├── .gitignore
├── Dockerfile-kafka-connect
├── README.md
├── clickhouse-client.sh
├── demo-start.sh
├── demo-stop.sh
├── docker-compose.yml
├── ksql-custom-udfs
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── io
    │               └── streamthoughts
    │                   └── ksql
    │                       └── udfs
    │                           ├── ArrayToString.java
    │                           └── ExtractArrayField.java
└── sql
    ├── ch_create_table_tweets.sql
    ├── ksql_create_connector_sink_jdbc_clickhouse.sql
    ├── ksql_create_connector_source_twitter.sql
    ├── ksql_create_stream_tweets.sql
    ├── ksql_create_stream_tweets_json.sql
    └── ksql_create_stream_tweets_normalized.sql


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Scala template
  2 | *.class
  3 | *.log
  4 | 
  5 | # sbt specific
  6 | .cache
  7 | .history
  8 | .lib/
  9 | dist/*
 10 | target/
 11 | lib_managed/
 12 | src_managed/
 13 | project/boot/
 14 | project/plugins/project/
 15 | 
 16 | # Scala-IDE specific
 17 | .scala_dependencies
 18 | .worksheet
 19 | ### Java template
 20 | *.class
 21 | 
 22 | # Mobile Tools for Java (J2ME)
 23 | .mtj.tmp/
 24 | 
 25 | # Package Files #
 26 | *.jar
 27 | *.war
 28 | *.ear
 29 | 
 30 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 31 | hs_err_pid*
 32 | ### Maven template
 33 | target/
 34 | pom.xml.tag
 35 | pom.xml.releaseBackup
 36 | pom.xml.versionsBackup
 37 | pom.xml.next
 38 | release.properties
 39 | dependency-reduced-pom.xml
 40 | buildNumber.properties
 41 | .mvn/timing.properties
 42 | ### Eclipse template
 43 | *.pydevproject
 44 | .metadata
 45 | .gradle
 46 | tmp/
 47 | *.tmp
 48 | *.bak
 49 | *.swp
 50 | *~.nib
 51 | local.properties
 52 | .settings/
 53 | .loadpath
 54 | 
 55 | # Eclipse Core
 56 | .project
 57 | 
 58 | # External tool builders
 59 | .externalToolBuilders/
 60 | 
 61 | # Locally stored "Eclipse launch configurations"
 62 | *.launch
 63 | 
 64 | # CDT-specific
 65 | .cproject
 66 | 
 67 | # JDT-specific (Eclipse Java Development Tools)
 68 | .classpath
 69 | 
 70 | # Java annotation processor (APT)
 71 | .factorypath
 72 | 
 73 | # PDT-specific
 74 | .buildpath
 75 | 
 76 | # sbteclipse plugin
 77 | .target
 78 | 
 79 | # TeXlipse plugin
 80 | .texlipse
 81 | ### JetBrains template
 82 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
 83 | 
 84 | *.iml
 85 | 
 86 | ## Directory-based project format:
 87 | .idea/
 88 | # if you remove the above rule, at least ignore the following:
 89 | 
 90 | # User-specific stuff:
 91 | # .idea/workspace.xml
 92 | # .idea/tasks.xml
 93 | # .idea/dictionaries
 94 | 
 95 | # Sensitive or high-churn files:
 96 | # .idea/dataSources.ids
 97 | # .idea/dataSources.xml
 98 | # .idea/sqlDataSources.xml
 99 | # .idea/dynamic.xml
100 | # .idea/uiDesigner.xml
101 | 
102 | # Gradle:
103 | # .idea/gradle.xml
104 | # .idea/libraries
105 | 
106 | # Mongo Explorer plugin:
107 | # .idea/mongoSettings.xml
108 | 
109 | ## File-based project format:
110 | *.ipr
111 | *.iws
112 | 
113 | ## Plugin-specific files:
114 | 
115 | # IntelliJ
116 | /out/
117 | 
118 | # mpeltonen/sbt-idea plugin
119 | .idea_modules/
120 | 
121 | # JIRA plugin
122 | atlassian-ide-plugin.xml
123 | 
124 | # Crashlytics plugin (for Android Studio and IntelliJ)
125 | com_crashlytics_export_strings.xml
126 | crashlytics.properties
127 | crashlytics-build.properties
128 | 
129 | # Created by .ignore support plugin (hsz.mobi)
130 | #
131 | #
132 | azkarra-*/docs
133 | azkarra-server/src/main/resources/io/
134 | */node_modules
135 | site/resources
136 | 


--------------------------------------------------------------------------------
/Dockerfile-kafka-connect:
--------------------------------------------------------------------------------
 1 | FROM confluentinc/cp-kafka-connect-base:5.4.1
 2 | 
 3 | ARG jdbcDriverPath
 4 | ARG clickHouseVersion
 5 | ARG connectTwitterVersion
 6 | 
 7 | ENV JDBC_DRIVER_PATH=/usr/share/confluent-hub-components/confluentinc-kafka-connect-jdbc/lib
 8 | ENV JDBC_DRIVER=clickhouse-jdbc-$clickHouseVersion.jar
 9 | 
10 | RUN confluent-hub install --no-prompt jcustenborder/kafka-connect-twitter:$connectTwitterVersion
11 | RUN confluent-hub install --no-prompt confluentinc/kafka-connect-jdbc:5.4.1
12 | 
13 | RUN mkdir -p $JDBC_DRIVER_PATH && \
14 | 	echo "Downloading JDBC Driver for ClickHouse v$clickHouseVersion" && \
15 | 	wget -O ${JDBC_DRIVER_PATH}/${JDBC_DRIVER} https://github.com/ClickHouse/clickhouse-jdbc/releases/download/release_$clickHouseVersion/${JDBC_DRIVER}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PoC : Fast Data Analytic platform with ClickHouse and Apache Kafka
 2 | 
 3 | 
 4 | ## Prerequisites
 5 | 
 6 | * Git
 7 | * Maven (we recommend version 3.5.3)
 8 | * Java 11
 9 | * Docker, Docker-compose
10 | 
11 | ## Project Tree
12 | 
13 | ```
14 | ├── clickhouse-client.sh                // Utility to start a clickHouse client
15 | ├── demo-start.sh                       // Utility script to start the project
16 | ├── demo-stop.sh                        // Uitlity script to stop the project
17 | ├── docker-compose.yml
18 | ├── Dockerfile-kafka-connect
19 | ├── ksql-custom-udfs                    // Maven project that contains User Defined Functions (UDFs) for ksqlDB
20 | │   ├── pom.xml
21 | │   └── src
22 | │       └── main
23 | │           └── java
24 | │               └── io
25 | │                   └── streamthoughts
26 | │                       └── ksql
27 | │                           └── udfs
28 | │                               ├── ArrayToString.java
29 | │                               └── ExtractArrayField.java
30 | ├── README.md
31 | └── sql                              // ksqlDB and ClickHouse queries
32 |     ├── ch_create_table_tweets.sql
33 |     ├── ksql_create_connector_sink_jdbc_clickhouse.sql
34 |     ├── ksql_create_connector_source_twitter.sql
35 |     ├── ksql_create_stream_tweets_json.sql
36 |     ├── ksql_create_stream_tweets_normalized.sql
37 |     └── ksql_create_stream_tweets.sql
38 | ```
39 | 
40 | ## Twitter API OAuth
41 | 
42 | To run that demo project, you need to get credentials for using [Twitter API](https://developer.twitter.com/en/docs/basics/authentication/oauth-1-0a).
43 | 
44 | * Edit file `./sql/ksql_create_connector_source_twitter.sql` to set your Twitter API Credentials :
45 | 
46 | ```sql
47 | CREATE SOURCE CONNECTOR tweeterconnector WITH (
48 |     'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector',
49 |     'twitter.oauth.accessTokenSecret'='%ACCESS_TOKEN_SECRET%',
50 |     'twitter.oauth.consumerSecret'='%CONSUMER_SECRET%',
51 |     'twitter.oauth.accessToken'='%ACCESS_TOKEN%',
52 |     'twitter.oauth.consumerKey'='%CONSUMER_KEY%',
53 |     'kafka.status.topic'='tweets',
54 |     'process.deletes'=false,
55 |     'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19'
56 | );
57 | ```
58 | 
59 | ## Starting Project
60 | 
61 | **Start demonstration**
62 | 
63 | ```bash
64 | $ ./demo-start.sh
65 | ```
66 | 
67 | **Example of ClickHouse SQL query :**
68 | 
69 | ```bash
70 | $ ./clickhouse-client.sh
71 | $ docker exec -it clickhouse bin/bash -c "clickhouse-client -q 'SELECT COUNT(*) AS COUNT, LANG FROM tweets GROUP BY LANG ORDER BY (COUNT) DESC LIMIT 10;'"
72 | ```
73 | 
74 | **Stopping**
75 | 
76 | ```bash
77 | $ ./demo-stop.sh
78 | ```
79 | 


--------------------------------------------------------------------------------
/clickhouse-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | docker exec -it clickhouse bin/bash -c "clickhouse-client --multiline"
6 | 


--------------------------------------------------------------------------------
/demo-start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | 
 6 | # Functions
 7 | function exec_clickhouse_query() {
 8 |   SQL=$1
 9 |   echo -e "\n🚀 Executing ClickHouse query\n"
10 |   echo "${SQL}"
11 |   docker exec -it clickhouse bash -c "echo \"$SQL\" | clickhouse-client --multiline"
12 |   echo "----------------------------------------------"
13 | }
14 | 
15 | function exec_ksql_query() {
16 |   echo -e "\n🚀 Executing KSQL query\n"
17 |   KSQL_QUERY=$1
18 |   docker exec -it ksqldb-server bash -c "echo \"$KSQL_QUERY\" | ksql"
19 | }
20 | 
21 | # Main
22 | echo "--------------------------------------------------------------------------------"
23 | echo "---    Demo : Fast Analytic Platform with Kafka, ClickHouse & Superset       ---"
24 | echo "--------------------------------------------------------------------------------"
25 | 
26 | export COMPOSE_PROJECT_NAME=demo-twitter-streams
27 | 
28 | echo -e "\n🐳 Stopping all previsously started Docker containers"
29 | docker-compose down -v
30 | 
31 | echo -e "\n🏭 Building Maven project (cd ksql-custom-udfs; mvn clean -q package)\n"
32 | (cd ksql-custom-udfs; mvn clean -q package)
33 | 
34 | echo -e "\n🐳 Starting all docker containers"
35 | docker-compose up -d
36 | 
37 | KAFKA_CONTAINER_NAME=kafka
38 | 
39 | echo -e "\n⏳ Waiting for Kafka Broker to be up and running"
40 | while true
41 | do
42 |   if [ $(docker logs $KAFKA_CONTAINER_NAME 2>&1 | grep "started (kafka.server.KafkaServer)" >/dev/null; echo $?) -eq 0 ]; then
43 |     echo
44 |     break
45 |   fi
46 |   printf "."
47 |   sleep 1
48 | done;
49 | 
50 | echo -e "\n⏳ Creating Kafka topics"
51 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets --partitions 4 --replication-factor 1
52 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets-normalized --partitions 4 --replication-factor 1
53 | 
54 | exec_clickhouse_query "$(cat ./sql/ch_create_table_tweets.sql)"
55 | 
56 | echo -e "\n⏳ Waiting for Kafka Connect to be up and running."
57 | while true
58 | do
59 |   res=$(curl -sI http://localhost:8083 | head -n 1 | cut -d$' ' -f2)
60 |   if [ "$res" == "200" ]; then
61 |     echo
62 |     break
63 |   fi
64 |   printf "."
65 |   sleep 1
66 | done;
67 | 
68 | echo -e "\n⏳ Waiting for KSQL to be available before launching CLI\n"
69 | while [ $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) -eq 000 ]
70 | do
71 |   echo -e $(date) "KSQL Server HTTP state: " $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) " (waiting for 200)"
72 |   sleep 5
73 | done
74 | 
75 | echo -e "\n⏳ Starting JdbcSinkConnector for ClickHouse\n"
76 | 
77 | exec_ksql_query "$(cat ./sql/ksql_create_connector_sink_jdbc_clickhouse.sql)"
78 | 
79 | sleep 5
80 | 
81 | echo -e "\n⏳ Starting TwitterSourceConnector\n"
82 | exec_ksql_query "$(cat ./sql/ksql_create_connector_source_twitter.sql)"
83 | 
84 | exec_ksql_query "SET 'auto.offset.reset' = 'earliest';"
85 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets.sql)"
86 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_normalized.sql)"
87 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_json.sql)"
88 | 
89 | exit 0
90 | 


--------------------------------------------------------------------------------
/demo-stop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | set -e 
 4 | 
 5 | export COMPOSE_PROJECT_NAME=demo-twitter-streams
 6 | 
 7 | echo -e "\n🐳 Stopping all previsously started Docker containers"
 8 | docker-compose down -v
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright 2020 StreamThoughts.
  3 | #
  4 | version: '3'
  5 | services:
  6 |   zookeeper:
  7 |     image: confluentinc/cp-zookeeper:5.4.1
  8 |     hostname: zookeeper
  9 |     container_name: zookeeper
 10 |     ports:
 11 |       - "2181:2181"
 12 |     environment:
 13 |       ZOOKEEPER_CLIENT_PORT: 2181
 14 |       ZOOKEEPER_TICK_TIME: 2000
 15 | 
 16 |   kafka:
 17 |     image: confluentinc/cp-kafka:5.4.1
 18 |     hostname: kafka
 19 |     container_name: kafka
 20 |     depends_on:
 21 |       - zookeeper
 22 |     ports:
 23 |       - "29092:29092"
 24 |       - "9092:9092"
 25 |     environment:
 26 |       KAFKA_BROKER_ID: 1
 27 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
 28 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
 29 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
 30 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 31 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
 32 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
 33 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
 34 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092
 35 |       CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181
 36 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
 37 |       CONFLUENT_METRICS_ENABLE: 'false'
 38 | 
 39 |   schema-registry:
 40 |     image: confluentinc/cp-schema-registry:5.4.1
 41 |     hostname: schema-registry
 42 |     container_name: schema-registry
 43 |     depends_on:
 44 |       - zookeeper
 45 |       - kafka
 46 |     ports:
 47 |       - "8081:8081"
 48 |     environment:
 49 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
 50 |       SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
 51 | 
 52 |   kafka-connect:
 53 |     container_name: kafka-connect
 54 |     build:
 55 |       context: ./
 56 |       dockerfile: Dockerfile-kafka-connect
 57 |       args:
 58 |         clickHouseVersion: "0.2.4"
 59 |         connectTwitterVersion: "0.3.33"
 60 |     depends_on:
 61 |       - zookeeper
 62 |       - schema-registry
 63 |       - kafka
 64 |     ports:
 65 |       - "8083:8083"
 66 |     environment:
 67 |       CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092'
 68 |       CONNECT_REST_ADVERTISED_HOST_NAME: connect
 69 |       CONNECT_REST_PORT: 8083
 70 |       CONNECT_GROUP_ID: compose-connect-group
 71 |       CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
 72 |       CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
 73 |       CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
 74 |       CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
 75 |       CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
 76 |       CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
 77 |       CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
 78 |       CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
 79 |       CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
 80 |       CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
 81 |       CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
 82 |       CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
 83 |       CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181'
 84 |       CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/"
 85 |       CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO"
 86 |       CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR
 87 | 
 88 |   ksqldb-server:
 89 |     #image: confluentinc/cp-ksql-server:5.4.1
 90 |     image: confluentinc/ksqldb-server:0.8.1
 91 |     hostname: ksqldb-server
 92 |     container_name: ksqldb-server
 93 |     depends_on:
 94 |       - zookeeper
 95 |       - kafka
 96 |       - schema-registry
 97 |       - kafka-connect
 98 |     ports:
 99 |       - "8088:8088"
100 |     environment:
101 |       KSQL_BOOTSTRAP_SERVERS: kafka:29092
102 |       KSQL_LISTENERS: http://0.0.0.0:8088
103 |       KSQL_KSQL_CONNECT_URL: http://kafka-connect:8083
104 |       KSQL_KSQL_SERVICE_ID: ksql-docker
105 |       KSQL_KSQL_SCHEMA_REGISTRY_URL: http://schema-registry:8081
106 |       KSQL_KSQL_EXTENSION_DIR: /var/lib/ksql/extension
107 |     volumes:
108 |       - ./ksql-custom-udfs/target/ksql-custom-udfs-1.0.jar:/var/lib/ksql/extension/ksql-custom-udfs-1.0.jar
109 | 
110 |   clickhouse:
111 |     image: yandex/clickhouse-server:20.4
112 |     hostname: clickhouse
113 |     container_name: clickhouse
114 | #    ports:
115 | # Default port for HTTP interfaces (https://clickhouse.yandex/docs/en/interfaces/http_interface/) 
116 | #      - "8123::8123"
117 | # Default port for native client (https://clickhouse.yandex/docs/en/interfaces/tcp/)
118 | #      - "9000:9000"
119 | #    volumes:
120 | #      - /tmp/clickhouse/data:/var/lib/clickhouse
121 | #      - /tmp/clickhouse/log:/var/log/clickhouse-server
122 | #      - /tmp/clickhouse-config.xml:/etc/clickhouse-server/config.xml
123 | # Zookeeper is required for replicated tables (https://clickhouse.tech/docs/en/operations/table_engines/replication/)
124 | #    depends_on:
125 | #      - zookeeper
126 | 
127 |   superset:
128 |     image: amancevice/superset:latest
129 |     hostname: superset
130 |     container_name: superset
131 |     ports:
132 |       - "8080:8088"
133 | 


--------------------------------------------------------------------------------
/ksql-custom-udfs/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>io.streamthoughts</groupId>
 8 |     <artifactId>ksql-custom-udfs</artifactId>
 9 |     <version>1.0</version>
10 | 
11 |     <repositories>
12 |         <repository>
13 |             <id>confluent</id>
14 |             <url>http://packages.confluent.io/maven/</url>
15 |         </repository>
16 |     </repositories>
17 | 
18 |     <build>
19 |         <plugins>
20 |             <plugin>
21 |                 <groupId>org.apache.maven.plugins</groupId>
22 |                 <artifactId>maven-compiler-plugin</artifactId>
23 |                 <version>3.8.1</version>
24 |                 <configuration>
25 |                     <source>1.8</source>
26 |                     <target>1.8</target>
27 |                 </configuration>
28 |             </plugin>
29 |             <plugin>
30 |                 <groupId>org.apache.maven.plugins</groupId>
31 |                 <artifactId>maven-compiler-plugin</artifactId>
32 |                 <version>3.8.1</version>
33 |                 <configuration>
34 |                     <source>8</source>
35 |                     <target>8</target>
36 |                     <encoding>UTF-8</encoding>
37 |                     <compilerArgs>
38 |                         <arg>-parameters</arg>
39 |                     </compilerArgs>
40 |                 </configuration>
41 |             </plugin>
42 |         </plugins>
43 |     </build>
44 | 
45 |     <properties>
46 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
47 |     </properties>
48 | 
49 |     <dependencies>
50 | 
51 |         <dependency>
52 |             <groupId>io.confluent.ksql</groupId>
53 |             <artifactId>ksql-common</artifactId>
54 |             <version>5.4.1</version>
55 |             <scope>provided</scope>
56 |         </dependency>
57 | 
58 |         <dependency>
59 |             <groupId>io.confluent.ksql</groupId>
60 |             <artifactId>ksql-udf</artifactId>
61 |             <version>5.4.1</version>
62 |             <scope>provided</scope>
63 |         </dependency>
64 | 
65 |         <dependency>
66 |             <groupId>org.apache.kafka</groupId>
67 |             <artifactId>connect-api</artifactId>
68 |             <version>2.4.1</version>
69 |             <scope>provided</scope>
70 |         </dependency>
71 | 
72 |     </dependencies>
73 | 
74 | </project>


--------------------------------------------------------------------------------
/ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ArrayToString.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 StreamThoughts.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements. See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License. You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | package io.streamthoughts.ksql.udfs;
20 | 
21 | import io.confluent.ksql.function.udf.Udf;
22 | import io.confluent.ksql.function.udf.UdfDescription;
23 | import io.confluent.ksql.function.udf.UdfParameter;
24 | 
25 | import java.util.List;
26 | import java.util.stream.Collectors;
27 | 
28 | @UdfDescription(
29 |     name = "array_to_string",
30 |     description = "Concatenates array elements using supplied delimiter and null string")
31 | public class ArrayToString {
32 | 
33 | 
34 |     @Udf
35 |     public <T> String arrayToString(@UdfParameter("array") final List<T> array,
36 |                                     @UdfParameter("delimiter") final String delimiter,
37 |                                     @UdfParameter("nullString") final String nullString) {
38 |         if (array == null) return nullString;
39 |         return array
40 |             .stream()
41 |             .map(Object::toString)
42 |             .collect(Collectors.joining(delimiter));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ExtractArrayField.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 StreamThoughts.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements. See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License. You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | package io.streamthoughts.ksql.udfs;
20 | 
21 | import io.confluent.ksql.function.udf.Udf;
22 | import io.confluent.ksql.function.udf.UdfDescription;
23 | import io.confluent.ksql.function.udf.UdfParameter;
24 | import org.apache.kafka.connect.data.Field;
25 | import org.apache.kafka.connect.data.Struct;
26 | 
27 | import java.util.List;
28 | import java.util.Objects;
29 | import java.util.stream.Collectors;
30 | 
31 | @UdfDescription(name = "extract_array_field", description = "Extract a single field from an array of struct")
32 | public class ExtractArrayField {
33 | 
34 |     @Udf
35 |     @SuppressWarnings("unchecked")
36 |     public <T> List<String> extractArrayField(@UdfParameter(value = "array") final List<T> array,
37 |                                               @UdfParameter(value = "field") final String field) {
38 | 
39 |         if (array == null) return null;
40 | 
41 |         final String ufield = field.toUpperCase();
42 | 
43 |         final List<Struct> structs = (List<Struct>) array;
44 |         return structs.stream()
45 |             .map(record -> {
46 |                 final Field sf = record.schema().field(ufield);
47 |                 return sf != null ? record.getString(ufield) : null;
48 |             })
49 |             .filter(Objects::nonNull)
50 |             .collect(Collectors.toList());
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/sql/ch_create_table_tweets.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS default.tweets
 2 | (
 3 |     ID String,
 4 |     CREATEDAT DateTime,
 5 |     TEXT String,
 6 |     LANG String,
 7 |     RETWEETED UInt8,
 8 |     USERID String,
 9 |     USERNAME String,
10 |     USERDESCRIPTION String,
11 |     USERLOCATION String,
12 |     HASHTAGS String,
13 |     MENTIONS String
14 | ) ENGINE = MergeTree()
15 | PARTITION BY toYYYYMM(CREATEDAT)
16 | ORDER BY (CREATEDAT, LANG);
17 | 


--------------------------------------------------------------------------------
/sql/ksql_create_connector_sink_jdbc_clickhouse.sql:
--------------------------------------------------------------------------------
1 | CREATE SOURCE CONNECTOR clickhousejdbcconnector WITH (
2 |     'connector.class'='io.confluent.connect.jdbc.JdbcSinkConnector',
3 |     'topics'='tweets-normalized',
4 |     'tasks.max'='1',
5 |     'connection.url'='jdbc:clickhouse://clickhouse:8123/default',
6 |     'table.name.format'='tweets'
7 | );
8 | 


--------------------------------------------------------------------------------
/sql/ksql_create_connector_source_twitter.sql:
--------------------------------------------------------------------------------
 1 | CREATE SOURCE CONNECTOR tweeterconnector WITH (
 2 |     'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector',
 3 |     'twitter.oauth.accessTokenSecret'='%OAUTH_ACCESS_TOKEN_SECRET%',
 4 |     'twitter.oauth.consumerSecret'='%OAUTH_CONSUMER_SECRET%',
 5 |     'twitter.oauth.accessToken'='%OAUTH_ACCESS_TOKEN%',
 6 |     'twitter.oauth.consumerKey'='%OAUTH_CONSUMER_KEY%',
 7 |     'kafka.status.topic'='tweets',
 8 |     'process.deletes'=false,
 9 |     'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19'
10 | );
11 | 


--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets.sql:
--------------------------------------------------------------------------------
1 | CREATE STREAM tweets WITH (KAFKA_TOPIC = 'tweets', VALUE_FORMAT='AVRO');
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets_json.sql:
--------------------------------------------------------------------------------
1 | CREATE STREAM TWEETS_NORMALIZED_JSON 
2 | WITH (KAFKA_TOPIC='tweets-normalized-json',VALUE_FORMAT='JSON') 
3 | AS SELECT * FROM TWEETS_NORMALIZED;
4 | 


--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets_normalized.sql:
--------------------------------------------------------------------------------
 1 | CREATE STREAM TWEETS_NORMALIZED
 2 |     WITH (kafka_topic = 'tweets-normalized') AS
 3 |     SELECT
 4 |         Id,
 5 |         CreatedAt / 1000 as CreatedAt,
 6 |         Text,
 7 |         Lang,
 8 |         Retweeted,
 9 |         User->Id as UserId,
10 |         User->Name as UserName,
11 |         IFNULL(User->Description, '') as UserDescription,
12 |         IFNULL(User->Location, '') as UserLocation,
13 |         ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(UserMentionEntities, 'Name'), ',', '') as Mentions,
14 |         ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(HashtagEntities, 'Text'), ',', '') as Hashtags
15 |     FROM tweets EMIT CHANGES;
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------