├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── bin
    ├── activator
    └── activator.bat
├── build.sbt
├── clickhouse_files
    └── config.xml
├── docker_files
    └── docker_start.sh
├── libexec
    └── activator-launch-1.3.12.jar
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        └── scala
        │   └── io
        │       └── clickhouse
        │           └── ext
        │               ├── ClickhouseClient.scala
        │               ├── ClickhouseConnectionFactory.scala
        │               ├── ClickhouseResultSetExt.scala
        │               ├── ClusterResultSet.scala
        │               ├── Utils.scala
        │               └── spark
        │                   └── DataFrameExt.scala
    └── test
        └── scala
            ├── DFExtSpec.scala
            └── UtilsSpec.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | # SBT
 2 | boot/
 3 | lib_managed/
 4 | src_managed/
 5 | test-output/
 6 | target/
 7 | .history
 8 | 
 9 | # IntelliJ
10 | .idea/
11 | *.iml
12 | *.ipr
13 | *.iws
14 | out/
15 | 
16 | # Eclipse
17 | .cache
18 | .classpath
19 | .project
20 | .scala_dependencies
21 | .settings
22 | .target/
23 | 
24 | # Mac
25 | .DS_Store
26 | 
27 | # Other
28 | *.pyc
29 | *.swp
30 | sync.sh
31 | 
32 | logs/
33 | metastore_db/
34 | *.log


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM p7hb/docker-spark:2.1.0
 2 | 
 3 | ARG CLICKHOUSE_LOGS=/var/log/clickhouse-server
 4 | 
 5 | ENV APP_DIR /app
 6 | 
 7 | RUN mkdir -p /etc/apt/sources.list.d && \
 8 | 	apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 && \
 9 | 	echo "deb http://repo.yandex.ru/clickhouse/trusty stable main" | tee /etc/apt/sources.list.d/clickhouse.list && \
10 | 	apt-get -y update && \
11 | 	apt-get -y install clickhouse-server-common clickhouse-client && \
12 | 	mkdir -p ${CLICKHOUSE_LOGS} && \
13 | 	touch ${CLICKHOUSE_LOGS}/tmp
14 | 
15 | ADD docker_files/docker_start.sh /docker_start.sh
16 | 
17 | RUN mkdir -p ${APP_DIR}
18 | WORKDIR ${APP_DIR}
19 | 
20 | # clickhouse config with cluster def
21 | COPY /clickhouse_files/config.xml /etc/clickhouse-server/
22 | 
23 | COPY /target/pack/lib/clickhouse* ${APP_DIR}/lib/
24 | COPY /target/pack/lib/guava* ${APP_DIR}/lib/
25 | COPY Makefile ${APP_DIR}
26 | 
27 | ENTRYPOINT ["/docker_start.sh"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2014 Typesafe, Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # docker images params
 3 | REPO=dmitryb/clickhouse-spark-connector
 4 | TAG=0.0.1
 5 | 
 6 | build:
 7 | 	sbt compile
 8 | 	sbt pack
 9 | 
10 | pack:
11 | 	sbt pack-archive
12 | 
13 | run:
14 | 	env JAVA_OPTS="-Xmx4g -Xms4g -server -XX:+UseParallelGC -XX:NewRatio=1" \
15 | 	./target/pack/bin/main --conf
16 | 
17 | start-activator:
18 | 	./bin/activator ui -Dhttp.address=0.0.0.0 -Dhttp.port=8088
19 | 
20 | docker-build:
21 | 	docker build -t $(REPO):$(TAG) .
22 | 
23 | docker-push:
24 | 	docker push $(REPO):$(TAG)
25 | 
26 | docker-clean:
27 | 	docker rm $(docker ps -a -q)
28 | 	docker rmi $(docker images | grep "dmitryb/clickhouse-spark-connector" | awk "{print $3}")
29 | 
30 | # to create fat jar (not used)
31 | assembly:
32 | 	sbt assembly
33 | 
34 | dev-local:
35 | 	#sbt clean compile
36 | 	#sbt pack
37 | 	mkdir -p target/l
38 | 	cp -f target/pack/lib/clickhouse* target/l/
39 | 	cp -f target/pack/lib/guava* target/l/
40 | 
41 | clickhouse-server-start:
42 | 	docker run -it -d --name clickhouse-server -p 8123:8123 -v `pwd`/clickhouse_files/config.xml:/etc/clickhouse-server/config.xml yandex/clickhouse-server


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | clickhouse spark connector
 2 | ==========================
 3 | 
 4 | > connector #spark DataFrame -> Yandex #ClickHouse table
 5 |  
 6 | Example
 7 | ``` scala
 8 | 
 9 |     import io.clickhouse.ext.ClickhouseConnectionFactory
10 |     import io.clickhouse.ext.spark.ClickhouseSparkExt._
11 |     import org.apache.spark.sql.SparkSession
12 | 
13 |     // spark config
14 |     val sparkSession = SparkSession.builder
15 |       .master("local")
16 |       .appName("local spark")
17 |       .getOrCreate()
18 | 
19 |     val sc = sparkSession.sparkContext
20 |     val sqlContext = sparkSession.sqlContext
21 |     
22 |     // create test DF
23 |     case class Row1(name: String, v: Int, v2: Int)
24 |     val df = sqlContext.createDataFrame(1 to 1000 map(i => Row1(s"$i", i, i + 10)) )
25 | 
26 |     // clickhouse params
27 |     
28 |     // any node 
29 |     val anyHost = "localhost"
30 |     val db = "tmp1"
31 |     val tableName = "t1"
32 |     // cluster configuration must be defined in config.xml (clickhouse config)
33 |     val clusterName = Some("perftest_1shards_1replicas"): Option[String]
34 | 
35 |     // define clickhouse datasource
36 |     implicit val clickhouseDataSource = ClickhouseConnectionFactory.get(anyHost)
37 |     
38 |     // create db / table
39 |     //df.dropClickhouseDb(db, clusterName)
40 |     df.createClickhouseDb(db, clusterName)
41 |     df.createClickhouseTable(db, tableName, "mock_date", Seq("name"), clusterNameO = clusterName)
42 | 
43 |     // save DF to clickhouse table
44 |     val res = df.saveToClickhouse("tmp1", "t1", (row) => java.sql.Date.valueOf("2000-12-01"), "mock_date", clusterNameO = clusterName)
45 |     assert(res.size == 1)
46 |     assert(res.get("localhost") == Some(df.count()))
47 | 
48 | ```
49 | 
50 | Docker image
51 | [Docker](https://hub.docker.com/r/dmitryb/clickhouse-spark-connector/)


--------------------------------------------------------------------------------
/bin/activator:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ###  ------------------------------- ###
  4 | ###  Helper methods for BASH scripts ###
  5 | ###  ------------------------------- ###
  6 | 
  7 | realpath () {
  8 | (
  9 |   TARGET_FILE="$1"
 10 |   FIX_CYGPATH="$2"
 11 | 
 12 |   cd "$(dirname "$TARGET_FILE")"
 13 |   TARGET_FILE=$(basename "$TARGET_FILE")
 14 | 
 15 |   COUNT=0
 16 |   while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
 17 |   do
 18 |       TARGET_FILE=$(readlink "$TARGET_FILE")
 19 |       cd "$(dirname "$TARGET_FILE")"
 20 |       TARGET_FILE=$(basename "$TARGET_FILE")
 21 |       COUNT=$(($COUNT + 1))
 22 |   done
 23 | 
 24 |   # make sure we grab the actual windows path, instead of cygwin's path.
 25 |   if [[ "x$FIX_CYGPATH" != "x" ]]; then
 26 |     echo "$(cygwinpath "$(pwd -P)/$TARGET_FILE")"
 27 |   else
 28 |     echo "$(pwd -P)/$TARGET_FILE"
 29 |   fi
 30 | )
 31 | }
 32 | 
 33 | 
 34 | # Uses uname to detect if we're in the odd cygwin environment.
 35 | is_cygwin() {
 36 |   local os=$(uname -s)
 37 |   case "$os" in
 38 |     CYGWIN*) return 0 ;;
 39 |     *)  return 1 ;;
 40 |   esac
 41 | }
 42 | 
 43 | # TODO - Use nicer bash-isms here.
 44 | CYGWIN_FLAG=$(if is_cygwin; then echo true; else echo false; fi)
 45 | 
 46 | 
 47 | # This can fix cygwin style /cygdrive paths so we get the
 48 | # windows style paths.
 49 | cygwinpath() {
 50 |   local file="$1"
 51 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
 52 |     echo $(cygpath -w $file)
 53 |   else
 54 |     echo $file
 55 |   fi
 56 | }
 57 | 
 58 | # Make something URI friendly
 59 | make_url() {
 60 |   url="$1"
 61 |   local nospaces=${url// /%20}
 62 |   if is_cygwin; then
 63 |     echo "/${nospaces//\\//}"
 64 |   else
 65 |     echo "$nospaces"
 66 |   fi
 67 | }
 68 | 
 69 | declare -a residual_args
 70 | declare -a java_args
 71 | declare -a scalac_args
 72 | declare -a sbt_commands
 73 | declare java_cmd=java
 74 | declare java_version
 75 | declare -r real_script_path="$(realpath "$0")"
 76 | declare -r sbt_home="$(realpath "$(dirname "$(dirname "$real_script_path")")")"
 77 | declare -r sbt_bin_dir="$(dirname "$real_script_path")"
 78 | declare -r app_version="1.3.12"
 79 | 
 80 | declare -r script_name=activator
 81 | declare -r java_opts=( "${ACTIVATOR_OPTS[@]}" "${SBT_OPTS[@]}" "${JAVA_OPTS[@]}" "${java_opts[@]}" )
 82 | userhome="$HOME"
 83 | if is_cygwin; then
 84 |   # cygwin sets home to something f-d up, set to real windows homedir
 85 |   userhome="$USERPROFILE"
 86 | fi
 87 | declare -r activator_user_home_dir="${userhome}/.activator"
 88 | declare -r java_opts_config_home="${activator_user_home_dir}/activatorconfig.txt"
 89 | declare -r java_opts_config_version="${activator_user_home_dir}/${app_version}/activatorconfig.txt"
 90 | 
 91 | echoerr () {
 92 |   echo 1>&2 "$@"
 93 | }
 94 | vlog () {
 95 |   [[ $verbose || $debug ]] && echoerr "$@"
 96 | }
 97 | dlog () {
 98 |   [[ $debug ]] && echoerr "$@"
 99 | }
100 | 
101 | jar_file () {
102 |   echo "$(cygwinpath "${sbt_home}/libexec/activator-launch-${app_version}.jar")"
103 | }
104 | 
105 | acquire_sbt_jar () {
106 |   sbt_jar="$(jar_file)"
107 | 
108 |   if [[ ! -f "$sbt_jar" ]]; then
109 |     echoerr "Could not find launcher jar: $sbt_jar"
110 |     exit 2
111 |   fi
112 | }
113 | 
114 | execRunner () {
115 |   # print the arguments one to a line, quoting any containing spaces
116 |   [[ $verbose || $debug ]] && echo "# Executing command line:" && {
117 |     for arg; do
118 |       if printf "%s\n" "$arg" | grep -q ' '; then
119 |         printf "\"%s\"\n" "$arg"
120 |       else
121 |         printf "%s\n" "$arg"
122 |       fi
123 |     done
124 |     echo ""
125 |   }
126 | 
127 |   # THis used to be exec, but we loose the ability to re-hook stty then
128 |   # for cygwin...  Maybe we should flag the feature here...
129 |   "$@"
130 | }
131 | 
132 | addJava () {
133 |   dlog "[addJava] arg = '$1'"
134 |   java_args=( "${java_args[@]}" "$1" )
135 | }
136 | addSbt () {
137 |   dlog "[addSbt] arg = '$1'"
138 |   sbt_commands=( "${sbt_commands[@]}" "$1" )
139 | }
140 | addResidual () {
141 |   dlog "[residual] arg = '$1'"
142 |   residual_args=( "${residual_args[@]}" "$1" )
143 | }
144 | addDebugger () {
145 |   addJava "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=$1"
146 | }
147 | addConfigOpts () {
148 |   dlog "[addConfigOpts] arg = '$*'"
149 |   for item in $*
150 |   do
151 |     addJava "$item"
152 |   done
153 | }
154 | 
155 | get_mem_opts () {
156 |   # if we detect any of these settings in ${JAVA_OPTS} we need to NOT output our settings.
157 |   # The reason is the Xms/Xmx, if they don't line up, cause errors.
158 |   if [[ "${JAVA_OPTS}" == *-Xmx* ]] || [[ "${JAVA_OPTS}" == *-Xms* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxPermSize* ]] || [[ "${JAVA_OPTS}" == *-XX:MaxMetaspaceSize* ]] || [[ "${JAVA_OPTS}" == *-XX:ReservedCodeCacheSize* ]]; then
159 |      echo ""
160 |   else
161 |     # a ham-fisted attempt to move some memory settings in concert
162 |     # so they need not be messed around with individually.
163 |     local mem=${1:-1024}
164 |     local codecache=$(( $mem / 8 ))
165 |     (( $codecache > 128 )) || codecache=128
166 |     (( $codecache < 512 )) || codecache=512
167 |     local class_metadata_size=$(( $codecache * 2 ))
168 |     local class_metadata_opt=$([[ "$java_version" < "1.8" ]] && echo "MaxPermSize" || echo "MaxMetaspaceSize")
169 | 
170 |     echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m -XX:${class_metadata_opt}=${class_metadata_size}m"
171 |   fi
172 | }
173 | 
174 | require_arg () {
175 |   local type="$1"
176 |   local opt="$2"
177 |   local arg="$3"
178 |   if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
179 |     echo "$opt requires <$type> argument"
180 |     exit 1
181 |   fi
182 | }
183 | 
184 | is_function_defined() {
185 |   declare -f "$1" > /dev/null
186 | }
187 | 
188 | # If we're *not* running in a terminal, and we don't have any arguments, then we need to add the 'ui' parameter
189 | detect_terminal_for_ui() {
190 |   [[ ! -t 0 ]] && [[ "${#residual_args}" == "0" ]] && {
191 |     addResidual "ui"
192 |   }
193 |   # SPECIAL TEST FOR MAC
194 |   [[ "$(uname)" == "Darwin" ]] && [[ "$HOME" == "$PWD" ]] && [[ "${#residual_args}" == "0" ]] && {
195 |     echo "Detected MAC OSX launched script...."
196 |     echo "Swapping to UI"
197 |     addResidual "ui"
198 |   }
199 | }
200 | 
201 | process_args () {
202 |   while [[ $# -gt 0 ]]; do
203 |     case "$1" in
204 |        -h|-help) usage; exit 1 ;;
205 |     -v|-verbose) verbose=1 && shift ;;
206 |       -d|-debug) debug=1 && shift ;;
207 | 
208 |            -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
209 |            -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
210 |      -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
211 |          -batch) exec </dev/null && shift ;;
212 | 
213 |        -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
214 |    -sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
215 |      -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;;
216 | 
217 |             -D*) addJava "$1" && shift ;;
218 |             -J*) addJava "${1:2}" && shift ;;
219 |               *) addResidual "$1" && shift ;;
220 |     esac
221 |   done
222 | 
223 |   is_function_defined process_my_args && {
224 |     myargs=("${residual_args[@]}")
225 |     residual_args=()
226 |     process_my_args "${myargs[@]}"
227 |   }
228 | 
229 |   java_version=$("$java_cmd" -Xmx512M -version 2>&1 | awk -F '"' '/version/ {print $2}')
230 |   vlog "[process_args] java_version = '$java_version'"
231 | }
232 | 
233 | # Detect that we have java installed.
234 | checkJava() {
235 |   local required_version="$1"
236 |   # Now check to see if it's a good enough version
237 |   if [[ "$java_version" == "" ]]; then
238 |     echo
239 |     echo No java installations was detected.
240 |     echo Please go to http://www.java.com/getjava/ and download
241 |     echo
242 |     exit 1
243 |   elif [[ ! "$java_version" > "$required_version" ]]; then
244 |     echo
245 |     echo The java installation you have is not up to date
246 |     echo $script_name requires at least version $required_version+, you have
247 |     echo version $java_version
248 |     echo
249 |     echo Please go to http://www.java.com/getjava/ and download
250 |     echo a valid Java Runtime and install before running $script_name.
251 |     echo
252 |     exit 1
253 |   fi
254 | }
255 | 
256 | 
257 | run() {
258 |   # no jar? download it.
259 |   [[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
260 |     # still no jar? uh-oh.
261 |     echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
262 |     exit 1
263 |   }
264 | 
265 |   # process the combined args, then reset "$@" to the residuals
266 |   process_args "$@"
267 |   detect_terminal_for_ui
268 |   set -- "${residual_args[@]}"
269 |   argumentCount=$#
270 | 
271 |   # TODO - java check should be configurable...
272 |   checkJava "1.6"
273 | 
274 |   #If we're in cygwin, we should use the windows config, and terminal hacks
275 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
276 |     stty -icanon min 1 -echo > /dev/null 2>&1
277 |     addJava "-Djline.terminal=jline.UnixTerminal"
278 |     addJava "-Dsbt.cygwin=true"
279 |   fi
280 | 
281 |   # run sbt
282 |   execRunner "$java_cmd" \
283 |     "-Dactivator.home=$(make_url "$sbt_home")" \
284 |     ${SBT_OPTS:-$default_sbt_opts} \
285 |     $(get_mem_opts $sbt_mem) \
286 |       ${JAVA_OPTS} \
287 |     ${java_args[@]} \
288 |     -jar "$sbt_jar" \
289 |     "${sbt_commands[@]}" \
290 |     "${residual_args[@]}"
291 | 
292 |   exit_code=$?
293 | 
294 |   # Clean up the terminal from cygwin hacks.
295 |   if [[ "$CYGWIN_FLAG" == "true" ]]; then
296 |     stty icanon echo > /dev/null 2>&1
297 |   fi
298 |   exit $exit_code
299 | }
300 | 
301 | 
302 | declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
303 | declare -r sbt_opts_file=".sbtopts"
304 | declare -r etc_sbt_opts_file="${sbt_home}/conf/sbtopts"
305 | declare -r win_sbt_opts_file="${sbt_home}/conf/sbtconfig.txt"
306 | 
307 | usage() {
308 |  cat <<EOM
309 | Usage: $script_name [options]
310 | 
311 |   Command:
312 |   ui                 Start the Activator UI
313 |   new [name] [template-id]  Create a new project with [name] using template [template-id]
314 |   list-templates     Print all available template names
315 | 
316 |   Options:
317 |   -h | -help         print this message
318 |   -v | -verbose      this runner is chattier
319 |   -d | -debug        set sbt log level to debug
320 |   -no-colors         disable ANSI color codes
321 |   -sbt-create        start sbt even if current directory contains no sbt project
322 |   -sbt-dir   <path>  path to global settings/plugins directory (default: ~/.sbt)
323 |   -sbt-boot  <path>  path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
324 |   -ivy       <path>  path to local Ivy repository (default: ~/.ivy2)
325 |   -mem    <integer>  set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
326 |   -no-share          use all local caches; no sharing
327 |   -no-global         uses global caches, but does not use global ~/.sbt directory.
328 |   -jvm-debug <port>  Turn on JVM debugging, open at the given port.
329 |   -batch             Disable interactive mode
330 | 
331 |   # sbt version (default: from project/build.properties if present, else latest release)
332 |   -sbt-version  <version>   use the specified version of sbt
333 |   -sbt-jar      <path>      use the specified jar as the sbt launcher
334 |   -sbt-rc                   use an RC version of sbt
335 |   -sbt-snapshot             use a snapshot version of sbt
336 | 
337 |   # java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
338 |   -java-home <path>         alternate JAVA_HOME
339 | 
340 |   # jvm options and output control
341 |   JAVA_OPTS          environment variable, if unset uses "$java_opts"
342 |   SBT_OPTS           environment variable, if unset uses "$default_sbt_opts"
343 |   ACTIVATOR_OPTS     Environment variable, if unset uses ""
344 |   .sbtopts           if this file exists in the current directory, it is
345 |                      prepended to the runner args
346 |   /etc/sbt/sbtopts   if this file exists, it is prepended to the runner args
347 |   -Dkey=val          pass -Dkey=val directly to the java runtime
348 |   -J-X               pass option -X directly to the java runtime
349 |                      (-J is stripped)
350 |   -S-X               add -X to sbt's scalacOptions (-S is stripped)
351 | 
352 | In the case of duplicated or conflicting options, the order above
353 | shows precedence: JAVA_OPTS lowest, command line options highest.
354 | EOM
355 | }
356 | 
357 | 
358 | 
359 | process_my_args () {
360 |   while [[ $# -gt 0 ]]; do
361 |     case "$1" in
362 |      -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
363 |       -no-share) addJava "$noshare_opts" && shift ;;
364 |      -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
365 |       -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
366 |        -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
367 |      -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
368 |          -batch) exec </dev/null && shift ;;
369 | 
370 |     -sbt-create) sbt_create=true && shift ;;
371 | 
372 |               *) addResidual "$1" && shift ;;
373 |     esac
374 |   done
375 | 
376 |   # Now, ensure sbt version is used.
377 |   [[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
378 | }
379 | 
380 | loadConfigFile() {
381 |   cat "$1" | sed '/^\#/d' | while read line; do
382 |     eval echo $line
383 |   done
384 | }
385 | 
386 | # TODO - Pull in config based on operating system... (MSYS + cygwin should pull in txt file).
387 | # Here we pull in the global settings configuration.
388 | [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
389 | # -- Windows behavior stub'd
390 | # JAVA_OPTS=$(cat "$WDIR/sbtconfig.txt" | sed -e 's/\r//g' -e 's/^#.*$//g' | sed ':a;N;$!ba;s/\n/ /g')
391 | 
392 | 
393 | #  Pull in the project-level config file, if it exists.
394 | [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
395 | 
396 | # if configuration files exist, prepend their contents to the java args so it can be processed by this runner
397 | # a "versioned" config trumps one on the top level
398 | if [[ -f "$java_opts_config_version" ]]; then
399 |   addConfigOpts $(loadConfigFile "$java_opts_config_version")
400 | elif [[ -f "$java_opts_config_home" ]]; then
401 |   addConfigOpts $(loadConfigFile "$java_opts_config_home")
402 | fi
403 | 
404 | run "$@"
405 | 


--------------------------------------------------------------------------------
/bin/activator.bat:
--------------------------------------------------------------------------------
  1 | @REM activator launcher script
  2 | @REM
  3 | @REM Environment:
  4 | @REM In order for Activator to work you must have Java available on the classpath
  5 | @REM JAVA_HOME - location of a JDK home dir (optional if java on path)
  6 | @REM CFG_OPTS  - JVM options (optional)
  7 | @REM Configuration:
  8 | @REM activatorconfig.txt found in the ACTIVATOR_HOME or ACTIVATOR_HOME/ACTIVATOR_VERSION
  9 | @setlocal enabledelayedexpansion
 10 | 
 11 | @echo off
 12 | 
 13 | set "var1=%~1"
 14 | if defined var1 (
 15 |   if "%var1%"=="help" (
 16 |     echo.
 17 |     echo Usage activator [options] [command]
 18 |     echo.
 19 |     echo Commands:
 20 |     echo ui                 Start the Activator UI
 21 |     echo new [name] [template-id]  Create a new project with [name] using template [template-id]
 22 |     echo list-templates     Print all available template names
 23 |     echo help               Print this message
 24 |     echo.
 25 |     echo Options:
 26 |     echo -jvm-debug [port]  Turn on JVM debugging, open at the given port.  Defaults to 9999 if no port given.
 27 |     echo.
 28 |     echo Environment variables ^(read from context^):
 29 |     echo JAVA_OPTS          Environment variable, if unset uses ""
 30 |     echo SBT_OPTS           Environment variable, if unset uses ""
 31 |     echo ACTIVATOR_OPTS     Environment variable, if unset uses ""
 32 |     echo.
 33 |     echo Please note that in order for Activator to work you must have Java available on the classpath
 34 |     echo.
 35 |     goto :end
 36 |   )
 37 | )
 38 | 
 39 | @REM determine ACTIVATOR_HOME environment variable
 40 | set BIN_DIRECTORY=%~dp0
 41 | set BIN_DIRECTORY=%BIN_DIRECTORY:~0,-1%
 42 | for %%d in ("%BIN_DIRECTORY%") do set ACTIVATOR_HOME=%%~dpd
 43 | set ACTIVATOR_HOME=%ACTIVATOR_HOME:~0,-1%
 44 | 
 45 | echo ACTIVATOR_HOME=%ACTIVATOR_HOME%
 46 | 
 47 | set ERROR_CODE=0
 48 | set APP_VERSION=1.3.12
 49 | set ACTIVATOR_LAUNCH_JAR=activator-launch-%APP_VERSION%.jar
 50 | 
 51 | rem Detect if we were double clicked, although theoretically A user could
 52 | rem manually run cmd /c
 53 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
 54 | 
 55 | set SBT_HOME=%BIN_DIRECTORY%
 56 | 
 57 | rem Detect if we were double clicked, although theoretically A user could
 58 | rem manually run cmd /c
 59 | for %%x in (%cmdcmdline%) do if %%~x==/c set DOUBLECLICKED=1
 60 | 
 61 | rem FIRST we load the config file of extra options.
 62 | set FN=%SBT_HOME%\..\conf\sbtconfig.txt
 63 | set CFG_OPTS=
 64 | FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
 65 |   set DO_NOT_REUSE_ME=%%i
 66 |   rem ZOMG (Part #2) WE use !! here to delay the expansion of
 67 |   rem CFG_OPTS, otherwise it remains "" for this loop.
 68 |   set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 69 | )
 70 | 
 71 | rem FIRST we load a config file of extra options (if there is one)
 72 | set "CFG_FILE_HOME=%UserProfile%\.activator\activatorconfig.txt"
 73 | set "CFG_FILE_VERSION=%UserProfile%\.activator\%APP_VERSION%\activatorconfig.txt"
 74 | if exist %CFG_FILE_VERSION% (
 75 |   FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_VERSION%") DO (
 76 |     set DO_NOT_REUSE_ME=%%i
 77 |     rem ZOMG (Part #2) WE use !! here to delay the expansion of
 78 |     rem CFG_OPTS, otherwise it remains "" for this loop.
 79 |     set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 80 |   )
 81 | )
 82 | if "%CFG_OPTS%"=="" (
 83 |   if exist %CFG_FILE_HOME% (
 84 |     FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%CFG_FILE_HOME%") DO (
 85 |       set DO_NOT_REUSE_ME=%%i
 86 |       rem ZOMG (Part #2) WE use !! here to delay the expansion of
 87 |       rem CFG_OPTS, otherwise it remains "" for this loop.
 88 |       set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
 89 |     )
 90 |   )
 91 | )
 92 | 
 93 | rem We use the value of the JAVACMD environment variable if defined
 94 | set _JAVACMD=%JAVACMD%
 95 | 
 96 | if "%_JAVACMD%"=="" (
 97 |   if not "%JAVA_HOME%"=="" (
 98 |     if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
 99 | 
100 |     rem if there is a java home set we make sure it is the first picked up when invoking 'java'
101 |     SET "PATH=%JAVA_HOME%\bin;%PATH%"
102 |   )
103 | )
104 | 
105 | if "%_JAVACMD%"=="" set _JAVACMD=java
106 | 
107 | rem Detect if this java is ok to use.
108 | for /F %%j in ('"%_JAVACMD%" -version  2^>^&1') do (
109 |   if %%~j==java set JAVAINSTALLED=1
110 |   if %%~j==openjdk set JAVAINSTALLED=1
111 | )
112 | 
113 | rem Detect the same thing about javac
114 | if "%_JAVACCMD%"=="" (
115 |   if not "%JAVA_HOME%"=="" (
116 |     if exist "%JAVA_HOME%\bin\javac.exe" set "_JAVACCMD=%JAVA_HOME%\bin\javac.exe"
117 |   )
118 | )
119 | if "%_JAVACCMD%"=="" set _JAVACCMD=javac
120 | for /F %%j in ('"%_JAVACCMD%" -version 2^>^&1') do (
121 |   if %%~j==javac set JAVACINSTALLED=1
122 | )
123 | 
124 | rem BAT has no logical or, so we do it OLD SCHOOL! Oppan Redmond Style
125 | set JAVAOK=true
126 | if not defined JAVAINSTALLED set JAVAOK=false
127 | if not defined JAVACINSTALLED set JAVAOK=false
128 | 
129 | if "%JAVAOK%"=="false" (
130 |   echo.
131 |   echo A Java JDK is not installed or can't be found.
132 |   if not "%JAVA_HOME%"=="" (
133 |     echo JAVA_HOME = "%JAVA_HOME%"
134 |   )
135 |   echo.
136 |   echo Please go to
137 |   echo   http://www.oracle.com/technetwork/java/javase/downloads/index.html
138 |   echo and download a valid Java JDK and install before running Activator.
139 |   echo.
140 |   echo If you think this message is in error, please check
141 |   echo your environment variables to see if "java.exe" and "javac.exe" are
142 |   echo available via JAVA_HOME or PATH.
143 |   echo.
144 |   if defined DOUBLECLICKED pause
145 |   exit /B 1
146 | )
147 | 
148 | rem Check what Java version is being used to determine what memory options to use
149 | for /f "tokens=3" %%g in ('java -version 2^>^&1 ^| findstr /i "version"') do (
150 |     set JAVA_VERSION=%%g
151 | )
152 | 
153 | rem Strips away the " characters
154 | set JAVA_VERSION=%JAVA_VERSION:"=%
155 | 
156 | rem TODO Check if there are existing mem settings in JAVA_OPTS/CFG_OPTS and use those instead of the below
157 | for /f "delims=. tokens=1-3" %%v in ("%JAVA_VERSION%") do (
158 |     set MAJOR=%%v
159 |     set MINOR=%%w
160 |     set BUILD=%%x
161 | 
162 |     set META_SIZE=-XX:MetaspaceSize=64M -XX:MaxMetaspaceSize=256M
163 |     if "!MINOR!" LSS "8" (
164 |       set META_SIZE=-XX:PermSize=64M -XX:MaxPermSize=256M
165 |     )
166 | 
167 |     set MEM_OPTS=!META_SIZE!
168 |  )
169 | 
170 | rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
171 | set _JAVA_OPTS=%JAVA_OPTS%
172 | if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
173 | 
174 | set DEBUG_OPTS=
175 | 
176 | rem Loop through the arguments, building remaining args in args variable
177 | set args=
178 | :argsloop
179 | if not "%~1"=="" (
180 |   rem Checks if the argument contains "-D" and if true, adds argument 1 with 2 and puts an equal sign between them.
181 |   rem This is done since batch considers "=" to be a delimiter so we need to circumvent this behavior with a small hack.
182 |   set arg1=%~1
183 |   if "!arg1:~0,2!"=="-D" (
184 |      set "args=%args% "%~1"="%~2""
185 |     shift
186 |     shift
187 |     goto argsloop
188 |   )
189 | 
190 |   if "%~1"=="-jvm-debug" (
191 |     if not "%~2"=="" (
192 |       rem This piece of magic somehow checks that an argument is a number
193 |       for /F "delims=0123456789" %%i in ("%~2") do (
194 |         set var="%%i"
195 |       )
196 |       if defined var (
197 |         rem Not a number, assume no argument given and default to 9999
198 |         set JPDA_PORT=9999
199 |       ) else (
200 |         rem Port was given, shift arguments
201 |         set JPDA_PORT=%~2
202 |         shift
203 |       )
204 |     ) else (
205 |       set JPDA_PORT=9999
206 |     )
207 |     shift
208 | 
209 |     set DEBUG_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=!JPDA_PORT!
210 |     goto argsloop
211 |   )
212 |   rem else
213 |   set "args=%args% "%~1""
214 |   shift
215 |   goto argsloop
216 | )
217 | 
218 | :run
219 | 
220 | if "!args!"=="" (
221 |   if defined DOUBLECLICKED (
222 |     set CMDS="ui"
223 |   ) else set CMDS=!args!
224 | ) else set CMDS=!args!
225 | 
226 | rem We add a / in front, so we get file:///C: instead of file://C:
227 | rem Java considers the later a UNC path.
228 | rem We also attempt a solid effort at making it URI friendly.
229 | rem We don't even bother with UNC paths.
230 | set JAVA_FRIENDLY_HOME_1=/!ACTIVATOR_HOME:\=/!
231 | set JAVA_FRIENDLY_HOME=/!JAVA_FRIENDLY_HOME_1: =%%20!
232 | 
233 | rem Checks if the command contains spaces to know if it should be wrapped in quotes or not
234 | set NON_SPACED_CMD=%_JAVACMD: =%
235 | if "%_JAVACMD%"=="%NON_SPACED_CMD%" %_JAVACMD% %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
236 | if NOT "%_JAVACMD%"=="%NON_SPACED_CMD%" "%_JAVACMD%" %DEBUG_OPTS% %MEM_OPTS% %ACTIVATOR_OPTS% %SBT_OPTS% %_JAVA_OPTS% "-Dactivator.home=%JAVA_FRIENDLY_HOME%" -jar "%ACTIVATOR_HOME%\libexec\%ACTIVATOR_LAUNCH_JAR%" %CMDS%
237 | 
238 | if ERRORLEVEL 1 goto error
239 | goto end
240 | 
241 | :error
242 | set ERROR_CODE=1
243 | 
244 | :end
245 | 
246 | @endlocal
247 | 
248 | exit /B %ERROR_CODE%
249 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | 
 2 | name := """clickhouse-spark-connector"""
 3 | 
 4 | version := "1.2"
 5 | 
 6 | scalaVersion := "2.11.7"
 7 | 
 8 | publishTo := Some("jFrog" at "http://10.2.95.5:8080/artifactory/libs-release")
 9 | //credentials += Credentials("jFrog", "10.2.95.5", "admin", "password")
10 | 
11 | libraryDependencies ++= Seq(
12 |   "org.apache.spark" % "spark-core_2.11" % "2.0.0",
13 |   "org.apache.spark" % "spark-sql_2.11" % "2.0.0",
14 |   "ru.yandex.clickhouse" % "clickhouse-jdbc" % "0.1.14",
15 |   "org.scalatest" %% "scalatest" % "2.2.4" % "test",
16 |   "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.7.4"
17 | )
18 | 
19 | fork in run := true
20 | 
21 | test in assembly := {}
22 | 
23 | assemblyMergeStrategy in assembly := {
24 |   case n if n.startsWith("META-INF/MANIFEST.MF") => MergeStrategy.discard
25 |   case "reference.conf"                          => MergeStrategy.concat
26 |   case x => MergeStrategy.first
27 | }
28 | 
29 | packAutoSettings


--------------------------------------------------------------------------------
/clickhouse_files/config.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <yandex>
  3 | 	<logger>
  4 | 		<level>trace</level>
  5 | 		<log>/var/log/clickhouse-server/clickhouse-server.log</log>
  6 | 		<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
  7 | 		<size>1000M</size>
  8 | 		<count>10</count>
  9 | 	</logger>
 10 | 
 11 | 
 12 | 	<http_port>8123</http_port>
 13 | 	<tcp_port>9000</tcp_port>
 14 | 
 15 | 	<!-- Port for communication between replicas. Used for data exchange. -->
 16 | 	<interserver_http_port>9009</interserver_http_port>
 17 | 
 18 | 	<!-- Hostname that is used by other replicas to request this server.
 19 | 	     If not specified, than it is determined analoguous to 'hostname -f' command.
 20 | 		 This setting could be used to switch replication to another network interface.
 21 | 	  -->
 22 | 	<!--
 23 | 	<interserver_http_host>example.yandex.ru</interserver_http_host>
 24 | 	-->
 25 | 
 26 | 	<!-- Listen specified host. :: - is wildcard IPv6 address, allows to accept connections both with IPv4 and IPv6 from everywhere. -->
 27 | 	<listen_host>::</listen_host>
 28 | 
 29 | 	<max_connections>4096</max_connections>
 30 | 	<keep_alive_timeout>3</keep_alive_timeout>
 31 | 
 32 | 	<!-- Maximum number of concurrent queries. -->
 33 | 	<max_concurrent_queries>100</max_concurrent_queries>
 34 | 
 35 | 	<!-- Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
 36 | 		 correct maximum value. -->
 37 | 	<!-- <max_open_files>262144</max_open_files> -->
 38 | 
 39 | 	<!-- Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
 40 | 		 In bytes. Cache is single for server. Memory is allocated only on demand.
 41 | 		 Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
 42 | 		 Uncompressed cache is advantageous only for very short queries and in rare cases.
 43 | 	  -->
 44 | 	<uncompressed_cache_size>8589934592</uncompressed_cache_size>
 45 | 
 46 | 	<!-- Approximate size of mark cache, used in tables of MergeTree family.
 47 | 		 In bytes. Cache is single for server. Memory is allocated only on demand.
 48 | 		 You should not lower this value.
 49 | 	  -->
 50 | 	<mark_cache_size>5368709120</mark_cache_size>
 51 | 
 52 | 
 53 | 	<!-- Path to data directory, with trailing slash. -->
 54 | 	<path>/opt/clickhouse/</path>
 55 | 
 56 | 	<!-- Path to temporary data for processing hard queries. -->
 57 | 	<tmp_path>/opt/clickhouse/tmp/</tmp_path>
 58 | 
 59 | 	<!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
 60 | 	<users_config>users.xml</users_config>
 61 | 
 62 | 	<!-- Default profile of settings.. -->
 63 | 	<default_profile>default</default_profile>
 64 | 
 65 | 	<!-- Default database. -->
 66 | 	<default_database>default</default_database>
 67 | 
 68 | 	<!-- Configuration of clusters that could be used in Distributed tables.
 69 | 		 https://clickhouse.yandex/reference_en.html#Distributed
 70 | 	  -->
 71 | 	<!--<remote_servers incl="clickhouse_remote_servers" />
 72 | 	-->
 73 | 	<remote_servers>
 74 | 	    <perftest_1shards_1replicas>
 75 | 	        <shard>
 76 | 	            <replica>
 77 | 	                <host>localhost</host>
 78 | 	                <port>9000</port>
 79 | 	            </replica>
 80 | 			</shard>
 81 | 	    </perftest_1shards_1replicas>
 82 | 	</remote_servers>
 83 | 
 84 | 	<!-- If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
 85 | 	     By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
 86 | 		 Values for substitutions are specified in /yandex/name_of_substitution elements in that file.
 87 | 	  -->
 88 | 
 89 | 	<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
 90 | 	     Optional. If you don't use replicated tables, you could omit that.
 91 | 
 92 | 		 See https://clickhouse.yandex/reference_en.html#Data%20replication
 93 | 	  -->
 94 | 	<zookeeper incl="zookeeper-servers" optional="true" />
 95 | 
 96 | 	<!-- Substitutions for parameters of replicated tables.
 97 | 	      Optional. If you don't use replicated tables, you could omit that.
 98 | 
 99 | 		 See https://clickhouse.yandex/reference_en.html#Creating%20replicated%20tables
100 | 	  -->
101 | 	<macros incl="macros" optional="true" />
102 | 
103 | 
104 | 	<!-- Reloading interval for embedded dictionaries, in seconds. Default: 3600. -->
105 | 	<builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>
106 | 
107 | 
108 | 	<!-- Sending data to Graphite for monitoring. -->
109 | 	<use_graphite>false</use_graphite>
110 | 
111 | 	<!-- Uncomment if use_graphite.
112 | 	<graphite>
113 | 		<host>127.0.0.1</host>
114 | 		<port>42000</port>
115 | 		<root_path>one_min</root_path>
116 | 		<timeout>0.1</timeout>
117 | 	</graphite>
118 | 	-->
119 | 
120 | 
121 | 	<!-- Query log. Used only for queries with setting log_queries = 1. -->
122 | 	<query_log>
123 | 		<!-- What table to insert data. If table is not exist, it will be created.
124 | 		     When query log structure is changed after system update,
125 | 		      then old table will be renamed and new table will be created automatically.
126 | 		-->
127 | 		<database>system</database>
128 | 		<table>query_log</table>
129 | 
130 | 		<!-- Interval of flushing data. -->
131 | 		<flush_interval_milliseconds>7500</flush_interval_milliseconds>
132 | 	</query_log>
133 | 
134 | 
135 | 	<!-- Parameters for embedded dictionaries, used in Yandex.Metrica.
136 | 		 See https://clickhouse.yandex/reference_en.html#Internal%20dictionaries
137 | 	-->
138 | 
139 | 	<!-- Path to file with region hierarchy. -->
140 | 	<!-- <path_to_regions_hierarchy_file>/opt/geo/regions_hierarchy.txt</path_to_regions_hierarchy_file> -->
141 | 
142 | 	<!-- Path to directory with files containing names of regions -->
143 | 	<!-- <path_to_regions_names_files>/opt/geo/</path_to_regions_names_files> -->
144 | 
145 | 
146 | 	<!-- Configuration of external dictionaries. See:
147 | 		 https://clickhouse.yandex/reference_en.html#External%20Dictionaries
148 | 	-->
149 | 	<dictionaries_config>*_dictionary.xml</dictionaries_config>
150 | 
151 | 
152 | 	<!-- Uncomment if you want data to be compressed 30-100% better.
153 | 		 Don't do that if you just started using ClickHouse.
154 | 	  -->
155 | 	<compression incl="clickhouse_compression">
156 | 	<!--
157 | 		<!- - Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used. - ->
158 | 		<case>
159 | 
160 | 			<!- - Conditions. All must be satisfied. Some conditions may be omitted. - ->
161 | 			<min_part_size>10000000000</min_part_size>		<!- - Min part size in bytes. - ->
162 | 			<min_part_size_ratio>0.01</min_part_size_ratio>	<!- - Min size of part relative to whole table size. - ->
163 | 
164 | 			<!- - What compression method to use. - ->
165 | 			<method>zstd</method>	<!- - Keep in mind that zstd compression library is highly experimental. - ->
166 | 		</case>
167 | 	-->
168 | 	</compression>
169 | 
170 | 	<resharding>
171 | 		<task_queue_path>/clickhouse/task_queue</task_queue_path>
172 | 	</resharding>
173 | 
174 | 	<!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
175 | 	<!--
176 | 	<merge_tree>
177 | 		<max_suspicious_broken_parts>5</max_suspicious_broken_parts>
178 | 	</merge_tree>
179 | 	-->
180 | 
181 | 	<!-- Example of parameters for GraphiteMergeTree table engine -->
182 | 	<graphite_rollup_example>
183 | 		<pattern>
184 | 			<regexp>click_cost</regexp>
185 | 			<function>any</function>
186 | 			<retention>
187 | 				<age>0</age>
188 | 				<precision>3600</precision>
189 | 			</retention>
190 | 			<retention>
191 | 				<age>86400</age>
192 | 				<precision>60</precision>
193 | 			</retention>
194 | 		</pattern>
195 | 		<default>
196 | 			<function>max</function>
197 | 			<retention>
198 | 				<age>0</age>
199 | 				<precision>60</precision>
200 | 			</retention>
201 | 			<retention>
202 | 				<age>3600</age>
203 | 				<precision>300</precision>
204 | 			</retention>
205 | 			<retention>
206 | 				<age>86400</age>
207 | 				<precision>3600</precision>
208 | 			</retention>
209 | 		</default>
210 | 	</graphite_rollup_example>
211 | </yandex>
212 | 


--------------------------------------------------------------------------------
/docker_files/docker_start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | service clickhouse-server start
4 | #exec "$@"
5 | tail -f /var/log/clickhouse-server/*


--------------------------------------------------------------------------------
/libexec/activator-launch-1.3.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DmitryBe/spark-clickhouse/d8e546505dc937ad567a32d8ab74e226ce5179e9/libexec/activator-launch-1.3.12.jar


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #Activator-generated Properties
2 | #Thu Jan 19 11:34:38 SGT 2017
3 | template.uuid=e17acfbb-1ff5-41f5-b8cf-2c40be6a8340
4 | sbt.version=0.13.8
5 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.8.2")
2 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/ClickhouseClient.scala:
--------------------------------------------------------------------------------
 1 | package io.clickhouse.ext
 2 | 
 3 | import ru.yandex.clickhouse.ClickHouseDataSource
 4 | import io.clickhouse.ext.Utils._
 5 | 
 6 | case class ClickhouseClient(clusterNameO: Option[String] = None)
 7 |                            (implicit ds: ClickHouseDataSource){
 8 | 
 9 |   import io.clickhouse.ext.ClickhouseResultSetExt._
10 | 
11 |   def createDb(dbName: String){
12 |     query(s"create database if not exists $dbName")
13 |   }
14 | 
15 |   def dropDb(dbName: String){
16 |     query(s"DROP DATABASE IF EXISTS $dbName")
17 |   }
18 | 
19 |   def query(sql: String) = {
20 |     using(ds.getConnection){ conn =>
21 |       val statement = conn.createStatement()
22 |       val rs = statement.executeQuery(sql)
23 |       rs
24 |     }
25 |   }
26 | 
27 |   def queryCluster(sql: String) = {
28 |     val resultSet = runOnAllNodes(sql)
29 |     ClusterResultSet(resultSet)
30 |   }
31 | 
32 |   def createDbCluster(dbName: String) = {
33 |     runOnAllNodes(s"create database if not exists $dbName")
34 |       .count(x => x._2 == null)
35 |   }
36 | 
37 |   def dropDbCluster(dbName: String) = {
38 |     runOnAllNodes(s"DROP DATABASE IF EXISTS $dbName")
39 |       .count(x => x._2 == null)
40 |   }
41 | 
42 |   def getClusterNodes() = {
43 |     val clusterName = isClusterNameProvided()
44 |     using(ds.getConnection) { conn =>
45 |       val statement = conn.createStatement()
46 |       val rs = statement.executeQuery(s"select host_name, host_address from system.clusters where cluster == '$clusterName'")
47 |       val r = rs.map(x => x.getString("host_name"))
48 |       require(r.nonEmpty, s"cluster $clusterNameO not found")
49 |       r
50 |     }
51 |   }
52 | 
53 |   private def runOnAllNodes(sql: String) = {
54 |     getClusterNodes().map{ nodeIp =>
55 |       val nodeDs = ClickhouseConnectionFactory.get(nodeIp)
56 |       val client = ClickhouseClient()(nodeDs)
57 |       (nodeIp, client.query(sql))
58 |     }
59 |   }
60 | 
61 |   private def isClusterNameProvided() = {
62 |     clusterNameO match {
63 |       case None => throw new Exception("cluster name is requires")
64 |       case Some(clusterName) => clusterName
65 |     }
66 |   }
67 | }


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/ClickhouseConnectionFactory.scala:
--------------------------------------------------------------------------------
 1 | package io.clickhouse.ext
 2 | 
 3 | import java.util.Properties
 4 | import ru.yandex.clickhouse.ClickHouseDataSource
 5 | import ru.yandex.clickhouse.settings.ClickHouseProperties
 6 | 
 7 | object ClickhouseConnectionFactory extends Serializable{
 8 | 
 9 |   private val dataSources = scala.collection.mutable.Map[(String, Int), ClickHouseDataSource]()
10 | 
11 |   def get(host: String, port: Int = 8123): ClickHouseDataSource ={
12 |     dataSources.get((host, port)) match {
13 |       case Some(ds) =>
14 |         ds
15 |       case None =>
16 |         val ds = createDatasource(host, port = port)
17 |         dataSources += ((host, port) -> ds)
18 |         ds
19 |     }
20 |   }
21 | 
22 |   private def createDatasource(host: String, dbO: Option[String] = None, port: Int = 8123) = {
23 |     val props = new Properties()
24 |     dbO map {db => props.setProperty("database", db)}
25 | 
26 |     val clickHouseProps = new ClickHouseProperties(props)
27 |     new ClickHouseDataSource(s"jdbc:clickhouse://$host:$port", clickHouseProps)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/ClickhouseResultSetExt.scala:
--------------------------------------------------------------------------------
 1 | package io.clickhouse.ext
 2 | 
 3 | object ClickhouseResultSetExt{
 4 |   implicit class ResultSetExt(rs: java.sql.ResultSet){
 5 | 
 6 |     def map[T](delegate: (java.sql.ResultSet) => T): Seq[T] = {
 7 |       var results = List[T]()
 8 |       while(rs.next()){
 9 |         results = delegate(rs) :: results
10 |       }
11 |       results
12 |     }
13 | 
14 |     def toTab = {
15 |       // rs meta: (colId, name, type)
16 |       val header = getMeta.map(v => s"${v._2}").mkString("\t")
17 | 
18 |       val body = getData.map{ row =>
19 |         row.map(v => s"$v").mkString("\t")
20 |       }.mkString("\n")
21 | 
22 |       val table = List(header, body).mkString("\n")
23 |       println(s"%table $table")
24 |     }
25 | 
26 |     def getMeta = {
27 |       1 to rs.getMetaData.getColumnCount map { i =>
28 |         (i, rs.getMetaData.getColumnName(i), rs.getMetaData.getColumnTypeName(i))
29 |       }
30 |     }
31 | 
32 |     def getData = {
33 |       val meta = getMeta
34 |       val results = scala.collection.mutable.MutableList[Seq[AnyRef]]()
35 |       while(rs.next()){
36 |         val row = meta.map(i => rs.getObject(i._1))
37 |         results += row
38 |       }
39 |       results.toList
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/ClusterResultSet.scala:
--------------------------------------------------------------------------------
 1 | package io.clickhouse.ext
 2 | 
 3 | case class ClusterResultSet(clusterRs: Seq[(String, java.sql.ResultSet)]){
 4 |   import io.clickhouse.ext.ClickhouseResultSetExt._
 5 | 
 6 |   def get = clusterRs
 7 | 
 8 |   def toTab = {
 9 |     val firstRow = clusterRs.head
10 |     val firstRowRs = firstRow._2
11 | 
12 |     val metaTab = if(firstRowRs != null){
13 |       val meta = firstRowRs.getMeta
14 |       ("host" :: meta.map(x => s"${x._2}").toList).mkString("\t")
15 |     }else{
16 |       Seq("host", "result").mkString("\t")
17 |     }
18 | 
19 |     val bodyTab = clusterRs.map{ cur =>
20 |       val hostIp = cur._1
21 |       if(cur._2 != null){
22 |         val ds = cur._2.getData // Seq[Seq[AnyRef]]
23 |         ds.map{ row =>
24 |           (hostIp :: row.map(v => s"$v").toList).mkString("\t")
25 |         }.mkString("\n")
26 |       }else{
27 |         Seq(hostIp, "null").mkString("\t")
28 |       }
29 |     }.mkString("\n")
30 | 
31 |     val table = List(metaTab, bodyTab).mkString("\n")
32 |     println(s"%table $table")
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/Utils.scala:
--------------------------------------------------------------------------------
 1 | package io.clickhouse.ext
 2 | 
 3 | object Utils {
 4 |   def using[A, B <: {def close(): Unit}] (closeable: B) (f: B => A): A =
 5 |     try {
 6 |       f(closeable)
 7 |     }
 8 |     finally {
 9 |       closeable.close()
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/io/clickhouse/ext/spark/DataFrameExt.scala:
--------------------------------------------------------------------------------
  1 | package io.clickhouse.ext.spark
  2 | 
  3 | import io.clickhouse.ext.{ClickhouseClient, ClickhouseConnectionFactory}
  4 | import ru.yandex.clickhouse.ClickHouseDataSource
  5 | import io.clickhouse.ext.Utils._
  6 | import org.apache.spark.sql.types._
  7 | 
  8 | object ClickhouseSparkExt{
  9 |   implicit def extraOperations(df: org.apache.spark.sql.DataFrame) = DataFrameExt(df)
 10 | }
 11 | 
 12 | case class DataFrameExt(df: org.apache.spark.sql.DataFrame) extends Serializable {
 13 | 
 14 |   def dropClickhouseDb(dbName: String, clusterNameO: Option[String] = None)
 15 |                       (implicit ds: ClickHouseDataSource){
 16 |     val client = ClickhouseClient(clusterNameO)(ds)
 17 |     clusterNameO match {
 18 |       case None => client.dropDb(dbName)
 19 |       case Some(x) => client.dropDbCluster(dbName)
 20 |     }
 21 |   }
 22 | 
 23 |   def createClickhouseDb(dbName: String, clusterNameO: Option[String] = None)
 24 |                         (implicit ds: ClickHouseDataSource){
 25 |     val client = ClickhouseClient(clusterNameO)(ds)
 26 |     clusterNameO match {
 27 |       case None => client.createDb(dbName)
 28 |       case Some(x) => client.createDbCluster(dbName)
 29 |     }
 30 |   }
 31 | 
 32 |   def createClickhouseTable(dbName: String, tableName: String, partitionColumnName: String, indexColumns: Seq[String], clusterNameO: Option[String] = None)
 33 |                            (implicit ds: ClickHouseDataSource){
 34 |     val client = ClickhouseClient(clusterNameO)(ds)
 35 |     val sqlStmt = createClickhouseTableDefinitionSQL(dbName, tableName, partitionColumnName, indexColumns)
 36 |     clusterNameO match {
 37 |       case None => client.query(sqlStmt)
 38 |       case Some(clusterName) =>
 39 |         // create local table on every node
 40 |         client.queryCluster(sqlStmt)
 41 |         // create distrib table (view) on every node
 42 |         val sqlStmt2 = s"CREATE TABLE IF NOT EXISTS ${dbName}.${tableName}_all AS ${dbName}.${tableName} ENGINE = Distributed($clusterName, $dbName, $tableName, rand());"
 43 |         client.queryCluster(sqlStmt2)
 44 |     }
 45 |   }
 46 | 
 47 |   def saveToClickhouse(dbName: String, tableName: String, partitionFunc: (org.apache.spark.sql.Row) => java.sql.Date, partitionColumnName: String = "mock_date", clusterNameO: Option[String] = None, batchSize: Int = 100000)
 48 |                       (implicit ds: ClickHouseDataSource)={
 49 | 
 50 |     val defaultHost = ds.getHost
 51 |     val defaultPort = ds.getPort
 52 | 
 53 |     val (clusterTableName, clickHouseHosts) = clusterNameO match {
 54 |       case Some(clusterName) =>
 55 |         // get nodes from cluster
 56 |         val client = ClickhouseClient(clusterNameO)(ds)
 57 |         (s"${tableName}_all", client.getClusterNodes())
 58 |       case None =>
 59 |         (tableName, Seq(defaultHost))
 60 |     }
 61 | 
 62 |     val schema = df.schema
 63 | 
 64 |     // following code is going to be run on executors
 65 |     val insertResults = df.rdd.mapPartitions((partition: Iterator[org.apache.spark.sql.Row])=>{
 66 | 
 67 |       val rnd = scala.util.Random.nextInt(clickHouseHosts.length)
 68 |       val targetHost = clickHouseHosts(rnd)
 69 |       val targetHostDs = ClickhouseConnectionFactory.get(targetHost, defaultPort)
 70 | 
 71 |       // explicit closing
 72 |       using(targetHostDs.getConnection) { conn =>
 73 | 
 74 |         val insertStatementSql = generateInsertStatment(schema, dbName, clusterTableName, partitionColumnName)
 75 |         val statement = conn.prepareStatement(insertStatementSql)
 76 | 
 77 |         var totalInsert = 0
 78 |         var counter = 0
 79 | 
 80 |         while(partition.hasNext){
 81 | 
 82 |           counter += 1
 83 |           val row = partition.next()
 84 | 
 85 |           // create mock date
 86 |           val partitionVal = partitionFunc(row)
 87 |           statement.setDate(1, partitionVal)
 88 | 
 89 |           // map fields
 90 |           schema.foreach{ f =>
 91 |             val fieldName = f.name
 92 |             val fieldIdx = row.fieldIndex(fieldName)
 93 |             val fieldVal = row.get(fieldIdx)
 94 |             if(fieldVal != null)
 95 |               statement.setObject(fieldIdx + 2, fieldVal)
 96 |             else{
 97 |               val defVal = defaultNullValue(f.dataType, fieldVal)
 98 |               statement.setObject(fieldIdx + 2, defVal)
 99 |             }
100 |           }
101 |           statement.addBatch()
102 | 
103 |           if(counter >= batchSize){
104 |             val r = statement.executeBatch()
105 |             totalInsert += r.sum
106 |             counter = 0
107 |           }
108 | 
109 |         } // end: while
110 | 
111 |         if(counter > 0) {
112 |           val r = statement.executeBatch()
113 |           totalInsert += r.sum
114 |           counter = 0
115 |         }
116 | 
117 |         // return: Seq((host, insertCount))
118 |         List((targetHost, totalInsert)).toIterator
119 |       }
120 | 
121 |     }).collect()
122 | 
123 |     // aggr insert results by hosts
124 |     insertResults.groupBy(_._1)
125 |       .map(x => (x._1, x._2.map(_._2).sum))
126 |   }
127 | 
128 |   private def generateInsertStatment(schema: org.apache.spark.sql.types.StructType, dbName: String, tableName: String, partitionColumnName: String) = {
129 |     val columns = partitionColumnName :: schema.map(f => f.name).toList
130 |     val vals = 1 to (columns.length) map (i => "?")
131 |     s"INSERT INTO $dbName.$tableName (${columns.mkString(",")}) VALUES (${vals.mkString(",")})"
132 |   }
133 | 
134 |   private def defaultNullValue(sparkType: org.apache.spark.sql.types.DataType, v: Any) = sparkType match {
135 |     case DoubleType => 0
136 |     case LongType => 0
137 |     case FloatType => 0
138 |     case IntegerType => 0
139 |     case StringType => null
140 |     case BooleanType => false
141 |     case _ => null
142 |   }
143 | 
144 |   private def createClickhouseTableDefinitionSQL(dbName: String, tableName: String, partitionColumnName: String, indexColumns: Seq[String])= {
145 | 
146 |     val header = s"""
147 |           CREATE TABLE IF NOT EXISTS $dbName.$tableName(
148 |           """
149 | 
150 |     val columns = s"$partitionColumnName Date" :: df.schema.map{ f =>
151 |       Seq(f.name, sparkType2ClickhouseType(f.dataType)).mkString(" ")
152 |     }.toList
153 |     val columnsStr = columns.mkString(",\n")
154 | 
155 |     val footer = s"""
156 |           )ENGINE = MergeTree($partitionColumnName, (${indexColumns.mkString(",")}), 8192);
157 |           """
158 | 
159 |     Seq(header, columnsStr, footer).mkString("\n")
160 |   }
161 | 
162 |   private def sparkType2ClickhouseType(sparkType: org.apache.spark.sql.types.DataType)= sparkType match {
163 |     case LongType => "Int64"
164 |     case DoubleType => "Float64"
165 |     case FloatType => "Float32"
166 |     case IntegerType => "Int32"
167 |     case StringType => "String"
168 |     case BooleanType => "UInt8"
169 |     case _ => "unknown"
170 |   }
171 | 
172 | }
173 | 


--------------------------------------------------------------------------------
/src/test/scala/DFExtSpec.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | import org.scalatest._
 3 | import io.clickhouse.ext.ClickhouseConnectionFactory
 4 | import io.clickhouse.ext.spark.ClickhouseSparkExt._
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | case class Row1(name: String, v: Int, v2: Int)
 8 | 
 9 | class TestSpec extends FlatSpec with Matchers {
10 | 
11 |   "case0" should "" in {
12 | 
13 |     val max = 25e6
14 |     val monthSize = max / 11
15 |     val daySize = monthSize / 28
16 | 
17 |     def yearMap(chrom: String) = {
18 |       1900 + (math.abs(chrom.hashCode) % 200)
19 |     }
20 | 
21 |     def monthDayMap(pos: Int) = {
22 |       val m = (pos / monthSize).toInt
23 |       val d = ((pos % monthSize) / daySize).toInt
24 |       (m + 1, d + 1)
25 |     }
26 | 
27 |     val r = (5024637 to 48119824).toList map { pos =>
28 |       monthDayMap(pos)
29 |     }
30 | 
31 |     val month_range = r.map(_._1).distinct
32 |     val day_range = r.map(_._2).distinct
33 | 
34 |     assert(true)
35 |   }
36 | 
37 |   "case 11" should "" in {
38 | 
39 |     val a = 1
40 | 
41 |     def calc(pos: Int) = {
42 |       val x = pos / 25e6 * 348
43 |       val m = x % 12
44 |       val d = x % 29
45 |       (m.toInt, d.toInt)
46 |     }
47 | 
48 |     val r = (0 to 1000000).toList map { pos =>
49 |       calc(pos)
50 |     }
51 | 
52 |     val month_range = r.map(_._1).distinct
53 |     val day_range = r.map(_._2).distinct
54 | 
55 | 
56 |     assert(true)
57 |   }
58 | 
59 |   "case1" should "ok" in {
60 | 
61 |     val sparkSession = SparkSession.builder
62 |       .master("local")
63 |       .appName("local spark")
64 |       .getOrCreate()
65 | 
66 |     val sc = sparkSession.sparkContext
67 |     val sqlContext = sparkSession.sqlContext
68 | 
69 |     // test dframe
70 |     val df = sqlContext.createDataFrame(1 to 10 map(i => Row1(s"$i", i, i + 10)) )
71 | 
72 |     // clickhouse params
73 |     val anyHost = "localhost"
74 |     val db = "tmp1"
75 |     val tableName = "t1"
76 | //    val clusterName = None: Option[String]
77 |     // start clickhouse docker using config.xml from clickhouse_files
78 |     val clusterName = Some("perftest_1shards_1replicas"): Option[String]
79 | 
80 |     // define clickhouse connection
81 |     implicit val clickhouseDataSource = ClickhouseConnectionFactory.get(anyHost)
82 | 
83 |     // create db / table
84 |     df.dropClickhouseDb(db, clusterName)
85 |     df.createClickhouseDb(db, clusterName)
86 |     df.createClickhouseTable(db, tableName, "mock_date", Seq("name"), clusterNameO = clusterName)
87 | 
88 |     // save data
89 |     val res = df.saveToClickhouse("tmp1", "t1", (row) => java.sql.Date.valueOf("2000-12-01"), "mock_date", clusterNameO = clusterName)
90 |     assert(res.size == 1)
91 |     assert(res.get("localhost") == Some(df.count()))
92 | 
93 |     true should === (true)
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/test/scala/UtilsSpec.scala:
--------------------------------------------------------------------------------
 1 | import java.util.Properties
 2 | 
 3 | import org.scalatest.{FlatSpec, Matchers}
 4 | import io.clickhouse.ext.Utils._
 5 | 
 6 | class UtilsSpec extends FlatSpec with Matchers{
 7 | 
 8 |   "case 1" should "ok" in {
 9 | 
10 |     var f = false
11 |     case class Mock(){
12 |       def print(): Unit = {
13 |         println("mock print")
14 |       }
15 |       def close(): Unit ={
16 |         f = true
17 |       }
18 |     }
19 | 
20 |     using(Mock()){ mock =>
21 |       mock.print()
22 |     }
23 |     assert(f.equals(true))
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------