├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── project
├── Versions.scala
├── build.properties
└── plugins.sbt
└── spark-clickhouse-connector
└── src
└── main
└── scala
├── io
└── clickhouse
│ └── spark
│ ├── connection
│ ├── ClickHouseDataSource.scala
│ └── ConnectionPooledDBUrl.scala
│ ├── connector
│ ├── ClickhouseConnector.scala
│ ├── ClickhouseRDD.scala
│ ├── ConnectorConf.scala
│ ├── SparkClickhouseFunctions.scala
│ ├── TableScanner.scala
│ ├── package.scala
│ └── partitioner
│ │ ├── ClickhousePartition.scala
│ │ ├── ClickhousePartitioner.scala
│ │ ├── NodeAddress.scala
│ │ └── PartitionQuery.scala
│ └── sql
│ └── RowReaderFactory.scala
└── org
└── apache
└── spark
├── TableIterator.scala
├── metrics
└── clickhouse
│ ├── ClickhouseYandexRowMeter.scala
│ └── InputMetricsUpdater.scala
└── sql
└── ClickhouseRow.scala
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | #ignore thumbnails created by windows
3 | Thumbs.db
4 | #Ignore files build by Visual Studio
5 | *.obj
6 | *.exe
7 | *.pdb
8 | *.user
9 | *.aps
10 | *.pch
11 | *.vspscc
12 | *_i.c
13 | *_p.c
14 | *.ncb
15 | *.suo
16 | *.tlb
17 | *.tlh
18 | *.bak
19 | *.cache
20 | *.ilk
21 | *.log
22 | [Bb]in
23 | [Dd]ebug*/
24 | *.lib
25 | *.sbr
26 | obj/
27 | [Rr]elease*/
28 | _ReSharper*/
29 | [Tt]est[Rr]esult*
30 |
31 | target/
32 | project/boot/
33 | project/target/
34 | project/plugins/project/
35 | sbt/sbt-launch*.jar
36 |
37 | # Scala-IDE specific
38 | .scala_dependencies
39 | .worksheet
40 | .idea
41 | .idea_modules
42 |
43 |
44 |
45 | *.~sql
46 | *.iml
47 | .idea/*
48 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spark-clickhouse-connector
2 | Spark Clickhouse Connector
3 |
4 | ## Description
5 |
6 | *Package for integration between Yandex Clickhouse and Apache Spark.*
7 |
8 | This assembly provides functionality to represent a Clickhouse table as ClickhouseRdd.
9 |
10 | - allows to execute SQL queries
11 | - allows to filter rows on the server side
12 | - allows to manage spark partition granularity
13 | - provides failover by Clickhouse replica
14 | - provides data locality if Clickhouse nodes are collocated with Spark nodes
15 | - provides load-balancing by Clickhouse replica
16 | - provides Clickhouse cluster auto-discovery
17 | - can be used with both drivers: ru.yandex.clickhouse.clickhouse-jdbc or com.github.housepower.clickhouse-native-jdbc
18 | - allows to throttle consuming database resources
19 |
20 | ClickhouseRDD is the main entry point for analyzing data in Clickhouse database with Spark. You can obtain object of this class by calling SparkClickhouseFunctions.clickhouseTable()
21 |
22 | Configuration properties should be passed in the org.apache.spark.SparkConf SparkConf configuration of org.apache.spark.SparkContext SparkContext. ClickhouseRDD needs to open connection to Clickhouse, therefore it requires appropriate connection property values to be present in org.apache.spark.SparkConf SparkConf. For the list of required and available properties, see ConnectorConf.
23 |
24 | A 'ClickhouseRDD' object gets serialized and sent to every Spark Executor, which then calls the 'compute' method to fetch the data on every node. The 'getPreferredLocations' method tells Spark the preferred nodes to fetch a partition from, so that the data for the partition are at the same node the task was sent to. If Clickhouse nodes are collocated with Spark nodes, the queries are always sent to the Clickhouse process running on the same node as the Spark Executor process, hence data are not transferred between nodes. If a Clickhouse node fails or gets overloaded during read, the queries are retried to a different node.
25 |
26 | ## Build
27 |
28 | In the root directory run
29 |
30 | scala 2.11:
31 |
32 | sbt '++2.11.7 assembly'
33 |
34 | or scala 2.12:
35 |
36 | sbt '++2.12.9 assembly'
37 |
38 | or two versions of scala
39 |
40 | sbt '+ assembly'
41 |
42 | A jar with shaded dependencies will be generated to directory spark-clickhouse-connector/target/scala-2.11 e.g. spark-clickhouse-connector_2.11-2.4.0_0.23.jar
43 |
44 | To publish to local maven nexus
45 |
46 | publish
47 |
48 | Or to publish to local repository
49 |
50 | publishM2
51 |
52 | You need to provide in home directory two files:
53 | - Credentials(Path.userHome / ".sbt" / "credentials")
54 | - Path.userHome.absolutePath + "/.sbt/nexus_url"
55 |
56 | ## Usage
57 |
58 | ### Prerequisites
59 |
60 | * Copy spark-clickhouse-connector_2.11-2.4.0_0.25.jar to spark lib directory
61 | * Copy ru.yandex.clickhouse.clickhouse-jdbc to spark lib directory
62 | * Add dependency to your spark project
63 | ```
64 |
65 | io.clickhouse
66 | spark-clickhouse-connector_2.11
67 | 0.23
68 |
69 | ```
70 | ### Set parameters
71 |
72 | ```scala
73 | val sparkConf = new SparkConf()
74 | .set("spark.clickhouse.driver","ru.yandex.clickhouse.ClickHouseDriver")
75 | .set("spark.clickhouse.url", "jdbc:clickhouse://192.168.1.1:8123,192.168.1.2:8123")
76 | .set("spark.clickhouse.user", null)
77 | .set("spark.clickhouse.password", null)
78 | .set("spark.clickhouse.connection.per.executor.max", "5")
79 | .set("spark.clickhouse.metrics.enable", "false")
80 | .set("spark.clickhouse.socket.timeout.ms", "10000")
81 | .set("spark.clickhouse.cluster.auto-discovery", "false")
82 |
83 | val ss = SparkSession.builder()
84 | .master("spark_master_url")
85 | .appName("test_app")
86 | .config(sparkConf)
87 | .getOrCreate()
88 |
89 | ```
90 |
91 | ### Add functions on the `SparkContext` and `RDD`:
92 |
93 | ```scala
94 | import io.clickhouse.spark.connector._
95 | ```
96 | ### Loading from Clickhouse
97 |
98 | Sample table with index by date
99 |
100 | ```sql
101 | CREATE TABLE IF NOT EXISTS data.some_table_local on cluster some_cluster \
102 | ( \
103 | dated Date DEFAULT toDate(started), \
104 | started DateTime, \
105 | counter_id UInt32, \
106 | col1 String, \
107 | col2 String, \
108 | ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/some_table_local', '{replica}') \
109 | PARTITION BY (toYYYYMM(dated)) \
110 | ORDER BY (dated, counter_id) \
111 |
112 | ```
113 |
114 | Loading data to ClickhouseRDD with spark partitions by `Shard` by `DAY`. For example you have cluster with 2 shards and you need to analyze 30 days period. In this case connector creates 60 spark-partitions.
115 |
116 | ```scala
117 |
118 | val sc = ss.sparkContext
119 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " +
120 | s" where started >='${startDate.toString("yyyy-MM-dd HH:mm:ss")}' and started <= '${endDate.toString("yyyy-MM-dd HH:mm:ss")}'"
121 |
122 | sc.clickhouseTable(query, "some_cluster")
123 | .withPeriod(startDate, endDate, partitioner.RangeType.DAY, "dated")
124 | .map(row => {
125 | val counterId = row.getAs[Long]("counter_id")
126 | val started = new DateTime(row.getAs[Timestamp]("started"))
127 | val col1 = row.getAs[String]("col1")
128 | val col2 = row.getAs[String]("col2")
129 |
130 | (started, counterId, col1, col2)
131 | })
132 | .filter ()
133 | .groupBy()
134 | .<...>
135 |
136 | ```
137 |
138 | Loading data to ClickhouseRDD with spark partitions by `Shard` without date range. For example you have cluster with 2 shards and you need to analyze 30 days period. In this case connector creates only 2 spark-partitions.
139 |
140 |
141 | ```scala
142 |
143 | val sc = ss.sparkContext
144 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " +
145 | s" where dated >='${startDate.toString("yyyy-MM-dd")}' and dated <= '${endDate.toString("yyyy-MM-dd")} " +
146 | s" and counter_id in (1,2,3)'"
147 |
148 | sc.clickhouseTable(query, "some_cluster")
149 | .map(row => {
150 | val counterId = row.getAs[Long]("counter_id")
151 | val started = new DateTime(row.getAs[Timestamp]("started"))
152 | val col1 = row.getAs[String]("col1")
153 | val col2 = row.getAs[String]("col2")
154 |
155 | (started, counterId, col1, col2)
156 | })
157 | .filter ()
158 | .groupBy()
159 | .<...>
160 |
161 | ```
162 |
163 | Loading data to ClickhouseRDD with spark partitions by `Shard` with custom date range.
164 | For example you have cluster with 2 shards and you need to analyze 30 days period.
165 | In this case connector creates only 4 spark-partitions.
166 |
167 |
168 | ```scala
169 |
170 | val sc = ss.sparkContext
171 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " +
172 | s" where counter_id in (1,2,3)'"
173 |
174 | sc.clickhouseTable(query, "some_cluster")
175 | .withCustomPartitioning(Seq("dated >= '2019-01-01' and dated < '2019-01-16'",
176 | "dated >= '2019-01-16' and dated < '2019-02-01'"))
177 | .map(row => {
178 | val counterId = row.getAs[Long]("counter_id")
179 | val started = new DateTime(row.getAs[Timestamp]("started"))
180 | val col1 = row.getAs[String]("col1")
181 | val col2 = row.getAs[String]("col2")
182 |
183 | (started, counterId, col1, col2)
184 | })
185 | .filter ()
186 | .groupBy()
187 | .<...>
188 |
189 | ```
190 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import sbt.Keys.{credentials, libraryDependencies, pomExtra, publishM2Configuration, run, version}
2 | import Versions._
3 | import sbt.{Credentials, CrossVersion}
4 | import sbtassembly.AssemblyKeys.{assembly, assemblyJarName, assemblyOption}
5 |
6 |
7 | val versionStatus = settingKey[Unit]("The Scala version used in cross-build reapply for '+ package', '+ publish'.")
8 | val nexus_local = IO.read(new File(Path.userHome.absolutePath + "/.sbt/nexus_url"))
9 | lazy val creds = Seq(credentials += Credentials(Path.userHome / ".sbt" / "credentials"))
10 |
11 |
12 | val commonSettings = creds ++ Seq(
13 | name := "spark-clickhouse-connector",
14 | organization := "io.clickhouse",
15 | version := "0.25",
16 | crossScalaVersions := Seq(Versions.scala211, Versions.scala212),
17 | crossVersion := CrossVersion.binary,
18 | versionStatus := Versions.status(scalaVersion.value, scalaBinaryVersion.value),
19 | publishMavenStyle := true,
20 |
21 | publishConfiguration := publishConfiguration.value.withOverwrite(true),
22 | publishM2Configuration := publishM2Configuration.value.withOverwrite(true),
23 | publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true),
24 |
25 | publishTo := {
26 | if (isSnapshot.value)
27 | Some("Snapshot repository" at nexus_local+ "/content/repositories/snapshots/")
28 | else
29 | Some("Release repository" at nexus_local + "/content/repositories/releases/")
30 | },
31 |
32 | pomExtra :=
33 |
34 |
35 | vbezruchko
36 | Vadim Bezruchko
37 | va.bezruchko@gmail.com
38 |
39 |
40 |
41 | )
42 |
43 | //lazy val scalaBinary = scala_version.dropRight(2)
44 |
45 |
46 |
47 | val deps = Seq (
48 | "org.apache.spark" %% "spark-core" % Spark % "provided",
49 | "org.apache.spark" %% "spark-sql" % Spark % "provided",
50 |
51 | "org.slf4j" % "slf4j-api" % Slf4j % "provided",
52 |
53 | "org.eclipse.jetty" % "jetty-server" % SparkJetty % "provided",
54 | "org.eclipse.jetty" % "jetty-servlet" % SparkJetty % "provided",
55 |
56 | "com.codahale.metrics" % "metrics-core" % CodaHaleMetrics % "provided",
57 | "com.codahale.metrics" % "metrics-json" % CodaHaleMetrics % "provided",
58 |
59 | "ru.yandex.clickhouse" % "clickhouse-jdbc" % clickhouse_jdbc % "provided",
60 |
61 | "joda-time" % "joda-time" % joda_version % "provided",
62 | "org.apache.commons" % "commons-pool2" % commons_pool
63 | )
64 |
65 | //multi-project for shading commons-pool2
66 | lazy val assemblyJar = (project in file("spark-clickhouse-connector"))
67 | .enablePlugins(AssemblyPlugin)
68 | .settings(commonSettings)
69 | .settings(libraryDependencies ++=deps)
70 | .settings(
71 | skip in publish := true,
72 | skip in publishM2 := true,
73 | assemblyOption in assembly ~= { _.copy(includeScala = false) },
74 | run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)),
75 | assemblyJarName in assembly := s"${name.value}_${scalaBinaryVersion.value}-${Versions.Spark}_${version.value}.jar",
76 | assemblyShadeRules in assembly := Seq(
77 | ShadeRule.rename("org.apache.commons.**" -> "shade.io.clickhouse.apache.commons.@1").inAll
78 | )
79 | )
80 |
81 | //hide shaded dependencies
82 | lazy val connectorDistribution = (project in file("./"))
83 | .settings(commonSettings)
84 | .settings(
85 | packageBin in Compile := (assembly in (assemblyJar, Compile)).value,
86 | assembly := (assembly in assemblyJar).value
87 |
88 | )
--------------------------------------------------------------------------------
/project/Versions.scala:
--------------------------------------------------------------------------------
1 | import Versions.JDK
2 |
3 | import scala.util.Properties
4 |
5 | object Versions {
6 |
7 | val scala211 = "2.11.7"
8 | val scala212 = "2.12.9"
9 | val Spark = "2.4.0"
10 | val Slf4j = "1.6.1"
11 | val SparkJetty = "8.1.14.v20131031"
12 | val CodaHaleMetrics = "3.0.2"
13 | val commons_pool = "2.5.0"
14 | val clickhouse_jdbc = "0.1.50"
15 | val JDK = "1.8"
16 | val joda_version = "2.10.6"
17 |
18 | val status = (versionInReapply: String, binaryInReapply: String) =>
19 | println(s"""
20 | | Scala: $versionInReapply
21 | | Scala Binary: $binaryInReapply
22 | | Java: target=$JDK user=${Properties.javaVersion}
23 | """.stripMargin)
24 | }
25 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.2.8
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 |
2 | resolvers += Classpaths.sbtPluginReleases
3 |
4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
5 |
6 | addSbtPlugin("com.scalapenos" % "sbt-prompt" % "1.0.2")
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connection/ClickHouseDataSource.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connection
2 |
3 | import java.util.regex.Pattern
4 |
5 | case class ClickHouseDataSource(value: Map[String, String], database: String)
6 |
7 | object ClickHouseDataSource {
8 | private val JDBC_PREFIX = "jdbc:"
9 | private val JDBC_CLICKHOUSE_PREFIX = JDBC_PREFIX + "clickhouse:"
10 | private val URL_TEMPLATE = Pattern.compile(JDBC_CLICKHOUSE_PREFIX + "//([a-zA-Z0-9_:,.-]+)(/[a-zA-Z0-9_]+)?")
11 |
12 | def apply(url: String): ClickHouseDataSource = splitUrl(url)
13 |
14 | private def splitUrl(url: String): ClickHouseDataSource = {
15 | val m = URL_TEMPLATE.matcher(url)
16 | if (!m.matches) throw new IllegalArgumentException("Incorrect url")
17 | var database = m.group(2)
18 | if (database == null) database = ""
19 | val hosts = m.group(1).split(",")
20 |
21 | val value =
22 | hosts.map(hostWithPort =>
23 | (hostWithPort.split(":")(0), JDBC_CLICKHOUSE_PREFIX + "//" + hostWithPort + database)).toMap
24 |
25 | new ClickHouseDataSource(value, database)
26 |
27 | }
28 |
29 | def apply(hosts: Iterable[String], port: Int, database: String): ClickHouseDataSource = {
30 |
31 | val value = hosts.map(host => (host, JDBC_CLICKHOUSE_PREFIX + s"//$host:$port" + database)).toMap
32 |
33 | new ClickHouseDataSource(value, database)
34 | }
35 | }
36 |
37 |
38 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connection/ConnectionPooledDBUrl.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connection
2 |
3 | import java.io.Serializable
4 | import java.sql.{Connection, Driver, SQLException, Statement}
5 | import java.util.{NoSuchElementException, Properties}
6 |
7 | import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericKeyedObjectPool, GenericKeyedObjectPoolConfig}
8 | import org.apache.commons.pool2.{KeyedPooledObjectFactory, PooledObject}
9 | import org.slf4j.LoggerFactory
10 | import ru.yandex.clickhouse.except.ClickHouseException
11 |
12 | case class JdbcConnection(shard: String, connection: Connection)
13 |
14 |
15 | class ConnectionPooledDBUrl(val dataSource: Map[String, String],
16 | val driverName: String,
17 | val poolSizePetShard: Int,
18 | val socketTimeoutMs: Int,
19 | val user: String,
20 | val password: String) extends Serializable {
21 |
22 | private val LOG = LoggerFactory.getLogger(classOf[ConnectionPooledDBUrl])
23 |
24 | private val driver = Class.forName(driverName).newInstance.asInstanceOf[Driver]
25 |
26 | private val connectionProperties = {
27 | val prop = new Properties
28 | prop.put("socket_timeout", socketTimeoutMs.toString)
29 |
30 | if (user != null) {
31 | prop.put("user", user)
32 | }
33 | if (password != null) {
34 | prop.put("password", password)
35 | }
36 | prop
37 | }
38 |
39 | private val pool = {
40 |
41 | val config = new GenericKeyedObjectPoolConfig
42 |
43 | config.setMaxTotalPerKey(poolSizePetShard)
44 | config.setTestOnBorrow(true)
45 | config.setTestOnReturn(false)
46 |
47 | new GenericKeyedObjectPool[String, JdbcConnection](new PoolableFactory, config)
48 | }
49 |
50 |
51 | def getConnection(shard: String): JdbcConnection = this.pool.borrowObject(shard)
52 |
53 | def releaseConnection(con: JdbcConnection): Unit = {
54 | try
55 | this.pool.returnObject(con.shard, con)
56 | catch {
57 | case ex: Exception =>
58 | LOG.warn("Can not close connection.", ex)
59 | }
60 | }
61 |
62 | implicit def funcToRunnable(func: () => Unit): Runnable = () => func()
63 |
64 | def close(): Unit = {
65 | new Thread(() => {
66 |
67 | try {
68 | val p = this.pool
69 | LOG.debug(">>>> Clearing pool, active: {}, idle: {}", p.getNumActive, p.getNumIdle)
70 | p.clear()
71 | while ( {
72 | p.getNumActive > 0
73 | }) {
74 | p.setMaxTotal(p.getNumActive)
75 | try
76 | Thread.sleep(p.getMaxWaitMillis)
77 | catch {
78 | case _: InterruptedException =>
79 | //do noting
80 | }
81 | }
82 | LOG.debug(">>>> Closing pool, active: {}, idle: {}", p.getNumActive, p.getNumIdle)
83 | p.close()
84 | } catch {
85 | case ex: Exception =>
86 | LOG.warn(">>>> Exception closing pool", ex)
87 | }
88 |
89 | }).start()
90 | }
91 |
92 |
93 | private class PoolableFactory extends KeyedPooledObjectFactory[String, JdbcConnection] {
94 | @throws[SQLException]
95 | override def makeObject(shard: String): PooledObject[JdbcConnection] = {
96 | val dbURL = dataSource(shard)
97 | val connection = driver.connect(dbURL, connectionProperties)
98 | new DefaultPooledObject[JdbcConnection](JdbcConnection(shard, connection))
99 | }
100 |
101 | @throws[SQLException]
102 | override def destroyObject(key: String, obj: PooledObject[JdbcConnection]): Unit = {
103 | val dbURL = dataSource.get(key)
104 | LOG.debug("---- Closing connection in pool {}", dbURL)
105 | obj.getObject.connection.close()
106 | }
107 |
108 | override def validateObject(key: String, obj: PooledObject[JdbcConnection]): Boolean = {
109 | val dbURL = dataSource.get(key)
110 | val connection = obj.getObject.connection
111 | var st: Statement = null
112 | try {
113 | st = connection.createStatement
114 | st.execute("SELECT 1")
115 | return true
116 | } catch {
117 | case _: SQLException =>
118 | LOG.info("Invalidate connection for url: {}", dbURL)
119 | } finally try
120 | if (st != null) st.close()
121 | catch {
122 | case ex: SQLException =>
123 | LOG.info("Exception closing statement", ex)
124 | }
125 | false
126 | }
127 |
128 | override def activateObject(key: String, `object`: PooledObject[JdbcConnection]): Unit = {
129 | }
130 |
131 | override def passivateObject(key: String, `object`: PooledObject[JdbcConnection]): Unit = {
132 | }
133 | }
134 |
135 |
136 | }
137 |
138 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ClickhouseConnector.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector
2 |
3 | import java.net.InetAddress
4 | import java.util.ConcurrentModificationException
5 |
6 | import io.clickhouse.spark.connection.{ClickHouseDataSource, ConnectionPooledDBUrl, JdbcConnection}
7 | import io.clickhouse.spark.connector.ClickhouseConnector.getConnectionPool
8 | import io.clickhouse.spark.connector.partitioner.{ClickhousePartition, PartitionQuery}
9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.internal.Logging
11 |
12 | import scala.collection.concurrent.TrieMap
13 |
14 | final case class ShardUnavailableException(private val message: String = "",
15 | private val cause: Throwable = None.orNull)
16 | extends Exception(message, cause)
17 |
18 | class ClickhouseConnector(conf: ConnectorConf,
19 | initDataSource: ClickHouseDataSource,
20 | cluster: Option[String]
21 | )
22 | extends Serializable with Logging {
23 |
24 | val (
25 | dataSource: Map[Int, Seq[InetAddress]],
26 | theDataSource: ClickHouseDataSource
27 | ) = makeDataSource()
28 |
29 | def execute(partition: ClickhousePartition, query: String): TableScanner = {
30 |
31 | try {
32 | executeStatement(
33 | partition.endpoints.map(_.getHostAddress).iterator,
34 | PartitionQuery.queryForPartition(query, partition),
35 | getConnectionPool(conf, theDataSource)
36 | )
37 | }
38 | catch {
39 | case e: ShardUnavailableException =>
40 | throw ShardUnavailableException(
41 | s"all servers for shard (${partition.shardId}) are not accessible: (${partition.endpoints.map(_.getHostAddress).mkString(",")})", e)
42 | }
43 | }
44 |
45 | @scala.annotation.tailrec
46 | private def executeStatement(shardNodes: Iterator[String], query: String, cp: ConnectionPooledDBUrl): TableScanner = {
47 |
48 | if (!shardNodes.hasNext) throw ShardUnavailableException() //there are no replica left
49 |
50 | val replicaServer: String = shardNodes.next()
51 | var jdbc: JdbcConnection = null
52 | try {
53 | jdbc = cp.getConnection(replicaServer)
54 |
55 | val statement = jdbc.connection.prepareStatement(query)
56 |
57 | new TableScanner(cp, jdbc, statement)
58 | }
59 | catch {
60 | case e: Throwable =>
61 | // go to the next replica with warning message
62 | logWarning(s"Failed to execute query at $replicaServer", e)
63 |
64 | if (jdbc != null) {
65 | cp.releaseConnection(jdbc)
66 | }
67 | executeStatement(shardNodes, query, cp)
68 | }
69 | }
70 |
71 | private def getClusterMetadata = {
72 | val query =
73 | s"select shard_num, groupArray(host_name) as names, groupArray(host_address) as addresses from system.clusters " +
74 | s"where cluster = '${cluster.get}' group by shard_num"
75 |
76 | executeStatement(initDataSource.value.keys.iterator, query, getConnectionPool(conf, initDataSource))
77 | .map(rs => (rs.getInt("shard_num"),
78 | rs.getArray("names").getArray.asInstanceOf[Array[String]],
79 | rs.getArray("addresses").getArray.asInstanceOf[Array[String]]))
80 | .toList
81 | }
82 |
83 | /** find host in cluster metadata and detect shard
84 | * return shard_num */
85 | private def detectShard(clusterMetadata: List[(Int, Array[String], Array[String])], host: String): Option[Int] = {
86 | clusterMetadata.find(v => v._2.contains(host) || v._3.contains(host)).map(_._1)
87 | }
88 |
89 | private def makeDataSource(): (Map[Int, Seq[InetAddress]], ClickHouseDataSource) = {
90 |
91 | if (cluster.isDefined) {
92 |
93 | val clusterMeta = getClusterMetadata
94 |
95 | if (!conf.clickhouseAutoDiscoveryEnable) {
96 |
97 | //for each host in data_source detects shard_id, after that performed group by replicas.
98 | //Also performed filtering hosts which doesn't contained into cluster metadata.
99 | val ds =
100 | initDataSource.value.keys
101 | .map(v => (detectShard(clusterMeta, v), v)) //(shard_num, host)
102 | .filter(_._1.isDefined) //filter undefined hosts
103 | .map(v => (v._1.get, v._2)) //remove Option[]
104 | .groupBy(_._1) //group by shard_num
105 | .map(v => (v._1, v._2.map(m => InetAddress.getByName(m._2)).toList)) // (shard_num, List(InetAddress))
106 | (ds, initDataSource)
107 | }
108 | else {
109 | logDebug("cluster auto-discovery enabled")
110 | //cluster auto-discovery enabled, make new datasource from cluster metadata
111 | val newDataSource =
112 | ClickHouseDataSource(clusterMeta.flatMap(_._3), conf.clickhousePortDefault, initDataSource.database)
113 |
114 | val ds = clusterMeta
115 | .map(v => (v._1, v._3.map(m => InetAddress.getByName(m)).toList))
116 | .toMap
117 |
118 | (ds, newDataSource)
119 | }
120 | }
121 | else {
122 | //Used for clickhouse installation without 'cluster' option e.g. single server installation.
123 | //It's assumed, that all hosts in datasource are single shard and contains the same data.
124 | (Map(0 -> initDataSource.value.keys.map(InetAddress.getByName).toList), initDataSource)
125 | }
126 | }
127 | }
128 |
129 |
130 | object ClickhouseConnector {
131 |
132 | private val connectionPoolCache = new TrieMap[(ConnectorConf, ClickHouseDataSource), ConnectionPooledDBUrl]
133 |
134 | def apply(sc: SparkContext, cluster: Option[String]): ClickhouseConnector = {
135 | val conf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf)
136 |
137 | val dataSource = ClickHouseDataSource(conf.сlickhouseUrl)
138 |
139 | new ClickhouseConnector(conf, dataSource, cluster)
140 | }
141 |
142 | def getConnectionPool(conf: ConnectorConf, ds: ClickHouseDataSource): ConnectionPooledDBUrl = synchronized {
143 |
144 | connectionPoolCache.get((conf, ds)) match {
145 | case Some(value) =>
146 | value
147 | case None =>
148 | val value = new ConnectionPooledDBUrl(ds.value, conf.сlickhouseDriver,
149 | conf.maxConnectionsPerExecutor, conf.сlickhouseSocketTimeoutMs,
150 | conf.clickhouseUser, conf.clickhousePassword)
151 | connectionPoolCache.putIfAbsent((conf, ds), value) match {
152 | case None =>
153 | value
154 | case Some(_) =>
155 | throw new ConcurrentModificationException("It shouldn't reach here as it is synchronized")
156 | }
157 | }
158 | }
159 |
160 | }
161 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ClickhouseRDD.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector
2 |
3 | import io.clickhouse.spark.connector.partitioner._
4 | import org.apache.spark._
5 | import org.apache.spark.internal.Logging
6 | import org.apache.spark.metrics.clickhouse.InputMetricsUpdater
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.sql.{ClickhouseRowFactory, Row}
9 | import org.joda.time.DateTime
10 |
11 |
12 | /** RDD representing a Table Scan of a Clickhouse table.
13 | *
14 | * This class is the main entry point for analyzing data in Clickhouse database with Spark.
15 | * Obtain objects of this class by calling SparkClickhouseFunctions.clickhouseTable()
16 | *
17 | * Configuration properties should be passed in the [[org.apache.spark.SparkConf SparkConf]]
18 | * configuration of [[org.apache.spark.SparkContext SparkContext]].
19 | * `ClickhouseRDD` needs to open connection to Clickhouse, therefore it requires appropriate
20 | * connection property values to be present in [[org.apache.spark.SparkConf SparkConf]].
21 | * For the list of required and available properties, see [[ConnectorConf]].
22 | *
23 | * `ClickhouseRDD` divides the data set into smaller partitions, processed locally on every
24 | * cluster node. There are several partition strategy:
25 | * - DatedClickhousePartitioner that provides functionality for splitting shard into small partitions by date range.
26 | * Supported several range types e.g. Daily, Hourly.
27 | * - SimpleClickhousePartitioner that provides functionality for splitting RDD with partitions by shards.
28 | * - CustomClickhousePartitioner with custom split strategy for each shard.
29 | *
30 | * A `ClickhouseRDD` object gets serialized and sent to every Spark Executor, which then
31 | * calls the `compute` method to fetch the data on every node. The `getPreferredLocations`
32 | * method tells Spark the preferred nodes to fetch a partition from, so that the data for
33 | * the partition are at the same node the task was sent to. If Clickhouse nodes are collocated
34 | * with Spark nodes, the queries are always sent to the Clickhouse process running on the same
35 | * node as the Spark Executor process, hence data are not transferred between nodes.
36 | * If a Clickhouse node fails or gets overloaded during read, the queries are retried
37 | * to a different node.
38 | *
39 | */
40 | class ClickhouseRDD
41 | (
42 | @transient val sc: SparkContext,
43 | val connector: ClickhouseConnector,
44 | val query: String,
45 | val connectorConf: ConnectorConf,
46 | clickhousePartitioner: ClickhousePartitioner
47 | ) extends RDD[Row](sc, Seq.empty) with Logging {
48 |
49 | type Self = ClickhouseRDD
50 |
51 | //don't override partitioner
52 | //@transient override val partitioner = Some(clickhousePartitioner)
53 |
54 | /** Allows to copy this RDD with changing some of the properties */
55 | protected def copy(
56 | query: String = query,
57 | connectorConf: ConnectorConf = connectorConf,
58 | connector: ClickhouseConnector = connector,
59 | clickhousePartitioner: ClickhousePartitioner = clickhousePartitioner
60 | ): Self = {
61 |
62 | new ClickhouseRDD(
63 | sc = sc,
64 | connector = connector,
65 | query = query,
66 | connectorConf = connectorConf,
67 | clickhousePartitioner = clickhousePartitioner
68 | )
69 | }
70 |
71 | def query(sql: String, cluster: String): Self = {
72 | copy(query = query)
73 | }
74 |
75 | /**
76 | * Partitioning strategy: which is used for split each shard with small parts by date range.
77 | *
78 | * @param startPeriod begin of date range
79 | * @param endPeriod end of date range
80 | * @param rangeType type of date range. e.g. Hour, Day.
81 | * @param pk name of primary key
82 | */
83 | def withPeriod(startPeriod: DateTime, endPeriod: DateTime, rangeType: RangeType, pk: String = "dated"): Self = {
84 | copy(
85 | clickhousePartitioner = new DatedClickhousePartitioner(connector, (startPeriod, endPeriod), rangeType, pk)
86 | )
87 | }
88 |
89 | /**
90 | * Base partitioning strategy: single partition for each shard.
91 | */
92 | def withoutPartitioning(): Self = {
93 | copy(
94 | clickhousePartitioner = new SimpleClickhousePartitioner(connector)
95 | )
96 | }
97 |
98 | /**
99 | * User defined partitioning strategy
100 | *
101 | * @param customPartitions Sequence of partition for splitting each shard with small parts.
102 | */
103 | def withCustomPartitioning(customPartitions: Seq[String]): Self = {
104 | copy(
105 | clickhousePartitioner = new CustomClickhousePartitioner(connector, customPartitions)
106 | )
107 | }
108 |
109 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = {
110 |
111 | val partition = split.asInstanceOf[ClickhousePartition]
112 | logDebug(s" Start computation fot partition $partition")
113 | val metricsUpdater = InputMetricsUpdater(context, connectorConf)
114 |
115 | val scanner = connector.execute(partition, query)
116 |
117 | val scannerWithMetrics = scanner.map(metricsUpdater.updateMetrics)
118 |
119 | context.addTaskCompletionListener[Unit] { context =>
120 | val duration = metricsUpdater.finish() / 1000000000d
121 | logInfo(s"Complete computation for partition $partition in $duration%.3f s. Fetched ${scanner.count} rows")
122 | scanner.closeIfNeeded()
123 | }
124 |
125 | val rowFactory = ClickhouseRowFactory(scanner)
126 |
127 | scannerWithMetrics.map(rowFactory.create)
128 | }
129 |
130 | override def getPreferredLocations(split: Partition): Seq[String] = {
131 | logDebug(s"Get preferred locations for $split")
132 | val locations = split.asInstanceOf[ClickhousePartition].endpoints
133 | .flatMap(ev => NodeAddress.hostNames(ev)).toList
134 |
135 | logTrace(s" Locations:\n${locations.mkString("\n")}")
136 | locations
137 | }
138 |
139 | override protected def getPartitions: Array[Partition] = {
140 |
141 | val partitions = clickhousePartitioner.partitions
142 |
143 | logDebug(s"Created total ${partitions.length} partitions for datasource with $clickhousePartitioner.")
144 | logTrace("Partitions: \n" + partitions.mkString("\n"))
145 | partitions
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ConnectorConf.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector
2 |
3 | import org.apache.spark.SparkConf
4 |
5 | case class ConnectorConf(сlickhouseDriver: String = ConnectorConf.DefaultClickhouseDriver,
6 | сlickhouseUrl: String = ConnectorConf.DefaultClickhouseUrl,
7 | maxConnectionsPerExecutor: Int = ConnectorConf.DefaultMaxConnectionsPerExecutor,
8 | clickhouseMetricsEnable: Boolean = ConnectorConf.DefaultClickhouseMetricsEnable,
9 | сlickhouseSocketTimeoutMs: Int = ConnectorConf.DefaultClickhouseSocketTimeoutMs,
10 | clickhouseAutoDiscoveryEnable: Boolean = ConnectorConf.DefaultClickhouseAutoDiscoveryEnable,
11 | clickhousePortDefault: Int = ConnectorConf.DefaultClickhousePortDefault,
12 | clickhouseUser: String = ConnectorConf.DefaultClickhouseUser,
13 | clickhousePassword: String = ConnectorConf.DefaultClickhousePassword
14 | )
15 |
16 | object ConnectorConf {
17 |
18 | val ClickhouseDriverProperty = "spark.clickhouse.driver"
19 | val ClickhouseUrlProperty = "spark.clickhouse.url"
20 | val ClickhouseUserProperty = "spark.clickhouse.user"
21 | val ClickhousePasswordProperty = "spark.clickhouse.password"
22 | val ClickhouseAutoDiscoveryProperty = "spark.clickhouse.cluster.auto-discovery"
23 | val ClickhouseHttpPortDefaultProperty = "spark.clickhouse.cluster.port.default" //is used with auto-discovery options
24 | val ClickhouseSocketTimeoutProperty = "spark.clickhouse.socket.timeout.ms"
25 | val MaxConnectionsPerExecutorProperty = "spark.clickhouse.connection.per.executor.max"
26 | val ClickhouseMetricsEnableProperty = "spark.clickhouse.metrics.enable"
27 |
28 | val DefaultClickhouseDriver = "ru.yandex.clickhouse.ClickHouseDriver"
29 | val DefaultClickhouseUrl = "jdbc:clickhouse://127.0.0.1:8123"
30 | val DefaultMaxConnectionsPerExecutor: Int = 1
31 | val DefaultClickhouseSocketTimeoutMs: Int = 60000
32 | val DefaultClickhouseMetricsEnable: Boolean = false
33 | val DefaultClickhouseAutoDiscoveryEnable: Boolean = false
34 | val DefaultClickhousePortDefault: Int = 8123
35 |
36 | val DefaultClickhouseUser: String = null
37 | val DefaultClickhousePassword: String = null
38 |
39 |
40 | def fromSparkConf(conf: SparkConf): ConnectorConf = {
41 |
42 |
43 | ConnectorConf(
44 | сlickhouseDriver = conf.get(ClickhouseDriverProperty, DefaultClickhouseDriver),
45 | сlickhouseUrl = conf.get(ClickhouseUrlProperty, DefaultClickhouseUrl),
46 | maxConnectionsPerExecutor = conf.getInt(MaxConnectionsPerExecutorProperty, DefaultMaxConnectionsPerExecutor),
47 | clickhouseMetricsEnable = conf.getBoolean(ClickhouseMetricsEnableProperty, DefaultClickhouseMetricsEnable),
48 | сlickhouseSocketTimeoutMs = conf.getInt(ClickhouseSocketTimeoutProperty, DefaultClickhouseSocketTimeoutMs),
49 | clickhouseAutoDiscoveryEnable = conf.getBoolean(ClickhouseAutoDiscoveryProperty, DefaultClickhouseAutoDiscoveryEnable),
50 | clickhousePortDefault = conf.getInt(ClickhouseHttpPortDefaultProperty, DefaultClickhousePortDefault),
51 | clickhouseUser = conf.get(ClickhouseUserProperty, DefaultClickhouseUser),
52 | clickhousePassword = conf.get(ClickhousePasswordProperty, DefaultClickhousePassword)
53 | )
54 | }
55 |
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/SparkClickhouseFunctions.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector
2 |
3 | import io.clickhouse.spark.connector.partitioner.SimpleClickhousePartitioner
4 | import org.apache.spark.SparkContext
5 |
6 | /** Provides Clickhouse-specific methods on org.apache.spark.SparkContext SparkContext */
7 | class SparkClickhouseFunctions(@transient val sc: SparkContext) extends Serializable {
8 |
9 | def clickhouseTable(query: String, cluster: String)
10 | (implicit connector: ClickhouseConnector = ClickhouseConnector(sc, Some(cluster)),
11 | readConf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf)
12 | ) = new ClickhouseRDD(sc, connector, query, readConf, SimpleClickhousePartitioner(connector))
13 |
14 | /**
15 | * Used for clickhouse installation without 'cluster' option e.g. single server installation.
16 | * It's assumed, that all hosts in datasource are single shard and contains the same data.
17 | */
18 | def clickhouseTableWithoutCluster(query: String)
19 | (implicit connector: ClickhouseConnector = ClickhouseConnector(sc, None),
20 | readConf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf)
21 | ) = new ClickhouseRDD(sc, connector, query, readConf, SimpleClickhousePartitioner(connector))
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/TableScanner.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector
2 |
3 | import java.sql.{PreparedStatement, ResultSet}
4 |
5 | import io.clickhouse.spark.connection.{ConnectionPooledDBUrl, JdbcConnection}
6 | import org.apache.spark.TableIterator
7 | import org.apache.spark.internal.Logging
8 |
9 | class TableScanner(connectionPool: ConnectionPooledDBUrl,
10 | connection: JdbcConnection,
11 | statement: PreparedStatement) extends TableIterator[ResultSet] with Logging {
12 |
13 | private var _count = 0
14 |
15 | val resultSet: ResultSet = statement.executeQuery()
16 |
17 | /** Returns the number of successful invocations of `next` */
18 | def count: Int = _count
19 |
20 | override protected def getNext: ResultSet = {
21 | if (resultSet.next()) {
22 | _count += 1
23 | resultSet
24 | } else {
25 | finished = true
26 | null.asInstanceOf[ResultSet]
27 | }
28 | }
29 |
30 | def close(): Unit = {
31 |
32 | try {
33 | if (null != resultSet) {
34 | resultSet.close()
35 | }
36 | } catch {
37 | case e: Exception => logWarning("Exception closing resultset", e)
38 | }
39 | try {
40 | if (null != statement) {
41 | statement.close()
42 | }
43 | } catch {
44 | case e: Exception => logWarning("Exception closing statement", e)
45 | }
46 | try {
47 | if (null != connection) {
48 | connectionPool.releaseConnection(connection)
49 | }
50 | logDebug(s"release connection: ${connection.shard}")
51 | } catch {
52 | case e: Exception => logWarning(s"Exception releasing connection: ${connection.shard}", e)
53 | }
54 | }
55 |
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/package.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | package object connector {
6 | implicit def toSparkClickhouseFunctions(sc: SparkContext): SparkClickhouseFunctions =
7 | new SparkClickhouseFunctions(sc)
8 | }
9 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/ClickhousePartition.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector.partitioner
2 |
3 | import java.net.InetAddress
4 |
5 | import org.apache.spark.Partition
6 | import org.joda.time.{DateTime, Days, Hours}
7 |
8 | sealed trait RangeType {}
9 |
10 | object RangeType {
11 |
12 | case object HOUR extends RangeType
13 |
14 | case object DAY extends RangeType
15 |
16 | }
17 |
18 | case class DateRange(dated: DateTime,
19 | rType: RangeType,
20 | pk: String // primary key name for partitioning
21 | ) {
22 |
23 | def sql(): String = {
24 | if (rType == RangeType.HOUR)
25 | s"$pk = '${dated.hourOfDay.roundFloorCopy.toString("yyyy-MM-dd HH:mm:ss")}'"
26 | else
27 | s"$pk = '${dated.toString("yyyy-MM-dd")}'"
28 | }
29 |
30 | override def toString: String = {
31 | if (rType == RangeType.HOUR)
32 | s"DateRange(${dated.toString("yyyy-MM-dd HH")})"
33 | else
34 | s"DateRange(${dated.toString("yyyy-MM-dd")})"
35 | }
36 | }
37 |
38 | trait EndpointPartition extends Partition {
39 | def endpoints: Iterable[InetAddress]
40 | }
41 |
42 | case class ClickhousePartition(
43 | index: Int,
44 | shardId: Int,
45 | endpoints: Iterable[InetAddress],
46 | partitionSplit: Option[String] //addition primary key clause for spark partition splitting.
47 | ) extends EndpointPartition
48 |
49 |
50 | object DateRange {
51 |
52 | def range(startDate: DateTime, endDate: DateTime, rType: RangeType): Seq[DateTime] = {
53 |
54 | if (rType == RangeType.DAY)
55 | rangeByDay(startDate, endDate)
56 | else
57 | rangeByHour(startDate, endDate)
58 | }
59 |
60 | def rangeByHour(startDate: DateTime, endDate: DateTime): Seq[DateTime] = {
61 |
62 | val hours = Hours.hoursBetween(
63 | startDate.hourOfDay().roundFloorCopy(),
64 | endDate.hourOfDay().roundFloorCopy().plus(1)
65 | ).getHours
66 | (0 to hours).map(i => startDate.plusHours(i)).toList
67 | }
68 |
69 | def rangeByDay(startDate: DateTime, endDate: DateTime): Seq[DateTime] = {
70 |
71 | val days = Days.daysBetween(
72 | startDate.withTimeAtStartOfDay(),
73 | endDate.withTimeAtStartOfDay().plus(1)
74 | ).getDays
75 | (0 to days).map(i => startDate.plusDays(i))
76 | }
77 | }
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/ClickhousePartitioner.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector.partitioner
2 |
3 | import io.clickhouse.spark.connector.ClickhouseConnector
4 | import org.apache.spark.Partition
5 | import org.joda.time.DateTime
6 |
7 | trait ClickhousePartitioner extends Serializable {
8 |
9 | val partitions: Array[Partition]
10 |
11 | def numPartitions: Int = partitions.length
12 | }
13 |
14 | /**
15 | * Partitioner that provides functionality for splitting shard into small partitions by date range.
16 | * Supported several range types e.g. Daily, Hourly
17 | */
18 | class DatedClickhousePartitioner(connector: ClickhouseConnector,
19 | dated: (DateTime, DateTime),
20 | rangeType: RangeType,
21 | primaryKeyName: String
22 | ) extends SupportPartitionReplica with ClickhousePartitioner {
23 |
24 |
25 | override val partitions: Array[Partition] = {
26 |
27 | var i = -1
28 | for (date <- DateRange.range(dated._1, dated._2, rangeType)) yield {
29 | i += 1
30 | for (source <- connector.dataSource) yield {
31 | val rotatedHosts = rotateRight(source._2, i)
32 | val shardId = source._1
33 | // partition index will be set later
34 | ClickhousePartition(0, shardId, rotatedHosts, Some(DateRange(date, rangeType, primaryKeyName).sql()))
35 | }
36 | }
37 | }.flatMap(_.seq)
38 | .zipWithIndex
39 | .map { case (p, index) => p.copy(index = index) }
40 | .toArray[Partition]
41 |
42 | override def toString: String = s"DatedPartitioner with period ${dated._1} - ${dated._2} by $rangeType "
43 | }
44 |
45 | /**
46 | * Partitioner that provides functionality for splitting RDD with partitions by shards
47 | */
48 | class SimpleClickhousePartitioner(connector: ClickhouseConnector) extends ClickhousePartitioner {
49 |
50 | override val partitions: Array[Partition] = (for {
51 | source <- connector.dataSource
52 | } yield {
53 |
54 | val shardId = source._1
55 | val hosts = source._2
56 | // partition index will be set later
57 | ClickhousePartition(0, shardId, hosts, None)
58 | }).zipWithIndex
59 | .map { case (p, index) => p.copy(index = index) }
60 | .toArray[Partition]
61 |
62 | override def toString: String = s"SimplePartitioner"
63 | }
64 |
65 | /**
66 | * Partitioner with custom split strategy for each shard
67 | */
68 | class CustomClickhousePartitioner(connector: ClickhouseConnector,
69 | partitionSeq: Seq[String]
70 | ) extends SupportPartitionReplica with ClickhousePartitioner {
71 |
72 | override val partitions: Array[Partition] = {
73 |
74 | for (source <- connector.dataSource) yield { // (shard_num, List(InetAddress))
75 |
76 | var i = 0
77 | for (part <- partitionSeq) yield {
78 |
79 | val rotatedHosts = rotateRight(source._2, i)
80 | val shardId = source._1
81 | i += 1
82 | // partition index will be set later
83 | ClickhousePartition(0, shardId, rotatedHosts, Some(part))
84 | }
85 | }
86 | }.flatMap(_.seq)
87 | .zipWithIndex
88 | .map { case (p, index) => p.copy(index = index) }
89 | .toArray[Partition]
90 |
91 | override def toString: String = s"CustomClickhousePartitioner"
92 | }
93 |
94 | object SimpleClickhousePartitioner {
95 | def apply(connector: ClickhouseConnector): SimpleClickhousePartitioner = new SimpleClickhousePartitioner(connector)
96 | }
97 |
98 | /** Support replica placement */
99 | abstract class SupportPartitionReplica {
100 |
101 | /** circular shift of a Scala collection */
102 | def rotateRight[A](seq: Seq[A], i: Int): Seq[A] = {
103 |
104 | val size = seq.size
105 | if (i != 0 && size > 1) {
106 | seq.drop(size - (i % size)) ++ seq.take(size - (i % size))
107 | }
108 | else {
109 | seq.toList
110 | }
111 | }
112 |
113 |
114 | }
115 |
116 |
117 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/NodeAddress.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector.partitioner
2 |
3 | import java.net.InetAddress
4 |
5 | import scala.collection.concurrent.TrieMap
6 |
7 | object NodeAddress {
8 |
9 | private val addressCache = new TrieMap[InetAddress, Set[String]]
10 |
11 | /** Returns a list of IP-addresses and host names that identify a node.
12 | * Useful for giving Spark the list of preferred nodes for the Spark partition. */
13 | def hostNames(rpcAddress: InetAddress): Set[String] = {
14 |
15 | addressCache.get(rpcAddress) match {
16 | case Some(value) =>
17 | value
18 | case None =>
19 | val address = Set(
20 | rpcAddress.getHostAddress,
21 | rpcAddress.getHostName
22 | )
23 | addressCache.putIfAbsent(rpcAddress, address)
24 | address
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/PartitionQuery.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.connector.partitioner
2 |
3 | object PartitionQuery {
4 |
5 | def queryForPartition(query: String, partition: ClickhousePartition): String = {
6 |
7 | partition.partitionSplit match {
8 | case Some(split: String) => s"$query and $split"
9 | case None => query
10 | }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/sql/RowReaderFactory.scala:
--------------------------------------------------------------------------------
1 | package io.clickhouse.spark.sql
2 |
3 | import java.sql.ResultSet
4 |
5 | import org.apache.spark.sql.Row
6 |
7 | trait RowReaderFactory {
8 |
9 | def create(rs: ResultSet): Row
10 | }
11 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/org/apache/spark/TableIterator.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark
2 |
3 | import org.apache.spark.util.NextIterator
4 |
5 | /** Provides a basic/boilerplate Iterator implementation.
6 | * Extends private scope for NextIterator from apache.spark. */
7 | abstract class TableIterator[U] extends NextIterator[U] {
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/org/apache/spark/metrics/clickhouse/ClickhouseYandexRowMeter.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.metrics.clickhouse
2 |
3 | import java.sql.ResultSet
4 |
5 | import ru.yandex.clickhouse.response.ClickHouseResultSet
6 |
7 | /** Class that provide a method to calculate row_size from the yandex driver result_set */
8 | class ClickhouseYandexRowMeter extends JdbcRowMeter {
9 |
10 | def sizeOf(resultSet: ResultSet): Int = {
11 | resultSet.asInstanceOf[ClickHouseResultSet].getValues.map(_.getLen).sum
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/org/apache/spark/metrics/clickhouse/InputMetricsUpdater.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.metrics.clickhouse
2 |
3 | import java.sql.ResultSet
4 |
5 | import io.clickhouse.spark.connector.ConnectorConf
6 | import org.apache.spark.TaskContext
7 | import org.apache.spark.executor.InputMetrics
8 |
9 | /** A trait that provides a method to update read metrics which are collected for connector related tasks.
10 | * The appropriate instance is created by the companion object.
11 | *
12 | */
13 | trait InputMetricsUpdater {
14 | /** Updates the metrics being collected for the connector after reading each single row. This method
15 | * is not thread-safe.
16 | *
17 | * @param row the row which has just been read
18 | */
19 | def updateMetrics(row: ResultSet): ResultSet = row
20 |
21 | def finish(): Long
22 |
23 | private[metrics] def updateTaskMetrics(count: Int, dataLength: Int): Unit = {}
24 |
25 | }
26 |
27 | trait JdbcRowMeter {
28 | def sizeOf(rs: ResultSet): Int
29 | }
30 |
31 | object InputMetricsUpdater {
32 |
33 | /** Creates the appropriate instance of `InputMetricsUpdater`.
34 | *
35 | * The created instance will be updating task metrics so
36 | * that Spark will report them in the UI. Remember that this is supported for Spark 1.2+.
37 | *
38 | */
39 | def apply(
40 | taskContext: TaskContext,
41 | conf: ConnectorConf
42 | ): InputMetricsUpdater = {
43 |
44 | val tm = taskContext.taskMetrics()
45 | val inputMetrics = tm.inputMetrics
46 |
47 | if (conf.clickhouseMetricsEnable) {
48 |
49 | val jdbcRowMeter: JdbcRowMeter =
50 | if (conf.сlickhouseDriver == ConnectorConf.DefaultClickhouseDriver) {
51 | //metrics supported for yandex driver only
52 | new ClickhouseYandexRowMeter
53 | } else {
54 | null
55 | }
56 | new ClickhouseInputMetricsUpdater(jdbcRowMeter, inputMetrics)
57 | }
58 | else {
59 | new StubInputMetricsUpdater
60 | }
61 | }
62 |
63 | trait Timer {
64 | private val startTime = System.nanoTime()
65 |
66 | def stopTimer(): Long = System.nanoTime() - startTime
67 | }
68 |
69 | private class ClickhouseInputMetricsUpdater(rowMeter: JdbcRowMeter, inputMetrics: InputMetrics)
70 | extends InputMetricsUpdater with Timer {
71 |
72 | def getRowBinarySize(row: ResultSet): Int = {
73 |
74 | if (rowMeter != null) {
75 | rowMeter.sizeOf(row)
76 | }
77 | else {
78 | 0
79 | }
80 | }
81 |
82 | override def updateMetrics(row: ResultSet): ResultSet = {
83 | val binarySize = getRowBinarySize(row)
84 |
85 | updateTaskMetrics(1, binarySize)
86 |
87 | row
88 | }
89 |
90 | def finish(): Long = stopTimer()
91 |
92 | override def updateTaskMetrics(count: Int, dataLength: Int): Unit = {
93 | inputMetrics.incBytesRead(dataLength)
94 | inputMetrics.incRecordsRead(count)
95 | }
96 |
97 | }
98 |
99 | /** The implementation of [[InputMetricsUpdater]] which does not update anything. */
100 | private class StubInputMetricsUpdater extends InputMetricsUpdater with Timer {
101 | def finish(): Long = stopTimer()
102 | }
103 |
104 | }
--------------------------------------------------------------------------------
/spark-clickhouse-connector/src/main/scala/org/apache/spark/sql/ClickhouseRow.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.sql
2 |
3 | import java.sql.{ResultSet, ResultSetMetaData}
4 |
5 | import io.clickhouse.spark.connector.TableScanner
6 | import io.clickhouse.spark.sql.RowReaderFactory
7 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
8 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
9 | import org.apache.spark.sql.jdbc.JdbcDialects
10 | import org.apache.spark.sql.types.StructType
11 |
12 |
13 | final class ClickhouseRow(values: Array[Any],
14 | override val schema: StructType
15 | )
16 | extends GenericRowWithSchema(values, schema) {
17 | }
18 |
19 |
20 | class ClickhouseRowFactory(metadata: StructType) extends RowReaderFactory {
21 |
22 | private def resultSetToObjectArray(rs: ResultSet): Array[Any] = {
23 | Array.tabulate[Any](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
24 | }
25 |
26 | def create(rs: ResultSet): ClickhouseRow = {
27 | new ClickhouseRow(resultSetToObjectArray(rs), metadata)
28 | }
29 | }
30 |
31 | object ClickhouseRowFactory {
32 |
33 | def apply(scanner: TableScanner): ClickhouseRowFactory =
34 | new ClickhouseRowFactory(
35 | JdbcUtils.getSchema(scanner.resultSet, JdbcDialects.get("jdbc:clickhouse")))
36 | }
37 |
--------------------------------------------------------------------------------