├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── project ├── Versions.scala ├── build.properties └── plugins.sbt └── spark-clickhouse-connector └── src └── main └── scala ├── io └── clickhouse │ └── spark │ ├── connection │ ├── ClickHouseDataSource.scala │ └── ConnectionPooledDBUrl.scala │ ├── connector │ ├── ClickhouseConnector.scala │ ├── ClickhouseRDD.scala │ ├── ConnectorConf.scala │ ├── SparkClickhouseFunctions.scala │ ├── TableScanner.scala │ ├── package.scala │ └── partitioner │ │ ├── ClickhousePartition.scala │ │ ├── ClickhousePartitioner.scala │ │ ├── NodeAddress.scala │ │ └── PartitionQuery.scala │ └── sql │ └── RowReaderFactory.scala └── org └── apache └── spark ├── TableIterator.scala ├── metrics └── clickhouse │ ├── ClickhouseYandexRowMeter.scala │ └── InputMetricsUpdater.scala └── sql └── ClickhouseRow.scala /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | #ignore thumbnails created by windows 3 | Thumbs.db 4 | #Ignore files build by Visual Studio 5 | *.obj 6 | *.exe 7 | *.pdb 8 | *.user 9 | *.aps 10 | *.pch 11 | *.vspscc 12 | *_i.c 13 | *_p.c 14 | *.ncb 15 | *.suo 16 | *.tlb 17 | *.tlh 18 | *.bak 19 | *.cache 20 | *.ilk 21 | *.log 22 | [Bb]in 23 | [Dd]ebug*/ 24 | *.lib 25 | *.sbr 26 | obj/ 27 | [Rr]elease*/ 28 | _ReSharper*/ 29 | [Tt]est[Rr]esult* 30 | 31 | target/ 32 | project/boot/ 33 | project/target/ 34 | project/plugins/project/ 35 | sbt/sbt-launch*.jar 36 | 37 | # Scala-IDE specific 38 | .scala_dependencies 39 | .worksheet 40 | .idea 41 | .idea_modules 42 | 43 | 44 | 45 | *.~sql 46 | *.iml 47 | .idea/* 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-clickhouse-connector 2 | Spark Clickhouse Connector 3 | 4 | ## Description 5 | 6 | *Package for integration between Yandex Clickhouse and Apache Spark.* 7 | 8 | This assembly provides functionality to represent a Clickhouse table as ClickhouseRdd. 9 | 10 | - allows to execute SQL queries 11 | - allows to filter rows on the server side 12 | - allows to manage spark partition granularity 13 | - provides failover by Clickhouse replica 14 | - provides data locality if Clickhouse nodes are collocated with Spark nodes 15 | - provides load-balancing by Clickhouse replica 16 | - provides Clickhouse cluster auto-discovery 17 | - can be used with both drivers: ru.yandex.clickhouse.clickhouse-jdbc or com.github.housepower.clickhouse-native-jdbc 18 | - allows to throttle consuming database resources 19 | 20 | ClickhouseRDD is the main entry point for analyzing data in Clickhouse database with Spark. You can obtain object of this class by calling SparkClickhouseFunctions.clickhouseTable() 21 | 22 | Configuration properties should be passed in the org.apache.spark.SparkConf SparkConf configuration of org.apache.spark.SparkContext SparkContext. ClickhouseRDD needs to open connection to Clickhouse, therefore it requires appropriate connection property values to be present in org.apache.spark.SparkConf SparkConf. For the list of required and available properties, see ConnectorConf. 23 | 24 | A 'ClickhouseRDD' object gets serialized and sent to every Spark Executor, which then calls the 'compute' method to fetch the data on every node. The 'getPreferredLocations' method tells Spark the preferred nodes to fetch a partition from, so that the data for the partition are at the same node the task was sent to. If Clickhouse nodes are collocated with Spark nodes, the queries are always sent to the Clickhouse process running on the same node as the Spark Executor process, hence data are not transferred between nodes. If a Clickhouse node fails or gets overloaded during read, the queries are retried to a different node. 25 | 26 | ## Build 27 | 28 | In the root directory run 29 | 30 | scala 2.11: 31 | 32 | sbt '++2.11.7 assembly' 33 | 34 | or scala 2.12: 35 | 36 | sbt '++2.12.9 assembly' 37 | 38 | or two versions of scala 39 | 40 | sbt '+ assembly' 41 | 42 | A jar with shaded dependencies will be generated to directory spark-clickhouse-connector/target/scala-2.11 e.g. spark-clickhouse-connector_2.11-2.4.0_0.23.jar 43 | 44 | To publish to local maven nexus 45 | 46 | publish 47 | 48 | Or to publish to local repository 49 | 50 | publishM2 51 | 52 | You need to provide in home directory two files: 53 | - Credentials(Path.userHome / ".sbt" / "credentials") 54 | - Path.userHome.absolutePath + "/.sbt/nexus_url" 55 | 56 | ## Usage 57 | 58 | ### Prerequisites 59 | 60 | * Copy spark-clickhouse-connector_2.11-2.4.0_0.25.jar to spark lib directory 61 | * Copy ru.yandex.clickhouse.clickhouse-jdbc to spark lib directory 62 | * Add dependency to your spark project 63 | ``` 64 | 65 | io.clickhouse 66 | spark-clickhouse-connector_2.11 67 | 0.23 68 | 69 | ``` 70 | ### Set parameters 71 | 72 | ```scala 73 | val sparkConf = new SparkConf() 74 | .set("spark.clickhouse.driver","ru.yandex.clickhouse.ClickHouseDriver") 75 | .set("spark.clickhouse.url", "jdbc:clickhouse://192.168.1.1:8123,192.168.1.2:8123") 76 | .set("spark.clickhouse.user", null) 77 | .set("spark.clickhouse.password", null) 78 | .set("spark.clickhouse.connection.per.executor.max", "5") 79 | .set("spark.clickhouse.metrics.enable", "false") 80 | .set("spark.clickhouse.socket.timeout.ms", "10000") 81 | .set("spark.clickhouse.cluster.auto-discovery", "false") 82 | 83 | val ss = SparkSession.builder() 84 | .master("spark_master_url") 85 | .appName("test_app") 86 | .config(sparkConf) 87 | .getOrCreate() 88 | 89 | ``` 90 | 91 | ### Add functions on the `SparkContext` and `RDD`: 92 | 93 | ```scala 94 | import io.clickhouse.spark.connector._ 95 | ``` 96 | ### Loading from Clickhouse 97 | 98 | Sample table with index by date 99 | 100 | ```sql 101 | CREATE TABLE IF NOT EXISTS data.some_table_local on cluster some_cluster \ 102 | ( \ 103 | dated Date DEFAULT toDate(started), \ 104 | started DateTime, \ 105 | counter_id UInt32, \ 106 | col1 String, \ 107 | col2 String, \ 108 | ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/some_table_local', '{replica}') \ 109 | PARTITION BY (toYYYYMM(dated)) \ 110 | ORDER BY (dated, counter_id) \ 111 | 112 | ``` 113 | 114 | Loading data to ClickhouseRDD with spark partitions by `Shard` by `DAY`. For example you have cluster with 2 shards and you need to analyze 30 days period. In this case connector creates 60 spark-partitions. 115 | 116 | ```scala 117 | 118 | val sc = ss.sparkContext 119 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " + 120 | s" where started >='${startDate.toString("yyyy-MM-dd HH:mm:ss")}' and started <= '${endDate.toString("yyyy-MM-dd HH:mm:ss")}'" 121 | 122 | sc.clickhouseTable(query, "some_cluster") 123 | .withPeriod(startDate, endDate, partitioner.RangeType.DAY, "dated") 124 | .map(row => { 125 | val counterId = row.getAs[Long]("counter_id") 126 | val started = new DateTime(row.getAs[Timestamp]("started")) 127 | val col1 = row.getAs[String]("col1") 128 | val col2 = row.getAs[String]("col2") 129 | 130 | (started, counterId, col1, col2) 131 | }) 132 | .filter () 133 | .groupBy() 134 | .<...> 135 | 136 | ``` 137 | 138 | Loading data to ClickhouseRDD with spark partitions by `Shard` without date range. For example you have cluster with 2 shards and you need to analyze 30 days period. In this case connector creates only 2 spark-partitions. 139 | 140 | 141 | ```scala 142 | 143 | val sc = ss.sparkContext 144 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " + 145 | s" where dated >='${startDate.toString("yyyy-MM-dd")}' and dated <= '${endDate.toString("yyyy-MM-dd")} " + 146 | s" and counter_id in (1,2,3)'" 147 | 148 | sc.clickhouseTable(query, "some_cluster") 149 | .map(row => { 150 | val counterId = row.getAs[Long]("counter_id") 151 | val started = new DateTime(row.getAs[Timestamp]("started")) 152 | val col1 = row.getAs[String]("col1") 153 | val col2 = row.getAs[String]("col2") 154 | 155 | (started, counterId, col1, col2) 156 | }) 157 | .filter () 158 | .groupBy() 159 | .<...> 160 | 161 | ``` 162 | 163 | Loading data to ClickhouseRDD with spark partitions by `Shard` with custom date range. 164 | For example you have cluster with 2 shards and you need to analyze 30 days period. 165 | In this case connector creates only 4 spark-partitions. 166 | 167 | 168 | ```scala 169 | 170 | val sc = ss.sparkContext 171 | val query = s"select started, counter_id, col1, col2 from data.some_table_local " + 172 | s" where counter_id in (1,2,3)'" 173 | 174 | sc.clickhouseTable(query, "some_cluster") 175 | .withCustomPartitioning(Seq("dated >= '2019-01-01' and dated < '2019-01-16'", 176 | "dated >= '2019-01-16' and dated < '2019-02-01'")) 177 | .map(row => { 178 | val counterId = row.getAs[Long]("counter_id") 179 | val started = new DateTime(row.getAs[Timestamp]("started")) 180 | val col1 = row.getAs[String]("col1") 181 | val col2 = row.getAs[String]("col2") 182 | 183 | (started, counterId, col1, col2) 184 | }) 185 | .filter () 186 | .groupBy() 187 | .<...> 188 | 189 | ``` 190 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import sbt.Keys.{credentials, libraryDependencies, pomExtra, publishM2Configuration, run, version} 2 | import Versions._ 3 | import sbt.{Credentials, CrossVersion} 4 | import sbtassembly.AssemblyKeys.{assembly, assemblyJarName, assemblyOption} 5 | 6 | 7 | val versionStatus = settingKey[Unit]("The Scala version used in cross-build reapply for '+ package', '+ publish'.") 8 | val nexus_local = IO.read(new File(Path.userHome.absolutePath + "/.sbt/nexus_url")) 9 | lazy val creds = Seq(credentials += Credentials(Path.userHome / ".sbt" / "credentials")) 10 | 11 | 12 | val commonSettings = creds ++ Seq( 13 | name := "spark-clickhouse-connector", 14 | organization := "io.clickhouse", 15 | version := "0.25", 16 | crossScalaVersions := Seq(Versions.scala211, Versions.scala212), 17 | crossVersion := CrossVersion.binary, 18 | versionStatus := Versions.status(scalaVersion.value, scalaBinaryVersion.value), 19 | publishMavenStyle := true, 20 | 21 | publishConfiguration := publishConfiguration.value.withOverwrite(true), 22 | publishM2Configuration := publishM2Configuration.value.withOverwrite(true), 23 | publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true), 24 | 25 | publishTo := { 26 | if (isSnapshot.value) 27 | Some("Snapshot repository" at nexus_local+ "/content/repositories/snapshots/") 28 | else 29 | Some("Release repository" at nexus_local + "/content/repositories/releases/") 30 | }, 31 | 32 | pomExtra := 33 | 34 | 35 | vbezruchko 36 | Vadim Bezruchko 37 | va.bezruchko@gmail.com 38 | 39 | 40 | 41 | ) 42 | 43 | //lazy val scalaBinary = scala_version.dropRight(2) 44 | 45 | 46 | 47 | val deps = Seq ( 48 | "org.apache.spark" %% "spark-core" % Spark % "provided", 49 | "org.apache.spark" %% "spark-sql" % Spark % "provided", 50 | 51 | "org.slf4j" % "slf4j-api" % Slf4j % "provided", 52 | 53 | "org.eclipse.jetty" % "jetty-server" % SparkJetty % "provided", 54 | "org.eclipse.jetty" % "jetty-servlet" % SparkJetty % "provided", 55 | 56 | "com.codahale.metrics" % "metrics-core" % CodaHaleMetrics % "provided", 57 | "com.codahale.metrics" % "metrics-json" % CodaHaleMetrics % "provided", 58 | 59 | "ru.yandex.clickhouse" % "clickhouse-jdbc" % clickhouse_jdbc % "provided", 60 | 61 | "joda-time" % "joda-time" % joda_version % "provided", 62 | "org.apache.commons" % "commons-pool2" % commons_pool 63 | ) 64 | 65 | //multi-project for shading commons-pool2 66 | lazy val assemblyJar = (project in file("spark-clickhouse-connector")) 67 | .enablePlugins(AssemblyPlugin) 68 | .settings(commonSettings) 69 | .settings(libraryDependencies ++=deps) 70 | .settings( 71 | skip in publish := true, 72 | skip in publishM2 := true, 73 | assemblyOption in assembly ~= { _.copy(includeScala = false) }, 74 | run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)), 75 | assemblyJarName in assembly := s"${name.value}_${scalaBinaryVersion.value}-${Versions.Spark}_${version.value}.jar", 76 | assemblyShadeRules in assembly := Seq( 77 | ShadeRule.rename("org.apache.commons.**" -> "shade.io.clickhouse.apache.commons.@1").inAll 78 | ) 79 | ) 80 | 81 | //hide shaded dependencies 82 | lazy val connectorDistribution = (project in file("./")) 83 | .settings(commonSettings) 84 | .settings( 85 | packageBin in Compile := (assembly in (assemblyJar, Compile)).value, 86 | assembly := (assembly in assemblyJar).value 87 | 88 | ) -------------------------------------------------------------------------------- /project/Versions.scala: -------------------------------------------------------------------------------- 1 | import Versions.JDK 2 | 3 | import scala.util.Properties 4 | 5 | object Versions { 6 | 7 | val scala211 = "2.11.7" 8 | val scala212 = "2.12.9" 9 | val Spark = "2.4.0" 10 | val Slf4j = "1.6.1" 11 | val SparkJetty = "8.1.14.v20131031" 12 | val CodaHaleMetrics = "3.0.2" 13 | val commons_pool = "2.5.0" 14 | val clickhouse_jdbc = "0.1.50" 15 | val JDK = "1.8" 16 | val joda_version = "2.10.6" 17 | 18 | val status = (versionInReapply: String, binaryInReapply: String) => 19 | println(s""" 20 | | Scala: $versionInReapply 21 | | Scala Binary: $binaryInReapply 22 | | Java: target=$JDK user=${Properties.javaVersion} 23 | """.stripMargin) 24 | } 25 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.8 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | 2 | resolvers += Classpaths.sbtPluginReleases 3 | 4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") 5 | 6 | addSbtPlugin("com.scalapenos" % "sbt-prompt" % "1.0.2") 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connection/ClickHouseDataSource.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connection 2 | 3 | import java.util.regex.Pattern 4 | 5 | case class ClickHouseDataSource(value: Map[String, String], database: String) 6 | 7 | object ClickHouseDataSource { 8 | private val JDBC_PREFIX = "jdbc:" 9 | private val JDBC_CLICKHOUSE_PREFIX = JDBC_PREFIX + "clickhouse:" 10 | private val URL_TEMPLATE = Pattern.compile(JDBC_CLICKHOUSE_PREFIX + "//([a-zA-Z0-9_:,.-]+)(/[a-zA-Z0-9_]+)?") 11 | 12 | def apply(url: String): ClickHouseDataSource = splitUrl(url) 13 | 14 | private def splitUrl(url: String): ClickHouseDataSource = { 15 | val m = URL_TEMPLATE.matcher(url) 16 | if (!m.matches) throw new IllegalArgumentException("Incorrect url") 17 | var database = m.group(2) 18 | if (database == null) database = "" 19 | val hosts = m.group(1).split(",") 20 | 21 | val value = 22 | hosts.map(hostWithPort => 23 | (hostWithPort.split(":")(0), JDBC_CLICKHOUSE_PREFIX + "//" + hostWithPort + database)).toMap 24 | 25 | new ClickHouseDataSource(value, database) 26 | 27 | } 28 | 29 | def apply(hosts: Iterable[String], port: Int, database: String): ClickHouseDataSource = { 30 | 31 | val value = hosts.map(host => (host, JDBC_CLICKHOUSE_PREFIX + s"//$host:$port" + database)).toMap 32 | 33 | new ClickHouseDataSource(value, database) 34 | } 35 | } 36 | 37 | 38 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connection/ConnectionPooledDBUrl.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connection 2 | 3 | import java.io.Serializable 4 | import java.sql.{Connection, Driver, SQLException, Statement} 5 | import java.util.{NoSuchElementException, Properties} 6 | 7 | import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericKeyedObjectPool, GenericKeyedObjectPoolConfig} 8 | import org.apache.commons.pool2.{KeyedPooledObjectFactory, PooledObject} 9 | import org.slf4j.LoggerFactory 10 | import ru.yandex.clickhouse.except.ClickHouseException 11 | 12 | case class JdbcConnection(shard: String, connection: Connection) 13 | 14 | 15 | class ConnectionPooledDBUrl(val dataSource: Map[String, String], 16 | val driverName: String, 17 | val poolSizePetShard: Int, 18 | val socketTimeoutMs: Int, 19 | val user: String, 20 | val password: String) extends Serializable { 21 | 22 | private val LOG = LoggerFactory.getLogger(classOf[ConnectionPooledDBUrl]) 23 | 24 | private val driver = Class.forName(driverName).newInstance.asInstanceOf[Driver] 25 | 26 | private val connectionProperties = { 27 | val prop = new Properties 28 | prop.put("socket_timeout", socketTimeoutMs.toString) 29 | 30 | if (user != null) { 31 | prop.put("user", user) 32 | } 33 | if (password != null) { 34 | prop.put("password", password) 35 | } 36 | prop 37 | } 38 | 39 | private val pool = { 40 | 41 | val config = new GenericKeyedObjectPoolConfig 42 | 43 | config.setMaxTotalPerKey(poolSizePetShard) 44 | config.setTestOnBorrow(true) 45 | config.setTestOnReturn(false) 46 | 47 | new GenericKeyedObjectPool[String, JdbcConnection](new PoolableFactory, config) 48 | } 49 | 50 | 51 | def getConnection(shard: String): JdbcConnection = this.pool.borrowObject(shard) 52 | 53 | def releaseConnection(con: JdbcConnection): Unit = { 54 | try 55 | this.pool.returnObject(con.shard, con) 56 | catch { 57 | case ex: Exception => 58 | LOG.warn("Can not close connection.", ex) 59 | } 60 | } 61 | 62 | implicit def funcToRunnable(func: () => Unit): Runnable = () => func() 63 | 64 | def close(): Unit = { 65 | new Thread(() => { 66 | 67 | try { 68 | val p = this.pool 69 | LOG.debug(">>>> Clearing pool, active: {}, idle: {}", p.getNumActive, p.getNumIdle) 70 | p.clear() 71 | while ( { 72 | p.getNumActive > 0 73 | }) { 74 | p.setMaxTotal(p.getNumActive) 75 | try 76 | Thread.sleep(p.getMaxWaitMillis) 77 | catch { 78 | case _: InterruptedException => 79 | //do noting 80 | } 81 | } 82 | LOG.debug(">>>> Closing pool, active: {}, idle: {}", p.getNumActive, p.getNumIdle) 83 | p.close() 84 | } catch { 85 | case ex: Exception => 86 | LOG.warn(">>>> Exception closing pool", ex) 87 | } 88 | 89 | }).start() 90 | } 91 | 92 | 93 | private class PoolableFactory extends KeyedPooledObjectFactory[String, JdbcConnection] { 94 | @throws[SQLException] 95 | override def makeObject(shard: String): PooledObject[JdbcConnection] = { 96 | val dbURL = dataSource(shard) 97 | val connection = driver.connect(dbURL, connectionProperties) 98 | new DefaultPooledObject[JdbcConnection](JdbcConnection(shard, connection)) 99 | } 100 | 101 | @throws[SQLException] 102 | override def destroyObject(key: String, obj: PooledObject[JdbcConnection]): Unit = { 103 | val dbURL = dataSource.get(key) 104 | LOG.debug("---- Closing connection in pool {}", dbURL) 105 | obj.getObject.connection.close() 106 | } 107 | 108 | override def validateObject(key: String, obj: PooledObject[JdbcConnection]): Boolean = { 109 | val dbURL = dataSource.get(key) 110 | val connection = obj.getObject.connection 111 | var st: Statement = null 112 | try { 113 | st = connection.createStatement 114 | st.execute("SELECT 1") 115 | return true 116 | } catch { 117 | case _: SQLException => 118 | LOG.info("Invalidate connection for url: {}", dbURL) 119 | } finally try 120 | if (st != null) st.close() 121 | catch { 122 | case ex: SQLException => 123 | LOG.info("Exception closing statement", ex) 124 | } 125 | false 126 | } 127 | 128 | override def activateObject(key: String, `object`: PooledObject[JdbcConnection]): Unit = { 129 | } 130 | 131 | override def passivateObject(key: String, `object`: PooledObject[JdbcConnection]): Unit = { 132 | } 133 | } 134 | 135 | 136 | } 137 | 138 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ClickhouseConnector.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector 2 | 3 | import java.net.InetAddress 4 | import java.util.ConcurrentModificationException 5 | 6 | import io.clickhouse.spark.connection.{ClickHouseDataSource, ConnectionPooledDBUrl, JdbcConnection} 7 | import io.clickhouse.spark.connector.ClickhouseConnector.getConnectionPool 8 | import io.clickhouse.spark.connector.partitioner.{ClickhousePartition, PartitionQuery} 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.internal.Logging 11 | 12 | import scala.collection.concurrent.TrieMap 13 | 14 | final case class ShardUnavailableException(private val message: String = "", 15 | private val cause: Throwable = None.orNull) 16 | extends Exception(message, cause) 17 | 18 | class ClickhouseConnector(conf: ConnectorConf, 19 | initDataSource: ClickHouseDataSource, 20 | cluster: Option[String] 21 | ) 22 | extends Serializable with Logging { 23 | 24 | val ( 25 | dataSource: Map[Int, Seq[InetAddress]], 26 | theDataSource: ClickHouseDataSource 27 | ) = makeDataSource() 28 | 29 | def execute(partition: ClickhousePartition, query: String): TableScanner = { 30 | 31 | try { 32 | executeStatement( 33 | partition.endpoints.map(_.getHostAddress).iterator, 34 | PartitionQuery.queryForPartition(query, partition), 35 | getConnectionPool(conf, theDataSource) 36 | ) 37 | } 38 | catch { 39 | case e: ShardUnavailableException => 40 | throw ShardUnavailableException( 41 | s"all servers for shard (${partition.shardId}) are not accessible: (${partition.endpoints.map(_.getHostAddress).mkString(",")})", e) 42 | } 43 | } 44 | 45 | @scala.annotation.tailrec 46 | private def executeStatement(shardNodes: Iterator[String], query: String, cp: ConnectionPooledDBUrl): TableScanner = { 47 | 48 | if (!shardNodes.hasNext) throw ShardUnavailableException() //there are no replica left 49 | 50 | val replicaServer: String = shardNodes.next() 51 | var jdbc: JdbcConnection = null 52 | try { 53 | jdbc = cp.getConnection(replicaServer) 54 | 55 | val statement = jdbc.connection.prepareStatement(query) 56 | 57 | new TableScanner(cp, jdbc, statement) 58 | } 59 | catch { 60 | case e: Throwable => 61 | // go to the next replica with warning message 62 | logWarning(s"Failed to execute query at $replicaServer", e) 63 | 64 | if (jdbc != null) { 65 | cp.releaseConnection(jdbc) 66 | } 67 | executeStatement(shardNodes, query, cp) 68 | } 69 | } 70 | 71 | private def getClusterMetadata = { 72 | val query = 73 | s"select shard_num, groupArray(host_name) as names, groupArray(host_address) as addresses from system.clusters " + 74 | s"where cluster = '${cluster.get}' group by shard_num" 75 | 76 | executeStatement(initDataSource.value.keys.iterator, query, getConnectionPool(conf, initDataSource)) 77 | .map(rs => (rs.getInt("shard_num"), 78 | rs.getArray("names").getArray.asInstanceOf[Array[String]], 79 | rs.getArray("addresses").getArray.asInstanceOf[Array[String]])) 80 | .toList 81 | } 82 | 83 | /** find host in cluster metadata and detect shard 84 | * return shard_num */ 85 | private def detectShard(clusterMetadata: List[(Int, Array[String], Array[String])], host: String): Option[Int] = { 86 | clusterMetadata.find(v => v._2.contains(host) || v._3.contains(host)).map(_._1) 87 | } 88 | 89 | private def makeDataSource(): (Map[Int, Seq[InetAddress]], ClickHouseDataSource) = { 90 | 91 | if (cluster.isDefined) { 92 | 93 | val clusterMeta = getClusterMetadata 94 | 95 | if (!conf.clickhouseAutoDiscoveryEnable) { 96 | 97 | //for each host in data_source detects shard_id, after that performed group by replicas. 98 | //Also performed filtering hosts which doesn't contained into cluster metadata. 99 | val ds = 100 | initDataSource.value.keys 101 | .map(v => (detectShard(clusterMeta, v), v)) //(shard_num, host) 102 | .filter(_._1.isDefined) //filter undefined hosts 103 | .map(v => (v._1.get, v._2)) //remove Option[] 104 | .groupBy(_._1) //group by shard_num 105 | .map(v => (v._1, v._2.map(m => InetAddress.getByName(m._2)).toList)) // (shard_num, List(InetAddress)) 106 | (ds, initDataSource) 107 | } 108 | else { 109 | logDebug("cluster auto-discovery enabled") 110 | //cluster auto-discovery enabled, make new datasource from cluster metadata 111 | val newDataSource = 112 | ClickHouseDataSource(clusterMeta.flatMap(_._3), conf.clickhousePortDefault, initDataSource.database) 113 | 114 | val ds = clusterMeta 115 | .map(v => (v._1, v._3.map(m => InetAddress.getByName(m)).toList)) 116 | .toMap 117 | 118 | (ds, newDataSource) 119 | } 120 | } 121 | else { 122 | //Used for clickhouse installation without 'cluster' option e.g. single server installation. 123 | //It's assumed, that all hosts in datasource are single shard and contains the same data. 124 | (Map(0 -> initDataSource.value.keys.map(InetAddress.getByName).toList), initDataSource) 125 | } 126 | } 127 | } 128 | 129 | 130 | object ClickhouseConnector { 131 | 132 | private val connectionPoolCache = new TrieMap[(ConnectorConf, ClickHouseDataSource), ConnectionPooledDBUrl] 133 | 134 | def apply(sc: SparkContext, cluster: Option[String]): ClickhouseConnector = { 135 | val conf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf) 136 | 137 | val dataSource = ClickHouseDataSource(conf.сlickhouseUrl) 138 | 139 | new ClickhouseConnector(conf, dataSource, cluster) 140 | } 141 | 142 | def getConnectionPool(conf: ConnectorConf, ds: ClickHouseDataSource): ConnectionPooledDBUrl = synchronized { 143 | 144 | connectionPoolCache.get((conf, ds)) match { 145 | case Some(value) => 146 | value 147 | case None => 148 | val value = new ConnectionPooledDBUrl(ds.value, conf.сlickhouseDriver, 149 | conf.maxConnectionsPerExecutor, conf.сlickhouseSocketTimeoutMs, 150 | conf.clickhouseUser, conf.clickhousePassword) 151 | connectionPoolCache.putIfAbsent((conf, ds), value) match { 152 | case None => 153 | value 154 | case Some(_) => 155 | throw new ConcurrentModificationException("It shouldn't reach here as it is synchronized") 156 | } 157 | } 158 | } 159 | 160 | } 161 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ClickhouseRDD.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector 2 | 3 | import io.clickhouse.spark.connector.partitioner._ 4 | import org.apache.spark._ 5 | import org.apache.spark.internal.Logging 6 | import org.apache.spark.metrics.clickhouse.InputMetricsUpdater 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.{ClickhouseRowFactory, Row} 9 | import org.joda.time.DateTime 10 | 11 | 12 | /** RDD representing a Table Scan of a Clickhouse table. 13 | * 14 | * This class is the main entry point for analyzing data in Clickhouse database with Spark. 15 | * Obtain objects of this class by calling SparkClickhouseFunctions.clickhouseTable() 16 | * 17 | * Configuration properties should be passed in the [[org.apache.spark.SparkConf SparkConf]] 18 | * configuration of [[org.apache.spark.SparkContext SparkContext]]. 19 | * `ClickhouseRDD` needs to open connection to Clickhouse, therefore it requires appropriate 20 | * connection property values to be present in [[org.apache.spark.SparkConf SparkConf]]. 21 | * For the list of required and available properties, see [[ConnectorConf]]. 22 | * 23 | * `ClickhouseRDD` divides the data set into smaller partitions, processed locally on every 24 | * cluster node. There are several partition strategy: 25 | * - DatedClickhousePartitioner that provides functionality for splitting shard into small partitions by date range. 26 | * Supported several range types e.g. Daily, Hourly. 27 | * - SimpleClickhousePartitioner that provides functionality for splitting RDD with partitions by shards. 28 | * - CustomClickhousePartitioner with custom split strategy for each shard. 29 | * 30 | * A `ClickhouseRDD` object gets serialized and sent to every Spark Executor, which then 31 | * calls the `compute` method to fetch the data on every node. The `getPreferredLocations` 32 | * method tells Spark the preferred nodes to fetch a partition from, so that the data for 33 | * the partition are at the same node the task was sent to. If Clickhouse nodes are collocated 34 | * with Spark nodes, the queries are always sent to the Clickhouse process running on the same 35 | * node as the Spark Executor process, hence data are not transferred between nodes. 36 | * If a Clickhouse node fails or gets overloaded during read, the queries are retried 37 | * to a different node. 38 | * 39 | */ 40 | class ClickhouseRDD 41 | ( 42 | @transient val sc: SparkContext, 43 | val connector: ClickhouseConnector, 44 | val query: String, 45 | val connectorConf: ConnectorConf, 46 | clickhousePartitioner: ClickhousePartitioner 47 | ) extends RDD[Row](sc, Seq.empty) with Logging { 48 | 49 | type Self = ClickhouseRDD 50 | 51 | //don't override partitioner 52 | //@transient override val partitioner = Some(clickhousePartitioner) 53 | 54 | /** Allows to copy this RDD with changing some of the properties */ 55 | protected def copy( 56 | query: String = query, 57 | connectorConf: ConnectorConf = connectorConf, 58 | connector: ClickhouseConnector = connector, 59 | clickhousePartitioner: ClickhousePartitioner = clickhousePartitioner 60 | ): Self = { 61 | 62 | new ClickhouseRDD( 63 | sc = sc, 64 | connector = connector, 65 | query = query, 66 | connectorConf = connectorConf, 67 | clickhousePartitioner = clickhousePartitioner 68 | ) 69 | } 70 | 71 | def query(sql: String, cluster: String): Self = { 72 | copy(query = query) 73 | } 74 | 75 | /** 76 | * Partitioning strategy: which is used for split each shard with small parts by date range. 77 | * 78 | * @param startPeriod begin of date range 79 | * @param endPeriod end of date range 80 | * @param rangeType type of date range. e.g. Hour, Day. 81 | * @param pk name of primary key 82 | */ 83 | def withPeriod(startPeriod: DateTime, endPeriod: DateTime, rangeType: RangeType, pk: String = "dated"): Self = { 84 | copy( 85 | clickhousePartitioner = new DatedClickhousePartitioner(connector, (startPeriod, endPeriod), rangeType, pk) 86 | ) 87 | } 88 | 89 | /** 90 | * Base partitioning strategy: single partition for each shard. 91 | */ 92 | def withoutPartitioning(): Self = { 93 | copy( 94 | clickhousePartitioner = new SimpleClickhousePartitioner(connector) 95 | ) 96 | } 97 | 98 | /** 99 | * User defined partitioning strategy 100 | * 101 | * @param customPartitions Sequence of partition for splitting each shard with small parts. 102 | */ 103 | def withCustomPartitioning(customPartitions: Seq[String]): Self = { 104 | copy( 105 | clickhousePartitioner = new CustomClickhousePartitioner(connector, customPartitions) 106 | ) 107 | } 108 | 109 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = { 110 | 111 | val partition = split.asInstanceOf[ClickhousePartition] 112 | logDebug(s" Start computation fot partition $partition") 113 | val metricsUpdater = InputMetricsUpdater(context, connectorConf) 114 | 115 | val scanner = connector.execute(partition, query) 116 | 117 | val scannerWithMetrics = scanner.map(metricsUpdater.updateMetrics) 118 | 119 | context.addTaskCompletionListener[Unit] { context => 120 | val duration = metricsUpdater.finish() / 1000000000d 121 | logInfo(s"Complete computation for partition $partition in $duration%.3f s. Fetched ${scanner.count} rows") 122 | scanner.closeIfNeeded() 123 | } 124 | 125 | val rowFactory = ClickhouseRowFactory(scanner) 126 | 127 | scannerWithMetrics.map(rowFactory.create) 128 | } 129 | 130 | override def getPreferredLocations(split: Partition): Seq[String] = { 131 | logDebug(s"Get preferred locations for $split") 132 | val locations = split.asInstanceOf[ClickhousePartition].endpoints 133 | .flatMap(ev => NodeAddress.hostNames(ev)).toList 134 | 135 | logTrace(s" Locations:\n${locations.mkString("\n")}") 136 | locations 137 | } 138 | 139 | override protected def getPartitions: Array[Partition] = { 140 | 141 | val partitions = clickhousePartitioner.partitions 142 | 143 | logDebug(s"Created total ${partitions.length} partitions for datasource with $clickhousePartitioner.") 144 | logTrace("Partitions: \n" + partitions.mkString("\n")) 145 | partitions 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/ConnectorConf.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector 2 | 3 | import org.apache.spark.SparkConf 4 | 5 | case class ConnectorConf(сlickhouseDriver: String = ConnectorConf.DefaultClickhouseDriver, 6 | сlickhouseUrl: String = ConnectorConf.DefaultClickhouseUrl, 7 | maxConnectionsPerExecutor: Int = ConnectorConf.DefaultMaxConnectionsPerExecutor, 8 | clickhouseMetricsEnable: Boolean = ConnectorConf.DefaultClickhouseMetricsEnable, 9 | сlickhouseSocketTimeoutMs: Int = ConnectorConf.DefaultClickhouseSocketTimeoutMs, 10 | clickhouseAutoDiscoveryEnable: Boolean = ConnectorConf.DefaultClickhouseAutoDiscoveryEnable, 11 | clickhousePortDefault: Int = ConnectorConf.DefaultClickhousePortDefault, 12 | clickhouseUser: String = ConnectorConf.DefaultClickhouseUser, 13 | clickhousePassword: String = ConnectorConf.DefaultClickhousePassword 14 | ) 15 | 16 | object ConnectorConf { 17 | 18 | val ClickhouseDriverProperty = "spark.clickhouse.driver" 19 | val ClickhouseUrlProperty = "spark.clickhouse.url" 20 | val ClickhouseUserProperty = "spark.clickhouse.user" 21 | val ClickhousePasswordProperty = "spark.clickhouse.password" 22 | val ClickhouseAutoDiscoveryProperty = "spark.clickhouse.cluster.auto-discovery" 23 | val ClickhouseHttpPortDefaultProperty = "spark.clickhouse.cluster.port.default" //is used with auto-discovery options 24 | val ClickhouseSocketTimeoutProperty = "spark.clickhouse.socket.timeout.ms" 25 | val MaxConnectionsPerExecutorProperty = "spark.clickhouse.connection.per.executor.max" 26 | val ClickhouseMetricsEnableProperty = "spark.clickhouse.metrics.enable" 27 | 28 | val DefaultClickhouseDriver = "ru.yandex.clickhouse.ClickHouseDriver" 29 | val DefaultClickhouseUrl = "jdbc:clickhouse://127.0.0.1:8123" 30 | val DefaultMaxConnectionsPerExecutor: Int = 1 31 | val DefaultClickhouseSocketTimeoutMs: Int = 60000 32 | val DefaultClickhouseMetricsEnable: Boolean = false 33 | val DefaultClickhouseAutoDiscoveryEnable: Boolean = false 34 | val DefaultClickhousePortDefault: Int = 8123 35 | 36 | val DefaultClickhouseUser: String = null 37 | val DefaultClickhousePassword: String = null 38 | 39 | 40 | def fromSparkConf(conf: SparkConf): ConnectorConf = { 41 | 42 | 43 | ConnectorConf( 44 | сlickhouseDriver = conf.get(ClickhouseDriverProperty, DefaultClickhouseDriver), 45 | сlickhouseUrl = conf.get(ClickhouseUrlProperty, DefaultClickhouseUrl), 46 | maxConnectionsPerExecutor = conf.getInt(MaxConnectionsPerExecutorProperty, DefaultMaxConnectionsPerExecutor), 47 | clickhouseMetricsEnable = conf.getBoolean(ClickhouseMetricsEnableProperty, DefaultClickhouseMetricsEnable), 48 | сlickhouseSocketTimeoutMs = conf.getInt(ClickhouseSocketTimeoutProperty, DefaultClickhouseSocketTimeoutMs), 49 | clickhouseAutoDiscoveryEnable = conf.getBoolean(ClickhouseAutoDiscoveryProperty, DefaultClickhouseAutoDiscoveryEnable), 50 | clickhousePortDefault = conf.getInt(ClickhouseHttpPortDefaultProperty, DefaultClickhousePortDefault), 51 | clickhouseUser = conf.get(ClickhouseUserProperty, DefaultClickhouseUser), 52 | clickhousePassword = conf.get(ClickhousePasswordProperty, DefaultClickhousePassword) 53 | ) 54 | } 55 | 56 | } 57 | 58 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/SparkClickhouseFunctions.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector 2 | 3 | import io.clickhouse.spark.connector.partitioner.SimpleClickhousePartitioner 4 | import org.apache.spark.SparkContext 5 | 6 | /** Provides Clickhouse-specific methods on org.apache.spark.SparkContext SparkContext */ 7 | class SparkClickhouseFunctions(@transient val sc: SparkContext) extends Serializable { 8 | 9 | def clickhouseTable(query: String, cluster: String) 10 | (implicit connector: ClickhouseConnector = ClickhouseConnector(sc, Some(cluster)), 11 | readConf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf) 12 | ) = new ClickhouseRDD(sc, connector, query, readConf, SimpleClickhousePartitioner(connector)) 13 | 14 | /** 15 | * Used for clickhouse installation without 'cluster' option e.g. single server installation. 16 | * It's assumed, that all hosts in datasource are single shard and contains the same data. 17 | */ 18 | def clickhouseTableWithoutCluster(query: String) 19 | (implicit connector: ClickhouseConnector = ClickhouseConnector(sc, None), 20 | readConf: ConnectorConf = ConnectorConf.fromSparkConf(sc.getConf) 21 | ) = new ClickhouseRDD(sc, connector, query, readConf, SimpleClickhousePartitioner(connector)) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/TableScanner.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector 2 | 3 | import java.sql.{PreparedStatement, ResultSet} 4 | 5 | import io.clickhouse.spark.connection.{ConnectionPooledDBUrl, JdbcConnection} 6 | import org.apache.spark.TableIterator 7 | import org.apache.spark.internal.Logging 8 | 9 | class TableScanner(connectionPool: ConnectionPooledDBUrl, 10 | connection: JdbcConnection, 11 | statement: PreparedStatement) extends TableIterator[ResultSet] with Logging { 12 | 13 | private var _count = 0 14 | 15 | val resultSet: ResultSet = statement.executeQuery() 16 | 17 | /** Returns the number of successful invocations of `next` */ 18 | def count: Int = _count 19 | 20 | override protected def getNext: ResultSet = { 21 | if (resultSet.next()) { 22 | _count += 1 23 | resultSet 24 | } else { 25 | finished = true 26 | null.asInstanceOf[ResultSet] 27 | } 28 | } 29 | 30 | def close(): Unit = { 31 | 32 | try { 33 | if (null != resultSet) { 34 | resultSet.close() 35 | } 36 | } catch { 37 | case e: Exception => logWarning("Exception closing resultset", e) 38 | } 39 | try { 40 | if (null != statement) { 41 | statement.close() 42 | } 43 | } catch { 44 | case e: Exception => logWarning("Exception closing statement", e) 45 | } 46 | try { 47 | if (null != connection) { 48 | connectionPool.releaseConnection(connection) 49 | } 50 | logDebug(s"release connection: ${connection.shard}") 51 | } catch { 52 | case e: Exception => logWarning(s"Exception releasing connection: ${connection.shard}", e) 53 | } 54 | } 55 | 56 | } 57 | 58 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/package.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | package object connector { 6 | implicit def toSparkClickhouseFunctions(sc: SparkContext): SparkClickhouseFunctions = 7 | new SparkClickhouseFunctions(sc) 8 | } 9 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/ClickhousePartition.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import org.apache.spark.Partition 6 | import org.joda.time.{DateTime, Days, Hours} 7 | 8 | sealed trait RangeType {} 9 | 10 | object RangeType { 11 | 12 | case object HOUR extends RangeType 13 | 14 | case object DAY extends RangeType 15 | 16 | } 17 | 18 | case class DateRange(dated: DateTime, 19 | rType: RangeType, 20 | pk: String // primary key name for partitioning 21 | ) { 22 | 23 | def sql(): String = { 24 | if (rType == RangeType.HOUR) 25 | s"$pk = '${dated.hourOfDay.roundFloorCopy.toString("yyyy-MM-dd HH:mm:ss")}'" 26 | else 27 | s"$pk = '${dated.toString("yyyy-MM-dd")}'" 28 | } 29 | 30 | override def toString: String = { 31 | if (rType == RangeType.HOUR) 32 | s"DateRange(${dated.toString("yyyy-MM-dd HH")})" 33 | else 34 | s"DateRange(${dated.toString("yyyy-MM-dd")})" 35 | } 36 | } 37 | 38 | trait EndpointPartition extends Partition { 39 | def endpoints: Iterable[InetAddress] 40 | } 41 | 42 | case class ClickhousePartition( 43 | index: Int, 44 | shardId: Int, 45 | endpoints: Iterable[InetAddress], 46 | partitionSplit: Option[String] //addition primary key clause for spark partition splitting. 47 | ) extends EndpointPartition 48 | 49 | 50 | object DateRange { 51 | 52 | def range(startDate: DateTime, endDate: DateTime, rType: RangeType): Seq[DateTime] = { 53 | 54 | if (rType == RangeType.DAY) 55 | rangeByDay(startDate, endDate) 56 | else 57 | rangeByHour(startDate, endDate) 58 | } 59 | 60 | def rangeByHour(startDate: DateTime, endDate: DateTime): Seq[DateTime] = { 61 | 62 | val hours = Hours.hoursBetween( 63 | startDate.hourOfDay().roundFloorCopy(), 64 | endDate.hourOfDay().roundFloorCopy().plus(1) 65 | ).getHours 66 | (0 to hours).map(i => startDate.plusHours(i)).toList 67 | } 68 | 69 | def rangeByDay(startDate: DateTime, endDate: DateTime): Seq[DateTime] = { 70 | 71 | val days = Days.daysBetween( 72 | startDate.withTimeAtStartOfDay(), 73 | endDate.withTimeAtStartOfDay().plus(1) 74 | ).getDays 75 | (0 to days).map(i => startDate.plusDays(i)) 76 | } 77 | } -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/ClickhousePartitioner.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector.partitioner 2 | 3 | import io.clickhouse.spark.connector.ClickhouseConnector 4 | import org.apache.spark.Partition 5 | import org.joda.time.DateTime 6 | 7 | trait ClickhousePartitioner extends Serializable { 8 | 9 | val partitions: Array[Partition] 10 | 11 | def numPartitions: Int = partitions.length 12 | } 13 | 14 | /** 15 | * Partitioner that provides functionality for splitting shard into small partitions by date range. 16 | * Supported several range types e.g. Daily, Hourly 17 | */ 18 | class DatedClickhousePartitioner(connector: ClickhouseConnector, 19 | dated: (DateTime, DateTime), 20 | rangeType: RangeType, 21 | primaryKeyName: String 22 | ) extends SupportPartitionReplica with ClickhousePartitioner { 23 | 24 | 25 | override val partitions: Array[Partition] = { 26 | 27 | var i = -1 28 | for (date <- DateRange.range(dated._1, dated._2, rangeType)) yield { 29 | i += 1 30 | for (source <- connector.dataSource) yield { 31 | val rotatedHosts = rotateRight(source._2, i) 32 | val shardId = source._1 33 | // partition index will be set later 34 | ClickhousePartition(0, shardId, rotatedHosts, Some(DateRange(date, rangeType, primaryKeyName).sql())) 35 | } 36 | } 37 | }.flatMap(_.seq) 38 | .zipWithIndex 39 | .map { case (p, index) => p.copy(index = index) } 40 | .toArray[Partition] 41 | 42 | override def toString: String = s"DatedPartitioner with period ${dated._1} - ${dated._2} by $rangeType " 43 | } 44 | 45 | /** 46 | * Partitioner that provides functionality for splitting RDD with partitions by shards 47 | */ 48 | class SimpleClickhousePartitioner(connector: ClickhouseConnector) extends ClickhousePartitioner { 49 | 50 | override val partitions: Array[Partition] = (for { 51 | source <- connector.dataSource 52 | } yield { 53 | 54 | val shardId = source._1 55 | val hosts = source._2 56 | // partition index will be set later 57 | ClickhousePartition(0, shardId, hosts, None) 58 | }).zipWithIndex 59 | .map { case (p, index) => p.copy(index = index) } 60 | .toArray[Partition] 61 | 62 | override def toString: String = s"SimplePartitioner" 63 | } 64 | 65 | /** 66 | * Partitioner with custom split strategy for each shard 67 | */ 68 | class CustomClickhousePartitioner(connector: ClickhouseConnector, 69 | partitionSeq: Seq[String] 70 | ) extends SupportPartitionReplica with ClickhousePartitioner { 71 | 72 | override val partitions: Array[Partition] = { 73 | 74 | for (source <- connector.dataSource) yield { // (shard_num, List(InetAddress)) 75 | 76 | var i = 0 77 | for (part <- partitionSeq) yield { 78 | 79 | val rotatedHosts = rotateRight(source._2, i) 80 | val shardId = source._1 81 | i += 1 82 | // partition index will be set later 83 | ClickhousePartition(0, shardId, rotatedHosts, Some(part)) 84 | } 85 | } 86 | }.flatMap(_.seq) 87 | .zipWithIndex 88 | .map { case (p, index) => p.copy(index = index) } 89 | .toArray[Partition] 90 | 91 | override def toString: String = s"CustomClickhousePartitioner" 92 | } 93 | 94 | object SimpleClickhousePartitioner { 95 | def apply(connector: ClickhouseConnector): SimpleClickhousePartitioner = new SimpleClickhousePartitioner(connector) 96 | } 97 | 98 | /** Support replica placement */ 99 | abstract class SupportPartitionReplica { 100 | 101 | /** circular shift of a Scala collection */ 102 | def rotateRight[A](seq: Seq[A], i: Int): Seq[A] = { 103 | 104 | val size = seq.size 105 | if (i != 0 && size > 1) { 106 | seq.drop(size - (i % size)) ++ seq.take(size - (i % size)) 107 | } 108 | else { 109 | seq.toList 110 | } 111 | } 112 | 113 | 114 | } 115 | 116 | 117 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/NodeAddress.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector.partitioner 2 | 3 | import java.net.InetAddress 4 | 5 | import scala.collection.concurrent.TrieMap 6 | 7 | object NodeAddress { 8 | 9 | private val addressCache = new TrieMap[InetAddress, Set[String]] 10 | 11 | /** Returns a list of IP-addresses and host names that identify a node. 12 | * Useful for giving Spark the list of preferred nodes for the Spark partition. */ 13 | def hostNames(rpcAddress: InetAddress): Set[String] = { 14 | 15 | addressCache.get(rpcAddress) match { 16 | case Some(value) => 17 | value 18 | case None => 19 | val address = Set( 20 | rpcAddress.getHostAddress, 21 | rpcAddress.getHostName 22 | ) 23 | addressCache.putIfAbsent(rpcAddress, address) 24 | address 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/connector/partitioner/PartitionQuery.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.connector.partitioner 2 | 3 | object PartitionQuery { 4 | 5 | def queryForPartition(query: String, partition: ClickhousePartition): String = { 6 | 7 | partition.partitionSplit match { 8 | case Some(split: String) => s"$query and $split" 9 | case None => query 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/io/clickhouse/spark/sql/RowReaderFactory.scala: -------------------------------------------------------------------------------- 1 | package io.clickhouse.spark.sql 2 | 3 | import java.sql.ResultSet 4 | 5 | import org.apache.spark.sql.Row 6 | 7 | trait RowReaderFactory { 8 | 9 | def create(rs: ResultSet): Row 10 | } 11 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/org/apache/spark/TableIterator.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark 2 | 3 | import org.apache.spark.util.NextIterator 4 | 5 | /** Provides a basic/boilerplate Iterator implementation. 6 | * Extends private scope for NextIterator from apache.spark. */ 7 | abstract class TableIterator[U] extends NextIterator[U] { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/org/apache/spark/metrics/clickhouse/ClickhouseYandexRowMeter.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.metrics.clickhouse 2 | 3 | import java.sql.ResultSet 4 | 5 | import ru.yandex.clickhouse.response.ClickHouseResultSet 6 | 7 | /** Class that provide a method to calculate row_size from the yandex driver result_set */ 8 | class ClickhouseYandexRowMeter extends JdbcRowMeter { 9 | 10 | def sizeOf(resultSet: ResultSet): Int = { 11 | resultSet.asInstanceOf[ClickHouseResultSet].getValues.map(_.getLen).sum 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/org/apache/spark/metrics/clickhouse/InputMetricsUpdater.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.metrics.clickhouse 2 | 3 | import java.sql.ResultSet 4 | 5 | import io.clickhouse.spark.connector.ConnectorConf 6 | import org.apache.spark.TaskContext 7 | import org.apache.spark.executor.InputMetrics 8 | 9 | /** A trait that provides a method to update read metrics which are collected for connector related tasks. 10 | * The appropriate instance is created by the companion object. 11 | * 12 | */ 13 | trait InputMetricsUpdater { 14 | /** Updates the metrics being collected for the connector after reading each single row. This method 15 | * is not thread-safe. 16 | * 17 | * @param row the row which has just been read 18 | */ 19 | def updateMetrics(row: ResultSet): ResultSet = row 20 | 21 | def finish(): Long 22 | 23 | private[metrics] def updateTaskMetrics(count: Int, dataLength: Int): Unit = {} 24 | 25 | } 26 | 27 | trait JdbcRowMeter { 28 | def sizeOf(rs: ResultSet): Int 29 | } 30 | 31 | object InputMetricsUpdater { 32 | 33 | /** Creates the appropriate instance of `InputMetricsUpdater`. 34 | * 35 | * The created instance will be updating task metrics so 36 | * that Spark will report them in the UI. Remember that this is supported for Spark 1.2+. 37 | * 38 | */ 39 | def apply( 40 | taskContext: TaskContext, 41 | conf: ConnectorConf 42 | ): InputMetricsUpdater = { 43 | 44 | val tm = taskContext.taskMetrics() 45 | val inputMetrics = tm.inputMetrics 46 | 47 | if (conf.clickhouseMetricsEnable) { 48 | 49 | val jdbcRowMeter: JdbcRowMeter = 50 | if (conf.сlickhouseDriver == ConnectorConf.DefaultClickhouseDriver) { 51 | //metrics supported for yandex driver only 52 | new ClickhouseYandexRowMeter 53 | } else { 54 | null 55 | } 56 | new ClickhouseInputMetricsUpdater(jdbcRowMeter, inputMetrics) 57 | } 58 | else { 59 | new StubInputMetricsUpdater 60 | } 61 | } 62 | 63 | trait Timer { 64 | private val startTime = System.nanoTime() 65 | 66 | def stopTimer(): Long = System.nanoTime() - startTime 67 | } 68 | 69 | private class ClickhouseInputMetricsUpdater(rowMeter: JdbcRowMeter, inputMetrics: InputMetrics) 70 | extends InputMetricsUpdater with Timer { 71 | 72 | def getRowBinarySize(row: ResultSet): Int = { 73 | 74 | if (rowMeter != null) { 75 | rowMeter.sizeOf(row) 76 | } 77 | else { 78 | 0 79 | } 80 | } 81 | 82 | override def updateMetrics(row: ResultSet): ResultSet = { 83 | val binarySize = getRowBinarySize(row) 84 | 85 | updateTaskMetrics(1, binarySize) 86 | 87 | row 88 | } 89 | 90 | def finish(): Long = stopTimer() 91 | 92 | override def updateTaskMetrics(count: Int, dataLength: Int): Unit = { 93 | inputMetrics.incBytesRead(dataLength) 94 | inputMetrics.incRecordsRead(count) 95 | } 96 | 97 | } 98 | 99 | /** The implementation of [[InputMetricsUpdater]] which does not update anything. */ 100 | private class StubInputMetricsUpdater extends InputMetricsUpdater with Timer { 101 | def finish(): Long = stopTimer() 102 | } 103 | 104 | } -------------------------------------------------------------------------------- /spark-clickhouse-connector/src/main/scala/org/apache/spark/sql/ClickhouseRow.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql 2 | 3 | import java.sql.{ResultSet, ResultSetMetaData} 4 | 5 | import io.clickhouse.spark.connector.TableScanner 6 | import io.clickhouse.spark.sql.RowReaderFactory 7 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 8 | import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils 9 | import org.apache.spark.sql.jdbc.JdbcDialects 10 | import org.apache.spark.sql.types.StructType 11 | 12 | 13 | final class ClickhouseRow(values: Array[Any], 14 | override val schema: StructType 15 | ) 16 | extends GenericRowWithSchema(values, schema) { 17 | } 18 | 19 | 20 | class ClickhouseRowFactory(metadata: StructType) extends RowReaderFactory { 21 | 22 | private def resultSetToObjectArray(rs: ResultSet): Array[Any] = { 23 | Array.tabulate[Any](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1)) 24 | } 25 | 26 | def create(rs: ResultSet): ClickhouseRow = { 27 | new ClickhouseRow(resultSetToObjectArray(rs), metadata) 28 | } 29 | } 30 | 31 | object ClickhouseRowFactory { 32 | 33 | def apply(scanner: TableScanner): ClickhouseRowFactory = 34 | new ClickhouseRowFactory( 35 | JdbcUtils.getSchema(scanner.resultSet, JdbcDialects.get("jdbc:clickhouse"))) 36 | } 37 | --------------------------------------------------------------------------------