├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── project ├── build.properties ├── build.sbt └── plugins.sbt ├── src ├── it │ ├── resources │ │ └── log4j.properties │ └── scala │ │ └── com │ │ └── miraisolutions │ │ └── spark │ │ └── bigquery │ │ ├── DirectWriteAndReadSpec.scala │ │ ├── ParquetWriteDirectReadSpec.scala │ │ └── test │ │ ├── BigQueryConfiguration.scala │ │ ├── BigQueryTesting.scala │ │ ├── data │ │ ├── DataFrameGenerator.scala │ │ └── TestData.scala │ │ └── package.scala └── main │ ├── resources │ └── META-INF │ │ └── services │ │ ├── org.apache.hadoop.fs.FileSystem │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ └── com │ └── miraisolutions │ └── spark │ └── bigquery │ ├── BigQueryPartition.scala │ ├── BigQueryRowRDD.scala │ ├── BigQuerySchemaConverter.scala │ ├── BigQueryTableReference.scala │ ├── BigQueryTableRelation.scala │ ├── DefaultSource.scala │ ├── FileFormat.scala │ ├── client │ ├── BigQueryClient.scala │ ├── BigQueryTableReader.scala │ └── package.scala │ ├── config │ ├── BigQueryConfig.scala │ └── package.scala │ ├── examples │ └── Shakespeare.scala │ ├── exception │ ├── IOException.scala │ ├── MissingParameterException.scala │ ├── ParseException.scala │ └── UnsupportedFormatException.scala │ ├── sql │ ├── BigQueryDialect.scala │ └── BigQuerySqlGeneration.scala │ └── utils │ ├── DateTime.scala │ ├── Files.scala │ ├── SqlLogger.scala │ └── format │ ├── FormatConverter.scala │ ├── Generic.scala │ ├── Parquet.scala │ └── package.scala └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | lib_managed 3 | project/project 4 | project/target 5 | target 6 | derby.log 7 | logs 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Mirai Solutions GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [DEPRECATED] spark-bigquery: A Google BigQuery Data Source for Apache Spark 2 | 3 | ## Deprecation Notice 4 | 5 | This project has been deprecated in favor of the official 6 | [Apache Spark SQL connector for Google BigQuery](https://github.com/GoogleCloudDataproc/spark-bigquery-connector). 7 | 8 | 9 | ## Overview 10 | 11 | This project provides a [Google BigQuery](https://cloud.google.com/bigquery/) data source (`com.miraisolutions.spark.bigquery.DefaultSource`) to [Apache Spark](https://spark.apache.org/) using the new [Google Cloud client libraries](https://cloud.google.com/bigquery/docs/reference/libraries) for the Google BigQuery API. It supports "direct" import/export where records are directly streamed from/to BigQuery. In addition, data may be imported/exported via intermediate data extracts on [Google Cloud Storage](https://cloud.google.com/storage/) (GCS). Note that when using "direct" (streaming) export, data may not be immediately available for further querying/processing in BigQuery. It may take several minutes for streamed records to become "available". See the following resources for more information: 12 | 13 | * https://cloud.google.com/bigquery/streaming-data-into-bigquery 14 | * https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert 15 | 16 | The following import/export combinations are currently supported: 17 | 18 | | | Direct | Parquet | Avro | ORC | JSON | CSV | 19 | | -------------------------------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | 20 | | Import to Spark (export from BigQuery) | :heavy_check_mark: | :x: | :heavy_check_mark: | :x: | :heavy_check_mark: | :heavy_check_mark: | 21 | | Export from Spark (import to BigQuery) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: | 22 | 23 | 24 | More information on the various supported formats can be found at: 25 | 26 | * Parquet: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet 27 | * Avro: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro 28 | * ORC: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc 29 | * JSON: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json 30 | * CSV: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv 31 | 32 | CSV and JSON are not recommended as data exchange formats between Apache Spark and BigQuery due to their lack of type safety. Better options are direct import/export, Parquet, Avro and ORC. 33 | 34 | 35 | This data source is used in the [sparkbq](https://github.com/miraisolutions/sparkbq) R package. 36 | 37 | ## Building 38 | 39 | Due to dependency version mismatches between Apache Spark and Google client libraries (e.g. Google Guava) this project uses [`sbt-assembly`](https://github.com/sbt/sbt-assembly) to build a fat JAR using [shading](https://github.com/sbt/sbt-assembly#shading) to relocate relevant Google classes. 40 | 41 | ## Version Information 42 | 43 | The following table provides an overview over supported versions of Apache Spark, Scala and [Google Dataproc](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions): 44 | 45 | | spark-bigquery | Spark | Scala | Google Dataproc | 46 | | :------------: | --------------- | ----- | --------------- | 47 | | 0.1.x | 2.2.x and 2.3.x | 2.11 | 1.2.x and 1.3.x | 48 | 49 | ## Example 50 | 51 | The provided Google BigQuery data source (`com.miraisolutions.spark.bigquery.DefaultSource`) can be used as follows: 52 | 53 | ```scala 54 | import org.apache.spark.sql.{SaveMode, SparkSession} 55 | import com.miraisolutions.spark.bigquery.config._ 56 | 57 | // Initialize Spark session 58 | val spark = SparkSession 59 | .builder 60 | .appName("Google BigQuery Shakespeare") 61 | .getOrCreate 62 | 63 | import spark.implicits._ 64 | 65 | // Define BigQuery options 66 | val config = BigQueryConfig( 67 | project = "", // Google BigQuery billing project ID 68 | location = "", // Google BigQuery dataset location 69 | stagingDataset = StagingDatasetConfig( 70 | gcsBucket = "" // Google Cloud Storage bucket for staging files 71 | ), 72 | // Google Cloud service account key file - works only in local cluster mode 73 | serviceAccountKeyFile = if(args.length > 3) Some(args(3)) else None 74 | ) 75 | 76 | // Read public shakespeare data table using direct import (streaming) 77 | val shakespeare = spark.read 78 | .bigquery(config) 79 | .option("table", "bigquery-public-data.samples.shakespeare") 80 | .option("type", "direct") 81 | .load() 82 | 83 | val hamlet = shakespeare.filter($"corpus".like("hamlet")) 84 | hamlet.show(100) 85 | 86 | shakespeare.createOrReplaceTempView("shakespeare") 87 | val macbeth = spark.sql("SELECT * FROM shakespeare WHERE corpus = 'macbeth'").persist() 88 | macbeth.show(100) 89 | 90 | // Write filtered data table via a Parquet export on GCS 91 | macbeth.write 92 | .bigquery(config) 93 | .option("table", ".samples.macbeth") 94 | .option("type", "parquet") 95 | .mode(SaveMode.Overwrite) 96 | .save() 97 | ``` 98 | 99 | You can find a complete example at `com.miraisolutions.spark.bigquery.examples.Shakespeare`. 100 | 101 | To run this example first compile and assembly using `sbt assembly`. Then run: 102 | 103 | **Local Spark Cluster** 104 | 105 | `spark-submit --class com.miraisolutions.spark.bigquery.examples.Shakespeare --master local[*] target/scala-2.11/spark-bigquery-assembly-.jar ` 106 | 107 | **[Google Cloud Dataproc](https://cloud.google.com/dataproc/)** 108 | 109 | Login to service account (it needs to have permissions to access all resources): 110 | 111 | `gcloud auth activate-service-account --key-file=[KEY-FILE]` 112 | 113 | Run spark job: 114 | 115 | `gcloud dataproc jobs submit spark --cluster --class com.miraisolutions.spark.bigquery.examples.Shakespeare --jars target/scala-2.11/spark-bigquery-assembly-.jar -- ` 116 | 117 | where `` are: 118 | 1. Google BigQuery billing project ID 119 | 2. Google BigQuery dataset location (EU, US) 120 | 3. Google Cloud Storage (GCS) bucket where staging files will be located 121 | 122 | ## Using the spark-bigquery Spark package 123 | 124 | spark-bigquery is available as Spark package from https://spark-packages.org/package/miraisolutions/spark-bigquery and as such via the Maven coordinates `miraisolutions:spark-bigquery:`. You can simply specify the appropriate Maven coordinates with the `--packages` option when using the Spark shell or when using `spark-submit`. 125 | 126 | ### Using the Spark Shell 127 | 128 | `spark-shell --master local[*] --packages miraisolutions:spark-bigquery:` 129 | 130 | ```scala 131 | import com.miraisolutions.spark.bigquery.config._ 132 | 133 | // Define BigQuery options 134 | val config = BigQueryConfig( 135 | project = "", 136 | location = "US", 137 | stagingDataset = StagingDatasetConfig( 138 | gcsBucket = "" 139 | ), 140 | serviceAccountKeyFile = Some("") 141 | ) 142 | 143 | val shakespeare = spark.read 144 | .bigquery(config) 145 | .option("table", "bigquery-public-data.samples.shakespeare") 146 | .option("type", "direct") 147 | .load() 148 | 149 | shakespeare.show() 150 | ``` 151 | 152 | ### Using PySpark 153 | 154 | `pyspark --master local[*] --packages miraisolutions:spark-bigquery:` 155 | 156 | ```python 157 | shakespeare = spark.read \ 158 | .format("bigquery") \ 159 | .option("bq.project", "") \ 160 | .option("bq.location", "US") \ 161 | .option("bq.service_account_key_file", "") \ 162 | .option("bq.staging_dataset.gcs_bucket", "") \ 163 | .option("table", "bigquery-public-data.samples.shakespeare") \ 164 | .option("type", "direct") \ 165 | .load() 166 | 167 | shakespeare.show() 168 | ``` 169 | 170 | ### Using `spark-submit` 171 | 172 | Assume the following Spark application which has been compiled into a JAR named `Shakespeare.jar` (you may want to use something like [Holden Karau's Giter8 Spark project template](https://github.com/holdenk/sparkProjectTemplate.g8) for this): 173 | 174 | ```scala 175 | package com.example 176 | 177 | import org.apache.spark.sql.SparkSession 178 | import com.miraisolutions.spark.bigquery.config._ 179 | 180 | object Shakespeare { 181 | 182 | def main(args: Array[String]): Unit = { 183 | 184 | // Initialize Spark session 185 | val spark = SparkSession 186 | .builder 187 | .appName("Google BigQuery Shakespeare") 188 | .getOrCreate 189 | 190 | // Define BigQuery options 191 | val config = BigQueryConfig( 192 | project = "", 193 | location = "US", 194 | stagingDataset = StagingDatasetConfig( 195 | gcsBucket = "" 196 | ), 197 | serviceAccountKeyFile = Some("") 198 | ) 199 | 200 | // Read public shakespeare data table using direct import (streaming) 201 | val shakespeare = spark.read 202 | .bigquery(config) 203 | .option("table", "bigquery-public-data.samples.shakespeare") 204 | .option("type", "direct") 205 | .load() 206 | 207 | shakespeare.show() 208 | 209 | } 210 | 211 | } 212 | ``` 213 | 214 | You can run this application using `spark-submit` in the following way: 215 | 216 | `spark-submit --class com.example.Shakespeare --master local[*] --packages miraisolutions:spark-bigquery: Shakespeare.jar` 217 | 218 | 219 | ### Using `gcloud dataproc jobs submit` 220 | 221 | Login to service account (it needs to have permissions to access all resources): 222 | 223 | `gcloud auth activate-service-account --key-file=[KEY-FILE]` 224 | 225 | Similar to the `spark-submit` example above, the Spark application can be submitted to Google Dataproc using 226 | 227 | `gcloud dataproc jobs submit spark --cluster --class com.example.Shakespeare --jars Shakespeare.jar --properties "spark.jars.packages=miraisolutions:spark-bigquery:"` 228 | 229 | You may choose not to specify a service account key file and use default application credentials instead when running on Dataproc. 230 | 231 | 232 | ## Configuration 233 | 234 | The three main Spark read/write options include: 235 | 236 | * `table`: The BigQuery table to read/write. To be specified in the form `[projectId].[datasetId].[tableId]`. One of `table` or `sqlQuery` must be specified. 237 | * `sqlQuery`: A SQL query in Google BigQuery standard SQL dialect (SQL-2011). One of `table` or `sqlQuery` must be specified. 238 | * `type` (optional): The BigQuery import/export type to use. Options include "direct", "parquet", "avro", "orc", "json" and "csv". Defaults to "direct". See the table at the top for supported type and import/export combinations. 239 | 240 | 241 | In addition, there are a number of BigQuery configuration options that can be specified in two ways: the traditional way using Spark's read/write options (e.g. `spark.read.option("bq.project", "")`) and using the `bigquery` extension method (`spark.read.bigquery(config)`; see example above) which is usually more straightforward to use. If you prefer the traditional way or if you are using spark-bigquery in a non-Scala environment (e.g. PySpark), the configuration keys are as follows: 242 | 243 | * `bq.project` (required): Google BigQuery billing project id 244 | * `bq.location` (required): Geographic location where newly created datasets should reside. "EU" or "US". 245 | * `bq.service_account_key_file` (optional): Google Cloud service account key file to use for authentication with Google Cloud services. The use of service accounts is highly recommended. Specifically, the service account will be used to interact with BigQuery and Google Cloud Storage (GCS). If not specified, application default credentials will be used. 246 | * `bq.staging_dataset.name` (optional): Prefix of BigQuery staging dataset. A staging dataset is used to temporarily store the results of SQL queries. Defaults to "spark_staging". 247 | * `bq.staging_dataset.lifetime` (optional): Default staging dataset table lifetime in milliseconds. Tables are automatically deleted once the lifetime has been reached. Defaults to 86400000 ms (= 1 day). 248 | * `bq.staging_dataset.gcs_bucket` (required): Google Cloud Storage (GCS) bucket to use for storing temporary files. Temporary files are used when importing through BigQuery load jobs and exporting through BigQuery extraction jobs (i.e. when using data extracts such as Parquet, Avro, ORC, ...). The service account specified in `bq.service_account_key_file` needs to be given appropriate rights. 249 | * `bq.job.priority` (optional): BigQuery job priority when executing SQL queries. Options include "interactive" and "batch". Defaults to "interactive", i.e. the query is executed as soon as possible. 250 | * `bq.job.timeout` (optional): Timeout in milliseconds after which a file import/export job should be considered as failed. Defaults to 3600000 ms (= 1 h). 251 | 252 | 253 | See the following resources for more information: 254 | * [BigQuery pricing](https://cloud.google.com/bigquery/pricing) 255 | * [BigQuery dataset locations](https://cloud.google.com/bigquery/docs/dataset-locations) 256 | * [General authentication](https://cloud.google.com/docs/authentication/) 257 | * [BigQuery authentication](https://cloud.google.com/bigquery/docs/authentication/) 258 | * [Cloud Storage authentication](https://cloud.google.com/storage/docs/authentication/) 259 | 260 | 261 | ## Schema Conversion 262 | 263 | ### Using Direct Mode 264 | 265 | In **direct** (streaming) mode, spark-bigquery performs the following data type conversions between supported Spark data types and BigQuery data types: 266 | 267 | **Importing to Spark / Exporting from BigQuery** 268 | 269 | | BigQuery Source Data Type | Spark Target Data Type | 270 | | ------------------------- | ---------------------- | 271 | | BOOL | BooleanType | 272 | | INT64 | LongType | 273 | | FLOAT64 | DoubleType | 274 | | NUMERIC | DecimalType(38, 9) | 275 | | STRING | StringType | 276 | | BYTES | BinaryType | 277 | | STRUCT | StructType | 278 | | TIMESTAMP | TimestampType | 279 | | DATE | DateType | 280 | | TIME | StringType | 281 | | DATETIME | StringType | 282 | 283 | BigQuery repeated fields are mapped to the corresponding Spark ArrayType. Also, BigQuery's nullable mode is used to determine the appropriate nullable property of the target Spark data type. 284 | 285 | **Exporting from Spark / Importing to BigQuery** 286 | 287 | | Spark Source Data Type | BigQuery Target Data Type | 288 | | ---------------------- | ------------------------- | 289 | | BooleanType | BOOL | 290 | | ByteType | INT64 | 291 | | ShortType | INT64 | 292 | | IntegerType | INT64 | 293 | | LongType | INT64 | 294 | | FloatType | FLOAT64 | 295 | | DoubleType | FLOAT64 | 296 | | <= DecimalType(38, 9) | NUMERIC | 297 | | > DecimalType(38, 9) | STRING | 298 | | StringType | STRING | 299 | | BinaryType | BYTES | 300 | | StructType | STRUCT | 301 | | TimestampType | TIMESTAMP | 302 | | DateType | DATE | 303 | | MapType | Repeated key-value STRUCT | 304 | | ArrayType | Repeated field | 305 | | Other | STRING | 306 | 307 | For more information on supported BigQuery data types see https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types. 308 | 309 | ### Using GCS Data Extracts 310 | 311 | When using intermediate GCS data extracts (Parquet, Avro, ORC, ...) the result depends on the data format being used. Consult the data format's specification for information on supported data types. Furthermore, see the following resources on type conversions supported by BigQuery: 312 | 313 | * Parquet: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet 314 | * Avro: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro 315 | * ORC: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc 316 | * JSON: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json 317 | * CSV: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv 318 | 319 | ## Authentication 320 | 321 | Providing key file is only possible in local cluster mode, since app deployed on GC will try to resolve a location on HDFS. 322 | It's not a good practice to keep key files stored as cloud resource. 323 | 324 | If you need to run via gcloud you can authenticate with service account JSON file using: 325 | 326 | `gcloud auth activate-service-account --key-file=[KEY-FILE]` 327 | 328 | Using local cluster mode it is possible to provide the key file as an argument to the spark job. 329 | 330 | Information on how to generate service account credentials can be found at 331 | https://cloud.google.com/storage/docs/authentication#service_accounts. 332 | The service account key file can either be passed directly via `BigQueryConfig` or 333 | it can be passed through an environment variable: 334 | `export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service_account_keyfile.json` 335 | (see https://cloud.google.com/docs/authentication/getting-started for more information). 336 | When running on Google Cloud, e.g. Google Cloud Dataproc, 337 | [application default credentials](https://developers.google.com/identity/protocols/application-default-credentials) 338 | may be used in which case it is not necessary to specify a service account key file. 339 | 340 | ## License 341 | 342 | MIT License 343 | 344 | Copyright (c) 2018 Mirai Solutions GmbH 345 | 346 | Permission is hereby granted, free of charge, to any person obtaining a copy 347 | of this software and associated documentation files (the "Software"), to deal 348 | in the Software without restriction, including without limitation the rights 349 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 350 | copies of the Software, and to permit persons to whom the Software is 351 | furnished to do so, subject to the following conditions: 352 | 353 | The above copyright notice and this permission notice shall be included in all 354 | copies or substantial portions of the Software. 355 | 356 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 357 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 358 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 359 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 360 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 361 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 362 | SOFTWARE. 363 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import net.ruippeixotog.scalascraper.browser.JsoupBrowser 2 | import net.ruippeixotog.scalascraper.dsl.DSL._ 3 | import net.ruippeixotog.scalascraper.dsl.DSL.Extract._ 4 | import com.typesafe.sbt.license.{DepModuleInfo, LicenseInfo} 5 | import ReleaseTransformations._ 6 | import sbtrelease.{Version, versionFormatError} 7 | 8 | import scala.xml.{Elem, Node => XmlNode, NodeSeq => XmlNodeSeq} 9 | import scala.xml.transform._ 10 | 11 | // Apache Spark version setting 12 | val sparkVersion = settingKey[String]("The version of Spark to use.") 13 | 14 | // Custom task for creating a Spark package release artifact 15 | val sparkPackage = taskKey[File]("Creates a Spark package release artifact.") 16 | 17 | // Setting Maven properties as needed by gcs-connector 18 | val mavenProps = settingKey[Unit]("Setting Maven properties") 19 | 20 | lazy val commonSettings = Seq( 21 | // Name must match github repository name 22 | name := "spark-bigquery", 23 | organization := "com.miraisolutions", 24 | organizationName := "Mirai Solutions GmbH", 25 | description := "A Google BigQuery Data Source for Apache Spark", 26 | startYear := Some(2018), 27 | licenses += ("MIT", new URL("https://opensource.org/licenses/MIT")), 28 | sparkVersion := "2.4.5", 29 | scalaVersion := "2.11.12", 30 | crossScalaVersions := Seq("2.11.12"), 31 | scalacOptions ++= Seq( 32 | "-target:jvm-1.8", 33 | "-deprecation", 34 | "-feature", 35 | "-unchecked" 36 | ) 37 | ) 38 | 39 | // Dependency exclusions 40 | lazy val exclusions = Seq( 41 | // Clash with Spark 42 | ExclusionRule("com.fasterxml.jackson.core", "jackson-core"), 43 | ExclusionRule("commons-logging", "commons-logging"), 44 | ExclusionRule("commons-lang", "commons-lang"), 45 | // Not required 46 | ExclusionRule("com.google.auto.value", "auto-value"), 47 | ExclusionRule("com.google.auto.value", "auto-value-annotations") 48 | ) 49 | 50 | // Spark provided dependencies 51 | lazy val sparkDependencies = Def.setting(Seq( 52 | "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided", 53 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided", 54 | "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided" 55 | )) 56 | 57 | // Dependencies which need to be shaded to run on Google Cloud Dataproc 58 | lazy val dependenciesToShade = Seq( 59 | "com.google.cloud" % "google-cloud-bigquery" % "1.108.1" excludeAll(exclusions: _*), 60 | "com.google.cloud.bigdataoss" % "gcs-connector" % "1.9.3-hadoop2" excludeAll(exclusions: _*), 61 | "com.google.http-client" % "google-http-client-apache" % "2.0.0" excludeAll(exclusions: _*) 62 | ) 63 | 64 | // Dependencies which don't need any shading 65 | lazy val nonShadedDependencies = Seq( 66 | "com.databricks" %% "spark-avro" % "4.0.0" 67 | ) 68 | 69 | // Test dependencies 70 | lazy val testDependencies = Def.setting(Seq( 71 | "org.scalatest" %% "scalatest" % "3.0.5" % "it,test", 72 | "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_0.14.0" % "it,test", 73 | "org.apache.spark" %% "spark-hive" % sparkVersion.value % "it,test" // required by spark-testing-base 74 | )) 75 | 76 | lazy val browser = JsoupBrowser() 77 | 78 | def existsUrl(url: String): Boolean = { 79 | import java.net.{URL, HttpURLConnection} 80 | (new URL(url)).openConnection().asInstanceOf[HttpURLConnection].getResponseCode == 200 81 | } 82 | 83 | 84 | lazy val root = (project in file(".")) 85 | .enablePlugins(AssemblyPlugin, AutomateHeaderPlugin) 86 | .configs(IntegrationTest) 87 | .settings(commonSettings: _*) 88 | .settings( 89 | libraryDependencies := dependenciesToShade ++ sparkDependencies.value ++ 90 | nonShadedDependencies.map(_ % "provided") ++ testDependencies.value, 91 | skip in publish := true, 92 | mavenProps := { 93 | // Required by gcs-connector 94 | sys.props("hadoop.identifier") = "hadoop2" 95 | () 96 | }, 97 | 98 | Defaults.itSettings, 99 | IntegrationTest / fork := true, 100 | IntegrationTest / javaOptions ++= Seq( 101 | "-Xmx2048m", 102 | "-Xms512m", 103 | "-XX:+CMSClassUnloadingEnabled" 104 | ), 105 | IntegrationTest / logBuffered := false, 106 | IntegrationTest / testOptions += Tests.Argument("-oF"), 107 | automateHeaderSettings(IntegrationTest), 108 | 109 | // See https://spark-packages.org/artifact-help 110 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), 111 | // Shade google dependencies due to version mismatches with dependencies deployed on Google Dataproc 112 | assemblyShadeRules in assembly := Seq( 113 | // ShadeRule.rename("com.google.cloud.hadoop.fs.**" -> "com.google.cloud.hadoop.fs.@1").inAll, 114 | ShadeRule.rename("com.google.**" -> "shadehttpclient.@1").inLibrary("com.google.http-client" % "google-http-client-apache" % "2.0.0"), 115 | ShadeRule.rename("com.google.**" -> "shadegoogle.@1").inAll 116 | ), 117 | assemblyMergeStrategy in assembly := { 118 | case PathList("META-INF", "services", "org.apache.hadoop.fs.FileSystem") => 119 | // Take our "shaded" version 120 | MergeStrategy.first 121 | case r => 122 | MergeStrategy.defaultMergeStrategy(r) 123 | }, 124 | 125 | // We release the distribution module only 126 | releaseProcess := Seq.empty, 127 | 128 | licenseConfigurations := Set("compile"), 129 | licenseOverrides := { 130 | case DepModuleInfo("org.slf4j", "slf4j-api", _) => 131 | LicenseInfo.MIT 132 | }, 133 | // Extends license report to include artifact description and link to JAR files 134 | licenseReportNotes := { 135 | case DepModuleInfo(group, id, version) => 136 | try { 137 | // Fetch artifact information 138 | val doc = browser.get(s"https://mvnrepository.com/artifact/$group/$id/$version") 139 | // Extract title 140 | val title = (doc >> text(".im-title")).replaceFirst("\\s»\\s.+$", "") 141 | // Extract description 142 | val description = doc >> text(".im-description") 143 | // Locate link to JAR file 144 | val mainJar = (doc >> elementList("a.vbtn")) 145 | .filter(element => element.innerHtml.startsWith("jar") || element.innerHtml.startsWith("bundle")) 146 | .map(_ >> attr("href")) 147 | .headOption 148 | .getOrElse(throw new NoSuchElementException("Can't locate JAR file")) 149 | 150 | // Derive link to sources JAR file 151 | val sourcesJar = mainJar.replaceFirst("\\.jar$", "-sources.jar") 152 | 153 | // Check if JAR file exists 154 | require(existsUrl(mainJar), "Invalid link to JAR file") 155 | // Check if sources JAR file exists 156 | require(existsUrl(sourcesJar), "Invalid link to sources JAR file") 157 | // https://en.wikipedia.org/wiki/C0_and_C1_control_codes (unit separator) 158 | title + '\u001F' + description + '\u001F' + mainJar + '\u001F' + sourcesJar 159 | } catch { 160 | case t: Throwable => 161 | "**** " + t.getMessage + " ****" 162 | } 163 | } 164 | ) 165 | 166 | // A "virtual" project with configurations to build Spark packages 167 | // See https://spark-packages.org/artifact-help 168 | lazy val distribution = (project in file("distribution")) 169 | .settings(commonSettings: _*) 170 | .settings( 171 | // Include the Scala binary version here 172 | version := s"${(root / version).value}-s_${scalaBinaryVersion.value}", 173 | libraryDependencies := nonShadedDependencies, 174 | // Spark packages need the github organization name as the group ID 175 | organization := "miraisolutions", 176 | crossPaths := false, 177 | pomExtra := { 178 | https://github.com/miraisolutions/spark-bigquery 179 | 180 | git@github.com:miraisolutions/spark-bigquery.git 181 | scm:git:git@github.com:miraisolutions/spark-bigquery.git 182 | 183 | 184 | 185 | martinstuder 186 | Martin Studer 187 | https://github.com/martinstuder 188 | 189 | 190 | lambiase 191 | Nicola Lambiase 192 | https://github.com/lambiase 193 | 194 | 195 | }, 196 | // Spark packages need the github repository name as the artifact ID 197 | pomPostProcess := { (node: XmlNode) => 198 | val rule = new RewriteRule { 199 | override def transform(n: XmlNode): XmlNodeSeq = n match { 200 | case n: Elem if n.label == "project" => 201 | val updatedChildren = n.child map { 202 | case c if c.label == "artifactId" => 203 | {normalizedName.value} 204 | 205 | case c => 206 | c 207 | } 208 | n.copy(child = updatedChildren) 209 | 210 | case n => 211 | n 212 | } 213 | } 214 | new RuleTransformer(rule)(node) 215 | }, 216 | Compile / packageBin := (root / Compile / assembly).value, 217 | sparkPackage := { 218 | val jar = (Compile / packageBin).value 219 | val pom = makePom.value 220 | val packageName = s"${normalizedName.value}-${version.value}" 221 | val zipFile = target.value / s"$packageName.zip" 222 | IO.delete(zipFile) 223 | IO.zip(Seq(jar -> s"$packageName.jar", pom -> s"$packageName.pom"), zipFile) 224 | println(s"\nSpark Package created at: $zipFile\n") 225 | zipFile 226 | }, 227 | 228 | releaseVersion := { _ => 229 | Version((root / version).value).map(_.withoutQualifier.string).getOrElse(versionFormatError) 230 | }, 231 | releaseVersionFile := (ThisBuild / baseDirectory).value / "version.sbt", 232 | releaseProcess := Seq[ReleaseStep]( 233 | checkSnapshotDependencies, 234 | inquireVersions, 235 | runClean, 236 | setReleaseVersion, 237 | commitReleaseVersion, 238 | tagRelease, 239 | releaseStepTask(sparkPackage), 240 | setNextVersion, 241 | commitNextVersion, 242 | pushChanges 243 | ) 244 | ) 245 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.6 2 | -------------------------------------------------------------------------------- /project/build.sbt: -------------------------------------------------------------------------------- 1 | resolvers += DefaultMavenRepository 2 | 3 | // Used to scrape mvnrepository artifact information 4 | libraryDependencies += "net.ruippeixotog" %% "scala-scraper" % "2.0.0" 5 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") 2 | 3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 4 | 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.2.0") 6 | 7 | addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.0.0") 8 | 9 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.9") 10 | -------------------------------------------------------------------------------- /src/it/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=WARN, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=WARN 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.spark-project.jetty=WARN 32 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | log4j.logger.com.miraisolutions.spark.bigquery=INFO 38 | 39 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 40 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 41 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 42 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/DirectWriteAndReadSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.miraisolutions.spark.bigquery.test._ 25 | import com.miraisolutions.spark.bigquery.test.data.{DataFrameGenerator, TestData} 26 | import org.apache.spark.sql.types.StructType 27 | import org.apache.spark.sql.{DataFrame, SaveMode} 28 | import org.scalactic.anyvals.PosZInt 29 | import org.scalatest.FunSuite 30 | import org.scalatest.prop.{Checkers, GeneratorDrivenPropertyChecks} 31 | 32 | /** 33 | * Test suite which tests reading and writing single Spark fields/columns to and from BigQuery. 34 | * 35 | * Data frames are written to BigQuery via "direct" export. Because attempts to query the new fields might require 36 | * a waiting time of up to 90 minutes (https://cloud.google.com/bigquery/streaming-data-into-bigquery), this test 37 | * suite only verifies the correctness of the generated BigQuery schema. 38 | * 39 | * BigQuery's streaming system caches table schemas for up to two minutes. That also seems to be the case when a 40 | * table gets deleted. For this reason, this test suite generates unique table names for each test case and then 41 | * manually deletes the table afterwards. 42 | * 43 | * @see [[https://cloud.google.com/bigquery/streaming-data-into-bigquery]] 44 | * @see [[https://stackoverflow.com/q/25279116]] 45 | * @see [[https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert]] 46 | */ 47 | class DirectWriteAndReadSpec extends FunSuite with BigQueryTesting with Checkers 48 | with GeneratorDrivenPropertyChecks { 49 | 50 | private val testTablePrefix = "direct_test" 51 | 52 | private class RandomDataFrame(schema: StructType, size: PosZInt) 53 | extends Checkers with GeneratorDrivenPropertyChecks { 54 | 55 | override implicit val generatorDrivenConfig = 56 | PropertyCheckConfiguration(minSuccessful = 1, minSize = size, sizeRange = size) 57 | 58 | implicit val arbitraryDataFrame = DataFrameGenerator.generate(sqlContext, schema) 59 | // Use unique table name to avoid BigQuery schema caching issues 60 | val tableName = testTablePrefix + "_" + System.currentTimeMillis().toString 61 | 62 | forAll { df: DataFrame => 63 | df.write 64 | .mode(SaveMode.Overwrite) 65 | .bigqueryTest(tableName) 66 | .save() 67 | 68 | val in = spark.read 69 | .bigqueryTest(tableName) 70 | .load() 71 | .persist() 72 | 73 | val tableReference = getTestDatasetTableReference(tableName) 74 | 75 | assert(df.aligned.schema, in.aligned.schema) 76 | val deleted = bigQueryClient.deleteTable(tableReference) 77 | assert(deleted, true) 78 | } 79 | } 80 | 81 | 82 | (TestData.atomicFields ++ TestData.arrayFields ++ TestData.mapFields) foreach { field => 83 | 84 | test(s"Column of type ${field.dataType} (nullable: ${field.nullable}) " + 85 | s"can be written to and read from BigQuery using direct imports (streaming)") { 86 | new RandomDataFrame(StructType(List(field)), 10) 87 | } 88 | 89 | } 90 | 91 | test("Nested struct columns can be written to and read from BigQuery using direct imports (streaming)") { 92 | new RandomDataFrame(StructType(List(TestData.customStructField)), 2) 93 | } 94 | 95 | test("Data frames with mixed data types can be written to and read from BigQuery using direct imports (streaming)") { 96 | new RandomDataFrame(StructType(TestData.atomicFields ++ TestData.arrayFields.take(2) ++ 97 | TestData.mapFields.take(2)), 2) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/ParquetWriteDirectReadSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.miraisolutions.spark.bigquery.test._ 25 | import com.miraisolutions.spark.bigquery.test.data.{DataFrameGenerator, TestData} 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.sql.{DataFrame, SaveMode} 28 | import org.scalactic.anyvals.PosZInt 29 | import org.scalatest.FunSuite 30 | import org.scalatest.prop.{Checkers, GeneratorDrivenPropertyChecks} 31 | 32 | /** 33 | * Test suite which tests reading and writing Spark fields/columns to and from BigQuery. 34 | * 35 | * Data frames are written to BigQuery via Parquet export to ensure data is immediately available for querying and 36 | * doesn't end up in BigQuery's streaming buffer as it would when using "direct" mode. 37 | * 38 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet]] 39 | * @see [[https://cloud.google.com/bigquery/streaming-data-into-bigquery]] 40 | * @see [[https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert]] 41 | */ 42 | class ParquetWriteDirectReadSpec extends FunSuite with BigQueryTesting { 43 | 44 | private val testTable = "test" 45 | 46 | private class RandomDataFrame(schema: StructType, size: PosZInt) 47 | extends Checkers with GeneratorDrivenPropertyChecks { 48 | 49 | override implicit val generatorDrivenConfig = 50 | PropertyCheckConfiguration(minSuccessful = 1, minSize = size, sizeRange = size) 51 | 52 | implicit val arbitraryDataFrame = DataFrameGenerator.generate(sqlContext, schema) 53 | 54 | forAll { out: DataFrame => 55 | 56 | out.write 57 | .mode(SaveMode.Overwrite) 58 | .bigqueryTest(testTable, exportType = "parquet") 59 | .save() 60 | 61 | val in = spark.read 62 | .bigqueryTest(testTable) 63 | .load() 64 | .persist() 65 | 66 | assertDataFrameEquals(out.aligned, in.aligned) 67 | } 68 | 69 | } 70 | 71 | (TestData.atomicFields ++ TestData.arrayFields ++ TestData.mapFields) foreach { field => 72 | 73 | test(s"Column of type ${field.dataType} (nullable: ${field.nullable}) " + 74 | s"can be written to and read from BigQuery") { 75 | new RandomDataFrame(StructType(List(field)), 10) 76 | } 77 | 78 | } 79 | 80 | test("Nested struct columns can be written to and read from BigQuery") { 81 | new RandomDataFrame(StructType(List(TestData.customStructField)), 2) 82 | } 83 | 84 | test("Data frames with mixed data types can be written to and read from BigQuery") { 85 | new RandomDataFrame(StructType(TestData.atomicFields ++ TestData.arrayFields.take(2) ++ 86 | TestData.mapFields.take(2)), 2) 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/test/BigQueryConfiguration.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.test 23 | 24 | import com.miraisolutions.spark.bigquery.BigQueryTableReference 25 | import com.miraisolutions.spark.bigquery.client.BigQueryClient 26 | import com.miraisolutions.spark.bigquery.config.{BigQueryConfig, _} 27 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row} 28 | import org.scalatest.{BeforeAndAfterAll, Outcome, TestSuite, TestSuiteMixin} 29 | 30 | private object BigQueryConfiguration { 31 | // BigQuery test dataset name 32 | private val BIGQUERY_TEST_DATASET = "spark_bigquery_test" 33 | } 34 | 35 | /** 36 | * BigQuery configuration test suite mixin. Tests mixing in that trait can be run in sbt via: 37 | * 38 | * it:testOnly com.miraisolutions.spark.bigquery.* -- 39 | * -Dbq.project= 40 | * -Dbq.location= 41 | * -Dbq.staging_dataset.gcs_bucket= 42 | * -Dbq.staging_dataset.service_account_key_file= 43 | */ 44 | private[bigquery] trait BigQueryConfiguration extends TestSuiteMixin with BeforeAndAfterAll { this: TestSuite => 45 | import BigQueryConfiguration._ 46 | 47 | // Captured BigQuery test configuration 48 | private var _config: BigQueryConfig = _ 49 | 50 | /** BigQuery client */ 51 | protected lazy val bigQueryClient: BigQueryClient = new BigQueryClient(config) 52 | 53 | /** BigQuery configuration */ 54 | protected def config: BigQueryConfig = _config 55 | 56 | /** 57 | * Construct a table reference to a table in the configured BigQuery test dataset. 58 | * @param table Table name 59 | * @return BigQuery table reference 60 | */ 61 | protected def getTestDatasetTableReference(table: String): BigQueryTableReference = { 62 | BigQueryTableReference(_config.project, BIGQUERY_TEST_DATASET, table) 63 | } 64 | 65 | /** 66 | * Gets the unquoted table identifier for a table in the configured BigQuery test dataset. 67 | * @param table Table name 68 | * @return Unquoted table identifier 69 | */ 70 | protected def getTestDatasetTableIdentifier(table: String): String = { 71 | getTestDatasetTableReference(table).unquotedIdentifier 72 | } 73 | 74 | protected implicit class DataFrameReaderTestConfig(val reader: DataFrameReader) { 75 | /** 76 | * Applies BigQuery test configuration options derived from parameters passed to the test. 77 | * @param table BigQuery table to read 78 | * @param importType Import type (e.g. "direct", "parquet", "avro", ...) 79 | * @return Spark [[DataFrameReader]] 80 | */ 81 | def bigqueryTest(table: String, importType: String = "direct"): DataFrameReader = { 82 | applyDataFrameOptions(reader, _config) 83 | .option("table", getTestDatasetTableIdentifier(table)) 84 | .option("type", importType) 85 | } 86 | } 87 | 88 | protected implicit class DataFrameWriterTestConfig(val writer: DataFrameWriter[Row]) { 89 | /** 90 | * Applies BigQuery test configuration options derived from parameters passed to the test. 91 | * @param table BigQuery table to write 92 | * @param exportType Export type (e.g. "direct", "parquet", "avro", ...) 93 | * @return Spark [[DataFrameWriter]] 94 | */ 95 | def bigqueryTest(table: String, exportType: String = "direct"): DataFrameWriter[Row] = { 96 | applyDataFrameOptions(writer, _config) 97 | .option("table", getTestDatasetTableIdentifier(table)) 98 | .option("type", exportType) 99 | } 100 | } 101 | 102 | override protected def afterAll(): Unit = { 103 | // Removes the BigQuery test dataset at the end of a test suite 104 | bigQueryClient.deleteDataset(_config.project, BIGQUERY_TEST_DATASET) 105 | } 106 | 107 | // See {{TestSuiteMixin}} 108 | abstract override def withFixture(test: NoArgTest): Outcome = { 109 | // Extract BigQuery configuration from config map 110 | _config = BigQueryConfig(test.configMap.mapValues(_.toString)) 111 | super.withFixture(test) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/test/BigQueryTesting.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.test 23 | 24 | import com.holdenkarau.spark.testing.{DataFrameSuiteBase, RDDComparisons} 25 | import org.apache.spark.sql.DataFrame 26 | import org.scalatest.TestSuite 27 | 28 | private[bigquery] trait BigQueryTesting extends BigQueryConfiguration with DataFrameSuiteBase 29 | with RDDComparisons { this: TestSuite => 30 | 31 | // See https://github.com/holdenk/spark-testing-base/issues/148 32 | // See https://issues.apache.org/jira/browse/SPARK-22918 33 | System.setSecurityManager(null) 34 | 35 | override def assertDataFrameEquals(expected: DataFrame, result: DataFrame): Unit = { 36 | assert("Schemas don't match", expected.schema, result.schema) 37 | assert("Number of rows don't match", expected.count(), result.count()) 38 | 39 | val mismatch = compareRDD(expected.rdd, result.rdd) 40 | if(mismatch.isDefined) { 41 | println("#### Expected ####") 42 | expected.show(10, 100, true) 43 | println("#### Result ####") 44 | result.show(10, 100, true) 45 | } 46 | 47 | assertTrue(mismatch.isEmpty) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/test/data/DataFrameGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.test.data 23 | 24 | import java.math.BigInteger 25 | import java.sql.{Date, Timestamp} 26 | import java.time._ 27 | 28 | import com.holdenkarau.spark.testing.RDDGenerator 29 | import org.apache.spark.sql.{DataFrame, Row, SQLContext} 30 | import org.apache.spark.sql.types._ 31 | import org.scalacheck.{Arbitrary, Gen} 32 | 33 | /** 34 | * Generator of arbitrary Spark data frames used in property-based testing. 35 | * 36 | * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]] 37 | */ 38 | private[bigquery] object DataFrameGenerator { 39 | 40 | // Min and max BigQuery timestamp and date values 41 | private val MIN_INSTANT = Instant.parse("0001-01-01T00:00:00.000000Z") 42 | private val MAX_INSTANT = Instant.parse("9999-12-31T23:59:59.999999Z") 43 | 44 | // Number of milliseconds in one day 45 | private val MILLIS_PER_DAY = 86400000L 46 | 47 | /** 48 | * Generates an arbitrary Spark data frame with the specified schema and minimum number of partitions. 49 | * @param sqlContext Spark SQL context 50 | * @param schema Schema of data frame to generate 51 | * @param minPartitions Minimum number of partitions 52 | * @return Arbitrary Spark data frame 53 | */ 54 | def generate(sqlContext: SQLContext, schema: StructType, minPartitions: Int = 1): Arbitrary[DataFrame] = { 55 | val genRow = getRowGenerator(schema) 56 | val genDataFrame = RDDGenerator.genRDD[Row](sqlContext.sparkContext, minPartitions)(genRow) 57 | Arbitrary(genDataFrame.map(sqlContext.createDataFrame(_, schema))) 58 | } 59 | 60 | /** 61 | * Creates a generator of a row in a Spark data frame. 62 | * @param schema Schema of row to generate 63 | * @return Generator for a row 64 | */ 65 | private def getRowGenerator(schema: StructType): Gen[Row] = { 66 | import scala.collection.JavaConverters._ 67 | val fieldGenerators = schema.fields.map(field => getGeneratorForType(field.dataType)) 68 | val rowGen = Gen.sequence(fieldGenerators) 69 | rowGen.map(values => Row.fromSeq(values.asScala)) 70 | } 71 | 72 | /** 73 | * Creates a generator for a target data type. 74 | * @param dataType Data type 75 | * @return Generator of values of the specified data type 76 | */ 77 | private def getGeneratorForType(dataType: DataType): Gen[Any] = { 78 | import Arbitrary._ 79 | 80 | dataType match { 81 | case ByteType => 82 | arbitrary[Byte] 83 | 84 | case ShortType => 85 | arbitrary[Short] 86 | 87 | case IntegerType => 88 | arbitrary[Int] 89 | 90 | case LongType => 91 | arbitrary[Long] 92 | 93 | case FloatType => 94 | arbitrary[Float] 95 | 96 | case DoubleType => 97 | arbitrary[Double] 98 | 99 | case dt: DecimalType => 100 | for { 101 | digits <- Gen.listOfN(dt.precision, Gen.numChar) 102 | sign <- Gen.oneOf("", "-") 103 | unscaledValue = new BigInteger(sign + digits.mkString) 104 | } yield new java.math.BigDecimal(unscaledValue, dt.scale) 105 | 106 | case StringType => 107 | arbitrary[String] 108 | 109 | case BinaryType => 110 | Gen.listOf(arbitrary[Byte]).map(_.toArray) 111 | 112 | case BooleanType => 113 | arbitrary[Boolean] 114 | 115 | case TimestampType => 116 | // BigQuery allowed timestamp range: [0001-01-1 00:00:00.000000, 9999-12-31 23:59:59.999999] 117 | Gen.chooseNum[Long](MIN_INSTANT.toEpochMilli, MAX_INSTANT.toEpochMilli).map(new Timestamp(_)) 118 | 119 | case DateType => 120 | // BigQuery allowed date range: [0001-01-1, 9999-12-31] 121 | Gen.chooseNum[Long](MIN_INSTANT.toEpochMilli, MAX_INSTANT.toEpochMilli) map { millis => 122 | // We need to round the milliseconds to full days as otherwise the time components will be set to the 123 | // time components in the default time zone; see javadoc for java.sql.Date for more details 124 | new Date(millis / MILLIS_PER_DAY * MILLIS_PER_DAY) 125 | } 126 | 127 | case arr: ArrayType => 128 | val elementGenerator = getGeneratorForType(arr.elementType) 129 | Gen.listOf(elementGenerator) 130 | 131 | case map: MapType => 132 | val keyGenerator = getGeneratorForType(map.keyType) 133 | val valueGenerator = getGeneratorForType(map.valueType) 134 | val keyValueGenerator: Gen[(Any, Any)] = for { 135 | key <- keyGenerator 136 | value <- valueGenerator 137 | } yield (key, value) 138 | 139 | Gen.mapOf(keyValueGenerator) 140 | 141 | case row: StructType => 142 | getRowGenerator(row) 143 | 144 | case _ => 145 | throw new UnsupportedOperationException(s"Data type '$dataType' is not supported") 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/test/data/TestData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.test.data 23 | 24 | import org.apache.spark.sql.types._ 25 | 26 | /** 27 | * Definition of Spark data frame test data types and fields. 28 | */ 29 | private[bigquery] object TestData { 30 | 31 | private val atomicTypes = List(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, 32 | DoubleType, StringType, BinaryType, TimestampType, DateType, DataTypes.createDecimalType(38, 9), 33 | DataTypes.createDecimalType(12, 4), DataTypes.createDecimalType(33, 4), 34 | DataTypes.createDecimalType(7,7)) 35 | 36 | private def createName(dt: DataType, nullable: Boolean): String = { 37 | dt.typeName 38 | val base = dt.typeName.replaceAll("[^A-Za-z0-9]+", "") 39 | if(nullable) { 40 | base + "0" 41 | } else { 42 | base 43 | } 44 | } 45 | 46 | private def createFields[T <: DataType](dataTypes: List[T], createName: (T, Boolean) => String): List[StructField] = { 47 | for { 48 | dt <- dataTypes 49 | nullable <- List(true, false) 50 | } yield StructField(createName(dt, nullable), dt, nullable) 51 | } 52 | 53 | val atomicFields: List[StructField] = createFields(atomicTypes, createName) 54 | 55 | private def createArrayName(dt: ArrayType, nullable: Boolean): String = { 56 | val elementName = createName(dt.elementType, dt.containsNull) 57 | val array = if(nullable) "array0" else "array" 58 | s"${array}_${elementName}_" 59 | } 60 | 61 | private val arrayTypes: List[ArrayType] = { 62 | for { 63 | bt <- atomicTypes 64 | containsNull <- List(true, false) 65 | } yield ArrayType(bt, containsNull) 66 | } 67 | 68 | val arrayFields: List[StructField] = createFields(arrayTypes, createArrayName) 69 | 70 | private def createMapName(dt: MapType, nullable: Boolean): String = { 71 | val keyName = createName(dt.keyType, false) 72 | val valueName = createName(dt.valueType, dt.valueContainsNull) 73 | val map = if(nullable) "map0" else "map" 74 | s"${map}_${keyName}_${valueName}" 75 | } 76 | 77 | private val mapTypes: List[MapType] = { 78 | for { 79 | keyType <- atomicTypes 80 | valueType <- atomicTypes 81 | valueContainsNull <- List(true, false) 82 | } yield MapType(keyType, valueType, valueContainsNull) 83 | } 84 | 85 | val mapFields: List[StructField] = createFields(mapTypes, createMapName) 86 | 87 | val customStructField: StructField = StructField( 88 | "customStruct0", 89 | StructType( 90 | Array( 91 | StructField("a", BooleanType, false), 92 | StructField("b", ShortType, true), 93 | StructField("c", FloatType, false), 94 | StructField("d", DataTypes.createDecimalType(18, 6)), 95 | StructField("e", TimestampType, true), 96 | StructField("f", StringType, false), 97 | StructField("g", BinaryType, true), 98 | StructField("h", ArrayType(DoubleType, true)), 99 | StructField("i", MapType(StringType, LongType, false)), 100 | StructField("j", StructType(List( 101 | StructField("k", DateType, false), 102 | StructField("l", ByteType, true), 103 | StructField("m", ArrayType(IntegerType, false), true), 104 | StructField("n", MapType(BinaryType, ByteType, true), true) 105 | )), true) 106 | ) 107 | ), 108 | true 109 | ) 110 | } 111 | -------------------------------------------------------------------------------- /src/it/scala/com/miraisolutions/spark/bigquery/test/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.miraisolutions.spark.bigquery.utils.format._ 25 | import java.sql.Timestamp 26 | 27 | import org.apache.spark.sql.{Column, DataFrame} 28 | import org.apache.spark.sql.functions.{base64, explode_outer, udf} 29 | import org.apache.spark.sql.types._ 30 | 31 | package object test { 32 | import BigQuerySchemaConverter.BIGQUERY_NUMERIC_DECIMAL 33 | 34 | // Column flattener: partial function from field to flattened fields and an appropriate converter function 35 | private type ColumnFlattener = PartialFunction[StructField, (Seq[StructField], DataFrame => DataFrame)] 36 | 37 | // Rounds a timestamp to milliseconds 38 | private def roundTimestampToMillis(ts: Timestamp): Timestamp = { 39 | val roundedTs = new Timestamp(ts.getTime) 40 | roundedTs.setNanos(Math.round(ts.getNanos / 1e6).toInt) 41 | roundedTs 42 | } 43 | 44 | // Spark SQL UDF to round timestamps to milliseconds 45 | private val roundTimestampToMillisUdf = udf(roundTimestampToMillis _) 46 | 47 | 48 | // Explodes a nested column such as an array or map column 49 | private def explode(df: DataFrame, columnName: String, f: Column => Column = identity): DataFrame = { 50 | df.select(df.col("*"), f(explode_outer(df.col(columnName)))).drop(columnName) 51 | } 52 | 53 | // Column converter used to convert atomic data types into data types supported in BigQuery 54 | private val atomicTypeConverter: ColumnConverter = { 55 | case f @ StructField(_, ByteType | ShortType | IntegerType, _, _) => 56 | (f.copy(dataType = LongType), _.cast(LongType)) 57 | 58 | case f @ StructField(_, FloatType, _, _) => 59 | (f.copy(dataType = DoubleType), _.cast(DoubleType)) 60 | 61 | case f @ StructField(_, dt: DecimalType, _, _) if dt.precision < 38 => 62 | (f.copy(dataType = BIGQUERY_NUMERIC_DECIMAL), _.cast(BIGQUERY_NUMERIC_DECIMAL)) 63 | 64 | case f @ StructField(_, TimestampType, _, _) => 65 | (f, roundTimestampToMillisUdf(_)) 66 | 67 | case f @ StructField(_, BinaryType, _, _) => 68 | (f.copy(dataType = StringType), base64) 69 | } 70 | 71 | // Column flattener for nested data types 72 | private val nestedTypeFlattener: ColumnFlattener = { 73 | case StructField(name, ArrayType(elementType, containsNull), _, _) => 74 | val newName = name + "_element" 75 | (Seq(StructField(newName, elementType, containsNull)), explode(_, name, _.as(newName))) 76 | 77 | case StructField(name, MapType(keyType, valueType, containsNull), _, _) => 78 | val keyName = name + "_key" 79 | val valueName = name + "_value" 80 | (Seq(StructField(keyName, keyType, false), StructField(valueName, valueType, containsNull)), 81 | explode(_, name, _.as(Seq(keyName, valueName)))) 82 | 83 | case StructField(name, StructType(fields), _, _) => 84 | def unnest(df: DataFrame): DataFrame = { 85 | val subFields = df.col("*") :: (fields.toList map { sub => 86 | df.col(name).getItem(sub.name).as(name + "_" + sub.name) 87 | }) 88 | df.select(subFields: _*).drop(name) 89 | } 90 | (fields.toSeq, unnest) 91 | } 92 | 93 | /** 94 | * Implicit helper class to align column types in a data frame to types supported in BigQuery and types 95 | * suitable for comparison. 96 | * @param dataFrame Source data frame 97 | */ 98 | private[bigquery] implicit class AlignedDataFrame(val dataFrame: DataFrame) extends AnyVal { 99 | 100 | /** 101 | * Converts/casts columns to the appropriate types supported in BigQuery and types which are suitable for 102 | * comparison. Specifically: 103 | * 104 | * - BigQuery only supports 8 byte integer and floating point types. 105 | * 106 | * - BigQuery only supports decimal types with precision 38 and scale 9 107 | * 108 | * - Binary types are converted to base64 encoded strings for comparison 109 | * 110 | * - Array and map types are exploded for easier comparison 111 | * 112 | * - Struct types are unfolded 113 | * 114 | * - Parquet-style formatted columns (LIST, MAP) are converted to their native Spark equivalent 115 | * 116 | * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]] 117 | * @see [[FormatConverter]], [[Parquet]] 118 | */ 119 | def aligned: DataFrame = { 120 | import Parquet._ 121 | import Generic._ 122 | 123 | var df: DataFrame = dataFrame 124 | 125 | do { 126 | df = FormatConverter.transform(df, List(parquetListToArray, parquetMapToMap, keyValueRecordToMap)) 127 | df = df.schema.fields.foldLeft(df) { case (agg, field) => 128 | if(nestedTypeFlattener.isDefinedAt(field)) { 129 | val (_, converter) = nestedTypeFlattener(field) 130 | converter(agg) 131 | } else { 132 | agg 133 | } 134 | } 135 | } while(df.schema.fields.exists(nestedTypeFlattener.isDefinedAt)) 136 | 137 | FormatConverter.transform(df, List(atomicTypeConverter)) 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem: -------------------------------------------------------------------------------- 1 | shadegoogle.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | com.miraisolutions.spark.bigquery.DefaultSource -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/BigQueryPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import org.apache.spark.Partition 25 | 26 | /** 27 | * BigQuery table partition identifier 28 | * @param index Partition index 29 | */ 30 | private final case class BigQueryPartition(override val index: Int) extends Partition 31 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/BigQueryRowRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.miraisolutions.spark.bigquery.client.BigQueryTableReader 25 | import org.apache.spark.{Partition, SparkContext, TaskContext} 26 | import org.apache.spark.rdd.RDD 27 | import org.apache.spark.sql.Row 28 | 29 | /** 30 | * BigQuery row RDD which reads a BigQuery table by streaming records through a set of pages. 31 | * @param sc Spark context 32 | * @param table Table reader used to read a BigQuery table through a number of partitions 33 | */ 34 | class BigQueryRowRDD(sc: SparkContext, val table: BigQueryTableReader) extends RDD[Row](sc, Seq.empty) { 35 | 36 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = { 37 | table.getRows(split.index).iterator 38 | } 39 | 40 | override protected def getPartitions: Array[Partition] = { 41 | (0 until table.numPartitions).map(BigQueryPartition(_)).toArray 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/BigQuerySchemaConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.google.cloud.bigquery.{Option => _, _} 25 | import org.apache.spark.sql.Row 26 | import org.apache.spark.sql.types._ 27 | 28 | import LegacySQLTypeName._ 29 | import com.google.common.io.BaseEncoding 30 | import com.miraisolutions.spark.bigquery.utils.DateTime 31 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 32 | 33 | import scala.collection.JavaConverters._ 34 | import scala.language.postfixOps 35 | 36 | /** 37 | * Schema conversion functions to convert schemas between Apache Spark and Google BigQuery. 38 | * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]] 39 | */ 40 | private[bigquery] object BigQuerySchemaConverter { 41 | 42 | // BigQuery's NUMERIC is a Decimal with precision 38 and scale 9 43 | private[bigquery] val BIGQUERY_NUMERIC_DECIMAL = DataTypes.createDecimalType(38, 9) 44 | 45 | private[bigquery] val KEY_FIELD_NAME = "key" 46 | private[bigquery] val VALUE_FIELD_NAME = "value" 47 | 48 | /** 49 | * Converts a BigQuery schema to a Spark schema. 50 | * @param schema BigQuery schema 51 | * @return Spark schema 52 | */ 53 | def fromBigQueryToSpark(schema: Schema): StructType = { 54 | val fields = schema.getFields.asScala.map(bigQueryToSparkField) 55 | StructType(fields) 56 | } 57 | 58 | /** 59 | * Creates a function that can be used to convert a BigQuery row (represented as [[FieldValueList]]) to a Spark 60 | * row. 61 | * @param schema BigQuery schema 62 | * @return Function to convert a BigQuery row to a Spark row 63 | */ 64 | def getBigQueryToSparkConverterFunction(schema: Schema): FieldValueList => Row = { fields => 65 | val meta = fromBigQueryToSpark(schema).fields.zip(fields.asScala) 66 | val values = meta map { case (field, value) => getRowValue(value, field.dataType) } 67 | Row.fromSeq(values) 68 | } 69 | 70 | /** 71 | * Converts a BigQuery [[Field]] to a Spark [[StructField]]. 72 | * @param field BigQuery [[Field]] 73 | * @return Spark [[StructField]] 74 | */ 75 | private def bigQueryToSparkField(field: Field): StructField = { 76 | val dataType = field.getType match { 77 | case BOOLEAN => 78 | BooleanType 79 | 80 | case INTEGER => 81 | LongType 82 | 83 | case FLOAT => 84 | DoubleType 85 | 86 | case NUMERIC => 87 | BIGQUERY_NUMERIC_DECIMAL 88 | 89 | case STRING => 90 | StringType 91 | 92 | case BYTES => 93 | BinaryType 94 | 95 | case RECORD => 96 | val fields = field.getSubFields.asScala.map(bigQueryToSparkField) 97 | StructType(fields.toArray) 98 | 99 | case TIMESTAMP => 100 | TimestampType 101 | 102 | case DATE => 103 | DateType 104 | 105 | case TIME => 106 | // Not supported in Spark 107 | StringType 108 | 109 | case DATETIME => 110 | // Not supported in Spark 111 | StringType 112 | } 113 | 114 | // Mode may be null in which case the default nullable is assumed 115 | val mode = Option(field.getMode).getOrElse(Field.Mode.NULLABLE) 116 | val isNullable = mode.equals(Field.Mode.NULLABLE) 117 | val isRepeated = mode.equals(Field.Mode.REPEATED) 118 | 119 | if(isRepeated) { 120 | StructField(field.getName, ArrayType(dataType, isNullable), false) 121 | } else { 122 | StructField(field.getName, dataType, isNullable) 123 | } 124 | } 125 | 126 | /** 127 | * Extracts a value from a BigQuery field that can be used to construct a Spark row. 128 | * @param value BigQuery [[FieldValue]] 129 | * @param dataType Target Spark data type 130 | * @return Spark row value 131 | */ 132 | private def getRowValue(value: FieldValue, dataType: DataType): Any = { 133 | if(value.isNull) { 134 | null 135 | } else { 136 | dataType match { 137 | case BooleanType => 138 | value.getBooleanValue 139 | case LongType => 140 | value.getLongValue 141 | case DoubleType => 142 | value.getDoubleValue 143 | case _: DecimalType => 144 | value.getNumericValue 145 | case StringType => 146 | value.getStringValue 147 | case BinaryType => 148 | value.getBytesValue 149 | case ArrayType(elementType, _) => 150 | value.getRepeatedValue.asScala.map(getRowValue(_, elementType)).toArray 151 | case StructType(fields) => 152 | Row(value.getRecordValue.asScala.zip(fields.map(_.dataType)).map((getRowValue _).tupled): _*) 153 | case TimestampType => 154 | DateTime.epochMicrosToTimestamp(value.getTimestampValue) 155 | case DateType => 156 | DateTime.parseDate(value.getStringValue) 157 | } 158 | } 159 | } 160 | 161 | /** 162 | * Converts a Spark schema to a BigQuery schema. 163 | * @param schema Spark schema 164 | * @return BigQuery schema 165 | */ 166 | def fromSparkToBigQuery(schema: StructType): Schema = { 167 | Schema.of(schema.fields.map(sparkToBigQueryField): _*) 168 | } 169 | 170 | /** 171 | * Creates a custom (key, value) [[StructField]] pair from a Spark [[MapType]] that can be used 172 | * to construct a BigQuery record type. 173 | * @param mapType Spark map type 174 | * @return Key/value [[StructField]] pair according to the map's key/value data types 175 | */ 176 | private def customKeyValueStructFields(mapType: MapType): (StructField, StructField) = { 177 | val keyField = StructField(KEY_FIELD_NAME, mapType.keyType, false) 178 | val valueField = StructField(VALUE_FIELD_NAME, mapType.valueType, mapType.valueContainsNull) 179 | (keyField, valueField) 180 | } 181 | 182 | /** 183 | * Creates a custom [[StructField]] from a Spark [[ArrayType]] that can be used to construct a BigQuery 184 | * field with "repeated" mode. 185 | * @param arrayType Spark array type 186 | * @return [[StructField]] according to the array's element data type 187 | */ 188 | private def customArrayStructField(arrayType: ArrayType): StructField = { 189 | StructField(VALUE_FIELD_NAME, arrayType.elementType, arrayType.containsNull) 190 | } 191 | 192 | /** 193 | * Converts a Spark [[StructField]] to a BigQuery [[Field]]. 194 | * @param field Spark [[StructField]] 195 | * @return BigQuery [[Field]] 196 | */ 197 | private def sparkToBigQueryField(field: StructField): Field = { 198 | def f(tpe: LegacySQLTypeName): Field = { 199 | val mode = if(field.nullable) Field.Mode.NULLABLE else Field.Mode.REQUIRED 200 | Field.newBuilder(field.name, tpe).setMode(mode).build() 201 | } 202 | 203 | field.dataType match { 204 | case BooleanType => 205 | f(BOOLEAN) 206 | 207 | case ByteType => 208 | f(INTEGER) 209 | 210 | case ShortType => 211 | f(INTEGER) 212 | 213 | case IntegerType => 214 | f(INTEGER) 215 | 216 | case LongType => 217 | f(INTEGER) 218 | 219 | case FloatType => 220 | f(FLOAT) 221 | 222 | case DoubleType => 223 | f(FLOAT) 224 | 225 | case dt: DecimalType if dt.precision <= BIGQUERY_NUMERIC_DECIMAL.precision && 226 | dt.scale <= BIGQUERY_NUMERIC_DECIMAL.scale => 227 | f(NUMERIC) 228 | 229 | case _: DecimalType => 230 | f(STRING) 231 | 232 | case StringType => 233 | f(STRING) 234 | 235 | case BinaryType => 236 | f(BYTES) 237 | 238 | case StructType(fields) => 239 | Field.of(field.name, RECORD, fields.map(sparkToBigQueryField): _*) 240 | 241 | case TimestampType => 242 | f(TIMESTAMP) 243 | 244 | case DateType => 245 | f(DATE) 246 | 247 | case t: MapType => 248 | val (keyField, valueField) = customKeyValueStructFields(t) 249 | val key = sparkToBigQueryField(keyField) 250 | val value = sparkToBigQueryField(valueField) 251 | Field.newBuilder(field.name, RECORD, key, value).setMode(Field.Mode.REPEATED).build() 252 | 253 | case t: ArrayType => 254 | val elementField = sparkToBigQueryField(customArrayStructField(t)) 255 | Field.newBuilder(field.name, elementField.getType).setMode(Field.Mode.REPEATED).build() 256 | 257 | case _ => // HiveStringType, NullType, ObjectType, CalendarIntervalType 258 | f(STRING) 259 | } 260 | } 261 | 262 | /** 263 | * Creates a function that can be used to convert a Spark row to a BigQuery row (represented as a map). 264 | * @param schema Spark schema 265 | * @return Function to convert a Spark row to a BigQuery row 266 | */ 267 | def getSparkToBigQueryConverterFunction(schema: StructType): Row => java.util.Map[String, Any] = { row => 268 | val meta = fromSparkToBigQuery(schema) 269 | val result = new java.util.HashMap[String, Any](meta.getFields.size) 270 | 271 | meta.getFields.asScala foreach { field => 272 | result.put(field.getName, getFieldValue(row, field)) 273 | } 274 | result 275 | } 276 | 277 | /** 278 | * Extracts a value from a Spark row that can be used to construct a BigQuery row. 279 | * @param row Spark row 280 | * @param field BigQuery field 281 | * @return BigQuery field value 282 | */ 283 | private def getFieldValue(row: Row, field: Field): Any = { 284 | val idx = row.fieldIndex(field.getName) 285 | 286 | if(row.isNullAt(idx)) { 287 | null 288 | } else { 289 | val sourceType = row.schema(idx).dataType 290 | val targetType = field.getType 291 | 292 | (sourceType, targetType) match { 293 | case (BooleanType, BOOLEAN) => 294 | row.getBoolean(idx) 295 | 296 | case (ByteType, INTEGER) => 297 | row.getByte(idx) 298 | 299 | case (ShortType, INTEGER) => 300 | row.getShort(idx) 301 | 302 | case (IntegerType, INTEGER) => 303 | row.getInt(idx) 304 | 305 | case (LongType, INTEGER) => 306 | row.getLong(idx) 307 | 308 | case (FloatType, FLOAT) => 309 | row.getFloat(idx).toDouble 310 | 311 | case (DoubleType, FLOAT) => 312 | row.getDouble(idx) 313 | 314 | case (_: DecimalType, _) => 315 | row.getDecimal(idx).toPlainString 316 | 317 | case (StringType, STRING) => 318 | row.getString(idx) 319 | 320 | case (BinaryType, BYTES) => 321 | val bytes = row.getAs[Array[Byte]](idx) 322 | BaseEncoding.base64().encode(bytes) 323 | 324 | case (StructType(_), RECORD) => 325 | val struct = row.getStruct(idx) 326 | val result = new java.util.HashMap[String, Any](struct.size) 327 | 328 | field.getSubFields.asScala foreach { subField => 329 | result.put(subField.getName, getFieldValue(struct, subField)) 330 | } 331 | result 332 | 333 | case (TimestampType, TIMESTAMP) => 334 | // BigQuery requires specifying the number of seconds since the epoch 335 | DateTime.timestampToEpochSeconds(row.getTimestamp(idx)) 336 | 337 | case (DateType, DATE) => 338 | DateTime.formatSparkDate(row.getDate(idx)) 339 | 340 | case (t: MapType, RECORD) => 341 | val m = row.getMap[Any, Any](idx) 342 | 343 | val (keyField, valueField) = customKeyValueStructFields(t) 344 | val mapSchema = StructType(Array(keyField, valueField)) 345 | 346 | m.toList map { case (k, v) => 347 | val record = new java.util.HashMap[Any, Any](2) 348 | val kvRow = new GenericRowWithSchema(Array(k, v), mapSchema) 349 | val keyValue = getFieldValue(kvRow, field.getSubFields.get(0)) 350 | val valueValue = getFieldValue(kvRow, field.getSubFields.get(1)) 351 | record.put(keyField.name, keyValue) 352 | record.put(valueField.name, valueValue) 353 | record 354 | } asJava 355 | 356 | case (st: ArrayType, _) => 357 | val arrayField = customArrayStructField(st) 358 | val arraySchema = StructType(Array(arrayField)) 359 | val arrayBigQueryField = sparkToBigQueryField(arrayField) 360 | 361 | row.getSeq[Any](idx) map { value => 362 | getFieldValue(new GenericRowWithSchema(Array(value), arraySchema), arrayBigQueryField) 363 | } asJava 364 | } 365 | } 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/BigQueryTableReference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.google.cloud.bigquery.{Table, TableId} 25 | import com.miraisolutions.spark.bigquery.exception.ParseException 26 | import scala.language.implicitConversions 27 | 28 | /** 29 | * BigQuery table reference 30 | * @param project Project ID 31 | * @param dataset Dataset ID 32 | * @param table Table ID 33 | */ 34 | private final case class BigQueryTableReference(project: String, dataset: String, table: String) { 35 | 36 | /** Returns the unquoted table identifier (BigQuery Standard SQL) */ 37 | def unquotedIdentifier: String = s"$project.$dataset.$table" 38 | /** Returns the quoted table identifier (BigQuery Standard SQL) */ 39 | def quotedIdentifier: String = "`" + unquotedIdentifier + "`" 40 | 41 | /** BigQuery Standard SQL table identifier (quoted) */ 42 | override def toString: String = quotedIdentifier 43 | } 44 | 45 | private object BigQueryTableReference { 46 | 47 | /** 48 | * Creates a [[BigQueryTableReference]] from a [[TableId]] 49 | * @param tableId Table ID 50 | * @return BigQuery table reference 51 | */ 52 | def apply(tableId: TableId): BigQueryTableReference = 53 | BigQueryTableReference(tableId.getProject, tableId.getDataset, tableId.getTable) 54 | 55 | /** 56 | * Creates a [[BigQueryTableReference]] from a table reference [[String]] 57 | * @param tableRef Table reference string 58 | * @return BigQuery table reference 59 | */ 60 | def apply(tableRef: String): BigQueryTableReference = { 61 | val tableId = raw"((.+:)?[\w_\-]+)\.([\w_\-]+)\.([\w_\-]+)".r 62 | tableRef.replace("`", "") match { 63 | case tableId(project, _, dataset, table) => BigQueryTableReference(project, dataset, table) 64 | case _ => throw new ParseException("Failed to parse BigQuery table reference which needs to be of the form " + 65 | "[projectId].[datasetId].[tableId]") 66 | } 67 | } 68 | 69 | // Converts an internal BigQuery table reference to a Google BigQuery API `TableId` 70 | implicit def bigQueryTableReferenceToTableId(table: BigQueryTableReference): TableId = { 71 | TableId.of(table.project, table.dataset, table.table) 72 | } 73 | 74 | // Converts a Google BigQuery API `TableId` to an internal BigQuery table reference 75 | implicit def tableIdToBigQueryTableReference(tableId: TableId): BigQueryTableReference = apply(tableId) 76 | 77 | // Converts a Google BigQuery API `Table` to an internal BigQuery table reference 78 | implicit def tableToBigQueryTableReference(table: Table): BigQueryTableReference = apply(table.getTableId) 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/BigQueryTableRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.miraisolutions.spark.bigquery.client.BigQueryClient 25 | import com.miraisolutions.spark.bigquery.sql.BigQuerySqlGeneration 26 | import org.apache.spark.rdd.RDD 27 | import org.apache.spark.sql.sources._ 28 | import org.apache.spark.sql.types.StructType 29 | import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} 30 | import org.slf4j.LoggerFactory 31 | 32 | /** 33 | * Relation for a Google BigQuery table 34 | * 35 | * @param sqlContext Spark SQL context 36 | * @param client BigQuery client 37 | * @param table BigQuery table reference 38 | */ 39 | private final case class BigQueryTableRelation(sqlContext: SQLContext, client: BigQueryClient, 40 | table: BigQueryTableReference) 41 | extends BaseRelation with TableScan with PrunedScan with PrunedFilteredScan with InsertableRelation { 42 | 43 | private val logger = LoggerFactory.getLogger(classOf[BigQueryTableRelation]) 44 | private val sql = BigQuerySqlGeneration(table) 45 | 46 | // See {{BaseRelation}} 47 | override def schema: StructType = client.getSchema(table) 48 | 49 | // See {{TableScan}} 50 | override def buildScan(): RDD[Row] = { 51 | logger.info(s"Executing full scan of table $table") 52 | val tbl = client.getTable(table, sqlContext.sparkContext.defaultParallelism) 53 | new BigQueryRowRDD(sqlContext.sparkContext, tbl) 54 | } 55 | 56 | // See {{PrunedScan}} 57 | override def buildScan(requiredColumns: Array[String]): RDD[Row] = { 58 | buildScan(requiredColumns, Array.empty) 59 | } 60 | 61 | // See {{PrunedFilteredScan}} 62 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { 63 | logger.info(s"Executing pruned filtered scan of table $table ") 64 | val sqlQuery = sql.getQuery(requiredColumns, filters) 65 | val tbl = client.executeQuery(sqlQuery, sqlContext.sparkContext.defaultParallelism) 66 | new BigQueryRowRDD(sqlContext.sparkContext, tbl) 67 | } 68 | 69 | // See {{InsertableRelation}} 70 | override def insert(data: DataFrame, overwrite: Boolean): Unit = { 71 | logger.info(s"Writing to table $table (overwrite = $overwrite)") 72 | val mode = if(overwrite) SaveMode.Overwrite else SaveMode.Append 73 | client.writeTable(data, table, mode) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.google.cloud.hadoop.fs.gcs.{GoogleHadoopFS, GoogleHadoopFileSystem} 25 | import com.miraisolutions.spark.bigquery.FileFormat.CSV 26 | import com.miraisolutions.spark.bigquery.client.BigQueryClient 27 | import com.miraisolutions.spark.bigquery.config.BigQueryConfig 28 | import com.miraisolutions.spark.bigquery.exception.MissingParameterException 29 | import com.miraisolutions.spark.bigquery.utils.Files 30 | import org.apache.hadoop.conf.Configuration 31 | import org.apache.spark.sql.execution.datasources.DataSource 32 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 33 | import org.apache.spark.sql.sources._ 34 | import org.apache.spark.sql.execution.FileRelation 35 | 36 | /** 37 | * Google BigQuery default data source. 38 | */ 39 | class DefaultSource extends RelationProvider with CreatableRelationProvider with DataSourceRegister { 40 | import DefaultSource._ 41 | 42 | // See {{DataSourceRegister}} 43 | override def shortName(): String = BIGQUERY_DATA_SOURCE_NAME 44 | 45 | // See {{RelationProvider}} 46 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { 47 | withBigQueryClient(sqlContext, parameters, false) { (client, table) => 48 | parameters.foldType[BaseRelation](BigQueryTableRelation(sqlContext, client, table)) { format => 49 | val stagingDirectory = client.exportTable(table, format) 50 | // Register staging directory for deletion when FileSystem gets closed 51 | Files.deleteOnExit(stagingDirectory, sqlContext.sparkContext.hadoopConfiguration) 52 | 53 | getStagingDataFileRelation(sqlContext, stagingDirectory, format) 54 | } 55 | } 56 | } 57 | 58 | // See {{CreatableRelationProvider}} 59 | override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], 60 | data: DataFrame): BaseRelation = { 61 | 62 | withBigQueryClient(sqlContext, parameters, true) { (client, table) => 63 | parameters.foldType[Unit](client.writeTable(data, table, mode)) { format => 64 | val stagingDirectory = client.getStagingDirectory() 65 | 66 | // Use TIMESTAMP_MICROS in Parquet (supported since Spark 2.3.0) 67 | sqlContext.setConf("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") 68 | 69 | data.write 70 | .format(format.sparkFormatIdentifier) 71 | .options(getFormatOptions(format)) 72 | .save(stagingDirectory) 73 | 74 | client.importTable(stagingDirectory, format, table, mode) 75 | 76 | // Remove staging directory after import has been completed 77 | Files.delete(stagingDirectory, sqlContext.sparkContext.hadoopConfiguration) 78 | } 79 | 80 | BigQueryTableRelation(sqlContext, client, table) 81 | } 82 | } 83 | } 84 | 85 | private[bigquery] object DefaultSource { 86 | 87 | // BigQuery data source name 88 | val BIGQUERY_DATA_SOURCE_NAME = "bigquery" 89 | 90 | // Direct import/export type 91 | private val TYPE_DIRECT = "direct" 92 | 93 | /** Creates a BigQuery client with the provided configuration parameters */ 94 | private def getBigQueryClient(parameters: Map[String, String]): BigQueryClient = { 95 | new BigQueryClient(BigQueryConfig(parameters)) 96 | } 97 | 98 | /** 99 | * Sets several necessary Spark Hadoop configuration options to enable access to Google Cloud Storage (GCS). 100 | * @param conf Spark Hadoop configuration 101 | * @param project Google Cloud project 102 | * @param serviceAccountKeyFile Optional Google Cloud service account key file 103 | * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]] 104 | * @see [[https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md]] 105 | */ 106 | private def initHadoop(conf: Configuration, project: String, serviceAccountKeyFile: Option[String]): Unit = { 107 | conf.set("fs.gs.impl", classOf[GoogleHadoopFileSystem].getName) 108 | conf.set("fs.AbstractFileSystem.gs.impl", classOf[GoogleHadoopFS].getName) 109 | conf.set("fs.gs.project.id", project) 110 | 111 | serviceAccountKeyFile foreach { file => 112 | conf.set("google.cloud.auth.service.account.enable", "true") 113 | conf.set("google.cloud.auth.service.account.json.keyfile", file) 114 | } 115 | } 116 | 117 | /** 118 | * Determines Spark options to be provided for a particular file format 119 | * @param format File format 120 | * @return Spark import/export options 121 | */ 122 | private def getFormatOptions(format: FileFormat): Map[String, String] = { 123 | format match { 124 | case CSV => 125 | Map("header" -> "true") 126 | 127 | case _ => 128 | Map.empty 129 | } 130 | } 131 | 132 | /** 133 | * Creates a Spark [[FileRelation]] for staged data files in a Google Cloud Storage (GCS) staging directory. 134 | * @param sqlContext Spark SQL context 135 | * @param stagingDirectory Staging directory path 136 | * @param format File export format 137 | * @return Spark [[BaseRelation]] for the staged data files 138 | */ 139 | private def getStagingDataFileRelation(sqlContext: SQLContext, stagingDirectory: String, 140 | format: FileFormat): BaseRelation = { 141 | val dataSource = DataSource( 142 | sparkSession = sqlContext.sparkSession, 143 | className = format.sparkFormatIdentifier, 144 | paths = List(stagingDirectory), 145 | userSpecifiedSchema = None, 146 | options = getFormatOptions(format) 147 | ) 148 | 149 | dataSource.resolveRelation(true) 150 | } 151 | 152 | /** 153 | * Gets a BigQuery table reference for the specified parameters. The parameters must either specify a table or 154 | * a SQL query. 155 | * @param sqlContext Spark SQL context 156 | * @param client BigQuery client 157 | * @param parameters Parameters - must either specify a table or SQL query. 158 | * @param tableOnly Specifies whether a direct table reference is required. 159 | * @return Reference to a BigQuery table that holds the data 160 | */ 161 | private def getBigQueryTableReference(sqlContext: SQLContext, client: BigQueryClient, 162 | parameters: Map[String, String], tableOnly: Boolean): BigQueryTableReference = { 163 | // Get direct table reference if 'table' has been specified 164 | val tableOpt = parameters.get("table").map(BigQueryTableReference(_)) 165 | 166 | if(tableOnly) { 167 | tableOpt.getOrElse(throw new MissingParameterException( 168 | "A parameter 'table' of the form [projectId].[datasetId].[tableId] must be specified." 169 | )) 170 | } else { 171 | // Execute 'sqlQuery' and get reference to table containing the results 172 | def sqlTableOpt: Option[BigQueryTableReference] = parameters.get("sqlQuery") map { sqlQuery => 173 | client.executeQuery(sqlQuery, sqlContext.sparkContext.defaultParallelism).table 174 | } 175 | 176 | tableOpt.orElse(sqlTableOpt).getOrElse(throw new MissingParameterException( 177 | "Either a parameter 'table' of the form [projectId].[datasetId].[tableId] or 'sqlQuery' must be specified." 178 | )) 179 | } 180 | } 181 | 182 | /** 183 | * Constructs a BigQuery client, applies the necessary Spark Hadoop configuration and then calls a provided 184 | * function to create a [[BaseRelation]]. 185 | * @param sqlContext Spark SQL context 186 | * @param parameters Parameters 187 | * @param tableOnly Specifies whether a direct table reference is required. 188 | * @param createRelation Function to create a [[BaseRelation]] given a BigQuery client and a table reference. 189 | * @return Spark [[BaseRelation]] 190 | */ 191 | private def withBigQueryClient(sqlContext: SQLContext, parameters: Map[String, String], tableOnly: Boolean) 192 | (createRelation: 193 | (BigQueryClient, BigQueryTableReference) => BaseRelation): BaseRelation = { 194 | 195 | val client = getBigQueryClient(parameters) 196 | 197 | initHadoop(sqlContext.sparkContext.hadoopConfiguration, client.config.project, 198 | client.config.serviceAccountKeyFile) 199 | 200 | val tableReference = getBigQueryTableReference(sqlContext, client, parameters, tableOnly) 201 | 202 | createRelation(client, tableReference) 203 | } 204 | 205 | /** Helper class for parameter handling */ 206 | private implicit class TypeParameter(parameters: Map[String, String]) { 207 | /** 208 | * Fold on import/export type. 209 | * @param direct Block to execute when 'type' = 'direct' 210 | * @param handleFileFormat Function to execute when 'type' = 211 | */ 212 | def foldType[T](direct: => T)(handleFileFormat: FileFormat => T): T = { 213 | parameters.getOrElse("type", TYPE_DIRECT) match { 214 | case TYPE_DIRECT => 215 | direct 216 | 217 | case tpe => 218 | handleFileFormat(FileFormat(tpe)) 219 | } 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/FileFormat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.google.cloud.bigquery.FormatOptions 25 | 26 | /** 27 | * File format used in conjunction with Spark and BigQuery import/export. 28 | */ 29 | private sealed trait FileFormat { 30 | /** Spark format identifier */ 31 | def sparkFormatIdentifier: String 32 | /** BigQuery format options */ 33 | def bigQueryFormatOptions: FormatOptions 34 | /** File extension */ 35 | def fileExtension: String 36 | } 37 | 38 | private object FileFormat { 39 | 40 | /** 41 | * JSON format. 42 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json]] 43 | */ 44 | case object JSON extends FileFormat { 45 | override val sparkFormatIdentifier: String = "json" 46 | override val bigQueryFormatOptions: FormatOptions = FormatOptions.json() 47 | override val fileExtension: String = "json" 48 | } 49 | 50 | /** 51 | * CSV format. 52 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv]] 53 | */ 54 | case object CSV extends FileFormat { 55 | override val sparkFormatIdentifier: String = "csv" 56 | override val bigQueryFormatOptions: FormatOptions = FormatOptions.csv() 57 | override val fileExtension: String = "csv" 58 | } 59 | 60 | /** 61 | * Avro format. 62 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro]] 63 | * @see [[https://github.com/databricks/spark-avro]] 64 | */ 65 | case object AVRO extends FileFormat { 66 | override val sparkFormatIdentifier: String = "com.databricks.spark.avro" 67 | override val bigQueryFormatOptions: FormatOptions = FormatOptions.avro() 68 | override val fileExtension: String = "avro" 69 | } 70 | 71 | /** 72 | * Parquet format. 73 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet]] 74 | * @see [[https://spark.apache.org/docs/latest/sql-programming-guide.html#parquet-files]] 75 | */ 76 | case object PARQUET extends FileFormat { 77 | override val sparkFormatIdentifier: String = "parquet" 78 | override val bigQueryFormatOptions: FormatOptions = FormatOptions.parquet() 79 | override val fileExtension: String = "parquet" 80 | } 81 | 82 | /** 83 | * ORC format. 84 | * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc]] 85 | * @see [[https://spark.apache.org/docs/latest/sql-programming-guide.html#orc-files]] 86 | */ 87 | case object ORC extends FileFormat { 88 | override val sparkFormatIdentifier: String = "orc" 89 | override val bigQueryFormatOptions: FormatOptions = FormatOptions.orc() 90 | override val fileExtension: String = "orc" 91 | } 92 | 93 | /** Creates a file format from a string. */ 94 | def apply(format: String): FileFormat = { 95 | format.toLowerCase match { 96 | case "parquet" => PARQUET 97 | case "avro" => AVRO 98 | case "orc" => ORC 99 | case "json" => JSON 100 | case "csv" => CSV 101 | case _ => 102 | throw new IllegalArgumentException(s"Unsupported file format: $format") 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/client/BigQueryClient.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.client 23 | 24 | import java.io.FileInputStream 25 | import java.time.{Instant, ZoneId} 26 | import java.time.format.DateTimeFormatter 27 | 28 | import com.google.auth.oauth2.ServiceAccountCredentials 29 | import com.google.cloud.RetryOption 30 | import com.google.cloud.bigquery.BigQuery.DatasetDeleteOption 31 | import com.google.cloud.bigquery.InsertAllRequest.RowToInsert 32 | import com.google.cloud.bigquery.JobInfo.{CreateDisposition, WriteDisposition} 33 | import com.google.cloud.bigquery.{Option => _, _} 34 | import com.miraisolutions.spark.bigquery.config.{BigQueryConfig, StagingDatasetConfig} 35 | import com.miraisolutions.spark.bigquery.exception.{IOException, UnsupportedFormatException} 36 | import com.miraisolutions.spark.bigquery.utils.SqlLogger 37 | import com.miraisolutions.spark.bigquery.{BigQuerySchemaConverter, BigQueryTableReference, FileFormat} 38 | import org.apache.spark.sql.types.StructType 39 | import org.apache.spark.sql.{DataFrame, SaveMode} 40 | import org.slf4j.LoggerFactory 41 | import org.threeten.bp.Duration 42 | 43 | import scala.collection.JavaConverters._ 44 | import scala.language.implicitConversions 45 | import scala.util.Random 46 | 47 | private object BigQueryClient { 48 | import FileFormat._ 49 | 50 | // Prefix for temporary tables and directories 51 | private val TEMP_PREFIX = "spark_" 52 | 53 | // Timestamp formatter for temporary tables and GCS staging directories 54 | private val TIMESTAMP_FORMATTER = 55 | DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(ZoneId.of("UTC")) 56 | 57 | // File formats not currently available for BigQuery imports 58 | private val UNSUPPORTED_BIGQUERY_IMPORT_FORMATS: Set[FileFormat] = Set(JSON, CSV) 59 | // File formats not currently available for BigQuery exports 60 | private val UNSUPPORTED_BIGQUERY_EXPORT_FORMATS: Set[FileFormat] = Set(PARQUET, ORC) 61 | } 62 | 63 | /** 64 | * BigQuery Client 65 | * 66 | * @param config BigQuery configuration 67 | */ 68 | private[bigquery] class BigQueryClient(val config: BigQueryConfig) { 69 | import BigQueryClient._ 70 | 71 | // Internal BigQuery client 72 | private val bigquery: BigQuery = getBigQueryService() 73 | 74 | private val logger = LoggerFactory.getLogger(classOf[BigQueryClient]) 75 | private val sqlLogger = SqlLogger(logger) 76 | 77 | /** 78 | * Creates an internal BigQuery client that uses the provided service account credentials or the application 79 | * default credentials if no service account credentials have been provided. 80 | * @return BigQuery service interface 81 | * @see [[https://cloud.google.com/docs/authentication/]] 82 | * @see [[https://cloud.google.com/bigquery/docs/authentication/]] 83 | * @see [[https://github.com/GoogleCloudPlatform/google-cloud-java#authentication]] 84 | */ 85 | private def getBigQueryService(): BigQuery = { 86 | config.serviceAccountKeyFile.fold(BigQueryOptions.getDefaultInstance.getService) { keyFile => 87 | BigQueryOptions.newBuilder() 88 | .setCredentials(ServiceAccountCredentials.fromStream(new FileInputStream(keyFile))) 89 | .build() 90 | .getService 91 | } 92 | } 93 | 94 | /** 95 | * Retrieves a dataset or creates it if it doesn't exist. 96 | * @param project Project ID 97 | * @param dataset Dataset ID 98 | * @param build Function to configure the dataset to be created if it doesn't exist yet 99 | * @return BigQuery [[Dataset]] 100 | */ 101 | private def getOrCreateDataset(project: String, dataset: String) 102 | (build: DatasetInfo.Builder => DatasetInfo.Builder): Dataset = { 103 | val datasetId = DatasetId.of(project, dataset) 104 | Option(bigquery.getDataset(datasetId)).getOrElse { 105 | logger.info(s"Creating dataset $dataset in project $project") 106 | 107 | val datasetBuilder = DatasetInfo.newBuilder(datasetId) 108 | // New datasets are always created in the configured location 109 | val datasetInfo = build(datasetBuilder).setLocation(config.location).build() 110 | bigquery.create(datasetInfo) 111 | } 112 | } 113 | 114 | /** 115 | * Retrieve or create staging dataset which hosts temporary SQL query result tables. 116 | * @return Staging dataset ID 117 | */ 118 | private def getOrCreateStagingDataset(): DatasetId = { 119 | import config._ 120 | 121 | val ds = getOrCreateDataset(project, stagingDataset.name + "_" + location) { builder => 122 | builder 123 | .setDefaultTableLifetime(stagingDataset.lifetime) 124 | .setDescription(StagingDatasetConfig.DESCRIPTION) 125 | } 126 | 127 | ds.getDatasetId 128 | } 129 | 130 | /** 131 | * Creates a temporary name that can be used for temporary tables and directories. 132 | */ 133 | private def createTempName(): String = { 134 | TEMP_PREFIX + TIMESTAMP_FORMATTER.format(Instant.now()) + "_" + Random.nextInt(Int.MaxValue) 135 | } 136 | 137 | /** 138 | * Creates a new (unique) reference to a temporary table which will contain the results of an executed SQL query. 139 | * @return BigQuery table reference 140 | */ 141 | private def createTemporaryTableReference(): BigQueryTableReference = { 142 | val stagingDataset = getOrCreateStagingDataset() 143 | val tempTableName = createTempName() 144 | 145 | BigQueryTableReference(stagingDataset.getProject, stagingDataset.getDataset, tempTableName) 146 | } 147 | 148 | /** 149 | * Retrieves the Spark schema for a BigQuery table 150 | * @param table BigQuery table reference 151 | * @return Spark schema 152 | */ 153 | def getSchema(table: BigQueryTableReference): StructType = { 154 | val schema = bigquery.getTable(table).getDefinition[TableDefinition].getSchema 155 | BigQuerySchemaConverter.fromBigQueryToSpark(schema) 156 | } 157 | 158 | /** 159 | * Gets a BigQuery table reader that can be used to read a BigQuery table through a number of pages/partitions. 160 | * @param table BigQuery table reference 161 | * @param numPartitions Suggested number of target partitions. The effective number of partitions may be different. 162 | * @return BigQuery table reader 163 | */ 164 | def getTable(table: BigQueryTableReference, numPartitions: Int): BigQueryTableReader = { 165 | val tbl = bigquery.getTable(table) 166 | BigQueryTableReader(tbl, tbl.list().getTotalRows, numPartitions) 167 | } 168 | 169 | /** 170 | * Deletes a BigQuery table. 171 | * @param table BigQuery table reference 172 | * @return True if the table was deleted and false if the table was not found 173 | */ 174 | def deleteTable(table: BigQueryTableReference): Boolean = { 175 | logger.info(s"Deleting table $table") 176 | bigquery.getTable(table).delete() 177 | } 178 | 179 | /** 180 | * Deletes a BigQuery dataset and its contents. 181 | * @param project Project ID 182 | * @param dataset Dataset ID 183 | * @return True if the dataset was deleted, false if it was not found. 184 | */ 185 | def deleteDataset(project: String, dataset: String): Boolean = { 186 | logger.info(s"Deleting dataset $dataset in project $project") 187 | bigquery.delete(DatasetId.of(project, dataset), DatasetDeleteOption.deleteContents()) 188 | } 189 | 190 | /** 191 | * Executes a BigQuery standard SQL query and returns a BigQuery table reader to retrieve the results. 192 | * @param query BigQuery standard SQL (SQL-2011) query 193 | * @param numPartitions Number of target partitions 194 | * @return BigQuery table reader 195 | */ 196 | def executeQuery(query: String, numPartitions: Int): BigQueryTableReader = { 197 | sqlLogger.logSqlQuery(query) 198 | val tempTable = createTemporaryTableReference() 199 | 200 | val queryJobConfiguration = 201 | QueryJobConfiguration.newBuilder(query) 202 | .setUseLegacySql(false) 203 | .setAllowLargeResults(true) 204 | .setFlattenResults(false) 205 | .setPriority(config.job.priority.underlying) 206 | .setDestinationTable(tempTable) 207 | .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 208 | .setWriteDisposition(WriteDisposition.WRITE_EMPTY) 209 | .build() 210 | 211 | val totalRows = bigquery.query(queryJobConfiguration).getTotalRows 212 | val tbl = bigquery.getTable(tempTable) 213 | BigQueryTableReader(tbl, totalRows, numPartitions) 214 | } 215 | 216 | /** 217 | * Inserts the rows of a Spark [[DataFrame]] into a BigQuery table. 218 | * @param df Spark [[DataFrame]] 219 | * @param table BigQuery table 220 | */ 221 | private def insertRows(df: DataFrame, table: Table): Unit = { 222 | // Getting a stable reference to the schema for serialization with the following closure 223 | val schema = df.schema 224 | 225 | df foreachPartition { rows => 226 | if(rows.nonEmpty) { 227 | val converter = BigQuerySchemaConverter.getSparkToBigQueryConverterFunction(schema) 228 | val rowsToInsert = rows.map(row => RowToInsert.of(converter(row))).toIterable.asJava 229 | val response = table.insert(rowsToInsert, false, false) 230 | 231 | if (response.hasErrors) { 232 | val msg = response.getInsertErrors.asScala.values.flatMap(_.asScala.map(_.getMessage)).toSet.mkString("\n") 233 | throw new IOException(msg) 234 | } 235 | } 236 | } 237 | } 238 | 239 | /** 240 | * Writes a Spark [[DataFrame]] to a BigQuery table. 241 | * @param df Spark [[DataFrame]] 242 | * @param table Target BigQuery table 243 | * @param mode Save mode 244 | */ 245 | def writeTable(df: DataFrame, table: BigQueryTableReference, mode: SaveMode): Unit = { 246 | import SaveMode._ 247 | 248 | logger.info(s"Attempting to insert ${df.count()} rows to table $table" + 249 | s" (mode: $mode, partitions: ${df.rdd.getNumPartitions})") 250 | 251 | val ds = getOrCreateDataset(table.project, table.dataset)(identity) 252 | 253 | val schema = BigQuerySchemaConverter.fromSparkToBigQuery(df.schema) 254 | 255 | mode match { 256 | case Append => 257 | val tbl = ds.getOrCreateTable(table.table, schema) 258 | insertRows(df, tbl) 259 | 260 | case Overwrite => 261 | val tbl = ds.dropAndCreateTable(table.table, schema) 262 | insertRows(df, tbl) 263 | 264 | case ErrorIfExists => 265 | if(ds.existsNonEmptyTable(table.table)) { 266 | throw new IllegalStateException(s"Table $table already exists and is not empty") 267 | } else { 268 | val tbl = ds.getOrCreateTable(table.table, schema) 269 | insertRows(df, tbl) 270 | } 271 | 272 | case Ignore => 273 | if(!ds.existsNonEmptyTable(table.table)) { 274 | val tbl = ds.getOrCreateTable(table.table, schema) 275 | insertRows(df, tbl) 276 | } 277 | } 278 | } 279 | 280 | /** 281 | * Constructs a Google Cloud Storage (GCS) staging directory path that can be used to stage data files for data 282 | * import and export. 283 | * @return GCS directory path 284 | */ 285 | def getStagingDirectory(): String = { 286 | import config.stagingDataset._ 287 | 288 | val tempDirectoryName = createTempName() 289 | s"gs://$gcsBucket/$name/$tempDirectoryName" 290 | } 291 | 292 | /** 293 | * Exports a BigQuery table as a series of files to a temporary directory in a Google Cloud Storage (GCS) bucket. 294 | * @param table BigQuery table to export 295 | * @param format File export format 296 | * @return Temporary GCS staging directory containing the exported files in the specified format 297 | * @see [[https://cloud.google.com/bigquery/docs/exporting-data]] 298 | */ 299 | def exportTable(table: BigQueryTableReference, format: FileFormat): String = { 300 | if(UNSUPPORTED_BIGQUERY_EXPORT_FORMATS.contains(format)) { 301 | throw new UnsupportedFormatException(s"Unsupported BigQuery export format: $format") 302 | } 303 | 304 | val stagingDirectory = getStagingDirectory() 305 | val destinationUri = s"$stagingDirectory/${table.table}_*.${format.fileExtension}" 306 | 307 | logger.info(s"Starting export of table $table to $destinationUri (format: $format)") 308 | val job = bigquery.getTable(table).extract(format.bigQueryFormatOptions.getType, destinationUri) 309 | waitForJob(job) 310 | logger.info(s"Done exporting table $table") 311 | 312 | stagingDirectory 313 | } 314 | 315 | /** 316 | * Imports data from a Google Cloud Storage (GCS) directory into a BigQuery table. 317 | * @param path GCS directory path 318 | * @param format File format 319 | * @param table BigQuery table reference 320 | * @param mode Save mode 321 | */ 322 | def importTable(path: String, format: FileFormat, table: BigQueryTableReference, mode: SaveMode): Unit = { 323 | import SaveMode._ 324 | 325 | if(UNSUPPORTED_BIGQUERY_IMPORT_FORMATS.contains(format)) { 326 | throw new UnsupportedFormatException(s"Unsupported BigQuery import format: $format") 327 | } 328 | 329 | getOrCreateDataset(table.project, table.dataset)(identity) 330 | 331 | val baseConfig = LoadJobConfiguration.builder(table, path + s"*.${format.fileExtension}") 332 | .setAutodetect(true) 333 | .setIgnoreUnknownValues(false) 334 | .setMaxBadRecords(0) 335 | .setFormatOptions(format.bigQueryFormatOptions) 336 | .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) 337 | 338 | val (writeDisposition, ignoreDuplicateError) = mode match { 339 | case Append => 340 | (WriteDisposition.WRITE_APPEND, false) 341 | 342 | case Overwrite => 343 | (WriteDisposition.WRITE_TRUNCATE, false) 344 | 345 | case ErrorIfExists => 346 | (WriteDisposition.WRITE_EMPTY, false) 347 | 348 | case Ignore => 349 | (WriteDisposition.WRITE_EMPTY, true) 350 | } 351 | 352 | val jobInfo = JobInfo.of(baseConfig.setWriteDisposition(writeDisposition).build()) 353 | val job = bigquery.create(jobInfo) 354 | 355 | logger.info(s"Starting import into table $table from $path (format: $format, mode: $mode)") 356 | waitForJob(job, ignoreDuplicateError) 357 | logger.info(s"Done importing into table $table") 358 | } 359 | 360 | /** 361 | * Waits for completion of a job and check for errors. 362 | * @param job Job to wait for 363 | * @param ignoreDuplicateError Whether to ignore duplicate errors or not 364 | * @see [[https://cloud.google.com/bigquery/troubleshooting-errors]] 365 | */ 366 | private def waitForJob(job: Job, ignoreDuplicateError: Boolean = false): Unit = { 367 | val status = job.waitFor( 368 | RetryOption.initialRetryDelay(Duration.ofSeconds(1)), 369 | RetryOption.retryDelayMultiplier(1.2), 370 | RetryOption.maxRetryDelay(Duration.ofSeconds(30)), 371 | RetryOption.totalTimeout(Duration.ofMillis(config.job.timeout)) 372 | ).getStatus 373 | 374 | if(status.getError != null && (!ignoreDuplicateError || status.getError.getReason != "duplicate")) { 375 | throw new IOException(s"BigQuery job ${job.getJobId} failed with message: ${status.getError.getMessage}") 376 | } 377 | } 378 | } 379 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/client/BigQueryTableReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.client 23 | 24 | import com.google.cloud.bigquery.BigQuery.TableDataListOption 25 | import com.google.cloud.bigquery._ 26 | import com.miraisolutions.spark.bigquery.{BigQuerySchemaConverter, BigQueryTableReference} 27 | import org.apache.spark.sql.Row 28 | import org.slf4j.LoggerFactory 29 | 30 | import scala.collection.JavaConverters._ 31 | 32 | private[bigquery] object BigQueryTableReader { 33 | // Maximum number of rows per BigQuery page. See https://cloud.google.com/bigquery/docs/paging-results 34 | private val MAX_ROWS_PER_PAGE: Long = 100000L 35 | } 36 | 37 | /** 38 | * Table reader used to read a BigQuery table through a number of pages/partitions. 39 | * @param table BigQuery table 40 | * @param totalRows Total number of rows to read (across all pages) 41 | * @param suggestedNumPartitions Suggested number of target partitions. The effective number of partitions may 42 | * be different. 43 | * @see [[https://cloud.google.com/bigquery/docs/paging-results]] 44 | */ 45 | private[bigquery] case class BigQueryTableReader private (table: Table, totalRows: Long, suggestedNumPartitions: Int) { 46 | import BigQueryTableReader._ 47 | 48 | private val logger = LoggerFactory.getLogger(classOf[BigQueryTableReader]) 49 | 50 | // BigQuery => Spark schema converter 51 | private lazy val converter: FieldValueList => Row = { 52 | val schema = table.getDefinition[TableDefinition].getSchema 53 | BigQuerySchemaConverter.getBigQueryToSparkConverterFunction(schema) 54 | } 55 | 56 | // Page size to use when reading from BigQuery; note that there is a limit of 100k rows per page 57 | private val pageSize: Long = { 58 | val suggestedPageSize = (totalRows + suggestedNumPartitions - 1) / suggestedNumPartitions 59 | Math.min(Math.min(suggestedPageSize, MAX_ROWS_PER_PAGE), totalRows) 60 | } 61 | 62 | /** 63 | * The effective number of partitions. This may be different from the `suggestedNumPartitions`. 64 | */ 65 | def numPartitions: Int = Math.ceil(totalRows.toDouble / pageSize).toInt 66 | 67 | /** 68 | * Reads a page/partition of a specified size. 69 | * @param pageIndex Page index 70 | * @return BigQuery [[TableResult]] that can be used to iterate through the results 71 | */ 72 | private def getTableResult(pageIndex: Int): TableResult = { 73 | table.list( 74 | TableDataListOption.pageSize(pageSize), 75 | TableDataListOption.startIndex(pageSize * pageIndex) 76 | ) 77 | } 78 | 79 | /** 80 | * Get a row iterable for the specified partition. 81 | * @param partitionIndex Partition index 82 | * @return Row iterable 83 | */ 84 | def getRows(partitionIndex: Int): Iterable[Row] = { 85 | logger.info(s"Retrieving rows of table ${BigQueryTableReference(table.getTableId)} partition $partitionIndex" + 86 | s" (page size: $pageSize, total rows: $totalRows, partitions: $numPartitions)") 87 | 88 | val result = getTableResult(partitionIndex) 89 | result.getValues.asScala.map(converter) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/client/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import com.google.cloud.bigquery.BigQuery.TableDataListOption 25 | import com.google.cloud.bigquery._ 26 | import org.slf4j.LoggerFactory 27 | 28 | package object client { 29 | 30 | private val logger = LoggerFactory.getLogger(this.getClass.getName) 31 | 32 | /** 33 | * Some convenience methods on [[Dataset]] 34 | * @param ds BigQuery [[Dataset]] 35 | */ 36 | private[client] implicit class BigQueryDataset(val ds: Dataset) { 37 | 38 | private def fold[T](table: String)(ifNotExists: => T)(f: Table => T): T = { 39 | Option(ds.get(table)).fold(ifNotExists) { tbl => 40 | if(tbl.exists()) f(tbl) else ifNotExists 41 | } 42 | } 43 | 44 | def getOrCreateTable(table: String, schema: Schema): Table = { 45 | fold(table)(createTable(table, schema))(identity) 46 | } 47 | 48 | def existsTable(table: String): Boolean = { 49 | fold(table)(false)(_ => true) 50 | } 51 | 52 | def isNonEmptyTable(table: String): Boolean = { 53 | fold(table)(false)(_.list(TableDataListOption.pageSize(1)).getTotalRows > 0) 54 | } 55 | 56 | def existsNonEmptyTable(table: String): Boolean = { 57 | existsTable(table) && isNonEmptyTable(table) 58 | } 59 | 60 | def createTable(table: String, schema: Schema): Table = { 61 | logger.info(s"Creating table $table in dataset ${ds.getDatasetId.getDataset} " + 62 | s"of project ${ds.getDatasetId.getProject}") 63 | 64 | val tableDefinition = StandardTableDefinition.newBuilder() 65 | .setType(TableDefinition.Type.TABLE) 66 | .setSchema(schema) 67 | .build() 68 | 69 | ds.create(table, tableDefinition) 70 | } 71 | 72 | def dropTable(table: String): Unit = { 73 | fold(table)((): Unit) { table => 74 | logger.info(s"Deleting table $table in dataset ${ds.getDatasetId.getDataset} " + 75 | s"of project ${ds.getDatasetId.getProject}") 76 | table.delete() 77 | } 78 | } 79 | 80 | def dropAndCreateTable(table: String, schema: Schema): Table = { 81 | dropTable(table) 82 | createTable(table, schema) 83 | } 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/config/BigQueryConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.config 23 | 24 | import com.miraisolutions.spark.bigquery.config.JobConfig.Priority 25 | 26 | object StagingDatasetConfig { 27 | private val namespace = "bq.staging_dataset." 28 | 29 | private[bigquery] val DESCRIPTION = "Spark BigQuery staging dataset" 30 | 31 | private[config] object Keys { 32 | val NAME = namespace + "name" 33 | val LIFETIME = namespace + "lifetime" 34 | val GCS_BUCKET = namespace + "gcs_bucket" 35 | } 36 | 37 | private[config] object Defaults { 38 | val NAME = "spark_staging" 39 | val LIFETIME = 86400000L 40 | } 41 | } 42 | 43 | /** 44 | * BigQuery staging dataset configuration. A staging dataset is used to temporarily store the results of SQL queries. 45 | * @param name Name of staging dataset 46 | * @param lifetime Default table lifetime in milliseconds. Tables are automatically deleted once the lifetime has 47 | * been reached. 48 | * @param gcsBucket Google Cloud Storage (GCS) bucket to use for storing temporary files. Temporary files are used 49 | * when importing through BigQuery load jobs and exporting through BigQuery extraction jobs. 50 | * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]] 51 | */ 52 | case class StagingDatasetConfig( 53 | name: String = StagingDatasetConfig.Defaults.NAME, 54 | lifetime: Long = StagingDatasetConfig.Defaults.LIFETIME, 55 | gcsBucket: String 56 | ) 57 | 58 | 59 | object JobConfig { 60 | import com.google.cloud.bigquery.QueryJobConfiguration.{Priority => BQPriority} 61 | 62 | private val namespace = "bq.job." 63 | 64 | /** BigQuery job priority */ 65 | sealed trait Priority { 66 | private[bigquery] def underlying: BQPriority 67 | } 68 | object Priority { 69 | 70 | /** 71 | * BigQuery interactive priority. Runs jobs as soon as possible. Interactive queries count towards the 72 | * concurrent rate limit and the daily limit. 73 | * 74 | * @see https://cloud.google.com/bigquery/docs/running-queries 75 | * @see https://cloud.google.com/bigquery/quotas 76 | */ 77 | case object Interactive extends Priority { 78 | override private[bigquery] def underlying: BQPriority = BQPriority.INTERACTIVE 79 | } 80 | 81 | /** 82 | * BigQuery batch priority. Jobs start as soon as idle resources are available, usually within a few minutes. 83 | * Batch queries don't count towards the concurrent rate limit and the daily limit. 84 | * 85 | * @see https://cloud.google.com/bigquery/docs/running-queries 86 | */ 87 | case object Batch extends Priority { 88 | override private[bigquery] def underlying: BQPriority = BQPriority.BATCH 89 | } 90 | 91 | private[config] def parse(s: String): Priority = s.toLowerCase match { 92 | case "interactive" => Interactive 93 | case "batch" => Batch 94 | case _ => throw new IllegalArgumentException("Invalid priority: " + s) 95 | } 96 | } 97 | 98 | private[config] object Keys { 99 | val PRIORITY = namespace + "priority" 100 | val TIMEOUT = namespace + "timeout" 101 | } 102 | 103 | private[config] object Defaults { 104 | val PRIORITY = Priority.Interactive 105 | val TIMEOUT = 3600000L 106 | } 107 | } 108 | 109 | /** 110 | * BigQuery job configuration options. 111 | * @param priority BigQuery job priority when executing SQL queries. Defaults to "interactive", i.e. the 112 | * query is executed as soon as possible. 113 | * @param timeout Timeout in milliseconds after which a file import/export job should be considered as failed. 114 | * Defaults to 3600000 ms = 1 h. 115 | * @see [[https://cloud.google.com/bigquery/quota-policy]] 116 | */ 117 | case class JobConfig( 118 | priority: Priority = JobConfig.Defaults.PRIORITY, 119 | timeout: Long = JobConfig.Defaults.TIMEOUT 120 | ) 121 | 122 | 123 | object BigQueryConfig { 124 | private val namespace = "bq." 125 | 126 | private[config] object Keys { 127 | val PROJECT = namespace + "project" 128 | val LOCATION = namespace + "location" 129 | val SERVICE_ACCOUNT_KEY_FILE = namespace + "service_account_key_file" 130 | } 131 | 132 | /** 133 | * Constructs typed BigQuery configuration options from a parameter map. 134 | * @param parameters Parameter map 135 | */ 136 | def apply(parameters: Map[String, String]): BigQueryConfig = { 137 | val project = parameters(Keys.PROJECT) 138 | val location = parameters(Keys.LOCATION) 139 | val serviceAccountKeyFile = parameters.get(Keys.SERVICE_ACCOUNT_KEY_FILE) 140 | 141 | val stagingDataset = StagingDatasetConfig( 142 | name = parameters.getOrElse(StagingDatasetConfig.Keys.NAME, StagingDatasetConfig.Defaults.NAME), 143 | lifetime = parameters.get(StagingDatasetConfig.Keys.LIFETIME).map(_.toLong) 144 | .getOrElse(StagingDatasetConfig.Defaults.LIFETIME), 145 | gcsBucket = parameters(StagingDatasetConfig.Keys.GCS_BUCKET) 146 | ) 147 | 148 | val job = JobConfig( 149 | priority = parameters.get(JobConfig.Keys.PRIORITY).map(Priority.parse).getOrElse(JobConfig.Defaults.PRIORITY), 150 | timeout = parameters.get(JobConfig.Keys.TIMEOUT).map(_.toLong).getOrElse(JobConfig.Defaults.TIMEOUT) 151 | ) 152 | 153 | BigQueryConfig(project, location , serviceAccountKeyFile, stagingDataset, job) 154 | } 155 | } 156 | 157 | /** 158 | * BigQuery configuration. 159 | * 160 | * @param project BigQuery billing project ID. 161 | * @param location Geographic location where newly created datasets should reside. "EU" or "US". 162 | * This holds for new datasets that are being created as part of a Spark write operation 163 | * and for temporary staging datasets. 164 | * @param serviceAccountKeyFile Optional Google Cloud service account key file to use for authentication with Google 165 | * Cloud services. The use of service accounts is highly recommended. Specifically, the 166 | * service account will be used to interact with BigQuery and Google Cloud Storage (GCS). 167 | * If not specified, application default credentials will be used. 168 | * @param stagingDataset BigQuery staging dataset configuration options. 169 | * @param job BigQuery job configuration options. 170 | * @see [[https://cloud.google.com/bigquery/pricing]] 171 | * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]] 172 | * @see [[https://cloud.google.com/docs/authentication/]] 173 | * @see [[https://cloud.google.com/bigquery/docs/authentication/]] 174 | * @see [[https://cloud.google.com/storage/docs/authentication/]] 175 | */ 176 | case class BigQueryConfig( 177 | project: String, 178 | location: String, 179 | serviceAccountKeyFile: Option[String] = None, 180 | stagingDataset: StagingDatasetConfig, 181 | job: JobConfig = JobConfig() 182 | ) 183 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/config/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery 23 | 24 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row} 25 | import scala.language.reflectiveCalls 26 | 27 | package object config { 28 | 29 | // Structural helper type 30 | private type OPT[T] = { 31 | def format(source: String): T 32 | def option(key: String, value: String): T 33 | } 34 | 35 | // Applies format and configuration options on a DataFrameReader or DataFrameWriter 36 | private[bigquery] def applyDataFrameOptions[T <: OPT[T]](obj: T, config: BigQueryConfig): T = { 37 | import config._ 38 | 39 | val objWithOptions = 40 | obj 41 | .format(DefaultSource.BIGQUERY_DATA_SOURCE_NAME) 42 | .option(BigQueryConfig.Keys.PROJECT, project) 43 | .option(BigQueryConfig.Keys.LOCATION, location) 44 | .option(StagingDatasetConfig.Keys.NAME, stagingDataset.name) 45 | .option(StagingDatasetConfig.Keys.LIFETIME, stagingDataset.lifetime.toString) 46 | .option(StagingDatasetConfig.Keys.GCS_BUCKET, stagingDataset.gcsBucket) 47 | .option(JobConfig.Keys.PRIORITY, job.priority.toString) 48 | 49 | serviceAccountKeyFile.fold(objWithOptions) { file => 50 | objWithOptions.option(BigQueryConfig.Keys.SERVICE_ACCOUNT_KEY_FILE, file) 51 | } 52 | } 53 | 54 | implicit class DataFrameReaderConfig(val reader: DataFrameReader) extends AnyVal { 55 | /** 56 | * Utility method to apply typed BigQuery configuration. 57 | * @param config BigQuery configuration 58 | */ 59 | def bigquery(config: BigQueryConfig): DataFrameReader = applyDataFrameOptions(reader, config) 60 | } 61 | 62 | implicit class DataFrameWriterConfig(val writer: DataFrameWriter[Row]) extends AnyVal { 63 | /** 64 | * Utility method to apply typed BigQuery configuration. 65 | * @param config BigQuery configuration 66 | */ 67 | def bigquery(config: BigQueryConfig): DataFrameWriter[Row] = applyDataFrameOptions(writer, config) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/examples/Shakespeare.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.examples 23 | 24 | import org.apache.spark.sql.{SaveMode, SparkSession} 25 | import com.miraisolutions.spark.bigquery.config._ 26 | 27 | /** 28 | * Reads the public Google BigQuery sample dataset 'shakespeare'. 29 | * 30 | * To run this example first compile an assembly using `sbt assembly`. Then run: 31 | * 32 | * ==Local Spark Cluster== 33 | * `spark-submit --class com.miraisolutions.spark.bigquery.examples.Shakespeare --master local[*] 34 | * target/scala-2.11/spark-bigquery-assembly-.jar ` 35 | * 36 | * ==Google Cloud Dataproc== 37 | * Login to service account: 38 | * `gcloud auth activate-service-account --key-file=[KEY-FILE]` 39 | * 40 | * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]] 41 | * 42 | * `gcloud dataproc jobs submit spark --cluster --class 43 | * com.miraisolutions.spark.bigquery.examples.Shakespeare --jars 44 | * target/scala-2.11/spark-bigquery-assembly-.jar -- ` 45 | * 46 | * Where `` are: 47 | * 1. Google BigQuery billing project ID 48 | * 2. Google BigQuery dataset location (EU, US) 49 | * 3. Google Cloud Storage (GCS) bucket where staging files will be located 50 | * 4. Google Cloud service account key file (works only when run against local cluster) 51 | * 52 | * @see [[https://cloud.google.com/bigquery/public-data/]] 53 | * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]] 54 | * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]] 55 | * @see [[https://cloud.google.com/dataproc/]] 56 | */ 57 | object Shakespeare { 58 | def main(args: Array[String]): Unit = { 59 | 60 | // Initialize Spark session 61 | val spark = SparkSession 62 | .builder 63 | .appName("Google BigQuery Shakespeare") 64 | .getOrCreate 65 | 66 | import spark.implicits._ 67 | 68 | // Define BigQuery options 69 | val config = BigQueryConfig( 70 | project = args(0), // Google BigQuery billing project ID 71 | location = args(1), // Google BigQuery dataset location 72 | stagingDataset = StagingDatasetConfig( 73 | gcsBucket = args(2) // Google Cloud Storage bucket for staging files 74 | ), 75 | // Google Cloud service account key file - works only in local cluster mode 76 | serviceAccountKeyFile = if(args.length > 3) Some(args(3)) else None 77 | ) 78 | 79 | // Read public shakespeare data table using direct import (streaming) 80 | val shakespeare = spark.read 81 | .bigquery(config) 82 | .option("table", "bigquery-public-data.samples.shakespeare") 83 | .option("type", "direct") 84 | .load() 85 | 86 | val hamlet = shakespeare.filter($"corpus".like("hamlet")) 87 | hamlet.show(100) 88 | 89 | shakespeare.createOrReplaceTempView("shakespeare") 90 | val macbeth = spark.sql("SELECT * FROM shakespeare WHERE corpus = 'macbeth'").persist() 91 | macbeth.show(100) 92 | 93 | // Write filtered data table via a Parquet export on GCS 94 | macbeth.write 95 | .bigquery(config) 96 | .option("table", args(0) + ".samples.macbeth") 97 | .option("type", "parquet") 98 | .mode(SaveMode.Overwrite) 99 | .save() 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/exception/IOException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.exception 23 | 24 | /** 25 | * Signals an error when trying to read from or write to BigQuery. 26 | * @param message Exception message 27 | */ 28 | private[bigquery] class IOException(message: String) extends java.io.IOException(message) 29 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/exception/MissingParameterException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.exception 23 | 24 | /** 25 | * Exception thrown in case of a missing parameter. 26 | * @param message Exception message 27 | */ 28 | private[bigquery] class MissingParameterException(message: String) extends Exception(message) 29 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/exception/ParseException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.exception 23 | 24 | /** 25 | * Signals that an error occurred while parsing a string. 26 | * @param message Exception message 27 | */ 28 | private[bigquery] class ParseException(message: String) extends Exception(message) 29 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/exception/UnsupportedFormatException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.exception 23 | 24 | /** 25 | * Thrown to indicate that an unsupported BigQuery import/export format is being used. 26 | * @param message Exception message 27 | */ 28 | private[bigquery] class UnsupportedFormatException(message: String) extends UnsupportedOperationException(message) 29 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/sql/BigQueryDialect.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.sql 23 | 24 | import org.apache.spark.sql.jdbc.JdbcDialect 25 | 26 | /** 27 | * Google BigQuery standard SQL dialect (SQL-2011) 28 | * 29 | * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/]] 30 | */ 31 | private case object BigQueryDialect extends JdbcDialect { 32 | 33 | override def canHandle(url: String): Boolean = false 34 | 35 | override def quoteIdentifier(colName: String): String = s"`$colName`" 36 | 37 | override def getTableExistsQuery(table: String): String = s"SELECT 1 FROM $table LIMIT 1" 38 | 39 | override def getSchemaQuery(table: String): String = s"SELECT * FROM $table LIMIT 1" 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/sql/BigQuerySqlGeneration.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.sql 23 | 24 | import com.miraisolutions.spark.bigquery.BigQueryTableReference 25 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD 26 | import org.apache.spark.sql.sources.Filter 27 | 28 | 29 | /** 30 | * Google BigQuery SQL Generation 31 | * 32 | * @param table BigQuery table reference 33 | * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/]] 34 | */ 35 | private[bigquery] case class BigQuerySqlGeneration(table: BigQueryTableReference) { 36 | 37 | // Generates column list for SELECT statement 38 | private def getColumnList(columns: Array[String]): String = { 39 | if(columns.isEmpty) { 40 | "1" 41 | } else { 42 | columns.map(BigQueryDialect.quoteIdentifier).mkString(",") 43 | } 44 | } 45 | 46 | // Generates the filter expressions in a WHERE clause 47 | private def getWhereClauseFilters(filters: Array[Filter]): String = { 48 | filters 49 | .flatMap(JDBCRDD.compileFilter(_, BigQueryDialect)) 50 | .map(p => s"($p)") 51 | .mkString(" AND ") 52 | } 53 | 54 | def getQuery(columns: Array[String]): String = { 55 | s"SELECT ${getColumnList(columns)} FROM $table" 56 | } 57 | 58 | def getQuery(columns: Array[String], filters: Array[Filter]): String = { 59 | val whereClause = if(filters.isEmpty) "" else " WHERE " + getWhereClauseFilters(filters) 60 | getQuery(columns) + whereClause 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/DateTime.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils 23 | 24 | import java.util.TimeZone 25 | import java.sql.{Date, Timestamp} 26 | import java.time.{Instant, LocalDate, ZoneId} 27 | import java.time.format.DateTimeFormatter 28 | 29 | private[bigquery] object DateTime { 30 | 31 | private val UTC = ZoneId.of("UTC") 32 | private val DATE_FORMATTER = DateTimeFormatter.ISO_LOCAL_DATE.withZone(UTC) 33 | 34 | /** 35 | * Formats milliseconds since the epoch in the format 'yyyy-MM-dd'. 36 | * @param millis Milliseconds since the epoch 37 | * @return Date string of the form 'yyyy-MM-dd' 38 | */ 39 | def formatMillisSinceEpoch(millis: Long): String = { 40 | val instant = Instant.ofEpochMilli(millis) 41 | DATE_FORMATTER.format(instant) 42 | } 43 | 44 | /** 45 | * Formats a [[java.sql.Date]] returned by Spark using the format 'yyyy-MM-dd'. 46 | * @param date Date value from Spark 47 | * @return Date string of the form 'yyyy-MM-dd' 48 | * @note Spark generally seems to be using local timezone 49 | * @see [[https://issues.apache.org/jira/browse/SPARK-18350]] 50 | * @see [[https://groups.google.com/a/lists.datastax.com/forum/#!topic/spark-connector-user/Uv9UoFjA9SU]] 51 | */ 52 | def formatSparkDate(date: Date): String = { 53 | formatMillisSinceEpoch(date.getTime + TimeZone.getDefault.getOffset(date.getTime)) 54 | } 55 | 56 | /** 57 | * Parses a string of the form 'yyyy-MM-dd' to a [[java.sql.Date]]. 58 | * @param s String of the form 'yyyy-MM-dd' 59 | * @return Date 60 | */ 61 | def parseDate(s: String): Date = { 62 | val localDate = LocalDate.parse(s, DATE_FORMATTER) 63 | new Date(localDate.atStartOfDay.atZone(UTC).toInstant.toEpochMilli) 64 | } 65 | 66 | /** 67 | * Creates a [[java.sql.Timestamp]] from microseconds since the epoch as returned by the BigQuery API. 68 | * @param m Microseconds since the epoch 69 | * @return Timestamp 70 | */ 71 | def epochMicrosToTimestamp(m: Long): Timestamp = { 72 | val ts = new Timestamp(0) 73 | val nanos = (m % 1000000L).toInt * 1000 74 | 75 | if(nanos < 0) { 76 | ts.setTime((m / 1000000L - 1L) * 1000L) 77 | ts.setNanos(nanos + 1000000000) 78 | } else { 79 | ts.setTime(m / 1000000L * 1000L) 80 | ts.setNanos(nanos) 81 | } 82 | 83 | ts 84 | } 85 | 86 | /** 87 | * Converts a [[java.sql.Timestamp]] to the number of (fractional) seconds since the epoch. 88 | * @param ts Timestamp 89 | * @return Seconds since the epoch 90 | */ 91 | def timestampToEpochSeconds(ts: Timestamp): Double = { 92 | ts.getTime / 1000L + ts.getNanos.toDouble / 1e9 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/Files.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | import org.apache.hadoop.fs.{FileSystem, Path} 26 | 27 | /** 28 | * File utilities. 29 | */ 30 | private[bigquery] object Files { 31 | 32 | /** 33 | * Returns a Hadoop filesystem and path for the provided path. 34 | * @param path File or directory path 35 | * @param conf Hadoop configuration 36 | * @return Hadoop filesystem and path 37 | */ 38 | private def getFsAndPath(path: String, conf: Configuration): (FileSystem, Path) = { 39 | val p = new Path(path) 40 | val fs = FileSystem.get(p.toUri, conf) 41 | (fs, p) 42 | } 43 | 44 | /** 45 | * Deletes the specified path recursively. 46 | * @param path Path to delete 47 | * @param conf Hadoop configuration 48 | */ 49 | def delete(path: String, conf: Configuration): Unit = { 50 | val (fs, p) = getFsAndPath(path, conf) 51 | fs.delete(p, true) 52 | } 53 | 54 | /** 55 | * Registers the specified path for deletion when the underlying filesystem is being closed. 56 | * @param path Path to delete 57 | * @param conf Hadoop configuration 58 | */ 59 | def deleteOnExit(path: String, conf: Configuration): Unit = { 60 | val (fs, p) = getFsAndPath(path, conf) 61 | fs.deleteOnExit(p) 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/SqlLogger.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils 23 | 24 | import org.slf4j.Logger 25 | 26 | /** 27 | * SQL logging wrapper for logging SQL queries 28 | * @param logger Slf4j logger to use for logging 29 | */ 30 | private[bigquery] case class SqlLogger(logger: Logger) { 31 | def logSqlQuery(sqlQuery: String): Unit = { 32 | logger.info("Executing SQL query: " + sqlQuery.replaceAllLiterally("\n", "")) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/format/FormatConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils.format 23 | 24 | import org.apache.spark.sql.types.{StructField, StructType} 25 | import org.apache.spark.sql.DataFrame 26 | 27 | /** 28 | * Provides format conversion utility methods. 29 | */ 30 | object FormatConverter { 31 | 32 | /** 33 | * Transforms a data frame by applying a list of column converters. Converters are applied in the order specified. 34 | * Columns not matching any of the converter's domains remain unchanged. 35 | * @param df Input data frame 36 | * @param converters List of column converters to apply 37 | * @return Output data frame 38 | */ 39 | def transform(df: DataFrame, converters: List[ColumnConverter]): DataFrame = { 40 | val convert = converters.reduce(_ orElse _) 41 | val transformed = df.schema.fields.foldRight((df, List.empty[StructField])) { case (field, (aggDf, aggFields)) => 42 | if(convert.isDefinedAt(field)) { 43 | val (newField, converterFunction) = convert(field) 44 | val newDf = aggDf.withColumn(field.name, converterFunction(aggDf.col(field.name))) 45 | (newDf, newField :: aggFields) 46 | 47 | } else { 48 | // leave column unchanged 49 | (aggDf, field :: aggFields) 50 | } 51 | } 52 | val (newDf, newFields) = transformed 53 | df.sqlContext.createDataFrame(newDf.rdd, StructType(newFields)) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/format/Generic.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils.format 23 | 24 | import com.miraisolutions.spark.bigquery.BigQuerySchemaConverter 25 | import org.apache.spark.sql.Row 26 | import org.apache.spark.sql.expressions.UserDefinedFunction 27 | import org.apache.spark.sql.functions.udf 28 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType} 29 | import scala.collection.mutable.WrappedArray 30 | 31 | /** 32 | * Generic format converters. 33 | */ 34 | object Generic { 35 | import BigQuerySchemaConverter._ 36 | 37 | /** 38 | * Creates a Spark UDF to convert an array of key-value structs to a Spark map column. 39 | * @param mapType Resulting map type 40 | * @return Spark UDF 41 | */ 42 | private def keyValueRecordToMapUdf(mapType: MapType): UserDefinedFunction = udf((kvMap: WrappedArray[Row]) => { 43 | val kvPairs = kvMap map { kvRecord => 44 | val key = kvRecord.getAs[Any](KEY_FIELD_NAME) 45 | val value = kvRecord.getAs[Any](VALUE_FIELD_NAME) 46 | (key, value) 47 | } 48 | kvPairs.toMap 49 | }, mapType) 50 | 51 | /** 52 | * Transforms a BigQuery repeated record of key-value fields to a Spark map column. 53 | */ 54 | val keyValueRecordToMap: ColumnConverter = { 55 | case StructField( 56 | name, 57 | ArrayType( 58 | StructType( 59 | Array( 60 | StructField(KEY_FIELD_NAME, keyType, false, _), // key field 61 | StructField(VALUE_FIELD_NAME, valueType, valueNullable, _) // value field 62 | ) 63 | ), 64 | false 65 | ), 66 | nullable, 67 | meta 68 | ) => 69 | val mapType = MapType(keyType, valueType, valueNullable) 70 | val newField = StructField(name, mapType, nullable, meta) 71 | (newField, keyValueRecordToMapUdf(mapType)(_)) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/format/Parquet.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils.format 23 | 24 | import org.apache.spark.sql.Row 25 | import org.apache.spark.sql.expressions.UserDefinedFunction 26 | import org.apache.spark.sql.functions.udf 27 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType} 28 | 29 | /** 30 | * Apache Parquet format converters. 31 | * 32 | * @see [[https://github.com/apache/parquet-format/blob/master/LogicalTypes.md]] 33 | */ 34 | object Parquet { 35 | 36 | private val PARQUET_LIST_LIST_FIELD_NAME = "list" 37 | private val PARQUET_LIST_ELEMENT_FIELD_NAME = "element" 38 | private val PARQUET_MAP_KEYVALUE_FIELD_NAME = "key_value" 39 | private val PARQUET_MAP_KEY_FIELD_NAME = "key" 40 | private val PARQUET_MAP_VALUE_FIELD_NAME = "value" 41 | 42 | /** 43 | * Creates a Spark UDF to convert a Parquet-LIST-structured column to a Spark array column. 44 | * @param arrayType Resulting array type 45 | * @return Spark UDF 46 | */ 47 | private def parquetListToArrayUdf(arrayType: ArrayType): UserDefinedFunction = udf((row: Row) => { 48 | row.getAs[Seq[Row]](PARQUET_LIST_LIST_FIELD_NAME).map(_.getAs[Any](PARQUET_LIST_ELEMENT_FIELD_NAME)) 49 | }, arrayType) 50 | 51 | /** 52 | * Transforms a Parquet-LIST-structured column to a Spark array column. 53 | */ 54 | val parquetListToArray: ColumnConverter = { 55 | case StructField( 56 | name, 57 | StructType( 58 | Array( 59 | StructField( // Parquet: repeated group list 60 | PARQUET_LIST_LIST_FIELD_NAME, 61 | ArrayType( 62 | StructType( 63 | Array( 64 | // Parquet: element field 65 | StructField(PARQUET_LIST_ELEMENT_FIELD_NAME, elementType, elementNullable, _) 66 | ) 67 | ), 68 | false 69 | ), 70 | false, // repeated fields are not nullable 71 | _ 72 | ) 73 | ) 74 | ), 75 | nullable, 76 | meta 77 | ) => 78 | val arrayType = ArrayType(elementType, elementNullable) 79 | val newField = StructField(name, arrayType, nullable, meta) 80 | (newField, parquetListToArrayUdf(arrayType)(_)) 81 | } 82 | 83 | /** 84 | * Creates a Spark UDF to convert a Parquet-MAP-structured column to a Spark map column. 85 | * @param mapType Resulting map type 86 | * @return Spark UDF 87 | */ 88 | private def parquetMapToMapUdf(mapType: MapType): UserDefinedFunction = udf((row: Row) => { 89 | val kvPairs = row.getAs[Seq[Row]](PARQUET_MAP_KEYVALUE_FIELD_NAME) map { kv => 90 | val key = kv.getAs[Any](PARQUET_MAP_KEY_FIELD_NAME) 91 | val value = kv.getAs[Any](PARQUET_MAP_VALUE_FIELD_NAME) 92 | (key, value) 93 | } 94 | kvPairs.toMap 95 | }, mapType) 96 | 97 | /** 98 | * Transforms a Parquet-MAP-structured column to a Spark map column. 99 | */ 100 | val parquetMapToMap: ColumnConverter = { 101 | case StructField( 102 | name, 103 | StructType( 104 | Array( 105 | StructField( 106 | PARQUET_MAP_KEYVALUE_FIELD_NAME, 107 | ArrayType( 108 | StructType( 109 | Array( 110 | StructField(PARQUET_MAP_KEY_FIELD_NAME, keyType, false, _), 111 | StructField(PARQUET_MAP_VALUE_FIELD_NAME, valueType, valueNullable, _) 112 | ) 113 | ), 114 | false 115 | ), 116 | false, 117 | _ 118 | ) 119 | ) 120 | ), 121 | nullable, 122 | meta 123 | ) => 124 | val mapType = MapType(keyType, valueType, valueNullable) 125 | val newField = StructField(name, mapType, nullable, meta) 126 | (newField, parquetMapToMapUdf(mapType)(_)) 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/scala/com/miraisolutions/spark/bigquery/utils/format/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018 Mirai Solutions GmbH 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | * this software and associated documentation files (the "Software"), to deal in 6 | * the Software without restriction, including without limitation the rights to 7 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | * the Software, and to permit persons to whom the Software is furnished to do so, 9 | * subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in all 12 | * copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | */ 21 | 22 | package com.miraisolutions.spark.bigquery.utils 23 | 24 | import org.apache.spark.sql.Column 25 | import org.apache.spark.sql.types.StructField 26 | 27 | package object format { 28 | 29 | /** 30 | * Generic converter partial function for converting data frame columns. 31 | * 32 | * The result of a converter is the new field definition/format and a column-to-column function to transform an 33 | * existing column into the specified format. 34 | */ 35 | type ColumnConverter = PartialFunction[StructField, (StructField, Column => Column)] 36 | 37 | } 38 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.1.1" 2 | --------------------------------------------------------------------------------