├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── project
    ├── build.properties
    ├── build.sbt
    └── plugins.sbt
├── src
    ├── it
    │   ├── resources
    │   │   └── log4j.properties
    │   └── scala
    │   │   └── com
    │   │       └── miraisolutions
    │   │           └── spark
    │   │               └── bigquery
    │   │                   ├── DirectWriteAndReadSpec.scala
    │   │                   ├── ParquetWriteDirectReadSpec.scala
    │   │                   └── test
    │   │                       ├── BigQueryConfiguration.scala
    │   │                       ├── BigQueryTesting.scala
    │   │                       ├── data
    │   │                           ├── DataFrameGenerator.scala
    │   │                           └── TestData.scala
    │   │                       └── package.scala
    └── main
    │   ├── resources
    │       └── META-INF
    │       │   └── services
    │       │       ├── org.apache.hadoop.fs.FileSystem
    │       │       └── org.apache.spark.sql.sources.DataSourceRegister
    │   └── scala
    │       └── com
    │           └── miraisolutions
    │               └── spark
    │                   └── bigquery
    │                       ├── BigQueryPartition.scala
    │                       ├── BigQueryRowRDD.scala
    │                       ├── BigQuerySchemaConverter.scala
    │                       ├── BigQueryTableReference.scala
    │                       ├── BigQueryTableRelation.scala
    │                       ├── DefaultSource.scala
    │                       ├── FileFormat.scala
    │                       ├── client
    │                           ├── BigQueryClient.scala
    │                           ├── BigQueryTableReader.scala
    │                           └── package.scala
    │                       ├── config
    │                           ├── BigQueryConfig.scala
    │                           └── package.scala
    │                       ├── examples
    │                           └── Shakespeare.scala
    │                       ├── exception
    │                           ├── IOException.scala
    │                           ├── MissingParameterException.scala
    │                           ├── ParseException.scala
    │                           └── UnsupportedFormatException.scala
    │                       ├── sql
    │                           ├── BigQueryDialect.scala
    │                           └── BigQuerySqlGeneration.scala
    │                       └── utils
    │                           ├── DateTime.scala
    │                           ├── Files.scala
    │                           ├── SqlLogger.scala
    │                           └── format
    │                               ├── FormatConverter.scala
    │                               ├── Generic.scala
    │                               ├── Parquet.scala
    │                               └── package.scala
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | lib_managed
3 | project/project
4 | project/target
5 | target
6 | derby.log
7 | logs
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Mirai Solutions GmbH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [DEPRECATED] spark-bigquery: A Google BigQuery Data Source for Apache Spark
  2 | 
  3 | ## Deprecation Notice
  4 | 
  5 | This project has been deprecated in favor of the official
  6 | [Apache Spark SQL connector for Google BigQuery](https://github.com/GoogleCloudDataproc/spark-bigquery-connector).
  7 | 
  8 | 
  9 | ## Overview
 10 | 
 11 | This project provides a [Google BigQuery](https://cloud.google.com/bigquery/) data source (`com.miraisolutions.spark.bigquery.DefaultSource`) to [Apache Spark](https://spark.apache.org/) using the new [Google Cloud client libraries](https://cloud.google.com/bigquery/docs/reference/libraries) for the Google BigQuery API. It supports "direct" import/export where records are directly streamed from/to BigQuery. In addition, data may be imported/exported via intermediate data extracts on [Google Cloud Storage](https://cloud.google.com/storage/) (GCS). Note that when using "direct" (streaming) export, data may not be immediately available for further querying/processing in BigQuery. It may take several minutes for streamed records to become "available". See the following resources for more information:
 12 | 
 13 | * https://cloud.google.com/bigquery/streaming-data-into-bigquery
 14 | * https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert
 15 | 
 16 | The following import/export combinations are currently supported:
 17 | 
 18 | |                                        | Direct             | Parquet            | Avro               | ORC                | JSON               | CSV                |
 19 | | -------------------------------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ |
 20 | | Import to Spark (export from BigQuery) | :heavy_check_mark: | :x:                | :heavy_check_mark: | :x:                | :heavy_check_mark: | :heavy_check_mark: |
 21 | | Export from Spark (import to BigQuery) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x:                | :x:                |
 22 | 
 23 | 
 24 | More information on the various supported formats can be found at:
 25 | 
 26 | * Parquet: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet
 27 | * Avro: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro
 28 | * ORC: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc
 29 | * JSON: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json
 30 | * CSV: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv
 31 | 
 32 | CSV and JSON are not recommended as data exchange formats between Apache Spark and BigQuery due to their lack of type safety. Better options are direct import/export, Parquet, Avro and ORC.
 33 | 
 34 | 
 35 | This data source is used in the [sparkbq](https://github.com/miraisolutions/sparkbq) R package.
 36 | 
 37 | ## Building
 38 | 
 39 | Due to dependency version mismatches between Apache Spark and Google client libraries (e.g. Google Guava) this project uses [`sbt-assembly`](https://github.com/sbt/sbt-assembly) to build a fat JAR using [shading](https://github.com/sbt/sbt-assembly#shading) to relocate relevant Google classes.
 40 | 
 41 | ## Version Information
 42 | 
 43 | The following table provides an overview over supported versions of Apache Spark, Scala and [Google Dataproc](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions):
 44 | 
 45 | | spark-bigquery | Spark           | Scala | Google Dataproc |
 46 | | :------------: | --------------- | ----- | --------------- |
 47 | | 0.1.x          | 2.2.x and 2.3.x | 2.11  | 1.2.x and 1.3.x |
 48 | 
 49 | ## Example
 50 | 
 51 | The provided Google BigQuery data source (`com.miraisolutions.spark.bigquery.DefaultSource`) can be used as follows:
 52 | 
 53 | ```scala
 54 | import org.apache.spark.sql.{SaveMode, SparkSession}
 55 | import com.miraisolutions.spark.bigquery.config._
 56 | 
 57 | // Initialize Spark session
 58 | val spark = SparkSession
 59 |   .builder
 60 |   .appName("Google BigQuery Shakespeare")
 61 |   .getOrCreate
 62 | 
 63 | import spark.implicits._
 64 | 
 65 | // Define BigQuery options
 66 | val config = BigQueryConfig(
 67 |   project = "<billing_project_id>",  // Google BigQuery billing project ID
 68 |   location = "<dataset_location>", // Google BigQuery dataset location
 69 |   stagingDataset = StagingDatasetConfig(
 70 | 	gcsBucket = "<gcs_bucket>" // Google Cloud Storage bucket for staging files
 71 |   ),
 72 |   // Google Cloud service account key file - works only in local cluster mode
 73 |   serviceAccountKeyFile = if(args.length > 3) Some(args(3)) else None
 74 | )
 75 | 
 76 | // Read public shakespeare data table using direct import (streaming)
 77 | val shakespeare = spark.read
 78 |   .bigquery(config)
 79 |   .option("table", "bigquery-public-data.samples.shakespeare")
 80 |   .option("type", "direct")
 81 |   .load()
 82 | 
 83 | val hamlet = shakespeare.filter($"corpus".like("hamlet"))
 84 | hamlet.show(100)
 85 | 
 86 | shakespeare.createOrReplaceTempView("shakespeare")
 87 | val macbeth = spark.sql("SELECT * FROM shakespeare WHERE corpus = 'macbeth'").persist()
 88 | macbeth.show(100)
 89 | 
 90 | // Write filtered data table via a Parquet export on GCS
 91 | macbeth.write
 92 |   .bigquery(config)
 93 |   .option("table", "<billing_project_id>.samples.macbeth")
 94 |   .option("type", "parquet")
 95 |   .mode(SaveMode.Overwrite)
 96 |   .save()
 97 | ```
 98 | 
 99 | You can find a complete example at `com.miraisolutions.spark.bigquery.examples.Shakespeare`.
100 | 
101 | To run this example first compile and assembly using `sbt assembly`. Then run:
102 | 
103 | **Local Spark Cluster**
104 | 
105 | `spark-submit --class com.miraisolutions.spark.bigquery.examples.Shakespeare --master local[*] target/scala-2.11/spark-bigquery-assembly-<version>.jar <arguments>`
106 | 
107 | **[Google Cloud Dataproc](https://cloud.google.com/dataproc/)**
108 | 
109 | Login to service account (it needs to have permissions to access all resources):
110 | 
111 | `gcloud auth activate-service-account --key-file=[KEY-FILE]`
112 | 
113 | Run spark job:
114 | 
115 | `gcloud dataproc jobs submit spark --cluster <cluster> --class com.miraisolutions.spark.bigquery.examples.Shakespeare --jars target/scala-2.11/spark-bigquery-assembly-<version>.jar -- <argument>`
116 | 
117 | where `<arguments>` are:
118 | 1. Google BigQuery billing project ID
119 | 2. Google BigQuery dataset location (EU, US)
120 | 3. Google Cloud Storage (GCS) bucket where staging files will be located
121 | 
122 | ## Using the spark-bigquery Spark package
123 | 
124 | spark-bigquery is available as Spark package from https://spark-packages.org/package/miraisolutions/spark-bigquery and as such via the Maven coordinates `miraisolutions:spark-bigquery:<version>`. You can simply specify the appropriate Maven coordinates with the `--packages` option when using the Spark shell or when using `spark-submit`.
125 | 
126 | ### Using the Spark Shell
127 | 
128 | `spark-shell --master local[*] --packages miraisolutions:spark-bigquery:<version>`
129 | 
130 | ```scala
131 | import com.miraisolutions.spark.bigquery.config._
132 | 
133 | // Define BigQuery options
134 | val config = BigQueryConfig(
135 |   project = "<your_billing_project_id>",
136 |   location = "US",
137 |   stagingDataset = StagingDatasetConfig(
138 |     gcsBucket = "<your_gcs_bucket>"
139 |   ),
140 |   serviceAccountKeyFile = Some("<your_service_account_key_file>")
141 | )
142 | 
143 | val shakespeare = spark.read
144 |   .bigquery(config)
145 |   .option("table", "bigquery-public-data.samples.shakespeare")
146 |   .option("type", "direct")
147 |   .load()
148 |   
149 | shakespeare.show()
150 | ```
151 | 
152 | ### Using PySpark
153 | 
154 | `pyspark --master local[*] --packages miraisolutions:spark-bigquery:<version>`
155 | 
156 | ```python
157 | shakespeare = spark.read \
158 |   .format("bigquery") \
159 |   .option("bq.project", "<your_billing_project_id>") \
160 |   .option("bq.location", "US") \
161 |   .option("bq.service_account_key_file", "<your_service_account_key_file>") \
162 |   .option("bq.staging_dataset.gcs_bucket", "<your_gcs_bucket>") \
163 |   .option("table", "bigquery-public-data.samples.shakespeare") \
164 |   .option("type", "direct") \
165 |   .load()
166 |   
167 | shakespeare.show()
168 | ```
169 | 
170 | ### Using `spark-submit`
171 | 
172 | Assume the following Spark application which has been compiled into a JAR named `Shakespeare.jar` (you may want to use something like [Holden Karau's Giter8 Spark project template](https://github.com/holdenk/sparkProjectTemplate.g8) for this):
173 | 
174 | ```scala
175 | package com.example
176 | 
177 | import org.apache.spark.sql.SparkSession
178 | import com.miraisolutions.spark.bigquery.config._
179 | 
180 | object Shakespeare {
181 | 	
182 | 	def main(args: Array[String]): Unit = {
183 | 		
184 | 		// Initialize Spark session
185 | 		val spark = SparkSession
186 | 		  .builder
187 | 		  .appName("Google BigQuery Shakespeare")
188 | 		  .getOrCreate
189 | 		
190 | 		// Define BigQuery options
191 | 		val config = BigQueryConfig(
192 | 		  project = "<your_billing_project_id>",
193 | 		  location = "US",
194 | 		  stagingDataset = StagingDatasetConfig(
195 | 			gcsBucket = "<your_gcs_bucket>"
196 | 		  ),
197 | 		  serviceAccountKeyFile = Some("<your_service_account_key_file>")
198 | 		)
199 | 
200 | 		// Read public shakespeare data table using direct import (streaming)
201 | 		val shakespeare = spark.read
202 | 		  .bigquery(config)
203 | 		  .option("table", "bigquery-public-data.samples.shakespeare")
204 | 		  .option("type", "direct")
205 | 		  .load()
206 |   		
207 |   		shakespeare.show()	
208 |   		
209 | 	}
210 | 	
211 | }
212 | ```
213 | 
214 | You can run this application using `spark-submit` in the following way:
215 | 
216 | `spark-submit --class com.example.Shakespeare --master local[*] --packages miraisolutions:spark-bigquery:<version> Shakespeare.jar`
217 | 
218 | 
219 | ### Using `gcloud dataproc jobs submit`
220 | 
221 | Login to service account (it needs to have permissions to access all resources):
222 | 
223 | `gcloud auth activate-service-account --key-file=[KEY-FILE]`
224 | 
225 | Similar to the `spark-submit` example above, the Spark application can be submitted to Google Dataproc using
226 | 
227 | `gcloud dataproc jobs submit spark --cluster <dataproc_cluster_name> --class com.example.Shakespeare --jars Shakespeare.jar --properties "spark.jars.packages=miraisolutions:spark-bigquery:<version>"`
228 | 
229 | You may choose not to specify a service account key file and use default application credentials instead when running on Dataproc.
230 | 
231 | 
232 | ## Configuration
233 | 
234 | The three main Spark read/write options include:
235 | 
236 | * `table`: The BigQuery table to read/write. To be specified in the form `[projectId].[datasetId].[tableId]`. One of `table` or `sqlQuery` must be specified.
237 | * `sqlQuery`: A SQL query in Google BigQuery standard SQL dialect (SQL-2011). One of `table` or `sqlQuery` must be specified. 
238 | * `type` (optional): The BigQuery import/export type to use. Options include "direct", "parquet", "avro", "orc", "json" and "csv". Defaults to "direct". See the table at the top for supported type and import/export combinations.
239 | 
240 | 
241 | In addition, there are a number of BigQuery configuration options that can be specified in two ways: the traditional way using Spark's read/write options (e.g. `spark.read.option("bq.project", "<your_project>")`) and using the `bigquery` extension method (`spark.read.bigquery(config)`; see example above) which is usually more straightforward to use. If you prefer the traditional way or if you are using spark-bigquery in a non-Scala environment (e.g. PySpark), the configuration keys are as follows:
242 | 
243 | * `bq.project` (required): Google BigQuery billing project id
244 | * `bq.location` (required): Geographic location where newly created datasets should reside. "EU" or "US".
245 | * `bq.service_account_key_file` (optional): Google Cloud service account key file to use for authentication with Google Cloud services. The use of service accounts is highly recommended. Specifically, the service account will be used to interact with BigQuery and Google Cloud Storage (GCS). If not specified, application default credentials will be used.
246 | * `bq.staging_dataset.name` (optional): Prefix of BigQuery staging dataset. A staging dataset is used to temporarily store the results of SQL queries. Defaults to "spark_staging".
247 | * `bq.staging_dataset.lifetime` (optional): Default staging dataset table lifetime in milliseconds. Tables are automatically deleted once the lifetime has been reached. Defaults to 86400000 ms (= 1 day).
248 | * `bq.staging_dataset.gcs_bucket` (required): Google Cloud Storage (GCS) bucket to use for storing temporary files. Temporary files are used when importing through BigQuery load jobs and exporting through BigQuery extraction jobs (i.e. when using data extracts such as Parquet, Avro, ORC, ...). The service account specified in `bq.service_account_key_file` needs to be given appropriate rights.
249 | * `bq.job.priority` (optional): BigQuery job priority when executing SQL queries. Options include "interactive" and "batch". Defaults to "interactive", i.e. the query is executed as soon as possible.
250 | * `bq.job.timeout` (optional): Timeout in milliseconds after which a file import/export job should be considered as failed. Defaults to 3600000 ms (= 1 h).
251 | 
252 | 
253 | See the following resources for more information:
254 | * [BigQuery pricing](https://cloud.google.com/bigquery/pricing)
255 | * [BigQuery dataset locations](https://cloud.google.com/bigquery/docs/dataset-locations)
256 | * [General authentication](https://cloud.google.com/docs/authentication/)
257 | * [BigQuery authentication](https://cloud.google.com/bigquery/docs/authentication/)
258 | * [Cloud Storage authentication](https://cloud.google.com/storage/docs/authentication/)
259 | 
260 | 
261 | ## Schema Conversion
262 | 
263 | ### Using Direct Mode
264 | 
265 | In **direct** (streaming) mode, spark-bigquery performs the following data type conversions between supported Spark data types and BigQuery data types:
266 | 
267 | **Importing to Spark / Exporting from BigQuery**
268 | 
269 | | BigQuery Source Data Type | Spark Target Data Type |
270 | | ------------------------- | ---------------------- |
271 | | BOOL                      | BooleanType            |
272 | | INT64                     | LongType               |
273 | | FLOAT64                   | DoubleType             |
274 | | NUMERIC                   | DecimalType(38, 9)     |
275 | | STRING                    | StringType             |
276 | | BYTES                     | BinaryType             |
277 | | STRUCT                    | StructType             |
278 | | TIMESTAMP                 | TimestampType          |
279 | | DATE                      | DateType               |
280 | | TIME                      | StringType             |
281 | | DATETIME                  | StringType             |
282 | 
283 | BigQuery repeated fields are mapped to the corresponding Spark ArrayType. Also, BigQuery's nullable mode is used to determine the appropriate nullable property of the target Spark data type.
284 | 
285 | **Exporting from Spark / Importing to BigQuery**
286 | 
287 | | Spark Source Data Type | BigQuery Target Data Type |
288 | | ---------------------- | ------------------------- |
289 | | BooleanType            | BOOL                      |
290 | | ByteType               | INT64                     |
291 | | ShortType              | INT64                     |
292 | | IntegerType            | INT64                     |
293 | | LongType               | INT64                     |
294 | | FloatType              | FLOAT64                   |
295 | | DoubleType             | FLOAT64                   |
296 | | <= DecimalType(38, 9)  | NUMERIC                   |
297 | |  > DecimalType(38, 9)  | STRING                    |
298 | | StringType             | STRING                    |
299 | | BinaryType             | BYTES                     |
300 | | StructType             | STRUCT                    |
301 | | TimestampType          | TIMESTAMP                 |
302 | | DateType               | DATE                      |
303 | | MapType                | Repeated key-value STRUCT |
304 | | ArrayType              | Repeated field            |
305 | | Other                  | STRING                    |
306 | 
307 | For more information on supported BigQuery data types see https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types.
308 | 
309 | ### Using GCS Data Extracts
310 | 
311 | When using intermediate GCS data extracts (Parquet, Avro, ORC, ...) the result depends on the data format being used. Consult the data format's specification for information on supported data types. Furthermore, see the following resources on type conversions supported by BigQuery:
312 | 
313 | * Parquet: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet
314 | * Avro: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro
315 | * ORC: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc
316 | * JSON: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json
317 | * CSV: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv
318 | 
319 | ## Authentication
320 | 
321 | Providing key file is only possible in local cluster mode, since app deployed on GC will try to resolve a location on HDFS. 
322 | It's not a good practice to keep key files stored as cloud resource.
323 | 
324 | If you need to run via gcloud you can authenticate with service account JSON file using:
325 | 
326 | `gcloud auth activate-service-account --key-file=[KEY-FILE]`
327 | 
328 | Using local cluster mode it is possible to provide the key file as an argument to the spark job.
329 |  
330 | Information on how to generate service account credentials can be found at 
331 | https://cloud.google.com/storage/docs/authentication#service_accounts. 
332 | The service account key file can either be passed directly via `BigQueryConfig` or 
333 | it can be passed through an environment variable: 
334 | `export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service_account_keyfile.json` 
335 | (see https://cloud.google.com/docs/authentication/getting-started for more information). 
336 | When running on Google Cloud, e.g. Google Cloud Dataproc, 
337 | [application default credentials](https://developers.google.com/identity/protocols/application-default-credentials) 
338 | may be used in which case it is not necessary to specify a service account key file.
339 | 
340 | ## License
341 | 
342 | MIT License
343 | 
344 | Copyright (c) 2018 Mirai Solutions GmbH
345 | 
346 | Permission is hereby granted, free of charge, to any person obtaining a copy
347 | of this software and associated documentation files (the "Software"), to deal
348 | in the Software without restriction, including without limitation the rights
349 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
350 | copies of the Software, and to permit persons to whom the Software is
351 | furnished to do so, subject to the following conditions:
352 | 
353 | The above copyright notice and this permission notice shall be included in all
354 | copies or substantial portions of the Software.
355 | 
356 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
357 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
358 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
359 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
360 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
361 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
362 | SOFTWARE.
363 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | import net.ruippeixotog.scalascraper.browser.JsoupBrowser
  2 | import net.ruippeixotog.scalascraper.dsl.DSL._
  3 | import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
  4 | import com.typesafe.sbt.license.{DepModuleInfo, LicenseInfo}
  5 | import ReleaseTransformations._
  6 | import sbtrelease.{Version, versionFormatError}
  7 | 
  8 | import scala.xml.{Elem, Node => XmlNode, NodeSeq => XmlNodeSeq}
  9 | import scala.xml.transform._
 10 | 
 11 | // Apache Spark version setting
 12 | val sparkVersion = settingKey[String]("The version of Spark to use.")
 13 | 
 14 | // Custom task for creating a Spark package release artifact
 15 | val sparkPackage = taskKey[File]("Creates a Spark package release artifact.")
 16 | 
 17 | // Setting Maven properties as needed by gcs-connector
 18 | val mavenProps = settingKey[Unit]("Setting Maven properties")
 19 | 
 20 | lazy val commonSettings = Seq(
 21 |   // Name must match github repository name
 22 |   name := "spark-bigquery",
 23 |   organization := "com.miraisolutions",
 24 |   organizationName := "Mirai Solutions GmbH",
 25 |   description := "A Google BigQuery Data Source for Apache Spark",
 26 |   startYear := Some(2018),
 27 |   licenses += ("MIT", new URL("https://opensource.org/licenses/MIT")),
 28 |   sparkVersion := "2.4.5",
 29 |   scalaVersion := "2.11.12",
 30 |   crossScalaVersions := Seq("2.11.12"),
 31 |   scalacOptions ++= Seq(
 32 |     "-target:jvm-1.8",
 33 |     "-deprecation",
 34 |     "-feature",
 35 |     "-unchecked"
 36 |   )
 37 | )
 38 | 
 39 | // Dependency exclusions
 40 | lazy val exclusions = Seq(
 41 |   // Clash with Spark
 42 |   ExclusionRule("com.fasterxml.jackson.core", "jackson-core"),
 43 |   ExclusionRule("commons-logging", "commons-logging"),
 44 |   ExclusionRule("commons-lang", "commons-lang"),
 45 |   // Not required
 46 |   ExclusionRule("com.google.auto.value", "auto-value"),
 47 |   ExclusionRule("com.google.auto.value", "auto-value-annotations")
 48 | )
 49 | 
 50 | // Spark provided dependencies
 51 | lazy val sparkDependencies = Def.setting(Seq(
 52 |   "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided",
 53 |   "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided",
 54 |   "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided"
 55 | ))
 56 | 
 57 | // Dependencies which need to be shaded to run on Google Cloud Dataproc
 58 | lazy val dependenciesToShade = Seq(
 59 |   "com.google.cloud" % "google-cloud-bigquery" % "1.108.1" excludeAll(exclusions: _*),
 60 |   "com.google.cloud.bigdataoss" % "gcs-connector" % "1.9.3-hadoop2" excludeAll(exclusions: _*),
 61 |   "com.google.http-client" % "google-http-client-apache" % "2.0.0" excludeAll(exclusions: _*)
 62 | )
 63 | 
 64 | // Dependencies which don't need any shading
 65 | lazy val nonShadedDependencies = Seq(
 66 |   "com.databricks" %% "spark-avro" % "4.0.0"
 67 | )
 68 | 
 69 | // Test dependencies
 70 | lazy val testDependencies = Def.setting(Seq(
 71 |   "org.scalatest" %% "scalatest" % "3.0.5" % "it,test",
 72 |   "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_0.14.0" % "it,test",
 73 |   "org.apache.spark" %% "spark-hive" % sparkVersion.value % "it,test" // required by spark-testing-base
 74 | ))
 75 | 
 76 | lazy val browser = JsoupBrowser()
 77 | 
 78 | def existsUrl(url: String): Boolean = {
 79 |   import java.net.{URL, HttpURLConnection}
 80 |   (new URL(url)).openConnection().asInstanceOf[HttpURLConnection].getResponseCode == 200
 81 | }
 82 | 
 83 | 
 84 | lazy val root = (project in file("."))
 85 |   .enablePlugins(AssemblyPlugin, AutomateHeaderPlugin)
 86 |   .configs(IntegrationTest)
 87 |   .settings(commonSettings: _*)
 88 |   .settings(
 89 |     libraryDependencies := dependenciesToShade ++ sparkDependencies.value ++
 90 |       nonShadedDependencies.map(_ % "provided") ++ testDependencies.value,
 91 |     skip in publish := true,
 92 |     mavenProps := {
 93 |       // Required by gcs-connector
 94 |       sys.props("hadoop.identifier") = "hadoop2"
 95 |       ()
 96 |     },
 97 | 
 98 |     Defaults.itSettings,
 99 |     IntegrationTest / fork := true,
100 |     IntegrationTest / javaOptions ++= Seq(
101 |       "-Xmx2048m",
102 |       "-Xms512m",
103 |       "-XX:+CMSClassUnloadingEnabled"
104 |     ),
105 |     IntegrationTest / logBuffered := false,
106 |     IntegrationTest / testOptions += Tests.Argument("-oF"),
107 |     automateHeaderSettings(IntegrationTest),
108 | 
109 |     // See https://spark-packages.org/artifact-help
110 |     assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false),
111 |     // Shade google dependencies due to version mismatches with dependencies deployed on Google Dataproc
112 |     assemblyShadeRules in assembly := Seq(
113 |       // ShadeRule.rename("com.google.cloud.hadoop.fs.**" -> "com.google.cloud.hadoop.fs.@1").inAll,
114 |       ShadeRule.rename("com.google.**" -> "shadehttpclient.@1").inLibrary("com.google.http-client" % "google-http-client-apache" % "2.0.0"),
115 |       ShadeRule.rename("com.google.**" -> "shadegoogle.@1").inAll
116 |     ),
117 |     assemblyMergeStrategy in assembly := {
118 |       case PathList("META-INF", "services", "org.apache.hadoop.fs.FileSystem") =>
119 |         // Take our "shaded" version
120 |         MergeStrategy.first
121 |       case r =>
122 |         MergeStrategy.defaultMergeStrategy(r)
123 |     },
124 | 
125 |     // We release the distribution module only
126 |     releaseProcess := Seq.empty,
127 | 
128 |     licenseConfigurations := Set("compile"),
129 |     licenseOverrides := {
130 |       case DepModuleInfo("org.slf4j", "slf4j-api", _) =>
131 |         LicenseInfo.MIT
132 |     },
133 |     // Extends license report to include artifact description and link to JAR files
134 |     licenseReportNotes := {
135 |       case DepModuleInfo(group, id, version) =>
136 |         try {
137 |           // Fetch artifact information
138 |           val doc = browser.get(s"https://mvnrepository.com/artifact/$group/$id/$version")
139 |           // Extract title
140 |           val title = (doc >> text(".im-title")).replaceFirst("\\s»\\s.+$", "")
141 |           // Extract description
142 |           val description = doc >> text(".im-description")
143 |           // Locate link to JAR file
144 |           val mainJar = (doc >> elementList("a.vbtn"))
145 |             .filter(element => element.innerHtml.startsWith("jar") || element.innerHtml.startsWith("bundle"))
146 |             .map(_ >> attr("href"))
147 |             .headOption
148 |             .getOrElse(throw new NoSuchElementException("Can't locate JAR file"))
149 | 
150 |           // Derive link to sources JAR file
151 |           val sourcesJar = mainJar.replaceFirst("\\.jar$", "-sources.jar")
152 | 
153 |           // Check if JAR file exists
154 |           require(existsUrl(mainJar), "Invalid link to JAR file")
155 |           // Check if sources JAR file exists
156 |           require(existsUrl(sourcesJar), "Invalid link to sources JAR file")
157 |           // https://en.wikipedia.org/wiki/C0_and_C1_control_codes (unit separator)
158 |           title + '\u001F' + description + '\u001F' + mainJar + '\u001F' + sourcesJar
159 |         } catch {
160 |           case t: Throwable =>
161 |             "**** " + t.getMessage + " ****"
162 |         }
163 |     }
164 |   )
165 | 
166 | // A "virtual" project with configurations to build Spark packages
167 | // See https://spark-packages.org/artifact-help
168 | lazy val distribution = (project in file("distribution"))
169 |   .settings(commonSettings: _*)
170 |   .settings(
171 |     // Include the Scala binary version here
172 |     version := s"${(root / version).value}-s_${scalaBinaryVersion.value}",
173 |     libraryDependencies := nonShadedDependencies,
174 |     // Spark packages need the github organization name as the group ID
175 |     organization := "miraisolutions",
176 |     crossPaths := false,
177 |     pomExtra := {
178 |       <url>https://github.com/miraisolutions/spark-bigquery</url>
179 |       <scm>
180 |         <url>git@github.com:miraisolutions/spark-bigquery.git</url>
181 |         <connection>scm:git:git@github.com:miraisolutions/spark-bigquery.git</connection>
182 |       </scm>
183 |       <developers>
184 |         <developer>
185 |           <id>martinstuder</id>
186 |           <name>Martin Studer</name>
187 |           <url>https://github.com/martinstuder</url>
188 |         </developer>
189 |         <developer>
190 |           <id>lambiase</id>
191 |           <name>Nicola Lambiase</name>
192 |           <url>https://github.com/lambiase</url>
193 |         </developer>
194 |       </developers>
195 |     },
196 |     // Spark packages need the github repository name as the artifact ID
197 |     pomPostProcess := { (node: XmlNode) =>
198 |       val rule = new RewriteRule {
199 |         override def transform(n: XmlNode): XmlNodeSeq = n match {
200 |           case n: Elem if n.label == "project" =>
201 |             val updatedChildren = n.child map {
202 |               case c if c.label == "artifactId" =>
203 |                 <artifactId>{normalizedName.value}</artifactId>
204 | 
205 |               case c =>
206 |                 c
207 |             }
208 |             n.copy(child = updatedChildren)
209 | 
210 |           case n =>
211 |             n
212 |         }
213 |       }
214 |       new RuleTransformer(rule)(node)
215 |     },
216 |     Compile / packageBin := (root / Compile / assembly).value,
217 |     sparkPackage := {
218 |       val jar = (Compile / packageBin).value
219 |       val pom = makePom.value
220 |       val packageName = s"${normalizedName.value}-${version.value}"
221 |       val zipFile = target.value / s"$packageName.zip"
222 |       IO.delete(zipFile)
223 |       IO.zip(Seq(jar -> s"$packageName.jar", pom -> s"$packageName.pom"), zipFile)
224 |       println(s"\nSpark Package created at: $zipFile\n")
225 |       zipFile
226 |     },
227 | 
228 |     releaseVersion := { _ =>
229 |       Version((root / version).value).map(_.withoutQualifier.string).getOrElse(versionFormatError)
230 |     },
231 |     releaseVersionFile := (ThisBuild / baseDirectory).value / "version.sbt",
232 |     releaseProcess := Seq[ReleaseStep](
233 |       checkSnapshotDependencies,
234 |       inquireVersions,
235 |       runClean,
236 |       setReleaseVersion,
237 |       commitReleaseVersion,
238 |       tagRelease,
239 |       releaseStepTask(sparkPackage),
240 |       setNextVersion,
241 |       commitNextVersion,
242 |       pushChanges
243 |     )
244 |   )
245 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.6
2 | 


--------------------------------------------------------------------------------
/project/build.sbt:
--------------------------------------------------------------------------------
1 | resolvers += DefaultMavenRepository
2 | 
3 | // Used to scrape mvnrepository artifact information
4 | libraryDependencies += "net.ruippeixotog" %% "scala-scraper" % "2.0.0"
5 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
 2 | 
 3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0")
 4 | 
 5 | addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.2.0")
 6 | 
 7 | addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.0.0")
 8 | 
 9 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.9")
10 | 


--------------------------------------------------------------------------------
/src/it/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=WARN, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 | 
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=WARN
29 | 
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.spark-project.jetty=WARN
32 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 | log4j.logger.com.miraisolutions.spark.bigquery=INFO
38 | 
39 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
40 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
41 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
42 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/DirectWriteAndReadSpec.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery
 23 | 
 24 | import com.miraisolutions.spark.bigquery.test._
 25 | import com.miraisolutions.spark.bigquery.test.data.{DataFrameGenerator, TestData}
 26 | import org.apache.spark.sql.types.StructType
 27 | import org.apache.spark.sql.{DataFrame, SaveMode}
 28 | import org.scalactic.anyvals.PosZInt
 29 | import org.scalatest.FunSuite
 30 | import org.scalatest.prop.{Checkers, GeneratorDrivenPropertyChecks}
 31 | 
 32 | /**
 33 |   * Test suite which tests reading and writing single Spark fields/columns to and from BigQuery.
 34 |   *
 35 |   * Data frames are written to BigQuery via "direct" export. Because attempts to query the new fields might require
 36 |   * a waiting time of up to 90 minutes (https://cloud.google.com/bigquery/streaming-data-into-bigquery), this test
 37 |   * suite only verifies the correctness of the generated BigQuery schema.
 38 |   *
 39 |   * BigQuery's streaming system caches table schemas for up to two minutes. That also seems to be the case when a
 40 |   * table gets deleted. For this reason, this test suite generates unique table names for each test case and then
 41 |   * manually deletes the table afterwards.
 42 |   *
 43 |   * @see [[https://cloud.google.com/bigquery/streaming-data-into-bigquery]]
 44 |   * @see [[https://stackoverflow.com/q/25279116]]
 45 |   * @see [[https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert]]
 46 |   */
 47 | class DirectWriteAndReadSpec extends FunSuite with BigQueryTesting with Checkers
 48 |   with GeneratorDrivenPropertyChecks {
 49 | 
 50 |   private val testTablePrefix = "direct_test"
 51 | 
 52 |   private class RandomDataFrame(schema: StructType, size: PosZInt)
 53 |     extends Checkers with GeneratorDrivenPropertyChecks {
 54 | 
 55 |     override implicit val generatorDrivenConfig =
 56 |       PropertyCheckConfiguration(minSuccessful = 1, minSize = size, sizeRange = size)
 57 | 
 58 |     implicit val arbitraryDataFrame = DataFrameGenerator.generate(sqlContext, schema)
 59 |     // Use unique table name to avoid BigQuery schema caching issues
 60 |     val tableName = testTablePrefix + "_" + System.currentTimeMillis().toString
 61 | 
 62 |     forAll { df: DataFrame =>
 63 |       df.write
 64 |         .mode(SaveMode.Overwrite)
 65 |         .bigqueryTest(tableName)
 66 |         .save()
 67 | 
 68 |       val in = spark.read
 69 |         .bigqueryTest(tableName)
 70 |         .load()
 71 |         .persist()
 72 | 
 73 |       val tableReference = getTestDatasetTableReference(tableName)
 74 | 
 75 |       assert(df.aligned.schema, in.aligned.schema)
 76 |       val deleted = bigQueryClient.deleteTable(tableReference)
 77 |       assert(deleted, true)
 78 |      }
 79 |   }
 80 | 
 81 | 
 82 |   (TestData.atomicFields ++ TestData.arrayFields ++ TestData.mapFields) foreach { field =>
 83 | 
 84 |     test(s"Column of type ${field.dataType} (nullable: ${field.nullable}) " +
 85 |       s"can be written to and read from BigQuery using direct imports (streaming)") {
 86 |       new RandomDataFrame(StructType(List(field)), 10)
 87 |     }
 88 | 
 89 |   }
 90 | 
 91 |   test("Nested struct columns can be written to and read from BigQuery using direct imports (streaming)") {
 92 |     new RandomDataFrame(StructType(List(TestData.customStructField)), 2)
 93 |   }
 94 | 
 95 |   test("Data frames with mixed data types can be written to and read from BigQuery using direct imports (streaming)") {
 96 |     new RandomDataFrame(StructType(TestData.atomicFields ++ TestData.arrayFields.take(2) ++
 97 |       TestData.mapFields.take(2)), 2)
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/ParquetWriteDirectReadSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import com.miraisolutions.spark.bigquery.test._
25 | import com.miraisolutions.spark.bigquery.test.data.{DataFrameGenerator, TestData}
26 | import org.apache.spark.sql.types._
27 | import org.apache.spark.sql.{DataFrame, SaveMode}
28 | import org.scalactic.anyvals.PosZInt
29 | import org.scalatest.FunSuite
30 | import org.scalatest.prop.{Checkers, GeneratorDrivenPropertyChecks}
31 | 
32 | /**
33 |   * Test suite which tests reading and writing Spark fields/columns to and from BigQuery.
34 |   *
35 |   * Data frames are written to BigQuery via Parquet export to ensure data is immediately available for querying and
36 |   * doesn't end up in BigQuery's streaming buffer as it would when using "direct" mode.
37 |   *
38 |   * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet]]
39 |   * @see [[https://cloud.google.com/bigquery/streaming-data-into-bigquery]]
40 |   * @see [[https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert]]
41 |   */
42 | class ParquetWriteDirectReadSpec extends FunSuite with BigQueryTesting {
43 | 
44 |   private val testTable = "test"
45 | 
46 |   private class RandomDataFrame(schema: StructType, size: PosZInt)
47 |     extends Checkers with GeneratorDrivenPropertyChecks {
48 | 
49 |     override implicit val generatorDrivenConfig =
50 |       PropertyCheckConfiguration(minSuccessful = 1, minSize = size, sizeRange = size)
51 | 
52 |     implicit val arbitraryDataFrame = DataFrameGenerator.generate(sqlContext, schema)
53 | 
54 |     forAll { out: DataFrame =>
55 | 
56 |       out.write
57 |         .mode(SaveMode.Overwrite)
58 |         .bigqueryTest(testTable, exportType = "parquet")
59 |         .save()
60 | 
61 |       val in = spark.read
62 |         .bigqueryTest(testTable)
63 |         .load()
64 |         .persist()
65 | 
66 |       assertDataFrameEquals(out.aligned, in.aligned)
67 |     }
68 | 
69 |   }
70 | 
71 |   (TestData.atomicFields ++ TestData.arrayFields ++ TestData.mapFields) foreach { field =>
72 | 
73 |     test(s"Column of type ${field.dataType} (nullable: ${field.nullable}) " +
74 |       s"can be written to and read from BigQuery") {
75 |       new RandomDataFrame(StructType(List(field)), 10)
76 |     }
77 | 
78 |   }
79 | 
80 |   test("Nested struct columns can be written to and read from BigQuery") {
81 |     new RandomDataFrame(StructType(List(TestData.customStructField)), 2)
82 |   }
83 | 
84 |   test("Data frames with mixed data types can be written to and read from BigQuery") {
85 |     new RandomDataFrame(StructType(TestData.atomicFields ++ TestData.arrayFields.take(2) ++
86 |       TestData.mapFields.take(2)), 2)
87 |   }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/test/BigQueryConfiguration.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.test
 23 | 
 24 | import com.miraisolutions.spark.bigquery.BigQueryTableReference
 25 | import com.miraisolutions.spark.bigquery.client.BigQueryClient
 26 | import com.miraisolutions.spark.bigquery.config.{BigQueryConfig, _}
 27 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row}
 28 | import org.scalatest.{BeforeAndAfterAll, Outcome, TestSuite, TestSuiteMixin}
 29 | 
 30 | private object BigQueryConfiguration {
 31 |   // BigQuery test dataset name
 32 |   private val BIGQUERY_TEST_DATASET = "spark_bigquery_test"
 33 | }
 34 | 
 35 | /**
 36 |   * BigQuery configuration test suite mixin. Tests mixing in that trait can be run in sbt via:
 37 |   *
 38 |   * it:testOnly com.miraisolutions.spark.bigquery.* --
 39 |   * -Dbq.project=<project>
 40 |   * -Dbq.location=<location>
 41 |   * -Dbq.staging_dataset.gcs_bucket=<gcs_bucket>
 42 |   * -Dbq.staging_dataset.service_account_key_file=<path_to_keyfile>
 43 |   */
 44 | private[bigquery] trait BigQueryConfiguration extends TestSuiteMixin with BeforeAndAfterAll { this: TestSuite =>
 45 |   import BigQueryConfiguration._
 46 | 
 47 |   // Captured BigQuery test configuration
 48 |   private var _config: BigQueryConfig = _
 49 | 
 50 |   /** BigQuery client */
 51 |   protected lazy val bigQueryClient: BigQueryClient = new BigQueryClient(config)
 52 | 
 53 |   /** BigQuery configuration */
 54 |   protected def config: BigQueryConfig = _config
 55 | 
 56 |   /**
 57 |     * Construct a table reference to a table in the configured BigQuery test dataset.
 58 |     * @param table Table name
 59 |     * @return BigQuery table reference
 60 |     */
 61 |   protected def getTestDatasetTableReference(table: String): BigQueryTableReference = {
 62 |     BigQueryTableReference(_config.project, BIGQUERY_TEST_DATASET, table)
 63 |   }
 64 | 
 65 |   /**
 66 |     * Gets the unquoted table identifier for a table in the configured BigQuery test dataset.
 67 |     * @param table Table name
 68 |     * @return Unquoted table identifier
 69 |     */
 70 |   protected def getTestDatasetTableIdentifier(table: String): String = {
 71 |     getTestDatasetTableReference(table).unquotedIdentifier
 72 |   }
 73 | 
 74 |   protected implicit class DataFrameReaderTestConfig(val reader: DataFrameReader) {
 75 |     /**
 76 |       * Applies BigQuery test configuration options derived from parameters passed to the test.
 77 |       * @param table BigQuery table to read
 78 |       * @param importType Import type (e.g. "direct", "parquet", "avro", ...)
 79 |       * @return Spark [[DataFrameReader]]
 80 |       */
 81 |     def bigqueryTest(table: String, importType: String = "direct"): DataFrameReader = {
 82 |       applyDataFrameOptions(reader, _config)
 83 |         .option("table", getTestDatasetTableIdentifier(table))
 84 |         .option("type", importType)
 85 |     }
 86 |   }
 87 | 
 88 |   protected implicit class DataFrameWriterTestConfig(val writer: DataFrameWriter[Row]) {
 89 |     /**
 90 |       * Applies BigQuery test configuration options derived from parameters passed to the test.
 91 |       * @param table BigQuery table to write
 92 |       * @param exportType Export type (e.g. "direct", "parquet", "avro", ...)
 93 |       * @return Spark [[DataFrameWriter]]
 94 |       */
 95 |     def bigqueryTest(table: String, exportType: String = "direct"): DataFrameWriter[Row] = {
 96 |       applyDataFrameOptions(writer, _config)
 97 |         .option("table", getTestDatasetTableIdentifier(table))
 98 |         .option("type", exportType)
 99 |     }
100 |   }
101 | 
102 |   override protected def afterAll(): Unit = {
103 |     // Removes the BigQuery test dataset at the end of a test suite
104 |     bigQueryClient.deleteDataset(_config.project, BIGQUERY_TEST_DATASET)
105 |   }
106 | 
107 |   // See {{TestSuiteMixin}}
108 |   abstract override def withFixture(test: NoArgTest): Outcome = {
109 |     // Extract BigQuery configuration from config map
110 |     _config = BigQueryConfig(test.configMap.mapValues(_.toString))
111 |     super.withFixture(test)
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/test/BigQueryTesting.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.test
23 | 
24 | import com.holdenkarau.spark.testing.{DataFrameSuiteBase, RDDComparisons}
25 | import org.apache.spark.sql.DataFrame
26 | import org.scalatest.TestSuite
27 | 
28 | private[bigquery] trait BigQueryTesting extends BigQueryConfiguration with DataFrameSuiteBase
29 |   with RDDComparisons { this: TestSuite =>
30 | 
31 |   // See https://github.com/holdenk/spark-testing-base/issues/148
32 |   // See https://issues.apache.org/jira/browse/SPARK-22918
33 |   System.setSecurityManager(null)
34 | 
35 |   override def assertDataFrameEquals(expected: DataFrame, result: DataFrame): Unit = {
36 |     assert("Schemas don't match", expected.schema, result.schema)
37 |     assert("Number of rows don't match", expected.count(), result.count())
38 | 
39 |     val mismatch = compareRDD(expected.rdd, result.rdd)
40 |     if(mismatch.isDefined) {
41 |       println("#### Expected ####")
42 |       expected.show(10, 100, true)
43 |       println("#### Result ####")
44 |       result.show(10, 100, true)
45 |     }
46 | 
47 |     assertTrue(mismatch.isEmpty)
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/test/data/DataFrameGenerator.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.test.data
 23 | 
 24 | import java.math.BigInteger
 25 | import java.sql.{Date, Timestamp}
 26 | import java.time._
 27 | 
 28 | import com.holdenkarau.spark.testing.RDDGenerator
 29 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 30 | import org.apache.spark.sql.types._
 31 | import org.scalacheck.{Arbitrary, Gen}
 32 | 
 33 | /**
 34 |   * Generator of arbitrary Spark data frames used in property-based testing.
 35 |   *
 36 |   * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]]
 37 |   */
 38 | private[bigquery] object DataFrameGenerator {
 39 | 
 40 |   // Min and max BigQuery timestamp and date values
 41 |   private val MIN_INSTANT = Instant.parse("0001-01-01T00:00:00.000000Z")
 42 |   private val MAX_INSTANT = Instant.parse("9999-12-31T23:59:59.999999Z")
 43 | 
 44 |   // Number of milliseconds in one day
 45 |   private val MILLIS_PER_DAY = 86400000L
 46 | 
 47 |   /**
 48 |     * Generates an arbitrary Spark data frame with the specified schema and minimum number of partitions.
 49 |     * @param sqlContext Spark SQL context
 50 |     * @param schema Schema of data frame to generate
 51 |     * @param minPartitions Minimum number of partitions
 52 |     * @return Arbitrary Spark data frame
 53 |     */
 54 |   def generate(sqlContext: SQLContext, schema: StructType, minPartitions: Int = 1): Arbitrary[DataFrame] = {
 55 |     val genRow = getRowGenerator(schema)
 56 |     val genDataFrame = RDDGenerator.genRDD[Row](sqlContext.sparkContext, minPartitions)(genRow)
 57 |     Arbitrary(genDataFrame.map(sqlContext.createDataFrame(_, schema)))
 58 |   }
 59 | 
 60 |   /**
 61 |     * Creates a generator of a row in a Spark data frame.
 62 |     * @param schema Schema of row to generate
 63 |     * @return Generator for a row
 64 |     */
 65 |   private def getRowGenerator(schema: StructType): Gen[Row] = {
 66 |     import scala.collection.JavaConverters._
 67 |     val fieldGenerators = schema.fields.map(field => getGeneratorForType(field.dataType))
 68 |     val rowGen = Gen.sequence(fieldGenerators)
 69 |     rowGen.map(values => Row.fromSeq(values.asScala))
 70 |   }
 71 | 
 72 |   /**
 73 |     * Creates a generator for a target data type.
 74 |     * @param dataType Data type
 75 |     * @return Generator of values of the specified data type
 76 |     */
 77 |   private def getGeneratorForType(dataType: DataType): Gen[Any] = {
 78 |     import Arbitrary._
 79 | 
 80 |     dataType match {
 81 |       case ByteType =>
 82 |         arbitrary[Byte]
 83 | 
 84 |       case ShortType =>
 85 |         arbitrary[Short]
 86 | 
 87 |       case IntegerType =>
 88 |         arbitrary[Int]
 89 | 
 90 |       case LongType =>
 91 |         arbitrary[Long]
 92 | 
 93 |       case FloatType =>
 94 |         arbitrary[Float]
 95 | 
 96 |       case DoubleType =>
 97 |         arbitrary[Double]
 98 | 
 99 |       case dt: DecimalType =>
100 |         for {
101 |           digits <- Gen.listOfN(dt.precision, Gen.numChar)
102 |           sign <- Gen.oneOf("", "-")
103 |           unscaledValue = new BigInteger(sign + digits.mkString)
104 |         } yield new java.math.BigDecimal(unscaledValue, dt.scale)
105 | 
106 |       case StringType =>
107 |         arbitrary[String]
108 | 
109 |       case BinaryType =>
110 |         Gen.listOf(arbitrary[Byte]).map(_.toArray)
111 | 
112 |       case BooleanType =>
113 |         arbitrary[Boolean]
114 | 
115 |       case TimestampType =>
116 |         // BigQuery allowed timestamp range: [0001-01-1 00:00:00.000000, 9999-12-31 23:59:59.999999]
117 |         Gen.chooseNum[Long](MIN_INSTANT.toEpochMilli, MAX_INSTANT.toEpochMilli).map(new Timestamp(_))
118 | 
119 |       case DateType =>
120 |         // BigQuery allowed date range: [0001-01-1, 9999-12-31]
121 |         Gen.chooseNum[Long](MIN_INSTANT.toEpochMilli, MAX_INSTANT.toEpochMilli) map { millis =>
122 |           // We need to round the milliseconds to full days as otherwise the time components will be set to the
123 |           // time components in the default time zone; see javadoc for java.sql.Date for more details
124 |           new Date(millis / MILLIS_PER_DAY * MILLIS_PER_DAY)
125 |         }
126 | 
127 |       case arr: ArrayType =>
128 |         val elementGenerator = getGeneratorForType(arr.elementType)
129 |         Gen.listOf(elementGenerator)
130 | 
131 |       case map: MapType =>
132 |         val keyGenerator = getGeneratorForType(map.keyType)
133 |         val valueGenerator = getGeneratorForType(map.valueType)
134 |         val keyValueGenerator: Gen[(Any, Any)] = for {
135 |           key <- keyGenerator
136 |           value <- valueGenerator
137 |         } yield (key, value)
138 | 
139 |         Gen.mapOf(keyValueGenerator)
140 | 
141 |       case row: StructType =>
142 |         getRowGenerator(row)
143 | 
144 |       case _ =>
145 |         throw new UnsupportedOperationException(s"Data type '$dataType' is not supported")
146 |     }
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/test/data/TestData.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.test.data
 23 | 
 24 | import org.apache.spark.sql.types._
 25 | 
 26 | /**
 27 |   * Definition of Spark data frame test data types and fields.
 28 |   */
 29 | private[bigquery] object TestData {
 30 | 
 31 |   private val atomicTypes = List(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType,
 32 |     DoubleType, StringType, BinaryType, TimestampType, DateType, DataTypes.createDecimalType(38, 9),
 33 |     DataTypes.createDecimalType(12, 4), DataTypes.createDecimalType(33, 4),
 34 |     DataTypes.createDecimalType(7,7))
 35 | 
 36 |   private def createName(dt: DataType, nullable: Boolean): String = {
 37 |     dt.typeName
 38 |     val base = dt.typeName.replaceAll("[^A-Za-z0-9]+", "")
 39 |     if(nullable) {
 40 |       base + "0"
 41 |     } else {
 42 |       base
 43 |     }
 44 |   }
 45 | 
 46 |   private def createFields[T <: DataType](dataTypes: List[T], createName: (T, Boolean) => String): List[StructField] = {
 47 |     for {
 48 |       dt <- dataTypes
 49 |       nullable <- List(true, false)
 50 |     } yield StructField(createName(dt, nullable), dt, nullable)
 51 |   }
 52 | 
 53 |   val atomicFields: List[StructField] = createFields(atomicTypes, createName)
 54 | 
 55 |   private def createArrayName(dt: ArrayType, nullable: Boolean): String = {
 56 |     val elementName = createName(dt.elementType, dt.containsNull)
 57 |     val array = if(nullable) "array0" else "array"
 58 |     s"${array}_${elementName}_"
 59 |   }
 60 | 
 61 |   private val arrayTypes: List[ArrayType] = {
 62 |     for {
 63 |       bt <- atomicTypes
 64 |       containsNull <- List(true, false)
 65 |     } yield ArrayType(bt, containsNull)
 66 |   }
 67 | 
 68 |   val arrayFields: List[StructField] = createFields(arrayTypes, createArrayName)
 69 | 
 70 |   private def createMapName(dt: MapType, nullable: Boolean): String = {
 71 |     val keyName = createName(dt.keyType, false)
 72 |     val valueName = createName(dt.valueType, dt.valueContainsNull)
 73 |     val map = if(nullable) "map0" else "map"
 74 |     s"${map}_${keyName}_${valueName}"
 75 |   }
 76 | 
 77 |   private val mapTypes: List[MapType] = {
 78 |     for {
 79 |       keyType <- atomicTypes
 80 |       valueType <- atomicTypes
 81 |       valueContainsNull <- List(true, false)
 82 |     } yield MapType(keyType, valueType, valueContainsNull)
 83 |   }
 84 | 
 85 |   val mapFields: List[StructField] = createFields(mapTypes, createMapName)
 86 | 
 87 |   val customStructField: StructField = StructField(
 88 |     "customStruct0",
 89 |     StructType(
 90 |       Array(
 91 |         StructField("a", BooleanType, false),
 92 |         StructField("b", ShortType, true),
 93 |         StructField("c", FloatType, false),
 94 |         StructField("d", DataTypes.createDecimalType(18, 6)),
 95 |         StructField("e", TimestampType, true),
 96 |         StructField("f", StringType, false),
 97 |         StructField("g", BinaryType, true),
 98 |         StructField("h", ArrayType(DoubleType, true)),
 99 |         StructField("i", MapType(StringType, LongType, false)),
100 |         StructField("j", StructType(List(
101 |           StructField("k", DateType, false),
102 |           StructField("l", ByteType, true),
103 |           StructField("m", ArrayType(IntegerType, false), true),
104 |           StructField("n", MapType(BinaryType, ByteType, true), true)
105 |         )), true)
106 |       )
107 |     ),
108 |     true
109 |   )
110 | }
111 | 


--------------------------------------------------------------------------------
/src/it/scala/com/miraisolutions/spark/bigquery/test/package.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery
 23 | 
 24 | import com.miraisolutions.spark.bigquery.utils.format._
 25 | import java.sql.Timestamp
 26 | 
 27 | import org.apache.spark.sql.{Column, DataFrame}
 28 | import org.apache.spark.sql.functions.{base64, explode_outer, udf}
 29 | import org.apache.spark.sql.types._
 30 | 
 31 | package object test {
 32 |   import BigQuerySchemaConverter.BIGQUERY_NUMERIC_DECIMAL
 33 | 
 34 |   // Column flattener: partial function from field to flattened fields and an appropriate converter function
 35 |   private type ColumnFlattener = PartialFunction[StructField, (Seq[StructField], DataFrame => DataFrame)]
 36 | 
 37 |   // Rounds a timestamp to milliseconds
 38 |   private def roundTimestampToMillis(ts: Timestamp): Timestamp = {
 39 |     val roundedTs = new Timestamp(ts.getTime)
 40 |     roundedTs.setNanos(Math.round(ts.getNanos / 1e6).toInt)
 41 |     roundedTs
 42 |   }
 43 | 
 44 |   // Spark SQL UDF to round timestamps to milliseconds
 45 |   private val roundTimestampToMillisUdf = udf(roundTimestampToMillis _)
 46 | 
 47 | 
 48 |   // Explodes a nested column such as an array or map column
 49 |   private def explode(df: DataFrame, columnName: String, f: Column => Column = identity): DataFrame = {
 50 |     df.select(df.col("*"), f(explode_outer(df.col(columnName)))).drop(columnName)
 51 |   }
 52 | 
 53 |   // Column converter used to convert atomic data types into data types supported in BigQuery
 54 |   private val atomicTypeConverter: ColumnConverter = {
 55 |     case f @ StructField(_, ByteType | ShortType | IntegerType, _, _) =>
 56 |       (f.copy(dataType = LongType), _.cast(LongType))
 57 | 
 58 |     case f @ StructField(_, FloatType, _, _) =>
 59 |       (f.copy(dataType = DoubleType), _.cast(DoubleType))
 60 | 
 61 |     case f @ StructField(_, dt: DecimalType, _, _) if dt.precision < 38 =>
 62 |       (f.copy(dataType = BIGQUERY_NUMERIC_DECIMAL), _.cast(BIGQUERY_NUMERIC_DECIMAL))
 63 | 
 64 |     case f @ StructField(_, TimestampType, _, _) =>
 65 |       (f, roundTimestampToMillisUdf(_))
 66 | 
 67 |     case f @ StructField(_, BinaryType, _, _) =>
 68 |       (f.copy(dataType = StringType), base64)
 69 |   }
 70 | 
 71 |   // Column flattener for nested data types
 72 |   private val nestedTypeFlattener: ColumnFlattener = {
 73 |     case StructField(name, ArrayType(elementType, containsNull), _, _) =>
 74 |       val newName = name + "_element"
 75 |       (Seq(StructField(newName, elementType, containsNull)), explode(_, name, _.as(newName)))
 76 | 
 77 |     case StructField(name, MapType(keyType, valueType, containsNull), _, _) =>
 78 |       val keyName = name + "_key"
 79 |       val valueName = name + "_value"
 80 |       (Seq(StructField(keyName, keyType, false), StructField(valueName, valueType, containsNull)),
 81 |         explode(_, name, _.as(Seq(keyName, valueName))))
 82 | 
 83 |     case StructField(name, StructType(fields), _, _) =>
 84 |       def unnest(df: DataFrame): DataFrame = {
 85 |         val subFields = df.col("*") :: (fields.toList map { sub =>
 86 |           df.col(name).getItem(sub.name).as(name + "_" + sub.name)
 87 |         })
 88 |         df.select(subFields: _*).drop(name)
 89 |       }
 90 |       (fields.toSeq, unnest)
 91 |   }
 92 | 
 93 |   /**
 94 |     * Implicit helper class to align column types in a data frame to types supported in BigQuery and types
 95 |     * suitable for comparison.
 96 |     * @param dataFrame Source data frame
 97 |     */
 98 |   private[bigquery] implicit class AlignedDataFrame(val dataFrame: DataFrame) extends AnyVal {
 99 | 
100 |     /**
101 |       * Converts/casts columns to the appropriate types supported in BigQuery and types which are suitable for
102 |       * comparison. Specifically:
103 |       *
104 |       * - BigQuery only supports 8 byte integer and floating point types.
105 |       *
106 |       * - BigQuery only supports decimal types with precision 38 and scale 9
107 |       *
108 |       * - Binary types are converted to base64 encoded strings for comparison
109 |       *
110 |       * - Array and map types are exploded for easier comparison
111 |       *
112 |       * - Struct types are unfolded
113 |       *
114 |       * - Parquet-style formatted columns (LIST, MAP) are converted to their native Spark equivalent
115 |       *
116 |       * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]]
117 |       * @see [[FormatConverter]], [[Parquet]]
118 |       */
119 |     def aligned: DataFrame = {
120 |       import Parquet._
121 |       import Generic._
122 | 
123 |       var df: DataFrame = dataFrame
124 | 
125 |       do {
126 |         df = FormatConverter.transform(df, List(parquetListToArray, parquetMapToMap, keyValueRecordToMap))
127 |         df = df.schema.fields.foldLeft(df) { case (agg, field) =>
128 |           if(nestedTypeFlattener.isDefinedAt(field)) {
129 |             val (_, converter) = nestedTypeFlattener(field)
130 |             converter(agg)
131 |           } else {
132 |             agg
133 |           }
134 |         }
135 |       } while(df.schema.fields.exists(nestedTypeFlattener.isDefinedAt))
136 | 
137 |       FormatConverter.transform(df, List(atomicTypeConverter))
138 |     }
139 |   }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem:
--------------------------------------------------------------------------------
1 | shadegoogle.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.miraisolutions.spark.bigquery.DefaultSource


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/BigQueryPartition.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import org.apache.spark.Partition
25 | 
26 | /**
27 |   * BigQuery table partition identifier
28 |   * @param index Partition index
29 |   */
30 | private final case class BigQueryPartition(override val index: Int) extends Partition
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/BigQueryRowRDD.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import com.miraisolutions.spark.bigquery.client.BigQueryTableReader
25 | import org.apache.spark.{Partition, SparkContext, TaskContext}
26 | import org.apache.spark.rdd.RDD
27 | import org.apache.spark.sql.Row
28 | 
29 | /**
30 |   * BigQuery row RDD which reads a BigQuery table by streaming records through a set of pages.
31 |   * @param sc Spark context
32 |   * @param table Table reader used to read a BigQuery table through a number of partitions
33 |   */
34 | class BigQueryRowRDD(sc: SparkContext, val table: BigQueryTableReader) extends RDD[Row](sc, Seq.empty) {
35 | 
36 |   override def compute(split: Partition, context: TaskContext): Iterator[Row] = {
37 |     table.getRows(split.index).iterator
38 |   }
39 | 
40 |   override protected def getPartitions: Array[Partition] = {
41 |     (0 until table.numPartitions).map(BigQueryPartition(_)).toArray
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/BigQuerySchemaConverter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery
 23 | 
 24 | import com.google.cloud.bigquery.{Option => _, _}
 25 | import org.apache.spark.sql.Row
 26 | import org.apache.spark.sql.types._
 27 | 
 28 | import LegacySQLTypeName._
 29 | import com.google.common.io.BaseEncoding
 30 | import com.miraisolutions.spark.bigquery.utils.DateTime
 31 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 32 | 
 33 | import scala.collection.JavaConverters._
 34 | import scala.language.postfixOps
 35 | 
 36 | /**
 37 |   * Schema conversion functions to convert schemas between Apache Spark and Google BigQuery.
 38 |   * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types]]
 39 |   */
 40 | private[bigquery] object BigQuerySchemaConverter {
 41 | 
 42 |   // BigQuery's NUMERIC is a Decimal with precision 38 and scale 9
 43 |   private[bigquery] val BIGQUERY_NUMERIC_DECIMAL = DataTypes.createDecimalType(38, 9)
 44 | 
 45 |   private[bigquery] val KEY_FIELD_NAME = "key"
 46 |   private[bigquery] val VALUE_FIELD_NAME = "value"
 47 | 
 48 |   /**
 49 |     * Converts a BigQuery schema to a Spark schema.
 50 |     * @param schema BigQuery schema
 51 |     * @return Spark schema
 52 |     */
 53 |   def fromBigQueryToSpark(schema: Schema): StructType = {
 54 |     val fields = schema.getFields.asScala.map(bigQueryToSparkField)
 55 |     StructType(fields)
 56 |   }
 57 | 
 58 |   /**
 59 |     * Creates a function that can be used to convert a BigQuery row (represented as [[FieldValueList]]) to a Spark
 60 |     * row.
 61 |     * @param schema BigQuery schema
 62 |     * @return Function to convert a BigQuery row to a Spark row
 63 |     */
 64 |   def getBigQueryToSparkConverterFunction(schema: Schema): FieldValueList => Row = { fields =>
 65 |     val meta = fromBigQueryToSpark(schema).fields.zip(fields.asScala)
 66 |     val values = meta map { case (field, value) => getRowValue(value, field.dataType) }
 67 |     Row.fromSeq(values)
 68 |   }
 69 | 
 70 |   /**
 71 |     * Converts a BigQuery [[Field]] to a Spark [[StructField]].
 72 |     * @param field BigQuery [[Field]]
 73 |     * @return Spark [[StructField]]
 74 |     */
 75 |   private def bigQueryToSparkField(field: Field): StructField = {
 76 |     val dataType = field.getType match {
 77 |       case BOOLEAN =>
 78 |         BooleanType
 79 | 
 80 |       case INTEGER =>
 81 |         LongType
 82 | 
 83 |       case FLOAT =>
 84 |         DoubleType
 85 | 
 86 |       case NUMERIC =>
 87 |         BIGQUERY_NUMERIC_DECIMAL
 88 | 
 89 |       case STRING =>
 90 |         StringType
 91 | 
 92 |       case BYTES =>
 93 |         BinaryType
 94 | 
 95 |       case RECORD =>
 96 |         val fields = field.getSubFields.asScala.map(bigQueryToSparkField)
 97 |         StructType(fields.toArray)
 98 | 
 99 |       case TIMESTAMP =>
100 |         TimestampType
101 | 
102 |       case DATE =>
103 |         DateType
104 | 
105 |       case TIME =>
106 |         // Not supported in Spark
107 |         StringType
108 | 
109 |       case DATETIME =>
110 |         // Not supported in Spark
111 |         StringType
112 |     }
113 | 
114 |     // Mode may be null in which case the default nullable is assumed
115 |     val mode = Option(field.getMode).getOrElse(Field.Mode.NULLABLE)
116 |     val isNullable = mode.equals(Field.Mode.NULLABLE)
117 |     val isRepeated = mode.equals(Field.Mode.REPEATED)
118 | 
119 |     if(isRepeated) {
120 |       StructField(field.getName, ArrayType(dataType, isNullable), false)
121 |     } else {
122 |       StructField(field.getName, dataType, isNullable)
123 |     }
124 |   }
125 | 
126 |   /**
127 |     * Extracts a value from a BigQuery field that can be used to construct a Spark row.
128 |     * @param value BigQuery [[FieldValue]]
129 |     * @param dataType Target Spark data type
130 |     * @return Spark row value
131 |     */
132 |   private def getRowValue(value: FieldValue, dataType: DataType): Any = {
133 |     if(value.isNull) {
134 |       null
135 |     } else {
136 |       dataType match {
137 |         case BooleanType =>
138 |           value.getBooleanValue
139 |         case LongType =>
140 |           value.getLongValue
141 |         case DoubleType =>
142 |           value.getDoubleValue
143 |         case _: DecimalType =>
144 |           value.getNumericValue
145 |         case StringType =>
146 |           value.getStringValue
147 |         case BinaryType =>
148 |           value.getBytesValue
149 |         case ArrayType(elementType, _) =>
150 |           value.getRepeatedValue.asScala.map(getRowValue(_, elementType)).toArray
151 |         case StructType(fields) =>
152 |           Row(value.getRecordValue.asScala.zip(fields.map(_.dataType)).map((getRowValue _).tupled): _*)
153 |         case TimestampType =>
154 |           DateTime.epochMicrosToTimestamp(value.getTimestampValue)
155 |         case DateType =>
156 |           DateTime.parseDate(value.getStringValue)
157 |       }
158 |     }
159 |   }
160 | 
161 |   /**
162 |     * Converts a Spark schema to a BigQuery schema.
163 |     * @param schema Spark schema
164 |     * @return BigQuery schema
165 |     */
166 |   def fromSparkToBigQuery(schema: StructType): Schema = {
167 |     Schema.of(schema.fields.map(sparkToBigQueryField): _*)
168 |   }
169 | 
170 |   /**
171 |     * Creates a custom (key, value) [[StructField]] pair from a Spark [[MapType]] that can be used
172 |     * to construct a BigQuery record type.
173 |     * @param mapType Spark map type
174 |     * @return Key/value [[StructField]] pair according to the map's key/value data types
175 |     */
176 |   private def customKeyValueStructFields(mapType: MapType): (StructField, StructField) = {
177 |     val keyField = StructField(KEY_FIELD_NAME, mapType.keyType, false)
178 |     val valueField = StructField(VALUE_FIELD_NAME, mapType.valueType, mapType.valueContainsNull)
179 |     (keyField, valueField)
180 |   }
181 | 
182 |   /**
183 |     * Creates a custom [[StructField]] from a Spark [[ArrayType]] that can be used to construct a BigQuery
184 |     * field with "repeated" mode.
185 |     * @param arrayType Spark array type
186 |     * @return [[StructField]] according to the array's element data type
187 |     */
188 |   private def customArrayStructField(arrayType: ArrayType): StructField = {
189 |     StructField(VALUE_FIELD_NAME, arrayType.elementType, arrayType.containsNull)
190 |   }
191 | 
192 |   /**
193 |     * Converts a Spark [[StructField]] to a BigQuery [[Field]].
194 |     * @param field Spark [[StructField]]
195 |     * @return BigQuery [[Field]]
196 |     */
197 |   private def sparkToBigQueryField(field: StructField): Field = {
198 |     def f(tpe: LegacySQLTypeName): Field = {
199 |       val mode = if(field.nullable) Field.Mode.NULLABLE else Field.Mode.REQUIRED
200 |       Field.newBuilder(field.name, tpe).setMode(mode).build()
201 |     }
202 | 
203 |     field.dataType match {
204 |       case BooleanType =>
205 |         f(BOOLEAN)
206 | 
207 |       case ByteType =>
208 |         f(INTEGER)
209 | 
210 |       case ShortType =>
211 |         f(INTEGER)
212 | 
213 |       case IntegerType =>
214 |         f(INTEGER)
215 | 
216 |       case LongType =>
217 |         f(INTEGER)
218 | 
219 |       case FloatType =>
220 |         f(FLOAT)
221 | 
222 |       case DoubleType =>
223 |         f(FLOAT)
224 | 
225 |       case dt: DecimalType if dt.precision <= BIGQUERY_NUMERIC_DECIMAL.precision &&
226 |         dt.scale <= BIGQUERY_NUMERIC_DECIMAL.scale =>
227 |         f(NUMERIC)
228 | 
229 |       case _: DecimalType =>
230 |         f(STRING)
231 | 
232 |       case StringType =>
233 |         f(STRING)
234 | 
235 |       case BinaryType =>
236 |         f(BYTES)
237 | 
238 |       case StructType(fields) =>
239 |         Field.of(field.name, RECORD, fields.map(sparkToBigQueryField): _*)
240 | 
241 |       case TimestampType =>
242 |         f(TIMESTAMP)
243 | 
244 |       case DateType =>
245 |         f(DATE)
246 | 
247 |       case t: MapType =>
248 |         val (keyField, valueField) = customKeyValueStructFields(t)
249 |         val key = sparkToBigQueryField(keyField)
250 |         val value = sparkToBigQueryField(valueField)
251 |         Field.newBuilder(field.name, RECORD, key, value).setMode(Field.Mode.REPEATED).build()
252 | 
253 |       case t: ArrayType =>
254 |         val elementField = sparkToBigQueryField(customArrayStructField(t))
255 |         Field.newBuilder(field.name, elementField.getType).setMode(Field.Mode.REPEATED).build()
256 | 
257 |       case _ => // HiveStringType, NullType, ObjectType, CalendarIntervalType
258 |         f(STRING)
259 |     }
260 |   }
261 | 
262 |   /**
263 |     * Creates a function that can be used to convert a Spark row to a BigQuery row (represented as a map).
264 |     * @param schema Spark schema
265 |     * @return Function to convert a Spark row to a BigQuery row
266 |     */
267 |   def getSparkToBigQueryConverterFunction(schema: StructType): Row => java.util.Map[String, Any] = { row =>
268 |     val meta = fromSparkToBigQuery(schema)
269 |     val result = new java.util.HashMap[String, Any](meta.getFields.size)
270 | 
271 |     meta.getFields.asScala foreach { field =>
272 |       result.put(field.getName, getFieldValue(row, field))
273 |     }
274 |     result
275 |   }
276 | 
277 |   /**
278 |     * Extracts a value from a Spark row that can be used to construct a BigQuery row.
279 |     * @param row Spark row
280 |     * @param field BigQuery field
281 |     * @return BigQuery field value
282 |     */
283 |   private def getFieldValue(row: Row, field: Field): Any = {
284 |     val idx = row.fieldIndex(field.getName)
285 | 
286 |     if(row.isNullAt(idx)) {
287 |       null
288 |     } else {
289 |       val sourceType = row.schema(idx).dataType
290 |       val targetType = field.getType
291 | 
292 |       (sourceType, targetType) match {
293 |         case (BooleanType, BOOLEAN) =>
294 |           row.getBoolean(idx)
295 | 
296 |         case (ByteType, INTEGER) =>
297 |           row.getByte(idx)
298 | 
299 |         case (ShortType, INTEGER) =>
300 |           row.getShort(idx)
301 | 
302 |         case (IntegerType, INTEGER) =>
303 |           row.getInt(idx)
304 | 
305 |         case (LongType, INTEGER) =>
306 |           row.getLong(idx)
307 | 
308 |         case (FloatType, FLOAT) =>
309 |           row.getFloat(idx).toDouble
310 | 
311 |         case (DoubleType, FLOAT) =>
312 |           row.getDouble(idx)
313 | 
314 |         case (_: DecimalType, _) =>
315 |           row.getDecimal(idx).toPlainString
316 | 
317 |         case (StringType, STRING) =>
318 |           row.getString(idx)
319 | 
320 |         case (BinaryType, BYTES) =>
321 |           val bytes = row.getAs[Array[Byte]](idx)
322 |           BaseEncoding.base64().encode(bytes)
323 | 
324 |         case (StructType(_), RECORD) =>
325 |           val struct = row.getStruct(idx)
326 |           val result = new java.util.HashMap[String, Any](struct.size)
327 | 
328 |           field.getSubFields.asScala foreach { subField =>
329 |             result.put(subField.getName, getFieldValue(struct, subField))
330 |           }
331 |           result
332 | 
333 |         case (TimestampType, TIMESTAMP) =>
334 |           // BigQuery requires specifying the number of seconds since the epoch
335 |           DateTime.timestampToEpochSeconds(row.getTimestamp(idx))
336 | 
337 |         case (DateType, DATE) =>
338 |           DateTime.formatSparkDate(row.getDate(idx))
339 | 
340 |         case (t: MapType, RECORD) =>
341 |           val m = row.getMap[Any, Any](idx)
342 | 
343 |           val (keyField, valueField) = customKeyValueStructFields(t)
344 |           val mapSchema = StructType(Array(keyField, valueField))
345 | 
346 |           m.toList map { case (k, v) =>
347 |             val record = new java.util.HashMap[Any, Any](2)
348 |             val kvRow = new GenericRowWithSchema(Array(k, v), mapSchema)
349 |             val keyValue = getFieldValue(kvRow, field.getSubFields.get(0))
350 |             val valueValue = getFieldValue(kvRow, field.getSubFields.get(1))
351 |             record.put(keyField.name, keyValue)
352 |             record.put(valueField.name, valueValue)
353 |             record
354 |           } asJava
355 | 
356 |         case (st: ArrayType, _) =>
357 |           val arrayField = customArrayStructField(st)
358 |           val arraySchema = StructType(Array(arrayField))
359 |           val arrayBigQueryField = sparkToBigQueryField(arrayField)
360 | 
361 |           row.getSeq[Any](idx) map { value =>
362 |             getFieldValue(new GenericRowWithSchema(Array(value), arraySchema), arrayBigQueryField)
363 |           } asJava
364 |       }
365 |     }
366 |   }
367 | }
368 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/BigQueryTableReference.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import com.google.cloud.bigquery.{Table, TableId}
25 | import com.miraisolutions.spark.bigquery.exception.ParseException
26 | import scala.language.implicitConversions
27 | 
28 | /**
29 |   * BigQuery table reference
30 |   * @param project Project ID
31 |   * @param dataset Dataset ID
32 |   * @param table Table ID
33 |   */
34 | private final case class BigQueryTableReference(project: String, dataset: String, table: String) {
35 | 
36 |   /** Returns the unquoted table identifier (BigQuery Standard SQL) */
37 |   def unquotedIdentifier: String = s"$project.$dataset.$table"
38 |   /** Returns the quoted table identifier (BigQuery Standard SQL) */
39 |   def quotedIdentifier: String = "`" + unquotedIdentifier + "`"
40 | 
41 |   /** BigQuery Standard SQL table identifier (quoted) */
42 |   override def toString: String = quotedIdentifier
43 | }
44 | 
45 | private object BigQueryTableReference {
46 | 
47 |   /**
48 |     * Creates a [[BigQueryTableReference]] from a [[TableId]]
49 |     * @param tableId Table ID
50 |     * @return BigQuery table reference
51 |     */
52 |   def apply(tableId: TableId): BigQueryTableReference =
53 |     BigQueryTableReference(tableId.getProject, tableId.getDataset, tableId.getTable)
54 | 
55 |   /**
56 |     * Creates a [[BigQueryTableReference]] from a table reference [[String]]
57 |     * @param tableRef Table reference string
58 |     * @return BigQuery table reference
59 |     */
60 |   def apply(tableRef: String): BigQueryTableReference = {
61 |     val tableId = raw"((.+:)?[\w_\-]+)\.([\w_\-]+)\.([\w_\-]+)".r
62 |     tableRef.replace("`", "") match {
63 |       case tableId(project, _, dataset, table) => BigQueryTableReference(project, dataset, table)
64 |       case _ => throw new ParseException("Failed to parse BigQuery table reference which needs to be of the form " +
65 |         "[projectId].[datasetId].[tableId]")
66 |     }
67 |   }
68 | 
69 |   // Converts an internal BigQuery table reference to a Google BigQuery API `TableId`
70 |   implicit def bigQueryTableReferenceToTableId(table: BigQueryTableReference): TableId = {
71 |     TableId.of(table.project, table.dataset, table.table)
72 |   }
73 | 
74 |   // Converts a Google BigQuery API `TableId` to an internal BigQuery table reference
75 |   implicit def tableIdToBigQueryTableReference(tableId: TableId): BigQueryTableReference = apply(tableId)
76 | 
77 |   // Converts a Google BigQuery API `Table` to an internal BigQuery table reference
78 |   implicit def tableToBigQueryTableReference(table: Table): BigQueryTableReference = apply(table.getTableId)
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/BigQueryTableRelation.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import com.miraisolutions.spark.bigquery.client.BigQueryClient
25 | import com.miraisolutions.spark.bigquery.sql.BigQuerySqlGeneration
26 | import org.apache.spark.rdd.RDD
27 | import org.apache.spark.sql.sources._
28 | import org.apache.spark.sql.types.StructType
29 | import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
30 | import org.slf4j.LoggerFactory
31 | 
32 | /**
33 |   * Relation for a Google BigQuery table
34 |   *
35 |   * @param sqlContext Spark SQL context
36 |   * @param client BigQuery client
37 |   * @param table BigQuery table reference
38 |   */
39 | private final case class BigQueryTableRelation(sqlContext: SQLContext, client: BigQueryClient,
40 |                                                table: BigQueryTableReference)
41 |   extends BaseRelation with TableScan with PrunedScan with PrunedFilteredScan with InsertableRelation {
42 | 
43 |   private val logger = LoggerFactory.getLogger(classOf[BigQueryTableRelation])
44 |   private val sql = BigQuerySqlGeneration(table)
45 | 
46 |   // See {{BaseRelation}}
47 |   override def schema: StructType = client.getSchema(table)
48 | 
49 |   // See {{TableScan}}
50 |   override def buildScan(): RDD[Row] = {
51 |     logger.info(s"Executing full scan of table $table")
52 |     val tbl = client.getTable(table, sqlContext.sparkContext.defaultParallelism)
53 |     new BigQueryRowRDD(sqlContext.sparkContext, tbl)
54 |   }
55 | 
56 |   // See {{PrunedScan}}
57 |   override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
58 |     buildScan(requiredColumns, Array.empty)
59 |   }
60 | 
61 |   // See {{PrunedFilteredScan}}
62 |   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
63 |     logger.info(s"Executing pruned filtered scan of table $table ")
64 |     val sqlQuery = sql.getQuery(requiredColumns, filters)
65 |     val tbl = client.executeQuery(sqlQuery, sqlContext.sparkContext.defaultParallelism)
66 |     new BigQueryRowRDD(sqlContext.sparkContext, tbl)
67 |   }
68 | 
69 |   // See {{InsertableRelation}}
70 |   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
71 |     logger.info(s"Writing to table $table (overwrite = $overwrite)")
72 |     val mode = if(overwrite) SaveMode.Overwrite else SaveMode.Append
73 |     client.writeTable(data, table, mode)
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/DefaultSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery
 23 | 
 24 | import com.google.cloud.hadoop.fs.gcs.{GoogleHadoopFS, GoogleHadoopFileSystem}
 25 | import com.miraisolutions.spark.bigquery.FileFormat.CSV
 26 | import com.miraisolutions.spark.bigquery.client.BigQueryClient
 27 | import com.miraisolutions.spark.bigquery.config.BigQueryConfig
 28 | import com.miraisolutions.spark.bigquery.exception.MissingParameterException
 29 | import com.miraisolutions.spark.bigquery.utils.Files
 30 | import org.apache.hadoop.conf.Configuration
 31 | import org.apache.spark.sql.execution.datasources.DataSource
 32 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
 33 | import org.apache.spark.sql.sources._
 34 | import org.apache.spark.sql.execution.FileRelation
 35 | 
 36 | /**
 37 |   * Google BigQuery default data source.
 38 |   */
 39 | class DefaultSource extends RelationProvider with CreatableRelationProvider with DataSourceRegister {
 40 |   import DefaultSource._
 41 | 
 42 |   // See {{DataSourceRegister}}
 43 |   override def shortName(): String = BIGQUERY_DATA_SOURCE_NAME
 44 | 
 45 |   // See {{RelationProvider}}
 46 |   override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
 47 |     withBigQueryClient(sqlContext, parameters, false) { (client, table) =>
 48 |       parameters.foldType[BaseRelation](BigQueryTableRelation(sqlContext, client, table)) { format =>
 49 |         val stagingDirectory = client.exportTable(table, format)
 50 |         // Register staging directory for deletion when FileSystem gets closed
 51 |         Files.deleteOnExit(stagingDirectory, sqlContext.sparkContext.hadoopConfiguration)
 52 | 
 53 |         getStagingDataFileRelation(sqlContext, stagingDirectory, format)
 54 |       }
 55 |     }
 56 |   }
 57 | 
 58 |   // See {{CreatableRelationProvider}}
 59 |   override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String],
 60 |                               data: DataFrame): BaseRelation = {
 61 | 
 62 |     withBigQueryClient(sqlContext, parameters, true) { (client, table) =>
 63 |       parameters.foldType[Unit](client.writeTable(data, table, mode)) { format =>
 64 |         val stagingDirectory = client.getStagingDirectory()
 65 | 
 66 |         // Use TIMESTAMP_MICROS in Parquet (supported since Spark 2.3.0)
 67 |         sqlContext.setConf("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
 68 | 
 69 |         data.write
 70 |           .format(format.sparkFormatIdentifier)
 71 |           .options(getFormatOptions(format))
 72 |           .save(stagingDirectory)
 73 | 
 74 |         client.importTable(stagingDirectory, format, table, mode)
 75 | 
 76 |         // Remove staging directory after import has been completed
 77 |         Files.delete(stagingDirectory, sqlContext.sparkContext.hadoopConfiguration)
 78 |       }
 79 | 
 80 |       BigQueryTableRelation(sqlContext, client, table)
 81 |     }
 82 |   }
 83 | }
 84 | 
 85 | private[bigquery] object DefaultSource {
 86 | 
 87 |   // BigQuery data source name
 88 |   val BIGQUERY_DATA_SOURCE_NAME = "bigquery"
 89 | 
 90 |   // Direct import/export type
 91 |   private val TYPE_DIRECT = "direct"
 92 | 
 93 |   /** Creates a BigQuery client with the provided configuration parameters */
 94 |   private def getBigQueryClient(parameters: Map[String, String]): BigQueryClient = {
 95 |     new BigQueryClient(BigQueryConfig(parameters))
 96 |   }
 97 | 
 98 |   /**
 99 |     * Sets several necessary Spark Hadoop configuration options to enable access to Google Cloud Storage (GCS).
100 |     * @param conf Spark Hadoop configuration
101 |     * @param project Google Cloud project
102 |     * @param serviceAccountKeyFile Optional Google Cloud service account key file
103 |     * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]]
104 |     * @see [[https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md]]
105 |     */
106 |   private def initHadoop(conf: Configuration, project: String, serviceAccountKeyFile: Option[String]): Unit = {
107 |     conf.set("fs.gs.impl", classOf[GoogleHadoopFileSystem].getName)
108 |     conf.set("fs.AbstractFileSystem.gs.impl", classOf[GoogleHadoopFS].getName)
109 |     conf.set("fs.gs.project.id", project)
110 | 
111 |     serviceAccountKeyFile foreach { file =>
112 |       conf.set("google.cloud.auth.service.account.enable", "true")
113 |       conf.set("google.cloud.auth.service.account.json.keyfile", file)
114 |     }
115 |   }
116 | 
117 |   /**
118 |     * Determines Spark options to be provided for a particular file format
119 |     * @param format File format
120 |     * @return Spark import/export options
121 |     */
122 |   private def getFormatOptions(format: FileFormat): Map[String, String] = {
123 |     format match {
124 |       case CSV =>
125 |         Map("header" -> "true")
126 | 
127 |       case _ =>
128 |         Map.empty
129 |     }
130 |   }
131 | 
132 |   /**
133 |     * Creates a Spark [[FileRelation]] for staged data files in a Google Cloud Storage (GCS) staging directory.
134 |     * @param sqlContext Spark SQL context
135 |     * @param stagingDirectory Staging directory path
136 |     * @param format File export format
137 |     * @return Spark [[BaseRelation]] for the staged data files
138 |     */
139 |   private def getStagingDataFileRelation(sqlContext: SQLContext, stagingDirectory: String,
140 |                                          format: FileFormat): BaseRelation = {
141 |     val dataSource = DataSource(
142 |       sparkSession = sqlContext.sparkSession,
143 |       className = format.sparkFormatIdentifier,
144 |       paths = List(stagingDirectory),
145 |       userSpecifiedSchema = None,
146 |       options = getFormatOptions(format)
147 |     )
148 | 
149 |     dataSource.resolveRelation(true)
150 |   }
151 | 
152 |   /**
153 |     * Gets a BigQuery table reference for the specified parameters. The parameters must either specify a table or
154 |     * a SQL query.
155 |     * @param sqlContext Spark SQL context
156 |     * @param client BigQuery client
157 |     * @param parameters Parameters - must either specify a table or SQL query.
158 |     * @param tableOnly Specifies whether a direct table reference is required.
159 |     * @return Reference to a BigQuery table that holds the data
160 |     */
161 |   private def getBigQueryTableReference(sqlContext: SQLContext, client: BigQueryClient,
162 |                                         parameters: Map[String, String], tableOnly: Boolean): BigQueryTableReference = {
163 |     // Get direct table reference if 'table' has been specified
164 |     val tableOpt = parameters.get("table").map(BigQueryTableReference(_))
165 | 
166 |     if(tableOnly) {
167 |       tableOpt.getOrElse(throw new MissingParameterException(
168 |         "A parameter 'table' of the form [projectId].[datasetId].[tableId] must be specified."
169 |       ))
170 |     } else {
171 |       // Execute 'sqlQuery' and get reference to table containing the results
172 |       def sqlTableOpt: Option[BigQueryTableReference] = parameters.get("sqlQuery") map { sqlQuery =>
173 |         client.executeQuery(sqlQuery, sqlContext.sparkContext.defaultParallelism).table
174 |       }
175 | 
176 |       tableOpt.orElse(sqlTableOpt).getOrElse(throw new MissingParameterException(
177 |         "Either a parameter 'table' of the form [projectId].[datasetId].[tableId] or 'sqlQuery' must be specified."
178 |       ))
179 |     }
180 |   }
181 | 
182 |   /**
183 |     * Constructs a BigQuery client, applies the necessary Spark Hadoop configuration and then calls a provided
184 |     * function to create a [[BaseRelation]].
185 |     * @param sqlContext Spark SQL context
186 |     * @param parameters Parameters
187 |     * @param tableOnly Specifies whether a direct table reference is required.
188 |     * @param createRelation Function to create a [[BaseRelation]] given a BigQuery client and a table reference.
189 |     * @return Spark [[BaseRelation]]
190 |     */
191 |   private def withBigQueryClient(sqlContext: SQLContext, parameters: Map[String, String], tableOnly: Boolean)
192 |                                 (createRelation:
193 |                                  (BigQueryClient, BigQueryTableReference) => BaseRelation): BaseRelation = {
194 | 
195 |     val client = getBigQueryClient(parameters)
196 | 
197 |     initHadoop(sqlContext.sparkContext.hadoopConfiguration, client.config.project,
198 |       client.config.serviceAccountKeyFile)
199 | 
200 |     val tableReference = getBigQueryTableReference(sqlContext, client, parameters, tableOnly)
201 | 
202 |     createRelation(client, tableReference)
203 |   }
204 | 
205 |   /** Helper class for parameter handling */
206 |   private implicit class TypeParameter(parameters: Map[String, String]) {
207 |     /**
208 |       * Fold on import/export type.
209 |       * @param direct Block to execute when 'type' = 'direct'
210 |       * @param handleFileFormat Function to execute when 'type' = <supported format>
211 |       */
212 |     def foldType[T](direct: => T)(handleFileFormat: FileFormat => T): T = {
213 |       parameters.getOrElse("type", TYPE_DIRECT) match {
214 |         case TYPE_DIRECT =>
215 |           direct
216 | 
217 |         case tpe =>
218 |           handleFileFormat(FileFormat(tpe))
219 |       }
220 |     }
221 |   }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/FileFormat.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery
 23 | 
 24 | import com.google.cloud.bigquery.FormatOptions
 25 | 
 26 | /**
 27 |   * File format used in conjunction with Spark and BigQuery import/export.
 28 |   */
 29 | private sealed trait FileFormat {
 30 |   /** Spark format identifier */
 31 |   def sparkFormatIdentifier: String
 32 |   /** BigQuery format options */
 33 |   def bigQueryFormatOptions: FormatOptions
 34 |   /** File extension */
 35 |   def fileExtension: String
 36 | }
 37 | 
 38 | private object FileFormat {
 39 | 
 40 |   /**
 41 |     * JSON format.
 42 |     * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json]]
 43 |     */
 44 |   case object JSON extends FileFormat {
 45 |     override val sparkFormatIdentifier: String = "json"
 46 |     override val bigQueryFormatOptions: FormatOptions = FormatOptions.json()
 47 |     override val fileExtension: String = "json"
 48 |   }
 49 | 
 50 |   /**
 51 |     * CSV format.
 52 |     * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv]]
 53 |     */
 54 |   case object CSV extends FileFormat {
 55 |     override val sparkFormatIdentifier: String = "csv"
 56 |     override val bigQueryFormatOptions: FormatOptions = FormatOptions.csv()
 57 |     override val fileExtension: String = "csv"
 58 |   }
 59 | 
 60 |   /**
 61 |     * Avro format.
 62 |     * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro]]
 63 |     * @see [[https://github.com/databricks/spark-avro]]
 64 |     */
 65 |   case object AVRO extends FileFormat {
 66 |     override val sparkFormatIdentifier: String = "com.databricks.spark.avro"
 67 |     override val bigQueryFormatOptions: FormatOptions = FormatOptions.avro()
 68 |     override val fileExtension: String = "avro"
 69 |   }
 70 | 
 71 |   /**
 72 |     * Parquet format.
 73 |     * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet]]
 74 |     * @see [[https://spark.apache.org/docs/latest/sql-programming-guide.html#parquet-files]]
 75 |     */
 76 |   case object PARQUET extends FileFormat {
 77 |     override val sparkFormatIdentifier: String = "parquet"
 78 |     override val bigQueryFormatOptions: FormatOptions = FormatOptions.parquet()
 79 |     override val fileExtension: String = "parquet"
 80 |   }
 81 | 
 82 |   /**
 83 |     * ORC format.
 84 |     * @see [[https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc]]
 85 |     * @see [[https://spark.apache.org/docs/latest/sql-programming-guide.html#orc-files]]
 86 |     */
 87 |   case object ORC extends FileFormat {
 88 |     override val sparkFormatIdentifier: String = "orc"
 89 |     override val bigQueryFormatOptions: FormatOptions = FormatOptions.orc()
 90 |     override val fileExtension: String = "orc"
 91 |   }
 92 | 
 93 |   /** Creates a file format from a string. */
 94 |   def apply(format: String): FileFormat = {
 95 |     format.toLowerCase match {
 96 |       case "parquet" => PARQUET
 97 |       case "avro" => AVRO
 98 |       case "orc" => ORC
 99 |       case "json" => JSON
100 |       case "csv" => CSV
101 |       case _ =>
102 |         throw new IllegalArgumentException(s"Unsupported file format: $format")
103 |     }
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/client/BigQueryClient.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.client
 23 | 
 24 | import java.io.FileInputStream
 25 | import java.time.{Instant, ZoneId}
 26 | import java.time.format.DateTimeFormatter
 27 | 
 28 | import com.google.auth.oauth2.ServiceAccountCredentials
 29 | import com.google.cloud.RetryOption
 30 | import com.google.cloud.bigquery.BigQuery.DatasetDeleteOption
 31 | import com.google.cloud.bigquery.InsertAllRequest.RowToInsert
 32 | import com.google.cloud.bigquery.JobInfo.{CreateDisposition, WriteDisposition}
 33 | import com.google.cloud.bigquery.{Option => _, _}
 34 | import com.miraisolutions.spark.bigquery.config.{BigQueryConfig, StagingDatasetConfig}
 35 | import com.miraisolutions.spark.bigquery.exception.{IOException, UnsupportedFormatException}
 36 | import com.miraisolutions.spark.bigquery.utils.SqlLogger
 37 | import com.miraisolutions.spark.bigquery.{BigQuerySchemaConverter, BigQueryTableReference, FileFormat}
 38 | import org.apache.spark.sql.types.StructType
 39 | import org.apache.spark.sql.{DataFrame, SaveMode}
 40 | import org.slf4j.LoggerFactory
 41 | import org.threeten.bp.Duration
 42 | 
 43 | import scala.collection.JavaConverters._
 44 | import scala.language.implicitConversions
 45 | import scala.util.Random
 46 | 
 47 | private object BigQueryClient {
 48 |   import FileFormat._
 49 | 
 50 |   // Prefix for temporary tables and directories
 51 |   private val TEMP_PREFIX = "spark_"
 52 | 
 53 |   // Timestamp formatter for temporary tables and GCS staging directories
 54 |   private val TIMESTAMP_FORMATTER =
 55 |     DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(ZoneId.of("UTC"))
 56 | 
 57 |   // File formats not currently available for BigQuery imports
 58 |   private val UNSUPPORTED_BIGQUERY_IMPORT_FORMATS: Set[FileFormat] = Set(JSON, CSV)
 59 |   // File formats not currently available for BigQuery exports
 60 |   private val UNSUPPORTED_BIGQUERY_EXPORT_FORMATS: Set[FileFormat] = Set(PARQUET, ORC)
 61 | }
 62 | 
 63 | /**
 64 |   * BigQuery Client
 65 |   *
 66 |   * @param config BigQuery configuration
 67 |   */
 68 | private[bigquery] class BigQueryClient(val config: BigQueryConfig) {
 69 |   import BigQueryClient._
 70 | 
 71 |   // Internal BigQuery client
 72 |   private val bigquery: BigQuery = getBigQueryService()
 73 | 
 74 |   private val logger = LoggerFactory.getLogger(classOf[BigQueryClient])
 75 |   private val sqlLogger = SqlLogger(logger)
 76 | 
 77 |   /**
 78 |     * Creates an internal BigQuery client that uses the provided service account credentials or the application
 79 |     * default credentials if no service account credentials have been provided.
 80 |     * @return BigQuery service interface
 81 |     * @see [[https://cloud.google.com/docs/authentication/]]
 82 |     * @see [[https://cloud.google.com/bigquery/docs/authentication/]]
 83 |     * @see [[https://github.com/GoogleCloudPlatform/google-cloud-java#authentication]]
 84 |     */
 85 |   private def getBigQueryService(): BigQuery = {
 86 |     config.serviceAccountKeyFile.fold(BigQueryOptions.getDefaultInstance.getService) { keyFile =>
 87 |       BigQueryOptions.newBuilder()
 88 |         .setCredentials(ServiceAccountCredentials.fromStream(new FileInputStream(keyFile)))
 89 |         .build()
 90 |         .getService
 91 |     }
 92 |   }
 93 | 
 94 |   /**
 95 |     * Retrieves a dataset or creates it if it doesn't exist.
 96 |     * @param project Project ID
 97 |     * @param dataset Dataset ID
 98 |     * @param build Function to configure the dataset to be created if it doesn't exist yet
 99 |     * @return BigQuery [[Dataset]]
100 |     */
101 |   private def getOrCreateDataset(project: String, dataset: String)
102 |                                 (build: DatasetInfo.Builder => DatasetInfo.Builder): Dataset = {
103 |     val datasetId = DatasetId.of(project, dataset)
104 |     Option(bigquery.getDataset(datasetId)).getOrElse {
105 |       logger.info(s"Creating dataset $dataset in project $project")
106 | 
107 |       val datasetBuilder = DatasetInfo.newBuilder(datasetId)
108 |       // New datasets are always created in the configured location
109 |       val datasetInfo = build(datasetBuilder).setLocation(config.location).build()
110 |       bigquery.create(datasetInfo)
111 |     }
112 |   }
113 | 
114 |   /**
115 |     * Retrieve or create staging dataset which hosts temporary SQL query result tables.
116 |     * @return Staging dataset ID
117 |     */
118 |   private def getOrCreateStagingDataset(): DatasetId = {
119 |     import config._
120 | 
121 |     val ds = getOrCreateDataset(project, stagingDataset.name + "_" + location) { builder =>
122 |       builder
123 |         .setDefaultTableLifetime(stagingDataset.lifetime)
124 |         .setDescription(StagingDatasetConfig.DESCRIPTION)
125 |     }
126 | 
127 |     ds.getDatasetId
128 |   }
129 | 
130 |   /**
131 |     * Creates a temporary name that can be used for temporary tables and directories.
132 |     */
133 |   private def createTempName(): String = {
134 |     TEMP_PREFIX + TIMESTAMP_FORMATTER.format(Instant.now()) + "_" + Random.nextInt(Int.MaxValue)
135 |   }
136 | 
137 |   /**
138 |     * Creates a new (unique) reference to a temporary table which will contain the results of an executed SQL query.
139 |     * @return BigQuery table reference
140 |     */
141 |   private def createTemporaryTableReference(): BigQueryTableReference = {
142 |     val stagingDataset = getOrCreateStagingDataset()
143 |     val tempTableName = createTempName()
144 | 
145 |     BigQueryTableReference(stagingDataset.getProject, stagingDataset.getDataset, tempTableName)
146 |   }
147 | 
148 |   /**
149 |     * Retrieves the Spark schema for a BigQuery table
150 |     * @param table BigQuery table reference
151 |     * @return Spark schema
152 |     */
153 |   def getSchema(table: BigQueryTableReference): StructType = {
154 |     val schema = bigquery.getTable(table).getDefinition[TableDefinition].getSchema
155 |     BigQuerySchemaConverter.fromBigQueryToSpark(schema)
156 |   }
157 | 
158 |   /**
159 |     * Gets a BigQuery table reader that can be used to read a BigQuery table through a number of pages/partitions.
160 |     * @param table BigQuery table reference
161 |     * @param numPartitions Suggested number of target partitions. The effective number of partitions may be different.
162 |     * @return BigQuery table reader
163 |     */
164 |   def getTable(table: BigQueryTableReference, numPartitions: Int): BigQueryTableReader = {
165 |     val tbl = bigquery.getTable(table)
166 |     BigQueryTableReader(tbl, tbl.list().getTotalRows, numPartitions)
167 |   }
168 | 
169 |   /**
170 |     * Deletes a BigQuery table.
171 |     * @param table BigQuery table reference
172 |     * @return True if the table was deleted and false if the table was not found
173 |     */
174 |   def deleteTable(table: BigQueryTableReference): Boolean = {
175 |     logger.info(s"Deleting table $table")
176 |     bigquery.getTable(table).delete()
177 |   }
178 | 
179 |   /**
180 |     * Deletes a BigQuery dataset and its contents.
181 |     * @param project Project ID
182 |     * @param dataset Dataset ID
183 |     * @return True if the dataset was deleted, false if it was not found.
184 |     */
185 |   def deleteDataset(project: String, dataset: String): Boolean = {
186 |     logger.info(s"Deleting dataset $dataset in project $project")
187 |     bigquery.delete(DatasetId.of(project, dataset), DatasetDeleteOption.deleteContents())
188 |   }
189 | 
190 |   /**
191 |     * Executes a BigQuery standard SQL query and returns a BigQuery table reader to retrieve the results.
192 |     * @param query BigQuery standard SQL (SQL-2011) query
193 |     * @param numPartitions Number of target partitions
194 |     * @return BigQuery table reader
195 |     */
196 |   def executeQuery(query: String, numPartitions: Int): BigQueryTableReader = {
197 |     sqlLogger.logSqlQuery(query)
198 |     val tempTable = createTemporaryTableReference()
199 | 
200 |     val queryJobConfiguration =
201 |       QueryJobConfiguration.newBuilder(query)
202 |         .setUseLegacySql(false)
203 |         .setAllowLargeResults(true)
204 |         .setFlattenResults(false)
205 |         .setPriority(config.job.priority.underlying)
206 |         .setDestinationTable(tempTable)
207 |         .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
208 |         .setWriteDisposition(WriteDisposition.WRITE_EMPTY)
209 |         .build()
210 | 
211 |     val totalRows = bigquery.query(queryJobConfiguration).getTotalRows
212 |     val tbl = bigquery.getTable(tempTable)
213 |     BigQueryTableReader(tbl, totalRows, numPartitions)
214 |   }
215 | 
216 |   /**
217 |     * Inserts the rows of a Spark [[DataFrame]] into a BigQuery table.
218 |     * @param df Spark [[DataFrame]]
219 |     * @param table BigQuery table
220 |     */
221 |   private def insertRows(df: DataFrame, table: Table): Unit = {
222 |     // Getting a stable reference to the schema for serialization with the following closure
223 |     val schema = df.schema
224 | 
225 |     df foreachPartition { rows =>
226 |       if(rows.nonEmpty) {
227 |         val converter = BigQuerySchemaConverter.getSparkToBigQueryConverterFunction(schema)
228 |         val rowsToInsert = rows.map(row => RowToInsert.of(converter(row))).toIterable.asJava
229 |         val response = table.insert(rowsToInsert, false, false)
230 | 
231 |         if (response.hasErrors) {
232 |           val msg = response.getInsertErrors.asScala.values.flatMap(_.asScala.map(_.getMessage)).toSet.mkString("\n")
233 |           throw new IOException(msg)
234 |         }
235 |       }
236 |     }
237 |   }
238 | 
239 |   /**
240 |     * Writes a Spark [[DataFrame]] to a BigQuery table.
241 |     * @param df Spark [[DataFrame]]
242 |     * @param table Target BigQuery table
243 |     * @param mode Save mode
244 |     */
245 |   def writeTable(df: DataFrame, table: BigQueryTableReference, mode: SaveMode): Unit = {
246 |     import SaveMode._
247 | 
248 |     logger.info(s"Attempting to insert ${df.count()} rows to table $table" +
249 |       s" (mode: $mode, partitions: ${df.rdd.getNumPartitions})")
250 | 
251 |     val ds = getOrCreateDataset(table.project, table.dataset)(identity)
252 | 
253 |     val schema = BigQuerySchemaConverter.fromSparkToBigQuery(df.schema)
254 | 
255 |     mode match {
256 |       case Append =>
257 |         val tbl = ds.getOrCreateTable(table.table, schema)
258 |         insertRows(df, tbl)
259 | 
260 |       case Overwrite =>
261 |         val tbl = ds.dropAndCreateTable(table.table, schema)
262 |         insertRows(df, tbl)
263 | 
264 |       case ErrorIfExists =>
265 |         if(ds.existsNonEmptyTable(table.table)) {
266 |           throw new IllegalStateException(s"Table $table already exists and is not empty")
267 |         } else {
268 |           val tbl = ds.getOrCreateTable(table.table, schema)
269 |           insertRows(df, tbl)
270 |         }
271 | 
272 |       case Ignore =>
273 |         if(!ds.existsNonEmptyTable(table.table)) {
274 |           val tbl = ds.getOrCreateTable(table.table, schema)
275 |           insertRows(df, tbl)
276 |         }
277 |     }
278 |   }
279 | 
280 |   /**
281 |     * Constructs a Google Cloud Storage (GCS) staging directory path that can be used to stage data files for data
282 |     * import and export.
283 |     * @return GCS directory path
284 |     */
285 |   def getStagingDirectory(): String = {
286 |     import config.stagingDataset._
287 | 
288 |     val tempDirectoryName = createTempName()
289 |     s"gs://$gcsBucket/$name/$tempDirectoryName"
290 |   }
291 | 
292 |   /**
293 |     * Exports a BigQuery table as a series of files to a temporary directory in a Google Cloud Storage (GCS) bucket.
294 |     * @param table BigQuery table to export
295 |     * @param format File export format
296 |     * @return Temporary GCS staging directory containing the exported files in the specified format
297 |     * @see [[https://cloud.google.com/bigquery/docs/exporting-data]]
298 |     */
299 |   def exportTable(table: BigQueryTableReference, format: FileFormat): String = {
300 |     if(UNSUPPORTED_BIGQUERY_EXPORT_FORMATS.contains(format)) {
301 |       throw new UnsupportedFormatException(s"Unsupported BigQuery export format: $format")
302 |     }
303 | 
304 |     val stagingDirectory = getStagingDirectory()
305 |     val destinationUri = s"$stagingDirectory/${table.table}_*.${format.fileExtension}"
306 | 
307 |     logger.info(s"Starting export of table $table to $destinationUri (format: $format)")
308 |     val job = bigquery.getTable(table).extract(format.bigQueryFormatOptions.getType, destinationUri)
309 |     waitForJob(job)
310 |     logger.info(s"Done exporting table $table")
311 | 
312 |     stagingDirectory
313 |   }
314 | 
315 |   /**
316 |     * Imports data from a Google Cloud Storage (GCS) directory into a BigQuery table.
317 |     * @param path GCS directory path
318 |     * @param format File format
319 |     * @param table BigQuery table reference
320 |     * @param mode Save mode
321 |     */
322 |   def importTable(path: String, format: FileFormat, table: BigQueryTableReference, mode: SaveMode): Unit = {
323 |     import SaveMode._
324 | 
325 |     if(UNSUPPORTED_BIGQUERY_IMPORT_FORMATS.contains(format)) {
326 |       throw new UnsupportedFormatException(s"Unsupported BigQuery import format: $format")
327 |     }
328 | 
329 |     getOrCreateDataset(table.project, table.dataset)(identity)
330 | 
331 |     val baseConfig = LoadJobConfiguration.builder(table, path + s"*.${format.fileExtension}")
332 |       .setAutodetect(true)
333 |       .setIgnoreUnknownValues(false)
334 |       .setMaxBadRecords(0)
335 |       .setFormatOptions(format.bigQueryFormatOptions)
336 |       .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
337 | 
338 |     val (writeDisposition, ignoreDuplicateError) = mode match {
339 |       case Append =>
340 |         (WriteDisposition.WRITE_APPEND, false)
341 | 
342 |       case Overwrite =>
343 |         (WriteDisposition.WRITE_TRUNCATE, false)
344 | 
345 |       case ErrorIfExists =>
346 |         (WriteDisposition.WRITE_EMPTY, false)
347 | 
348 |       case Ignore =>
349 |         (WriteDisposition.WRITE_EMPTY, true)
350 |     }
351 | 
352 |     val jobInfo = JobInfo.of(baseConfig.setWriteDisposition(writeDisposition).build())
353 |     val job = bigquery.create(jobInfo)
354 | 
355 |     logger.info(s"Starting import into table $table from $path (format: $format, mode: $mode)")
356 |     waitForJob(job, ignoreDuplicateError)
357 |     logger.info(s"Done importing into table $table")
358 |   }
359 | 
360 |   /**
361 |     * Waits for completion of a job and check for errors.
362 |     * @param job Job to wait for
363 |     * @param ignoreDuplicateError Whether to ignore duplicate errors or not
364 |     * @see [[https://cloud.google.com/bigquery/troubleshooting-errors]]
365 |     */
366 |   private def waitForJob(job: Job, ignoreDuplicateError: Boolean = false): Unit = {
367 |     val status = job.waitFor(
368 |       RetryOption.initialRetryDelay(Duration.ofSeconds(1)),
369 |       RetryOption.retryDelayMultiplier(1.2),
370 |       RetryOption.maxRetryDelay(Duration.ofSeconds(30)),
371 |       RetryOption.totalTimeout(Duration.ofMillis(config.job.timeout))
372 |     ).getStatus
373 | 
374 |     if(status.getError != null && (!ignoreDuplicateError || status.getError.getReason != "duplicate")) {
375 |       throw new IOException(s"BigQuery job ${job.getJobId} failed with message: ${status.getError.getMessage}")
376 |     }
377 |   }
378 | }
379 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/client/BigQueryTableReader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.client
23 | 
24 | import com.google.cloud.bigquery.BigQuery.TableDataListOption
25 | import com.google.cloud.bigquery._
26 | import com.miraisolutions.spark.bigquery.{BigQuerySchemaConverter, BigQueryTableReference}
27 | import org.apache.spark.sql.Row
28 | import org.slf4j.LoggerFactory
29 | 
30 | import scala.collection.JavaConverters._
31 | 
32 | private[bigquery] object BigQueryTableReader {
33 |   // Maximum number of rows per BigQuery page. See https://cloud.google.com/bigquery/docs/paging-results
34 |   private val MAX_ROWS_PER_PAGE: Long = 100000L
35 | }
36 | 
37 | /**
38 |   * Table reader used to read a BigQuery table through a number of pages/partitions.
39 |   * @param table BigQuery table
40 |   * @param totalRows Total number of rows to read (across all pages)
41 |   * @param suggestedNumPartitions Suggested number of target partitions. The effective number of partitions may
42 |   *                               be different.
43 |   * @see [[https://cloud.google.com/bigquery/docs/paging-results]]
44 |   */
45 | private[bigquery] case class BigQueryTableReader private (table: Table, totalRows: Long, suggestedNumPartitions: Int) {
46 |   import BigQueryTableReader._
47 | 
48 |   private val logger = LoggerFactory.getLogger(classOf[BigQueryTableReader])
49 | 
50 |   // BigQuery => Spark schema converter
51 |   private lazy val converter: FieldValueList => Row = {
52 |     val schema = table.getDefinition[TableDefinition].getSchema
53 |     BigQuerySchemaConverter.getBigQueryToSparkConverterFunction(schema)
54 |   }
55 | 
56 |   // Page size to use when reading from BigQuery; note that there is a limit of 100k rows per page
57 |   private val pageSize: Long = {
58 |     val suggestedPageSize = (totalRows + suggestedNumPartitions - 1) / suggestedNumPartitions
59 |     Math.min(Math.min(suggestedPageSize, MAX_ROWS_PER_PAGE), totalRows)
60 |   }
61 | 
62 |   /**
63 |     * The effective number of partitions. This may be different from the `suggestedNumPartitions`.
64 |     */
65 |   def numPartitions: Int = Math.ceil(totalRows.toDouble / pageSize).toInt
66 | 
67 |   /**
68 |     * Reads a page/partition of a specified size.
69 |     * @param pageIndex Page index
70 |     * @return BigQuery [[TableResult]] that can be used to iterate through the results
71 |     */
72 |   private def getTableResult(pageIndex: Int): TableResult = {
73 |     table.list(
74 |       TableDataListOption.pageSize(pageSize),
75 |       TableDataListOption.startIndex(pageSize * pageIndex)
76 |     )
77 |   }
78 | 
79 |   /**
80 |     * Get a row iterable for the specified partition.
81 |     * @param partitionIndex Partition index
82 |     * @return Row iterable
83 |     */
84 |   def getRows(partitionIndex: Int): Iterable[Row] = {
85 |     logger.info(s"Retrieving rows of table ${BigQueryTableReference(table.getTableId)} partition $partitionIndex" +
86 |       s" (page size: $pageSize, total rows: $totalRows, partitions: $numPartitions)")
87 | 
88 |     val result = getTableResult(partitionIndex)
89 |     result.getValues.asScala.map(converter)
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/client/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import com.google.cloud.bigquery.BigQuery.TableDataListOption
25 | import com.google.cloud.bigquery._
26 | import org.slf4j.LoggerFactory
27 | 
28 | package object client {
29 | 
30 |   private val logger = LoggerFactory.getLogger(this.getClass.getName)
31 | 
32 |   /**
33 |     * Some convenience methods on [[Dataset]]
34 |     * @param ds BigQuery [[Dataset]]
35 |     */
36 |   private[client] implicit class BigQueryDataset(val ds: Dataset) {
37 | 
38 |     private def fold[T](table: String)(ifNotExists: => T)(f: Table => T): T = {
39 |       Option(ds.get(table)).fold(ifNotExists) { tbl =>
40 |         if(tbl.exists()) f(tbl) else ifNotExists
41 |       }
42 |     }
43 | 
44 |     def getOrCreateTable(table: String, schema: Schema): Table = {
45 |       fold(table)(createTable(table, schema))(identity)
46 |     }
47 | 
48 |     def existsTable(table: String): Boolean = {
49 |       fold(table)(false)(_ => true)
50 |     }
51 | 
52 |     def isNonEmptyTable(table: String): Boolean = {
53 |       fold(table)(false)(_.list(TableDataListOption.pageSize(1)).getTotalRows > 0)
54 |     }
55 | 
56 |     def existsNonEmptyTable(table: String): Boolean = {
57 |       existsTable(table) && isNonEmptyTable(table)
58 |     }
59 | 
60 |     def createTable(table: String, schema: Schema): Table = {
61 |       logger.info(s"Creating table $table in dataset ${ds.getDatasetId.getDataset} " +
62 |         s"of project ${ds.getDatasetId.getProject}")
63 | 
64 |       val tableDefinition = StandardTableDefinition.newBuilder()
65 |         .setType(TableDefinition.Type.TABLE)
66 |         .setSchema(schema)
67 |         .build()
68 | 
69 |       ds.create(table, tableDefinition)
70 |     }
71 | 
72 |     def dropTable(table: String): Unit = {
73 |       fold(table)((): Unit) { table =>
74 |         logger.info(s"Deleting table $table in dataset ${ds.getDatasetId.getDataset} " +
75 |           s"of project ${ds.getDatasetId.getProject}")
76 |         table.delete()
77 |       }
78 |     }
79 | 
80 |     def dropAndCreateTable(table: String, schema: Schema): Table = {
81 |       dropTable(table)
82 |       createTable(table, schema)
83 |     }
84 |   }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/config/BigQueryConfig.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.config
 23 | 
 24 | import com.miraisolutions.spark.bigquery.config.JobConfig.Priority
 25 | 
 26 | object StagingDatasetConfig {
 27 |   private val namespace = "bq.staging_dataset."
 28 | 
 29 |   private[bigquery] val DESCRIPTION = "Spark BigQuery staging dataset"
 30 | 
 31 |   private[config] object Keys {
 32 |     val NAME = namespace + "name"
 33 |     val LIFETIME = namespace + "lifetime"
 34 |     val GCS_BUCKET = namespace + "gcs_bucket"
 35 |   }
 36 | 
 37 |   private[config] object Defaults {
 38 |     val NAME = "spark_staging"
 39 |     val LIFETIME = 86400000L
 40 |   }
 41 | }
 42 | 
 43 | /**
 44 |   * BigQuery staging dataset configuration. A staging dataset is used to temporarily store the results of SQL queries.
 45 |   * @param name Name of staging dataset
 46 |   * @param lifetime Default table lifetime in milliseconds. Tables are automatically deleted once the lifetime has
 47 |   *                 been reached.
 48 |   * @param gcsBucket Google Cloud Storage (GCS) bucket to use for storing temporary files. Temporary files are used
 49 |   *                  when importing through BigQuery load jobs and exporting through BigQuery extraction jobs.
 50 |   * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]]
 51 |   */
 52 | case class StagingDatasetConfig(
 53 |   name: String = StagingDatasetConfig.Defaults.NAME,
 54 |   lifetime: Long = StagingDatasetConfig.Defaults.LIFETIME,
 55 |   gcsBucket: String
 56 | )
 57 | 
 58 | 
 59 | object JobConfig {
 60 |   import com.google.cloud.bigquery.QueryJobConfiguration.{Priority => BQPriority}
 61 | 
 62 |   private val namespace = "bq.job."
 63 | 
 64 |   /** BigQuery job priority */
 65 |   sealed trait Priority {
 66 |     private[bigquery] def underlying: BQPriority
 67 |   }
 68 |   object Priority {
 69 | 
 70 |     /**
 71 |       * BigQuery interactive priority. Runs jobs as soon as possible. Interactive queries count towards the
 72 |       * concurrent rate limit and the daily limit.
 73 |       *
 74 |       * @see https://cloud.google.com/bigquery/docs/running-queries
 75 |       * @see https://cloud.google.com/bigquery/quotas
 76 |       */
 77 |     case object Interactive extends Priority {
 78 |       override private[bigquery] def underlying: BQPriority = BQPriority.INTERACTIVE
 79 |     }
 80 | 
 81 |     /**
 82 |       * BigQuery batch priority. Jobs start as soon as idle resources are available, usually within a few minutes.
 83 |       * Batch queries don't count towards the concurrent rate limit and the daily limit.
 84 |       *
 85 |       * @see https://cloud.google.com/bigquery/docs/running-queries
 86 |       */
 87 |     case object Batch extends Priority {
 88 |       override private[bigquery] def underlying: BQPriority = BQPriority.BATCH
 89 |     }
 90 | 
 91 |     private[config] def parse(s: String): Priority = s.toLowerCase match {
 92 |       case "interactive" => Interactive
 93 |       case "batch" => Batch
 94 |       case _ => throw new IllegalArgumentException("Invalid priority: " + s)
 95 |     }
 96 |   }
 97 | 
 98 |   private[config] object Keys {
 99 |     val PRIORITY = namespace + "priority"
100 |     val TIMEOUT = namespace + "timeout"
101 |   }
102 | 
103 |   private[config] object Defaults {
104 |     val PRIORITY = Priority.Interactive
105 |     val TIMEOUT = 3600000L
106 |   }
107 | }
108 | 
109 | /**
110 |   * BigQuery job configuration options.
111 |   * @param priority BigQuery job priority when executing SQL queries. Defaults to "interactive", i.e. the
112 |   *                 query is executed as soon as possible.
113 |   * @param timeout Timeout in milliseconds after which a file import/export job should be considered as failed.
114 |   *                Defaults to 3600000 ms = 1 h.
115 |   * @see [[https://cloud.google.com/bigquery/quota-policy]]
116 |   */
117 | case class JobConfig(
118 |   priority: Priority = JobConfig.Defaults.PRIORITY,
119 |   timeout: Long = JobConfig.Defaults.TIMEOUT
120 | )
121 | 
122 | 
123 | object BigQueryConfig {
124 |   private val namespace = "bq."
125 | 
126 |   private[config] object Keys {
127 |     val PROJECT = namespace + "project"
128 |     val LOCATION = namespace + "location"
129 |     val SERVICE_ACCOUNT_KEY_FILE = namespace + "service_account_key_file"
130 |   }
131 | 
132 |   /**
133 |     * Constructs typed BigQuery configuration options from a parameter map.
134 |     * @param parameters Parameter map
135 |     */
136 |   def apply(parameters: Map[String, String]): BigQueryConfig = {
137 |     val project = parameters(Keys.PROJECT)
138 |     val location = parameters(Keys.LOCATION)
139 |     val serviceAccountKeyFile = parameters.get(Keys.SERVICE_ACCOUNT_KEY_FILE)
140 | 
141 |     val stagingDataset = StagingDatasetConfig(
142 |       name = parameters.getOrElse(StagingDatasetConfig.Keys.NAME, StagingDatasetConfig.Defaults.NAME),
143 |       lifetime = parameters.get(StagingDatasetConfig.Keys.LIFETIME).map(_.toLong)
144 |         .getOrElse(StagingDatasetConfig.Defaults.LIFETIME),
145 |       gcsBucket = parameters(StagingDatasetConfig.Keys.GCS_BUCKET)
146 |     )
147 | 
148 |     val job = JobConfig(
149 |       priority = parameters.get(JobConfig.Keys.PRIORITY).map(Priority.parse).getOrElse(JobConfig.Defaults.PRIORITY),
150 |       timeout = parameters.get(JobConfig.Keys.TIMEOUT).map(_.toLong).getOrElse(JobConfig.Defaults.TIMEOUT)
151 |     )
152 | 
153 |     BigQueryConfig(project, location , serviceAccountKeyFile, stagingDataset, job)
154 |   }
155 | }
156 | 
157 | /**
158 |   * BigQuery configuration.
159 |   *
160 |   * @param project               BigQuery billing project ID.
161 |   * @param location              Geographic location where newly created datasets should reside. "EU" or "US".
162 |   *                              This holds for new datasets that are being created as part of a Spark write operation
163 |   *                              and for temporary staging datasets.
164 |   * @param serviceAccountKeyFile Optional Google Cloud service account key file to use for authentication with Google
165 |   *                              Cloud services. The use of service accounts is highly recommended. Specifically, the
166 |   *                              service account will be used to interact with BigQuery and Google Cloud Storage (GCS).
167 |   *                              If not specified, application default credentials will be used.
168 |   * @param stagingDataset        BigQuery staging dataset configuration options.
169 |   * @param job                   BigQuery job configuration options.
170 |   * @see [[https://cloud.google.com/bigquery/pricing]]
171 |   * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]]
172 |   * @see [[https://cloud.google.com/docs/authentication/]]
173 |   * @see [[https://cloud.google.com/bigquery/docs/authentication/]]
174 |   * @see [[https://cloud.google.com/storage/docs/authentication/]]
175 |   */
176 | case class BigQueryConfig(
177 |   project: String,
178 |   location: String,
179 |   serviceAccountKeyFile: Option[String] = None,
180 |   stagingDataset: StagingDatasetConfig,
181 |   job: JobConfig = JobConfig()
182 | )
183 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/config/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery
23 | 
24 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row}
25 | import scala.language.reflectiveCalls
26 | 
27 | package object config {
28 | 
29 |   // Structural helper type
30 |   private type OPT[T] = {
31 |     def format(source: String): T
32 |     def option(key: String, value: String): T
33 |   }
34 | 
35 |   // Applies format and configuration options on a DataFrameReader or DataFrameWriter
36 |   private[bigquery] def applyDataFrameOptions[T <: OPT[T]](obj: T, config: BigQueryConfig): T = {
37 |     import config._
38 | 
39 |     val objWithOptions =
40 |       obj
41 |         .format(DefaultSource.BIGQUERY_DATA_SOURCE_NAME)
42 |         .option(BigQueryConfig.Keys.PROJECT, project)
43 |         .option(BigQueryConfig.Keys.LOCATION, location)
44 |         .option(StagingDatasetConfig.Keys.NAME, stagingDataset.name)
45 |         .option(StagingDatasetConfig.Keys.LIFETIME, stagingDataset.lifetime.toString)
46 |         .option(StagingDatasetConfig.Keys.GCS_BUCKET, stagingDataset.gcsBucket)
47 |         .option(JobConfig.Keys.PRIORITY, job.priority.toString)
48 | 
49 |     serviceAccountKeyFile.fold(objWithOptions) { file =>
50 |       objWithOptions.option(BigQueryConfig.Keys.SERVICE_ACCOUNT_KEY_FILE, file)
51 |     }
52 |   }
53 | 
54 |   implicit class DataFrameReaderConfig(val reader: DataFrameReader) extends AnyVal {
55 |     /**
56 |       * Utility method to apply typed BigQuery configuration.
57 |       * @param config BigQuery configuration
58 |       */
59 |     def bigquery(config: BigQueryConfig): DataFrameReader = applyDataFrameOptions(reader, config)
60 |   }
61 | 
62 |   implicit class DataFrameWriterConfig(val writer: DataFrameWriter[Row]) extends AnyVal {
63 |     /**
64 |       * Utility method to apply typed BigQuery configuration.
65 |       * @param config BigQuery configuration
66 |       */
67 |     def bigquery(config: BigQueryConfig): DataFrameWriter[Row] = applyDataFrameOptions(writer, config)
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/examples/Shakespeare.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.examples
 23 | 
 24 | import org.apache.spark.sql.{SaveMode, SparkSession}
 25 | import com.miraisolutions.spark.bigquery.config._
 26 | 
 27 | /**
 28 |   * Reads the public Google BigQuery sample dataset 'shakespeare'.
 29 |   *
 30 |   * To run this example first compile an assembly using `sbt assembly`. Then run:
 31 |   *
 32 |   * ==Local Spark Cluster==
 33 |   * `spark-submit --class com.miraisolutions.spark.bigquery.examples.Shakespeare --master local[*]
 34 |   * target/scala-2.11/spark-bigquery-assembly-<version>.jar <arguments>`
 35 |   *
 36 |   * ==Google Cloud Dataproc==
 37 |   * Login to service account:
 38 |   * `gcloud auth activate-service-account --key-file=[KEY-FILE]`
 39 |   *
 40 |   * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]]
 41 |   *
 42 |   * `gcloud dataproc jobs submit spark --cluster <cluster> --class
 43 |   * com.miraisolutions.spark.bigquery.examples.Shakespeare --jars
 44 |   * target/scala-2.11/spark-bigquery-assembly-<version>.jar -- <argument>`
 45 |   *
 46 |   * Where `<arguments>` are:
 47 |   *  1. Google BigQuery billing project ID
 48 |   *  2. Google BigQuery dataset location (EU, US)
 49 |   *  3. Google Cloud Storage (GCS) bucket where staging files will be located
 50 |   *  4. Google Cloud service account key file (works only when run against local cluster)
 51 |   *
 52 |   * @see [[https://cloud.google.com/bigquery/public-data/]]
 53 |   * @see [[https://cloud.google.com/bigquery/docs/dataset-locations]]
 54 |   * @see [[https://cloud.google.com/storage/docs/authentication#service_accounts]]
 55 |   * @see [[https://cloud.google.com/dataproc/]]
 56 |   */
 57 | object Shakespeare {
 58 |   def main(args: Array[String]): Unit = {
 59 | 
 60 |     // Initialize Spark session
 61 |     val spark = SparkSession
 62 |       .builder
 63 |       .appName("Google BigQuery Shakespeare")
 64 |       .getOrCreate
 65 | 
 66 |     import spark.implicits._
 67 | 
 68 |     // Define BigQuery options
 69 |     val config = BigQueryConfig(
 70 |       project = args(0), // Google BigQuery billing project ID
 71 |       location = args(1), // Google BigQuery dataset location
 72 |       stagingDataset = StagingDatasetConfig(
 73 |         gcsBucket = args(2) // Google Cloud Storage bucket for staging files
 74 |       ),
 75 |       // Google Cloud service account key file - works only in local cluster mode
 76 |       serviceAccountKeyFile = if(args.length > 3) Some(args(3)) else None
 77 |     )
 78 | 
 79 |     // Read public shakespeare data table using direct import (streaming)
 80 |     val shakespeare = spark.read
 81 |       .bigquery(config)
 82 |       .option("table", "bigquery-public-data.samples.shakespeare")
 83 |       .option("type", "direct")
 84 |       .load()
 85 | 
 86 |     val hamlet = shakespeare.filter($"corpus".like("hamlet"))
 87 |     hamlet.show(100)
 88 | 
 89 |     shakespeare.createOrReplaceTempView("shakespeare")
 90 |     val macbeth = spark.sql("SELECT * FROM shakespeare WHERE corpus = 'macbeth'").persist()
 91 |     macbeth.show(100)
 92 | 
 93 |     // Write filtered data table via a Parquet export on GCS
 94 |     macbeth.write
 95 |       .bigquery(config)
 96 |       .option("table", args(0) + ".samples.macbeth")
 97 |       .option("type", "parquet")
 98 |       .mode(SaveMode.Overwrite)
 99 |       .save()
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/exception/IOException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.exception
23 | 
24 | /**
25 |   * Signals an error when trying to read from or write to BigQuery.
26 |   * @param message Exception message
27 |   */
28 | private[bigquery] class IOException(message: String) extends java.io.IOException(message)
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/exception/MissingParameterException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.exception
23 | 
24 | /**
25 |   * Exception thrown in case of a missing parameter.
26 |   * @param message Exception message
27 |   */
28 | private[bigquery] class MissingParameterException(message: String) extends Exception(message)
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/exception/ParseException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.exception
23 | 
24 | /**
25 |   * Signals that an error occurred while parsing a string.
26 |   * @param message Exception message
27 |   */
28 | private[bigquery] class ParseException(message: String) extends Exception(message)
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/exception/UnsupportedFormatException.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.exception
23 | 
24 | /**
25 |   * Thrown to indicate that an unsupported BigQuery import/export format is being used.
26 |   * @param message Exception message
27 |   */
28 | private[bigquery] class UnsupportedFormatException(message: String) extends UnsupportedOperationException(message)
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/sql/BigQueryDialect.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.sql
23 | 
24 | import org.apache.spark.sql.jdbc.JdbcDialect
25 | 
26 | /**
27 |   * Google BigQuery standard SQL dialect (SQL-2011)
28 |   *
29 |   * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/]]
30 |   */
31 | private case object BigQueryDialect extends JdbcDialect {
32 | 
33 |   override def canHandle(url: String): Boolean = false
34 | 
35 |   override def quoteIdentifier(colName: String): String = s"`$colName`"
36 | 
37 |   override def getTableExistsQuery(table: String): String = s"SELECT 1 FROM $table LIMIT 1"
38 | 
39 |   override def getSchemaQuery(table: String): String = s"SELECT * FROM $table LIMIT 1"
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/sql/BigQuerySqlGeneration.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.sql
23 | 
24 | import com.miraisolutions.spark.bigquery.BigQueryTableReference
25 | import org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD
26 | import org.apache.spark.sql.sources.Filter
27 | 
28 | 
29 | /**
30 |   * Google BigQuery SQL Generation
31 |   *
32 |   * @param table BigQuery table reference
33 |   * @see [[https://cloud.google.com/bigquery/docs/reference/standard-sql/]]
34 |   */
35 | private[bigquery] case class BigQuerySqlGeneration(table: BigQueryTableReference) {
36 | 
37 |   // Generates column list for SELECT statement
38 |   private def getColumnList(columns: Array[String]): String = {
39 |     if(columns.isEmpty) {
40 |       "1"
41 |     } else {
42 |       columns.map(BigQueryDialect.quoteIdentifier).mkString(",")
43 |     }
44 |   }
45 | 
46 |   // Generates the filter expressions in a WHERE clause
47 |   private def getWhereClauseFilters(filters: Array[Filter]): String = {
48 |     filters
49 |       .flatMap(JDBCRDD.compileFilter(_, BigQueryDialect))
50 |       .map(p => s"($p)")
51 |       .mkString(" AND ")
52 |   }
53 | 
54 |   def getQuery(columns: Array[String]): String = {
55 |     s"SELECT ${getColumnList(columns)} FROM $table"
56 |   }
57 | 
58 |   def getQuery(columns: Array[String], filters: Array[Filter]): String = {
59 |     val whereClause = if(filters.isEmpty) "" else " WHERE " + getWhereClauseFilters(filters)
60 |     getQuery(columns) + whereClause
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/DateTime.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils
23 | 
24 | import java.util.TimeZone
25 | import java.sql.{Date, Timestamp}
26 | import java.time.{Instant, LocalDate, ZoneId}
27 | import java.time.format.DateTimeFormatter
28 | 
29 | private[bigquery] object DateTime {
30 | 
31 |   private val UTC = ZoneId.of("UTC")
32 |   private val DATE_FORMATTER = DateTimeFormatter.ISO_LOCAL_DATE.withZone(UTC)
33 | 
34 |   /**
35 |     * Formats milliseconds since the epoch in the format 'yyyy-MM-dd'.
36 |     * @param millis Milliseconds since the epoch
37 |     * @return Date string of the form 'yyyy-MM-dd'
38 |     */
39 |   def formatMillisSinceEpoch(millis: Long): String = {
40 |     val instant = Instant.ofEpochMilli(millis)
41 |     DATE_FORMATTER.format(instant)
42 |   }
43 | 
44 |   /**
45 |     * Formats a [[java.sql.Date]] returned by Spark using the format 'yyyy-MM-dd'.
46 |     * @param date Date value from Spark
47 |     * @return Date string of the form 'yyyy-MM-dd'
48 |     * @note Spark generally seems to be using local timezone
49 |     * @see [[https://issues.apache.org/jira/browse/SPARK-18350]]
50 |     * @see [[https://groups.google.com/a/lists.datastax.com/forum/#!topic/spark-connector-user/Uv9UoFjA9SU]]
51 |     */
52 |   def formatSparkDate(date: Date): String = {
53 |     formatMillisSinceEpoch(date.getTime + TimeZone.getDefault.getOffset(date.getTime))
54 |   }
55 | 
56 |   /**
57 |     * Parses a string of the form 'yyyy-MM-dd' to a [[java.sql.Date]].
58 |     * @param s String of the form 'yyyy-MM-dd'
59 |     * @return Date
60 |     */
61 |   def parseDate(s: String): Date = {
62 |     val localDate = LocalDate.parse(s, DATE_FORMATTER)
63 |     new Date(localDate.atStartOfDay.atZone(UTC).toInstant.toEpochMilli)
64 |   }
65 | 
66 |   /**
67 |     * Creates a [[java.sql.Timestamp]] from microseconds since the epoch as returned by the BigQuery API.
68 |     * @param m Microseconds since the epoch
69 |     * @return Timestamp
70 |     */
71 |   def epochMicrosToTimestamp(m: Long): Timestamp = {
72 |     val ts = new Timestamp(0)
73 |     val nanos = (m % 1000000L).toInt * 1000
74 | 
75 |     if(nanos < 0) {
76 |       ts.setTime((m / 1000000L - 1L) * 1000L)
77 |       ts.setNanos(nanos + 1000000000)
78 |     } else {
79 |       ts.setTime(m / 1000000L * 1000L)
80 |       ts.setNanos(nanos)
81 |     }
82 | 
83 |     ts
84 |   }
85 | 
86 |   /**
87 |     * Converts a [[java.sql.Timestamp]] to the number of (fractional) seconds since the epoch.
88 |     * @param ts Timestamp
89 |     * @return Seconds since the epoch
90 |     */
91 |   def timestampToEpochSeconds(ts: Timestamp): Double = {
92 |     ts.getTime / 1000L + ts.getNanos.toDouble / 1e9
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/Files.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.fs.{FileSystem, Path}
26 | 
27 | /**
28 |   * File utilities.
29 |   */
30 | private[bigquery] object Files {
31 | 
32 |   /**
33 |     * Returns a Hadoop filesystem and path for the provided path.
34 |     * @param path File or directory path
35 |     * @param conf Hadoop configuration
36 |     * @return Hadoop filesystem and path
37 |     */
38 |   private def getFsAndPath(path: String, conf: Configuration): (FileSystem, Path) = {
39 |     val p = new Path(path)
40 |     val fs = FileSystem.get(p.toUri, conf)
41 |     (fs, p)
42 |   }
43 | 
44 |   /**
45 |     * Deletes the specified path recursively.
46 |     * @param path Path to delete
47 |     * @param conf Hadoop configuration
48 |     */
49 |   def delete(path: String, conf: Configuration): Unit = {
50 |     val (fs, p) = getFsAndPath(path, conf)
51 |     fs.delete(p, true)
52 |   }
53 | 
54 |   /**
55 |     * Registers the specified path for deletion when the underlying filesystem is being closed.
56 |     * @param path Path to delete
57 |     * @param conf Hadoop configuration
58 |     */
59 |   def deleteOnExit(path: String, conf: Configuration): Unit = {
60 |     val (fs, p) = getFsAndPath(path, conf)
61 |     fs.deleteOnExit(p)
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/SqlLogger.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils
23 | 
24 | import org.slf4j.Logger
25 | 
26 | /**
27 |   * SQL logging wrapper for logging SQL queries
28 |   * @param logger Slf4j logger to use for logging
29 |   */
30 | private[bigquery] case class SqlLogger(logger: Logger) {
31 |   def logSqlQuery(sqlQuery: String): Unit = {
32 |     logger.info("Executing SQL query: " + sqlQuery.replaceAllLiterally("\n", ""))
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/format/FormatConverter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils.format
23 | 
24 | import org.apache.spark.sql.types.{StructField, StructType}
25 | import org.apache.spark.sql.DataFrame
26 | 
27 | /**
28 |   * Provides format conversion utility methods.
29 |   */
30 | object FormatConverter {
31 | 
32 |   /**
33 |     * Transforms a data frame by applying a list of column converters. Converters are applied in the order specified.
34 |     * Columns not matching any of the converter's domains remain unchanged.
35 |     * @param df Input data frame
36 |     * @param converters List of column converters to apply
37 |     * @return Output data frame
38 |     */
39 |   def transform(df: DataFrame, converters: List[ColumnConverter]): DataFrame = {
40 |     val convert = converters.reduce(_ orElse _)
41 |     val transformed = df.schema.fields.foldRight((df, List.empty[StructField])) { case (field, (aggDf, aggFields)) =>
42 |       if(convert.isDefinedAt(field)) {
43 |         val (newField, converterFunction) = convert(field)
44 |         val newDf = aggDf.withColumn(field.name, converterFunction(aggDf.col(field.name)))
45 |         (newDf, newField :: aggFields)
46 | 
47 |       } else {
48 |         // leave column unchanged
49 |         (aggDf, field :: aggFields)
50 |       }
51 |     }
52 |     val (newDf, newFields) = transformed
53 |     df.sqlContext.createDataFrame(newDf.rdd, StructType(newFields))
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/format/Generic.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils.format
23 | 
24 | import com.miraisolutions.spark.bigquery.BigQuerySchemaConverter
25 | import org.apache.spark.sql.Row
26 | import org.apache.spark.sql.expressions.UserDefinedFunction
27 | import org.apache.spark.sql.functions.udf
28 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType}
29 | import scala.collection.mutable.WrappedArray
30 | 
31 | /**
32 |   * Generic format converters.
33 |   */
34 | object Generic {
35 |   import BigQuerySchemaConverter._
36 | 
37 |   /**
38 |     * Creates a Spark UDF to convert an array of key-value structs to a Spark map column.
39 |     * @param mapType Resulting map type
40 |     * @return Spark UDF
41 |     */
42 |   private def keyValueRecordToMapUdf(mapType: MapType): UserDefinedFunction = udf((kvMap: WrappedArray[Row]) => {
43 |     val kvPairs = kvMap map { kvRecord =>
44 |       val key = kvRecord.getAs[Any](KEY_FIELD_NAME)
45 |       val value = kvRecord.getAs[Any](VALUE_FIELD_NAME)
46 |       (key, value)
47 |     }
48 |     kvPairs.toMap
49 |   }, mapType)
50 | 
51 |   /**
52 |     * Transforms a BigQuery repeated record of key-value fields to a Spark map column.
53 |     */
54 |   val keyValueRecordToMap: ColumnConverter = {
55 |     case StructField(
56 |       name,
57 |       ArrayType(
58 |         StructType(
59 |           Array(
60 |             StructField(KEY_FIELD_NAME, keyType, false, _), // key field
61 |             StructField(VALUE_FIELD_NAME, valueType, valueNullable, _) // value field
62 |           )
63 |         ),
64 |         false
65 |       ),
66 |       nullable,
67 |       meta
68 |     ) =>
69 |       val mapType = MapType(keyType, valueType, valueNullable)
70 |       val newField = StructField(name, mapType, nullable, meta)
71 |       (newField, keyValueRecordToMapUdf(mapType)(_))
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/format/Parquet.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018 Mirai Solutions GmbH
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  5 |  * this software and associated documentation files (the "Software"), to deal in
  6 |  * the Software without restriction, including without limitation the rights to
  7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
  9 |  * subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in all
 12 |  * copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 20 |  */
 21 | 
 22 | package com.miraisolutions.spark.bigquery.utils.format
 23 | 
 24 | import org.apache.spark.sql.Row
 25 | import org.apache.spark.sql.expressions.UserDefinedFunction
 26 | import org.apache.spark.sql.functions.udf
 27 | import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType}
 28 | 
 29 | /**
 30 |   * Apache Parquet format converters.
 31 |   *
 32 |   * @see [[https://github.com/apache/parquet-format/blob/master/LogicalTypes.md]]
 33 |   */
 34 | object Parquet {
 35 | 
 36 |   private val PARQUET_LIST_LIST_FIELD_NAME = "list"
 37 |   private val PARQUET_LIST_ELEMENT_FIELD_NAME = "element"
 38 |   private val PARQUET_MAP_KEYVALUE_FIELD_NAME = "key_value"
 39 |   private val PARQUET_MAP_KEY_FIELD_NAME = "key"
 40 |   private val PARQUET_MAP_VALUE_FIELD_NAME = "value"
 41 | 
 42 |   /**
 43 |     * Creates a Spark UDF to convert a Parquet-LIST-structured column to a Spark array column.
 44 |     * @param arrayType Resulting array type
 45 |     * @return Spark UDF
 46 |     */
 47 |   private def parquetListToArrayUdf(arrayType: ArrayType): UserDefinedFunction = udf((row: Row) => {
 48 |     row.getAs[Seq[Row]](PARQUET_LIST_LIST_FIELD_NAME).map(_.getAs[Any](PARQUET_LIST_ELEMENT_FIELD_NAME))
 49 |   }, arrayType)
 50 | 
 51 |   /**
 52 |     * Transforms a Parquet-LIST-structured column to a Spark array column.
 53 |     */
 54 |   val parquetListToArray: ColumnConverter = {
 55 |     case StructField(
 56 |       name,
 57 |       StructType(
 58 |         Array(
 59 |           StructField( // Parquet: repeated group list
 60 |             PARQUET_LIST_LIST_FIELD_NAME,
 61 |             ArrayType(
 62 |               StructType(
 63 |                 Array(
 64 |                   // Parquet: element field
 65 |                   StructField(PARQUET_LIST_ELEMENT_FIELD_NAME, elementType, elementNullable, _)
 66 |                 )
 67 |               ),
 68 |               false
 69 |             ),
 70 |             false, // repeated fields are not nullable
 71 |             _
 72 |           )
 73 |         )
 74 |       ),
 75 |       nullable,
 76 |       meta
 77 |     ) =>
 78 |       val arrayType = ArrayType(elementType, elementNullable)
 79 |       val newField = StructField(name, arrayType, nullable, meta)
 80 |       (newField, parquetListToArrayUdf(arrayType)(_))
 81 |   }
 82 | 
 83 |   /**
 84 |     * Creates a Spark UDF to convert a Parquet-MAP-structured column to a Spark map column.
 85 |     * @param mapType Resulting map type
 86 |     * @return Spark UDF
 87 |     */
 88 |   private def parquetMapToMapUdf(mapType: MapType): UserDefinedFunction = udf((row: Row) => {
 89 |     val kvPairs = row.getAs[Seq[Row]](PARQUET_MAP_KEYVALUE_FIELD_NAME) map { kv =>
 90 |       val key = kv.getAs[Any](PARQUET_MAP_KEY_FIELD_NAME)
 91 |       val value = kv.getAs[Any](PARQUET_MAP_VALUE_FIELD_NAME)
 92 |       (key, value)
 93 |     }
 94 |     kvPairs.toMap
 95 |   }, mapType)
 96 | 
 97 |   /**
 98 |     * Transforms a Parquet-MAP-structured column to a Spark map column.
 99 |     */
100 |   val parquetMapToMap: ColumnConverter = {
101 |     case StructField(
102 |       name,
103 |       StructType(
104 |         Array(
105 |           StructField(
106 |             PARQUET_MAP_KEYVALUE_FIELD_NAME,
107 |             ArrayType(
108 |               StructType(
109 |                 Array(
110 |                   StructField(PARQUET_MAP_KEY_FIELD_NAME, keyType, false, _),
111 |                   StructField(PARQUET_MAP_VALUE_FIELD_NAME, valueType, valueNullable, _)
112 |                 )
113 |               ),
114 |               false
115 |             ),
116 |             false,
117 |             _
118 |           )
119 |         )
120 |       ),
121 |       nullable,
122 |       meta
123 |     ) =>
124 |       val mapType = MapType(keyType, valueType, valueNullable)
125 |       val newField = StructField(name, mapType, nullable, meta)
126 |       (newField, parquetMapToMapUdf(mapType)(_))
127 |   }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/main/scala/com/miraisolutions/spark/bigquery/utils/format/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018 Mirai Solutions GmbH
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 |  * this software and associated documentation files (the "Software"), to deal in
 6 |  * the Software without restriction, including without limitation the rights to
 7 |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 8 |  * the Software, and to permit persons to whom the Software is furnished to do so,
 9 |  * subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in all
12 |  * copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 |  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 |  * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 |  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |  */
21 | 
22 | package com.miraisolutions.spark.bigquery.utils
23 | 
24 | import org.apache.spark.sql.Column
25 | import org.apache.spark.sql.types.StructField
26 | 
27 | package object format {
28 | 
29 |   /**
30 |     * Generic converter partial function for converting data frame columns.
31 |     *
32 |     * The result of a converter is the new field definition/format and a column-to-column function to transform an
33 |     * existing column into the specified format.
34 |     */
35 |   type ColumnConverter = PartialFunction[StructField, (StructField, Column => Column)]
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.1.1"
2 | 


--------------------------------------------------------------------------------