├── .github └── workflows │ ├── check.yml │ └── publish.yml ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── embulk-output-bigquery.gemspec ├── example ├── config_append_direct_schema_update_options.yml ├── config_client_options.yml ├── config_csv.yml ├── config_delete_in_advance.yml ├── config_delete_in_advance_field_partitioned_table.yml ├── config_delete_in_advance_partitioned_table.yml ├── config_destination_project.yml ├── config_expose_errors.yml ├── config_gcs.yml ├── config_guess_from_embulk_schema.yml ├── config_guess_with_column_options.yml ├── config_gzip.yml ├── config_jsonl.yml ├── config_max_threads.yml ├── config_min_ouput_tasks.yml ├── config_mode_append.yml ├── config_mode_append_direct.yml ├── config_nested_record.yml ├── config_payload_column.yml ├── config_payload_column_index.yml ├── config_progress_log_interval.yml ├── config_replace.yml ├── config_replace_backup.yml ├── config_replace_backup_field_partitioned_table.yml ├── config_replace_backup_partitioned_table.yml ├── config_replace_field_partitioned_table.yml ├── config_replace_field_range_partitioned_table.yml ├── config_replace_partitioned_table.yml ├── config_replace_schema_update_options.yml ├── config_skip_file_generation.yml ├── config_table_strftime.yml ├── config_template_table.yml ├── config_uncompressed.yml ├── config_with_rehearsal.yml ├── example.csv ├── example.yml ├── example2_1.csv ├── example2_2.csv ├── example4_1.csv ├── example4_2.csv ├── example4_3.csv ├── example4_4.csv ├── json_key.json ├── nested_example.jsonl ├── schema.json └── schema_expose_errors.json ├── lib └── embulk │ └── output │ ├── bigquery.rb │ └── bigquery │ ├── auth.rb │ ├── bigquery_client.rb │ ├── file_writer.rb │ ├── gcs_client.rb │ ├── google_client.rb │ ├── helper.rb │ └── value_converter_factory.rb └── test ├── helper.rb ├── test_bigquery_client.rb ├── test_configure.rb ├── test_example.rb ├── test_file_writer.rb ├── test_helper.rb ├── test_transaction.rb └── test_value_converter_factory.rb /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | name: Check 2 | on: [ pull_request, push ] 3 | jobs: 4 | check: 5 | runs-on: ubuntu-latest 6 | # push: always run. 7 | # pull_request: run only when the PR is submitted from a forked repository, not within this repository. 8 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 9 | strategy: 10 | matrix: 11 | jruby_version: 12 | - 9.3.15.0 13 | - 9.4.8.0 14 | fail-fast: false 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up OpenJDK 8 18 | uses: actions/setup-java@v4 19 | with: 20 | java-version: 8 21 | distribution: "temurin" 22 | - name: download jruby 23 | run: "curl -L -o jruby.jar https://repo1.maven.org/maven2/org/jruby/jruby-complete/${{ matrix.jruby_version }}/jruby-complete-${{ matrix.jruby_version }}.jar" 24 | # 25 | # For avoiding permission denied. install gems into `gems` directory 26 | # 27 | - name: bundle install 28 | run: "env GEM_HOME=gems java -jar jruby.jar -S bundle install" 29 | 30 | - name: install embulk.jar 31 | run: "curl -L -o embulk.jar https://github.com/embulk/embulk/releases/download/v0.11.4/embulk-0.11.4.jar" 32 | - name: rake test 33 | run: 'env GEM_HOME=gems RUBYOPT="-r ./embulk.jar -r rubygems" java -jar jruby.jar -S bundle exec rake test' 34 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - "v0.*" 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | environment: maven-central-and-ruby-gems 10 | strategy: 11 | fail-fast: true 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Ruby 15 | uses: ruby/setup-ruby@v1 16 | with: 17 | ruby-version: 3.3.0 18 | # get tag variable using {{ github.ref_name }} 19 | # 20 | # References: 21 | # * https://docs.github.com/en/actions/learn-github-actions/contexts#github-context 22 | # * https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables 23 | - name: extract gem version from tag 24 | id: vars 25 | run: echo version=${{ github.ref_name }} | sed -e 's/v0/0/' >> $GITHUB_OUTPUT 26 | # 27 | # From gem push documents. 28 | # 29 | # The push command will use ~/.gem/credentials to authenticate to a server, 30 | # but you can use the RubyGems environment variable GEM_HOST_API_KEY 31 | # to set the api key to authenticate. 32 | # 33 | # https://guides.rubygems.org/command-reference/#gem-push 34 | # 35 | - name: Publish 36 | run: | 37 | rake build 38 | gem push pkg/${EMBULK_PLUGIN_NAME}-${{ steps.vars.outputs.version }}.gem 39 | env: 40 | EMBULK_PLUGIN_NAME: embulk-output-bigquery 41 | GEM_HOST_API_KEY: "${{secrets.RUBYGEMS_API_KEY}}" 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /pkg/ 3 | /tmp/ 4 | /.bundle/ 5 | /Gemfile.lock 6 | vendor/ 7 | .ruby-version 8 | .tags 9 | your-project-000.json 10 | embulk.jar 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | matrix: 3 | include: 4 | - env: EMBULK_VERSION=0.9.15 5 | rvm: jruby-9.1.15.0 # bundled jruby version 6 | jdk: openjdk8 # embulk 0.9.x uses jdk8 7 | - env: EMBULK_VERSION=latest 8 | rvm: jruby-9.1.15.0 # ? 9 | jdk: openjdk8 # ? 10 | allow_failures: 11 | - env: EMBULK_VERSION=latest 12 | before_install: 13 | - curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-${EMBULK_VERSION}.jar" 14 | - chmod a+x embulk.jar 15 | - BUNDLER_VERSION=$(echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb | tail -n 2 | tr -d '"') 16 | - gem uninstall bundler -x 17 | - gem install bundler -v ${BUNDLER_VERSION} 18 | install: 19 | - ./embulk.jar bundle install --jobs=3 --retry=3 --path vendor/bundle 20 | script: 21 | - bundle exec env RUBYOPT="-r ./embulk.jar -r embulk -r embulk/java/bootstrap" rake test 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.7.5 - 2025-05-13 2 | * [enhancement] Add range partitioning support (Thanks to kitagry) #174 3 | 4 | ## 0.7.4 - 2024-12-19 5 | * [maintenance] Primary location unless location is set explicitly (Thanks to joker1007) #172 6 | 7 | ## 0.7.3 - 2024-08-28 8 | * [enhancement] Add TIME type conversion to string converter (Thanks to p-eye) 9 | 10 | ## 0.7.2 - 2024-07-21 11 | * [maintenance] Fix GitHub Actions #166 12 | * [maintenance] Fix gcs_client in order to load data using gcs_bucket parameter (Thanks to kashira202111) #164 13 | * [maintenance] Prevent creating unnecessary tables. (Thanks to kashira202111) #148 14 | 15 | ## 0.7.1 - 2024-03-4 16 | * [enhancement] Support description of columns and tables (Thanks to @kyoshidajp and @fagai ) #142 17 | * [maintenance] Add missing GitHub Actions environment setting. #160 18 | * [maintenance] Replace google-api-client with specific Google APIs (Thanks to @Nozomuts) #161 19 | * [maintenance] Update GitHub Actions use checkout@v4 and setup-java@v4 #162 20 | 21 | ## 0.7.0 - 2024-02-1 22 | * [enhancement] Add support Embulk 0.11.x 23 | 24 | ## 0.6.9 - 2023-03-16 25 | * [enhancement] Add SSLException to retry job (thanks to @mzumi) 26 | 27 | ## 0.6.8 - 2022-10-12 28 | * [enhancement] Support JSON type (thanks to @civitaspo ) 29 | * [maintenance] Add an error message in order to retry (thanks to @mzumi) 30 | 31 | ## 0.6.7 - 2021-09-10 32 | * [enhancement] Add an expiration option of temporary table to clean up (thanks to @TKNGUE) 33 | 34 | ## 0.6.6 - 2021-06-10 35 | 36 | * [maintenance] Fix network retry function (thanks to @case-k-git) 37 | * [enhancement] Allow to specify the billing project and the project to which the data will be loaded separately (thanks to @ck-fm0211) 38 | * [enhancement] Include original error message on json parse error (thanks to @k-yomo) 39 | 40 | ## 0.6.5 - 2021-06-10 41 | * [maintenance] Fix failed tests (thanks to @kyoshidajp) 42 | * [maintenance] Lock representable version for avoiding requiring Ruby 2.4 (thanks to @hiroyuki-sato) 43 | 44 | ## 0.6.4 - 2019-11-06 45 | 46 | * [enhancement] Add DATETIME type conveter (thanks to @kekekenta) 47 | 48 | ## 0.6.3 - 2019-10-28 49 | 50 | * [enhancement] Add DATE type conveter (thanks to @tksfjt1024) 51 | 52 | ## 0.6.2 - 2019-10-16 53 | 54 | * [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato) 55 | 56 | ## 0.6.1 - 2019-08-28 57 | 58 | * [maintenance] Release a new gem not to include symlinks to make it work on Windows. 59 | 60 | ## 0.6.0 - 2019-08-11 61 | 62 | Cleanup `auth_method`: 63 | 64 | * [enhancement] Support `auth_method: authorized_user` (OAuth) 65 | * [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility) 66 | * [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key) 67 | * [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped. 68 | 69 | ## 0.5.0 - 2019-08-10 70 | 71 | * [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter` 72 | * [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now 73 | * [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations. 74 | * [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now. 75 | 76 | ## 0.4.14 - 2019-08-10 77 | 78 | * [enhancement] Support field partitioning correctly. 79 | 80 | ## 0.4.13 - 2019-03-20 81 | 82 | * [enhancement] Support clustered table as an experimental feature 83 | 84 | ## 0.4.12 - 2019-03-20 85 | 86 | * [maintenance] Fix `time_partitioning.requirePartitionFilter` was not working. Use `time_partitioning.require_partition_filter` (thanks to @gitetsu) 87 | 88 | ## 0.4.11 - 2019-03-07 89 | 90 | * [maintenance] Fix to use `response.status.error_result` instead of `response.status.errors` to check job failure status (thanks to @nownabe) 91 | 92 | ## 0.4.10 - 2018-11-08 93 | * [enhancement] Support column-based partition (thanks to Chi-Ruei Li) 94 | 95 | ## 0.4.9 - 2018-09-08 96 | * [enhancement] Enable object lifecycle management when creating buckets with `auto_create_gcs_bucket` (thanks to @potato2003) 97 | 98 | ## 0.4.8 - 2017-05-23 99 | * [enhancement] Support location option for `auto_create_gcs_bucket` option (thanks to @potato2003) 100 | 101 | ## 0.4.7 - 2017-05-02 102 | * [enhancement] Support location option to allow to use 'asia-northeast1' region 103 | 104 | ## 0.4.6 - 2017-04-17 105 | * [enhancement] Support auth_method 'application_default' 106 | 107 | ## 0.4.5 - 2017-04-04 108 | 109 | * [maintenance] Fix deprecated warning log condition for `timeout_sec` 110 | 111 | ## 0.4.4 - 2017-04-04 112 | 113 | * [maintenance] Support google-api-ruby-client >= v0.11.0 114 | * [maintenance] Add `send_timeout_sec` and `read_timeout_sec` option for google-api-ruby-client >= v0.11.0 115 | 116 | ## 0.4.3 - 2017-02-11 117 | 118 | * [maintenance] Fix `schma_update_options` was not set with load_from_gcs (thanks to h10a-bf) 119 | 120 | ## 0.4.2 - 2016-10-12 121 | 122 | * [maintenance] Fix `schema_update_options` was not working (nil error) 123 | 124 | ## 0.4.1 - 2016-10-03 125 | 126 | * [enhancement] Support `schema_update_options` option 127 | 128 | ## 0.4.0 - 2016-10-01 129 | 130 | * [enhancement] Support partitioned table 131 | * [maintenance] Add `progress_log_interval` option to control the interval of showing progress log, and now showing progress log is off by default 132 | 133 | ## 0.3.7 - 2016-08-03 134 | 135 | * [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora) 136 | 137 | ## 0.3.6 - 2016-06-15 138 | 139 | * [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007) 140 | 141 | ## 0.3.5 - 2016-06-13 142 | 143 | * [enhancement] retry backendError and internalError in waiting load job 144 | * [enhancement] retry Broken pipe and Connection reset in inserting object to GCS 145 | 146 | ## 0.3.4 - 2016-06-01 147 | 148 | * [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job 149 | 150 | ## 0.3.3 - 2016-05-24 151 | 152 | * [maintenance] Fix `private_key` auth is not working 153 | 154 | ## 0.3.2 - 2016-05-03 155 | 156 | * [new feature] Add `abort_on_error` option 157 | * [maintenance] Use uuid instead of current time for temp_table name 158 | 159 | ## 0.3.1 - 2016-04-15 160 | 161 | * [new feature] Add `sdk_log_level` option to show log of google-api-client 162 | * [maintenance] Fix `prevent_duplicate_insert` was not working correctly 163 | * [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API 164 | * [maintenance] Log response.statistics of load jobs 165 | * [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred 166 | * [maintenance] Fix a possibility which rehearsal would load 0 rows file 167 | 168 | ## 0.3.0 - 2016-04-08 169 | 170 | Big change is introduced. Now, embulk-output-bigquery is written in JRuby. 171 | 172 | * [new feature] Support parallel loads. Fix [#28](https://github.com/embulk/embulk-output-bigquery/issues/28). 173 | * [new feature] Create table first. Fix [#29](https://github.com/embulk/embulk-output-bigquery/issues/29). 174 | * [new feature] Introduce rehearsal mode. Fix [#30](https://github.com/embulk/embulk-output-bigquery/issues/30). 175 | * [new feature] Support `dataset_old` option for `replace_backup`. Fix [#31](https://github.com/embulk/embulk-output-bigquery/issues/31). 176 | * [maintenance] Fix default timestamp format to `%Y-%m-%d %H:%M:%S.%6`. Fix [#32](https://github.com/embulk/embulk-output-bigquery/issues/32). 177 | * [new feature] Support request options such as `timeout_sec`, `open_timeout_sec`, `retries`. Fix [#33](https://github.com/embulk/embulk-output-bigquery/issues/33). 178 | * [new feature] Support continuing from file generation with `skip_file_generation` option. 179 | * [new feature] Guess BigQuery schema from Embulk schema. Fix [#1](https://github.com/embulk/embulk-output-bigquery/issues/1). 180 | * [new feature] Support automatically create dataset. 181 | * [new feature] Support transactional append mode. 182 | * [incompatibility change] Formatter plugin support is dropped. Formatter is done in this plugin for specified `source_format`. 183 | * [incompatibility change] Encoder plugin support is dropped. Encoding is done in this plugin for specified `compression`. 184 | * [incompatibility change] `append` mode now expresses a transactional append, and `append_direct` is one which is not transactional (this was `append` mode before) 185 | 186 | ## 0.2.3 - 2016-02-19 187 | 188 | * [maintenance] Fix detect logic of delete_in_advance mode. [#26](https://github.com/embulk/embulk-output-bigquery/issues/26). @sonots thanks! 189 | 190 | ## 0.2.2 - 2016-02-15 191 | 192 | * [new feature] Added template_table option. [#25](https://github.com/embulk/embulk-output-bigquery/pull/25). @joker1007 thanks! 193 | 194 | ## 0.2.1 - 2016-01-28 195 | 196 | * [maintenance] Upgraded Embulk version to 0.8.1 [#22](https://github.com/embulk/embulk-output-bigquery/pull/22). @joker1007 thanks! 197 | * [maintenance] Formatted code style by checkstyle [#23](https://github.com/embulk/embulk-output-bigquery/pull/23) 198 | 199 | ## 0.2.0 - 2016-01-26 200 | 201 | * [new feature] Added mode parameters and support 4 modes(append, replace, replace_backup, delete_in_advance). [#20](https://github.com/embulk/embulk-output-bigquery/pull/20) [#21](https://github.com/embulk/embulk-output-bigquery/pull/21) @joker1007 thanks! 202 | 203 | ## 0.1.11 - 2015-11-16 204 | 205 | * [maintenance] Change error result display for easy investigation. [#18](https://github.com/embulk/embulk-output-bigquery/pull/18) 206 | 207 | ## 0.1.10 - 2015-10-06 208 | 209 | * [new feature] Added new auth method - json_keyfile of GCP(Google Cloud Platform)'s service account [#17](https://github.com/embulk/embulk-output-bigquery/pull/17) 210 | 211 | ## 0.1.9 - 2015-08-19 212 | 213 | * [maintenance] Upgraded Embulk version to 0.7.1 214 | 215 | ## 0.1.8 - 2015-08-19 216 | 217 | * [new feature] Supported mapreduce-executor. @frsyuki thanks! [#13](https://github.com/embulk/embulk-output-bigquery/pull/13) 218 | * [maintenance] Fixed job_id generation logic [#15](https://github.com/embulk/embulk-output-bigquery/pull/15) 219 | * [maintenance] Refactored [#11](https://github.com/embulk/embulk-output-bigquery/pull/11) 220 | 221 | ## 0.1.7 - 2015-05-20 222 | 223 | * [new feature] Added allow_quoted_newlines option [#10](https://github.com/embulk/embulk-output-bigquery/pull/10) 224 | * [maintenance] Upgraded embulk version to 0.6.8 225 | 226 | ## 0.1.6 - 2015-04-23 227 | 228 | * [new feature] Added ignore_unknown_values option to job_id generation logic. [#9](https://github.com/embulk/embulk-output-bigquery/pull/9) 229 | 230 | ## 0.1.5 - 2015-04-23 231 | 232 | * [new feature] Added ignore_unknown_values option. [#8](https://github.com/embulk/embulk-output-bigquery/pull/8) @takus thanks! 233 | 234 | ## 0.1.4 - 2015-04-21 235 | 236 | * [new feature] Added prevent_duplicate_insert option 237 | 238 | ## 0.1.3 - 2015-04-06 239 | 240 | * [new feature] Added new auth method - pre-defined access token of GCE(Google Compute Engine) 241 | * [maintenance] Updated Google provided libraries 242 | * http-client:google-http-client-jackson2 from 1.19.0 to 1.20.0 243 | * apis:google-api-services-bigquery from v2-rev193-1.19.1 to v2-rev205-1.20.0 244 | 245 | ## 0.1.2 - 2015-04-01 246 | 247 | * [new feature] Changed bulk-load method from "via GCS" to direct-insert 248 | * [new feature] added dynamic table creationg option 249 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org/' 2 | 3 | gemspec 4 | gem 'embulk', '= 0.11.4' 5 | gem 'embulk-parser-none' 6 | gem 'embulk-parser-jsonl' 7 | gem 'pry-nav' 8 | gem 'test-unit' 9 | gem 'test-unit-rr' 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # embulk-output-bigquery 2 | 3 | [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/) using [direct insert](https://cloud.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest) 4 | 5 | ## Overview 6 | 7 | load data into Google BigQuery as batch jobs for big amount of data 8 | https://developers.google.com/bigquery/loading-data-into-bigquery 9 | 10 | * **Plugin type**: output 11 | * **Resume supported**: no 12 | * **Cleanup supported**: no 13 | * **Dynamic table creating**: yes 14 | 15 | ### Supported Embulk 16 | 17 | | gem version | Embulk version | 18 | |------------------|--------------------| 19 | | 0.7.0 and higher | v0.11.0 and higher | 20 | | 0.6.9 and lower | v0.9.X and lower | 21 | 22 | ### NOT IMPLEMENTED 23 | * insert data over streaming inserts 24 | * for continuous real-time insertions 25 | * Please use other product, like [fluent-plugin-bigquery](https://github.com/kaizenplatform/fluent-plugin-bigquery) 26 | * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases 27 | 28 | Current version of this plugin supports Google API with Service Account Authentication, but does not support 29 | OAuth flow for installed applications. 30 | 31 | ## Configuration 32 | 33 | #### Original options 34 | 35 | | name | type | required? | default | description | 36 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------| 37 | | mode | string | optional | "append" | See [Mode](#mode) | 38 | | auth_method | string | optional | "application\_default" | See [Authentication](#authentication) | 39 | | json_keyfile | string | optional | | keyfile path or `content` | 40 | | project | string | required unless service\_account's `json_keyfile` is given. | | project\_id | 41 | | destination_project | string | optional | `project` value | A destination project to which the data will be loaded. Use this if you want to separate a billing project (the `project` value) and a destination project (the `destination_project` value). | 42 | | dataset | string | required | | dataset | 43 | | location | string | optional | nil | geographic location of dataset. See [Location](#location) | 44 | | table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`| 45 | | auto_create_dataset | boolean | optional | false | automatically create dataset | 46 | | auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) | 47 | | schema_file | string | optional | | /path/to/schema.json | 48 | | template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) | 49 | | job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time | 50 | | job_status_polling_interval | int | optional | 10 sec | Job status polling interval | 51 | | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode | 52 | | with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible | 53 | | rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal | 54 | | abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match | 55 | | column_options | hash | optional | | See [Column Options](#column-options) | 56 | | default_timezone | string | optional | UTC | | 57 | | default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | | 58 | | payload_column | string | optional | nil | See [Formatter Performance Issue](#formatter-performance-issue) | 59 | | payload_column_index | integer | optional | nil | See [Formatter Performance Issue](#formatter-performance-issue) | 60 | | gcs_bucket | string | optional | nil | See [GCS Bucket](#gcs-bucket) | 61 | | auto_create_gcs_bucket | boolean | optional | false | See [GCS Bucket](#gcs-bucket) | 62 | | progress_log_interval | float | optional | nil (Disabled) | Progress log interval. The progress log is disabled by nil (default). NOTE: This option may be removed in a future because a filter plugin can achieve the same goal | 63 | | description | string | optional | nil | description of table | 64 | 65 | Client or request options 66 | 67 | | name | type | required? | default | description | 68 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------| 69 | | open_timeout_sec | integer | optional | 300 | Seconds to wait for the connection to open | 70 | | timeout_sec | integer | optional | 300 | Seconds to wait for one block to be read (google-api-ruby-client < v0.11.0) | 71 | | send_timeout_sec | integer | optional | 300 | Seconds to wait to send a request (google-api-ruby-client >= v0.11.0) | 72 | | read_timeout_sec | integer | optional | 300 | Seconds to wait to read a response (google-api-ruby-client >= v0.11.0) | 73 | | retries | integer | optional | 5 | Number of retries | 74 | | application_name | string | optional | "Embulk BigQuery plugin" | User-Agent | 75 | | sdk_log_level | string | optional | nil (WARN) | Log level of google api client library | 76 | 77 | Options for intermediate local files 78 | 79 | | name | type | required? | default | description | 80 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------| 81 | | path_prefix | string | optional | | Path prefix of local files such as "/tmp/prefix_". Default randomly generates with [tempfile](http://ruby-doc.org/stdlib-2.2.3/libdoc/tempfile/rdoc/Tempfile.html) | 82 | | sequence_format | string | optional | .%d.%d | Sequence format for pid, thread id | 83 | | file_ext | string | optional | | The file extension of local files such as ".csv.gz" ".json.gz". Default automatically generates from `source_format` and `compression`| 84 | | skip_file_generation | boolean | optional | | Load already generated local files into BigQuery if available. Specify correct path_prefix and file_ext. | 85 | | delete_from_local_when_job_end | boolean | optional | true | If set to true, delete generate local files when job is end | 86 | | compression | string | optional | "NONE" | Compression of local files (`GZIP` or `NONE`) | 87 | 88 | 89 | Options for intermediate tables on BigQuery 90 | 91 | | name | type | required? | default | description | 92 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------| 93 | | temporary_table_expiration | integer | optional | | Temporary table's expiration time in seconds | 94 | 95 | `source_format` is also used to determine formatter (csv or jsonl). 96 | 97 | #### Same options of bq command-line tools or BigQuery job's property 98 | 99 | Following options are same as [bq command-line tools](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile) or BigQuery [job's property](https://cloud.google.com/bigquery/docs/reference/v2/jobs#resource). 100 | 101 | | name | type | required? | default | description | 102 | |:----------------------------------|:---------|:----------|:--------|:-----------------------| 103 | | source_format | string | required | "CSV" | File type (`NEWLINE_DELIMITED_JSON` or `CSV`) | 104 | | max_bad_records | int | optional | 0 | | 105 | | field_delimiter | char | optional | "," | | 106 | | encoding | string | optional | "UTF-8" | `UTF-8` or `ISO-8859-1` | 107 | | ignore_unknown_values | boolean | optional | false | | 108 | | allow_quoted_newlines | boolean | optional | false | Set true, if data contains newline characters. It may cause slow procsssing | 109 | | time_partitioning | hash | optional | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) | 110 | | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. | 111 | | time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. | 112 | | time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning | 113 | | range_partitioning | hash | optional | nil | See [Range Partitioning](#range-partitioning) | 114 | | range_partitioning.field | string | required | nil | `INT64` column used for partitioning | 115 | | range-partitioning.range | hash | required | nil | Defines the ranges for range paritioning | 116 | | range-partitioning.range.start | int | required | nil | The start of range partitioning, inclusive. | 117 | | range-partitioning.range.end | int | required | nil | The end of range partitioning, exclusive. | 118 | | range-partitioning.range.interval| int | required | nil | The width of each interval. | 119 | | clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) | 120 | | clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. | 121 | | schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. | 122 | 123 | ### Example 124 | 125 | ```yaml 126 | out: 127 | type: bigquery 128 | mode: append 129 | auth_method: service_account 130 | json_keyfile: /path/to/json_keyfile.json 131 | project: your-project-000 132 | dataset: your_dataset_name 133 | table: your_table_name 134 | compression: GZIP 135 | source_format: NEWLINE_DELIMITED_JSON 136 | ``` 137 | 138 | ### Location 139 | 140 | The geographic location of the dataset. Required except for US and EU. 141 | 142 | GCS bucket should be in same region when you use `gcs_bucket`. 143 | 144 | See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations) 145 | 146 | ### Mode 147 | 148 | 5 modes are provided. 149 | 150 | ##### append 151 | 152 | 1. Load to temporary table (Create and WRITE_APPEND in parallel) 153 | 2. Copy temporary table to destination table (or partition). (WRITE_APPEND) 154 | 155 | ##### append_direct 156 | 157 | 1. Insert data into existing table (or partition) directly. (WRITE_APPEND in parallel) 158 | 159 | This is not transactional, i.e., if fails, the target table could have some rows inserted. 160 | 161 | ##### replace 162 | 163 | 1. Load to temporary table (Create and WRITE_APPEND in parallel) 164 | 2. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE) 165 | 166 | ```is_skip_job_result_check``` must be false when replace mode 167 | 168 | NOTE: BigQuery does not support replacing (actually, copying into) a non-partitioned table with a paritioned table atomically. You must once delete the non-partitioned table, otherwise, you get `Incompatible table partitioning specification when copying to the column partitioned table` error. 169 | 170 | ##### replace_backup 171 | 172 | 1. Load to temporary table (Create and WRITE_APPEND in parallel) 173 | 2. Copy destination table (or partition) to backup table (or partition). (dataset_old, table_old) 174 | 3. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE) 175 | 176 | ```is_skip_job_result_check``` must be false when replace_backup mode. 177 | 178 | ##### delete_in_advance 179 | 180 | 1. Delete destination table (or partition), if it exists. 181 | 2. Load to destination table (or partition). 182 | 183 | ### Authentication 184 | 185 | There are four authentication methods 186 | 187 | 1. `service_account` (or `json_key` for backward compatibility) 188 | 1. `authorized_user` 189 | 1. `compute_engine` 190 | 1. `application_default` 191 | 192 | #### service\_account (or json\_key) 193 | 194 | Use GCP service account credentials. 195 | You first need to create a service account, download its json key and deploy the key with embulk. 196 | 197 | ```yaml 198 | out: 199 | type: bigquery 200 | auth_method: service_account 201 | json_keyfile: /path/to/json_keyfile.json 202 | ``` 203 | 204 | You can also embed contents of `json_keyfile` at config.yml. 205 | 206 | ```yaml 207 | out: 208 | type: bigquery 209 | auth_method: service_account 210 | json_keyfile: 211 | content: | 212 | { 213 | "private_key_id": "123456789", 214 | "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF", 215 | "client_email": "..." 216 | } 217 | ``` 218 | 219 | #### authorized\_user 220 | 221 | Use Google user credentials. 222 | You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`. 223 | 224 | ```yaml 225 | out: 226 | type: bigquery 227 | auth_method: authorized_user 228 | json_keyfile: /path/to/credentials.json 229 | ``` 230 | 231 | You can also embed contents of `json_keyfile` at config.yml. 232 | 233 | ```yaml 234 | out: 235 | type: bigquery 236 | auth_method: authorized_user 237 | json_keyfile: 238 | content: | 239 | { 240 | "client_id":"xxxxxxxxxxx.apps.googleusercontent.com", 241 | "client_secret":"xxxxxxxxxxx", 242 | "refresh_token":"xxxxxxxxxxx", 243 | "type":"authorized_user" 244 | } 245 | ``` 246 | 247 | #### compute\_engine 248 | 249 | On the other hand, you don't need to explicitly create a service account for embulk when you 250 | run embulk in Google Compute Engine. In this third authentication method, you need to 251 | add the API scope "https://www.googleapis.com/auth/bigquery" to the scope list of your 252 | Compute Engine VM instance, then you can configure embulk like this. 253 | 254 | ```yaml 255 | out: 256 | type: bigquery 257 | auth_method: compute_engine 258 | ``` 259 | 260 | #### application\_default 261 | 262 | Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials. 263 | 264 | 1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to. 265 | 2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`. 266 | 3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run. 267 | 268 | See https://cloud.google.com/docs/authentication/production for details. 269 | 270 | ```yaml 271 | out: 272 | type: bigquery 273 | auth_method: application_default 274 | ``` 275 | 276 | ### Table id formatting 277 | 278 | `table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime) 279 | format to construct table ids. 280 | Table ids are formatted at runtime 281 | using the local time of the embulk server. 282 | 283 | For example, with the configuration below, 284 | data is inserted into tables `table_20150503`, `table_20150504` and so on. 285 | 286 | ```yaml 287 | out: 288 | type: bigquery 289 | table: table_%Y%m%d 290 | ``` 291 | 292 | ### Dynamic table creating 293 | 294 | There are 3 ways to set schema. 295 | 296 | #### Set schema.json 297 | 298 | Please set file path of schema.json. 299 | 300 | ```yaml 301 | out: 302 | type: bigquery 303 | auto_create_table: true 304 | table: table_%Y%m%d 305 | schema_file: /path/to/schema.json 306 | ``` 307 | 308 | #### Set template_table in dataset 309 | 310 | Plugin will try to read schema from existing table and use it as schema template. 311 | 312 | ```yaml 313 | out: 314 | type: bigquery 315 | auto_create_table: true 316 | table: table_%Y%m%d 317 | template_table: existing_table_name 318 | ``` 319 | 320 | #### Guess from Embulk Schema 321 | 322 | Plugin will try to guess BigQuery schema from Embulk schema. It is also configurable with `column_options`. See [Column Options](#column-options). 323 | 324 | ### Column Options 325 | 326 | Column options are used to aid guessing BigQuery schema, or to define conversion of values: 327 | 328 | - **column_options**: advanced: an array of options for columns 329 | - **name**: column name 330 | - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATETIME`, `DATE`, and `RECORD`. See belows for supported conversion type. 331 | - boolean: `BOOLEAN`, `STRING` (default: `BOOLEAN`) 332 | - long: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`) 333 | - double: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`) 334 | - string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIME`, `TIMESTAMP`, `DATETIME`, `DATE`, `RECORD` (default: `STRING`) 335 | - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIME`, `TIMESTAMP`, `DATETIME`, `DATE` (default: `TIMESTAMP`) 336 | - json: `STRING`, `RECORD` (default: `STRING`) 337 | - **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`) 338 | - **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column. 339 | - **description**: description (string, default is `None`). 340 | - **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`) 341 | - **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`). 342 | - **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N") 343 | - **default_timezone**: default timezone for column_options (string, default is "UTC") 344 | 345 | Example) 346 | 347 | ```yaml 348 | out: 349 | type: bigquery 350 | auto_create_table: true 351 | column_options: 352 | - {name: date, type: STRING, timestamp_format: %Y-%m-%d, timezone: "Asia/Tokyo"} 353 | - name: json_column 354 | type: RECORD 355 | fields: 356 | - {name: key1, type: STRING} 357 | - {name: key2, type: STRING} 358 | ``` 359 | 360 | NOTE: Type conversion is done in this jruby plugin, and could be slow. See [Formatter Performance Issue](#formatter-performance-issue) to improve the performance. 361 | 362 | ### Formatter Performance Issue 363 | 364 | embulk-output-bigquery supports formatting records into CSV or JSON (and also formatting timestamp column). 365 | However, this plugin is written in jruby, and jruby plugins are slower than java plugins generally. 366 | 367 | Therefore, it is recommended to format records with filter plugins written in Java such as [embulk-filter-to_json](https://github.com/civitaspo/embulk-filter-to_json) as: 368 | 369 | ```yaml 370 | filters: 371 | - type: to_json 372 | column: {name: payload, type: string} 373 | default_format: "%Y-%m-%d %H:%M:%S.%6N" 374 | out: 375 | type: bigquery 376 | payload_column_index: 0 # or, payload_column: payload 377 | ``` 378 | 379 | Furtheremore, if your files are originally jsonl or csv files, you can even skip a parser with [embulk-parser-none](https://github.com/sonots/embulk-parser-none) as: 380 | 381 | ```yaml 382 | in: 383 | type: file 384 | path_prefix: example/example.jsonl 385 | parser: 386 | type: none 387 | column_name: payload 388 | out: 389 | type: bigquery 390 | payload_column_index: 0 # or, payload_column: payload 391 | ``` 392 | 393 | ### GCS Bucket 394 | 395 | This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs). 396 | 397 | This plugin originally loads local files into BigQuery in parallel, that is, consumes a number of jobs, say 24 jobs on 24 CPU core machine for example (this depends on embulk parameters such as `min_output_tasks` and `max_threads`). 398 | 399 | BigQuery supports loading multiple files from GCS with one job, therefore, uploading local files to GCS in parallel and then loading from GCS into BigQuery reduces number of consumed jobs to 1. 400 | 401 | Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_create_gcs_bucket` to create the specified GCS bucket automatically. 402 | 403 | ```yaml 404 | out: 405 | type: bigquery 406 | gcs_bucket: bucket_name 407 | auto_create_gcs_bucket: true 408 | ``` 409 | 410 | ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS. 411 | 412 | ### Time Partitioning 413 | 414 | From 0.4.0, embulk-output-bigquery supports to load into partitioned table. 415 | See also [Creating and Updating Date-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables). 416 | 417 | To load into a partition, specify `table` parameter with a partition decorator as: 418 | 419 | ```yaml 420 | out: 421 | type: bigquery 422 | table: table_name$20160929 423 | ``` 424 | 425 | You may configure `time_partitioning` parameter together as: 426 | 427 | ```yaml 428 | out: 429 | type: bigquery 430 | table: table_name$20160929 431 | time_partitioning: 432 | type: DAY 433 | expiration_ms: 259200000 434 | ``` 435 | 436 | You can also create column-based partitioning table as: 437 | 438 | ```yaml 439 | out: 440 | type: bigquery 441 | mode: replace 442 | table: table_name 443 | time_partitioning: 444 | type: DAY 445 | field: timestamp 446 | ``` 447 | 448 | Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`. 449 | 450 | Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though. 451 | Note that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now. Deleting columns, and renaming columns are not supported. 452 | 453 | MEMO: [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) is available 454 | to update the schema of the desitination table as a side effect of the load job, but it is not available for copy job. 455 | Thus, it was not suitable for embulk-output-bigquery idempotence modes, `append`, `replace`, and `replace_backup`, sigh. 456 | 457 | ### Range Partitioning 458 | 459 | See also [Creating and Updating Range-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables). 460 | 461 | To load into a partition, specify `range_partitioning` and `table` parameter with a partition decorator as: 462 | 463 | ```yaml 464 | out: 465 | type: bigquery 466 | table: table_name$1 467 | range_partitioning: 468 | field: customer_id 469 | range: 470 | start: 1 471 | end: 99999 472 | interval: 1 473 | ``` 474 | 475 | ## Development 476 | 477 | ### Run example: 478 | 479 | Prepare a json\_keyfile at example/your-project-000.json, then 480 | 481 | ``` 482 | $ embulk bundle install --path vendor/bundle 483 | $ embulk run -X page_size=1 -b . -l trace example/example.yml 484 | ``` 485 | 486 | ### Run test: 487 | 488 | Place your embulk with `.jar` extension: 489 | 490 | 491 | ``` 492 | $ curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-latest.jar" 493 | $ chmod a+x embulk.jar 494 | ``` 495 | 496 | Investigate JRUBY\_VERSION and Bundler::VERSION included in the embulk.jar: 497 | 498 | ``` 499 | $ echo JRUBY_VERSION | ./embulk.jar irb 500 | 2019-08-10 00:59:11.866 +0900: Embulk v0.9.17 501 | Switch to inspect mode. 502 | JRUBY_VERSION 503 | "X.X.X.X" 504 | 505 | $ echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb 506 | 2019-08-10 01:59:10.460 +0900: Embulk v0.9.17 507 | Switch to inspect mode. 508 | require 'bundler'; Bundler::VERSION 509 | "Y.Y.Y" 510 | ``` 511 | 512 | Install the same version of jruby (change X.X.X.X to the version shown above) and bundler: 513 | 514 | ``` 515 | $ rbenv install jruby-X.X.X.X 516 | $ rbenv local jruby-X.X.X.X 517 | $ gem install bundler -v Y.Y.Y 518 | ``` 519 | 520 | Install dependencies (NOTE: Use bundler included in the embulk.jar, otherwise, `gem 'embulk'` is not found): 521 | 522 | ``` 523 | $ ./embulk.jar bundle install --path vendor/bundle 524 | ``` 525 | 526 | Run tests with `env RUBYOPT="-r ./embulk.jar`: 527 | 528 | ``` 529 | $ bundle exec env RUBYOPT="-r ./embulk.jar" rake test 530 | ``` 531 | 532 | To run tests which actually connects to BigQuery such as test/test\_bigquery\_client.rb, 533 | prepare a json\_keyfile at example/your-project-000.json, then 534 | 535 | ``` 536 | $ bundle exec env RUBYOPT="-r ./embulk.jar" ruby test/test_bigquery_client.rb 537 | $ bundle exec env RUBYOPT="-r ./embulk.jar" ruby test/test_example.rb 538 | ``` 539 | 540 | ### Release gem: 541 | 542 | Change the version of gemspec, and write CHANGELOG.md. Then, 543 | 544 | ``` 545 | $ bundle exec rake release 546 | ``` 547 | 548 | ## ChangeLog 549 | 550 | [CHANGELOG.md](CHANGELOG.md) 551 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require 'rake/testtask' 3 | 4 | desc 'Run test_unit based test' 5 | Rake::TestTask.new(:test) do |t| 6 | t.libs << "test" 7 | t.test_files = Dir["test/**/test_*.rb"].sort 8 | t.verbose = true 9 | t.warning = false 10 | end 11 | task :default => :test 12 | -------------------------------------------------------------------------------- /embulk-output-bigquery.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |spec| 2 | spec.name = "embulk-output-bigquery" 3 | spec.version = "0.7.5" 4 | spec.authors = ["Satoshi Akama", "Naotoshi Seo"] 5 | spec.summary = "Google BigQuery output plugin for Embulk" 6 | spec.description = "Embulk plugin that insert records to Google BigQuery." 7 | spec.email = ["satoshiakama@gmail.com", "sonots@gmail.com"] 8 | spec.licenses = ["MIT"] 9 | spec.homepage = "https://github.com/embulk/embulk-output-bigquery" 10 | 11 | # Exclude example directory which uses symlinks from generating gem. 12 | # Symlinks do not work properly on the Windows platform without administrator privilege. 13 | spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ] 14 | spec.test_files = spec.files.grep(%r{^(test|spec)/}) 15 | spec.require_paths = ["lib"] 16 | 17 | # the latest version 18 | spec.add_dependency 'google-apis-storage_v1' 19 | spec.add_dependency 'google-apis-bigquery_v2' 20 | spec.add_dependency 'time_with_zone' 21 | spec.add_dependency 'thwait' 22 | # activesupport require Ruby >= 2.7.0 23 | # jruby-9.3.0.0 is MRI 2.6 compatible 24 | spec.add_dependency 'activesupport', "< 7.0" 25 | 26 | spec.add_development_dependency 'bundler', ['>= 1.10.6'] 27 | spec.add_development_dependency 'rake', ['>= 10.0'] 28 | end 29 | -------------------------------------------------------------------------------- /example/config_append_direct_schema_update_options.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: append_direct 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION] 32 | -------------------------------------------------------------------------------- /example/config_client_options.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | timeout_sec: 400 31 | open_timeout_sec: 400 32 | retries: 2 33 | application_name: "Embulk BigQuery plugin test" 34 | -------------------------------------------------------------------------------- /example/config_csv.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: CSV 27 | compression: GZIP 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | -------------------------------------------------------------------------------- /example/config_delete_in_advance.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: delete_in_advance 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | -------------------------------------------------------------------------------- /example/config_delete_in_advance_field_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: delete_in_advance 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_field_partitioned_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | time_partitioning: 32 | type: 'DAY' 33 | field: timestamp 34 | -------------------------------------------------------------------------------- /example/config_delete_in_advance_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: delete_in_advance 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_partitioned_table_name$20160929 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | time_partitioning: 32 | type: 'DAY' 33 | expiration_ms: 100 34 | -------------------------------------------------------------------------------- /example/config_destination_project.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | project: your_project_name 25 | destination_project: your_destination_project_name 26 | dataset: your_dataset_name 27 | table: your_table_name 28 | source_format: NEWLINE_DELIMITED_JSON 29 | compression: NONE 30 | auto_create_dataset: true 31 | auto_create_table: true 32 | schema_file: example/schema.json 33 | -------------------------------------------------------------------------------- /example/config_expose_errors.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema_expose_errors.json 31 | -------------------------------------------------------------------------------- /example/config_gcs.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: GZIP 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | gcs_bucket: your_bucket_name 32 | auto_create_gcs_bucket: true 33 | -------------------------------------------------------------------------------- /example/config_guess_from_embulk_schema.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | compression: GZIP 27 | source_format: NEWLINE_DELIMITED_JSON 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | -------------------------------------------------------------------------------- /example/config_guess_with_column_options.yml: -------------------------------------------------------------------------------- 1 | # embulk gem install embulk-parser-jsonl 2 | in: 3 | type: file 4 | path_prefix: example/nested_example.jsonl 5 | parser: 6 | type: jsonl 7 | columns: 8 | - {name: date, type: string} 9 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 10 | - {name: "null", type: string} 11 | - {name: long, type: long} 12 | - {name: string, type: string} 13 | - {name: double, type: double} 14 | - {name: json, type: json} 15 | - {name: boolean, type: boolean} 16 | out: 17 | type: bigquery 18 | mode: replace 19 | auth_method: service_account 20 | json_keyfile: example/your-project-000.json 21 | dataset: your_dataset_name 22 | table: your_table_name 23 | compression: GZIP 24 | source_format: NEWLINE_DELIMITED_JSON 25 | auto_create_dataset: true 26 | auto_create_table: true 27 | column_options: 28 | - {name: date, type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"} 29 | - {name: timestamp, type: STRING, timestamp_format: "%Y-%m-%d", timezone: "+09:00"} 30 | - {name: long, type: STRING} 31 | - {name: string, type: STRING} 32 | - {name: double, type: STRING} 33 | - {name: boolean, type: STRING} 34 | - name: json 35 | type: RECORD 36 | fields: 37 | - {name: k1, type: STRING} 38 | - {name: k2, type: STRING} 39 | # 2015-07-13 40 | # 2015-07-12 15:00:00 41 | -------------------------------------------------------------------------------- /example/config_gzip.yml: -------------------------------------------------------------------------------- 1 | config_csv.yml -------------------------------------------------------------------------------- /example/config_jsonl.yml: -------------------------------------------------------------------------------- 1 | config_replace.yml -------------------------------------------------------------------------------- /example/config_max_threads.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example4_ 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | exec: 32 | type: local 33 | min_output_tasks: 2 34 | max_threads: 2 35 | -------------------------------------------------------------------------------- /example/config_min_ouput_tasks.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example2_ 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: GZIP 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | exec: 32 | type: local 33 | min_output_tasks: 8 34 | max_threads: 4 35 | -------------------------------------------------------------------------------- /example/config_mode_append.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | null_string: 'NULL' 8 | skip_header_lines: 1 9 | comment_line_marker: '#' 10 | columns: 11 | - {name: date, type: string} 12 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 13 | - {name: "null", type: string} 14 | - {name: long, type: long} 15 | - {name: string, type: string} 16 | - {name: double, type: double} 17 | - {name: boolean, type: boolean} 18 | out: 19 | type: bigquery 20 | mode: append 21 | auth_method: service_account 22 | json_keyfile: example/your-project-000.json 23 | dataset: your_dataset_name 24 | table: your_table_name 25 | compression: GZIP 26 | source_format: CSV 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | delete_from_local_when_job_end: false 31 | -------------------------------------------------------------------------------- /example/config_mode_append_direct.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | null_string: 'NULL' 8 | skip_header_lines: 1 9 | comment_line_marker: '#' 10 | columns: 11 | - {name: date, type: string} 12 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 13 | - {name: "null", type: string} 14 | - {name: long, type: long} 15 | - {name: string, type: string} 16 | - {name: double, type: double} 17 | - {name: boolean, type: boolean} 18 | out: 19 | type: bigquery 20 | mode: append_direct 21 | auth_method: service_account 22 | json_keyfile: example/your-project-000.json 23 | dataset: your_dataset_name 24 | table: your_table_name 25 | compression: GZIP 26 | source_format: CSV 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | delete_from_local_when_job_end: false 31 | -------------------------------------------------------------------------------- /example/config_nested_record.yml: -------------------------------------------------------------------------------- 1 | config_guess_with_column_options.yml -------------------------------------------------------------------------------- /example/config_payload_column.yml: -------------------------------------------------------------------------------- 1 | # embulk gem install embulk-parser-none 2 | in: 3 | type: file 4 | path_prefix: example/example.jsonl 5 | parser: 6 | type: none 7 | column_name: payload 8 | out: 9 | type: bigquery 10 | mode: replace 11 | auth_method: service_account 12 | json_keyfile: example/your-project-000.json 13 | dataset: your_dataset_name 14 | table: your_table_name 15 | compression: GZIP 16 | source_format: NEWLINE_DELIMITED_JSON 17 | auto_create_dataset: true 18 | auto_create_table: true 19 | schema_file: example/schema.json 20 | payload_column: payload 21 | -------------------------------------------------------------------------------- /example/config_payload_column_index.yml: -------------------------------------------------------------------------------- 1 | # embulk gem install embulk-parser-none 2 | in: 3 | type: file 4 | path_prefix: example/example.jsonl 5 | parser: 6 | type: none 7 | column_name: payload 8 | out: 9 | type: bigquery 10 | mode: replace 11 | auth_method: service_account 12 | json_keyfile: example/your-project-000.json 13 | dataset: your_dataset_name 14 | table: your_table_name 15 | compression: GZIP 16 | source_format: NEWLINE_DELIMITED_JSON 17 | auto_create_dataset: true 18 | auto_create_table: true 19 | schema_file: example/schema.json 20 | payload_column_index: 0 21 | -------------------------------------------------------------------------------- /example/config_progress_log_interval.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | progress_log_interval: 0.1 32 | -------------------------------------------------------------------------------- /example/config_replace.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | -------------------------------------------------------------------------------- /example/config_replace_backup.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace_backup 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | dataset_old: your_dataset_name_old 27 | table_old: your_table_name_old 28 | source_format: NEWLINE_DELIMITED_JSON 29 | auto_create_dataset: true 30 | auto_create_table: true 31 | schema_file: example/schema.json 32 | skip_load: true # for debug 33 | -------------------------------------------------------------------------------- /example/config_replace_backup_field_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace_backup 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_field_partitioned_table_name 26 | table_old: your_field_partitioned_table_name_old 27 | source_format: NEWLINE_DELIMITED_JSON 28 | compression: NONE 29 | auto_create_dataset: true 30 | auto_create_table: true 31 | schema_file: example/schema.json 32 | time_partitioning: 33 | type: 'DAY' 34 | field: 'timestamp' 35 | -------------------------------------------------------------------------------- /example/config_replace_backup_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace_backup 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_partitioned_table_name$20160929 26 | table_old: your_partitioned_table_name_old$20160929 27 | source_format: NEWLINE_DELIMITED_JSON 28 | compression: NONE 29 | auto_create_dataset: true 30 | auto_create_table: true 31 | schema_file: example/schema.json 32 | time_partitioning: 33 | type: 'DAY' 34 | expiration_ms: 100 35 | -------------------------------------------------------------------------------- /example/config_replace_field_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_field_partitioned_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | time_partitioning: 32 | type: 'DAY' 33 | field: 'timestamp' 34 | -------------------------------------------------------------------------------- /example/config_replace_field_range_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_field_partitioned_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | range_partitioning: 32 | field: 'long' 33 | range: 34 | start: 90 35 | end: 100 36 | interval: 1 37 | -------------------------------------------------------------------------------- /example/config_replace_partitioned_table.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_partitioned_table_name$20160929 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | time_partitioning: 32 | type: 'DAY' 33 | expiration_ms: 100 34 | -------------------------------------------------------------------------------- /example/config_replace_schema_update_options.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_partitioned_table_name$20160929 26 | source_format: NEWLINE_DELIMITED_JSON 27 | compression: NONE 28 | auto_create_dataset: true 29 | auto_create_table: true 30 | schema_file: example/schema.json 31 | time_partitioning: 32 | type: 'DAY' 33 | expiration_ms: 100 34 | -------------------------------------------------------------------------------- /example/config_skip_file_generation.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | path_prefix: example/example 31 | file_ext: .jsonl 32 | skip_file_generation: true 33 | -------------------------------------------------------------------------------- /example/config_table_strftime.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name_%Y%m%d 26 | source_format: NEWLINE_DELIMITED_JSON 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | skip_load: true # for debug 31 | -------------------------------------------------------------------------------- /example/config_template_table.yml: -------------------------------------------------------------------------------- 1 | # embulk gem install embulk-parser-none 2 | in: 3 | type: file 4 | path_prefix: example/example.jsonl 5 | parser: 6 | type: none 7 | column_name: payload 8 | out: 9 | type: bigquery 10 | mode: replace 11 | auth_method: service_account 12 | json_keyfile: example/your-project-000.json 13 | dataset: your_dataset_name 14 | table: your_table_name_%Y%m%d 15 | compression: GZIP 16 | source_format: NEWLINE_DELIMITED_JSON 17 | auto_create_dataset: true 18 | auto_create_table: true 19 | template_table: your_table_name 20 | payload_column: payload 21 | skip_load: true # for debug 22 | -------------------------------------------------------------------------------- /example/config_uncompressed.yml: -------------------------------------------------------------------------------- 1 | config_replace.yml -------------------------------------------------------------------------------- /example/config_with_rehearsal.yml: -------------------------------------------------------------------------------- 1 | in: 2 | type: file 3 | path_prefix: example/example.csv 4 | parser: 5 | type: csv 6 | charset: UTF-8 7 | newline: CRLF 8 | null_string: 'NULL' 9 | skip_header_lines: 1 10 | comment_line_marker: '#' 11 | columns: 12 | - {name: date, type: string} 13 | - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"} 14 | - {name: "null", type: string} 15 | - {name: long, type: long} 16 | - {name: string, type: string} 17 | - {name: double, type: double} 18 | - {name: boolean, type: boolean} 19 | out: 20 | type: bigquery 21 | mode: replace 22 | auth_method: service_account 23 | json_keyfile: example/your-project-000.json 24 | dataset: your_dataset_name 25 | table: your_table_name 26 | source_format: NEWLINE_DELIMITED_JSON 27 | auto_create_dataset: true 28 | auto_create_table: true 29 | schema_file: example/schema.json 30 | with_rehearsal: true 31 | rehearsal_counts: 1 32 | skip_load: true # for debug 33 | compression: GZIP 34 | -------------------------------------------------------------------------------- /example/example.csv: -------------------------------------------------------------------------------- 1 | date,timestamp,null,long,string,double,boolean 2 | 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,true 3 | 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,true 4 | 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,true 5 | 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,true 6 | 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,true 7 | 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,true 8 | 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,true 9 | 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,true 10 | 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,false 11 | 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,false 12 | 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,false 13 | 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,false 14 | 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,false 15 | 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,false 16 | 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,false 17 | 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,false 18 | -------------------------------------------------------------------------------- /example/example.yml: -------------------------------------------------------------------------------- 1 | config_replace.yml -------------------------------------------------------------------------------- /example/example2_1.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/example2_2.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/example4_1.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/example4_2.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/example4_3.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/example4_4.csv: -------------------------------------------------------------------------------- 1 | example.csv -------------------------------------------------------------------------------- /example/json_key.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "project_id": "your_project_name", 4 | "private_key_id": "your_private_key_id", 5 | "private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n", 6 | "client_email": "your_service_account_email", 7 | "client_id": "your_client_id", 8 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 9 | "token_uri": "https://accounts.google.com/o/oauth2/token", 10 | "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", 11 | "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/account-3%40your_project_name.iam.gserviceaccount.com" 12 | } 13 | -------------------------------------------------------------------------------- /example/nested_example.jsonl: -------------------------------------------------------------------------------- 1 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":true} 2 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":true} 3 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":true} 4 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":true} 5 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":true} 6 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":true} 7 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":true} 8 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":true} 9 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":false} 10 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":false} 11 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":false} 12 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":false} 13 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":false} 14 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":false} 15 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":false} 16 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":false} 17 | -------------------------------------------------------------------------------- /example/schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name":"date", 4 | "type":"STRING" 5 | }, 6 | { 7 | "name":"timestamp", 8 | "type":"TIMESTAMP" 9 | }, 10 | { 11 | "name":"null", 12 | "type":"STRING" 13 | }, 14 | { 15 | "name":"long", 16 | "type":"INTEGER" 17 | }, 18 | { 19 | "name":"string", 20 | "type":"STRING" 21 | }, 22 | { 23 | "name":"double", 24 | "type":"FLOAT" 25 | }, 26 | { 27 | "name":"boolean", 28 | "type":"BOOLEAN" 29 | } 30 | ] 31 | -------------------------------------------------------------------------------- /example/schema_expose_errors.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name":"dat", 4 | "type":"STRING" 5 | }, 6 | { 7 | "name":"timestamp", 8 | "type":"TIMESTAMP" 9 | }, 10 | { 11 | "name":"null", 12 | "type":"STRING" 13 | }, 14 | { 15 | "name":"long", 16 | "type":"INTEGER" 17 | }, 18 | { 19 | "name":"string", 20 | "type":"STRING" 21 | }, 22 | { 23 | "name":"double", 24 | "type":"FLOAT" 25 | }, 26 | { 27 | "name":"boolean", 28 | "type":"BOOLEAN" 29 | } 30 | ] 31 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/auth.rb: -------------------------------------------------------------------------------- 1 | require 'googleauth' 2 | 3 | module Embulk 4 | module Output 5 | class Bigquery < OutputPlugin 6 | class Auth 7 | 8 | attr_reader :auth_method, :json_key, :scope 9 | 10 | def initialize(task, scope) 11 | @auth_method = task['auth_method'] 12 | @json_key = task['json_keyfile'] 13 | @scope = scope 14 | end 15 | 16 | def authenticate 17 | case auth_method 18 | when 'authorized_user' 19 | key = StringIO.new(json_key) 20 | return Google::Auth::UserRefreshCredentials.make_creds(json_key_io: key, scope: scope) 21 | when 'compute_engine' 22 | return Google::Auth::GCECredentials.new 23 | when 'service_account', 'json_key' # json_key is for backward compatibility 24 | key = StringIO.new(json_key) 25 | return Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope) 26 | when 'application_default' 27 | return Google::Auth.get_application_default([scope]) 28 | else 29 | raise ConfigError.new("Unknown auth method: #{auth_method}") 30 | end 31 | end 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/bigquery_client.rb: -------------------------------------------------------------------------------- 1 | require 'google/apis/bigquery_v2' 2 | require 'json' 3 | require 'thwait' 4 | require_relative 'google_client' 5 | require_relative 'helper' 6 | 7 | module Embulk 8 | module Output 9 | class Bigquery < OutputPlugin 10 | class BigqueryClient < GoogleClient 11 | BIGQUERY_TABLE_OPERATION_INTERVAL = 2 # https://cloud.google.com/bigquery/quotas 12 | 13 | def initialize(task, schema, fields = nil) 14 | scope = "https://www.googleapis.com/auth/bigquery" 15 | client_class = Google::Apis::BigqueryV2::BigqueryService 16 | super(task, scope, client_class) 17 | 18 | @schema = schema 19 | reset_fields(fields) if fields 20 | @project = @task['project'] 21 | @destination_project = @task['destination_project'] 22 | @dataset = @task['dataset'] 23 | @location = @task['location'] 24 | @location_for_log = @location.nil? ? 'Primary location' : @location 25 | 26 | @task['source_format'] ||= 'CSV' 27 | @task['max_bad_records'] ||= 0 28 | @task['field_delimiter'] ||= ',' 29 | @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil 30 | @task['encoding'] ||= 'UTF-8' 31 | @task['ignore_unknown_values'] = false if @task['ignore_unknown_values'].nil? 32 | @task['allow_quoted_newlines'] = false if @task['allow_quoted_newlines'].nil? 33 | end 34 | 35 | def fields 36 | return @fields if @fields 37 | if @task['schema_file'] 38 | @fields = Helper.deep_symbolize_keys(JSON.parse(File.read(@task['schema_file']))) 39 | elsif @task['template_table'] 40 | @fields = fields_from_table(@task['template_table']) 41 | else 42 | @fields = Helper.fields_from_embulk_schema(@task, @schema) 43 | end 44 | end 45 | 46 | def fields_from_table(table) 47 | response = get_table(table) 48 | response.schema.fields.map {|field| field.to_h } 49 | end 50 | 51 | def reset_fields(fields = nil) 52 | @fields = fields 53 | self.fields 54 | end 55 | 56 | def with_job_retry(&block) 57 | retries = 0 58 | begin 59 | yield 60 | rescue BackendError, InternalError, RateLimitExceeded => e 61 | if e.is_a?(RateLimitExceeded) 62 | sleep(BIGQUERY_TABLE_OPERATION_INTERVAL) 63 | end 64 | 65 | if retries < @task['retries'] 66 | retries += 1 67 | Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" } 68 | retry 69 | else 70 | Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" } 71 | raise e 72 | end 73 | end 74 | end 75 | 76 | # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path 77 | # @return [Array] responses 78 | def load_from_gcs(object_uris, table) 79 | with_job_retry do 80 | begin 81 | # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says, 82 | # we should generate job_id in client code, otherwise, retrying would cause duplication 83 | job_id = "embulk_load_job_#{SecureRandom.uuid}" 84 | Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}" } 85 | 86 | body = { 87 | job_reference: { 88 | project_id: @project, 89 | job_id: job_id, 90 | }, 91 | configuration: { 92 | load: { 93 | destination_table: { 94 | project_id: @destination_project, 95 | dataset_id: @dataset, 96 | table_id: table, 97 | }, 98 | schema: { 99 | fields: fields, 100 | }, 101 | write_disposition: 'WRITE_APPEND', 102 | source_format: @task['source_format'], 103 | max_bad_records: @task['max_bad_records'], 104 | field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil, 105 | encoding: @task['encoding'], 106 | ignore_unknown_values: @task['ignore_unknown_values'], 107 | allow_quoted_newlines: @task['allow_quoted_newlines'], 108 | source_uris: object_uris, 109 | } 110 | } 111 | } 112 | 113 | if @location 114 | body[:job_reference][:location] = @location 115 | end 116 | 117 | if @task['schema_update_options'] 118 | body[:configuration][:load][:schema_update_options] = @task['schema_update_options'] 119 | end 120 | 121 | opts = {} 122 | 123 | Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" } 124 | response = with_network_retry { client.insert_job(@project, body, **opts) } 125 | unless @task['is_skip_job_result_check'] 126 | response = wait_load('Load', response) 127 | end 128 | [response] 129 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 130 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 131 | Embulk.logger.error { 132 | "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}" 133 | } 134 | raise Error, "failed to load #{object_uris} to #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}, response:#{response}" 135 | end 136 | end 137 | end 138 | 139 | def load_in_parallel(paths, table) 140 | return [] if paths.empty? 141 | # You may think as, load job is a background job, so sending requests in parallel 142 | # does not improve performance. However, with actual experiments, this parallel 143 | # loadings drastically shortened waiting time. It looks one jobs.insert takes about 50 sec. 144 | # NOTICE: parallel uploadings of files consumes network traffic. With 24 concurrencies 145 | # with 100MB files consumed about 500Mbps in the experimented environment at a peak. 146 | # 147 | # We before had a `max_load_parallels` option, but this was not extensible for map reduce executor 148 | # So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35 149 | responses = [] 150 | threads = [] 151 | Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" } 152 | paths.each_with_index do |path, idx| 153 | threads << Thread.new(path, idx) do |path, idx| 154 | # I am not sure whether google-api-ruby-client is thread-safe, 155 | # so let me create new instances for each thread for safe 156 | bigquery = self.class.new(@task, @schema, fields) 157 | response = bigquery.load(path, table) 158 | [idx, response] 159 | end 160 | end 161 | ThreadsWait.all_waits(*threads) do |th| 162 | idx, response = th.value # raise errors occurred in threads 163 | responses[idx] = response 164 | end 165 | responses 166 | end 167 | 168 | def load(path, table, write_disposition: 'WRITE_APPEND') 169 | with_job_retry do 170 | begin 171 | if File.exist?(path) 172 | # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says, 173 | # we should generate job_id in client code, otherwise, retrying would cause duplication 174 | job_id = "embulk_load_job_#{SecureRandom.uuid}" 175 | Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}" } 176 | else 177 | Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" } 178 | return 179 | end 180 | 181 | body = { 182 | job_reference: { 183 | project_id: @project, 184 | job_id: job_id, 185 | }, 186 | configuration: { 187 | load: { 188 | destination_table: { 189 | project_id: @destination_project, 190 | dataset_id: @dataset, 191 | table_id: table, 192 | }, 193 | schema: { 194 | fields: fields, 195 | }, 196 | write_disposition: write_disposition, 197 | source_format: @task['source_format'], 198 | max_bad_records: @task['max_bad_records'], 199 | field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil, 200 | encoding: @task['encoding'], 201 | ignore_unknown_values: @task['ignore_unknown_values'], 202 | allow_quoted_newlines: @task['allow_quoted_newlines'], 203 | } 204 | } 205 | } 206 | 207 | if @location 208 | body[:job_reference][:location] = @location 209 | end 210 | 211 | if @task['schema_update_options'] 212 | body[:configuration][:load][:schema_update_options] = @task['schema_update_options'] 213 | end 214 | 215 | opts = { 216 | upload_source: path, 217 | content_type: "application/octet-stream", 218 | # options: { 219 | # retries: @task['retries'], 220 | # timeout_sec: @task['timeout_sec'], 221 | # open_timeout_sec: @task['open_timeout_sec'] 222 | # }, 223 | } 224 | Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" } 225 | response = with_network_retry { client.insert_job(@project, body, **opts) } 226 | if @task['is_skip_job_result_check'] 227 | response 228 | else 229 | response = wait_load('Load', response) 230 | end 231 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 232 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 233 | Embulk.logger.error { 234 | "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}" 235 | } 236 | raise Error, "failed to load #{path} to #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}, response:#{response}" 237 | end 238 | end 239 | end 240 | 241 | def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE') 242 | with_job_retry do 243 | begin 244 | destination_dataset ||= @dataset 245 | job_id = "embulk_copy_job_#{SecureRandom.uuid}" 246 | 247 | Embulk.logger.info { 248 | "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \ 249 | "#{@destination_project}:#{@dataset}.#{source_table} => #{@destination_project}:#{destination_dataset}.#{destination_table}" 250 | } 251 | 252 | body = { 253 | job_reference: { 254 | project_id: @project, 255 | job_id: job_id, 256 | }, 257 | configuration: { 258 | copy: { 259 | create_deposition: 'CREATE_IF_NEEDED', 260 | write_disposition: write_disposition, 261 | source_table: { 262 | project_id: @destination_project, 263 | dataset_id: @dataset, 264 | table_id: source_table, 265 | }, 266 | destination_table: { 267 | project_id: @destination_project, 268 | dataset_id: destination_dataset, 269 | table_id: destination_table, 270 | }, 271 | } 272 | } 273 | } 274 | 275 | if @location 276 | body[:job_reference][:location] = @location 277 | end 278 | 279 | opts = {} 280 | Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" } 281 | response = with_network_retry { client.insert_job(@project, body, **opts) } 282 | wait_load('Copy', response) 283 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 284 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 285 | Embulk.logger.error { 286 | "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}" 287 | } 288 | raise Error, "failed to copy #{@destination_project}:#{@dataset}.#{source_table} " \ 289 | "to #{@destination_project}:#{destination_dataset}.#{destination_table}, response:#{response}" 290 | end 291 | end 292 | end 293 | 294 | def wait_load(kind, response) 295 | started = Time.now 296 | 297 | wait_interval = @task['job_status_polling_interval'] 298 | max_polling_time = @task['job_status_max_polling_time'] 299 | _response = response 300 | 301 | while true 302 | job_id = _response.job_reference.job_id 303 | location = @location || _response.job_reference.location 304 | elapsed = Time.now - started 305 | status = _response.status.state 306 | if status == "DONE" 307 | Embulk.logger.info { 308 | "embulk-output-bigquery: #{kind} job completed... " \ 309 | "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]" 310 | } 311 | break 312 | elsif elapsed.to_i > max_polling_time 313 | message = "embulk-output-bigquery: #{kind} job checking... " \ 314 | "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]" 315 | Embulk.logger.info { message } 316 | raise JobTimeoutError.new(message) 317 | else 318 | Embulk.logger.info { 319 | "embulk-output-bigquery: #{kind} job checking... " \ 320 | "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]" 321 | } 322 | sleep wait_interval 323 | _response = with_network_retry { client.get_job(@project, job_id, location: location) } 324 | end 325 | end 326 | 327 | # `errors` returns Array if any error exists. 328 | _errors = _response.status.errors 329 | 330 | # cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method 331 | # `error_result` returns Google::Apis::BigqueryV2::ErrorProto if job failed. 332 | # Otherwise, this returns nil. 333 | if _response.status.error_result 334 | msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}" 335 | if _errors.any? {|error| error.reason == 'backendError' } 336 | raise BackendError, msg 337 | elsif _errors.any? {|error| error.reason == 'internalError' } 338 | raise InternalError, msg 339 | elsif _errors.any? {|error| error.reason == 'rateLimitExceeded' } 340 | raise RateLimitExceeded, msg 341 | else 342 | Embulk.logger.error { "embulk-output-bigquery: #{msg}" } 343 | raise Error, msg 344 | end 345 | end 346 | 347 | if _errors 348 | Embulk.logger.warn { "embulk-output-bigquery: #{kind} job errors... job_id:[#{job_id}] errors:#{_errors.map(&:to_h)}" } 349 | end 350 | 351 | Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" } 352 | 353 | _response 354 | end 355 | 356 | def create_dataset(dataset = nil, reference: nil) 357 | dataset ||= @dataset 358 | begin 359 | Embulk.logger.info { "embulk-output-bigquery: Create dataset... #{@destination_project}:#{dataset} in #{@location_for_log}" } 360 | hint = {} 361 | if reference 362 | response = get_dataset(reference) 363 | hint = { access: response.access } 364 | end 365 | body = { 366 | dataset_reference: { 367 | project_id: @project, 368 | dataset_id: dataset, 369 | }, 370 | }.merge(hint) 371 | if @location 372 | body[:location] = @location 373 | end 374 | opts = {} 375 | Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{@location_for_log}, #{body}, #{opts})" } 376 | with_network_retry { client.insert_dataset(@project, body, **opts) } 377 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 378 | if e.status_code == 409 && /Already Exists:/ =~ e.message 379 | # ignore 'Already Exists' error 380 | return 381 | end 382 | 383 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 384 | Embulk.logger.error { 385 | "embulk-output-bigquery: insert_dataset(#{@project}, #{body}, #{opts}), response:#{response}" 386 | } 387 | raise Error, "failed to create dataset #{@destination_project}:#{dataset} in #{@location_for_log}, response:#{response}" 388 | end 389 | end 390 | 391 | def get_dataset(dataset = nil) 392 | dataset ||= @dataset 393 | begin 394 | Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@destination_project}:#{dataset}" } 395 | with_network_retry { client.get_dataset(@destination_project, dataset) } 396 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 397 | if e.status_code == 404 398 | raise NotFoundError, "Dataset #{@destination_project}:#{dataset} is not found" 399 | end 400 | 401 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 402 | Embulk.logger.error { 403 | "embulk-output-bigquery: get_dataset(#{@destination_project}, #{dataset}), response:#{response}" 404 | } 405 | raise Error, "failed to get dataset #{@destination_project}:#{dataset}, response:#{response}" 406 | end 407 | end 408 | 409 | def create_table_if_not_exists(table, dataset: nil, options: nil) 410 | begin 411 | dataset ||= @dataset 412 | options ||= {} 413 | options['time_partitioning'] ||= @task['time_partitioning'] 414 | if Helper.has_partition_decorator?(table) 415 | options['time_partitioning'] ||= {'type' => 'DAY'} 416 | table = Helper.chomp_partition_decorator(table) 417 | end 418 | 419 | Embulk.logger.info { "embulk-output-bigquery: Create table... #{@destination_project}:#{dataset}.#{table}" } 420 | body = { 421 | table_reference: { 422 | table_id: table, 423 | }, 424 | description: @task['description'], 425 | schema: { 426 | fields: fields, 427 | } 428 | } 429 | 430 | if options['time_partitioning'] 431 | body[:time_partitioning] = { 432 | type: options['time_partitioning']['type'], 433 | expiration_ms: options['time_partitioning']['expiration_ms'], 434 | field: options['time_partitioning']['field'], 435 | } 436 | end 437 | 438 | options['range_partitioning'] ||= @task['range_partitioning'] 439 | if options['range_partitioning'] 440 | body[:range_partitioning] = { 441 | field: options['range_partitioning']['field'], 442 | range: { 443 | start: options['range_partitioning']['range']['start'].to_s, 444 | end: options['range_partitioning']['range']['end'].to_s, 445 | interval: options['range_partitioning']['range']['interval'].to_s, 446 | }, 447 | } 448 | end 449 | 450 | options['clustering'] ||= @task['clustering'] 451 | if options['clustering'] 452 | body[:clustering] = { 453 | fields: options['clustering']['fields'], 454 | } 455 | end 456 | 457 | if options['expiration_time'] 458 | # expiration_time is expressed in milliseconds 459 | body[:expiration_time] = (Time.now.to_i + options['expiration_time']) * 1000 460 | end 461 | 462 | opts = {} 463 | Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@destination_project}, #{dataset}, #{@location_for_log}, #{body}, #{opts})" } 464 | with_network_retry { client.insert_table(@destination_project, dataset, body, **opts) } 465 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 466 | if e.status_code == 409 && /Already Exists:/ =~ e.message 467 | # ignore 'Already Exists' error 468 | return 469 | end 470 | 471 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 472 | Embulk.logger.error { 473 | "embulk-output-bigquery: insert_table(#{@destination_project}, #{dataset}, #{@location_for_log}, #{body}, #{opts}), response:#{response}" 474 | } 475 | raise Error, "failed to create table #{@destination_project}:#{dataset}.#{table} in #{@location_for_log}, response:#{response}" 476 | end 477 | end 478 | 479 | def delete_table(table, dataset: nil) 480 | table = Helper.chomp_partition_decorator(table) 481 | delete_table_or_partition(table, dataset: dataset) 482 | end 483 | 484 | def delete_partition(table, dataset: nil) 485 | delete_table_or_partition(table, dataset: dataset) 486 | end 487 | 488 | # if `table` with a partition decorator is given, a partition is deleted. 489 | def delete_table_or_partition(table, dataset: nil) 490 | begin 491 | dataset ||= @dataset 492 | Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@destination_project}:#{dataset}.#{table}" } 493 | with_network_retry { client.delete_table(@destination_project, dataset, table) } 494 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 495 | if e.status_code == 404 && /Not found:/ =~ e.message 496 | # ignore 'Not Found' error 497 | return 498 | end 499 | 500 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 501 | Embulk.logger.error { 502 | "embulk-output-bigquery: delete_table(#{@destination_project}, #{dataset}, #{table}), response:#{response}" 503 | } 504 | raise Error, "failed to delete table #{@destination_project}:#{dataset}.#{table}, response:#{response}" 505 | end 506 | end 507 | 508 | def get_table(table, dataset: nil) 509 | table = Helper.chomp_partition_decorator(table) 510 | get_table_or_partition(table) 511 | end 512 | 513 | def get_partition(table, dataset: nil) 514 | get_table_or_partition(table) 515 | end 516 | 517 | def get_table_or_partition(table, dataset: nil) 518 | begin 519 | dataset ||= @dataset 520 | Embulk.logger.info { "embulk-output-bigquery: Get table... #{@destination_project}:#{dataset}.#{table}" } 521 | with_network_retry { client.get_table(@destination_project, dataset, table) } 522 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 523 | if e.status_code == 404 524 | raise NotFoundError, "Table #{@destination_project}:#{dataset}.#{table} is not found" 525 | end 526 | 527 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 528 | Embulk.logger.error { 529 | "embulk-output-bigquery: get_table(#{@destination_project}, #{dataset}, #{table}), response:#{response}" 530 | } 531 | raise Error, "failed to get table #{@destination_project}:#{dataset}.#{table}, response:#{response}" 532 | end 533 | end 534 | end 535 | end 536 | end 537 | end 538 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/file_writer.rb: -------------------------------------------------------------------------------- 1 | require 'zlib' 2 | require 'json' 3 | require 'csv' 4 | require_relative 'value_converter_factory' 5 | 6 | module Embulk 7 | module Output 8 | class Bigquery < OutputPlugin 9 | class FileWriter 10 | attr_reader :num_rows 11 | 12 | def initialize(task, schema, index, converters = nil) 13 | @task = task 14 | @schema = schema 15 | @index = index 16 | @converters = converters || ValueConverterFactory.create_converters(task, schema) 17 | 18 | @num_rows = 0 19 | if @task['progress_log_interval'] 20 | @progress_log_interval = @task['progress_log_interval'] 21 | @progress_log_timer = Time.now 22 | @previous_num_rows = 0 23 | end 24 | 25 | if @task['payload_column_index'] 26 | @payload_column_index = @task['payload_column_index'] 27 | @formatter_proc = self.method(:to_payload) 28 | else 29 | case @task['source_format'].downcase 30 | when 'csv' 31 | @formatter_proc = self.method(:to_csv) 32 | else 33 | @formatter_proc = self.method(:to_jsonl) 34 | end 35 | end 36 | end 37 | 38 | def io 39 | return @io if @io 40 | 41 | path = sprintf( 42 | "#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}", 43 | Process.pid, Thread.current.object_id 44 | ) 45 | if File.exist?(path) 46 | Embulk.logger.warn { "embulk-output-bigquery: unlink already existing #{path}" } 47 | File.unlink(path) rescue nil 48 | end 49 | Embulk.logger.info { "embulk-output-bigquery: create #{path}" } 50 | 51 | @io = open(path, 'w') 52 | end 53 | 54 | def open(path, mode = 'w') 55 | file_io = File.open(path, mode) 56 | case @task['compression'].downcase 57 | when 'gzip' 58 | io = Zlib::GzipWriter.new(file_io) 59 | else 60 | io = file_io 61 | end 62 | io 63 | end 64 | 65 | def close 66 | io.close rescue nil 67 | io 68 | end 69 | 70 | def reopen 71 | @io = open(io.path, 'a') 72 | end 73 | 74 | def to_payload(record) 75 | "#{record[@payload_column_index]}\n" 76 | end 77 | 78 | def to_csv(record) 79 | record.map.with_index do |value, column_index| 80 | @converters[column_index].call(value) 81 | end.to_csv 82 | end 83 | 84 | def to_jsonl(record) 85 | hash = {} 86 | column_names = @schema.names 87 | record.each_with_index do |value, column_index| 88 | column_name = column_names[column_index] 89 | hash[column_name] = @converters[column_index].call(value) 90 | end 91 | "#{hash.to_json}\n" 92 | end 93 | 94 | def num_format(number) 95 | number.to_s.gsub(/(\d)(?=(\d{3})+(?!\d))/, '\1,') 96 | end 97 | 98 | def add(page) 99 | _io = io 100 | # I once tried to split IO writing into another IO thread using SizedQueue 101 | # However, it resulted in worse performance, so I removed the codes. 102 | page.each do |record| 103 | Embulk.logger.trace { "embulk-output-bigquery: record #{record}" } 104 | formatted_record = @formatter_proc.call(record) 105 | Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" } 106 | _io.write formatted_record 107 | @num_rows += 1 108 | end 109 | show_progress if @task['progress_log_interval'] 110 | @num_rows 111 | end 112 | 113 | private 114 | 115 | def show_progress 116 | now = Time.now 117 | if @progress_log_timer < now - @progress_log_interval 118 | speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1) 119 | @progress_log_timer = now 120 | @previous_num_rows = @num_rows 121 | Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" } 122 | end 123 | end 124 | end 125 | end 126 | end 127 | end 128 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/gcs_client.rb: -------------------------------------------------------------------------------- 1 | require 'uri' 2 | require 'java' 3 | require 'google/apis/storage_v1' 4 | require_relative 'google_client' 5 | require_relative 'helper' 6 | 7 | # ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers 8 | # ToDo: Tests are not written because this implementation will probably entirely changed on supporting streaming transfers 9 | module Embulk 10 | module Output 11 | class Bigquery < OutputPlugin 12 | class GcsClient < GoogleClient 13 | def initialize(task) 14 | scope = "https://www.googleapis.com/auth/cloud-platform" 15 | client_class = Google::Apis::StorageV1::StorageService 16 | super(task, scope, client_class) 17 | 18 | @project = @task['project'] 19 | @destination_project = @task['destination_project'] 20 | @bucket = @task['gcs_bucket'] 21 | @location = @task['location'] 22 | end 23 | 24 | def insert_temporary_bucket(bucket = nil) 25 | bucket ||= @bucket 26 | begin 27 | Embulk.logger.info { "embulk-output-bigquery: Insert bucket... #{@destination_project}:#{bucket}" } 28 | body = { 29 | name: bucket, 30 | lifecycle: { 31 | rule: [ 32 | { 33 | action: { 34 | type: "Delete", 35 | }, 36 | condition: { 37 | age: 1, 38 | } 39 | }, 40 | ] 41 | } 42 | } 43 | 44 | if @location 45 | body[:location] = @location 46 | end 47 | 48 | opts = {} 49 | 50 | Embulk.logger.debug { "embulk-output-bigquery: insert_temporary_bucket(#{@project}, #{body}, #{opts})" } 51 | with_network_retry { client.insert_bucket(@project, body, **opts) } 52 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 53 | if e.status_code == 409 && /conflict:/ =~ e.message 54 | # ignore 'Already Exists' error 55 | return nil 56 | end 57 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 58 | Embulk.logger.error { 59 | "embulk-output-bigquery: insert_temporary_bucket(#{@project}, #{body}, #{opts}), response:#{response}" 60 | } 61 | raise Error, "failed to insert bucket #{@destination_project}:#{bucket}, response:#{response}" 62 | end 63 | end 64 | 65 | def insert_object(path, object: nil, bucket: nil) 66 | bucket ||= @bucket 67 | object ||= path 68 | object = object.start_with?('/') ? object[1..-1] : object 69 | object_uri = URI.join("gs://#{bucket}", object).to_s 70 | 71 | started = Time.now 72 | begin 73 | Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@destination_project}:#{object_uri}" } 74 | body = { 75 | name: object, 76 | } 77 | opts = { 78 | upload_source: path, 79 | content_type: 'application/octet-stream' 80 | } 81 | 82 | Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" } 83 | # memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency 84 | with_network_retry { client.insert_object(bucket, body, **opts) } 85 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 86 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 87 | Embulk.logger.error { 88 | "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}" 89 | } 90 | raise Error, "failed to insert object #{@destination_project}:#{object_uri}, response:#{response}" 91 | end 92 | end 93 | 94 | def insert_objects(paths, objects: nil, bucket: nil) 95 | return [] if paths.empty? 96 | bucket ||= @bucket 97 | objects ||= paths 98 | raise "number of paths and objects are different" if paths.size != objects.size 99 | 100 | responses = [] 101 | paths.each_with_index do |path, idx| 102 | object = objects[idx] 103 | responses << insert_object(path, object: object, bucket: bucket) 104 | end 105 | responses 106 | end 107 | 108 | def delete_object(object, bucket: nil) 109 | bucket ||= @bucket 110 | object = object.start_with?('/') ? object[1..-1] : object 111 | object_uri = URI.join("gs://#{bucket}", object).to_s 112 | begin 113 | Embulk.logger.info { "embulk-output-bigquery: Delete object... #{@destination_project}:#{object_uri}" } 114 | opts = {} 115 | 116 | Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" } 117 | response = with_network_retry { client.delete_object(bucket, object, **opts) } 118 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 119 | if e.status_code == 404 # ignore 'notFound' error 120 | return nil 121 | end 122 | response = {status_code: e.status_code, message: e.message, error_class: e.class} 123 | Embulk.logger.error { 124 | "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts}), response:#{response}" 125 | } 126 | raise Error, "failed to delete object #{@destination_project}:#{object_uri}, response:#{response}" 127 | end 128 | end 129 | end 130 | end 131 | end 132 | end 133 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/google_client.rb: -------------------------------------------------------------------------------- 1 | require_relative 'auth' 2 | 3 | module Embulk 4 | module Output 5 | class Bigquery < OutputPlugin 6 | class Error < StandardError; end 7 | class JobTimeoutError < Error; end 8 | class NotFoundError < Error; end 9 | class BackendError < Error; end 10 | class InternalError < Error; end 11 | class RateLimitExceeded < Error; end 12 | 13 | class GoogleClient 14 | def initialize(task, scope, client_class) 15 | @task = task 16 | @scope = scope 17 | @auth = Auth.new(task, scope) 18 | @client_class = client_class 19 | end 20 | 21 | def client 22 | return @cached_client if @cached_client && @cached_client_expiration > Time.now 23 | 24 | client = @client_class.new 25 | client.client_options.application_name = @task['application_name'] 26 | client.request_options.retries = @task['retries'] 27 | if client.request_options.respond_to?(:timeout_sec) 28 | client.request_options.timeout_sec = @task['timeout_sec'] || 300 29 | client.request_options.open_timeout_sec = @task['open_timeout_sec'] || 300 30 | else # google-api-ruby-client >= v0.11.0 31 | if @task['timeout_sec'] 32 | Embulk.logger.warn { "embulk-output-bigquery: timeout_sec is deprecated in google-api-ruby-client >= v0.11.0. Use read_timeout_sec instead" } 33 | end 34 | client.client_options.open_timeout_sec = @task['open_timeout_sec'] || 300 # default: 60 35 | client.client_options.send_timeout_sec = @task['send_timeout_sec'] || 300 # default: 120 36 | client.client_options.read_timeout_sec = @task['read_timeout_sec'] || @task['timeout_sec'] || 300 # default: 60 37 | end 38 | Embulk.logger.debug { "embulk-output-bigquery: client_options: #{client.client_options.to_h}" } 39 | Embulk.logger.debug { "embulk-output-bigquery: request_options: #{client.request_options.to_h}" } 40 | 41 | client.authorization = @auth.authenticate 42 | 43 | @cached_client_expiration = Time.now + 1800 44 | @cached_client = client 45 | end 46 | 47 | # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException 48 | def with_network_retry(&block) 49 | retries = 0 50 | begin 51 | yield 52 | rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException, ::Java::JavaxNetSsl::SSLException => e 53 | retry_messages = [ 54 | 'Broken pipe', 55 | 'Connection reset', 56 | 'Connection timed out', 57 | 'Connection or outbound has closed', 58 | ] 59 | if retry_messages.select { |x| e.message.include?(x) }.empty? 60 | raise e 61 | else 62 | if retries < @task['retries'] 63 | retries += 1 64 | Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" } 65 | retry 66 | else 67 | Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" } 68 | raise e 69 | end 70 | end 71 | end 72 | end 73 | end 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/helper.rb: -------------------------------------------------------------------------------- 1 | require 'digest/md5' 2 | require 'securerandom' 3 | 4 | module Embulk 5 | module Output 6 | class Bigquery < OutputPlugin 7 | class Helper 8 | PARTITION_DECORATOR_REGEXP = /\$.+\z/ 9 | 10 | def self.field_partitioning?(task) 11 | (task['time_partitioning'] || {}).key?('field') 12 | end 13 | 14 | def self.has_partition_decorator?(table_name) 15 | !!(table_name =~ PARTITION_DECORATOR_REGEXP) 16 | end 17 | 18 | def self.chomp_partition_decorator(table_name) 19 | table_name.sub(PARTITION_DECORATOR_REGEXP, '') 20 | end 21 | 22 | def self.bq_type_from_embulk_type(embulk_type) 23 | case embulk_type 24 | when :boolean then 'BOOLEAN' 25 | when :long then 'INTEGER' 26 | when :double then 'FLOAT' 27 | when :string then 'STRING' 28 | when :timestamp then 'TIMESTAMP' 29 | when :json then 'STRING' # NOTE: Default is not RECORD since it requires `fields` 30 | else raise ArgumentError, "embulk type #{embulk_type} is not supported" 31 | end 32 | end 33 | 34 | # @return [Hash] name => column_option. 35 | # ToDo: recursively map fields? 36 | def self.column_options_map(column_options) 37 | (column_options || {}).map do |column_option| 38 | [column_option['name'], column_option] 39 | end.to_h 40 | end 41 | 42 | def self.fields_from_embulk_schema(task, schema) 43 | column_options_map = self.column_options_map(task['column_options']) 44 | schema.map do |column| 45 | column_name = column[:name] 46 | embulk_type = column[:type] 47 | column_option = column_options_map[column_name] || {} 48 | {}.tap do |field| 49 | field[:name] = column_name 50 | field[:type] = (column_option['type'] || bq_type_from_embulk_type(embulk_type)).upcase 51 | field[:mode] = column_option['mode'] if column_option['mode'] 52 | field[:fields] = deep_symbolize_keys(column_option['fields']) if column_option['fields'] 53 | field[:description] = column_option['description'] if column_option['description'] 54 | end 55 | end 56 | end 57 | 58 | def self.deep_symbolize_keys(obj) 59 | if obj.is_a?(Hash) 60 | obj.inject({}) do |options, (key, value)| 61 | options[(key.to_sym rescue key) || key] = deep_symbolize_keys(value) 62 | options 63 | end 64 | elsif obj.is_a?(Array) 65 | obj.map {|value| deep_symbolize_keys(value) } 66 | else 67 | obj 68 | end 69 | end 70 | 71 | def self.create_load_job_id(task, path, fields) 72 | elements = [ 73 | Digest::MD5.file(path).hexdigest, 74 | task['dataset'], 75 | task['location'], 76 | task['table'], 77 | fields, 78 | task['source_format'], 79 | task['max_bad_records'], 80 | task['field_delimiter'], 81 | task['encoding'], 82 | task['ignore_unknown_values'], 83 | task['allow_quoted_newlines'], 84 | ] 85 | 86 | str = elements.map(&:to_s).join('') 87 | md5 = Digest::MD5.hexdigest(str) 88 | "embulk_load_job_#{md5}" 89 | end 90 | end 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /lib/embulk/output/bigquery/value_converter_factory.rb: -------------------------------------------------------------------------------- 1 | require 'time' 2 | require 'time_with_zone' 3 | require 'json' 4 | require_relative 'helper' 5 | 6 | module Embulk 7 | module Output 8 | class Bigquery < OutputPlugin 9 | class ValueConverterFactory 10 | class NotSupportedType < StandardError; end 11 | class TypeCastError < StandardError; end 12 | 13 | # ref. https://cloud.google.com/bigquery/preparing-data-for-bigquery 14 | 15 | DEFAULT_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%6N" # BigQuery timestamp format 16 | DEFAULT_TIMEZONE = "UTC" 17 | 18 | # @param [Hash] task 19 | # @option task [String] default_timestamp_format 20 | # @option task [String] default_timezone 21 | # @option task [Hash] column_options user defined column types 22 | # @param [Schema] schema embulk defined column types 23 | # @return [Array] an arary whose key is column_index, and value is its converter (Proc) 24 | def self.create_converters(task, schema) 25 | column_options_map = Helper.column_options_map(task['column_options']) 26 | default_timestamp_format = task['default_timestamp_format'] || DEFAULT_TIMESTAMP_FORMAT 27 | default_timezone = task['default_timezone'] || DEFAULT_TIMEZONE 28 | schema.map do |column| 29 | column_name = column[:name] 30 | embulk_type = column[:type] 31 | column_option = column_options_map[column_name] || {} 32 | self.new( 33 | embulk_type, column_option['type'], 34 | timestamp_format: column_option['timestamp_format'], 35 | timezone: column_option['timezone'], 36 | strict: column_option['strict'], 37 | default_timestamp_format: default_timestamp_format, 38 | default_timezone: default_timezone, 39 | ).create_converter 40 | end 41 | end 42 | 43 | attr_reader :embulk_type, :type, :timestamp_format, :timezone, :zone_offset, :strict 44 | 45 | def initialize( 46 | embulk_type, type = nil, 47 | timestamp_format: nil, timezone: nil, strict: nil, 48 | default_timestamp_format: DEFAULT_TIMESTAMP_FORMAT, 49 | default_timezone: DEFAULT_TIMEZONE 50 | ) 51 | @embulk_type = embulk_type 52 | @type = (type || Helper.bq_type_from_embulk_type(embulk_type)).upcase 53 | @timestamp_format = timestamp_format 54 | @default_timestamp_format = default_timestamp_format 55 | @timezone = timezone || default_timezone 56 | @zone_offset = TimeWithZone.zone_offset(@timezone) 57 | @strict = strict.nil? ? true : strict 58 | end 59 | 60 | def create_converter 61 | case embulk_type 62 | when :boolean then boolean_converter 63 | when :long then long_converter 64 | when :double then double_converter 65 | when :string then string_converter 66 | when :timestamp then timestamp_converter 67 | when :json then json_converter 68 | else raise NotSupportedType, "embulk type #{embulk_type} is not supported" 69 | end 70 | end 71 | 72 | def with_typecast_error(val) 73 | begin 74 | yield(val) 75 | rescue => e 76 | raise_typecast_error(val) 77 | end 78 | end 79 | 80 | def raise_typecast_error(val) 81 | message = "cannot cast #{@embulk_type} `#{val}` to #{@type}" 82 | if @strict 83 | raise TypeCastError, message 84 | else 85 | Embulk.logger.trace { message } 86 | return nil 87 | end 88 | end 89 | 90 | def boolean_converter 91 | case type 92 | when 'BOOLEAN' 93 | Proc.new {|val| 94 | val 95 | } 96 | when 'STRING' 97 | Proc.new {|val| 98 | next nil if val.nil? 99 | val.to_s 100 | } 101 | else 102 | raise NotSupportedType, "cannot take column type #{type} for boolean column" 103 | end 104 | end 105 | 106 | def long_converter 107 | case type 108 | when 'BOOLEAN' 109 | Proc.new {|val| 110 | next nil if val.nil? 111 | next true if val == 1 112 | next false if val == 0 113 | raise_typecast_error(val) 114 | } 115 | when 'INTEGER' 116 | Proc.new {|val| 117 | val 118 | } 119 | when 'FLOAT' 120 | Proc.new {|val| 121 | next nil if val.nil? 122 | val.to_f 123 | } 124 | when 'STRING' 125 | Proc.new {|val| 126 | next nil if val.nil? 127 | val.to_s 128 | } 129 | when 'TIMESTAMP' 130 | Proc.new {|val| 131 | next nil if val.nil? 132 | val # BigQuery supports UNIX timestamp 133 | } 134 | else 135 | raise NotSupportedType, "cannot take column type #{type} for long column" 136 | end 137 | end 138 | 139 | def double_converter 140 | case type 141 | when 'INTEGER' 142 | Proc.new {|val| 143 | next nil if val.nil? 144 | val.to_i 145 | } 146 | when 'FLOAT' 147 | Proc.new {|val| 148 | val 149 | } 150 | when 'STRING' 151 | Proc.new {|val| 152 | next nil if val.nil? 153 | val.to_s 154 | } 155 | when 'TIMESTAMP' 156 | Proc.new {|val| 157 | next nil if val.nil? 158 | val # BigQuery supports UNIX timestamp 159 | } 160 | else 161 | raise NotSupportedType, "cannot take column type #{type} for double column" 162 | end 163 | end 164 | 165 | def string_converter 166 | case type 167 | when 'BOOLEAN' 168 | Proc.new {|val| 169 | next nil if val.nil? 170 | next true if val == 'true'.freeze 171 | next false if val == 'false'.freeze 172 | raise_typecast_error(val) 173 | } 174 | when 'INTEGER' 175 | Proc.new {|val| 176 | next nil if val.nil? 177 | with_typecast_error(val) do |val| 178 | Integer(val) 179 | end 180 | } 181 | when 'FLOAT' 182 | Proc.new {|val| 183 | next nil if val.nil? 184 | with_typecast_error(val) do |val| 185 | Float(val) 186 | end 187 | } 188 | when 'STRING' 189 | Proc.new {|val| 190 | val 191 | } 192 | when 'TIMESTAMP' 193 | if @timestamp_format 194 | Proc.new {|val| 195 | next nil if val.nil? 196 | with_typecast_error(val) do |val| 197 | TimeWithZone.set_zone_offset(Time.strptime(val, @timestamp_format), zone_offset).strftime("%Y-%m-%d %H:%M:%S.%6N %:z") 198 | end 199 | } 200 | else 201 | Proc.new {|val| 202 | next nil if val.nil? 203 | val # Users must care of BQ timestamp format 204 | } 205 | end 206 | when 'DATE' 207 | Proc.new {|val| 208 | next nil if val.nil? 209 | with_typecast_error(val) do |val| 210 | TimeWithZone.set_zone_offset(Time.parse(val), zone_offset).strftime("%Y-%m-%d") 211 | end 212 | } 213 | when 'DATETIME' 214 | if @timestamp_format 215 | Proc.new {|val| 216 | next nil if val.nil? 217 | with_typecast_error(val) do |val| 218 | Time.strptime(val, @timestamp_format).strftime("%Y-%m-%d %H:%M:%S.%6N") 219 | end 220 | } 221 | else 222 | Proc.new {|val| 223 | next nil if val.nil? 224 | val # Users must care of BQ timestamp format 225 | } 226 | end 227 | when 'TIME' 228 | # TimeWithZone doesn't affect any change to the time value 229 | Proc.new {|val| 230 | next nil if val.nil? 231 | with_typecast_error(val) do |val| 232 | TimeWithZone.set_zone_offset(Time.parse(val), zone_offset).strftime("%H:%M:%S.%6N") 233 | end 234 | } 235 | when 'RECORD' 236 | Proc.new {|val| 237 | next nil if val.nil? 238 | with_typecast_error(val) do |val| 239 | JSON.parse(val) 240 | end 241 | } 242 | else 243 | raise NotSupportedType, "cannot take column type #{type} for string column" 244 | end 245 | end 246 | 247 | def timestamp_converter 248 | case type 249 | when 'INTEGER' 250 | Proc.new {|val| 251 | next nil if val.nil? 252 | val.to_i 253 | } 254 | when 'FLOAT' 255 | Proc.new {|val| 256 | next nil if val.nil? 257 | val.to_f 258 | } 259 | when 'STRING' 260 | _timestamp_format = @timestamp_format || @default_timestamp_format 261 | Proc.new {|val| 262 | next nil if val.nil? 263 | with_typecast_error(val) do |val| 264 | val.localtime(zone_offset).strftime(_timestamp_format) 265 | end 266 | } 267 | when 'TIMESTAMP' 268 | Proc.new {|val| 269 | next nil if val.nil? 270 | val.strftime("%Y-%m-%d %H:%M:%S.%6N %:z") 271 | } 272 | when 'DATE' 273 | Proc.new {|val| 274 | next nil if val.nil? 275 | val.localtime(zone_offset).strftime("%Y-%m-%d") 276 | } 277 | when 'DATETIME' 278 | Proc.new {|val| 279 | next nil if val.nil? 280 | val.localtime(zone_offset).strftime("%Y-%m-%d %H:%M:%S.%6N") 281 | } 282 | when 'TIME' 283 | Proc.new {|val| 284 | next nil if val.nil? 285 | val.localtime(zone_offset).strftime("%H:%M:%S.%6N") 286 | } 287 | else 288 | raise NotSupportedType, "cannot take column type #{type} for timestamp column" 289 | end 290 | end 291 | 292 | # ToDo: recursive conversion 293 | def json_converter 294 | case type 295 | when 'STRING' 296 | Proc.new {|val| 297 | next nil if val.nil? 298 | val.to_json 299 | } 300 | when 'RECORD' 301 | Proc.new {|val| 302 | val 303 | } 304 | when 'JSON' 305 | Proc.new {|val| 306 | val 307 | } 308 | else 309 | raise NotSupportedType, "cannot take column type #{type} for json column" 310 | end 311 | end 312 | end 313 | end 314 | end 315 | end 316 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'bundler/setup' 4 | require 'test/unit' 5 | require 'test/unit/rr' 6 | 7 | # Embulk 0.10.x introduced new bootstrap mechanism. 8 | # https://github.com/embulk/embulk/blob/641f35fec064cca7b1a7314d634a4b64ef8637f1/embulk-ruby/test/vanilla/run-test.rb#L8-L13 9 | static_initializer = Java::org.embulk.EmbulkDependencyClassLoader.staticInitializer().useSelfContainedJarFiles() 10 | static_initializer.java_send :initialize 11 | 12 | require 'embulk/java/bootstrap' 13 | require 'embulk' 14 | 15 | Embulk.logger = Embulk::Logger.new('/dev/null') 16 | 17 | APP_ROOT = File.expand_path('../', __dir__) 18 | EXAMPLE_ROOT = File.expand_path('../example', __dir__) 19 | TEST_ROOT = File.expand_path(File.dirname(__FILE__)) 20 | JSON_KEYFILE = File.join(EXAMPLE_ROOT, 'your-project-000.json') 21 | -------------------------------------------------------------------------------- /test/test_bigquery_client.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery/bigquery_client' 3 | require 'csv' 4 | 5 | # 1. Prepare example/your-project-000.json 6 | # 2. bunlde exec ruby test/test_bigquery_client.rb 7 | 8 | unless File.exist?(JSON_KEYFILE) 9 | puts "#{JSON_KEYFILE} is not found. Skip test/test_bigquery_client.rb" 10 | else 11 | module Embulk 12 | class Output::Bigquery 13 | class TestBigqueryClient < Test::Unit::TestCase 14 | class << self 15 | def startup 16 | FileUtils.mkdir_p('tmp') 17 | end 18 | 19 | def shutdown 20 | FileUtils.rm_rf('tmp') 21 | end 22 | end 23 | 24 | def client(task = {}) 25 | task = least_task.merge(task) 26 | BigqueryClient.new(task, schema) 27 | end 28 | 29 | def least_task 30 | { 31 | 'project' => JSON.parse(File.read(JSON_KEYFILE))['project_id'], 32 | 'destination_project' => JSON.parse(File.read(JSON_KEYFILE))['project_id'], 33 | 'dataset' => 'your_dataset_name', 34 | 'table' => 'your_table_name', 35 | 'auth_method' => 'json_key', 36 | 'json_keyfile' => File.read(JSON_KEYFILE), 37 | 'retries' => 3, 38 | 'timeout_sec' => 300, 39 | 'open_timeout_sec' => 300, 40 | 'job_status_max_polling_time' => 3600, 41 | 'job_status_polling_interval' => 10, 42 | 'source_format' => 'CSV' 43 | } 44 | end 45 | 46 | def schema 47 | Schema.new([ 48 | Column.new({index: 0, name: 'boolean', type: :boolean}), 49 | Column.new({index: 1, name: 'long', type: :long}), 50 | Column.new({index: 2, name: 'double', type: :double}), 51 | Column.new({index: 3, name: 'string', type: :string}), 52 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 53 | Column.new({index: 5, name: 'json', type: :json}), 54 | ]) 55 | end 56 | 57 | def record 58 | [true,1,1.1,'1',Time.parse("2016-02-26 +00:00"),'{"foo":"bar"}'] 59 | end 60 | 61 | sub_test_case "client" do 62 | def test_json_keyfile 63 | assert_nothing_raised { BigqueryClient.new(least_task, schema).client } 64 | end 65 | end 66 | 67 | sub_test_case "create_dataset" do 68 | def test_create_dataset 69 | assert_nothing_raised { client.create_dataset } 70 | end 71 | 72 | def test_create_dataset_with_reference 73 | response = client.get_dataset 74 | any_instance_of(BigqueryClient) do |obj| 75 | mock(obj).get_dataset('your_dataset_name') { response } 76 | end 77 | assert_nothing_raised do 78 | client.create_dataset('your_dataset_name_old', reference: 'your_dataset_name') 79 | end 80 | end 81 | end 82 | 83 | sub_test_case "get_dataset" do 84 | def test_get_dataset 85 | assert_nothing_raised { client.create_dataset } 86 | assert_nothing_raised { client.get_dataset } 87 | end 88 | 89 | def test_get_dataset_not_found 90 | assert_raise(NotFoundError) { 91 | client.get_dataset('something_does_not_exist') 92 | } 93 | end 94 | end 95 | 96 | sub_test_case "create_table_if_not_exists" do 97 | def test_create_table_if_not_exists 98 | client.delete_table('your_table_name') 99 | assert_nothing_raised { client.create_table_if_not_exists('your_table_name') } 100 | end 101 | 102 | def test_create_table_if_not_exists_already_exists 103 | assert_nothing_raised { client.create_table_if_not_exists('your_table_name') } 104 | end 105 | 106 | def test_create_partitioned_table 107 | client.delete_table('your_table_name') 108 | assert_nothing_raised do 109 | client.create_table_if_not_exists('your_table_name$20160929', options:{ 110 | 'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000} 111 | }) 112 | end 113 | end 114 | end 115 | 116 | sub_test_case "delete_table" do 117 | def test_delete_table 118 | client.create_table_if_not_exists('your_table_name') 119 | assert_nothing_raised { client.delete_table('your_table_name') } 120 | end 121 | 122 | def test_delete_table_not_found 123 | assert_nothing_raised { client.delete_table('your_table_name') } 124 | end 125 | 126 | def test_delete_partitioned_table 127 | client.create_table_if_not_exists('your_table_name') 128 | assert_nothing_raised { client.delete_table('your_table_name$20160929') } 129 | end 130 | end 131 | 132 | sub_test_case "get_table" do 133 | def test_get_table 134 | client.create_table_if_not_exists('your_table_name') 135 | assert_nothing_raised { client.get_table('your_table_name') } 136 | end 137 | 138 | def test_get_table_not_found 139 | client.delete_table('your_table_name') 140 | assert_raise(NotFoundError) { 141 | client.get_table('your_table_name') 142 | } 143 | end 144 | 145 | def test_get_partitioned_table 146 | client.create_table_if_not_exists('your_table_name') 147 | assert_nothing_raised { client.get_table('your_table_name$20160929') } 148 | end 149 | end 150 | 151 | sub_test_case "delete_partition" do 152 | def test_delete_partition 153 | client.delete_table('your_table_name') 154 | client.create_table_if_not_exists('your_table_name$20160929') 155 | assert_nothing_raised { client.delete_partition('your_table_name$20160929') } 156 | ensure 157 | client.delete_table('your_table_name') 158 | end 159 | 160 | def test_delete_partition_of_non_partitioned_table 161 | client.delete_table('your_table_name') 162 | client.create_table_if_not_exists('your_table_name') 163 | assert_raise { client.delete_partition('your_table_name$20160929') } 164 | ensure 165 | client.delete_table('your_table_name') 166 | end 167 | 168 | def test_delete_partition_table_not_found 169 | assert_nothing_raised { client.delete_partition('your_table_name$20160929') } 170 | end 171 | end 172 | 173 | sub_test_case "fields" do 174 | def test_fields_from_table 175 | client.create_table_if_not_exists('your_table_name') 176 | fields = client.fields_from_table('your_table_name') 177 | expected = [ 178 | {:type=>"BOOLEAN", :name=>"boolean"}, 179 | {:type=>"INTEGER", :name=>"long"}, 180 | {:type=>"FLOAT", :name=>"double"}, 181 | {:type=>"STRING", :name=>"string"}, 182 | {:type=>"TIMESTAMP", :name=>"timestamp"}, 183 | {:type=>"STRING", :name=>"json"}, 184 | ] 185 | assert_equal expected, fields 186 | end 187 | end 188 | 189 | sub_test_case "copy" do 190 | def test_create_table_if_not_exists 191 | client.create_table_if_not_exists('your_table_name') 192 | assert_nothing_raised { client.copy('your_table_name', 'your_table_name_old') } 193 | end 194 | end 195 | 196 | sub_test_case "load" do 197 | def test_load 198 | client.create_table_if_not_exists('your_table_name') 199 | File.write("tmp/your_file_name.csv", record.to_csv) 200 | assert_nothing_raised { client.load("/tmp/your_file_name.csv", 'your_table_name') } 201 | end 202 | end 203 | end 204 | end 205 | end 206 | end 207 | -------------------------------------------------------------------------------- /test/test_configure.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery' 3 | 4 | Bigquery = Embulk::Output::Bigquery unless defined?(Bigquery) 5 | 6 | module Embulk 7 | class Output::Bigquery 8 | class TestConfigure < Test::Unit::TestCase 9 | class << self 10 | def startup 11 | FileUtils.mkdir_p('tmp') 12 | end 13 | 14 | def shutdown 15 | FileUtils.rm_rf('tmp') 16 | end 17 | end 18 | 19 | def least_config 20 | DataSource.new({ 21 | 'project' => 'your_project_name', 22 | 'dataset' => 'your_dataset_name', 23 | 'table' => 'your_table_name', 24 | }) 25 | end 26 | 27 | def schema 28 | Schema.new([ 29 | Column.new({index: 0, name: 'boolean', type: :boolean}), 30 | Column.new({index: 1, name: 'long', type: :long}), 31 | Column.new({index: 2, name: 'double', type: :double}), 32 | Column.new({index: 3, name: 'string', type: :string}), 33 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 34 | Column.new({index: 5, name: 'json', type: :json}), 35 | ]) 36 | end 37 | 38 | def processor_count 39 | 1 40 | end 41 | 42 | def test_configure_default 43 | task = Bigquery.configure(least_config, schema, processor_count) 44 | assert_equal "append", task['mode'] 45 | assert_equal "application_default", task['auth_method'] 46 | assert_equal nil, task['json_keyfile'] 47 | assert_equal "your_project_name", task['project'] 48 | assert_equal "your_project_name", task['destination_project'] 49 | assert_equal "your_dataset_name", task['dataset'] 50 | assert_equal nil, task['location'] 51 | assert_equal "your_table_name", task['table'] 52 | assert_equal nil, task['dataset_old'] 53 | assert_equal nil, task['table_old'] 54 | assert_equal nil, task['table_name_old'] 55 | assert_equal false, task['auto_create_dataset'] 56 | assert_equal true, task['auto_create_table'] 57 | assert_equal nil, task['schema_file'] 58 | assert_equal nil, task['template_table'] 59 | assert_equal true, task['delete_from_local_when_job_end'] 60 | assert_equal 3600, task['job_status_max_polling_time'] 61 | assert_equal 10, task['job_status_polling_interval'] 62 | assert_equal false, task['is_skip_job_result_check'] 63 | assert_equal false, task['with_rehearsal'] 64 | assert_equal 1000, task['rehearsal_counts'] 65 | assert_equal [], task['column_options'] 66 | assert_equal "UTC", task['default_timezone'] 67 | assert_equal "%Y-%m-%d %H:%M:%S.%6N", task['default_timestamp_format'] 68 | assert_equal nil, task['payload_column'] 69 | assert_equal nil, task['payload_column_index'] 70 | assert_equal 5, task['retries'] 71 | assert_equal "Embulk BigQuery plugin", task['application_name'] 72 | # assert_equal "/tmp/embulk_output_bigquery_20160228-27184-pubcn0", task['path_prefix'] 73 | assert_equal ".%d.%d", task['sequence_format'] 74 | assert_equal ".csv", task['file_ext'] 75 | assert_equal false, task['skip_file_generation'] 76 | assert_equal "NONE", task['compression'] 77 | assert_equal "CSV", task['source_format'] 78 | assert_equal 0, task['max_bad_records'] 79 | assert_equal ",", task['field_delimiter'] 80 | assert_equal "UTF-8", task['encoding'] 81 | assert_equal false, task['ignore_unknown_values'] 82 | assert_equal false, task['allow_quoted_newlines'] 83 | assert_equal nil, task['time_partitioning'] 84 | assert_equal nil, task['clustering'] 85 | assert_equal false, task['skip_load'] 86 | end 87 | 88 | def test_mode 89 | config = least_config.merge('mode' => 'foobar') 90 | assert_raise { Bigquery.configure(config, schema, processor_count) } 91 | 92 | config = least_config.merge('mode' => 'append') 93 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 94 | 95 | config = least_config.merge('mode' => 'replace') 96 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 97 | 98 | config = least_config.merge('mode' => 'delete_in_advance') 99 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 100 | 101 | config = least_config.merge('mode' => 'replace_backup') 102 | assert_raise { Bigquery.configure(config, schema, processor_count) } 103 | end 104 | 105 | def test_location 106 | config = least_config.merge('location' => 'us') 107 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 108 | 109 | config = least_config.merge('location' => 'eu') 110 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 111 | 112 | config = least_config.merge('location' => 'asia-northeast1') 113 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 114 | end 115 | 116 | def test_dataset_table_old 117 | task = nil 118 | config = least_config.merge('mode' => 'replace_backup', 'table_old' => 'backup') 119 | assert_nothing_raised { task = Bigquery.configure(config, schema, processor_count) } 120 | assert_equal task['dataset_old'], task['dataset'] 121 | assert_equal task['table_old'], 'backup' 122 | 123 | config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'backup') 124 | assert_nothing_raised { task = Bigquery.configure(config, schema, processor_count) } 125 | assert_equal task['dataset_old'], 'backup' 126 | assert_equal task['table_old'], task['table'] 127 | end 128 | 129 | def test_auth_method 130 | config = least_config.merge('auth_method' => 'foobar') 131 | assert_raise { Bigquery.configure(config, schema, processor_count) } 132 | 133 | config = least_config.merge('auth_method' => 'json_key').tap {|h| h.delete('json_keyfile') } 134 | assert_raise { Bigquery.configure(config, schema, processor_count) } 135 | config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => "#{EXAMPLE_ROOT}/json_key.json") 136 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 137 | 138 | config = least_config.merge('auth_method' => 'compute_engine') 139 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 140 | end 141 | 142 | def test_json_keyfile 143 | json_keyfile = "#{EXAMPLE_ROOT}/json_key.json" 144 | config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => json_keyfile).tap {|h| h.delete('project') } 145 | task = Bigquery.configure(config, schema, processor_count) 146 | assert_not_equal nil, task['project'] # project is obtained from json_keyfile if available 147 | 148 | config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => { 'content' => File.read(json_keyfile) }).tap {|h| h.delete('project') } 149 | task = Bigquery.configure(config, schema, processor_count) 150 | assert_not_equal nil, task['project'] # project is obtained from json_keyfile if available 151 | 152 | config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => { 'content' => 'not a json' }) 153 | assert_raise { Bigquery.configure(config, schema, processor_count) } 154 | end 155 | 156 | def test_payload_column 157 | config = least_config.merge('payload_column' => schema.first.name, 'auto_create_table' => false, 'mode' => 'append_direct') 158 | task = Bigquery.configure(config, schema, processor_count) 159 | assert_equal task['payload_column_index'], 0 160 | 161 | config = least_config.merge('payload_column' => 'not_exist', 'auto_create_table' => false, 'mode' => 'append_direct') 162 | assert_raise { Bigquery.configure(config, schema, processor_count) } 163 | end 164 | 165 | def test_payload_column_index 166 | config = least_config.merge('payload_column_index' => 0, 'auto_create_table' => false, 'mode' => 'append_direct') 167 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 168 | 169 | config = least_config.merge('payload_column_index' => -1, 'auto_create_table' => false, 'mode' => 'append_direct') 170 | assert_raise { Bigquery.configure(config, schema, processor_count) } 171 | 172 | config = least_config.merge('payload_column_index' => schema.size, 'auto_create_table' => false, 'mode' => 'append_direct') 173 | assert_raise { Bigquery.configure(config, schema, processor_count) } 174 | end 175 | 176 | def test_auto_create_table_with_payload_column 177 | config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json') 178 | assert_raise { Bigquery.configure(config, schema, processor_count) } 179 | 180 | config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json', 'schema_file' => "#{EXAMPLE_ROOT}/schema.json") 181 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 182 | 183 | config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json', 'template_table' => 'foo') 184 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 185 | end 186 | 187 | def test_auto_create_table_with_payload_column_index 188 | config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0) 189 | assert_raise { Bigquery.configure(config, schema, processor_count) } 190 | 191 | config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0, 'schema_file' => "#{EXAMPLE_ROOT}/schema.json") 192 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 193 | 194 | config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0, 'template_table' => 'foo') 195 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 196 | end 197 | 198 | def test_schema_file 199 | config = least_config.merge('schema_file' => "#{EXAMPLE_ROOT}/schema.json") 200 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 201 | 202 | config = least_config.merge('schema_file' => "not_found.json") 203 | assert_raise { Bigquery.configure(config, schema, processor_count) } 204 | 205 | File.write("tmp/bad_schema.json", "not_a_json") 206 | config = least_config.merge('schema_file' => "tmp/bad_schema.json") 207 | assert_raise { Bigquery.configure(config, schema, processor_count) } 208 | end 209 | 210 | def test_source_format 211 | config = least_config.merge('source_format' => 'csv') 212 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 213 | 214 | config = least_config.merge('source_format' => 'jsonl') 215 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 216 | 217 | config = least_config.merge('source_format' => 'newline_delimited_json') 218 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 219 | 220 | config = least_config.merge('source_format' => 'foobar') 221 | assert_raise { Bigquery.configure(config, schema, processor_count) } 222 | end 223 | 224 | def test_compression 225 | config = least_config.merge('compression' => 'gzip') 226 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 227 | 228 | config = least_config.merge('compression' => 'none') 229 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 230 | 231 | config = least_config.merge('compression' => 'foobar') 232 | assert_raise { Bigquery.configure(config, schema, processor_count) } 233 | end 234 | 235 | def test_file_ext 236 | config = least_config.merge('source_format' => 'csv', 'compression' => 'gzip') 237 | task = Bigquery.configure(config, schema, processor_count) 238 | assert_equal '.csv.gz', task['file_ext'] 239 | 240 | config = least_config.merge('source_format' => 'NEWLINE_DELIMITED_JSON', 'compression' => 'gzip') 241 | task = Bigquery.configure(config, schema, processor_count) 242 | assert_equal '.jsonl.gz', task['file_ext'] 243 | 244 | config = least_config.merge('source_format' => 'csv', 'compression' => 'none') 245 | task = Bigquery.configure(config, schema, processor_count) 246 | assert_equal '.csv', task['file_ext'] 247 | 248 | config = least_config.merge('source_format' => 'NEWLINE_DELIMITED_JSON', 'compression' => 'none') 249 | task = Bigquery.configure(config, schema, processor_count) 250 | assert_equal '.jsonl', task['file_ext'] 251 | 252 | config = least_config.merge('file_ext' => '.foo') 253 | task = Bigquery.configure(config, schema, processor_count) 254 | assert_equal '.foo', task['file_ext'] 255 | end 256 | 257 | def test_time_partitioning 258 | config = least_config.merge('time_partitioning' => {'type' => 'DAY'}) 259 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 260 | 261 | config = least_config.merge('time_partitioning' => {'foo' => 'bar'}) 262 | assert_raise { Bigquery.configure(config, schema, processor_count) } 263 | 264 | config = least_config.merge('table' => 'table') 265 | task = Bigquery.configure(config, schema, processor_count) 266 | assert_equal nil, task['time_partitioning'] 267 | 268 | config = least_config.merge('table' => 'table_name$20160912') 269 | task = Bigquery.configure(config, schema, processor_count) 270 | assert_equal 'DAY', task['time_partitioning']['type'] 271 | end 272 | 273 | def test_range_partitioning 274 | config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 3, 'interval' => 1 }}) 275 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 276 | 277 | # field is required 278 | config = least_config.merge('range_partitioning' => {'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }}) 279 | assert_raise { Bigquery.configure(config, schema, processor_count) } 280 | 281 | 282 | # range is required 283 | config = least_config.merge('range_partitioning' => {'field' => 'foo'}) 284 | assert_raise { Bigquery.configure(config, schema, processor_count) } 285 | 286 | # range.start is required 287 | config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'end' => 2, 'interval' => 1 }}) 288 | assert_raise { Bigquery.configure(config, schema, processor_count) } 289 | 290 | # range.end is required 291 | config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'interval' => 1 }}) 292 | assert_raise { Bigquery.configure(config, schema, processor_count) } 293 | 294 | # range.interval is required 295 | config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2 }}) 296 | assert_raise { Bigquery.configure(config, schema, processor_count) } 297 | 298 | # range.start + range.interval should be less than range.end 299 | config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 2 }}) 300 | assert_raise { Bigquery.configure(config, schema, processor_count) } 301 | end 302 | 303 | def test_time_and_range_partitioning_error 304 | config = least_config.merge('time_partitioning' => {'type' => 'DAY'}, 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }}) 305 | assert_raise { Bigquery.configure(config, schema, processor_count) } 306 | 307 | config = least_config.merge('table' => 'table_name$20160912', 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }}) 308 | assert_raise { Bigquery.configure(config, schema, processor_count) } 309 | end 310 | 311 | def test_clustering 312 | config = least_config.merge('clustering' => {'fields' => ['field_a']}) 313 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 314 | 315 | config = least_config.merge('clustering' => {}) 316 | assert_raise { Bigquery.configure(config, schema, processor_count) } 317 | end 318 | 319 | def test_schema_update_options 320 | config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']) 321 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 322 | 323 | config = least_config.merge('schema_update_options' => ['FOO']) 324 | assert_raise { Bigquery.configure(config, schema, processor_count) } 325 | end 326 | 327 | def test_destination_project 328 | config = least_config.merge('destination_project' => 'your_destination_project_name') 329 | task = Bigquery.configure(config, schema, processor_count) 330 | 331 | assert_nothing_raised { Bigquery.configure(config, schema, processor_count) } 332 | assert_equal 'your_destination_project_name', task['destination_project'] 333 | assert_equal 'your_project_name', task['project'] 334 | end 335 | 336 | end 337 | end 338 | end 339 | -------------------------------------------------------------------------------- /test/test_example.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | 3 | # 1. Prepare example/your-project-000.json 4 | # 2. embulk bundle 5 | # 3. bundle exec ruby test/test_example.rb 6 | 7 | unless File.exist?(JSON_KEYFILE) 8 | puts "#{JSON_KEYFILE} is not found. Skip test/test_example.rb" 9 | else 10 | class TestExample < Test::Unit::TestCase 11 | def embulk_path 12 | if File.exist?("#{ENV['HOME']}/.embulk/bin/embulk") 13 | "#{ENV['HOME']}/.embulk/bin/embulk" 14 | elsif File.exist?("#{ENV['PWD']}/embulk.jar") 15 | "#{ENV['PWD']}/embulk.jar" 16 | elsif File.exist?("/usr/local/bin/embulk") 17 | "/usr/local/bin/embulk" 18 | else 19 | "embulk" 20 | end 21 | end 22 | 23 | def embulk_run(config_path) 24 | ::Bundler.with_clean_env do 25 | cmd = "#{embulk_path} run -X page_size=1 -b . -l trace #{config_path}" 26 | puts "=" * 64 27 | puts cmd 28 | system(cmd) 29 | end 30 | end 31 | 32 | files = Dir.glob("#{APP_ROOT}/example/config_*.yml").reject {|file| File.symlink?(file) }.sort 33 | files.each do |config_path| 34 | if %w[ 35 | config_expose_errors.yml 36 | ].include?(File.basename(config_path)) 37 | define_method(:"test_#{File.basename(config_path, ".yml")}") do 38 | assert_false embulk_run(config_path) 39 | end 40 | else 41 | define_method(:"test_#{File.basename(config_path, ".yml")}") do 42 | assert_true embulk_run(config_path) 43 | end 44 | end 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /test/test_file_writer.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery/file_writer' 3 | require 'fileutils' 4 | require 'zlib' 5 | 6 | module Embulk 7 | class Output::Bigquery 8 | class TestFileWriter < Test::Unit::TestCase 9 | class << self 10 | def startup 11 | FileUtils.mkdir_p('tmp') 12 | end 13 | 14 | def shutdown 15 | FileUtils.rm_rf('tmp') 16 | end 17 | end 18 | 19 | def default_task 20 | { 21 | 'compression' => 'GZIP', 22 | 'payload_column' => nil, 23 | 'source_format' => 'CSV', 24 | 'path_prefix' => 'tmp/path_prefix', 25 | 'sequence_format' => '.%d.%03d', 26 | 'file_ext' => nil, 27 | } 28 | end 29 | 30 | def schema 31 | Schema.new([ 32 | Column.new({index: 0, name: 'boolean', type: :boolean}), 33 | Column.new({index: 1, name: 'long', type: :long}), 34 | Column.new({index: 2, name: 'double', type: :double}), 35 | Column.new({index: 3, name: 'string', type: :string}), 36 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 37 | Column.new({index: 5, name: 'json', type: :json}), 38 | ]) 39 | end 40 | 41 | def converters 42 | @converters ||= ValueConverterFactory.create_converters(default_task, schema) 43 | end 44 | 45 | def record 46 | [true, 1, 1.1, 'foo', Time.parse("2016-02-26 00:00:00 +00:00").utc, {"foo"=>"foo"}] 47 | end 48 | 49 | def page 50 | [record] 51 | end 52 | 53 | sub_test_case "path" do 54 | def test_path 55 | task = default_task.merge('path_prefix' => 'tmp/foo', 'sequence_format' => '', 'file_ext' => '.1') 56 | file_writer = FileWriter.new(task, schema, 0, converters) 57 | 58 | begin 59 | file_writer.add(page) 60 | ensure 61 | io.close rescue nil 62 | end 63 | path = file_writer.io.path 64 | assert_equal 'tmp/foo.1', path 65 | end 66 | end 67 | 68 | sub_test_case "formatter" do 69 | def test_payload_column_index 70 | task = default_task.merge('payload_column_index' => 0) 71 | file_writer = FileWriter.new(task, schema, 0, converters) 72 | formatter_proc = file_writer.instance_variable_get(:@formatter_proc) 73 | assert_equal :to_payload, formatter_proc.name 74 | 75 | assert_equal %Q[true\n], formatter_proc.call(record) 76 | end 77 | 78 | def test_csv 79 | task = default_task.merge('source_format' => 'CSV') 80 | file_writer = FileWriter.new(task, schema, 0, converters) 81 | formatter_proc = file_writer.instance_variable_get(:@formatter_proc) 82 | assert_equal :to_csv, formatter_proc.name 83 | 84 | expected = %Q[true,1,1.1,foo,2016-02-26 00:00:00.000000 +00:00,"{""foo"":""foo""}"\n] 85 | assert_equal expected, formatter_proc.call(record) 86 | end 87 | 88 | def test_jsonl 89 | task = default_task.merge('source_format' => 'NEWLINE_DELIMITED_JSON') 90 | file_writer = FileWriter.new(task, schema, 0, converters) 91 | formatter_proc = file_writer.instance_variable_get(:@formatter_proc) 92 | assert_equal :to_jsonl, formatter_proc.name 93 | 94 | expected = %Q[{"boolean":true,"long":1,"double":1.1,"string":"foo","timestamp":"2016-02-26 00:00:00.000000 +00:00","json":"{\\"foo\\":\\"foo\\"}"}\n] 95 | assert_equal expected, formatter_proc.call(record) 96 | end 97 | end 98 | 99 | sub_test_case "compression" do 100 | def test_gzip 101 | task = default_task.merge('compression' => 'GZIP') 102 | file_writer = FileWriter.new(task, schema, 0, converters) 103 | 104 | begin 105 | file_writer.add(page) 106 | io = file_writer.io 107 | assert_equal Zlib::GzipWriter, io.class 108 | ensure 109 | io.close rescue nil 110 | end 111 | path = file_writer.io.path 112 | assert_true File.exist?(path) 113 | assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } } 114 | end 115 | 116 | def test_uncompressed 117 | task = default_task.merge('compression' => 'NONE') 118 | file_writer = FileWriter.new(task, schema, 0, converters) 119 | 120 | begin 121 | file_writer.add(page) 122 | io = file_writer.io 123 | assert_equal File, io.class 124 | ensure 125 | io.close rescue nil 126 | end 127 | path = file_writer.io.path 128 | assert_true File.exist?(path) 129 | assert_raise { Zlib::GzipReader.open(path) {|gz| } } 130 | end 131 | end 132 | end 133 | end 134 | end 135 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery/helper' 3 | 4 | module Embulk 5 | class Output::Bigquery 6 | class TestHelper < Test::Unit::TestCase 7 | class << self 8 | def startup 9 | FileUtils.mkdir_p('tmp') 10 | end 11 | 12 | def shutdown 13 | FileUtils.rm_rf('tmp') 14 | end 15 | end 16 | 17 | def has_partition_decorator? 18 | assert_true Helper.has_partition_decorator?('table$20160929') 19 | assert_false Helper.has_partition_decorator?('table') 20 | end 21 | 22 | def chomp_partition_decorator 23 | assert_equal 'table', Helper.chomp_partition_decorator?('table$20160929') 24 | assert_equal 'table', Helper.chomp_partition_decorator?('table') 25 | end 26 | 27 | def bq_type_from_embulk_type 28 | assert_equal 'BOOLEAN', Helper.bq_type_from_embulk_type(:boolean) 29 | assert_equal 'STRING', Helper.bq_type_from_embulk_type(:string) 30 | assert_equal 'FLOAT', Helper.bq_type_from_embulk_type(:double) 31 | assert_equal 'STRING', Helper.bq_type_from_embulk_type(:string) 32 | assert_equal 'TIMESTAMP', Helper.bq_type_from_embulk_type(:timestamp) 33 | assert_equal 'STRING', Helper.bq_type_from_embulk_type(:json) 34 | end 35 | 36 | sub_test_case "fields_from_embulk_schema" do 37 | def test_fields_from_embulk_schema_without_column_options 38 | schema = Schema.new([ 39 | Column.new({index: 0, name: 'boolean', type: :boolean}), 40 | Column.new({index: 1, name: 'long', type: :long}), 41 | Column.new({index: 2, name: 'double', type: :double}), 42 | Column.new({index: 3, name: 'string', type: :string}), 43 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 44 | Column.new({index: 5, name: 'json', type: :json}), 45 | ]) 46 | expected = [ 47 | {name: 'boolean', type: 'BOOLEAN'}, 48 | {name: 'long', type: 'INTEGER'}, 49 | {name: 'double', type: 'FLOAT'}, 50 | {name: 'string', type: 'STRING'}, 51 | {name: 'timestamp', type: 'TIMESTAMP'}, 52 | {name: 'json', type: 'STRING'}, 53 | ] 54 | fields = Helper.fields_from_embulk_schema({}, schema) 55 | assert_equal expected, fields 56 | end 57 | 58 | def test_fields_from_embulk_schema_with_column_options 59 | schema = Schema.new([ 60 | Column.new({index: 0, name: 'boolean', type: :boolean}), 61 | Column.new({index: 1, name: 'long', type: :long}), 62 | Column.new({index: 2, name: 'double', type: :double}), 63 | Column.new({index: 3, name: 'string', type: :string}), 64 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 65 | Column.new({index: 5, name: 'date', type: :timestamp}), 66 | Column.new({index: 6, name: 'datetime', type: :timestamp}), 67 | Column.new({index: 7, name: 'json', type: :json}), 68 | ]) 69 | task = { 70 | 'column_options' => [ 71 | {'name' => 'boolean', 'type' => 'STRING', 'mode' => 'REQUIRED', 'description' => 'hoge'}, 72 | {'name' => 'long', 'type' => 'STRING'}, 73 | {'name' => 'double', 'type' => 'STRING'}, 74 | {'name' => 'string', 'type' => 'INTEGER'}, 75 | {'name' => 'timestamp', 'type' => 'INTEGER'}, 76 | {'name' => 'date', 'type' => 'DATE'}, 77 | {'name' => 'datetime', 'type' => 'DATETIME'}, 78 | {'name' => 'json', 'type' => 'RECORD', 'fields' => [ 79 | { 'name' => 'key1', 'type' => 'STRING' }, 80 | ]}, 81 | ], 82 | } 83 | expected = [ 84 | {name: 'boolean', type: 'STRING', mode: 'REQUIRED', description: 'hoge'}, 85 | {name: 'long', type: 'STRING'}, 86 | {name: 'double', type: 'STRING'}, 87 | {name: 'string', type: 'INTEGER'}, 88 | {name: 'timestamp', type: 'INTEGER'}, 89 | {name: 'date', type: 'DATE'}, 90 | {name: 'datetime', type: 'DATETIME'}, 91 | {name: 'json', type: 'RECORD', fields: [ 92 | {name: 'key1', type: 'STRING'}, 93 | ]}, 94 | ] 95 | fields = Helper.fields_from_embulk_schema(task, schema) 96 | assert_equal expected, fields 97 | end 98 | end 99 | 100 | def test_create_load_job_id 101 | task = { 102 | 'dataset' => 'your_dataset_name', 103 | 'location' => 'asia-northeast1', 104 | 'table' => 'your_table_name', 105 | 'source_format' => 'CSV', 106 | 'max_bad_records' => nil, 107 | 'field_delimiter' => ',', 108 | 'encoding' => 'UTF-8', 109 | 'ignore_unknown_values' => nil, 110 | 'allow_quoted_newlines' => nil, 111 | } 112 | fields = { 113 | name: 'a', type: 'STRING', 114 | } 115 | File.write("tmp/your_file_name", "foobarbaz") 116 | job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields) 117 | assert job_id.is_a?(String) 118 | assert_equal 'embulk_load_job_2abaf528b69987db0224e52bbd1f0eec', job_id 119 | end 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /test/test_transaction.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery' 3 | 4 | Bigquery = Embulk::Output::Bigquery unless defined?(Bigquery) 5 | 6 | module Embulk 7 | class Output::Bigquery 8 | class TestTransaction < Test::Unit::TestCase 9 | def least_config 10 | DataSource.new({ 11 | 'project' => 'your_project_name', 12 | 'dataset' => 'your_dataset_name', 13 | 'table' => 'your_table_name', 14 | 'temp_table' => 'temp_table', # randomly created is not good for our test 15 | 'path_prefix' => 'tmp/', # randomly created is not good for our test 16 | }) 17 | end 18 | 19 | def schema 20 | Schema.new([ 21 | Column.new({index: 0, name: 'boolean', type: :boolean}), 22 | Column.new({index: 1, name: 'long', type: :long}), 23 | Column.new({index: 2, name: 'double', type: :double}), 24 | Column.new({index: 3, name: 'string', type: :string}), 25 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 26 | Column.new({index: 5, name: 'json', type: :json}), 27 | ]) 28 | end 29 | 30 | def processor_count 31 | 1 32 | end 33 | 34 | def control 35 | Proc.new {|task| task_reports = [] } 36 | end 37 | 38 | def setup 39 | stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} } 40 | end 41 | 42 | sub_test_case "append_direct" do 43 | def test_append_direc_without_auto_create 44 | config = least_config.merge('mode' => 'append_direct', 'auto_create_dataset' => false, 'auto_create_table' => false) 45 | any_instance_of(BigqueryClient) do |obj| 46 | mock(obj).get_dataset(config['dataset']) 47 | mock(obj).get_table(config['table']) 48 | end 49 | Bigquery.transaction(config, schema, processor_count, &control) 50 | end 51 | 52 | def test_append_direct_with_auto_create 53 | config = least_config.merge('mode' => 'append_direct', 'auto_create_dataset' => true, 'auto_create_table' => true) 54 | task = Bigquery.configure(config, schema, processor_count) 55 | any_instance_of(BigqueryClient) do |obj| 56 | mock(obj).create_dataset(config['dataset']) 57 | mock(obj).create_table_if_not_exists(config['table']) 58 | end 59 | Bigquery.transaction(config, schema, processor_count, &control) 60 | end 61 | 62 | def test_append_direct_with_partition_without_auto_create 63 | config = least_config.merge('mode' => 'append_direct', 'table' => 'table$20160929', 'auto_create_dataset' => false, 'auto_create_table' => false) 64 | any_instance_of(BigqueryClient) do |obj| 65 | mock(obj).get_dataset(config['dataset']) 66 | mock(obj).get_table(config['table']) 67 | end 68 | Bigquery.transaction(config, schema, processor_count, &control) 69 | end 70 | 71 | def test_append_direct_with_partition_with_auto_create 72 | config = least_config.merge('mode' => 'append_direct', 'table' => 'table$20160929', 'auto_create_dataset' => true, 'auto_create_table' => true) 73 | task = Bigquery.configure(config, schema, processor_count) 74 | any_instance_of(BigqueryClient) do |obj| 75 | mock(obj).create_dataset(config['dataset']) 76 | mock(obj).create_table_if_not_exists(config['table']) 77 | end 78 | Bigquery.transaction(config, schema, processor_count, &control) 79 | end 80 | end 81 | 82 | sub_test_case "delete_in_advance" do 83 | def test_delete_in_advance 84 | config = least_config.merge('mode' => 'delete_in_advance') 85 | task = Bigquery.configure(config, schema, processor_count) 86 | any_instance_of(BigqueryClient) do |obj| 87 | mock(obj).get_dataset(config['dataset']) 88 | mock(obj).delete_table_or_partition(config['table']) 89 | mock(obj).create_table_if_not_exists(config['table']) 90 | end 91 | Bigquery.transaction(config, schema, processor_count, &control) 92 | end 93 | 94 | def test_delete_in_advance_with_partitioning 95 | config = least_config.merge('mode' => 'delete_in_advance', 'table' => 'table$20160929', 'auto_create_table' => true) 96 | task = Bigquery.configure(config, schema, processor_count) 97 | any_instance_of(BigqueryClient) do |obj| 98 | mock(obj).get_dataset(config['dataset']) 99 | mock(obj).delete_table_or_partition(config['table']) 100 | mock(obj).create_table_if_not_exists(config['table']) 101 | end 102 | Bigquery.transaction(config, schema, processor_count, &control) 103 | end 104 | end 105 | 106 | sub_test_case "replace" do 107 | def test_replace 108 | config = least_config.merge('mode' => 'replace') 109 | task = Bigquery.configure(config, schema, processor_count) 110 | any_instance_of(BigqueryClient) do |obj| 111 | mock(obj).get_dataset(config['dataset']) 112 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 113 | mock(obj).create_table_if_not_exists(config['table']) 114 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE') 115 | mock(obj).delete_table(config['temp_table']) 116 | end 117 | Bigquery.transaction(config, schema, processor_count, &control) 118 | end 119 | 120 | def test_replace_with_partitioning 121 | config = least_config.merge('mode' => 'replace', 'table' => 'table$20160929') 122 | task = Bigquery.configure(config, schema, processor_count) 123 | any_instance_of(BigqueryClient) do |obj| 124 | mock(obj).get_dataset(config['dataset']) 125 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 126 | mock(obj).create_table_if_not_exists(config['table']) 127 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE') 128 | mock(obj).delete_table(config['temp_table']) 129 | end 130 | Bigquery.transaction(config, schema, processor_count, &control) 131 | end 132 | end 133 | 134 | sub_test_case "replace_backup" do 135 | def test_replace_backup 136 | config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old', 'temp_table' => 'temp_table') 137 | task = Bigquery.configure(config, schema, processor_count) 138 | any_instance_of(BigqueryClient) do |obj| 139 | mock(obj).get_dataset(config['dataset']) 140 | mock(obj).get_dataset(config['dataset_old']) 141 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 142 | mock(obj).create_table_if_not_exists(config['table']) 143 | mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old']) 144 | 145 | mock(obj).get_table_or_partition(config['table']) 146 | mock(obj).copy(config['table'], config['table_old'], config['dataset_old']) 147 | 148 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE') 149 | mock(obj).delete_table(config['temp_table']) 150 | end 151 | Bigquery.transaction(config, schema, processor_count, &control) 152 | end 153 | 154 | def test_replace_backup_auto_create_dataset 155 | config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old', 'temp_table' => 'temp_table', 'auto_create_dataset' => true) 156 | task = Bigquery.configure(config, schema, processor_count) 157 | any_instance_of(BigqueryClient) do |obj| 158 | mock(obj).create_dataset(config['dataset']) 159 | mock(obj).create_dataset(config['dataset_old'], reference: config['dataset']) 160 | mock(obj).create_table_if_not_exists(config['table']) 161 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 162 | mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old']) 163 | 164 | mock(obj).get_table_or_partition(config['table']) 165 | mock(obj).copy(config['table'], config['table_old'], config['dataset_old']) 166 | 167 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE') 168 | mock(obj).delete_table(config['temp_table']) 169 | end 170 | Bigquery.transaction(config, schema, processor_count, &control) 171 | end 172 | 173 | def test_replace_backup_with_partitioning 174 | config = least_config.merge('mode' => 'replace_backup', 'table' => 'table$20160929', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old$20160929', 'temp_table' => 'temp_table', 'auto_create_table' => true) 175 | task = Bigquery.configure(config, schema, processor_count) 176 | any_instance_of(BigqueryClient) do |obj| 177 | mock(obj).get_dataset(config['dataset']) 178 | mock(obj).get_dataset(config['dataset_old']) 179 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 180 | mock(obj).create_table_if_not_exists(config['table']) 181 | mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old']) 182 | 183 | mock(obj).get_table_or_partition(config['table']) 184 | mock(obj).copy(config['table'], config['table_old'], config['dataset_old']) 185 | 186 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE') 187 | mock(obj).delete_table(config['temp_table']) 188 | end 189 | Bigquery.transaction(config, schema, processor_count, &control) 190 | end 191 | end 192 | 193 | sub_test_case "append" do 194 | def test_append 195 | config = least_config.merge('mode' => 'append') 196 | task = Bigquery.configure(config, schema, processor_count) 197 | any_instance_of(BigqueryClient) do |obj| 198 | mock(obj).get_dataset(config['dataset']) 199 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 200 | mock(obj).create_table_if_not_exists(config['table']) 201 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND') 202 | mock(obj).delete_table(config['temp_table']) 203 | end 204 | Bigquery.transaction(config, schema, processor_count, &control) 205 | end 206 | 207 | def test_append_with_partitioning 208 | config = least_config.merge('mode' => 'append', 'table' => 'table$20160929', 'auto_create_table' => true) 209 | task = Bigquery.configure(config, schema, processor_count) 210 | any_instance_of(BigqueryClient) do |obj| 211 | mock(obj).get_dataset(config['dataset']) 212 | mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil}) 213 | mock(obj).create_table_if_not_exists(config['table']) 214 | mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND') 215 | mock(obj).delete_table(config['temp_table']) 216 | end 217 | Bigquery.transaction(config, schema, processor_count, &control) 218 | end 219 | end 220 | end 221 | end 222 | end 223 | -------------------------------------------------------------------------------- /test/test_value_converter_factory.rb: -------------------------------------------------------------------------------- 1 | require_relative './helper' 2 | require 'embulk/output/bigquery/value_converter_factory' 3 | 4 | module Embulk 5 | class Output::Bigquery 6 | class TestValueConverterFactory < Test::Unit::TestCase 7 | 8 | class TestCreateConverters < Test::Unit::TestCase 9 | def test_create_default_converter 10 | schema = Schema.new([ 11 | Column.new({index: 0, name: 'boolean', type: :boolean}), 12 | Column.new({index: 1, name: 'long', type: :long}), 13 | Column.new({index: 2, name: 'double', type: :double}), 14 | Column.new({index: 3, name: 'string', type: :string}), 15 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 16 | Column.new({index: 5, name: 'json', type: :json}), 17 | ]) 18 | converters = ValueConverterFactory.create_converters({}, schema) 19 | assert_equal schema.size, converters.size 20 | # Check correct converters are created 21 | # Proc can not have names, so we have to execute to check... 22 | assert_equal true, converters[0].call(true) 23 | assert_equal 1, converters[1].call(1) 24 | assert_equal 1.1, converters[2].call(1.1) 25 | assert_equal 'foo', converters[3].call('foo') 26 | timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00") 27 | assert_equal "2016-02-26 00:00:00.500000 +00:00", converters[4].call(timestamp) 28 | assert_equal %Q[{"foo":"foo"}], converters[5].call({'foo'=>'foo'}) 29 | end 30 | 31 | def test_create_custom_converter 32 | schema = Schema.new([ 33 | Column.new({index: 0, name: 'boolean', type: :boolean}), 34 | Column.new({index: 1, name: 'long', type: :long}), 35 | Column.new({index: 2, name: 'double', type: :double}), 36 | Column.new({index: 3, name: 'string', type: :string}), 37 | Column.new({index: 4, name: 'timestamp', type: :timestamp}), 38 | Column.new({index: 5, name: 'json', type: :json}), 39 | ]) 40 | task = { 41 | 'column_options' => [ 42 | {'name' => 'boolean', 'type' => 'STRING'}, 43 | {'name' => 'long', 'type' => 'STRING'}, 44 | {'name' => 'double', 'type' => 'STRING'}, 45 | {'name' => 'string', 'type' => 'INTEGER'}, 46 | {'name' => 'timestamp', 'type' => 'INTEGER'}, 47 | {'name' => 'json', 'type' => 'RECORD'}, 48 | ], 49 | } 50 | converters = ValueConverterFactory.create_converters(task, schema) 51 | assert_equal schema.size, converters.size 52 | # Check correct converters are created 53 | # Proc can not have names, so we have to execute to check... 54 | assert_equal 'true', converters[0].call(true) 55 | assert_equal '1', converters[1].call(1) 56 | assert_equal '1.1', converters[2].call(1.1) 57 | assert_equal 1, converters[3].call('1') 58 | timestamp = Time.parse("2016-02-26 00:00:00.100000 +00:00") 59 | assert_equal 1456444800, converters[4].call(timestamp) 60 | assert_equal({'foo'=>'foo'}, converters[5].call({'foo'=>'foo'})) 61 | end 62 | end 63 | 64 | class TestBooleanConverter < Test::Unit::TestCase 65 | SCHEMA_TYPE = :boolean 66 | 67 | def test_boolean 68 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter 69 | assert_equal nil, converter.call(nil) 70 | assert_equal true, converter.call(true) 71 | assert_equal false, converter.call(false) 72 | end 73 | 74 | def test_integer 75 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter } 76 | end 77 | 78 | def test_float 79 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter } 80 | end 81 | 82 | def test_string 83 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 84 | assert_equal nil, converter.call(nil) 85 | assert_equal "true", converter.call(true) 86 | assert_equal "false", converter.call(false) 87 | end 88 | 89 | def test_timestamp 90 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter } 91 | end 92 | 93 | def test_date 94 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter } 95 | end 96 | 97 | def test_datetime 98 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter } 99 | end 100 | 101 | def test_record 102 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter } 103 | end 104 | end 105 | 106 | class TestLongConverter < Test::Unit::TestCase 107 | SCHEMA_TYPE = :long 108 | 109 | def test_boolean 110 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter 111 | assert_equal nil, converter.call(nil) 112 | assert_equal true, converter.call(1) 113 | assert_equal false, converter.call(0) 114 | assert_raise { converter.call(2) } 115 | end 116 | 117 | def test_integer 118 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter 119 | assert_equal nil, converter.call(nil) 120 | assert_equal 1, converter.call(1) 121 | end 122 | 123 | def test_float 124 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter 125 | assert_equal nil, converter.call(nil) 126 | assert_equal 1.0, converter.call(1) 127 | end 128 | 129 | def test_string 130 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 131 | assert_equal nil, converter.call(nil) 132 | assert_equal "1", converter.call(1) 133 | end 134 | 135 | def test_timestamp 136 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter 137 | assert_equal nil, converter.call(nil) 138 | assert_equal 1408452095, converter.call(1408452095) 139 | end 140 | 141 | def test_date 142 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter } 143 | end 144 | 145 | def test_datetime 146 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter } 147 | end 148 | 149 | def test_record 150 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter } 151 | end 152 | end 153 | 154 | class TestDoubleConverter < Test::Unit::TestCase 155 | SCHEMA_TYPE = :double 156 | 157 | def test_boolean 158 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter } 159 | end 160 | 161 | def test_integer 162 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter 163 | assert_equal nil, converter.call(nil) 164 | assert_equal 1, converter.call(1.1) 165 | end 166 | 167 | def test_float 168 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter 169 | assert_equal nil, converter.call(nil) 170 | assert_equal 1.1, converter.call(1.1) 171 | end 172 | 173 | def test_string 174 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 175 | assert_equal nil, converter.call(nil) 176 | assert_equal "1.1", converter.call(1.1) 177 | end 178 | 179 | def test_timestamp 180 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter 181 | assert_equal nil, converter.call(nil) 182 | assert_equal 1408452095.188766, converter.call(1408452095.188766) 183 | end 184 | 185 | def test_date 186 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter } 187 | end 188 | 189 | def test_datetime 190 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter } 191 | end 192 | 193 | def test_record 194 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter } 195 | end 196 | end 197 | 198 | class TestStringConverter < Test::Unit::TestCase 199 | SCHEMA_TYPE = :string 200 | 201 | def test_boolean 202 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter 203 | assert_equal nil, converter.call(nil) 204 | assert_equal true, converter.call('true') 205 | assert_equal false, converter.call('false') 206 | assert_raise { converter.call('foo') } 207 | end 208 | 209 | def test_integer 210 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter 211 | assert_equal nil, converter.call(nil) 212 | assert_equal 1, converter.call('1') 213 | assert_raise { converter.call('1.1') } 214 | end 215 | 216 | def test_float 217 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter 218 | assert_equal nil, converter.call(nil) 219 | assert_equal 1.1, converter.call('1.1') 220 | assert_raise { converter.call('foo') } 221 | end 222 | 223 | def test_string 224 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 225 | assert_equal nil, converter.call(nil) 226 | assert_equal "foo", converter.call("foo") 227 | end 228 | 229 | def test_timestamp 230 | converter = ValueConverterFactory.new( 231 | SCHEMA_TYPE, 'TIMESTAMP', 232 | timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo' 233 | ).create_converter 234 | assert_equal nil, converter.call(nil) 235 | assert_equal "2016-02-26 00:00:00.000000 +09:00", converter.call("2016-02-26") 236 | 237 | # Users must care of BQ timestamp format by themselves with no timestamp_format 238 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter 239 | assert_equal nil, converter.call(nil) 240 | assert_equal "2016-02-26 00:00:00", converter.call("2016-02-26 00:00:00") 241 | end 242 | 243 | def test_date 244 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter 245 | assert_equal nil, converter.call(nil) 246 | assert_equal "2016-02-26", converter.call("2016-02-26") 247 | assert_equal "2016-02-26", converter.call("2016-02-26 00:00:00") 248 | assert_raise { converter.call('foo') } 249 | end 250 | 251 | def test_datetime 252 | converter = ValueConverterFactory.new( 253 | SCHEMA_TYPE, 'DATETIME', 254 | timestamp_format: '%Y/%m/%d' 255 | ).create_converter 256 | assert_equal nil, converter.call(nil) 257 | assert_equal "2016-02-26 00:00:00.000000", converter.call("2016/02/26") 258 | 259 | # Users must care of BQ datetime format by themselves with no timestamp_format 260 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter 261 | assert_equal nil, converter.call(nil) 262 | assert_equal "2016-02-26 00:00:00", converter.call("2016-02-26 00:00:00") 263 | end 264 | 265 | def test_time 266 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIME').create_converter 267 | assert_equal nil, converter.call(nil) 268 | assert_equal "00:03:22.000000", converter.call("00:03:22") 269 | assert_equal "15:22:00.000000", converter.call("3:22 PM") 270 | assert_equal "03:22:00.000000", converter.call("3:22 AM") 271 | assert_equal "00:00:00.000000", converter.call("2016-02-26 00:00:00") 272 | 273 | # TimeWithZone doesn't affect any change to the time value 274 | converter = ValueConverterFactory.new( 275 | SCHEMA_TYPE, 'TIME', timezone: 'Asia/Tokyo' 276 | ).create_converter 277 | assert_equal "15:00:01.000000", converter.call("15:00:01") 278 | 279 | assert_raise { converter.call('foo') } 280 | end 281 | 282 | def test_record 283 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter 284 | assert_equal({'foo'=>'foo'}, converter.call(%Q[{"foo":"foo"}])) 285 | assert_raise { converter.call('foo') } 286 | end 287 | end 288 | 289 | class TestTimestampConverter < Test::Unit::TestCase 290 | SCHEMA_TYPE = :timestamp 291 | 292 | def test_boolean 293 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter } 294 | end 295 | 296 | def test_integer 297 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter 298 | assert_equal nil, converter.call(nil) 299 | expected = 1456444800 300 | assert_equal expected, converter.call(Time.at(expected)) 301 | end 302 | 303 | def test_float 304 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter 305 | assert_equal nil, converter.call(nil) 306 | expected = 1456444800.500000 307 | assert_equal expected, converter.call(Time.at(expected)) 308 | end 309 | 310 | def test_string 311 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 312 | assert_equal nil, converter.call(nil) 313 | timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00") 314 | expected = "2016-02-26 00:00:00.500000" 315 | assert_equal expected, converter.call(timestamp) 316 | 317 | converter = ValueConverterFactory.new( 318 | SCHEMA_TYPE, 'STRING', 319 | timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo' 320 | ).create_converter 321 | timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00") 322 | expected = "2016-02-26" 323 | assert_equal expected, converter.call(timestamp) 324 | end 325 | 326 | def test_timestamp 327 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter 328 | assert_equal nil, converter.call(nil) 329 | subject = 1456444800.500000 330 | expected = "2016-02-26 00:00:00.500000 +00:00" 331 | assert_equal expected, converter.call(Time.at(subject).utc) 332 | end 333 | 334 | def test_date 335 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter 336 | assert_equal nil, converter.call(nil) 337 | timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00") 338 | expected = "2016-02-26" 339 | assert_equal expected, converter.call(timestamp) 340 | 341 | converter = ValueConverterFactory.new( 342 | SCHEMA_TYPE, 'DATE', timezone: 'Asia/Tokyo' 343 | ).create_converter 344 | assert_equal nil, converter.call(nil) 345 | timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00") 346 | expected = "2016-02-26" 347 | assert_equal expected, converter.call(timestamp) 348 | 349 | assert_raise { converter.call('foo') } 350 | end 351 | 352 | def test_datetime 353 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter 354 | assert_equal nil, converter.call(nil) 355 | timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00") 356 | expected = "2016-02-26 00:00:00.500000" 357 | assert_equal expected, converter.call(timestamp) 358 | 359 | converter = ValueConverterFactory.new( 360 | SCHEMA_TYPE, 'DATETIME', timezone: 'Asia/Tokyo' 361 | ).create_converter 362 | assert_equal nil, converter.call(nil) 363 | timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00") 364 | expected = "2016-02-26 00:00:00.500000" 365 | assert_equal expected, converter.call(timestamp) 366 | 367 | assert_raise { converter.call('foo') } 368 | end 369 | 370 | def test_time 371 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIME').create_converter 372 | assert_equal nil, converter.call(nil) 373 | timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00") 374 | expected = "00:00:00.500000" 375 | assert_equal expected, converter.call(timestamp) 376 | 377 | converter = ValueConverterFactory.new( 378 | SCHEMA_TYPE, 'TIME', timezone: 'Asia/Tokyo' 379 | ).create_converter 380 | assert_equal nil, converter.call(nil) 381 | timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00") 382 | expected = "00:00:00.500000" 383 | assert_equal expected, converter.call(timestamp) 384 | 385 | assert_raise { converter.call('foo') } 386 | end 387 | 388 | def test_record 389 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter } 390 | end 391 | end 392 | 393 | class TestJsonConverter < Test::Unit::TestCase 394 | SCHEMA_TYPE = :json 395 | 396 | def test_boolean 397 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter } 398 | end 399 | 400 | def test_integer 401 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter } 402 | end 403 | 404 | def test_float 405 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter } 406 | end 407 | 408 | def test_string 409 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter 410 | assert_equal nil, converter.call(nil) 411 | assert_equal(%Q[{"foo":"foo"}], converter.call({'foo'=>'foo'})) 412 | end 413 | 414 | def test_timestamp 415 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter } 416 | end 417 | 418 | def test_date 419 | assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter } 420 | end 421 | 422 | def test_record 423 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter 424 | assert_equal nil, converter.call(nil) 425 | assert_equal({'foo'=>'foo'}, converter.call({'foo'=>'foo'})) 426 | end 427 | 428 | def test_json 429 | converter = ValueConverterFactory.new(SCHEMA_TYPE, 'JSON').create_converter 430 | assert_equal nil, converter.call(nil) 431 | assert_equal({'foo'=>'foo'}, converter.call({'foo'=>'foo'})) 432 | end 433 | end 434 | 435 | def test_strict_false 436 | converter = ValueConverterFactory.new(:string, 'BOOLEAN', strict: false).create_converter 437 | assert_equal nil, converter.call('foo') 438 | 439 | converter = ValueConverterFactory.new(:string, 'INTEGER', strict: false).create_converter 440 | assert_equal nil, converter.call('foo') 441 | end 442 | end 443 | end 444 | end 445 | --------------------------------------------------------------------------------