├── .github
    └── workflows
    │   ├── check.yml
    │   └── publish.yml
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── embulk-output-bigquery.gemspec
├── example
    ├── config_append_direct_schema_update_options.yml
    ├── config_client_options.yml
    ├── config_csv.yml
    ├── config_delete_in_advance.yml
    ├── config_delete_in_advance_field_partitioned_table.yml
    ├── config_delete_in_advance_partitioned_table.yml
    ├── config_destination_project.yml
    ├── config_expose_errors.yml
    ├── config_gcs.yml
    ├── config_guess_from_embulk_schema.yml
    ├── config_guess_with_column_options.yml
    ├── config_gzip.yml
    ├── config_jsonl.yml
    ├── config_max_threads.yml
    ├── config_min_ouput_tasks.yml
    ├── config_mode_append.yml
    ├── config_mode_append_direct.yml
    ├── config_nested_record.yml
    ├── config_payload_column.yml
    ├── config_payload_column_index.yml
    ├── config_progress_log_interval.yml
    ├── config_replace.yml
    ├── config_replace_backup.yml
    ├── config_replace_backup_field_partitioned_table.yml
    ├── config_replace_backup_partitioned_table.yml
    ├── config_replace_field_partitioned_table.yml
    ├── config_replace_field_range_partitioned_table.yml
    ├── config_replace_partitioned_table.yml
    ├── config_replace_schema_update_options.yml
    ├── config_skip_file_generation.yml
    ├── config_table_strftime.yml
    ├── config_template_table.yml
    ├── config_uncompressed.yml
    ├── config_with_rehearsal.yml
    ├── example.csv
    ├── example.yml
    ├── example2_1.csv
    ├── example2_2.csv
    ├── example4_1.csv
    ├── example4_2.csv
    ├── example4_3.csv
    ├── example4_4.csv
    ├── json_key.json
    ├── nested_example.jsonl
    ├── schema.json
    └── schema_expose_errors.json
├── lib
    └── embulk
    │   └── output
    │       ├── bigquery.rb
    │       └── bigquery
    │           ├── auth.rb
    │           ├── bigquery_client.rb
    │           ├── file_writer.rb
    │           ├── gcs_client.rb
    │           ├── google_client.rb
    │           ├── helper.rb
    │           └── value_converter_factory.rb
└── test
    ├── helper.rb
    ├── test_bigquery_client.rb
    ├── test_configure.rb
    ├── test_example.rb
    ├── test_file_writer.rb
    ├── test_helper.rb
    ├── test_transaction.rb
    └── test_value_converter_factory.rb


/.github/workflows/check.yml:
--------------------------------------------------------------------------------
 1 | name: Check
 2 | on: [ pull_request, push ]
 3 | jobs:
 4 |   check:
 5 |     runs-on: ubuntu-latest
 6 |     # push: always run.
 7 |     # pull_request: run only when the PR is submitted from a forked repository, not within this repository.
 8 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 9 |     strategy:
10 |       matrix:
11 |         jruby_version:
12 |         - 9.3.15.0
13 |         - 9.4.8.0
14 |       fail-fast: false
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up OpenJDK 8
18 |       uses: actions/setup-java@v4
19 |       with:
20 |         java-version: 8
21 |         distribution: "temurin"
22 |     - name: download jruby
23 |       run: "curl -L -o jruby.jar https://repo1.maven.org/maven2/org/jruby/jruby-complete/${{ matrix.jruby_version }}/jruby-complete-${{ matrix.jruby_version }}.jar"
24 |     #
25 |     # For avoiding permission denied. install gems into `gems` directory
26 |     #
27 |     - name: bundle install
28 |       run: "env GEM_HOME=gems java -jar jruby.jar -S bundle install"
29 | 
30 |     - name: install embulk.jar
31 |       run: "curl -L -o embulk.jar https://github.com/embulk/embulk/releases/download/v0.11.4/embulk-0.11.4.jar"
32 |     - name: rake test
33 |       run: 'env GEM_HOME=gems RUBYOPT="-r ./embulk.jar -r rubygems" java -jar jruby.jar -S bundle exec rake test'
34 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "v0.*"
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 |     environment: maven-central-and-ruby-gems
10 |     strategy:
11 |       fail-fast: true
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Set up Ruby
15 |       uses: ruby/setup-ruby@v1
16 |       with:
17 |         ruby-version: 3.3.0
18 |     # get tag variable using {{ github.ref_name }}
19 |     #
20 |     # References:
21 |     # * https://docs.github.com/en/actions/learn-github-actions/contexts#github-context
22 |     # * https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables
23 |     - name: extract gem version from tag
24 |       id: vars
25 |       run: echo version=${{ github.ref_name }} | sed -e 's/v0/0/' >> $GITHUB_OUTPUT
26 |     #
27 |     # From gem push documents.
28 |     #
29 |     # The push command will use ~/.gem/credentials to authenticate to a server,
30 |     # but you can use the RubyGems environment variable GEM_HOST_API_KEY
31 |     # to set the api key to authenticate.
32 |     #
33 |     # https://guides.rubygems.org/command-reference/#gem-push
34 |     #
35 |     - name: Publish
36 |       run: |
37 |         rake build
38 |         gem push pkg/${EMBULK_PLUGIN_NAME}-${{ steps.vars.outputs.version }}.gem
39 |       env:
40 |         EMBULK_PLUGIN_NAME: embulk-output-bigquery
41 |         GEM_HOST_API_KEY: "${{secrets.RUBYGEMS_API_KEY}}"
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | /pkg/
 3 | /tmp/
 4 | /.bundle/
 5 | /Gemfile.lock
 6 | vendor/
 7 | .ruby-version
 8 | .tags
 9 | your-project-000.json
10 | embulk.jar
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: ruby
 2 | matrix:
 3 |   include:
 4 |     - env: EMBULK_VERSION=0.9.15
 5 |       rvm: jruby-9.1.15.0 # bundled jruby version
 6 |       jdk: openjdk8 # embulk 0.9.x uses jdk8
 7 |     - env: EMBULK_VERSION=latest
 8 |       rvm: jruby-9.1.15.0 # ?
 9 |       jdk: openjdk8 # ?
10 |   allow_failures:
11 |     - env: EMBULK_VERSION=latest
12 | before_install:
13 |   - curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-${EMBULK_VERSION}.jar"
14 |   - chmod a+x embulk.jar
15 |   - BUNDLER_VERSION=$(echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb | tail -n 2 | tr -d '"')
16 |   - gem uninstall bundler -x
17 |   - gem install bundler -v ${BUNDLER_VERSION}
18 | install:
19 |   - ./embulk.jar bundle install --jobs=3 --retry=3 --path vendor/bundle
20 | script:
21 |   - bundle exec env RUBYOPT="-r ./embulk.jar  -r embulk -r embulk/java/bootstrap" rake test
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## 0.7.5 - 2025-05-13
  2 | * [enhancement] Add range partitioning support (Thanks to kitagry) #174
  3 | 
  4 | ## 0.7.4 - 2024-12-19
  5 | * [maintenance] Primary location unless location is set explicitly (Thanks to joker1007) #172
  6 | 
  7 | ## 0.7.3 - 2024-08-28
  8 | * [enhancement] Add TIME type conversion to string converter (Thanks to p-eye)
  9 | 
 10 | ## 0.7.2 - 2024-07-21
 11 | * [maintenance] Fix GitHub Actions #166
 12 | * [maintenance] Fix gcs_client in order to load data using gcs_bucket parameter (Thanks to kashira202111) #164
 13 | * [maintenance] Prevent creating unnecessary tables. (Thanks to kashira202111) #148
 14 | 
 15 | ## 0.7.1 - 2024-03-4
 16 | * [enhancement] Support description of columns and tables (Thanks to @kyoshidajp and @fagai ) #142
 17 | * [maintenance] Add missing GitHub Actions environment setting. #160
 18 | * [maintenance] Replace google-api-client with specific Google APIs (Thanks to @Nozomuts) #161
 19 | * [maintenance] Update GitHub Actions use checkout@v4 and setup-java@v4 #162
 20 | 
 21 | ## 0.7.0 - 2024-02-1
 22 | * [enhancement] Add support Embulk 0.11.x
 23 | 
 24 | ## 0.6.9 - 2023-03-16
 25 | * [enhancement] Add SSLException to retry job (thanks to @mzumi)
 26 | 
 27 | ## 0.6.8 - 2022-10-12
 28 | * [enhancement] Support JSON type (thanks to @civitaspo )
 29 | * [maintenance] Add an error message in order to retry (thanks to @mzumi)
 30 | 
 31 | ## 0.6.7 - 2021-09-10
 32 | * [enhancement] Add an expiration option of temporary table to clean up (thanks to @TKNGUE)
 33 | 
 34 | ## 0.6.6 - 2021-06-10
 35 | 
 36 | * [maintenance] Fix network retry function (thanks to @case-k-git)
 37 | * [enhancement] Allow to specify the billing project and the project to which the data will be loaded separately (thanks to @ck-fm0211)
 38 | * [enhancement] Include original error message on json parse error (thanks to @k-yomo)
 39 | 
 40 | ## 0.6.5 - 2021-06-10
 41 | * [maintenance] Fix failed tests (thanks to @kyoshidajp)
 42 | * [maintenance] Lock representable version for avoiding requiring Ruby 2.4 (thanks to @hiroyuki-sato)
 43 | 
 44 | ## 0.6.4 - 2019-11-06
 45 | 
 46 | * [enhancement] Add DATETIME type conveter (thanks to @kekekenta)
 47 | 
 48 | ## 0.6.3 - 2019-10-28
 49 | 
 50 | * [enhancement] Add DATE type conveter (thanks to @tksfjt1024)
 51 | 
 52 | ## 0.6.2 - 2019-10-16
 53 | 
 54 | * [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
 55 | 
 56 | ## 0.6.1 - 2019-08-28
 57 | 
 58 | * [maintenance] Release a new gem not to include symlinks to make it work on Windows.
 59 | 
 60 | ## 0.6.0 - 2019-08-11
 61 | 
 62 | Cleanup `auth_method`:
 63 | 
 64 | * [enhancement] Support `auth_method: authorized_user` (OAuth)
 65 | * [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
 66 | * [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
 67 | * [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
 68 | 
 69 | ## 0.5.0 - 2019-08-10
 70 | 
 71 | * [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
 72 | * [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
 73 | * [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
 74 | * [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
 75 | 
 76 | ## 0.4.14 - 2019-08-10
 77 | 
 78 | * [enhancement] Support field partitioning correctly.
 79 | 
 80 | ## 0.4.13 - 2019-03-20
 81 | 
 82 | * [enhancement] Support clustered table as an experimental feature
 83 | 
 84 | ## 0.4.12 - 2019-03-20
 85 | 
 86 | * [maintenance] Fix `time_partitioning.requirePartitionFilter` was not working. Use `time_partitioning.require_partition_filter` (thanks to @gitetsu)
 87 | 
 88 | ## 0.4.11 - 2019-03-07
 89 | 
 90 | * [maintenance] Fix to use `response.status.error_result` instead of `response.status.errors` to check job failure status (thanks to @nownabe)
 91 | 
 92 | ## 0.4.10 - 2018-11-08
 93 | * [enhancement] Support column-based partition (thanks to Chi-Ruei Li)
 94 | 
 95 | ## 0.4.9 - 2018-09-08
 96 | * [enhancement] Enable object lifecycle management when creating buckets with `auto_create_gcs_bucket` (thanks to @potato2003)
 97 | 
 98 | ## 0.4.8 - 2017-05-23
 99 | * [enhancement] Support location option for `auto_create_gcs_bucket` option (thanks to @potato2003)
100 | 
101 | ## 0.4.7 - 2017-05-02
102 | * [enhancement] Support location option to allow to use 'asia-northeast1' region
103 | 
104 | ## 0.4.6 - 2017-04-17
105 | * [enhancement] Support auth_method 'application_default'
106 | 
107 | ## 0.4.5 - 2017-04-04
108 | 
109 | * [maintenance] Fix deprecated warning log condition for `timeout_sec`
110 | 
111 | ## 0.4.4 - 2017-04-04
112 | 
113 | * [maintenance] Support google-api-ruby-client >= v0.11.0
114 | * [maintenance] Add `send_timeout_sec` and `read_timeout_sec` option for google-api-ruby-client >= v0.11.0
115 | 
116 | ## 0.4.3 - 2017-02-11
117 | 
118 | * [maintenance] Fix `schma_update_options` was not set with load_from_gcs (thanks to h10a-bf)
119 | 
120 | ## 0.4.2 - 2016-10-12
121 | 
122 | * [maintenance] Fix `schema_update_options` was not working (nil error)
123 | 
124 | ## 0.4.1 - 2016-10-03
125 | 
126 | * [enhancement] Support `schema_update_options` option
127 | 
128 | ## 0.4.0 - 2016-10-01
129 | 
130 | * [enhancement] Support partitioned table
131 | * [maintenance] Add `progress_log_interval` option to control the interval of showing progress log, and now showing progress log is off by default
132 | 
133 | ## 0.3.7 - 2016-08-03
134 | 
135 | * [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora)
136 | 
137 | ## 0.3.6 - 2016-06-15
138 | 
139 | * [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007)
140 | 
141 | ## 0.3.5 - 2016-06-13
142 | 
143 | * [enhancement] retry backendError and internalError in waiting load job
144 | * [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
145 | 
146 | ## 0.3.4 - 2016-06-01
147 | 
148 | * [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job
149 | 
150 | ## 0.3.3 - 2016-05-24
151 | 
152 | * [maintenance] Fix `private_key` auth is not working
153 | 
154 | ## 0.3.2 - 2016-05-03
155 | 
156 | * [new feature] Add `abort_on_error` option
157 | * [maintenance] Use uuid instead of current time for temp_table name
158 | 
159 | ## 0.3.1 - 2016-04-15
160 | 
161 | * [new feature] Add `sdk_log_level` option to show log of google-api-client
162 | * [maintenance] Fix `prevent_duplicate_insert` was not working correctly
163 | * [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API
164 | * [maintenance] Log response.statistics of load jobs
165 | * [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred
166 | * [maintenance] Fix a possibility which rehearsal would load 0 rows file
167 | 
168 | ## 0.3.0 - 2016-04-08
169 | 
170 | Big change is introduced. Now, embulk-output-bigquery is written in JRuby.
171 | 
172 | * [new feature] Support parallel loads. Fix [#28](https://github.com/embulk/embulk-output-bigquery/issues/28).
173 | * [new feature] Create table first. Fix [#29](https://github.com/embulk/embulk-output-bigquery/issues/29).
174 | * [new feature] Introduce rehearsal mode. Fix [#30](https://github.com/embulk/embulk-output-bigquery/issues/30).
175 | * [new feature] Support `dataset_old` option for `replace_backup`. Fix [#31](https://github.com/embulk/embulk-output-bigquery/issues/31).
176 | * [maintenance] Fix default timestamp format to `%Y-%m-%d %H:%M:%S.%6`. Fix [#32](https://github.com/embulk/embulk-output-bigquery/issues/32).
177 | * [new feature] Support request options such as `timeout_sec`, `open_timeout_sec`, `retries`. Fix [#33](https://github.com/embulk/embulk-output-bigquery/issues/33).
178 | * [new feature] Support continuing from file generation with `skip_file_generation` option.
179 | * [new feature] Guess BigQuery schema from Embulk schema. Fix [#1](https://github.com/embulk/embulk-output-bigquery/issues/1).
180 | * [new feature] Support automatically create dataset.
181 | * [new feature] Support transactional append mode.
182 | * [incompatibility change] Formatter plugin support is dropped. Formatter is done in this plugin for specified `source_format`.
183 | * [incompatibility change] Encoder plugin support is dropped. Encoding is done in this plugin for specified `compression`.
184 | * [incompatibility change] `append` mode now expresses a transactional append, and `append_direct` is one which is not transactional (this was `append` mode before)
185 | 
186 | ## 0.2.3 - 2016-02-19
187 | 
188 | * [maintenance] Fix detect logic of delete_in_advance mode. [#26](https://github.com/embulk/embulk-output-bigquery/issues/26). @sonots thanks!
189 | 
190 | ## 0.2.2 - 2016-02-15
191 | 
192 | * [new feature] Added template_table option. [#25](https://github.com/embulk/embulk-output-bigquery/pull/25). @joker1007 thanks!
193 | 
194 | ## 0.2.1 - 2016-01-28
195 | 
196 | * [maintenance] Upgraded Embulk version to 0.8.1 [#22](https://github.com/embulk/embulk-output-bigquery/pull/22). @joker1007 thanks!
197 | * [maintenance] Formatted code style by checkstyle [#23](https://github.com/embulk/embulk-output-bigquery/pull/23)
198 | 
199 | ## 0.2.0 - 2016-01-26
200 | 
201 | * [new feature] Added mode parameters and support 4 modes(append, replace, replace_backup, delete_in_advance). [#20](https://github.com/embulk/embulk-output-bigquery/pull/20) [#21](https://github.com/embulk/embulk-output-bigquery/pull/21) @joker1007 thanks!
202 | 
203 | ## 0.1.11 - 2015-11-16
204 | 
205 | * [maintenance] Change error result display for easy investigation. [#18](https://github.com/embulk/embulk-output-bigquery/pull/18)
206 | 
207 | ## 0.1.10 - 2015-10-06
208 | 
209 | * [new feature] Added new auth method - json_keyfile of GCP(Google Cloud Platform)'s service account [#17](https://github.com/embulk/embulk-output-bigquery/pull/17)
210 | 
211 | ## 0.1.9 - 2015-08-19
212 | 
213 | * [maintenance] Upgraded Embulk version to 0.7.1
214 | 
215 | ## 0.1.8 - 2015-08-19
216 | 
217 | * [new feature] Supported mapreduce-executor. @frsyuki thanks! [#13](https://github.com/embulk/embulk-output-bigquery/pull/13)
218 | * [maintenance] Fixed job_id generation logic [#15](https://github.com/embulk/embulk-output-bigquery/pull/15)
219 | * [maintenance] Refactored [#11](https://github.com/embulk/embulk-output-bigquery/pull/11)
220 | 
221 | ## 0.1.7 - 2015-05-20
222 | 
223 | * [new feature] Added allow_quoted_newlines option [#10](https://github.com/embulk/embulk-output-bigquery/pull/10)
224 | * [maintenance] Upgraded embulk version to 0.6.8
225 | 
226 | ## 0.1.6 - 2015-04-23
227 | 
228 | * [new feature] Added ignore_unknown_values option to job_id generation logic. [#9](https://github.com/embulk/embulk-output-bigquery/pull/9)
229 | 
230 | ## 0.1.5 - 2015-04-23
231 | 
232 | * [new feature] Added ignore_unknown_values option.  [#8](https://github.com/embulk/embulk-output-bigquery/pull/8) @takus thanks!
233 | 
234 | ## 0.1.4 - 2015-04-21
235 | 
236 | * [new feature] Added prevent_duplicate_insert option
237 | 
238 | ## 0.1.3 - 2015-04-06
239 | 
240 | * [new feature] Added new auth method - pre-defined access token of GCE(Google Compute Engine)
241 | * [maintenance] Updated Google provided libraries
242 |   * http-client:google-http-client-jackson2 from 1.19.0 to 1.20.0
243 |   * apis:google-api-services-bigquery from v2-rev193-1.19.1 to v2-rev205-1.20.0
244 | 
245 | ## 0.1.2 - 2015-04-01
246 | 
247 | * [new feature] Changed bulk-load method from "via GCS" to direct-insert
248 | * [new feature] added dynamic table creationg option
249 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org/'
 2 | 
 3 | gemspec
 4 | gem 'embulk', '= 0.11.4'
 5 | gem 'embulk-parser-none'
 6 | gem 'embulk-parser-jsonl'
 7 | gem 'pry-nav'
 8 | gem 'test-unit'
 9 | gem 'test-unit-rr'
10 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # embulk-output-bigquery
  2 | 
  3 | [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/) using [direct insert](https://cloud.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest)
  4 | 
  5 | ## Overview
  6 | 
  7 | load data into Google BigQuery as batch jobs for big amount of data
  8 | https://developers.google.com/bigquery/loading-data-into-bigquery
  9 | 
 10 | * **Plugin type**: output
 11 | * **Resume supported**: no
 12 | * **Cleanup supported**: no
 13 | * **Dynamic table creating**: yes
 14 | 
 15 | ### Supported Embulk
 16 | 
 17 | | gem version      | Embulk version     |
 18 | |------------------|--------------------|
 19 | | 0.7.0 and higher | v0.11.0 and higher |
 20 | | 0.6.9 and lower  | v0.9.X and lower   |
 21 | 
 22 | ### NOT IMPLEMENTED
 23 | * insert data over streaming inserts
 24 |   * for continuous real-time insertions
 25 |   * Please use other product, like [fluent-plugin-bigquery](https://github.com/kaizenplatform/fluent-plugin-bigquery)
 26 |   * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
 27 | 
 28 | Current version of this plugin supports Google API with Service Account Authentication, but does not support
 29 | OAuth flow for installed applications.
 30 | 
 31 | ## Configuration
 32 | 
 33 | #### Original options
 34 | 
 35 | | name                                 | type        | required?  | default                  | description            |
 36 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
 37 | |  mode                                | string      | optional   | "append"                 | See [Mode](#mode)      |
 38 | |  auth_method                         | string      | optional   | "application\_default"   | See [Authentication](#authentication) |
 39 | |  json_keyfile                        | string      | optional   |                          | keyfile path or `content` |
 40 | |  project                             | string      | required unless service\_account's `json_keyfile` is given. | | project\_id |
 41 | |  destination_project                 | string      | optional   | `project` value         |  A destination project to which the data will be loaded. Use this if you want to separate a billing project (the `project` value) and a destination project (the `destination_project` value). |
 42 | |  dataset                             | string      | required   |                          | dataset |
 43 | |  location                            | string      | optional   | nil                      | geographic location of dataset. See [Location](#location) |
 44 | |  table                               | string      | required   |                          | table name, or table name with a partition decorator such as `table_name$20160929`|
 45 | |  auto_create_dataset                 | boolean     | optional   | false                    | automatically create dataset |
 46 | |  auto_create_table                   | boolean     | optional   | true                     | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
 47 | |  schema_file                         | string      | optional   |                          | /path/to/schema.json |
 48 | |  template_table                      | string      | optional   |                          | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
 49 | |  job_status_max_polling_time         | int         | optional   | 3600 sec                 | Max job status polling time |
 50 | |  job_status_polling_interval         | int         | optional   | 10 sec                   | Job status polling interval |
 51 | |  is_skip_job_result_check            | boolean     | optional   | false                    | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
 52 | |  with_rehearsal                      | boolean     | optional   | false                    | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
 53 | |  rehearsal_counts                    | integer     | optional   | 1000                     | Specify number of records to load in a rehearsal |
 54 | |  abort_on_error                      | boolean     | optional   | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
 55 | |  column_options                      | hash        | optional   |                          | See [Column Options](#column-options) |
 56 | |  default_timezone                    | string      | optional   | UTC                      | |
 57 | |  default_timestamp_format            | string      | optional   | %Y-%m-%d %H:%M:%S.%6N    | |
 58 | |  payload_column                      | string      | optional   | nil                      | See [Formatter Performance Issue](#formatter-performance-issue) |
 59 | |  payload_column_index                | integer     | optional   | nil                      | See [Formatter Performance Issue](#formatter-performance-issue) |
 60 | |  gcs_bucket                          | string      | optional   | nil                      | See [GCS Bucket](#gcs-bucket) |
 61 | |  auto_create_gcs_bucket              | boolean     | optional   | false                    | See [GCS Bucket](#gcs-bucket) |
 62 | |  progress_log_interval               | float       | optional   | nil (Disabled)           | Progress log interval. The progress log is disabled by nil (default). NOTE: This option may be removed in a future because a filter plugin can achieve the same goal |
 63 | |  description                         | string      | optional   | nil                      | description of table |
 64 | 
 65 | Client or request options
 66 | 
 67 | | name                                 | type        | required?  | default                  | description            |
 68 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
 69 | |  open_timeout_sec                    | integer     | optional   | 300                      | Seconds to wait for the connection to open |
 70 | |  timeout_sec                         | integer     | optional   | 300                      | Seconds to wait for one block to be read (google-api-ruby-client < v0.11.0) |
 71 | |  send_timeout_sec                    | integer     | optional   | 300                      | Seconds to wait to send a request (google-api-ruby-client >= v0.11.0) |
 72 | |  read_timeout_sec                    | integer     | optional   | 300                      | Seconds to wait to read a response (google-api-ruby-client >= v0.11.0) |
 73 | |  retries                             | integer     | optional   | 5                        | Number of retries |
 74 | |  application_name                    | string      | optional   | "Embulk BigQuery plugin" | User-Agent |
 75 | |  sdk_log_level                       | string      | optional   | nil (WARN)               | Log level of google api client library |
 76 | 
 77 | Options for intermediate local files
 78 | 
 79 | | name                                 | type        | required?  | default                  | description            |
 80 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
 81 | |  path_prefix                         | string      | optional   |                          | Path prefix of local files such as "/tmp/prefix_". Default randomly generates with [tempfile](http://ruby-doc.org/stdlib-2.2.3/libdoc/tempfile/rdoc/Tempfile.html) |
 82 | |  sequence_format                     | string      | optional   | .%d.%d                   | Sequence format for pid, thread id |
 83 | |  file_ext                            | string      | optional   |                          | The file extension of local files such as ".csv.gz" ".json.gz". Default automatically generates from `source_format` and `compression`|
 84 | |  skip_file_generation                | boolean     | optional   |                          | Load already generated local files into BigQuery if available. Specify correct path_prefix and file_ext. |
 85 | |  delete_from_local_when_job_end      | boolean     | optional   | true                     | If set to true, delete generate local files when job is end |
 86 | |  compression                         | string      | optional   | "NONE"                   | Compression of local files (`GZIP` or `NONE`) |
 87 | 
 88 | 
 89 | Options for intermediate tables on BigQuery
 90 | 
 91 | | name                                 | type        | required?  | default                  | description            |
 92 | |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
 93 | |  temporary_table_expiration          | integer     | optional   |                          | Temporary table's expiration time in seconds |
 94 | 
 95 | `source_format` is also used to determine formatter (csv or jsonl).
 96 | 
 97 | #### Same options of bq command-line tools or BigQuery job's property
 98 | 
 99 | Following options are same as [bq command-line tools](https://cloud.google.com/bigquery/bq-command-line-tool#creatingtablefromfile) or BigQuery [job's property](https://cloud.google.com/bigquery/docs/reference/v2/jobs#resource).
100 | 
101 | | name                              | type     | required? | default | description            |
102 | |:----------------------------------|:---------|:----------|:--------|:-----------------------|
103 | |  source_format                    | string   | required  | "CSV"   |   File type (`NEWLINE_DELIMITED_JSON` or `CSV`) |
104 | |  max_bad_records                  | int      | optional  | 0       | |
105 | |  field_delimiter                  | char     | optional  | ","     | |
106 | |  encoding                         | string   | optional  | "UTF-8" | `UTF-8` or `ISO-8859-1` |
107 | |  ignore_unknown_values            | boolean  | optional  | false   | |
108 | |  allow_quoted_newlines            | boolean  | optional  | false   | Set true, if data contains newline characters. It may cause slow procsssing |
109 | |  time_partitioning                | hash     | optional  | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
110 | |  time_partitioning.type           | string   | required  | nil     | The only type supported is DAY, which will generate one partition per day based on data loading time. |
111 | |  time_partitioning.expiration_ms  | int      | optional  | nil     | Number of milliseconds for which to keep the storage for a partition. |
112 | |  time_partitioning.field          | string   | optional  | nil     | `DATE` or `TIMESTAMP` column used for partitioning |
113 | |  range_partitioning               | hash     | optional  | nil     | See [Range Partitioning](#range-partitioning) |
114 | |  range_partitioning.field         | string   | required  | nil     | `INT64` column used for partitioning |
115 | |  range-partitioning.range         | hash     | required  | nil     | Defines the ranges for range paritioning |
116 | |  range-partitioning.range.start   | int      | required  | nil     | The start of range partitioning, inclusive. |
117 | |  range-partitioning.range.end     | int      | required  | nil     | The end of range partitioning, exclusive. |
118 | |  range-partitioning.range.interval| int      | required  | nil     | The width of each interval. |
119 | |  clustering                       | hash     | optional  | nil     | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
120 | |  clustering.fields                | array    | required  | nil     | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
121 | |  schema_update_options            | array    | optional  | nil     | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
122 | 
123 | ### Example
124 | 
125 | ```yaml
126 | out:
127 |   type: bigquery
128 |   mode: append
129 |   auth_method: service_account
130 |   json_keyfile: /path/to/json_keyfile.json
131 |   project: your-project-000
132 |   dataset: your_dataset_name
133 |   table: your_table_name
134 |   compression: GZIP
135 |   source_format: NEWLINE_DELIMITED_JSON
136 | ```
137 | 
138 | ### Location
139 | 
140 | The geographic location of the dataset. Required except for US and EU.
141 | 
142 | GCS bucket should be in same region when you use `gcs_bucket`.
143 | 
144 | See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
145 | 
146 | ### Mode
147 | 
148 | 5 modes are provided.
149 | 
150 | ##### append
151 | 
152 | 1. Load to temporary table (Create and WRITE_APPEND in parallel)
153 | 2. Copy temporary table to destination table (or partition). (WRITE_APPEND)
154 | 
155 | ##### append_direct
156 | 
157 | 1. Insert data into existing table (or partition) directly. (WRITE_APPEND in parallel)
158 | 
159 | This is not transactional, i.e., if fails, the target table could have some rows inserted.
160 | 
161 | ##### replace
162 | 
163 | 1. Load to temporary table (Create and WRITE_APPEND in parallel)
164 | 2. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE)
165 | 
166 | ```is_skip_job_result_check``` must be false when replace mode
167 | 
168 | NOTE: BigQuery does not support replacing (actually, copying into) a non-partitioned table with a paritioned table atomically. You must once delete the non-partitioned table, otherwise, you get `Incompatible table partitioning specification when copying to the column partitioned table` error.
169 | 
170 | ##### replace_backup
171 | 
172 | 1. Load to temporary table (Create and WRITE_APPEND in parallel)
173 | 2. Copy destination table (or partition) to backup table (or partition). (dataset_old, table_old)
174 | 3. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE)
175 | 
176 | ```is_skip_job_result_check``` must be false when replace_backup mode.
177 | 
178 | ##### delete_in_advance
179 | 
180 | 1. Delete destination table (or partition), if it exists.
181 | 2. Load to destination table (or partition).
182 | 
183 | ### Authentication
184 | 
185 | There are four authentication methods
186 | 
187 | 1. `service_account` (or `json_key` for backward compatibility)
188 | 1. `authorized_user`
189 | 1. `compute_engine`
190 | 1. `application_default`
191 | 
192 | #### service\_account (or json\_key)
193 | 
194 | Use GCP service account credentials.
195 | You first need to create a service account, download its json key and deploy the key with embulk.
196 | 
197 | ```yaml
198 | out:
199 |   type: bigquery
200 |   auth_method: service_account
201 |   json_keyfile: /path/to/json_keyfile.json
202 | ```
203 | 
204 | You can also embed contents of `json_keyfile` at config.yml.
205 | 
206 | ```yaml
207 | out:
208 |   type: bigquery
209 |   auth_method: service_account
210 |   json_keyfile:
211 |     content: |
212 |       {
213 |           "private_key_id": "123456789",
214 |           "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
215 |           "client_email": "..."
216 |       }
217 | ```
218 | 
219 | #### authorized\_user
220 | 
221 | Use Google user credentials.
222 | You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
223 | 
224 | ```yaml
225 | out:
226 |   type: bigquery
227 |   auth_method: authorized_user
228 |   json_keyfile: /path/to/credentials.json
229 | ```
230 | 
231 | You can also embed contents of `json_keyfile` at config.yml.
232 | 
233 | ```yaml
234 | out:
235 |   type: bigquery
236 |   auth_method: authorized_user
237 |   json_keyfile:
238 |     content: |
239 |       {
240 |         "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
241 |         "client_secret":"xxxxxxxxxxx",
242 |         "refresh_token":"xxxxxxxxxxx",
243 |         "type":"authorized_user"
244 |       }
245 | ```
246 | 
247 | #### compute\_engine
248 | 
249 | On the other hand, you don't need to explicitly create a service account for embulk when you
250 | run embulk in Google Compute Engine. In this third authentication method, you need to
251 | add the API scope "https://www.googleapis.com/auth/bigquery" to the scope list of your
252 | Compute Engine VM instance, then you can configure embulk like this.
253 | 
254 | ```yaml
255 | out:
256 |   type: bigquery
257 |   auth_method: compute_engine
258 | ```
259 | 
260 | #### application\_default
261 | 
262 | Use Application Default Credentials (ADC).  ADC is a strategy to locate Google Cloud Service Account credentials.
263 | 
264 | 1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
265 | 2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
266 | 3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
267 | 
268 | See https://cloud.google.com/docs/authentication/production for details.
269 | 
270 | ```yaml
271 | out:
272 |   type: bigquery
273 |   auth_method: application_default
274 | ```
275 | 
276 | ### Table id formatting
277 | 
278 | `table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
279 | format to construct table ids.
280 | Table ids are formatted at runtime
281 | using the local time of the embulk server.
282 | 
283 | For example, with the configuration below,
284 | data is inserted into tables `table_20150503`, `table_20150504` and so on.
285 | 
286 | ```yaml
287 | out:
288 |   type: bigquery
289 |   table: table_%Y%m%d
290 | ```
291 | 
292 | ### Dynamic table creating
293 | 
294 | There are 3 ways to set schema.
295 | 
296 | #### Set schema.json
297 | 
298 | Please set file path of schema.json.
299 | 
300 | ```yaml
301 | out:
302 |   type: bigquery
303 |   auto_create_table: true
304 |   table: table_%Y%m%d
305 |   schema_file: /path/to/schema.json
306 | ```
307 | 
308 | #### Set template_table in dataset
309 | 
310 | Plugin will try to read schema from existing table and use it as schema template.
311 | 
312 | ```yaml
313 | out:
314 |   type: bigquery
315 |   auto_create_table: true
316 |   table: table_%Y%m%d
317 |   template_table: existing_table_name
318 | ```
319 | 
320 | #### Guess from Embulk Schema
321 | 
322 | Plugin will try to guess BigQuery schema from Embulk schema.  It is also configurable with `column_options`. See [Column Options](#column-options).
323 | 
324 | ### Column Options
325 | 
326 | Column options are used to aid guessing BigQuery schema, or to define conversion of values:
327 | 
328 | - **column_options**: advanced: an array of options for columns
329 |   - **name**: column name
330 |   - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATETIME`, `DATE`, and `RECORD`. See belows for supported conversion type.
331 |     - boolean:   `BOOLEAN`, `STRING` (default: `BOOLEAN`)
332 |     - long:      `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`)
333 |     - double:    `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`)
334 |     - string:    `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIME`, `TIMESTAMP`, `DATETIME`, `DATE`, `RECORD` (default: `STRING`)
335 |     - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIME`, `TIMESTAMP`, `DATETIME`, `DATE` (default: `TIMESTAMP`)
336 |     - json:      `STRING`,  `RECORD` (default: `STRING`)
337 |   - **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`)
338 |   - **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column.
339 |   - **description**: description (string, default is `None`).
340 |   - **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`)
341 |   - **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`).
342 | - **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N")
343 | - **default_timezone**: default timezone for column_options (string, default is "UTC")
344 | 
345 | Example)
346 | 
347 | ```yaml
348 | out:
349 |   type: bigquery
350 |   auto_create_table: true
351 |   column_options:
352 |     - {name: date, type: STRING, timestamp_format: %Y-%m-%d, timezone: "Asia/Tokyo"}
353 |     - name: json_column
354 |       type: RECORD
355 |       fields:
356 |         - {name: key1, type: STRING}
357 |         - {name: key2, type: STRING}
358 | ```
359 | 
360 | NOTE: Type conversion is done in this jruby plugin, and could be slow. See [Formatter Performance Issue](#formatter-performance-issue) to improve the performance.
361 | 
362 | ### Formatter Performance Issue
363 | 
364 | embulk-output-bigquery supports formatting records into CSV or JSON (and also formatting timestamp column).
365 | However, this plugin is written in jruby, and jruby plugins are slower than java plugins generally.
366 | 
367 | Therefore, it is recommended to format records with filter plugins written in Java such as [embulk-filter-to_json](https://github.com/civitaspo/embulk-filter-to_json) as:
368 | 
369 | ```yaml
370 | filters:
371 |   - type: to_json
372 |     column: {name: payload, type: string}
373 |     default_format: "%Y-%m-%d %H:%M:%S.%6N"
374 | out:
375 |   type: bigquery
376 |   payload_column_index: 0 # or, payload_column: payload
377 | ```
378 | 
379 | Furtheremore, if your files are originally jsonl or csv files, you can even skip a parser with [embulk-parser-none](https://github.com/sonots/embulk-parser-none) as:
380 | 
381 | ```yaml
382 | in:
383 |   type: file
384 |   path_prefix: example/example.jsonl
385 |   parser:
386 |     type: none
387 |     column_name: payload
388 | out:
389 |   type: bigquery
390 |   payload_column_index: 0 # or, payload_column: payload
391 | ```
392 | 
393 | ### GCS Bucket
394 | 
395 | This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
396 | 
397 | This plugin originally loads local files into BigQuery in parallel, that is, consumes a number of jobs, say 24 jobs on 24 CPU core machine for example (this depends on embulk parameters such as `min_output_tasks` and `max_threads`).
398 | 
399 | BigQuery supports loading multiple files from GCS with one job, therefore, uploading local files to GCS in parallel and then loading from GCS into BigQuery reduces number of consumed jobs to 1.
400 | 
401 | Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_create_gcs_bucket` to create the specified GCS bucket automatically.
402 | 
403 | ```yaml
404 | out:
405 |   type: bigquery
406 |   gcs_bucket: bucket_name
407 |   auto_create_gcs_bucket: true
408 | ```
409 | 
410 | ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
411 | 
412 | ### Time Partitioning
413 | 
414 | From 0.4.0, embulk-output-bigquery supports to load into partitioned table.
415 | See also [Creating and Updating Date-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables).
416 | 
417 | To load into a partition, specify `table` parameter with a partition decorator as:
418 | 
419 | ```yaml
420 | out:
421 |   type: bigquery
422 |   table: table_name$20160929
423 | ```
424 | 
425 | You may configure `time_partitioning` parameter together as:
426 | 
427 | ```yaml
428 | out:
429 |   type: bigquery
430 |   table: table_name$20160929
431 |   time_partitioning:
432 |     type: DAY
433 |     expiration_ms: 259200000
434 | ```
435 | 
436 | You can also create column-based partitioning table as:
437 | 
438 | ```yaml
439 | out:
440 |   type: bigquery
441 |   mode: replace
442 |   table: table_name
443 |   time_partitioning:
444 |     type: DAY
445 |     field: timestamp
446 | ```
447 | 
448 | Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
449 | 
450 | Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
451 | Note that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now. Deleting columns, and renaming columns are not supported.
452 | 
453 | MEMO: [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) is available
454 | to update the schema of the desitination table as a side effect of the load job, but it is not available for copy job.
455 | Thus, it was not suitable for embulk-output-bigquery idempotence modes, `append`, `replace`, and `replace_backup`, sigh.
456 | 
457 | ### Range Partitioning
458 | 
459 | See also [Creating and Updating Range-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables).
460 | 
461 | To load into a partition, specify `range_partitioning` and `table` parameter with a partition decorator as:
462 | 
463 | ```yaml
464 | out:
465 |   type: bigquery
466 |   table: table_name$1
467 |   range_partitioning:
468 |     field: customer_id
469 |     range:
470 |       start: 1
471 |       end: 99999
472 |       interval: 1
473 | ```
474 | 
475 | ## Development
476 | 
477 | ### Run example:
478 | 
479 | Prepare a json\_keyfile at example/your-project-000.json, then
480 | 
481 | ```
482 | $ embulk bundle install --path vendor/bundle
483 | $ embulk run -X page_size=1 -b . -l trace example/example.yml
484 | ```
485 | 
486 | ### Run test:
487 | 
488 | Place your embulk with `.jar` extension:
489 | 
490 | 
491 | ```
492 | $ curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-latest.jar"
493 | $ chmod a+x embulk.jar
494 | ```
495 | 
496 | Investigate JRUBY\_VERSION and Bundler::VERSION included in the embulk.jar:
497 | 
498 | ```
499 | $ echo JRUBY_VERSION | ./embulk.jar irb
500 | 2019-08-10 00:59:11.866 +0900: Embulk v0.9.17
501 | Switch to inspect mode.
502 | JRUBY_VERSION
503 | "X.X.X.X"
504 | 
505 | $ echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb
506 | 2019-08-10 01:59:10.460 +0900: Embulk v0.9.17
507 | Switch to inspect mode.
508 | require 'bundler'; Bundler::VERSION
509 | "Y.Y.Y"
510 | ```
511 | 
512 | Install the same version of jruby (change X.X.X.X to the version shown above) and bundler:
513 | 
514 | ```
515 | $ rbenv install jruby-X.X.X.X
516 | $ rbenv local jruby-X.X.X.X
517 | $ gem install bundler -v Y.Y.Y
518 | ```
519 | 
520 | Install dependencies (NOTE: Use bundler included in the embulk.jar, otherwise, `gem 'embulk'` is not found):
521 | 
522 | ```
523 | $ ./embulk.jar bundle install --path vendor/bundle
524 | ```
525 | 
526 | Run tests with `env RUBYOPT="-r ./embulk.jar`:
527 | 
528 | ```
529 | $ bundle exec env RUBYOPT="-r ./embulk.jar" rake test
530 | ```
531 | 
532 | To run tests which actually connects to BigQuery such as test/test\_bigquery\_client.rb,
533 | prepare a json\_keyfile at example/your-project-000.json, then
534 | 
535 | ```
536 | $ bundle exec env RUBYOPT="-r ./embulk.jar" ruby test/test_bigquery_client.rb
537 | $ bundle exec env RUBYOPT="-r ./embulk.jar" ruby test/test_example.rb
538 | ```
539 | 
540 | ### Release gem:
541 | 
542 | Change the version of gemspec, and write CHANGELOG.md. Then,
543 | 
544 | ```
545 | $ bundle exec rake release
546 | ```
547 | 
548 | ## ChangeLog
549 | 
550 | [CHANGELOG.md](CHANGELOG.md)
551 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | require 'rake/testtask'
 3 | 
 4 | desc 'Run test_unit based test'
 5 | Rake::TestTask.new(:test) do |t|
 6 |   t.libs << "test"
 7 |   t.test_files = Dir["test/**/test_*.rb"].sort
 8 |   t.verbose = true
 9 |   t.warning = false
10 | end
11 | task :default => :test
12 | 


--------------------------------------------------------------------------------
/embulk-output-bigquery.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |spec|
 2 |   spec.name          = "embulk-output-bigquery"
 3 |   spec.version       = "0.7.5"
 4 |   spec.authors       = ["Satoshi Akama", "Naotoshi Seo"]
 5 |   spec.summary       = "Google BigQuery output plugin for Embulk"
 6 |   spec.description   = "Embulk plugin that insert records to Google BigQuery."
 7 |   spec.email         = ["satoshiakama@gmail.com", "sonots@gmail.com"]
 8 |   spec.licenses      = ["MIT"]
 9 |   spec.homepage      = "https://github.com/embulk/embulk-output-bigquery"
10 | 
11 |   # Exclude example directory which uses symlinks from generating gem.
12 |   # Symlinks do not work properly on the Windows platform without administrator privilege.
13 |   spec.files         = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
14 |   spec.test_files    = spec.files.grep(%r{^(test|spec)/})
15 |   spec.require_paths = ["lib"]
16 | 
17 |   # the latest version
18 |   spec.add_dependency 'google-apis-storage_v1'
19 |   spec.add_dependency 'google-apis-bigquery_v2'
20 |   spec.add_dependency 'time_with_zone'
21 |   spec.add_dependency 'thwait'
22 |   # activesupport require Ruby >= 2.7.0
23 |   # jruby-9.3.0.0 is MRI 2.6 compatible
24 |   spec.add_dependency 'activesupport', "< 7.0"
25 | 
26 |   spec.add_development_dependency 'bundler', ['>= 1.10.6']
27 |   spec.add_development_dependency 'rake', ['>= 10.0']
28 | end
29 | 


--------------------------------------------------------------------------------
/example/config_append_direct_schema_update_options.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,     type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: append_direct
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION]
32 | 


--------------------------------------------------------------------------------
/example/config_client_options.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   timeout_sec: 400
31 |   open_timeout_sec: 400
32 |   retries: 2
33 |   application_name: "Embulk BigQuery plugin test"
34 | 


--------------------------------------------------------------------------------
/example/config_csv.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: CSV
27 |   compression: GZIP
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 | 


--------------------------------------------------------------------------------
/example/config_delete_in_advance.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: delete_in_advance
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 | 


--------------------------------------------------------------------------------
/example/config_delete_in_advance_field_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: delete_in_advance
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_field_partitioned_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   time_partitioning:
32 |     type: 'DAY'
33 |     field: timestamp
34 | 


--------------------------------------------------------------------------------
/example/config_delete_in_advance_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: delete_in_advance
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_partitioned_table_name$20160929
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   time_partitioning:
32 |     type: 'DAY'
33 |     expiration_ms: 100
34 | 


--------------------------------------------------------------------------------
/example/config_destination_project.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   project: your_project_name
25 |   destination_project: your_destination_project_name
26 |   dataset: your_dataset_name
27 |   table: your_table_name
28 |   source_format: NEWLINE_DELIMITED_JSON
29 |   compression: NONE
30 |   auto_create_dataset: true
31 |   auto_create_table: true
32 |   schema_file: example/schema.json
33 | 


--------------------------------------------------------------------------------
/example/config_expose_errors.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema_expose_errors.json
31 | 


--------------------------------------------------------------------------------
/example/config_gcs.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: GZIP
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   gcs_bucket: your_bucket_name
32 |   auto_create_gcs_bucket: true
33 | 


--------------------------------------------------------------------------------
/example/config_guess_from_embulk_schema.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   compression: GZIP
27 |   source_format: NEWLINE_DELIMITED_JSON
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 | 


--------------------------------------------------------------------------------
/example/config_guess_with_column_options.yml:
--------------------------------------------------------------------------------
 1 | # embulk gem install embulk-parser-jsonl
 2 | in:
 3 |   type: file
 4 |   path_prefix: example/nested_example.jsonl
 5 |   parser:
 6 |     type: jsonl
 7 |     columns:
 8 |       - {name: date,        type: string}
 9 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
10 |       - {name: "null",      type: string}
11 |       - {name: long,        type: long}
12 |       - {name: string,      type: string}
13 |       - {name: double,      type: double}
14 |       - {name: json,        type: json}
15 |       - {name: boolean,     type: boolean}
16 | out:
17 |   type: bigquery
18 |   mode: replace
19 |   auth_method: service_account
20 |   json_keyfile: example/your-project-000.json
21 |   dataset: your_dataset_name
22 |   table: your_table_name
23 |   compression: GZIP
24 |   source_format: NEWLINE_DELIMITED_JSON
25 |   auto_create_dataset: true
26 |   auto_create_table: true
27 |   column_options:
28 |     - {name: date,        type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
29 |     - {name: timestamp,   type: STRING,    timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
30 |     - {name: long,        type: STRING}
31 |     - {name: string,      type: STRING}
32 |     - {name: double,      type: STRING}
33 |     - {name: boolean,     type: STRING}
34 |     - name: json
35 |       type: RECORD
36 |       fields:
37 |         - {name: k1,      type: STRING}
38 |         - {name: k2,      type: STRING}
39 | # 2015-07-13
40 | # 2015-07-12 15:00:00
41 | 


--------------------------------------------------------------------------------
/example/config_gzip.yml:
--------------------------------------------------------------------------------
1 | config_csv.yml


--------------------------------------------------------------------------------
/example/config_jsonl.yml:
--------------------------------------------------------------------------------
1 | config_replace.yml


--------------------------------------------------------------------------------
/example/config_max_threads.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example4_
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 | exec:
32 |   type: local
33 |   min_output_tasks: 2
34 |   max_threads: 2
35 | 


--------------------------------------------------------------------------------
/example/config_min_ouput_tasks.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example2_
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: GZIP
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 | exec:
32 |   type: local
33 |   min_output_tasks: 8
34 |   max_threads: 4
35 | 


--------------------------------------------------------------------------------
/example/config_mode_append.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     null_string: 'NULL'
 8 |     skip_header_lines: 1
 9 |     comment_line_marker: '#'
10 |     columns:
11 |       - {name: date,        type: string}
12 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
13 |       - {name: "null",      type: string}
14 |       - {name: long,        type: long}
15 |       - {name: string,      type: string}
16 |       - {name: double,      type: double}
17 |       - {name: boolean,     type: boolean}
18 | out:
19 |   type: bigquery
20 |   mode: append
21 |   auth_method: service_account
22 |   json_keyfile: example/your-project-000.json
23 |   dataset: your_dataset_name
24 |   table: your_table_name
25 |   compression: GZIP
26 |   source_format: CSV
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   delete_from_local_when_job_end: false
31 | 


--------------------------------------------------------------------------------
/example/config_mode_append_direct.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     null_string: 'NULL'
 8 |     skip_header_lines: 1
 9 |     comment_line_marker: '#'
10 |     columns:
11 |       - {name: date,        type: string}
12 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
13 |       - {name: "null",      type: string}
14 |       - {name: long,        type: long}
15 |       - {name: string,      type: string}
16 |       - {name: double,      type: double}
17 |       - {name: boolean,     type: boolean}
18 | out:
19 |   type: bigquery
20 |   mode: append_direct
21 |   auth_method: service_account
22 |   json_keyfile: example/your-project-000.json
23 |   dataset: your_dataset_name
24 |   table: your_table_name
25 |   compression: GZIP
26 |   source_format: CSV
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   delete_from_local_when_job_end: false
31 | 


--------------------------------------------------------------------------------
/example/config_nested_record.yml:
--------------------------------------------------------------------------------
1 | config_guess_with_column_options.yml


--------------------------------------------------------------------------------
/example/config_payload_column.yml:
--------------------------------------------------------------------------------
 1 | # embulk gem install embulk-parser-none
 2 | in:
 3 |   type: file
 4 |   path_prefix: example/example.jsonl
 5 |   parser:
 6 |     type: none
 7 |     column_name: payload
 8 | out:
 9 |   type: bigquery
10 |   mode: replace
11 |   auth_method: service_account
12 |   json_keyfile: example/your-project-000.json
13 |   dataset: your_dataset_name
14 |   table: your_table_name
15 |   compression: GZIP
16 |   source_format: NEWLINE_DELIMITED_JSON
17 |   auto_create_dataset: true
18 |   auto_create_table: true
19 |   schema_file: example/schema.json
20 |   payload_column: payload
21 | 


--------------------------------------------------------------------------------
/example/config_payload_column_index.yml:
--------------------------------------------------------------------------------
 1 | # embulk gem install embulk-parser-none
 2 | in:
 3 |   type: file
 4 |   path_prefix: example/example.jsonl
 5 |   parser:
 6 |     type: none
 7 |     column_name: payload
 8 | out:
 9 |   type: bigquery
10 |   mode: replace
11 |   auth_method: service_account
12 |   json_keyfile: example/your-project-000.json
13 |   dataset: your_dataset_name
14 |   table: your_table_name
15 |   compression: GZIP
16 |   source_format: NEWLINE_DELIMITED_JSON
17 |   auto_create_dataset: true
18 |   auto_create_table: true
19 |   schema_file: example/schema.json
20 |   payload_column_index: 0
21 | 


--------------------------------------------------------------------------------
/example/config_progress_log_interval.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   progress_log_interval: 0.1
32 | 


--------------------------------------------------------------------------------
/example/config_replace.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 | 


--------------------------------------------------------------------------------
/example/config_replace_backup.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace_backup
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   dataset_old: your_dataset_name_old
27 |   table_old: your_table_name_old
28 |   source_format: NEWLINE_DELIMITED_JSON
29 |   auto_create_dataset: true
30 |   auto_create_table: true
31 |   schema_file: example/schema.json
32 |   skip_load: true # for debug
33 | 


--------------------------------------------------------------------------------
/example/config_replace_backup_field_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace_backup
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_field_partitioned_table_name
26 |   table_old: your_field_partitioned_table_name_old
27 |   source_format: NEWLINE_DELIMITED_JSON
28 |   compression: NONE
29 |   auto_create_dataset: true
30 |   auto_create_table: true
31 |   schema_file: example/schema.json
32 |   time_partitioning:
33 |     type: 'DAY'
34 |     field: 'timestamp'
35 | 


--------------------------------------------------------------------------------
/example/config_replace_backup_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace_backup
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_partitioned_table_name$20160929
26 |   table_old: your_partitioned_table_name_old$20160929
27 |   source_format: NEWLINE_DELIMITED_JSON
28 |   compression: NONE
29 |   auto_create_dataset: true
30 |   auto_create_table: true
31 |   schema_file: example/schema.json
32 |   time_partitioning:
33 |     type: 'DAY'
34 |     expiration_ms: 100
35 | 


--------------------------------------------------------------------------------
/example/config_replace_field_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_field_partitioned_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   time_partitioning:
32 |     type: 'DAY'
33 |     field: 'timestamp'
34 | 


--------------------------------------------------------------------------------
/example/config_replace_field_range_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_field_partitioned_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   range_partitioning:
32 |     field: 'long'
33 |     range:
34 |       start: 90
35 |       end: 100
36 |       interval: 1
37 | 


--------------------------------------------------------------------------------
/example/config_replace_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_partitioned_table_name$20160929
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   time_partitioning:
32 |     type: 'DAY'
33 |     expiration_ms: 100
34 | 


--------------------------------------------------------------------------------
/example/config_replace_schema_update_options.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_partitioned_table_name$20160929
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   compression: NONE
28 |   auto_create_dataset: true
29 |   auto_create_table: true
30 |   schema_file: example/schema.json
31 |   time_partitioning:
32 |     type: 'DAY'
33 |     expiration_ms: 100
34 | 


--------------------------------------------------------------------------------
/example/config_skip_file_generation.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   path_prefix: example/example
31 |   file_ext: .jsonl
32 |   skip_file_generation: true
33 | 


--------------------------------------------------------------------------------
/example/config_table_strftime.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name_%Y%m%d
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   skip_load: true # for debug
31 | 


--------------------------------------------------------------------------------
/example/config_template_table.yml:
--------------------------------------------------------------------------------
 1 | # embulk gem install embulk-parser-none
 2 | in:
 3 |   type: file
 4 |   path_prefix: example/example.jsonl
 5 |   parser:
 6 |     type: none
 7 |     column_name: payload
 8 | out:
 9 |   type: bigquery
10 |   mode: replace
11 |   auth_method: service_account
12 |   json_keyfile: example/your-project-000.json
13 |   dataset: your_dataset_name
14 |   table: your_table_name_%Y%m%d
15 |   compression: GZIP
16 |   source_format: NEWLINE_DELIMITED_JSON
17 |   auto_create_dataset: true
18 |   auto_create_table: true
19 |   template_table: your_table_name
20 |   payload_column: payload
21 |   skip_load: true # for debug
22 | 


--------------------------------------------------------------------------------
/example/config_uncompressed.yml:
--------------------------------------------------------------------------------
1 | config_replace.yml


--------------------------------------------------------------------------------
/example/config_with_rehearsal.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: example/example.csv
 4 |   parser:
 5 |     type: csv
 6 |     charset: UTF-8
 7 |     newline: CRLF
 8 |     null_string: 'NULL'
 9 |     skip_header_lines: 1
10 |     comment_line_marker: '#'
11 |     columns:
12 |       - {name: date,        type: string}
13 |       - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14 |       - {name: "null",      type: string}
15 |       - {name: long,        type: long}
16 |       - {name: string,      type: string}
17 |       - {name: double,      type: double}
18 |       - {name: boolean,     type: boolean}
19 | out:
20 |   type: bigquery
21 |   mode: replace
22 |   auth_method: service_account
23 |   json_keyfile: example/your-project-000.json
24 |   dataset: your_dataset_name
25 |   table: your_table_name
26 |   source_format: NEWLINE_DELIMITED_JSON
27 |   auto_create_dataset: true
28 |   auto_create_table: true
29 |   schema_file: example/schema.json
30 |   with_rehearsal: true
31 |   rehearsal_counts: 1
32 |   skip_load: true # for debug
33 |   compression: GZIP
34 | 


--------------------------------------------------------------------------------
/example/example.csv:
--------------------------------------------------------------------------------
 1 | date,timestamp,null,long,string,double,boolean
 2 | 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,true
 3 | 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,true
 4 | 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,true
 5 | 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,true
 6 | 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,true
 7 | 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,true
 8 | 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,true
 9 | 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,true
10 | 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,false
11 | 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,false
12 | 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,false
13 | 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,false
14 | 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,false
15 | 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,false
16 | 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,false
17 | 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,false
18 | 


--------------------------------------------------------------------------------
/example/example.yml:
--------------------------------------------------------------------------------
1 | config_replace.yml


--------------------------------------------------------------------------------
/example/example2_1.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/example2_2.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/example4_1.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/example4_2.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/example4_3.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/example4_4.csv:
--------------------------------------------------------------------------------
1 | example.csv


--------------------------------------------------------------------------------
/example/json_key.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "service_account",
 3 |   "project_id": "your_project_name",
 4 |   "private_key_id": "your_private_key_id",
 5 |   "private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n",
 6 |   "client_email": "your_service_account_email",
 7 |   "client_id": "your_client_id",
 8 |   "auth_uri": "https://accounts.google.com/o/oauth2/auth",
 9 |   "token_uri": "https://accounts.google.com/o/oauth2/token",
10 |   "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11 |   "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/account-3%40your_project_name.iam.gserviceaccount.com"
12 | }
13 | 


--------------------------------------------------------------------------------
/example/nested_example.jsonl:
--------------------------------------------------------------------------------
 1 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 2 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 3 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 4 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 5 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 6 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 7 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 8 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
 9 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
10 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
11 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":false}
12 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
13 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":false}
14 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":false}
15 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
16 | {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
17 | 


--------------------------------------------------------------------------------
/example/schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name":"date",
 4 |     "type":"STRING"
 5 |   },
 6 |   {
 7 |     "name":"timestamp",
 8 |     "type":"TIMESTAMP"
 9 |   },
10 |   {
11 |     "name":"null",
12 |     "type":"STRING"
13 |   },
14 |   {
15 |     "name":"long",
16 |     "type":"INTEGER"
17 |   },
18 |   {
19 |     "name":"string",
20 |     "type":"STRING"
21 |   },
22 |   {
23 |     "name":"double",
24 |     "type":"FLOAT"
25 |   },
26 |   {
27 |     "name":"boolean",
28 |     "type":"BOOLEAN"
29 |   }
30 | ]
31 | 


--------------------------------------------------------------------------------
/example/schema_expose_errors.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name":"dat",
 4 |     "type":"STRING"
 5 |   },
 6 |   {
 7 |     "name":"timestamp",
 8 |     "type":"TIMESTAMP"
 9 |   },
10 |   {
11 |     "name":"null",
12 |     "type":"STRING"
13 |   },
14 |   {
15 |     "name":"long",
16 |     "type":"INTEGER"
17 |   },
18 |   {
19 |     "name":"string",
20 |     "type":"STRING"
21 |   },
22 |   {
23 |     "name":"double",
24 |     "type":"FLOAT"
25 |   },
26 |   {
27 |     "name":"boolean",
28 |     "type":"BOOLEAN"
29 |   }
30 | ]
31 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/auth.rb:
--------------------------------------------------------------------------------
 1 | require 'googleauth'
 2 | 
 3 | module Embulk
 4 |   module Output
 5 |     class Bigquery < OutputPlugin
 6 |       class Auth
 7 | 
 8 |         attr_reader :auth_method, :json_key, :scope
 9 | 
10 |         def initialize(task, scope)
11 |           @auth_method = task['auth_method']
12 |           @json_key = task['json_keyfile']
13 |           @scope = scope
14 |         end
15 | 
16 |         def authenticate
17 |           case auth_method
18 |           when 'authorized_user'
19 |             key = StringIO.new(json_key)
20 |             return Google::Auth::UserRefreshCredentials.make_creds(json_key_io: key, scope: scope)
21 |           when 'compute_engine'
22 |             return Google::Auth::GCECredentials.new
23 |           when 'service_account', 'json_key' # json_key is for backward compatibility
24 |             key = StringIO.new(json_key)
25 |             return Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
26 |           when 'application_default'
27 |             return Google::Auth.get_application_default([scope])
28 |           else
29 |             raise ConfigError.new("Unknown auth method: #{auth_method}")
30 |           end
31 |         end
32 |       end
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/bigquery_client.rb:
--------------------------------------------------------------------------------
  1 | require 'google/apis/bigquery_v2'
  2 | require 'json'
  3 | require 'thwait'
  4 | require_relative 'google_client'
  5 | require_relative 'helper'
  6 | 
  7 | module Embulk
  8 |   module Output
  9 |     class Bigquery < OutputPlugin
 10 |       class BigqueryClient < GoogleClient
 11 |         BIGQUERY_TABLE_OPERATION_INTERVAL = 2 # https://cloud.google.com/bigquery/quotas
 12 | 
 13 |         def initialize(task, schema, fields = nil)
 14 |           scope = "https://www.googleapis.com/auth/bigquery"
 15 |           client_class = Google::Apis::BigqueryV2::BigqueryService
 16 |           super(task, scope, client_class)
 17 | 
 18 |           @schema = schema
 19 |           reset_fields(fields) if fields
 20 |           @project = @task['project']
 21 |           @destination_project = @task['destination_project']
 22 |           @dataset = @task['dataset']
 23 |           @location = @task['location']
 24 |           @location_for_log = @location.nil? ? 'Primary location' : @location
 25 | 
 26 |           @task['source_format'] ||= 'CSV'
 27 |           @task['max_bad_records'] ||= 0
 28 |           @task['field_delimiter'] ||= ','
 29 |           @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil
 30 |           @task['encoding'] ||= 'UTF-8'
 31 |           @task['ignore_unknown_values'] = false if @task['ignore_unknown_values'].nil?
 32 |           @task['allow_quoted_newlines'] = false if @task['allow_quoted_newlines'].nil?
 33 |         end
 34 | 
 35 |         def fields
 36 |           return @fields if @fields
 37 |           if @task['schema_file']
 38 |             @fields = Helper.deep_symbolize_keys(JSON.parse(File.read(@task['schema_file'])))
 39 |           elsif @task['template_table']
 40 |             @fields = fields_from_table(@task['template_table'])
 41 |           else
 42 |             @fields = Helper.fields_from_embulk_schema(@task, @schema)
 43 |           end
 44 |         end
 45 | 
 46 |         def fields_from_table(table)
 47 |           response = get_table(table)
 48 |           response.schema.fields.map {|field| field.to_h }
 49 |         end
 50 | 
 51 |         def reset_fields(fields = nil)
 52 |           @fields = fields
 53 |           self.fields
 54 |         end
 55 | 
 56 |         def with_job_retry(&block)
 57 |           retries = 0
 58 |           begin
 59 |             yield
 60 |           rescue BackendError, InternalError, RateLimitExceeded => e
 61 |             if e.is_a?(RateLimitExceeded)
 62 |               sleep(BIGQUERY_TABLE_OPERATION_INTERVAL)
 63 |             end
 64 | 
 65 |             if retries < @task['retries']
 66 |               retries += 1
 67 |               Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
 68 |               retry
 69 |             else
 70 |               Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
 71 |               raise e
 72 |             end
 73 |           end
 74 |         end
 75 | 
 76 |         # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
 77 |         # @return [Array] responses
 78 |         def load_from_gcs(object_uris, table)
 79 |           with_job_retry do
 80 |             begin
 81 |               # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
 82 |               # we should generate job_id in client code, otherwise, retrying would cause duplication
 83 |               job_id = "embulk_load_job_#{SecureRandom.uuid}"
 84 |               Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}" }
 85 | 
 86 |               body = {
 87 |                 job_reference: {
 88 |                   project_id: @project,
 89 |                   job_id: job_id,
 90 |                 },
 91 |                 configuration: {
 92 |                   load: {
 93 |                     destination_table: {
 94 |                       project_id: @destination_project,
 95 |                       dataset_id: @dataset,
 96 |                       table_id: table,
 97 |                     },
 98 |                     schema: {
 99 |                       fields: fields,
100 |                     },
101 |                     write_disposition: 'WRITE_APPEND',
102 |                     source_format:         @task['source_format'],
103 |                     max_bad_records:       @task['max_bad_records'],
104 |                     field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
105 |                     encoding:              @task['encoding'],
106 |                     ignore_unknown_values: @task['ignore_unknown_values'],
107 |                     allow_quoted_newlines: @task['allow_quoted_newlines'],
108 |                     source_uris: object_uris,
109 |                   }
110 |                 }
111 |               }
112 | 
113 |               if @location
114 |                 body[:job_reference][:location] = @location
115 |               end
116 | 
117 |               if @task['schema_update_options']
118 |                 body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
119 |               end
120 | 
121 |               opts = {}
122 | 
123 |               Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
124 |               response = with_network_retry { client.insert_job(@project, body, **opts) }
125 |               unless @task['is_skip_job_result_check']
126 |                 response = wait_load('Load', response)
127 |               end
128 |               [response]
129 |             rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
130 |               response = {status_code: e.status_code, message: e.message, error_class: e.class}
131 |               Embulk.logger.error {
132 |                 "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
133 |               }
134 |               raise Error, "failed to load #{object_uris} to #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}, response:#{response}"
135 |             end
136 |           end
137 |         end
138 | 
139 |         def load_in_parallel(paths, table)
140 |           return [] if paths.empty?
141 |           # You may think as, load job is a background job, so sending requests in parallel
142 |           # does not improve performance. However, with actual experiments, this parallel
143 |           # loadings drastically shortened waiting time. It looks one jobs.insert takes about 50 sec.
144 |           # NOTICE: parallel uploadings of files consumes network traffic. With 24 concurrencies
145 |           # with 100MB files consumed about 500Mbps in the experimented environment at a peak.
146 |           #
147 |           # We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
148 |           # So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
149 |           responses = []
150 |           threads = []
151 |           Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
152 |           paths.each_with_index do |path, idx|
153 |             threads << Thread.new(path, idx) do |path, idx|
154 |               # I am not sure whether google-api-ruby-client is thread-safe,
155 |               # so let me create new instances for each thread for safe
156 |               bigquery = self.class.new(@task, @schema, fields)
157 |               response = bigquery.load(path, table)
158 |               [idx, response]
159 |             end
160 |           end
161 |           ThreadsWait.all_waits(*threads) do |th|
162 |             idx, response = th.value # raise errors occurred in threads
163 |             responses[idx] = response
164 |           end
165 |           responses
166 |         end
167 | 
168 |         def load(path, table, write_disposition: 'WRITE_APPEND')
169 |           with_job_retry do
170 |             begin
171 |               if File.exist?(path)
172 |                 # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
173 |                 # we should generate job_id in client code, otherwise, retrying would cause duplication
174 |                 job_id = "embulk_load_job_#{SecureRandom.uuid}"
175 |                 Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}" }
176 |               else
177 |                 Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
178 |                 return
179 |               end
180 | 
181 |               body = {
182 |                 job_reference: {
183 |                   project_id: @project,
184 |                   job_id: job_id,
185 |                 },
186 |                 configuration: {
187 |                   load: {
188 |                     destination_table: {
189 |                       project_id: @destination_project,
190 |                       dataset_id: @dataset,
191 |                       table_id: table,
192 |                     },
193 |                     schema: {
194 |                       fields: fields,
195 |                     },
196 |                     write_disposition:     write_disposition,
197 |                     source_format:         @task['source_format'],
198 |                     max_bad_records:       @task['max_bad_records'],
199 |                     field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
200 |                     encoding:              @task['encoding'],
201 |                     ignore_unknown_values: @task['ignore_unknown_values'],
202 |                     allow_quoted_newlines: @task['allow_quoted_newlines'],
203 |                   }
204 |                 }
205 |               }
206 | 
207 |               if @location
208 |                 body[:job_reference][:location] = @location
209 |               end
210 | 
211 |               if @task['schema_update_options']
212 |                 body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
213 |               end
214 | 
215 |               opts = {
216 |                 upload_source: path,
217 |                 content_type: "application/octet-stream",
218 |                 # options: {
219 |                 #   retries: @task['retries'],
220 |                 #   timeout_sec: @task['timeout_sec'],
221 |                 #   open_timeout_sec: @task['open_timeout_sec']
222 |                 # },
223 |               }
224 |               Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
225 |               response = with_network_retry { client.insert_job(@project, body, **opts) }
226 |               if @task['is_skip_job_result_check']
227 |                 response
228 |               else
229 |                 response = wait_load('Load', response)
230 |               end
231 |             rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
232 |               response = {status_code: e.status_code, message: e.message, error_class: e.class}
233 |               Embulk.logger.error {
234 |                 "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
235 |               }
236 |               raise Error, "failed to load #{path} to #{@destination_project}:#{@dataset}.#{table} in #{@location_for_log}, response:#{response}"
237 |             end
238 |           end
239 |         end
240 | 
241 |         def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
242 |           with_job_retry do
243 |             begin
244 |               destination_dataset ||= @dataset
245 |               job_id = "embulk_copy_job_#{SecureRandom.uuid}"
246 | 
247 |               Embulk.logger.info {
248 |                 "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
249 |                 "#{@destination_project}:#{@dataset}.#{source_table} => #{@destination_project}:#{destination_dataset}.#{destination_table}"
250 |               }
251 | 
252 |               body = {
253 |                 job_reference: {
254 |                   project_id: @project,
255 |                   job_id: job_id,
256 |                 },
257 |                 configuration: {
258 |                   copy: {
259 |                     create_deposition: 'CREATE_IF_NEEDED',
260 |                     write_disposition: write_disposition,
261 |                     source_table: {
262 |                       project_id: @destination_project,
263 |                       dataset_id: @dataset,
264 |                       table_id: source_table,
265 |                     },
266 |                     destination_table: {
267 |                       project_id: @destination_project,
268 |                       dataset_id: destination_dataset,
269 |                       table_id: destination_table,
270 |                     },
271 |                   }
272 |                 }
273 |               }
274 | 
275 |               if @location
276 |                 body[:job_reference][:location] = @location
277 |               end
278 | 
279 |               opts = {}
280 |               Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
281 |               response = with_network_retry { client.insert_job(@project, body, **opts) }
282 |               wait_load('Copy', response)
283 |             rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
284 |               response = {status_code: e.status_code, message: e.message, error_class: e.class}
285 |               Embulk.logger.error {
286 |                 "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
287 |               }
288 |               raise Error, "failed to copy #{@destination_project}:#{@dataset}.#{source_table} " \
289 |                 "to #{@destination_project}:#{destination_dataset}.#{destination_table}, response:#{response}"
290 |             end
291 |           end
292 |         end
293 | 
294 |         def wait_load(kind, response)
295 |           started = Time.now
296 | 
297 |           wait_interval = @task['job_status_polling_interval']
298 |           max_polling_time = @task['job_status_max_polling_time']
299 |           _response = response
300 | 
301 |           while true
302 |             job_id = _response.job_reference.job_id
303 |             location = @location || _response.job_reference.location
304 |             elapsed = Time.now - started
305 |             status = _response.status.state
306 |             if status == "DONE"
307 |               Embulk.logger.info {
308 |                 "embulk-output-bigquery: #{kind} job completed... " \
309 |                 "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
310 |               }
311 |               break
312 |             elsif elapsed.to_i > max_polling_time
313 |               message = "embulk-output-bigquery: #{kind} job checking... " \
314 |                 "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
315 |               Embulk.logger.info { message }
316 |               raise JobTimeoutError.new(message)
317 |             else
318 |               Embulk.logger.info {
319 |                 "embulk-output-bigquery: #{kind} job checking... " \
320 |                 "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
321 |               }
322 |               sleep wait_interval
323 |               _response = with_network_retry { client.get_job(@project, job_id, location: location) }
324 |             end
325 |           end
326 | 
327 |           # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
328 |           _errors = _response.status.errors
329 | 
330 |           # cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method
331 |           # `error_result` returns Google::Apis::BigqueryV2::ErrorProto if job failed.
332 |           # Otherwise, this returns nil.
333 |           if _response.status.error_result
334 |             msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
335 |             if _errors.any? {|error| error.reason == 'backendError' }
336 |               raise BackendError, msg
337 |             elsif _errors.any? {|error| error.reason == 'internalError' }
338 |               raise InternalError, msg
339 |             elsif _errors.any? {|error| error.reason == 'rateLimitExceeded' }
340 |               raise RateLimitExceeded, msg
341 |             else
342 |               Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
343 |               raise Error, msg
344 |             end
345 |           end
346 | 
347 |           if _errors
348 |             Embulk.logger.warn { "embulk-output-bigquery: #{kind} job errors... job_id:[#{job_id}] errors:#{_errors.map(&:to_h)}" }
349 |           end
350 | 
351 |           Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
352 | 
353 |           _response
354 |         end
355 | 
356 |         def create_dataset(dataset = nil, reference: nil)
357 |           dataset ||= @dataset
358 |           begin
359 |             Embulk.logger.info { "embulk-output-bigquery: Create dataset... #{@destination_project}:#{dataset} in #{@location_for_log}" }
360 |             hint = {}
361 |             if reference
362 |               response = get_dataset(reference)
363 |               hint = { access: response.access }
364 |             end
365 |             body = {
366 |               dataset_reference: {
367 |                 project_id: @project,
368 |                 dataset_id: dataset,
369 |               },
370 |             }.merge(hint)
371 |             if @location
372 |               body[:location] = @location
373 |             end
374 |             opts = {}
375 |             Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{@location_for_log}, #{body}, #{opts})" }
376 |             with_network_retry { client.insert_dataset(@project, body, **opts) }
377 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
378 |             if e.status_code == 409 && /Already Exists:/ =~ e.message
379 |               # ignore 'Already Exists' error
380 |               return
381 |             end
382 | 
383 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
384 |             Embulk.logger.error {
385 |               "embulk-output-bigquery: insert_dataset(#{@project}, #{body}, #{opts}), response:#{response}"
386 |             }
387 |             raise Error, "failed to create dataset #{@destination_project}:#{dataset} in #{@location_for_log}, response:#{response}"
388 |           end
389 |         end
390 | 
391 |         def get_dataset(dataset = nil)
392 |           dataset ||= @dataset
393 |           begin
394 |             Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@destination_project}:#{dataset}" }
395 |             with_network_retry { client.get_dataset(@destination_project, dataset) }
396 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
397 |             if e.status_code == 404
398 |               raise NotFoundError, "Dataset #{@destination_project}:#{dataset} is not found"
399 |             end
400 | 
401 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
402 |             Embulk.logger.error {
403 |               "embulk-output-bigquery: get_dataset(#{@destination_project}, #{dataset}), response:#{response}"
404 |             }
405 |             raise Error, "failed to get dataset #{@destination_project}:#{dataset}, response:#{response}"
406 |           end
407 |         end
408 | 
409 |         def create_table_if_not_exists(table, dataset: nil, options: nil)
410 |           begin
411 |             dataset ||= @dataset
412 |             options ||= {}
413 |             options['time_partitioning'] ||= @task['time_partitioning']
414 |             if Helper.has_partition_decorator?(table)
415 |               options['time_partitioning'] ||= {'type' => 'DAY'}
416 |               table = Helper.chomp_partition_decorator(table)
417 |             end
418 | 
419 |             Embulk.logger.info { "embulk-output-bigquery: Create table... #{@destination_project}:#{dataset}.#{table}" }
420 |             body = {
421 |               table_reference: {
422 |                 table_id: table,
423 |               },
424 |               description: @task['description'],
425 |               schema: {
426 |                 fields: fields,
427 |               }
428 |             }
429 | 
430 |             if options['time_partitioning']
431 |               body[:time_partitioning] = {
432 |                 type: options['time_partitioning']['type'],
433 |                 expiration_ms: options['time_partitioning']['expiration_ms'],
434 |                 field: options['time_partitioning']['field'],
435 |               }
436 |             end
437 | 
438 |             options['range_partitioning'] ||= @task['range_partitioning']
439 |             if options['range_partitioning']
440 |               body[:range_partitioning] = {
441 |                 field: options['range_partitioning']['field'],
442 |                 range: {
443 |                   start: options['range_partitioning']['range']['start'].to_s,
444 |                   end: options['range_partitioning']['range']['end'].to_s,
445 |                   interval: options['range_partitioning']['range']['interval'].to_s,
446 |                 },
447 |               }
448 |             end
449 | 
450 |             options['clustering'] ||= @task['clustering']
451 |             if options['clustering']
452 |               body[:clustering] = {
453 |                 fields: options['clustering']['fields'],
454 |               }
455 |             end
456 | 
457 |             if options['expiration_time']
458 |               # expiration_time is expressed in milliseconds
459 |               body[:expiration_time] = (Time.now.to_i + options['expiration_time']) * 1000
460 |             end
461 | 
462 |             opts = {}
463 |             Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@destination_project}, #{dataset}, #{@location_for_log}, #{body}, #{opts})" }
464 |             with_network_retry { client.insert_table(@destination_project, dataset, body, **opts) }
465 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
466 |             if e.status_code == 409 && /Already Exists:/ =~ e.message
467 |               # ignore 'Already Exists' error
468 |               return
469 |             end
470 | 
471 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
472 |             Embulk.logger.error {
473 |               "embulk-output-bigquery: insert_table(#{@destination_project}, #{dataset}, #{@location_for_log}, #{body}, #{opts}), response:#{response}"
474 |             }
475 |             raise Error, "failed to create table #{@destination_project}:#{dataset}.#{table} in #{@location_for_log}, response:#{response}"
476 |           end
477 |         end
478 | 
479 |         def delete_table(table, dataset: nil)
480 |           table = Helper.chomp_partition_decorator(table)
481 |           delete_table_or_partition(table, dataset: dataset)
482 |         end
483 | 
484 |         def delete_partition(table, dataset: nil)
485 |           delete_table_or_partition(table, dataset: dataset)
486 |         end
487 | 
488 |         # if `table` with a partition decorator is given, a partition is deleted.
489 |         def delete_table_or_partition(table, dataset: nil)
490 |           begin
491 |             dataset ||= @dataset
492 |             Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@destination_project}:#{dataset}.#{table}" }
493 |             with_network_retry { client.delete_table(@destination_project, dataset, table) }
494 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
495 |             if e.status_code == 404 && /Not found:/ =~ e.message
496 |               # ignore 'Not Found' error
497 |               return
498 |             end
499 | 
500 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
501 |             Embulk.logger.error {
502 |               "embulk-output-bigquery: delete_table(#{@destination_project}, #{dataset}, #{table}), response:#{response}"
503 |             }
504 |             raise Error, "failed to delete table #{@destination_project}:#{dataset}.#{table}, response:#{response}"
505 |           end
506 |         end
507 | 
508 |         def get_table(table, dataset: nil)
509 |           table = Helper.chomp_partition_decorator(table)
510 |           get_table_or_partition(table)
511 |         end
512 | 
513 |         def get_partition(table, dataset: nil)
514 |           get_table_or_partition(table)
515 |         end
516 | 
517 |         def get_table_or_partition(table, dataset: nil)
518 |           begin
519 |             dataset ||= @dataset
520 |             Embulk.logger.info { "embulk-output-bigquery: Get table... #{@destination_project}:#{dataset}.#{table}" }
521 |             with_network_retry { client.get_table(@destination_project, dataset, table) }
522 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
523 |             if e.status_code == 404
524 |               raise NotFoundError, "Table #{@destination_project}:#{dataset}.#{table} is not found"
525 |             end
526 | 
527 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
528 |             Embulk.logger.error {
529 |               "embulk-output-bigquery: get_table(#{@destination_project}, #{dataset}, #{table}), response:#{response}"
530 |             }
531 |             raise Error, "failed to get table #{@destination_project}:#{dataset}.#{table}, response:#{response}"
532 |           end
533 |         end
534 |       end
535 |     end
536 |   end
537 | end
538 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/file_writer.rb:
--------------------------------------------------------------------------------
  1 | require 'zlib'
  2 | require 'json'
  3 | require 'csv'
  4 | require_relative 'value_converter_factory'
  5 | 
  6 | module Embulk
  7 |   module Output
  8 |     class Bigquery < OutputPlugin
  9 |       class FileWriter
 10 |         attr_reader :num_rows
 11 | 
 12 |         def initialize(task, schema, index, converters = nil)
 13 |           @task = task
 14 |           @schema = schema
 15 |           @index = index
 16 |           @converters = converters || ValueConverterFactory.create_converters(task, schema)
 17 | 
 18 |           @num_rows = 0
 19 |           if @task['progress_log_interval']
 20 |             @progress_log_interval = @task['progress_log_interval']
 21 |             @progress_log_timer = Time.now
 22 |             @previous_num_rows = 0
 23 |           end
 24 | 
 25 |           if @task['payload_column_index']
 26 |             @payload_column_index = @task['payload_column_index']
 27 |             @formatter_proc = self.method(:to_payload)
 28 |           else
 29 |             case @task['source_format'].downcase
 30 |             when 'csv'
 31 |               @formatter_proc = self.method(:to_csv)
 32 |             else
 33 |               @formatter_proc = self.method(:to_jsonl)
 34 |             end
 35 |           end
 36 |         end
 37 | 
 38 |         def io
 39 |           return @io if @io
 40 | 
 41 |           path = sprintf(
 42 |             "#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}",
 43 |             Process.pid, Thread.current.object_id
 44 |           )
 45 |           if File.exist?(path)
 46 |             Embulk.logger.warn { "embulk-output-bigquery: unlink already existing #{path}" }
 47 |             File.unlink(path) rescue nil
 48 |           end
 49 |           Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
 50 | 
 51 |           @io = open(path, 'w')
 52 |         end
 53 | 
 54 |         def open(path, mode = 'w')
 55 |           file_io = File.open(path, mode)
 56 |           case @task['compression'].downcase
 57 |           when 'gzip'
 58 |             io = Zlib::GzipWriter.new(file_io)
 59 |           else
 60 |             io = file_io
 61 |           end
 62 |           io
 63 |         end
 64 | 
 65 |         def close
 66 |           io.close rescue nil
 67 |           io
 68 |         end
 69 | 
 70 |         def reopen
 71 |           @io = open(io.path, 'a')
 72 |         end
 73 | 
 74 |         def to_payload(record)
 75 |           "#{record[@payload_column_index]}\n"
 76 |         end
 77 | 
 78 |         def to_csv(record)
 79 |           record.map.with_index do |value, column_index|
 80 |             @converters[column_index].call(value)
 81 |           end.to_csv
 82 |         end
 83 | 
 84 |         def to_jsonl(record)
 85 |           hash = {}
 86 |           column_names = @schema.names
 87 |           record.each_with_index do |value, column_index|
 88 |             column_name = column_names[column_index]
 89 |             hash[column_name] = @converters[column_index].call(value)
 90 |           end
 91 |           "#{hash.to_json}\n"
 92 |         end
 93 | 
 94 |         def num_format(number)
 95 |           number.to_s.gsub(/(\d)(?=(\d{3})+(?!\d))/, '\1,')
 96 |         end
 97 | 
 98 |         def add(page)
 99 |           _io = io
100 |           # I once tried to split IO writing into another IO thread using SizedQueue
101 |           # However, it resulted in worse performance, so I removed the codes.
102 |           page.each do |record|
103 |             Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
104 |             formatted_record = @formatter_proc.call(record)
105 |             Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
106 |             _io.write formatted_record
107 |             @num_rows += 1
108 |           end
109 |           show_progress if @task['progress_log_interval']
110 |           @num_rows
111 |         end
112 | 
113 |         private
114 | 
115 |         def show_progress
116 |           now = Time.now
117 |           if @progress_log_timer < now - @progress_log_interval
118 |             speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1)
119 |             @progress_log_timer = now
120 |             @previous_num_rows = @num_rows
121 |             Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" }
122 |           end
123 |         end
124 |       end
125 |     end
126 |   end
127 | end
128 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/gcs_client.rb:
--------------------------------------------------------------------------------
  1 | require 'uri'
  2 | require 'java'
  3 | require 'google/apis/storage_v1'
  4 | require_relative 'google_client'
  5 | require_relative 'helper'
  6 | 
  7 | # ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers
  8 | # ToDo: Tests are not written because this implementation will probably entirely changed on supporting streaming transfers
  9 | module Embulk
 10 |   module Output
 11 |     class Bigquery < OutputPlugin
 12 |       class GcsClient < GoogleClient
 13 |         def initialize(task)
 14 |           scope = "https://www.googleapis.com/auth/cloud-platform"
 15 |           client_class = Google::Apis::StorageV1::StorageService
 16 |           super(task, scope, client_class)
 17 | 
 18 |           @project = @task['project']
 19 |           @destination_project = @task['destination_project']
 20 |           @bucket = @task['gcs_bucket']
 21 |           @location = @task['location']
 22 |         end
 23 | 
 24 |         def insert_temporary_bucket(bucket = nil)
 25 |           bucket ||= @bucket
 26 |           begin
 27 |             Embulk.logger.info { "embulk-output-bigquery: Insert bucket... #{@destination_project}:#{bucket}" }
 28 |             body = {
 29 |               name: bucket,
 30 |               lifecycle: {
 31 |                 rule: [
 32 |                   {
 33 |                     action: {
 34 |                       type: "Delete",
 35 |                     },
 36 |                     condition: {
 37 |                       age: 1,
 38 |                     }
 39 |                   },
 40 |                 ]
 41 |               }
 42 |             }
 43 | 
 44 |             if @location
 45 |               body[:location] = @location
 46 |             end
 47 | 
 48 |             opts = {}
 49 | 
 50 |             Embulk.logger.debug { "embulk-output-bigquery: insert_temporary_bucket(#{@project}, #{body}, #{opts})" }
 51 |             with_network_retry { client.insert_bucket(@project, body, **opts) }
 52 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
 53 |             if e.status_code == 409 && /conflict:/ =~ e.message
 54 |               # ignore 'Already Exists' error
 55 |               return nil
 56 |             end
 57 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
 58 |             Embulk.logger.error {
 59 |               "embulk-output-bigquery: insert_temporary_bucket(#{@project}, #{body}, #{opts}), response:#{response}"
 60 |             }
 61 |             raise Error, "failed to insert bucket #{@destination_project}:#{bucket}, response:#{response}"
 62 |           end
 63 |         end
 64 | 
 65 |         def insert_object(path, object: nil, bucket: nil)
 66 |           bucket ||= @bucket
 67 |           object ||= path
 68 |           object = object.start_with?('/') ? object[1..-1] : object
 69 |           object_uri = URI.join("gs://#{bucket}", object).to_s
 70 | 
 71 |           started = Time.now
 72 |           begin
 73 |             Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@destination_project}:#{object_uri}" }
 74 |             body = {
 75 |               name: object,
 76 |             }
 77 |             opts = {
 78 |               upload_source: path,
 79 |               content_type: 'application/octet-stream'
 80 |             }
 81 | 
 82 |             Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
 83 |             # memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
 84 |             with_network_retry { client.insert_object(bucket, body, **opts) }
 85 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
 86 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
 87 |             Embulk.logger.error {
 88 |               "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
 89 |             }
 90 |             raise Error, "failed to insert object #{@destination_project}:#{object_uri}, response:#{response}"
 91 |           end
 92 |         end
 93 | 
 94 |         def insert_objects(paths, objects: nil, bucket: nil)
 95 |           return [] if paths.empty?
 96 |           bucket ||= @bucket
 97 |           objects ||= paths
 98 |           raise "number of paths and objects are different" if paths.size != objects.size
 99 | 
100 |           responses = []
101 |           paths.each_with_index do |path, idx|
102 |             object = objects[idx]
103 |             responses << insert_object(path, object: object, bucket: bucket)
104 |           end
105 |           responses
106 |         end
107 | 
108 |         def delete_object(object, bucket: nil)
109 |           bucket ||= @bucket
110 |           object = object.start_with?('/') ? object[1..-1] : object
111 |           object_uri = URI.join("gs://#{bucket}", object).to_s
112 |           begin
113 |             Embulk.logger.info { "embulk-output-bigquery: Delete object... #{@destination_project}:#{object_uri}" }
114 |             opts = {}
115 | 
116 |             Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
117 |             response = with_network_retry { client.delete_object(bucket, object, **opts) }
118 |           rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
119 |             if e.status_code == 404 # ignore 'notFound' error
120 |               return nil
121 |             end
122 |             response = {status_code: e.status_code, message: e.message, error_class: e.class}
123 |             Embulk.logger.error {
124 |               "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts}), response:#{response}"
125 |             }
126 |             raise Error, "failed to delete object #{@destination_project}:#{object_uri}, response:#{response}"
127 |           end
128 |         end
129 |       end
130 |     end
131 |   end
132 | end
133 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/google_client.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'auth'
 2 | 
 3 | module Embulk
 4 |   module Output
 5 |     class Bigquery < OutputPlugin
 6 |       class Error < StandardError; end
 7 |       class JobTimeoutError < Error; end
 8 |       class NotFoundError < Error; end
 9 |       class BackendError < Error; end
10 |       class InternalError < Error; end
11 |       class RateLimitExceeded < Error; end
12 | 
13 |       class GoogleClient
14 |         def initialize(task, scope, client_class)
15 |           @task = task
16 |           @scope = scope
17 |           @auth = Auth.new(task, scope)
18 |           @client_class = client_class
19 |         end
20 | 
21 |         def client
22 |           return @cached_client if @cached_client && @cached_client_expiration > Time.now
23 | 
24 |           client = @client_class.new
25 |           client.client_options.application_name = @task['application_name']
26 |           client.request_options.retries = @task['retries']
27 |           if client.request_options.respond_to?(:timeout_sec)
28 |             client.request_options.timeout_sec = @task['timeout_sec'] || 300
29 |             client.request_options.open_timeout_sec = @task['open_timeout_sec'] || 300
30 |           else # google-api-ruby-client >= v0.11.0
31 |             if @task['timeout_sec']
32 |               Embulk.logger.warn { "embulk-output-bigquery: timeout_sec is deprecated in google-api-ruby-client >= v0.11.0. Use read_timeout_sec instead" }
33 |             end
34 |             client.client_options.open_timeout_sec = @task['open_timeout_sec'] || 300 # default: 60
35 |             client.client_options.send_timeout_sec = @task['send_timeout_sec'] || 300 # default: 120
36 |             client.client_options.read_timeout_sec = @task['read_timeout_sec'] || @task['timeout_sec'] || 300 # default: 60
37 |           end
38 |           Embulk.logger.debug { "embulk-output-bigquery: client_options: #{client.client_options.to_h}" }
39 |           Embulk.logger.debug { "embulk-output-bigquery: request_options: #{client.request_options.to_h}" }
40 | 
41 |           client.authorization = @auth.authenticate
42 | 
43 |           @cached_client_expiration = Time.now + 1800
44 |           @cached_client = client
45 |         end
46 | 
47 |         # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
48 |         def with_network_retry(&block)
49 |           retries = 0
50 |           begin
51 |             yield
52 |           rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException, ::Java::JavaxNetSsl::SSLException => e
53 |             retry_messages = [
54 |               'Broken pipe',
55 |               'Connection reset',
56 |               'Connection timed out',
57 |               'Connection or outbound has closed',
58 |             ]
59 |             if retry_messages.select { |x| e.message.include?(x) }.empty?
60 |               raise e
61 |             else
62 |               if retries < @task['retries']
63 |                 retries += 1
64 |                 Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" }
65 |                 retry
66 |               else
67 |                 Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" }
68 |                 raise e
69 |               end
70 |             end
71 |           end
72 |         end
73 |       end
74 |     end
75 |   end
76 | end
77 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/helper.rb:
--------------------------------------------------------------------------------
 1 | require 'digest/md5'
 2 | require 'securerandom'
 3 | 
 4 | module Embulk
 5 |   module Output
 6 |     class Bigquery < OutputPlugin
 7 |       class Helper
 8 |         PARTITION_DECORATOR_REGEXP = /\$.+\z/
 9 | 
10 |         def self.field_partitioning?(task)
11 |           (task['time_partitioning'] || {}).key?('field')
12 |         end
13 | 
14 |         def self.has_partition_decorator?(table_name)
15 |           !!(table_name =~ PARTITION_DECORATOR_REGEXP)
16 |         end
17 | 
18 |         def self.chomp_partition_decorator(table_name)
19 |           table_name.sub(PARTITION_DECORATOR_REGEXP, '')
20 |         end
21 | 
22 |         def self.bq_type_from_embulk_type(embulk_type)
23 |           case embulk_type
24 |           when :boolean then 'BOOLEAN'
25 |           when :long then 'INTEGER'
26 |           when :double then 'FLOAT'
27 |           when :string then 'STRING'
28 |           when :timestamp then 'TIMESTAMP'
29 |           when :json then 'STRING' # NOTE: Default is not RECORD since it requires `fields`
30 |           else raise ArgumentError, "embulk type #{embulk_type} is not supported"
31 |           end
32 |         end
33 | 
34 |         # @return [Hash] name => column_option.
35 |         # ToDo: recursively map fields?
36 |         def self.column_options_map(column_options)
37 |           (column_options || {}).map do |column_option|
38 |             [column_option['name'], column_option]
39 |           end.to_h
40 |         end
41 | 
42 |         def self.fields_from_embulk_schema(task, schema)
43 |           column_options_map = self.column_options_map(task['column_options'])
44 |           schema.map do |column|
45 |             column_name   = column[:name]
46 |             embulk_type   = column[:type]
47 |             column_option = column_options_map[column_name] || {}
48 |             {}.tap do |field|
49 |               field[:name]        = column_name
50 |               field[:type]        = (column_option['type'] || bq_type_from_embulk_type(embulk_type)).upcase
51 |               field[:mode]        = column_option['mode'] if column_option['mode']
52 |               field[:fields]      = deep_symbolize_keys(column_option['fields']) if column_option['fields']
53 |               field[:description] = column_option['description'] if column_option['description']
54 |             end
55 |           end
56 |         end
57 | 
58 |         def self.deep_symbolize_keys(obj)
59 |           if obj.is_a?(Hash)
60 |             obj.inject({}) do |options, (key, value)|
61 |               options[(key.to_sym rescue key) || key] = deep_symbolize_keys(value)
62 |               options
63 |             end
64 |           elsif obj.is_a?(Array)
65 |             obj.map {|value| deep_symbolize_keys(value) }
66 |           else
67 |             obj
68 |           end
69 |         end
70 | 
71 |         def self.create_load_job_id(task, path, fields)
72 |           elements = [
73 |             Digest::MD5.file(path).hexdigest,
74 |             task['dataset'],
75 |             task['location'],
76 |             task['table'],
77 |             fields,
78 |             task['source_format'],
79 |             task['max_bad_records'],
80 |             task['field_delimiter'],
81 |             task['encoding'],
82 |             task['ignore_unknown_values'],
83 |             task['allow_quoted_newlines'],
84 |           ]
85 | 
86 |           str = elements.map(&:to_s).join('')
87 |           md5 = Digest::MD5.hexdigest(str)
88 |           "embulk_load_job_#{md5}"
89 |         end
90 |       end
91 |     end
92 |   end
93 | end
94 | 


--------------------------------------------------------------------------------
/lib/embulk/output/bigquery/value_converter_factory.rb:
--------------------------------------------------------------------------------
  1 | require 'time'
  2 | require 'time_with_zone'
  3 | require 'json'
  4 | require_relative 'helper'
  5 | 
  6 | module Embulk
  7 |   module Output
  8 |     class Bigquery < OutputPlugin
  9 |       class ValueConverterFactory
 10 |         class NotSupportedType < StandardError; end
 11 |         class TypeCastError < StandardError; end
 12 | 
 13 |         # ref. https://cloud.google.com/bigquery/preparing-data-for-bigquery
 14 | 
 15 |         DEFAULT_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%6N" # BigQuery timestamp format
 16 |         DEFAULT_TIMEZONE         = "UTC"
 17 | 
 18 |         # @param [Hash] task
 19 |         # @option task [String] default_timestamp_format
 20 |         # @option task [String] default_timezone
 21 |         # @option task [Hash]   column_options user defined column types
 22 |         # @param [Schema] schema embulk defined column types
 23 |         # @return [Array] an arary whose key is column_index, and value is its converter (Proc)
 24 |         def self.create_converters(task, schema)
 25 |           column_options_map       = Helper.column_options_map(task['column_options'])
 26 |           default_timestamp_format = task['default_timestamp_format'] || DEFAULT_TIMESTAMP_FORMAT
 27 |           default_timezone         = task['default_timezone'] || DEFAULT_TIMEZONE
 28 |           schema.map do |column|
 29 |             column_name   = column[:name]
 30 |             embulk_type   = column[:type]
 31 |             column_option = column_options_map[column_name] || {}
 32 |             self.new(
 33 |               embulk_type, column_option['type'],
 34 |               timestamp_format: column_option['timestamp_format'],
 35 |               timezone: column_option['timezone'],
 36 |               strict: column_option['strict'],
 37 |               default_timestamp_format: default_timestamp_format,
 38 |               default_timezone: default_timezone,
 39 |             ).create_converter
 40 |           end
 41 |         end
 42 | 
 43 |         attr_reader :embulk_type, :type, :timestamp_format, :timezone, :zone_offset, :strict
 44 | 
 45 |         def initialize(
 46 |           embulk_type, type = nil,
 47 |           timestamp_format: nil, timezone: nil, strict: nil,
 48 |           default_timestamp_format: DEFAULT_TIMESTAMP_FORMAT,
 49 |           default_timezone: DEFAULT_TIMEZONE
 50 |         )
 51 |           @embulk_type      = embulk_type
 52 |           @type             = (type || Helper.bq_type_from_embulk_type(embulk_type)).upcase
 53 |           @timestamp_format = timestamp_format
 54 |           @default_timestamp_format = default_timestamp_format
 55 |           @timezone         = timezone || default_timezone
 56 |           @zone_offset      = TimeWithZone.zone_offset(@timezone)
 57 |           @strict           = strict.nil? ? true : strict
 58 |         end
 59 | 
 60 |         def create_converter
 61 |           case embulk_type
 62 |           when :boolean   then boolean_converter
 63 |           when :long      then long_converter
 64 |           when :double    then double_converter
 65 |           when :string    then string_converter
 66 |           when :timestamp then timestamp_converter
 67 |           when :json      then json_converter
 68 |           else raise NotSupportedType, "embulk type #{embulk_type} is not supported"
 69 |           end
 70 |         end
 71 | 
 72 |         def with_typecast_error(val)
 73 |           begin
 74 |             yield(val)
 75 |           rescue => e
 76 |             raise_typecast_error(val)
 77 |           end
 78 |         end
 79 | 
 80 |         def raise_typecast_error(val)
 81 |           message = "cannot cast #{@embulk_type} `#{val}` to #{@type}"
 82 |           if @strict
 83 |             raise TypeCastError, message
 84 |           else
 85 |             Embulk.logger.trace { message }
 86 |             return nil
 87 |           end
 88 |         end
 89 | 
 90 |         def boolean_converter
 91 |           case type
 92 |           when 'BOOLEAN'
 93 |             Proc.new {|val|
 94 |               val
 95 |             }
 96 |           when 'STRING'
 97 |             Proc.new {|val|
 98 |               next nil if val.nil?
 99 |               val.to_s
100 |             }
101 |           else
102 |             raise NotSupportedType, "cannot take column type #{type} for boolean column"
103 |           end
104 |         end
105 | 
106 |         def long_converter
107 |           case type
108 |           when 'BOOLEAN'
109 |             Proc.new {|val|
110 |               next nil if val.nil?
111 |               next true if val == 1
112 |               next false if val == 0
113 |               raise_typecast_error(val)
114 |             }
115 |           when 'INTEGER'
116 |             Proc.new {|val|
117 |               val
118 |             }
119 |           when 'FLOAT'
120 |             Proc.new {|val|
121 |               next nil if val.nil?
122 |               val.to_f
123 |             }
124 |           when 'STRING'
125 |             Proc.new {|val|
126 |               next nil if val.nil?
127 |               val.to_s
128 |             }
129 |           when 'TIMESTAMP'
130 |             Proc.new {|val|
131 |               next nil if val.nil?
132 |               val # BigQuery supports UNIX timestamp
133 |             }
134 |           else
135 |             raise NotSupportedType, "cannot take column type #{type} for long column"
136 |           end
137 |         end
138 | 
139 |         def double_converter
140 |           case type
141 |           when 'INTEGER'
142 |             Proc.new {|val|
143 |               next nil if val.nil?
144 |               val.to_i
145 |             }
146 |           when 'FLOAT'
147 |             Proc.new {|val|
148 |               val
149 |             }
150 |           when 'STRING'
151 |             Proc.new {|val|
152 |               next nil if val.nil?
153 |               val.to_s
154 |             }
155 |           when 'TIMESTAMP'
156 |             Proc.new {|val|
157 |               next nil if val.nil?
158 |               val # BigQuery supports UNIX timestamp
159 |             }
160 |           else
161 |             raise NotSupportedType, "cannot take column type #{type} for double column"
162 |           end
163 |         end
164 | 
165 |         def string_converter
166 |           case type
167 |           when 'BOOLEAN'
168 |             Proc.new {|val|
169 |               next nil if val.nil?
170 |               next true if val == 'true'.freeze
171 |               next false if val == 'false'.freeze
172 |               raise_typecast_error(val)
173 |             }
174 |           when 'INTEGER'
175 |             Proc.new {|val|
176 |               next nil if val.nil?
177 |               with_typecast_error(val) do |val|
178 |                 Integer(val)
179 |               end
180 |             }
181 |           when 'FLOAT'
182 |             Proc.new {|val|
183 |               next nil if val.nil?
184 |               with_typecast_error(val) do |val|
185 |                 Float(val)
186 |               end
187 |             }
188 |           when 'STRING'
189 |             Proc.new {|val|
190 |               val
191 |             }
192 |           when 'TIMESTAMP'
193 |             if @timestamp_format
194 |               Proc.new {|val|
195 |                 next nil if val.nil?
196 |                 with_typecast_error(val) do |val|
197 |                   TimeWithZone.set_zone_offset(Time.strptime(val, @timestamp_format), zone_offset).strftime("%Y-%m-%d %H:%M:%S.%6N %:z")
198 |                 end
199 |               }
200 |             else
201 |               Proc.new {|val|
202 |                 next nil if val.nil?
203 |                 val # Users must care of BQ timestamp format
204 |               }
205 |             end
206 |           when 'DATE'
207 |             Proc.new {|val|
208 |               next nil if val.nil?
209 |               with_typecast_error(val) do |val|
210 |                 TimeWithZone.set_zone_offset(Time.parse(val), zone_offset).strftime("%Y-%m-%d")
211 |               end
212 |             }
213 |           when 'DATETIME'
214 |             if @timestamp_format
215 |               Proc.new {|val|
216 |                 next nil if val.nil?
217 |                 with_typecast_error(val) do |val|
218 |                   Time.strptime(val, @timestamp_format).strftime("%Y-%m-%d %H:%M:%S.%6N")
219 |                 end
220 |               }
221 |             else
222 |               Proc.new {|val|
223 |                 next nil if val.nil?
224 |                 val # Users must care of BQ timestamp format
225 |               }
226 |             end
227 |           when 'TIME'
228 |             # TimeWithZone doesn't affect any change to the time value
229 |             Proc.new {|val|
230 |               next nil if val.nil?
231 |               with_typecast_error(val) do |val|
232 |                 TimeWithZone.set_zone_offset(Time.parse(val), zone_offset).strftime("%H:%M:%S.%6N")
233 |               end
234 |             }
235 |           when 'RECORD'
236 |             Proc.new {|val|
237 |               next nil if val.nil?
238 |               with_typecast_error(val) do |val|
239 |                 JSON.parse(val)
240 |               end
241 |             }
242 |           else
243 |             raise NotSupportedType, "cannot take column type #{type} for string column"
244 |           end
245 |         end
246 | 
247 |         def timestamp_converter
248 |           case type
249 |           when 'INTEGER'
250 |             Proc.new {|val|
251 |               next nil if val.nil?
252 |               val.to_i
253 |             }
254 |           when 'FLOAT'
255 |             Proc.new {|val|
256 |               next nil if val.nil?
257 |               val.to_f
258 |             }
259 |           when 'STRING'
260 |             _timestamp_format = @timestamp_format || @default_timestamp_format
261 |             Proc.new {|val|
262 |               next nil if val.nil?
263 |               with_typecast_error(val) do |val|
264 |                 val.localtime(zone_offset).strftime(_timestamp_format)
265 |               end
266 |             }
267 |           when 'TIMESTAMP'
268 |             Proc.new {|val|
269 |               next nil if val.nil?
270 |               val.strftime("%Y-%m-%d %H:%M:%S.%6N %:z")
271 |             }
272 |           when 'DATE'
273 |             Proc.new {|val|
274 |               next nil if val.nil?
275 |               val.localtime(zone_offset).strftime("%Y-%m-%d")
276 |             }
277 |           when 'DATETIME'
278 |             Proc.new {|val|
279 |               next nil if val.nil?
280 |               val.localtime(zone_offset).strftime("%Y-%m-%d %H:%M:%S.%6N")
281 |             }
282 |           when 'TIME'
283 |             Proc.new {|val|
284 |               next nil if val.nil?
285 |               val.localtime(zone_offset).strftime("%H:%M:%S.%6N")
286 |             }
287 |           else
288 |             raise NotSupportedType, "cannot take column type #{type} for timestamp column"
289 |           end
290 |         end
291 | 
292 |         # ToDo: recursive conversion
293 |         def json_converter
294 |           case type
295 |           when 'STRING'
296 |             Proc.new {|val|
297 |               next nil if val.nil?
298 |               val.to_json
299 |             }
300 |           when 'RECORD'
301 |             Proc.new {|val|
302 |               val
303 |             }
304 |           when 'JSON'
305 |             Proc.new {|val|
306 |               val
307 |             }
308 |           else
309 |             raise NotSupportedType, "cannot take column type #{type} for json column"
310 |           end
311 |         end
312 |       end
313 |     end
314 |   end
315 | end
316 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'bundler/setup'
 4 | require 'test/unit'
 5 | require 'test/unit/rr'
 6 | 
 7 | # Embulk 0.10.x introduced new bootstrap mechanism.
 8 | # https://github.com/embulk/embulk/blob/641f35fec064cca7b1a7314d634a4b64ef8637f1/embulk-ruby/test/vanilla/run-test.rb#L8-L13
 9 | static_initializer = Java::org.embulk.EmbulkDependencyClassLoader.staticInitializer().useSelfContainedJarFiles()
10 | static_initializer.java_send :initialize
11 | 
12 | require 'embulk/java/bootstrap'
13 | require 'embulk'
14 | 
15 | Embulk.logger = Embulk::Logger.new('/dev/null')
16 | 
17 | APP_ROOT = File.expand_path('../', __dir__)
18 | EXAMPLE_ROOT = File.expand_path('../example', __dir__)
19 | TEST_ROOT = File.expand_path(File.dirname(__FILE__))
20 | JSON_KEYFILE = File.join(EXAMPLE_ROOT, 'your-project-000.json')
21 | 


--------------------------------------------------------------------------------
/test/test_bigquery_client.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery/bigquery_client'
  3 | require 'csv'
  4 | 
  5 | # 1. Prepare example/your-project-000.json
  6 | # 2. bunlde exec ruby test/test_bigquery_client.rb
  7 | 
  8 | unless File.exist?(JSON_KEYFILE)
  9 |   puts "#{JSON_KEYFILE} is not found. Skip test/test_bigquery_client.rb"
 10 | else
 11 |   module Embulk
 12 |     class Output::Bigquery
 13 |       class TestBigqueryClient < Test::Unit::TestCase
 14 |         class << self
 15 |           def startup
 16 |             FileUtils.mkdir_p('tmp')
 17 |           end
 18 | 
 19 |           def shutdown
 20 |             FileUtils.rm_rf('tmp')
 21 |           end
 22 |         end
 23 | 
 24 |         def client(task = {})
 25 |           task = least_task.merge(task)
 26 |           BigqueryClient.new(task, schema)
 27 |         end
 28 | 
 29 |         def least_task
 30 |           {
 31 |             'project'          => JSON.parse(File.read(JSON_KEYFILE))['project_id'],
 32 |             'destination_project' => JSON.parse(File.read(JSON_KEYFILE))['project_id'],
 33 |             'dataset'          => 'your_dataset_name',
 34 |             'table'            => 'your_table_name',
 35 |             'auth_method'      => 'json_key',
 36 |             'json_keyfile'     => File.read(JSON_KEYFILE),
 37 |             'retries'          => 3,
 38 |             'timeout_sec'      => 300,
 39 |             'open_timeout_sec' => 300,
 40 |             'job_status_max_polling_time' => 3600,
 41 |             'job_status_polling_interval' => 10,
 42 |             'source_format'    => 'CSV'
 43 |           }
 44 |         end
 45 | 
 46 |         def schema
 47 |           Schema.new([
 48 |             Column.new({index: 0, name: 'boolean', type: :boolean}),
 49 |             Column.new({index: 1, name: 'long', type: :long}),
 50 |             Column.new({index: 2, name: 'double', type: :double}),
 51 |             Column.new({index: 3, name: 'string', type: :string}),
 52 |             Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 53 |             Column.new({index: 5, name: 'json', type: :json}),
 54 |           ])
 55 |         end
 56 | 
 57 |         def record
 58 |           [true,1,1.1,'1',Time.parse("2016-02-26 +00:00"),'{"foo":"bar"}']
 59 |         end
 60 | 
 61 |         sub_test_case "client" do
 62 |           def test_json_keyfile
 63 |             assert_nothing_raised { BigqueryClient.new(least_task, schema).client }
 64 |           end
 65 |         end
 66 | 
 67 |         sub_test_case "create_dataset" do
 68 |           def test_create_dataset
 69 |             assert_nothing_raised { client.create_dataset }
 70 |           end
 71 | 
 72 |           def test_create_dataset_with_reference
 73 |             response = client.get_dataset
 74 |             any_instance_of(BigqueryClient) do |obj|
 75 |               mock(obj).get_dataset('your_dataset_name') { response }
 76 |             end
 77 |             assert_nothing_raised do
 78 |               client.create_dataset('your_dataset_name_old', reference: 'your_dataset_name')
 79 |             end
 80 |           end
 81 |         end
 82 | 
 83 |         sub_test_case "get_dataset" do
 84 |           def test_get_dataset
 85 |             assert_nothing_raised { client.create_dataset }
 86 |             assert_nothing_raised { client.get_dataset }
 87 |           end
 88 | 
 89 |           def test_get_dataset_not_found
 90 |             assert_raise(NotFoundError) {
 91 |               client.get_dataset('something_does_not_exist')
 92 |             }
 93 |           end
 94 |         end
 95 | 
 96 |         sub_test_case "create_table_if_not_exists" do
 97 |           def test_create_table_if_not_exists
 98 |             client.delete_table('your_table_name')
 99 |             assert_nothing_raised { client.create_table_if_not_exists('your_table_name') }
100 |           end
101 | 
102 |           def test_create_table_if_not_exists_already_exists
103 |             assert_nothing_raised { client.create_table_if_not_exists('your_table_name') }
104 |           end
105 | 
106 |           def test_create_partitioned_table
107 |             client.delete_table('your_table_name')
108 |             assert_nothing_raised do
109 |               client.create_table_if_not_exists('your_table_name$20160929', options:{
110 |                 'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000}
111 |               })
112 |             end
113 |           end
114 |         end
115 | 
116 |         sub_test_case "delete_table" do
117 |           def test_delete_table
118 |             client.create_table_if_not_exists('your_table_name')
119 |             assert_nothing_raised { client.delete_table('your_table_name') }
120 |           end
121 | 
122 |           def test_delete_table_not_found
123 |             assert_nothing_raised { client.delete_table('your_table_name') }
124 |           end
125 | 
126 |           def test_delete_partitioned_table
127 |             client.create_table_if_not_exists('your_table_name')
128 |             assert_nothing_raised { client.delete_table('your_table_name$20160929') }
129 |           end
130 |         end
131 | 
132 |         sub_test_case "get_table" do
133 |           def test_get_table
134 |             client.create_table_if_not_exists('your_table_name')
135 |             assert_nothing_raised { client.get_table('your_table_name') }
136 |           end
137 | 
138 |           def test_get_table_not_found
139 |             client.delete_table('your_table_name')
140 |             assert_raise(NotFoundError) {
141 |               client.get_table('your_table_name')
142 |             }
143 |           end
144 | 
145 |           def test_get_partitioned_table
146 |             client.create_table_if_not_exists('your_table_name')
147 |             assert_nothing_raised { client.get_table('your_table_name$20160929') }
148 |           end
149 |         end
150 | 
151 |         sub_test_case "delete_partition" do
152 |           def test_delete_partition
153 |             client.delete_table('your_table_name')
154 |             client.create_table_if_not_exists('your_table_name$20160929')
155 |             assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
156 |           ensure
157 |             client.delete_table('your_table_name')
158 |           end
159 | 
160 |           def test_delete_partition_of_non_partitioned_table
161 |             client.delete_table('your_table_name')
162 |             client.create_table_if_not_exists('your_table_name')
163 |             assert_raise { client.delete_partition('your_table_name$20160929') }
164 |           ensure
165 |             client.delete_table('your_table_name')
166 |           end
167 | 
168 |           def test_delete_partition_table_not_found
169 |             assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
170 |           end
171 |         end
172 | 
173 |         sub_test_case "fields" do
174 |           def test_fields_from_table
175 |             client.create_table_if_not_exists('your_table_name')
176 |             fields = client.fields_from_table('your_table_name')
177 |             expected = [
178 |               {:type=>"BOOLEAN", :name=>"boolean"},
179 |               {:type=>"INTEGER", :name=>"long"},
180 |               {:type=>"FLOAT", :name=>"double"},
181 |               {:type=>"STRING", :name=>"string"},
182 |               {:type=>"TIMESTAMP", :name=>"timestamp"},
183 |               {:type=>"STRING", :name=>"json"},
184 |             ]
185 |             assert_equal expected, fields
186 |           end
187 |         end
188 | 
189 |         sub_test_case "copy" do
190 |           def test_create_table_if_not_exists
191 |             client.create_table_if_not_exists('your_table_name')
192 |             assert_nothing_raised { client.copy('your_table_name', 'your_table_name_old') }
193 |           end
194 |         end
195 | 
196 |         sub_test_case "load" do
197 |           def test_load
198 |             client.create_table_if_not_exists('your_table_name')
199 |             File.write("tmp/your_file_name.csv", record.to_csv)
200 |             assert_nothing_raised { client.load("/tmp/your_file_name.csv", 'your_table_name') }
201 |           end
202 |         end
203 |       end
204 |     end
205 |   end
206 | end
207 | 


--------------------------------------------------------------------------------
/test/test_configure.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery'
  3 | 
  4 | Bigquery = Embulk::Output::Bigquery unless defined?(Bigquery)
  5 | 
  6 | module Embulk
  7 |   class Output::Bigquery
  8 |     class TestConfigure < Test::Unit::TestCase
  9 |       class << self
 10 |         def startup
 11 |           FileUtils.mkdir_p('tmp')
 12 |         end
 13 | 
 14 |         def shutdown
 15 |           FileUtils.rm_rf('tmp')
 16 |         end
 17 |       end
 18 | 
 19 |       def least_config
 20 |         DataSource.new({
 21 |           'project'      => 'your_project_name',
 22 |           'dataset'      => 'your_dataset_name',
 23 |           'table'        => 'your_table_name',
 24 |         })
 25 |       end
 26 | 
 27 |       def schema
 28 |         Schema.new([
 29 |           Column.new({index: 0, name: 'boolean', type: :boolean}),
 30 |           Column.new({index: 1, name: 'long', type: :long}),
 31 |           Column.new({index: 2, name: 'double', type: :double}),
 32 |           Column.new({index: 3, name: 'string', type: :string}),
 33 |           Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 34 |           Column.new({index: 5, name: 'json', type: :json}),
 35 |         ])
 36 |       end
 37 | 
 38 |       def processor_count
 39 |         1
 40 |       end
 41 | 
 42 |       def test_configure_default
 43 |         task = Bigquery.configure(least_config, schema, processor_count)
 44 |         assert_equal "append", task['mode']
 45 |         assert_equal "application_default", task['auth_method']
 46 |         assert_equal nil, task['json_keyfile']
 47 |         assert_equal "your_project_name", task['project']
 48 |         assert_equal "your_project_name", task['destination_project']
 49 |         assert_equal "your_dataset_name", task['dataset']
 50 |         assert_equal nil, task['location']
 51 |         assert_equal "your_table_name", task['table']
 52 |         assert_equal nil, task['dataset_old']
 53 |         assert_equal nil, task['table_old']
 54 |         assert_equal nil, task['table_name_old']
 55 |         assert_equal false, task['auto_create_dataset']
 56 |         assert_equal true, task['auto_create_table']
 57 |         assert_equal nil, task['schema_file']
 58 |         assert_equal nil, task['template_table']
 59 |         assert_equal true, task['delete_from_local_when_job_end']
 60 |         assert_equal 3600, task['job_status_max_polling_time']
 61 |         assert_equal 10, task['job_status_polling_interval']
 62 |         assert_equal false, task['is_skip_job_result_check']
 63 |         assert_equal false, task['with_rehearsal']
 64 |         assert_equal 1000, task['rehearsal_counts']
 65 |         assert_equal [], task['column_options']
 66 |         assert_equal "UTC", task['default_timezone']
 67 |         assert_equal "%Y-%m-%d %H:%M:%S.%6N", task['default_timestamp_format']
 68 |         assert_equal nil, task['payload_column']
 69 |         assert_equal nil, task['payload_column_index']
 70 |         assert_equal 5, task['retries']
 71 |         assert_equal "Embulk BigQuery plugin", task['application_name']
 72 |         # assert_equal "/tmp/embulk_output_bigquery_20160228-27184-pubcn0", task['path_prefix']
 73 |         assert_equal ".%d.%d", task['sequence_format']
 74 |         assert_equal ".csv", task['file_ext']
 75 |         assert_equal false, task['skip_file_generation']
 76 |         assert_equal "NONE", task['compression']
 77 |         assert_equal "CSV", task['source_format']
 78 |         assert_equal 0, task['max_bad_records']
 79 |         assert_equal ",", task['field_delimiter']
 80 |         assert_equal "UTF-8", task['encoding']
 81 |         assert_equal false, task['ignore_unknown_values']
 82 |         assert_equal false, task['allow_quoted_newlines']
 83 |         assert_equal nil, task['time_partitioning']
 84 |         assert_equal nil, task['clustering']
 85 |         assert_equal false, task['skip_load']
 86 |       end
 87 | 
 88 |       def test_mode
 89 |         config = least_config.merge('mode' => 'foobar')
 90 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
 91 | 
 92 |         config = least_config.merge('mode' => 'append')
 93 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
 94 | 
 95 |         config = least_config.merge('mode' => 'replace')
 96 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
 97 | 
 98 |         config = least_config.merge('mode' => 'delete_in_advance')
 99 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
100 | 
101 |         config = least_config.merge('mode' => 'replace_backup')
102 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
103 |       end
104 | 
105 |       def test_location
106 |         config = least_config.merge('location' => 'us')
107 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
108 | 
109 |         config = least_config.merge('location' => 'eu')
110 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
111 | 
112 |         config = least_config.merge('location' => 'asia-northeast1')
113 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
114 |       end
115 | 
116 |       def test_dataset_table_old
117 |         task = nil
118 |         config = least_config.merge('mode' => 'replace_backup', 'table_old' => 'backup')
119 |         assert_nothing_raised { task = Bigquery.configure(config, schema, processor_count) }
120 |         assert_equal task['dataset_old'], task['dataset']
121 |         assert_equal task['table_old'],   'backup'
122 | 
123 |         config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'backup')
124 |         assert_nothing_raised { task = Bigquery.configure(config, schema, processor_count) }
125 |         assert_equal task['dataset_old'], 'backup'
126 |         assert_equal task['table_old'],   task['table']
127 |       end
128 | 
129 |       def test_auth_method
130 |         config = least_config.merge('auth_method' => 'foobar')
131 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
132 | 
133 |         config = least_config.merge('auth_method' => 'json_key').tap {|h| h.delete('json_keyfile') }
134 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
135 |         config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => "#{EXAMPLE_ROOT}/json_key.json")
136 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
137 | 
138 |         config = least_config.merge('auth_method' => 'compute_engine')
139 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
140 |       end
141 | 
142 |       def test_json_keyfile
143 |         json_keyfile = "#{EXAMPLE_ROOT}/json_key.json"
144 |         config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => json_keyfile).tap {|h| h.delete('project') }
145 |         task = Bigquery.configure(config, schema, processor_count)
146 |         assert_not_equal nil, task['project'] # project is obtained from json_keyfile if available
147 | 
148 |         config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => { 'content' => File.read(json_keyfile) }).tap {|h| h.delete('project') }
149 |         task = Bigquery.configure(config, schema, processor_count)
150 |         assert_not_equal nil, task['project'] # project is obtained from json_keyfile if available
151 | 
152 |         config = least_config.merge('auth_method' => 'json_key', 'json_keyfile' => { 'content' => 'not a json' })
153 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
154 |       end
155 | 
156 |       def test_payload_column
157 |         config = least_config.merge('payload_column' => schema.first.name, 'auto_create_table' => false, 'mode' => 'append_direct')
158 |         task = Bigquery.configure(config, schema, processor_count)
159 |         assert_equal task['payload_column_index'], 0
160 | 
161 |         config = least_config.merge('payload_column' => 'not_exist', 'auto_create_table' => false, 'mode' => 'append_direct')
162 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
163 |       end
164 | 
165 |       def test_payload_column_index
166 |         config = least_config.merge('payload_column_index' => 0, 'auto_create_table' => false, 'mode' => 'append_direct')
167 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
168 | 
169 |         config = least_config.merge('payload_column_index' => -1, 'auto_create_table' => false, 'mode' => 'append_direct')
170 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
171 | 
172 |         config = least_config.merge('payload_column_index' => schema.size, 'auto_create_table' => false, 'mode' => 'append_direct')
173 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
174 |       end
175 | 
176 |       def test_auto_create_table_with_payload_column
177 |         config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json')
178 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
179 | 
180 |         config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json', 'schema_file' => "#{EXAMPLE_ROOT}/schema.json")
181 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
182 | 
183 |         config = least_config.merge('auto_create_table' => true, 'payload_column' => 'json', 'template_table' => 'foo')
184 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
185 |       end
186 | 
187 |       def test_auto_create_table_with_payload_column_index
188 |         config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0)
189 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
190 | 
191 |         config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0, 'schema_file' => "#{EXAMPLE_ROOT}/schema.json")
192 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
193 | 
194 |         config = least_config.merge('auto_create_table' => true, 'payload_column_index' => 0, 'template_table' => 'foo')
195 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
196 |       end
197 | 
198 |       def test_schema_file
199 |         config = least_config.merge('schema_file' => "#{EXAMPLE_ROOT}/schema.json")
200 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
201 | 
202 |         config = least_config.merge('schema_file' => "not_found.json")
203 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
204 | 
205 |         File.write("tmp/bad_schema.json", "not_a_json")
206 |         config = least_config.merge('schema_file' => "tmp/bad_schema.json")
207 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
208 |       end
209 | 
210 |       def test_source_format
211 |         config = least_config.merge('source_format' => 'csv')
212 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
213 | 
214 |         config = least_config.merge('source_format' => 'jsonl')
215 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
216 | 
217 |         config = least_config.merge('source_format' => 'newline_delimited_json')
218 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
219 | 
220 |         config = least_config.merge('source_format' => 'foobar')
221 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
222 |       end
223 | 
224 |       def test_compression
225 |         config = least_config.merge('compression' => 'gzip')
226 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
227 | 
228 |         config = least_config.merge('compression' => 'none')
229 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
230 | 
231 |         config = least_config.merge('compression' => 'foobar')
232 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
233 |       end
234 | 
235 |       def test_file_ext
236 |         config = least_config.merge('source_format' => 'csv', 'compression' => 'gzip')
237 |         task = Bigquery.configure(config, schema, processor_count)
238 |         assert_equal '.csv.gz', task['file_ext']
239 | 
240 |         config = least_config.merge('source_format' => 'NEWLINE_DELIMITED_JSON', 'compression' => 'gzip')
241 |         task = Bigquery.configure(config, schema, processor_count)
242 |         assert_equal '.jsonl.gz', task['file_ext']
243 | 
244 |         config = least_config.merge('source_format' => 'csv', 'compression' => 'none')
245 |         task = Bigquery.configure(config, schema, processor_count)
246 |         assert_equal '.csv', task['file_ext']
247 | 
248 |         config = least_config.merge('source_format' => 'NEWLINE_DELIMITED_JSON', 'compression' => 'none')
249 |         task = Bigquery.configure(config, schema, processor_count)
250 |         assert_equal '.jsonl', task['file_ext']
251 | 
252 |         config = least_config.merge('file_ext' => '.foo')
253 |         task = Bigquery.configure(config, schema, processor_count)
254 |         assert_equal '.foo', task['file_ext']
255 |       end
256 | 
257 |       def test_time_partitioning
258 |         config = least_config.merge('time_partitioning' => {'type' => 'DAY'})
259 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
260 | 
261 |         config = least_config.merge('time_partitioning' => {'foo' => 'bar'})
262 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
263 | 
264 |         config = least_config.merge('table' => 'table')
265 |         task = Bigquery.configure(config, schema, processor_count)
266 |         assert_equal nil, task['time_partitioning']
267 | 
268 |         config = least_config.merge('table' => 'table_name$20160912')
269 |         task = Bigquery.configure(config, schema, processor_count)
270 |         assert_equal 'DAY', task['time_partitioning']['type']
271 |       end
272 | 
273 |       def test_range_partitioning
274 |         config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 3, 'interval' => 1 }})
275 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
276 | 
277 |         # field is required
278 |         config = least_config.merge('range_partitioning' => {'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
279 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
280 | 
281 | 
282 |         # range is required
283 |         config = least_config.merge('range_partitioning' => {'field' => 'foo'})
284 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
285 | 
286 |         # range.start is required
287 |         config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'end' => 2, 'interval' => 1 }})
288 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
289 | 
290 |         # range.end is required
291 |         config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'interval' => 1 }})
292 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
293 | 
294 |         # range.interval is required
295 |         config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2 }})
296 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
297 | 
298 |         # range.start + range.interval should be less than range.end
299 |         config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 2 }})
300 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
301 |       end
302 | 
303 |       def test_time_and_range_partitioning_error
304 |         config = least_config.merge('time_partitioning' => {'type' => 'DAY'}, 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
305 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
306 | 
307 |         config = least_config.merge('table' => 'table_name$20160912', 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
308 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
309 |       end
310 | 
311 |       def test_clustering
312 |         config = least_config.merge('clustering' => {'fields' => ['field_a']})
313 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
314 | 
315 |         config = least_config.merge('clustering' => {})
316 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
317 |       end
318 | 
319 |       def test_schema_update_options
320 |         config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'])
321 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
322 | 
323 |         config = least_config.merge('schema_update_options' => ['FOO'])
324 |         assert_raise { Bigquery.configure(config, schema, processor_count) }
325 |       end
326 | 
327 |       def test_destination_project
328 |         config = least_config.merge('destination_project' => 'your_destination_project_name')
329 |         task = Bigquery.configure(config, schema, processor_count)
330 | 
331 |         assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
332 |         assert_equal 'your_destination_project_name', task['destination_project']
333 |         assert_equal 'your_project_name', task['project']
334 |       end
335 | 
336 |     end
337 |   end
338 | end
339 | 


--------------------------------------------------------------------------------
/test/test_example.rb:
--------------------------------------------------------------------------------
 1 | require_relative './helper'
 2 | 
 3 | # 1. Prepare example/your-project-000.json
 4 | # 2. embulk bundle
 5 | # 3. bundle exec ruby test/test_example.rb
 6 | 
 7 | unless File.exist?(JSON_KEYFILE)
 8 |   puts "#{JSON_KEYFILE} is not found. Skip test/test_example.rb"
 9 | else
10 |   class TestExample < Test::Unit::TestCase
11 |     def embulk_path
12 |       if File.exist?("#{ENV['HOME']}/.embulk/bin/embulk")
13 |         "#{ENV['HOME']}/.embulk/bin/embulk"
14 |       elsif File.exist?("#{ENV['PWD']}/embulk.jar")
15 |         "#{ENV['PWD']}/embulk.jar"
16 |       elsif File.exist?("/usr/local/bin/embulk")
17 |         "/usr/local/bin/embulk"
18 |       else
19 |         "embulk"
20 |       end
21 |     end
22 | 
23 |     def embulk_run(config_path)
24 |       ::Bundler.with_clean_env do
25 |         cmd = "#{embulk_path} run -X page_size=1 -b . -l trace #{config_path}"
26 |         puts "=" * 64
27 |         puts cmd
28 |         system(cmd)
29 |       end
30 |     end
31 | 
32 |     files = Dir.glob("#{APP_ROOT}/example/config_*.yml").reject {|file| File.symlink?(file) }.sort
33 |     files.each do |config_path|
34 |       if %w[
35 |         config_expose_errors.yml
36 |         ].include?(File.basename(config_path))
37 |         define_method(:"test_#{File.basename(config_path, ".yml")}") do
38 |           assert_false embulk_run(config_path)
39 |         end
40 |       else
41 |         define_method(:"test_#{File.basename(config_path, ".yml")}") do
42 |           assert_true embulk_run(config_path)
43 |         end
44 |       end
45 |     end
46 |   end
47 | end
48 | 


--------------------------------------------------------------------------------
/test/test_file_writer.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery/file_writer'
  3 | require 'fileutils'
  4 | require 'zlib'
  5 | 
  6 | module Embulk
  7 |   class Output::Bigquery
  8 |     class TestFileWriter < Test::Unit::TestCase
  9 |       class << self
 10 |         def startup
 11 |           FileUtils.mkdir_p('tmp')
 12 |         end
 13 | 
 14 |         def shutdown
 15 |           FileUtils.rm_rf('tmp')
 16 |         end
 17 |       end
 18 | 
 19 |       def default_task
 20 |         {
 21 |           'compression' => 'GZIP',
 22 |           'payload_column' => nil,
 23 |           'source_format' => 'CSV',
 24 |           'path_prefix' => 'tmp/path_prefix',
 25 |           'sequence_format' => '.%d.%03d',
 26 |           'file_ext' => nil,
 27 |         }
 28 |       end
 29 | 
 30 |       def schema
 31 |         Schema.new([
 32 |           Column.new({index: 0, name: 'boolean', type: :boolean}),
 33 |           Column.new({index: 1, name: 'long', type: :long}),
 34 |           Column.new({index: 2, name: 'double', type: :double}),
 35 |           Column.new({index: 3, name: 'string', type: :string}),
 36 |           Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 37 |           Column.new({index: 5, name: 'json', type: :json}),
 38 |         ])
 39 |       end
 40 | 
 41 |       def converters
 42 |         @converters ||= ValueConverterFactory.create_converters(default_task, schema)
 43 |       end
 44 | 
 45 |       def record
 46 |         [true, 1, 1.1, 'foo', Time.parse("2016-02-26 00:00:00 +00:00").utc, {"foo"=>"foo"}]
 47 |       end
 48 | 
 49 |       def page
 50 |         [record]
 51 |       end
 52 | 
 53 |       sub_test_case "path" do
 54 |         def test_path
 55 |           task = default_task.merge('path_prefix' => 'tmp/foo', 'sequence_format' => '', 'file_ext' => '.1')
 56 |           file_writer = FileWriter.new(task, schema, 0, converters)
 57 | 
 58 |           begin
 59 |             file_writer.add(page)
 60 |           ensure
 61 |             io.close rescue nil
 62 |           end
 63 |           path = file_writer.io.path
 64 |           assert_equal 'tmp/foo.1', path
 65 |         end
 66 |       end
 67 | 
 68 |       sub_test_case "formatter" do
 69 |         def test_payload_column_index
 70 |           task = default_task.merge('payload_column_index' => 0)
 71 |           file_writer = FileWriter.new(task, schema, 0, converters)
 72 |           formatter_proc = file_writer.instance_variable_get(:@formatter_proc)
 73 |           assert_equal :to_payload, formatter_proc.name
 74 | 
 75 |           assert_equal %Q[true\n], formatter_proc.call(record)
 76 |         end
 77 | 
 78 |         def test_csv
 79 |           task = default_task.merge('source_format' => 'CSV')
 80 |           file_writer = FileWriter.new(task, schema, 0, converters)
 81 |           formatter_proc = file_writer.instance_variable_get(:@formatter_proc)
 82 |           assert_equal :to_csv, formatter_proc.name
 83 | 
 84 |           expected = %Q[true,1,1.1,foo,2016-02-26 00:00:00.000000 +00:00,"{""foo"":""foo""}"\n]
 85 |           assert_equal expected, formatter_proc.call(record)
 86 |         end
 87 | 
 88 |         def test_jsonl
 89 |           task = default_task.merge('source_format' => 'NEWLINE_DELIMITED_JSON')
 90 |           file_writer = FileWriter.new(task, schema, 0, converters)
 91 |           formatter_proc = file_writer.instance_variable_get(:@formatter_proc)
 92 |           assert_equal :to_jsonl, formatter_proc.name
 93 | 
 94 |           expected = %Q[{"boolean":true,"long":1,"double":1.1,"string":"foo","timestamp":"2016-02-26 00:00:00.000000 +00:00","json":"{\\"foo\\":\\"foo\\"}"}\n]
 95 |           assert_equal expected, formatter_proc.call(record)
 96 |         end
 97 |       end
 98 | 
 99 |       sub_test_case "compression" do
100 |         def test_gzip
101 |           task = default_task.merge('compression' => 'GZIP')
102 |           file_writer = FileWriter.new(task, schema, 0, converters)
103 | 
104 |           begin
105 |             file_writer.add(page)
106 |             io = file_writer.io
107 |             assert_equal Zlib::GzipWriter, io.class
108 |           ensure
109 |             io.close rescue nil
110 |           end
111 |           path = file_writer.io.path
112 |           assert_true File.exist?(path)
113 |           assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } }
114 |         end
115 | 
116 |         def test_uncompressed
117 |           task = default_task.merge('compression' => 'NONE')
118 |           file_writer = FileWriter.new(task, schema, 0, converters)
119 | 
120 |           begin
121 |             file_writer.add(page)
122 |             io = file_writer.io
123 |             assert_equal File, io.class
124 |           ensure
125 |             io.close rescue nil
126 |           end
127 |           path = file_writer.io.path
128 |           assert_true File.exist?(path)
129 |           assert_raise { Zlib::GzipReader.open(path) {|gz| } }
130 |         end
131 |       end
132 |     end
133 |   end
134 | end
135 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery/helper'
  3 | 
  4 | module Embulk
  5 |   class Output::Bigquery
  6 |     class TestHelper < Test::Unit::TestCase
  7 |       class << self
  8 |         def startup
  9 |           FileUtils.mkdir_p('tmp')
 10 |         end
 11 | 
 12 |         def shutdown
 13 |           FileUtils.rm_rf('tmp')
 14 |         end
 15 |       end
 16 | 
 17 |       def has_partition_decorator?
 18 |         assert_true Helper.has_partition_decorator?('table$20160929')
 19 |         assert_false Helper.has_partition_decorator?('table')
 20 |       end
 21 | 
 22 |       def chomp_partition_decorator
 23 |         assert_equal 'table', Helper.chomp_partition_decorator?('table$20160929')
 24 |         assert_equal 'table', Helper.chomp_partition_decorator?('table')
 25 |       end
 26 | 
 27 |       def bq_type_from_embulk_type
 28 |         assert_equal 'BOOLEAN',   Helper.bq_type_from_embulk_type(:boolean)
 29 |         assert_equal 'STRING',    Helper.bq_type_from_embulk_type(:string)
 30 |         assert_equal 'FLOAT',     Helper.bq_type_from_embulk_type(:double)
 31 |         assert_equal 'STRING',    Helper.bq_type_from_embulk_type(:string)
 32 |         assert_equal 'TIMESTAMP', Helper.bq_type_from_embulk_type(:timestamp)
 33 |         assert_equal 'STRING',    Helper.bq_type_from_embulk_type(:json)
 34 |       end
 35 | 
 36 |       sub_test_case "fields_from_embulk_schema" do
 37 |         def test_fields_from_embulk_schema_without_column_options
 38 |           schema = Schema.new([
 39 |             Column.new({index: 0, name: 'boolean', type: :boolean}),
 40 |             Column.new({index: 1, name: 'long', type: :long}),
 41 |             Column.new({index: 2, name: 'double', type: :double}),
 42 |             Column.new({index: 3, name: 'string', type: :string}),
 43 |             Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 44 |             Column.new({index: 5, name: 'json', type: :json}),
 45 |           ])
 46 |           expected = [
 47 |             {name: 'boolean',   type: 'BOOLEAN'},
 48 |             {name: 'long',      type: 'INTEGER'},
 49 |             {name: 'double',    type: 'FLOAT'},
 50 |             {name: 'string',    type: 'STRING'},
 51 |             {name: 'timestamp', type: 'TIMESTAMP'},
 52 |             {name: 'json',      type: 'STRING'},
 53 |           ]
 54 |           fields = Helper.fields_from_embulk_schema({}, schema)
 55 |           assert_equal expected, fields
 56 |         end
 57 | 
 58 |         def test_fields_from_embulk_schema_with_column_options
 59 |           schema = Schema.new([
 60 |             Column.new({index: 0, name: 'boolean', type: :boolean}),
 61 |             Column.new({index: 1, name: 'long', type: :long}),
 62 |             Column.new({index: 2, name: 'double', type: :double}),
 63 |             Column.new({index: 3, name: 'string', type: :string}),
 64 |             Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 65 |             Column.new({index: 5, name: 'date', type: :timestamp}),
 66 |             Column.new({index: 6, name: 'datetime', type: :timestamp}),
 67 |             Column.new({index: 7, name: 'json', type: :json}),
 68 |           ])
 69 |           task = {
 70 |             'column_options' => [
 71 |               {'name' => 'boolean',   'type' => 'STRING', 'mode' => 'REQUIRED', 'description' => 'hoge'},
 72 |               {'name' => 'long',      'type' => 'STRING'},
 73 |               {'name' => 'double',    'type' => 'STRING'},
 74 |               {'name' => 'string',    'type' => 'INTEGER'},
 75 |               {'name' => 'timestamp', 'type' => 'INTEGER'},
 76 |               {'name' => 'date',      'type' => 'DATE'},
 77 |               {'name' => 'datetime',  'type' => 'DATETIME'},
 78 |               {'name' => 'json',      'type' => 'RECORD', 'fields' => [
 79 |                 { 'name' => 'key1',   'type' => 'STRING' },
 80 |               ]},
 81 |             ],
 82 |           }
 83 |           expected = [
 84 |             {name: 'boolean', type: 'STRING', mode: 'REQUIRED', description: 'hoge'},
 85 |             {name: 'long',      type: 'STRING'},
 86 |             {name: 'double',    type: 'STRING'},
 87 |             {name: 'string',    type: 'INTEGER'},
 88 |             {name: 'timestamp', type: 'INTEGER'},
 89 |             {name: 'date',      type: 'DATE'},
 90 |             {name: 'datetime',  type: 'DATETIME'},
 91 |             {name: 'json',      type: 'RECORD', fields: [
 92 |               {name: 'key1',    type: 'STRING'},
 93 |             ]},
 94 |           ]
 95 |           fields = Helper.fields_from_embulk_schema(task, schema)
 96 |           assert_equal expected, fields
 97 |         end
 98 |       end
 99 | 
100 |       def test_create_load_job_id
101 |         task = {
102 |           'dataset' => 'your_dataset_name',
103 |           'location' => 'asia-northeast1',
104 |           'table' => 'your_table_name',
105 |           'source_format' => 'CSV',
106 |           'max_bad_records' => nil,
107 |           'field_delimiter' => ',',
108 |           'encoding' => 'UTF-8',
109 |           'ignore_unknown_values' => nil,
110 |           'allow_quoted_newlines' => nil,
111 |         }
112 |         fields = {
113 |           name: 'a', type: 'STRING',
114 |         }
115 |         File.write("tmp/your_file_name", "foobarbaz")
116 |         job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields)
117 |         assert job_id.is_a?(String)
118 |         assert_equal 'embulk_load_job_2abaf528b69987db0224e52bbd1f0eec', job_id
119 |       end
120 |     end
121 |   end
122 | end
123 | 


--------------------------------------------------------------------------------
/test/test_transaction.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery'
  3 | 
  4 | Bigquery = Embulk::Output::Bigquery unless defined?(Bigquery)
  5 | 
  6 | module Embulk
  7 |   class Output::Bigquery
  8 |     class TestTransaction < Test::Unit::TestCase
  9 |       def least_config
 10 |         DataSource.new({
 11 |           'project'      => 'your_project_name',
 12 |           'dataset'      => 'your_dataset_name',
 13 |           'table'        => 'your_table_name',
 14 |           'temp_table'   => 'temp_table', # randomly created is not good for our test
 15 |           'path_prefix'  => 'tmp/', # randomly created is not good for our test
 16 |         })
 17 |       end
 18 | 
 19 |       def schema
 20 |         Schema.new([
 21 |           Column.new({index: 0, name: 'boolean', type: :boolean}),
 22 |           Column.new({index: 1, name: 'long', type: :long}),
 23 |           Column.new({index: 2, name: 'double', type: :double}),
 24 |           Column.new({index: 3, name: 'string', type: :string}),
 25 |           Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 26 |           Column.new({index: 5, name: 'json', type: :json}),
 27 |         ])
 28 |       end
 29 | 
 30 |       def processor_count
 31 |         1
 32 |       end
 33 | 
 34 |       def control
 35 |         Proc.new {|task| task_reports = [] }
 36 |       end
 37 | 
 38 |       def setup
 39 |         stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} }
 40 |       end
 41 | 
 42 |       sub_test_case "append_direct" do
 43 |         def test_append_direc_without_auto_create
 44 |           config = least_config.merge('mode' => 'append_direct', 'auto_create_dataset' => false, 'auto_create_table' => false)
 45 |           any_instance_of(BigqueryClient) do |obj|
 46 |             mock(obj).get_dataset(config['dataset'])
 47 |             mock(obj).get_table(config['table'])
 48 |           end
 49 |           Bigquery.transaction(config, schema, processor_count, &control)
 50 |         end
 51 | 
 52 |         def test_append_direct_with_auto_create
 53 |           config = least_config.merge('mode' => 'append_direct', 'auto_create_dataset' => true, 'auto_create_table' => true)
 54 |           task = Bigquery.configure(config, schema, processor_count)
 55 |           any_instance_of(BigqueryClient) do |obj|
 56 |             mock(obj).create_dataset(config['dataset'])
 57 |             mock(obj).create_table_if_not_exists(config['table'])
 58 |           end
 59 |           Bigquery.transaction(config, schema, processor_count, &control)
 60 |         end
 61 | 
 62 |         def test_append_direct_with_partition_without_auto_create
 63 |           config = least_config.merge('mode' => 'append_direct', 'table' => 'table$20160929', 'auto_create_dataset' => false, 'auto_create_table' => false)
 64 |           any_instance_of(BigqueryClient) do |obj|
 65 |             mock(obj).get_dataset(config['dataset'])
 66 |             mock(obj).get_table(config['table'])
 67 |           end
 68 |           Bigquery.transaction(config, schema, processor_count, &control)
 69 |         end
 70 | 
 71 |         def test_append_direct_with_partition_with_auto_create
 72 |           config = least_config.merge('mode' => 'append_direct', 'table' => 'table$20160929', 'auto_create_dataset' => true, 'auto_create_table' => true)
 73 |           task = Bigquery.configure(config, schema, processor_count)
 74 |           any_instance_of(BigqueryClient) do |obj|
 75 |             mock(obj).create_dataset(config['dataset'])
 76 |             mock(obj).create_table_if_not_exists(config['table'])
 77 |           end
 78 |           Bigquery.transaction(config, schema, processor_count, &control)
 79 |         end
 80 |       end
 81 | 
 82 |       sub_test_case "delete_in_advance" do
 83 |         def test_delete_in_advance
 84 |           config = least_config.merge('mode' => 'delete_in_advance')
 85 |           task = Bigquery.configure(config, schema, processor_count)
 86 |           any_instance_of(BigqueryClient) do |obj|
 87 |             mock(obj).get_dataset(config['dataset'])
 88 |             mock(obj).delete_table_or_partition(config['table'])
 89 |             mock(obj).create_table_if_not_exists(config['table'])
 90 |           end
 91 |           Bigquery.transaction(config, schema, processor_count, &control)
 92 |         end
 93 | 
 94 |         def test_delete_in_advance_with_partitioning
 95 |           config = least_config.merge('mode' => 'delete_in_advance', 'table' => 'table$20160929', 'auto_create_table' => true)
 96 |           task = Bigquery.configure(config, schema, processor_count)
 97 |           any_instance_of(BigqueryClient) do |obj|
 98 |             mock(obj).get_dataset(config['dataset'])
 99 |             mock(obj).delete_table_or_partition(config['table'])
100 |             mock(obj).create_table_if_not_exists(config['table'])
101 |           end
102 |           Bigquery.transaction(config, schema, processor_count, &control)
103 |         end
104 |       end
105 | 
106 |       sub_test_case "replace" do
107 |         def test_replace
108 |           config = least_config.merge('mode' => 'replace')
109 |           task = Bigquery.configure(config, schema, processor_count)
110 |           any_instance_of(BigqueryClient) do |obj|
111 |             mock(obj).get_dataset(config['dataset'])
112 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
113 |             mock(obj).create_table_if_not_exists(config['table'])
114 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
115 |             mock(obj).delete_table(config['temp_table'])
116 |           end
117 |           Bigquery.transaction(config, schema, processor_count, &control)
118 |         end
119 | 
120 |         def test_replace_with_partitioning
121 |           config = least_config.merge('mode' => 'replace', 'table' => 'table$20160929')
122 |           task = Bigquery.configure(config, schema, processor_count)
123 |           any_instance_of(BigqueryClient) do |obj|
124 |             mock(obj).get_dataset(config['dataset'])
125 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
126 |             mock(obj).create_table_if_not_exists(config['table'])
127 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
128 |             mock(obj).delete_table(config['temp_table'])
129 |           end
130 |           Bigquery.transaction(config, schema, processor_count, &control)
131 |         end
132 |       end
133 | 
134 |       sub_test_case "replace_backup" do
135 |         def test_replace_backup
136 |           config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old', 'temp_table' => 'temp_table')
137 |           task = Bigquery.configure(config, schema, processor_count)
138 |           any_instance_of(BigqueryClient) do |obj|
139 |             mock(obj).get_dataset(config['dataset'])
140 |             mock(obj).get_dataset(config['dataset_old'])
141 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
142 |             mock(obj).create_table_if_not_exists(config['table'])
143 |             mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old'])
144 | 
145 |             mock(obj).get_table_or_partition(config['table'])
146 |             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
147 | 
148 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
149 |             mock(obj).delete_table(config['temp_table'])
150 |           end
151 |           Bigquery.transaction(config, schema, processor_count, &control)
152 |         end
153 | 
154 |         def test_replace_backup_auto_create_dataset
155 |           config = least_config.merge('mode' => 'replace_backup', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old', 'temp_table' => 'temp_table', 'auto_create_dataset' => true)
156 |           task = Bigquery.configure(config, schema, processor_count)
157 |           any_instance_of(BigqueryClient) do |obj|
158 |             mock(obj).create_dataset(config['dataset'])
159 |             mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
160 |             mock(obj).create_table_if_not_exists(config['table'])
161 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
162 |             mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old'])
163 | 
164 |             mock(obj).get_table_or_partition(config['table'])
165 |             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
166 | 
167 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
168 |             mock(obj).delete_table(config['temp_table'])
169 |           end
170 |           Bigquery.transaction(config, schema, processor_count, &control)
171 |         end
172 | 
173 |         def test_replace_backup_with_partitioning
174 |           config = least_config.merge('mode' => 'replace_backup', 'table' => 'table$20160929', 'dataset_old' => 'dataset_old', 'table_old' => 'table_old$20160929', 'temp_table' => 'temp_table', 'auto_create_table' => true)
175 |           task = Bigquery.configure(config, schema, processor_count)
176 |           any_instance_of(BigqueryClient) do |obj|
177 |             mock(obj).get_dataset(config['dataset'])
178 |             mock(obj).get_dataset(config['dataset_old'])
179 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
180 |             mock(obj).create_table_if_not_exists(config['table'])
181 |             mock(obj).create_table_if_not_exists(config['table_old'], dataset: config['dataset_old'])
182 | 
183 |             mock(obj).get_table_or_partition(config['table'])
184 |             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
185 | 
186 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
187 |             mock(obj).delete_table(config['temp_table'])
188 |           end
189 |           Bigquery.transaction(config, schema, processor_count, &control)
190 |         end
191 |       end
192 | 
193 |       sub_test_case "append" do
194 |         def test_append
195 |           config = least_config.merge('mode' => 'append')
196 |           task = Bigquery.configure(config, schema, processor_count)
197 |           any_instance_of(BigqueryClient) do |obj|
198 |             mock(obj).get_dataset(config['dataset'])
199 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
200 |             mock(obj).create_table_if_not_exists(config['table'])
201 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
202 |             mock(obj).delete_table(config['temp_table'])
203 |           end
204 |           Bigquery.transaction(config, schema, processor_count, &control)
205 |         end
206 | 
207 |         def test_append_with_partitioning
208 |           config = least_config.merge('mode' => 'append', 'table' => 'table$20160929', 'auto_create_table' => true)
209 |           task = Bigquery.configure(config, schema, processor_count)
210 |           any_instance_of(BigqueryClient) do |obj|
211 |             mock(obj).get_dataset(config['dataset'])
212 |             mock(obj).create_table_if_not_exists(config['temp_table'], options: {"expiration_time"=>nil})
213 |             mock(obj).create_table_if_not_exists(config['table'])
214 |             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
215 |             mock(obj).delete_table(config['temp_table'])
216 |           end
217 |           Bigquery.transaction(config, schema, processor_count, &control)
218 |         end
219 |       end
220 |     end
221 |   end
222 | end
223 | 


--------------------------------------------------------------------------------
/test/test_value_converter_factory.rb:
--------------------------------------------------------------------------------
  1 | require_relative './helper'
  2 | require 'embulk/output/bigquery/value_converter_factory'
  3 | 
  4 | module Embulk
  5 |   class Output::Bigquery
  6 |     class TestValueConverterFactory < Test::Unit::TestCase
  7 | 
  8 |       class TestCreateConverters < Test::Unit::TestCase
  9 |         def test_create_default_converter
 10 |           schema = Schema.new([
 11 |             Column.new({index: 0, name: 'boolean', type: :boolean}),
 12 |             Column.new({index: 1, name: 'long', type: :long}),
 13 |             Column.new({index: 2, name: 'double', type: :double}),
 14 |             Column.new({index: 3, name: 'string', type: :string}),
 15 |             Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 16 |             Column.new({index: 5, name: 'json', type: :json}),
 17 |           ])
 18 |           converters = ValueConverterFactory.create_converters({}, schema)
 19 |           assert_equal schema.size, converters.size
 20 |           # Check correct converters are created
 21 |           # Proc can not have names, so we have to execute to check...
 22 |           assert_equal true, converters[0].call(true)
 23 |           assert_equal 1, converters[1].call(1)
 24 |           assert_equal 1.1, converters[2].call(1.1)
 25 |           assert_equal 'foo', converters[3].call('foo')
 26 |           timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
 27 |           assert_equal "2016-02-26 00:00:00.500000 +00:00", converters[4].call(timestamp)
 28 |           assert_equal %Q[{"foo":"foo"}], converters[5].call({'foo'=>'foo'})
 29 |         end
 30 | 
 31 |         def test_create_custom_converter
 32 |           schema = Schema.new([
 33 |             Column.new({index: 0, name: 'boolean', type: :boolean}),
 34 |             Column.new({index: 1, name: 'long', type: :long}),
 35 |             Column.new({index: 2, name: 'double', type: :double}),
 36 |             Column.new({index: 3, name: 'string', type: :string}),
 37 |             Column.new({index: 4, name: 'timestamp', type: :timestamp}),
 38 |             Column.new({index: 5, name: 'json', type: :json}),
 39 |           ])
 40 |           task = {
 41 |             'column_options' => [
 42 |               {'name' => 'boolean',   'type' => 'STRING'},
 43 |               {'name' => 'long',      'type' => 'STRING'},
 44 |               {'name' => 'double',    'type' => 'STRING'},
 45 |               {'name' => 'string',    'type' => 'INTEGER'},
 46 |               {'name' => 'timestamp', 'type' => 'INTEGER'},
 47 |               {'name' => 'json',      'type' => 'RECORD'},
 48 |             ],
 49 |           }
 50 |           converters = ValueConverterFactory.create_converters(task, schema)
 51 |           assert_equal schema.size, converters.size
 52 |           # Check correct converters are created
 53 |           # Proc can not have names, so we have to execute to check...
 54 |           assert_equal 'true', converters[0].call(true)
 55 |           assert_equal '1', converters[1].call(1)
 56 |           assert_equal '1.1', converters[2].call(1.1)
 57 |           assert_equal 1, converters[3].call('1')
 58 |           timestamp = Time.parse("2016-02-26 00:00:00.100000 +00:00")
 59 |           assert_equal 1456444800, converters[4].call(timestamp)
 60 |           assert_equal({'foo'=>'foo'}, converters[5].call({'foo'=>'foo'}))
 61 |         end
 62 |       end
 63 | 
 64 |       class TestBooleanConverter < Test::Unit::TestCase
 65 |         SCHEMA_TYPE = :boolean
 66 | 
 67 |         def test_boolean
 68 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter
 69 |           assert_equal nil, converter.call(nil)
 70 |           assert_equal true, converter.call(true)
 71 |           assert_equal false, converter.call(false)
 72 |         end
 73 | 
 74 |         def test_integer
 75 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter }
 76 |         end
 77 | 
 78 |         def test_float
 79 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter }
 80 |         end
 81 | 
 82 |         def test_string
 83 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
 84 |           assert_equal nil, converter.call(nil)
 85 |           assert_equal "true", converter.call(true)
 86 |           assert_equal "false", converter.call(false)
 87 |         end
 88 | 
 89 |         def test_timestamp
 90 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter }
 91 |         end
 92 | 
 93 |         def test_date
 94 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter }
 95 |         end
 96 | 
 97 |         def test_datetime
 98 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter }
 99 |         end
100 | 
101 |         def test_record
102 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter }
103 |         end
104 |       end
105 | 
106 |       class TestLongConverter < Test::Unit::TestCase
107 |         SCHEMA_TYPE = :long
108 | 
109 |         def test_boolean
110 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter
111 |           assert_equal nil, converter.call(nil)
112 |           assert_equal true, converter.call(1)
113 |           assert_equal false, converter.call(0)
114 |           assert_raise { converter.call(2) }
115 |         end
116 | 
117 |         def test_integer
118 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter
119 |           assert_equal nil, converter.call(nil)
120 |           assert_equal 1, converter.call(1)
121 |         end
122 | 
123 |         def test_float
124 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter
125 |           assert_equal nil, converter.call(nil)
126 |           assert_equal 1.0, converter.call(1)
127 |         end
128 | 
129 |         def test_string
130 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
131 |           assert_equal nil, converter.call(nil)
132 |           assert_equal "1", converter.call(1)
133 |         end
134 | 
135 |         def test_timestamp
136 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
137 |           assert_equal nil, converter.call(nil)
138 |           assert_equal 1408452095, converter.call(1408452095)
139 |         end
140 | 
141 |         def test_date
142 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter }
143 |         end
144 | 
145 |         def test_datetime
146 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter }
147 |         end
148 | 
149 |         def test_record
150 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter }
151 |         end
152 |       end
153 | 
154 |       class TestDoubleConverter < Test::Unit::TestCase
155 |         SCHEMA_TYPE = :double
156 | 
157 |         def test_boolean
158 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter }
159 |         end
160 | 
161 |         def test_integer
162 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter
163 |           assert_equal nil, converter.call(nil)
164 |           assert_equal 1, converter.call(1.1)
165 |         end
166 | 
167 |         def test_float
168 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter
169 |           assert_equal nil, converter.call(nil)
170 |           assert_equal 1.1, converter.call(1.1)
171 |         end
172 | 
173 |         def test_string
174 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
175 |           assert_equal nil, converter.call(nil)
176 |           assert_equal "1.1", converter.call(1.1)
177 |         end
178 | 
179 |         def test_timestamp
180 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
181 |           assert_equal nil, converter.call(nil)
182 |           assert_equal 1408452095.188766, converter.call(1408452095.188766)
183 |         end
184 | 
185 |         def test_date
186 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter }
187 |         end
188 | 
189 |         def test_datetime
190 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter }
191 |         end
192 | 
193 |         def test_record
194 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter }
195 |         end
196 |       end
197 | 
198 |       class TestStringConverter < Test::Unit::TestCase
199 |         SCHEMA_TYPE = :string
200 | 
201 |         def test_boolean
202 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter
203 |           assert_equal nil, converter.call(nil)
204 |           assert_equal true, converter.call('true')
205 |           assert_equal false, converter.call('false')
206 |           assert_raise { converter.call('foo') }
207 |         end
208 | 
209 |         def test_integer
210 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter
211 |           assert_equal nil, converter.call(nil)
212 |           assert_equal 1, converter.call('1')
213 |           assert_raise { converter.call('1.1') }
214 |         end
215 | 
216 |         def test_float
217 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter
218 |           assert_equal nil, converter.call(nil)
219 |           assert_equal 1.1, converter.call('1.1')
220 |           assert_raise { converter.call('foo') }
221 |         end
222 | 
223 |         def test_string
224 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
225 |           assert_equal nil, converter.call(nil)
226 |           assert_equal "foo", converter.call("foo")
227 |         end
228 | 
229 |         def test_timestamp
230 |           converter = ValueConverterFactory.new(
231 |             SCHEMA_TYPE, 'TIMESTAMP',
232 |             timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo'
233 |           ).create_converter
234 |           assert_equal nil, converter.call(nil)
235 |           assert_equal "2016-02-26 00:00:00.000000 +09:00", converter.call("2016-02-26")
236 | 
237 |           # Users must care of BQ timestamp format by themselves with no timestamp_format
238 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
239 |           assert_equal nil, converter.call(nil)
240 |           assert_equal "2016-02-26 00:00:00", converter.call("2016-02-26 00:00:00")
241 |         end
242 | 
243 |         def test_date
244 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter
245 |           assert_equal nil, converter.call(nil)
246 |           assert_equal "2016-02-26", converter.call("2016-02-26")
247 |           assert_equal "2016-02-26", converter.call("2016-02-26 00:00:00")
248 |           assert_raise { converter.call('foo') }
249 |         end
250 | 
251 |         def test_datetime
252 |           converter = ValueConverterFactory.new(
253 |             SCHEMA_TYPE, 'DATETIME',
254 |             timestamp_format: '%Y/%m/%d'
255 |           ).create_converter
256 |           assert_equal nil, converter.call(nil)
257 |           assert_equal "2016-02-26 00:00:00.000000", converter.call("2016/02/26")
258 | 
259 |           # Users must care of BQ datetime format by themselves with no timestamp_format
260 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter
261 |           assert_equal nil, converter.call(nil)
262 |           assert_equal "2016-02-26 00:00:00", converter.call("2016-02-26 00:00:00")
263 |         end
264 | 
265 |         def test_time
266 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIME').create_converter
267 |           assert_equal nil, converter.call(nil)
268 |           assert_equal "00:03:22.000000", converter.call("00:03:22")
269 |           assert_equal "15:22:00.000000", converter.call("3:22 PM")
270 |           assert_equal "03:22:00.000000", converter.call("3:22 AM")
271 |           assert_equal "00:00:00.000000", converter.call("2016-02-26 00:00:00")
272 | 
273 |            # TimeWithZone doesn't affect any change to the time value
274 |           converter = ValueConverterFactory.new(
275 |             SCHEMA_TYPE, 'TIME', timezone: 'Asia/Tokyo'
276 |           ).create_converter
277 |           assert_equal "15:00:01.000000", converter.call("15:00:01")
278 | 
279 |           assert_raise { converter.call('foo') }
280 |         end
281 | 
282 |         def test_record
283 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter
284 |           assert_equal({'foo'=>'foo'}, converter.call(%Q[{"foo":"foo"}]))
285 |           assert_raise { converter.call('foo') }
286 |         end
287 |       end
288 | 
289 |       class TestTimestampConverter < Test::Unit::TestCase
290 |         SCHEMA_TYPE = :timestamp
291 | 
292 |         def test_boolean
293 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter }
294 |         end
295 | 
296 |         def test_integer
297 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter
298 |           assert_equal nil, converter.call(nil)
299 |           expected = 1456444800
300 |           assert_equal expected, converter.call(Time.at(expected))
301 |         end
302 | 
303 |         def test_float
304 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter
305 |           assert_equal nil, converter.call(nil)
306 |           expected = 1456444800.500000
307 |           assert_equal expected, converter.call(Time.at(expected))
308 |         end
309 | 
310 |         def test_string
311 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
312 |           assert_equal nil, converter.call(nil)
313 |           timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
314 |           expected = "2016-02-26 00:00:00.500000"
315 |           assert_equal expected, converter.call(timestamp)
316 | 
317 |           converter = ValueConverterFactory.new(
318 |             SCHEMA_TYPE, 'STRING',
319 |             timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo'
320 |           ).create_converter
321 |           timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00")
322 |           expected = "2016-02-26"
323 |           assert_equal expected, converter.call(timestamp)
324 |         end
325 | 
326 |         def test_timestamp
327 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
328 |           assert_equal nil, converter.call(nil)
329 |           subject = 1456444800.500000
330 |           expected = "2016-02-26 00:00:00.500000 +00:00"
331 |           assert_equal expected, converter.call(Time.at(subject).utc)
332 |         end
333 | 
334 |         def test_date
335 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter
336 |           assert_equal nil, converter.call(nil)
337 |           timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
338 |           expected = "2016-02-26"
339 |           assert_equal expected, converter.call(timestamp)
340 | 
341 |           converter = ValueConverterFactory.new(
342 |             SCHEMA_TYPE, 'DATE', timezone: 'Asia/Tokyo'
343 |           ).create_converter
344 |           assert_equal nil, converter.call(nil)
345 |           timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00")
346 |           expected = "2016-02-26"
347 |           assert_equal expected, converter.call(timestamp)
348 | 
349 |           assert_raise { converter.call('foo') }
350 |         end
351 | 
352 |         def test_datetime
353 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'DATETIME').create_converter
354 |           assert_equal nil, converter.call(nil)
355 |           timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
356 |           expected = "2016-02-26 00:00:00.500000"
357 |           assert_equal expected, converter.call(timestamp)
358 | 
359 |           converter = ValueConverterFactory.new(
360 |             SCHEMA_TYPE, 'DATETIME', timezone: 'Asia/Tokyo'
361 |           ).create_converter
362 |           assert_equal nil, converter.call(nil)
363 |           timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00")
364 |           expected = "2016-02-26 00:00:00.500000"
365 |           assert_equal expected, converter.call(timestamp)
366 | 
367 |           assert_raise { converter.call('foo') }
368 |         end
369 | 
370 |         def test_time
371 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIME').create_converter
372 |           assert_equal nil, converter.call(nil)
373 |           timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
374 |           expected = "00:00:00.500000"
375 |           assert_equal expected, converter.call(timestamp)
376 | 
377 |           converter = ValueConverterFactory.new(
378 |             SCHEMA_TYPE, 'TIME', timezone: 'Asia/Tokyo'
379 |           ).create_converter
380 |           assert_equal nil, converter.call(nil)
381 |           timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00")
382 |           expected = "00:00:00.500000"
383 |           assert_equal expected, converter.call(timestamp)
384 | 
385 |           assert_raise { converter.call('foo') }
386 |         end
387 | 
388 |         def test_record
389 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter }
390 |         end
391 |       end
392 | 
393 |       class TestJsonConverter < Test::Unit::TestCase
394 |         SCHEMA_TYPE = :json
395 | 
396 |         def test_boolean
397 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'BOOLEAN').create_converter }
398 |         end
399 | 
400 |         def test_integer
401 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'INTEGER').create_converter }
402 |         end
403 | 
404 |         def test_float
405 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter }
406 |         end
407 | 
408 |         def test_string
409 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
410 |           assert_equal nil, converter.call(nil)
411 |           assert_equal(%Q[{"foo":"foo"}], converter.call({'foo'=>'foo'}))
412 |         end
413 | 
414 |         def test_timestamp
415 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter }
416 |         end
417 | 
418 |         def test_date
419 |           assert_raise { ValueConverterFactory.new(SCHEMA_TYPE, 'DATE').create_converter }
420 |         end
421 | 
422 |         def test_record
423 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'RECORD').create_converter
424 |           assert_equal nil, converter.call(nil)
425 |           assert_equal({'foo'=>'foo'}, converter.call({'foo'=>'foo'}))
426 |         end
427 | 
428 |         def test_json
429 |           converter = ValueConverterFactory.new(SCHEMA_TYPE, 'JSON').create_converter
430 |           assert_equal nil, converter.call(nil)
431 |           assert_equal({'foo'=>'foo'}, converter.call({'foo'=>'foo'}))
432 |         end
433 |       end
434 | 
435 |       def test_strict_false
436 |         converter = ValueConverterFactory.new(:string, 'BOOLEAN', strict: false).create_converter
437 |         assert_equal nil, converter.call('foo')
438 | 
439 |         converter = ValueConverterFactory.new(:string, 'INTEGER', strict: false).create_converter
440 |         assert_equal nil, converter.call('foo')
441 |       end
442 |     end
443 |   end
444 | end
445 | 


--------------------------------------------------------------------------------