├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── linux.yml
    │   └── windows.yml
├── .gitignore
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── fluent-plugin-bigquery.gemspec
├── gemfiles
    └── activesupport-4.gemfile
├── integration
    ├── README.md
    ├── create_table.sh
    ├── dummer_insert.rb
    ├── dummer_load.rb
    ├── fluent.conf
    └── schema.json
├── lib
    └── fluent
    │   └── plugin
    │       ├── bigquery
    │           ├── errors.rb
    │           ├── helper.rb
    │           ├── schema.rb
    │           ├── version.rb
    │           └── writer.rb
    │       ├── out_bigquery_base.rb
    │       ├── out_bigquery_insert.rb
    │       └── out_bigquery_load.rb
└── test
    ├── helper.rb
    ├── plugin
        ├── test_out_bigquery_base.rb
        ├── test_out_bigquery_insert.rb
        ├── test_out_bigquery_load.rb
        ├── test_record_schema.rb
        └── testdata
        │   ├── apache.schema
        │   ├── json_key.json
        │   └── sudo.schema
    └── run_test.rb


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- Please check your config and docs of fluentd !! -->
 2 | 
 3 | ## Environments
 4 | 
 5 | - fluentd version:
 6 | - plugin version:
 7 | 
 8 | ## Configuration
 9 | <!-- Please write your configuration -->
10 | 
11 | ## Expected Behavior
12 | 
13 | ## Actual Behavior
14 | 
15 | ## Log (if you have)
16 | 
17 | 


--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
 1 | name: Testing on Ubuntu
 2 | on:
 3 |   - push
 4 |   - pull_request
 5 | jobs:
 6 |   build:
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         ruby:
12 |           - 3.0
13 |           - 3.1
14 |           - 3.2
15 |           - 3.3
16 |         os:
17 |           - ubuntu-latest
18 |     name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - uses: ruby/setup-ruby@v1
22 |       with:
23 |         ruby-version: ${{ matrix.ruby }}
24 |     - name: unit testing
25 |       env:
26 |         CI: true
27 |       run: |
28 |         ruby -v
29 |         bundle install --jobs 4 --retry 3
30 |         bundle exec rake test
31 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | name: Testing on Windows
 2 | on:
 3 |   - push
 4 |   - pull_request
 5 | jobs:
 6 |   build:
 7 |     runs-on: ${{ matrix.os }}
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         ruby:
12 |           - 3.0
13 |           - 3.1
14 |           - 3.2
15 |           - 3.3
16 |         os:
17 |           - windows-latest
18 |     name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - uses: ruby/setup-ruby@v1
22 |       with:
23 |         ruby-version: ${{ matrix.ruby }}
24 |     - name: unit testing
25 |       env:
26 |         CI: true
27 |       run: |
28 |         ruby -v
29 |         bundle install --jobs 4 --retry 3
30 |         bundle exec rake test
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | .yardoc
 6 | .ruby-version
 7 | Gemfile.lock
 8 | InstalledFiles
 9 | _yardoc
10 | coverage
11 | doc/
12 | lib/bundler/man
13 | pkg
14 | rdoc
15 | spec/reports
16 | test/tmp
17 | test/version_tmp
18 | tmp
19 | script/
20 | .idea/
21 | 
22 | fluentd-0.12
23 | 
24 | integration/log
25 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## [v3.1.0](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/compare/v3.0.1...v3.1.0) (2022-12-16)
2 | 
3 | 
4 | ### Features
5 | 
6 | * Support GEOGRAPHY type field ([#201](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/issues/201)) ([734faa9](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/commit/734faa9adb7cec1ed579fc6a0bd9ce72d48b82d0))
7 | * Support JSON type field ([#204](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/issues/204)) ([ec62bfa](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/commit/ec62bfa2f858feb440e8bb8e8f8d6b8689f709bb))
8 | 
9 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in fluent-plugin-bigquery.gemspec
4 | gemspec
5 | 
6 | gem "oj"
7 | gem "dummer"
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012- TAGOMORI Satoshi
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fluent-plugin-bigquery
  2 | 
  3 | [Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
  4 | 
  5 | - **Plugin type**: Output
  6 | 
  7 | * insert data over streaming inserts
  8 |   * plugin type is `bigquery_insert`
  9 |   * for continuous real-time insertions
 10 |   * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
 11 | * load data
 12 |   * plugin type is `bigquery_load`
 13 |   * for data loading as batch jobs, for big amount of data
 14 |   * https://developers.google.com/bigquery/loading-data-into-bigquery
 15 | 
 16 | Current version of this plugin supports Google API with Service Account Authentication, but does not support
 17 | OAuth flow for installed applications.
 18 | 
 19 | ## Support Version
 20 | 
 21 | | plugin version | fluentd version | ruby version |
 22 | | :-----------   | :-----------    | :----------- |
 23 | | v0.4.x         | 0.12.x          | 2.0 or later |
 24 | | v1.x.x         | 0.14.x or later | 2.2 or later |
 25 | | v2.x.x         | 0.14.x or later | 2.3 or later |
 26 | | v3.x.x         | 1.x or later    | 2.7 or later |
 27 | 
 28 | ## With docker image
 29 | If you use official alpine based fluentd docker image (https://github.com/fluent/fluentd-docker-image),
 30 | You need to install `bigdecimal` gem on your own dockerfile.
 31 | Because alpine based image has only minimal ruby environment in order to reduce image size.
 32 | And in most case, dependency to embedded gem is not written on gemspec.
 33 | Because embedded gem dependency sometimes restricts ruby environment.
 34 | 
 35 | ## Configuration
 36 | 
 37 | ### Options
 38 | 
 39 | #### common
 40 | 
 41 | | name                                          | type          | required?                                    | placeholder? | default                    | description                                                                                            |
 42 | | :-------------------------------------------- | :------------ | :-----------                                 | :----------  | :------------------------- | :-----------------------                                                                               |
 43 | | auth_method                                   | enum          | yes                                          | no           | private_key                | `private_key` or `json_key` or `compute_engine` or `application_default` (GKE Workload Identity)       |
 44 | | email                                         | string        | yes (private_key)                            | no           | nil                        | GCP Service Account Email                                                                              |
 45 | | private_key_path                              | string        | yes (private_key)                            | no           | nil                        | GCP Private Key file path                                                                              |
 46 | | private_key_passphrase                        | string        | yes (private_key)                            | no           | nil                        | GCP Private Key Passphrase                                                                             |
 47 | | json_key                                      | string        | yes (json_key)                               | no           | nil                        | GCP JSON Key file path or JSON Key string                                                              |
 48 | | project                                       | string        | yes                                          | yes          | nil                        |                                                                                                        |
 49 | | dataset                                       | string        | yes                                          | yes          | nil                        |                                                                                                        |
 50 | | table                                         | string        | yes (either `tables`)                        | yes          | nil                        |                                                                                                        |
 51 | | tables                                        | array(string) | yes (either `table`)                         | yes          | nil                        | can set multi table names splitted by `,`                                                              |
 52 | | auto_create_table                             | bool          | no                                           | no           | false                      | If true, creates table automatically                                                                   |
 53 | | ignore_unknown_values                         | bool          | no                                           | no           | false                      | Accept rows that contain values that do not match the schema. The unknown values are ignored.          |
 54 | | schema                                        | array         | yes (either `fetch_schema` or `schema_path`) | no           | nil                        | Schema Definition. It is formatted by JSON.                                                            |
 55 | | schema_path                                   | string        | yes (either `fetch_schema`)                  | yes          | nil                        | Schema Definition file path. It is formatted by JSON.                                                  |
 56 | | fetch_schema                                  | bool          | yes (either `schema_path`)                   | no           | false                      | If true, fetch table schema definition from Bigquery table automatically.                              |
 57 | | fetch_schema_table                            | string        | no                                           | yes          | nil                        | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
 58 | | schema_cache_expire                           | integer       | no                                           | no           | 600                        | Value is second. If current time is after expiration interval, re-fetch table schema definition.       |
 59 | | request_timeout_sec                           | integer       | no                                           | no           | nil                        | Bigquery API response timeout                                                                          |
 60 | | request_open_timeout_sec                      | integer       | no                                           | no           | 60                         | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.       |
 61 | | time_partitioning_type                        | enum          | no (either day or hour)                      | no           | nil                        | Type of bigquery time partitioning feature.                                                            |
 62 | | time_partitioning_field                       | string        | no                                           | no           | nil                        | Field used to determine how to create a time-based partition.                                          |
 63 | | time_partitioning_expiration                  | time          | no                                           | no           | nil                        | Expiration milliseconds for bigquery time partitioning.                                                |
 64 | | clustering_fields                             | array(string) | no                                           | no           | nil                        | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
 65 | 
 66 | #### bigquery_insert
 67 | 
 68 | | name                                   | type          | required?    | placeholder? | default                    | description                                                                                                                                                                                |
 69 | | :------------------------------------- | :------------ | :----------- | :----------  | :------------------------- | :-----------------------                                                                                                                                                                   |
 70 | | template_suffix                        | string        | no           | yes          | nil                        | can use `%{time_slice}` placeholder replaced by `time_slice_format`                                                                                                                        |
 71 | | skip_invalid_rows                      | bool          | no           | no           | false                      |                                                                                                                                                                                            |
 72 | | insert_id_field                        | string        | no           | no           | nil                        | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor                                                    |
 73 | | add_insert_timestamp                   | string        | no           | no           | nil                        | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
 74 | | allow_retry_insert_errors              | bool          | no           | no           | false                      | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate.                                                                              |
 75 | | require_partition_filter    | bool          | no                                           | no           | false                      | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. |
 76 | 
 77 | #### bigquery_load
 78 | 
 79 | | name                                   | type          | required?    | placeholder? | default                    | description                                                                                                                                    |
 80 | | :------------------------------------- | :------------ | :----------- | :----------  | :------------------------- | :-----------------------                                                                                                                       |
 81 | | source_format                          | enum          | no           | no           | json                       | Specify source format `json` or `csv` or `avro`. If you change this parameter, you must change formatter plugin via `<format>` config section. |
 82 | | max_bad_records                        | integer       | no           | no           | 0                          | If the number of bad records exceeds this value, an invalid error is returned in the job result.                                               |
 83 | 
 84 | ### Buffer section
 85 | 
 86 | | name                                   | type          | required?    | default                        | description                        |
 87 | | :------------------------------------- | :------------ | :----------- | :-------------------------     | :-----------------------           |
 88 | | @type                                  | string        | no           | memory (insert) or file (load) |                                    |
 89 | | chunk_limit_size                       | integer       | no           | 1MB (insert) or 1GB (load)     |                                    |
 90 | | total_limit_size                       | integer       | no           | 1GB (insert) or 32GB (load)    |                                    |
 91 | | chunk_records_limit                    | integer       | no           | 500 (insert) or nil (load)     |                                    |
 92 | | flush_mode                             | enum          | no           | interval                       | default, lazy, interval, immediate |
 93 | | flush_interval                         | float         | no           | 1.0 (insert) or 3600 (load)    |                                    |
 94 | | flush_thread_interval                  | float         | no           | 0.05 (insert) or 5 (load)      |                                    |
 95 | | flush_thread_burst_interval            | float         | no           | 0.05 (insert) or 5 (load)      |                                    |
 96 | 
 97 | And, other params (defined by base class) are available
 98 | 
 99 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb
100 | 
101 | ### Inject section
102 | 
103 | It is replacement of previous version `time_field` and `time_format`.
104 | 
105 | For example.
106 | 
107 | ```
108 | <inject>
109 |   time_key time_field_name
110 |   time_type string
111 |   time_format %Y-%m-%d %H:%M:%S
112 | </inject>
113 | ```
114 | 
115 | | name                                   | type          | required?    | default                    | description              |
116 | | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
117 | | hostname_key                           | string        | no           | nil                        |                          |
118 | | hostname                               | string        | no           | nil                        |                          |
119 | | tag_key                                | string        | no           | nil                        |                          |
120 | | time_key                               | string        | no           | nil                        |                          |
121 | | time_type                              | string        | no           | nil                        |                          |
122 | | time_format                            | string        | no           | nil                        |                          |
123 | | localtime                              | bool          | no           | true                       |                          |
124 | | utc                                    | bool          | no           | false                      |                          |
125 | | timezone                               | string        | no           | nil                        |                          |
126 | 
127 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb
128 | 
129 | ### Formatter section
130 | 
131 | This section is for `load` mode only.
132 | If you use `insert` mode, used formatter is `json` only.
133 | 
134 | Bigquery supports `csv`, `json` and `avro` format. Default is `json`
135 | I recommend to use `json` for now.
136 | 
137 | For example.
138 | 
139 | ```
140 | source_format csv
141 | 
142 | <format>
143 |   @type csv
144 |   fields col1, col2, col3
145 | </format>
146 | ```
147 | 
148 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/formatter.rb
149 | 
150 | ## Examples
151 | 
152 | ### Streaming inserts
153 | 
154 | Configure insert specifications with target table schema, with your credentials. This is minimum configurations:
155 | 
156 | ```apache
157 | <match dummy>
158 |   @type bigquery_insert
159 | 
160 |   auth_method private_key   # default
161 |   email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
162 |   private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
163 |   # private_key_passphrase notasecret # default
164 | 
165 |   project yourproject_id
166 |   dataset yourdataset_id
167 |   table   tablename
168 | 
169 |   schema [
170 |     {"name": "time", "type": "INTEGER"},
171 |     {"name": "status", "type": "INTEGER"},
172 |     {"name": "bytes", "type": "INTEGER"},
173 |     {"name": "vhost", "type": "STRING"},
174 |     {"name": "path", "type": "STRING"},
175 |     {"name": "method", "type": "STRING"},
176 |     {"name": "protocol", "type": "STRING"},
177 |     {"name": "agent", "type": "STRING"},
178 |     {"name": "referer", "type": "STRING"},
179 |     {"name": "remote", "type": "RECORD", "fields": [
180 |       {"name": "host", "type": "STRING"},
181 |       {"name": "ip", "type": "STRING"},
182 |       {"name": "user", "type": "STRING"}
183 |     ]},
184 |     {"name": "requesttime", "type": "FLOAT"},
185 |     {"name": "bot_access", "type": "BOOLEAN"},
186 |     {"name": "loginsession", "type": "BOOLEAN"}
187 |   ]
188 | </match>
189 | ```
190 | 
191 | For high rate inserts over streaming inserts, you should specify flush intervals and buffer chunk options:
192 | 
193 | ```apache
194 | <match dummy>
195 |   @type bigquery_insert
196 | 
197 |   <buffer>
198 |     flush_interval 0.1  # flush as frequent as possible
199 | 
200 |     total_limit_size 10g
201 | 
202 |     flush_thread_count 16
203 |   </buffer>
204 | 
205 |   auth_method private_key   # default
206 |   email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
207 |   private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
208 |   # private_key_passphrase notasecret # default
209 | 
210 |   project yourproject_id
211 |   dataset yourdataset_id
212 |   tables  accesslog1,accesslog2,accesslog3
213 | 
214 |   schema [
215 |     {"name": "time", "type": "INTEGER"},
216 |     {"name": "status", "type": "INTEGER"},
217 |     {"name": "bytes", "type": "INTEGER"},
218 |     {"name": "vhost", "type": "STRING"},
219 |     {"name": "path", "type": "STRING"},
220 |     {"name": "method", "type": "STRING"},
221 |     {"name": "protocol", "type": "STRING"},
222 |     {"name": "agent", "type": "STRING"},
223 |     {"name": "referer", "type": "STRING"},
224 |     {"name": "remote", "type": "RECORD", "fields": [
225 |       {"name": "host", "type": "STRING"},
226 |       {"name": "ip", "type": "STRING"},
227 |       {"name": "user", "type": "STRING"}
228 |     ]},
229 |     {"name": "requesttime", "type": "FLOAT"},
230 |     {"name": "bot_access", "type": "BOOLEAN"},
231 |     {"name": "loginsession", "type": "BOOLEAN"}
232 |   ]
233 | </match>
234 | ```
235 | 
236 | Important options for high rate events are:
237 | 
238 |   * `tables`
239 |     * 2 or more tables are available with ',' separator
240 |     * `out_bigquery` uses these tables for Table Sharding inserts
241 |     * these must have same schema
242 |   * `buffer/chunk_limit_size`
243 |     * max size of an insert or chunk (default 1000000 or 1MB)
244 |     * the max size is limited to 1MB on BigQuery
245 |   * `buffer/chunk_records_limit`
246 |     * number of records over streaming inserts API call is limited as 500, per insert or chunk
247 |     * `out_bigquery` flushes buffer with 500 records for 1 inserts API call
248 |   * `buffer/queue_length_limit`
249 |     * BigQuery streaming inserts needs very small buffer chunks
250 |     * for high-rate events, `buffer_queue_limit` should be configured with big number
251 |     * Max 1GB memory may be used under network problem in default configuration
252 |       * `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)`
253 |   * `buffer/flush_thread_count`
254 |     * threads for insert api calls in parallel
255 |     * specify this option for 100 or more records per seconds
256 |     * 10 or more threads seems good for inserts over internet
257 |     * fewer threads may be good for Google Compute Engine instances (with low latency for BigQuery)
258 |   * `buffer/flush_interval`
259 |     * interval between data flushes (default 0.25)
260 |     * you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later
261 | 
262 | See [Quota policy](https://cloud.google.com/bigquery/streaming-data-into-bigquery#quota)
263 | section in the Google BigQuery document.
264 | 
265 | ### Load
266 | ```apache
267 | <match bigquery>
268 |   @type bigquery_load
269 | 
270 |   <buffer>
271 |     path bigquery.*.buffer
272 |     flush_at_shutdown true
273 |     timekey_use_utc
274 |   </buffer>
275 | 
276 |   auth_method json_key
277 |   json_key json_key_path.json
278 | 
279 |   project yourproject_id
280 |   dataset yourdataset_id
281 |   auto_create_table true
282 |   table yourtable%{time_slice}
283 |   schema_path bq_schema.json
284 | </match>
285 | ```
286 | 
287 | I recommend to use file buffer and long flush interval.
288 | 
289 | ### Authentication
290 | 
291 | There are four methods supported to fetch access token for the service account.
292 | 
293 | 1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
294 | 2. JSON key of GCP(Google Cloud Platform)'s service account
295 | 3. Predefined access token (Compute Engine only)
296 | 4. [Google application default credentials](https://cloud.google.com/docs/authentication/application-default-credentials) / GKE Workload Identity
297 | 
298 | #### Public-Private key pair of GCP's service account
299 | 
300 | The examples above use the first one. You first need to create a service account (client ID),
301 | download its private key and deploy the key with fluentd.
302 | 
303 | #### JSON key of GCP(Google Cloud Platform)'s service account
304 | 
305 | You first need to create a service account (client ID),
306 | download its JSON key and deploy the key with fluentd.
307 | 
308 | ```apache
309 | <match dummy>
310 |   @type bigquery_insert
311 | 
312 |   auth_method json_key
313 |   json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
314 | 
315 |   project yourproject_id
316 |   dataset yourdataset_id
317 |   table   tablename
318 |   ...
319 | </match>
320 | ```
321 | 
322 | You can also provide `json_key` as embedded JSON string like this.
323 | You need to only include `private_key` and `client_email` key from JSON key file.
324 | 
325 | ```apache
326 | <match dummy>
327 |   @type bigquery_insert
328 | 
329 |   auth_method json_key
330 |   json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
331 | 
332 |   project yourproject_id
333 |   dataset yourdataset_id
334 |   table   tablename
335 |   ...
336 | </match>
337 | ```
338 | 
339 | #### Predefined access token (Compute Engine only)
340 | 
341 | When you run fluentd on Google Compute Engine instance,
342 | you don't need to explicitly create a service account for fluentd.
343 | In this authentication method, you need to add the API scope "https://www.googleapis.com/auth/bigquery" to the scope list of your
344 | Compute Engine instance, then you can configure fluentd like this.
345 | 
346 | ```apache
347 | <match dummy>
348 |   @type bigquery_insert
349 | 
350 |   auth_method compute_engine
351 | 
352 |   project yourproject_id
353 |   dataset yourdataset_id
354 |   table   tablename
355 | 
356 |   ...
357 | </match>
358 | ```
359 | 
360 | #### Application default credentials
361 | 
362 | The Application Default Credentials provide a simple way to get authorization credentials for use in calling Google APIs, which are described in detail at https://cloud.google.com/docs/authentication/application-default-credentials.
363 | 
364 | **This is the method you should choose if you want to use Workload Identity on GKE**.
365 | 
366 | In this authentication method, the credentials returned are determined by the environment the code is running in. Conditions are checked in the following order:credentials are get from following order.
367 | 
368 | 1. The environment variable `GOOGLE_APPLICATION_CREDENTIALS` is checked. If this variable is specified it should point to a JSON key file that defines the credentials.
369 | 2. The environment variable `GOOGLE_PRIVATE_KEY` and `GOOGLE_CLIENT_EMAIL` are checked. If these variables are specified `GOOGLE_PRIVATE_KEY` should point to `private_key`, `GOOGLE_CLIENT_EMAIL` should point to `client_email` in a JSON key.
370 | 3. Well known path is checked. If the file exists, it is used as a JSON key file. This path is `$HOME/.config/gcloud/application_default_credentials.json`.
371 | 4. System default path is checked. If the file exists, it is used as a JSON key file. This path is `/etc/google/auth/application_default_credentials.json`.
372 | 5. If you are running in Google Compute Engine production, the built-in service account associated with the virtual machine instance will be used.
373 | 6. If none of these conditions is true, an error will occur.
374 | 
375 | ### Table id formatting
376 | 
377 | this plugin supports fluentd-0.14 style placeholder.
378 | 
379 | #### strftime formatting
380 | `table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
381 | format to construct table ids.
382 | Table ids are formatted at runtime
383 | using the chunk key time.
384 | 
385 | see. https://docs.fluentd.org/configuration/buffer-section
386 | 
387 | For example, with the configuration below,
388 | data is inserted into tables `accesslog_2014_08_02`, `accesslog_2014_08_03` and so on.
389 | 
390 | ```apache
391 | <match dummy>
392 |   @type bigquery_insert
393 | 
394 |   ...
395 | 
396 |   project yourproject_id
397 |   dataset yourdataset_id
398 |   table   accesslog_%Y_%m_%d
399 | 
400 |   <buffer time>
401 |     timekey 1d
402 |   </buffer>
403 |   ...
404 | </match>
405 | ```
406 | 
407 | **NOTE: In current fluentd (v1.15.x), The maximum unit supported by strftime formatting is the granularity of days**
408 | 
409 | #### record attribute formatting
410 | The format can be suffixed with attribute name.
411 | 
412 | __CAUTION: format is different with previous version__
413 | 
414 | ```apache
415 | <match dummy>
416 |   ...
417 |   table   accesslog_${status_code}
418 | 
419 |   <buffer status_code>
420 |   </buffer>
421 |   ...
422 | </match>
423 | ```
424 | 
425 | If attribute name is given, the time to be used for formatting is value of each row.
426 | The value for the time should be a UNIX time.
427 | 
428 | #### time_slice_key formatting
429 | 
430 | Instead, Use strftime formatting.
431 | 
432 | strftime formatting of current version is based on chunk key.
433 | That is same with previous time_slice_key formatting .
434 | 
435 | ### Date partitioned table support
436 | this plugin can insert (load) into date partitioned table.
437 | 
438 | Use placeholder.
439 | 
440 | ```apache
441 | <match dummy>
442 |   @type bigquery_load
443 | 
444 |   ...
445 |   table   accesslog$%Y%m%d
446 | 
447 |   <buffer time>
448 |     timekey 1d
449 |   </buffer>
450 |   ...
451 | </match>
452 | ```
453 | 
454 | But, Dynamic table creating doesn't support date partitioned table yet.
455 | And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
456 | If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
457 | 
458 | ### Dynamic table creating
459 | 
460 | When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...".
461 | Next retry of insertion is expected to be success.
462 | 
463 | NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should create the table on ahead to use `fetch_schema`.
464 | 
465 | ```apache
466 | <match dummy>
467 |   @type bigquery_insert
468 | 
469 |   ...
470 | 
471 |   auto_create_table true
472 |   table accesslog_%Y_%m
473 | 
474 |   ...
475 | </match>
476 | ```
477 | 
478 | Also, you can create clustered table by using `clustering_fields`.
479 | 
480 | ### Table schema
481 | 
482 | There are three methods to describe the schema of the target table.
483 | 
484 | 1. List fields in fluent.conf
485 | 2. Load a schema file in JSON.
486 | 3. Fetch a schema using BigQuery API
487 | 
488 | The examples above use the first method.  In this method,
489 | you can also specify nested fields by prefixing their belonging record fields.
490 | 
491 | ```apache
492 | <match dummy>
493 |   @type bigquery_insert
494 | 
495 |   ...
496 | 
497 |   schema [
498 |     {"name": "time", "type": "INTEGER"},
499 |     {"name": "status", "type": "INTEGER"},
500 |     {"name": "bytes", "type": "INTEGER"},
501 |     {"name": "vhost", "type": "STRING"},
502 |     {"name": "path", "type": "STRING"},
503 |     {"name": "method", "type": "STRING"},
504 |     {"name": "protocol", "type": "STRING"},
505 |     {"name": "agent", "type": "STRING"},
506 |     {"name": "referer", "type": "STRING"},
507 |     {"name": "remote", "type": "RECORD", "fields": [
508 |       {"name": "host", "type": "STRING"},
509 |       {"name": "ip", "type": "STRING"},
510 |       {"name": "user", "type": "STRING"}
511 |     ]},
512 |     {"name": "requesttime", "type": "FLOAT"},
513 |     {"name": "bot_access", "type": "BOOLEAN"},
514 |     {"name": "loginsession", "type": "BOOLEAN"}
515 |   ]
516 | </match>
517 | ```
518 | 
519 | This schema accepts structured JSON data like:
520 | 
521 | ```json
522 | {
523 |   "request":{
524 |     "time":1391748126.7000976,
525 |     "vhost":"www.example.com",
526 |     "path":"/",
527 |     "method":"GET",
528 |     "protocol":"HTTP/1.1",
529 |     "agent":"HotJava",
530 |     "bot_access":false
531 |   },
532 |   "remote":{ "ip": "192.0.2.1" },
533 |   "response":{
534 |     "status":200,
535 |     "bytes":1024
536 |   }
537 | }
538 | ```
539 | 
540 | The second method is to specify a path to a BigQuery schema file instead of listing fields.  In this case, your fluent.conf looks like:
541 | 
542 | ```apache
543 | <match dummy>
544 |   @type bigquery_insert
545 | 
546 |   ...
547 | 
548 |   schema_path /path/to/httpd.schema
549 | </match>
550 | ```
551 | where /path/to/httpd.schema is a path to the JSON-encoded schema file which you used for creating the table on BigQuery. By using external schema file you are able to write full schema that does support NULLABLE/REQUIRED/REPEATED, this feature is really useful and adds full flexibility.
552 | 
553 | The third method is to set `fetch_schema` to `true` to enable fetch a schema using BigQuery API.  In this case, your fluent.conf looks like:
554 | 
555 | ```apache
556 | <match dummy>
557 |   @type bigquery_insert
558 | 
559 |   ...
560 | 
561 |   fetch_schema true
562 |   # fetch_schema_table other_table # if you want to fetch schema from other table
563 | </match>
564 | ```
565 | 
566 | If you specify multiple tables in configuration file, plugin get all schema data from BigQuery and merge it.
567 | 
568 | NOTE: Since JSON does not define how to encode data of TIMESTAMP type,
569 | you are still recommended to specify JSON types for TIMESTAMP fields as "time" field does in the example, if you use second or third method.
570 | 
571 | ### Specifying insertId property
572 | 
573 | BigQuery uses `insertId` property to detect duplicate insertion requests (see [data consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency) in Google BigQuery documents).
574 | You can set `insert_id_field` option to specify the field to use as `insertId` property.
575 | `insert_id_field` can use fluentd record_accessor format like `$['key1'][0]['key2']`.
576 | (detail. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)
577 | 
578 | ```apache
579 | <match dummy>
580 |   @type bigquery_insert
581 | 
582 |   ...
583 | 
584 |   insert_id_field uuid
585 |   schema [{"name": "uuid", "type": "STRING"}]
586 | </match>
587 | ```
588 | 
589 | ## TODO
590 | 
591 | * OAuth installed application credentials support
592 | * Google API discovery expiration
593 | * check row size limits
594 | 
595 | ## Authors
596 | 
597 | * @tagomoris: First author, original version
598 | * KAIZEN platform Inc.: Maintainer, Since 2014.08.19
599 | * @joker1007
600 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rake
 2 | require "bundler/gem_tasks"
 3 | 
 4 | require 'rake/testtask'
 5 | Rake::TestTask.new(:test) do |test|
 6 |   test.libs << 'lib' << 'test'
 7 |   test.pattern = 'test/**/test_*.rb'
 8 |   test.verbose = true
 9 |   test.warning = false
10 | end
11 | 
12 | task :default => :test
13 | 


--------------------------------------------------------------------------------
/fluent-plugin-bigquery.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require 'fluent/plugin/bigquery/version'
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = "fluent-plugin-bigquery"
 8 |   spec.version       = Fluent::BigQueryPlugin::VERSION
 9 |   spec.authors       = ["Naoya Ito", "joker1007"]
10 |   spec.email         = ["i.naoya@gmail.com", "kakyoin.hierophant@gmail.com"]
11 |   spec.description   = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts}
12 |   spec.summary       = %q{Fluentd plugin to store data on Google BigQuery}
13 |   spec.homepage      = "https://github.com/kaizenplatform/fluent-plugin-bigquery"
14 |   spec.license       = "Apache-2.0"
15 | 
16 |   spec.files         = `git ls-files`.split($/)
17 |   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 |   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
19 |   spec.require_paths = ["lib"]
20 | 
21 |   spec.add_development_dependency "rake"
22 |   spec.add_development_dependency "rr"
23 |   spec.add_development_dependency "test-unit"
24 |   spec.add_development_dependency "test-unit-rr"
25 | 
26 |   spec.add_runtime_dependency "google-api-client", ">= 0.11.0"
27 |   spec.add_runtime_dependency "googleauth", ">= 0.5.0"
28 |   spec.add_runtime_dependency "multi_json"
29 |   spec.add_runtime_dependency "fluentd", ">= 0.14.0", "< 2"
30 | end
31 | 


--------------------------------------------------------------------------------
/gemfiles/activesupport-4.gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'activesupport', '< 5'
4 | 
5 | # Specify your gem's dependencies in fluent-plugin-bigquery.gemspec
6 | gemspec :path => '../'
7 | 


--------------------------------------------------------------------------------
/integration/README.md:
--------------------------------------------------------------------------------
 1 | # Requirements
 2 | 
 3 | Set Environment Variable
 4 | 
 5 | - GOOGLE_APPLICATION_CREDENTIALS (json key path)
 6 | - PROJECT_NAME
 7 | - DATASET_NAME
 8 | - TABLE_NAME
 9 | 
10 | # How to use
11 | 
12 | 1. execute `create_table.sh`
13 | 1. `bundle exec fluentd -c fluent.conf`
14 | 1. `bundle exec dummer -c dummer_insert.rb` or `bundle exec dummer -c dummer_load.rb`
15 | 


--------------------------------------------------------------------------------
/integration/create_table.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | set -eux
4 | bq mk -t --project_id=${PROJECT_NAME} --schema=$(dirname $0)/schema.json ${DATASET_NAME}.${TABLE_NAME} 
5 | 


--------------------------------------------------------------------------------
/integration/dummer_insert.rb:
--------------------------------------------------------------------------------
 1 | require "time"
 2 | 
 3 | configure "insert" do
 4 |   host "localhost"
 5 |   port 24224
 6 |   rate 100
 7 |   tag type: :string, any: %w(insert_data)
 8 |   field :id, type: :integer, countup: true
 9 |   field :string_field, type: :string, any: %w(str1 str2 str3 str4)
10 |   field :timestamp_field, type: :string, value: Time.now.iso8601
11 |   field :date, type: :string, value: Time.now.strftime("%Y-%m-%d")
12 | end
13 | 


--------------------------------------------------------------------------------
/integration/dummer_load.rb:
--------------------------------------------------------------------------------
 1 | require "time"
 2 | 
 3 | configure "load" do
 4 |   host "localhost"
 5 |   port 24224
 6 |   rate 100
 7 |   tag type: :string, any: %w(load_data)
 8 |   field :id, type: :integer, countup: true
 9 |   field :string_field, type: :string, any: %w(str1 str2 str3 str4)
10 |   field :timestamp_field, type: :string, value: Time.now.iso8601
11 |   field :date, type: :string, value: Time.now.strftime("%Y-%m-%d")
12 | end
13 | 


--------------------------------------------------------------------------------
/integration/fluent.conf:
--------------------------------------------------------------------------------
 1 | <source>
 2 |   @type forward
 3 |   port 24224
 4 |   bind 0.0.0.0
 5 | </source>
 6 | 
 7 | <source>
 8 |   @type dummy
 9 |   dummy {"json_field": {"foo": "val1", "bar": "val2", "hoge": 1}, "geography_field": {"type": "LineString", "coordinates": [[-118.4085, 33.9416], [-73.7781, 40.6413]]}, "timestamp_field": "2022-12-15T22:40:21+09:00", "date": "2022-12-15", "record_field": {"inner_field": "hoge", "inner_json": {"key1": "val1", "key2": "val2"}}, "repeated_string_field": ["a", "b", "c"]}
10 |   auto_increment_key id
11 | 
12 |   tag insert_data
13 | </source>
14 | 
15 | <match insert_data>
16 |   @id bigquery-insert-integration
17 |   @type bigquery_insert
18 | 
19 |   allow_retry_insert_errors true
20 | 
21 |   auth_method json_key
22 |   json_key "#{ENV["GOOGLE_APPLICATION_CREDENTIALS"]}"
23 | 
24 |   <buffer>
25 |     @type file
26 | 
27 |     chunk_limit_size 1m
28 |     chunk_limit_records 1500
29 |     total_limit_size 1g
30 |     path ./log/bigquery-insert-integration
31 | 
32 |     flush_interval 15
33 |     flush_thread_count 4
34 |     flush_at_shutdown true
35 | 
36 |     retry_max_times 14
37 |     retry_max_interval 30m
38 |   </buffer>
39 | 
40 |   request_open_timeout_sec 2m
41 | 
42 |   slow_flush_log_threshold 30.0
43 | 
44 |   project "#{ENV["PROJECT_NAME"]}"
45 |   dataset "#{ENV["DATASET_NAME"]}"
46 |   table "#{ENV["TABLE_NAME"]}"
47 |   auto_create_table false
48 |   # schema_path integration/schema.json
49 |   fetch_schema true
50 |   fetch_schema_table "#{ENV["TABLE_NAME"]}"
51 | 
52 |   insert_id_field id
53 | 
54 |   <secondary>
55 |     @type file
56 |     path ./log/bigquery-insert-integration.errors
57 |   </secondary>
58 | </match>
59 | 
60 | <match load_data>
61 |   @id bigquery-load-integration
62 |   @type bigquery_load
63 | 
64 |   auth_method json_key
65 |   json_key "#{ENV["GOOGLE_APPLICATION_CREDENTIALS"]}"
66 | 
67 |   <buffer>
68 |     @type file
69 | 
70 |     chunk_limit_size 1m
71 |     total_limit_size 1g
72 |     path ./log/bigquery-load-integration
73 | 
74 |     flush_interval 120
75 |     flush_thread_count 4
76 |     flush_at_shutdown true
77 | 
78 |     retry_max_times 14
79 |     retry_max_interval 30m
80 |   </buffer>
81 | 
82 |   request_open_timeout_sec 2m
83 | 
84 |   slow_flush_log_threshold 300.0
85 | 
86 |   project "#{ENV["PROJECT_NAME"]}"
87 |   dataset "#{ENV["DATASET_NAME"]}"
88 |   table "#{ENV["TABLE_NAME"]}"
89 |   auto_create_table false
90 |   # schema_path integration/schema.json
91 |   fetch_schema true
92 |   fetch_schema_table "#{ENV["TABLE_NAME"]}"
93 | 
94 |   <secondary>
95 |     @type file
96 |     path ./log/bigquery-load-integration.errors
97 |   </secondary>
98 | </match>
99 | 


--------------------------------------------------------------------------------
/integration/schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "id",
 4 |     "type": "INTEGER",
 5 |     "mode": "REQUIRED"
 6 |   },
 7 |   {
 8 |     "name": "string_field",
 9 |     "type": "STRING",
10 |     "mode": "NULLABLE"
11 |   },
12 |   {
13 |     "name": "json_field",
14 |     "type": "JSON",
15 |     "mode": "NULLABLE"
16 |   },
17 |   {
18 |     "name": "geography_field",
19 |     "type": "GEOGRAPHY",
20 |     "mode": "NULLABLE"
21 |   },
22 |   {
23 |     "name": "timestamp_field",
24 |     "type": "TIMESTAMP",
25 |     "mode": "NULLABLE"
26 |   },
27 |   {
28 |     "name": "date",
29 |     "type": "DATE",
30 |     "mode": "REQUIRED"
31 |   },
32 |   {
33 |     "name": "record_field",
34 |     "type": "RECORD",
35 |     "mode": "NULLABLE",
36 |     "fields": [
37 |       {
38 |         "name": "inner_field",
39 |         "type": "STRING",
40 |         "mode": "REQUIRED"
41 |       },
42 |       {
43 |         "name": "inner_json",
44 |         "type": "JSON",
45 |         "mode": "REQUIRED"
46 |       }
47 |     ]
48 |   },
49 |   {
50 |     "name": "repeated_string_field",
51 |     "type": "STRING",
52 |     "mode": "REPEATED"
53 |   }
54 | ]
55 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/bigquery/errors.rb:
--------------------------------------------------------------------------------
 1 | module Fluent
 2 |   module BigQuery
 3 |     # @abstract
 4 |     class Error < StandardError
 5 |       RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze
 6 |       RETRYABLE_INSERT_ERRORS_REASON = %w(timeout backendError internalError rateLimitExceeded).freeze
 7 |       RETRYABLE_STATUS_CODE = [500, 502, 503, 504]
 8 |       REGION_NOT_WRITABLE_MESSAGE = -"is not writable in the region"
 9 |       SSL_UNEXPECTED_EOF_MESSAGE = -"SSL_read: unexpected eof while reading"
10 | 
11 |       class << self
12 |         # @param e [Google::Apis::Error]
13 |         # @param message [String]
14 |         def wrap(e, message = nil)
15 |           if retryable_error?(e)
16 |             RetryableError.new(message, e)
17 |           else
18 |             UnRetryableError.new(message, e)
19 |           end
20 |         end
21 | 
22 |         # @param e [Google::Apis::Error]
23 |         def retryable_error?(e)
24 |           retryable_server_error?(e) || retryable_region_not_writable?(e) || retryable_ssl_unexpected_eof?(e)
25 |         end
26 | 
27 |         def retryable_server_error?(e)
28 |           e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code)
29 |         end
30 | 
31 |         def retryable_error_reason?(reason)
32 |           RETRYABLE_ERROR_REASON.include?(reason)
33 |         end
34 | 
35 |         def retryable_insert_errors_reason?(reason)
36 |           RETRYABLE_INSERT_ERRORS_REASON.include?(reason)
37 |         end
38 | 
39 |         def retryable_region_not_writable?(e)
40 |           e.is_a?(Google::Apis::ClientError) && e.status_code == 400 && e.message.include?(REGION_NOT_WRITABLE_MESSAGE)
41 |         end
42 | 
43 |         def retryable_ssl_unexpected_eof?(e)
44 |           e.message.include?(SSL_UNEXPECTED_EOF_MESSAGE)
45 |         end
46 | 
47 |         # Guard for instantiation
48 |         private :new
49 |         def inherited(subclass)
50 |           subclass.class_eval do
51 |             class << self
52 |               public :new
53 |             end
54 |           end
55 |         end
56 |       end
57 | 
58 |       attr_reader :origin
59 | 
60 |       def initialize(message, origin = nil)
61 |         @origin = origin
62 |         super(message || origin.message)
63 |       end
64 | 
65 |       def method_missing(name, *args)
66 |         if @origin
67 |           @origin.send(name, *args)
68 |         else
69 |           super
70 |         end
71 |       end
72 | 
73 |       def reason
74 |         @origin && @origin.respond_to?(:reason) ? @origin.reason : nil
75 |       end
76 | 
77 |       def status_code
78 |         @origin && @origin.respond_to?(:status_code) ? @origin.status_code : nil
79 |       end
80 | 
81 |       def body
82 |         @origin && @origin.respond_to?(:body) ? @origin.body : nil
83 |       end
84 | 
85 |       def retryable?
86 |         false
87 |       end
88 |     end
89 | 
90 |     class UnRetryableError < Error; end
91 | 
92 |     class RetryableError < Error
93 |       def retryable?
94 |         true
95 |       end
96 |     end
97 |   end
98 | end
99 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/bigquery/helper.rb:
--------------------------------------------------------------------------------
 1 | module Fluent
 2 |   module BigQuery
 3 |     module Helper
 4 |       class << self
 5 |         def deep_symbolize_keys(object)
 6 |           case object
 7 |           when Hash
 8 |             object.each_with_object({}) do |(key, value), result|
 9 |               result[key.to_sym] = deep_symbolize_keys(value)
10 |             end
11 |           when Array
12 |             object.map {|e| deep_symbolize_keys(e) }
13 |           else
14 |             object
15 |           end
16 |         end
17 | 
18 |         def deep_stringify_keys(object)
19 |           case object
20 |           when Hash
21 |             object.each_with_object({}) do |(key, value), result|
22 |               result[key.to_s] = deep_stringify_keys(value)
23 |             end
24 |           when Array
25 |             object.map {|e| deep_stringify_keys(e) }
26 |           else
27 |             object
28 |           end
29 |         end
30 |       end
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/bigquery/schema.rb:
--------------------------------------------------------------------------------
  1 | require 'multi_json'
  2 | 
  3 | module Fluent
  4 |   module BigQuery
  5 |     class FieldSchema
  6 |       def initialize(name, mode = :nullable)
  7 |         unless [:nullable, :required, :repeated].include?(mode)
  8 |           raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
  9 |         end
 10 |         ### https://developers.google.com/bigquery/docs/tables
 11 |         # Each field has the following properties:
 12 |         #
 13 |         # name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
 14 |         #        and must start with a letter or underscore. The maximum length is 128 characters.
 15 |         #        https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
 16 |         unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
 17 |           raise ConfigError, "invalid bigquery field name: '#{name}'"
 18 |         end
 19 | 
 20 |         @name = name
 21 |         @mode = mode
 22 |       end
 23 | 
 24 |       attr_reader :name, :mode
 25 | 
 26 |       def format(value, is_load: false)
 27 |         case @mode
 28 |         when :nullable
 29 |           format_one(value, is_load: is_load) unless value.nil?
 30 |         when :required
 31 |           if value.nil?
 32 |             log.warn "Required field #{name} cannot be null"
 33 |             nil
 34 |           else
 35 |             format_one(value, is_load: is_load)
 36 |           end
 37 |         when :repeated
 38 |           value.nil? ? [] : value.each_with_object([]) { |v, arr| arr << format_one(v, is_load: true) if v }
 39 |         end
 40 |       end
 41 | 
 42 |       def format_one(value, is_load: false)
 43 |         raise NotImplementedError, "Must implement in a subclass"
 44 |       end
 45 | 
 46 |       def to_h
 47 |         {
 48 |           :name => name,
 49 |           :type => type.to_s.upcase,
 50 |           :mode => mode.to_s.upcase,
 51 |         }
 52 |       end
 53 |     end
 54 | 
 55 |     class StringFieldSchema < FieldSchema
 56 |       def type
 57 |         :string
 58 |       end
 59 | 
 60 |       def format_one(value, is_load: false)
 61 |         if value.is_a?(Hash) || value.is_a?(Array)
 62 |           MultiJson.dump(value)
 63 |         else
 64 |           value.to_s
 65 |         end
 66 |       end
 67 |     end
 68 | 
 69 |     class JsonFieldSchema < FieldSchema
 70 |       def type
 71 |         :json
 72 |       end
 73 | 
 74 |       def format_one(value, is_load: false)
 75 |         if is_load
 76 |           value
 77 |         else
 78 |           MultiJson.dump(value)
 79 |         end
 80 |       end
 81 |     end
 82 | 
 83 |     class GeographyFieldSchema < StringFieldSchema
 84 |       def type
 85 |         :geography
 86 |       end
 87 |     end
 88 | 
 89 |     class IntegerFieldSchema < FieldSchema
 90 |       def type
 91 |         :integer
 92 |       end
 93 | 
 94 |       def format_one(value, is_load: false)
 95 |         value.to_i
 96 |       end
 97 |     end
 98 | 
 99 |     class FloatFieldSchema < FieldSchema
100 |       def type
101 |         :float
102 |       end
103 | 
104 |       def format_one(value, is_load: false)
105 |         value.to_f
106 |       end
107 |     end
108 | 
109 |     class NumericFieldSchema < FieldSchema
110 |       def type
111 |         :numeric
112 |       end
113 | 
114 |       def format_one(value, is_load: false)
115 |         value.to_s
116 |       end
117 |     end
118 | 
119 |     class BigNumericFieldSchema < FieldSchema
120 |       def type
121 |         :bignumeric
122 |       end
123 | 
124 |       def format_one(value, is_load: false)
125 |         value.to_s
126 |       end
127 |     end    
128 | 
129 |     class BooleanFieldSchema < FieldSchema
130 |       def type
131 |         :boolean
132 |       end
133 | 
134 |       def format_one(value, is_load: false)
135 |         !!value
136 |       end
137 |     end
138 | 
139 |     class TimestampFieldSchema < FieldSchema
140 |       INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
141 |       FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
142 | 
143 |       def type
144 |         :timestamp
145 |       end
146 | 
147 |       def format_one(value, is_load: false)
148 |         case value
149 |         when Time
150 |           value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
151 |         when String
152 |           if value =~ INTEGER_REGEXP
153 |             value.to_i
154 |           elsif value =~ FLOAT_REGEXP
155 |             value.to_f
156 |           else
157 |             value
158 |           end
159 |         else
160 |           value
161 |         end
162 |       end
163 |     end
164 | 
165 |     class DateFieldSchema < FieldSchema
166 |       def type
167 |         :date
168 |       end
169 | 
170 |       def format_one(value, is_load: false)
171 |         if value.respond_to?(:strftime)
172 |           value.strftime("%Y-%m-%d")
173 |         else
174 |           value
175 |         end
176 |       end
177 |     end
178 | 
179 |     class DateTimeFieldSchema < FieldSchema
180 |       def type
181 |         :datetime
182 |       end
183 | 
184 |       def format_one(value, is_load: false)
185 |         if value.respond_to?(:strftime)
186 |           value.strftime("%Y-%m-%dT%H:%M:%S.%6L")
187 |         else
188 |           value
189 |         end
190 |       end
191 |     end
192 | 
193 |     class TimeFieldSchema < FieldSchema
194 |       def type
195 |         :time
196 |       end
197 | 
198 |       def format_one(value, is_load: false)
199 |         if value.respond_to?(:strftime)
200 |           value.strftime("%H:%M:%S.%6L")
201 |         else
202 |           value
203 |         end
204 |       end
205 |     end
206 | 
207 |     class RecordSchema < FieldSchema
208 |       FIELD_TYPES = {
209 |         string: StringFieldSchema,
210 |         integer: IntegerFieldSchema,
211 |         float: FloatFieldSchema,
212 |         numeric: NumericFieldSchema,
213 |         bignumeric: BigNumericFieldSchema,
214 |         boolean: BooleanFieldSchema,
215 |         timestamp: TimestampFieldSchema,
216 |         date: DateFieldSchema,
217 |         datetime: DateTimeFieldSchema,
218 |         time: TimeFieldSchema,
219 |         json: JsonFieldSchema,
220 |         geography: GeographyFieldSchema,
221 |         record: RecordSchema
222 |       }.freeze
223 | 
224 |       def initialize(name, mode = :nullable)
225 |         super(name, mode)
226 |         @fields = {}
227 |       end
228 | 
229 |       def type
230 |         :record
231 |       end
232 | 
233 |       def [](name)
234 |         @fields[name]
235 |       end
236 | 
237 |       def empty?
238 |         @fields.empty?
239 |       end
240 | 
241 |       def to_a
242 |         @fields.map do |_, field_schema|
243 |           field_schema.to_h
244 |         end
245 |       end
246 | 
247 |       def to_h
248 |         {
249 |           :name => name,
250 |           :type => type.to_s.upcase,
251 |           :mode => mode.to_s.upcase,
252 |           :fields => self.to_a,
253 |         }
254 |       end
255 | 
256 |       def load_schema(schema)
257 |         schema.each do |field|
258 |           raise ConfigError, 'field must have type' unless field.key?('type')
259 | 
260 |           name = field['name']
261 |           mode = (field['mode'] || 'nullable').downcase.to_sym
262 | 
263 |           type = field['type'].downcase.to_sym
264 |           field_schema_class = FIELD_TYPES[type]
265 |           raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
266 | 
267 |           field_schema = field_schema_class.new(name, mode)
268 |           @fields[name] = field_schema
269 |           if type == :record
270 |             raise ConfigError, "record field must have fields" unless field.key?('fields')
271 |             field_schema.load_schema(field['fields'])
272 |           end
273 |         end
274 |       end
275 | 
276 |       def register_field(name, type)
277 |         if @fields.key?(name) and @fields[name].type != :timestamp
278 |           raise ConfigError, "field #{name} is registered twice"
279 |         end
280 |         if name[/\./]
281 |           recordname = $`
282 |           fieldname = $'
283 |           register_record_field(recordname)
284 |           @fields[recordname].register_field(fieldname, type)
285 |         else
286 |           schema = FIELD_TYPES[type]
287 |           raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
288 |           @fields[name] = schema.new(name)
289 |         end
290 |       end
291 | 
292 |       def format_one(record, is_load: false)
293 |         out = {}
294 |         record.each do |key, value|
295 |           next if value.nil?
296 |           schema = @fields[key]
297 |           out[key] = schema ? schema.format(value, is_load: is_load) : value
298 |         end
299 |         out
300 |       end
301 | 
302 |       private
303 |       def register_record_field(name)
304 |         if !@fields.key?(name)
305 |           @fields[name] = RecordSchema.new(name)
306 |         else
307 |           unless @fields[name].kind_of?(RecordSchema)
308 |             raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
309 |           end
310 |         end
311 |       end
312 |     end
313 |   end
314 | end
315 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/bigquery/version.rb:
--------------------------------------------------------------------------------
1 | module Fluent
2 |   module BigQueryPlugin
3 |     VERSION = "3.3.2".freeze
4 |   end
5 | end
6 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/bigquery/writer.rb:
--------------------------------------------------------------------------------
  1 | module Fluent
  2 |   module BigQuery
  3 |     class Writer
  4 |       def initialize(log, auth_method, **options)
  5 |         @auth_method = auth_method
  6 |         @scope = "https://www.googleapis.com/auth/bigquery"
  7 |         @options = options
  8 |         @log = log
  9 |         @num_errors_per_chunk = {}
 10 |       end
 11 | 
 12 |       def client
 13 |         @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
 14 |           cl.authorization = get_auth
 15 |           cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
 16 |           cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
 17 |           cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
 18 |         end
 19 |       end
 20 | 
 21 |       def create_table(project, dataset, table_id, record_schema)
 22 |         create_table_retry_limit = 3
 23 |         create_table_retry_wait = 1
 24 |         create_table_retry_count = 0
 25 |         table_id = safe_table_id(table_id)
 26 | 
 27 |         begin
 28 |           definition = {
 29 |             table_reference: {
 30 |               table_id: table_id,
 31 |             },
 32 |             schema: {
 33 |               fields: record_schema.to_a,
 34 |             }
 35 |           }
 36 | 
 37 |           definition.merge!(time_partitioning: time_partitioning) if time_partitioning
 38 |           definition.merge!(require_partition_filter: require_partition_filter) if require_partition_filter
 39 |           definition.merge!(clustering: clustering) if clustering
 40 |           client.insert_table(project, dataset, definition, **{})
 41 |           log.debug "create table", project_id: project, dataset: dataset, table: table_id
 42 |         rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
 43 |           message = e.message
 44 |           if e.status_code == 409 && /Already Exists:/ =~ message
 45 |             log.debug "already created table", project_id: project, dataset: dataset, table: table_id
 46 |             # ignore 'Already Exists' error
 47 |             return
 48 |           end
 49 | 
 50 |           log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
 51 | 
 52 |           if create_table_retry_count < create_table_retry_limit
 53 |             sleep create_table_retry_wait
 54 |             create_table_retry_wait *= 2
 55 |             create_table_retry_count += 1
 56 |             retry
 57 |           else
 58 |             raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e)
 59 |           end
 60 |         end
 61 |       end
 62 | 
 63 |       def fetch_schema(project, dataset, table_id)
 64 |         res = client.get_table(project, dataset, table_id)
 65 |         schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields])
 66 |         log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
 67 | 
 68 |         schema
 69 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
 70 |         message = e.message
 71 |         log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
 72 |         nil
 73 |       end
 74 | 
 75 |       def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
 76 |         body = {
 77 |           rows: rows,
 78 |           skip_invalid_rows: @options[:skip_invalid_rows],
 79 |           ignore_unknown_values: @options[:ignore_unknown_values],
 80 |         }
 81 |         body.merge!(template_suffix: template_suffix) if template_suffix
 82 | 
 83 |         if @options[:auto_create_table]
 84 |           res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
 85 |         else
 86 |           res = client.insert_all_table_data(project, dataset, table_id, body, **{})
 87 |         end
 88 |         log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
 89 | 
 90 |         if res.insert_errors && !res.insert_errors.empty?
 91 |           log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
 92 |           if @options[:allow_retry_insert_errors]
 93 |             is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
 94 |               insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
 95 |             end
 96 |             if is_included_any_retryable_insert_error
 97 |               raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry")
 98 |             else
 99 |               raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry")
100 |             end
101 |           end
102 |         end
103 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
104 |         log.debug "insert error: #{e.message}", status_code: e.respond_to?(:status_code) ? e.status_code : nil, reason: e.respond_to?(:reason) ? e.reason : nil
105 |         error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
106 |         wrapped = Fluent::BigQuery::Error.wrap(e)
107 |         if wrapped.retryable?
108 |           log.warn "tabledata.insertAll API", error_data
109 |         else
110 |           log.error "tabledata.insertAll API", error_data
111 |         end
112 | 
113 |         raise wrapped
114 |       end
115 | 
116 |       JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id, :location) do
117 |         def as_hash(*keys)
118 |           if keys.empty?
119 |             to_h
120 |           else
121 |             to_h.select { |k, _| keys.include?(k) }
122 |           end
123 |         end
124 |       end
125 | 
126 |       def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
127 |         configuration = {
128 |           configuration: {
129 |             load: {
130 |               destination_table: {
131 |                 project_id: project,
132 |                 dataset_id: dataset,
133 |                 table_id: table_id,
134 |               },
135 |               write_disposition: "WRITE_APPEND",
136 |               source_format: source_format,
137 |               ignore_unknown_values: @options[:ignore_unknown_values],
138 |               max_bad_records: @options[:max_bad_records],
139 |             }
140 |           }
141 |         }
142 | 
143 |         job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
144 |         configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
145 | 
146 |         begin
147 |           # Check table existance and use its location for the result when the load jobs is duplicated.
148 |           table = client.get_table(project, dataset, table_id)
149 |         rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
150 |           if e.status_code == 404 && /Not Found: Table/i =~ e.message
151 |             raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
152 |             raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
153 |             configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
154 |             configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
155 |             configuration[:configuration][:load].merge!(clustering: clustering) if clustering
156 |           end
157 |         end
158 | 
159 |         res = client.insert_job(
160 |           project,
161 |           configuration,
162 |           upload_source: upload_source,
163 |           content_type: "application/octet-stream",
164 |         )
165 |         JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id, res.job_reference.location)
166 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
167 |         log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
168 | 
169 |         if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
170 |           # If a load job is duplicated, the API response may not be available to create the result.
171 |           # Therefore, we need to use the location of the table instead of the job's location to determine the result.
172 |           return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id, table.location)
173 |         end
174 | 
175 |         raise Fluent::BigQuery::Error.wrap(e)
176 |       end
177 | 
178 |       def fetch_load_job(job_reference)
179 |         project = job_reference.project_id
180 |         job_id = job_reference.job_id
181 |         location = job_reference.location
182 | 
183 |         res = client.get_job(project, job_id, location: location)
184 |         log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
185 | 
186 |         if res.status.state == "DONE"
187 |           res
188 |         end
189 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
190 |         e = Fluent::BigQuery::Error.wrap(e) 
191 |         raise e unless e.retryable?
192 |       end
193 | 
194 |       def commit_load_job(chunk_id_hex, response)
195 |         job_id = response.id
196 |         project = response.configuration.load.destination_table.project_id
197 |         dataset = response.configuration.load.destination_table.dataset_id
198 |         table_id = response.configuration.load.destination_table.table_id
199 | 
200 |         errors = response.status.errors
201 |         if errors
202 |           errors.each do |e|
203 |             log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
204 |           end
205 |         end
206 | 
207 |         error_result = response.status.error_result
208 |         if error_result
209 |           log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
210 |           if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
211 |             @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
212 |             raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
213 |           else
214 |             @num_errors_per_chunk.delete(chunk_id_hex)
215 |             raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
216 |           end
217 |         end
218 | 
219 |         # `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
220 |         stats = response.statistics.load
221 |         duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
222 |         log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
223 |         @num_errors_per_chunk.delete(chunk_id_hex)
224 |       end
225 | 
226 |       private
227 | 
228 |       def log
229 |         @log
230 |       end
231 | 
232 |       def get_auth
233 |         case @auth_method
234 |         when :private_key
235 |           get_auth_from_private_key
236 |         when :compute_engine
237 |           get_auth_from_compute_engine
238 |         when :json_key
239 |           get_auth_from_json_key
240 |         when :application_default
241 |           get_auth_from_application_default
242 |         else
243 |           raise ConfigError, "Unknown auth method: #{@auth_method}"
244 |         end
245 |       end
246 | 
247 |       def get_auth_from_private_key
248 |         require 'google/api_client/auth/key_utils'
249 |         private_key_path = @options[:private_key_path]
250 |         private_key_passphrase = @options[:private_key_passphrase]
251 |         email = @options[:email]
252 | 
253 |         key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
254 |         Signet::OAuth2::Client.new(
255 |           token_credential_uri: "https://accounts.google.com/o/oauth2/token",
256 |           audience: "https://accounts.google.com/o/oauth2/token",
257 |           scope: @scope,
258 |           issuer: email,
259 |           signing_key: key
260 |         )
261 |       end
262 | 
263 |       def get_auth_from_compute_engine
264 |         Google::Auth::GCECredentials.new
265 |       end
266 | 
267 |       def get_auth_from_json_key
268 |         json_key = @options[:json_key]
269 | 
270 |         begin
271 |           JSON.parse(json_key)
272 |           key = StringIO.new(json_key)
273 |           Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
274 |         rescue JSON::ParserError
275 |           key = json_key
276 |           File.open(json_key) do |f|
277 |             Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
278 |           end
279 |         end
280 |       end
281 | 
282 |       def get_auth_from_application_default
283 |         Google::Auth.get_application_default([@scope])
284 |       end
285 | 
286 |       def safe_table_id(table_id)
287 |         table_id.gsub(/\$\d+$/, "")
288 |       end
289 | 
290 |       def create_job_id(chunk_id_hex, dataset, table, schema)
291 |         job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
292 |         @log.debug "job_id_key: #{job_id_key}"
293 |         "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
294 |       end
295 | 
296 |       def source_format
297 |         case @options[:source_format]
298 |         when :json
299 |           "NEWLINE_DELIMITED_JSON"
300 |         when :avro
301 |           "AVRO"
302 |         when :csv
303 |           "CSV"
304 |         else
305 |           "NEWLINE_DELIMITED_JSON"
306 |         end
307 |       end
308 | 
309 |       def time_partitioning
310 |         return @time_partitioning if instance_variable_defined?(:@time_partitioning)
311 | 
312 |         if @options[:time_partitioning_type]
313 |           @time_partitioning = {
314 |             type: @options[:time_partitioning_type].to_s.upcase,
315 |             field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
316 |             expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
317 |           }.reject { |_, v| v.nil? }
318 |         else
319 |           @time_partitioning
320 |         end
321 |       end
322 | 
323 |       def require_partition_filter
324 |         return @require_partition_filter if instance_variable_defined?(:@require_partition_filter)
325 | 
326 |         if @options[:require_partition_filter]
327 |           @require_partition_filter = @options[:require_partition_filter]
328 |         else
329 |           @require_partition_filter
330 |         end
331 |       end
332 | 
333 |       def clustering
334 |         return @clustering if instance_variable_defined?(:@clustering)
335 | 
336 |         if @options[:clustering_fields]
337 |           @clustering = {
338 |             fields: @options[:clustering_fields]
339 |           }
340 |         else
341 |           @clustering
342 |         end
343 |       end
344 | 
345 |       def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
346 |         try_count ||= 1
347 |         res = client.insert_all_table_data(project, dataset, table_id, body, **{})
348 |       rescue Google::Apis::ClientError => e
349 |         if e.status_code == 404 && /Not Found: Table/i =~ e.message
350 |           if try_count == 1
351 |             # Table Not Found: Auto Create Table
352 |             create_table(project, dataset, table_id, schema)
353 |           elsif try_count > 60 # timeout in about 300 seconds
354 |             raise "A new table was created but it is not found."
355 |           end
356 | 
357 |           # Retry to insert several times because the created table is not visible from Streaming insert for a little while
358 |           # cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
359 |           try_count += 1
360 |           sleep 5
361 |           log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
362 |           retry
363 |         end
364 |         raise
365 |       end
366 |     end
367 |   end
368 | end
369 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/out_bigquery_base.rb:
--------------------------------------------------------------------------------
  1 | require 'fluent/plugin/output'
  2 | 
  3 | require 'fluent/plugin/bigquery/version'
  4 | 
  5 | require 'fluent/plugin/bigquery/helper'
  6 | require 'fluent/plugin/bigquery/errors'
  7 | require 'fluent/plugin/bigquery/schema'
  8 | require 'fluent/plugin/bigquery/writer'
  9 | 
 10 | require 'multi_json'
 11 | require 'google/apis/bigquery_v2'
 12 | require 'googleauth'
 13 | 
 14 | module Fluent
 15 |   module Plugin
 16 |     # This class is abstract class
 17 |     class BigQueryBaseOutput < Output
 18 |       helpers :inject, :formatter
 19 | 
 20 |       # Available methods are:
 21 |       # * private_key -- Use service account credential from pkcs12 private key file
 22 |       # * compute_engine -- Use access token available in instances of ComputeEngine
 23 |       # * json_key -- Use service account credential from JSON key
 24 |       # * application_default -- Use application default credential
 25 |       config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
 26 | 
 27 |       ### Service Account credential
 28 |       config_param :email, :string, default: nil
 29 |       config_param :private_key_path, :string, default: nil
 30 |       config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
 31 |       config_param :json_key, default: nil, secret: true
 32 | 
 33 |       # see as simple reference
 34 |       #   https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
 35 |       config_param :project, :string
 36 | 
 37 |       # dataset_name
 38 |       #   The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
 39 |       #   but it cannot start with a number or underscore, or have spaces.
 40 |       config_param :dataset, :string
 41 | 
 42 |       # table_id
 43 |       #   In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
 44 |       config_param :table, :string, default: nil
 45 |       config_param :tables, :array, value_type: :string, default: nil
 46 | 
 47 |       config_param :auto_create_table, :bool, default: false
 48 | 
 49 |       # ignore_unknown_values
 50 |       #   Accept rows that contain values that do not match the schema. The unknown values are ignored.
 51 |       #   Default is false, which treats unknown values as errors.
 52 |       config_param :ignore_unknown_values, :bool, default: false
 53 | 
 54 |       config_param :schema, :array, default: nil
 55 |       config_param :schema_path, :string, default: nil
 56 |       config_param :fetch_schema, :bool, default: false
 57 |       config_param :fetch_schema_table, :string, default: nil
 58 |       config_param :schema_cache_expire, :time, default: 600
 59 | 
 60 |       ## Timeout
 61 |       # request_timeout_sec
 62 |       #   Bigquery API response timeout
 63 |       # request_open_timeout_sec
 64 |       #   Bigquery API connection, and request timeout
 65 |       config_param :request_timeout_sec, :time, default: nil
 66 |       config_param :request_open_timeout_sec, :time, default: 60
 67 | 
 68 |       ## Partitioning
 69 |       config_param :time_partitioning_type, :enum, list: [:day, :hour], default: nil
 70 |       config_param :time_partitioning_field, :string, default: nil
 71 |       config_param :time_partitioning_expiration, :time, default: nil
 72 | 
 73 |       ## Clustering
 74 |       config_param :clustering_fields, :array, default: nil
 75 | 
 76 |       ## Formatter
 77 |       config_section :format do
 78 |         config_set_default :@type, 'json'
 79 |       end
 80 | 
 81 |       def configure(conf)
 82 |         super
 83 | 
 84 |         case @auth_method
 85 |         when :private_key
 86 |           unless @email && @private_key_path
 87 |             raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
 88 |           end
 89 |         when :compute_engine
 90 |           # Do nothing
 91 |         when :json_key
 92 |           unless @json_key
 93 |             raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
 94 |           end
 95 |         when :application_default
 96 |           # Do nothing
 97 |         else
 98 |           raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
 99 |         end
100 | 
101 |         unless @table.nil? ^ @tables.nil?
102 |           raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
103 |         end
104 | 
105 |         @tablelist = @tables ? @tables : [@table]
106 | 
107 |         @table_schema = Fluent::BigQuery::RecordSchema.new('record')
108 |         if @schema
109 |           @table_schema.load_schema(@schema)
110 |         end
111 | 
112 |         formatter_config = conf.elements("format")[0]
113 |         @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
114 |       end
115 | 
116 |       def start
117 |         super
118 | 
119 |         @tables_queue = @tablelist.shuffle
120 |         @tables_mutex = Mutex.new
121 |         @fetched_schemas = {}
122 |         @last_fetch_schema_time = Hash.new(0)
123 |         @read_schemas = {}
124 |       end
125 | 
126 |       def multi_workers_ready?
127 |         true
128 |       end
129 | 
130 |       def writer
131 |         @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method,
132 |           private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
133 |           email: @email,
134 |           json_key: @json_key,
135 |           source_format: @source_format,
136 |           skip_invalid_rows: @skip_invalid_rows,
137 |           ignore_unknown_values: @ignore_unknown_values,
138 |           max_bad_records: @max_bad_records,
139 |           allow_retry_insert_errors: @allow_retry_insert_errors,
140 |           prevent_duplicate_load: @prevent_duplicate_load,
141 |           auto_create_table: @auto_create_table,
142 |           time_partitioning_type: @time_partitioning_type,
143 |           time_partitioning_field: @time_partitioning_field,
144 |           time_partitioning_expiration: @time_partitioning_expiration,
145 |           require_partition_filter: @require_partition_filter,
146 |           clustering_fields: @clustering_fields,
147 |           timeout_sec: @request_timeout_sec,
148 |           open_timeout_sec: @request_open_timeout_sec,
149 |         )
150 |       end
151 | 
152 |       def format(tag, time, record)
153 |         if record.nil?
154 |           log.warn("nil record detected. corrupted chunks? tag=#{tag}, time=#{time}")
155 |           return
156 |         end
157 | 
158 |         record = inject_values_to_record(tag, time, record)
159 | 
160 |         meta = metadata(tag, time, record)
161 |         schema =
162 |           if @fetch_schema
163 |             fetch_schema(meta)
164 |           elsif @schema_path
165 |             read_schema(meta)
166 |           else
167 |             @table_schema
168 |           end
169 | 
170 |         begin
171 |           row = schema.format(record, is_load: !!@is_load)
172 |           return if row.empty?
173 |           @formatter.format(tag, time, row)
174 |         rescue
175 |           log.error("format error", record: record, schema: schema)
176 |           raise
177 |         end
178 |       end
179 | 
180 |       def write(chunk)
181 |       end
182 | 
183 |       def fetch_schema(metadata)
184 |         table_id = nil
185 |         project = extract_placeholders(@project, metadata)
186 |         dataset = extract_placeholders(@dataset, metadata)
187 |         table_id = fetch_schema_target_table(metadata)
188 | 
189 |         if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
190 |           schema = writer.fetch_schema(project, dataset, table_id)
191 | 
192 |           if schema
193 |             table_schema = Fluent::BigQuery::RecordSchema.new("record")
194 |             table_schema.load_schema(schema)
195 |             @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
196 |           else
197 |             if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
198 |               raise "failed to fetch schema from bigquery"
199 |             else
200 |               log.warn "#{table_id} uses previous schema"
201 |             end
202 |           end
203 | 
204 |           @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
205 |         end
206 | 
207 |         @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
208 |       end
209 | 
210 |       def fetch_schema_target_table(metadata)
211 |         extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
212 |       end
213 | 
214 |       def read_schema(metadata)
215 |         schema_path = read_schema_target_path(metadata)
216 | 
217 |         unless @read_schemas[schema_path]
218 |           table_schema = Fluent::BigQuery::RecordSchema.new("record")
219 |           table_schema.load_schema(MultiJson.load(File.read(schema_path)))
220 |           @read_schemas[schema_path] = table_schema
221 |         end
222 |         @read_schemas[schema_path]
223 |       end
224 | 
225 |       def read_schema_target_path(metadata)
226 |         extract_placeholders(@schema_path, metadata)
227 |       end
228 | 
229 |       def get_schema(project, dataset, metadata)
230 |         if @fetch_schema
231 |           @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
232 |         elsif @schema_path
233 |           @read_schemas[read_schema_target_path(metadata)] || read_schema(metadata)
234 |         else
235 |           @table_schema
236 |         end
237 |       end
238 |     end
239 |   end
240 | end
241 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/out_bigquery_insert.rb:
--------------------------------------------------------------------------------
  1 | require 'fluent/plugin/out_bigquery_base'
  2 | 
  3 | module Fluent
  4 |   module Plugin
  5 |     class BigQueryInsertOutput < BigQueryBaseOutput
  6 |       Fluent::Plugin.register_output('bigquery_insert', self)
  7 | 
  8 |       helpers :record_accessor
  9 | 
 10 |       # template_suffix (only insert)
 11 |       #   https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
 12 |       config_param :template_suffix, :string, default: nil
 13 | 
 14 |       # skip_invalid_rows (only insert)
 15 |       #   Insert all valid rows of a request, even if invalid rows exist.
 16 |       #   The default value is false, which causes the entire request to fail if any invalid rows exist.
 17 |       config_param :skip_invalid_rows, :bool, default: false
 18 | 
 19 |       # insert_id_field (only insert)
 20 |       config_param :insert_id_field, :string, default: nil
 21 | 
 22 |       # add_insert_timestamp (only insert)
 23 |       # adds a timestamp just before sending the rows to bigquery, so that
 24 |       # buffering time is not taken into account. Gives a field in bigquery
 25 |       # which represents the insert time of the row.
 26 |       config_param :add_insert_timestamp, :string, default: nil
 27 | 
 28 |       # allow_retry_insert_errors (only insert)
 29 |       # If insert_id_field is not specified, true means to allow duplicate rows
 30 |       config_param :allow_retry_insert_errors, :bool, default: false
 31 | 
 32 |       ## RequirePartitionFilter
 33 |       config_param :require_partition_filter, :bool, default: false
 34 | 
 35 |       ## Buffer
 36 |       config_section :buffer do
 37 |         config_set_default :@type, "memory"
 38 |         config_set_default :flush_mode, :interval
 39 |         config_set_default :flush_interval, 1
 40 |         config_set_default :flush_thread_interval, 0.05
 41 |         config_set_default :flush_thread_burst_interval, 0.05
 42 |         config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
 43 |         config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
 44 |         config_set_default :chunk_limit_records, 500
 45 |       end
 46 | 
 47 |       def configure(conf)
 48 |         super
 49 |         @is_load = false
 50 | 
 51 |         if @insert_id_field
 52 |           if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
 53 |             warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
 54 |           end
 55 |           @get_insert_id = record_accessor_create(@insert_id_field)
 56 |         end
 57 | 
 58 |         formatter_config = conf.elements("format")[0]
 59 |         if formatter_config && formatter_config['@type'] != "json"
 60 |           raise ConfigError, "`bigquery_insert` supports only json formatter."
 61 |         end
 62 |         @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
 63 | 
 64 |         placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
 65 |         placeholder_validate!(:bigquery_insert, placeholder_params)
 66 |       end
 67 | 
 68 |       # for Fluent::Plugin::Output#implement? method
 69 |       def format(tag, time, record)
 70 |         super
 71 |       end
 72 | 
 73 |       def write(chunk)
 74 |         table_format = @tables_mutex.synchronize do
 75 |           t = @tables_queue.shift
 76 |           @tables_queue.push t
 77 |           t
 78 |         end
 79 | 
 80 |         now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
 81 | 
 82 |         rows = chunk.open do |io|
 83 |           io.map do |line|
 84 |             record = MultiJson.load(line)
 85 |             record[@add_insert_timestamp] = now if @add_insert_timestamp
 86 |             row = {"json" => record}
 87 |             row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
 88 |             Fluent::BigQuery::Helper.deep_symbolize_keys(row)
 89 |           end
 90 |         end
 91 | 
 92 |         metadata = chunk.metadata
 93 |         project = extract_placeholders(@project, metadata)
 94 |         dataset = extract_placeholders(@dataset, metadata)
 95 |         table_id = extract_placeholders(table_format, metadata)
 96 |         template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
 97 |         schema = get_schema(project, dataset, metadata)
 98 | 
 99 |         insert(project, dataset, table_id, rows, schema, template_suffix)
100 |       rescue MultiJson::ParseError => e
101 |         raise Fluent::UnrecoverableError.new(e)
102 |       end
103 | 
104 |       def insert(project, dataset, table_id, rows, schema, template_suffix)
105 |         writer.insert_rows(project, dataset, table_id, rows, schema, template_suffix: template_suffix)
106 |       rescue Fluent::BigQuery::Error => e
107 |         raise if e.retryable?
108 | 
109 |         if @secondary
110 |           # TODO: find better way
111 |           @retry = retry_state_create(
112 |             :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
113 |             forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
114 |             max_interval: @buffer_config.retry_max_interval,
115 |             secondary: true, secondary_threshold: Float::EPSILON,
116 |             randomize: @buffer_config.retry_randomize
117 |           )
118 |         else
119 |           @retry = retry_state_create(
120 |             :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
121 |             forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
122 |             max_interval: @buffer_config.retry_max_interval,
123 |             randomize: @buffer_config.retry_randomize
124 |           )
125 |         end
126 | 
127 |         raise
128 |       end
129 |     end
130 |   end
131 | end
132 | 


--------------------------------------------------------------------------------
/lib/fluent/plugin/out_bigquery_load.rb:
--------------------------------------------------------------------------------
  1 | require 'fluent/plugin/out_bigquery_base'
  2 | 
  3 | module Fluent
  4 |   module Plugin
  5 |     class BigQueryLoadOutput < BigQueryBaseOutput
  6 |       Fluent::Plugin.register_output('bigquery_load', self)
  7 | 
  8 |       helpers :timer
  9 | 
 10 |       config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
 11 | 
 12 |       # max_bad_records (only load)
 13 |       #   The maximum number of bad records that BigQuery can ignore when running the job.
 14 |       #   If the number of bad records exceeds this value, an invalid error is returned in the job result.
 15 |       #   The default value is 0, which requires that all records are valid.
 16 |       config_param :max_bad_records, :integer, default: 0
 17 | 
 18 |       # prevent_duplicate_load (only load)
 19 |       config_param :prevent_duplicate_load, :bool, default: false
 20 | 
 21 |       config_param :use_delayed_commit, :bool, default: true
 22 |       config_param :wait_job_interval, :time, default: 3
 23 | 
 24 |       ## Buffer
 25 |       config_section :buffer do
 26 |         config_set_default :@type, "file"
 27 |         config_set_default :flush_mode, :interval
 28 |         config_set_default :flush_interval, 3600 # 1h
 29 |         config_set_default :flush_thread_interval, 5
 30 |         config_set_default :flush_thread_burst_interval, 5
 31 |         config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
 32 |         config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
 33 | 
 34 |         config_set_default :delayed_commit_timeout, 1800 # 30m
 35 |       end
 36 | 
 37 |       def configure(conf)
 38 |         super
 39 |         @is_load = true
 40 | 
 41 |         placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
 42 |         placeholder_validate!(:bigquery_load, placeholder_params)
 43 |       end
 44 | 
 45 |       def start
 46 |         super
 47 | 
 48 |         if prefer_delayed_commit
 49 |           @polling_targets = []
 50 |           @polling_mutex = Mutex.new
 51 |           log.debug("start load job polling")
 52 |           timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
 53 |         end
 54 |       end
 55 | 
 56 |       def prefer_delayed_commit
 57 |         @use_delayed_commit
 58 |       end
 59 | 
 60 |       # for Fluent::Plugin::Output#implement? method
 61 |       def format(tag, time, record)
 62 |         super
 63 |       end
 64 | 
 65 |       def write(chunk)
 66 |         job_reference = do_write(chunk)
 67 | 
 68 |         until response = writer.fetch_load_job(job_reference)
 69 |           sleep @wait_job_interval
 70 |         end
 71 | 
 72 |         writer.commit_load_job(job_reference.chunk_id_hex, response)
 73 |       rescue Fluent::BigQuery::Error => e
 74 |         raise if e.retryable?
 75 | 
 76 |         @retry_mutex.synchronize do
 77 |           if @secondary
 78 |             # TODO: find better way
 79 |             @retry = retry_state_create(
 80 |               :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
 81 |               forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
 82 |               max_interval: @buffer_config.retry_max_interval,
 83 |               secondary: true, secondary_threshold: Float::EPSILON,
 84 |               randomize: @buffer_config.retry_randomize
 85 |             )
 86 |           else
 87 |             @retry = retry_state_create(
 88 |               :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
 89 |               forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
 90 |               max_interval: @buffer_config.retry_max_interval,
 91 |               randomize: @buffer_config.retry_randomize
 92 |             )
 93 |           end
 94 |         end
 95 | 
 96 |         raise
 97 |       end
 98 | 
 99 |       def try_write(chunk)
100 |         job_reference = do_write(chunk)
101 |         @polling_mutex.synchronize do
102 |           @polling_targets << job_reference
103 |         end
104 |       rescue Fluent::BigQuery::Error => e
105 |         raise if e.retryable?
106 | 
107 |         @retry_mutex.synchronize do
108 |           if @secondary
109 |             # TODO: find better way
110 |             @retry = retry_state_create(
111 |               :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
112 |               forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
113 |               max_interval: @buffer_config.retry_max_interval,
114 |               secondary: true, secondary_threshold: Float::EPSILON,
115 |               randomize: @buffer_config.retry_randomize
116 |             )
117 |           else
118 |             @retry = retry_state_create(
119 |               :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
120 |               forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
121 |               max_interval: @buffer_config.retry_max_interval,
122 |               randomize: @buffer_config.retry_randomize
123 |             )
124 |           end
125 |         end
126 | 
127 |         raise
128 |       end
129 | 
130 |       private
131 | 
132 |       def do_write(chunk)
133 |         table_format = @tables_mutex.synchronize do
134 |           t = @tables_queue.shift
135 |           @tables_queue.push t
136 |           t
137 |         end
138 | 
139 |         metadata = chunk.metadata
140 |         project = extract_placeholders(@project, metadata)
141 |         dataset = extract_placeholders(@dataset, metadata)
142 |         table_id = extract_placeholders(table_format, metadata)
143 |         schema = get_schema(project, dataset, metadata)
144 | 
145 |         create_upload_source(chunk) do |upload_source|
146 |           writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
147 |         end
148 |       end
149 | 
150 |       def poll
151 |         job_reference = @polling_mutex.synchronize do
152 |           @polling_targets.shift
153 |         end
154 |         return unless job_reference
155 | 
156 |         begin
157 |           response = writer.fetch_load_job(job_reference)
158 |           if response
159 |             writer.commit_load_job(job_reference.chunk_id_hex, response)
160 |             commit_write(job_reference.chunk_id)
161 |             log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
162 |           else
163 |             @polling_mutex.synchronize do
164 |               @polling_targets << job_reference
165 |             end
166 |           end
167 |         rescue Fluent::BigQuery::Error => e
168 |           # RetryableError comes from only `commit_load_job`
169 |           # if error is retryable, takeback chunk and do next `try_flush`
170 |           # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
171 |           if e.retryable?
172 |             log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
173 |           else
174 |             log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
175 |             @retry_mutex.synchronize do
176 |               if @secondary
177 |                 # TODO: find better way
178 |                 @retry = retry_state_create(
179 |                   :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
180 |                   forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
181 |                   max_interval: @buffer_config.retry_max_interval,
182 |                   secondary: true, secondary_threshold: Float::EPSILON,
183 |                   randomize: @buffer_config.retry_randomize
184 |                 )
185 |               else
186 |                 @retry = retry_state_create(
187 |                   :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
188 |                   forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
189 |                   max_interval: @buffer_config.retry_max_interval,
190 |                   randomize: @buffer_config.retry_randomize
191 |                 )
192 |               end
193 |             end
194 |           end
195 | 
196 |           rollback_write(job_reference.chunk_id)
197 |         rescue => e
198 |           log.error("unexpected error while polling", error: e)
199 |           log.error_backtrace
200 |           rollback_write(job_reference.chunk_id)
201 |         end
202 |       end
203 | 
204 |       def create_upload_source(chunk)
205 |         chunk_is_file = @buffer_config["@type"] == 'file'
206 |         if chunk_is_file
207 |           File.open(chunk.path) do |file|
208 |             yield file
209 |           end
210 |         else
211 |           Tempfile.open("chunk-tmp") do |file|
212 |             file.binmode
213 |             chunk.write_to(file)
214 |             file.sync
215 |             file.rewind
216 |             yield file
217 |           end
218 |         end
219 |       end
220 |     end
221 |   end
222 | end
223 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | require 'bundler/setup'
 2 | require 'test/unit'
 3 | 
 4 | $LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
 5 | $LOAD_PATH.unshift(__dir__)
 6 | require 'fluent/test'
 7 | 
 8 | require 'fluent/plugin/buffer'
 9 | require 'fluent/plugin/buf_memory'
10 | require 'fluent/plugin/buf_file'
11 | require 'fluent/test/driver/output'
12 | 
13 | require 'fluent/plugin/out_bigquery_base'
14 | require 'fluent/plugin/out_bigquery_insert'
15 | require 'fluent/plugin/out_bigquery_load'
16 | require 'google/apis/bigquery_v2'
17 | require 'google/api_client/auth/key_utils'
18 | require 'googleauth'
19 | 
20 | require 'test/unit/rr'
21 | 


--------------------------------------------------------------------------------
/test/plugin/test_out_bigquery_base.rb:
--------------------------------------------------------------------------------
  1 | require 'helper'
  2 | 
  3 | class BigQueryBaseOutputTest < Test::Unit::TestCase
  4 |   def setup
  5 |     Fluent::Test.setup
  6 |   end
  7 | 
  8 |   CONFIG = %[
  9 |     table foo
 10 |     email foo@bar.example
 11 |     private_key_path /path/to/key
 12 |     project yourproject_id
 13 |     dataset yourdataset_id
 14 | 
 15 |     <inject>
 16 |     time_format %s
 17 |     time_key  time
 18 |     </inject>
 19 | 
 20 |     schema [
 21 |       {"name": "time", "type": "INTEGER"},
 22 |       {"name": "status", "type": "INTEGER"},
 23 |       {"name": "bytes", "type": "INTEGER"},
 24 |       {"name": "vhost", "type": "STRING"},
 25 |       {"name": "path", "type": "STRING"},
 26 |       {"name": "method", "type": "STRING"},
 27 |       {"name": "protocol", "type": "STRING"},
 28 |       {"name": "agent", "type": "STRING"},
 29 |       {"name": "referer", "type": "STRING"},
 30 |       {"name": "remote", "type": "RECORD", "fields": [
 31 |         {"name": "host", "type": "STRING"},
 32 |         {"name": "ip", "type": "STRING"},
 33 |         {"name": "user", "type": "STRING"}
 34 |       ]},
 35 |       {"name": "requesttime", "type": "FLOAT"},
 36 |       {"name": "bot_access", "type": "BOOLEAN"},
 37 |       {"name": "loginsession", "type": "BOOLEAN"}
 38 |     ]
 39 |   ]
 40 | 
 41 |   API_SCOPE = "https://www.googleapis.com/auth/bigquery"
 42 | 
 43 |   def create_driver(conf = CONFIG)
 44 |     Fluent::Test::Driver::Output.new(Fluent::Plugin::BigQueryBaseOutput).configure(conf)
 45 |   end
 46 | 
 47 |   def stub_writer(stub_auth: true)
 48 |     stub.proxy(Fluent::BigQuery::Writer).new.with_any_args do |writer|
 49 |       stub(writer).get_auth { nil } if stub_auth
 50 |       yield writer
 51 |       writer
 52 |     end
 53 |   end
 54 | 
 55 |   private def sudo_schema_response
 56 |     {
 57 |       "schema" => {
 58 |         "fields" => [
 59 |           {
 60 |             "name" => "time",
 61 |             "type" => "TIMESTAMP",
 62 |             "mode" => "REQUIRED"
 63 |           },
 64 |           {
 65 |             "name" => "tty",
 66 |             "type" => "STRING",
 67 |             "mode" => "NULLABLE"
 68 |           },
 69 |           {
 70 |             "name" => "pwd",
 71 |             "type" => "STRING",
 72 |             "mode" => "REQUIRED"
 73 |           },
 74 |           {
 75 |             "name" => "user",
 76 |             "type" => "STRING",
 77 |             "mode" => "REQUIRED"
 78 |           },
 79 |           {
 80 |             "name" => "argv",
 81 |             "type" => "STRING",
 82 |             "mode" => "REPEATED"
 83 |           }
 84 |         ]
 85 |       }
 86 |     }
 87 |   end
 88 | 
 89 |   def test_configure_table
 90 |     driver = create_driver
 91 |     assert_equal driver.instance.table, 'foo'
 92 |     assert_nil driver.instance.tables
 93 | 
 94 |     driver = create_driver(CONFIG.sub(/\btable\s+.*$/,  'tables foo,bar'))
 95 |     assert_nil driver.instance.table
 96 |     assert_equal driver.instance.tables, ['foo' ,'bar']
 97 | 
 98 |     assert_raise(Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid") {
 99 |       create_driver(CONFIG + "tables foo,bar")
100 |     }
101 |   end
102 | 
103 |   def test_configure_auth_private_key
104 |     driver = create_driver
105 |     stub_writer(stub_auth: false) do |writer|
106 |       mock(writer).get_auth_from_private_key { stub! }
107 |     end
108 |     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
109 |   end
110 | 
111 |   def test_configure_auth_compute_engine
112 |     driver = create_driver(%[
113 |       table foo
114 |       auth_method compute_engine
115 |       project yourproject_id
116 |       dataset yourdataset_id
117 |       schema [
118 |         {"name": "time", "type": "INTEGER"},
119 |         {"name": "status", "type": "INTEGER"},
120 |         {"name": "bytes", "type": "INTEGER"}
121 |       ]
122 |     ])
123 | 
124 |     stub_writer(stub_auth: false) do |writer|
125 |       mock(writer).get_auth_from_compute_engine { stub! }
126 |     end
127 |     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
128 |   end
129 | 
130 |   def test_configure_auth_json_key_as_file
131 |     driver = create_driver(%[
132 |       table foo
133 |       auth_method json_key
134 |       json_key jsonkey.josn
135 |       project yourproject_id
136 |       dataset yourdataset_id
137 |       schema [
138 |         {"name": "time", "type": "INTEGER"},
139 |         {"name": "status", "type": "INTEGER"},
140 |         {"name": "bytes", "type": "INTEGER"}
141 |       ]
142 |     ])
143 | 
144 |     stub_writer(stub_auth: false) do |writer|
145 |       mock(writer).get_auth_from_json_key { stub! }
146 |     end
147 |     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
148 |   end
149 | 
150 |   def test_configure_auth_json_key_as_string
151 |     json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
152 |     json_key_io = StringIO.new(json_key)
153 |     authorization = Object.new
154 |     stub(Google::Auth::ServiceAccountCredentials).make_creds(json_key_io: satisfy {|arg| JSON.parse(arg.read) == JSON.parse(json_key_io.read) }, scope: API_SCOPE) { authorization }
155 | 
156 |     driver = create_driver(%[
157 |       table foo
158 |       auth_method json_key
159 |       json_key #{json_key}
160 |       project yourproject_id
161 |       dataset yourdataset_id
162 |       schema [
163 |         {"name": "time", "type": "INTEGER"},
164 |         {"name": "status", "type": "INTEGER"},
165 |         {"name": "bytes", "type": "INTEGER"}
166 |       ]
167 |     ])
168 |     stub_writer(stub_auth: false) do |writer|
169 |       mock.proxy(writer).get_auth_from_json_key { stub! }
170 |     end
171 |     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
172 |   end
173 | 
174 |   def test_configure_auth_application_default
175 |     omit "This testcase depends on some environment variables." if ENV["CI"] == "true"
176 | 
177 |     driver = create_driver(%[
178 |       table foo
179 |       auth_method application_default
180 |       project yourproject_id
181 |       dataset yourdataset_id
182 |       schema [
183 |         {"name": "time", "type": "INTEGER"},
184 |         {"name": "status", "type": "INTEGER"},
185 |         {"name": "bytes", "type": "INTEGER"}
186 |       ]
187 |     ])
188 | 
189 |     stub_writer(stub_auth: false) do |writer|
190 |       mock.proxy(writer).get_auth_from_application_default { stub! }
191 |     end
192 |     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
193 |   end
194 | 
195 |   def test_format
196 |     now = Fluent::EventTime.new(Time.now.to_i)
197 |     input = {
198 |       "status" => "1",
199 |       "bytes" => 3.0,
200 |       "vhost" => :bar,
201 |       "path" => "/path/to/baz",
202 |       "method" => "GET",
203 |       "protocol" => "HTTP/0.9",
204 |       "agent" => "libwww",
205 |       "referer" => "http://referer.example",
206 |       "requesttime" => (now - 1).to_f.to_s,
207 |       "bot_access" => true,
208 |       "loginsession" => false,
209 |       "something-else" => "would be ignored",
210 |       "yet-another" => {
211 |         "foo" => "bar",
212 |         "baz" => 1,
213 |       },
214 |       "remote" => {
215 |         "host" => "remote.example",
216 |         "ip" =>  "192.0.2.1",
217 |         "port" => 12345,
218 |         "user" => "tagomoris",
219 |       }
220 |     }
221 |     expected = {
222 |       "time" => now.to_i,
223 |       "status" => 1,
224 |       "bytes" => 3,
225 |       "vhost" => "bar",
226 |       "path" => "/path/to/baz",
227 |       "method" => "GET",
228 |       "protocol" => "HTTP/0.9",
229 |       "agent" => "libwww",
230 |       "referer" => "http://referer.example",
231 |       "requesttime" => (now - 1).to_f.to_s.to_f,
232 |       "bot_access" => true,
233 |       "loginsession" => false,
234 |       "something-else" => "would be ignored",
235 |       "yet-another" => {
236 |         "foo" => "bar",
237 |         "baz" => 1,
238 |       },
239 |       "remote" => {
240 |         "host" => "remote.example",
241 |         "ip" =>  "192.0.2.1",
242 |         "port" => 12345,
243 |         "user" => "tagomoris",
244 |       }
245 |     }
246 | 
247 |     driver = create_driver(CONFIG)
248 |     buf = nil
249 |     driver.run { buf = driver.instance.format("my.tag", now, input) }
250 | 
251 |     assert_equal expected, MultiJson.load(buf)
252 |   end
253 | 
254 |   [
255 |     # <time_format>, <time field type>, <time expectation generator>, <assertion>
256 |     [
257 |       "%s.%6N",
258 |       lambda{|t| t.strftime("%s.%6N").to_f },
259 |       lambda{|recv, expected, actual|
260 |         recv.assert_in_delta(expected, actual, Float::EPSILON / 10**3)
261 |       }
262 |     ],
263 |     [
264 |       "%Y-%m-%dT%H:%M:%S%:z",
265 |       lambda{|t| t.iso8601 },
266 |       :assert_equal.to_proc
267 |     ],
268 |   ].each do |format, expect_time, assert|
269 |     define_method("test_time_formats_#{format}") do
270 |       now = Fluent::Engine.now
271 |       input = {}
272 |       expected = { "time" => expect_time[Time.at(now.to_r)] }
273 | 
274 |       driver = create_driver(<<-CONFIG)
275 |         table foo
276 |         email foo@bar.example
277 |         private_key_path /path/to/key
278 |         project yourproject_id
279 |         dataset yourdataset_id
280 | 
281 |         <inject>
282 |         time_format #{format}
283 |         time_type string
284 |         time_key  time
285 |         </inject>
286 | 
287 |         schema [
288 |           {"name": "metadata", "type": "RECORD", "fields": [
289 |             {"name": "time", "type": "INTEGER"},
290 |             {"name": "node", "type": "STRING"}
291 |           ]},
292 |           {"name": "log", "type": "STRING"}
293 |         ]
294 |       CONFIG
295 | 
296 |       buf = nil
297 |       driver.run { buf = driver.instance.format("my.tag", now, input) }
298 | 
299 |       assert[self, expected["time"], MultiJson.load(buf)["time"]]
300 |     end
301 |   end
302 | 
303 |   def test_format_with_schema
304 |     now = Fluent::EventTime.new(Time.now.to_i)
305 |     input = {
306 |       "request" => {
307 |         "vhost" => :bar,
308 |         "path" => "/path/to/baz",
309 |         "method" => "GET",
310 |         "protocol" => "HTTP/0.9",
311 |         "agent" => "libwww",
312 |         "referer" => "http://referer.example",
313 |         "time" => (now - 1).to_f,
314 |         "bot_access" => true,
315 |         "loginsession" => false,
316 |       },
317 |       "response" => {
318 |         "status" => "1",
319 |         "bytes" => 3.0,
320 |       },
321 |       "remote" => {
322 |         "host" => "remote.example",
323 |         "ip" =>  "192.0.2.1",
324 |         "port" => 12345,
325 |         "user" => "tagomoris",
326 |       },
327 |       "something-else" => "would be ignored",
328 |       "yet-another" => {
329 |         "foo" => "bar",
330 |         "baz" => 1,
331 |       },
332 |     }
333 |     expected = {
334 |       "time" => now.to_f,
335 |       "request" => {
336 |         "vhost" => "bar",
337 |         "path" => "/path/to/baz",
338 |         "method" => "GET",
339 |         "protocol" => "HTTP/0.9",
340 |         "agent" => "libwww",
341 |         "referer" => "http://referer.example",
342 |         "time" => (now - 1).to_f,
343 |         "bot_access" => true,
344 |         "loginsession" => false,
345 |       },
346 |       "remote" => {
347 |         "host" => "remote.example",
348 |         "ip" =>  "192.0.2.1",
349 |         "port" => 12345,
350 |         "user" => "tagomoris",
351 |       },
352 |       "response" => {
353 |         "status" => 1,
354 |         "bytes" => 3,
355 |       },
356 |       "something-else" => "would be ignored",
357 |       "yet-another" => {
358 |         "foo" => "bar",
359 |         "baz" => 1,
360 |       },
361 |     }
362 | 
363 |     driver = create_driver(<<-CONFIG)
364 |       table foo
365 |       email foo@bar.example
366 |       private_key_path /path/to/key
367 |       project yourproject_id
368 |       dataset yourdataset_id
369 | 
370 |       <inject>
371 |       time_format %s
372 |       time_key  time
373 |       </inject>
374 | 
375 |       schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
376 |       schema [{"name": "time", "type": "INTEGER"}]
377 |     CONFIG
378 | 
379 |     buf = nil
380 |     driver.run { buf = driver.instance.format("my.tag", now, input) }
381 | 
382 |     assert_equal expected, MultiJson.load(buf)
383 |   end
384 | 
385 |   def test_format_repeated_field_with_schema
386 |     now = Fluent::EventTime.new(Time.now.to_i)
387 |     input = {
388 |       "tty" => nil,
389 |       "pwd" => "/home/yugui",
390 |       "user" => "fluentd",
391 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
392 |     }
393 |     expected = {
394 |       "time" => now.to_f,
395 |       "pwd" => "/home/yugui",
396 |       "user" => "fluentd",
397 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
398 |     }
399 | 
400 |     driver = create_driver(<<-CONFIG)
401 |       table foo
402 |       email foo@bar.example
403 |       private_key_path /path/to/key
404 |       project yourproject_id
405 |       dataset yourdataset_id
406 | 
407 |       <inject>
408 |       time_format %s
409 |       time_key  time
410 |       </inject>
411 | 
412 |       schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
413 |       schema [{"name": "time", "type": "INTEGER"}]
414 |     CONFIG
415 | 
416 |     buf = nil
417 |     driver.run { buf = driver.instance.format("my.tag", now, input) }
418 | 
419 |     assert_equal expected, MultiJson.load(buf)
420 |   end
421 | 
422 |   def test_format_fetch_from_bigquery_api
423 |     now = Fluent::EventTime.new(Time.now.to_i)
424 |     input = {
425 |       "tty" => nil,
426 |       "pwd" => "/home/yugui",
427 |       "user" => "fluentd",
428 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
429 |     }
430 |     expected = {
431 |       "time" => now.to_i,
432 |       "pwd" => "/home/yugui",
433 |       "user" => "fluentd",
434 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
435 |     }
436 | 
437 |     driver = create_driver(<<-CONFIG)
438 |       table foo
439 |       email foo@bar.example
440 |       private_key_path /path/to/key
441 |       project yourproject_id
442 |       dataset yourdataset_id
443 | 
444 |       <inject>
445 |       time_format %s
446 |       time_key  time
447 |       </inject>
448 | 
449 |       fetch_schema true
450 |       schema [{"name": "time", "type": "INTEGER"}]
451 |     CONFIG
452 | 
453 |     stub_writer do |writer|
454 |       mock(writer).fetch_schema('yourproject_id', 'yourdataset_id', 'foo') do
455 |         sudo_schema_response["schema"]["fields"]
456 |       end
457 |     end
458 | 
459 |     buf = nil
460 |     driver.run { buf = driver.instance.format("my.tag", now, input) }
461 | 
462 |     assert_equal expected, MultiJson.load(buf)
463 | 
464 |     table_schema = driver.instance.instance_eval{ @fetched_schemas['yourproject_id.yourdataset_id.foo'] }
465 |     assert table_schema["time"]
466 |     assert_equal :timestamp, table_schema["time"].type
467 |     assert_equal :required, table_schema["time"].mode
468 | 
469 |     assert table_schema["tty"]
470 |     assert_equal :string, table_schema["tty"].type
471 |     assert_equal :nullable, table_schema["tty"].mode
472 | 
473 |     assert table_schema["pwd"]
474 |     assert_equal :string, table_schema["pwd"].type
475 |     assert_equal :required, table_schema["pwd"].mode
476 | 
477 |     assert table_schema["user"]
478 |     assert_equal :string, table_schema["user"].type
479 |     assert_equal :required, table_schema["user"].mode
480 | 
481 |     assert table_schema["argv"]
482 |     assert_equal :string, table_schema["argv"].type
483 |     assert_equal :repeated, table_schema["argv"].mode
484 |   end
485 | 
486 |   def test_format_fetch_from_bigquery_api_with_fetch_schema_table
487 |     now = Fluent::EventTime.new(Time.now.to_i)
488 |     input = {
489 |       "tty" => nil,
490 |       "pwd" => "/home/yugui",
491 |       "user" => "fluentd",
492 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
493 |     }
494 |     expected = {
495 |       "time" => now.to_i,
496 |       "pwd" => "/home/yugui",
497 |       "user" => "fluentd",
498 |       "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
499 |     }
500 | 
501 |     driver = create_driver(<<-CONFIG)
502 |       table foo_%Y_%m_%d
503 |       email foo@bar.example
504 |       private_key_path /path/to/key
505 |       project yourproject_id
506 |       dataset yourdataset_id
507 | 
508 |       <inject>
509 |       time_format %s
510 |       time_key  time
511 |       </inject>
512 | 
513 |       fetch_schema true
514 |       fetch_schema_table foo
515 |       schema [{"name": "time", "type": "INTEGER"}]
516 | 
517 |       <buffer time>
518 |         timekey 1d
519 |       </buffer>
520 |     CONFIG
521 | 
522 |     stub_writer do |writer|
523 |       mock(writer).fetch_schema('yourproject_id', 'yourdataset_id', 'foo') do
524 |         sudo_schema_response["schema"]["fields"]
525 |       end
526 |     end
527 | 
528 |     buf = nil
529 |     driver.run { buf = driver.instance.format("my.tag", now, input) }
530 | 
531 |     assert_equal expected, MultiJson.load(buf)
532 | 
533 |     table_schema = driver.instance.instance_eval{ @fetched_schemas['yourproject_id.yourdataset_id.foo'] }
534 |     assert table_schema["time"]
535 |     assert_equal :timestamp, table_schema["time"].type
536 |     assert_equal :required, table_schema["time"].mode
537 | 
538 |     assert table_schema["tty"]
539 |     assert_equal :string, table_schema["tty"].type
540 |     assert_equal :nullable, table_schema["tty"].mode
541 | 
542 |     assert table_schema["pwd"]
543 |     assert_equal :string, table_schema["pwd"].type
544 |     assert_equal :required, table_schema["pwd"].mode
545 | 
546 |     assert table_schema["user"]
547 |     assert_equal :string, table_schema["user"].type
548 |     assert_equal :required, table_schema["user"].mode
549 | 
550 |     assert table_schema["argv"]
551 |     assert_equal :string, table_schema["argv"].type
552 |     assert_equal :repeated, table_schema["argv"].mode
553 |   end
554 | 
555 |   def test_resolve_schema_path_with_placeholder
556 |     now = Time.now.to_i
557 |     driver = create_driver(<<-CONFIG)
558 |       table ${tag}_%Y%m%d
559 |       auth_method json_key
560 |       json_key jsonkey.josn
561 |       project yourproject_id
562 |       dataset yourdataset_id
563 |       schema_path ${tag}.schema
564 | 
565 |       <buffer tag, time>
566 |         timekey 1d
567 |       </buffer>
568 |     CONFIG
569 | 
570 |     metadata = Fluent::Plugin::Buffer::Metadata.new(now, "foo", {})
571 | 
572 |     assert_equal "foo.schema", driver.instance.read_schema_target_path(metadata)
573 |   end
574 | end
575 | 


--------------------------------------------------------------------------------
/test/plugin/test_out_bigquery_insert.rb:
--------------------------------------------------------------------------------
  1 | require 'helper'
  2 | 
  3 | class BigQueryInsertOutputTest < Test::Unit::TestCase
  4 |   def setup
  5 |     Fluent::Test.setup
  6 |   end
  7 | 
  8 |   def is_ruby2?
  9 |     RUBY_VERSION.to_i < 3
 10 |   end
 11 | 
 12 |   def build_args(args)
 13 |     if is_ruby2?
 14 |       args << {}
 15 |     end
 16 |     args
 17 |   end
 18 | 
 19 |   SCHEMA_PATH = File.join(File.dirname(__FILE__), "testdata", "apache.schema")
 20 | 
 21 |   CONFIG = %[
 22 |     table foo
 23 |     email foo@bar.example
 24 |     private_key_path /path/to/key
 25 |     project yourproject_id
 26 |     dataset yourdataset_id
 27 | 
 28 |     <inject>
 29 |     time_format %s
 30 |     time_key  time
 31 |     </inject>
 32 | 
 33 |     schema [
 34 |       {"name": "time", "type": "INTEGER"},
 35 |       {"name": "status", "type": "INTEGER"},
 36 |       {"name": "bytes", "type": "INTEGER"},
 37 |       {"name": "vhost", "type": "STRING"},
 38 |       {"name": "path", "type": "STRING"},
 39 |       {"name": "method", "type": "STRING"},
 40 |       {"name": "protocol", "type": "STRING"},
 41 |       {"name": "agent", "type": "STRING"},
 42 |       {"name": "referer", "type": "STRING"},
 43 |       {"name": "remote", "type": "RECORD", "fields": [
 44 |         {"name": "host", "type": "STRING"},
 45 |         {"name": "ip", "type": "STRING"},
 46 |         {"name": "user", "type": "STRING"}
 47 |       ]},
 48 |       {"name": "requesttime", "type": "FLOAT"},
 49 |       {"name": "bot_access", "type": "BOOLEAN"},
 50 |       {"name": "loginsession", "type": "BOOLEAN"}
 51 |     ]
 52 |   ]
 53 | 
 54 |   API_SCOPE = "https://www.googleapis.com/auth/bigquery"
 55 | 
 56 |   def create_driver(conf = CONFIG)
 57 |     Fluent::Test::Driver::Output.new(Fluent::Plugin::BigQueryInsertOutput).configure(conf)
 58 |   end
 59 | 
 60 |   def stub_writer(stub_auth: true)
 61 |     stub.proxy(Fluent::BigQuery::Writer).new.with_any_args do |writer|
 62 |       stub(writer).get_auth { nil } if stub_auth
 63 |       yield writer
 64 |       writer
 65 |     end
 66 |   end
 67 | 
 68 |   def test__write_with_insert_id
 69 |     now = Time.now.to_i
 70 |     input = {
 71 |       "uuid" => "9ABFF756-0267-4247-847F-0895B65F0938",
 72 |     }
 73 |     expected = {
 74 |       insert_id: "9ABFF756-0267-4247-847F-0895B65F0938",
 75 |       json: {
 76 |         uuid: "9ABFF756-0267-4247-847F-0895B65F0938",
 77 |       }
 78 |     }
 79 | 
 80 |     driver = create_driver(<<-CONFIG)
 81 |       table foo
 82 |       email foo@bar.example
 83 |       private_key_path /path/to/key
 84 |       project yourproject_id
 85 |       dataset yourdataset_id
 86 | 
 87 |       insert_id_field uuid
 88 |       schema [{"name": "uuid", "type": "STRING"}]
 89 |     CONFIG
 90 |     mock(driver.instance).insert("yourproject_id", "yourdataset_id", "foo", [expected], instance_of(Fluent::BigQuery::RecordSchema), nil)
 91 | 
 92 |     driver.run do
 93 |       driver.feed('tag', now, input)
 94 |     end
 95 |   end
 96 | 
 97 |   def test__write_with_nested_insert_id
 98 |     input = {
 99 |       "data" => {
100 |         "uuid" => "809F6BA7-1C16-44CD-9816-4B20E2C7AA2A",
101 |       },
102 |     }
103 |     expected = {
104 |       insert_id: "809F6BA7-1C16-44CD-9816-4B20E2C7AA2A",
105 |       json: {
106 |         data: {
107 |           uuid: "809F6BA7-1C16-44CD-9816-4B20E2C7AA2A",
108 |         }
109 |       }
110 |     }
111 | 
112 |     driver = create_driver(<<-CONFIG)
113 |       table foo
114 |       email foo@bar.example
115 |       private_key_path /path/to/key
116 |       project yourproject_id
117 |       dataset yourdataset_id
118 | 
119 |       insert_id_field $.data.uuid
120 |       schema [{"name": "data", "type": "RECORD", "fields": [
121 |         {"name": "uuid", "type": "STRING"}
122 |       ]}]
123 |     CONFIG
124 | 
125 |     mock(driver.instance).insert("yourproject_id", "yourdataset_id", "foo", [expected], instance_of(Fluent::BigQuery::RecordSchema), nil)
126 | 
127 |     driver.run do
128 |       driver.feed('tag', Fluent::EventTime.now, input)
129 |     end
130 |   end
131 | 
132 |   def test_write
133 |     entry = {a: "b"}
134 |     driver = create_driver
135 | 
136 |     stub_writer do |writer|
137 |       args = build_args(['yourproject_id', 'yourdataset_id', 'foo', {
138 |         rows: [{json: hash_including(entry)}],
139 |         skip_invalid_rows: false,
140 |         ignore_unknown_values: false
141 |       }])
142 |       mock(writer.client).insert_all_table_data(*args) do
143 |         s = stub!
144 |         s.insert_errors { nil }
145 |         s
146 |       end
147 |     end
148 | 
149 |     driver.run do
150 |       driver.feed("tag", Time.now.to_i, {"a" => "b"})
151 |     end
152 |   end
153 | 
154 |   def test_write_with_retryable_error
155 |     data_input = [
156 |       { "status_code" => 500  },
157 |       { "status_code" => 502  },
158 |       { "status_code" => 503  },
159 |       { "status_code" => 504  },
160 |     ]
161 | 
162 |     data_input.each do |d|
163 |       driver = create_driver(<<-CONFIG)
164 |         table foo
165 |         email foo@bar.example
166 |         private_key_path /path/to/key
167 |         project yourproject_id
168 |         dataset yourdataset_id
169 | 
170 |         <inject>
171 |         time_format %s
172 |         time_key  time
173 |         </inject>
174 | 
175 |         schema [
176 |           {"name": "time", "type": "INTEGER"},
177 |           {"name": "status", "type": "INTEGER"},
178 |           {"name": "bytes", "type": "INTEGER"},
179 |           {"name": "vhost", "type": "STRING"},
180 |           {"name": "path", "type": "STRING"},
181 |           {"name": "method", "type": "STRING"},
182 |           {"name": "protocol", "type": "STRING"},
183 |           {"name": "agent", "type": "STRING"},
184 |           {"name": "referer", "type": "STRING"},
185 |           {"name": "remote", "type": "RECORD", "fields": [
186 |             {"name": "host", "type": "STRING"},
187 |             {"name": "ip", "type": "STRING"},
188 |             {"name": "user", "type": "STRING"}
189 |           ]},
190 |           {"name": "requesttime", "type": "FLOAT"},
191 |           {"name": "bot_access", "type": "BOOLEAN"},
192 |           {"name": "loginsession", "type": "BOOLEAN"}
193 |         ]
194 |         <secondary>
195 |           type file
196 |           path error
197 |           utc
198 |         </secondary>
199 |       CONFIG
200 | 
201 |       entry = {a: "b"}
202 |       stub_writer do |writer|
203 |         args = build_args(['yourproject_id', 'yourdataset_id', 'foo', {
204 |           rows: [{json: hash_including(entry)}],
205 |           skip_invalid_rows: false,
206 |           ignore_unknown_values: false
207 |         }])
208 |         mock(writer.client).insert_all_table_data(*args) do
209 |           ex = Google::Apis::ServerError.new("error", status_code: d["status_code"])
210 |           raise ex
211 |         end
212 |       end
213 | 
214 |       assert_raise(Fluent::BigQuery::RetryableError) do
215 |         driver.run do
216 |           driver.feed("tag", Time.now.to_i, {"a" => "b"})
217 |         end
218 |       end
219 |     end
220 |   end
221 | 
222 |   def test_write_with_not_retryable_error
223 |     driver = create_driver(<<-CONFIG)
224 |       table foo
225 |       email foo@bar.example
226 |       private_key_path /path/to/key
227 |       project yourproject_id
228 |       dataset yourdataset_id
229 | 
230 |       <inject>
231 |       time_format %s
232 |       time_key  time
233 |       </inject>
234 | 
235 |       schema [
236 |         {"name": "time", "type": "INTEGER"},
237 |         {"name": "status", "type": "INTEGER"},
238 |         {"name": "bytes", "type": "INTEGER"},
239 |         {"name": "vhost", "type": "STRING"},
240 |         {"name": "path", "type": "STRING"},
241 |         {"name": "method", "type": "STRING"},
242 |         {"name": "protocol", "type": "STRING"},
243 |         {"name": "agent", "type": "STRING"},
244 |         {"name": "referer", "type": "STRING"},
245 |         {"name": "remote", "type": "RECORD", "fields": [
246 |           {"name": "host", "type": "STRING"},
247 |           {"name": "ip", "type": "STRING"},
248 |           {"name": "user", "type": "STRING"}
249 |         ]},
250 |         {"name": "requesttime", "type": "FLOAT"},
251 |         {"name": "bot_access", "type": "BOOLEAN"},
252 |         {"name": "loginsession", "type": "BOOLEAN"}
253 |       ]
254 |       <secondary>
255 |         type file
256 |         path error
257 |         utc
258 |       </secondary>
259 |     CONFIG
260 | 
261 |     entry = {a: "b"}
262 |     stub_writer do |writer|
263 |       args = build_args(['yourproject_id', 'yourdataset_id', 'foo', {
264 |         rows: [{json: hash_including(entry)}],
265 |         skip_invalid_rows: false,
266 |         ignore_unknown_values: false
267 |       }])
268 |       mock(writer.client).insert_all_table_data(*args) do
269 |         ex = Google::Apis::ServerError.new("error", status_code: 501)
270 |         def ex.reason
271 |           "invalid"
272 |         end
273 |         raise ex
274 |       end
275 |     end
276 | 
277 |     driver.instance_start
278 |     tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
279 |     metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
280 |     chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
281 |       c.append([driver.instance.format(tag, time, record)])
282 |     end
283 |     assert_raise Fluent::BigQuery::UnRetryableError do
284 |       driver.instance.write(chunk)
285 |     end
286 |     assert_in_delta driver.instance.retry.secondary_transition_at , Time.now, 0.2
287 |     driver.instance_shutdown
288 |   end
289 | 
290 |   def test_write_with_row_based_table_id_formatting
291 |     entry = [
292 |       {json: {a: "b", created_at: Time.local(2014,8,20,9,0,0).strftime("%Y_%m_%d")}},
293 |     ]
294 |     driver = create_driver(<<-CONFIG)
295 |       <buffer created_at>
296 |       </buffer>
297 |       table foo_${created_at}
298 |       email foo@bar.example
299 |       private_key_path /path/to/key
300 |       project yourproject_id
301 |       dataset yourdataset_id
302 | 
303 |       schema [
304 |         {"name": "time", "type": "INTEGER"}
305 |       ]
306 |     CONFIG
307 | 
308 |     stub_writer do |writer|
309 |       args = ['yourproject_id', 'yourdataset_id', 'foo_2014_08_20', {
310 |         rows: [entry[0]],
311 |         skip_invalid_rows: false,
312 |         ignore_unknown_values: false
313 |       }]
314 |       if RUBY_VERSION.to_i < 3
315 |         args << {}
316 |       end
317 |       mock(writer.client).insert_all_table_data(*args) { stub!.insert_errors { nil } }
318 |     end
319 | 
320 |     driver.run do
321 |       driver.feed("tag", Time.now.to_i, {"a" => "b", "created_at" => Time.local(2014,8,20,9,0,0).strftime("%Y_%m_%d")})
322 |     end
323 |   end
324 | 
325 |   def test_auto_create_table_by_bigquery_api
326 |     now = Time.at(Time.now.to_i)
327 |     message = {
328 |       "time" => now.to_i,
329 |       "request" => {
330 |         "vhost" => "bar",
331 |         "path" => "/path/to/baz",
332 |         "method" => "GET",
333 |         "protocol" => "HTTP/1.0",
334 |         "agent" => "libwww",
335 |         "referer" => "http://referer.example",
336 |         "time" => (now - 1).to_f,
337 |         "bot_access" => true,
338 |         "loginsession" => false,
339 |       },
340 |       "remote" => {
341 |         "host" => "remote.example",
342 |         "ip" =>  "192.168.1.1",
343 |         "user" => "nagachika",
344 |       },
345 |       "response" => {
346 |         "status" => 200,
347 |         "bytes" => 72,
348 |       },
349 |     }
350 | 
351 |     driver = create_driver(<<-CONFIG)
352 |       table foo
353 |       email foo@bar.example
354 |       private_key_path /path/to/key
355 |       project yourproject_id
356 |       dataset yourdataset_id
357 | 
358 |       <inject>
359 |       time_format %s
360 |       time_key  time
361 |       </inject>
362 | 
363 |       auto_create_table true
364 |       schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
365 |     CONFIG
366 | 
367 |     schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
368 | 
369 |     stub_writer do |writer|
370 |       body = {
371 |         rows: [{json: Fluent::BigQuery::Helper.deep_symbolize_keys(message)}],
372 |         skip_invalid_rows: false,
373 |         ignore_unknown_values: false,
374 |       }
375 |       args = build_args(['yourproject_id', 'yourdataset_id', 'foo', body])
376 |       mock(writer.client).insert_all_table_data(*args) do
377 |         raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
378 |       end.at_least(1)
379 |       mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
380 | 
381 |       args = build_args(['yourproject_id', 'yourdataset_id', {
382 |         table_reference: {
383 |           table_id: 'foo',
384 |         },
385 |         schema: {
386 |           fields: schema_fields,
387 |         },
388 |       }])
389 |       mock(writer.client).insert_table(*args)
390 |     end
391 | 
392 |     assert_raise(RuntimeError) do
393 |       driver.run do
394 |         driver.feed("tag", Fluent::EventTime.from_time(now), message)
395 |       end
396 |     end
397 |   end
398 | 
399 |   def test_auto_create_partitioned_table_by_bigquery_api
400 |     now = Time.now
401 |     message = {
402 |       json: {
403 |         time: now.to_i,
404 |         request: {
405 |           vhost: "bar",
406 |           path: "/path/to/baz",
407 |           method: "GET",
408 |           protocol: "HTTP/1.0",
409 |           agent: "libwww",
410 |           referer: "http://referer.example",
411 |           time: (now - 1).to_f,
412 |           bot_access: true,
413 |           loginsession: false,
414 |         },
415 |         remote: {
416 |           host: "remote.example",
417 |           ip: "192.168.1.1",
418 |           user: "nagachika",
419 |         },
420 |         response: {
421 |           status: 200,
422 |           bytes: 72,
423 |         },
424 |       }
425 |     }
426 | 
427 |     driver = create_driver(<<-CONFIG)
428 |       table foo
429 |       email foo@bar.example
430 |       private_key_path /path/to/key
431 |       project yourproject_id
432 |       dataset yourdataset_id
433 | 
434 |       time_format %s
435 |       time_field  time
436 | 
437 |       auto_create_table true
438 |       schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
439 | 
440 |       time_partitioning_type day
441 |       time_partitioning_field time
442 |       time_partitioning_expiration 1h
443 | 
444 |       require_partition_filter true
445 |     CONFIG
446 | 
447 |     schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
448 | 
449 |     stub_writer do |writer|
450 |       body = {
451 |         rows: [message],
452 |         skip_invalid_rows: false,
453 |         ignore_unknown_values: false,
454 |       }
455 |       args = build_args(['yourproject_id', 'yourdataset_id', 'foo', body])
456 |       mock(writer.client).insert_all_table_data(*args) do
457 |         raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
458 |       end.at_least(1)
459 |       mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
460 | 
461 |       args = build_args(['yourproject_id', 'yourdataset_id', {
462 |         table_reference: {
463 |           table_id: 'foo',
464 |         },
465 |         schema: {
466 |           fields: schema_fields,
467 |         },
468 |         time_partitioning: {
469 |           type: 'DAY',
470 |           field: 'time',
471 |           expiration_ms: 3600000,
472 |         },
473 |         require_partition_filter: true,
474 |       }])
475 |       mock(writer.client).insert_table(*args)
476 |     end
477 | 
478 |     assert_raise(RuntimeError) do
479 |       driver.run do
480 |         driver.feed("tag", Fluent::EventTime.now, message[:json])
481 |       end
482 |     end
483 |   end
484 | 
485 |   def test_auto_create_clustered_table_by_bigquery_api
486 |     now = Time.now
487 |     message = {
488 |       json: {
489 |         time: now.to_i,
490 |         request: {
491 |           vhost: "bar",
492 |           path: "/path/to/baz",
493 |           method: "GET",
494 |           protocol: "HTTP/1.0",
495 |           agent: "libwww",
496 |           referer: "http://referer.example",
497 |           time: (now - 1).to_f,
498 |           bot_access: true,
499 |           loginsession: false,
500 |         },
501 |         remote: {
502 |           host: "remote.example",
503 |           ip: "192.168.1.1",
504 |           user: "nagachika",
505 |         },
506 |         response: {
507 |           status: 200,
508 |           bytes: 72,
509 |         },
510 |       }
511 |     }
512 | 
513 |     driver = create_driver(<<-CONFIG)
514 |       table foo
515 |       email foo@bar.example
516 |       private_key_path /path/to/key
517 |       project yourproject_id
518 |       dataset yourdataset_id
519 | 
520 |       time_format %s
521 |       time_field  time
522 | 
523 |       auto_create_table true
524 |       schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
525 | 
526 |       time_partitioning_type day
527 |       time_partitioning_field time
528 |       time_partitioning_expiration 1h
529 | 
530 |       clustering_fields [
531 |         "time",
532 |         "vhost"
533 |       ]
534 |     CONFIG
535 | 
536 |     schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
537 | 
538 |     stub_writer do |writer|
539 |       body = {
540 |         rows: [message],
541 |         skip_invalid_rows: false,
542 |         ignore_unknown_values: false,
543 |       }
544 |       args = build_args(['yourproject_id', 'yourdataset_id', 'foo', body])
545 |       mock(writer.client).insert_all_table_data(*args) do
546 |         raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
547 |       end.at_least(1)
548 |       mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
549 | 
550 |       args = build_args(['yourproject_id', 'yourdataset_id', {
551 |         table_reference: {
552 |           table_id: 'foo',
553 |         },
554 |         schema: {
555 |           fields: schema_fields,
556 |         },
557 |         time_partitioning: {
558 |           type: 'DAY',
559 |           field: 'time',
560 |           expiration_ms: 3600000,
561 |         },
562 |         clustering: {
563 |           fields: [
564 |             'time',
565 |             'vhost',
566 |           ],
567 |         },
568 |       }])
569 |       mock(writer.client).insert_table(*args)
570 |     end
571 | 
572 |     assert_raise(RuntimeError) do
573 |       driver.run do
574 |         driver.feed("tag", Fluent::EventTime.now, message[:json])
575 |       end
576 |     end
577 |   end
578 | end
579 | 


--------------------------------------------------------------------------------
/test/plugin/test_out_bigquery_load.rb:
--------------------------------------------------------------------------------
  1 | require 'helper'
  2 | 
  3 | class BigQueryLoadOutputTest < Test::Unit::TestCase
  4 |   def setup
  5 |     Fluent::Test.setup
  6 |   end
  7 | 
  8 |   SCHEMA_PATH = File.join(File.dirname(__FILE__), "testdata", "sudo.schema")
  9 |   CONFIG = %[
 10 |     table foo
 11 |     email foo@bar.example
 12 |     private_key_path /path/to/key
 13 |     project yourproject_id
 14 |     dataset yourdataset_id
 15 | 
 16 |     <buffer>
 17 |       @type memory
 18 |     </buffer>
 19 | 
 20 |     <inject>
 21 |     time_format %s
 22 |     time_key  time
 23 |     </inject>
 24 | 
 25 |     schema_path #{SCHEMA_PATH}
 26 |     wait_job_interval 0.1
 27 |   ]
 28 | 
 29 |   API_SCOPE = "https://www.googleapis.com/auth/bigquery"
 30 | 
 31 |   def create_driver(conf = CONFIG)
 32 |     Fluent::Test::Driver::Output.new(Fluent::Plugin::BigQueryLoadOutput).configure(conf)
 33 |   end
 34 | 
 35 |   def stub_writer(stub_auth: true)
 36 |     stub.proxy(Fluent::BigQuery::Writer).new.with_any_args do |writer|
 37 |       stub(writer).get_auth { nil } if stub_auth
 38 |       yield writer
 39 |       writer
 40 |     end
 41 |   end
 42 | 
 43 |   def test_write
 44 |     response_stub = stub!
 45 | 
 46 |     driver = create_driver
 47 |     stub_writer do |writer|
 48 |       mock(writer).fetch_load_job(is_a(Fluent::BigQuery::Writer::JobReference)) { response_stub }
 49 |       mock(writer).commit_load_job(is_a(String), response_stub)
 50 | 
 51 |       mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') { nil }
 52 | 
 53 |       mock(writer.client).insert_job('yourproject_id', {
 54 |         configuration: {
 55 |           load: {
 56 |             destination_table: {
 57 |               project_id: 'yourproject_id',
 58 |               dataset_id: 'yourdataset_id',
 59 |               table_id: 'foo',
 60 |             },
 61 |             write_disposition: "WRITE_APPEND",
 62 |             source_format: "NEWLINE_DELIMITED_JSON",
 63 |             ignore_unknown_values: false,
 64 |             max_bad_records: 0,
 65 |           }
 66 |         }
 67 |       }, upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream") do
 68 |         stub!.job_reference.stub! do |s|
 69 |           s.job_id { "dummy_job_id" }
 70 |           s.location { "us" }
 71 |         end
 72 |       end
 73 |     end
 74 | 
 75 |     driver.run do
 76 |       driver.feed("tag", Time.now.to_i, {"a" => "b"})
 77 |     end
 78 |   end
 79 | 
 80 |   def test_write_with_prevent_duplicate_load
 81 |     driver = create_driver(<<-CONFIG)
 82 |       table foo
 83 |       email foo@bar.example
 84 |       private_key_path /path/to/key
 85 |       project yourproject_id
 86 |       dataset yourdataset_id
 87 | 
 88 |       <buffer>
 89 |         @type memory
 90 |       </buffer>
 91 | 
 92 |       <inject>
 93 |       time_format %s
 94 |       time_key  time
 95 |       </inject>
 96 | 
 97 |       schema_path #{SCHEMA_PATH}
 98 |       prevent_duplicate_load true
 99 |     CONFIG
100 | 
101 |     response_stub = stub!
102 |     stub_writer do |writer|
103 |       mock(writer).fetch_load_job(is_a(Fluent::BigQuery::Writer::JobReference)) { response_stub }
104 |       mock(writer).commit_load_job(is_a(String), response_stub)
105 | 
106 |       mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') { nil }
107 | 
108 |       mock(writer.client).insert_job('yourproject_id', {
109 |         configuration: {
110 |           load: {
111 |             destination_table: {
112 |               project_id: 'yourproject_id',
113 |               dataset_id: 'yourdataset_id',
114 |               table_id: 'foo',
115 |             },
116 |             write_disposition: "WRITE_APPEND",
117 |             source_format: "NEWLINE_DELIMITED_JSON",
118 |             ignore_unknown_values: false,
119 |             max_bad_records: 0,
120 |           },
121 |         },
122 |         job_reference: {project_id: 'yourproject_id', job_id: satisfy { |x| x =~ /fluentd_job_.*/}} ,
123 |       }, upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream") do
124 |         stub!.job_reference.stub! do |s|
125 |           s.job_id { "dummy_job_id" }
126 |           s.location { "us" }
127 |         end
128 |       end
129 |     end
130 | 
131 |     driver.run do
132 |       driver.feed("tag", Time.now.to_i, {"a" => "b"})
133 |     end
134 |   end
135 | 
136 |   def test_write_with_retryable_error
137 |     driver = create_driver
138 | 
139 |     driver.instance_start
140 |     tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
141 |     metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
142 |     chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
143 |       c.append([driver.instance.format(tag, time, record)])
144 |     end
145 | 
146 |     stub_writer do |writer|
147 |       mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') { nil }
148 | 
149 |       mock(writer.client).insert_job('yourproject_id', {
150 |         configuration: {
151 |           load: {
152 |             destination_table: {
153 |               project_id: 'yourproject_id',
154 |               dataset_id: 'yourdataset_id',
155 |               table_id: 'foo',
156 |             },
157 |             write_disposition: "WRITE_APPEND",
158 |             source_format: "NEWLINE_DELIMITED_JSON",
159 |             ignore_unknown_values: false,
160 |             max_bad_records: 0,
161 |           }
162 |         }
163 |       }, upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream") do
164 |         stub!.job_reference.stub! do |s|
165 |           s.job_id { "dummy_job_id" }
166 |           s.location { "us" }
167 |         end
168 |       end
169 | 
170 |       mock(writer.client).get_job('yourproject_id', 'dummy_job_id', location: "us") do
171 |         stub! do |s|
172 |           s.id { 'dummy_job_id' }
173 |           s.configuration.stub! do |_s|
174 |             _s.load.stub! do |__s|
175 |               __s.destination_table.stub! do |___s|
176 |                 ___s.project_id { 'yourproject_id' }
177 |                 ___s.dataset_id { 'yourdataset_id' }
178 |                 ___s.table_id { 'foo' }
179 |               end
180 |             end
181 |           end
182 |           s.status.stub! do |_s|
183 |             _s.state { 'DONE' }
184 |             _s.errors { [] }
185 |             _s.error_result.stub! do |__s|
186 |               __s.message { 'error' }
187 |               __s.reason { 'backendError' }
188 |             end
189 |           end
190 |         end
191 |       end
192 |     end
193 | 
194 |     assert_raise Fluent::BigQuery::RetryableError do
195 |       driver.instance.write(chunk)
196 |     end
197 |     driver.instance_shutdown
198 |   end
199 | 
200 |   def test_write_with_not_retryable_error
201 |     driver = create_driver(<<-CONFIG)
202 |       table foo
203 |       email foo@bar.example
204 |       private_key_path /path/to/key
205 |       project yourproject_id
206 |       dataset yourdataset_id
207 | 
208 |       <buffer>
209 |         @type memory
210 |       </buffer>
211 | 
212 |       <inject>
213 |       time_format %s
214 |       time_key  time
215 |       </inject>
216 | 
217 |       schema_path #{SCHEMA_PATH}
218 |       <secondary>
219 |         @type file
220 |         path error
221 |         utc
222 |       </secondary>
223 |     CONFIG
224 | 
225 |     driver.instance_start
226 |     tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
227 |     metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
228 |     chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
229 |       c.append([driver.instance.format(tag, time, record)])
230 |     end
231 | 
232 |     stub_writer do |writer|
233 |       mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') { nil }
234 | 
235 |       mock(writer.client).insert_job('yourproject_id', {
236 |         configuration: {
237 |           load: {
238 |             destination_table: {
239 |               project_id: 'yourproject_id',
240 |               dataset_id: 'yourdataset_id',
241 |               table_id: 'foo',
242 |             },
243 |             write_disposition: "WRITE_APPEND",
244 |             source_format: "NEWLINE_DELIMITED_JSON",
245 |             ignore_unknown_values: false,
246 |             max_bad_records: 0,
247 |           }
248 |         }
249 |       }, upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream") do
250 |         stub!.job_reference.stub! do |s|
251 |           s.job_id { "dummy_job_id" }
252 |           s.location { "us" }
253 |         end
254 |       end
255 | 
256 |       mock(writer.client).get_job('yourproject_id', 'dummy_job_id', location: "us") do
257 |         stub! do |s|
258 |           s.id { 'dummy_job_id' }
259 |           s.configuration.stub! do |_s|
260 |             _s.load.stub! do |__s|
261 |               __s.destination_table.stub! do |___s|
262 |                 ___s.project_id { 'yourproject_id' }
263 |                 ___s.dataset_id { 'yourdataset_id' }
264 |                 ___s.table_id { 'foo' }
265 |               end
266 |             end
267 |           end
268 |           s.status.stub! do |_s|
269 |             _s.state { 'DONE' }
270 |             _s.errors { [] }
271 |             _s.error_result.stub! do |__s|
272 |               __s.message { 'error' }
273 |               __s.reason { 'invalid' }
274 |             end
275 |           end
276 |         end
277 |       end
278 |     end
279 | 
280 |     assert_raise Fluent::BigQuery::UnRetryableError do
281 |       driver.instance.write(chunk)
282 |     end
283 |     assert_in_delta driver.instance.retry.secondary_transition_at , Time.now, 0.1
284 |     driver.instance_shutdown
285 |   end
286 | 
287 |   def test_write_with_auto_create_table
288 |     driver = create_driver(<<-CONFIG)
289 |       table foo
290 |       email foo@bar.example
291 |       private_key_path /path/to/key
292 |       project yourproject_id
293 |       dataset yourdataset_id
294 | 
295 |       <buffer>
296 |         @type memory
297 |       </buffer>
298 | 
299 |       <inject>
300 |       time_format %s
301 |       time_key  time
302 |       </inject>
303 | 
304 |       auto_create_table true
305 |       schema_path #{SCHEMA_PATH}
306 |     CONFIG
307 | 
308 |     schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
309 | 
310 |     stub_writer do |writer|
311 |       mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') do
312 |         raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
313 |       end
314 | 
315 |       mock(writer.client).insert_job('yourproject_id', {
316 |         configuration: {
317 |           load: {
318 |             destination_table: {
319 |               project_id: 'yourproject_id',
320 |               dataset_id: 'yourdataset_id',
321 |               table_id: 'foo',
322 |             },
323 |             write_disposition: "WRITE_APPEND",
324 |             source_format: "NEWLINE_DELIMITED_JSON",
325 |             ignore_unknown_values: false,
326 |             max_bad_records: 0,
327 |             schema: {
328 |               fields: schema_fields,
329 |             },
330 |           }
331 |         }
332 |       }, upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream") do
333 |         stub!.job_reference.stub! do |s|
334 |           s.job_id { "dummy_job_id" }
335 |           s.location { "us" }
336 |         end
337 |       end
338 |     end
339 | 
340 |     driver.run do
341 |       driver.feed("tag", Time.now.to_i, {"a" => "b"})
342 |     end
343 |   end
344 | 
345 |   private
346 | 
347 |   def create_response_stub(response)
348 |     case response
349 |     when Hash
350 |       root = stub!
351 |       response.each do |k, v|
352 |         root.__send__(k) do
353 |           create_response_stub(v)
354 |         end
355 |       end
356 |       root
357 |     when Array
358 |       response.map { |item| create_response_stub(item) }
359 |     else
360 |       response
361 |     end
362 |   end
363 | end
364 | 


--------------------------------------------------------------------------------
/test/plugin/test_record_schema.rb:
--------------------------------------------------------------------------------
  1 | require 'helper'
  2 | 
  3 | class RecordSchemaTest < Test::Unit::TestCase
  4 |   def base_schema
  5 |     [
  6 |       {
  7 |         "name" => "time",
  8 |         "type" => "TIMESTAMP",
  9 |         "mode" => "REQUIRED"
 10 |       },
 11 |       {
 12 |         "name" => "tty",
 13 |         "type" => "STRING",
 14 |         "mode" => "NULLABLE"
 15 |       },
 16 |       {
 17 |         "name" => "pwd",
 18 |         "type" => "STRING",
 19 |         "mode" => "REQUIRED"
 20 |       },
 21 |       {
 22 |         "name" => "user",
 23 |         "type" => "STRING",
 24 |         "mode" => "REQUIRED"
 25 |       },
 26 |       {
 27 |         "name" => "argv",
 28 |         "type" => "STRING",
 29 |         "mode" => "REPEATED"
 30 |       },
 31 |       {
 32 |         "name" => "utilization",
 33 |         "type" => "NUMERIC",
 34 |         "mode" => "NULLABLE"
 35 |       },
 36 |       {
 37 |         "name" => "bigutilization",
 38 |         "type" => "BIGNUMERIC",
 39 |         "mode" => "NULLABLE"
 40 |       }
 41 |     ]
 42 |   end
 43 | 
 44 |   def base_schema_with_new_column
 45 |     [
 46 |       {
 47 |         "name" => "time",
 48 |         "type" => "TIMESTAMP",
 49 |         "mode" => "REQUIRED"
 50 |       },
 51 |       {
 52 |         "name" => "tty",
 53 |         "type" => "STRING",
 54 |         "mode" => "NULLABLE"
 55 |       },
 56 |       {
 57 |         "name" => "pwd",
 58 |         "type" => "STRING",
 59 |         "mode" => "REQUIRED"
 60 |       },
 61 |       {
 62 |         "name" => "user",
 63 |         "type" => "STRING",
 64 |         "mode" => "REQUIRED"
 65 |       },
 66 |       {
 67 |         "name" => "argv",
 68 |         "type" => "STRING",
 69 |         "mode" => "REPEATED"
 70 |       },
 71 |       {
 72 |         "name" => "utilization",
 73 |         "type" => "NUMERIC",
 74 |         "mode" => "NULLABLE"
 75 |       },
 76 |       {
 77 |         "name" => "bigutilization",
 78 |         "type" => "BIGNUMERIC",
 79 |         "mode" => "NULLABLE"
 80 |       },
 81 |       {
 82 |         "name" => "new_column",
 83 |         "type" => "STRING",
 84 |         "mode" => "REQUIRED"
 85 |       },
 86 |     ]
 87 |   end
 88 | 
 89 |   def base_schema_with_type_changed_column
 90 |     [
 91 |       {
 92 |         "name" => "time",
 93 |         "type" => "INTEGER", # change type
 94 |         "mode" => "REQUIRED"
 95 |       },
 96 |       {
 97 |         "name" => "tty",
 98 |         "type" => "STRING",
 99 |         "mode" => "NULLABLE"
100 |       },
101 |       {
102 |         "name" => "pwd",
103 |         "type" => "STRING",
104 |         "mode" => "REQUIRED"
105 |       },
106 |       {
107 |         "name" => "user",
108 |         "type" => "STRING",
109 |         "mode" => "REQUIRED"
110 |       },
111 |       {
112 |         "name" => "argv",
113 |         "type" => "STRING",
114 |         "mode" => "REPEATED"
115 |       },
116 |       {
117 |         "name" => "utilization",
118 |         "type" => "NUMERIC",
119 |         "mode" => "NULLABLE"
120 |       },
121 |       {
122 |         "name" => "bigutilization",
123 |         "type" => "BIGNUMERIC",
124 |         "mode" => "NULLABLE"
125 |       }
126 |     ]
127 |   end
128 | 
129 |   def test_load_schema
130 |     fields = Fluent::BigQuery::RecordSchema.new("record")
131 |     fields.load_schema(base_schema)
132 |     assert { Fluent::BigQuery::Helper.deep_stringify_keys(fields.to_a) == base_schema }
133 |   end
134 | 
135 |   def test_load_schema_allow_overwrite_with_type_changed_column
136 |     fields = Fluent::BigQuery::RecordSchema.new("record")
137 |     fields.load_schema(base_schema)
138 | 
139 |     fields.load_schema(base_schema_with_type_changed_column)
140 |     assert { Fluent::BigQuery::Helper.deep_stringify_keys(fields.to_a) == base_schema_with_type_changed_column }
141 |   end
142 | 
143 |   def test_load_schema_allow_overwrite_with_new_column
144 |     fields = Fluent::BigQuery::RecordSchema.new("record")
145 |     fields.load_schema(base_schema)
146 | 
147 |     fields.load_schema(base_schema_with_new_column)
148 |     assert { Fluent::BigQuery::Helper.deep_stringify_keys(fields.to_a) == base_schema_with_new_column }
149 |   end
150 | 
151 |   def test_format_one
152 |     fields = Fluent::BigQuery::RecordSchema.new("record")
153 |     fields.load_schema(base_schema)
154 | 
155 |     time = Time.local(2016, 2, 7, 19, 0, 0).utc
156 | 
157 |     formatted = fields.format_one({
158 |       "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42]
159 |     })
160 |     assert_equal(
161 |       formatted,
162 |       {
163 |         "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42"]
164 |       }
165 |     )
166 |   end
167 | 
168 |   def test_format_one_convert_array_or_hash_to_json
169 |     fields = Fluent::BigQuery::RecordSchema.new("record")
170 |     fields.load_schema(base_schema)
171 | 
172 |     time = Time.local(2016, 2, 7, 19, 0, 0).utc
173 | 
174 |     formatted = fields.format_one({
175 |       "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilization" => "0.837", "bigutilization" => "0.837"
176 |     })
177 |     assert_equal(
178 |       formatted,
179 |       {
180 |         "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilization" => "0.837", "bigutilization" => "0.837"
181 |       }
182 |     )
183 |   end
184 | 
185 |   def test_format_one_with_extra_column
186 |     fields = Fluent::BigQuery::RecordSchema.new("record")
187 |     fields.load_schema(base_schema)
188 | 
189 |     time = Time.local(2016, 2, 7, 19, 0, 0).utc
190 | 
191 |     formatted = fields.format_one({
192 |       "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42.195], "extra" => "extra_data"
193 |     })
194 |     assert_equal(
195 |       formatted,
196 |       {
197 |         "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42.195"], "extra" => "extra_data"
198 |       }
199 |     )
200 |   end
201 | end
202 | 


--------------------------------------------------------------------------------
/test/plugin/testdata/apache.schema:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "time",
 4 |     "type": "TIMESTAMP",
 5 |     "mode": "REQUIRED"
 6 |   },
 7 |   {
 8 |     "name": "request",
 9 |     "type": "RECORD",
10 |     "mode": "REQUIRED",
11 |     "fields": [
12 |       {
13 |         "name": "vhost",
14 |         "type": "STRING",
15 |         "mode": "NULLABLE"
16 |       },
17 |       {
18 |         "name": "path",
19 |         "type": "STRING",
20 |         "mode": "REQUIRED"
21 |       },
22 |       {
23 |         "name": "method",
24 |         "type": "STRING",
25 |         "mode": "REQUIRED"
26 |       },
27 |       {
28 |         "name": "protocol",
29 |         "type": "STRING",
30 |         "mode": "REQUIRED"
31 |       },
32 |       {
33 |         "name": "agent",
34 |         "type": "STRING",
35 |         "mode": "REQUIRED"
36 |       },
37 |       {
38 |         "name": "referer",
39 |         "type": "STRING",
40 |         "mode": "NULLABLE"
41 |       },
42 |       {
43 |         "name": "time",
44 |         "type": "TIMESTAMP",
45 |         "mode": "NULLABLE"
46 |       },
47 |       {
48 |         "name": "bot_access",
49 |         "type": "BOOLEAN",
50 |         "mode": "NULLABLE"
51 |       },
52 |       {
53 |         "name": "loginsession",
54 |         "type": "BOOLEAN",
55 |         "mode": "NULLABLE"
56 |       }
57 |     ]
58 |   },
59 |   {
60 |     "name": "remote",
61 |     "type": "RECORD",
62 |     "mode": "NULLABLE",
63 |     "fields": [
64 |       {
65 |         "name": "host",
66 |         "type": "STRING",
67 |         "mode": "NULLABLE"
68 |       },
69 |       {
70 |         "name": "ip",
71 |         "type": "STRING",
72 |         "mode": "REQUIRED"
73 |       },
74 |       {
75 |         "name": "user",
76 |         "type": "STRING",
77 |         "mode": "NULLABLE"
78 |       }
79 |     ]
80 |   },
81 |   {
82 |     "name": "response",
83 |     "type": "RECORD",
84 |     "mode": "REQUIRED",
85 |     "fields": [
86 |       {
87 |         "name": "status",
88 |         "type": "INTEGER",
89 |         "mode": "REQUIRED"
90 |       },
91 |       {
92 |         "name": "bytes",
93 |         "type": "INTEGER",
94 |         "mode": "REQUIRED"
95 |       }
96 |     ]
97 |   }
98 | ]
99 | 


--------------------------------------------------------------------------------
/test/plugin/testdata/json_key.json:
--------------------------------------------------------------------------------
1 | {
2 |   "private_key_id": "1",
3 |   "private_key": "X",
4 |   "client_email": "xxx@developer.gserviceaccount.com",
5 |   "client_id": "xxx.apps.googleusercontent.com",
6 |   "type": "service_account"
7 | }
8 | 


--------------------------------------------------------------------------------
/test/plugin/testdata/sudo.schema:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "time",
 4 |     "type": "TIMESTAMP",
 5 |     "mode": "REQUIRED"
 6 |   },
 7 |   {
 8 |     "name": "tty",
 9 |     "type": "STRING",
10 |     "mode": "NULLABLE"
11 |   },
12 |   {
13 |     "name": "pwd",
14 |     "type": "STRING",
15 |     "mode": "REQUIRED"
16 |   },
17 |   {
18 |     "name": "user",
19 |     "type": "STRING",
20 |     "mode": "REQUIRED"
21 |   },
22 |   {
23 |     "name": "argv",
24 |     "type": "STRING",
25 |     "mode": "REPEATED"
26 |   }
27 | ]
28 | 


--------------------------------------------------------------------------------
/test/run_test.rb:
--------------------------------------------------------------------------------
 1 | base_dir = File.expand_path(File.join(File.dirname(__FILE__), ".."))
 2 | lib_dir  = File.join(base_dir, "lib")
 3 | test_dir = File.join(base_dir, "test")
 4 | 
 5 | $LOAD_PATH.unshift(lib_dir)
 6 | 
 7 | require 'test/unit'
 8 | 
 9 | exit Test::Unit::AutoRunner.run(true, test_dir)
10 | 


--------------------------------------------------------------------------------