├── .github ├── ISSUE_TEMPLATE.md └── workflows │ ├── linux.yml │ └── windows.yml ├── .gitignore ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── fluent-plugin-bigquery.gemspec ├── gemfiles └── activesupport-4.gemfile ├── integration ├── README.md ├── create_table.sh ├── dummer_insert.rb ├── dummer_load.rb ├── fluent.conf └── schema.json ├── lib └── fluent │ └── plugin │ ├── bigquery │ ├── errors.rb │ ├── helper.rb │ ├── schema.rb │ ├── version.rb │ └── writer.rb │ ├── out_bigquery_base.rb │ ├── out_bigquery_insert.rb │ └── out_bigquery_load.rb └── test ├── helper.rb ├── plugin ├── test_out_bigquery_base.rb ├── test_out_bigquery_insert.rb ├── test_out_bigquery_load.rb ├── test_record_schema.rb └── testdata │ ├── apache.schema │ ├── json_key.json │ └── sudo.schema └── run_test.rb /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Environments 4 | 5 | - fluentd version: 6 | - plugin version: 7 | 8 | ## Configuration 9 | 10 | 11 | ## Expected Behavior 12 | 13 | ## Actual Behavior 14 | 15 | ## Log (if you have) 16 | 17 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Testing on Ubuntu 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | build: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | ruby: 12 | - 3.0 13 | - 3.1 14 | - 3.2 15 | - 3.3 16 | os: 17 | - ubuntu-latest 18 | name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }} 19 | steps: 20 | - uses: actions/checkout@v2 21 | - uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: ${{ matrix.ruby }} 24 | - name: unit testing 25 | env: 26 | CI: true 27 | run: | 28 | ruby -v 29 | bundle install --jobs 4 --retry 3 30 | bundle exec rake test 31 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Testing on Windows 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | build: 7 | runs-on: ${{ matrix.os }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | ruby: 12 | - 3.0 13 | - 3.1 14 | - 3.2 15 | - 3.3 16 | os: 17 | - windows-latest 18 | name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }} 19 | steps: 20 | - uses: actions/checkout@v2 21 | - uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: ${{ matrix.ruby }} 24 | - name: unit testing 25 | env: 26 | CI: true 27 | run: | 28 | ruby -v 29 | bundle install --jobs 4 --retry 3 30 | bundle exec rake test 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | .ruby-version 7 | Gemfile.lock 8 | InstalledFiles 9 | _yardoc 10 | coverage 11 | doc/ 12 | lib/bundler/man 13 | pkg 14 | rdoc 15 | spec/reports 16 | test/tmp 17 | test/version_tmp 18 | tmp 19 | script/ 20 | .idea/ 21 | 22 | fluentd-0.12 23 | 24 | integration/log 25 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [v3.1.0](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/compare/v3.0.1...v3.1.0) (2022-12-16) 2 | 3 | 4 | ### Features 5 | 6 | * Support GEOGRAPHY type field ([#201](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/issues/201)) ([734faa9](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/commit/734faa9adb7cec1ed579fc6a0bd9ce72d48b82d0)) 7 | * Support JSON type field ([#204](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/issues/204)) ([ec62bfa](https://github.com/fluent-plugins-nursery/fluent-plugin-bigquery/commit/ec62bfa2f858feb440e8bb8e8f8d6b8689f709bb)) 8 | 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in fluent-plugin-bigquery.gemspec 4 | gemspec 5 | 6 | gem "oj" 7 | gem "dummer" 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012- TAGOMORI Satoshi 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fluent-plugin-bigquery 2 | 3 | [Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery. 4 | 5 | - **Plugin type**: Output 6 | 7 | * insert data over streaming inserts 8 | * plugin type is `bigquery_insert` 9 | * for continuous real-time insertions 10 | * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases 11 | * load data 12 | * plugin type is `bigquery_load` 13 | * for data loading as batch jobs, for big amount of data 14 | * https://developers.google.com/bigquery/loading-data-into-bigquery 15 | 16 | Current version of this plugin supports Google API with Service Account Authentication, but does not support 17 | OAuth flow for installed applications. 18 | 19 | ## Support Version 20 | 21 | | plugin version | fluentd version | ruby version | 22 | | :----------- | :----------- | :----------- | 23 | | v0.4.x | 0.12.x | 2.0 or later | 24 | | v1.x.x | 0.14.x or later | 2.2 or later | 25 | | v2.x.x | 0.14.x or later | 2.3 or later | 26 | | v3.x.x | 1.x or later | 2.7 or later | 27 | 28 | ## With docker image 29 | If you use official alpine based fluentd docker image (https://github.com/fluent/fluentd-docker-image), 30 | You need to install `bigdecimal` gem on your own dockerfile. 31 | Because alpine based image has only minimal ruby environment in order to reduce image size. 32 | And in most case, dependency to embedded gem is not written on gemspec. 33 | Because embedded gem dependency sometimes restricts ruby environment. 34 | 35 | ## Configuration 36 | 37 | ### Options 38 | 39 | #### common 40 | 41 | | name | type | required? | placeholder? | default | description | 42 | | :-------------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- | 43 | | auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` (GKE Workload Identity) | 44 | | email | string | yes (private_key) | no | nil | GCP Service Account Email | 45 | | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path | 46 | | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase | 47 | | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string | 48 | | project | string | yes | yes | nil | | 49 | | dataset | string | yes | yes | nil | | 50 | | table | string | yes (either `tables`) | yes | nil | | 51 | | tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` | 52 | | auto_create_table | bool | no | no | false | If true, creates table automatically | 53 | | ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. | 54 | | schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. | 55 | | schema_path | string | yes (either `fetch_schema`) | yes | nil | Schema Definition file path. It is formatted by JSON. | 56 | | fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. | 57 | | fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored | 58 | | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. | 59 | | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout | 60 | | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. | 61 | | time_partitioning_type | enum | no (either day or hour) | no | nil | Type of bigquery time partitioning feature. | 62 | | time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. | 63 | | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. | 64 | | clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. | 65 | 66 | #### bigquery_insert 67 | 68 | | name | type | required? | placeholder? | default | description | 69 | | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- | 70 | | template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` | 71 | | skip_invalid_rows | bool | no | no | false | | 72 | | insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor | 73 | | add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. | 74 | | allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. | 75 | | require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. | 76 | 77 | #### bigquery_load 78 | 79 | | name | type | required? | placeholder? | default | description | 80 | | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- | 81 | | source_format | enum | no | no | json | Specify source format `json` or `csv` or `avro`. If you change this parameter, you must change formatter plugin via `` config section. | 82 | | max_bad_records | integer | no | no | 0 | If the number of bad records exceeds this value, an invalid error is returned in the job result. | 83 | 84 | ### Buffer section 85 | 86 | | name | type | required? | default | description | 87 | | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- | 88 | | @type | string | no | memory (insert) or file (load) | | 89 | | chunk_limit_size | integer | no | 1MB (insert) or 1GB (load) | | 90 | | total_limit_size | integer | no | 1GB (insert) or 32GB (load) | | 91 | | chunk_records_limit | integer | no | 500 (insert) or nil (load) | | 92 | | flush_mode | enum | no | interval | default, lazy, interval, immediate | 93 | | flush_interval | float | no | 1.0 (insert) or 3600 (load) | | 94 | | flush_thread_interval | float | no | 0.05 (insert) or 5 (load) | | 95 | | flush_thread_burst_interval | float | no | 0.05 (insert) or 5 (load) | | 96 | 97 | And, other params (defined by base class) are available 98 | 99 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb 100 | 101 | ### Inject section 102 | 103 | It is replacement of previous version `time_field` and `time_format`. 104 | 105 | For example. 106 | 107 | ``` 108 | 109 | time_key time_field_name 110 | time_type string 111 | time_format %Y-%m-%d %H:%M:%S 112 | 113 | ``` 114 | 115 | | name | type | required? | default | description | 116 | | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- | 117 | | hostname_key | string | no | nil | | 118 | | hostname | string | no | nil | | 119 | | tag_key | string | no | nil | | 120 | | time_key | string | no | nil | | 121 | | time_type | string | no | nil | | 122 | | time_format | string | no | nil | | 123 | | localtime | bool | no | true | | 124 | | utc | bool | no | false | | 125 | | timezone | string | no | nil | | 126 | 127 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb 128 | 129 | ### Formatter section 130 | 131 | This section is for `load` mode only. 132 | If you use `insert` mode, used formatter is `json` only. 133 | 134 | Bigquery supports `csv`, `json` and `avro` format. Default is `json` 135 | I recommend to use `json` for now. 136 | 137 | For example. 138 | 139 | ``` 140 | source_format csv 141 | 142 | 143 | @type csv 144 | fields col1, col2, col3 145 | 146 | ``` 147 | 148 | see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/formatter.rb 149 | 150 | ## Examples 151 | 152 | ### Streaming inserts 153 | 154 | Configure insert specifications with target table schema, with your credentials. This is minimum configurations: 155 | 156 | ```apache 157 | 158 | @type bigquery_insert 159 | 160 | auth_method private_key # default 161 | email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com 162 | private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12 163 | # private_key_passphrase notasecret # default 164 | 165 | project yourproject_id 166 | dataset yourdataset_id 167 | table tablename 168 | 169 | schema [ 170 | {"name": "time", "type": "INTEGER"}, 171 | {"name": "status", "type": "INTEGER"}, 172 | {"name": "bytes", "type": "INTEGER"}, 173 | {"name": "vhost", "type": "STRING"}, 174 | {"name": "path", "type": "STRING"}, 175 | {"name": "method", "type": "STRING"}, 176 | {"name": "protocol", "type": "STRING"}, 177 | {"name": "agent", "type": "STRING"}, 178 | {"name": "referer", "type": "STRING"}, 179 | {"name": "remote", "type": "RECORD", "fields": [ 180 | {"name": "host", "type": "STRING"}, 181 | {"name": "ip", "type": "STRING"}, 182 | {"name": "user", "type": "STRING"} 183 | ]}, 184 | {"name": "requesttime", "type": "FLOAT"}, 185 | {"name": "bot_access", "type": "BOOLEAN"}, 186 | {"name": "loginsession", "type": "BOOLEAN"} 187 | ] 188 | 189 | ``` 190 | 191 | For high rate inserts over streaming inserts, you should specify flush intervals and buffer chunk options: 192 | 193 | ```apache 194 | 195 | @type bigquery_insert 196 | 197 | 198 | flush_interval 0.1 # flush as frequent as possible 199 | 200 | total_limit_size 10g 201 | 202 | flush_thread_count 16 203 | 204 | 205 | auth_method private_key # default 206 | email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com 207 | private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12 208 | # private_key_passphrase notasecret # default 209 | 210 | project yourproject_id 211 | dataset yourdataset_id 212 | tables accesslog1,accesslog2,accesslog3 213 | 214 | schema [ 215 | {"name": "time", "type": "INTEGER"}, 216 | {"name": "status", "type": "INTEGER"}, 217 | {"name": "bytes", "type": "INTEGER"}, 218 | {"name": "vhost", "type": "STRING"}, 219 | {"name": "path", "type": "STRING"}, 220 | {"name": "method", "type": "STRING"}, 221 | {"name": "protocol", "type": "STRING"}, 222 | {"name": "agent", "type": "STRING"}, 223 | {"name": "referer", "type": "STRING"}, 224 | {"name": "remote", "type": "RECORD", "fields": [ 225 | {"name": "host", "type": "STRING"}, 226 | {"name": "ip", "type": "STRING"}, 227 | {"name": "user", "type": "STRING"} 228 | ]}, 229 | {"name": "requesttime", "type": "FLOAT"}, 230 | {"name": "bot_access", "type": "BOOLEAN"}, 231 | {"name": "loginsession", "type": "BOOLEAN"} 232 | ] 233 | 234 | ``` 235 | 236 | Important options for high rate events are: 237 | 238 | * `tables` 239 | * 2 or more tables are available with ',' separator 240 | * `out_bigquery` uses these tables for Table Sharding inserts 241 | * these must have same schema 242 | * `buffer/chunk_limit_size` 243 | * max size of an insert or chunk (default 1000000 or 1MB) 244 | * the max size is limited to 1MB on BigQuery 245 | * `buffer/chunk_records_limit` 246 | * number of records over streaming inserts API call is limited as 500, per insert or chunk 247 | * `out_bigquery` flushes buffer with 500 records for 1 inserts API call 248 | * `buffer/queue_length_limit` 249 | * BigQuery streaming inserts needs very small buffer chunks 250 | * for high-rate events, `buffer_queue_limit` should be configured with big number 251 | * Max 1GB memory may be used under network problem in default configuration 252 | * `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)` 253 | * `buffer/flush_thread_count` 254 | * threads for insert api calls in parallel 255 | * specify this option for 100 or more records per seconds 256 | * 10 or more threads seems good for inserts over internet 257 | * fewer threads may be good for Google Compute Engine instances (with low latency for BigQuery) 258 | * `buffer/flush_interval` 259 | * interval between data flushes (default 0.25) 260 | * you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later 261 | 262 | See [Quota policy](https://cloud.google.com/bigquery/streaming-data-into-bigquery#quota) 263 | section in the Google BigQuery document. 264 | 265 | ### Load 266 | ```apache 267 | 268 | @type bigquery_load 269 | 270 | 271 | path bigquery.*.buffer 272 | flush_at_shutdown true 273 | timekey_use_utc 274 | 275 | 276 | auth_method json_key 277 | json_key json_key_path.json 278 | 279 | project yourproject_id 280 | dataset yourdataset_id 281 | auto_create_table true 282 | table yourtable%{time_slice} 283 | schema_path bq_schema.json 284 | 285 | ``` 286 | 287 | I recommend to use file buffer and long flush interval. 288 | 289 | ### Authentication 290 | 291 | There are four methods supported to fetch access token for the service account. 292 | 293 | 1. Public-Private key pair of GCP(Google Cloud Platform)'s service account 294 | 2. JSON key of GCP(Google Cloud Platform)'s service account 295 | 3. Predefined access token (Compute Engine only) 296 | 4. [Google application default credentials](https://cloud.google.com/docs/authentication/application-default-credentials) / GKE Workload Identity 297 | 298 | #### Public-Private key pair of GCP's service account 299 | 300 | The examples above use the first one. You first need to create a service account (client ID), 301 | download its private key and deploy the key with fluentd. 302 | 303 | #### JSON key of GCP(Google Cloud Platform)'s service account 304 | 305 | You first need to create a service account (client ID), 306 | download its JSON key and deploy the key with fluentd. 307 | 308 | ```apache 309 | 310 | @type bigquery_insert 311 | 312 | auth_method json_key 313 | json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json 314 | 315 | project yourproject_id 316 | dataset yourdataset_id 317 | table tablename 318 | ... 319 | 320 | ``` 321 | 322 | You can also provide `json_key` as embedded JSON string like this. 323 | You need to only include `private_key` and `client_email` key from JSON key file. 324 | 325 | ```apache 326 | 327 | @type bigquery_insert 328 | 329 | auth_method json_key 330 | json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"} 331 | 332 | project yourproject_id 333 | dataset yourdataset_id 334 | table tablename 335 | ... 336 | 337 | ``` 338 | 339 | #### Predefined access token (Compute Engine only) 340 | 341 | When you run fluentd on Google Compute Engine instance, 342 | you don't need to explicitly create a service account for fluentd. 343 | In this authentication method, you need to add the API scope "https://www.googleapis.com/auth/bigquery" to the scope list of your 344 | Compute Engine instance, then you can configure fluentd like this. 345 | 346 | ```apache 347 | 348 | @type bigquery_insert 349 | 350 | auth_method compute_engine 351 | 352 | project yourproject_id 353 | dataset yourdataset_id 354 | table tablename 355 | 356 | ... 357 | 358 | ``` 359 | 360 | #### Application default credentials 361 | 362 | The Application Default Credentials provide a simple way to get authorization credentials for use in calling Google APIs, which are described in detail at https://cloud.google.com/docs/authentication/application-default-credentials. 363 | 364 | **This is the method you should choose if you want to use Workload Identity on GKE**. 365 | 366 | In this authentication method, the credentials returned are determined by the environment the code is running in. Conditions are checked in the following order:credentials are get from following order. 367 | 368 | 1. The environment variable `GOOGLE_APPLICATION_CREDENTIALS` is checked. If this variable is specified it should point to a JSON key file that defines the credentials. 369 | 2. The environment variable `GOOGLE_PRIVATE_KEY` and `GOOGLE_CLIENT_EMAIL` are checked. If these variables are specified `GOOGLE_PRIVATE_KEY` should point to `private_key`, `GOOGLE_CLIENT_EMAIL` should point to `client_email` in a JSON key. 370 | 3. Well known path is checked. If the file exists, it is used as a JSON key file. This path is `$HOME/.config/gcloud/application_default_credentials.json`. 371 | 4. System default path is checked. If the file exists, it is used as a JSON key file. This path is `/etc/google/auth/application_default_credentials.json`. 372 | 5. If you are running in Google Compute Engine production, the built-in service account associated with the virtual machine instance will be used. 373 | 6. If none of these conditions is true, an error will occur. 374 | 375 | ### Table id formatting 376 | 377 | this plugin supports fluentd-0.14 style placeholder. 378 | 379 | #### strftime formatting 380 | `table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime) 381 | format to construct table ids. 382 | Table ids are formatted at runtime 383 | using the chunk key time. 384 | 385 | see. https://docs.fluentd.org/configuration/buffer-section 386 | 387 | For example, with the configuration below, 388 | data is inserted into tables `accesslog_2014_08_02`, `accesslog_2014_08_03` and so on. 389 | 390 | ```apache 391 | 392 | @type bigquery_insert 393 | 394 | ... 395 | 396 | project yourproject_id 397 | dataset yourdataset_id 398 | table accesslog_%Y_%m_%d 399 | 400 | 401 | timekey 1d 402 | 403 | ... 404 | 405 | ``` 406 | 407 | **NOTE: In current fluentd (v1.15.x), The maximum unit supported by strftime formatting is the granularity of days** 408 | 409 | #### record attribute formatting 410 | The format can be suffixed with attribute name. 411 | 412 | __CAUTION: format is different with previous version__ 413 | 414 | ```apache 415 | 416 | ... 417 | table accesslog_${status_code} 418 | 419 | 420 | 421 | ... 422 | 423 | ``` 424 | 425 | If attribute name is given, the time to be used for formatting is value of each row. 426 | The value for the time should be a UNIX time. 427 | 428 | #### time_slice_key formatting 429 | 430 | Instead, Use strftime formatting. 431 | 432 | strftime formatting of current version is based on chunk key. 433 | That is same with previous time_slice_key formatting . 434 | 435 | ### Date partitioned table support 436 | this plugin can insert (load) into date partitioned table. 437 | 438 | Use placeholder. 439 | 440 | ```apache 441 | 442 | @type bigquery_load 443 | 444 | ... 445 | table accesslog$%Y%m%d 446 | 447 | 448 | timekey 1d 449 | 450 | ... 451 | 452 | ``` 453 | 454 | But, Dynamic table creating doesn't support date partitioned table yet. 455 | And streaming insert is not allowed to insert with `$%Y%m%d` suffix. 456 | If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`. 457 | 458 | ### Dynamic table creating 459 | 460 | When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...". 461 | Next retry of insertion is expected to be success. 462 | 463 | NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should create the table on ahead to use `fetch_schema`. 464 | 465 | ```apache 466 | 467 | @type bigquery_insert 468 | 469 | ... 470 | 471 | auto_create_table true 472 | table accesslog_%Y_%m 473 | 474 | ... 475 | 476 | ``` 477 | 478 | Also, you can create clustered table by using `clustering_fields`. 479 | 480 | ### Table schema 481 | 482 | There are three methods to describe the schema of the target table. 483 | 484 | 1. List fields in fluent.conf 485 | 2. Load a schema file in JSON. 486 | 3. Fetch a schema using BigQuery API 487 | 488 | The examples above use the first method. In this method, 489 | you can also specify nested fields by prefixing their belonging record fields. 490 | 491 | ```apache 492 | 493 | @type bigquery_insert 494 | 495 | ... 496 | 497 | schema [ 498 | {"name": "time", "type": "INTEGER"}, 499 | {"name": "status", "type": "INTEGER"}, 500 | {"name": "bytes", "type": "INTEGER"}, 501 | {"name": "vhost", "type": "STRING"}, 502 | {"name": "path", "type": "STRING"}, 503 | {"name": "method", "type": "STRING"}, 504 | {"name": "protocol", "type": "STRING"}, 505 | {"name": "agent", "type": "STRING"}, 506 | {"name": "referer", "type": "STRING"}, 507 | {"name": "remote", "type": "RECORD", "fields": [ 508 | {"name": "host", "type": "STRING"}, 509 | {"name": "ip", "type": "STRING"}, 510 | {"name": "user", "type": "STRING"} 511 | ]}, 512 | {"name": "requesttime", "type": "FLOAT"}, 513 | {"name": "bot_access", "type": "BOOLEAN"}, 514 | {"name": "loginsession", "type": "BOOLEAN"} 515 | ] 516 | 517 | ``` 518 | 519 | This schema accepts structured JSON data like: 520 | 521 | ```json 522 | { 523 | "request":{ 524 | "time":1391748126.7000976, 525 | "vhost":"www.example.com", 526 | "path":"/", 527 | "method":"GET", 528 | "protocol":"HTTP/1.1", 529 | "agent":"HotJava", 530 | "bot_access":false 531 | }, 532 | "remote":{ "ip": "192.0.2.1" }, 533 | "response":{ 534 | "status":200, 535 | "bytes":1024 536 | } 537 | } 538 | ``` 539 | 540 | The second method is to specify a path to a BigQuery schema file instead of listing fields. In this case, your fluent.conf looks like: 541 | 542 | ```apache 543 | 544 | @type bigquery_insert 545 | 546 | ... 547 | 548 | schema_path /path/to/httpd.schema 549 | 550 | ``` 551 | where /path/to/httpd.schema is a path to the JSON-encoded schema file which you used for creating the table on BigQuery. By using external schema file you are able to write full schema that does support NULLABLE/REQUIRED/REPEATED, this feature is really useful and adds full flexibility. 552 | 553 | The third method is to set `fetch_schema` to `true` to enable fetch a schema using BigQuery API. In this case, your fluent.conf looks like: 554 | 555 | ```apache 556 | 557 | @type bigquery_insert 558 | 559 | ... 560 | 561 | fetch_schema true 562 | # fetch_schema_table other_table # if you want to fetch schema from other table 563 | 564 | ``` 565 | 566 | If you specify multiple tables in configuration file, plugin get all schema data from BigQuery and merge it. 567 | 568 | NOTE: Since JSON does not define how to encode data of TIMESTAMP type, 569 | you are still recommended to specify JSON types for TIMESTAMP fields as "time" field does in the example, if you use second or third method. 570 | 571 | ### Specifying insertId property 572 | 573 | BigQuery uses `insertId` property to detect duplicate insertion requests (see [data consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency) in Google BigQuery documents). 574 | You can set `insert_id_field` option to specify the field to use as `insertId` property. 575 | `insert_id_field` can use fluentd record_accessor format like `$['key1'][0]['key2']`. 576 | (detail. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor) 577 | 578 | ```apache 579 | 580 | @type bigquery_insert 581 | 582 | ... 583 | 584 | insert_id_field uuid 585 | schema [{"name": "uuid", "type": "STRING"}] 586 | 587 | ``` 588 | 589 | ## TODO 590 | 591 | * OAuth installed application credentials support 592 | * Google API discovery expiration 593 | * check row size limits 594 | 595 | ## Authors 596 | 597 | * @tagomoris: First author, original version 598 | * KAIZEN platform Inc.: Maintainer, Since 2014.08.19 599 | * @joker1007 600 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rake 2 | require "bundler/gem_tasks" 3 | 4 | require 'rake/testtask' 5 | Rake::TestTask.new(:test) do |test| 6 | test.libs << 'lib' << 'test' 7 | test.pattern = 'test/**/test_*.rb' 8 | test.verbose = true 9 | test.warning = false 10 | end 11 | 12 | task :default => :test 13 | -------------------------------------------------------------------------------- /fluent-plugin-bigquery.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'fluent/plugin/bigquery/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "fluent-plugin-bigquery" 8 | spec.version = Fluent::BigQueryPlugin::VERSION 9 | spec.authors = ["Naoya Ito", "joker1007"] 10 | spec.email = ["i.naoya@gmail.com", "kakyoin.hierophant@gmail.com"] 11 | spec.description = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts} 12 | spec.summary = %q{Fluentd plugin to store data on Google BigQuery} 13 | spec.homepage = "https://github.com/kaizenplatform/fluent-plugin-bigquery" 14 | spec.license = "Apache-2.0" 15 | 16 | spec.files = `git ls-files`.split($/) 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_development_dependency "rake" 22 | spec.add_development_dependency "rr" 23 | spec.add_development_dependency "test-unit" 24 | spec.add_development_dependency "test-unit-rr" 25 | 26 | spec.add_runtime_dependency "google-api-client", ">= 0.11.0" 27 | spec.add_runtime_dependency "googleauth", ">= 0.5.0" 28 | spec.add_runtime_dependency "multi_json" 29 | spec.add_runtime_dependency "fluentd", ">= 0.14.0", "< 2" 30 | end 31 | -------------------------------------------------------------------------------- /gemfiles/activesupport-4.gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'activesupport', '< 5' 4 | 5 | # Specify your gem's dependencies in fluent-plugin-bigquery.gemspec 6 | gemspec :path => '../' 7 | -------------------------------------------------------------------------------- /integration/README.md: -------------------------------------------------------------------------------- 1 | # Requirements 2 | 3 | Set Environment Variable 4 | 5 | - GOOGLE_APPLICATION_CREDENTIALS (json key path) 6 | - PROJECT_NAME 7 | - DATASET_NAME 8 | - TABLE_NAME 9 | 10 | # How to use 11 | 12 | 1. execute `create_table.sh` 13 | 1. `bundle exec fluentd -c fluent.conf` 14 | 1. `bundle exec dummer -c dummer_insert.rb` or `bundle exec dummer -c dummer_load.rb` 15 | -------------------------------------------------------------------------------- /integration/create_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eux 4 | bq mk -t --project_id=${PROJECT_NAME} --schema=$(dirname $0)/schema.json ${DATASET_NAME}.${TABLE_NAME} 5 | -------------------------------------------------------------------------------- /integration/dummer_insert.rb: -------------------------------------------------------------------------------- 1 | require "time" 2 | 3 | configure "insert" do 4 | host "localhost" 5 | port 24224 6 | rate 100 7 | tag type: :string, any: %w(insert_data) 8 | field :id, type: :integer, countup: true 9 | field :string_field, type: :string, any: %w(str1 str2 str3 str4) 10 | field :timestamp_field, type: :string, value: Time.now.iso8601 11 | field :date, type: :string, value: Time.now.strftime("%Y-%m-%d") 12 | end 13 | -------------------------------------------------------------------------------- /integration/dummer_load.rb: -------------------------------------------------------------------------------- 1 | require "time" 2 | 3 | configure "load" do 4 | host "localhost" 5 | port 24224 6 | rate 100 7 | tag type: :string, any: %w(load_data) 8 | field :id, type: :integer, countup: true 9 | field :string_field, type: :string, any: %w(str1 str2 str3 str4) 10 | field :timestamp_field, type: :string, value: Time.now.iso8601 11 | field :date, type: :string, value: Time.now.strftime("%Y-%m-%d") 12 | end 13 | -------------------------------------------------------------------------------- /integration/fluent.conf: -------------------------------------------------------------------------------- 1 | 2 | @type forward 3 | port 24224 4 | bind 0.0.0.0 5 | 6 | 7 | 8 | @type dummy 9 | dummy {"json_field": {"foo": "val1", "bar": "val2", "hoge": 1}, "geography_field": {"type": "LineString", "coordinates": [[-118.4085, 33.9416], [-73.7781, 40.6413]]}, "timestamp_field": "2022-12-15T22:40:21+09:00", "date": "2022-12-15", "record_field": {"inner_field": "hoge", "inner_json": {"key1": "val1", "key2": "val2"}}, "repeated_string_field": ["a", "b", "c"]} 10 | auto_increment_key id 11 | 12 | tag insert_data 13 | 14 | 15 | 16 | @id bigquery-insert-integration 17 | @type bigquery_insert 18 | 19 | allow_retry_insert_errors true 20 | 21 | auth_method json_key 22 | json_key "#{ENV["GOOGLE_APPLICATION_CREDENTIALS"]}" 23 | 24 | 25 | @type file 26 | 27 | chunk_limit_size 1m 28 | chunk_limit_records 1500 29 | total_limit_size 1g 30 | path ./log/bigquery-insert-integration 31 | 32 | flush_interval 15 33 | flush_thread_count 4 34 | flush_at_shutdown true 35 | 36 | retry_max_times 14 37 | retry_max_interval 30m 38 | 39 | 40 | request_open_timeout_sec 2m 41 | 42 | slow_flush_log_threshold 30.0 43 | 44 | project "#{ENV["PROJECT_NAME"]}" 45 | dataset "#{ENV["DATASET_NAME"]}" 46 | table "#{ENV["TABLE_NAME"]}" 47 | auto_create_table false 48 | # schema_path integration/schema.json 49 | fetch_schema true 50 | fetch_schema_table "#{ENV["TABLE_NAME"]}" 51 | 52 | insert_id_field id 53 | 54 | 55 | @type file 56 | path ./log/bigquery-insert-integration.errors 57 | 58 | 59 | 60 | 61 | @id bigquery-load-integration 62 | @type bigquery_load 63 | 64 | auth_method json_key 65 | json_key "#{ENV["GOOGLE_APPLICATION_CREDENTIALS"]}" 66 | 67 | 68 | @type file 69 | 70 | chunk_limit_size 1m 71 | total_limit_size 1g 72 | path ./log/bigquery-load-integration 73 | 74 | flush_interval 120 75 | flush_thread_count 4 76 | flush_at_shutdown true 77 | 78 | retry_max_times 14 79 | retry_max_interval 30m 80 | 81 | 82 | request_open_timeout_sec 2m 83 | 84 | slow_flush_log_threshold 300.0 85 | 86 | project "#{ENV["PROJECT_NAME"]}" 87 | dataset "#{ENV["DATASET_NAME"]}" 88 | table "#{ENV["TABLE_NAME"]}" 89 | auto_create_table false 90 | # schema_path integration/schema.json 91 | fetch_schema true 92 | fetch_schema_table "#{ENV["TABLE_NAME"]}" 93 | 94 | 95 | @type file 96 | path ./log/bigquery-load-integration.errors 97 | 98 | 99 | -------------------------------------------------------------------------------- /integration/schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "id", 4 | "type": "INTEGER", 5 | "mode": "REQUIRED" 6 | }, 7 | { 8 | "name": "string_field", 9 | "type": "STRING", 10 | "mode": "NULLABLE" 11 | }, 12 | { 13 | "name": "json_field", 14 | "type": "JSON", 15 | "mode": "NULLABLE" 16 | }, 17 | { 18 | "name": "geography_field", 19 | "type": "GEOGRAPHY", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "timestamp_field", 24 | "type": "TIMESTAMP", 25 | "mode": "NULLABLE" 26 | }, 27 | { 28 | "name": "date", 29 | "type": "DATE", 30 | "mode": "REQUIRED" 31 | }, 32 | { 33 | "name": "record_field", 34 | "type": "RECORD", 35 | "mode": "NULLABLE", 36 | "fields": [ 37 | { 38 | "name": "inner_field", 39 | "type": "STRING", 40 | "mode": "REQUIRED" 41 | }, 42 | { 43 | "name": "inner_json", 44 | "type": "JSON", 45 | "mode": "REQUIRED" 46 | } 47 | ] 48 | }, 49 | { 50 | "name": "repeated_string_field", 51 | "type": "STRING", 52 | "mode": "REPEATED" 53 | } 54 | ] 55 | -------------------------------------------------------------------------------- /lib/fluent/plugin/bigquery/errors.rb: -------------------------------------------------------------------------------- 1 | module Fluent 2 | module BigQuery 3 | # @abstract 4 | class Error < StandardError 5 | RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze 6 | RETRYABLE_INSERT_ERRORS_REASON = %w(timeout backendError internalError rateLimitExceeded).freeze 7 | RETRYABLE_STATUS_CODE = [500, 502, 503, 504] 8 | REGION_NOT_WRITABLE_MESSAGE = -"is not writable in the region" 9 | SSL_UNEXPECTED_EOF_MESSAGE = -"SSL_read: unexpected eof while reading" 10 | 11 | class << self 12 | # @param e [Google::Apis::Error] 13 | # @param message [String] 14 | def wrap(e, message = nil) 15 | if retryable_error?(e) 16 | RetryableError.new(message, e) 17 | else 18 | UnRetryableError.new(message, e) 19 | end 20 | end 21 | 22 | # @param e [Google::Apis::Error] 23 | def retryable_error?(e) 24 | retryable_server_error?(e) || retryable_region_not_writable?(e) || retryable_ssl_unexpected_eof?(e) 25 | end 26 | 27 | def retryable_server_error?(e) 28 | e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code) 29 | end 30 | 31 | def retryable_error_reason?(reason) 32 | RETRYABLE_ERROR_REASON.include?(reason) 33 | end 34 | 35 | def retryable_insert_errors_reason?(reason) 36 | RETRYABLE_INSERT_ERRORS_REASON.include?(reason) 37 | end 38 | 39 | def retryable_region_not_writable?(e) 40 | e.is_a?(Google::Apis::ClientError) && e.status_code == 400 && e.message.include?(REGION_NOT_WRITABLE_MESSAGE) 41 | end 42 | 43 | def retryable_ssl_unexpected_eof?(e) 44 | e.message.include?(SSL_UNEXPECTED_EOF_MESSAGE) 45 | end 46 | 47 | # Guard for instantiation 48 | private :new 49 | def inherited(subclass) 50 | subclass.class_eval do 51 | class << self 52 | public :new 53 | end 54 | end 55 | end 56 | end 57 | 58 | attr_reader :origin 59 | 60 | def initialize(message, origin = nil) 61 | @origin = origin 62 | super(message || origin.message) 63 | end 64 | 65 | def method_missing(name, *args) 66 | if @origin 67 | @origin.send(name, *args) 68 | else 69 | super 70 | end 71 | end 72 | 73 | def reason 74 | @origin && @origin.respond_to?(:reason) ? @origin.reason : nil 75 | end 76 | 77 | def status_code 78 | @origin && @origin.respond_to?(:status_code) ? @origin.status_code : nil 79 | end 80 | 81 | def body 82 | @origin && @origin.respond_to?(:body) ? @origin.body : nil 83 | end 84 | 85 | def retryable? 86 | false 87 | end 88 | end 89 | 90 | class UnRetryableError < Error; end 91 | 92 | class RetryableError < Error 93 | def retryable? 94 | true 95 | end 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /lib/fluent/plugin/bigquery/helper.rb: -------------------------------------------------------------------------------- 1 | module Fluent 2 | module BigQuery 3 | module Helper 4 | class << self 5 | def deep_symbolize_keys(object) 6 | case object 7 | when Hash 8 | object.each_with_object({}) do |(key, value), result| 9 | result[key.to_sym] = deep_symbolize_keys(value) 10 | end 11 | when Array 12 | object.map {|e| deep_symbolize_keys(e) } 13 | else 14 | object 15 | end 16 | end 17 | 18 | def deep_stringify_keys(object) 19 | case object 20 | when Hash 21 | object.each_with_object({}) do |(key, value), result| 22 | result[key.to_s] = deep_stringify_keys(value) 23 | end 24 | when Array 25 | object.map {|e| deep_stringify_keys(e) } 26 | else 27 | object 28 | end 29 | end 30 | end 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/fluent/plugin/bigquery/schema.rb: -------------------------------------------------------------------------------- 1 | require 'multi_json' 2 | 3 | module Fluent 4 | module BigQuery 5 | class FieldSchema 6 | def initialize(name, mode = :nullable) 7 | unless [:nullable, :required, :repeated].include?(mode) 8 | raise ConfigError, "Unrecognized mode for #{name}: #{mode}" 9 | end 10 | ### https://developers.google.com/bigquery/docs/tables 11 | # Each field has the following properties: 12 | # 13 | # name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_), 14 | # and must start with a letter or underscore. The maximum length is 128 characters. 15 | # https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name 16 | unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/ 17 | raise ConfigError, "invalid bigquery field name: '#{name}'" 18 | end 19 | 20 | @name = name 21 | @mode = mode 22 | end 23 | 24 | attr_reader :name, :mode 25 | 26 | def format(value, is_load: false) 27 | case @mode 28 | when :nullable 29 | format_one(value, is_load: is_load) unless value.nil? 30 | when :required 31 | if value.nil? 32 | log.warn "Required field #{name} cannot be null" 33 | nil 34 | else 35 | format_one(value, is_load: is_load) 36 | end 37 | when :repeated 38 | value.nil? ? [] : value.each_with_object([]) { |v, arr| arr << format_one(v, is_load: true) if v } 39 | end 40 | end 41 | 42 | def format_one(value, is_load: false) 43 | raise NotImplementedError, "Must implement in a subclass" 44 | end 45 | 46 | def to_h 47 | { 48 | :name => name, 49 | :type => type.to_s.upcase, 50 | :mode => mode.to_s.upcase, 51 | } 52 | end 53 | end 54 | 55 | class StringFieldSchema < FieldSchema 56 | def type 57 | :string 58 | end 59 | 60 | def format_one(value, is_load: false) 61 | if value.is_a?(Hash) || value.is_a?(Array) 62 | MultiJson.dump(value) 63 | else 64 | value.to_s 65 | end 66 | end 67 | end 68 | 69 | class JsonFieldSchema < FieldSchema 70 | def type 71 | :json 72 | end 73 | 74 | def format_one(value, is_load: false) 75 | if is_load 76 | value 77 | else 78 | MultiJson.dump(value) 79 | end 80 | end 81 | end 82 | 83 | class GeographyFieldSchema < StringFieldSchema 84 | def type 85 | :geography 86 | end 87 | end 88 | 89 | class IntegerFieldSchema < FieldSchema 90 | def type 91 | :integer 92 | end 93 | 94 | def format_one(value, is_load: false) 95 | value.to_i 96 | end 97 | end 98 | 99 | class FloatFieldSchema < FieldSchema 100 | def type 101 | :float 102 | end 103 | 104 | def format_one(value, is_load: false) 105 | value.to_f 106 | end 107 | end 108 | 109 | class NumericFieldSchema < FieldSchema 110 | def type 111 | :numeric 112 | end 113 | 114 | def format_one(value, is_load: false) 115 | value.to_s 116 | end 117 | end 118 | 119 | class BigNumericFieldSchema < FieldSchema 120 | def type 121 | :bignumeric 122 | end 123 | 124 | def format_one(value, is_load: false) 125 | value.to_s 126 | end 127 | end 128 | 129 | class BooleanFieldSchema < FieldSchema 130 | def type 131 | :boolean 132 | end 133 | 134 | def format_one(value, is_load: false) 135 | !!value 136 | end 137 | end 138 | 139 | class TimestampFieldSchema < FieldSchema 140 | INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze 141 | FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze 142 | 143 | def type 144 | :timestamp 145 | end 146 | 147 | def format_one(value, is_load: false) 148 | case value 149 | when Time 150 | value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z") 151 | when String 152 | if value =~ INTEGER_REGEXP 153 | value.to_i 154 | elsif value =~ FLOAT_REGEXP 155 | value.to_f 156 | else 157 | value 158 | end 159 | else 160 | value 161 | end 162 | end 163 | end 164 | 165 | class DateFieldSchema < FieldSchema 166 | def type 167 | :date 168 | end 169 | 170 | def format_one(value, is_load: false) 171 | if value.respond_to?(:strftime) 172 | value.strftime("%Y-%m-%d") 173 | else 174 | value 175 | end 176 | end 177 | end 178 | 179 | class DateTimeFieldSchema < FieldSchema 180 | def type 181 | :datetime 182 | end 183 | 184 | def format_one(value, is_load: false) 185 | if value.respond_to?(:strftime) 186 | value.strftime("%Y-%m-%dT%H:%M:%S.%6L") 187 | else 188 | value 189 | end 190 | end 191 | end 192 | 193 | class TimeFieldSchema < FieldSchema 194 | def type 195 | :time 196 | end 197 | 198 | def format_one(value, is_load: false) 199 | if value.respond_to?(:strftime) 200 | value.strftime("%H:%M:%S.%6L") 201 | else 202 | value 203 | end 204 | end 205 | end 206 | 207 | class RecordSchema < FieldSchema 208 | FIELD_TYPES = { 209 | string: StringFieldSchema, 210 | integer: IntegerFieldSchema, 211 | float: FloatFieldSchema, 212 | numeric: NumericFieldSchema, 213 | bignumeric: BigNumericFieldSchema, 214 | boolean: BooleanFieldSchema, 215 | timestamp: TimestampFieldSchema, 216 | date: DateFieldSchema, 217 | datetime: DateTimeFieldSchema, 218 | time: TimeFieldSchema, 219 | json: JsonFieldSchema, 220 | geography: GeographyFieldSchema, 221 | record: RecordSchema 222 | }.freeze 223 | 224 | def initialize(name, mode = :nullable) 225 | super(name, mode) 226 | @fields = {} 227 | end 228 | 229 | def type 230 | :record 231 | end 232 | 233 | def [](name) 234 | @fields[name] 235 | end 236 | 237 | def empty? 238 | @fields.empty? 239 | end 240 | 241 | def to_a 242 | @fields.map do |_, field_schema| 243 | field_schema.to_h 244 | end 245 | end 246 | 247 | def to_h 248 | { 249 | :name => name, 250 | :type => type.to_s.upcase, 251 | :mode => mode.to_s.upcase, 252 | :fields => self.to_a, 253 | } 254 | end 255 | 256 | def load_schema(schema) 257 | schema.each do |field| 258 | raise ConfigError, 'field must have type' unless field.key?('type') 259 | 260 | name = field['name'] 261 | mode = (field['mode'] || 'nullable').downcase.to_sym 262 | 263 | type = field['type'].downcase.to_sym 264 | field_schema_class = FIELD_TYPES[type] 265 | raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class 266 | 267 | field_schema = field_schema_class.new(name, mode) 268 | @fields[name] = field_schema 269 | if type == :record 270 | raise ConfigError, "record field must have fields" unless field.key?('fields') 271 | field_schema.load_schema(field['fields']) 272 | end 273 | end 274 | end 275 | 276 | def register_field(name, type) 277 | if @fields.key?(name) and @fields[name].type != :timestamp 278 | raise ConfigError, "field #{name} is registered twice" 279 | end 280 | if name[/\./] 281 | recordname = $` 282 | fieldname = $' 283 | register_record_field(recordname) 284 | @fields[recordname].register_field(fieldname, type) 285 | else 286 | schema = FIELD_TYPES[type] 287 | raise ConfigError, "[Bug] Invalid field type #{type}" unless schema 288 | @fields[name] = schema.new(name) 289 | end 290 | end 291 | 292 | def format_one(record, is_load: false) 293 | out = {} 294 | record.each do |key, value| 295 | next if value.nil? 296 | schema = @fields[key] 297 | out[key] = schema ? schema.format(value, is_load: is_load) : value 298 | end 299 | out 300 | end 301 | 302 | private 303 | def register_record_field(name) 304 | if !@fields.key?(name) 305 | @fields[name] = RecordSchema.new(name) 306 | else 307 | unless @fields[name].kind_of?(RecordSchema) 308 | raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}" 309 | end 310 | end 311 | end 312 | end 313 | end 314 | end 315 | -------------------------------------------------------------------------------- /lib/fluent/plugin/bigquery/version.rb: -------------------------------------------------------------------------------- 1 | module Fluent 2 | module BigQueryPlugin 3 | VERSION = "3.3.2".freeze 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /lib/fluent/plugin/bigquery/writer.rb: -------------------------------------------------------------------------------- 1 | module Fluent 2 | module BigQuery 3 | class Writer 4 | def initialize(log, auth_method, **options) 5 | @auth_method = auth_method 6 | @scope = "https://www.googleapis.com/auth/bigquery" 7 | @options = options 8 | @log = log 9 | @num_errors_per_chunk = {} 10 | end 11 | 12 | def client 13 | @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl| 14 | cl.authorization = get_auth 15 | cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec] 16 | cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec] 17 | cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec] 18 | end 19 | end 20 | 21 | def create_table(project, dataset, table_id, record_schema) 22 | create_table_retry_limit = 3 23 | create_table_retry_wait = 1 24 | create_table_retry_count = 0 25 | table_id = safe_table_id(table_id) 26 | 27 | begin 28 | definition = { 29 | table_reference: { 30 | table_id: table_id, 31 | }, 32 | schema: { 33 | fields: record_schema.to_a, 34 | } 35 | } 36 | 37 | definition.merge!(time_partitioning: time_partitioning) if time_partitioning 38 | definition.merge!(require_partition_filter: require_partition_filter) if require_partition_filter 39 | definition.merge!(clustering: clustering) if clustering 40 | client.insert_table(project, dataset, definition, **{}) 41 | log.debug "create table", project_id: project, dataset: dataset, table: table_id 42 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 43 | message = e.message 44 | if e.status_code == 409 && /Already Exists:/ =~ message 45 | log.debug "already created table", project_id: project, dataset: dataset, table: table_id 46 | # ignore 'Already Exists' error 47 | return 48 | end 49 | 50 | log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message 51 | 52 | if create_table_retry_count < create_table_retry_limit 53 | sleep create_table_retry_wait 54 | create_table_retry_wait *= 2 55 | create_table_retry_count += 1 56 | retry 57 | else 58 | raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e) 59 | end 60 | end 61 | end 62 | 63 | def fetch_schema(project, dataset, table_id) 64 | res = client.get_table(project, dataset, table_id) 65 | schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields]) 66 | log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}" 67 | 68 | schema 69 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 70 | message = e.message 71 | log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message 72 | nil 73 | end 74 | 75 | def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil) 76 | body = { 77 | rows: rows, 78 | skip_invalid_rows: @options[:skip_invalid_rows], 79 | ignore_unknown_values: @options[:ignore_unknown_values], 80 | } 81 | body.merge!(template_suffix: template_suffix) if template_suffix 82 | 83 | if @options[:auto_create_table] 84 | res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema) 85 | else 86 | res = client.insert_all_table_data(project, dataset, table_id, body, **{}) 87 | end 88 | log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size 89 | 90 | if res.insert_errors && !res.insert_errors.empty? 91 | log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s 92 | if @options[:allow_retry_insert_errors] 93 | is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error| 94 | insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) } 95 | end 96 | if is_included_any_retryable_insert_error 97 | raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry") 98 | else 99 | raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry") 100 | end 101 | end 102 | end 103 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 104 | log.debug "insert error: #{e.message}", status_code: e.respond_to?(:status_code) ? e.status_code : nil, reason: e.respond_to?(:reason) ? e.reason : nil 105 | error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message } 106 | wrapped = Fluent::BigQuery::Error.wrap(e) 107 | if wrapped.retryable? 108 | log.warn "tabledata.insertAll API", error_data 109 | else 110 | log.error "tabledata.insertAll API", error_data 111 | end 112 | 113 | raise wrapped 114 | end 115 | 116 | JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id, :location) do 117 | def as_hash(*keys) 118 | if keys.empty? 119 | to_h 120 | else 121 | to_h.select { |k, _| keys.include?(k) } 122 | end 123 | end 124 | end 125 | 126 | def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields) 127 | configuration = { 128 | configuration: { 129 | load: { 130 | destination_table: { 131 | project_id: project, 132 | dataset_id: dataset, 133 | table_id: table_id, 134 | }, 135 | write_disposition: "WRITE_APPEND", 136 | source_format: source_format, 137 | ignore_unknown_values: @options[:ignore_unknown_values], 138 | max_bad_records: @options[:max_bad_records], 139 | } 140 | } 141 | } 142 | 143 | job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load] 144 | configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id 145 | 146 | begin 147 | # Check table existance and use its location for the result when the load jobs is duplicated. 148 | table = client.get_table(project, dataset, table_id) 149 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 150 | if e.status_code == 404 && /Not Found: Table/i =~ e.message 151 | raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table] 152 | raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty? 153 | configuration[:configuration][:load].merge!(schema: {fields: fields.to_a}) 154 | configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning 155 | configuration[:configuration][:load].merge!(clustering: clustering) if clustering 156 | end 157 | end 158 | 159 | res = client.insert_job( 160 | project, 161 | configuration, 162 | upload_source: upload_source, 163 | content_type: "application/octet-stream", 164 | ) 165 | JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id, res.job_reference.location) 166 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 167 | log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message 168 | 169 | if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job 170 | # If a load job is duplicated, the API response may not be available to create the result. 171 | # Therefore, we need to use the location of the table instead of the job's location to determine the result. 172 | return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id, table.location) 173 | end 174 | 175 | raise Fluent::BigQuery::Error.wrap(e) 176 | end 177 | 178 | def fetch_load_job(job_reference) 179 | project = job_reference.project_id 180 | job_id = job_reference.job_id 181 | location = job_reference.location 182 | 183 | res = client.get_job(project, job_id, location: location) 184 | log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id) 185 | 186 | if res.status.state == "DONE" 187 | res 188 | end 189 | rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e 190 | e = Fluent::BigQuery::Error.wrap(e) 191 | raise e unless e.retryable? 192 | end 193 | 194 | def commit_load_job(chunk_id_hex, response) 195 | job_id = response.id 196 | project = response.configuration.load.destination_table.project_id 197 | dataset = response.configuration.load.destination_table.dataset_id 198 | table_id = response.configuration.load.destination_table.table_id 199 | 200 | errors = response.status.errors 201 | if errors 202 | errors.each do |e| 203 | log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason 204 | end 205 | end 206 | 207 | error_result = response.status.error_result 208 | if error_result 209 | log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason 210 | if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason) 211 | @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1 212 | raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry") 213 | else 214 | @num_errors_per_chunk.delete(chunk_id_hex) 215 | raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry") 216 | end 217 | end 218 | 219 | # `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error." 220 | stats = response.statistics.load 221 | duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0 222 | log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id 223 | @num_errors_per_chunk.delete(chunk_id_hex) 224 | end 225 | 226 | private 227 | 228 | def log 229 | @log 230 | end 231 | 232 | def get_auth 233 | case @auth_method 234 | when :private_key 235 | get_auth_from_private_key 236 | when :compute_engine 237 | get_auth_from_compute_engine 238 | when :json_key 239 | get_auth_from_json_key 240 | when :application_default 241 | get_auth_from_application_default 242 | else 243 | raise ConfigError, "Unknown auth method: #{@auth_method}" 244 | end 245 | end 246 | 247 | def get_auth_from_private_key 248 | require 'google/api_client/auth/key_utils' 249 | private_key_path = @options[:private_key_path] 250 | private_key_passphrase = @options[:private_key_passphrase] 251 | email = @options[:email] 252 | 253 | key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase) 254 | Signet::OAuth2::Client.new( 255 | token_credential_uri: "https://accounts.google.com/o/oauth2/token", 256 | audience: "https://accounts.google.com/o/oauth2/token", 257 | scope: @scope, 258 | issuer: email, 259 | signing_key: key 260 | ) 261 | end 262 | 263 | def get_auth_from_compute_engine 264 | Google::Auth::GCECredentials.new 265 | end 266 | 267 | def get_auth_from_json_key 268 | json_key = @options[:json_key] 269 | 270 | begin 271 | JSON.parse(json_key) 272 | key = StringIO.new(json_key) 273 | Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope) 274 | rescue JSON::ParserError 275 | key = json_key 276 | File.open(json_key) do |f| 277 | Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope) 278 | end 279 | end 280 | end 281 | 282 | def get_auth_from_application_default 283 | Google::Auth.get_application_default([@scope]) 284 | end 285 | 286 | def safe_table_id(table_id) 287 | table_id.gsub(/\$\d+$/, "") 288 | end 289 | 290 | def create_job_id(chunk_id_hex, dataset, table, schema) 291 | job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}" 292 | @log.debug "job_id_key: #{job_id_key}" 293 | "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key) 294 | end 295 | 296 | def source_format 297 | case @options[:source_format] 298 | when :json 299 | "NEWLINE_DELIMITED_JSON" 300 | when :avro 301 | "AVRO" 302 | when :csv 303 | "CSV" 304 | else 305 | "NEWLINE_DELIMITED_JSON" 306 | end 307 | end 308 | 309 | def time_partitioning 310 | return @time_partitioning if instance_variable_defined?(:@time_partitioning) 311 | 312 | if @options[:time_partitioning_type] 313 | @time_partitioning = { 314 | type: @options[:time_partitioning_type].to_s.upcase, 315 | field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil, 316 | expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil, 317 | }.reject { |_, v| v.nil? } 318 | else 319 | @time_partitioning 320 | end 321 | end 322 | 323 | def require_partition_filter 324 | return @require_partition_filter if instance_variable_defined?(:@require_partition_filter) 325 | 326 | if @options[:require_partition_filter] 327 | @require_partition_filter = @options[:require_partition_filter] 328 | else 329 | @require_partition_filter 330 | end 331 | end 332 | 333 | def clustering 334 | return @clustering if instance_variable_defined?(:@clustering) 335 | 336 | if @options[:clustering_fields] 337 | @clustering = { 338 | fields: @options[:clustering_fields] 339 | } 340 | else 341 | @clustering 342 | end 343 | end 344 | 345 | def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema) 346 | try_count ||= 1 347 | res = client.insert_all_table_data(project, dataset, table_id, body, **{}) 348 | rescue Google::Apis::ClientError => e 349 | if e.status_code == 404 && /Not Found: Table/i =~ e.message 350 | if try_count == 1 351 | # Table Not Found: Auto Create Table 352 | create_table(project, dataset, table_id, schema) 353 | elsif try_count > 60 # timeout in about 300 seconds 354 | raise "A new table was created but it is not found." 355 | end 356 | 357 | # Retry to insert several times because the created table is not visible from Streaming insert for a little while 358 | # cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts 359 | try_count += 1 360 | sleep 5 361 | log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id 362 | retry 363 | end 364 | raise 365 | end 366 | end 367 | end 368 | end 369 | -------------------------------------------------------------------------------- /lib/fluent/plugin/out_bigquery_base.rb: -------------------------------------------------------------------------------- 1 | require 'fluent/plugin/output' 2 | 3 | require 'fluent/plugin/bigquery/version' 4 | 5 | require 'fluent/plugin/bigquery/helper' 6 | require 'fluent/plugin/bigquery/errors' 7 | require 'fluent/plugin/bigquery/schema' 8 | require 'fluent/plugin/bigquery/writer' 9 | 10 | require 'multi_json' 11 | require 'google/apis/bigquery_v2' 12 | require 'googleauth' 13 | 14 | module Fluent 15 | module Plugin 16 | # This class is abstract class 17 | class BigQueryBaseOutput < Output 18 | helpers :inject, :formatter 19 | 20 | # Available methods are: 21 | # * private_key -- Use service account credential from pkcs12 private key file 22 | # * compute_engine -- Use access token available in instances of ComputeEngine 23 | # * json_key -- Use service account credential from JSON key 24 | # * application_default -- Use application default credential 25 | config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key 26 | 27 | ### Service Account credential 28 | config_param :email, :string, default: nil 29 | config_param :private_key_path, :string, default: nil 30 | config_param :private_key_passphrase, :string, default: 'notasecret', secret: true 31 | config_param :json_key, default: nil, secret: true 32 | 33 | # see as simple reference 34 | # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb 35 | config_param :project, :string 36 | 37 | # dataset_name 38 | # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore, 39 | # but it cannot start with a number or underscore, or have spaces. 40 | config_param :dataset, :string 41 | 42 | # table_id 43 | # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset. 44 | config_param :table, :string, default: nil 45 | config_param :tables, :array, value_type: :string, default: nil 46 | 47 | config_param :auto_create_table, :bool, default: false 48 | 49 | # ignore_unknown_values 50 | # Accept rows that contain values that do not match the schema. The unknown values are ignored. 51 | # Default is false, which treats unknown values as errors. 52 | config_param :ignore_unknown_values, :bool, default: false 53 | 54 | config_param :schema, :array, default: nil 55 | config_param :schema_path, :string, default: nil 56 | config_param :fetch_schema, :bool, default: false 57 | config_param :fetch_schema_table, :string, default: nil 58 | config_param :schema_cache_expire, :time, default: 600 59 | 60 | ## Timeout 61 | # request_timeout_sec 62 | # Bigquery API response timeout 63 | # request_open_timeout_sec 64 | # Bigquery API connection, and request timeout 65 | config_param :request_timeout_sec, :time, default: nil 66 | config_param :request_open_timeout_sec, :time, default: 60 67 | 68 | ## Partitioning 69 | config_param :time_partitioning_type, :enum, list: [:day, :hour], default: nil 70 | config_param :time_partitioning_field, :string, default: nil 71 | config_param :time_partitioning_expiration, :time, default: nil 72 | 73 | ## Clustering 74 | config_param :clustering_fields, :array, default: nil 75 | 76 | ## Formatter 77 | config_section :format do 78 | config_set_default :@type, 'json' 79 | end 80 | 81 | def configure(conf) 82 | super 83 | 84 | case @auth_method 85 | when :private_key 86 | unless @email && @private_key_path 87 | raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'" 88 | end 89 | when :compute_engine 90 | # Do nothing 91 | when :json_key 92 | unless @json_key 93 | raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'" 94 | end 95 | when :application_default 96 | # Do nothing 97 | else 98 | raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}" 99 | end 100 | 101 | unless @table.nil? ^ @tables.nil? 102 | raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid" 103 | end 104 | 105 | @tablelist = @tables ? @tables : [@table] 106 | 107 | @table_schema = Fluent::BigQuery::RecordSchema.new('record') 108 | if @schema 109 | @table_schema.load_schema(@schema) 110 | end 111 | 112 | formatter_config = conf.elements("format")[0] 113 | @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config) 114 | end 115 | 116 | def start 117 | super 118 | 119 | @tables_queue = @tablelist.shuffle 120 | @tables_mutex = Mutex.new 121 | @fetched_schemas = {} 122 | @last_fetch_schema_time = Hash.new(0) 123 | @read_schemas = {} 124 | end 125 | 126 | def multi_workers_ready? 127 | true 128 | end 129 | 130 | def writer 131 | @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, 132 | private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase, 133 | email: @email, 134 | json_key: @json_key, 135 | source_format: @source_format, 136 | skip_invalid_rows: @skip_invalid_rows, 137 | ignore_unknown_values: @ignore_unknown_values, 138 | max_bad_records: @max_bad_records, 139 | allow_retry_insert_errors: @allow_retry_insert_errors, 140 | prevent_duplicate_load: @prevent_duplicate_load, 141 | auto_create_table: @auto_create_table, 142 | time_partitioning_type: @time_partitioning_type, 143 | time_partitioning_field: @time_partitioning_field, 144 | time_partitioning_expiration: @time_partitioning_expiration, 145 | require_partition_filter: @require_partition_filter, 146 | clustering_fields: @clustering_fields, 147 | timeout_sec: @request_timeout_sec, 148 | open_timeout_sec: @request_open_timeout_sec, 149 | ) 150 | end 151 | 152 | def format(tag, time, record) 153 | if record.nil? 154 | log.warn("nil record detected. corrupted chunks? tag=#{tag}, time=#{time}") 155 | return 156 | end 157 | 158 | record = inject_values_to_record(tag, time, record) 159 | 160 | meta = metadata(tag, time, record) 161 | schema = 162 | if @fetch_schema 163 | fetch_schema(meta) 164 | elsif @schema_path 165 | read_schema(meta) 166 | else 167 | @table_schema 168 | end 169 | 170 | begin 171 | row = schema.format(record, is_load: !!@is_load) 172 | return if row.empty? 173 | @formatter.format(tag, time, row) 174 | rescue 175 | log.error("format error", record: record, schema: schema) 176 | raise 177 | end 178 | end 179 | 180 | def write(chunk) 181 | end 182 | 183 | def fetch_schema(metadata) 184 | table_id = nil 185 | project = extract_placeholders(@project, metadata) 186 | dataset = extract_placeholders(@dataset, metadata) 187 | table_id = fetch_schema_target_table(metadata) 188 | 189 | if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire 190 | schema = writer.fetch_schema(project, dataset, table_id) 191 | 192 | if schema 193 | table_schema = Fluent::BigQuery::RecordSchema.new("record") 194 | table_schema.load_schema(schema) 195 | @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema 196 | else 197 | if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil? 198 | raise "failed to fetch schema from bigquery" 199 | else 200 | log.warn "#{table_id} uses previous schema" 201 | end 202 | end 203 | 204 | @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now 205 | end 206 | 207 | @fetched_schemas["#{project}.#{dataset}.#{table_id}"] 208 | end 209 | 210 | def fetch_schema_target_table(metadata) 211 | extract_placeholders(@fetch_schema_table || @tablelist[0], metadata) 212 | end 213 | 214 | def read_schema(metadata) 215 | schema_path = read_schema_target_path(metadata) 216 | 217 | unless @read_schemas[schema_path] 218 | table_schema = Fluent::BigQuery::RecordSchema.new("record") 219 | table_schema.load_schema(MultiJson.load(File.read(schema_path))) 220 | @read_schemas[schema_path] = table_schema 221 | end 222 | @read_schemas[schema_path] 223 | end 224 | 225 | def read_schema_target_path(metadata) 226 | extract_placeholders(@schema_path, metadata) 227 | end 228 | 229 | def get_schema(project, dataset, metadata) 230 | if @fetch_schema 231 | @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata) 232 | elsif @schema_path 233 | @read_schemas[read_schema_target_path(metadata)] || read_schema(metadata) 234 | else 235 | @table_schema 236 | end 237 | end 238 | end 239 | end 240 | end 241 | -------------------------------------------------------------------------------- /lib/fluent/plugin/out_bigquery_insert.rb: -------------------------------------------------------------------------------- 1 | require 'fluent/plugin/out_bigquery_base' 2 | 3 | module Fluent 4 | module Plugin 5 | class BigQueryInsertOutput < BigQueryBaseOutput 6 | Fluent::Plugin.register_output('bigquery_insert', self) 7 | 8 | helpers :record_accessor 9 | 10 | # template_suffix (only insert) 11 | # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details 12 | config_param :template_suffix, :string, default: nil 13 | 14 | # skip_invalid_rows (only insert) 15 | # Insert all valid rows of a request, even if invalid rows exist. 16 | # The default value is false, which causes the entire request to fail if any invalid rows exist. 17 | config_param :skip_invalid_rows, :bool, default: false 18 | 19 | # insert_id_field (only insert) 20 | config_param :insert_id_field, :string, default: nil 21 | 22 | # add_insert_timestamp (only insert) 23 | # adds a timestamp just before sending the rows to bigquery, so that 24 | # buffering time is not taken into account. Gives a field in bigquery 25 | # which represents the insert time of the row. 26 | config_param :add_insert_timestamp, :string, default: nil 27 | 28 | # allow_retry_insert_errors (only insert) 29 | # If insert_id_field is not specified, true means to allow duplicate rows 30 | config_param :allow_retry_insert_errors, :bool, default: false 31 | 32 | ## RequirePartitionFilter 33 | config_param :require_partition_filter, :bool, default: false 34 | 35 | ## Buffer 36 | config_section :buffer do 37 | config_set_default :@type, "memory" 38 | config_set_default :flush_mode, :interval 39 | config_set_default :flush_interval, 1 40 | config_set_default :flush_thread_interval, 0.05 41 | config_set_default :flush_thread_burst_interval, 0.05 42 | config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB 43 | config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB 44 | config_set_default :chunk_limit_records, 500 45 | end 46 | 47 | def configure(conf) 48 | super 49 | @is_load = false 50 | 51 | if @insert_id_field 52 | if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./ 53 | warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)" 54 | end 55 | @get_insert_id = record_accessor_create(@insert_id_field) 56 | end 57 | 58 | formatter_config = conf.elements("format")[0] 59 | if formatter_config && formatter_config['@type'] != "json" 60 | raise ConfigError, "`bigquery_insert` supports only json formatter." 61 | end 62 | @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config) 63 | 64 | placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}" 65 | placeholder_validate!(:bigquery_insert, placeholder_params) 66 | end 67 | 68 | # for Fluent::Plugin::Output#implement? method 69 | def format(tag, time, record) 70 | super 71 | end 72 | 73 | def write(chunk) 74 | table_format = @tables_mutex.synchronize do 75 | t = @tables_queue.shift 76 | @tables_queue.push t 77 | t 78 | end 79 | 80 | now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp 81 | 82 | rows = chunk.open do |io| 83 | io.map do |line| 84 | record = MultiJson.load(line) 85 | record[@add_insert_timestamp] = now if @add_insert_timestamp 86 | row = {"json" => record} 87 | row["insert_id"] = @get_insert_id.call(record) if @get_insert_id 88 | Fluent::BigQuery::Helper.deep_symbolize_keys(row) 89 | end 90 | end 91 | 92 | metadata = chunk.metadata 93 | project = extract_placeholders(@project, metadata) 94 | dataset = extract_placeholders(@dataset, metadata) 95 | table_id = extract_placeholders(table_format, metadata) 96 | template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil 97 | schema = get_schema(project, dataset, metadata) 98 | 99 | insert(project, dataset, table_id, rows, schema, template_suffix) 100 | rescue MultiJson::ParseError => e 101 | raise Fluent::UnrecoverableError.new(e) 102 | end 103 | 104 | def insert(project, dataset, table_id, rows, schema, template_suffix) 105 | writer.insert_rows(project, dataset, table_id, rows, schema, template_suffix: template_suffix) 106 | rescue Fluent::BigQuery::Error => e 107 | raise if e.retryable? 108 | 109 | if @secondary 110 | # TODO: find better way 111 | @retry = retry_state_create( 112 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 113 | forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base, 114 | max_interval: @buffer_config.retry_max_interval, 115 | secondary: true, secondary_threshold: Float::EPSILON, 116 | randomize: @buffer_config.retry_randomize 117 | ) 118 | else 119 | @retry = retry_state_create( 120 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 121 | forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base, 122 | max_interval: @buffer_config.retry_max_interval, 123 | randomize: @buffer_config.retry_randomize 124 | ) 125 | end 126 | 127 | raise 128 | end 129 | end 130 | end 131 | end 132 | -------------------------------------------------------------------------------- /lib/fluent/plugin/out_bigquery_load.rb: -------------------------------------------------------------------------------- 1 | require 'fluent/plugin/out_bigquery_base' 2 | 3 | module Fluent 4 | module Plugin 5 | class BigQueryLoadOutput < BigQueryBaseOutput 6 | Fluent::Plugin.register_output('bigquery_load', self) 7 | 8 | helpers :timer 9 | 10 | config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json 11 | 12 | # max_bad_records (only load) 13 | # The maximum number of bad records that BigQuery can ignore when running the job. 14 | # If the number of bad records exceeds this value, an invalid error is returned in the job result. 15 | # The default value is 0, which requires that all records are valid. 16 | config_param :max_bad_records, :integer, default: 0 17 | 18 | # prevent_duplicate_load (only load) 19 | config_param :prevent_duplicate_load, :bool, default: false 20 | 21 | config_param :use_delayed_commit, :bool, default: true 22 | config_param :wait_job_interval, :time, default: 3 23 | 24 | ## Buffer 25 | config_section :buffer do 26 | config_set_default :@type, "file" 27 | config_set_default :flush_mode, :interval 28 | config_set_default :flush_interval, 3600 # 1h 29 | config_set_default :flush_thread_interval, 5 30 | config_set_default :flush_thread_burst_interval, 5 31 | config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB 32 | config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB 33 | 34 | config_set_default :delayed_commit_timeout, 1800 # 30m 35 | end 36 | 37 | def configure(conf) 38 | super 39 | @is_load = true 40 | 41 | placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}" 42 | placeholder_validate!(:bigquery_load, placeholder_params) 43 | end 44 | 45 | def start 46 | super 47 | 48 | if prefer_delayed_commit 49 | @polling_targets = [] 50 | @polling_mutex = Mutex.new 51 | log.debug("start load job polling") 52 | timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll)) 53 | end 54 | end 55 | 56 | def prefer_delayed_commit 57 | @use_delayed_commit 58 | end 59 | 60 | # for Fluent::Plugin::Output#implement? method 61 | def format(tag, time, record) 62 | super 63 | end 64 | 65 | def write(chunk) 66 | job_reference = do_write(chunk) 67 | 68 | until response = writer.fetch_load_job(job_reference) 69 | sleep @wait_job_interval 70 | end 71 | 72 | writer.commit_load_job(job_reference.chunk_id_hex, response) 73 | rescue Fluent::BigQuery::Error => e 74 | raise if e.retryable? 75 | 76 | @retry_mutex.synchronize do 77 | if @secondary 78 | # TODO: find better way 79 | @retry = retry_state_create( 80 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 81 | forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base, 82 | max_interval: @buffer_config.retry_max_interval, 83 | secondary: true, secondary_threshold: Float::EPSILON, 84 | randomize: @buffer_config.retry_randomize 85 | ) 86 | else 87 | @retry = retry_state_create( 88 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 89 | forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base, 90 | max_interval: @buffer_config.retry_max_interval, 91 | randomize: @buffer_config.retry_randomize 92 | ) 93 | end 94 | end 95 | 96 | raise 97 | end 98 | 99 | def try_write(chunk) 100 | job_reference = do_write(chunk) 101 | @polling_mutex.synchronize do 102 | @polling_targets << job_reference 103 | end 104 | rescue Fluent::BigQuery::Error => e 105 | raise if e.retryable? 106 | 107 | @retry_mutex.synchronize do 108 | if @secondary 109 | # TODO: find better way 110 | @retry = retry_state_create( 111 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 112 | forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base, 113 | max_interval: @buffer_config.retry_max_interval, 114 | secondary: true, secondary_threshold: Float::EPSILON, 115 | randomize: @buffer_config.retry_randomize 116 | ) 117 | else 118 | @retry = retry_state_create( 119 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 120 | forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base, 121 | max_interval: @buffer_config.retry_max_interval, 122 | randomize: @buffer_config.retry_randomize 123 | ) 124 | end 125 | end 126 | 127 | raise 128 | end 129 | 130 | private 131 | 132 | def do_write(chunk) 133 | table_format = @tables_mutex.synchronize do 134 | t = @tables_queue.shift 135 | @tables_queue.push t 136 | t 137 | end 138 | 139 | metadata = chunk.metadata 140 | project = extract_placeholders(@project, metadata) 141 | dataset = extract_placeholders(@dataset, metadata) 142 | table_id = extract_placeholders(table_format, metadata) 143 | schema = get_schema(project, dataset, metadata) 144 | 145 | create_upload_source(chunk) do |upload_source| 146 | writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema) 147 | end 148 | end 149 | 150 | def poll 151 | job_reference = @polling_mutex.synchronize do 152 | @polling_targets.shift 153 | end 154 | return unless job_reference 155 | 156 | begin 157 | response = writer.fetch_load_job(job_reference) 158 | if response 159 | writer.commit_load_job(job_reference.chunk_id_hex, response) 160 | commit_write(job_reference.chunk_id) 161 | log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id)) 162 | else 163 | @polling_mutex.synchronize do 164 | @polling_targets << job_reference 165 | end 166 | end 167 | rescue Fluent::BigQuery::Error => e 168 | # RetryableError comes from only `commit_load_job` 169 | # if error is retryable, takeback chunk and do next `try_flush` 170 | # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush` 171 | if e.retryable? 172 | log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id)) 173 | else 174 | log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id)) 175 | @retry_mutex.synchronize do 176 | if @secondary 177 | # TODO: find better way 178 | @retry = retry_state_create( 179 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 180 | forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base, 181 | max_interval: @buffer_config.retry_max_interval, 182 | secondary: true, secondary_threshold: Float::EPSILON, 183 | randomize: @buffer_config.retry_randomize 184 | ) 185 | else 186 | @retry = retry_state_create( 187 | :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout, 188 | forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base, 189 | max_interval: @buffer_config.retry_max_interval, 190 | randomize: @buffer_config.retry_randomize 191 | ) 192 | end 193 | end 194 | end 195 | 196 | rollback_write(job_reference.chunk_id) 197 | rescue => e 198 | log.error("unexpected error while polling", error: e) 199 | log.error_backtrace 200 | rollback_write(job_reference.chunk_id) 201 | end 202 | end 203 | 204 | def create_upload_source(chunk) 205 | chunk_is_file = @buffer_config["@type"] == 'file' 206 | if chunk_is_file 207 | File.open(chunk.path) do |file| 208 | yield file 209 | end 210 | else 211 | Tempfile.open("chunk-tmp") do |file| 212 | file.binmode 213 | chunk.write_to(file) 214 | file.sync 215 | file.rewind 216 | yield file 217 | end 218 | end 219 | end 220 | end 221 | end 222 | end 223 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require 'bundler/setup' 2 | require 'test/unit' 3 | 4 | $LOAD_PATH.unshift(File.join(__dir__, '..', 'lib')) 5 | $LOAD_PATH.unshift(__dir__) 6 | require 'fluent/test' 7 | 8 | require 'fluent/plugin/buffer' 9 | require 'fluent/plugin/buf_memory' 10 | require 'fluent/plugin/buf_file' 11 | require 'fluent/test/driver/output' 12 | 13 | require 'fluent/plugin/out_bigquery_base' 14 | require 'fluent/plugin/out_bigquery_insert' 15 | require 'fluent/plugin/out_bigquery_load' 16 | require 'google/apis/bigquery_v2' 17 | require 'google/api_client/auth/key_utils' 18 | require 'googleauth' 19 | 20 | require 'test/unit/rr' 21 | -------------------------------------------------------------------------------- /test/plugin/test_out_bigquery_base.rb: -------------------------------------------------------------------------------- 1 | require 'helper' 2 | 3 | class BigQueryBaseOutputTest < Test::Unit::TestCase 4 | def setup 5 | Fluent::Test.setup 6 | end 7 | 8 | CONFIG = %[ 9 | table foo 10 | email foo@bar.example 11 | private_key_path /path/to/key 12 | project yourproject_id 13 | dataset yourdataset_id 14 | 15 | 16 | time_format %s 17 | time_key time 18 | 19 | 20 | schema [ 21 | {"name": "time", "type": "INTEGER"}, 22 | {"name": "status", "type": "INTEGER"}, 23 | {"name": "bytes", "type": "INTEGER"}, 24 | {"name": "vhost", "type": "STRING"}, 25 | {"name": "path", "type": "STRING"}, 26 | {"name": "method", "type": "STRING"}, 27 | {"name": "protocol", "type": "STRING"}, 28 | {"name": "agent", "type": "STRING"}, 29 | {"name": "referer", "type": "STRING"}, 30 | {"name": "remote", "type": "RECORD", "fields": [ 31 | {"name": "host", "type": "STRING"}, 32 | {"name": "ip", "type": "STRING"}, 33 | {"name": "user", "type": "STRING"} 34 | ]}, 35 | {"name": "requesttime", "type": "FLOAT"}, 36 | {"name": "bot_access", "type": "BOOLEAN"}, 37 | {"name": "loginsession", "type": "BOOLEAN"} 38 | ] 39 | ] 40 | 41 | API_SCOPE = "https://www.googleapis.com/auth/bigquery" 42 | 43 | def create_driver(conf = CONFIG) 44 | Fluent::Test::Driver::Output.new(Fluent::Plugin::BigQueryBaseOutput).configure(conf) 45 | end 46 | 47 | def stub_writer(stub_auth: true) 48 | stub.proxy(Fluent::BigQuery::Writer).new.with_any_args do |writer| 49 | stub(writer).get_auth { nil } if stub_auth 50 | yield writer 51 | writer 52 | end 53 | end 54 | 55 | private def sudo_schema_response 56 | { 57 | "schema" => { 58 | "fields" => [ 59 | { 60 | "name" => "time", 61 | "type" => "TIMESTAMP", 62 | "mode" => "REQUIRED" 63 | }, 64 | { 65 | "name" => "tty", 66 | "type" => "STRING", 67 | "mode" => "NULLABLE" 68 | }, 69 | { 70 | "name" => "pwd", 71 | "type" => "STRING", 72 | "mode" => "REQUIRED" 73 | }, 74 | { 75 | "name" => "user", 76 | "type" => "STRING", 77 | "mode" => "REQUIRED" 78 | }, 79 | { 80 | "name" => "argv", 81 | "type" => "STRING", 82 | "mode" => "REPEATED" 83 | } 84 | ] 85 | } 86 | } 87 | end 88 | 89 | def test_configure_table 90 | driver = create_driver 91 | assert_equal driver.instance.table, 'foo' 92 | assert_nil driver.instance.tables 93 | 94 | driver = create_driver(CONFIG.sub(/\btable\s+.*$/, 'tables foo,bar')) 95 | assert_nil driver.instance.table 96 | assert_equal driver.instance.tables, ['foo' ,'bar'] 97 | 98 | assert_raise(Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid") { 99 | create_driver(CONFIG + "tables foo,bar") 100 | } 101 | end 102 | 103 | def test_configure_auth_private_key 104 | driver = create_driver 105 | stub_writer(stub_auth: false) do |writer| 106 | mock(writer).get_auth_from_private_key { stub! } 107 | end 108 | assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService) 109 | end 110 | 111 | def test_configure_auth_compute_engine 112 | driver = create_driver(%[ 113 | table foo 114 | auth_method compute_engine 115 | project yourproject_id 116 | dataset yourdataset_id 117 | schema [ 118 | {"name": "time", "type": "INTEGER"}, 119 | {"name": "status", "type": "INTEGER"}, 120 | {"name": "bytes", "type": "INTEGER"} 121 | ] 122 | ]) 123 | 124 | stub_writer(stub_auth: false) do |writer| 125 | mock(writer).get_auth_from_compute_engine { stub! } 126 | end 127 | assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService) 128 | end 129 | 130 | def test_configure_auth_json_key_as_file 131 | driver = create_driver(%[ 132 | table foo 133 | auth_method json_key 134 | json_key jsonkey.josn 135 | project yourproject_id 136 | dataset yourdataset_id 137 | schema [ 138 | {"name": "time", "type": "INTEGER"}, 139 | {"name": "status", "type": "INTEGER"}, 140 | {"name": "bytes", "type": "INTEGER"} 141 | ] 142 | ]) 143 | 144 | stub_writer(stub_auth: false) do |writer| 145 | mock(writer).get_auth_from_json_key { stub! } 146 | end 147 | assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService) 148 | end 149 | 150 | def test_configure_auth_json_key_as_string 151 | json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}' 152 | json_key_io = StringIO.new(json_key) 153 | authorization = Object.new 154 | stub(Google::Auth::ServiceAccountCredentials).make_creds(json_key_io: satisfy {|arg| JSON.parse(arg.read) == JSON.parse(json_key_io.read) }, scope: API_SCOPE) { authorization } 155 | 156 | driver = create_driver(%[ 157 | table foo 158 | auth_method json_key 159 | json_key #{json_key} 160 | project yourproject_id 161 | dataset yourdataset_id 162 | schema [ 163 | {"name": "time", "type": "INTEGER"}, 164 | {"name": "status", "type": "INTEGER"}, 165 | {"name": "bytes", "type": "INTEGER"} 166 | ] 167 | ]) 168 | stub_writer(stub_auth: false) do |writer| 169 | mock.proxy(writer).get_auth_from_json_key { stub! } 170 | end 171 | assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService) 172 | end 173 | 174 | def test_configure_auth_application_default 175 | omit "This testcase depends on some environment variables." if ENV["CI"] == "true" 176 | 177 | driver = create_driver(%[ 178 | table foo 179 | auth_method application_default 180 | project yourproject_id 181 | dataset yourdataset_id 182 | schema [ 183 | {"name": "time", "type": "INTEGER"}, 184 | {"name": "status", "type": "INTEGER"}, 185 | {"name": "bytes", "type": "INTEGER"} 186 | ] 187 | ]) 188 | 189 | stub_writer(stub_auth: false) do |writer| 190 | mock.proxy(writer).get_auth_from_application_default { stub! } 191 | end 192 | assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService) 193 | end 194 | 195 | def test_format 196 | now = Fluent::EventTime.new(Time.now.to_i) 197 | input = { 198 | "status" => "1", 199 | "bytes" => 3.0, 200 | "vhost" => :bar, 201 | "path" => "/path/to/baz", 202 | "method" => "GET", 203 | "protocol" => "HTTP/0.9", 204 | "agent" => "libwww", 205 | "referer" => "http://referer.example", 206 | "requesttime" => (now - 1).to_f.to_s, 207 | "bot_access" => true, 208 | "loginsession" => false, 209 | "something-else" => "would be ignored", 210 | "yet-another" => { 211 | "foo" => "bar", 212 | "baz" => 1, 213 | }, 214 | "remote" => { 215 | "host" => "remote.example", 216 | "ip" => "192.0.2.1", 217 | "port" => 12345, 218 | "user" => "tagomoris", 219 | } 220 | } 221 | expected = { 222 | "time" => now.to_i, 223 | "status" => 1, 224 | "bytes" => 3, 225 | "vhost" => "bar", 226 | "path" => "/path/to/baz", 227 | "method" => "GET", 228 | "protocol" => "HTTP/0.9", 229 | "agent" => "libwww", 230 | "referer" => "http://referer.example", 231 | "requesttime" => (now - 1).to_f.to_s.to_f, 232 | "bot_access" => true, 233 | "loginsession" => false, 234 | "something-else" => "would be ignored", 235 | "yet-another" => { 236 | "foo" => "bar", 237 | "baz" => 1, 238 | }, 239 | "remote" => { 240 | "host" => "remote.example", 241 | "ip" => "192.0.2.1", 242 | "port" => 12345, 243 | "user" => "tagomoris", 244 | } 245 | } 246 | 247 | driver = create_driver(CONFIG) 248 | buf = nil 249 | driver.run { buf = driver.instance.format("my.tag", now, input) } 250 | 251 | assert_equal expected, MultiJson.load(buf) 252 | end 253 | 254 | [ 255 | # ,