├── exe
    ├── bq-migrate
    └── bq_migrate
├── example
    ├── example.yml
    ├── schema.json
    ├── copy_table.yml
    ├── table_info.yml
    ├── insert_select.yml
    ├── migrate_table.yml
    ├── application_default.yml
    ├── migrate_partitioned_table.yml
    └── migrate_clustered_table.yml
├── .rspec
├── Gemfile
├── lib
    ├── bigquery_migration
    │   ├── version.rb
    │   ├── error.rb
    │   ├── hash_util.rb
    │   ├── logger.rb
    │   ├── time_with_zone.rb
    │   ├── config_loader.rb
    │   ├── action_runner.rb
    │   ├── cli.rb
    │   ├── action.rb
    │   ├── table_data.rb
    │   ├── schema.rb
    │   └── bigquery_wrapper.rb
    └── bigquery_migration.rb
├── .travis.yml
├── bin
    ├── setup
    └── console
├── .gitignore
├── Rakefile
├── test
    ├── helper.rb
    ├── test_schema.rb
    ├── test_table_data.rb
    └── test_bigquery_wrapper.rb
├── LICENSE.txt
├── bigquery_migration.gemspec
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
└── README.md


/exe/bq-migrate:
--------------------------------------------------------------------------------
1 | bq_migrate


--------------------------------------------------------------------------------
/example/example.yml:
--------------------------------------------------------------------------------
1 | migrate_table.yml


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gemspec
4 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/version.rb:
--------------------------------------------------------------------------------
1 | class BigqueryMigration
2 |   VERSION = "0.3.2"
3 | end
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 |   - 2.3.0
4 | before_install: gem install bundler -v 1.11.2
5 | 


--------------------------------------------------------------------------------
/exe/bq_migrate:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | 
3 | require_relative '../lib/bigquery_migration/cli'
4 | BigqueryMigration::CLI.start(ARGV)
5 | 


--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 | 
6 | bundle install
7 | 
8 | # Do any other automated setup that you need to do here
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /Gemfile.lock
 4 | /_yardoc/
 5 | /coverage/
 6 | /doc/
 7 | /pkg/
 8 | /spec/reports/
 9 | /tmp/
10 | your-project-000.json
11 | .tags
12 | .ruby-version
13 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/error.rb:
--------------------------------------------------------------------------------
1 | class BigqueryMigration
2 |   class Error < StandardError; end
3 |   class ConfigError < Error; end
4 |   class JobTimeoutError < Error; end
5 |   class NotFoundError < Error; end
6 | end
7 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | 
 3 | require 'rake/testtask'
 4 | desc 'Run test_unit based test'
 5 | Rake::TestTask.new(:test) do |t|
 6 |   t.libs << "test"
 7 |   t.test_files = Dir["test/**/test_*.rb"].sort
 8 |   t.verbose = true
 9 | end
10 | task :default => :test
11 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'test/unit'
 4 | require 'test/unit/rr'
 5 | require 'pry'
 6 | require 'bigquery_migration'
 7 | 
 8 | APP_ROOT = File.dirname(__dir__)
 9 | TEST_ROOT = File.join(APP_ROOT, 'test')
10 | JSON_KEYFILE = File.join(APP_ROOT, "example/your-project-000.json")
11 | 
12 | BigqueryMigration.logger = Logger.new(nil)
13 | 


--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "bundler/setup"
 4 | require "bigquery_migration"
 5 | 
 6 | # You can add fixtures and/or initialization code here to make experimenting
 7 | # with your gem easier. You can also use a different console, if you like.
 8 | 
 9 | # (If you use this, don't forget to add pry to your Gemfile!)
10 | # require "pry"
11 | # Pry.start
12 | 
13 | require "irb"
14 | IRB.start
15 | 


--------------------------------------------------------------------------------
/example/schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name":"timestamp",
 4 |     "type":"TIMESTAMP"
 5 |   },
 6 |   {
 7 |     "name":"long",
 8 |     "type":"INTEGER"
 9 |   },
10 |   {
11 |     "name":"string",
12 |     "type":"STRING"
13 |   },
14 |   {
15 |     "name":"double",
16 |     "type":"FLOAT"
17 |   },
18 |   {
19 |     "name":"boolean",
20 |     "type":"BOOLEAN"
21 |   },
22 |   {
23 |     "name":"date",
24 |     "type":"DATE"
25 |   }
26 | ]
27 | 


--------------------------------------------------------------------------------
/example/copy_table.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 | - action: copy_table
19 |   <<: *bigquery
20 |   destination_table: your_table_name_copy
21 | 


--------------------------------------------------------------------------------
/example/table_info.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 | - action: table_info
19 |   table: your_table_name
20 |   <<: *bigquery
21 | - action: table_info
22 |   prefix: your_table_name
23 |   <<: *bigquery
24 | 


--------------------------------------------------------------------------------
/example/insert_select.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 | - action: insert_select
19 |   <<: *bigquery
20 |   destination_table: your_table_name_insert_select
21 |   query: select * from [your_dataset_name.your_table_name]
22 | 


--------------------------------------------------------------------------------
/example/migrate_table.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 |         - { name: 'bytes', type: 'BYTES' }
19 | - action: migrate_table
20 |   <<: *bigquery
21 |   schema_file: example/schema.json
22 | - action: delete_table
23 |   <<: *bigquery
24 | 


--------------------------------------------------------------------------------
/example/application_default.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   # project: read from ~/.config/gcloud/configurations/config_default
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 |         - { name: 'bytes', type: 'BYTES' }
19 | - action: migrate_table
20 |   <<: *bigquery
21 |   schema_file: example/schema.json
22 | - action: delete_table
23 |   <<: *bigquery
24 | 


--------------------------------------------------------------------------------
/example/migrate_partitioned_table.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_table_name
 5 | 
 6 | actions:
 7 | - action: create_dataset
 8 |   <<: *bigquery
 9 | - action: migrate_partitioned_table
10 |   <<: *bigquery
11 |   columns:
12 |     - { name: 'timestamp', type: 'TIMESTAMP' }
13 |     - name: 'record'
14 |       type: 'RECORD'
15 |       fields:
16 |         - { name: 'string', type: 'STRING' }
17 |         - { name: 'integer', type: 'INTEGER' }
18 |         - { name: 'bytes', type: 'BYTES' }
19 | - action: migrate_partitioned_table
20 |   <<: *bigquery
21 |   schema_file: example/schema.json
22 | - action: delete_table
23 |   <<: *bigquery
24 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration.rb:
--------------------------------------------------------------------------------
 1 | require "bigquery_migration/version"
 2 | require "bigquery_migration/error"
 3 | require "bigquery_migration/schema"
 4 | require "bigquery_migration/logger"
 5 | require "bigquery_migration/bigquery_wrapper"
 6 | 
 7 | class BigqueryMigration
 8 |   def self.logger
 9 |     @logger ||= Logger.new(STDOUT)
10 |   end
11 | 
12 |   def self.logger=(logger)
13 |     @logger = logger
14 |   end
15 | 
16 |   def initialize(*args)
17 |     @wrapper = BigqueryWrapper.new(*args)
18 |   end
19 | 
20 |   # Delegate to BigqueryWrapper instance
21 |   BigqueryWrapper.instance_methods(false).each do |name|
22 |     next if method_defined?(name)
23 |     class_eval <<-"EOS", __FILE__, __LINE__ + 1
24 |       def #{name}(*args, &block)
25 |         @wrapper.#{name}(*args, &block)
26 |       end
27 |     EOS
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/example/migrate_clustered_table.yml:
--------------------------------------------------------------------------------
 1 | bigquery: &bigquery
 2 |   credentials_file: example/your-project-000.json
 3 |   dataset: your_dataset_name
 4 |   table: your_clustered_table_name
 5 |   clustering:
 6 |     fields:
 7 |       - timestamp
 8 |       - integer
 9 | 
10 | actions:
11 | - action: create_dataset
12 |   <<: *bigquery
13 | - action: migrate_partitioned_table
14 |   <<: *bigquery
15 |   columns:
16 |     - { name: 'timestamp', type: 'TIMESTAMP' }
17 |     - { name: 'integer', type: 'INTEGER' }
18 |     - name: 'record'
19 |       type: 'RECORD'
20 |       fields:
21 |         - { name: 'string', type: 'STRING' }
22 |         - { name: 'integer', type: 'INTEGER' }
23 |         - { name: 'bytes', type: 'BYTES' }
24 | - action: migrate_partitioned_table
25 |   <<: *bigquery
26 |   schema_file: example/schema.json
27 | - action: delete_table
28 |   <<: *bigquery
29 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/hash_util.rb:
--------------------------------------------------------------------------------
 1 | class BigqueryMigration
 2 |   class HashUtil
 3 |     def self.deep_symbolize_keys(hash)
 4 |       if hash.is_a?(Hash)
 5 |         hash.map do |key, val|
 6 |           new_key = key.to_sym
 7 |           new_val = deep_symbolize_keys(val)
 8 |           [new_key, new_val]
 9 |         end.to_h
10 |       elsif hash.is_a?(Array)
11 |         hash.map do |val|
12 |           deep_symbolize_keys(val)
13 |         end
14 |       else
15 |         hash
16 |       end
17 |     end
18 | 
19 |     def self.deep_stringify_keys(hash)
20 |       if hash.is_a?(Hash)
21 |         hash.map do |key, val|
22 |           new_key = key.to_s
23 |           new_val = deep_stringify_keys(val)
24 |           [new_key, new_val]
25 |         end.to_h
26 |       elsif hash.is_a?(Array)
27 |         hash.map do |val|
28 |           deep_stringify_keys(val)
29 |         end
30 |       else
31 |         hash
32 |       end
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/logger.rb:
--------------------------------------------------------------------------------
 1 | require 'logger'
 2 | 
 3 | class BigqueryMigration
 4 |   class LogFormatter
 5 |     FORMAT = "%s [%s] %s\n"
 6 | 
 7 |     def initialize(opts={})
 8 |     end
 9 | 
10 |     def call(severity, time, progname, msg)
11 |       FORMAT % [format_datetime(time), severity, format_message(msg)]
12 |     end
13 | 
14 |     private
15 |     def format_datetime(time)
16 |       time.iso8601
17 |     end
18 | 
19 |     def format_severity(severity)
20 |       severity
21 |     end
22 | 
23 |     def format_message(message)
24 |       case message
25 |       when ::Exception
26 |         e = message
27 |         "#{e.class} (#{e.message})\n  #{e.backtrace.join("\n  ")}"
28 |       else
29 |         message.to_s
30 |       end
31 |     end
32 |   end
33 | 
34 |   class Logger < ::Logger
35 |     def initialize(logdev, shift_age = 0, shift_size = 1048576)
36 |       logdev = STDOUT if logdev == 'STDOUT'
37 |       super(logdev, shift_age, shift_size)
38 |       @formatter = LogFormatter.new
39 |     end
40 | 
41 |     def write(msg)
42 |       @logdev.write msg
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/time_with_zone.rb:
--------------------------------------------------------------------------------
 1 | require 'tzinfo'
 2 | 
 3 | class BigqueryMigration
 4 |   class TimeWithZone
 5 |     # [+-]HH:MM, [+-]HHMM, [+-]HH
 6 |     NUMERIC_PATTERN = %r{\A[+-]\d\d(:?\d\d)?\z}
 7 | 
 8 |     # Region/Zone, Region/Zone/Zone
 9 |     NAME_PATTERN = %r{\A[^/]+/[^/]+(/[^/]+)?\z}
10 | 
11 |     class << self
12 |       def time_with_zone(time, timezone)
13 |         time.localtime(zone_offset(timezone))
14 |       end
15 | 
16 |       def strptime_with_zone(date, format, timezone)
17 |         time = Time.strptime(date, format)
18 |         _utc_offset = time.utc_offset
19 |         _zone_offset = zone_offset(timezone)
20 |         time.localtime(_zone_offset) + _utc_offset - _zone_offset
21 |       end
22 | 
23 |       private
24 |       def zone_offset(timezone)
25 |         if NUMERIC_PATTERN === timezone
26 |           Time.zone_offset(timezone)
27 |         elsif NAME_PATTERN === timezone
28 |           tz = TZInfo::Timezone.get(timezone)
29 |           tz.current_period.utc_total_offset
30 |         elsif "UTC" == timezone # special treatment
31 |           0
32 |         else
33 |           raise ArgumentError, "timezone format is invalid: #{timezone}"
34 |         end
35 |       end
36 |     end
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/bigquery_migration.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require 'bigquery_migration/version'
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = "bigquery_migration"
 8 |   spec.version       = BigqueryMigration::VERSION
 9 |   spec.authors       = ["Naotoshi Seo", "kysnm", "potato2003"]
10 |   spec.email         = ["sonots@gmail.com", "tokyoincidents.g@gmail.com", "potato2003@gmail.com"]
11 | 
12 |   spec.summary       = %q{Migrate BigQuery table schema}
13 |   spec.description   = %q{Migrate BigQuery table schema.}
14 |   spec.homepage      = "https://github.com/sonots/bigquery_migration"
15 |   spec.license       = "MIT"
16 | 
17 |   spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18 |   spec.bindir        = "exe"
19 |   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20 |   spec.require_paths = ["lib"]
21 | 
22 |   spec.add_dependency "google-api-client"
23 |   spec.add_dependency "tzinfo"
24 |   spec.add_dependency "thor"
25 |   spec.add_dependency "inifile"
26 | 
27 |   spec.add_development_dependency "bundler", "~> 1.11"
28 |   spec.add_development_dependency "rake", "~> 10.0"
29 |   spec.add_development_dependency "pry-byebug"
30 |   spec.add_development_dependency "test-unit"
31 |   spec.add_development_dependency "test-unit-rr"
32 | end
33 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/config_loader.rb:
--------------------------------------------------------------------------------
 1 | require 'set'
 2 | require 'yaml'
 3 | require 'erb'
 4 | require 'ostruct'
 5 | 
 6 | class BigqueryMigration
 7 |   class ConfigLoader
 8 |     attr_reader :config_path, :namespace
 9 | 
10 |     class AlreayIncluded < ::StandardError; end
11 | 
12 |     def initialize(config_path, vars = {})
13 |       @config_path = File.expand_path(config_path)
14 |       @included_files = Set.new
15 |       @namespace = OpenStruct.new(vars)
16 | 
17 |       unless @namespace.respond_to?(:include_file)
18 |         itself = self
19 |         # ToDo: better way?
20 |         @namespace.define_singleton_method(:include_file) do |path|
21 |           caller_path = caller[0][/^([^:]+):\d+:in `[^']*'$/, 1]
22 |           abs_path = File.expand_path(path, File.dirname(caller_path))
23 |           if File.extname(path) == '.erb'
24 |             itself.load_erb(abs_path)
25 |           else
26 |             File.read(abs_path)
27 |           end
28 |         end
29 |       end
30 |     end
31 | 
32 |     def load
33 |       if File.extname(config_path) == '.erb'
34 |         YAML.load(load_erb(config_path))
35 |       else
36 |         YAML.load(File.read(config_path))
37 |       end
38 |     end
39 | 
40 |     def load_erb(path = config_path)
41 |       unless @included_files.add?(path)
42 |         raise AlreayIncluded, "#{path} was included twice"
43 |       end
44 | 
45 |       raw = File.read(path)
46 |       erb = ERB.new(raw, nil, "-")
47 |       erb.filename = path
48 |       erb.result(namespace.instance_eval { binding })
49 |     end
50 |   end
51 | end
52 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.3.2 (2019/04/29)
 2 | 
 3 | Enhancements:
 4 | 
 5 | * Support clustered table
 6 | 
 7 | # 0.3.1 (2018/05/23)
 8 | 
 9 | Enhancements:
10 | 
11 | * Support newly added location option of google-api-ruby-client.
12 | 
13 | # 0.3.0 (2017/04/26)
14 | 
15 | Enhancements:
16 | 
17 | * Support more authentication methods such as oauth, compute_engine, application_default
18 | 
19 | # 0.2.2 (2017/04/04)
20 | 
21 | Enhancements:
22 | 
23 | * Support google-api-ruby-client >= v0.11.0
24 | 
25 | # 0.2.1 (2017/03/31)
26 | 
27 | Enhancements:
28 | 
29 | * Accept DATE, DATETIME, TIME as column types
30 | 
31 | # 0.2.0 (2016/10/03)
32 | 
33 | Enhancements:
34 | 
35 | * Support migrate_partitioned_table
36 | 
37 | Fixes:
38 | 
39 | * Fix list_table_data for when a value is an empty hash
40 | 
41 | # 0.1.7 (2016/09/17)
42 | 
43 | Fixes:
44 | 
45 | * Prohibit to create a table with empty columns
46 | * Create a table only if a table does not exist
47 | 
48 | # 0.1.6 (2016/07/26)
49 | 
50 | Fixes:
51 | 
52 | * Fix empty hash to nil for list table data
53 | 
54 | # 0.1.5 (2016/07/25)
55 | 
56 | Enhancements:
57 | 
58 | * Support record type and repeated mode for list table data
59 | 
60 | # 0.1.4 (2016/07/12)
61 | 
62 | Fixes:
63 | 
64 | * Fix to allow downcase type and mode
65 | 
66 | # 0.1.3 (2016/04/22)
67 | 
68 | Enhancements:
69 | 
70 | * Support new BYTES types
71 | * Add exe/bq-migrate as an alias to exe/bq_migrate
72 | 
73 | # 0.1.2 (2016/04/14)
74 | 
75 | Changes:
76 | 
77 | * Genearate job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs)
78 | 
79 | # 0.1.1 (2016/04/12)
80 | 
81 | Changes:
82 | 
83 | * Expose wait_load method
84 | 
85 | # 0.1.0 (2016/04/08)
86 | 
87 | Initial release
88 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/action_runner.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'config_loader'
 2 | require_relative 'error'
 3 | require_relative 'action'
 4 | require_relative 'hash_util'
 5 | 
 6 | class BigqueryMigration
 7 |   class ActionRunner
 8 |     attr_reader :config, :config_path, :opts
 9 | 
10 |     def initialize(config_path = nil, opts = {})
11 |       @config_path = config_path
12 |       @opts = opts
13 |       config = ConfigLoader.new(@config_path, opts[:vars]).load
14 |       @config = HashUtil.deep_symbolize_keys(config)
15 |       validate_config!
16 |     end
17 | 
18 |     def run
19 |       success, responses = run_actions
20 |       { success: success, dry_run: @opts[:dry_run], actions: responses }
21 |     end
22 | 
23 |     def run_actions
24 |       success = true
25 |       responses = []
26 | 
27 |       @config[:actions].each do |action_config|
28 |         _success, result = Action.new(action_config, @opts).run
29 |         response = action_config.merge({'result' => result})
30 |         responses << response
31 |         unless _success
32 |           success = false
33 |           break
34 |         end
35 |       end
36 | 
37 |       [success, responses]
38 |     end
39 | 
40 |     def validate_config!
41 |       unless config.is_a?(Hash)
42 |         raise ConfigError, "config file format has to be YAML Hash"
43 |       end
44 | 
45 |       unless config[:actions]
46 |         raise ConfigError, "config must have `actions` key"
47 |       end
48 | 
49 |       unless config[:actions].is_a?(Array)
50 |         raise ConfigError, "config[:actions] must be an Array"
51 |       end
52 | 
53 |       config[:actions].each do |action_config|
54 |         unless action_config[:action]
55 |           raise ConfigError, "Elements of `config[:actions]` must have `action` key"
56 |         end
57 |       end
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, and in the interest of
 4 | fostering an open and welcoming community, we pledge to respect all people who
 5 | contribute through reporting issues, posting feature requests, updating
 6 | documentation, submitting pull requests or patches, and other activities.
 7 | 
 8 | We are committed to making participation in this project a harassment-free
 9 | experience for everyone, regardless of level of experience, gender, gender
10 | identity and expression, sexual orientation, disability, personal appearance,
11 | body size, race, ethnicity, age, religion, or nationality.
12 | 
13 | Examples of unacceptable behavior by participants include:
14 | 
15 | * The use of sexualized language or imagery
16 | * Personal attacks
17 | * Trolling or insulting/derogatory comments
18 | * Public or private harassment
19 | * Publishing other's private information, such as physical or electronic
20 |   addresses, without explicit permission
21 | * Other unethical or unprofessional conduct
22 | 
23 | Project maintainers have the right and responsibility to remove, edit, or
24 | reject comments, commits, code, wiki edits, issues, and other contributions
25 | that are not aligned to this Code of Conduct, or to ban temporarily or
26 | permanently any contributor for other behaviors that they deem inappropriate,
27 | threatening, offensive, or harmful.
28 | 
29 | By adopting this Code of Conduct, project maintainers commit themselves to
30 | fairly and consistently applying these principles to every aspect of managing
31 | this project. Project maintainers who do not follow or enforce the Code of
32 | Conduct may be permanently removed from the project team.
33 | 
34 | This code of conduct applies both within project spaces and in public spaces
35 | when an individual is representing the project or its community.
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
38 | reported by contacting a project maintainer at sonots@gmail.com. All
39 | complaints will be reviewed and investigated and will result in a response that
40 | is deemed necessary and appropriate to the circumstances. Maintainers are
41 | obligated to maintain confidentiality with regard to the reporter of an
42 | incident.
43 | 
44 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45 | version 1.3.0, available at
46 | [http://contributor-covenant.org/version/1/3/0/][version]
47 | 
48 | [homepage]: http://contributor-covenant.org
49 | [version]: http://contributor-covenant.org/version/1/3/0/


--------------------------------------------------------------------------------
/lib/bigquery_migration/cli.rb:
--------------------------------------------------------------------------------
  1 | require 'thor'
  2 | require 'json'
  3 | require 'bigquery_migration'
  4 | require_relative 'action_runner'
  5 | require_relative 'hash_util'
  6 | 
  7 | class BigqueryMigration
  8 |   class CLI < Thor
  9 |     # cf. http://qiita.com/KitaitiMakoto/items/c6b9d6311c20a3cc21f9
 10 |     def self.exit_on_failure?
 11 |       true
 12 |     end
 13 | 
 14 |     # `run` is reserved by thor, we have to use def _run
 15 |     map "run" => "_run"
 16 | 
 17 |     option :config_path, :aliases => ['-c'], :type => :string,
 18 |       :default => 'config.yml'
 19 |     option :log_level, :aliases => ["-l"], :type => :string,
 20 |       :desc => 'Log level such as fatal, error, warn, info, or debug',
 21 |       :default => 'info'
 22 |     option :log, :type => :string,
 23 |       :desc => 'Output log to a file',
 24 |       :default => 'STDOUT'
 25 |     option :stdout, :type => :string,
 26 |       :desc => 'Redirect STDOUT to a file',
 27 |       :default => 'STDOUT'
 28 |     option :stderr, :type => :string,
 29 |       :desc => 'Redirect STDERR to a file',
 30 |       :default => 'STDERR'
 31 |     option :exec, :type => :boolean,
 32 |       :desc => 'Execute or dry-run (Default: dry-run)',
 33 |       :default => false
 34 |     option :vars, :type => :hash,
 35 |       :desc => 'Variables used in ERB, thor hash format'
 36 |     option :output, :aliases => ["-o"], :type => :string,
 37 |       :desc => 'Output result yaml to a file',
 38 |       :default => 'STDOUT'
 39 | 
 40 |     desc 'run <config.yml>', 'run bigquery_migration'
 41 |     def _run(config_path)
 42 |       opts = options.merge(
 43 |         dry_run: !options[:exec]
 44 |       )
 45 | 
 46 |       init_logger
 47 |       reopen_stdout
 48 |       reopen_stderr
 49 | 
 50 |       result = ActionRunner.new(config_path, opts).run
 51 |       open_output do |io|
 52 |         io.puts mask_secret(HashUtil.deep_stringify_keys(result).to_yaml)
 53 |         logger.info { "DRY-RUN has finished. Use --exec option to run." } if opts[:dry_run]
 54 |       end
 55 |       exit(1) unless result[:success]
 56 |     end
 57 | 
 58 |     private
 59 | 
 60 |     def logger
 61 |       BigqueryMigration.logger
 62 |     end
 63 | 
 64 |     def init_logger
 65 |       logger = BigqueryMigration::Logger.new(options[:log])
 66 |       logger.level = options[:log_level]
 67 |       BigqueryMigration.logger = logger
 68 |     end
 69 | 
 70 |     def reopen_stdout
 71 |       unless options[:stdout] == 'STDOUT'
 72 |         $stdout.reopen(options[:stdout])
 73 |       end
 74 |       $stdout.sync = true
 75 |     end
 76 | 
 77 |     def reopen_stderr
 78 |       unless options[:stderr] == 'STDERR'
 79 |         $stderr.reopen(options[:stderr])
 80 |       end
 81 |       $stderr.sync = true
 82 |     end
 83 | 
 84 |     def open_output
 85 |       output = options[:output]
 86 |       if output == 'STDOUT'
 87 |         yield($stdout)
 88 |       elsif output == 'STDERR'
 89 |         yield($stderr)
 90 |       else
 91 |         File.open(output, 'w') do |io|
 92 |           yield(io)
 93 |         end
 94 |       end
 95 |     end
 96 | 
 97 |     def mask_secret(yaml_string)
 98 |       %w(password key).each do |secret|
 99 |         yaml_string.gsub!(/([^ ]*#{secret}): .*$/, '\1: xxxxx')
100 |       end
101 |       yaml_string.gsub!(/(-----BEGIN\s+PRIVATE\s+KEY-----)[0-9A-Za-z+\/=\s\\]+(-----END\s+PRIVATE\s+KEY-----)/m, '\1 xxxxx \2')
102 |       yaml_string
103 |     end
104 |   end
105 | end
106 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/action.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'schema'
  2 | require_relative 'error'
  3 | require_relative 'hash_util'
  4 | require_relative 'bigquery_wrapper'
  5 | 
  6 | class BigqueryMigration
  7 |   class Action
  8 |     attr_reader :config, :opts
  9 | 
 10 |     def initialize(config, opts = {})
 11 |       @config = HashUtil.deep_symbolize_keys(config)
 12 |       @opts = HashUtil.deep_symbolize_keys(opts)
 13 | 
 14 |       @action = @config[:action]
 15 |       unless self.class.supported_actions.include?(@action)
 16 |         raise ConfigError, "Action #{@action} is not supported"
 17 |       end
 18 |     end
 19 | 
 20 |     def run
 21 |       begin
 22 |         success = true
 23 |         result = send(@action)
 24 |       rescue => e
 25 |         result = { error: e.message, error_class: e.class.to_s, error_backtrace: e.backtrace }
 26 |         success = false
 27 |       ensure
 28 |         success = false if result[:success] == false
 29 |       end
 30 |       [success, result]
 31 |     end
 32 | 
 33 |     def self.supported_actions
 34 |       Set.new(%w[
 35 |         create_dataset
 36 |         create_table
 37 |         delete_table
 38 |         patch_table
 39 |         migrate_table
 40 |         insert
 41 |         preview
 42 |         insert_select
 43 |         copy_table
 44 |         table_info
 45 |         migrate_partitioned_table
 46 |       ])
 47 |     end
 48 | 
 49 |     def client
 50 |       @client ||= BigqueryMigration.new(@config, @opts)
 51 |     end
 52 | 
 53 |     def create_dataset
 54 |       client.create_dataset
 55 |     end
 56 | 
 57 |     def create_table
 58 |       client.create_table(columns: config[:columns])
 59 |     end
 60 | 
 61 |     def delete_table
 62 |       client.delete_table
 63 |     end
 64 | 
 65 |     def patch_table
 66 |       client.patch_table(
 67 |         columns: config[:columns],
 68 |         add_columns: config[:add_columns]
 69 |       )
 70 |     end
 71 | 
 72 |     def migrate_table
 73 |       client.migrate_table(
 74 |         schema_file: config[:schema_file],
 75 |         columns: config[:columns],
 76 |         backup_dataset: config[:backup_dataset],
 77 |         backup_table: config[:backup_table]
 78 |       )
 79 |     end
 80 | 
 81 |     def migrate_partitioned_table
 82 |       client.migrate_partitioned_table(
 83 |         schema_file: config[:schema_file],
 84 |         columns: config[:columns],
 85 |       )
 86 |     end
 87 | 
 88 |     def insert
 89 |       client.insert_all_table_data(rows: config[:rows])
 90 |     end
 91 | 
 92 |     def preview
 93 |       client.list_table_data(max_results: config[:max_results])
 94 |     end
 95 | 
 96 |     def copy_table
 97 |       client.copy_table(
 98 |         destination_table: config[:destination_table],
 99 |         destination_dataset: config[:destination_dataset],
100 |         source_table: config[:source_table],
101 |         source_dataset: config[:source_dataset],
102 |         write_disposition: config[:write_disposition],
103 |       )
104 |     end
105 | 
106 |     def insert_select
107 |       client.insert_select(
108 |         query: config[:query],
109 |         destination_table: config[:destination_table],
110 |         destination_dataset: config[:destination_dataset],
111 |         write_disposition: config[:write_disposition],
112 |       )
113 |     end
114 | 
115 |     def table_info
116 |       if config[:prefix]
117 |         tables = client.list_tables[:tables].select {|table| table.start_with?(config[:prefix]) }
118 |         table_infos = tables.map do |table|
119 |           result = client.get_table(table: table)
120 |           result.delete(:responses)
121 |           result
122 |         end
123 |         result = {
124 |           sum_num_bytes: table_infos.map {|info| info[:num_bytes].to_i }.inject(:+),
125 |           sum_num_rows: table_infos.map {|info| info[:num_rows].to_i }.inject(:+),
126 |           table_infos: table_infos,
127 |         }
128 |       else
129 |         client.get_table
130 |       end
131 |     end
132 |   end
133 | end
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BigqueryMigration
  2 | 
  3 | BigqueryMigraiton is a tool or a ruby library to migrate (or alter) BigQuery table schema.
  4 | 
  5 | ## Requirements
  6 | 
  7 | * Ruby >= 2.3.0
  8 | 
  9 | ## Installation
 10 | 
 11 | Add this line to your application's Gemfile:
 12 | 
 13 | ```ruby
 14 | gem 'bigquery_migration'
 15 | ```
 16 | 
 17 | And then execute:
 18 | 
 19 |     $ bundle
 20 | 
 21 | Or install it yourself as:
 22 | 
 23 |     $ gem install bigquery_migration
 24 | 
 25 | ## Usage
 26 | 
 27 | Define your desired schema, this tool automatically detects differences with the target table, and takes care of adding columns, or dropping columns (actually, select & copy is issued), or changing types.
 28 | 
 29 | ### CLI
 30 | 
 31 | config.yml
 32 | 
 33 | ```yaml
 34 | bigquery: &bigquery
 35 |   json_keyfile: your-project-000.json
 36 |   dataset: your_dataset_name
 37 |   table: your_table_name
 38 |   # If your data is in a location other than the US or EU multi-region, you must specify the location
 39 |   # location: asia-northeast1
 40 | 
 41 | actions:
 42 | - action: create_dataset
 43 |   <<: *bigquery
 44 | - action: migrate_table
 45 |   <<: *bigquery
 46 |   columns:
 47 |     - { name: 'timestamp', type: 'TIMESTAMP' }
 48 |     - name: 'record'
 49 |       type: 'RECORD'
 50 |       fields:
 51 |         - { name: 'string', type: 'STRING' }
 52 |         - { name: 'integer', type: 'INTEGER' }
 53 | ```
 54 | 
 55 | Run
 56 | 
 57 | ```
 58 | $ bundle exec bq_migrate run config.yml # dry-run
 59 | $ bundle exec bq_migrate run config.yml --exec
 60 | ```
 61 | 
 62 | ### Library
 63 | 
 64 | ```ruby
 65 | require 'bigquery_migration'
 66 | 
 67 | config = {
 68 |   json_keyfile: '/path/to/your-project-000.json',
 69 |   dataset: 'your_dataset_name',
 70 |   table: 'your_table_name',
 71 | 
 72 |   # If your data is in a location other than the US or EU multi-region, you must specify the location
 73 |   # location: asia-northeast1,
 74 | }
 75 | columns = [
 76 |   { name: 'string', type: 'STRING' },
 77 |   { name: 'record', type: 'RECORD', fields: [
 78 |     { name: 'integer', type: 'INTEGER' },
 79 |     { name: 'timestamp', type: 'TIMESTAMP' },
 80 |   ] }
 81 | ]
 82 | 
 83 | migrator = BigqueryMigration.new(config)
 84 | migrator.migrate_table(columns: columns)
 85 | # migrator.migrate_table(schema_file: '/path/to/schema.json')
 86 | ```
 87 | 
 88 | ## LIMITATIONS
 89 | 
 90 | There are serveral limitations because of BigQuery API limitations:
 91 | 
 92 | * Can not handle `mode: REPEATED` columns
 93 | * Can add only `mode: NULLABLE` columns
 94 | * Columns become `mode: NULLABLE` after type changing
 95 | * Will be charged because a query is issued (If only adding columns, it is not charged because it uses patch_table API)
 96 | 
 97 | This tool has an advantage that it is **faster** than reloading data entirely.
 98 | 
 99 | ## Further Details
100 | 
101 | * See [BigQueryテーブルのスキーマを変更する - sonots:blog](http://blog.livedoor.jp/sonots/archives/47294596.html) (Japanese)
102 | 
103 | ## Development
104 | 
105 | ### Run example:
106 | 
107 | **Service Account**
108 | 
109 | Prepare your service account json at `example/your-project-000.json`, then
110 | 
111 | ```
112 | $ bundle exec bq_migrate run example/example.yml # dry-run
113 | $ bundle exec bq_migrate run example/example.yml --exec
114 | ```
115 | 
116 | **OAuth**
117 | 
118 | Install gcloud into your development environment:
119 | 
120 | ```
121 | curl https://sdk.cloud.google.com | bash
122 | gcloud init
123 | gcloud auth login
124 | gcloud auth application-default login
125 | gcloud config set project <GCP_PROJECT_NAME>
126 | ```
127 | 
128 | Make sure `gcloud` works
129 | 
130 | ```
131 | gcloud compute instances list
132 | ```
133 | 
134 | Run as:
135 | 
136 | ```
137 | $ bundle exec bq_migrate run example/application_default.yml # dry-run
138 | $ bundle exec bq_migrate run example/application_default.yml --exec
139 | ```
140 | 
141 | ### Run test:
142 | 
143 | ```
144 | $ bundle exec rake test
145 | ```
146 | 
147 | To run tests which directly connects to BigQuery, prepare `example/your-project-000.json`, then
148 | 
149 | ```
150 | $ bundle exec rake test
151 | ```
152 | 
153 | ## Contributing
154 | 
155 | Bug reports and pull requests are welcome on GitHub at https://github.com/sonots/bigquery_migration. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
156 | 
157 | 
158 | ## License
159 | 
160 | The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
161 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/table_data.rb:
--------------------------------------------------------------------------------
  1 | # This codes are translated from BigQuery Web console's JavaScript
  2 | require_relative 'error'
  3 | 
  4 | class BigqueryMigration
  5 |   class TableData
  6 |     attr_reader :rows, :columns
  7 | 
  8 |     def logger
  9 |       BigqueryMigration.logger
 10 |     end
 11 | 
 12 |     def initialize(columns, rows)
 13 |       @columns = columns || raise(Error, '`columns` is required.')
 14 |       @rows = rows || raise(Error, '`rows` is required.')
 15 |     end
 16 | 
 17 |     # format list_table_data response rows which is like
 18 |     #
 19 |     # [
 20 |     #   { f: [
 21 |     #     { v: "foo" },
 22 |     #     { v: "1" },
 23 |     #     { v: [] },
 24 |     #     { v: "1.1" },
 25 |     #     { v: "true" },
 26 |     #     { v: "1.444435200E9" }
 27 |     #   ] },
 28 |     #   { f: [
 29 |     #     { v: "foo" },
 30 |     #     { v: "2" },
 31 |     #     { v: [
 32 |     #       { v: "foo" },
 33 |     #       { v: "bar" }
 34 |     #     ] },
 35 |     #     { v: "2.2" },
 36 |     #     { v: "false" },
 37 |     #     { v: "1.444435200E9" }
 38 |     #   ] }
 39 |     # ]
 40 |     #
 41 |     # into
 42 |     #
 43 |     # [
 44 |     #   # first row
 45 |     #   [
 46 |     #     [ "foo", "1", nil, "1.1", "true", "1.444435200E9" ]
 47 |     #   ],
 48 |     #   # second row
 49 |     #   [
 50 |     #     [ "foo", "2", "foo", "2.2", "false", "1.444435200E9" ],
 51 |     #     [ nil, nil, "bar", nil, nil, nil ],
 52 |     #   ],
 53 |     # ]
 54 |     def values
 55 |       values = @rows.map do |row|
 56 |         repeated_count = repeated_count(columns: @columns, rows: row)
 57 |         formatted_row = []
 58 |         repeated_count.times do |count|
 59 |           formatted_row << format_row(columns: @columns, rows: row, count: count)
 60 |         end
 61 |         formatted_row
 62 |       end
 63 |       # flattern if there is no repeated column for backward compatibility
 64 |       values.map(&:length).max > 1 ? values : values.flatten(1)
 65 |     end
 66 | 
 67 |     private
 68 | 
 69 |     # Count maximum number of rows on repeated columns
 70 |     #
 71 |     # This method called recursively, rows must be a hash and hash has key f:
 72 |     def repeated_count(columns: nil, rows: nil)
 73 |       return 1 if (rows.nil? || rows.empty?)
 74 |       validate_rows!(rows)
 75 |       rows[:f].zip(columns).map do |row, column|
 76 |         if column[:type] == 'RECORD'
 77 |           if column[:mode] == 'REPEATED'
 78 |             if row[:v].length == 0
 79 |               1
 80 |             else
 81 |               row[:v].map do |v|
 82 |                 v[:repeated_count] = repeated_count(columns: column[:fields], rows: v[:v])
 83 |               end.inject(:+)
 84 |             end
 85 |           else
 86 |             repeated_count(columns: column[:fields], rows: row[:v])
 87 |           end
 88 |         elsif column[:mode] == 'REPEATED'
 89 |           [(row[:v] || []).length, 1].max
 90 |         else
 91 |           1
 92 |         end
 93 |       end.max
 94 |     end
 95 | 
 96 |     # This method called recursively.
 97 |     # So, rows must be a hash and hash has key f:.
 98 |     def format_row(columns: nil, rows: nil, count: nil)
 99 |       formatted_row = []
100 |       return [nil] if (rows.nil? || rows.empty?)
101 |       validate_rows!(rows)
102 |       rows[:f].zip(columns).each do |row, column|
103 |         if column[:type] == 'RECORD'
104 |           if column[:mode] == 'REPEATED'
105 |             recursive = false
106 |             current = 0
107 |             row[:v].each do |v|
108 |               repeated_count = v[:repeated_count]
109 |               if current <= count && count < (current + repeated_count)
110 |                 formatted_row.concat format_row(columns: column[:fields], rows: v[:v], count: count - current)
111 |                 recursive = true
112 |               end
113 |               current = current + repeated_count
114 |             end
115 |             unless recursive
116 |               nil_count = get_nil_count(column[:fields])
117 |               formatted_row.concat(Array.new(nil_count))
118 |             end
119 |           elsif row[:v].nil?
120 |             nil_count = get_nil_count(column[:fields])
121 |             formatted_row.concat(Array.new(nil_count))
122 |           else
123 |             formatted_row.concat format_row(columns: column[:fields], rows: row[:v], count: count)
124 |           end
125 |         elsif column[:mode] == 'REPEATED'
126 |           v = row[:v]
127 |           count < v.length ? formatted_row.push(normalize_value(v[count][:v])) : formatted_row.push(nil)
128 |         elsif count == 0
129 |           formatted_row.push((normalize_value(row[:v])))
130 |         else
131 |           formatted_row.push(nil)
132 |         end
133 |       end
134 |       formatted_row
135 |     end
136 | 
137 |     # special treatment empty hash.
138 |     # nil is converted into {} by to_h
139 |     def normalize_value(v)
140 |       v.is_a?(Hash) && v.empty? ? nil : v
141 |     end
142 | 
143 |     def get_nil_count(fields)
144 |       fields.inject(0) do |acc, f|
145 |         f[:type] == 'RECORD' ? acc + get_nil_count(f[:fields]) : acc + 1
146 |       end
147 |     end
148 | 
149 |     def validate_rows!(rows)
150 |       raise Error, '`rows` must be a hash and hash has key `:f`.' if !rows.is_a?(Hash) || !rows.has_key?(:f)
151 |     end
152 |   end
153 | end
154 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/schema.rb:
--------------------------------------------------------------------------------
  1 | require 'csv'
  2 | require 'json'
  3 | require_relative 'error'
  4 | 
  5 | class BigqueryMigration
  6 |   class Schema < ::Array
  7 |     ALLOWED_FIELD_TYPES = Set.new(['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'RECORD', 'TIMESTAMP', 'BYTES', 'DATE', 'TIME', 'DATETIME'])
  8 |     ALLOWED_FIELD_MODES = Set.new(['NULLABLE', 'REQUIRED', 'REPEATED'])
  9 | 
 10 |     def initialize(columns = [])
 11 |       normalized = self.class.normalize_columns(columns)
 12 |       super(normalized)
 13 |       validate_columns!
 14 |     end
 15 | 
 16 |     def find_column_by_name(name)
 17 |       self.class.find_column_by_name(self, name)
 18 |     end
 19 | 
 20 |     def validate_columns!
 21 |       self.class.validate_columns!(self)
 22 |     end
 23 | 
 24 |     def validate_permitted_operations!(source_columns)
 25 |       target_columns = self
 26 |       self.class.validate_permitted_operations!(source_columns, target_columns)
 27 |     end
 28 | 
 29 |     def normalize_columns
 30 |       self.class.normalize_columns(self)
 31 |     end
 32 | 
 33 |     def shallow_normalize_columns
 34 |       self.class.shallow_normalize_columns(self)
 35 |     end
 36 |     def shallow_normalize_columns!
 37 |       self.class.shallow_normalize_column!(self)
 38 |     end
 39 | 
 40 |     def flattened_columns
 41 |       self.class.flattened_columns(self)
 42 |     end
 43 | 
 44 |     def equals?(source_columns)
 45 |       self.class.equals?(source_columns, self)
 46 |     end
 47 | 
 48 |     # self - source_columns
 49 |     def diff_columns(source_columns)
 50 |       self.class.diff_columns(source_columns, self)
 51 |     end
 52 | 
 53 |     # diff with only column names
 54 |     # self - source_columns
 55 |     def diff_columns_by_name(source_columns)
 56 |       self.class.diff_columns_by_name(source_columns, self)
 57 |     end
 58 | 
 59 |     # A.merge!(B) => B overwrites A
 60 |     # A.reverse_merge!(B) => A overwrites B, but A is modified
 61 |     def reverse_merge!(source_columns)
 62 |       self.class.reverse_merge!(source_columns, self)
 63 |     end
 64 | 
 65 |     def reject_columns!(drop_columns)
 66 |       self.class.reject_columns!(drop_columns, self)
 67 |     end
 68 | 
 69 |     def build_query_fields(source_columns)
 70 |       self.class.build_query_fields(source_columns, self)
 71 |     end
 72 | 
 73 |     class << self
 74 |       # The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
 75 |       # and must start with a letter or underscore. The maximum length is 128 characters.
 76 |       def validate_name!(name)
 77 |         unless name =~ /\A[a-zA-Z_]+\w*\Z/
 78 |           raise ConfigError, "Column name `#{name}` is invalid format"
 79 |         end
 80 |         unless name.length < 128
 81 |           raise ConfigError, "Column name `#{name}` must be less than 128"
 82 |         end
 83 |       end
 84 | 
 85 |       def validate_type!(type)
 86 |         unless ALLOWED_FIELD_TYPES.include?(type.upcase)
 87 |           raise ConfigError, "Column type `#{type}` is not allowed type"
 88 |         end
 89 |       end
 90 | 
 91 |       def validate_mode!(mode)
 92 |         unless ALLOWED_FIELD_MODES.include?(mode.upcase)
 93 |           raise ConfigError, "Column mode `#{mode}` is not allowed mode"
 94 |         end
 95 |       end
 96 | 
 97 |       def validate_columns!(columns)
 98 |         columns.each do |column|
 99 |           validate_name!(column[:name])
100 |           validate_type!(column[:type])
101 |           validate_mode!(column[:mode]) if column[:mode]
102 | 
103 |           if column[:type] == 'RECORD'
104 |             validate_columns!(column[:fields])
105 |           end
106 |         end
107 |       end
108 | 
109 |       def find_column_by_name(columns, name)
110 |         (columns || []).find { |c| c[:name] == name }
111 |       end
112 | 
113 |       # validates permitted changes from old schema to new schema
114 |       def validate_permitted_operations!(source_columns, target_columns)
115 |         flattened_source_columns = flattened_columns(normalize_columns(source_columns))
116 |         flattened_target_columns = flattened_columns(normalize_columns(target_columns))
117 | 
118 |         flattened_target_columns.keys.each do |flattened_name|
119 |           next unless flattened_source_columns.key?(flattened_name)
120 |           validate_permitted_operations_for_type!(
121 |             flattened_source_columns[flattened_name],
122 |             flattened_target_columns[flattened_name]
123 |           )
124 |           validate_permitted_operations_for_mode!(
125 |             flattened_source_columns[flattened_name],
126 |             flattened_target_columns[flattened_name]
127 |           )
128 |         end
129 |       end
130 | 
131 |       # @param [Hash] source_column
132 |       # @param [Hash] target_column
133 |       #
134 |       # Disallowed conversion rule is as follows:
135 |       #
136 |       #   type: RECORD => type: others
137 |       #   mode: REPEATED => change type
138 |       #
139 |       def validate_permitted_operations_for_type!(source_column, target_column)
140 |         source_column = shallow_normalize_column(source_column)
141 |         target_column = shallow_normalize_column(target_column)
142 | 
143 |         msg = "(#{source_column.to_h} => #{target_column.to_h})"
144 |         if source_column[:type] == 'RECORD'
145 |           if target_column[:type] != 'RECORD'
146 |             raise ConfigError, "`RECORD` can not be changed #{msg}"
147 |           end
148 |         end
149 |         if source_column[:mode] and source_column[:mode] == 'REPEATED'
150 |           if source_column[:type] != target_column[:type]
151 |             raise ConfigError, "`REPEATED` mode column's type can not be changed #{msg}"
152 |           end
153 |         end
154 |       end
155 | 
156 |       # @param [Hash] source_column
157 |       # @param [Hash] target_column
158 |       #
159 |       # Allowed conversion rule is as follows:
160 |       #
161 |       #     (new)    => NULLABLE, REPEATED
162 |       #     NULLABLE => NULLABLE
163 |       #     REQUIRED => REQUIRED, NULLABLE
164 |       #     REPEATED => REPEATED
165 |       def validate_permitted_operations_for_mode!(source_column, target_column)
166 |         source_column = shallow_normalize_column(source_column)
167 |         target_column = shallow_normalize_column(target_column)
168 |         source_mode   = source_column[:mode]
169 |         target_mode   = target_column[:mode]
170 | 
171 |         return if source_mode == target_mode
172 |         msg = "(#{source_column.to_h} => #{target_column.to_h})"
173 | 
174 |         case source_mode
175 |         when nil
176 |           if target_mode == 'REQUIRED'
177 |             raise ConfigError, "Newly adding a `REQUIRED` column is not allowed #{msg}"
178 |           end
179 |         when 'NULLABLE'
180 |           raise ConfigError, "`NULLABLE` column can not be changed #{msg}"
181 |         when 'REQUIRED'
182 |           if target_mode == 'REPEATED'
183 |             raise ConfigError, "`REQUIRED` column can not be changed to `REPEATED` #{msg}"
184 |           end
185 |         when 'REPEATED'
186 |           raise ConfigError, "`REPEATED` column can not be changed #{msg}"
187 |         end
188 |       end
189 | 
190 |       def normalize_columns(columns)
191 |         columns = shallow_normalize_columns(columns)
192 |         columns.map do |column|
193 |           if column[:type] == 'RECORD' and column[:fields]
194 |             column[:fields] = normalize_columns(column[:fields])
195 |           end
196 |           column
197 |         end
198 |       end
199 | 
200 |       def shallow_normalize_columns(columns)
201 |         columns.map {|column| shallow_normalize_column(column) }
202 |       end
203 | 
204 |       def shallow_normalize_columns!(columns)
205 |         columns.each {|column| shallow_normalize_column!(column) }
206 |         columns
207 |       end
208 | 
209 |       def shallow_normalize_column(column)
210 |         shallow_normalize_column!(column.dup)
211 |       end
212 | 
213 |       def shallow_normalize_column!(column)
214 |         symbolize_keys!(column)
215 |         column[:type] = column[:type].upcase if column[:type]
216 |         column[:mode] ||= 'NULLABLE'
217 |         column[:mode] = column[:mode].upcase
218 |         column
219 |       end
220 | 
221 |       def symbolize_keys!(column)
222 |         new_column = column.map do |key, val|
223 |           [key.to_sym, val]
224 |         end.to_h
225 |         column.replace(new_column)
226 |       end
227 | 
228 |       # @param [Array] columns
229 |       # [{
230 |       #   name: 'citiesLived',
231 |       #   type: 'RECORD',
232 |       #   fields: [
233 |       #     {
234 |       #       name: 'place', type: 'RECORD',
235 |       #       fields: [
236 |       #         { name: 'city', type: 'STRING' }, { name: 'postcode', type: 'STRING' }
237 |       #       ]
238 |       #     },
239 |       #     { name: 'yearsLived', type: 'INTEGER' }
240 |       #   ]
241 |       # }]
242 |       # @return Hash
243 |       # {
244 |       #   'citiesLived.place.city' => {
245 |       #     type: 'STRING'
246 |       #   },
247 |       #   'citiesLived.place.postcode' => {
248 |       #     type: 'STRING'
249 |       #   },
250 |       #   'citiesLived.yearsLived' => {
251 |       #     type: 'INTEGER'
252 |       #   }
253 |       # }
254 |       def flattened_columns(columns, parent_name: nil)
255 |         result = {}
256 |         columns.each do |column|
257 |           column_name = parent_name.nil? ? column[:name] : "#{parent_name}.#{column[:name]}"
258 |           if column[:type].upcase != 'RECORD'
259 |             result[column_name] = {}.tap do |value|
260 |               value[:type] = column[:type]
261 |               value[:mode] = column[:mode] if column[:mode]
262 |             end
263 |           else
264 |             result.merge!(flattened_columns(column[:fields], parent_name: column_name))
265 |           end
266 |         end
267 |         result
268 |       end
269 | 
270 |       def equals?(source_columns, target_columns)
271 |         diff_columns(source_columns, target_columns).empty? and \
272 |           diff_columns(target_columns, source_columns).empty?
273 |       end
274 | 
275 |       # target_columns - source_columns
276 |       def diff_columns(source_columns, target_columns)
277 |         _target_columns = shallow_normalize_columns(target_columns)
278 |         _source_columns = shallow_normalize_columns(source_columns)
279 |         diff_columns = _target_columns - _source_columns # shallow diff
280 | 
281 |         diff_columns.map do |target_column|
282 |           t = target_column
283 |           source_column = find_column_by_name(_source_columns, target_column[:name])
284 |           next t unless source_column
285 |           next t unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
286 |           next t unless target_column[:fields] and source_column[:fields]
287 |           # recusive diff for RECORD columns
288 |           diff_fields = diff_columns(source_column[:fields], target_column[:fields])
289 |           next nil if diff_fields.empty? # remove
290 |           target_column[:fields] = diff_fields
291 |           target_column
292 |         end.compact
293 |       end
294 | 
295 |       # diff with only column_names
296 |       # target_columns - source_columns
297 |       def diff_columns_by_name(source_columns, target_columns)
298 |         _target_columns = shallow_normalize_columns(target_columns)
299 |         _source_columns = shallow_normalize_columns(source_columns)
300 |         diff_columns = _target_columns - _source_columns # shallow diff
301 | 
302 |         diff_columns.map do |target_column|
303 |           t = target_column
304 |           source_column = find_column_by_name(_source_columns, target_column[:name])
305 |           next t unless source_column
306 |           next nil unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
307 |           next nil unless target_column[:fields] and source_column[:fields]
308 |           # recusive diff for RECORD columns
309 |           diff_fields = diff_columns_by_name(source_column[:fields], target_column[:fields])
310 |           next nil if diff_fields.empty? # remove
311 |           target_column[:fields] = diff_fields
312 |           target_column
313 |         end.compact
314 |       end
315 | 
316 |       # 1. target_column[:mode] ||= source_column[:mode] || 'NULLABLE' (not overwrite, but set if does not exist)
317 |       # 2. Add into target_columns if a source column does not exist in target_columns
318 |       #
319 |       # @param [Array] source_columns
320 |       # @param [Array] target_columns
321 |       def reverse_merge!(source_columns, target_columns)
322 |         shallow_normalize_columns!(source_columns)
323 |         shallow_normalize_columns!(target_columns)
324 | 
325 |         source_columns.map do |source_column|
326 |           if target_column = find_column_by_name(target_columns, source_column[:name])
327 |             target_column[:mode] ||= source_column[:mode] || 'NULLABLE'
328 |             target_column[:type] ||= source_column[:type] # should never be happened
329 |             # Recursive merge fields of `RECORD` type
330 |             if target_column[:type] == 'RECORD' and target_column[:fields] and source_column[:fields]
331 |               reverse_merge!(source_column[:fields], target_column[:fields])
332 |             end
333 |           else
334 |             target_column = source_column.dup
335 |             target_column[:mode] ||= 'NULLABLE'
336 |             target_columns << target_column
337 |           end
338 |         end
339 |         target_columns
340 |       end
341 | 
342 |       def reject_columns!(drop_columns, target_columns)
343 |         flattened_drop_columns = flattened_columns(drop_columns)
344 | 
345 |         flattened_drop_columns.keys.each do |flattened_name|
346 |           # paths like a %w(citiesLived place city child1)
347 |           paths = flattened_name.split('.')
348 |           # object_id of fields and target_columns are different.
349 |           # But the internal elements refer to the same ones
350 |           fields = target_columns
351 |           paths.each do |path|
352 |             # The last element of the path does not have the fields
353 |             next if path == paths.last
354 |             # find recursively
355 |             column = fields.find { |f| f[:name] == path }
356 |             next if column.nil?
357 |             fields = column[:fields]
358 |           end
359 | 
360 |           unless fields.empty?
361 |             fields.delete_if { |f| f[:name] == paths.last }
362 |           end
363 |         end
364 |         target_columns
365 |       end
366 | 
367 |       def build_query_fields(source_columns, target_columns)
368 |         flattened_source_columns = flattened_columns(source_columns)
369 |         flattened_target_columns = flattened_columns(target_columns)
370 | 
371 |         query_fields = flattened_target_columns.map do |flattened_name, flattened_target_column|
372 |           flattened_source_column = flattened_source_columns[flattened_name]
373 |           target_type = flattened_target_column[:type].upcase
374 | 
375 |           if flattened_source_column
376 |             "#{target_type}(#{flattened_name}) AS #{flattened_name}"
377 |           else
378 |             flattened_name
379 |             #  MEMO: NULL cast like "#{target_type}(NULL) AS #{flattened_name}" breaks RECORD columns as
380 |             #  INTEGER(NULL) AS add_record.add_record.add_column1 => add_record_add_record_add_column1
381 |             #  We have to add columns with patch_table beforehand
382 |           end
383 |         end
384 |       end
385 | 
386 |       def make_nullable!(columns)
387 |         columns.each do |column|
388 |           if column[:fields]
389 |             make_nullable!(column[:fields])
390 |           else
391 |             column[:mode] = 'NULLABLE'
392 |           end
393 |         end
394 |         columns
395 |       end
396 |     end
397 |   end
398 | end
399 | 


--------------------------------------------------------------------------------
/test/test_schema.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'helper.rb'
  2 | require 'bigquery_migration/schema'
  3 | 
  4 | class BigqueryMigration
  5 |   class TestSchema < Test::Unit::TestCase
  6 |     def columns
  7 |       [
  8 |         {name: 'boolean',  type: 'BOOLEAN', mode: 'NULLABLE'},
  9 |         {name: 'integer',  type: 'INTEGER'},
 10 |         {name: 'float',    type: 'FLOAT'},
 11 |         {name: 'string',   type: 'STRING'},
 12 |         {name: 'timstamp', type: 'TIMESTAMP'},
 13 |         {name: 'record',   type: 'RECORD', fields: [
 14 |           {name: 'record',   type: 'RECORD', fields: [
 15 |             {name: 'string', type: 'STRING', mode: 'NULLABLE'},
 16 |           ]},
 17 |         ]}
 18 |       ]
 19 |     end
 20 | 
 21 |     sub_test_case "find_column_by_name" do
 22 |       def test_find_column_by_name
 23 |         expected = {name: 'boolean', type: 'BOOLEAN', mode: 'NULLABLE'}
 24 |         assert { Schema.find_column_by_name(columns, 'boolean') == expected }
 25 |         assert { Schema.new(columns).find_column_by_name('boolean') == expected }
 26 |       end
 27 |     end
 28 | 
 29 |     sub_test_case "validate_columns!" do
 30 |       def test_validate_columns_with_valid
 31 |         assert_nothing_raised { Schema.new(columns).validate_columns! }
 32 |         assert_nothing_raised { Schema.validate_columns!(columns) }
 33 | 
 34 |         no_mode = [{name: 'name', type: 'STRING'}]
 35 |         assert_nothing_raised { Schema.validate_columns!(no_mode) }
 36 | 
 37 |         downcase_type = [{name: 'name', type: 'string'}]
 38 |         assert_nothing_raised { Schema.validate_columns!(downcase_type) }
 39 | 
 40 |         upcase_type = [{name: 'name', type: 'STRING'}]
 41 |         assert_nothing_raised { Schema.validate_columns!(upcase_type) }
 42 | 
 43 |         downcase_mode = [{name: 'name', type: 'STRING', mode: 'nullable'}]
 44 |         assert_nothing_raised { Schema.validate_columns!(downcase_mode) }
 45 | 
 46 |         upcase_mode = [{name: 'name', type: 'STRING', mode: 'NULLABLE'}]
 47 |         assert_nothing_raised { Schema.validate_columns!(upcase_mode) }
 48 |       end
 49 | 
 50 |       def test_validate_columns_with_invalid
 51 |         no_name = [{}]
 52 |         assert_raise { Schema.validate_columns!(no_name) }
 53 | 
 54 |         invalid_name = [{name: '%&%&^**'}]
 55 |         assert_raise { Schema.validate_columns!(invalid_name) }
 56 | 
 57 |         long_name = [{name: 'a'*129}]
 58 |         assert_raise { Schema.validate_columns!(long_name) }
 59 | 
 60 |         no_type = [{name: 'name'}]
 61 |         assert_raise { Schema.validate_columns!(no_type) }
 62 | 
 63 |         invalid_type = [{name: 'name', type: 'foobar'}]
 64 |         assert_raise { Schema.validate_columns!(invalid_type) }
 65 | 
 66 |         invalid_mode = [{name: 'name', type: 'STRING', mode: 'foobar'}]
 67 |         assert_raise { Schema.validate_columns!(no_mode) }
 68 |       end
 69 |     end
 70 | 
 71 |     sub_test_case "normalize_columns" do
 72 |       def test_normalize_columns
 73 |         downcase_columns = [
 74 |           {name: 'boolean',  type: 'boolean', mode: 'nullable'},
 75 |           {name: 'integer',  type: 'integer'},
 76 |           {name: 'float',    type: 'float'},
 77 |           {name: 'string',   type: 'string'},
 78 |           {name: 'timstamp', type: 'timestamp'},
 79 |           {name: 'record',   type: 'record', fields: [
 80 |             {name: 'record',   type: 'record', fields: [
 81 |               {name: 'string', type: 'string', mode: 'nullable'},
 82 |             ]},
 83 |           ]}
 84 |         ]
 85 |         expected = [
 86 |           {name: 'boolean',  type: 'BOOLEAN',   mode: 'NULLABLE'},
 87 |           {name: 'integer',  type: 'INTEGER',   mode: 'NULLABLE'},
 88 |           {name: 'float',    type: 'FLOAT',     mode: 'NULLABLE'},
 89 |           {name: 'string',   type: 'STRING',    mode: 'NULLABLE'},
 90 |           {name: 'timstamp', type: 'TIMESTAMP', mode: 'NULLABLE'},
 91 |           {name: 'record',   type: 'RECORD',    mode: 'NULLABLE', fields: [
 92 |             {name: 'record',   type: 'RECORD',    mode: 'NULLABLE', fields: [
 93 |               {name: 'string', type: 'STRING',      mode: 'NULLABLE'},
 94 |             ]},
 95 |           ]}
 96 |         ]
 97 |         result = Schema.normalize_columns(downcase_columns)
 98 |         assert { result == expected }
 99 |         result = Schema.new(downcase_columns).normalize_columns
100 |         assert { result == expected }
101 |       end
102 |     end
103 | 
104 |     sub_test_case "flattened_columns" do
105 |       def test_flattened_columns
106 |         columns = [
107 |           { name: 'id', type: 'INTEGER' },
108 |           { name: 'citiesLived', type: 'RECORD', fields: [
109 |             { name: 'place', type: 'RECORD', fields: [
110 |               { name: 'city', type: 'STRING' },
111 |               { name: 'postcode', type: 'STRING' }
112 |             ] },
113 |             { name: 'yearsLived', type: 'INTEGER' }
114 |           ] }
115 |         ]
116 | 
117 |         expected = {
118 |           'id' => { type: 'INTEGER' },
119 |           'citiesLived.place.city' => { type: 'STRING' },
120 |           'citiesLived.place.postcode' => { type: 'STRING' },
121 |           'citiesLived.yearsLived' => { type: 'INTEGER' }
122 |         }
123 |         result = Schema.flattened_columns(columns)
124 |         assert { result == expected }
125 |       end
126 |     end
127 | 
128 |     sub_test_case "diff_columns" do
129 |       sub_test_case "without intersect" do
130 |         def subset
131 |           [
132 |             {:name=>"remained_column", :type=>"INTEGER"},
133 |             {:name=>"record",
134 |              :type=>"RECORD",
135 |              :fields=>[
136 |                {:name=>"record", :type=>"RECORD", :fields=>[
137 |                  {:name=>"remained_column", :type=>"STRING"}
138 |                ]}
139 |              ]}
140 |           ]
141 |         end
142 | 
143 |         def superset
144 |           [
145 |             {:name=>"remained_column", :type=>"INTEGER"},
146 |             {:name=>"record", :type=>"RECORD", :fields=>[
147 |               {:name=>"record", :type=>"RECORD", :fields=>[
148 |                 {:name=>"remained_column", :type=>"STRING"},
149 |                 {:name=>"new_column", :type=>"INTEGER"}
150 |               ]},
151 |               {:name=>"new_record", :type=>"RECORD", :fields=>[
152 |                 {:name=>"new_column", :type=>"INTEGER"}
153 |               ]}
154 |             ]},
155 |             {:name=>"new_required_column", :type=>"INTEGER", :mode=>"REQUIRED"}
156 |           ]
157 |         end
158 | 
159 |         def test_diff_columns_subset
160 |           result = Schema.new(subset).diff_columns(superset)
161 |           assert { result == [] }
162 |         end
163 | 
164 |         def test_diff_columns_superset
165 |           expected = [
166 |             {:name=>"record", :type=>"RECORD", :fields=>[
167 |               {:name=>"record", :type=>"RECORD", :fields=>[
168 |                 {:name=>"new_column", :type=>"INTEGER", :mode=>"NULLABLE" }
169 |               ], :mode=>"NULLABLE"},
170 |               {:name=>"new_record", :type=>"RECORD", :fields=>[
171 |                 {"name"=>"new_column", "type"=>"INTEGER", :mode=>"NULLABLE" }
172 |               ], :mode=>"NULLABLE"}
173 |             ], :mode=>"NULLABLE"},
174 |             {:name=>"new_required_column", :type=>"INTEGER", :mode=>"REQUIRED"}
175 |           ]
176 |           result = Schema.new(superset).diff_columns(subset)
177 |           assert { Schema.equals?(result, expected) }
178 |         end
179 |       end
180 | 
181 |       sub_test_case "with intersect" do
182 |         def before_columns
183 |           [
184 |             {"name"=>"drop_column", "type"=>"INTEGER"},
185 |             {"name"=>"remained_column", "type"=>"INTEGER"},
186 |             {"name"=>"record", "type"=>"RECORD", "fields"=>[
187 |               {"name"=>"record", "type"=>"RECORD", "fields"=>[
188 |                 {"name"=>"drop_column", "type"=>"INTEGER"},
189 |                 {"name"=>"remained_column", "type"=>"STRING"}
190 |               ]}
191 |             ]}
192 |           ]
193 |         end
194 | 
195 |         def after_columns
196 |           [
197 |             {"name"=>"remained_column", "type"=>"INTEGER"},
198 |             {"name"=>"record", "type"=>"RECORD", "fields"=>[
199 |               {"name"=>"record", "type"=>"RECORD", "fields"=>[
200 |                 {"name"=>"remained_column", "type"=>"STRING"},
201 |                 {"name"=>"new_column", "type"=>"INTEGER"}
202 |               ]},
203 |               {"name"=>"new_record", "type"=>"RECORD", "fields"=>[
204 |                 {:name=>"new_column", :type=>"INTEGER"}
205 |               ]}
206 |             ]},
207 |             {"name"=>"new_required_column", "type"=>"INTEGER", "mode"=>"REQUIRED"}
208 |           ]
209 |         end
210 | 
211 |         def test_diff_columns_drop_columns
212 |           drop_columns = Schema.new(before_columns).diff_columns(after_columns)
213 |           expected = [
214 |             {:name=>"drop_column", :type=>"INTEGER", :mode=>"NULLABLE"},
215 |             {:name=>"record", :type=>"RECORD", :mode=>"NULLABLE", :fields=>[
216 |               {:name=>"record", :type=>"RECORD", :mode=>"NULLABLE", :fields=>[
217 |                 {:name=>"drop_column", :type=>"INTEGER", :mode=>"NULLABLE" }
218 |               ]}
219 |             ]}
220 |           ]
221 |           assert { Schema.equals?(drop_columns, expected) }
222 |         end
223 | 
224 |         def test_diff_columns_add_columns
225 |           add_columns = Schema.new(after_columns).diff_columns(before_columns)
226 |           expected = [
227 |             {:name=>"record", :type=>"RECORD", :mode=>"NULLABLE", :fields=>[
228 |               {:name=>"record", :type=>"RECORD", :mode=>"NULLABLE", :fields=>[
229 |                 {:name=>"new_column", :type=>"INTEGER", :mode=>"NULLABLE"}
230 |               ]},
231 |               {:name=>"new_record", :type=>"RECORD", :mode=>"NULLABLE", :fields=>[
232 |                 {"name"=>"new_column", "type"=>"INTEGER", :mode=>"NULLABLE"}
233 |               ]}
234 |             ]},
235 |             {:name=>"new_required_column", :type=>"INTEGER", :mode=>"REQUIRED"}
236 |           ]
237 |           assert { Schema.equals?(add_columns, expected) }
238 |         end
239 |       end
240 |     end
241 | 
242 |     sub_test_case "diff_columns_by_name" do
243 |       def before_columns
244 |         [
245 |           {:name=>"drop_column", :type=>"INTEGER"},
246 |           {:name=>"record", :type=>"RECORD", :fields=>[
247 |             {:name=>"record", :type=>"RECORD", :fields=>[
248 |               {:name=>"drop_column", :type=>"INTEGER"},
249 |             ]}
250 |           ]}
251 |         ]
252 |       end
253 | 
254 |       def after_columns
255 |         [
256 |           {:name=>"drop_column", :type=>"STRING"},
257 |           {:name=>"record", :type=>"RECORD", :fields=>[
258 |             {:name=>"record", :type=>"RECORD", :fields=>[
259 |               {:name=>"drop_column", :type=>"STRING"},
260 |               {:name=>"new_column", :type=>"INTEGER"}
261 |             ]},
262 |             {:name=>"new_record", :type=>"RECORD", :fields=>[
263 |               {:name=>"new_column", :type=>"INTEGER"}
264 |             ]}
265 |           ]},
266 |           {:name=>"new_required_column", :type=>"INTEGER", :mode=>"REQUIRED"}
267 |         ]
268 |       end
269 | 
270 |       def test_diff_columns_by_name
271 |         diff_columns = Schema.new(after_columns).diff_columns_by_name(before_columns)
272 |         expected = [
273 |           {:name=>"record", :type=>"RECORD", :fields=>[
274 |             {:name=>"record", :type=>"RECORD", :fields=>[
275 |               {:name=>"new_column", :type=>"INTEGER"}
276 |             ]},
277 |             {:name=>"new_record", :type=>"RECORD", :fields=>[
278 |               {:name=>"new_column", :type=>"INTEGER"}
279 |             ]}
280 |           ]},
281 |           {:name=>"new_required_column", :type=>"INTEGER", :mode=>"REQUIRED"}
282 |         ]
283 | 
284 |         assert { Schema.equals?(expected, diff_columns) }
285 |       end
286 |     end
287 | 
288 |     sub_test_case "reverse_merge!" do
289 |       def test_reverse_merge!
290 |         source_columns = [
291 |           { name: 'id', type: 'INTEGER', mode: 'NULLABLE' },
292 |           { name: 'name', type: 'RECORD', mode: 'REQUIRED', fields: [
293 |             { name: 'first_name', type: 'STRING', mode: 'NULLABLE' },
294 |             { name: 'last_name', type: 'STRING' },
295 |             { name: 'new_column', type: 'STRING' },
296 |           ] }
297 |         ]
298 | 
299 |         target_columns = [
300 |           { name: 'id', type: 'INTEGER' },
301 |           { name: 'name', type: 'RECORD', mode: 'NULLABLE', fields: [
302 |             { name: 'first_name', type: 'STRING' },
303 |             { name: 'last_name', type: 'STRING' },
304 |           ] },
305 |         ]
306 | 
307 |         expected = [
308 |           { name: 'id', type: 'INTEGER', mode: 'NULLABLE' },
309 |           { name: 'name', type: 'RECORD', mode: 'NULLABLE', fields: [
310 |             { name: 'first_name', type: 'STRING', mode: 'NULLABLE' },
311 |             { name: 'last_name', type: 'STRING', mode: 'NULLABLE' },
312 |             { name: 'new_column', type: 'STRING', mode: 'NULLABLE' },
313 |           ] }
314 |         ]
315 | 
316 |         result = Schema.new(target_columns).reverse_merge!(source_columns)
317 |         assert { result == expected }
318 |       end
319 |     end
320 | 
321 |     sub_test_case "reject_columns!" do
322 |       def test_reject_columns!
323 |         target_columns = [
324 |           { name: 'id', type: 'INTEGER' },
325 |           { name: 'citiesLived', type: 'RECORD', fields: [
326 |             { name: 'place', type: 'RECORD', fields: [
327 |               { name: 'city', type: 'RECORD', fields: [
328 |                 { name: 'child1', type: 'STRING' },
329 |                 { name: 'child2', type: 'STRING' }
330 |               ] },
331 |               { name: 'postcode', type: 'STRING' }
332 |             ] },
333 |             { name: 'yearsLived', type: 'INTEGER' }
334 |           ] }
335 |         ]
336 | 
337 |         drop_columns = [
338 |           { name: 'citiesLived', type: 'RECORD', fields: [
339 |             { name: 'place', type: 'RECORD', fields: [
340 |               { name: 'city', type: 'RECORD', fields: [
341 |                 { name: 'child2', type: 'STRING' },
342 |               ] }
343 |             ] }
344 |           ] }
345 |         ]
346 | 
347 |         expected = [
348 |           { name: 'id', type: 'INTEGER' },
349 |           { name: 'citiesLived', type: 'RECORD', fields: [
350 |             { name: 'place', type: 'RECORD', fields: [
351 |               { name: 'city', type: 'RECORD', fields: [
352 |                 name: 'child1', type: 'STRING'
353 |               ]
354 |               },
355 |               { name: 'postcode', type: 'STRING' }
356 |             ] },
357 |             { name: 'yearsLived', type: 'INTEGER' }
358 |           ] }
359 |         ]
360 | 
361 |         result = Schema.reject_columns!(drop_columns, target_columns)
362 |         assert { result == expected }
363 |       end
364 |     end
365 | 
366 |     sub_test_case "build_query_fields" do
367 |       def subset
368 |         subset = [
369 |           {name: "remained_column", type: "INTEGER"},
370 |           {name: "record", type: "RECORD", fields:  [
371 |             {name: "record", type: "RECORD", fields: [
372 |               {name: "remained_column", type: "STRING" }
373 |             ]}
374 |           ]}
375 |         ]
376 |       end
377 | 
378 |       def superset
379 |         [
380 |           {name: "remained_column", type: "INTEGER"},
381 |           {name: "record", type: "RECORD", fields:  [
382 |             {name: "record", type: "RECORD", fields: [
383 |               {name: "remained_column", type: "STRING" },
384 |               {name: "new_column", type: "INTEGER" }
385 |             ]},
386 |             {name: "new_record", type: "RECORD", fields: [
387 |               {name: "new_column", type: "INTEGER"}
388 |             ]}
389 |           ]},
390 |           {name: "new_required_column", type: "INTEGER", mode: "REQUIRED" }
391 |         ]
392 |       end
393 | 
394 |       def test_build_query_fields_for_subset
395 |         target_columns = subset
396 |         source_columns = superset
397 | 
398 |         schema = Schema.new(target_columns)
399 |         result = schema.build_query_fields(source_columns)
400 |         expected = [
401 |           "INTEGER(remained_column) AS remained_column",
402 |           "STRING(record.record.remained_column) AS record.record.remained_column"
403 |         ]
404 |         assert { expected == result }
405 |       end
406 | 
407 |       def test_build_query_fields_for_superset
408 |         target_columns = superset
409 |         source_columns = subset
410 | 
411 |         schema = Schema.new(target_columns)
412 |         result = schema.build_query_fields(source_columns)
413 |         expected = [
414 |           "INTEGER(remained_column) AS remained_column",
415 |           "STRING(record.record.remained_column) AS record.record.remained_column",
416 |           "record.record.new_column",
417 |           "record.new_record.new_column",
418 |           "new_required_column"
419 |         ]
420 |         assert { expected == result }
421 |       end
422 |     end
423 |   end
424 | end
425 | 


--------------------------------------------------------------------------------
/test/test_table_data.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'helper.rb'
  2 | require 'bigquery_migration/table_data'
  3 | 
  4 | class BigqueryMigration
  5 |   class TestTableData < Test::Unit::TestCase
  6 |     sub_test_case "values" do
  7 |       def test_values_simple
  8 |         columns = [
  9 |           { name: 'string', type: 'STRING', mode: 'NULLABLE'},
 10 |           { name: 'integer', type: 'INTEGER', mode: 'NULLABLE'},
 11 |           { name: 'float', type: 'FLOAT', mode: 'NULLABLE'},
 12 |           { name: 'boolean', type: 'BOOLEAN', mode: 'NULLABLE'},
 13 |           { name: 'timestamp', type: 'TIMESTAMP', mode: 'NULLABLE'},
 14 |         ]
 15 | 
 16 |         rows = [
 17 |           { f: [
 18 |             {v: "foo"},
 19 |             {v: "1"},
 20 |             {v: "1.1"},
 21 |             {v: "true"},
 22 |             {v: "1.444435200E9"}
 23 |           ] },
 24 |           { f: [
 25 |             {v: "bar"},
 26 |             {v: "2"},
 27 |             {v: "2.2"},
 28 |             {v: "false"},
 29 |             {v: "1.444435200E9"}
 30 |           ] }
 31 |         ]
 32 | 
 33 |         expected = [
 34 |           [ "foo", "1", "1.1", "true", "1.444435200E9" ],
 35 |           [ "bar", "2", "2.2", "false", "1.444435200E9" ]
 36 |         ]
 37 | 
 38 |         assert { TableData.new(columns, rows).values == expected }
 39 |       end
 40 | 
 41 |       def test_values_with_empty_hash
 42 |         columns = [
 43 |           {name: "category", type: "STRING"},
 44 |           {name: "number", type: "INTEGER"},
 45 |           {name: "null_string", type: "STRING"},
 46 |           {name: "d", type: "STRING"},
 47 |           {name: "t", type: "TIMESTAMP"}
 48 |         ]
 49 | 
 50 |         rows = [
 51 |           { f: [
 52 |             {v: "dummyEventCategory03"},
 53 |             {v: "5678"},
 54 |             {v: {}},
 55 |             {v: "2016-07-25"},
 56 |             {v: "1.4693724E9"}
 57 |           ] }
 58 |         ]
 59 | 
 60 |         expected = [
 61 |           [ "dummyEventCategory03", "5678", nil, "2016-07-25", "1.4693724E9" ],
 62 |         ]
 63 | 
 64 |         assert { TableData.new(columns, rows).values == expected }
 65 |       end
 66 | 
 67 |       def test_values_repeated_and_record_simple
 68 |         columns = [
 69 |           { name: 'repeated_record', type: 'RECORD', mode: 'REPEATED', fields: [
 70 |             { name: 'record', type: 'RECORD', mode: 'NULLABLE', fields: [
 71 |               { name: 'repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' }
 72 |             ] },
 73 |           ] }
 74 |         ]
 75 | 
 76 |         rows = [
 77 |           { f: [
 78 |             { v: [
 79 |               { v:
 80 |                 { f: [
 81 |                   { v:
 82 |                     { f: [
 83 |                       { v: [
 84 |                         {v: "1.444435200E9"},
 85 |                         {v: "1.444435200E9"}
 86 |                       ] }
 87 |                     ] }
 88 |                   }
 89 |                 ] }
 90 |               },
 91 |               v: {
 92 |                 f: [
 93 |                   { v:
 94 |                     { f: [
 95 |                       { v: [
 96 |                         {v: "1.444435200E9"},
 97 |                         {v: "1.444435200E9"},
 98 |                         {v: "1.444435200E9"}
 99 |                       ] }
100 |                     ] }
101 |                   }
102 |                 ]
103 |               }
104 |             ] }
105 |           ] }
106 |         ]
107 | 
108 |         expected = [
109 |           # only single row
110 |           [
111 |             ["1.444435200E9"],
112 |             ["1.444435200E9"],
113 |             ["1.444435200E9"],
114 |             ["1.444435200E9"],
115 |             ["1.444435200E9"]
116 |           ]
117 |         ]
118 | 
119 |         assert { TableData.new(columns, rows).values == expected }
120 |       end
121 | 
122 |       def test_values_repeated_and_record_multiple
123 |         columns = [
124 |           { name: 'repeated_record', type: 'RECORD', mode: 'REPEATED', fields: [
125 |             { name: 'record', type: 'RECORD', mode: 'NULLABLE', fields: [
126 |               { name: 'repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' }
127 |             ] },
128 |           ] }
129 |         ]
130 | 
131 |         rows = [
132 |           { f: [
133 |             { v: [
134 |               { v:
135 |                 { f: [
136 |                   { v:
137 |                     { f: [
138 |                       { v: [
139 |                         {v: "1.444435200E9"},
140 |                         {v: "1.444435200E9"}
141 |                       ] }
142 |                     ] }
143 |                   }
144 |                 ] }
145 |               }
146 |             ] }
147 |           ] },
148 |           { f: [
149 |             { v: [
150 |               { v:
151 |                 { f: [
152 |                   { v:
153 |                     { f: [
154 |                       { v: [
155 |                         {v: "1.444435200E9"},
156 |                         {v: "1.444435200E9"}
157 |                       ] }
158 |                     ] }
159 |                   }
160 |                 ] }
161 |               }
162 |             ] }
163 |           ] }
164 |         ]
165 | 
166 |         expected = [
167 |           # first row
168 |           [
169 |             ["1.444435200E9"],
170 |             ["1.444435200E9"],
171 |           ],
172 |           # second row
173 |           [
174 |             ["1.444435200E9"],
175 |             ["1.444435200E9"],
176 |           ]
177 |         ]
178 | 
179 |         assert { TableData.new(columns, rows).values == expected }
180 |       end
181 | 
182 |       def test_values_repeated_in_middle_row
183 |         columns = [
184 |           { "name": "string", "type": "STRING", "mode": "NULLABLE" },
185 |           { "name": "integer", "type": "INTEGER", "mode": "NULLABLE" },
186 |           { "name": "repeated", "type": "STRING", "mode": "REPEATED" },
187 |           { "name": "float", "type": "FLOAT", "mode": "NULLABLE" },
188 |           { "name": "boolean", "type": "BOOLEAN", "mode": "NULLABLE" },
189 |           { "name": "timestamp", "type": "TIMESTAMP", "mode": "NULLABLE" }
190 |         ]
191 | 
192 |         rows = [
193 |           { f: [
194 |             { v: "foo" },
195 |             { v: "1" },
196 |             { v: [] },
197 |             { v: "1.1" },
198 |             { v: "true" },
199 |             { v: "1.444435200E9" }
200 |           ] },
201 |           { f: [
202 |             { v: "foo" },
203 |             { v: "3" },
204 |             { v: [] },
205 |             { v: "3.3" },
206 |             { v: "true" },
207 |             { v: "1.444435200E9" }
208 |           ] },
209 |           { f: [
210 |             { v: "foo" },
211 |             { v: "4" },
212 |             { v: [] },
213 |             { v: "4.4" },
214 |             { v: "false" },
215 |             { v: "1.444435200E9" }
216 |           ] },
217 |           { f: [
218 |             { v: "foo" },
219 |             { v: "2" },
220 |             { v: [
221 |               { v: "foo" },
222 |               { v: "bar" }
223 |             ] },
224 |             { v: "2.2" },
225 |             { v: "false" },
226 |             { v: "1.444435200E9" }
227 |           ] }
228 |         ]
229 | 
230 |         expected = [
231 |           # first row
232 |           [
233 |             [ "foo", "1", nil, "1.1", "true", "1.444435200E9" ]
234 |           ],
235 |           # second row
236 |           [
237 |             [ "foo", "3", nil, "3.3", "true", "1.444435200E9" ],
238 |           ],
239 |           # third row
240 |           [
241 |             [ "foo", "4", nil, "4.4", "false", "1.444435200E9" ]
242 |           ],
243 |           # fourth row
244 |           [
245 |             [ "foo", "2", "foo", "2.2", "false", "1.444435200E9" ],
246 |             [ nil, nil, "bar", nil, nil, nil ],
247 |           ],
248 |         ]
249 | 
250 | 
251 |         assert { TableData.new(columns, rows).values == expected }
252 |       end
253 | 
254 |       def test_values_repeated_and_record_in_middle_row
255 |         columns = [
256 |           { "name": "string", "type": "STRING", "mode": "NULLABLE" },
257 |           { "name": "integer", "type": "INTEGER", "mode": "NULLABLE" },
258 |           { "name": "repeated", "type": "RECORD", "mode": "REPEATED", "fields": [
259 |             { "name": "record", "type": "STRING", "mode": "REPEATED" }
260 |           ] },
261 |           { "name": "float", "type": "FLOAT", "mode": "NULLABLE" },
262 |           { "name": "boolean", "type": "BOOLEAN", "mode": "NULLABLE" },
263 |           { "name": "timestamp", "type": "TIMESTAMP", "mode": "NULLABLE" }
264 |         ]
265 | 
266 |         rows = [
267 |           { f: [
268 |             { v: "foo" },
269 |             { v: "1" },
270 |             { v: [] },
271 |             { v: "1.1" },
272 |             { v: "true" },
273 |             { v: "1.444435200E9" }
274 |           ] },
275 |           { f: [
276 |             { v: "foo" },
277 |             { v: "4" },
278 |             { v: [] },
279 |             { v: "4.4" },
280 |             { v: "true" },
281 |             { v: "1.444435200E9" }
282 |           ] },
283 |           { f: [
284 |             { v: "foo" },
285 |             { v: "5" },
286 |             { v: [] },
287 |             { v: "5.5" },
288 |             { v: "false" },
289 |             { v: "1.444435200E9" }
290 |           ] },
291 |           { f: [
292 |             { v: "foo" },
293 |             { v: "2" },
294 |             { v: [
295 |               { v:
296 |                 { f: [
297 |                   { v: [
298 |                     { v: "foo" },
299 |                     { v: "bar" }
300 |                   ] }
301 |                 ] }
302 |               },
303 |               { v:
304 |                 { f: [
305 |                   { v: [
306 |                     { v: "foo" },
307 |                     { v: "bar" }
308 |                   ] }
309 |                 ] }
310 |               }
311 |             ] },
312 |             { v: "2.2" },
313 |             { v: "false" },
314 |             { v: "1.444435200E9" }
315 |           ] },
316 |           { f: [
317 |             { v: "foo" },
318 |             { v: "3" },
319 |             { v: [
320 |               { v:
321 |                 { f: [
322 |                   { v: [
323 |                     { v: "foo" },
324 |                     { v: "bar" }
325 |                   ] }
326 |                 ] }
327 |               },
328 |               { v:
329 |                 { f: [
330 |                   { v: [
331 |                       { v: "foo" },
332 |                       { v: "bar" }
333 |                     ] }
334 |                 ] }
335 |               }
336 |             ] },
337 |             { v: "3.3" },
338 |             { v: "false" },
339 |             { v: "1.444435200E9" }
340 |           ] }
341 |         ]
342 | 
343 |         expected = [
344 |           # first row
345 |           [
346 |             [ "foo", "1", nil, "1.1", "true", "1.444435200E9" ]
347 |           ],
348 |           # second row
349 |           [
350 |             [ "foo", "4", nil, "4.4", "true", "1.444435200E9" ]
351 |           ],
352 |           # third row
353 |           [
354 |             [ "foo", "5", nil, "5.5", "false", "1.444435200E9" ]
355 |           ],
356 |           # fourth row
357 |           [
358 |             [ "foo", "2", "foo", "2.2", "false", "1.444435200E9" ],
359 |             [ nil, nil, "bar", nil, nil, nil ],
360 |             [ nil, nil, "foo", nil, nil, nil ],
361 |             [ nil, nil, "bar", nil, nil, nil ]
362 |           ],
363 |           # fifth row
364 |           [
365 |             [ "foo", "3", "foo", "3.3", "false", "1.444435200E9" ],
366 |             [ nil, nil, "bar", nil, nil, nil ],
367 |             [ nil, nil, "foo", nil, nil, nil ],
368 |             [ nil, nil, "bar", nil, nil, nil ]
369 |           ],
370 |         ]
371 | 
372 |         assert { TableData.new(columns, rows).values == expected }
373 |       end
374 | 
375 |       def test_values_repeated_and_record_complex
376 |         columns = [
377 |           { name: 'repeated_record', type: 'RECORD', mode: 'REPEATED', fields: [
378 |             { name: 'record', type: 'RECORD', mode: 'NULLABLE', fields: [
379 |               { name: 'child', type: 'STRING', mode: 'NULLABLE' },
380 |               { name: 'repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' }
381 |             ] },
382 |             { name: 'repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' }
383 |           ] },
384 |           { name: 'repeated_string', type: 'STRING', mode: 'REPEATED' },
385 |           { name: 'repeated_int', type: 'INTEGER', mode: 'REPEATED' },
386 |           { name: 'repeated_record2', type: 'RECORD', mode: 'REPEATED', fields: [
387 |             { name: 'record2', type: 'RECORD', mode: 'NULLABLE', fields: [
388 |               { name: 'repeated_float', type: 'FLOAT', mode: 'REPEATED' },
389 |               { name: 'child2', type: 'STRING', mode: 'REQUIRED' }
390 |             ] }
391 |           ] }
392 |         ]
393 | 
394 |         rows = [
395 |           { f: [
396 |             { v: [
397 |               { v:
398 |                 { f: [
399 |                   { v:
400 |                     { f: [
401 |                       { v: "foo"},
402 |                       { v: [
403 |                         { v: "1.44423E9"},
404 |                         { v: "1.4443164E9"}
405 |                       ] }
406 |                     ] }
407 |                   },
408 |                   { v: [
409 |                     { v: "1.4444028E9"},
410 |                     { v: "1.4444028E9"}
411 |                   ] }
412 |                 ] }
413 |               },
414 |               { v:
415 |                 { f: [
416 |                   { v:
417 |                     { f: [
418 |                       { v: "fuga"},
419 |                       { v: [] }
420 |                     ] }
421 |                   },
422 |                   { v: [
423 |                     { v: "1.4445756E9"},
424 |                     { v: "1.444662E9"}
425 |                   ] }
426 |                 ] }
427 |               }
428 |             ] },
429 |             { v: [
430 |               { v: "one"},
431 |               { v: "two"},
432 |               { v: "three"}
433 |             ] },
434 |             { v: [
435 |               { v: "1"},
436 |               { v: "2"}
437 |             ] },
438 |             { v: [
439 |               { v:
440 |                 { f: [
441 |                   { v:
442 |                     { f: [
443 |                       { v: [
444 |                         { v: "1.1"},
445 |                         { v: "2.2"},
446 |                         { v: "3.3"}
447 |                       ] },
448 |                       { v: "foo2"}
449 |                     ] }
450 |                   }
451 |                 ] }
452 |               },
453 |               { v:
454 |                 { f: [
455 |                   { v:
456 |                     { f: [
457 |                       { v: [
458 |                         { v: "4.4"},
459 |                         { v: "5.5"},
460 |                         { v: "6.6"},
461 |                         { v: "7.7"}
462 |                       ] },
463 |                       { v: "bar"}
464 |                     ] }
465 |                   }
466 |                 ] }
467 |               }
468 |             ] }
469 |           ] }
470 |         ]
471 | 
472 |         expected = [
473 |           # only single row
474 |           [
475 |             ["foo", "1.44423E9", "1.4444028E9", "one", "1", "1.1", "foo2"],
476 |             [nil, "1.4443164E9", "1.4444028E9", "two", "2", "2.2", nil],
477 |             ["fuga", nil, "1.4445756E9", "three", nil, "3.3", nil],
478 |             [nil, nil, "1.444662E9", nil, nil, "4.4", "bar"],
479 |             [nil, nil, nil, nil, nil, "5.5", nil],
480 |             [nil, nil, nil, nil, nil, "6.6", nil],
481 |             [nil, nil, nil, nil, nil, "7.7", nil]
482 |           ]
483 |         ]
484 | 
485 |         assert { TableData.new(columns, rows).values == expected }
486 |       end
487 | 
488 |       def test_values_record_with_empty_hash
489 |         columns = [
490 |           { name: "test", type: "STRING" },
491 |           { name: "record1", type: "RECORD", fields: [
492 |             { name: "child", type: "STRING" },
493 |           ] },
494 |           { name: "record2", type: "RECORD", fields: [
495 |             { name: "child", type: "STRING" }
496 |           ] },
497 |           { name: "record3", type: "RECORD", mode: "REPEATED", fields: [
498 |             { name: "array", type: "INTEGER", mode: "REPEATED" }
499 |           ] },
500 |           { name: "date", type: "STRING" },
501 |           { name: "timestamp", type: "TIMESTAMP" }
502 |           ]
503 | 
504 |         rows = [
505 |           { f: [
506 |             { v: 'fuga' },
507 |             { v:
508 |               { f: [
509 |                 { v: 'hoge' },
510 |               ] }
511 |             },
512 |             { v: {} },
513 | 
514 |             { v: [
515 |               { v:
516 |                 { f: [
517 |                   { v: [
518 |                     { v: '1' }
519 |                     ] }
520 |                 ] }
521 |               },
522 |               { v:
523 |                 { f: [
524 |                   { v: [
525 |                     { v: '4' }
526 |                   ] }
527 |                 ] }
528 |               }
529 |             ] },
530 |             { v: '2016-10-17' },
531 |             { v: '1.47663E9' }
532 |           ] }
533 |         ]
534 | 
535 |         expected = [
536 |           [
537 |             ["fuga", "hoge", nil, "1", "2016-10-17", "1.47663E9"],
538 |             [nil, nil, nil, "4", nil, nil]
539 |           ]
540 |         ]
541 | 
542 |         assert { TableData.new(columns, rows).values == expected }
543 |       end
544 |     end
545 |   end
546 | end
547 | 


--------------------------------------------------------------------------------
/test/test_bigquery_wrapper.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'helper.rb'
  2 | require 'bigquery_migration/bigquery_wrapper'
  3 | 
  4 | unless File.exist?(JSON_KEYFILE)
  5 |   puts "#{JSON_KEYFILE} is not found. Skip test/test_bigquery_wrapper.rb"
  6 | else
  7 |   class BigqueryMigration
  8 |     class TestBigqueryWrapper < Test::Unit::TestCase
  9 |       def instance
 10 |         @instance ||= BigqueryWrapper.new(config)
 11 |       end
 12 | 
 13 |       def config
 14 |         {
 15 |           'json_keyfile' => JSON_KEYFILE,
 16 |           'dataset'      => 'bigquery_migration_unittest',
 17 |           'table'        => 'test',
 18 |         }
 19 |       end
 20 | 
 21 |       def config_for_location
 22 |         self.config.merge({
 23 |           'dataset'      => 'bigquery_migration_unittest_asia_northeast1',
 24 |           'location'     => 'asia-northeast1',
 25 |         })
 26 |       end
 27 | 
 28 |       sub_test_case "configure" do
 29 |         def test_configure_json_keyfile
 30 |           config = {
 31 |             'json_keyfile' => JSON_KEYFILE,
 32 |             'dataset'      => 'bigquery_migration_unittest',
 33 |             'table'        => 'test',
 34 |           }
 35 |           instance = BigqueryWrapper.new(config)
 36 |           assert_nothing_raised { instance.project }
 37 |           assert_nothing_raised { instance.dataset }
 38 |           assert_nothing_raised { instance.table }
 39 |           assert_nothing_raised { instance.client }
 40 |         end
 41 | 
 42 |         def test_configure_json_keyfile_content_json
 43 |           config = {
 44 |             'json_keyfile' => {
 45 |               'content' => File.read(JSON_KEYFILE),
 46 |             },
 47 |             'dataset'      => 'bigquery_migration_unittest',
 48 |             'table'        => 'test',
 49 |           }
 50 |           instance = BigqueryWrapper.new(config)
 51 |           assert_nothing_raised { instance.project }
 52 |           assert_nothing_raised { instance.dataset }
 53 |           assert_nothing_raised { instance.table }
 54 |           assert_nothing_raised { instance.client }
 55 |         end
 56 | 
 57 |         def test_configure_json_keyfile_content_hash
 58 |           config = {
 59 |             'json_keyfile' => {
 60 |               'content' => JSON.parse(File.read(JSON_KEYFILE)),
 61 |             },
 62 |             'dataset'      => 'bigquery_migration_unittest',
 63 |             'table'        => 'test',
 64 |           }
 65 |           instance = BigqueryWrapper.new(config)
 66 |           assert_nothing_raised { instance.project }
 67 |           assert_nothing_raised { instance.dataset }
 68 |           assert_nothing_raised { instance.table }
 69 |           assert_nothing_raised { instance.client }
 70 |         end
 71 |       end
 72 | 
 73 |       sub_test_case "create_dataset" do
 74 |         def test_create_dataset
 75 |           assert_nothing_raised { instance.create_dataset }
 76 |           assert_nothing_raised { instance.get_dataset }
 77 |         end
 78 | 
 79 |         sub_test_case "with location option" do
 80 |           def test_create_dataset
 81 |             instance = BigqueryWrapper.new(config_for_location)
 82 |             assert_nothing_raised { instance.create_dataset }
 83 |             result = instance.get_dataset
 84 |             assert { result[:responses][:get_dataset].location == 'asia-northeast1' }
 85 |           end
 86 |         end
 87 |       end
 88 | 
 89 |       sub_test_case "create_table" do
 90 |         def test_create_table
 91 |           instance.drop_table rescue nil
 92 |           columns = [
 93 |             { name: 'column1', type: 'INTEGER' },
 94 |             { name: 'column2', type: 'STRING' },
 95 |             { name: 'column3', type: 'FLOAT' },
 96 |             { name: 't',       type: 'TIMESTAMP' },
 97 |             { name: 'record',  type: 'RECORD', fields:[
 98 |               { name: 'column4', type: 'STRING' },
 99 |               { name: 'column5', type: 'INTEGER' },
100 |             ]},
101 |           ]
102 |           assert_nothing_raised { instance.create_table(columns: columns) }
103 |           assert_nothing_raised { instance.get_table }
104 |         end
105 | 
106 |         sub_test_case "with location option" do
107 |           def test_create_table
108 |             instance = BigqueryWrapper.new(config_for_location)
109 |             instance.drop_table rescue nil
110 |             columns = [
111 |               { name: 'column1', type: 'INTEGER' },
112 |             ]
113 |             assert_nothing_raised { instance.create_table(columns: columns) }
114 |             result = instance.get_table
115 |             assert { result[:location] == 'asia-northeast1' }
116 |           end
117 |         end
118 |       end
119 | 
120 |       def test_drop_table
121 |         instance.create_table(columns: [{ name: 'column1', type: 'INTEGER' }])
122 |         assert_nothing_raised { instance.drop_table }
123 |         assert_raise(NotFoundError) { instance.get_table }
124 |       end
125 | 
126 |       def test_list_tables
127 |         instance.create_table(table: 'table1', columns: [{ name: 'column1', type: 'INTEGER' }])
128 |         instance.create_table(table: 'table2', columns: [{ name: 'column1', type: 'INTEGER' }])
129 |         result = instance.list_tables
130 |         assert { result[:tables] == ['table1', 'table2'] }
131 |         instance.drop_table(table: 'table1')
132 |         instance.drop_table(table: 'table2')
133 |       end
134 | 
135 |       sub_test_case "purge_tables" do
136 |         def before_tables
137 |           %w[
138 |             test_20160301
139 |             test_20160301_00
140 |             test_20160229
141 |             test_20160229_23
142 |             test_20160229_22
143 |             test_20160228
144 |             test_23_20160229
145 |             test_22_20160229
146 |             test_00_20160301
147 |           ]
148 |         end
149 | 
150 |         def test_purge_tables_daily
151 |           stub(instance).list_tables { { tables: before_tables } }
152 |           result = instance.purge_tables(
153 |             table_prefix: 'test_', suffix_format: '%Y%m%d', purge_before: '20160229'
154 |           )
155 |           expected = %w[test_20160229 test_20160228]
156 |           assert { result[:delete_tables] == expected }
157 |         end
158 | 
159 |         def test_purge_tables_hourly_1
160 |           stub(instance).list_tables { { tables: before_tables } }
161 |           result = instance.purge_tables(
162 |             table_prefix: 'test_', suffix_format: '%Y%m%d_%H', purge_before: '20160229_23'
163 |           )
164 |           expected = %w[test_20160229_23 test_20160229_22]
165 |           assert { result[:delete_tables] == expected }
166 |         end
167 | 
168 |         def test_purge_tables_hourly_2
169 |           stub(instance).list_tables { { tables: before_tables } }
170 |           result = instance.purge_tables(
171 |             table_prefix: 'test_', suffix_format: '%H_%Y%m%d', purge_before: '23_20160229'
172 |           )
173 |           expected = %w[test_23_20160229 test_22_20160229]
174 |           assert { result[:delete_tables] == expected }
175 |         end
176 |       end
177 | 
178 |       sub_test_case "table_data" do
179 |         def setup
180 |           instance.drop_table
181 |         end
182 | 
183 |         def teardown
184 |           instance.drop_table
185 |         end
186 | 
187 |         # Streaming insert takes time to be reflected. Let me coment out....
188 | =begin
189 |         def test_insert_all_and_list_table_data
190 |           instance.create_table(columns: [
191 |             { 'name' => 'repeated_record', 'type' => 'RECORD', 'mode' => 'REPEATED', 'fields' => [
192 |               { 'name' => 'record', 'type' => 'RECORD', 'mode' => 'NULLABLE', 'fields' => [
193 |                 { 'name' => 'child', 'type' => 'STRING', 'mode' => 'NULLABLE' },
194 |                 { 'name' => 'repeated_time', 'type' => 'TIMESTAMP', 'mode' => 'REPEATED' }
195 |               ] },
196 |               { 'name' => 'repeated_time', 'type' => 'TIMESTAMP', 'mode' => 'REPEATED' }
197 |             ] },
198 |             { 'name' => 'repeated_string', 'type' => 'STRING', 'mode' => 'REPEATED' },
199 |             { 'name' => 'repeated_int', 'type' => 'INTEGER', 'mode' => 'REPEATED' },
200 |             { 'name' => 'repeated_record2', 'type' => 'RECORD', 'mode' => 'REPEATED', 'fields' => [
201 |               { 'name' => 'record2', 'type' => 'RECORD', 'mode' => 'NULLABLE', 'fields' => [
202 |                 { 'name' => 'repeated_float', 'type' => 'FLOAT', 'mode' => 'REPEATED' },
203 |                 { 'name' => 'child2', 'type' => 'STRING', 'mode' => 'REQUIRED' }
204 |               ] }
205 |             ] }
206 |           ])
207 | 
208 |           assert_nothing_raised do
209 |             instance.insert_all_table_data(rows: [
210 |               { 'repeated_record' => [
211 |                 { 'record' =>
212 |                   { 'child' => 'hoge',
213 |                     'repeated_time' => [
214 |                       '2015-10-08 00:00:00 +09:00',
215 |                       '2015-10-09 00:00:00 +09:00'
216 |                     ]
217 |                   },
218 |                   'repeated_time' => [
219 |                     '2015-10-10 00:00:00 +09:00',
220 |                     '2015-10-10 00:00:00 +09:00'
221 |                   ] },
222 |                 { 'record' =>
223 |                   { 'child' => 'fuga'},
224 |                   'repeated_time' => [
225 |                     '2015-10-12 00:00:00 +09:00',
226 |                     '2015-10-13 00:00:00 +09:00'
227 |                   ]
228 |                 }
229 |               ],
230 |               'repeated_string' => [
231 |                 'one',
232 |                 'two',
233 |                 'three'
234 |               ],
235 |               'repeated_int' => [
236 |                 1,
237 |                 2,
238 |               ],
239 |               'repeated_record2' => [
240 |                 { 'record2' =>
241 |                   { 'child2' => 'hoge2',
242 |                     'repeated_float' => [
243 |                       1.1,
244 |                       2.2,
245 |                       3.3
246 |                     ]
247 |                   }
248 |                 },
249 |                 { 'record2' =>
250 |                   { 'child2' => 'fuga2',
251 |                     'repeated_float' => [
252 |                       4.4,
253 |                       5.5,
254 |                       6.6,
255 |                       7.7
256 |                     ]
257 |                   }
258 |                 }
259 |               ] },
260 |             ])
261 |           end
262 | 
263 |           result = {}
264 |           assert_nothing_raised { result = instance.list_table_data }
265 |           60.times do
266 |             break if result[:values]
267 |             sleep 1
268 |             result = instance.list_table_data
269 |           end
270 | 
271 |           expected = {
272 |             total_rows: 4,
273 |             columns: [
274 |               { name: 'repeated_record.record.child', type: 'STRING', mode: 'NULLABLE' },
275 |               { name: 'repeated_record.record.repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' },
276 |               { name: 'repeated_record.repeated_time', type: 'TIMESTAMP', mode: 'REPEATED' },
277 |               { name: 'repeated_string', type: 'STRING', mode: 'REPEATED' },
278 |               { name: 'repeated_int', type: 'INTEGER', mode: 'REPEATED' },
279 |               { name: 'repeated_record2.record2.repeated_float', type: 'FLOAT', mode: 'REPEATED' },
280 |               { name: 'repeated_record2.record2.child2', type: 'STRING', mode: 'REQUIRED' },
281 |             ],
282 |             values: [
283 |               [
284 |                 ["hoge", "1.44423E9", "1.4444028E9", "one", "1", "1.1", "hoge2"],
285 |                 [nil, "1.4443164E9", "1.4444028E9", "two", "2", "2.2", nil],
286 |                 ["fuga", nil, "1.4445756E9", "three", nil, "3.3", nil],
287 |                 [nil, nil, "1.444662E9", nil, nil, "4.4", "fuga2"],
288 |                 [nil, nil, nil, nil, nil, "5.5", nil],
289 |                 [nil, nil, nil, nil, nil, "6.6", nil],
290 |                 [nil, nil, nil, nil, nil, "7.7", nil]
291 |             ]
292 |           ]
293 |           }
294 |           assert { result[:columns] == expected[:columns] }
295 |           assert { result[:values] == expected[:values] }
296 |           # total_rows is not reflected by streming insert ....
297 |           # assert { result[:total_rows] == expected[:total_rows] }
298 |         end
299 | =end
300 |       end
301 | 
302 |       sub_test_case "patch_table" do
303 |         def setup
304 |           instance.drop_table
305 |         end
306 | 
307 |         def teardown
308 |           instance.drop_table
309 |         end
310 | 
311 |         def test_add_columns
312 |           before_columns = [
313 |             { 'name' => 'id', 'type' => 'INTEGER' },
314 |             { 'name' => 'string', 'type' => 'STRING', 'mode' => 'REQUIRED' },
315 |             { 'name' => 'record', 'type' => 'RECORD', 'fields' => [
316 |               { 'name' => 'child1', 'type' => 'STRING' },
317 |             ] },
318 |           ]
319 |           instance.create_table(columns: before_columns)
320 | 
321 |           add_columns = [
322 |             {"name"=>"new_nullable_column", "type"=>"STRING", "mode"=>"NULLABLE"},
323 |             {"name"=>"new_repeated_column", "type"=>"STRING", "mode"=>"REPEATED"},
324 |             {"name"=>"new_record", "type"=>"RECORD", "fields"=>[
325 |               {"name"=>"new_record_child2", "type"=>"RECORD", "fields"=>[
326 |                 {"name"=>"new_record_child3", "type"=>"STRING"}
327 |               ]}
328 |             ]}
329 |           ]
330 |           expected = before_columns + add_columns
331 | 
332 |           result = instance.patch_table(add_columns: add_columns)
333 |           after_columns = result[:after_columns]
334 | 
335 |           assert { Schema.diff_columns(expected, after_columns) == [] }
336 |         end
337 | 
338 |         def test_mode_change
339 |           before_columns = [
340 |             {"name"=>"id", "type"=>"INTEGER"},
341 |             {"name"=>"record", "type"=>"RECORD", "fields"=> [
342 |               {"name"=>"record", "type"=>"RECORD", "fields"=> [
343 |                 {"name"=>"mode_change", "type"=>"STRING", "mode"=>"REQUIRED"}
344 |               ]}
345 |             ]}
346 |           ]
347 |           instance.create_table(columns: before_columns)
348 | 
349 |           add_columns = [
350 |             {"name"=>"record", "type"=>"RECORD", "fields"=> [
351 |               {"name"=>"record", "type"=>"RECORD", "fields"=> [
352 |                 {"name"=>"mode_change", "type"=>"STRING", "mode"=>"NULLABLE"}
353 |               ]}
354 |             ]}
355 |           ]
356 | 
357 |           expected = [
358 |             {"name"=>"id", "type"=>"INTEGER"},
359 |             {"name"=>"record", "type"=>"RECORD", "fields"=> [
360 |               {"name"=>"record", "type"=>"RECORD", "fields"=> [
361 |                 {"name"=>"mode_change", "type"=>"STRING", "mode"=>"NULLABLE"}
362 |               ]}
363 |             ]}
364 |           ]
365 | 
366 |           result = instance.patch_table(add_columns: add_columns)
367 |           after_columns = result[:after_columns]
368 | 
369 |           assert { Schema.diff_columns(expected, after_columns) == [] }
370 |         end
371 |       end
372 | 
373 |       sub_test_case "insert_select" do
374 |         def setup
375 |           instance.drop_table
376 |         end
377 | 
378 |         def teardown
379 |           instance.drop_table
380 |         end
381 | 
382 |         def test_insert_select
383 |           columns = [{ 'name' => 'id', 'type' => 'INTEGER' }]
384 |           instance.create_table(columns: columns)
385 | 
386 |           query = "SELECT id FROM [#{config['dataset']}.#{config['table']}]"
387 |           assert_nothing_raised do
388 |             instance.insert_select(destination_table: 'insert_table', query: query)
389 |           end
390 |           assert_nothing_raised { instance.get_table(table: 'insert_table') }
391 |         ensure
392 |           instance.drop_table(table: 'insert_table')
393 |         end
394 | 
395 |         sub_test_case "with location option" do
396 |           def test_insert_select
397 |             columns = [{ 'name' => 'id', 'type' => 'INTEGER' }]
398 | 
399 |             instance = BigqueryWrapper.new(config_for_location)
400 |             instance.drop_table rescue nil
401 |             instance.create_table(columns: columns)
402 |   
403 |             query = "SELECT id FROM [#{config_for_location['dataset']}.#{config_for_location['table']}]"
404 |             assert_nothing_raised do
405 |               instance.insert_select(destination_table: 'insert_table', query: query)
406 |             end
407 |             result = instance.get_table(table: 'insert_table')
408 |             assert { result[:location] == 'asia-northeast1' }
409 |           ensure
410 |             instance.drop_table(table: 'insert_table')
411 |           end
412 |         end
413 |       end
414 | 
415 |       sub_test_case "drop_column" do
416 |         def setup
417 |           instance.drop_table
418 |         end
419 | 
420 |         def teardown
421 |           instance.drop_table
422 |         end
423 | 
424 |         def test_drop_column_with_drop_columns
425 |           before_columns = [
426 |             { name: 'drop_column', type: 'INTEGER' },
427 |             { name: 'remained_column', type: 'STRING' },
428 |             { name: 'record', type: 'RECORD', fields:[
429 |               { name: 'drop_column', type: 'STRING' },
430 |               { name: 'remained_column', type: 'STRING' },
431 |             ] }
432 |           ]
433 |           instance.create_table(columns: before_columns)
434 | 
435 |           drop_columns = [
436 |             { name: 'drop_column', type: 'STRING' },
437 |             { name: 'record', type: 'RECORD', fields:[
438 |               { name: 'drop_column', type: 'STRING' },
439 |             ] },
440 |           ]
441 |           expected = [
442 |             { name: 'remained_column', type: 'STRING' },
443 |             { name: 'record', type: 'RECORD', fields:[
444 |               { name: 'remained_column', type: 'STRING' },
445 |             ] }
446 |           ]
447 | 
448 |           result = instance.drop_column(drop_columns: drop_columns)
449 |           after_columns = result[:after_columns]
450 | 
451 |           assert { Schema.diff_columns(expected, after_columns) == [] }
452 |         end
453 | 
454 |         def test_drop_column_with_columns
455 |           before_columns = [
456 |             { name: 'drop_column', type: 'INTEGER' },
457 |             { name: 'remained_column', type: 'STRING' },
458 |             { name: 'record', type: 'RECORD', fields:[
459 |               { name: 'drop_column', type: 'STRING' },
460 |               { name: 'remained_column', type: 'STRING' },
461 |             ] }
462 |           ]
463 |           instance.create_table(columns: before_columns)
464 | 
465 |           columns = [
466 |             { name: 'remained_column', type: 'STRING' },
467 |             { name: 'record', type: 'RECORD', fields:[
468 |               { name: 'remained_column', type: 'STRING' },
469 |               { name: 'add_column', type: 'STRING' },
470 |             ] },
471 |             { name: 'add_column', type: 'STRING' },
472 |           ]
473 |           expected = columns.dup
474 | 
475 |           result = instance.drop_column(columns: columns)
476 |           after_columns = result[:after_columns]
477 | 
478 |           assert { Schema.diff_columns(expected, after_columns) == [] }
479 |         end
480 | 
481 |       end
482 | 
483 |       sub_test_case "migrate_table" do
484 |         def setup
485 |           instance.drop_table
486 |         end
487 | 
488 |         def teardown
489 |           instance.drop_table
490 |         end
491 | 
492 |         def test_add_columns
493 |           before_columns = [
494 |             { name: 'remained_column', type: 'INTEGER' },
495 |             { name: 'record', type: 'RECORD', fields: [
496 |               { name: 'record', type: 'RECORD', fields: [
497 |                 { name: 'remained_column', type: 'STRING' },
498 |               ] }
499 |             ] }
500 |           ]
501 |           instance.create_table(columns: before_columns)
502 | 
503 |           columns = [
504 |             { name: 'remained_column', type: 'INTEGER' },
505 |             { name: 'record', type: 'RECORD', fields: [
506 |               { name: 'record', type: 'RECORD', fields: [
507 |                 { name: 'remained_column', type: 'STRING' },
508 |                 { name: 'new_column', type: 'INTEGER' },
509 |                 { name: 'new_record', type: 'RECORD', fields: [
510 |                   { name: 'new_column', type: 'INTEGER' },
511 |                 ] }
512 |               ] }
513 |             ] },
514 |             { name: 'new_column', type: 'INTEGER' },
515 |           ]
516 |           expected = columns.dup
517 | 
518 |           result = instance.migrate_table(columns: columns)
519 |           after_columns = result[:after_columns]
520 | 
521 |           assert { Schema.diff_columns(expected, after_columns) == [] }
522 |         end
523 | 
524 |         def test_drop_columns
525 |           before_columns = [
526 |             { name: 'drop_column', type: 'INTEGER' },
527 |             { name: 'remained_column', type: 'INTEGER' },
528 |             { name: 'record', type: 'RECORD', fields: [
529 |               { name: 'record', type: 'RECORD', fields: [
530 |                 { name: 'drop_column', type: 'STRING' },
531 |                 { name: 'remained_column', type: 'STRING' },
532 |               ] }
533 |             ] }
534 |           ]
535 |           instance.create_table(columns: before_columns)
536 | 
537 |           columns = [
538 |             { name: 'remained_column', type: 'INTEGER' },
539 |             { name: 'record', type: 'RECORD', fields: [
540 |               { name: 'record', type: 'RECORD', fields: [
541 |                 { name: 'remained_column', type: 'STRING' },
542 |               ] }
543 |             ] }
544 |           ]
545 |           expected = columns.dup
546 | 
547 |           result = instance.migrate_table(columns: columns)
548 |           after_columns = result[:after_columns]
549 | 
550 |           assert { Schema.diff_columns(expected, after_columns) == [] }
551 |         end
552 | 
553 |         def test_add_drop
554 |           before_columns = [
555 |             { name: 'remained_column', type: 'INTEGER' },
556 |             { name: 'record', type: 'RECORD', fields: [
557 |               { name: 'record', type: 'RECORD', fields: [
558 |                 { name: 'remained_column', type: 'STRING' },
559 |                 { name: 'drop_column', type: 'STRING' },
560 |               ] }
561 |             ] },
562 |             { name: 'drop_column', type: 'INTEGER' },
563 |           ]
564 |           instance.create_table(columns: before_columns)
565 | 
566 |           columns = [
567 |             { name: 'remained_column', type: 'INTEGER' },
568 |             { name: 'record', type: 'RECORD', fields: [
569 |               { name: 'record', type: 'RECORD', fields: [
570 |                 { name: 'remained_column', type: 'STRING' },
571 |                 { name: 'add_column', type: 'INTEGER' },
572 |               ] },
573 |             ] },
574 |             { name: 'add_column', type: 'STRING', mode: 'REPEATED' },
575 |             { name: 'add_record', type: 'RECORD', fields: [
576 |               { name: 'add_record', type: 'RECORD', fields: [
577 |                 { name: 'add_column1', type: 'STRING' },
578 |                 { name: 'add_column2', type: 'INTEGER' },
579 |               ] }
580 |             ]}
581 |           ]
582 |           expected = columns.dup
583 | 
584 |           result = instance.migrate_table(columns: columns)
585 |           after_columns = result[:after_columns]
586 | 
587 |           assert { Schema.diff_columns(expected, after_columns) == [] }
588 |         end
589 | 
590 |         def test_type_change
591 |           before_columns = [
592 |             { name: 'type_change', type: 'STRING' },
593 |             { name: 'remained_column', type: 'INTEGER' },
594 |             { name: 'record', type: 'RECORD', fields: [
595 |               { name: 'record', type: 'RECORD', fields: [
596 |                 { name: 'type_change', type: 'STRING' },
597 |                 { name: 'remained_column', type: 'STRING' },
598 |               ] }
599 |             ] }
600 |           ]
601 |           instance.create_table(columns: before_columns)
602 | 
603 |           columns = [
604 |             { name: 'type_change', type: 'INTEGER' },
605 |             { name: 'remained_column', type: 'INTEGER' },
606 |             { name: 'record', type: 'RECORD', fields: [
607 |               { name: 'record', type: 'RECORD', fields: [
608 |                 { name: 'type_change', type: 'INTEGER' },
609 |                 { name: 'remained_column', type: 'STRING' },
610 |               ] }
611 |             ]}
612 |           ]
613 |           expected = columns.dup
614 | 
615 |           result = instance.migrate_table(columns: columns)
616 |           after_columns = result[:after_columns]
617 | 
618 |           assert { Schema.diff_columns(expected, after_columns) == [] }
619 |         end
620 | 
621 |         def test_mode_change
622 |           before_columns = [
623 |             { name: 'mode_change', type: 'STRING', mode: 'REQUIRED' },
624 |             { name: 'remained_column', type: 'INTEGER' },
625 |             { name: 'record', type: 'RECORD', fields: [
626 |               { name: 'record', type: 'RECORD', fields: [
627 |                 { name: 'mode_change', type: 'STRING', mode: 'REQUIRED' },
628 |                 { name: 'remained_column', type: 'STRING' },
629 |               ] }
630 |             ] }
631 |           ]
632 |           instance.create_table(columns: before_columns)
633 | 
634 |           columns = [
635 |             { name: 'mode_change', type: 'STRING', mode: 'NULLABLE' },
636 |             { name: 'remained_column', type: 'INTEGER' },
637 |             { name: 'record', type: 'RECORD', fields: [
638 |               { name: 'record', type: 'RECORD', fields: [
639 |                 { name: 'mode_change', type: 'STRING', mode: 'NULLABLE' },
640 |                 { name: 'remained_column', type: 'STRING' },
641 |               ] }
642 |             ] }
643 |           ]
644 |           expected = columns.dup
645 | 
646 |           result = instance.migrate_table(columns: columns)
647 |           after_columns = result[:after_columns]
648 | 
649 |           assert { Schema.diff_columns(expected, after_columns) == [] }
650 |         end
651 |       end
652 | 
653 |       sub_test_case "migrate_partitioned_table" do
654 |         def setup
655 |           instance.drop_table
656 |         end
657 | 
658 |         def teardown
659 |           instance.drop_table
660 |         end
661 | 
662 |         def test_create_partitioned_table
663 |           columns = [
664 |             { name: 'remained_column', type: 'INTEGER' },
665 |             { name: 'record', type: 'RECORD', fields: [
666 |               { name: 'record', type: 'RECORD', fields: [
667 |                 { name: 'remained_column', type: 'STRING' },
668 |               ] }
669 |             ] }
670 |           ]
671 |           expected = columns.dup
672 | 
673 |           result = instance.migrate_partitioned_table(columns: columns)
674 |           after_columns = result[:after_columns]
675 | 
676 |           assert { result[:responses][:insert_table].time_partitioning.type == 'DAY' }
677 |           assert { Schema.diff_columns(expected, after_columns) == [] }
678 |           assert { Schema.diff_columns(after_columns, expected) == [] }
679 |         end
680 | 
681 |         def test_add_columns
682 |           before_columns = [
683 |             { name: 'remained_column', type: 'INTEGER' },
684 |             { name: 'record', type: 'RECORD', fields: [
685 |               { name: 'record', type: 'RECORD', fields: [
686 |                 { name: 'remained_column', type: 'STRING' },
687 |               ] }
688 |             ] }
689 |           ]
690 |           instance.create_partitioned_table(columns: before_columns)
691 | 
692 |           columns = [
693 |             { name: 'remained_column', type: 'INTEGER' },
694 |             { name: 'record', type: 'RECORD', fields: [
695 |               { name: 'record', type: 'RECORD', fields: [
696 |                 { name: 'remained_column', type: 'STRING' },
697 |                 { name: 'new_column', type: 'INTEGER' },
698 |                 { name: 'new_record', type: 'RECORD', fields: [
699 |                   { name: 'new_column', type: 'INTEGER' },
700 |                 ] }
701 |               ] }
702 |             ] },
703 |             { name: 'new_column', type: 'INTEGER' },
704 |           ]
705 |           expected = columns.dup
706 | 
707 |           result = instance.migrate_partitioned_table(columns: columns)
708 |           after_columns = result[:after_columns]
709 | 
710 |           assert { Schema.diff_columns(expected, after_columns) == [] }
711 |           assert { Schema.diff_columns(after_columns, expected) == [] }
712 |         end
713 | 
714 |         def test_add_drop
715 |           before_columns = [
716 |             { name: 'remained_column', type: 'INTEGER' },
717 |             { name: 'record', type: 'RECORD', fields: [
718 |               { name: 'record', type: 'RECORD', fields: [
719 |                 { name: 'remained_column', type: 'STRING' },
720 |                 { name: 'drop_column', type: 'STRING' },
721 |               ] }
722 |             ] },
723 |             { name: 'drop_column', type: 'INTEGER' },
724 |           ]
725 |           instance.create_partitioned_table(columns: before_columns)
726 | 
727 |           columns = [
728 |             { name: 'remained_column', type: 'INTEGER' },
729 |             { name: 'record', type: 'RECORD', fields: [
730 |               { name: 'record', type: 'RECORD', fields: [
731 |                 { name: 'remained_column', type: 'STRING' },
732 |                 { name: 'add_column', type: 'INTEGER' },
733 |               ] },
734 |             ] },
735 |             { name: 'add_column', type: 'STRING', mode: 'REPEATED' },
736 |             { name: 'add_record', type: 'RECORD', fields: [
737 |               { name: 'add_column1', type: 'STRING' },
738 |             ]}
739 |           ]
740 | 
741 |           expected = [
742 |             { name: 'remained_column', type: 'INTEGER' },
743 |             { name: 'record', type: 'RECORD', fields: [
744 |               { name: 'record', type: 'RECORD', fields: [
745 |                 { name: 'remained_column', type: 'STRING' },
746 |                 { name: 'drop_column', type: 'STRING', mode: 'NULLABLE'},
747 |                 { name: 'add_column', type: 'INTEGER' },
748 |               ] },
749 |             ] },
750 |             { name: 'drop_column', type: 'INTEGER', mode: 'NULLABLE' },
751 |             { name: 'add_column', type: 'STRING', mode: 'REPEATED' },
752 |             { name: 'add_record', type: 'RECORD', fields: [
753 |               { name: 'add_column1', type: 'STRING' },
754 |             ]}
755 |           ]
756 | 
757 |           result = instance.migrate_partitioned_table(columns: columns)
758 |           after_columns = result[:after_columns]
759 | 
760 |           assert { Schema.diff_columns(expected, after_columns) == [] }
761 |           assert { Schema.diff_columns(after_columns, expected) == [] }
762 |         end
763 | 
764 |         def test_type_change_raised
765 |           before_columns = [
766 |             { name: 'type_change', type: 'STRING' },
767 |           ]
768 |           instance.create_partitioned_table(columns: before_columns)
769 | 
770 |           columns = [
771 |             { name: 'type_change', type: 'INTEGER' },
772 |           ]
773 | 
774 |           assert_raise { instance.migrate_partitioned_table(columns: columns) }
775 |         end
776 | 
777 |         sub_test_case "with clustering option" do
778 |           def test_create_partitioned_table
779 |             instance = BigqueryWrapper.new(config.merge({
780 |               clustering: {
781 |                 fields: ['remained_column_a', 'remained_column_b'],
782 |               },
783 |             }))
784 | 
785 |             columns = [
786 |               { name: 'remained_column_a', type: 'STRING' },
787 |               { name: 'remained_column_b', type: 'INTEGER' },
788 |               { name: 'remained_column_c', type: 'INTEGER' },
789 |               { name: 'record', type: 'RECORD', fields: [
790 |                 { name: 'record', type: 'RECORD', fields: [
791 |                   { name: 'remained_column', type: 'STRING' },
792 |                 ] }
793 |               ] }
794 |             ]
795 |             expected = columns.dup
796 | 
797 |             result = instance.migrate_partitioned_table(columns: columns)
798 |             after_columns = result[:after_columns]
799 | 
800 |             assert { result[:responses][:insert_table].time_partitioning.type == 'DAY' }
801 |             assert { result[:responses][:insert_table].clustering.fields == ['remained_column_a', 'remained_column_b'] }
802 |             assert { Schema.diff_columns(expected, after_columns) == [] }
803 |             assert { Schema.diff_columns(after_columns, expected) == [] }
804 |           ensure
805 |             instance.drop_table
806 |           end
807 |         end
808 |       end
809 |     end
810 |   end
811 | end
812 | 


--------------------------------------------------------------------------------
/lib/bigquery_migration/bigquery_wrapper.rb:
--------------------------------------------------------------------------------
  1 | require 'csv'
  2 | require 'json'
  3 | require_relative 'schema'
  4 | require_relative 'table_data'
  5 | require_relative 'error'
  6 | require_relative 'time_with_zone'
  7 | require_relative 'hash_util'
  8 | require 'google/apis/bigquery_v2'
  9 | require 'google/api_client/auth/key_utils'
 10 | require 'securerandom'
 11 | require 'inifile'
 12 | 
 13 | class BigqueryMigration
 14 |   class BigqueryWrapper
 15 |     attr_reader :config
 16 | 
 17 |     def logger
 18 |       BigqueryMigration.logger
 19 |     end
 20 | 
 21 |     def initialize(config, opts = {})
 22 |       @config = HashUtil.deep_symbolize_keys(config)
 23 |       @opts = HashUtil.deep_symbolize_keys(opts)
 24 |     end
 25 | 
 26 |     def client
 27 |       return @cached_client if @cached_client && @cached_client_expiration > Time.now
 28 | 
 29 |       client = Google::Apis::BigqueryV2::BigqueryService.new
 30 |       client.request_options.retries = retries
 31 |       client.client_options.open_timeout_sec = open_timeout_sec
 32 |       if client.request_options.respond_to?(:timeout_sec)
 33 |         client.request_options.timeout_sec = timeout_sec
 34 |       else # google-api-ruby-client >= v0.11.0
 35 |         if timeout_sec
 36 |           logger.warn { "timeout_sec is deprecated in google-api-ruby-client >= v0.11.0. Use read_timeout_sec instead" }
 37 |         end
 38 |         client.client_options.send_timeout_sec = send_timeout_sec
 39 |         client.client_options.read_timeout_sec = read_timeout_sec
 40 |       end
 41 |       logger.debug { "client_options: #{client.client_options.to_h}" }
 42 |       logger.debug { "request_options: #{client.request_options.to_h}" }
 43 | 
 44 |       scope = "https://www.googleapis.com/auth/bigquery"
 45 | 
 46 |       case auth_method
 47 |       when 'authorized_user'
 48 |         auth = Signet::OAuth2::Client.new(
 49 |           token_credential_uri: "https://accounts.google.com/o/oauth2/token",
 50 |           audience: "https://accounts.google.com/o/oauth2/token",
 51 |           scope: scope,
 52 |           client_id:     credentials[:client_id],
 53 |           client_secret: credentials[:client_secret],
 54 |           refresh_token: credentials[:refresh_token]
 55 |         )
 56 |         auth.refresh!
 57 |       when 'compute_engine'
 58 |         auth = Google::Auth::GCECredentials.new
 59 |       when 'service_account'
 60 |         key = StringIO.new(credentials.to_json)
 61 |         auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
 62 |       when 'application_default'
 63 |         auth = Google::Auth.get_application_default([scope])
 64 |       else
 65 |         raise ConfigError, "Unknown auth method: #{auth_method}"
 66 |       end
 67 | 
 68 |       client.authorization = auth
 69 | 
 70 |       @cached_client_expiration = Time.now + 1800
 71 |       @cached_client = client
 72 |     end
 73 | 
 74 |     def existing_columns
 75 |       begin
 76 |         result = get_table
 77 |         response = result[:responses][:get_table]
 78 |         return [] unless response
 79 |         return [] unless response.schema
 80 |         return [] unless response.schema.fields
 81 |         response.schema.fields.map {|column| column.to_h }
 82 |       rescue NotFoundError
 83 |         return []
 84 |       end
 85 |     end
 86 | 
 87 |     def get_dataset(dataset: nil)
 88 |       dataset ||= self.dataset
 89 |       begin
 90 |         logger.info { "Get dataset... #{project}:#{dataset}" }
 91 |         response = client.get_dataset(project, dataset)
 92 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
 93 |         if e.status_code == 404
 94 |           raise NotFoundError, "Dataset #{project}:#{dataset} is not found"
 95 |         end
 96 | 
 97 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
 98 |         raise Error, "Failed to get_dataset(#{project}, #{dataset}), response:#{response}"
 99 |       end
100 | 
101 |       { responses: { get_dataset: response } }
102 |     end
103 | 
104 |     def insert_dataset(dataset: nil, reference: nil)
105 |       dataset ||= self.dataset
106 |       begin
107 |         logger.info { "#{head}Insert (create) dataset... #{project}:#{dataset}" }
108 |         hint = {}
109 |         if reference
110 |           response = get_dataset(reference)
111 |           hint = { access: response.access }
112 |         end
113 |         body = {
114 |           dataset_reference: {
115 |             project_id: project,
116 |             dataset_id: dataset,
117 |           },
118 |         }.merge(hint)
119 |         body[:location] = location if location
120 |         opts = {}
121 | 
122 |         logger.debug { "#{head}insert_dataset(#{project}, #{body}, #{opts})" }
123 |         unless dry_run?
124 |           response = client.insert_dataset(project, body, opts)
125 |         end
126 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
127 |         if e.status_code == 409 && /Already Exists:/ =~ e.message
128 |           # ignore 'Already Exists' error
129 |           return {}
130 |         end
131 | 
132 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
133 |         raise Error, "Failed to insert_dataset(#{project}, #{body}, #{opts}), response:#{response}"
134 |       end
135 | 
136 |       { responses: { insert_dataset: response } }
137 |     end
138 |     alias :create_dataset :insert_dataset
139 | 
140 |     def get_table(dataset: nil, table: nil)
141 |       dataset ||= self.dataset
142 |       table ||= self.table
143 |       begin
144 |         logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
145 |         response = client.get_table(project, dataset, table)
146 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
147 |         if e.status_code == 404 # not found
148 |           raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
149 |         end
150 | 
151 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
152 |         raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
153 |       end
154 | 
155 |       result = {}
156 |       if response
157 |         result = {
158 |           table_id: response.id,
159 |           creation_time: response.creation_time.to_i, # millisec
160 |           last_modified_time: response.last_modified_time.to_i, # millisec
161 |           location: response.location,
162 |           num_bytes: response.num_bytes.to_i,
163 |           num_rows: response.num_rows.to_i,
164 |         }
165 |       end
166 | 
167 |       result.merge!({ responses: { get_table: response } })
168 |     end
169 | 
170 |     def insert_table(dataset: nil, table: nil, columns:, options: {})
171 |       dataset ||= self.dataset
172 |       table ||= self.table
173 |       raise Error, "columns is empty" if columns.empty?
174 |       schema = Schema.new(columns)
175 | 
176 |       begin
177 |         logger.info { "#{head}Insert (create) table... #{project}:#{dataset}.#{table}" }
178 |         body = {
179 |           table_reference: {
180 |             table_id: table,
181 |           },
182 |           schema: {
183 |             fields: schema,
184 |           }
185 |         }
186 | 
187 |         if options['time_partitioning']
188 |           body[:time_partitioning] = {
189 |             type: options['time_partitioning']['type'],
190 |             expiration_ms: options['time_partitioning']['expiration_ms'],
191 |           }
192 |         end
193 | 
194 |         if clustering && clustering[:fields]
195 |           body[:clustering] = {
196 |             fields: clustering[:fields]
197 |           }
198 |         end
199 | 
200 |         opts = {}
201 |         logger.debug { "#{head}insert_table(#{project}, #{dataset}, #{body}, #{opts})" }
202 |         unless dry_run?
203 |           response = client.insert_table(project, dataset, body, opts)
204 |         end
205 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
206 |         if e.status_code == 409 && /Already Exists:/ =~ e.message
207 |           # ignore 'Already Exists' error
208 |           return {}
209 |         end
210 | 
211 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
212 |         raise Error, "Failed to insert_table(#{project}, #{dataset}, #{body}, #{opts}), response:#{response}"
213 |       end
214 | 
215 |       { responses: { insert_table: response } }
216 |     end
217 |     alias :create_table :insert_table
218 | 
219 |     def insert_partitioned_table(dataset: nil, table: nil, columns:, options: {})
220 |       options['time_partitioning'] = {'type'=>'DAY'}
221 |       insert_table(dataset: dataset, table: table, columns: columns, options: options)
222 |     end
223 |     alias :create_partitioned_table :insert_partitioned_table
224 | 
225 |     def delete_table(dataset: nil, table: nil)
226 |       dataset ||= self.dataset
227 |       table ||= self.table
228 | 
229 |       begin
230 |         logger.info { "#{head}Delete (drop) table... #{project}:#{dataset}.#{table}" }
231 |         unless dry_run?
232 |           client.delete_table(project, dataset, table) # no response
233 |           success = true
234 |         end
235 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
236 |         if e.status_code == 404 && /Not found:/ =~ e.message
237 |           # ignore 'Not Found' error
238 |           return {}
239 |         end
240 | 
241 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
242 |         raise Error, "Failed to delete_table(#{project}, #{dataset}, #{table}), response:#{response}"
243 |       end
244 | 
245 |       { success: success }
246 |     end
247 |     alias :drop_table :delete_table
248 | 
249 |     def list_tables(dataset: nil, max_results: 999999)
250 |       dataset ||= self.dataset
251 | 
252 |       tables = []
253 |       begin
254 |         logger.info { "List tables... #{project}:#{dataset}" }
255 |         response = client.list_tables(project, dataset, max_results: max_results)
256 |         while true
257 |           _tables = (response.tables || []).map { |t| t.table_reference.table_id.to_s }
258 |           tables.concat(_tables)
259 |           if next_page_token = response.next_page_token
260 |             response = client.list_tables(project, dataset, page_token: next_page_token, max_results: max_results)
261 |           else
262 |             break
263 |           end
264 |         end
265 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
266 |         if e.status_code == 404 && /Not found:/ =~ e.message
267 |           raise NotFoundError, "Dataset #{project}:#{dataset} is not found"
268 |         end
269 | 
270 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
271 |         logger.error { "list_tables(#{project}, #{dataset}), response:#{response}" }
272 |         raise Error, "failed to list tables #{project}:#{dataset}, response:#{response}"
273 |       end
274 | 
275 |       { tables: tables }
276 |     end
277 | 
278 |     def purge_tables(dataset: nil, table_prefix: , suffix_format: , purge_before: , timezone: nil)
279 |       dataset ||= self.dataset
280 |       timezone ||= Time.now.strftime('%z')
281 | 
282 |       before_tables = list_tables[:tables]
283 | 
284 |       purge_before_t = TimeWithZone.strptime_with_zone(purge_before, suffix_format, timezone)
285 |       tables = before_tables.select do |tbl|
286 |         suffix = tbl.gsub(table_prefix, '')
287 |         begin
288 |           suffix_t = TimeWithZone.strptime_with_zone(suffix, suffix_format, timezone)
289 |         rescue
290 |           next
291 |         end
292 |         # skip if different from the suffix_format
293 |         next if suffix_t.strftime(suffix_format) != suffix
294 |         suffix_t <= purge_before_t
295 |       end
296 | 
297 |       tables.each do |_table|
298 |         delete_table(table: _table)
299 |         # If you make more than 100 requests per second, throttling might occur.
300 |         # See https://cloud.google.com/bigquery/quota-policy#apirequests
301 |         sleep 1
302 |       end
303 | 
304 |       { delete_tables: tables }
305 |     end
306 | 
307 |     # rows:
308 |     #   - id: 1
309 |     #     type: one
310 |     #     record:
311 |     #       child1: 'child1'
312 |     #       child2: 'child2'
313 |     #   - id: 2
314 |     #     type: two
315 |     #     record:
316 |     #       child1: 'child3'
317 |     #       child2: 'child4'
318 |     def insert_all_table_data(dataset: nil, table: nil, rows: )
319 |       dataset ||= self.dataset
320 |       table ||= self.table
321 | 
322 |       begin
323 |         logger.info { "#{head}insertAll tableData... #{project}:#{dataset}.#{table}" }
324 |         body = {
325 |           rows: rows.map {|row| { json: row } },
326 |         }
327 |         opts = {}
328 |         unless dry_run?
329 |           response = client.insert_all_table_data(project, dataset, table, body, opts)
330 |         end
331 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
332 |         if e.status_code == 404 # not found
333 |           raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
334 |         end
335 | 
336 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
337 |         Medjed::Bulk.logger.error {
338 |           "insert_all_table_data(#{project}, #{dataset}, #{table}, #{opts}), response:#{response}"
339 |         }
340 |         raise Error, "failed to insert_all table_data #{project}:#{dataset}.#{table}, response:#{response}"
341 |       end
342 | 
343 |       { responses: { insert_all_table_data: response } }
344 |     end
345 | 
346 |     # @return Hash result of list table_data
347 |     #
348 |     # Example:
349 |     # {
350 |     #   columns:
351 |     #     [
352 |     #       {
353 |     #         name: id,
354 |     #         type: INTEGER
355 |     #       },
356 |     #       {
357 |     #         name: type,
358 |     #         type: STRING
359 |     #       },
360 |     #       {
361 |     #         name: record.child1,
362 |     #         type: STRING
363 |     #       },
364 |     #       {
365 |     #         name: record.child2,
366 |     #         type: STRING
367 |     #       },
368 |     #   values:
369 |     #     [
370 |     #       [2,"two","child3","child4"],
371 |     #       [1,"one","child1","child2"]
372 |     #     ],
373 |     #   total_rows: 2
374 |     # }
375 |     def list_table_data(dataset: nil, table: nil, max_results: 100)
376 |       dataset ||= self.dataset
377 |       table ||= self.table
378 | 
379 |       begin
380 |         logger.info  { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
381 |         response = client.list_table_data(project, dataset, table, max_results: max_results)
382 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
383 |         if e.status_code == 404 # not found
384 |           raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
385 |         end
386 | 
387 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
388 |         logger.error  { "list_table_data(#{project}, #{dataset}, #{table}, max_results: #{max_results})" }
389 |         raise Error, "Failed to list table_data #{project}:#{dataset}.#{table}, response:#{response}"
390 |       end
391 | 
392 |       columns = existing_columns
393 |       flattened_columns = Schema.new(columns).flattened_columns.map do |name, column|
394 |         {name: name}.merge!(column)
395 |       end
396 |       if rows = response.to_h[:rows]
397 |         values = TableData.new(columns, rows).values
398 |       end
399 | 
400 |       {
401 |         total_rows: response.total_rows,
402 |         columns: flattened_columns,
403 |         values: values,
404 |         responses: {
405 |           list_table_data: response,
406 |         }
407 |       }
408 |     end
409 | 
410 |     def patch_table(dataset: nil, table: nil, columns: nil, add_columns: nil)
411 |       dataset ||= self.dataset
412 |       table ||= self.table
413 | 
414 |       if columns.nil? and add_columns.nil?
415 |         raise ArgumentError, 'patch_table: `columns` or `add_columns` is required'
416 |       end
417 | 
418 |       before_columns = existing_columns
419 |       if columns # if already given
420 |         schema = Schema.new(columns)
421 |       else
422 |         schema = Schema.new(add_columns)
423 |         schema.reverse_merge!(before_columns)
424 |       end
425 |       schema.validate_permitted_operations!(before_columns)
426 | 
427 |       begin
428 |         logger.info { "#{head}Patch table... #{project}:#{dataset}.#{table}" }
429 |         fields = schema.map {|column| HashUtil.deep_symbolize_keys(column) }
430 |         body = {
431 |           schema: {
432 |             fields: fields,
433 |           }
434 |         }
435 |         opts = {}
436 |         logger.debug { "#{head}patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts})" }
437 |         unless dry_run?
438 |           response = client.patch_table(project, dataset, table, body, options: opts)
439 |         end
440 |       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
441 |         if e.status_code == 404 # not found
442 |           raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
443 |         end
444 | 
445 |         response = {status_code: e.status_code, message: e.message, error_class: e.class}
446 |         logger.error {
447 |           "patch_table(#{project}, #{dataset}, #{table}, #{body}, options: #{opts}), response:#{response}"
448 |         }
449 |         raise Error, "Failed to patch table #{project}:#{dataset}.#{table}, response:#{response}"
450 |       end
451 | 
452 |       after_columns = existing_columns
453 | 
454 |       {
455 |         before_columns: before_columns,
456 |         after_columns:  after_columns,
457 |         responses: { patch_table: response },
458 |       }
459 |     end
460 |     alias :add_column :patch_table
461 | 
462 |     def copy_table(destination_table:, destination_dataset: nil, source_table: nil, source_dataset: nil, write_disposition: nil)
463 |       source_table ||= self.table
464 |       source_dataset ||= self.dataset
465 |       destination_dataset ||= source_dataset
466 |       write_disposition ||= 'WRITE_TRUNCATE'
467 | 
468 |       body = {
469 |         job_reference: {
470 |           project_id: self.project,
471 |           job_id: "job_#{SecureRandom.uuid}",
472 |         },
473 |         configuration: {
474 |           copy: {
475 |             create_deposition: 'CREATE_IF_NEEDED',
476 |             write_disposition: write_disposition,
477 |             source_table: {
478 |               project_id: project,
479 |               dataset_id: source_dataset,
480 |               table_id: source_table,
481 |             },
482 |             destination_table: {
483 |               project_id: project,
484 |               dataset_id: destination_dataset,
485 |               table_id: destination_table,
486 |             },
487 |           }
488 |         }
489 |       }
490 |       body[:job_reference][:location] = location if location
491 |       opts = {}
492 | 
493 |       logger.info  { "#{head}insert_job(#{project}, #{body}, #{opts})" }
494 |       unless dry_run?
495 |         response = client.insert_job(project, body, opts)
496 |         get_response = wait_load('copy', response)
497 |       end
498 | 
499 |       {
500 |         responses: {
501 |           insert_job: response,
502 |           last_get_job: get_response,
503 |         }
504 |       }
505 |     end
506 | 
507 |     def insert_select(query:, destination_table: nil, destination_dataset: nil, write_disposition: nil)
508 |       destination_table   ||= self.table
509 |       destination_dataset ||= self.dataset
510 |       write_disposition ||= 'WRITE_TRUNCATE'
511 | 
512 |       body  = {
513 |         job_reference: {
514 |           project_id: self.project,
515 |           job_id: "job_#{SecureRandom.uuid}",
516 |         },
517 |         configuration: {
518 |           query: {
519 |             allow_large_results: true,
520 |             flatten_results: false,
521 |             write_disposition: write_disposition,
522 |             query: query,
523 |             destination_table: {
524 |               project_id: self.project,
525 |               dataset_id: destination_dataset,
526 |               table_id: destination_table,
527 |             },
528 |           }
529 |         }
530 |       }
531 |       body[:job_reference][:location] = location if location
532 |       opts = {}
533 | 
534 |       logger.info { "#{head}insert_job(#{project}, #{body}, #{opts})" }
535 |       unless dry_run?
536 |         response = client.insert_job(project, body, opts)
537 |         get_response = wait_load('query', response)
538 |       end
539 | 
540 |       {
541 |         responses: {
542 |           insert_job: response,
543 |           last_get_job: get_response,
544 |         }
545 |       }
546 |     end
547 | 
548 |     def wait_load(kind, response)
549 |       started = Time.now
550 | 
551 |       wait_interval = self.job_status_polling_interval
552 |       max_polling_time = self.job_status_max_polling_time
553 |       _response = response
554 | 
555 |       while true
556 |         job_id = _response.job_reference.job_id
557 |         elapsed = Time.now - started
558 |         status = _response.status.state
559 |         if status == "DONE"
560 |           logger.info {
561 |             "#{kind} job completed... " \
562 |             "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
563 |           }
564 |           break
565 |         elsif elapsed.to_i > max_polling_time
566 |           message = "#{kind} job checking... " \
567 |             "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
568 |           logger.info { message }
569 |           raise JobTimeoutError.new(message)
570 |         else
571 |           logger.info {
572 |             "#{kind} job checking... " \
573 |             "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
574 |           }
575 |           sleep wait_interval
576 |           if support_location_keyword?
577 |             _response = client.get_job(project, job_id, location: location)
578 |           else
579 |             _response = client.get_job(project, job_id)
580 |           end
581 |         end
582 |       end
583 | 
584 |       # cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method
585 |       # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
586 |       # Otherwise, this returns nil.
587 |       if _errors = _response.status.errors
588 |         raise Error, "Failed during waiting a job, get_job(#{project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
589 |       end
590 | 
591 |       _response
592 |     end
593 | 
594 |     def drop_column(table: nil, columns: nil, drop_columns: nil, backup_dataset: nil, backup_table: nil)
595 |       table ||= self.table
596 |       backup_dataset ||= self.dataset
597 |       if columns.nil? and drop_columns.nil?
598 |         raise ArgumentError, '`drop_columns` or `columns` is required'
599 |       end
600 | 
601 |       result = { responses: {} }
602 | 
603 |       before_columns = existing_columns
604 | 
605 |       if columns # if already given
606 |         schema = Schema.new(columns)
607 |       else
608 |         schema = Schema.new(existing_columns)
609 |         schema.reject_columns!(drop_columns)
610 |       end
611 |       if schema.empty? && !dry_run?
612 |         raise Error, 'No column is remained'
613 |       end
614 | 
615 |       schema.validate_permitted_operations!(before_columns)
616 | 
617 |       unless backup_dataset == self.dataset
618 |         create_dataset(dataset: backup_dataset)
619 |       end
620 | 
621 |       if backup_table
622 |         _result = copy_table(source_table: table, destination_table: backup_table, destination_dataset: backup_dataset)
623 |         result[:responses].merge!(_result[:responses])
624 |       end
625 | 
626 |       unless (add_columns = schema.diff_columns_by_name(before_columns)).empty?
627 |         _result = patch_table(add_columns: add_columns)
628 |         result[:responses].merge!(_result[:responses])
629 |       end
630 | 
631 |       query_fields = schema.build_query_fields(before_columns)
632 |       query = "SELECT #{query_fields.join(',')} FROM [#{dataset}.#{table}]"
633 |       _result = insert_select(query: query, destination_table: table)
634 |       result[:responses].merge!(_result[:responses])
635 | 
636 |       after_columns = existing_columns
637 | 
638 |       result.merge!({before_columns: before_columns, after_columns: after_columns})
639 |     end
640 | 
641 |     def migrate_table(table: nil, schema_file: nil, columns: nil, backup_dataset: nil, backup_table: nil)
642 |       table ||= self.table
643 |       backup_dataset ||= self.dataset
644 | 
645 |       if schema_file.nil? and columns.nil?
646 |         raise ArgumentError, '`schema_file` or `columns` is required'
647 |       end
648 |       if schema_file
649 |         columns = HashUtil.deep_symbolize_keys(JSON.parse(File.read(schema_file)))
650 |       end
651 |       Schema.validate_columns!(columns)
652 | 
653 |       result = {}
654 |       begin
655 |         get_table
656 |       rescue NotFoundError
657 |         before_columns = []
658 |         result = create_table(table: table, columns: columns)
659 |       else
660 |         before_columns = existing_columns
661 |         add_columns  = Schema.diff_columns(before_columns, columns)
662 |         drop_columns = Schema.diff_columns(columns, before_columns)
663 | 
664 |         if !drop_columns.empty?
665 |           drop_column(table: table, columns: columns,
666 |                       backup_dataset: backup_dataset, backup_table: backup_table)
667 |         elsif !add_columns.empty?
668 |           add_column(table: table, columns: columns)
669 |         end
670 |       end
671 | 
672 |       after_columns = existing_columns
673 | 
674 |       if after_columns.empty? and !dry_run?
675 |         raise Error, "after_columns is empty. " \
676 |           "before_columns: #{before_columns}, after_columns: #{after_columns}, columns: #{columns}"
677 |       end
678 | 
679 |       result.merge!( before_columns: before_columns, after_columns: after_columns )
680 |     end
681 | 
682 |     # creates a table with time_partitioning option
683 |     # this version only uses patch table API (no query job) because querying partitioned table should cost lots
684 |     def migrate_partitioned_table(table: nil, schema_file: nil, columns: nil, options: {})
685 |       table ||= self.table
686 | 
687 |       if schema_file.nil? and columns.nil?
688 |         raise ArgumentError, '`schema_file` or `columns` is required'
689 |       end
690 |       if schema_file
691 |         columns = HashUtil.deep_symbolize_keys(JSON.parse(File.read(schema_file)))
692 |       end
693 |       Schema.validate_columns!(columns)
694 | 
695 |       result = {}
696 |       begin
697 |         get_table
698 |       rescue NotFoundError
699 |         before_columns = []
700 |         result = create_partitioned_table(table: table, columns: columns, options: options)
701 |       else
702 |         before_columns = existing_columns
703 |         add_columns  = Schema.diff_columns(before_columns, columns)
704 |         drop_columns = Schema.diff_columns(columns, before_columns)
705 | 
706 |         if !drop_columns.empty? || !add_columns.empty?
707 |           Schema.make_nullable!(drop_columns) # drop columns will be NULLABLE columns
708 |           Schema.reverse_merge!(columns, patch_columns = drop_columns)
709 |           Schema.reverse_merge!(patch_columns, patch_columns = add_columns)
710 |           patch_table(table: table, columns: patch_columns)
711 |         end
712 |       end
713 | 
714 |       after_columns = existing_columns
715 | 
716 |       if after_columns.empty? and !dry_run?
717 |         raise Error, "after_columns is empty. " \
718 |           "before_columns: #{before_columns}, after_columns: #{after_columns}, columns: #{columns}"
719 |       end
720 | 
721 |       result.merge!( before_columns: before_columns, after_columns: after_columns )
722 |     end
723 | 
724 |     # the location keyword arguments are available in google-api-client v0.19.6 or later
725 |     def support_location_keyword?
726 |       @support_location_keyword ||= client.method(:get_job).parameters.include?([:key, :location])
727 |     end
728 | 
729 |     # For old version compatibility
730 |     # Use credentials_file or credentials instead
731 |     def json_key
732 |       if json_keyfile = config[:json_keyfile]
733 |         begin
734 |           case json_keyfile
735 |           when String
736 |             return HashUtil.deep_symbolize_keys(JSON.parse(File.read(json_keyfile)))
737 |           when Hash
738 |             case json_keyfile[:content]
739 |             when String
740 |               return HashUtil.deep_symbolize_keys(JSON.parse(json_keyfile[:content]))
741 |             when Hash
742 |               return json_keyfile[:content]
743 |             else
744 |               raise ConfigError.new "Unsupported json_keyfile type"
745 |             end
746 |           else
747 |             raise ConfigError.new "Unsupported json_keyfile type"
748 |           end
749 |         rescue => e
750 |           raise ConfigError.new "json_keyfile is not a JSON file"
751 |         end
752 |       end
753 |       nil
754 |     end
755 | 
756 |     # compute_engine, authorized_user, service_account
757 |     def auth_method
758 |       @auth_method ||= ENV['AUTH_METHOD'] || config.fetch(:auth_method, nil) || credentials[:type] || 'compute_engine'
759 |     end
760 | 
761 |     def credentials
762 |       json_key || HashUtil.deep_symbolize_keys(JSON.parse(config.fetch(:credentials, nil) || File.read(credentials_file)))
763 |     end
764 | 
765 |     def credentials_file
766 |       @credentials_file ||= File.expand_path(
767 |         # ref. https://developers.google.com/identity/protocols/application-default-credentials
768 |         ENV['GOOGLE_APPLICATION_CREDENTIALS'] ||
769 |         config.fetch(:credentials_file, nil) ||
770 |         (File.exist?(global_application_default_credentials_file) ? global_application_default_credentials_file : application_default_credentials_file)
771 |       )
772 |     end
773 | 
774 |     def application_default_credentials_file
775 |       @application_default_credentials_file ||= File.expand_path("~/.config/gcloud/application_default_credentials.json")
776 |     end
777 | 
778 |     def global_application_default_credentials_file
779 |       @global_application_default_credentials_file ||= '/etc/google/auth/application_default_credentials.json'
780 |     end
781 | 
782 |     def config_default_file
783 |       File.expand_path('~/.config/gcloud/configurations/config_default')
784 |     end
785 | 
786 |     def config_default
787 |       # {core:{account:'xxx',project:'xxx'},compute:{zone:'xxx}}
788 |       @config_default ||= File.readable?(config_default_file) ? HashUtil.deep_symbolize_keys(IniFile.load(config_default_file).to_h) : {}
789 |     end
790 | 
791 |     def service_account_default
792 |       (config_default[:core] || {})[:account]
793 |     end
794 | 
795 |     def project_default
796 |       (config_default[:core] || {})[:project]
797 |     end
798 | 
799 |     def zone_default
800 |       (config_default[:compute] || {})[:zone]
801 |     end
802 | 
803 |     def service_account
804 |       @service_account ||= ENV['GOOGLE_SERVICE_ACCOUNT'] || config.fetch(:service_account, nil) || credentials[:client_email] || service_account_default
805 |     end
806 | 
807 |     def retries
808 |       @retries ||= ENV['RETRIES'] || config.fetch(:retries, nil) || 5
809 |     end
810 | 
811 |     # For google-api-client < 0.11.0. Deprecated
812 |     def timeout_sec
813 |       @timeout_sec ||= ENV['TIMEOUT_SEC'] || config.fetch(:timeout_sec, nil)
814 |     end
815 | 
816 |     def send_timeout_sec
817 |       @send_timeout_sec ||= ENV['SEND_TIMEOUT_SEC'] || config.fetch(:send_timeout_sec, nil) || 60
818 |     end
819 | 
820 |     def read_timeout_sec
821 |       @read_timeout_sec ||= ENV['READ_TIMEOUT_SEC'] || config.fetch(:read_timeout_sec, nil) || timeout_sec || 300
822 |     end
823 | 
824 |     def open_timeout_sec
825 |       @open_timeout_sec ||= ENV['OPEN_TIMEOUT_SEC'] || config.fetch(:open_timeout_sec, nil) || 300
826 |     end
827 | 
828 |     def project
829 |       @project ||= ENV['GOOGLE_PROJECT'] || config.fetch(:project, nil) || credentials[:project_id]
830 |       @project ||= credentials[:client_email].chomp('.iam.gserviceaccount.com').split('@').last if credentials[:client_email]
831 |       @project ||= project_default || raise(ConfigError, '`project` is required.')
832 |     end
833 | 
834 |     def dataset
835 |       @dataset ||= config[:dataset] || raise(ConfigError, '`dataset` is required.')
836 |     end
837 | 
838 |     def table
839 |       @table  ||= config[:table]   || raise(ConfigError, '`table` is required.')
840 |     end
841 | 
842 |     def location
843 |       config[:location]
844 |     end
845 | 
846 |     def clustering
847 |       config[:clustering]
848 |     end
849 | 
850 |     def job_status_polling_interval
851 |       @job_status_polling_interval ||= config[:job_status_polling_interval] || 5
852 |     end
853 | 
854 |     def job_status_max_polling_time
855 |       @job_status_max_polling_time ||= config[:job_status_polling_time] || 3600
856 |     end
857 | 
858 |     def dry_run?
859 |       @opts[:dry_run]
860 |     end
861 | 
862 |     def head
863 |       dry_run? ? '(DRY-RUN) ' : '(EXECUTE) '
864 |     end
865 |   end
866 | end
867 | 


--------------------------------------------------------------------------------