├── .rspec ├── Gemfile ├── lib ├── samidare │ ├── version.rb │ ├── embulk_utility.rb │ ├── embulk.rb │ ├── bigquery_utility.rb │ └── mysql.rb └── samidare.rb ├── spec ├── spec_helper.rb ├── samidare_spec.rb ├── support │ ├── table.yml │ └── databe.yml └── samidare │ ├── embulk_spec.rb │ ├── bigquery_utility_spec.rb │ ├── embulk_utility_spec.rb │ └── mysql_spec.rb ├── Rakefile ├── .gitignore ├── .circleci └── config.yml ├── lint.sh ├── LICENSE.txt ├── samidare.gemspec ├── .rubocop.yml ├── Gemfile.lock └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /lib/samidare/version.rb: -------------------------------------------------------------------------------- 1 | module Samidare 2 | VERSION = '0.3.0'.freeze 3 | end 4 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'samidare' 3 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | 8 | -------------------------------------------------------------------------------- /spec/samidare_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Samidare do 4 | it 'has a version number' do 5 | expect(Samidare::VERSION).not_to be nil 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | /vendor/ 10 | *.bundle 11 | *.so 12 | *.o 13 | *.a 14 | .rspec 15 | mkmf.log 16 | -------------------------------------------------------------------------------- /spec/support/table.yml: -------------------------------------------------------------------------------- 1 | db01: 2 | tables: 3 | - name: hoge 4 | daily_snapshot: true 5 | - name: simple 6 | 7 | db02: 8 | tables: 9 | - name: fuga 10 | - name: with_condition 11 | condition: created_at < CURRENT_DATE() 12 | -------------------------------------------------------------------------------- /spec/support/databe.yml: -------------------------------------------------------------------------------- 1 | db01: 2 | host: localhost 3 | username: root 4 | password: 5 | database: embulk 6 | bq_dataset: mysql 7 | 8 | db02: 9 | host: localhost 10 | username: root 11 | password: 12 | database: embulk2 13 | bq_dataset: mysql2 14 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | working_directory: ~/samidare 5 | docker: 6 | - image: circleci/ruby:2.4.1 7 | environment: 8 | TZ: /usr/share/zoneinfo/Asia/Tokyo 9 | steps: 10 | - checkout 11 | 12 | - restore_cache: 13 | name: Restore bundle cache 14 | key: samidare-{{ checksum "Gemfile.lock" }} 15 | 16 | - run: 17 | name: Run bundle install 18 | command: bundle install --path vendor/bundle 19 | 20 | - save_cache: 21 | name: Store bundle cache 22 | key: samidare-{{ checksum "Gemfile.lock" }} 23 | paths: 24 | - vendor/bundle 25 | 26 | - run: 27 | name: chmod +x ./lint.sh 28 | command: chmod +x ./lint.sh 29 | 30 | - run: 31 | name: ./lint.sh 32 | command: ./lint.sh 33 | 34 | - run: 35 | name: Run rspec 36 | command: bundle exec rspec spec/ 37 | -------------------------------------------------------------------------------- /spec/samidare/embulk_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Samidare::Embulk do 4 | describe '#target_table_configs' do 5 | subject { Samidare::Embulk.new.target_table_configs(table_configs, target_table_names) } 6 | 7 | context 'all tables' do 8 | let(:table_hoge) { Samidare::MySQL::TableConfig.new({ 'name' => 'hoge' }) } 9 | let(:table_fuga) { Samidare::MySQL::TableConfig.new({ 'name' => 'fuga' }) } 10 | let(:table_configs) { [table_hoge, table_fuga] } 11 | let(:target_table_names) { [] } 12 | it { expect(subject).to match(table_configs) } 13 | end 14 | 15 | context 'target table selected' do 16 | let(:table_hoge) { Samidare::MySQL::TableConfig.new({ 'name' => 'hoge' }) } 17 | let(:table_fuga) { Samidare::MySQL::TableConfig.new({ 'name' => 'fuga' }) } 18 | let(:table_configs) { [table_hoge, table_fuga] } 19 | let(:target_table_names) { ['hoge'] } 20 | it { expect(subject).to match([table_hoge]) } 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo 'CIRCLE_BRANCH: ' ${CIRCLE_BRANCH} 4 | TARGET_BRANCH=${CIRCLE_BRANCH} 5 | 6 | # ローカルでの実行用にカレントブランチをセットする 7 | if [ "$TARGET_BRANCH" = '' ]; then 8 | TARGET_BRANCH=$(git rev-parse --abbrev-ref HEAD) 9 | fi 10 | echo 'TARGET_BRANCH: ' $TARGET_BRANCH 11 | 12 | echo 'CIRCLE_BASE_BRANCH: ' ${CIRCLE_BASE_BRANCH} 13 | BASE_BRANCH=${CIRCLE_BASE_BRANCH} 14 | 15 | # ローカルでの実行時に環境変数をセットしていない時はmasterブランチと比較する 16 | if [ "$BASE_BRANCH" = '' ]; then 17 | BASE_BRANCH=origin/master 18 | fi 19 | echo 'BASE_BRANCH: ' $BASE_BRANCH 20 | 21 | files=$(git diff --name-only $TARGET_BRANCH $BASE_BRANCH | grep -E '.rb' | egrep -v 'db/migrate|db/schema.rb') 22 | 23 | error=false 24 | for file in ${files}; do 25 | if [ -e $file ]; then 26 | result=$(bundle exec rubocop ${file}) 27 | rubocop_error=$(echo "$result" | grep 'Offenses:') 28 | if [ "$rubocop_error" != '' ]; then 29 | error=true 30 | echo '' 31 | echo 'ERROR:' $file 32 | echo "$result" 33 | fi 34 | fi 35 | done 36 | 37 | if $error; then 38 | exit 1 39 | fi 40 | 41 | exit 0 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Ryoji Kobori 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /lib/samidare.rb: -------------------------------------------------------------------------------- 1 | require 'samidare/version' 2 | require 'samidare/embulk_utility' 3 | require 'samidare/embulk' 4 | require 'samidare/mysql' 5 | 6 | module Samidare 7 | class EmbulkClient 8 | def generate_config(bq_config) 9 | Samidare::EmbulkUtility::ConfigGenerator.new.generate_config(database_configs, bq_config) 10 | end 11 | 12 | def run(bq_config, target_table_names = [], retry_max = 0) 13 | error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0) 14 | # return batch status(true: all tables success) 15 | error_tables.empty? 16 | end 17 | 18 | private 19 | 20 | def run_and_retry(bq_config, target_table_names, retry_max, retry_count) 21 | error_tables = Samidare::Embulk.new.run( 22 | database_configs, 23 | table_configs, 24 | bq_config, 25 | target_table_names 26 | ) 27 | if !error_tables.empty? && retry_count < retry_max 28 | puts '------------------------------------' 29 | puts 'retry start -> #{retry_count + 1} time' 30 | puts '------------------------------------' 31 | error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1) 32 | end 33 | error_tables 34 | end 35 | 36 | def database_configs 37 | @database_configs ||= YAML.load_file('database.yml') 38 | end 39 | 40 | def table_configs 41 | @table_configs ||= Samidare::MySQL::TableConfig.generate_table_configs 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/samidare/embulk_utility.rb: -------------------------------------------------------------------------------- 1 | module Samidare 2 | module EmbulkUtility 3 | class ConfigGenerator 4 | def generate_config(database_configs, bq_config) 5 | bq_utility = BigQueryUtility.new(bq_config) 6 | 7 | database_configs.keys.each do |db_name| 8 | database_config = database_configs[db_name] 9 | table_configs = all_table_configs[db_name] 10 | mysql_client = MySQL::MySQLClient.new(database_config) 11 | 12 | table_configs.each do |table_config| 13 | write( 14 | "#{bq_config['schema_dir']}/#{db_name}", 15 | "#{table_config.name}.json", 16 | mysql_client.generate_bq_schema(table_config.name) 17 | ) 18 | write( 19 | "#{bq_config['config_dir']}/#{db_name}", 20 | "#{table_config.name}.yml", 21 | bq_utility.generate_embulk_config( 22 | db_name, 23 | database_config, 24 | table_config, 25 | mysql_client.columns(table_config.name) 26 | ) 27 | ) 28 | end 29 | end 30 | end 31 | 32 | private 33 | 34 | def write(directory, file_name, content) 35 | FileUtils.mkdir_p(directory) unless FileTest.exist?(directory) 36 | File.write("#{directory}/#{file_name}", content) 37 | end 38 | 39 | def all_table_configs 40 | @all_table_configs ||= MySQL::TableConfig.generate_table_configs 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /samidare.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'samidare/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = 'samidare' 8 | spec.version = Samidare::VERSION 9 | spec.authors = ['Ryoji Kobori'] 10 | spec.email = ['kobori75@gmail.com'] 11 | spec.summary = %q{Embulk utility for MySQL to BigQuery} 12 | spec.description = %q{Generate Embulk config and BigQuery schema from MySQL schema} 13 | spec.homepage = 'https://github.com/cobot00/samidare' 14 | spec.license = 'MIT' 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ['lib'] 20 | 21 | spec.add_development_dependency 'bundler', '~> 1.7' 22 | spec.add_development_dependency 'rake', '~> 10.0' 23 | spec.add_development_dependency 'rspec', '~> 3.7.0' 24 | spec.add_development_dependency 'timecop', '~> 0.9.0' 25 | spec.add_development_dependency 'rubocop', '0.49.1' 26 | 27 | spec.add_dependency 'unindent', '1.0' 28 | spec.add_dependency 'mysql2-cs-bind', '0.0.6' 29 | spec.add_dependency 'embulk-output-bigquery', '0.4.3' 30 | spec.add_dependency 'embulk-input-mysql', '0.8.2' 31 | spec.add_dependency 'embulk-parser-jsonl', '0.2.0' 32 | spec.add_dependency 'embulk-formatter-jsonl', '0.1.4' 33 | spec.add_dependency 'bigquery', '0.8.3' 34 | end 35 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | TargetRubyVersion: 2.4.1 3 | Include: 4 | - '**/Gemfile' 5 | - '**/Rakefile' 6 | - '**/config.ru' 7 | Exclude: 8 | - 'db/**/*' 9 | - 'config/**/*' 10 | - 'script/**/*' 11 | - 'vendor/**/*' 12 | - 'bin/*' 13 | - !ruby/regexp /old_and_unused\.rb$/ 14 | 15 | AlignParameters: 16 | Enabled: false 17 | 18 | AsciiComments: 19 | Enabled: false 20 | 21 | BracesAroundHashParameters: 22 | Enabled: false 23 | 24 | Bundler/OrderedGems: 25 | Enabled: false 26 | 27 | ClassAndModuleChildren: 28 | Enabled: false 29 | 30 | ClassLength: 31 | Enabled: false 32 | 33 | Layout/AlignHash: 34 | Enabled: false 35 | 36 | Layout/EndOfLine: 37 | Enabled: false 38 | 39 | Layout/IndentHash: 40 | Enabled: false 41 | 42 | Layout/MultilineArrayBraceLayout: 43 | Enabled: false 44 | 45 | Layout/MultilineHashBraceLayout: 46 | Enabled: false 47 | 48 | Layout/MultilineMethodCallIndentation: 49 | Enabled: false 50 | 51 | Metrics/BlockLength: 52 | Exclude: 53 | - 'spec/**/*' 54 | - 'config/routes.rb' 55 | - 'app/jobs/**/*.rb' 56 | - 'lib/tasks/**/*.rb' 57 | 58 | Metrics/AbcSize: 59 | Enabled: false 60 | 61 | Metrics/CyclomaticComplexity: 62 | Enabled: false 63 | 64 | Metrics/LineLength: 65 | Enabled: false 66 | 67 | Metrics/MethodLength: 68 | Enabled: false 69 | 70 | Metrics/PerceivedComplexity: 71 | Enabled: false 72 | 73 | ModuleLength: 74 | Enabled: false 75 | 76 | Style/ClassAndModuleCamelCase: 77 | Enabled: false 78 | 79 | Style/Documentation: 80 | Enabled: false 81 | 82 | Style/FrozenStringLiteralComment: 83 | Enabled: false 84 | 85 | Style/GuardClause: 86 | Enabled: false 87 | 88 | Style/Next: 89 | Enabled: false 90 | 91 | Style/NumericLiterals: 92 | Enabled: false 93 | 94 | Style/RaiseArgs: 95 | Enabled: false 96 | 97 | Style/RedundantBegin: 98 | Enabled: false 99 | 100 | Style/Lambda: 101 | Enabled: false 102 | 103 | Rails/SkipsModelValidations: 104 | Enabled: false 105 | -------------------------------------------------------------------------------- /lib/samidare/embulk.rb: -------------------------------------------------------------------------------- 1 | module Samidare 2 | class Embulk 3 | def run(database_configs, all_table_configs, bq_config, target_table_names = []) 4 | error_tables = [] 5 | database_configs.keys.each do |db_name| 6 | table_configs = target_table_configs(all_table_configs[db_name], target_table_names) 7 | error_tables += run_by_database( 8 | db_name, 9 | table_configs, 10 | database_configs[db_name]['bq_dataset'], 11 | bq_config 12 | ) 13 | end 14 | error_tables 15 | end 16 | 17 | def target_table_configs(table_configs, target_table_names) 18 | return table_configs if target_table_names.empty? 19 | table_configs.select { |table_config| target_table_names.include?(table_config.name) } 20 | end 21 | 22 | private 23 | 24 | def run_by_database(db_name, table_configs, bq_dataset, bq_config) 25 | process_times = [] 26 | error_tables = [] 27 | big_query = Samidare::BigQueryUtility.new(bq_config) 28 | table_configs.each do |table_config| 29 | start_time = Time.now 30 | log "table: #{table_config.name} - start" 31 | 32 | begin 33 | big_query.delete_table(bq_dataset, table_config.name) 34 | log "table: #{table_config.name} - deleted" 35 | rescue 36 | log "table: #{table_config.name} - does not exist" 37 | end 38 | 39 | cmd = "embulk run #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml" 40 | log "cmd: #{cmd}" 41 | if system(cmd) 42 | result = 'success' 43 | else 44 | result = 'error' 45 | error_tables << table_config.name 46 | end 47 | 48 | process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec" 49 | log process_time 50 | process_times << process_time 51 | end 52 | log '------------------------------------' 53 | log "db_name: #{db_name}" 54 | process_times.each { |process_time| log process_time } 55 | 56 | error_tables 57 | end 58 | 59 | def log(message) 60 | puts "[#{Time.now.strftime('%Y-%m-%d %H:%M:%S')}] #{message}" 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/samidare/bigquery_utility.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | require 'erb' 3 | require 'big_query' 4 | require 'unindent' 5 | require 'date' 6 | 7 | module Samidare 8 | class BigQueryUtility 9 | CONTENTS = <<-EOS.unindent 10 | in: 11 | type: mysql 12 | user: <%= user %> 13 | password: <%= password %> 14 | database: <%= database %> 15 | host: <%= host %> 16 | query: | 17 | <%= query %> 18 | out: 19 | type: bigquery 20 | project: <%= project %> 21 | p12_keyfile: <%= p12_keyfile %> 22 | service_account_email: <%= service_account_email %> 23 | dataset: <%= dataset %> 24 | table: <%= table_name %> 25 | schema_path: <%= schema_path %> 26 | auto_create_table: 1 27 | path_prefix: <%= path_prefix %> 28 | source_format: NEWLINE_DELIMITED_JSON 29 | file_ext: .json.gz 30 | delete_from_local_when_job_end: 1 31 | formatter: 32 | type: jsonl 33 | encoders: 34 | - {type: gzip} 35 | EOS 36 | 37 | def initialize(config) 38 | @config = config.dup 39 | @current_date = Date.today 40 | end 41 | 42 | def self.generate_schema(columns) 43 | json_body = columns.map(&:to_json).join(",\n") 44 | "[\n" + json_body + "\n]\n" 45 | end 46 | 47 | def self.generate_sql(table_config, columns) 48 | columns = columns.map(&:converted_value) 49 | sql = "SELECT #{columns.join(',')}" 50 | sql << " FROM #{table_config.name}" 51 | sql << " WHERE #{table_config.condition}" if table_config.condition 52 | sql << "\n" 53 | sql 54 | end 55 | 56 | def generate_embulk_config(db_name, database_config, table_config, columns) 57 | host = database_config['host'] 58 | user = database_config['username'] 59 | password = database_config['password'] 60 | database = database_config['database'] 61 | query = Samidare::BigQueryUtility.generate_sql(table_config, columns) 62 | project = @config['project_id'] 63 | p12_keyfile = @config['key'] 64 | service_account_email = @config['service_email'] 65 | dataset = database_config['bq_dataset'] 66 | table_name = actual_table_name(table_config.name, database_config['daily_snapshot'] || table_config.daily_snapshot) 67 | schema_path = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json" 68 | path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}" 69 | 70 | ERB.new(CONTENTS).result(binding) 71 | end 72 | 73 | def delete_table(dataset, table_name) 74 | @config['dataset'] = dataset 75 | 76 | bq = BigQuery::Client.new(@config) 77 | bq.delete_table(table_name) 78 | end 79 | 80 | def actual_table_name(table_name, daily_snapshot) 81 | return table_name unless daily_snapshot 82 | table_name + @current_date.strftime('%Y%m%d') 83 | end 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /spec/samidare/bigquery_utility_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'unindent' 3 | require 'timecop' 4 | 5 | describe Samidare::BigQueryUtility do 6 | describe '.generate_schema' do 7 | subject { Samidare::BigQueryUtility.generate_schema(columns) } 8 | 9 | let(:columns) { [ 10 | Samidare::MySQL::Column.new('id', 'int'), 11 | Samidare::MySQL::Column.new('name', 'varchar'), 12 | Samidare::MySQL::Column.new('created_at', 'datetime') 13 | ] } 14 | let(:schema_json) { 15 | <<-JSON.unindent 16 | [ 17 | {"name":"id","type":"integer"}, 18 | {"name":"name","type":"string"}, 19 | {"name":"created_at","type":"timestamp"} 20 | ] 21 | JSON 22 | } 23 | it { expect(subject).to eq schema_json } 24 | end 25 | 26 | describe '.generate_sql' do 27 | subject { Samidare::BigQueryUtility.generate_sql(table_config, columns) } 28 | 29 | let(:columns) { [ 30 | Samidare::MySQL::Column.new('id', 'int'), 31 | Samidare::MySQL::Column.new('name', 'varchar'), 32 | Samidare::MySQL::Column.new('created_at', 'datetime') 33 | ] } 34 | 35 | context 'no condition' do 36 | let(:table_config) { Samidare::MySQL::TableConfig.new({ 'name' => 'simple' }) } 37 | let(:sql) { "SELECT `id`,`name`,UNIX_TIMESTAMP(`created_at`) AS `created_at` FROM simple\n" } 38 | it { expect(subject).to eq sql } 39 | end 40 | 41 | context 'has condition' do 42 | let(:table_config) { Samidare::MySQL::TableConfig.new({ 'name' => 'simple', 'condition' => 'created_at >= CURRENT_DATE() - INTERVAL 3 MONTH' }) } 43 | let(:sql) { "SELECT `id`,`name`,UNIX_TIMESTAMP(`created_at`) AS `created_at` FROM simple WHERE created_at >= CURRENT_DATE() - INTERVAL 3 MONTH\n" } 44 | it { expect(subject).to eq sql } 45 | end 46 | end 47 | 48 | describe '#actual_table_name' do 49 | before { Timecop.freeze(Time.now) } 50 | 51 | after { Timecop.return } 52 | 53 | subject { Samidare::BigQueryUtility.new({}).actual_table_name(table_name, daily_snapshot) } 54 | let(:table_name) { 'users' } 55 | let(:daily_snapshot) { false } 56 | 57 | context 'do not use daily snapshot' do 58 | it { expect(subject).to eq table_name } 59 | end 60 | 61 | context 'use daily snapshot' do 62 | let(:daily_snapshot) { true } 63 | it { expect(subject).to eq table_name + Time.now.strftime('%Y%m%d') } 64 | end 65 | end 66 | 67 | describe '#actual_table_name' do 68 | before { Timecop.freeze(Time.now) } 69 | 70 | after { Timecop.return } 71 | 72 | subject { Samidare::BigQueryUtility.new({}).actual_table_name(table_name, daily_snapshot) } 73 | let(:table_name) { 'users' } 74 | let(:daily_snapshot) { false } 75 | 76 | context 'do not use daily snapshot' do 77 | it { expect(subject).to eq table_name } 78 | end 79 | 80 | context 'use daily snapshot' do 81 | let(:daily_snapshot) { true } 82 | it { expect(subject).to eq table_name + Time.now.strftime('%Y%m%d') } 83 | end 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /spec/samidare/embulk_utility_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Samidare::MySQL::Column do 4 | let(:column) { Samidare::MySQL::Column.new(column_name, data_type) } 5 | let(:column_name) { 'id' } 6 | let(:data_type) { 'int' } 7 | 8 | it { expect(column.column_name).to eq 'id' } 9 | it { expect(column.data_type).to eq 'int' } 10 | 11 | describe '#bigquery_data_type' do 12 | subject { column.bigquery_data_type } 13 | 14 | context 'int' do 15 | let(:data_type) { 'int' } 16 | it { expect(subject).to eq 'integer' } 17 | end 18 | 19 | context 'tinyint' do 20 | let(:data_type) { 'tinyint' } 21 | it { expect(subject).to eq 'integer' } 22 | end 23 | 24 | context 'smallint' do 25 | let(:data_type) { 'smallint' } 26 | it { expect(subject).to eq 'integer' } 27 | end 28 | 29 | context 'mediumint' do 30 | let(:data_type) { 'mediumint' } 31 | it { expect(subject).to eq 'integer' } 32 | end 33 | 34 | context 'bigint' do 35 | let(:data_type) { 'bigint' } 36 | it { expect(subject).to eq 'integer' } 37 | end 38 | 39 | context 'float' do 40 | let(:data_type) { 'float' } 41 | it { expect(subject).to eq 'float' } 42 | end 43 | 44 | context 'double' do 45 | let(:data_type) { 'double' } 46 | it { expect(subject).to eq 'float' } 47 | end 48 | 49 | context 'decimal' do 50 | let(:data_type) { 'decimal' } 51 | it { expect(subject).to eq 'float' } 52 | end 53 | 54 | context 'char' do 55 | let(:data_type) { 'char' } 56 | it { expect(subject).to eq 'string' } 57 | end 58 | 59 | context 'varchar' do 60 | let(:data_type) { 'varchar' } 61 | it { expect(subject).to eq 'string' } 62 | end 63 | 64 | context 'tinytext' do 65 | let(:data_type) { 'tinytext' } 66 | it { expect(subject).to eq 'string' } 67 | end 68 | 69 | context 'text' do 70 | let(:data_type) { 'text' } 71 | it { expect(subject).to eq 'string' } 72 | end 73 | 74 | context 'date' do 75 | let(:data_type) { 'date' } 76 | it { expect(subject).to eq 'timestamp' } 77 | end 78 | 79 | context 'datetime' do 80 | let(:data_type) { 'datetime' } 81 | it { expect(subject).to eq 'timestamp' } 82 | end 83 | 84 | context 'timestamp' do 85 | let(:data_type) { 'timestamp' } 86 | it { expect(subject).to eq 'timestamp' } 87 | end 88 | end 89 | 90 | describe '#converted_value' do 91 | subject { column.converted_value } 92 | 93 | context 'datetime' do 94 | let(:column_name) { 'create_at' } 95 | let(:data_type) { 'datetime' } 96 | it { expect(subject).to eq 'UNIX_TIMESTAMP(`create_at`) AS `create_at`' } 97 | end 98 | 99 | context 'int' do 100 | let(:column_name) { 'id' } 101 | let(:data_type) { 'int' } 102 | it { expect(subject).to eq '`id`' } 103 | end 104 | 105 | context 'varchar' do 106 | let(:column_name) { 'explanation' } 107 | let(:data_type) { 'varchar' } 108 | it { expect(subject).to eq '`explanation`' } 109 | end 110 | end 111 | 112 | describe '#to_json' do 113 | subject { column.to_json } 114 | 115 | let(:column_name) { 'id' } 116 | let(:data_type) { 'int' } 117 | it { expect(subject).to eq '{"name":"id","type":"integer"}' } 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /lib/samidare/mysql.rb: -------------------------------------------------------------------------------- 1 | require 'mysql2-cs-bind' 2 | require 'json' 3 | require 'yaml' 4 | require 'fileutils' 5 | require 'samidare/bigquery_utility' 6 | 7 | module Samidare 8 | module MySQL 9 | class MySQLClient 10 | COLUMN_SQL = <<-SQL.freeze 11 | SELECT column_name, data_type 12 | FROM INFORMATION_SCHEMA.COLUMNS 13 | WHERE table_schema = ? 14 | AND table_name = ? 15 | ORDER BY ordinal_position 16 | SQL 17 | 18 | def initialize(database_config) 19 | @database_config = database_config 20 | end 21 | 22 | def client 23 | @client ||= Mysql2::Client.new( 24 | host: @database_config['host'], 25 | username: @database_config['username'], 26 | password: @database_config['password'], 27 | database: @database_config['database'] 28 | ) 29 | end 30 | 31 | def generate_bq_schema(table_name) 32 | infos = columns(table_name) 33 | BigQueryUtility.generate_schema(infos) 34 | end 35 | 36 | def columns(table_name) 37 | rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name) 38 | rows.map { |row| Column.new(row['column_name'], row['data_type']) } 39 | end 40 | end 41 | 42 | class TableConfig 43 | attr_reader :name, :daily_snapshot, :condition 44 | 45 | def initialize(config) 46 | @name = config['name'] 47 | @daily_snapshot = config['daily_snapshot'] || false 48 | @condition = config['condition'] 49 | end 50 | 51 | def self.generate_table_configs(file_path = 'table.yml') 52 | configs = YAML.load_file(file_path) 53 | configs.each_with_object({}) do |(db, database_config), table_configs| 54 | table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) } 55 | table_configs 56 | end 57 | end 58 | 59 | def ==(other) 60 | instance_variables.all? do |v| 61 | instance_variable_get(v) == other.instance_variable_get(v) 62 | end 63 | end 64 | end 65 | 66 | class Column 67 | attr_reader :column_name, :data_type 68 | 69 | TYPE_MAPPINGS = { 70 | 'int' => 'integer', 71 | 'tinyint' => 'integer', 72 | 'smallint' => 'integer', 73 | 'mediumint' => 'integer', 74 | 'bigint' => 'integer', 75 | 'float' => 'float', 76 | 'double' => 'float', 77 | 'decimal' => 'float', 78 | 'char' => 'string', 79 | 'varchar' => 'string', 80 | 'tinytext' => 'string', 81 | 'text' => 'string', 82 | 'date' => 'timestamp', 83 | 'datetime' => 'timestamp', 84 | 'timestamp' => 'timestamp' 85 | }.freeze 86 | 87 | def initialize(column_name, data_type) 88 | @column_name = column_name 89 | @data_type = data_type 90 | end 91 | 92 | def bigquery_data_type 93 | TYPE_MAPPINGS[@data_type] 94 | end 95 | 96 | def converted_value 97 | if bigquery_data_type == 'timestamp' 98 | # time zone translate to UTC 99 | "UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}" 100 | elsif data_type == 'tinyint' 101 | # for MySQL tinyint(1) problem 102 | "CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}" 103 | else 104 | escaped_column_name 105 | end 106 | end 107 | 108 | def to_json(*a) 109 | { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a) 110 | end 111 | 112 | private 113 | 114 | def escaped_column_name 115 | "`#{@column_name}`" 116 | end 117 | end 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | samidare (0.3.0) 5 | bigquery (= 0.8.3) 6 | embulk-formatter-jsonl (= 0.1.4) 7 | embulk-input-mysql (= 0.8.2) 8 | embulk-output-bigquery (= 0.4.3) 9 | embulk-parser-jsonl (= 0.2.0) 10 | mysql2-cs-bind (= 0.0.6) 11 | unindent (= 1.0) 12 | 13 | GEM 14 | remote: https://rubygems.org/ 15 | specs: 16 | activesupport (5.2.0) 17 | concurrent-ruby (~> 1.0, >= 1.0.2) 18 | i18n (>= 0.7, < 2) 19 | minitest (~> 5.1) 20 | tzinfo (~> 1.1) 21 | addressable (2.5.2) 22 | public_suffix (>= 2.0.2, < 4.0) 23 | ast (2.4.0) 24 | autoparse (0.3.3) 25 | addressable (>= 2.3.1) 26 | extlib (>= 0.9.15) 27 | multi_json (>= 1.0.0) 28 | bigquery (0.8.3) 29 | google-api-client (= 0.8.6) 30 | concurrent-ruby (1.0.5) 31 | diff-lcs (1.3) 32 | embulk-formatter-jsonl (0.1.4) 33 | jrjackson (~> 0.2.8) 34 | embulk-input-mysql (0.8.2) 35 | embulk-output-bigquery (0.4.3) 36 | google-api-client 37 | time_with_zone 38 | embulk-parser-jsonl (0.2.0) 39 | extlib (0.9.16) 40 | faraday (0.15.2) 41 | multipart-post (>= 1.2, < 3) 42 | google-api-client (0.8.6) 43 | activesupport (>= 3.2) 44 | addressable (~> 2.3) 45 | autoparse (~> 0.3) 46 | extlib (~> 0.9) 47 | faraday (~> 0.9) 48 | googleauth (~> 0.3) 49 | launchy (~> 2.4) 50 | multi_json (~> 1.10) 51 | retriable (~> 1.4) 52 | signet (~> 0.6) 53 | googleauth (0.6.2) 54 | faraday (~> 0.12) 55 | jwt (>= 1.4, < 3.0) 56 | logging (~> 2.0) 57 | memoist (~> 0.12) 58 | multi_json (~> 1.11) 59 | os (~> 0.9) 60 | signet (~> 0.7) 61 | i18n (1.0.1) 62 | concurrent-ruby (~> 1.0) 63 | jrjackson (0.2.9) 64 | jwt (2.1.0) 65 | launchy (2.4.3) 66 | addressable (~> 2.3) 67 | little-plugger (1.1.4) 68 | logging (2.2.2) 69 | little-plugger (~> 1.1) 70 | multi_json (~> 1.10) 71 | memoist (0.16.0) 72 | minitest (5.11.3) 73 | multi_json (1.13.1) 74 | multipart-post (2.0.0) 75 | mysql2 (0.5.1-x64-mingw32) 76 | mysql2-cs-bind (0.0.6) 77 | mysql2 78 | os (0.9.6) 79 | parallel (1.12.1) 80 | parser (2.5.1.0) 81 | ast (~> 2.4.0) 82 | powerpack (0.1.2) 83 | public_suffix (3.0.2) 84 | rainbow (2.2.2) 85 | rake 86 | rake (10.5.0) 87 | retriable (1.4.1) 88 | rspec (3.7.0) 89 | rspec-core (~> 3.7.0) 90 | rspec-expectations (~> 3.7.0) 91 | rspec-mocks (~> 3.7.0) 92 | rspec-core (3.7.1) 93 | rspec-support (~> 3.7.0) 94 | rspec-expectations (3.7.0) 95 | diff-lcs (>= 1.2.0, < 2.0) 96 | rspec-support (~> 3.7.0) 97 | rspec-mocks (3.7.0) 98 | diff-lcs (>= 1.2.0, < 2.0) 99 | rspec-support (~> 3.7.0) 100 | rspec-support (3.7.1) 101 | rubocop (0.49.1) 102 | parallel (~> 1.10) 103 | parser (>= 2.3.3.1, < 3.0) 104 | powerpack (~> 0.1) 105 | rainbow (>= 1.99.1, < 3.0) 106 | ruby-progressbar (~> 1.7) 107 | unicode-display_width (~> 1.0, >= 1.0.1) 108 | ruby-progressbar (1.9.0) 109 | signet (0.8.1) 110 | addressable (~> 2.3) 111 | faraday (~> 0.9) 112 | jwt (>= 1.5, < 3.0) 113 | multi_json (~> 1.10) 114 | thread_safe (0.3.6) 115 | time_with_zone (0.3.1) 116 | tzinfo 117 | timecop (0.9.1) 118 | tzinfo (1.2.5) 119 | thread_safe (~> 0.1) 120 | unicode-display_width (1.4.0) 121 | unindent (1.0) 122 | 123 | PLATFORMS 124 | x64-mingw32 125 | 126 | DEPENDENCIES 127 | bundler (~> 1.7) 128 | rake (~> 10.0) 129 | rspec (~> 3.7.0) 130 | rubocop (= 0.49.1) 131 | samidare! 132 | timecop (~> 0.9.0) 133 | 134 | BUNDLED WITH 135 | 1.16.2 136 | -------------------------------------------------------------------------------- /spec/samidare/mysql_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | describe Samidare::MySQL::TableConfig do 4 | 5 | describe '.generate_table_configs' do 6 | subject { Samidare::MySQL::TableConfig.generate_table_configs('spec/support/table.yml') } 7 | let(:db01_hoge) { Samidare::MySQL::TableConfig.new({ 'name' => 'hoge', 'daily_snapshot' => true }) } 8 | let(:db01_simple) { Samidare::MySQL::TableConfig.new({ 'name' => 'simple' }) } 9 | let(:db02_fuga) { Samidare::MySQL::TableConfig.new({ 'name' => 'fuga' }) } 10 | let(:db02_with_condition) { Samidare::MySQL::TableConfig.new({ 'name' => 'with_condition', 'condition' => 'created_at < CURRENT_DATE()' }) } 11 | 12 | it { expect(subject['db01'][0]).to eq db01_hoge } 13 | it { expect(subject['db01'][1]).to eq db01_simple } 14 | it { expect(subject['db02'][0]).to eq db02_fuga } 15 | it { expect(subject['db02'][1]).to eq db02_with_condition } 16 | end 17 | end 18 | 19 | describe Samidare::MySQL::Column do 20 | let(:column) { Samidare::MySQL::Column.new(column_name, data_type) } 21 | let(:column_name) { 'id' } 22 | let(:data_type) { 'int' } 23 | 24 | it { expect(column.column_name).to eq 'id' } 25 | it { expect(column.data_type).to eq 'int' } 26 | 27 | describe '#bigquery_data_type' do 28 | subject { column.bigquery_data_type } 29 | 30 | context 'int' do 31 | let(:data_type) { 'int' } 32 | it { expect(subject).to eq 'integer' } 33 | end 34 | 35 | context 'tinyint' do 36 | let(:data_type) { 'tinyint' } 37 | it { expect(subject).to eq 'integer' } 38 | end 39 | 40 | context 'smallint' do 41 | let(:data_type) { 'smallint' } 42 | it { expect(subject).to eq 'integer' } 43 | end 44 | 45 | context 'mediumint' do 46 | let(:data_type) { 'mediumint' } 47 | it { expect(subject).to eq 'integer' } 48 | end 49 | 50 | context 'bigint' do 51 | let(:data_type) { 'bigint' } 52 | it { expect(subject).to eq 'integer' } 53 | end 54 | 55 | context 'float' do 56 | let(:data_type) { 'float' } 57 | it { expect(subject).to eq 'float' } 58 | end 59 | 60 | context 'double' do 61 | let(:data_type) { 'double' } 62 | it { expect(subject).to eq 'float' } 63 | end 64 | 65 | context 'decimal' do 66 | let(:data_type) { 'decimal' } 67 | it { expect(subject).to eq 'float' } 68 | end 69 | 70 | context 'char' do 71 | let(:data_type) { 'char' } 72 | it { expect(subject).to eq 'string' } 73 | end 74 | 75 | context 'varchar' do 76 | let(:data_type) { 'varchar' } 77 | it { expect(subject).to eq 'string' } 78 | end 79 | 80 | context 'tinytext' do 81 | let(:data_type) { 'tinytext' } 82 | it { expect(subject).to eq 'string' } 83 | end 84 | 85 | context 'text' do 86 | let(:data_type) { 'text' } 87 | it { expect(subject).to eq 'string' } 88 | end 89 | 90 | context 'date' do 91 | let(:data_type) { 'date' } 92 | it { expect(subject).to eq 'timestamp' } 93 | end 94 | 95 | context 'datetime' do 96 | let(:data_type) { 'datetime' } 97 | it { expect(subject).to eq 'timestamp' } 98 | end 99 | 100 | context 'timestamp' do 101 | let(:data_type) { 'timestamp' } 102 | it { expect(subject).to eq 'timestamp' } 103 | end 104 | end 105 | 106 | describe '#converted_value' do 107 | subject { column.converted_value } 108 | 109 | context 'datetime' do 110 | let(:column_name) { 'create_at' } 111 | let(:data_type) { 'datetime' } 112 | it { expect(subject).to eq 'UNIX_TIMESTAMP(`create_at`) AS `create_at`' } 113 | end 114 | 115 | context 'int' do 116 | let(:column_name) { 'id' } 117 | let(:data_type) { 'int' } 118 | it { expect(subject).to eq '`id`' } 119 | end 120 | 121 | context 'varchar' do 122 | let(:column_name) { 'explanation' } 123 | let(:data_type) { 'varchar' } 124 | it { expect(subject).to eq '`explanation`' } 125 | end 126 | end 127 | 128 | describe '#to_json' do 129 | subject { column.to_json } 130 | 131 | let(:column_name) { 'id' } 132 | let(:data_type) { 'int' } 133 | it { expect(subject).to eq '{"name":"id","type":"integer"}' } 134 | end 135 | end 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Samidare 2 | 3 | Generate Embulk config and BigQuery schema from MySQL schema and run Embulk. 4 | 5 | ## Installation 6 | 7 | Add this line to your application's Gemfile: 8 | 9 | ```ruby 10 | gem 'samidare' 11 | ``` 12 | 13 | And then execute: 14 | 15 | $ bundle 16 | 17 | Or install it yourself as: 18 | 19 | $ gem install samidare 20 | 21 | ## Embulk setup 22 | `Samidare` is utility for `Embulk` . 23 | You need to install `Embulk` and install some gems like below. 24 | 25 | ```bash 26 | embulk gem install embulk-input-mysql --version 0.8.2 27 | embulk gem install embulk-output-bigquery --version 0.4.3 28 | embulk gem install embulk-parser-jsonl --version 0.2.0 29 | embulk gem install embulk-formatter-jsonl --version 0.1.4 30 | ``` 31 | 32 | ## Usage 33 | Require `database.yml` and `table.yml`. 34 | Below is a sample config file. 35 | 36 | ### database.yml 37 | ```yml 38 | db01: 39 | host: localhost 40 | username: root 41 | password: pswd 42 | database: production 43 | bq_dataset: mysql_db01 44 | 45 | db02: 46 | host: localhost 47 | username: root 48 | password: pswd 49 | database: production 50 | bq_dataset: mysql_db02 51 | 52 | ``` 53 | 54 | **Caution: Embulk doesn't allow no password for MySQL** 55 | 56 | ### table.yml 57 | ```yml 58 | db01: 59 | tables: 60 | - name: users 61 | - name: events 62 | - name: hobbies 63 | 64 | db02: 65 | tables: 66 | - name: administrators 67 | - name: configs 68 | ``` 69 | 70 | Samidare requires BigQuery parameters like below. 71 | 72 | ```ruby 73 | [sample.rb] 74 | require 'samidare' 75 | 76 | config = { 77 | 'project_id' => 'BIGQUERY_PROJECT_ID', 78 | 'service_email' => 'SERVICE_ACCOUNT_EMAIL', 79 | 'key' => '/etc/embulk/bigquery.p12', 80 | 'schema_dir' => '/var/tmp/embulk/schema', 81 | 'config_dir' => '/var/tmp/embulk/config', 82 | 'auth_method' => 'private_key' 83 | } 84 | 85 | client = Samidare::EmbulkClient.new 86 | client.generate_config(config) 87 | client.run(config) 88 | ``` 89 | 90 | ```bash 91 | ruby sample.rb 92 | ``` 93 | 94 | ## Features 95 | ### process status 96 | `Samidare` returns process status as boolean. 97 | If all tables are succeed, then returns `true`, else `false` . 98 | It is useful to control system flow. 99 | 100 | ```ruby 101 | process_status = Samidare::EmbulkClient.new.run(config) 102 | exit 1 unless process_status 103 | ``` 104 | 105 | ### narrow tables 106 | You can narrow actual target tables from `table.yml` for test or to retry. 107 | If no target tables is given, `Samidare` will execute all tables. 108 | 109 | ```ruby 110 | # in case, all tables are ['users', 'purchases', 'items'] 111 | target_tables = ['users', 'purchases'] 112 | Samidare::EmbulkClient.new.run(config, target_tables) 113 | ``` 114 | 115 | ### retry 116 | You can set retry count. 117 | If any table failed, only failed table will be retried until retry count. 118 | If no retry count is given, `Samidare` dosen't retry. 119 | 120 | ```ruby 121 | # 2 times retry will execute 122 | Samidare::EmbulkClient.new.run(config, [], 2) 123 | ``` 124 | 125 | ### SQL condition 126 | If you set `condition` to a table in `table.yml` , SQL is generated like below. 127 | It is useful for large size table. 128 | 129 | ```yml 130 | [table.yml] 131 | production: 132 | tables: 133 | - name: users 134 | - name: events 135 | conditon: created_at < CURRENT_DATE() 136 | ``` 137 | 138 | ```sql 139 | SELECT * FROM users 140 | SELECT * FROM events WHERE created_at < CURRENT_DATE() 141 | ``` 142 | 143 | ### daily snapshot 144 | BigQuery supports table wildcard expression of a specific set of daily tables, for example, `sales20150701` . 145 | If you need daily snapshot of a table for BigQuery, use `daily_snapshot` option to `database.yml` or `table.yml` like below. 146 | `daily_snapshot` option effects all tables in case of `database.yml` . 147 | On the other hand, only target table in `table.yml` . 148 | **Daily part is determined by execute date.** 149 | 150 | ```yml 151 | [database.yml] 152 | production: 153 | host: localhost 154 | username: root 155 | password: pswd 156 | database: production 157 | bq_dataset: mysql 158 | daily_snapshot: true 159 | ``` 160 | 161 | ```yml 162 | [table.yml] 163 | production: 164 | tables: 165 | - name: users 166 | - name: events 167 | daily_snapshot: true 168 | - name: hobbies 169 | 170 | Only `events` is renamed to `eventsYYYYMMDD` for BigQuery. 171 | ``` 172 | 173 | ## Contributing 174 | 175 | 1. Fork it ( https://github.com/[my-github-username]/samidare/fork ) 176 | 2. Create your feature branch (`git checkout -b my-new-feature`) 177 | 3. Commit your changes (`git commit -am 'Add some feature'`) 178 | 4. Push to the branch (`git push origin my-new-feature`) 179 | 5. Create a new Pull Request 180 | --------------------------------------------------------------------------------