├── .rspec ├── lib ├── etl │ ├── version.rb │ └── helpers.rb └── etl.rb ├── .travis.yml ├── Gemfile ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Rakefile ├── Gemfile.lock ├── etl.gemspec ├── examples ├── basic_etl.rb └── iterator_etl.rb ├── README.md └── spec └── etl_spec.rb /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --format documentation 3 | -------------------------------------------------------------------------------- /lib/etl/version.rb: -------------------------------------------------------------------------------- 1 | class ETL 2 | VERSION = "1.1.1" 3 | end 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 1.9.3 4 | - 2.2.0 5 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in etl.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | .rvmrc 7 | Gemfile.lock 8 | InstalledFiles 9 | _yardoc 10 | coverage 11 | doc/ 12 | lib/bundler/man 13 | pkg 14 | rdoc 15 | spec/reports 16 | test/tmp 17 | test/version_tmp 18 | tmp 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | If you would like to contribute code to ETL you can do so through GitHub by 5 | forking the repository and sending a pull request. 6 | 7 | When submitting code, please make every effort to follow existing conventions 8 | and style in order to keep the code as readable as possible. 9 | 10 | Before your code can be accepted into the project you must also sign the 11 | [Individual Contributor License Agreement (CLA)][1]. 12 | 13 | 14 | [1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 Square Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rake 2 | require "bundler/gem_tasks" 3 | begin 4 | require 'rspec/core/rake_task' 5 | 6 | RSpec::Core::RakeTask.new(:spec) do |t| 7 | t.rspec_opts = '-b' 8 | end 9 | 10 | task default: :spec 11 | rescue LoadError 12 | $stderr.puts "rspec not available, spec task not provided" 13 | end 14 | 15 | begin 16 | require 'cane/rake_task' 17 | 18 | desc "Run cane to check quality metrics" 19 | Cane::RakeTask.new(:quality) do |cane| 20 | cane.abc_max = 10 21 | cane.style_glob = "lib/**/*.rb" 22 | cane.no_doc = true 23 | end 24 | 25 | task :default => :quality 26 | rescue LoadError 27 | warn "cane not available, quality task not provided." 28 | end 29 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | ETL (1.1.1) 5 | activesupport (>= 3.2.3) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | activesupport (3.2.13) 11 | i18n (= 0.6.1) 12 | multi_json (~> 1.0) 13 | cane (2.5.0) 14 | parallel 15 | diff-lcs (1.2.5) 16 | i18n (0.6.1) 17 | multi_json (1.7.3) 18 | mysql2 (0.3.17) 19 | parallel (0.6.1) 20 | rake (10.0.3) 21 | rspec (2.14.1) 22 | rspec-core (~> 2.14.0) 23 | rspec-expectations (~> 2.14.0) 24 | rspec-mocks (~> 2.14.0) 25 | rspec-core (2.14.8) 26 | rspec-expectations (2.14.5) 27 | diff-lcs (>= 1.1.3, < 2.0) 28 | rspec-mocks (2.14.6) 29 | 30 | PLATFORMS 31 | ruby 32 | 33 | DEPENDENCIES 34 | ETL! 35 | cane 36 | mysql2 (~> 0.3.17) 37 | rake 38 | rspec (>= 2.14.0) 39 | -------------------------------------------------------------------------------- /etl.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require File.expand_path('../lib/etl/version', __FILE__) 3 | 4 | Gem::Specification.new do |gem| 5 | gem.authors = ["Jeff Iacono"] 6 | gem.email = ["iacono@squareup.com"] 7 | gem.description = %q{Extract, Transform, and Load (ETL) ruby wrapper} 8 | gem.summary = %q{Extract, Transform, and Load (ETL) ruby wrapper. Supports basic and iterative ETL operations.} 9 | gem.homepage = "https://github.com/square/ETL" 10 | 11 | gem.files = `git ls-files`.split($\) 12 | gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 13 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 14 | gem.name = "ETL" 15 | gem.require_paths = ["lib"] 16 | gem.version = ETL::VERSION 17 | 18 | gem.add_runtime_dependency "activesupport", [">= 3.2.3"] 19 | 20 | gem.add_development_dependency "rake" 21 | gem.add_development_dependency "cane" 22 | gem.add_development_dependency 'mysql2', '~> 0.3.17' 23 | gem.add_development_dependency "rspec", [">= 2.14.0"] 24 | end 25 | -------------------------------------------------------------------------------- /lib/etl/helpers.rb: -------------------------------------------------------------------------------- 1 | class ETL 2 | module Helpers 3 | # max_for returns the max value for the passed in column as found in the 4 | # specified database.table. If there is not currently a max, we use COALESCE 5 | # and a default value. You can specify a :default_floor value or the method 6 | # will try to derive it for you. 7 | # 8 | # Note: we try to detect if we want a date return type via the #datetype? 9 | # check. 10 | # 11 | # If this is found we wrap the whole SELECT clause in a DATE so it is cast 12 | # accordingly. 13 | def max_for options = {} 14 | database = options[:database] 15 | table = options[:table] 16 | column = options[:column] 17 | 18 | default_value = options[:default_floor] || 19 | default_floor_for(column) 20 | 21 | if date? default_value 22 | default_value = "DATE('#{default_value}')" 23 | caster = ->(str) { "DATE(#{str})" } 24 | end 25 | 26 | max_sql_clause = "IFNULL(MAX(#{table}.#{column}), #{default_value})" 27 | max_sql_clause = caster.(max_sql_clause) if caster 28 | 29 | sql = <<-EOS 30 | SELECT #{max_sql_clause} AS the_max 31 | FROM #{database}.#{table} 32 | EOS 33 | sql += " WHERE #{options[:conditions]}" if options[:conditions] 34 | 35 | query(sql).to_a.first['the_max'] 36 | end 37 | 38 | private 39 | 40 | def date? val 41 | val =~ /^\d{4}-\d{1,2}-\d{1,2}( \d{2}:\d{2}:\d{2}( ((-|\+)\d+)| UTC)?)?$/ 42 | end 43 | 44 | def default_floor_for column 45 | case column 46 | when /_at$/ 47 | return '1970-01-01' 48 | when /_date$/ 49 | return '1970-01-01' 50 | when /(^id$|_id$)/ 51 | return 0 52 | else 53 | raise ArgumentError, "could not determine a default for #{column}" 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /examples/basic_etl.rb: -------------------------------------------------------------------------------- 1 | require 'mysql2' 2 | require 'ETL' 3 | 4 | connection = Mysql2::Client.new host: 'localhost', 5 | username: 'root', 6 | password: '', 7 | database: 'some_database' 8 | 9 | # set up the source database 10 | connection.query %[ 11 | CREATE DATABASE IF NOT EXISTS some_database] 12 | 13 | connection.query %[ 14 | CREATE TABLE IF NOT EXISTS some_database.some_source_table ( 15 | user_id INT NOT NULL 16 | , created_at DATETIME NOT NULL 17 | , amount INT NOT NULL)] 18 | 19 | connection.query %[ 20 | TRUNCATE some_database.some_source_table] 21 | 22 | connection.query %[ 23 | INSERT INTO some_database.some_source_table ( 24 | user_id 25 | , created_at 26 | , amount 27 | ) VALUES 28 | (1, UTC_TIMESTAMP, 100) 29 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200) 30 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400) 31 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600) 32 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600) 33 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100) 34 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200) 35 | , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)] 36 | 37 | # set up the ETL 38 | etl = ETL.new(description: "a description of what this ETL does", 39 | connection: connection) 40 | 41 | # configure ETL 42 | etl.config do |etl| 43 | etl.ensure_destination do |etl| 44 | # For most ETLs you may want to ensure that the destination exists, so the 45 | # #ensure_destination block is ideally suited to fulfill this requirement. 46 | # 47 | # By way of example: 48 | # 49 | etl.query %[ 50 | CREATE TABLE IF NOT EXISTS some_database.some_destination_table ( 51 | user_id INT UNSIGNED NOT NULL 52 | , created_date DATE NOT NULL 53 | , total_amount INT SIGNED NOT NULL 54 | , message VARCHAR(100) DEFAULT NULL 55 | , PRIMARY KEY (user_id, created_date) 56 | , KEY (created_date) 57 | )] 58 | end 59 | 60 | etl.before_etl do |etl| 61 | # All pre-ETL work is performed in this block. 62 | # 63 | # This can be thought of as a before-ETL hook that will fire only once. When 64 | # you are not leveraging the ETL iteration capabilities, the value of this 65 | # block vs the #etl block is not very clear. We will see how and when to 66 | # leverage this block effectively when we introduce iteration. 67 | # 68 | # As an example, let's say we want to get rid of all entries that have an 69 | # amount less than zero before moving on to our actual etl: 70 | # 71 | etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0] 72 | end 73 | 74 | etl.etl do |etl| 75 | # Here is where the magic happens! This block contains the main ETL 76 | # operation. 77 | # 78 | # For example: 79 | # 80 | etl.query %[ 81 | REPLACE INTO some_database.some_destination_table ( 82 | user_id 83 | , created_date 84 | , total_amount 85 | ) SELECT 86 | sst.user_id 87 | , DATE(sst.created_at) AS created_date 88 | , SUM(sst.amount) AS total_amount 89 | FROM 90 | some_database.some_source_table sst 91 | GROUP BY 92 | sst.user_id 93 | , DATE(sst.created_at)] 94 | end 95 | 96 | etl.after_etl do |etl| 97 | # All post-ETL work is performed in this block. 98 | # 99 | # Again, to finish up with an example: 100 | # 101 | etl.query %[ 102 | UPDATE some_database.some_destination_table 103 | SET message = "WOW" 104 | WHERE total_amount > 100] 105 | end 106 | end 107 | 108 | # ship it 109 | etl.run 110 | 111 | puts %[ 112 | ETL complete. Now go have a look at some_database.some_destination_table 113 | That was build from some_database.some_source_table using the above ETL configuration. 114 | 115 | SELECT * FROM some_database.some_destination_table;] 116 | -------------------------------------------------------------------------------- /lib/etl.rb: -------------------------------------------------------------------------------- 1 | require 'etl/version' 2 | require 'etl/helpers' 3 | require 'logger' 4 | require 'date' 5 | require 'time' 6 | 7 | class ETL 8 | include Helpers 9 | 10 | attr_accessor :description 11 | attr_accessor :connection 12 | attr_reader :logger 13 | 14 | ORDERED_ETL_OPERATIONS = [ 15 | :ensure_destination, 16 | :before_etl, 17 | :etl, 18 | :after_etl 19 | ] 20 | 21 | ITERATOR_OPERATIONS = [ 22 | :start, 23 | :step, 24 | :stop 25 | ] 26 | 27 | def self.connection= connection 28 | @connection = connection 29 | end 30 | 31 | def self.connection 32 | @connection 33 | end 34 | 35 | def self.defaults 36 | {connection: @connection} 37 | end 38 | 39 | def initialize attributes = {} 40 | self.class.defaults.merge(attributes).each do |key, value| 41 | self.send "#{key}=", value 42 | end 43 | default_logger! unless attributes.keys.include?(:logger) 44 | end 45 | 46 | def config &block 47 | yield self if block_given? 48 | self 49 | end 50 | 51 | def logger= logger 52 | @logger = logger 53 | end 54 | 55 | # A little metaprogramming to consolidate the generation of our sql 56 | # generating / querying methods. Note that we don't metaprogram the etl 57 | # operation as it's a little more complex. 58 | # 59 | # This will produce methods of the form: 60 | # 61 | # def [name] *args, &block 62 | # if block_given? 63 | # @[name] = block 64 | # else 65 | # @[name].call self, *args if @[name] 66 | # end 67 | # end 68 | # 69 | # for any given variable included in the method name's array 70 | (ORDERED_ETL_OPERATIONS - [:etl]).each do |method| 71 | define_method method do |*args, &block| 72 | warn_args_will_be_deprecated_for method unless args.empty? 73 | 74 | if block 75 | instance_variable_set("@#{method}", block) 76 | else 77 | instance_variable_get("@#{method}"). 78 | call(self, *args) if instance_variable_get("@#{method}") 79 | end 80 | end 81 | end 82 | 83 | def etl *args, &block 84 | warn_args_will_be_deprecated_for :etl unless args.empty? 85 | 86 | if block_given? 87 | @etl = block 88 | else 89 | if iterate? 90 | if @etl 91 | current = start 92 | @etl.call self, cast(current), cast(current += step) while stop >= current 93 | end 94 | else 95 | @etl.call self, *args if @etl 96 | end 97 | end 98 | end 99 | 100 | # A little more metaprogramming to consolidate the generation of 101 | # our sql generating / querying methods. 102 | # 103 | # This will produce methods of the form: 104 | # 105 | # def [method] *args, &block 106 | # if block 107 | # @_[method]_block = block 108 | # else 109 | # # cache block's result 110 | # if defined? @[method] 111 | # @[method] 112 | # else 113 | # @[method] = @_[method]_block.call(self, *args) 114 | # end 115 | # end 116 | # end 117 | # 118 | # for any given variable included in the method name's array 119 | ITERATOR_OPERATIONS.each do |method| 120 | define_method method do |*args, &block| 121 | warn_args_will_be_deprecated_for method unless args.empty? 122 | 123 | if block 124 | instance_variable_set("@_#{method}_block", block) 125 | else 126 | if instance_variable_defined?("@#{method}") 127 | instance_variable_get("@#{method}") 128 | else 129 | instance_variable_set("@#{method}", 130 | instance_variable_get("@_#{method}_block") 131 | .call(self, *args)) 132 | end 133 | end 134 | end 135 | end 136 | 137 | def run options = {} 138 | (ORDERED_ETL_OPERATIONS - [*options[:except]]).each do |method| 139 | send method 140 | end 141 | end 142 | 143 | def query sql 144 | time_and_log(sql: sql) do 145 | connection.query sql 146 | end 147 | end 148 | 149 | def info data = {} 150 | logger.info data.merge(emitter: self) if logger? 151 | end 152 | 153 | def debug data = {} 154 | logger.debug data.merge(emitter: self) if logger? 155 | end 156 | 157 | private 158 | 159 | def warn_args_will_be_deprecated_for method 160 | warn "DEPRECATED: passing arguments to ##{method} will be removed in an upcoming release and will raise an exception. Please remove this from your code." 161 | end 162 | 163 | def iterate? 164 | ITERATOR_OPERATIONS.all? do |method| 165 | instance_variable_defined?("@_#{method}_block") 166 | end 167 | end 168 | 169 | def default_logger! 170 | @logger = default_logger 171 | end 172 | 173 | def logger? 174 | !!@logger 175 | end 176 | 177 | def default_logger 178 | ::Logger.new(STDOUT).tap do |logger| 179 | logger.formatter = proc do |severity, datetime, progname, msg| 180 | event_details = "[#{datetime}] #{severity} #{msg[:event_type]}" 181 | 182 | emitter_details = "\"#{msg[:emitter].description || 'no description given'}\"" 183 | emitter_details += " (object #{msg[:emitter].object_id})" 184 | 185 | leadin = "#{event_details} for #{emitter_details}" 186 | 187 | case msg[:event_type] 188 | when :query_start 189 | "#{leadin}\n#{msg[:sql]}\n" 190 | when :query_complete 191 | "#{leadin} runtime: #{msg[:runtime]}s\n" 192 | else 193 | "#{leadin}: #{msg[:message]}\n" 194 | end 195 | end 196 | end 197 | end 198 | 199 | def time_and_log data = {}, &block 200 | start_runtime = Time.now 201 | debug data.merge(event_type: :query_start) 202 | retval = yield 203 | info data.merge(event_type: :query_complete, 204 | runtime: Time.now - start_runtime) 205 | retval 206 | end 207 | 208 | # NOTE: If you needed to handle more type data type casting you can add a 209 | # case statement. If you need to be able to handle entirely different sets 210 | # of casting depending on database engine, you can modify #cast to take a 211 | # "type" arg and then determine which caster to route the arg through 212 | def cast arg 213 | case arg 214 | when Date then arg.strftime("%Y-%m-%d") 215 | when Time then arg.strftime("%Y-%m-%d %H:%M:%S") 216 | else 217 | arg 218 | end 219 | end 220 | end 221 | -------------------------------------------------------------------------------- /examples/iterator_etl.rb: -------------------------------------------------------------------------------- 1 | require 'mysql2' 2 | require 'ETL' 3 | 4 | connection = Mysql2::Client.new host: 'localhost', 5 | username: 'root', 6 | password: '', 7 | database: 'some_database' 8 | 9 | # set up the source database: 10 | connection.query %[ 11 | CREATE DATABASE IF NOT EXISTS some_database] 12 | 13 | connection.query %[ 14 | CREATE TABLE IF NOT EXISTS some_database.some_source_table ( 15 | user_id INT NOT NULL 16 | , created_at DATETIME NOT NULL 17 | , amount INT NOT NULL)] 18 | 19 | connection.query %[ 20 | TRUNCATE some_database.some_source_table] 21 | 22 | connection.query %[ 23 | INSERT INTO some_database.some_source_table ( 24 | user_id 25 | , created_at 26 | , amount 27 | ) VALUES 28 | (1, UTC_TIMESTAMP, 100) 29 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200) 30 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400) 31 | , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600) 32 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600) 33 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100) 34 | , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200) 35 | , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)] 36 | 37 | # set up the ETL 38 | etl = ETL.new(description: "a description of what this ETL does", 39 | connection: connection) 40 | 41 | # configure it 42 | etl.config do |etl| 43 | etl.ensure_destination do |etl| 44 | # For most ETLs you may want to ensure that the destination exists, so the 45 | # #ensure_destination block is ideally suited to fulfill this requirement. 46 | # 47 | # By way of example: 48 | # 49 | etl.query %[ 50 | CREATE TABLE IF NOT EXISTS some_database.some_destination_table ( 51 | user_id INT UNSIGNED NOT NULL 52 | , created_date DATE NOT NULL 53 | , total_amount INT SIGNED NOT NULL 54 | , message VARCHAR(100) DEFAULT NULL 55 | , PRIMARY KEY (user_id, created_date) 56 | , KEY (created_date) 57 | )] 58 | end 59 | 60 | etl.before_etl do |etl| 61 | # All pre-ETL work is performed in this block. 62 | # 63 | # Now that we are leveraging iteration the #before_etl block becomes 64 | # more useful as a way to execute an operation once before we begin 65 | # our iteration. 66 | # 67 | # As an example, let's say we want to get rid of all entries that have an 68 | # amount less than zero before moving on to our actual etl: 69 | # 70 | etl.query %[ 71 | DELETE FROM some_database.some_source_table 72 | WHERE amount < 0] 73 | end 74 | 75 | etl.start do |etl| 76 | # This defines where the ETL should start. This can be a flat number 77 | # or date, or even SQL / other code can be executed to produce a starting 78 | # value. 79 | # 80 | # Usually, this is the last known entry for the destination table with 81 | # some sensible default if the destination does not yet contain data. 82 | # 83 | # As an example: 84 | # 85 | # Note that we cast the default date as a DATE. If we don't, it will be 86 | # treated as a string and our iterator will fail under the hood when testing 87 | # if it is complete. 88 | res = etl.query %[ 89 | SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max 90 | FROM some_database.some_destination_table] 91 | 92 | res.to_a.first['the_max'] 93 | end 94 | 95 | etl.step do |etl| 96 | # The step block defines the size of the iteration block. To iterate by 97 | # ten records, the step block should be set to return 10. 98 | # 99 | # As an alternative example, to set the iteration to go 10,000 units 100 | # at a time, the following value should be provided: 101 | # 102 | # 10_000 (Note: an underscore is used for readability) 103 | # 104 | # As an example, to iterate 7 days at a time: 105 | # 106 | 7 107 | end 108 | 109 | etl.stop do |etl| 110 | # The stop block defines when the iteration should halt. 111 | # Again, this can be a flat value or code. Either way, one value *must* be 112 | # returned. 113 | # 114 | # As a flat value: 115 | # 116 | # 1_000_000 117 | # 118 | # Or a date value: 119 | # 120 | # Time.now.to_date 121 | # 122 | # Or as a code example: 123 | # 124 | res = etl.query %[ 125 | SELECT DATE(MAX(created_at)) AS the_max 126 | FROM some_database.some_source_table] 127 | 128 | res.to_a.first['the_max'] 129 | end 130 | 131 | etl.etl do |etl, lbound, ubound| 132 | # The etl block is the main part of the framework. Note: there are 133 | # two extra args with the iterator this time around: "lbound" and "ubound" 134 | # 135 | # "lbound" is the lower bound of the current iteration. When iterating 136 | # from 0 to 10 and stepping by 2, the lbound would equal 2 on the 137 | # second iteration. 138 | # 139 | # "ubound" is the upper bound of the current iteration. In continuing with the 140 | # example above, when iterating from 0 to 10 and stepping by 2, the ubound would 141 | # equal 4 on the second iteration. 142 | # 143 | # These args can be used to "window" SQL queries or other code operations. 144 | # 145 | # As a first example, to iterate over a set of ids: 146 | # 147 | # etl.query %[ 148 | # REPLACE INTO some_database.some_destination_table ( 149 | # created_date 150 | # , user_id 151 | # , total_amount 152 | # ) SELECT 153 | # DATE(sst.created_at) AS created_date 154 | # , sst.user_id 155 | # , SUM(sst.amount) AS total_amount 156 | # FROM 157 | # some_database.some_source_table sst 158 | # WHERE 159 | # sst.user_id > #{lbound} AND sst.user_id <= #{ubound} 160 | # GROUP BY 161 | # DATE(sst.created_at) 162 | # , sst.user_id] 163 | # 164 | # To "window" a SQL query using dates: 165 | # 166 | etl.query %[ 167 | REPLACE INTO some_database.some_destination_table ( 168 | created_date 169 | , user_id 170 | , total_amount 171 | ) SELECT 172 | DATE(sst.created_at) AS created_date 173 | , sst.user_id 174 | , SUM(sst.amount) AS total_amount 175 | FROM 176 | some_database.some_source_table sst 177 | WHERE 178 | -- Note the usage of quotes surrounding the lbound and ubound vars. 179 | -- This is is required when dealing with dates / datetimes 180 | sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}' 181 | GROUP BY 182 | DATE(sst.created_at) 183 | , sst.user_id] 184 | 185 | # Note that there is no sql sanitization here so there is *potential* for SQL 186 | # injection. That being said you'll likely be using this gem in an internal 187 | # tool so hopefully your co-workers are not looking to sabotage your ETL 188 | # pipeline. Just be aware of this and handle it as you see fit. 189 | end 190 | 191 | etl.after_etl do |etl| 192 | # All post-ETL work is performed in this block. 193 | # 194 | # Again, to finish up with an example: 195 | # 196 | etl.query %[ 197 | UPDATE some_database.some_destination_table 198 | SET message = "WOW" 199 | WHERE total_amount > 100] 200 | end 201 | end 202 | 203 | etl.run 204 | 205 | puts %[ 206 | ETL complete. Now go have a look at some_database.some_destination_table 207 | That was build from some_database.some_source_table using the above ETL configuration. 208 | 209 | SELECT * FROM some_database.some_destination_table;] 210 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ETL 2 | 3 | Extract, transform, and load data with ruby! 4 | 5 | ## Installation 6 | 7 | Add this line to your application's Gemfile: 8 | 9 | gem 'ETL' 10 | 11 | And then execute: 12 | 13 | $ bundle 14 | 15 | Or install it yourself as: 16 | 17 | $ gem install ETL 18 | 19 | ## ETL Dependencies 20 | 21 | ETL depends on having a database connection object that __must__ respond 22 | to `#query`. The [mysql2](https://github.com/brianmario/mysql2) gem is a good option. 23 | You can also proxy another library using Ruby's `SimpleDelegator` and add a `#query` 24 | method if need be. 25 | 26 | The gem comes bundled with a default logger. If you'd like to write your own 27 | just make sure that it implements `#debug` and `#info`. For more information 28 | on what is logged and when, view the [logger details](#logger-details). 29 | 30 | ### Basic ETL 31 | 32 | Assume that we have a database connection represented by `connection`. 33 | 34 | To run a basic ETL that is composed of sequential SQL statements, start by 35 | creating a new ETL instance: 36 | 37 | ```ruby 38 | # setting connection at the class level 39 | ETL.connection = connection 40 | 41 | etl = ETL.new(description: "a description of what this ETL does") 42 | ``` 43 | 44 | or 45 | 46 | ```ruby 47 | # setting connection at the instance level 48 | etl = ETL.new( 49 | description: "a description of what this ETL does", 50 | connection: connection 51 | ) 52 | ``` 53 | which can then be configured: 54 | 55 | ```ruby 56 | etl.config do |etl| 57 | etl.ensure_destination do |etl| 58 | # For most ETLs you may want to ensure that the destination exists, so the 59 | # #ensure_destination block is ideally suited to fulfill this requirement. 60 | # 61 | # By way of example: 62 | # 63 | etl.query %[ 64 | CREATE TABLE IF NOT EXISTS some_database.some_destination_table ( 65 | user_id INT UNSIGNED NOT NULL 66 | , created_date DATE NOT NULL 67 | , total_amount INT SIGNED NOT NULL 68 | , message VARCHAR(100) DEFAULT NULL 69 | , PRIMARY KEY (user_id, created_date) 70 | , KEY (created_date) 71 | ) 72 | ] 73 | end 74 | 75 | etl.before_etl do |etl| 76 | # All pre-ETL work is performed in this block. 77 | # 78 | # This can be thought of as a before-ETL hook that will fire only once. When 79 | # you are not leveraging the ETL iteration capabilities, the value of this 80 | # block vs the #etl block is not very clear. We will see how and when to 81 | # leverage this block effectively when we introduce iteration. 82 | # 83 | # As an example, let's say we want to get rid of all entries that have an 84 | # amount less than zero before moving on to our actual etl: 85 | # 86 | etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0] 87 | end 88 | 89 | etl.etl do |etl| 90 | # Here is where the magic happens! This block contains the main ETL 91 | # operation. 92 | # 93 | # For example: 94 | # 95 | etl.query %[ 96 | REPLACE INTO some_database.some_destination_table ( 97 | user_id 98 | , created_date 99 | , total_amount 100 | ) SELECT 101 | user_id 102 | , DATE(created_at) AS created_date 103 | , SUM(amount) AS total_amount 104 | FROM 105 | some_database.some_source_table sst 106 | GROUP BY 107 | sst.user_id 108 | , DATE(sst.created_at) 109 | ] 110 | end 111 | 112 | etl.after_etl do |etl| 113 | # All post-ETL work is performed in this block. 114 | # 115 | # Again, to finish up with an example: 116 | # 117 | etl.query %[ 118 | UPDATE some_database.some_destination_table 119 | SET message = "WOW" 120 | WHERE total_amount > 100 121 | ] 122 | end 123 | end 124 | ``` 125 | 126 | At this point it is possible to run the ETL instance via: 127 | 128 | ```ruby 129 | etl.run 130 | ``` 131 | which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in 132 | that order. 133 | 134 | ### ETL with iteration 135 | 136 | To add in iteration, simply supply `#start`, `#step`, and `#stop` blocks. This 137 | is useful when dealing with large data sets or when executing queries that, 138 | while optimized, are still slow. 139 | 140 | Again, to kick things off: 141 | 142 | ```ruby 143 | etl = ETL.new( 144 | description: "a description of what this ETL does", 145 | connection: connection 146 | ) 147 | ``` 148 | 149 | where `connection` is the same as described above. 150 | 151 | Next we can configure the ETL: 152 | 153 | ```ruby 154 | # assuming we have the ETL instance from above 155 | etl.config do |etl| 156 | etl.ensure_destination do |etl| 157 | # For most ETLs you may want to ensure that the destination exists, so the 158 | # #ensure_destination block is ideally suited to fulfill this requirement. 159 | # 160 | # By way of example: 161 | # 162 | etl.query %[ 163 | CREATE TABLE IF NOT EXISTS some_database.some_destination_table ( 164 | user_id INT UNSIGNED NOT NULL 165 | , created_date DATE NOT NULL 166 | , total_amount INT SIGNED NOT NULL 167 | , message VARCHAR(100) DEFAULT NULL 168 | , PRIMARY KEY (user_id, created_date) 169 | , KEY (created_date) 170 | ) 171 | ] 172 | end 173 | 174 | etl.before_etl do |etl| 175 | # All pre-ETL work is performed in this block. 176 | # 177 | # Now that we are leveraging iteration the #before_etl block becomes 178 | # more useful as a way to execute an operation once before we begin 179 | # our iteration. 180 | # 181 | # As an example, let's say we want to get rid of all entries that have an 182 | # amount less than zero before moving on to our actual etl: 183 | # 184 | etl.query %[ 185 | DELETE FROM some_database.some_source_table 186 | WHERE amount < 0 187 | ] 188 | end 189 | 190 | etl.start do |etl| 191 | # This defines where the ETL should start. This can be a flat number 192 | # or date, or even SQL / other code can be executed to produce a starting 193 | # value. 194 | # 195 | # Usually, this is the last known entry for the destination table with 196 | # some sensible default if the destination does not yet contain data. 197 | # 198 | # As an example: 199 | # 200 | # Note that we cast the default date as a DATE. If we don't, it will be 201 | # treated as a string and our iterator will fail under the hood when testing 202 | # if it is complete. 203 | res = etl.query %[ 204 | SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max 205 | FROM some_database.some_destination_table 206 | ] 207 | 208 | res.to_a.first['the_max'] 209 | end 210 | 211 | etl.step do |etl| 212 | # The step block defines the size of the iteration block. To iterate by 213 | # ten records, the step block should be set to return 10. 214 | # 215 | # As an alternative example, to set the iteration to go 10,000 units 216 | # at a time, the following value should be provided: 217 | # 218 | # 10_000 (Note: an underscore is used for readability) 219 | # 220 | # As an example, to iterate 7 days at a time: 221 | # 222 | 7 223 | end 224 | 225 | etl.stop do |etl| 226 | # The stop block defines when the iteration should halt. 227 | # Again, this can be a flat value or code. Either way, one value *must* be 228 | # returned. 229 | # 230 | # As a flat value: 231 | # 232 | # 1_000_000 233 | # 234 | # Or a date value: 235 | # 236 | # Time.now.to_date 237 | # 238 | # Or as a code example: 239 | # 240 | res = etl.query %[ 241 | SELECT DATE(MAX(created_at)) AS the_max 242 | FROM some_database.some_source_table 243 | ] 244 | 245 | res.to_a.first['the_max'] 246 | end 247 | 248 | etl.etl do |etl, lbound, ubound| 249 | # The etl block is the main part of the framework. Note: there are 250 | # two extra args with the iterator this time around: "lbound" and "ubound" 251 | # 252 | # "lbound" is the lower bound of the current iteration. When iterating 253 | # from 0 to 10 and stepping by 2, the lbound would equal 2 on the 254 | # second iteration. 255 | # 256 | # "ubound" is the upper bound of the current iteration. In continuing with the 257 | # example above, when iterating from 0 to 10 and stepping by 2, the ubound would 258 | # equal 4 on the second iteration. 259 | # 260 | # These args can be used to "window" SQL queries or other code operations. 261 | # 262 | # As a first example, to iterate over a set of ids: 263 | # 264 | # etl.query %[ 265 | # REPLACE INTO some_database.some_destination_table ( 266 | # created_date 267 | # , user_id 268 | # , total_amount 269 | # ) SELECT 270 | # DATE(sst.created_at) AS created_date 271 | # , sst.user_id 272 | # , SUM(sst.amount) AS total_amount 273 | # FROM 274 | # some_database.some_source_table sst 275 | # WHERE 276 | # sst.user_id > #{lbound} AND sst.user_id <= #{ubound} 277 | # GROUP BY 278 | # DATE(sst.created_at) 279 | # , sst.user_id] 280 | # 281 | # To "window" a SQL query using dates: 282 | # 283 | etl.query %[ 284 | REPLACE INTO some_database.some_destination_table ( 285 | created_date 286 | , user_id 287 | , total_amount 288 | ) SELECT 289 | DATE(sst.created_at) AS created_date 290 | , sst.user_id 291 | , SUM(sst.amount) AS total_amount 292 | FROM 293 | some_database.some_source_table sst 294 | WHERE 295 | -- Note the usage of quotes surrounding the lbound and ubound vars. 296 | -- This is is required when dealing with dates / datetimes 297 | sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}' 298 | GROUP BY 299 | DATE(sst.created_at) 300 | , sst.user_id 301 | ] 302 | 303 | # Note that there is no sql sanitization here so there is *potential* for SQL 304 | # injection. That being said you'll likely be using this gem in an internal 305 | # tool so hopefully your co-workers are not looking to sabotage your ETL 306 | # pipeline. Just be aware of this and handle it as you see fit. 307 | end 308 | 309 | etl.after_etl do |etl| 310 | # All post-ETL work is performed in this block. 311 | # 312 | # Again, to finish up with an example: 313 | # 314 | etl.query %[ 315 | UPDATE some_database.some_destination_table 316 | SET message = "WOW" 317 | WHERE total_amount > 100 318 | ] 319 | end 320 | end 321 | ``` 322 | 323 | At this point it is possible to run the ETL instance via: 324 | 325 | ```ruby 326 | etl.run 327 | ``` 328 | which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in 329 | that order. 330 | 331 | Note that `#etl` executes `#start` and `#stop` once and memoizes the result for 332 | each. It then begins to iterate from what `#start` evaluated to up until what `#stop` 333 | evaluated to by what `#step` evaluates to. 334 | 335 | ## Examples 336 | 337 | There are two examples found in `./examples` that demonstrate the basic ETL and 338 | iteration ETL. Each file uses the [mysql2](https://github.com/brianmario/mysql2) 339 | gem and reads / writes data to localhost using the root user with no password. 340 | Adjust as needed. 341 | 342 | ## Logger Details 343 | 344 | A logger must support two methods: `#info` and `#warn`. 345 | 346 | Both methods should accept a single hash argument. The argument will contain: 347 | 348 | - `:emitter` => a reference to the ETL instance's `self` 349 | - `:event_type` => a symbol that includes the type of event being logged. You 350 | can use this value to derive which other data you'll have available 351 | 352 | When `:event_type` is equal to `:query_start`, you'll have the following 353 | available in the hash argument: 354 | 355 | - `:sql` => the sql that is going to be run 356 | 357 | These events are logged at the debug level. 358 | 359 | When `:event_type` is equal to `:query_complete`, you'll have the following 360 | available in the hash argument: 361 | 362 | - `:sql` => the sql that was run 363 | - `:runtime` => how long the query took to execute 364 | 365 | These events are logged at the info level. 366 | 367 | Following from this you could implement a simple logger as: 368 | 369 | ```ruby 370 | class PutsLogger 371 | def info data 372 | @data = data 373 | write! 374 | end 375 | 376 | def debug data 377 | @data = data 378 | write! 379 | end 380 | 381 | private 382 | 383 | def write! 384 | case (event_type = @data.delete(:event_type)) 385 | when :query_start 386 | output = "#{@data[:emitter].description} is about to run\n" 387 | output += "#{@data[:sql]}\n" 388 | when :query_complete 389 | output = "#{@data[:emitter].description} executed:\n" 390 | output += "#{@data[:sql]}\n" 391 | output += "query completed at #{Time.now} and took #{@data[:runtime]}s\n" 392 | else 393 | output = "no special logging for #{event_type} event_type yet\n" 394 | end 395 | puts output 396 | @data = nil 397 | end 398 | end 399 | ``` 400 | 401 | ## Contributing 402 | 403 | If you would like to contribute code to ETL you can do so through GitHub by 404 | forking the repository and sending a pull request. 405 | 406 | When submitting code, please make every effort to follow existing conventions 407 | and style in order to keep the code as readable as possible. 408 | 409 | Before your code can be accepted into the project you must also sign the 410 | [Individual Contributor License Agreement (CLA)][1]. 411 | 412 | 413 | [1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1 414 | 415 | ## License 416 | 417 | Copyright 2013 Square Inc. 418 | 419 | Licensed under the Apache License, Version 2.0 (the "License"); 420 | you may not use this file except in compliance with the License. 421 | You may obtain a copy of the License at 422 | 423 | http://www.apache.org/licenses/LICENSE-2.0 424 | 425 | Unless required by applicable law or agreed to in writing, software 426 | distributed under the License is distributed on an "AS IS" BASIS, 427 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 428 | See the License for the specific language governing permissions and 429 | limitations under the License. 430 | -------------------------------------------------------------------------------- /spec/etl_spec.rb: -------------------------------------------------------------------------------- 1 | require 'mysql2' 2 | require 'active_support/time' 3 | require 'etl' 4 | 5 | def test_connection 6 | Mysql2::Client.new( 7 | host: 'localhost', 8 | username: 'root', 9 | database: 'etl_test' 10 | ) 11 | end 12 | 13 | def reset_test_env connection, &block 14 | connection.query %[DROP DATABASE IF EXISTS etl_test] 15 | connection.query %[CREATE DATABASE etl_test] 16 | connection.query %[USE etl_test] 17 | 18 | if block_given? 19 | yield connection 20 | else 21 | connection.query %[ 22 | CREATE TABLE etl_source ( 23 | id INT NOT NULL 24 | , name VARCHAR(10) 25 | , amount INT(11) DEFAULT 0 26 | , PRIMARY KEY (id) 27 | ) 28 | ] 29 | 30 | connection.query %[ 31 | INSERT INTO etl_test.etl_source (id, name, amount) 32 | VALUES 33 | (1, 'Jeff', 100), 34 | (2, 'Ryan', 50), 35 | (3, 'Jack', 75), 36 | (4, 'Jeff', 10), 37 | (5, 'Jack', 45), 38 | (6, 'Nick', -90), 39 | (7, 'Nick', 90) 40 | ] 41 | end 42 | end 43 | 44 | describe ETL do 45 | let(:logger) { nil } 46 | 47 | describe "deprecations" do 48 | let(:etl) { described_class.new } 49 | 50 | context "#ensure_destination" do 51 | it "does not warn when no args are passed" do 52 | expect(etl).to receive(:warn).never 53 | etl.ensure_destination {} 54 | end 55 | 56 | it "warns when args are passed that this is deprecated" do 57 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #ensure_destination will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 58 | etl.ensure_destination('some arg') {} 59 | end 60 | end 61 | 62 | context "#before_etl" do 63 | it "does not warn when no args are passed" do 64 | expect(etl).to receive(:warn).never 65 | etl.before_etl {} 66 | end 67 | 68 | it "warns when args are passed that this is deprecated" do 69 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #before_etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 70 | etl.before_etl('some arg') {} 71 | end 72 | end 73 | 74 | context "#start" do 75 | it "does not warn when no args are passed" do 76 | expect(etl).to receive(:warn).never 77 | etl.start {} 78 | end 79 | 80 | it "warns when args are passed that this is deprecated" do 81 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #start will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 82 | etl.start('some arg') {} 83 | end 84 | end 85 | 86 | context "#step" do 87 | it "does not warn when no args are passed" do 88 | expect(etl).to receive(:warn).never 89 | etl.step {} 90 | end 91 | 92 | it "warns when args are passed that this is deprecated" do 93 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #step will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 94 | etl.step('some arg') {} 95 | end 96 | end 97 | 98 | context "#stop" do 99 | it "does not warn when no args are passed" do 100 | expect(etl).to receive(:warn).never 101 | etl.stop {} 102 | end 103 | 104 | it "warns when args are passed that this is deprecated" do 105 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #stop will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 106 | etl.stop('some arg') {} 107 | end 108 | end 109 | 110 | context "#etl" do 111 | it "does not warn when no args are passed" do 112 | expect(etl).to receive(:warn).never 113 | etl.etl {} 114 | end 115 | 116 | it "warns when args are passed that this is deprecated" do 117 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 118 | etl.etl('some arg') {} 119 | end 120 | end 121 | 122 | context "#after_etl" do 123 | it "does not warn when no args are passed" do 124 | expect(etl).to receive(:warn).never 125 | etl.after_etl {} 126 | end 127 | 128 | it "warns when args are passed that this is deprecated" do 129 | expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #after_etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.") 130 | etl.after_etl('some arg') {} 131 | end 132 | end 133 | end 134 | 135 | describe ".connection=" do 136 | let(:class_level_connection) { double('class_level_connection') } 137 | 138 | it "sets the #connection for all instances" do 139 | ETL.connection = class_level_connection 140 | etl = ETL.new 141 | expect(etl.connection).to eq(class_level_connection) 142 | end 143 | 144 | it "allows instance-level overrides" do 145 | instance_level_connection = double('instance_level_connection') 146 | ETL.connection = class_level_connection 147 | etl_with_connection_override = ETL.new(connection: instance_level_connection) 148 | etl = ETL.new 149 | expect(etl.connection).to eq class_level_connection 150 | expect(etl_with_connection_override.connection).to eq(instance_level_connection) 151 | end 152 | end 153 | 154 | describe "#logger=" do 155 | let(:etl) { described_class.new connection: double } 156 | 157 | it 'assigns' do 158 | logger = double 159 | etl.logger = logger 160 | expect(etl.logger).to eq(logger) 161 | end 162 | end 163 | 164 | describe '#max_for' do 165 | let(:connection) { test_connection } 166 | let(:etl) { described_class.new(connection: connection, logger: logger) } 167 | 168 | before do 169 | client = Mysql2::Client.new host: 'localhost', username: 'root' 170 | client.query %[DROP DATABASE IF EXISTS etl_test] 171 | client.query %[CREATE DATABASE etl_test] 172 | client.query %[USE etl_test] 173 | client.query %[ 174 | CREATE TABLE IF NOT EXISTS etl_source ( 175 | id INT(11) NOT NULL AUTO_INCREMENT 176 | , name VARCHAR(10) 177 | , amount INT(11) DEFAULT 0 178 | , the_date DATE DEFAULT NULL 179 | , the_null_date DATE DEFAULT NULL 180 | , the_time_at DATETIME DEFAULT NULL 181 | , the_null_time_at DATETIME DEFAULT NULL 182 | , PRIMARY KEY (id) 183 | ) 184 | ] 185 | 186 | client.query %[ 187 | INSERT INTO etl_source ( 188 | name 189 | , amount 190 | , the_date 191 | , the_null_date 192 | , the_time_at 193 | , the_null_time_at 194 | ) VALUES 195 | ('Jeff', 100, '2012-01-02', NULL, '2012-01-02 00:00:01', NULL) 196 | , ('Ryan', 50, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 197 | , ('Jack', 75, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 198 | , ('Jeff', 10, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 199 | , ('Jack', 45, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 200 | , ('Nick', -90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 201 | , ('Nick', 90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL) 202 | ] 203 | 204 | client.close 205 | end 206 | 207 | after { connection.close } 208 | 209 | it "finds the max for dates" do 210 | expect(etl.max_for(database: :etl_test, 211 | table: :etl_source, 212 | column: :the_date)).to eq(Date.parse('2012-01-02')) 213 | end 214 | 215 | it "defaults to the beginning of time date when a max date cannot be found" do 216 | expect( 217 | etl.max_for( 218 | database: :etl_test, 219 | table: :etl_source, 220 | column: :the_null_date 221 | ) 222 | ).to eq(Date.parse('1970-01-01')) 223 | end 224 | 225 | it "defaults to the specified default floor when a max date cannot be found" do 226 | expect( 227 | etl.max_for( 228 | database: :etl_test, 229 | table: :etl_source, 230 | column: :the_null_date, 231 | default_floor: '2011-01-01' 232 | ) 233 | ).to eq(Date.parse('2011-01-01')) 234 | end 235 | 236 | it "finds the max for datetimes" do 237 | expect( 238 | etl.max_for( 239 | database: :etl_test, 240 | table: :etl_source, 241 | column: :the_time_at 242 | ) 243 | ).to eq(Date.parse('2012-01-02')) 244 | end 245 | 246 | it "defaults to the beginning of time when a max datetime cannot be found" do 247 | expect( 248 | etl.max_for( 249 | database: :etl_test, 250 | table: :etl_source, 251 | column: :the_null_time_at 252 | ) 253 | ).to eq(Date.parse('1970-01-01 00:00:00')) 254 | end 255 | 256 | it "defaults to the specified default floor when a max datetime cannot be found" do 257 | expect( 258 | etl.max_for( 259 | database: :etl_test, 260 | table: :etl_source, 261 | column: :the_null_time_at, 262 | default_floor: '2011-01-01 00:00:00' 263 | ) 264 | ).to eq(Date.parse('2011-01-01 00:00:00')) 265 | end 266 | 267 | it "raises an error if a non-standard column is supplied with no default floor" do 268 | expect { 269 | etl.max_for( 270 | database: :etl_test, 271 | table: :etl_source, 272 | column: :amount 273 | ) 274 | }.to raise_exception 275 | end 276 | 277 | it "finds the max for a non-standard column, using the default floor" do 278 | expect( 279 | etl.max_for( 280 | database: :etl_test, 281 | table: :etl_source, 282 | column: :amount, 283 | default_floor: 0 284 | ) 285 | ).to eq(100) 286 | end 287 | end 288 | 289 | describe '#run' do 290 | let(:connection) { test_connection } 291 | let(:etl) { described_class.new connection: connection, logger: logger } 292 | 293 | before do 294 | client = Mysql2::Client.new(host: 'localhost', username: 'root') 295 | client.query %[DROP DATABASE IF EXISTS etl_test] 296 | client.query %[CREATE DATABASE etl_test] 297 | client.query %[USE etl_test] 298 | client.query %[ 299 | CREATE TABLE IF NOT EXISTS etl_source ( 300 | id INT(11) NOT NULL AUTO_INCREMENT 301 | , name VARCHAR(10) 302 | , amount INT(11) DEFAULT 0 303 | , PRIMARY KEY (id) 304 | ) 305 | ] 306 | 307 | client.query %[ 308 | INSERT INTO etl_source (name, amount) 309 | VALUES 310 | ('Jeff', 100), 311 | ('Ryan', 50), 312 | ('Jack', 75), 313 | ('Jeff', 10), 314 | ('Jack', 45), 315 | ('Nick', -90), 316 | ('Nick', 90) 317 | ] 318 | 319 | client.close 320 | end 321 | 322 | it "executes the specified sql in the appropriate order" do 323 | etl.ensure_destination do |etl| 324 | etl.query %[ 325 | CREATE TABLE IF NOT EXISTS etl_destination ( 326 | name VARCHAR(10) 327 | , total_amount INT(11) DEFAULT 0 328 | , PRIMARY KEY (name) 329 | ) 330 | ] 331 | end 332 | 333 | etl.before_etl do |etl| 334 | etl.query "DELETE FROM etl_source WHERE amount < 0" 335 | end 336 | 337 | etl.etl do |etl| 338 | etl.query %[ 339 | REPLACE INTO etl_destination 340 | SELECT name, SUM(amount) FROM etl_source 341 | GROUP BY name 342 | ] 343 | end 344 | 345 | etl.after_etl do |etl| 346 | etl.query %[ 347 | UPDATE etl_destination 348 | SET name = CONCAT("SUPER ", name) 349 | WHERE total_amount > 115 350 | ] 351 | end 352 | 353 | etl.run 354 | 355 | expect( 356 | connection 357 | .query("SELECT * FROM etl_destination ORDER BY total_amount DESC") 358 | .to_a 359 | ).to eq( 360 | [ 361 | {'name' => 'SUPER Jack', 'total_amount' => 120}, 362 | {'name' => 'Jeff', 'total_amount' => 110}, 363 | {'name' => 'Nick', 'total_amount' => 90}, 364 | {'name' => 'Ryan', 'total_amount' => 50} 365 | ] 366 | ) 367 | end 368 | end 369 | 370 | describe '#run with operations specified for exclusion' do 371 | let(:connection) { double } 372 | let(:etl) { described_class.new connection: connection, logger: logger } 373 | 374 | it "does not call the specified method" do 375 | etl.ensure_destination {} 376 | expect(etl).not_to receive(:ensure_destination) 377 | etl.run(except: :ensure_destination) 378 | end 379 | end 380 | 381 | context "with iteration" do 382 | describe '#run over full table' do 383 | let(:connection) { test_connection } 384 | let(:etl) { described_class.new(connection: connection, logger: logger) } 385 | 386 | before { reset_test_env connection } 387 | after { connection.close } 388 | 389 | it "executes the specified sql in the appropriate order and ETLs properly" do 390 | etl.ensure_destination do |etl| 391 | etl.query %[ 392 | CREATE TABLE etl_destination ( 393 | id INT NOT NULL 394 | , name VARCHAR(10) 395 | , amount INT(11) DEFAULT 0 396 | , PRIMARY KEY (id) 397 | ) 398 | ] 399 | end 400 | 401 | etl.before_etl do |etl| 402 | etl.query "DELETE FROM etl_source WHERE amount < 0" 403 | end 404 | 405 | etl.start do |etl| 406 | etl.query( 407 | "SELECT COALESCE(MAX(id), 0) AS the_start FROM etl_destination" 408 | ).to_a.first['the_start'] 409 | end 410 | 411 | etl.step do 412 | 1 413 | end 414 | 415 | etl.stop do |etl| 416 | etl.query( 417 | "SELECT MAX(id) AS the_stop FROM etl_source" 418 | ).to_a.first['the_stop'] 419 | end 420 | 421 | etl.etl do |etl, lbound, ubound| 422 | etl.query %[ 423 | REPLACE INTO etl_destination 424 | SELECT id, name, amount FROM etl_source s 425 | WHERE s.id >= #{lbound} 426 | AND s.id < #{ubound} 427 | ] 428 | end 429 | 430 | etl.after_etl do |etl| 431 | etl.query %[ 432 | UPDATE etl_destination 433 | SET name = CONCAT("SUPER ", name) 434 | WHERE id <= 1 435 | ] 436 | end 437 | 438 | etl.run 439 | 440 | expect( 441 | connection 442 | .query("SELECT * FROM etl_destination ORDER BY id ASC") 443 | .to_a 444 | ).to eq( 445 | [ 446 | {'id' => 1, 'name' => 'SUPER Jeff', 'amount' => 100}, 447 | {'id' => 2, 'name' => 'Ryan', 'amount' => 50}, 448 | {'id' => 3, 'name' => 'Jack', 'amount' => 75}, 449 | {'id' => 4, 'name' => 'Jeff', 'amount' => 10}, 450 | {'id' => 5, 'name' => 'Jack', 'amount' => 45}, 451 | {'id' => 7, 'name' => 'Nick', 'amount' => 90} 452 | ] 453 | ) 454 | end 455 | end 456 | 457 | describe '#run over part of table' do 458 | let(:connection) { test_connection } 459 | let(:etl) { described_class.new(connection: connection, logger: logger) } 460 | 461 | before { reset_test_env connection } 462 | after { connection.close } 463 | 464 | it "executes the specified sql in the appropriate order and ETLs properly" do 465 | etl.ensure_destination do |etl| 466 | etl.query %[ 467 | CREATE TABLE etl_destination ( 468 | id INT NOT NULL 469 | , name VARCHAR(10) 470 | , amount INT(11) DEFAULT 0 471 | , PRIMARY KEY (id) 472 | ) 473 | ] 474 | end 475 | 476 | etl.before_etl do |etl| 477 | etl.query "DELETE FROM etl_source WHERE amount < 0" 478 | end 479 | 480 | etl.start do 481 | 4 482 | end 483 | 484 | etl.step do 485 | 1 486 | end 487 | 488 | etl.stop do |etl| 489 | etl.query( 490 | "SELECT MAX(id) AS the_stop FROM etl_source" 491 | ).to_a.first['the_stop'] 492 | end 493 | 494 | etl.etl do |etl, lbound, ubound| 495 | etl.query %[ 496 | REPLACE INTO etl_destination 497 | SELECT id, name, amount FROM etl_source s 498 | WHERE s.id >= #{lbound} 499 | AND s.id < #{ubound} 500 | ] 501 | end 502 | 503 | etl.run 504 | 505 | expect( 506 | connection 507 | .query("SELECT * FROM etl_destination ORDER BY id ASC") 508 | .to_a 509 | ).to eq( 510 | [ 511 | {'id' => 4, 'name' => 'Jeff', 'amount' => 10}, 512 | {'id' => 5, 'name' => 'Jack', 'amount' => 45}, 513 | {'id' => 7, 'name' => 'Nick', 'amount' => 90} 514 | ] 515 | ) 516 | end 517 | end 518 | 519 | describe "#run over gappy data" do 520 | let(:connection) { test_connection } 521 | let(:etl) { described_class.new(connection: connection, logger: logger) } 522 | 523 | before do 524 | reset_test_env(connection) do |connection| 525 | connection.query %[ 526 | CREATE TABLE etl_source ( 527 | id INT NOT NULL 528 | , name VARCHAR(10) 529 | , amount INT(11) DEFAULT 0 530 | , PRIMARY KEY (id) 531 | ) 532 | ] 533 | 534 | connection.query %[ 535 | INSERT INTO etl_source (id, name, amount) 536 | VALUES 537 | (1, 'Jeff', 100), 538 | (2, 'Ryan', 50), 539 | (13, 'Jack', 75), 540 | (14, 'Jeff', 10), 541 | (15, 'Jack', 45), 542 | (16, 'Nick', -90), 543 | (17, 'Nick', 90) 544 | ] 545 | end 546 | end 547 | 548 | after { connection.close } 549 | 550 | it "executes the specified sql in the appropriate order without getting stuck" do 551 | etl.ensure_destination do |etl| 552 | etl.query %[ 553 | CREATE TABLE etl_destination ( 554 | id INT NOT NULL 555 | , name VARCHAR(10) 556 | , amount INT(11) DEFAULT 0 557 | , PRIMARY KEY (id) 558 | ) 559 | ] 560 | end 561 | 562 | etl.before_etl do |etl| 563 | etl.query "DELETE FROM etl_source WHERE amount < 0" 564 | end 565 | 566 | etl.start do |etl| 567 | 1 568 | end 569 | 570 | etl.step do 571 | 1 572 | end 573 | 574 | etl.stop do |etl| 575 | etl.query( 576 | "SELECT MAX(id) AS the_stop FROM etl_source" 577 | ).to_a.first['the_stop'] 578 | end 579 | 580 | etl.etl do |etl, lbound, ubound| 581 | etl.query %[ 582 | REPLACE INTO etl_destination 583 | SELECT 584 | id 585 | , name 586 | , amount 587 | FROM etl_source s 588 | WHERE s.id >= #{lbound} 589 | AND s.id < #{ubound} 590 | ] 591 | end 592 | 593 | etl.run 594 | 595 | expect( 596 | connection 597 | .query("SELECT * FROM etl_destination ORDER BY id ASC") 598 | .to_a 599 | ).to eq( 600 | [ 601 | {'id' => 1, 'name' => 'Jeff', 'amount' => 100}, 602 | {'id' => 2, 'name' => 'Ryan', 'amount' => 50}, 603 | {'id' => 13, 'name' => 'Jack', 'amount' => 75}, 604 | {'id' => 14, 'name' => 'Jeff', 'amount' => 10}, 605 | {'id' => 15, 'name' => 'Jack', 'amount' => 45}, 606 | {'id' => 17, 'name' => 'Nick', 'amount' => 90} 607 | ] 608 | ) 609 | end 610 | end 611 | 612 | describe "#run over date data" do 613 | let(:connection) { test_connection } 614 | let(:etl) { described_class.new(connection: connection, logger: logger) } 615 | 616 | before do 617 | reset_test_env(connection) do |connection| 618 | connection.query %[ 619 | CREATE TABLE etl_source ( 620 | the_date DATE NOT NULL 621 | , name VARCHAR(10) 622 | , amount INT(11) DEFAULT 0 623 | ) 624 | ] 625 | 626 | connection.query %[ 627 | INSERT INTO etl_source (the_date, name, amount) 628 | VALUES 629 | ('2012-01-01', 'Jeff', 100), 630 | ('2012-01-01', 'Ryan', 50), 631 | ('2012-01-01', 'Jack', 75), 632 | ('2012-01-01', 'Jeff', 10), 633 | ('2012-01-02', 'Jack', 45), 634 | ('2012-01-02', 'Nick', -90), 635 | ('2012-01-02', 'Nick', 90) 636 | ] 637 | end 638 | end 639 | 640 | after { connection.close } 641 | 642 | it "executes the specified sql in the appropriate order and ETLs properly" do 643 | etl.ensure_destination do |etl| 644 | etl.query %[ 645 | CREATE TABLE etl_destination ( 646 | the_date DATE NOT NULL 647 | , name VARCHAR(10) 648 | , total_amount INT(11) DEFAULT 0 649 | , PRIMARY KEY (the_date, name) 650 | ) 651 | ] 652 | end 653 | 654 | etl.before_etl do |etl| 655 | etl.query "DELETE FROM etl_source WHERE amount < 0" 656 | end 657 | 658 | etl.start do |etl| 659 | etl.query(%[ 660 | SELECT COALESCE(MAX(the_date), DATE('2012-01-01')) AS the_start 661 | FROM etl_destination 662 | ]).to_a.first['the_start'] 663 | end 664 | 665 | etl.step do 666 | 1.day 667 | end 668 | 669 | etl.stop do |etl| 670 | etl.query( 671 | "SELECT MAX(the_date) AS the_stop FROM etl_source" 672 | ).to_a.first['the_stop'] 673 | end 674 | 675 | etl.etl do |etl, lbound, ubound| 676 | etl.query %[ 677 | REPLACE INTO etl_destination 678 | SELECT 679 | the_date 680 | , name 681 | , SUM(amount) AS total_amount 682 | FROM etl_source s 683 | WHERE s.the_date >= '#{lbound}' 684 | AND s.the_date < '#{ubound}' 685 | GROUP BY 686 | the_date 687 | , name 688 | ] 689 | end 690 | 691 | etl.run 692 | 693 | expect( 694 | connection.query(%[ 695 | SELECT 696 | the_date 697 | , name 698 | , total_amount 699 | FROM 700 | etl_destination 701 | ORDER BY 702 | the_date ASC 703 | , name ASC 704 | ]).to_a 705 | ).to eq( 706 | [ 707 | {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jack', 'total_amount' => 75}, 708 | {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jeff', 'total_amount' => 110}, 709 | {'the_date' => Date.parse('2012-01-01'), 'name' => 'Ryan', 'total_amount' => 50}, 710 | {'the_date' => Date.parse('2012-01-02'), 'name' => 'Jack', 'total_amount' => 45}, 711 | {'the_date' => Date.parse('2012-01-02'), 'name' => 'Nick', 'total_amount' => 90} 712 | ] 713 | ) 714 | end 715 | end 716 | 717 | describe "#run over datetime data" do 718 | let(:connection) { test_connection } 719 | let(:etl) { described_class.new(connection: connection, logger: logger) } 720 | 721 | before do 722 | reset_test_env(connection) do |connection| 723 | connection.query %[ 724 | CREATE TABLE etl_source ( 725 | the_datetime DATETIME NOT NULL 726 | , name VARCHAR(10) 727 | , amount INT(11) DEFAULT 0 728 | ) 729 | ] 730 | 731 | connection.query %[ 732 | INSERT INTO etl_source (the_datetime, name, amount) 733 | VALUES 734 | ('2011-12-31 23:59:59', 'Jeff', 100), 735 | ('2012-01-01 00:01:00', 'Ryan', 50), 736 | ('2012-01-01 00:01:01', 'Jack', 75), 737 | ('2012-01-01 00:01:02', 'Jeff', 10), 738 | ('2012-01-02 00:02:00', 'Jack', 45), 739 | ('2012-01-02 00:02:01', 'Nick', -90), 740 | ('2012-01-02 00:02:02', 'Nick', 90) 741 | ] 742 | end 743 | end 744 | 745 | after { connection.close } 746 | 747 | it "executes the specified sql in the appropriate order and ETLs properly" do 748 | etl.ensure_destination do |etl| 749 | etl.query %[ 750 | CREATE TABLE etl_destination ( 751 | the_datetime DATETIME NOT NULL 752 | , name VARCHAR(10) 753 | , amount INT(11) DEFAULT 0 754 | , PRIMARY KEY (the_datetime, name) 755 | ) 756 | ] 757 | end 758 | 759 | etl.before_etl do |etl| 760 | etl.query "DELETE FROM etl_source WHERE amount < 0" 761 | end 762 | 763 | etl.start do |etl| 764 | etl.query(%[ 765 | SELECT CAST(COALESCE(MAX(the_datetime), '2012-01-01 00:00:00') AS DATETIME) AS the_start 766 | FROM etl_destination 767 | ]).to_a.first['the_start'] 768 | end 769 | 770 | etl.step do 771 | 1.minute 772 | end 773 | 774 | etl.stop do |etl| 775 | etl.query( 776 | "SELECT MAX(the_datetime) AS the_stop FROM etl_source" 777 | ).to_a.first['the_stop'] 778 | end 779 | 780 | etl.etl do |etl, lbound, ubound| 781 | etl.query %[ 782 | REPLACE INTO etl_destination 783 | SELECT 784 | the_datetime 785 | , name 786 | , amount 787 | FROM etl_source s 788 | WHERE s.the_datetime >= '#{lbound}' 789 | AND s.the_datetime < '#{ubound}' 790 | ] 791 | end 792 | 793 | etl.run 794 | 795 | expect( 796 | connection.query(%[ 797 | SELECT 798 | the_datetime 799 | , name 800 | , amount 801 | FROM 802 | etl_destination 803 | ORDER BY 804 | the_datetime ASC 805 | , name ASC 806 | ]).to_a 807 | ).to eq( 808 | [ 809 | {'the_datetime' => Time.parse('2012-01-01 00:01:00'), 'name' => 'Ryan', 'amount' => 50}, 810 | {'the_datetime' => Time.parse('2012-01-01 00:01:01'), 'name' => 'Jack', 'amount' => 75}, 811 | {'the_datetime' => Time.parse('2012-01-01 00:01:02'), 'name' => 'Jeff', 'amount' => 10}, 812 | {'the_datetime' => Time.parse('2012-01-02 00:02:00'), 'name' => 'Jack', 'amount' => 45}, 813 | {'the_datetime' => Time.parse('2012-01-02 00:02:02'), 'name' => 'Nick', 'amount' => 90} 814 | ] 815 | ) 816 | end 817 | end 818 | end 819 | end 820 | --------------------------------------------------------------------------------