├── .rspec
├── lib
    ├── etl
    │   ├── version.rb
    │   └── helpers.rb
    └── etl.rb
├── .travis.yml
├── Gemfile
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Rakefile
├── Gemfile.lock
├── etl.gemspec
├── examples
    ├── basic_etl.rb
    └── iterator_etl.rb
├── README.md
└── spec
    └── etl_spec.rb


/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | --format documentation
3 | 


--------------------------------------------------------------------------------
/lib/etl/version.rb:
--------------------------------------------------------------------------------
1 | class ETL
2 |   VERSION = "1.1.1"
3 | end
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 |   - 1.9.3
4 |   - 2.2.0  
5 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in etl.gemspec
4 | gemspec
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | .yardoc
 6 | .rvmrc
 7 | Gemfile.lock
 8 | InstalledFiles
 9 | _yardoc
10 | coverage
11 | doc/
12 | lib/bundler/man
13 | pkg
14 | rdoc
15 | spec/reports
16 | test/tmp
17 | test/version_tmp
18 | tmp
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ============
 3 | 
 4 | If you would like to contribute code to ETL you can do so through GitHub by
 5 | forking the repository and sending a pull request.
 6 | 
 7 | When submitting code, please make every effort to follow existing conventions
 8 | and style in order to keep the code as readable as possible.
 9 | 
10 | Before your code can be accepted into the project you must also sign the
11 | [Individual Contributor License Agreement (CLA)][1].
12 | 
13 | 
14 |  [1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2013 Square Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env rake
 2 | require "bundler/gem_tasks"
 3 | begin
 4 |   require 'rspec/core/rake_task'
 5 | 
 6 |   RSpec::Core::RakeTask.new(:spec) do |t|
 7 |     t.rspec_opts = '-b'
 8 |   end
 9 | 
10 |   task default: :spec
11 | rescue LoadError
12 |   $stderr.puts "rspec not available, spec task not provided"
13 | end
14 | 
15 | begin
16 |   require 'cane/rake_task'
17 | 
18 |   desc "Run cane to check quality metrics"
19 |   Cane::RakeTask.new(:quality) do |cane|
20 |     cane.abc_max    = 10
21 |     cane.style_glob = "lib/**/*.rb"
22 |     cane.no_doc     = true
23 |   end
24 | 
25 |   task :default => :quality
26 | rescue LoadError
27 |   warn "cane not available, quality task not provided."
28 | end
29 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | PATH
 2 |   remote: .
 3 |   specs:
 4 |     ETL (1.1.1)
 5 |       activesupport (>= 3.2.3)
 6 | 
 7 | GEM
 8 |   remote: https://rubygems.org/
 9 |   specs:
10 |     activesupport (3.2.13)
11 |       i18n (= 0.6.1)
12 |       multi_json (~> 1.0)
13 |     cane (2.5.0)
14 |       parallel
15 |     diff-lcs (1.2.5)
16 |     i18n (0.6.1)
17 |     multi_json (1.7.3)
18 |     mysql2 (0.3.17)
19 |     parallel (0.6.1)
20 |     rake (10.0.3)
21 |     rspec (2.14.1)
22 |       rspec-core (~> 2.14.0)
23 |       rspec-expectations (~> 2.14.0)
24 |       rspec-mocks (~> 2.14.0)
25 |     rspec-core (2.14.8)
26 |     rspec-expectations (2.14.5)
27 |       diff-lcs (>= 1.1.3, < 2.0)
28 |     rspec-mocks (2.14.6)
29 | 
30 | PLATFORMS
31 |   ruby
32 | 
33 | DEPENDENCIES
34 |   ETL!
35 |   cane
36 |   mysql2 (~> 0.3.17)
37 |   rake
38 |   rspec (>= 2.14.0)
39 | 


--------------------------------------------------------------------------------
/etl.gemspec:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | require File.expand_path('../lib/etl/version', __FILE__)
 3 | 
 4 | Gem::Specification.new do |gem|
 5 |   gem.authors       = ["Jeff Iacono"]
 6 |   gem.email         = ["iacono@squareup.com"]
 7 |   gem.description   = %q{Extract, Transform, and Load (ETL) ruby wrapper}
 8 |   gem.summary       = %q{Extract, Transform, and Load (ETL) ruby wrapper. Supports basic and iterative ETL operations.}
 9 |   gem.homepage      = "https://github.com/square/ETL"
10 | 
11 |   gem.files         = `git ls-files`.split($\)
12 |   gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13 |   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
14 |   gem.name          = "ETL"
15 |   gem.require_paths = ["lib"]
16 |   gem.version       = ETL::VERSION
17 | 
18 |   gem.add_runtime_dependency "activesupport", [">= 3.2.3"]
19 | 
20 |   gem.add_development_dependency "rake"
21 |   gem.add_development_dependency "cane"
22 |   gem.add_development_dependency 'mysql2', '~> 0.3.17'
23 |   gem.add_development_dependency "rspec", [">= 2.14.0"]
24 | end
25 | 


--------------------------------------------------------------------------------
/lib/etl/helpers.rb:
--------------------------------------------------------------------------------
 1 | class ETL
 2 |   module Helpers
 3 |     # max_for returns the max value for the passed in column as found in the
 4 |     # specified database.table. If there is not currently a max, we use COALESCE
 5 |     # and a default value. You can specify a :default_floor value or the method
 6 |     # will try to derive it for you.
 7 |     #
 8 |     # Note: we try to detect if we want a date return type via the #datetype?
 9 |     # check.
10 |     #
11 |     # If this is found we wrap the whole SELECT clause in a DATE so it is cast
12 |     # accordingly.
13 |     def max_for options = {}
14 |       database = options[:database]
15 |       table    = options[:table]
16 |       column   = options[:column]
17 | 
18 |       default_value = options[:default_floor] ||
19 |                         default_floor_for(column)
20 | 
21 |       if date? default_value
22 |         default_value = "DATE('#{default_value}')"
23 |         caster = ->(str) { "DATE(#{str})" }
24 |       end
25 | 
26 |       max_sql_clause = "IFNULL(MAX(#{table}.#{column}), #{default_value})"
27 |       max_sql_clause = caster.(max_sql_clause) if caster
28 | 
29 |       sql = <<-EOS
30 |         SELECT #{max_sql_clause} AS the_max
31 |         FROM #{database}.#{table}
32 |       EOS
33 |       sql += " WHERE #{options[:conditions]}" if options[:conditions]
34 | 
35 |       query(sql).to_a.first['the_max']
36 |     end
37 | 
38 |   private
39 | 
40 |     def date? val
41 |       val =~ /^\d{4}-\d{1,2}-\d{1,2}( \d{2}:\d{2}:\d{2}( ((-|\+)\d+)| UTC)?)?$/
42 |     end
43 | 
44 |     def default_floor_for column
45 |       case column
46 |       when /_at$/
47 |         return '1970-01-01'
48 |       when /_date$/
49 |         return '1970-01-01'
50 |       when /(^id$|_id$)/
51 |         return 0
52 |       else
53 |         raise ArgumentError, "could not determine a default for #{column}"
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/examples/basic_etl.rb:
--------------------------------------------------------------------------------
  1 | require 'mysql2'
  2 | require 'ETL'
  3 | 
  4 | connection = Mysql2::Client.new host:     'localhost',
  5 |                                 username: 'root',
  6 |                                 password: '',
  7 |                                 database: 'some_database'
  8 | 
  9 | # set up the source database
 10 | connection.query %[
 11 |   CREATE DATABASE IF NOT EXISTS some_database]
 12 | 
 13 | connection.query %[
 14 |   CREATE TABLE IF NOT EXISTS some_database.some_source_table (
 15 |       user_id INT NOT NULL
 16 |     , created_at DATETIME NOT NULL
 17 |     , amount INT NOT NULL)]
 18 | 
 19 | connection.query %[
 20 |   TRUNCATE some_database.some_source_table]
 21 | 
 22 | connection.query %[
 23 |   INSERT INTO some_database.some_source_table (
 24 |       user_id
 25 |     , created_at
 26 |     , amount
 27 |   ) VALUES
 28 |       (1, UTC_TIMESTAMP, 100)
 29 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
 30 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
 31 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
 32 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
 33 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
 34 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
 35 |     , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
 36 | 
 37 | # set up the ETL
 38 | etl = ETL.new(description: "a description of what this ETL does",
 39 |               connection:  connection)
 40 | 
 41 | # configure ETL
 42 | etl.config do |etl|
 43 |   etl.ensure_destination do |etl|
 44 |     # For most ETLs you may want to ensure that the destination exists, so the
 45 |     # #ensure_destination block is ideally suited to fulfill this requirement.
 46 |     #
 47 |     # By way of example:
 48 |     #
 49 |     etl.query %[
 50 |       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
 51 |           user_id INT UNSIGNED NOT NULL
 52 |         , created_date DATE NOT NULL
 53 |         , total_amount INT SIGNED NOT NULL
 54 |         , message VARCHAR(100) DEFAULT NULL
 55 |         , PRIMARY KEY (user_id, created_date)
 56 |         , KEY (created_date)
 57 |       )]
 58 |   end
 59 | 
 60 |   etl.before_etl do |etl|
 61 |     # All pre-ETL work is performed in this block.
 62 |     #
 63 |     # This can be thought of as a before-ETL hook that will fire only once. When
 64 |     # you are not leveraging the ETL iteration capabilities, the value of this
 65 |     # block vs the #etl block is not very clear. We will see how and when to
 66 |     # leverage this block effectively when we introduce iteration.
 67 |     #
 68 |     # As an example, let's say we want to get rid of all entries that have an
 69 |     # amount less than zero before moving on to our actual etl:
 70 |     #
 71 |     etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
 72 |   end
 73 | 
 74 |   etl.etl do |etl|
 75 |     # Here is where the magic happens! This block contains the main ETL
 76 |     # operation.
 77 |     #
 78 |     # For example:
 79 |     #
 80 |     etl.query %[
 81 |       REPLACE INTO some_database.some_destination_table (
 82 |           user_id
 83 |         , created_date
 84 |         , total_amount
 85 |       ) SELECT
 86 |           sst.user_id
 87 |         , DATE(sst.created_at) AS created_date
 88 |         , SUM(sst.amount) AS total_amount
 89 |       FROM
 90 |         some_database.some_source_table sst
 91 |       GROUP BY
 92 |           sst.user_id
 93 |         , DATE(sst.created_at)]
 94 |   end
 95 | 
 96 |   etl.after_etl do |etl|
 97 |     # All post-ETL work is performed in this block.
 98 |     #
 99 |     # Again, to finish up with an example:
100 |     #
101 |     etl.query %[
102 |       UPDATE some_database.some_destination_table
103 |       SET message = "WOW"
104 |       WHERE total_amount > 100]
105 |   end
106 | end
107 | 
108 | # ship it
109 | etl.run
110 | 
111 | puts %[
112 | ETL complete. Now go have a look at some_database.some_destination_table
113 | That was build from some_database.some_source_table using the above ETL configuration.
114 | 
115 | SELECT * FROM some_database.some_destination_table;]
116 | 


--------------------------------------------------------------------------------
/lib/etl.rb:
--------------------------------------------------------------------------------
  1 | require 'etl/version'
  2 | require 'etl/helpers'
  3 | require 'logger'
  4 | require 'date'
  5 | require 'time'
  6 | 
  7 | class ETL
  8 |   include Helpers
  9 | 
 10 |   attr_accessor :description
 11 |   attr_accessor :connection
 12 |   attr_reader   :logger
 13 | 
 14 |   ORDERED_ETL_OPERATIONS = [
 15 |     :ensure_destination,
 16 |     :before_etl,
 17 |     :etl,
 18 |     :after_etl
 19 |   ]
 20 | 
 21 |   ITERATOR_OPERATIONS = [
 22 |     :start,
 23 |     :step,
 24 |     :stop
 25 |   ]
 26 | 
 27 |   def self.connection= connection
 28 |     @connection = connection
 29 |   end
 30 | 
 31 |   def self.connection
 32 |     @connection
 33 |   end
 34 | 
 35 |   def self.defaults
 36 |     {connection: @connection}
 37 |   end
 38 | 
 39 |   def initialize attributes = {}
 40 |     self.class.defaults.merge(attributes).each do |key, value|
 41 |       self.send "#{key}=", value
 42 |     end
 43 |     default_logger! unless attributes.keys.include?(:logger)
 44 |   end
 45 | 
 46 |   def config &block
 47 |     yield self if block_given?
 48 |     self
 49 |   end
 50 | 
 51 |   def logger= logger
 52 |     @logger = logger
 53 |   end
 54 | 
 55 |   # A little metaprogramming to consolidate the generation of our sql
 56 |   # generating / querying methods. Note that we don't metaprogram the etl
 57 |   # operation as it's a little more complex.
 58 |   #
 59 |   # This will produce methods of the form:
 60 |   #
 61 |   #   def [name] *args, &block
 62 |   #     if block_given?
 63 |   #       @[name] = block
 64 |   #     else
 65 |   #       @[name].call self, *args if @[name]
 66 |   #     end
 67 |   #   end
 68 |   #
 69 |   # for any given variable included in the method name's array
 70 |   (ORDERED_ETL_OPERATIONS - [:etl]).each do |method|
 71 |     define_method method do |*args, &block|
 72 |       warn_args_will_be_deprecated_for method unless args.empty?
 73 | 
 74 |       if block
 75 |         instance_variable_set("@#{method}", block)
 76 |       else
 77 |         instance_variable_get("@#{method}").
 78 |           call(self, *args) if instance_variable_get("@#{method}")
 79 |       end
 80 |     end
 81 |   end
 82 | 
 83 |   def etl *args, &block
 84 |     warn_args_will_be_deprecated_for :etl unless args.empty?
 85 | 
 86 |     if block_given?
 87 |       @etl = block
 88 |     else
 89 |       if iterate?
 90 |         if @etl
 91 |           current = start
 92 |           @etl.call self, cast(current), cast(current += step) while stop >= current
 93 |         end
 94 |       else
 95 |         @etl.call self, *args if @etl
 96 |       end
 97 |     end
 98 |   end
 99 | 
100 |   # A little more metaprogramming to consolidate the generation of
101 |   # our sql generating / querying methods.
102 |   #
103 |   # This will produce methods of the form:
104 |   #
105 |   #   def [method] *args, &block
106 |   #     if block
107 |   #       @_[method]_block = block
108 |   #     else
109 |   #       # cache block's result
110 |   #       if defined? @[method]
111 |   #         @[method]
112 |   #       else
113 |   #         @[method] = @_[method]_block.call(self, *args)
114 |   #       end
115 |   #     end
116 |   #   end
117 |   #
118 |   # for any given variable included in the method name's array
119 |   ITERATOR_OPERATIONS.each do |method|
120 |     define_method method do |*args, &block|
121 |       warn_args_will_be_deprecated_for method unless args.empty?
122 | 
123 |       if block
124 |         instance_variable_set("@_#{method}_block", block)
125 |       else
126 |         if instance_variable_defined?("@#{method}")
127 |           instance_variable_get("@#{method}")
128 |         else
129 |           instance_variable_set("@#{method}",
130 |                                 instance_variable_get("@_#{method}_block")
131 |                                   .call(self, *args))
132 |         end
133 |       end
134 |     end
135 |   end
136 | 
137 |   def run options = {}
138 |     (ORDERED_ETL_OPERATIONS - [*options[:except]]).each do |method|
139 |       send method
140 |     end
141 |   end
142 | 
143 |   def query sql
144 |     time_and_log(sql: sql) do
145 |       connection.query sql
146 |     end
147 |   end
148 | 
149 |   def info data = {}
150 |     logger.info data.merge(emitter: self) if logger?
151 |   end
152 | 
153 |   def debug data = {}
154 |     logger.debug data.merge(emitter: self) if logger?
155 |   end
156 | 
157 | private
158 | 
159 |   def warn_args_will_be_deprecated_for method
160 |     warn "DEPRECATED: passing arguments to ##{method} will be removed in an upcoming release and will raise an exception. Please remove this from your code."
161 |   end
162 | 
163 |   def iterate?
164 |     ITERATOR_OPERATIONS.all? do |method|
165 |       instance_variable_defined?("@_#{method}_block")
166 |     end
167 |   end
168 | 
169 |   def default_logger!
170 |     @logger = default_logger
171 |   end
172 | 
173 |   def logger?
174 |     !!@logger
175 |   end
176 | 
177 |   def default_logger
178 |     ::Logger.new(STDOUT).tap do |logger|
179 |       logger.formatter = proc do |severity, datetime, progname, msg|
180 |         event_details =  "[#{datetime}] #{severity} #{msg[:event_type]}"
181 | 
182 |         emitter_details =  "\"#{msg[:emitter].description || 'no description given'}\""
183 |         emitter_details += " (object #{msg[:emitter].object_id})"
184 | 
185 |         leadin = "#{event_details} for #{emitter_details}"
186 | 
187 |         case msg[:event_type]
188 |         when :query_start
189 |           "#{leadin}\n#{msg[:sql]}\n"
190 |         when :query_complete
191 |           "#{leadin} runtime: #{msg[:runtime]}s\n"
192 |         else
193 |           "#{leadin}: #{msg[:message]}\n"
194 |         end
195 |       end
196 |     end
197 |   end
198 | 
199 |   def time_and_log data = {}, &block
200 |     start_runtime = Time.now
201 |     debug data.merge(event_type: :query_start)
202 |     retval = yield
203 |     info data.merge(event_type: :query_complete,
204 |                     runtime: Time.now - start_runtime)
205 |     retval
206 |   end
207 | 
208 |   # NOTE: If you needed to handle more type data type casting you can add a
209 |   # case statement. If you need to be able to handle entirely different sets
210 |   # of casting depending on database engine, you can modify #cast to take a
211 |   # "type" arg and then determine which caster to route the arg through
212 |   def cast arg
213 |     case arg
214 |     when Date then arg.strftime("%Y-%m-%d")
215 |     when Time then arg.strftime("%Y-%m-%d %H:%M:%S")
216 |     else
217 |       arg
218 |     end
219 |   end
220 | end
221 | 


--------------------------------------------------------------------------------
/examples/iterator_etl.rb:
--------------------------------------------------------------------------------
  1 | require 'mysql2'
  2 | require 'ETL'
  3 | 
  4 | connection = Mysql2::Client.new host:     'localhost',
  5 |                                 username: 'root',
  6 |                                 password: '',
  7 |                                 database: 'some_database'
  8 | 
  9 | # set up the source database:
 10 | connection.query %[
 11 |   CREATE DATABASE IF NOT EXISTS some_database]
 12 | 
 13 | connection.query %[
 14 |   CREATE TABLE IF NOT EXISTS some_database.some_source_table (
 15 |       user_id INT NOT NULL
 16 |     , created_at DATETIME NOT NULL
 17 |     , amount INT NOT NULL)]
 18 | 
 19 | connection.query %[
 20 |   TRUNCATE some_database.some_source_table]
 21 | 
 22 | connection.query %[
 23 |   INSERT INTO some_database.some_source_table (
 24 |       user_id
 25 |     , created_at
 26 |     , amount
 27 |   ) VALUES
 28 |       (1, UTC_TIMESTAMP, 100)
 29 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
 30 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 400)
 31 |     , (2, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
 32 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 600)
 33 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, -100)
 34 |     , (3, UTC_TIMESTAMP - INTERVAL 3 DAY, 200)
 35 |     , (3, UTC_TIMESTAMP - INTERVAL 4 DAY, 200)]
 36 | 
 37 | # set up the ETL
 38 | etl = ETL.new(description: "a description of what this ETL does",
 39 |               connection:  connection)
 40 | 
 41 | # configure it
 42 | etl.config do |etl|
 43 |   etl.ensure_destination do |etl|
 44 |     # For most ETLs you may want to ensure that the destination exists, so the
 45 |     # #ensure_destination block is ideally suited to fulfill this requirement.
 46 |     #
 47 |     # By way of example:
 48 |     #
 49 |     etl.query %[
 50 |       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
 51 |           user_id INT UNSIGNED NOT NULL
 52 |         , created_date DATE NOT NULL
 53 |         , total_amount INT SIGNED NOT NULL
 54 |         , message VARCHAR(100) DEFAULT NULL
 55 |         , PRIMARY KEY (user_id, created_date)
 56 |         , KEY (created_date)
 57 |       )]
 58 |   end
 59 | 
 60 |   etl.before_etl do |etl|
 61 |     # All pre-ETL work is performed in this block.
 62 |     #
 63 |     # Now that we are leveraging iteration the #before_etl block becomes
 64 |     # more useful as a way to execute an operation once before we begin
 65 |     # our iteration.
 66 |     #
 67 |     # As an example, let's say we want to get rid of all entries that have an
 68 |     # amount less than zero before moving on to our actual etl:
 69 |     #
 70 |     etl.query %[
 71 |       DELETE FROM some_database.some_source_table
 72 |       WHERE amount < 0]
 73 |   end
 74 | 
 75 |   etl.start do |etl|
 76 |     # This defines where the ETL should start. This can be a flat number
 77 |     # or date, or even SQL / other code can be executed to produce a starting
 78 |     # value.
 79 |     #
 80 |     # Usually, this is the last known entry for the destination table with
 81 |     # some sensible default if the destination does not yet contain data.
 82 |     #
 83 |     # As an example:
 84 |     #
 85 |     # Note that we cast the default date as a DATE. If we don't, it will be
 86 |     # treated as a string and our iterator will fail under the hood when testing
 87 |     # if it is complete.
 88 |     res = etl.query %[
 89 |       SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
 90 |       FROM some_database.some_destination_table]
 91 | 
 92 |     res.to_a.first['the_max']
 93 |   end
 94 | 
 95 |   etl.step do |etl|
 96 |     # The step block defines the size of the iteration block. To iterate by
 97 |     # ten records, the step block should be set to return 10.
 98 |     #
 99 |     # As an alternative example, to set the iteration to go 10,000 units
100 |     # at a time, the following value should be provided:
101 |     #
102 |     #   10_000 (Note: an underscore is used for readability)
103 |     #
104 |     # As an example, to iterate 7 days at a time:
105 |     #
106 |     7
107 |   end
108 | 
109 |   etl.stop do |etl|
110 |     # The stop block defines when the iteration should halt.
111 |     # Again, this can be a flat value or code. Either way, one value *must* be
112 |     # returned.
113 |     #
114 |     # As a flat value:
115 |     #
116 |     #   1_000_000
117 |     #
118 |     # Or a date value:
119 |     #
120 |     #   Time.now.to_date
121 |     #
122 |     # Or as a code example:
123 |     #
124 |     res = etl.query %[
125 |       SELECT DATE(MAX(created_at)) AS the_max
126 |       FROM some_database.some_source_table]
127 | 
128 |     res.to_a.first['the_max']
129 |   end
130 | 
131 |   etl.etl do |etl, lbound, ubound|
132 |     # The etl block is the main part of the framework. Note: there are
133 |     # two extra args with the iterator this time around: "lbound" and "ubound"
134 |     #
135 |     # "lbound" is the lower bound of the current iteration. When iterating
136 |     # from 0 to 10 and stepping by 2, the lbound would equal 2 on the
137 |     # second iteration.
138 |     #
139 |     # "ubound" is the upper bound of the current iteration. In continuing with the
140 |     # example above, when iterating from 0 to 10 and stepping by 2, the ubound would
141 |     # equal 4 on the second iteration.
142 |     #
143 |     # These args can be used to "window" SQL queries or other code operations.
144 |     #
145 |     # As a first example, to iterate over a set of ids:
146 |     #
147 |     #   etl.query %[
148 |     #     REPLACE INTO some_database.some_destination_table (
149 |     #         created_date
150 |     #       , user_id
151 |     #       , total_amount
152 |     #     ) SELECT
153 |     #         DATE(sst.created_at) AS created_date
154 |     #       , sst.user_id
155 |     #       , SUM(sst.amount) AS total_amount
156 |     #     FROM
157 |     #       some_database.some_source_table sst
158 |     #     WHERE
159 |     #       sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
160 |     #     GROUP BY
161 |     #         DATE(sst.created_at)
162 |     #       , sst.user_id]
163 |     #
164 |     # To "window" a SQL query using dates:
165 |     #
166 |     etl.query %[
167 |       REPLACE INTO some_database.some_destination_table (
168 |           created_date
169 |         , user_id
170 |         , total_amount
171 |       ) SELECT
172 |           DATE(sst.created_at) AS created_date
173 |         , sst.user_id
174 |         , SUM(sst.amount) AS total_amount
175 |       FROM
176 |         some_database.some_source_table sst
177 |       WHERE
178 |         -- Note the usage of quotes surrounding the lbound and ubound vars.
179 |         -- This is is required when dealing with dates / datetimes
180 |         sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
181 |       GROUP BY
182 |           DATE(sst.created_at)
183 |         , sst.user_id]
184 | 
185 |     # Note that there is no sql sanitization here so there is *potential* for SQL
186 |     # injection. That being said you'll likely be using this gem in an internal
187 |     # tool so hopefully your co-workers are not looking to sabotage your ETL
188 |     # pipeline. Just be aware of this and handle it as you see fit.
189 |   end
190 | 
191 |   etl.after_etl do |etl|
192 |     # All post-ETL work is performed in this block.
193 |     #
194 |     # Again, to finish up with an example:
195 |     #
196 |     etl.query %[
197 |       UPDATE some_database.some_destination_table
198 |       SET message = "WOW"
199 |       WHERE total_amount > 100]
200 |   end
201 | end
202 | 
203 | etl.run
204 | 
205 | puts %[
206 | ETL complete. Now go have a look at some_database.some_destination_table
207 | That was build from some_database.some_source_table using the above ETL configuration.
208 | 
209 | SELECT * FROM some_database.some_destination_table;]
210 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ETL
  2 | 
  3 | Extract, transform, and load data with ruby!
  4 | 
  5 | ## Installation
  6 | 
  7 | Add this line to your application's Gemfile:
  8 | 
  9 |     gem 'ETL'
 10 | 
 11 | And then execute:
 12 | 
 13 |     $ bundle
 14 | 
 15 | Or install it yourself as:
 16 | 
 17 |     $ gem install ETL
 18 | 
 19 | ## ETL Dependencies
 20 | 
 21 | ETL depends on having a database connection object that __must__ respond
 22 | to `#query`. The [mysql2](https://github.com/brianmario/mysql2) gem is a good option.
 23 | You can also proxy another library using Ruby's `SimpleDelegator` and add a `#query`
 24 | method if need be.
 25 | 
 26 | The gem comes bundled with a default logger. If you'd like to write your own
 27 | just make sure that it implements `#debug` and `#info`. For more information
 28 | on what is logged and when, view the [logger details](#logger-details).
 29 | 
 30 | ### Basic ETL
 31 | 
 32 | Assume that we have a database connection represented by `connection`.
 33 | 
 34 | To run a basic ETL that is composed of sequential SQL statements, start by
 35 | creating a new ETL instance:
 36 | 
 37 | ```ruby
 38 | # setting connection at the class level
 39 | ETL.connection = connection
 40 | 
 41 | etl = ETL.new(description: "a description of what this ETL does")
 42 | ```
 43 | 
 44 | or
 45 | 
 46 | ```ruby
 47 | # setting connection at the instance level
 48 | etl = ETL.new(
 49 |   description: "a description of what this ETL does",
 50 |   connection:  connection
 51 | )
 52 | ```
 53 | which can then be configured:
 54 | 
 55 | ```ruby
 56 | etl.config do |etl|
 57 |   etl.ensure_destination do |etl|
 58 |     # For most ETLs you may want to ensure that the destination exists, so the
 59 |     # #ensure_destination block is ideally suited to fulfill this requirement.
 60 |     #
 61 |     # By way of example:
 62 |     #
 63 |     etl.query %[
 64 |       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
 65 |           user_id INT UNSIGNED NOT NULL
 66 |         , created_date DATE NOT NULL
 67 |         , total_amount INT SIGNED NOT NULL
 68 |         , message VARCHAR(100) DEFAULT NULL
 69 |         , PRIMARY KEY (user_id, created_date)
 70 |         , KEY (created_date)
 71 |       )
 72 |     ]
 73 |   end
 74 | 
 75 |   etl.before_etl do |etl|
 76 |     # All pre-ETL work is performed in this block.
 77 |     #
 78 |     # This can be thought of as a before-ETL hook that will fire only once. When
 79 |     # you are not leveraging the ETL iteration capabilities, the value of this
 80 |     # block vs the #etl block is not very clear. We will see how and when to
 81 |     # leverage this block effectively when we introduce iteration.
 82 |     #
 83 |     # As an example, let's say we want to get rid of all entries that have an
 84 |     # amount less than zero before moving on to our actual etl:
 85 |     #
 86 |     etl.query %[DELETE FROM some_database.some_source_table WHERE amount < 0]
 87 |   end
 88 | 
 89 |   etl.etl do |etl|
 90 |     # Here is where the magic happens! This block contains the main ETL
 91 |     # operation.
 92 |     #
 93 |     # For example:
 94 |     #
 95 |     etl.query %[
 96 |       REPLACE INTO some_database.some_destination_table (
 97 |           user_id
 98 |         , created_date
 99 |         , total_amount
100 |       ) SELECT
101 |           user_id
102 |         , DATE(created_at) AS created_date
103 |         , SUM(amount) AS total_amount
104 |       FROM
105 |         some_database.some_source_table sst
106 |       GROUP BY
107 |           sst.user_id
108 |         , DATE(sst.created_at)
109 |     ]
110 |   end
111 | 
112 |   etl.after_etl do |etl|
113 |     # All post-ETL work is performed in this block.
114 |     #
115 |     # Again, to finish up with an example:
116 |     #
117 |     etl.query %[
118 |       UPDATE some_database.some_destination_table
119 |       SET message = "WOW"
120 |       WHERE total_amount > 100
121 |     ]
122 |   end
123 | end
124 | ```
125 | 
126 | At this point it is possible to run the ETL instance via:
127 | 
128 | ```ruby
129 | etl.run
130 | ```
131 | which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in
132 | that order.
133 | 
134 | ### ETL with iteration
135 | 
136 | To add in iteration, simply supply `#start`, `#step`, and `#stop` blocks. This
137 | is useful when dealing with large data sets or when executing queries that,
138 | while optimized, are still slow.
139 | 
140 | Again, to kick things off:
141 | 
142 | ```ruby
143 | etl = ETL.new(
144 |   description: "a description of what this ETL does",
145 |   connection:  connection
146 | )
147 | ```
148 | 
149 | where `connection` is the same as described above.
150 | 
151 | Next we can configure the ETL:
152 | 
153 | ```ruby
154 | # assuming we have the ETL instance from above
155 | etl.config do |etl|
156 |   etl.ensure_destination do |etl|
157 |     # For most ETLs you may want to ensure that the destination exists, so the
158 |     # #ensure_destination block is ideally suited to fulfill this requirement.
159 |     #
160 |     # By way of example:
161 |     #
162 |     etl.query %[
163 |       CREATE TABLE IF NOT EXISTS some_database.some_destination_table (
164 |           user_id INT UNSIGNED NOT NULL
165 |         , created_date DATE NOT NULL
166 |         , total_amount INT SIGNED NOT NULL
167 |         , message VARCHAR(100) DEFAULT NULL
168 |         , PRIMARY KEY (user_id, created_date)
169 |         , KEY (created_date)
170 |       )
171 |     ]
172 |   end
173 | 
174 |   etl.before_etl do |etl|
175 |     # All pre-ETL work is performed in this block.
176 |     #
177 |     # Now that we are leveraging iteration the #before_etl block becomes
178 |     # more useful as a way to execute an operation once before we begin
179 |     # our iteration.
180 |     #
181 |     # As an example, let's say we want to get rid of all entries that have an
182 |     # amount less than zero before moving on to our actual etl:
183 |     #
184 |     etl.query %[
185 |       DELETE FROM some_database.some_source_table
186 |       WHERE amount < 0
187 |     ]
188 |   end
189 | 
190 |   etl.start do |etl|
191 |     # This defines where the ETL should start. This can be a flat number
192 |     # or date, or even SQL / other code can be executed to produce a starting
193 |     # value.
194 |     #
195 |     # Usually, this is the last known entry for the destination table with
196 |     # some sensible default if the destination does not yet contain data.
197 |     #
198 |     # As an example:
199 |     #
200 |     # Note that we cast the default date as a DATE. If we don't, it will be
201 |     # treated as a string and our iterator will fail under the hood when testing
202 |     # if it is complete.
203 |     res = etl.query %[
204 |       SELECT COALESCE(MAX(created_date), DATE('2010-01-01')) AS the_max
205 |       FROM some_database.some_destination_table
206 |     ]
207 | 
208 |     res.to_a.first['the_max']
209 |   end
210 | 
211 |   etl.step do |etl|
212 |     # The step block defines the size of the iteration block. To iterate by
213 |     # ten records, the step block should be set to return 10.
214 |     #
215 |     # As an alternative example, to set the iteration to go 10,000 units
216 |     # at a time, the following value should be provided:
217 |     #
218 |     #   10_000 (Note: an underscore is used for readability)
219 |     #
220 |     # As an example, to iterate 7 days at a time:
221 |     #
222 |     7
223 |   end
224 | 
225 |   etl.stop do |etl|
226 |     # The stop block defines when the iteration should halt.
227 |     # Again, this can be a flat value or code. Either way, one value *must* be
228 |     # returned.
229 |     #
230 |     # As a flat value:
231 |     #
232 |     #   1_000_000
233 |     #
234 |     # Or a date value:
235 |     #
236 |     #   Time.now.to_date
237 |     #
238 |     # Or as a code example:
239 |     #
240 |     res = etl.query %[
241 |       SELECT DATE(MAX(created_at)) AS the_max
242 |       FROM some_database.some_source_table
243 |     ]
244 | 
245 |     res.to_a.first['the_max']
246 |   end
247 | 
248 |   etl.etl do |etl, lbound, ubound|
249 |     # The etl block is the main part of the framework. Note: there are
250 |     # two extra args with the iterator this time around: "lbound" and "ubound"
251 |     #
252 |     # "lbound" is the lower bound of the current iteration. When iterating
253 |     # from 0 to 10 and stepping by 2, the lbound would equal 2 on the
254 |     # second iteration.
255 |     #
256 |     # "ubound" is the upper bound of the current iteration. In continuing with the
257 |     # example above, when iterating from 0 to 10 and stepping by 2, the ubound would
258 |     # equal 4 on the second iteration.
259 |     #
260 |     # These args can be used to "window" SQL queries or other code operations.
261 |     #
262 |     # As a first example, to iterate over a set of ids:
263 |     #
264 |     #   etl.query %[
265 |     #     REPLACE INTO some_database.some_destination_table (
266 |     #         created_date
267 |     #       , user_id
268 |     #       , total_amount
269 |     #     ) SELECT
270 |     #         DATE(sst.created_at) AS created_date
271 |     #       , sst.user_id
272 |     #       , SUM(sst.amount) AS total_amount
273 |     #     FROM
274 |     #       some_database.some_source_table sst
275 |     #     WHERE
276 |     #       sst.user_id > #{lbound} AND sst.user_id <= #{ubound}
277 |     #     GROUP BY
278 |     #         DATE(sst.created_at)
279 |     #       , sst.user_id]
280 |     #
281 |     # To "window" a SQL query using dates:
282 |     #
283 |     etl.query %[
284 |       REPLACE INTO some_database.some_destination_table (
285 |           created_date
286 |         , user_id
287 |         , total_amount
288 |       ) SELECT
289 |           DATE(sst.created_at) AS created_date
290 |         , sst.user_id
291 |         , SUM(sst.amount) AS total_amount
292 |       FROM
293 |         some_database.some_source_table sst
294 |       WHERE
295 |         -- Note the usage of quotes surrounding the lbound and ubound vars.
296 |         -- This is is required when dealing with dates / datetimes
297 |         sst.created_at >= '#{lbound}' AND sst.created_at < '#{ubound}'
298 |       GROUP BY
299 |           DATE(sst.created_at)
300 |         , sst.user_id
301 |     ]
302 | 
303 |     # Note that there is no sql sanitization here so there is *potential* for SQL
304 |     # injection. That being said you'll likely be using this gem in an internal
305 |     # tool so hopefully your co-workers are not looking to sabotage your ETL
306 |     # pipeline. Just be aware of this and handle it as you see fit.
307 |   end
308 | 
309 |   etl.after_etl do |etl|
310 |     # All post-ETL work is performed in this block.
311 |     #
312 |     # Again, to finish up with an example:
313 |     #
314 |     etl.query %[
315 |       UPDATE some_database.some_destination_table
316 |       SET message = "WOW"
317 |       WHERE total_amount > 100
318 |     ]
319 |   end
320 | end
321 | ```
322 | 
323 | At this point it is possible to run the ETL instance via:
324 | 
325 | ```ruby
326 | etl.run
327 | ```
328 | which executes `#ensure_destination`, `#before_etl`, `#etl`, and `#after_etl` in
329 | that order.
330 | 
331 | Note that `#etl` executes `#start` and `#stop` once and memoizes the result for
332 | each. It then begins to iterate from what `#start` evaluated to up until what `#stop`
333 | evaluated to by what `#step` evaluates to.
334 | 
335 | ## Examples
336 | 
337 | There are two examples found in `./examples` that demonstrate the basic ETL and
338 | iteration ETL. Each file uses the [mysql2](https://github.com/brianmario/mysql2)
339 | gem and reads / writes data to localhost using the root user with no password.
340 | Adjust as needed.
341 | 
342 | ## Logger Details
343 | 
344 | A logger must support two methods: `#info` and `#warn`.
345 | 
346 | Both methods should accept a single hash argument. The argument will contain:
347 | 
348 | - `:emitter` => a reference to the ETL instance's `self`
349 | - `:event_type` => a symbol that includes the type of event being logged. You
350 |   can use this value to derive which other data you'll have available
351 | 
352 | When `:event_type` is equal to `:query_start`, you'll have the following
353 | available in the hash argument:
354 | 
355 | - `:sql` => the sql that is going to be run
356 | 
357 | These events are logged at the debug level.
358 | 
359 | When `:event_type` is equal to `:query_complete`, you'll have the following
360 | available in the hash argument:
361 | 
362 | - `:sql` => the sql that was run
363 | - `:runtime` => how long the query took to execute
364 | 
365 | These events are logged at the info level.
366 | 
367 | Following from this you could implement a simple logger as:
368 | 
369 | ```ruby
370 | class PutsLogger
371 |   def info data
372 |     @data = data
373 |     write!
374 |   end
375 | 
376 |   def debug data
377 |     @data = data
378 |     write!
379 |   end
380 | 
381 | private
382 | 
383 |   def write!
384 |     case (event_type = @data.delete(:event_type))
385 |     when :query_start
386 |       output =  "#{@data[:emitter].description} is about to run\n"
387 |       output += "#{@data[:sql]}\n"
388 |     when :query_complete
389 |       output =  "#{@data[:emitter].description} executed:\n"
390 |       output += "#{@data[:sql]}\n"
391 |       output += "query completed at #{Time.now} and took #{@data[:runtime]}s\n"
392 |     else
393 |       output = "no special logging for #{event_type} event_type yet\n"
394 |     end
395 |     puts output
396 |     @data = nil
397 |   end
398 | end
399 | ```
400 | 
401 | ## Contributing
402 | 
403 | If you would like to contribute code to ETL you can do so through GitHub by
404 | forking the repository and sending a pull request.
405 | 
406 | When submitting code, please make every effort to follow existing conventions
407 | and style in order to keep the code as readable as possible.
408 | 
409 | Before your code can be accepted into the project you must also sign the
410 | [Individual Contributor License Agreement (CLA)][1].
411 | 
412 | 
413 |  [1]: https://spreadsheets.google.com/spreadsheet/viewform?formkey=dDViT2xzUHAwRkI3X3k5Z0lQM091OGc6MQ&ndplr=1
414 | 
415 | ## License
416 | 
417 | Copyright 2013 Square Inc.
418 | 
419 | Licensed under the Apache License, Version 2.0 (the "License");
420 | you may not use this file except in compliance with the License.
421 | You may obtain a copy of the License at
422 | 
423 |     http://www.apache.org/licenses/LICENSE-2.0
424 | 
425 | Unless required by applicable law or agreed to in writing, software
426 | distributed under the License is distributed on an "AS IS" BASIS,
427 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
428 | See the License for the specific language governing permissions and
429 | limitations under the License.
430 | 


--------------------------------------------------------------------------------
/spec/etl_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'mysql2'
  2 | require 'active_support/time'
  3 | require 'etl'
  4 | 
  5 | def test_connection
  6 |   Mysql2::Client.new(
  7 |     host: 'localhost',
  8 |     username: 'root',
  9 |     database: 'etl_test'
 10 |   )
 11 | end
 12 | 
 13 | def reset_test_env connection, &block
 14 |   connection.query %[DROP DATABASE IF EXISTS etl_test]
 15 |   connection.query %[CREATE DATABASE etl_test]
 16 |   connection.query %[USE etl_test]
 17 | 
 18 |   if block_given?
 19 |     yield connection
 20 |   else
 21 |     connection.query %[
 22 |       CREATE TABLE etl_source (
 23 |           id INT NOT NULL
 24 |         , name VARCHAR(10)
 25 |         , amount INT(11) DEFAULT 0
 26 |         , PRIMARY KEY (id)
 27 |       )
 28 |     ]
 29 | 
 30 |     connection.query %[
 31 |       INSERT INTO etl_test.etl_source (id, name, amount)
 32 |       VALUES
 33 |         (1, 'Jeff', 100),
 34 |         (2, 'Ryan',  50),
 35 |         (3, 'Jack',  75),
 36 |         (4, 'Jeff',  10),
 37 |         (5, 'Jack',  45),
 38 |         (6, 'Nick', -90),
 39 |         (7, 'Nick',  90)
 40 |     ]
 41 |   end
 42 | end
 43 | 
 44 | describe ETL do
 45 |   let(:logger) { nil }
 46 | 
 47 |   describe "deprecations" do
 48 |     let(:etl) { described_class.new }
 49 | 
 50 |     context "#ensure_destination" do
 51 |       it "does not warn when no args are passed" do
 52 |         expect(etl).to receive(:warn).never
 53 |         etl.ensure_destination {}
 54 |       end
 55 | 
 56 |       it "warns when args are passed that this is deprecated" do
 57 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #ensure_destination will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
 58 |         etl.ensure_destination('some arg') {}
 59 |       end
 60 |     end
 61 | 
 62 |     context "#before_etl" do
 63 |       it "does not warn when no args are passed" do
 64 |         expect(etl).to receive(:warn).never
 65 |         etl.before_etl {}
 66 |       end
 67 | 
 68 |       it "warns when args are passed that this is deprecated" do
 69 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #before_etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
 70 |         etl.before_etl('some arg') {}
 71 |       end
 72 |     end
 73 | 
 74 |     context "#start" do
 75 |       it "does not warn when no args are passed" do
 76 |         expect(etl).to receive(:warn).never
 77 |         etl.start {}
 78 |       end
 79 | 
 80 |       it "warns when args are passed that this is deprecated" do
 81 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #start will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
 82 |         etl.start('some arg') {}
 83 |       end
 84 |     end
 85 | 
 86 |     context "#step" do
 87 |       it "does not warn when no args are passed" do
 88 |         expect(etl).to receive(:warn).never
 89 |         etl.step {}
 90 |       end
 91 | 
 92 |       it "warns when args are passed that this is deprecated" do
 93 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #step will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
 94 |         etl.step('some arg') {}
 95 |       end
 96 |     end
 97 | 
 98 |     context "#stop" do
 99 |       it "does not warn when no args are passed" do
100 |         expect(etl).to receive(:warn).never
101 |         etl.stop {}
102 |       end
103 | 
104 |       it "warns when args are passed that this is deprecated" do
105 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #stop will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
106 |         etl.stop('some arg') {}
107 |       end
108 |     end
109 | 
110 |     context "#etl" do
111 |       it "does not warn when no args are passed" do
112 |         expect(etl).to receive(:warn).never
113 |         etl.etl {}
114 |       end
115 | 
116 |       it "warns when args are passed that this is deprecated" do
117 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
118 |         etl.etl('some arg') {}
119 |       end
120 |     end
121 | 
122 |     context "#after_etl" do
123 |       it "does not warn when no args are passed" do
124 |         expect(etl).to receive(:warn).never
125 |         etl.after_etl {}
126 |       end
127 | 
128 |       it "warns when args are passed that this is deprecated" do
129 |         expect(etl).to receive(:warn).with("DEPRECATED: passing arguments to #after_etl will be removed in an upcoming release and will raise an exception. Please remove this from your code.")
130 |         etl.after_etl('some arg') {}
131 |       end
132 |     end
133 |   end
134 | 
135 |   describe ".connection=" do
136 |     let(:class_level_connection) { double('class_level_connection') }
137 | 
138 |     it "sets the #connection for all instances" do
139 |       ETL.connection = class_level_connection
140 |       etl = ETL.new
141 |       expect(etl.connection).to eq(class_level_connection)
142 |     end
143 | 
144 |     it "allows instance-level overrides" do
145 |       instance_level_connection = double('instance_level_connection')
146 |       ETL.connection = class_level_connection
147 |       etl_with_connection_override = ETL.new(connection: instance_level_connection)
148 |       etl = ETL.new
149 |       expect(etl.connection).to eq class_level_connection
150 |       expect(etl_with_connection_override.connection).to eq(instance_level_connection)
151 |     end
152 |   end
153 | 
154 |   describe "#logger=" do
155 |     let(:etl) { described_class.new connection: double }
156 | 
157 |     it 'assigns' do
158 |       logger = double
159 |       etl.logger = logger
160 |       expect(etl.logger).to eq(logger)
161 |     end
162 |   end
163 | 
164 |   describe '#max_for' do
165 |     let(:connection) { test_connection }
166 |     let(:etl) { described_class.new(connection: connection, logger: logger) }
167 | 
168 |     before do
169 |       client = Mysql2::Client.new host: 'localhost', username: 'root'
170 |       client.query %[DROP DATABASE IF EXISTS etl_test]
171 |       client.query %[CREATE DATABASE etl_test]
172 |       client.query %[USE etl_test]
173 |       client.query %[
174 |         CREATE TABLE IF NOT EXISTS etl_source (
175 |             id INT(11) NOT NULL AUTO_INCREMENT
176 |           , name VARCHAR(10)
177 |           , amount INT(11) DEFAULT 0
178 |           , the_date DATE DEFAULT NULL
179 |           , the_null_date DATE DEFAULT NULL
180 |           , the_time_at DATETIME DEFAULT NULL
181 |           , the_null_time_at DATETIME DEFAULT NULL
182 |           , PRIMARY KEY (id)
183 |         )
184 |       ]
185 | 
186 |       client.query %[
187 |         INSERT INTO etl_source (
188 |             name
189 |           , amount
190 |           , the_date
191 |           , the_null_date
192 |           , the_time_at
193 |           , the_null_time_at
194 |         ) VALUES
195 |             ('Jeff', 100, '2012-01-02', NULL, '2012-01-02 00:00:01', NULL)
196 |           , ('Ryan',  50, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
197 |           , ('Jack',  75, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
198 |           , ('Jeff',  10, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
199 |           , ('Jack',  45, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
200 |           , ('Nick', -90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
201 |           , ('Nick',  90, '2012-01-01', NULL, '2012-01-01 00:00:00', NULL)
202 |       ]
203 | 
204 |       client.close
205 |     end
206 | 
207 |     after { connection.close }
208 | 
209 |     it "finds the max for dates" do
210 |       expect(etl.max_for(database: :etl_test,
211 |              table: :etl_source,
212 |              column: :the_date)).to eq(Date.parse('2012-01-02'))
213 |     end
214 | 
215 |     it "defaults to the beginning of time date when a max date cannot be found" do
216 |       expect(
217 |         etl.max_for(
218 |           database: :etl_test,
219 |           table: :etl_source,
220 |           column: :the_null_date
221 |         )
222 |       ).to eq(Date.parse('1970-01-01'))
223 |     end
224 | 
225 |     it "defaults to the specified default floor when a max date cannot be found" do
226 |       expect(
227 |         etl.max_for(
228 |           database: :etl_test,
229 |           table: :etl_source,
230 |           column: :the_null_date,
231 |           default_floor: '2011-01-01'
232 |         )
233 |       ).to eq(Date.parse('2011-01-01'))
234 |     end
235 | 
236 |     it "finds the max for datetimes" do
237 |       expect(
238 |         etl.max_for(
239 |           database: :etl_test,
240 |           table: :etl_source,
241 |           column: :the_time_at
242 |         )
243 |       ).to eq(Date.parse('2012-01-02'))
244 |     end
245 | 
246 |     it "defaults to the beginning of time when a max datetime cannot be found" do
247 |       expect(
248 |         etl.max_for(
249 |           database: :etl_test,
250 |           table: :etl_source,
251 |           column: :the_null_time_at
252 |         )
253 |       ).to eq(Date.parse('1970-01-01 00:00:00'))
254 |     end
255 | 
256 |     it "defaults to the specified default floor when a max datetime cannot be found" do
257 |       expect(
258 |         etl.max_for(
259 |           database: :etl_test,
260 |              table: :etl_source,
261 |              column: :the_null_time_at,
262 |              default_floor: '2011-01-01 00:00:00'
263 |         )
264 |       ).to eq(Date.parse('2011-01-01 00:00:00'))
265 |     end
266 | 
267 |     it "raises an error if a non-standard column is supplied with no default floor" do
268 |       expect {
269 |         etl.max_for(
270 |           database: :etl_test,
271 |           table:    :etl_source,
272 |           column:   :amount
273 |         )
274 |       }.to raise_exception
275 |     end
276 | 
277 |     it "finds the max for a non-standard column, using the default floor" do
278 |       expect(
279 |         etl.max_for(
280 |           database: :etl_test,
281 |           table: :etl_source,
282 |           column: :amount,
283 |           default_floor: 0
284 |         )
285 |       ).to eq(100)
286 |     end
287 |   end
288 | 
289 |   describe '#run' do
290 |     let(:connection) { test_connection }
291 |     let(:etl) { described_class.new connection: connection, logger: logger }
292 | 
293 |     before do
294 |       client = Mysql2::Client.new(host: 'localhost', username: 'root')
295 |       client.query %[DROP DATABASE IF EXISTS etl_test]
296 |       client.query %[CREATE DATABASE etl_test]
297 |       client.query %[USE etl_test]
298 |       client.query %[
299 |         CREATE TABLE IF NOT EXISTS etl_source (
300 |             id INT(11) NOT NULL AUTO_INCREMENT
301 |           , name VARCHAR(10)
302 |           , amount INT(11) DEFAULT 0
303 |           , PRIMARY KEY (id)
304 |         )
305 |       ]
306 | 
307 |       client.query %[
308 |         INSERT INTO etl_source (name, amount)
309 |         VALUES
310 |           ('Jeff',  100),
311 |           ('Ryan',  50),
312 |           ('Jack',  75),
313 |           ('Jeff',  10),
314 |           ('Jack',  45),
315 |           ('Nick', -90),
316 |           ('Nick',  90)
317 |       ]
318 | 
319 |       client.close
320 |     end
321 | 
322 |     it "executes the specified sql in the appropriate order" do
323 |       etl.ensure_destination do |etl|
324 |         etl.query %[
325 |           CREATE TABLE IF NOT EXISTS etl_destination (
326 |               name VARCHAR(10)
327 |             , total_amount INT(11) DEFAULT 0
328 |             , PRIMARY KEY (name)
329 |           )
330 |         ]
331 |       end
332 | 
333 |       etl.before_etl do |etl|
334 |         etl.query "DELETE FROM etl_source WHERE amount < 0"
335 |       end
336 | 
337 |       etl.etl do |etl|
338 |         etl.query %[
339 |           REPLACE INTO etl_destination
340 |           SELECT name, SUM(amount) FROM etl_source
341 |           GROUP BY name
342 |         ]
343 |       end
344 | 
345 |       etl.after_etl do |etl|
346 |         etl.query %[
347 |           UPDATE etl_destination
348 |           SET name = CONCAT("SUPER ", name)
349 |           WHERE total_amount > 115
350 |         ]
351 |       end
352 | 
353 |       etl.run
354 | 
355 |       expect(
356 |         connection
357 |           .query("SELECT * FROM etl_destination ORDER BY total_amount DESC")
358 |           .to_a
359 |       ).to eq(
360 |         [
361 |           {'name' => 'SUPER Jack', 'total_amount' => 120},
362 |           {'name' => 'Jeff',       'total_amount' => 110},
363 |           {'name' => 'Nick',       'total_amount' => 90},
364 |           {'name' => 'Ryan',       'total_amount' => 50}
365 |         ]
366 |       )
367 |     end
368 |   end
369 | 
370 |   describe '#run with operations specified for exclusion' do
371 |     let(:connection) { double }
372 |     let(:etl) { described_class.new connection: connection, logger: logger }
373 | 
374 |     it "does not call the specified method" do
375 |       etl.ensure_destination {}
376 |       expect(etl).not_to receive(:ensure_destination)
377 |       etl.run(except: :ensure_destination)
378 |     end
379 |   end
380 | 
381 |   context "with iteration" do
382 |     describe '#run over full table' do
383 |       let(:connection) { test_connection }
384 |       let(:etl) { described_class.new(connection: connection, logger: logger) }
385 | 
386 |       before { reset_test_env connection }
387 |       after  { connection.close }
388 | 
389 |       it "executes the specified sql in the appropriate order and ETLs properly" do
390 |         etl.ensure_destination do |etl|
391 |           etl.query %[
392 |             CREATE TABLE etl_destination (
393 |                 id INT NOT NULL
394 |               , name VARCHAR(10)
395 |               , amount INT(11) DEFAULT 0
396 |               , PRIMARY KEY (id)
397 |             )
398 |           ]
399 |         end
400 | 
401 |         etl.before_etl do |etl|
402 |           etl.query "DELETE FROM etl_source WHERE amount < 0"
403 |         end
404 | 
405 |         etl.start do |etl|
406 |           etl.query(
407 |             "SELECT COALESCE(MAX(id), 0) AS the_start FROM etl_destination"
408 |           ).to_a.first['the_start']
409 |         end
410 | 
411 |         etl.step do
412 |           1
413 |         end
414 | 
415 |         etl.stop do |etl|
416 |           etl.query(
417 |             "SELECT MAX(id) AS the_stop FROM etl_source"
418 |           ).to_a.first['the_stop']
419 |         end
420 | 
421 |         etl.etl do |etl, lbound, ubound|
422 |           etl.query %[
423 |             REPLACE INTO etl_destination
424 |             SELECT id, name, amount FROM etl_source s
425 |             WHERE s.id >= #{lbound}
426 |               AND s.id <  #{ubound}
427 |           ]
428 |         end
429 | 
430 |         etl.after_etl do |etl|
431 |           etl.query %[
432 |             UPDATE etl_destination
433 |             SET name = CONCAT("SUPER ", name)
434 |             WHERE id <= 1
435 |           ]
436 |         end
437 | 
438 |         etl.run
439 | 
440 |         expect(
441 |           connection
442 |             .query("SELECT * FROM etl_destination ORDER BY id ASC")
443 |             .to_a
444 |         ).to eq(
445 |           [
446 |             {'id' => 1, 'name' => 'SUPER Jeff', 'amount' => 100},
447 |             {'id' => 2, 'name' => 'Ryan',       'amount' => 50},
448 |             {'id' => 3, 'name' => 'Jack',       'amount' => 75},
449 |             {'id' => 4, 'name' => 'Jeff',       'amount' => 10},
450 |             {'id' => 5, 'name' => 'Jack',       'amount' => 45},
451 |             {'id' => 7, 'name' => 'Nick',       'amount' => 90}
452 |           ]
453 |         )
454 |       end
455 |     end
456 | 
457 |     describe '#run over part of table' do
458 |       let(:connection) { test_connection }
459 |       let(:etl) { described_class.new(connection: connection, logger: logger) }
460 | 
461 |       before { reset_test_env connection }
462 |       after  { connection.close }
463 | 
464 |       it "executes the specified sql in the appropriate order and ETLs properly" do
465 |         etl.ensure_destination do |etl|
466 |           etl.query %[
467 |             CREATE TABLE etl_destination (
468 |                 id INT NOT NULL
469 |               , name VARCHAR(10)
470 |               , amount INT(11) DEFAULT 0
471 |               , PRIMARY KEY (id)
472 |             )
473 |           ]
474 |         end
475 | 
476 |         etl.before_etl do |etl|
477 |           etl.query "DELETE FROM etl_source WHERE amount < 0"
478 |         end
479 | 
480 |         etl.start do
481 |           4
482 |         end
483 | 
484 |         etl.step do
485 |           1
486 |         end
487 | 
488 |         etl.stop do |etl|
489 |           etl.query(
490 |             "SELECT MAX(id) AS the_stop FROM etl_source"
491 |           ).to_a.first['the_stop']
492 |         end
493 | 
494 |         etl.etl do |etl, lbound, ubound|
495 |           etl.query %[
496 |             REPLACE INTO etl_destination
497 |             SELECT id, name, amount FROM etl_source s
498 |             WHERE s.id >= #{lbound}
499 |               AND s.id <  #{ubound}
500 |           ]
501 |         end
502 | 
503 |         etl.run
504 | 
505 |         expect(
506 |           connection
507 |           .query("SELECT * FROM etl_destination ORDER BY id ASC")
508 |           .to_a
509 |         ).to eq(
510 |           [
511 |             {'id' => 4, 'name' => 'Jeff', 'amount' => 10},
512 |             {'id' => 5, 'name' => 'Jack', 'amount' => 45},
513 |             {'id' => 7, 'name' => 'Nick', 'amount' => 90}
514 |           ]
515 |         )
516 |       end
517 |     end
518 | 
519 |     describe "#run over gappy data" do
520 |       let(:connection) { test_connection }
521 |       let(:etl) { described_class.new(connection: connection, logger: logger) }
522 | 
523 |       before do
524 |         reset_test_env(connection) do |connection|
525 |           connection.query %[
526 |             CREATE TABLE etl_source (
527 |                 id INT NOT NULL
528 |               , name VARCHAR(10)
529 |               , amount INT(11) DEFAULT 0
530 |               , PRIMARY KEY (id)
531 |             )
532 |           ]
533 | 
534 |           connection.query %[
535 |             INSERT INTO etl_source (id, name, amount)
536 |             VALUES
537 |               (1,  'Jeff',  100),
538 |               (2,  'Ryan',  50),
539 |               (13, 'Jack',  75),
540 |               (14, 'Jeff',  10),
541 |               (15, 'Jack',  45),
542 |               (16, 'Nick', -90),
543 |               (17, 'Nick',  90)
544 |           ]
545 |         end
546 |       end
547 | 
548 |       after { connection.close }
549 | 
550 |       it "executes the specified sql in the appropriate order without getting stuck" do
551 |         etl.ensure_destination do |etl|
552 |           etl.query %[
553 |             CREATE TABLE etl_destination (
554 |                 id INT NOT NULL
555 |               , name VARCHAR(10)
556 |               , amount INT(11) DEFAULT 0
557 |               , PRIMARY KEY (id)
558 |             )
559 |           ]
560 |         end
561 | 
562 |         etl.before_etl do |etl|
563 |           etl.query "DELETE FROM etl_source WHERE amount < 0"
564 |         end
565 | 
566 |         etl.start do |etl|
567 |           1
568 |         end
569 | 
570 |         etl.step do
571 |           1
572 |         end
573 | 
574 |         etl.stop do |etl|
575 |           etl.query(
576 |             "SELECT MAX(id) AS the_stop FROM etl_source"
577 |           ).to_a.first['the_stop']
578 |         end
579 | 
580 |         etl.etl do |etl, lbound, ubound|
581 |           etl.query %[
582 |             REPLACE INTO etl_destination
583 |             SELECT
584 |                 id
585 |               , name
586 |               , amount
587 |             FROM etl_source s
588 |             WHERE s.id >= #{lbound}
589 |               AND s.id <  #{ubound}
590 |           ]
591 |         end
592 | 
593 |         etl.run
594 | 
595 |         expect(
596 |           connection
597 |           .query("SELECT * FROM etl_destination ORDER BY id ASC")
598 |           .to_a
599 |         ).to eq(
600 |           [
601 |             {'id' => 1,  'name' => 'Jeff', 'amount' => 100},
602 |             {'id' => 2,  'name' => 'Ryan', 'amount' => 50},
603 |             {'id' => 13, 'name' => 'Jack', 'amount' => 75},
604 |             {'id' => 14, 'name' => 'Jeff', 'amount' => 10},
605 |             {'id' => 15, 'name' => 'Jack', 'amount' => 45},
606 |             {'id' => 17, 'name' => 'Nick', 'amount' => 90}
607 |           ]
608 |         )
609 |       end
610 |     end
611 | 
612 |     describe "#run over date data" do
613 |       let(:connection) { test_connection }
614 |       let(:etl) { described_class.new(connection: connection, logger: logger) }
615 | 
616 |       before do
617 |         reset_test_env(connection) do |connection|
618 |           connection.query %[
619 |             CREATE TABLE etl_source (
620 |                 the_date DATE NOT NULL
621 |               , name VARCHAR(10)
622 |               , amount INT(11) DEFAULT 0
623 |             )
624 |           ]
625 | 
626 |           connection.query %[
627 |             INSERT INTO etl_source (the_date, name, amount)
628 |             VALUES
629 |               ('2012-01-01', 'Jeff', 100),
630 |               ('2012-01-01', 'Ryan', 50),
631 |               ('2012-01-01', 'Jack', 75),
632 |               ('2012-01-01', 'Jeff', 10),
633 |               ('2012-01-02', 'Jack', 45),
634 |               ('2012-01-02', 'Nick', -90),
635 |               ('2012-01-02', 'Nick', 90)
636 |           ]
637 |         end
638 |       end
639 | 
640 |       after { connection.close }
641 | 
642 |       it "executes the specified sql in the appropriate order and ETLs properly" do
643 |         etl.ensure_destination do |etl|
644 |           etl.query %[
645 |             CREATE TABLE etl_destination (
646 |                 the_date DATE NOT NULL
647 |               , name VARCHAR(10)
648 |               , total_amount INT(11) DEFAULT 0
649 |               , PRIMARY KEY (the_date, name)
650 |             )
651 |           ]
652 |         end
653 | 
654 |         etl.before_etl do |etl|
655 |           etl.query "DELETE FROM etl_source WHERE amount < 0"
656 |         end
657 | 
658 |         etl.start do |etl|
659 |           etl.query(%[
660 |             SELECT COALESCE(MAX(the_date), DATE('2012-01-01')) AS the_start
661 |             FROM etl_destination
662 |           ]).to_a.first['the_start']
663 |         end
664 | 
665 |         etl.step do
666 |           1.day
667 |         end
668 | 
669 |         etl.stop do |etl|
670 |           etl.query(
671 |             "SELECT MAX(the_date) AS the_stop FROM etl_source"
672 |           ).to_a.first['the_stop']
673 |         end
674 | 
675 |         etl.etl do |etl, lbound, ubound|
676 |           etl.query %[
677 |             REPLACE INTO etl_destination
678 |             SELECT
679 |                 the_date
680 |               , name
681 |               , SUM(amount) AS total_amount
682 |             FROM etl_source s
683 |             WHERE s.the_date >= '#{lbound}'
684 |               AND s.the_date <  '#{ubound}'
685 |             GROUP BY
686 |                 the_date
687 |               , name
688 |           ]
689 |         end
690 | 
691 |         etl.run
692 | 
693 |         expect(
694 |           connection.query(%[
695 |             SELECT
696 |                 the_date
697 |               , name
698 |               , total_amount
699 |             FROM
700 |               etl_destination
701 |             ORDER BY
702 |                 the_date ASC
703 |               , name ASC
704 |           ]).to_a
705 |         ).to eq(
706 |           [
707 |             {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jack', 'total_amount' => 75},
708 |             {'the_date' => Date.parse('2012-01-01'), 'name' => 'Jeff', 'total_amount' => 110},
709 |             {'the_date' => Date.parse('2012-01-01'), 'name' => 'Ryan', 'total_amount' => 50},
710 |             {'the_date' => Date.parse('2012-01-02'), 'name' => 'Jack', 'total_amount' => 45},
711 |             {'the_date' => Date.parse('2012-01-02'), 'name' => 'Nick', 'total_amount' => 90}
712 |           ]
713 |         )
714 |       end
715 |     end
716 | 
717 |     describe "#run over datetime data" do
718 |       let(:connection) { test_connection }
719 |       let(:etl) { described_class.new(connection: connection, logger: logger) }
720 | 
721 |       before do
722 |         reset_test_env(connection) do |connection|
723 |           connection.query %[
724 |             CREATE TABLE etl_source (
725 |                 the_datetime DATETIME NOT NULL
726 |               , name VARCHAR(10)
727 |               , amount INT(11) DEFAULT 0
728 |             )
729 |           ]
730 | 
731 |           connection.query %[
732 |             INSERT INTO etl_source (the_datetime, name, amount)
733 |             VALUES
734 |               ('2011-12-31 23:59:59', 'Jeff', 100),
735 |               ('2012-01-01 00:01:00', 'Ryan', 50),
736 |               ('2012-01-01 00:01:01', 'Jack', 75),
737 |               ('2012-01-01 00:01:02', 'Jeff', 10),
738 |               ('2012-01-02 00:02:00', 'Jack', 45),
739 |               ('2012-01-02 00:02:01', 'Nick', -90),
740 |               ('2012-01-02 00:02:02', 'Nick', 90)
741 |           ]
742 |         end
743 |       end
744 | 
745 |       after { connection.close }
746 | 
747 |       it "executes the specified sql in the appropriate order and ETLs properly" do
748 |         etl.ensure_destination do |etl|
749 |           etl.query %[
750 |             CREATE TABLE etl_destination (
751 |                 the_datetime DATETIME NOT NULL
752 |               , name VARCHAR(10)
753 |               , amount INT(11) DEFAULT 0
754 |               , PRIMARY KEY (the_datetime, name)
755 |             )
756 |           ]
757 |         end
758 | 
759 |         etl.before_etl do |etl|
760 |           etl.query "DELETE FROM etl_source WHERE amount < 0"
761 |         end
762 | 
763 |         etl.start do |etl|
764 |           etl.query(%[
765 |             SELECT CAST(COALESCE(MAX(the_datetime), '2012-01-01 00:00:00') AS DATETIME) AS the_start
766 |             FROM etl_destination
767 |           ]).to_a.first['the_start']
768 |         end
769 | 
770 |         etl.step do
771 |           1.minute
772 |         end
773 | 
774 |         etl.stop do |etl|
775 |           etl.query(
776 |             "SELECT MAX(the_datetime) AS the_stop FROM etl_source"
777 |           ).to_a.first['the_stop']
778 |         end
779 | 
780 |         etl.etl do |etl, lbound, ubound|
781 |           etl.query %[
782 |             REPLACE INTO etl_destination
783 |             SELECT
784 |                 the_datetime
785 |               , name
786 |               , amount
787 |             FROM etl_source s
788 |             WHERE s.the_datetime >= '#{lbound}'
789 |               AND s.the_datetime <  '#{ubound}'
790 |           ]
791 |         end
792 | 
793 |         etl.run
794 | 
795 |         expect(
796 |           connection.query(%[
797 |             SELECT
798 |                 the_datetime
799 |               , name
800 |               , amount
801 |             FROM
802 |               etl_destination
803 |             ORDER BY
804 |                 the_datetime ASC
805 |               , name ASC
806 |           ]).to_a
807 |         ).to eq(
808 |           [
809 |             {'the_datetime' => Time.parse('2012-01-01 00:01:00'), 'name' => 'Ryan', 'amount' => 50},
810 |             {'the_datetime' => Time.parse('2012-01-01 00:01:01'), 'name' => 'Jack', 'amount' => 75},
811 |             {'the_datetime' => Time.parse('2012-01-01 00:01:02'), 'name' => 'Jeff', 'amount' => 10},
812 |             {'the_datetime' => Time.parse('2012-01-02 00:02:00'), 'name' => 'Jack', 'amount' => 45},
813 |             {'the_datetime' => Time.parse('2012-01-02 00:02:02'), 'name' => 'Nick', 'amount' => 90}
814 |           ]
815 |         )
816 |       end
817 |     end
818 |   end
819 | end
820 | 


--------------------------------------------------------------------------------