├── advanced ├── .rspec ├── data │ └── .gitignore ├── .ruby-version ├── .travis.yml ├── .bundle │ └── config ├── spec │ ├── spec_helper.rb │ ├── time_dimension_builder_spec.rb │ └── date_dimension_builder_spec.rb ├── .gitignore ├── Rakefile ├── etl │ ├── process_all.ebf │ ├── migrations │ │ ├── 003_create_time_dimension.rb │ │ ├── 004_create_commits_table.rb │ │ ├── 002_create_date_dimension.rb │ │ └── 001_create_user_dimension.rb │ ├── prepare_db.ctl │ ├── prepare_date_dimension.ctl │ ├── common.rb │ ├── prepare_time_dimension.ctl │ ├── prepare_user_dimension.ctl │ ├── extract.ctl │ └── import_new_commits.ctl ├── config │ └── database.yml ├── Gemfile ├── LICENSE ├── README.md ├── Gemfile.lock └── lib │ ├── date_dimension_builder.rb │ └── time_dimension_builder.rb └── simple ├── .gitignore ├── .rvmrc ├── etl ├── process_all.ebf ├── migrations │ └── 001_create_customers.rb ├── prepare_db.ctl └── upsert_customers.ctl ├── Gemfile ├── customers.csv ├── config └── database.yml ├── README.md └── Gemfile.lock /advanced/.rspec: -------------------------------------------------------------------------------- 1 | --format nested 2 | --color -------------------------------------------------------------------------------- /advanced/data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /simple/.gitignore: -------------------------------------------------------------------------------- 1 | source_data 2 | etl.log -------------------------------------------------------------------------------- /advanced/.ruby-version: -------------------------------------------------------------------------------- 1 | ruby-1.9.3-p125@aw-etl-sample 2 | -------------------------------------------------------------------------------- /advanced/.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 1.9.3 -------------------------------------------------------------------------------- /advanced/.bundle/config: -------------------------------------------------------------------------------- 1 | --- 2 | BUNDLE_WITHOUT: production 3 | -------------------------------------------------------------------------------- /advanced/spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $: << File.dirname(__FILE__) + '/../lib' -------------------------------------------------------------------------------- /simple/.rvmrc: -------------------------------------------------------------------------------- 1 | rvm --create use ruby-1.9.3-p125@aw-etl-sample 2 | -------------------------------------------------------------------------------- /simple/etl/process_all.ebf: -------------------------------------------------------------------------------- 1 | run 'prepare_db.ctl' 2 | run 'upsert_customers.ctl' -------------------------------------------------------------------------------- /advanced/.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | etl.log 3 | source_data 4 | .DS_Store 5 | tmp 6 | *.txt -------------------------------------------------------------------------------- /simple/Gemfile: -------------------------------------------------------------------------------- 1 | source :rubygems 2 | 3 | gem 'activewarehouse-etl', '1.0.0' 4 | gem 'mysql2' 5 | gem 'awesome_print' 6 | 7 | group :test do 8 | gem 'rspec' 9 | end 10 | -------------------------------------------------------------------------------- /advanced/Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler' 2 | Bundler.setup 3 | 4 | require 'rspec/core/rake_task' 5 | 6 | desc "Run all examples" 7 | RSpec::Core::RakeTask.new 8 | 9 | task :default => :spec -------------------------------------------------------------------------------- /simple/customers.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,email 2 | John,Barry,john.barry@gmail.com 3 | Jonathon,More,jon@coldcut.com 4 | Matt,Black,matt@coldcut.com 5 | Marlena,Shaw,marlena.shaw@gmail.com 6 | Neil,Young,neil.young@hotmail.com -------------------------------------------------------------------------------- /advanced/etl/process_all.ebf: -------------------------------------------------------------------------------- 1 | 2 | run 'extract.ctl' 3 | 4 | run 'prepare_db.ctl' 5 | 6 | run 'prepare_date_dimension.ctl' 7 | run 'prepare_time_dimension.ctl' 8 | run 'prepare_user_dimension.ctl' 9 | 10 | run 'import_new_commits.ctl' -------------------------------------------------------------------------------- /simple/etl/migrations/001_create_customers.rb: -------------------------------------------------------------------------------- 1 | class CreateCustomers < ActiveRecord::Migration 2 | 3 | def change 4 | create_table :customers, :force => true do |t| 5 | t.string :full_name 6 | t.string :email 7 | t.string :email_provider 8 | end 9 | end 10 | 11 | end -------------------------------------------------------------------------------- /simple/etl/prepare_db.ctl: -------------------------------------------------------------------------------- 1 | pre_process do 2 | migrations_folder = File.expand_path(File.dirname(__FILE__) + '/migrations') 3 | version = ENV["VERSION"] ? ENV["VERSION"].to_i : nil 4 | 5 | ActiveRecord::Base.establish_connection(:datawarehouse) 6 | ActiveRecord::Migrator.migrate(migrations_folder, version) 7 | end -------------------------------------------------------------------------------- /simple/config/database.yml: -------------------------------------------------------------------------------- 1 | common: &common 2 | adapter: mysql2 3 | username: root 4 | host: localhost 5 | 6 | etl_execution: 7 | <<: *common 8 | database: aw_etl_simple_etl_execution 9 | 10 | datawarehouse: 11 | <<: *common 12 | database: aw_etl_simple_datawarehouse 13 | encoding: utf8 14 | local_infile: true -------------------------------------------------------------------------------- /advanced/config/database.yml: -------------------------------------------------------------------------------- 1 | common: &common 2 | adapter: mysql2 3 | username: root 4 | host: localhost 5 | 6 | etl_execution: 7 | <<: *common 8 | database: aw_etl_sample_etl_execution 9 | 10 | datawarehouse: 11 | <<: *common 12 | database: aw_etl_sample_datawarehouse 13 | encoding: utf8 14 | local_infile: true -------------------------------------------------------------------------------- /advanced/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # use our own fork for bulk load support until issue fixed: 4 | # https://github.com/brianmario/mysql2/pull/242 5 | gem 'mysql2', :git => 'git://github.com/activewarehouse/mysql2.git' 6 | 7 | gem 'activewarehouse-etl', '1.0.0' 8 | gem 'adapter_extensions', '1.0.0' 9 | gem 'awesome_print' 10 | 11 | group :test do 12 | gem 'rspec' 13 | end 14 | -------------------------------------------------------------------------------- /advanced/etl/migrations/003_create_time_dimension.rb: -------------------------------------------------------------------------------- 1 | class CreateTimeDimension < ActiveRecord::Migration 2 | 3 | def self.up 4 | create_table :time_dimension do |t| 5 | t.time :sql_time_stamp 6 | 7 | t.integer :hour 8 | t.string :hour_description 9 | t.string :half_hour_description 10 | t.string :day_part 11 | t.string :hour_type 12 | end 13 | end 14 | 15 | def self.down 16 | drop_table :time_dimension 17 | end 18 | 19 | end -------------------------------------------------------------------------------- /simple/README.md: -------------------------------------------------------------------------------- 1 | ## How to run 2 | 3 | * make sure to have mysql installed and running (`brew install mysql`) 4 | * install ruby 1.9.3 (and I suggest RVM if possible) 5 | * edit `config/database.yml` to reflect your mysql setup 6 | * then: 7 | 8 | ``` 9 | bundle install 10 | mysql -u root -p -e "create database aw_etl_simple_etl_execution" 11 | mysql -u root -p -e "create database aw_etl_simple_datawarehouse CHARACTER SET utf8 COLLATE utf8_general_ci" 12 | 13 | bundle exec etl etl/process_all.ebf 14 | ``` 15 | -------------------------------------------------------------------------------- /advanced/etl/prepare_db.ctl: -------------------------------------------------------------------------------- 1 | # For convenience, we'll use ActiveRecord migrations to help us 2 | # build the dimensions and facts tables. You'd be free to use whatever you 3 | # like though (or to rely on an already existing schema) 4 | 5 | pre_process do 6 | migrations_folder = File.expand_path(File.dirname(__FILE__) + '/migrations') 7 | version = ENV["VERSION"] ? ENV["VERSION"].to_i : nil 8 | 9 | ActiveRecord::Base.establish_connection(:datawarehouse) 10 | ActiveRecord::Migrator.migrate(migrations_folder, version) 11 | end -------------------------------------------------------------------------------- /advanced/etl/migrations/004_create_commits_table.rb: -------------------------------------------------------------------------------- 1 | class CreateCommitsTable < ActiveRecord::Migration 2 | 3 | def self.up 4 | create_table :commits do |t| 5 | t.string :sha1 6 | 7 | t.integer :files_changed 8 | t.integer :insertions 9 | t.integer :deletions 10 | 11 | t.integer :date_id 12 | t.integer :time_id 13 | t.integer :user_id 14 | end 15 | 16 | add_index :commits, :sha1, :unique 17 | end 18 | 19 | def self.down 20 | drop_table :commits 21 | end 22 | 23 | end -------------------------------------------------------------------------------- /advanced/etl/migrations/002_create_date_dimension.rb: -------------------------------------------------------------------------------- 1 | class CreateDateDimension < ActiveRecord::Migration 2 | 3 | def self.up 4 | create_table :date_dimension do |t| 5 | t.string :date 6 | t.string :month 7 | t.string :day_of_week 8 | t.integer :day_of_week_as_number 9 | t.string :year 10 | t.string :year_and_month 11 | t.date :sql_date_stamp 12 | t.string :week 13 | t.string :quarter 14 | t.string :semester 15 | end 16 | end 17 | 18 | def self.down 19 | drop_table :date_dimension 20 | end 21 | 22 | end -------------------------------------------------------------------------------- /advanced/etl/migrations/001_create_user_dimension.rb: -------------------------------------------------------------------------------- 1 | class CreateUserDimension < ActiveRecord::Migration 2 | 3 | def change 4 | create_table :user_dimension, :force => true do |t| 5 | t.string :name, :null => false 6 | end 7 | 8 | # work-around: mysql bulk load will choke on "Adam" vs "adam", considering these are duplicates 9 | # as well: we use self. because of https://github.com/activewarehouse/activewarehouse-etl/issues/70 10 | self.execute %{ALTER TABLE user_dimension MODIFY name varchar(255) COLLATE utf8_bin NOT NULL} 11 | 12 | add_index :user_dimension, :name, :unique 13 | end 14 | 15 | end -------------------------------------------------------------------------------- /simple/etl/upsert_customers.ctl: -------------------------------------------------------------------------------- 1 | class Customer < ActiveRecord::Base 2 | end 3 | 4 | file = File.expand_path(File.dirname(__FILE__) + '/../customers.csv') 5 | 6 | source :input, 7 | { 8 | :file => file, 9 | :parser => :csv, 10 | :skip_lines => 1 11 | }, 12 | [ 13 | :first_name, 14 | :last_name, 15 | :email 16 | ] 17 | 18 | transform(:email_provider) do |n,v,r| 19 | r[:email].downcase.split('@').last 20 | end 21 | 22 | transform :email_provider, :default, 23 | :default_value => "Unknown" 24 | 25 | transform(:full_name) do |n,v,r| 26 | [r[:first_name], r[:last_name]].join(' ') 27 | end 28 | 29 | before_write do |r| 30 | r[:email_provider] =~ /hotmail/ ? nil : r 31 | end 32 | 33 | destination :out, { 34 | :type => :insert_update_database, 35 | :target => :datawarehouse, 36 | :table => 'customers' 37 | }, 38 | { 39 | :primarykey => [:email], 40 | :order => [:email, :full_name, :email_provider] 41 | } 42 | 43 | screen(:fatal) { 44 | assert_equal 1, Customer.where(:email => 'john.barry@gmail.com').count 45 | } -------------------------------------------------------------------------------- /advanced/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2012 Thibaut Barrère 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, including 5 | without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 6 | copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the 7 | following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial 10 | portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 13 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 14 | NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 15 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /advanced/etl/prepare_date_dimension.ctl: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/common') 2 | 3 | table = DateDimension.table_name 4 | bulk_load_file = "#{table}.txt" 5 | start_date = Date.parse('2000-01-01') 6 | end_date = Date.parse('2020-01-01') 7 | 8 | pre_process :truncate, :target => :datawarehouse, :table => table 9 | 10 | records = DateDimensionBuilder.new(start_date, end_date).build 11 | 12 | source :in, { 13 | :type => :enumerable, 14 | :enumerable => records, 15 | :store_locally => false 16 | } 17 | 18 | # pick the first record to extract the column names 19 | columns = records.first.keys 20 | 21 | # write only the new records to a raw file prior to bulk loading 22 | destination :out, { :file => bulk_load_file }, { :order => columns } 23 | 24 | # then bulk-load the resulting file to the database 25 | post_process :bulk_import, { 26 | :file => bulk_load_file, 27 | :columns => columns, 28 | :target => :datawarehouse, :table => table 29 | } 30 | 31 | after_post_process_screen(:fatal) do 32 | assert_equal end_date - start_date + 1, DateDimension.count 33 | 34 | # ensure we keep constant ids despite the truncating 35 | assert_equal start_date, DateDimension.find(1).sql_date_stamp 36 | end -------------------------------------------------------------------------------- /advanced/etl/common.rb: -------------------------------------------------------------------------------- 1 | require 'awesome_print' 2 | 3 | $: << File.dirname(__FILE__) + '/../lib' 4 | 5 | require 'date_dimension_builder' 6 | require 'time_dimension_builder' 7 | 8 | DATA_FOLDER = File.dirname(__FILE__) + '/../data' 9 | 10 | GIT_RAILS_REPO = File.join(DATA_FOLDER, 'git_rails') 11 | 12 | UNKNOWN_USER_NAME = 'Unknown user' 13 | 14 | # fail-fast process execution 15 | def system!(cmd) 16 | raise "Failed to run command #{cmd}" unless system(cmd) 17 | end 18 | 19 | # define a couple of ActiveRecord models for later re-use (not mandatory) 20 | 21 | class UserDimension < ActiveRecord::Base; end 22 | UserDimension.table_name = 'user_dimension' 23 | 24 | class TimeDimension < ActiveRecord::Base; end 25 | TimeDimension.table_name = 'time_dimension' 26 | 27 | class DateDimension < ActiveRecord::Base; end 28 | DateDimension.table_name = 'date_dimension' 29 | 30 | class Commit < ActiveRecord::Base 31 | # TODO - rename as :author 32 | belongs_to :user, :class_name => 'UserDimension' 33 | belongs_to :date, :class_name => 'DateDimension' 34 | belongs_to :time, :class_name => 'TimeDimension' 35 | end 36 | 37 | # TODO - figure out why this is required here 38 | ActiveRecord::Base.establish_connection(:datawarehouse) 39 | -------------------------------------------------------------------------------- /advanced/README.md: -------------------------------------------------------------------------------- 1 | # ActiveWarehouse-ETL sample 2 | 3 | This is a sample of ETL built on a very small data set (the rails git commit log) for educational purposes. 4 | 5 | [![Build Status](https://secure.travis-ci.org/activewarehouse/activewarehouse-etl-sample.png)](http://travis-ci.org/activewarehouse/activewarehouse-etl-sample) 6 | 7 | ## How to run 8 | 9 | More explanations will be added here later on. In the mean time: 10 | 11 | * make sure to have mysql installed and running (`brew install mysql`) 12 | * install ruby 1.9.3 (and I suggest RVM if possible) 13 | * edit `config/database.yml` to reflect your mysql setup 14 | * then: 15 | 16 | ``` 17 | bundle install 18 | mysql -u root -p -e "create database aw_etl_sample_etl_execution" 19 | mysql -u root -p -e "create database aw_etl_sample_datawarehouse CHARACTER SET utf8 COLLATE utf8_general_ci" 20 | 21 | bundle exec etl etl/process_all.ebf 22 | ``` 23 | 24 | ## Tests 25 | 26 | There are a couple of specs to show how to test your dimension builders. Run with: 27 | 28 | `rake spec` 29 | 30 | ## Contributors 31 | 32 | * Thibaut Barrère 33 | * Alisson Cavalcante Agiani 34 | 35 | ## Contributing 36 | 37 | Pull-request are most-welcome! Get in touch before working on anything significant though. 38 | 39 | ## License 40 | 41 | MIT -------------------------------------------------------------------------------- /simple/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: http://rubygems.org/ 3 | specs: 4 | activemodel (3.2.6) 5 | activesupport (= 3.2.6) 6 | builder (~> 3.0.0) 7 | activerecord (3.2.6) 8 | activemodel (= 3.2.6) 9 | activesupport (= 3.2.6) 10 | arel (~> 3.0.2) 11 | tzinfo (~> 0.3.29) 12 | activesupport (3.2.6) 13 | i18n (~> 0.6) 14 | multi_json (~> 1.0) 15 | activewarehouse-etl (1.0.0) 16 | activerecord (>= 3.0.0) 17 | activesupport (>= 3.0.0) 18 | adapter_extensions (>= 0.9.5.rc1) 19 | fastercsv (>= 1.2.0) 20 | rake (>= 0.8.3) 21 | adapter_extensions (1.0.0) 22 | activerecord (>= 3.0.0) 23 | activesupport (>= 3.0.0) 24 | rake (>= 0.8.3) 25 | arel (3.0.2) 26 | awesome_print (1.0.2) 27 | builder (3.0.0) 28 | diff-lcs (1.1.3) 29 | fastercsv (1.5.5) 30 | i18n (0.6.0) 31 | multi_json (1.3.6) 32 | mysql2 (0.3.11) 33 | rake (0.9.2.2) 34 | rspec (2.10.0) 35 | rspec-core (~> 2.10.0) 36 | rspec-expectations (~> 2.10.0) 37 | rspec-mocks (~> 2.10.0) 38 | rspec-core (2.10.1) 39 | rspec-expectations (2.10.0) 40 | diff-lcs (~> 1.1.3) 41 | rspec-mocks (2.10.1) 42 | tzinfo (0.3.33) 43 | 44 | PLATFORMS 45 | ruby 46 | 47 | DEPENDENCIES 48 | activewarehouse-etl (= 1.0.0) 49 | awesome_print 50 | mysql2 51 | rspec 52 | -------------------------------------------------------------------------------- /advanced/etl/prepare_time_dimension.ctl: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/common') 2 | 3 | # you will notice this file is very, very similar to prepare_date_dimension 4 | # so similar you can create a ruby method into common.rb or another file 5 | # to DRY things - it will definitely work. 6 | 7 | table = TimeDimension.table_name 8 | bulk_load_file = "#{table}.txt" 9 | 10 | pre_process :truncate, :target => :datawarehouse, :table => table 11 | 12 | records = TimeDimensionBuilder.new("0:00", "23:59").build 13 | 14 | source :in, { 15 | :type => :enumerable, 16 | :enumerable => records, 17 | :store_locally => false 18 | } 19 | 20 | # pick the first record to extract the column names 21 | columns = records.first.keys 22 | 23 | 24 | # write only the new records to a raw file prior to bulk loading 25 | destination :out, { :file => bulk_load_file }, { :order => columns } 26 | 27 | # then bulk-load the resulting file to the database 28 | post_process :bulk_import, { 29 | :file => bulk_load_file, 30 | :columns => columns, 31 | :target => :datawarehouse, :table => table 32 | } 33 | 34 | after_post_process_screen(:fatal) do 35 | assert_equal 60*24, TimeDimension.count 36 | 37 | # ensure we keep constant ids despite the truncating 38 | assert_equal '00:00', TimeDimension.find(1).sql_time_stamp.strftime("%H:%M") 39 | assert_equal '23:59', TimeDimension.find(60*24).sql_time_stamp.strftime("%H:%M") 40 | end -------------------------------------------------------------------------------- /advanced/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GIT 2 | remote: git://github.com/activewarehouse/mysql2.git 3 | revision: 67a5e63804648c4fb98a0046eb31c85509021fe6 4 | specs: 5 | mysql2 (0.3.11) 6 | 7 | GEM 8 | remote: https://rubygems.org/ 9 | specs: 10 | activemodel (3.2.6) 11 | activesupport (= 3.2.6) 12 | builder (~> 3.0.0) 13 | activerecord (3.2.6) 14 | activemodel (= 3.2.6) 15 | activesupport (= 3.2.6) 16 | arel (~> 3.0.2) 17 | tzinfo (~> 0.3.29) 18 | activesupport (3.2.6) 19 | i18n (~> 0.6) 20 | multi_json (~> 1.0) 21 | activewarehouse-etl (1.0.0) 22 | activerecord (>= 3.0.0) 23 | activesupport (>= 3.0.0) 24 | adapter_extensions (>= 0.9.5.rc1) 25 | fastercsv (>= 1.2.0) 26 | rake (>= 0.8.3) 27 | adapter_extensions (1.0.0) 28 | activerecord (>= 3.0.0) 29 | activesupport (>= 3.0.0) 30 | rake (>= 0.8.3) 31 | arel (3.0.2) 32 | awesome_print (1.0.2) 33 | builder (3.0.0) 34 | diff-lcs (1.1.3) 35 | fastercsv (1.5.5) 36 | i18n (0.6.0) 37 | multi_json (1.3.6) 38 | rake (0.9.2.2) 39 | rspec (2.8.0) 40 | rspec-core (~> 2.8.0) 41 | rspec-expectations (~> 2.8.0) 42 | rspec-mocks (~> 2.8.0) 43 | rspec-core (2.8.0) 44 | rspec-expectations (2.8.0) 45 | diff-lcs (~> 1.1.2) 46 | rspec-mocks (2.8.0) 47 | tzinfo (0.3.33) 48 | 49 | PLATFORMS 50 | ruby 51 | 52 | DEPENDENCIES 53 | activewarehouse-etl (= 1.0.0) 54 | adapter_extensions (= 1.0.0) 55 | awesome_print 56 | mysql2! 57 | rspec 58 | -------------------------------------------------------------------------------- /advanced/lib/date_dimension_builder.rb: -------------------------------------------------------------------------------- 1 | class DateDimensionBuilder 2 | # Specify the start date for the first record 3 | attr_accessor :start_date 4 | 5 | # Specify the end date for the last record 6 | attr_accessor :end_date 7 | 8 | # Initialize the builder. 9 | # 10 | # * start_date: The start date. 11 | # * end_date: The end date. 12 | def initialize(start_date, end_date) 13 | @start_date = start_date.class == String ? Date.parse(start_date) : start_date 14 | @end_date = end_date.class == String ? Date.parse(end_date) : end_date 15 | end 16 | 17 | # Returns an array of hashes representing records in the dimension. The values for each record are 18 | # accessed by name. 19 | def build 20 | records = [] 21 | date = start_date 22 | while date <= end_date 23 | record = {} 24 | record[:date] = date.strftime("%Y-%m-%d") 25 | record[:month] = Date::MONTHNAMES[date.month].downcase 26 | record[:day_of_week] = Date::DAYNAMES[date.wday].downcase 27 | record[:day_of_week_as_number] = date.wday 28 | record[:year] = date.year.to_s 29 | record[:year_and_month] = record[:year] + "-" + date.month.to_s.rjust(2,'0') 30 | record[:sql_date_stamp] = date 31 | record[:week] = "week #{date.to_date.cweek}" 32 | # compute quarter ourselves - available in Time but not in Date - anything better ? 33 | quarter = 1 + (date.month-1) / 3 34 | record[:quarter] = "Q#{quarter}" 35 | record[:semester] = "S#{(quarter+1)/2}" 36 | records << record 37 | date = date.next 38 | end 39 | records 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /advanced/spec/time_dimension_builder_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/spec_helper' 2 | require 'time_dimension_builder' 3 | 4 | describe TimeDimensionBuilder do 5 | 6 | def times(from, to, key=nil) 7 | result = TimeDimensionBuilder.new(from, to).build 8 | result.map! { |e| e[key] } unless key.nil? 9 | result 10 | end 11 | 12 | def time(time, key=nil) 13 | times(time, time, key).first 14 | end 15 | 16 | it "should return exactly one record for each minute of the 24 hours day" do 17 | times('0:00', '23:59', :sql_time_stamp).size.should eql(60*24) 18 | end 19 | 20 | it "should fill the sql_time_stamp" do 21 | time('21:00', :sql_time_stamp).should eql('21:00') 22 | end 23 | 24 | it "should fill the hour as integer" do 25 | times('21:00', '21:59', :hour).uniq.should eql([21]) 26 | end 27 | 28 | it "should fill the hour description" do 29 | times('17:00', '17:59', :hour_description).uniq.should eql(["between 05:00 pm and 05:59 pm"]) 30 | end 31 | 32 | it "should fill the half hour description" do 33 | times('20:00', '20:29', :half_hour_description).uniq.should eql(["between 08:00 pm and 08:29 pm"]) 34 | times('20:30', '20:59', :half_hour_description).uniq.should eql(["between 08:30 pm and 08:59 pm"]) 35 | times('7:00', '7:29', :half_hour_description).uniq.should eql(["between 07:00 am and 07:29 am"]) 36 | end 37 | 38 | it "should fill the day part" do 39 | times('0:00', '7:59', :day_part).uniq.should eql(["night"]) 40 | times('8:00', '11:59', :day_part).uniq.should eql(["morning"]) 41 | times('12:00', '18:59', :day_part).uniq.should eql(["afternoon"]) 42 | times('19:00', '21:59', :day_part).uniq.should eql(["evening"]) 43 | times('22:00', '23:59', :day_part).uniq.should eql(["night"]) 44 | end 45 | 46 | it "should states weither the hour is a closed or opened hour" do 47 | times('0:00', '8:59', :hour_type).uniq.should eql(["non opening hours"]) 48 | times('9:00', '16:59', :hour_type).uniq.should eql(["opening hours"]) 49 | times('17:00', '23:59', :hour_type).uniq.should eql(["non opening hours"]) 50 | end 51 | 52 | end -------------------------------------------------------------------------------- /advanced/etl/prepare_user_dimension.ctl: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/common') 2 | 3 | table = UserDimension.table_name 4 | 5 | # One can enable debug logging with: 6 | # ETL::Engine.logger = Logger.new(STDOUT) 7 | # ETL::Engine.logger.level = Logger::DEBUG 8 | # 9 | # And display rows in a processor using: 10 | # after_read do |row| ap row; row; end 11 | 12 | # first source to create an 'unknown' user 13 | source :unknown_user, :type => :enumerable, 14 | :enumerable => [{:author_name => UNKNOWN_USER_NAME}] 15 | 16 | # read the users csv file (options are passed to CSV/FasterCSV) 17 | source :git_users, 18 | :file => File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv')), 19 | :skip_lines => 1, :parser => :csv 20 | 21 | # in RAM unicity check - duplicate rows will be removed from the pipeline 22 | after_read :check_unique, :keys => [:author_name] 23 | 24 | # use the email as name in case no name is provided 25 | transform(:name) do |key, value, row| 26 | row[:author_name].blank? ? row[:author_email] : row[:author_name] 27 | end 28 | 29 | # remove rows that are already in the destination database 30 | before_write :check_exist, :target => :datawarehouse, :table => table, :columns => [:name] 31 | 32 | # here we'll just define a constant to be reused down there 33 | bulk_load_file = File.expand_path(File.join(DATA_FOLDER, 'new_git_users.txt')) 34 | 35 | # write only the new records to a raw file prior to bulk loading 36 | destination :out, { :file => bulk_load_file }, { :order => [:name] } 37 | 38 | # before the post-process, we have an opportunity to check the data 39 | screen(:fatal) { 40 | IO.foreach(bulk_load_file) do |line| 41 | assert line.strip.size > 0, "Empty line detected in #{bulk_load_file}! This isn't expected." 42 | end 43 | } 44 | 45 | # then bulk-load the resulting file to the database 46 | post_process :bulk_import, { 47 | :file => bulk_load_file, 48 | :columns => [:name], 49 | :target => :datawarehouse, :table => table 50 | } 51 | 52 | # after post-processes, we have another opportunity to check the data 53 | after_post_process_screen(:fatal) { 54 | assert_equal 1, UserDimension.where(:name => 'Yehuda Katz').count, "More than 1 user named Yehuda Katz" 55 | assert_equal 0, UserDimension.where(:name => '').count, "No user should have an empty name" 56 | assert_equal 1, UserDimension.where(:name => 'José Valim').count, "José Valim not found" 57 | assert_equal 1, UserDimension.where(:name => 'Unknown user').count, "Unknown user not found" 58 | } 59 | -------------------------------------------------------------------------------- /advanced/spec/date_dimension_builder_spec.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/spec_helper' 2 | require 'date_dimension_builder' 3 | 4 | describe DateDimensionBuilder do 5 | 6 | def days(from, to, key=nil) 7 | result = DateDimensionBuilder.new(from, to).build 8 | result.map! { |e| e[key] } unless key.nil? 9 | result 10 | end 11 | 12 | def day(from, key=nil) 13 | days(from, from, key).first 14 | end 15 | 16 | it "should return the expected number of days" do 17 | days('2007-01-01', '2007-12-31').size.should eql(365) 18 | end 19 | 20 | it "should return :date formatted as yyyy-mm-dd" do 21 | day('2007-12-31', :date).should eql('2007-12-31') 22 | end 23 | 24 | it "should return :month in english" do 25 | (1..12).map { |month| day("2007-#{month}-01", :month) }.should eql( 26 | %w(january february march april may june july august september october november december) 27 | ) 28 | end 29 | 30 | it "should return :year on four digits" do 31 | days('1959-1-1', '1959-12-31', :year).uniq.should eql(%w(1959)) 32 | end 33 | 34 | it "should return :year_and_month in a sortable fashion" do 35 | day('2008-02-03', :year_and_month).should eql('2008-02') 36 | day('2008-12-03', :year_and_month).should eql('2008-12') 37 | end 38 | 39 | it "should return all the days of week in english" do 40 | days('2007-12-01', '2007-12-07', :day_of_week).should eql(%w(saturday sunday monday tuesday wednesday thursday friday)) 41 | end 42 | 43 | it "should return all the days of week as numbers starting by sunday as 0" do 44 | days('2007-12-01', '2007-12-07', :day_of_week_as_number).should eql([6, 0, 1, 2, 3, 4, 5]) 45 | end 46 | 47 | it "should return a sql_date_stamp" do 48 | day('1991-1-1', :sql_date_stamp).should eql(Date.parse("1991-1-1")) 49 | end 50 | 51 | it "should return :quarter" do 52 | day('2007-12-01', :quarter).should eql('Q4') 53 | day('2007-06-30', :quarter).should eql('Q2') 54 | end 55 | 56 | it "should implement quarter properly - it's home baked after all" do 57 | (1..12).map { |month| day("2007-#{month}-01", :quarter) }.should eql(%w(Q1 Q1 Q1 Q2 Q2 Q2 Q3 Q3 Q3 Q4 Q4 Q4)) 58 | end 59 | 60 | it "should return :semester" do 61 | day('2007-7-1', :semester).should eql('S2') 62 | day('2007-06-30', :semester).should eql('S1') 63 | day('2007-12-31', :semester).should eql('S2') 64 | end 65 | 66 | it "should return :week and respect ISO 8601" do 67 | # http://fr.wikipedia.org/wiki/ISO_8601 68 | day('2009-12-31', :week).should eql('week 53') 69 | day('2010-1-1', :week).should eql('week 53') 70 | day('2010-1-4', :week).should eql('week 1') 71 | end 72 | 73 | end -------------------------------------------------------------------------------- /advanced/lib/time_dimension_builder.rb: -------------------------------------------------------------------------------- 1 | require 'time' 2 | 3 | class TimeDimensionBuilder 4 | # Specify the start time for the first record 5 | attr_accessor :start_time 6 | 7 | # Specify the end time for the last record 8 | attr_accessor :end_time 9 | 10 | # Avoid generating bad data on daylight change. If you are on a daylight changing day (ie: 2009/03/29) you'll get this: 11 | # >> Time.parse('1:59')+60 12 | # => Sun Mar 29 03:00:00 0200 2009 => 3:00 is just after 1:59 13 | # Instead, append a day that hasn't got daylight change (2009/03/01) 14 | # >> Time.parse('2009/03/01 1:59')+60 15 | # => Sun Mar 01 02:00:00 0100 2009 => 2:00 is just after 1:59 16 | DAY_WITHOUT_DAYLIGHT_CHANGE = "2009/03/01 " 17 | 18 | # Initialize the builder. 19 | # 20 | # * start_time: The start time. 21 | # * end_time: The end time. 22 | def initialize(start_time, end_time) 23 | @start_time = start_time.class == String ? Time.parse(DAY_WITHOUT_DAYLIGHT_CHANGE + start_time) : start_time 24 | @end_time = end_time.class == String ? Time.parse(DAY_WITHOUT_DAYLIGHT_CHANGE + end_time) : end_time 25 | end 26 | 27 | # Returns an array of hashes representing records in the dimension. The values for each record are 28 | # accessed by name. 29 | def build(options={}) 30 | records = [] 31 | time = start_time 32 | while time <= end_time 33 | record = {} 34 | record[:sql_time_stamp] = time.strftime('%H:%M') 35 | record[:hour] = time.hour 36 | 37 | hour_format = "%I:%M %P" 38 | 39 | # full hour description 40 | full_hour_start = time.to_a 41 | full_hour_start[1] = 0 # set minutes to 0 42 | full_hour_start = Time.local(*full_hour_start) 43 | record[:hour_description] = "between #{full_hour_start.strftime(hour_format)} and #{(full_hour_start+59*60).strftime(hour_format)}" 44 | 45 | # half hour computation 46 | half_hour_start = time.to_a 47 | half_hour_start[1] = 30*(half_hour_start[1] / 30) # round to 0 or 30 minutes 48 | half_hour_start = Time.local(*half_hour_start) 49 | half_hour_end = half_hour_start + 29*60 # grab the next half by adding 30 minutes 50 | half_hour_start = half_hour_start.strftime(hour_format) 51 | half_hour_end = half_hour_end.strftime(hour_format) 52 | record[:half_hour_description] = "between #{half_hour_start} and #{half_hour_end}" 53 | 54 | record[:hour_type] = case time.hour 55 | when 9..16; "opening hours" 56 | else "non opening hours" 57 | end 58 | 59 | record[:day_part] = case time.hour 60 | when 8..11; "morning" 61 | when 12..18; "afternoon" 62 | when 19..21; "evening" 63 | else "night" 64 | end 65 | 66 | records << record 67 | time = time + 60 68 | end 69 | records 70 | end 71 | 72 | end 73 | -------------------------------------------------------------------------------- /advanced/etl/extract.ctl: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/common') 2 | 3 | pre_process do 4 | if File.exist?(GIT_RAILS_REPO) 5 | system! "cd #{GIT_RAILS_REPO} && git pull" 6 | else 7 | system! "git clone https://github.com/rails/rails #{GIT_RAILS_REPO}" 8 | end 9 | end 10 | 11 | def to_git_char(char) 12 | "%x" + char.ord.to_s(16).rjust(2, "0") 13 | end 14 | 15 | git_commits_file = File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv')) 16 | 17 | pre_process do 18 | commit_fields = { 19 | 'H' => :commit_hash, 20 | 'an' => :author_name, 21 | 'ae' => :author_email, 22 | 'ai' => :author_date 23 | }.to_a 24 | 25 | col_sep = "\t" 26 | 27 | git_fields = commit_fields.map(&:first).map { |e| "%#{e}" } 28 | git_fields = git_fields.join(to_git_char(col_sep)) 29 | csv_fields = commit_fields.map(&:last) + [:files_changed, :insertions, :deletions] 30 | 31 | CSV.open(git_commits_file, 'w') do |output| 32 | output << csv_fields 33 | 34 | cmd = "cd #{GIT_RAILS_REPO} && git log --shortstat --reverse --pretty=format:\"#{git_fields}\"" 35 | 36 | buffer = [] 37 | 38 | IO.popen(cmd).each_line do |line| 39 | case line 40 | when /^[0-9a-f]{40}/; 41 | buffer << 0 << 0 << 0 unless buffer.size == csv_fields.size || buffer.empty? 42 | output << buffer unless buffer.empty? 43 | buffer = line.strip.split(col_sep) 44 | when /(\d+) files? changed, (\d+) insertions?\(\+\), (\d+) deletions?\(\-\)/; 45 | buffer << $1 << $2 << $3 46 | when /(\d+) files? changed, (\d+) insertions?\(\+\)/; 47 | buffer << $1 << $2 << '0' 48 | when /(\d+) files? changed, (\d+) deletions?\(\-\)/; 49 | buffer << $1 << '0' << $2 50 | when /(\d+) files? changed/; 51 | buffer << $1 << '0' << '0' 52 | when "\n"; 53 | else raise "Failed to parse line #{line.inspect}" 54 | end 55 | end 56 | 57 | buffer << 0 << 0 << 0 unless buffer.size == csv_fields.size || buffer.empty? 58 | output << buffer unless buffer.empty? 59 | end 60 | 61 | end 62 | 63 | screen(:fatal) do 64 | assert_equal %w(commit_hash author_name author_email author_date files_changed insertions deletions), 65 | CSV.parse_line(IO.popen("head -1 #{git_commits_file}").read) 66 | 67 | assert_equal [ 68 | "20d7d2415f99620590aec07cedcaace34cced1c6", 69 | "Xavier Noria", 70 | "fxn@hashref.com", 71 | "2011-06-18 10:14:32 +0200", 72 | "2", 73 | "3", 74 | "3" 75 | ], CSV.parse_line(IO.popen("grep 20d7d2415f99620590aec07cedcaace34cced1c6 #{git_commits_file}").read) 76 | 77 | # when no stat is provided, 0/0/0 should still be stored 78 | row_with_no_file_changes = CSV.parse_line(IO.popen("grep b3f45195aa8a35277c3f998917312797936a1f4e #{git_commits_file}").read) 79 | 80 | assert_equal [ 81 | "b3f45195aa8a35277c3f998917312797936a1f4e", 82 | "0", 83 | "0", 84 | "0" 85 | ], [ 86 | row_with_no_file_changes[0], 87 | row_with_no_file_changes[-1], 88 | row_with_no_file_changes[-2], 89 | row_with_no_file_changes[-3], 90 | ] 91 | end 92 | -------------------------------------------------------------------------------- /advanced/etl/import_new_commits.ctl: -------------------------------------------------------------------------------- 1 | require File.expand_path(File.dirname(__FILE__) + '/common') 2 | 3 | table = Commit.table_name 4 | source_fields = [:commit_hash, :author_name, :author_date, :files_changed, :insertions, :deletions] 5 | target_fields = [:sha1, :user_id, :date_id, :time_id, :files_changed, :insertions, :deletions] 6 | 7 | source :git_commits, 8 | :file => File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv')), 9 | :skip_lines => 1, :parser => :csv 10 | 11 | # Ensure the fields we rely on are here on each row 12 | after_read :ensure_fields_presence, { :fields => source_fields } 13 | 14 | rename :commit_hash, :sha1 15 | 16 | # in RAM unicity check - duplicate rows will be removed from the pipeline 17 | after_read :check_unique, :keys => [:sha1] 18 | 19 | # rename is an after_read with some sugar 20 | rename :author_name, :user_id 21 | 22 | copy :author_date, :date_id 23 | copy :author_date, :time_id 24 | 25 | transform(:date_id, :string_to_date) 26 | 27 | # look-up 28 | transform :user_id, :foreign_key_lookup, { 29 | :resolver => ActiveRecordResolver.new(UserDimension, :find_by_name), 30 | :default => UserDimension.find_by_name(UNKNOWN_USER_NAME).id 31 | # TODO - investigate to understand why the SQLResolver cannot find names 32 | # with accents (probably some SET NAMES option) 33 | } 34 | 35 | transform :date_id, :foreign_key_lookup, 36 | :resolver => ActiveRecordResolver.new(DateDimension, :find_by_sql_date_stamp) 37 | 38 | transform(:time_id) do |n,v,r| 39 | # only keep the HH:MM part before proceeding to look-up 40 | v[11..15] 41 | end 42 | 43 | transform :time_id, :foreign_key_lookup, 44 | :resolver => ActiveRecordResolver.new(TimeDimension, :find_by_sql_time_stamp) 45 | 46 | # here we'll just define a constant to be reused down there 47 | bulk_load_file = File.expand_path(File.join(DATA_FOLDER, 'new_git_commits.txt')) 48 | 49 | # remove rows that are already in the destination database 50 | before_write :check_exist, :target => :datawarehouse, :table => table, :columns => [:sha1] 51 | 52 | # write only the new records to a raw file prior to bulk loading 53 | destination :out, { :file => bulk_load_file }, { :order => target_fields } 54 | 55 | # then bulk-load the resulting file to the database 56 | post_process :bulk_import, { 57 | :file => bulk_load_file, 58 | :columns => target_fields, 59 | :target => :datawarehouse, :table => table 60 | } 61 | 62 | after_post_process_screen(:fatal) { 63 | commit = Commit.where(:sha1 => '7e56bf724479ce92eff2f806573f382957f3a2b4').first 64 | assert_not_nil commit, "missing expected commit 7e56bf72" 65 | assert_equal "Xavier Noria", commit.user.name 66 | 67 | # keep the date and time as UTC for the moment, we'll store timezone in a dimension maybe 68 | # so the author_date of 2011-06-24 23:27:40 +0200 should be converted to: 69 | assert_equal "2011-06-24", commit.date.sql_date_stamp.to_s 70 | # rounded to the minute in our case 71 | assert_equal "23:27:00", commit.time.sql_time_stamp.strftime('%H:%M:%S') 72 | 73 | assert_equal 1, commit.files_changed 74 | assert_equal 2, commit.insertions 75 | assert_equal 0, commit.deletions 76 | } 77 | --------------------------------------------------------------------------------