├── advanced
├── .rspec
├── data
│ └── .gitignore
├── .ruby-version
├── .travis.yml
├── .bundle
│ └── config
├── spec
│ ├── spec_helper.rb
│ ├── time_dimension_builder_spec.rb
│ └── date_dimension_builder_spec.rb
├── .gitignore
├── Rakefile
├── etl
│ ├── process_all.ebf
│ ├── migrations
│ │ ├── 003_create_time_dimension.rb
│ │ ├── 004_create_commits_table.rb
│ │ ├── 002_create_date_dimension.rb
│ │ └── 001_create_user_dimension.rb
│ ├── prepare_db.ctl
│ ├── prepare_date_dimension.ctl
│ ├── common.rb
│ ├── prepare_time_dimension.ctl
│ ├── prepare_user_dimension.ctl
│ ├── extract.ctl
│ └── import_new_commits.ctl
├── config
│ └── database.yml
├── Gemfile
├── LICENSE
├── README.md
├── Gemfile.lock
└── lib
│ ├── date_dimension_builder.rb
│ └── time_dimension_builder.rb
└── simple
├── .gitignore
├── .rvmrc
├── etl
├── process_all.ebf
├── migrations
│ └── 001_create_customers.rb
├── prepare_db.ctl
└── upsert_customers.ctl
├── Gemfile
├── customers.csv
├── config
└── database.yml
├── README.md
└── Gemfile.lock
/advanced/.rspec:
--------------------------------------------------------------------------------
1 | --format nested
2 | --color
--------------------------------------------------------------------------------
/advanced/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/simple/.gitignore:
--------------------------------------------------------------------------------
1 | source_data
2 | etl.log
--------------------------------------------------------------------------------
/advanced/.ruby-version:
--------------------------------------------------------------------------------
1 | ruby-1.9.3-p125@aw-etl-sample
2 |
--------------------------------------------------------------------------------
/advanced/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - 1.9.3
--------------------------------------------------------------------------------
/advanced/.bundle/config:
--------------------------------------------------------------------------------
1 | ---
2 | BUNDLE_WITHOUT: production
3 |
--------------------------------------------------------------------------------
/advanced/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | $: << File.dirname(__FILE__) + '/../lib'
--------------------------------------------------------------------------------
/simple/.rvmrc:
--------------------------------------------------------------------------------
1 | rvm --create use ruby-1.9.3-p125@aw-etl-sample
2 |
--------------------------------------------------------------------------------
/simple/etl/process_all.ebf:
--------------------------------------------------------------------------------
1 | run 'prepare_db.ctl'
2 | run 'upsert_customers.ctl'
--------------------------------------------------------------------------------
/advanced/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | etl.log
3 | source_data
4 | .DS_Store
5 | tmp
6 | *.txt
--------------------------------------------------------------------------------
/simple/Gemfile:
--------------------------------------------------------------------------------
1 | source :rubygems
2 |
3 | gem 'activewarehouse-etl', '1.0.0'
4 | gem 'mysql2'
5 | gem 'awesome_print'
6 |
7 | group :test do
8 | gem 'rspec'
9 | end
10 |
--------------------------------------------------------------------------------
/advanced/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler'
2 | Bundler.setup
3 |
4 | require 'rspec/core/rake_task'
5 |
6 | desc "Run all examples"
7 | RSpec::Core::RakeTask.new
8 |
9 | task :default => :spec
--------------------------------------------------------------------------------
/simple/customers.csv:
--------------------------------------------------------------------------------
1 | first_name,last_name,email
2 | John,Barry,john.barry@gmail.com
3 | Jonathon,More,jon@coldcut.com
4 | Matt,Black,matt@coldcut.com
5 | Marlena,Shaw,marlena.shaw@gmail.com
6 | Neil,Young,neil.young@hotmail.com
--------------------------------------------------------------------------------
/advanced/etl/process_all.ebf:
--------------------------------------------------------------------------------
1 |
2 | run 'extract.ctl'
3 |
4 | run 'prepare_db.ctl'
5 |
6 | run 'prepare_date_dimension.ctl'
7 | run 'prepare_time_dimension.ctl'
8 | run 'prepare_user_dimension.ctl'
9 |
10 | run 'import_new_commits.ctl'
--------------------------------------------------------------------------------
/simple/etl/migrations/001_create_customers.rb:
--------------------------------------------------------------------------------
1 | class CreateCustomers < ActiveRecord::Migration
2 |
3 | def change
4 | create_table :customers, :force => true do |t|
5 | t.string :full_name
6 | t.string :email
7 | t.string :email_provider
8 | end
9 | end
10 |
11 | end
--------------------------------------------------------------------------------
/simple/etl/prepare_db.ctl:
--------------------------------------------------------------------------------
1 | pre_process do
2 | migrations_folder = File.expand_path(File.dirname(__FILE__) + '/migrations')
3 | version = ENV["VERSION"] ? ENV["VERSION"].to_i : nil
4 |
5 | ActiveRecord::Base.establish_connection(:datawarehouse)
6 | ActiveRecord::Migrator.migrate(migrations_folder, version)
7 | end
--------------------------------------------------------------------------------
/simple/config/database.yml:
--------------------------------------------------------------------------------
1 | common: &common
2 | adapter: mysql2
3 | username: root
4 | host: localhost
5 |
6 | etl_execution:
7 | <<: *common
8 | database: aw_etl_simple_etl_execution
9 |
10 | datawarehouse:
11 | <<: *common
12 | database: aw_etl_simple_datawarehouse
13 | encoding: utf8
14 | local_infile: true
--------------------------------------------------------------------------------
/advanced/config/database.yml:
--------------------------------------------------------------------------------
1 | common: &common
2 | adapter: mysql2
3 | username: root
4 | host: localhost
5 |
6 | etl_execution:
7 | <<: *common
8 | database: aw_etl_sample_etl_execution
9 |
10 | datawarehouse:
11 | <<: *common
12 | database: aw_etl_sample_datawarehouse
13 | encoding: utf8
14 | local_infile: true
--------------------------------------------------------------------------------
/advanced/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # use our own fork for bulk load support until issue fixed:
4 | # https://github.com/brianmario/mysql2/pull/242
5 | gem 'mysql2', :git => 'git://github.com/activewarehouse/mysql2.git'
6 |
7 | gem 'activewarehouse-etl', '1.0.0'
8 | gem 'adapter_extensions', '1.0.0'
9 | gem 'awesome_print'
10 |
11 | group :test do
12 | gem 'rspec'
13 | end
14 |
--------------------------------------------------------------------------------
/advanced/etl/migrations/003_create_time_dimension.rb:
--------------------------------------------------------------------------------
1 | class CreateTimeDimension < ActiveRecord::Migration
2 |
3 | def self.up
4 | create_table :time_dimension do |t|
5 | t.time :sql_time_stamp
6 |
7 | t.integer :hour
8 | t.string :hour_description
9 | t.string :half_hour_description
10 | t.string :day_part
11 | t.string :hour_type
12 | end
13 | end
14 |
15 | def self.down
16 | drop_table :time_dimension
17 | end
18 |
19 | end
--------------------------------------------------------------------------------
/simple/README.md:
--------------------------------------------------------------------------------
1 | ## How to run
2 |
3 | * make sure to have mysql installed and running (`brew install mysql`)
4 | * install ruby 1.9.3 (and I suggest RVM if possible)
5 | * edit `config/database.yml` to reflect your mysql setup
6 | * then:
7 |
8 | ```
9 | bundle install
10 | mysql -u root -p -e "create database aw_etl_simple_etl_execution"
11 | mysql -u root -p -e "create database aw_etl_simple_datawarehouse CHARACTER SET utf8 COLLATE utf8_general_ci"
12 |
13 | bundle exec etl etl/process_all.ebf
14 | ```
15 |
--------------------------------------------------------------------------------
/advanced/etl/prepare_db.ctl:
--------------------------------------------------------------------------------
1 | # For convenience, we'll use ActiveRecord migrations to help us
2 | # build the dimensions and facts tables. You'd be free to use whatever you
3 | # like though (or to rely on an already existing schema)
4 |
5 | pre_process do
6 | migrations_folder = File.expand_path(File.dirname(__FILE__) + '/migrations')
7 | version = ENV["VERSION"] ? ENV["VERSION"].to_i : nil
8 |
9 | ActiveRecord::Base.establish_connection(:datawarehouse)
10 | ActiveRecord::Migrator.migrate(migrations_folder, version)
11 | end
--------------------------------------------------------------------------------
/advanced/etl/migrations/004_create_commits_table.rb:
--------------------------------------------------------------------------------
1 | class CreateCommitsTable < ActiveRecord::Migration
2 |
3 | def self.up
4 | create_table :commits do |t|
5 | t.string :sha1
6 |
7 | t.integer :files_changed
8 | t.integer :insertions
9 | t.integer :deletions
10 |
11 | t.integer :date_id
12 | t.integer :time_id
13 | t.integer :user_id
14 | end
15 |
16 | add_index :commits, :sha1, :unique
17 | end
18 |
19 | def self.down
20 | drop_table :commits
21 | end
22 |
23 | end
--------------------------------------------------------------------------------
/advanced/etl/migrations/002_create_date_dimension.rb:
--------------------------------------------------------------------------------
1 | class CreateDateDimension < ActiveRecord::Migration
2 |
3 | def self.up
4 | create_table :date_dimension do |t|
5 | t.string :date
6 | t.string :month
7 | t.string :day_of_week
8 | t.integer :day_of_week_as_number
9 | t.string :year
10 | t.string :year_and_month
11 | t.date :sql_date_stamp
12 | t.string :week
13 | t.string :quarter
14 | t.string :semester
15 | end
16 | end
17 |
18 | def self.down
19 | drop_table :date_dimension
20 | end
21 |
22 | end
--------------------------------------------------------------------------------
/advanced/etl/migrations/001_create_user_dimension.rb:
--------------------------------------------------------------------------------
1 | class CreateUserDimension < ActiveRecord::Migration
2 |
3 | def change
4 | create_table :user_dimension, :force => true do |t|
5 | t.string :name, :null => false
6 | end
7 |
8 | # work-around: mysql bulk load will choke on "Adam" vs "adam", considering these are duplicates
9 | # as well: we use self. because of https://github.com/activewarehouse/activewarehouse-etl/issues/70
10 | self.execute %{ALTER TABLE user_dimension MODIFY name varchar(255) COLLATE utf8_bin NOT NULL}
11 |
12 | add_index :user_dimension, :name, :unique
13 | end
14 |
15 | end
--------------------------------------------------------------------------------
/simple/etl/upsert_customers.ctl:
--------------------------------------------------------------------------------
1 | class Customer < ActiveRecord::Base
2 | end
3 |
4 | file = File.expand_path(File.dirname(__FILE__) + '/../customers.csv')
5 |
6 | source :input,
7 | {
8 | :file => file,
9 | :parser => :csv,
10 | :skip_lines => 1
11 | },
12 | [
13 | :first_name,
14 | :last_name,
15 | :email
16 | ]
17 |
18 | transform(:email_provider) do |n,v,r|
19 | r[:email].downcase.split('@').last
20 | end
21 |
22 | transform :email_provider, :default,
23 | :default_value => "Unknown"
24 |
25 | transform(:full_name) do |n,v,r|
26 | [r[:first_name], r[:last_name]].join(' ')
27 | end
28 |
29 | before_write do |r|
30 | r[:email_provider] =~ /hotmail/ ? nil : r
31 | end
32 |
33 | destination :out, {
34 | :type => :insert_update_database,
35 | :target => :datawarehouse,
36 | :table => 'customers'
37 | },
38 | {
39 | :primarykey => [:email],
40 | :order => [:email, :full_name, :email_provider]
41 | }
42 |
43 | screen(:fatal) {
44 | assert_equal 1, Customer.where(:email => 'john.barry@gmail.com').count
45 | }
--------------------------------------------------------------------------------
/advanced/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2011-2012 Thibaut Barrère
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
4 | associated documentation files (the "Software"), to deal in the Software without restriction, including
5 | without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
6 | copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
7 | following conditions:
8 |
9 | The above copyright notice and this permission notice shall be included in all copies or substantial
10 | portions of the Software.
11 |
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
13 | LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
14 | NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
15 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/advanced/etl/prepare_date_dimension.ctl:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/common')
2 |
3 | table = DateDimension.table_name
4 | bulk_load_file = "#{table}.txt"
5 | start_date = Date.parse('2000-01-01')
6 | end_date = Date.parse('2020-01-01')
7 |
8 | pre_process :truncate, :target => :datawarehouse, :table => table
9 |
10 | records = DateDimensionBuilder.new(start_date, end_date).build
11 |
12 | source :in, {
13 | :type => :enumerable,
14 | :enumerable => records,
15 | :store_locally => false
16 | }
17 |
18 | # pick the first record to extract the column names
19 | columns = records.first.keys
20 |
21 | # write only the new records to a raw file prior to bulk loading
22 | destination :out, { :file => bulk_load_file }, { :order => columns }
23 |
24 | # then bulk-load the resulting file to the database
25 | post_process :bulk_import, {
26 | :file => bulk_load_file,
27 | :columns => columns,
28 | :target => :datawarehouse, :table => table
29 | }
30 |
31 | after_post_process_screen(:fatal) do
32 | assert_equal end_date - start_date + 1, DateDimension.count
33 |
34 | # ensure we keep constant ids despite the truncating
35 | assert_equal start_date, DateDimension.find(1).sql_date_stamp
36 | end
--------------------------------------------------------------------------------
/advanced/etl/common.rb:
--------------------------------------------------------------------------------
1 | require 'awesome_print'
2 |
3 | $: << File.dirname(__FILE__) + '/../lib'
4 |
5 | require 'date_dimension_builder'
6 | require 'time_dimension_builder'
7 |
8 | DATA_FOLDER = File.dirname(__FILE__) + '/../data'
9 |
10 | GIT_RAILS_REPO = File.join(DATA_FOLDER, 'git_rails')
11 |
12 | UNKNOWN_USER_NAME = 'Unknown user'
13 |
14 | # fail-fast process execution
15 | def system!(cmd)
16 | raise "Failed to run command #{cmd}" unless system(cmd)
17 | end
18 |
19 | # define a couple of ActiveRecord models for later re-use (not mandatory)
20 |
21 | class UserDimension < ActiveRecord::Base; end
22 | UserDimension.table_name = 'user_dimension'
23 |
24 | class TimeDimension < ActiveRecord::Base; end
25 | TimeDimension.table_name = 'time_dimension'
26 |
27 | class DateDimension < ActiveRecord::Base; end
28 | DateDimension.table_name = 'date_dimension'
29 |
30 | class Commit < ActiveRecord::Base
31 | # TODO - rename as :author
32 | belongs_to :user, :class_name => 'UserDimension'
33 | belongs_to :date, :class_name => 'DateDimension'
34 | belongs_to :time, :class_name => 'TimeDimension'
35 | end
36 |
37 | # TODO - figure out why this is required here
38 | ActiveRecord::Base.establish_connection(:datawarehouse)
39 |
--------------------------------------------------------------------------------
/advanced/README.md:
--------------------------------------------------------------------------------
1 | # ActiveWarehouse-ETL sample
2 |
3 | This is a sample of ETL built on a very small data set (the rails git commit log) for educational purposes.
4 |
5 | [](http://travis-ci.org/activewarehouse/activewarehouse-etl-sample)
6 |
7 | ## How to run
8 |
9 | More explanations will be added here later on. In the mean time:
10 |
11 | * make sure to have mysql installed and running (`brew install mysql`)
12 | * install ruby 1.9.3 (and I suggest RVM if possible)
13 | * edit `config/database.yml` to reflect your mysql setup
14 | * then:
15 |
16 | ```
17 | bundle install
18 | mysql -u root -p -e "create database aw_etl_sample_etl_execution"
19 | mysql -u root -p -e "create database aw_etl_sample_datawarehouse CHARACTER SET utf8 COLLATE utf8_general_ci"
20 |
21 | bundle exec etl etl/process_all.ebf
22 | ```
23 |
24 | ## Tests
25 |
26 | There are a couple of specs to show how to test your dimension builders. Run with:
27 |
28 | `rake spec`
29 |
30 | ## Contributors
31 |
32 | * Thibaut Barrère
33 | * Alisson Cavalcante Agiani
34 |
35 | ## Contributing
36 |
37 | Pull-request are most-welcome! Get in touch before working on anything significant though.
38 |
39 | ## License
40 |
41 | MIT
--------------------------------------------------------------------------------
/simple/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: http://rubygems.org/
3 | specs:
4 | activemodel (3.2.6)
5 | activesupport (= 3.2.6)
6 | builder (~> 3.0.0)
7 | activerecord (3.2.6)
8 | activemodel (= 3.2.6)
9 | activesupport (= 3.2.6)
10 | arel (~> 3.0.2)
11 | tzinfo (~> 0.3.29)
12 | activesupport (3.2.6)
13 | i18n (~> 0.6)
14 | multi_json (~> 1.0)
15 | activewarehouse-etl (1.0.0)
16 | activerecord (>= 3.0.0)
17 | activesupport (>= 3.0.0)
18 | adapter_extensions (>= 0.9.5.rc1)
19 | fastercsv (>= 1.2.0)
20 | rake (>= 0.8.3)
21 | adapter_extensions (1.0.0)
22 | activerecord (>= 3.0.0)
23 | activesupport (>= 3.0.0)
24 | rake (>= 0.8.3)
25 | arel (3.0.2)
26 | awesome_print (1.0.2)
27 | builder (3.0.0)
28 | diff-lcs (1.1.3)
29 | fastercsv (1.5.5)
30 | i18n (0.6.0)
31 | multi_json (1.3.6)
32 | mysql2 (0.3.11)
33 | rake (0.9.2.2)
34 | rspec (2.10.0)
35 | rspec-core (~> 2.10.0)
36 | rspec-expectations (~> 2.10.0)
37 | rspec-mocks (~> 2.10.0)
38 | rspec-core (2.10.1)
39 | rspec-expectations (2.10.0)
40 | diff-lcs (~> 1.1.3)
41 | rspec-mocks (2.10.1)
42 | tzinfo (0.3.33)
43 |
44 | PLATFORMS
45 | ruby
46 |
47 | DEPENDENCIES
48 | activewarehouse-etl (= 1.0.0)
49 | awesome_print
50 | mysql2
51 | rspec
52 |
--------------------------------------------------------------------------------
/advanced/etl/prepare_time_dimension.ctl:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/common')
2 |
3 | # you will notice this file is very, very similar to prepare_date_dimension
4 | # so similar you can create a ruby method into common.rb or another file
5 | # to DRY things - it will definitely work.
6 |
7 | table = TimeDimension.table_name
8 | bulk_load_file = "#{table}.txt"
9 |
10 | pre_process :truncate, :target => :datawarehouse, :table => table
11 |
12 | records = TimeDimensionBuilder.new("0:00", "23:59").build
13 |
14 | source :in, {
15 | :type => :enumerable,
16 | :enumerable => records,
17 | :store_locally => false
18 | }
19 |
20 | # pick the first record to extract the column names
21 | columns = records.first.keys
22 |
23 |
24 | # write only the new records to a raw file prior to bulk loading
25 | destination :out, { :file => bulk_load_file }, { :order => columns }
26 |
27 | # then bulk-load the resulting file to the database
28 | post_process :bulk_import, {
29 | :file => bulk_load_file,
30 | :columns => columns,
31 | :target => :datawarehouse, :table => table
32 | }
33 |
34 | after_post_process_screen(:fatal) do
35 | assert_equal 60*24, TimeDimension.count
36 |
37 | # ensure we keep constant ids despite the truncating
38 | assert_equal '00:00', TimeDimension.find(1).sql_time_stamp.strftime("%H:%M")
39 | assert_equal '23:59', TimeDimension.find(60*24).sql_time_stamp.strftime("%H:%M")
40 | end
--------------------------------------------------------------------------------
/advanced/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GIT
2 | remote: git://github.com/activewarehouse/mysql2.git
3 | revision: 67a5e63804648c4fb98a0046eb31c85509021fe6
4 | specs:
5 | mysql2 (0.3.11)
6 |
7 | GEM
8 | remote: https://rubygems.org/
9 | specs:
10 | activemodel (3.2.6)
11 | activesupport (= 3.2.6)
12 | builder (~> 3.0.0)
13 | activerecord (3.2.6)
14 | activemodel (= 3.2.6)
15 | activesupport (= 3.2.6)
16 | arel (~> 3.0.2)
17 | tzinfo (~> 0.3.29)
18 | activesupport (3.2.6)
19 | i18n (~> 0.6)
20 | multi_json (~> 1.0)
21 | activewarehouse-etl (1.0.0)
22 | activerecord (>= 3.0.0)
23 | activesupport (>= 3.0.0)
24 | adapter_extensions (>= 0.9.5.rc1)
25 | fastercsv (>= 1.2.0)
26 | rake (>= 0.8.3)
27 | adapter_extensions (1.0.0)
28 | activerecord (>= 3.0.0)
29 | activesupport (>= 3.0.0)
30 | rake (>= 0.8.3)
31 | arel (3.0.2)
32 | awesome_print (1.0.2)
33 | builder (3.0.0)
34 | diff-lcs (1.1.3)
35 | fastercsv (1.5.5)
36 | i18n (0.6.0)
37 | multi_json (1.3.6)
38 | rake (0.9.2.2)
39 | rspec (2.8.0)
40 | rspec-core (~> 2.8.0)
41 | rspec-expectations (~> 2.8.0)
42 | rspec-mocks (~> 2.8.0)
43 | rspec-core (2.8.0)
44 | rspec-expectations (2.8.0)
45 | diff-lcs (~> 1.1.2)
46 | rspec-mocks (2.8.0)
47 | tzinfo (0.3.33)
48 |
49 | PLATFORMS
50 | ruby
51 |
52 | DEPENDENCIES
53 | activewarehouse-etl (= 1.0.0)
54 | adapter_extensions (= 1.0.0)
55 | awesome_print
56 | mysql2!
57 | rspec
58 |
--------------------------------------------------------------------------------
/advanced/lib/date_dimension_builder.rb:
--------------------------------------------------------------------------------
1 | class DateDimensionBuilder
2 | # Specify the start date for the first record
3 | attr_accessor :start_date
4 |
5 | # Specify the end date for the last record
6 | attr_accessor :end_date
7 |
8 | # Initialize the builder.
9 | #
10 | # * start_date: The start date.
11 | # * end_date: The end date.
12 | def initialize(start_date, end_date)
13 | @start_date = start_date.class == String ? Date.parse(start_date) : start_date
14 | @end_date = end_date.class == String ? Date.parse(end_date) : end_date
15 | end
16 |
17 | # Returns an array of hashes representing records in the dimension. The values for each record are
18 | # accessed by name.
19 | def build
20 | records = []
21 | date = start_date
22 | while date <= end_date
23 | record = {}
24 | record[:date] = date.strftime("%Y-%m-%d")
25 | record[:month] = Date::MONTHNAMES[date.month].downcase
26 | record[:day_of_week] = Date::DAYNAMES[date.wday].downcase
27 | record[:day_of_week_as_number] = date.wday
28 | record[:year] = date.year.to_s
29 | record[:year_and_month] = record[:year] + "-" + date.month.to_s.rjust(2,'0')
30 | record[:sql_date_stamp] = date
31 | record[:week] = "week #{date.to_date.cweek}"
32 | # compute quarter ourselves - available in Time but not in Date - anything better ?
33 | quarter = 1 + (date.month-1) / 3
34 | record[:quarter] = "Q#{quarter}"
35 | record[:semester] = "S#{(quarter+1)/2}"
36 | records << record
37 | date = date.next
38 | end
39 | records
40 | end
41 | end
42 |
--------------------------------------------------------------------------------
/advanced/spec/time_dimension_builder_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/spec_helper'
2 | require 'time_dimension_builder'
3 |
4 | describe TimeDimensionBuilder do
5 |
6 | def times(from, to, key=nil)
7 | result = TimeDimensionBuilder.new(from, to).build
8 | result.map! { |e| e[key] } unless key.nil?
9 | result
10 | end
11 |
12 | def time(time, key=nil)
13 | times(time, time, key).first
14 | end
15 |
16 | it "should return exactly one record for each minute of the 24 hours day" do
17 | times('0:00', '23:59', :sql_time_stamp).size.should eql(60*24)
18 | end
19 |
20 | it "should fill the sql_time_stamp" do
21 | time('21:00', :sql_time_stamp).should eql('21:00')
22 | end
23 |
24 | it "should fill the hour as integer" do
25 | times('21:00', '21:59', :hour).uniq.should eql([21])
26 | end
27 |
28 | it "should fill the hour description" do
29 | times('17:00', '17:59', :hour_description).uniq.should eql(["between 05:00 pm and 05:59 pm"])
30 | end
31 |
32 | it "should fill the half hour description" do
33 | times('20:00', '20:29', :half_hour_description).uniq.should eql(["between 08:00 pm and 08:29 pm"])
34 | times('20:30', '20:59', :half_hour_description).uniq.should eql(["between 08:30 pm and 08:59 pm"])
35 | times('7:00', '7:29', :half_hour_description).uniq.should eql(["between 07:00 am and 07:29 am"])
36 | end
37 |
38 | it "should fill the day part" do
39 | times('0:00', '7:59', :day_part).uniq.should eql(["night"])
40 | times('8:00', '11:59', :day_part).uniq.should eql(["morning"])
41 | times('12:00', '18:59', :day_part).uniq.should eql(["afternoon"])
42 | times('19:00', '21:59', :day_part).uniq.should eql(["evening"])
43 | times('22:00', '23:59', :day_part).uniq.should eql(["night"])
44 | end
45 |
46 | it "should states weither the hour is a closed or opened hour" do
47 | times('0:00', '8:59', :hour_type).uniq.should eql(["non opening hours"])
48 | times('9:00', '16:59', :hour_type).uniq.should eql(["opening hours"])
49 | times('17:00', '23:59', :hour_type).uniq.should eql(["non opening hours"])
50 | end
51 |
52 | end
--------------------------------------------------------------------------------
/advanced/etl/prepare_user_dimension.ctl:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/common')
2 |
3 | table = UserDimension.table_name
4 |
5 | # One can enable debug logging with:
6 | # ETL::Engine.logger = Logger.new(STDOUT)
7 | # ETL::Engine.logger.level = Logger::DEBUG
8 | #
9 | # And display rows in a processor using:
10 | # after_read do |row| ap row; row; end
11 |
12 | # first source to create an 'unknown' user
13 | source :unknown_user, :type => :enumerable,
14 | :enumerable => [{:author_name => UNKNOWN_USER_NAME}]
15 |
16 | # read the users csv file (options are passed to CSV/FasterCSV)
17 | source :git_users,
18 | :file => File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv')),
19 | :skip_lines => 1, :parser => :csv
20 |
21 | # in RAM unicity check - duplicate rows will be removed from the pipeline
22 | after_read :check_unique, :keys => [:author_name]
23 |
24 | # use the email as name in case no name is provided
25 | transform(:name) do |key, value, row|
26 | row[:author_name].blank? ? row[:author_email] : row[:author_name]
27 | end
28 |
29 | # remove rows that are already in the destination database
30 | before_write :check_exist, :target => :datawarehouse, :table => table, :columns => [:name]
31 |
32 | # here we'll just define a constant to be reused down there
33 | bulk_load_file = File.expand_path(File.join(DATA_FOLDER, 'new_git_users.txt'))
34 |
35 | # write only the new records to a raw file prior to bulk loading
36 | destination :out, { :file => bulk_load_file }, { :order => [:name] }
37 |
38 | # before the post-process, we have an opportunity to check the data
39 | screen(:fatal) {
40 | IO.foreach(bulk_load_file) do |line|
41 | assert line.strip.size > 0, "Empty line detected in #{bulk_load_file}! This isn't expected."
42 | end
43 | }
44 |
45 | # then bulk-load the resulting file to the database
46 | post_process :bulk_import, {
47 | :file => bulk_load_file,
48 | :columns => [:name],
49 | :target => :datawarehouse, :table => table
50 | }
51 |
52 | # after post-processes, we have another opportunity to check the data
53 | after_post_process_screen(:fatal) {
54 | assert_equal 1, UserDimension.where(:name => 'Yehuda Katz').count, "More than 1 user named Yehuda Katz"
55 | assert_equal 0, UserDimension.where(:name => '').count, "No user should have an empty name"
56 | assert_equal 1, UserDimension.where(:name => 'José Valim').count, "José Valim not found"
57 | assert_equal 1, UserDimension.where(:name => 'Unknown user').count, "Unknown user not found"
58 | }
59 |
--------------------------------------------------------------------------------
/advanced/spec/date_dimension_builder_spec.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/spec_helper'
2 | require 'date_dimension_builder'
3 |
4 | describe DateDimensionBuilder do
5 |
6 | def days(from, to, key=nil)
7 | result = DateDimensionBuilder.new(from, to).build
8 | result.map! { |e| e[key] } unless key.nil?
9 | result
10 | end
11 |
12 | def day(from, key=nil)
13 | days(from, from, key).first
14 | end
15 |
16 | it "should return the expected number of days" do
17 | days('2007-01-01', '2007-12-31').size.should eql(365)
18 | end
19 |
20 | it "should return :date formatted as yyyy-mm-dd" do
21 | day('2007-12-31', :date).should eql('2007-12-31')
22 | end
23 |
24 | it "should return :month in english" do
25 | (1..12).map { |month| day("2007-#{month}-01", :month) }.should eql(
26 | %w(january february march april may june july august september october november december)
27 | )
28 | end
29 |
30 | it "should return :year on four digits" do
31 | days('1959-1-1', '1959-12-31', :year).uniq.should eql(%w(1959))
32 | end
33 |
34 | it "should return :year_and_month in a sortable fashion" do
35 | day('2008-02-03', :year_and_month).should eql('2008-02')
36 | day('2008-12-03', :year_and_month).should eql('2008-12')
37 | end
38 |
39 | it "should return all the days of week in english" do
40 | days('2007-12-01', '2007-12-07', :day_of_week).should eql(%w(saturday sunday monday tuesday wednesday thursday friday))
41 | end
42 |
43 | it "should return all the days of week as numbers starting by sunday as 0" do
44 | days('2007-12-01', '2007-12-07', :day_of_week_as_number).should eql([6, 0, 1, 2, 3, 4, 5])
45 | end
46 |
47 | it "should return a sql_date_stamp" do
48 | day('1991-1-1', :sql_date_stamp).should eql(Date.parse("1991-1-1"))
49 | end
50 |
51 | it "should return :quarter" do
52 | day('2007-12-01', :quarter).should eql('Q4')
53 | day('2007-06-30', :quarter).should eql('Q2')
54 | end
55 |
56 | it "should implement quarter properly - it's home baked after all" do
57 | (1..12).map { |month| day("2007-#{month}-01", :quarter) }.should eql(%w(Q1 Q1 Q1 Q2 Q2 Q2 Q3 Q3 Q3 Q4 Q4 Q4))
58 | end
59 |
60 | it "should return :semester" do
61 | day('2007-7-1', :semester).should eql('S2')
62 | day('2007-06-30', :semester).should eql('S1')
63 | day('2007-12-31', :semester).should eql('S2')
64 | end
65 |
66 | it "should return :week and respect ISO 8601" do
67 | # http://fr.wikipedia.org/wiki/ISO_8601
68 | day('2009-12-31', :week).should eql('week 53')
69 | day('2010-1-1', :week).should eql('week 53')
70 | day('2010-1-4', :week).should eql('week 1')
71 | end
72 |
73 | end
--------------------------------------------------------------------------------
/advanced/lib/time_dimension_builder.rb:
--------------------------------------------------------------------------------
1 | require 'time'
2 |
3 | class TimeDimensionBuilder
4 | # Specify the start time for the first record
5 | attr_accessor :start_time
6 |
7 | # Specify the end time for the last record
8 | attr_accessor :end_time
9 |
10 | # Avoid generating bad data on daylight change. If you are on a daylight changing day (ie: 2009/03/29) you'll get this:
11 | # >> Time.parse('1:59')+60
12 | # => Sun Mar 29 03:00:00 0200 2009 => 3:00 is just after 1:59
13 | # Instead, append a day that hasn't got daylight change (2009/03/01)
14 | # >> Time.parse('2009/03/01 1:59')+60
15 | # => Sun Mar 01 02:00:00 0100 2009 => 2:00 is just after 1:59
16 | DAY_WITHOUT_DAYLIGHT_CHANGE = "2009/03/01 "
17 |
18 | # Initialize the builder.
19 | #
20 | # * start_time: The start time.
21 | # * end_time: The end time.
22 | def initialize(start_time, end_time)
23 | @start_time = start_time.class == String ? Time.parse(DAY_WITHOUT_DAYLIGHT_CHANGE + start_time) : start_time
24 | @end_time = end_time.class == String ? Time.parse(DAY_WITHOUT_DAYLIGHT_CHANGE + end_time) : end_time
25 | end
26 |
27 | # Returns an array of hashes representing records in the dimension. The values for each record are
28 | # accessed by name.
29 | def build(options={})
30 | records = []
31 | time = start_time
32 | while time <= end_time
33 | record = {}
34 | record[:sql_time_stamp] = time.strftime('%H:%M')
35 | record[:hour] = time.hour
36 |
37 | hour_format = "%I:%M %P"
38 |
39 | # full hour description
40 | full_hour_start = time.to_a
41 | full_hour_start[1] = 0 # set minutes to 0
42 | full_hour_start = Time.local(*full_hour_start)
43 | record[:hour_description] = "between #{full_hour_start.strftime(hour_format)} and #{(full_hour_start+59*60).strftime(hour_format)}"
44 |
45 | # half hour computation
46 | half_hour_start = time.to_a
47 | half_hour_start[1] = 30*(half_hour_start[1] / 30) # round to 0 or 30 minutes
48 | half_hour_start = Time.local(*half_hour_start)
49 | half_hour_end = half_hour_start + 29*60 # grab the next half by adding 30 minutes
50 | half_hour_start = half_hour_start.strftime(hour_format)
51 | half_hour_end = half_hour_end.strftime(hour_format)
52 | record[:half_hour_description] = "between #{half_hour_start} and #{half_hour_end}"
53 |
54 | record[:hour_type] = case time.hour
55 | when 9..16; "opening hours"
56 | else "non opening hours"
57 | end
58 |
59 | record[:day_part] = case time.hour
60 | when 8..11; "morning"
61 | when 12..18; "afternoon"
62 | when 19..21; "evening"
63 | else "night"
64 | end
65 |
66 | records << record
67 | time = time + 60
68 | end
69 | records
70 | end
71 |
72 | end
73 |
--------------------------------------------------------------------------------
/advanced/etl/extract.ctl:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/common')
2 |
3 | pre_process do
4 | if File.exist?(GIT_RAILS_REPO)
5 | system! "cd #{GIT_RAILS_REPO} && git pull"
6 | else
7 | system! "git clone https://github.com/rails/rails #{GIT_RAILS_REPO}"
8 | end
9 | end
10 |
11 | def to_git_char(char)
12 | "%x" + char.ord.to_s(16).rjust(2, "0")
13 | end
14 |
15 | git_commits_file = File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv'))
16 |
17 | pre_process do
18 | commit_fields = {
19 | 'H' => :commit_hash,
20 | 'an' => :author_name,
21 | 'ae' => :author_email,
22 | 'ai' => :author_date
23 | }.to_a
24 |
25 | col_sep = "\t"
26 |
27 | git_fields = commit_fields.map(&:first).map { |e| "%#{e}" }
28 | git_fields = git_fields.join(to_git_char(col_sep))
29 | csv_fields = commit_fields.map(&:last) + [:files_changed, :insertions, :deletions]
30 |
31 | CSV.open(git_commits_file, 'w') do |output|
32 | output << csv_fields
33 |
34 | cmd = "cd #{GIT_RAILS_REPO} && git log --shortstat --reverse --pretty=format:\"#{git_fields}\""
35 |
36 | buffer = []
37 |
38 | IO.popen(cmd).each_line do |line|
39 | case line
40 | when /^[0-9a-f]{40}/;
41 | buffer << 0 << 0 << 0 unless buffer.size == csv_fields.size || buffer.empty?
42 | output << buffer unless buffer.empty?
43 | buffer = line.strip.split(col_sep)
44 | when /(\d+) files? changed, (\d+) insertions?\(\+\), (\d+) deletions?\(\-\)/;
45 | buffer << $1 << $2 << $3
46 | when /(\d+) files? changed, (\d+) insertions?\(\+\)/;
47 | buffer << $1 << $2 << '0'
48 | when /(\d+) files? changed, (\d+) deletions?\(\-\)/;
49 | buffer << $1 << '0' << $2
50 | when /(\d+) files? changed/;
51 | buffer << $1 << '0' << '0'
52 | when "\n";
53 | else raise "Failed to parse line #{line.inspect}"
54 | end
55 | end
56 |
57 | buffer << 0 << 0 << 0 unless buffer.size == csv_fields.size || buffer.empty?
58 | output << buffer unless buffer.empty?
59 | end
60 |
61 | end
62 |
63 | screen(:fatal) do
64 | assert_equal %w(commit_hash author_name author_email author_date files_changed insertions deletions),
65 | CSV.parse_line(IO.popen("head -1 #{git_commits_file}").read)
66 |
67 | assert_equal [
68 | "20d7d2415f99620590aec07cedcaace34cced1c6",
69 | "Xavier Noria",
70 | "fxn@hashref.com",
71 | "2011-06-18 10:14:32 +0200",
72 | "2",
73 | "3",
74 | "3"
75 | ], CSV.parse_line(IO.popen("grep 20d7d2415f99620590aec07cedcaace34cced1c6 #{git_commits_file}").read)
76 |
77 | # when no stat is provided, 0/0/0 should still be stored
78 | row_with_no_file_changes = CSV.parse_line(IO.popen("grep b3f45195aa8a35277c3f998917312797936a1f4e #{git_commits_file}").read)
79 |
80 | assert_equal [
81 | "b3f45195aa8a35277c3f998917312797936a1f4e",
82 | "0",
83 | "0",
84 | "0"
85 | ], [
86 | row_with_no_file_changes[0],
87 | row_with_no_file_changes[-1],
88 | row_with_no_file_changes[-2],
89 | row_with_no_file_changes[-3],
90 | ]
91 | end
92 |
--------------------------------------------------------------------------------
/advanced/etl/import_new_commits.ctl:
--------------------------------------------------------------------------------
1 | require File.expand_path(File.dirname(__FILE__) + '/common')
2 |
3 | table = Commit.table_name
4 | source_fields = [:commit_hash, :author_name, :author_date, :files_changed, :insertions, :deletions]
5 | target_fields = [:sha1, :user_id, :date_id, :time_id, :files_changed, :insertions, :deletions]
6 |
7 | source :git_commits,
8 | :file => File.expand_path(File.join(DATA_FOLDER, 'git-commits.csv')),
9 | :skip_lines => 1, :parser => :csv
10 |
11 | # Ensure the fields we rely on are here on each row
12 | after_read :ensure_fields_presence, { :fields => source_fields }
13 |
14 | rename :commit_hash, :sha1
15 |
16 | # in RAM unicity check - duplicate rows will be removed from the pipeline
17 | after_read :check_unique, :keys => [:sha1]
18 |
19 | # rename is an after_read with some sugar
20 | rename :author_name, :user_id
21 |
22 | copy :author_date, :date_id
23 | copy :author_date, :time_id
24 |
25 | transform(:date_id, :string_to_date)
26 |
27 | # look-up
28 | transform :user_id, :foreign_key_lookup, {
29 | :resolver => ActiveRecordResolver.new(UserDimension, :find_by_name),
30 | :default => UserDimension.find_by_name(UNKNOWN_USER_NAME).id
31 | # TODO - investigate to understand why the SQLResolver cannot find names
32 | # with accents (probably some SET NAMES option)
33 | }
34 |
35 | transform :date_id, :foreign_key_lookup,
36 | :resolver => ActiveRecordResolver.new(DateDimension, :find_by_sql_date_stamp)
37 |
38 | transform(:time_id) do |n,v,r|
39 | # only keep the HH:MM part before proceeding to look-up
40 | v[11..15]
41 | end
42 |
43 | transform :time_id, :foreign_key_lookup,
44 | :resolver => ActiveRecordResolver.new(TimeDimension, :find_by_sql_time_stamp)
45 |
46 | # here we'll just define a constant to be reused down there
47 | bulk_load_file = File.expand_path(File.join(DATA_FOLDER, 'new_git_commits.txt'))
48 |
49 | # remove rows that are already in the destination database
50 | before_write :check_exist, :target => :datawarehouse, :table => table, :columns => [:sha1]
51 |
52 | # write only the new records to a raw file prior to bulk loading
53 | destination :out, { :file => bulk_load_file }, { :order => target_fields }
54 |
55 | # then bulk-load the resulting file to the database
56 | post_process :bulk_import, {
57 | :file => bulk_load_file,
58 | :columns => target_fields,
59 | :target => :datawarehouse, :table => table
60 | }
61 |
62 | after_post_process_screen(:fatal) {
63 | commit = Commit.where(:sha1 => '7e56bf724479ce92eff2f806573f382957f3a2b4').first
64 | assert_not_nil commit, "missing expected commit 7e56bf72"
65 | assert_equal "Xavier Noria", commit.user.name
66 |
67 | # keep the date and time as UTC for the moment, we'll store timezone in a dimension maybe
68 | # so the author_date of 2011-06-24 23:27:40 +0200 should be converted to:
69 | assert_equal "2011-06-24", commit.date.sql_date_stamp.to_s
70 | # rounded to the minute in our case
71 | assert_equal "23:27:00", commit.time.sql_time_stamp.strftime('%H:%M:%S')
72 |
73 | assert_equal 1, commit.files_changed
74 | assert_equal 2, commit.insertions
75 | assert_equal 0, commit.deletions
76 | }
77 |
--------------------------------------------------------------------------------