├── .gitignore
├── .standalone_migrations
├── .travis.yml
├── 0.9-UPGRADE
├── CHANGELOG
├── Gemfile
├── Guardfile
├── HOW_TO_RELEASE
├── LICENSE
├── README.textile
├── Rakefile
├── TODO
├── activewarehouse-etl.gemspec
├── bin
├── etl
└── etl.cmd
├── db
├── migrate
│ └── 20120229203554_create_tables.rb
└── schema.rb
├── examples
└── database.example.yml
├── lib
├── etl.rb
└── etl
│ ├── batch.rb
│ ├── batch
│ ├── batch.rb
│ └── directives.rb
│ ├── builder.rb
│ ├── builder
│ ├── date_dimension_builder.rb
│ └── time_dimension_builder.rb
│ ├── commands
│ └── etl.rb
│ ├── control.rb
│ ├── control
│ ├── control.rb
│ ├── destination.rb
│ ├── destination
│ │ ├── csv_destination.rb
│ │ ├── database_destination.rb
│ │ ├── excel_destination.rb
│ │ ├── file_destination.rb
│ │ ├── insert_update_database_destination.rb
│ │ ├── update_database_destination.rb
│ │ └── yaml_destination.rb
│ ├── source.rb
│ └── source
│ │ ├── database_source.rb
│ │ ├── enumerable_source.rb
│ │ ├── file_source.rb
│ │ ├── model_source.rb
│ │ └── mysql_streamer.rb
│ ├── core_ext.rb
│ ├── core_ext
│ ├── time.rb
│ └── time
│ │ └── calculations.rb
│ ├── engine.rb
│ ├── execution.rb
│ ├── execution
│ ├── base.rb
│ ├── batch.rb
│ ├── job.rb
│ └── migration.rb
│ ├── generator.rb
│ ├── generator
│ ├── generator.rb
│ └── surrogate_key_generator.rb
│ ├── http_tools.rb
│ ├── parser.rb
│ ├── parser
│ ├── apache_combined_log_parser.rb
│ ├── csv_parser.rb
│ ├── excel_parser.rb
│ ├── fixed_width_parser.rb
│ ├── nokogiri_xml_parser.rb
│ ├── parser.rb
│ ├── sax_parser.rb
│ └── xml_parser.rb
│ ├── processor.rb
│ ├── processor
│ ├── block_processor.rb
│ ├── bulk_import_processor.rb
│ ├── check_exist_processor.rb
│ ├── check_unique_processor.rb
│ ├── copy_field_processor.rb
│ ├── database_join_processor.rb
│ ├── encode_processor.rb
│ ├── ensure_fields_presence_processor.rb
│ ├── escape_csv_processor.rb
│ ├── filter_row_processor.rb
│ ├── ftp_downloader_processor.rb
│ ├── ftp_uploader_processor.rb
│ ├── hierarchy_exploder_processor.rb
│ ├── imapattachment_downloader_processor.rb
│ ├── pop3attachment_downloader_processor.rb
│ ├── print_row_processor.rb
│ ├── processor.rb
│ ├── rename_processor.rb
│ ├── require_non_blank_processor.rb
│ ├── row_processor.rb
│ ├── sequence_processor.rb
│ ├── sftp_downloader_processor.rb
│ ├── sftp_uploader_processor.rb
│ ├── surrogate_key_processor.rb
│ ├── truncate_processor.rb
│ └── zip_file_processor.rb
│ ├── row.rb
│ ├── screen.rb
│ ├── screen
│ └── row_count_screen.rb
│ ├── transform.rb
│ ├── transform
│ ├── block_transform.rb
│ ├── calculation_transform.rb
│ ├── date_to_string_transform.rb
│ ├── decode_transform.rb
│ ├── default_transform.rb
│ ├── foreign_key_lookup_transform.rb
│ ├── hierarchy_lookup_transform.rb
│ ├── md5_transform.rb
│ ├── ordinalize_transform.rb
│ ├── sha1_transform.rb
│ ├── split_fields_transform.rb
│ ├── string_to_date_time_transform.rb
│ ├── string_to_date_transform.rb
│ ├── string_to_time_transform.rb
│ ├── transform.rb
│ ├── trim_transform.rb
│ └── type_transform.rb
│ ├── util.rb
│ └── version.rb
└── test
├── .gitignore
├── .ignore
├── all.ebf
├── apache_combined_log.ctl
├── batch_test.rb
├── batch_with_error.ebf
├── batched1.ctl
├── batched2.ctl
├── block_processor.ctl
├── block_processor_error.ctl
├── block_processor_pre_post_process.ctl
├── block_processor_remove_rows.ctl
├── block_processor_test.rb
├── check_exist_processor_test.rb
├── check_unique_processor_test.rb
├── config
├── .gitignore
├── database.yml
└── gemfiles
│ ├── Gemfile.rails-3.0.x
│ ├── Gemfile.rails-3.1.x
│ ├── Gemfile.rails-3.2.x
│ ├── Gemfile.rails-4.0.x
│ └── common.rb
├── control_test.rb
├── data
├── apache_combined_log.txt
├── bulk_import.txt
├── bulk_import_with_empties.txt
├── decode.txt
├── delimited.txt
├── encode_source_latin1.txt
├── excel.xls
├── excel2.xls
├── fixed_width.txt
├── multiple_delimited_1.txt
├── multiple_delimited_2.txt
├── nokogiri.xml
├── people.txt
├── sax.xml
└── xml.xml
├── database_join_processor_test.rb
├── date_dimension_builder_test.rb
├── delimited.ctl
├── delimited_absolute.ctl
├── delimited_destination_db.ctl
├── delimited_excel.ctl
├── delimited_insert_update.ctl
├── delimited_update.ctl
├── delimited_with_bulk_load.ctl
├── destination_test.rb
├── directive_test.rb
├── encode_processor_test.rb
├── engine_test.rb
├── ensure_fields_presence_processor_test.rb
├── errors.ctl
├── etl_test.rb
├── excel.ctl
├── excel2.ctl
├── fixed_width.ctl
├── foreign_key_lookup_transform_test.rb
├── generator_test.rb
├── inline_parser.ctl
├── mocks
├── mock_destination.rb
└── mock_source.rb
├── model_source.ctl
├── multiple_delimited.ctl
├── multiple_source_delimited.ctl
├── nokogiri_all.ctl
├── nokogiri_select.ctl
├── nokogiri_test.rb
├── output
└── .ignore
├── parser_test.rb
├── performance
└── delimited.ctl
├── processor_test.rb
├── row_processor_test.rb
├── sax.ctl
├── scd
├── 1.txt
├── 2.txt
└── 3.txt
├── scd_test.rb
├── scd_test_type_1.ctl
├── scd_test_type_2.ctl
├── screen_test.rb
├── screen_test_error.ctl
├── screen_test_fatal.ctl
├── source_test.rb
├── test_helper.rb
├── transform_test.rb
├── truncate_processor_test.rb
└── xml.ctl
/.gitignore:
--------------------------------------------------------------------------------
1 | pkg/*
2 | source_data
3 | test/output/*
4 | rdoc
5 | .rvmrc
6 | .bundle
7 | *.gem
8 | *.lock
--------------------------------------------------------------------------------
/.standalone_migrations:
--------------------------------------------------------------------------------
1 | config:
2 | database: test/config/database.yml
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | gemfile:
3 | - test/config/gemfiles/Gemfile.rails-4.0.x
4 | - test/config/gemfiles/Gemfile.rails-3.2.x
5 | - test/config/gemfiles/Gemfile.rails-3.1.x
6 | - test/config/gemfiles/Gemfile.rails-3.0.x
7 | rvm:
8 | - 1.9.3
9 | - 1.8.7
10 | env:
11 | - DB=mysql2
12 | - DB=postgresql
13 | before_script:
14 | - bundle exec rake db:create RAILS_ENV=$DB
15 | - bundle exec rake db:create RAILS_ENV=etl_execution
16 | - bundle exec rake db:schema:load RAILS_ENV=$DB
17 |
18 | branches:
19 | only:
20 | - master
21 |
--------------------------------------------------------------------------------
/0.9-UPGRADE:
--------------------------------------------------------------------------------
1 | The 0.9 revision of ActiveWarehouse ETL significantly changes how connections are maintained. This release is not backwards compatible.
2 |
3 | To upgrade, you must do the following:
4 |
5 | 1.) All database connections used in ETL control files must be declared in database.yml in the directory that contains your ETL control files.
6 | 2.) All sources, destinations, transforms and processors that use a database connection must include the configuration name/value pair of :target => 'name' where name is replaced with the connection name defined in database.yml. Connection information should no longer be included in control files.
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | # Specify your gem's dependencies in ..gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/Guardfile:
--------------------------------------------------------------------------------
1 | db = ENV['DB'] || 'mysql2'
2 | gemfile = ENV['GEMFILE'] || 'test/config/gemfiles/Gemfile.rails-3.2.x'
3 |
4 | guard 'shell' do
5 | watch(/(lib|test)\/\.*/) {|m| `bundle exec rake ci:run_one[#{db},#{gemfile}]` }
6 | end
7 |
--------------------------------------------------------------------------------
/HOW_TO_RELEASE:
--------------------------------------------------------------------------------
1 | * update lib/etl/version
2 | * push your changes
3 | * then use bundler to build + git tag + push to rubygems
4 |
5 | rake release
6 |
7 | * if you remain stuck at "Pushed git commits and tags", the task may silently wait for your password. Check this if it's the case:
8 |
9 | https://github.com/carlhuda/bundler/issues/980
10 |
11 | * you can list changes using github:
12 |
13 | https://github.com/activewarehouse/activewarehouse-etl/compare/release-0.9.1...master
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2006-2011 Anthony Eden, Thibaut Barrère
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler'
2 | Bundler::GemHelper.install_tasks
3 | require 'rake'
4 | require 'rake/testtask'
5 |
6 | def system!(cmd)
7 | puts cmd
8 | raise "Command failed!" unless system(cmd)
9 | end
10 |
11 | require 'tasks/standalone_migrations'
12 |
13 | # experimental tasks to reproduce the Travis behaviour locally
14 | namespace :ci do
15 |
16 | desc "For current RVM, run the tests for one db and one gemfile"
17 | task :run_one, :db, :gemfile do |t, args|
18 | Bundler.with_clean_env do
19 | ENV['BUNDLE_GEMFILE'] = File.expand_path(args[:gemfile] || (File.dirname(__FILE__) + '/test/config/gemfiles/Gemfile.rails-3.2.x'))
20 | ENV['DB'] = args[:db] || 'mysql2'
21 | system! "bundle install"
22 | system! "bundle exec rake db:create"
23 | system! "bundle exec rake db:create RAILS_ENV=etl_execution"
24 | system! "bundle exec rake db:schema:load"
25 | system! "bundle exec rake"
26 | end
27 | end
28 |
29 | desc "For current RVM, run the tests for all the combination in travis configuration"
30 | task :run_matrix do
31 | require 'cartesian'
32 | config = YAML.load_file('.travis.yml')
33 | config['env'].cartesian(config['gemfile']).each do |*x|
34 | env, gemfile = *x.flatten
35 | db = env.gsub('DB=', '')
36 | print [db, gemfile].inspect.ljust(40) + ": "
37 | cmd = "rake \"ci:run_one[#{db},#{gemfile}]\""
38 | result = system "#{cmd} > /dev/null 2>&1"
39 | result = result ? "OK" : "FAILED! - re-run with: #{cmd}"
40 | puts result
41 | end
42 | end
43 |
44 | end
45 |
46 | task :default => :test
47 |
48 | desc 'Test the ETL application.'
49 | Rake::TestTask.new(:test) do |t|
50 | t.libs << 'lib' << '.'
51 | t.pattern = 'test/**/*_test.rb'
52 | t.verbose = true
53 | # TODO: reset the database
54 | end
55 |
--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | TODO
2 |
3 | * Add build-in support for audit_dimension
4 | * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
5 | * Provide greater control in error handling
6 | ** Allow a error threshold
7 | ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
8 | ** Allow mismatch row length error in delimited parser to be ignored
9 | * Improve error messages throughout, but especially in problems with the control files
10 | * Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
11 | * Check if a temp table exists and the last job run was successful, in which case skip during the current run
12 | * Create models for each of the tables in each of the databases defined in ETL::Engine.connections
13 |
14 | Audit Record
15 |
16 | Process-Level
17 | * Start Time
18 | * End Time
19 | * (Duration)
20 | * Rows Read
21 | * Rows Written
22 | * Rows Rejected
23 | * Errors
24 | * Destination
25 | Record-Level
26 | * Source
27 | * Timestamp
28 | * Transformation Log
29 |
--------------------------------------------------------------------------------
/activewarehouse-etl.gemspec:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | lib = File.expand_path('../lib/', __FILE__)
3 | $:.unshift lib unless $:.include?(lib)
4 |
5 | require 'etl/version'
6 |
7 | Gem::Specification.new do |s|
8 | s.name = %q{activewarehouse-etl}
9 | s.version = ETL::VERSION
10 | s.platform = Gem::Platform::RUBY
11 | s.authors = ["Anthony Eden", "Thibaut Barrère"]
12 | s.email = ["thibaut.barrere@gmail.com"]
13 | s.homepage = "https://github.com/activewarehouse/activewarehouse-etl"
14 | s.summary = %q{Pure Ruby ETL package.}
15 | s.description = %q{ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.}
16 |
17 | s.required_rubygems_version = ">= 1.3.6"
18 |
19 | s.add_runtime_dependency('rake', '>= 0.8.3')
20 | s.add_runtime_dependency('activesupport', '>= 3.0.0')
21 | s.add_runtime_dependency('activerecord', '>= 3.0.0')
22 | s.add_runtime_dependency('fastercsv', '>= 1.2.0')
23 | s.add_runtime_dependency('adapter_extensions', '>= 0.9.5.rc1')
24 |
25 | s.add_development_dependency('shoulda', '~>2.11.3')
26 | s.add_development_dependency('flexmock', '~>0.9.0')
27 | s.add_development_dependency('cartesian')
28 | s.add_development_dependency('guard')
29 | s.add_development_dependency('guard-shell')
30 | s.add_development_dependency('standalone_migrations', '1.0.5')
31 | s.add_development_dependency('roo')
32 |
33 | s.files = `git ls-files`.split("\n")
34 | s.test_files = `git ls-files -- {test}/*`.split("\n")
35 | s.executables = %w(etl)
36 | s.require_path = "lib"
37 | end
38 |
--------------------------------------------------------------------------------
/bin/etl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | #--
4 | # Copyright (c) 2006 Anthony Eden
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining
7 | # a copy of this software and associated documentation files (the
8 | # "Software"), to deal in the Software without restriction, including
9 | # without limitation the rights to use, copy, modify, merge, publish,
10 | # distribute, sublicense, and/or sell copies of the Software, and to
11 | # permit persons to whom the Software is furnished to do so, subject to
12 | # the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be
15 | # included in all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | #++
25 |
26 | $:.unshift(File.dirname(__FILE__) + '/../lib/')
27 | require 'etl'
28 | require 'etl/commands/etl'
--------------------------------------------------------------------------------
/bin/etl.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | rem The purpose of this Windows script is to let you use the etl command line with a non-gem version of AW-ETL (eg: unpacked gem, pistoned trunk).
4 | rem Just add the current folder on top of your PATH variable to use it instead of the etl command provided with the gem release.
5 |
6 | rem %~dp0 returns the absolute path where the current script is. We just append 'etl' to it, and forward all the arguments with %*
7 |
8 | ruby "%~dp0etl" %*
9 |
--------------------------------------------------------------------------------
/db/migrate/20120229203554_create_tables.rb:
--------------------------------------------------------------------------------
1 | class CreateTables < ActiveRecord::Migration
2 | def self.up
3 | create_table(:people, :force => true) do |t|
4 | t.column :first_name, :string
5 | t.column :last_name, :string
6 | t.column :ssn, :string, :limit => 64
7 | end
8 |
9 | create_table(:places, :force => true) do |t|
10 | t.column :address, :text
11 | t.column :city, :string
12 | t.column :state, :string
13 | t.column :country, :string, :limit => 2
14 | end
15 |
16 | create_table(:person_dimension, :force => true) do |t|
17 | t.column :first_name, :string, :limit => 50
18 | t.column :last_name, :string, :limit => 50
19 | t.column :address, :string, :limit => 100
20 | t.column :city, :string, :limit => 50
21 | t.column :state, :string, :limit => 50
22 | t.column :zip_code, :string, :limit => 20
23 |
24 | t.column :effective_date, :timestamp
25 | t.column :end_date, :timestamp
26 | t.column :latest_version, :boolean
27 | end
28 |
29 | create_table(:truncate_test, :force => true) do |t|
30 | t.column :x, :string, :limit => 4
31 | end
32 | end
33 |
34 | def self.down
35 | raise ActiveRecord::IrreversibleMigration
36 | end
37 | end
38 |
--------------------------------------------------------------------------------
/db/schema.rb:
--------------------------------------------------------------------------------
1 | # encoding: UTF-8
2 | # This file is auto-generated from the current state of the database. Instead
3 | # of editing this file, please use the migrations feature of Active Record to
4 | # incrementally modify your database, and then regenerate this schema definition.
5 | #
6 | # Note that this schema.rb definition is the authoritative source for your
7 | # database schema. If you need to create the application database on another
8 | # system, you should be using db:schema:load, not running all the migrations
9 | # from scratch. The latter is a flawed and unsustainable approach (the more migrations
10 | # you'll amass, the slower it'll run and the greater likelihood for issues).
11 | #
12 | # It's strongly recommended to check this file into your version control system.
13 |
14 | ActiveRecord::Schema.define(:version => 20120229203554) do
15 |
16 | create_table "people", :force => true do |t|
17 | t.string "first_name"
18 | t.string "last_name"
19 | t.string "ssn", :limit => 64
20 | end
21 |
22 | create_table "person_dimension", :force => true do |t|
23 | t.string "first_name", :limit => 50
24 | t.string "last_name", :limit => 50
25 | t.string "address", :limit => 100
26 | t.string "city", :limit => 50
27 | t.string "state", :limit => 50
28 | t.string "zip_code", :limit => 20
29 | t.datetime "effective_date"
30 | t.datetime "end_date"
31 | t.boolean "latest_version"
32 | end
33 |
34 | create_table "places", :force => true do |t|
35 | t.text "address"
36 | t.string "city"
37 | t.string "state"
38 | t.string "country", :limit => 2
39 | end
40 |
41 | create_table "truncate_test", :force => true do |t|
42 | t.string "x", :limit => 4
43 | end
44 |
45 | end
46 |
--------------------------------------------------------------------------------
/examples/database.example.yml:
--------------------------------------------------------------------------------
1 | etl_execution:
2 | adapter: mysql2
3 | username: root
4 | host: localhost
5 | database: etl_execution
6 | encoding: utf8
7 | datawarehouse:
8 | adapter: mysql2
9 | username: root
10 | host: localhost
11 | database: datawarehouse_development
12 | operational:
13 | adapter: mysql2
14 | username: root
15 | host: localhost
16 | database: operational_production
--------------------------------------------------------------------------------
/lib/etl.rb:
--------------------------------------------------------------------------------
1 | # This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
2 | # load this source file all of the other required files and gems will also be brought into the
3 | # runtime.
4 |
5 | #--
6 | # Copyright (c) 2006-2007 Anthony Eden
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining
9 | # a copy of this software and associated documentation files (the
10 | # "Software"), to deal in the Software without restriction, including
11 | # without limitation the rights to use, copy, modify, merge, publish,
12 | # distribute, sublicense, and/or sell copies of the Software, and to
13 | # permit persons to whom the Software is furnished to do so, subject to
14 | # the following conditions:
15 | #
16 | # The above copyright notice and this permission notice shall be
17 | # included in all copies or substantial portions of the Software.
18 | #
19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 | #++
27 |
28 | require 'logger'
29 | require 'yaml'
30 | require 'erb'
31 |
32 | require 'rubygems'
33 |
34 | unless defined?(REXML::VERSION)
35 | require 'rexml/rexml'
36 | unless defined?(REXML::VERSION)
37 | REXML::VERSION = REXML::Version
38 | end
39 | end
40 |
41 | require 'active_support'
42 | require 'active_record'
43 | require 'adapter_extensions'
44 |
45 | if ActiveSupport::VERSION::STRING >= '3.2'
46 | # support for cattr_accessor
47 | require 'active_support/core_ext/class/attribute_accessors'
48 | end
49 |
50 | if RUBY_VERSION < '1.9'
51 | require 'faster_csv'
52 | CSV = FasterCSV unless defined?(CSV)
53 | else
54 | require 'csv'
55 | end
56 |
57 | # patch for https://github.com/activewarehouse/activewarehouse-etl/issues/24
58 | # allow components to require optional gems
59 | class Object
60 | def optional_require(feature)
61 | begin
62 | require feature
63 | rescue LoadError
64 | end
65 | end
66 | end
67 |
68 | $:.unshift(File.dirname(__FILE__))
69 |
70 | require 'etl/core_ext'
71 | require 'etl/util'
72 | require 'etl/http_tools'
73 | require 'etl/builder'
74 | require 'etl/version'
75 | require 'etl/engine'
76 | require 'etl/control'
77 | require 'etl/batch'
78 | require 'etl/row'
79 | require 'etl/parser'
80 | require 'etl/transform'
81 | require 'etl/processor'
82 | require 'etl/generator'
83 | require 'etl/screen'
84 |
85 | module ETL #:nodoc:
86 | class ETLError < StandardError #:nodoc:
87 | end
88 | class ControlError < ETLError #:nodoc:
89 | end
90 | class DefinitionError < ControlError #:nodoc:
91 | end
92 | class ConfigurationError < ControlError #:nodoc:
93 | end
94 | class MismatchError < ETLError #:nodoc:
95 | end
96 | class ResolverError < ETLError #:nodoc:
97 | end
98 | class ScreenError < ETLError #:nodoc:
99 | end
100 | class FatalScreenError < ScreenError #:nodoc:
101 | end
102 | end
--------------------------------------------------------------------------------
/lib/etl/batch.rb:
--------------------------------------------------------------------------------
1 | require 'etl/batch/batch'
2 | require 'etl/batch/directives'
--------------------------------------------------------------------------------
/lib/etl/batch/batch.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Batch
3 | class Context
4 | attr_reader :batch
5 |
6 | class << self
7 | # Create a context that is used when evaluating the batch file
8 | def create(batch)
9 | Context.new(batch).get_binding
10 | end
11 | end
12 |
13 | def initialize(batch)
14 | @batch = batch
15 | end
16 |
17 | def file
18 | batch.file
19 | end
20 |
21 | def get_binding
22 | binding
23 | end
24 |
25 | def run(file)
26 | batch.run(File.dirname(self.file) + "/" + file)
27 | end
28 |
29 | def use_temp_tables(value=true)
30 | batch.use_temp_tables(value)
31 | end
32 |
33 | end
34 | class Batch
35 | attr_accessor :file
36 | attr_accessor :engine
37 |
38 | class << self
39 | # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
40 | # are:
41 | # * The path to a control file as a String
42 | # * A File object referencing the control file
43 | # * The ETL::Control::Control object (which will just be returned)
44 | #
45 | # Raises a ControlError if any other type is given
46 | def resolve(batch, engine)
47 | batch = do_resolve(batch)
48 | batch.engine = engine
49 | batch
50 | end
51 |
52 | protected
53 | def parse(batch_file)
54 | batch_file = batch_file.path if batch_file.instance_of?(File)
55 | batch = ETL::Batch::Batch.new(batch_file)
56 | eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
57 | batch
58 | end
59 |
60 | def do_resolve(batch)
61 | case batch
62 | when String
63 | ETL::Batch::Batch.parse(File.new(batch))
64 | when File
65 | ETL::Batch::Batch.parse(batch)
66 | when ETL::Batch::Batch
67 | batch
68 | else
69 | raise RuntimeError, "Batch must be a String, File or Batch object"
70 | end
71 | end
72 | end
73 |
74 | def initialize(file)
75 | @file = file
76 | end
77 |
78 | def run(file)
79 | directives << Run.new(self, file)
80 | end
81 |
82 | def use_temp_tables(value = true)
83 | directives << UseTempTables.new(self)
84 | end
85 |
86 | def execute
87 | engine.say "Executing batch"
88 | before_execute
89 | directives.each do |directive|
90 | directive.execute
91 | end
92 | engine.say "Finishing batch"
93 | after_execute
94 | engine.say "Batch complete"
95 | end
96 |
97 | def directives
98 | @directives ||= []
99 | end
100 |
101 | def before_execute
102 |
103 | end
104 |
105 | def after_execute
106 | ETL::Engine.finish # TODO: should be moved to the directive?
107 | ETL::Engine.use_temp_tables = false # reset the temp tables
108 | end
109 | end
110 | end
111 | end
--------------------------------------------------------------------------------
/lib/etl/batch/directives.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Batch #:nodoc:
3 | # Abstract base class for directives
4 | class Directive
5 | # Method to access the batch object
6 | attr_reader :batch
7 |
8 | # Initialize the directive with the given batch object
9 | def initialize(batch)
10 | @batch = batch
11 | end
12 |
13 | # Execute the directive
14 | def execute
15 | do_execute
16 | end
17 |
18 | protected
19 | # Implemented by subclasses
20 | def do_execute
21 | raise RuntimeError, "Directive must implement do_execute method"
22 | end
23 | end
24 |
25 | # Directive indicating that the specified ETL control file should be
26 | # run
27 | class Run < Directive
28 | # The file to execute
29 | attr_reader :file
30 |
31 | # Initialize the directive with the given batch object and file
32 | def initialize(batch, file)
33 | super(batch)
34 | @file = file
35 | end
36 |
37 | protected
38 | # Execute the process
39 | def do_execute
40 | current_batch = ETL::Engine.batch
41 | batch.engine.process(file)
42 |
43 | job = ETL::Engine.batch
44 | if (job.kind_of? ETL::Execution::Batch and
45 | current_batch[:id] != job[:id])
46 | job[:batch_id] = current_batch[:id]
47 | job.save!
48 | end
49 |
50 | ETL::Engine.batch = current_batch
51 | end
52 | end
53 |
54 | # Directive indicating temp tables should be used.
55 | class UseTempTables < Directive
56 | def initialize(batch)
57 | super(batch)
58 | end
59 | protected
60 | def do_execute
61 | ETL::Engine.use_temp_tables = true
62 | end
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/lib/etl/builder.rb:
--------------------------------------------------------------------------------
1 | require 'etl/builder/date_dimension_builder'
2 | require 'etl/builder/time_dimension_builder'
--------------------------------------------------------------------------------
/lib/etl/builder/time_dimension_builder.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Builder #:nodoc:
3 | # Builder that creates a simple time dimension.
4 | class TimeDimensionBuilder
5 | def initialize
6 | # Returns an array of hashes representing records in the dimension. The values for each record are
7 | # accessed by name.
8 | def build(options={})
9 | records = []
10 | 0.upto(23) do |t_hour|
11 | 0.upto(59) do |t_minute|
12 | 0.upto(59) do |t_second|
13 | t_hour_string = t_hour.to_s.rjust(2, '0')
14 | t_minute_string = t_minute.to_s.rjust(2, '0')
15 | t_second_string = t_second.to_s.rjust(2, '0')
16 | record = {}
17 | record[:hour] = t_hour
18 | record[:minute] = t_minute
19 | record[:second] = t_second
20 | record[:minute_description] = "#{t_hour_string}:#{t_minute_string}"
21 | record[:full_description] = "#{t_hour_string}:#{t_minute_string}:#{t_second_string}"
22 | records << record
23 | end
24 | end
25 | end
26 | records
27 | end
28 | end
29 | end
30 | end
31 | end
--------------------------------------------------------------------------------
/lib/etl/commands/etl.rb:
--------------------------------------------------------------------------------
1 | #--
2 | # Copyright (c) 2006 Anthony Eden
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining
5 | # a copy of this software and associated documentation files (the
6 | # "Software"), to deal in the Software without restriction, including
7 | # without limitation the rights to use, copy, modify, merge, publish,
8 | # distribute, sublicense, and/or sell copies of the Software, and to
9 | # permit persons to whom the Software is furnished to do so, subject to
10 | # the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be
13 | # included in all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | #++
23 |
24 | require 'benchmark'
25 | require 'getoptlong'
26 |
27 | # Print a usage statement
28 | def usage #:nodoc:
29 | puts "Usage: etl file [file file ...]" # TODO: add the command line options
30 | end
31 |
32 | def execute
33 | opts = GetoptLong.new(
34 | [ '--version', '-v', GetoptLong::NO_ARGUMENT],
35 | [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
36 | [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
37 | [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
38 | [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
39 | [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
40 | [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
41 | [ '--read-locally', GetoptLong::NO_ARGUMENT],
42 | [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
43 | )
44 |
45 | options = {}
46 | opts.each do |opt, arg|
47 | case opt
48 | when '--version'
49 | puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
50 | return
51 | when '--help'
52 | usage
53 | return
54 | when '--config'
55 | options[:config] = arg
56 | when '--limit'
57 | options[:limit] = arg.to_i
58 | when '--offset'
59 | options[:offset] = arg.to_i
60 | when '--newlog'
61 | options[:newlog] = true
62 | when '--skip-bulk-import'
63 | puts "skip bulk import enabled"
64 | options[:skip_bulk_import] = true
65 | when '--read-locally'
66 | puts "read locally enabled"
67 | options[:read_locally] = true
68 | when '--rails-root'
69 | options[:rails_root] = arg
70 | puts "rails root set to #{options[:rails_root]}"
71 | end
72 | end
73 |
74 | if ARGV.length < 1
75 | usage
76 | else
77 | puts "Starting ETL process"
78 |
79 | ETL::Engine.init(options)
80 | ARGV.each do |f|
81 | ETL::Engine.realtime_activity = true
82 | ETL::Engine.process(f)
83 | exit(ETL::Engine.exit_code) if ETL::Engine.exit_code
84 | end
85 |
86 | puts "ETL process complete\n\n"
87 | end
88 | end
89 |
90 | execute
--------------------------------------------------------------------------------
/lib/etl/control.rb:
--------------------------------------------------------------------------------
1 | require 'etl/control/control'
2 | require 'etl/control/source'
3 | require 'etl/control/destination'
--------------------------------------------------------------------------------
/lib/etl/control/destination/csv_destination.rb:
--------------------------------------------------------------------------------
1 | # This source file contains the ETL::Control::CsvDestination
2 |
3 | module ETL #:nodoc:
4 | module Control #:nodoc:
5 | # CSV File as the final destination.
6 | class CsvDestination < Destination
7 | # The File to write to
8 | attr_reader :file
9 |
10 | # The output order
11 | attr_reader :order
12 |
13 | # Flag which indicates to append (default is to overwrite)
14 | attr_accessor :append
15 |
16 | # The separator
17 | attr_accessor :separator
18 |
19 | # The end of line marker
20 | attr_accessor :eol
21 |
22 | # The enclosure character
23 | attr_accessor :enclose
24 |
25 | # Initialize the object.
26 | # * control: The Control object
27 | # * configuration: The configuration map
28 | # * mapping: The output mapping
29 | #
30 | # Configuration options:
31 | # * :file: The file to write to (REQUIRED)
32 | # * :append: Set to true to append to the file (default is to overwrite)
33 | # * :separator: Record separator (default is a comma)
34 | # * :eol: End of line marker (default is \n)
35 | # * :enclose: Set to true of false
36 | # * :unique: Set to true to only write unique records
37 | # * :append_rows: Array of rows to append
38 | #
39 | # Mapping options:
40 | # * :order: The order array
41 | def initialize(control, configuration, mapping={})
42 | super
43 | path = Pathname.new(configuration[:file])
44 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
45 | @append = configuration[:append] ||= false
46 | @separator = configuration[:separator] ||= ','
47 | @eol = configuration[:eol] ||= "\n"
48 | @enclose = true & configuration[:enclose]
49 | @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique]
50 | @unique.uniq! unless @unique.nil?
51 | @write_header = configuration[:write_header]
52 | @order = mapping[:order] + scd_required_fields if mapping[:order]
53 | @order.uniq! unless @order.nil?
54 | end
55 |
56 | def order
57 | @order ||= order_from_source
58 | end
59 |
60 | # Close the destination. This will flush the buffer and close the underlying stream or connection.
61 | def close
62 | buffer << append_rows if append_rows
63 | flush
64 | f.close
65 | end
66 |
67 | # Flush the destination buffer
68 | def flush
69 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
70 | if @write_header && !@header_written
71 | f << order
72 | @header_written = true
73 | end
74 |
75 | buffer.flatten.each do |row|
76 | #puts "row change type: #{row.change_type}"
77 | # check to see if this row's compound key constraint already exists
78 | # note that the compound key constraint may not utilize virtual fields
79 | next unless row_allowed?(row)
80 |
81 | # add any virtual fields
82 | add_virtuals!(row)
83 |
84 | # collect all of the values using the order designated in the configuration
85 | values = order.collect do |name|
86 | value = row[name]
87 | case value
88 | when Date, Time, DateTime
89 | value.to_s(:db)
90 | else
91 | value.to_s
92 | end
93 | end
94 |
95 | f << values
96 | end
97 | f.flush
98 | buffer.clear
99 | #puts "After flush there are #{buffer.length} rows"
100 | end
101 |
102 | private
103 | # Get the open file stream
104 | def f
105 | @f ||= FasterCSV.open(file, mode, options)
106 | end
107 |
108 | def options
109 | @options ||= {
110 | :col_sep => separator,
111 | :row_sep => eol,
112 | :force_quotes => enclose
113 | }
114 | end
115 |
116 | # Get the appropriate mode to open the file stream
117 | def mode
118 | append ? 'a' : 'w'
119 | end
120 | end
121 | end
122 | end
123 |
--------------------------------------------------------------------------------
/lib/etl/control/destination/database_destination.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Control #:nodoc:
3 | # Destination which writes directly to a database. This is useful when you are dealing with
4 | # a small amount of data. For larger amounts of data you should probably use the bulk
5 | # loader if it is supported with your target database as it will use a much faster load
6 | # method.
7 | class DatabaseDestination < Destination
8 | # The target connection
9 | attr_reader :target
10 |
11 | # The table
12 | attr_reader :table
13 |
14 | # Specify the order from the source
15 | attr_reader :order
16 |
17 | # Set to true to truncate the destination table first
18 | attr_reader :truncate
19 |
20 | # Initialize the database destination
21 | #
22 | # * control: The ETL::Control::Control instance
23 | # * configuration: The configuration Hash
24 | # * mapping: The mapping
25 | #
26 | # Configuration options:
27 | # * :database: The database name (REQUIRED)
28 | # * :target: The target connection (REQUIRED)
29 | # * :table: The table to write to (REQUIRED)
30 | # * :truncate: Set to true to truncate before writing (defaults to false)
31 | # * :unique: Set to true to only insert unique records (defaults to false)
32 | # * :append_rows: Array of rows to append
33 | #
34 | # Mapping options:
35 | # * :order: The order of fields to write (REQUIRED)
36 | def initialize(control, configuration, mapping={})
37 | super
38 | @target = configuration[:target]
39 | @table = configuration[:table]
40 | @truncate = configuration[:truncate] ||= false
41 | @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
42 | @unique.uniq! unless @unique.nil?
43 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
44 | @order.uniq! unless @order.nil?
45 | raise ControlError, "Order required in mapping" unless @order
46 | raise ControlError, "Table required" unless @table
47 | raise ControlError, "Target required" unless @target
48 | end
49 |
50 | # Flush the currently buffered data
51 | def flush
52 | conn.transaction do
53 | buffer.flatten.each do |row|
54 | # check to see if this row's compound key constraint already exists
55 | # note that the compound key constraint may not utilize virtual fields
56 | next unless row_allowed?(row)
57 |
58 | # add any virtual fields
59 | add_virtuals!(row)
60 |
61 | names = []
62 | values = []
63 | order.each do |name|
64 | names << conn.quote_column_name(name)
65 | values << conn.quote(row[name])
66 | end
67 | q = "INSERT INTO #{conn.quote_table_name(table_name)} (#{names.join(',')}) VALUES (#{values.join(',')})"
68 | ETL::Engine.logger.debug("Executing insert: #{q}")
69 | conn.insert(q, "Insert row #{current_row}")
70 | @current_row += 1
71 | end
72 | buffer.clear
73 | end
74 | end
75 |
76 | # Close the connection
77 | def close
78 | buffer << append_rows if append_rows
79 | flush
80 | end
81 |
82 | private
83 | def conn
84 | @conn ||= begin
85 | conn = ETL::Engine.connection(target)
86 | conn.truncate(table_name) if truncate
87 | conn
88 | end
89 | end
90 |
91 | def table_name
92 | ETL::Engine.table(table, ETL::Engine.connection(target))
93 | end
94 |
95 | end
96 | end
97 | end
98 |
--------------------------------------------------------------------------------
/lib/etl/control/destination/excel_destination.rb:
--------------------------------------------------------------------------------
1 | optional_require 'spreadsheet'
2 |
3 | module ETL
4 | module Control
5 | # Excel as the final destination.
6 | class ExcelDestination < Destination
7 | # The File to write to
8 | attr_reader :file
9 |
10 | # The output order
11 | attr_reader :order
12 |
13 | # Flag which indicates to append (default is to overwrite)
14 | attr_accessor :append
15 |
16 | # Initialize the object.
17 | # * control: The Control object
18 | # * configuration: The configuration map
19 | # * mapping: The output mapping
20 | #
21 | # Configuration options:
22 | # * :file: The file to write to (REQUIRED)
23 | # * :append: Set to true to append to the file (default is to overwrite)
24 | # * :unique: Set to true to only write unique records
25 | # * :append_rows: Array of rows to append
26 | #
27 | # Mapping options:
28 | # * :order: The order array
29 | def initialize(control, configuration, mapping={})
30 | super
31 | path = Pathname.new(configuration[:file])
32 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
33 | @append = configuration[:append] ||= false
34 | @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique]
35 | @unique.uniq! unless @unique.nil?
36 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
37 | @order.uniq! unless @order.nil?
38 | raise ControlError, "Order required in mapping" unless @order
39 | end
40 |
41 | # Close the destination. This will flush the buffer and close the underlying stream or connection.
42 | def close
43 | buffer << append_rows if append_rows
44 | flush
45 | book.write(file)
46 | end
47 |
48 | # Flush the destination buffer
49 | def flush
50 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
51 | buffer.flatten.each_with_index do |row, index|
52 | #puts "row change type: #{row.change_type}"
53 | # check to see if this row's compound key constraint already exists
54 | # note that the compound key constraint may not utilize virtual fields
55 | next unless row_allowed?(row)
56 |
57 | # add any virtual fields
58 | add_virtuals!(row)
59 |
60 | # collect all of the values using the order designated in the configuration
61 | values = order.collect do |name|
62 | value = row[name]
63 | case value
64 | when Date, Time, DateTime
65 | value.to_s(:db)
66 | else
67 | value.to_s
68 | end
69 | end
70 |
71 | # write the values
72 | sheet.insert_row(index, values)
73 | end
74 | buffer.clear
75 | #puts "After flush there are #{buffer.length} rows"
76 | end
77 |
78 | private
79 | # Get the open file excel
80 | def book
81 | @book ||= ( append ? Spreadsheet.open(file) : Spreadsheet::Workbook.new(file) )
82 | end
83 |
84 | private
85 | # Get the open sheet
86 | def sheet
87 | @sheet ||= ( append ? book.worksheet(0) : book.create_worksheet() )
88 | end
89 | end
90 | end
91 | end
92 |
--------------------------------------------------------------------------------
/lib/etl/control/destination/update_database_destination.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Control #:nodoc:
3 | # Destination which writes directly to a database. This is useful when you are dealing with
4 | # a small amount of data. For larger amounts of data you should probably use the bulk
5 | # loader if it is supported with your target database as it will use a much faster load
6 | # method.
7 | class UpdateDatabaseDestination < Destination
8 | # The target connection
9 | attr_reader :target
10 |
11 | # The table
12 | attr_reader :table
13 |
14 | # Specify the order from the source
15 | attr_reader :order
16 |
17 | # Specify the conditions from the source
18 | attr_reader :conditions
19 |
20 | # Initialize the database destination
21 | #
22 | # * control: The ETL::Control::Control instance
23 | # * configuration: The configuration Hash
24 | # * mapping: The mapping
25 | #
26 | # Configuration options:
27 | # * :database: The database name (REQUIRED)
28 | # * :target: The target connection (REQUIRED)
29 | # * :table: The table to write to (REQUIRED)
30 | # * :unique: Set to true to only insert unique records (defaults to false)
31 | # * :append_rows: Array of rows to append
32 | #
33 | # Mapping options:
34 | # * :order: The order of fields to write (REQUIRED)
35 | # * :conditions: The conditions on the fields to update (REQUIRED)
36 | def initialize(control, configuration, mapping={})
37 | super
38 | @target = configuration[:target]
39 | @table = configuration[:table]
40 | @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique]
41 | @unique.uniq! unless @unique.nil?
42 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source
43 | @order.uniq! unless @order.nil?
44 | @conditions = mapping[:conditions] ? mapping[:conditions] + scd_required_fields : nil
45 | @conditions.uniq! unless @conditions.nil?
46 | raise ControlError, "Conditions required in mapping" unless @conditions
47 | raise ControlError, "Order required in mapping" unless @order
48 | raise ControlError, "Table required" unless @table
49 | raise ControlError, "Target required" unless @target
50 | end
51 |
52 | # Flush the currently buffered data
53 | def flush
54 | conn.transaction do
55 | buffer.flatten.each do |row|
56 | # check to see if this row's compound key constraint already exists
57 | # note that the compound key constraint may not utilize virtual fields
58 | next unless row_allowed?(row)
59 |
60 | # add any virtual fields
61 | add_virtuals!(row)
62 |
63 | conditionsfilter = []
64 | conditions.each do |cond|
65 | c = " #{cond[:field]} #{cond[:comp]} #{cond[:value]} "
66 | condition = c
67 | begin
68 | condition = eval('"' + c + '"')
69 | rescue
70 | end
71 | conditionsfilter << condition
72 | end
73 |
74 | updatevalues = []
75 | order.each do |name|
76 | updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}"
77 | end
78 | q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{conditionsfilter.join(' AND ')}"
79 | ETL::Engine.logger.debug("Executing update: #{q}")
80 | conn.update(q, "Update row #{current_row}")
81 | @current_row += 1
82 | end
83 | buffer.clear
84 | end
85 | end
86 |
87 | # Close the connection
88 | def close
89 | buffer << append_rows if append_rows
90 | flush
91 | end
92 |
93 | private
94 | def conn
95 | @conn ||= begin
96 | conn = ETL::Engine.connection(target)
97 | conn
98 | rescue
99 | raise RuntimeError, "Problem to connect to db"
100 | end
101 | end
102 |
103 | def table_name
104 | ETL::Engine.table(table, ETL::Engine.connection(target))
105 | end
106 |
107 | end
108 | end
109 | end
110 |
--------------------------------------------------------------------------------
/lib/etl/control/destination/yaml_destination.rb:
--------------------------------------------------------------------------------
1 | require 'yaml'
2 |
3 | module ETL #:nodoc:
4 | module Control #:nodoc:
5 | class YamlDestination < Destination
6 | attr_reader :file, :append, :only, :except
7 | # Initialize the object.
8 | # * control: The Control object
9 | # * configuration: The configuration map
10 | # * mapping: The output mapping
11 | #
12 | # Configuration options:
13 | # * :file: The file to write to (REQUIRED)
14 | # * :append: Set to true to append to the file (default is to overwrite)
15 | # * :only
16 | # * :except
17 | def initialize(control, configuration, mapping={})
18 | super
19 | @file = File.join(File.dirname(control.file), configuration[:file])
20 | @append = configuration[:append] ||= false
21 | @only = configuration[:only]
22 | @except = configuration[:except]
23 | raise ControlError, "the :only and :except options must be used seperately, do not specify both" if @only && @except
24 | end
25 |
26 | # Close the destination. This will flush the buffer and close the underlying stream or connection.
27 | def close
28 | flush
29 | f.close
30 | end
31 |
32 | # Flush the destination buffer
33 | def flush
34 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
35 | buffer.flatten.each do |row|
36 | # check to see if this row's compound key constraint already exists
37 | # note that the compound key constraint may not utilize virtual fields
38 | next unless row_allowed?(row)
39 | # add any virtual fields
40 | add_virtuals!(row)
41 |
42 | yaml = {}
43 | row.each do |key, value|
44 | next if only && !only.include?(key)
45 | next if except && except.include?(key)
46 |
47 | case value
48 | when Date, Time, DateTime
49 | value = value.to_s(:db)
50 | end
51 |
52 | yaml[key] = value
53 | end
54 |
55 | # write the values
56 | YAML.dump(yaml, f)
57 | end
58 | f.flush
59 | buffer.clear
60 | end
61 |
62 | private
63 | # Get the open file stream
64 | def f
65 | @f ||= File.open(file, mode)
66 | end
67 |
68 | # Get the appropriate mode to open the file stream
69 | def mode
70 | append ? 'a' : 'w'
71 | end
72 | end
73 | end
74 | end
75 |
--------------------------------------------------------------------------------
/lib/etl/control/source/enumerable_source.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Control #:nodoc:
3 | # Use an Enumerable as a source
4 | class EnumerableSource < ETL::Control::Source
5 | # Iterate through the enumerable
6 | def each(&block)
7 | configuration[:enumerable].each(&block)
8 | end
9 | end
10 | end
11 | end
--------------------------------------------------------------------------------
/lib/etl/control/source/file_source.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Control #:nodoc:
3 | # A File source.
4 | class FileSource < Source
5 | # The number of lines to skip, default is 0
6 | attr_accessor :skip_lines
7 |
8 | # Accessor for the underlying parser
9 | attr_accessor :parser
10 |
11 | # The source file
12 | attr_accessor :file
13 |
14 | # Initialize the source
15 | #
16 | # Configuration options:
17 | # * :file: The source file
18 | # * :parser: One of the following: a parser name as a String or
19 | # symbol, a class which extends from Parser, a Hash with :name and
20 | # optionally an :options key. Whether or not the parser uses the
21 | # options is dependent on which parser is used. See the documentation
22 | # for each parser for information on what options it accepts.
23 | # * :skip_lines: The number of lines to skip (defaults to 0)
24 | # * :store_locally: Set to false to not store a copy of the
25 | # source data locally for archival
26 | def initialize(control, configuration, definition)
27 | super
28 | configure
29 | end
30 |
31 | # Get a String identifier for the source
32 | def to_s
33 | file
34 | end
35 |
36 | # Get the local storage directory
37 | def local_directory
38 | File.join(local_base, File.basename(file, File.extname(file)))
39 | end
40 |
41 | # Returns each row from the source
42 | def each
43 | count = 0
44 | copy_sources if @store_locally
45 | @parser.each do |row|
46 | if ETL::Engine.offset && count < ETL::Engine.offset
47 | count += 1
48 | else
49 | row = ETL::Row[row]
50 | row.source = self
51 | yield row
52 | end
53 | end
54 | end
55 |
56 | def order
57 | @parser.fields.collect {|field| field.name}
58 | end
59 |
60 | private
61 | # Copy source data to a local directory structure
62 | def copy_sources
63 | sequence = 0
64 | path = Pathname.new(file)
65 | path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path
66 | Pathname.glob(path).each do |f|
67 | next if f.directory?
68 | lf = local_file(sequence)
69 | FileUtils.cp(f, lf)
70 | File.open(local_file_trigger(lf), 'w') {|f| }
71 | sequence += 1
72 | end
73 | end
74 |
75 | # Configure the source
76 | def configure
77 | @file = configuration[:file]
78 | case configuration[:parser]
79 | when Class
80 | @parser = configuration[:parser].new(self)
81 | when String, Symbol
82 | @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self)
83 | when Hash
84 | name = configuration[:parser][:name]
85 | options = configuration[:parser][:options]
86 | @parser = ETL::Parser::Parser.class_for_name(name).new(self, options)
87 | else
88 | raise ControlError, "Configuration option :parser must be a Class, String or Symbol"
89 | end
90 | @skip_lines = configuration[:skip_lines] ||= 0
91 | end
92 | end
93 | end
94 | end
95 |
--------------------------------------------------------------------------------
/lib/etl/control/source/model_source.rb:
--------------------------------------------------------------------------------
1 | #RAILS_ENV = 'development'
2 | #require '../config/environment'
3 |
4 | module ETL #:nodoc:
5 | module Control #:nodoc:
6 | class ModelSource < Source
7 |
8 | def columns
9 | case definition
10 | when Array
11 | definition.collect(&:to_sym)
12 | when Hash
13 | definition.keys.collect(&:to_sym)
14 | else
15 | raise "Definition must be either an Array or a Hash"
16 | end
17 | end
18 |
19 | def railsmodel
20 | configuration[:model]
21 | end
22 |
23 | def order
24 | configuration[:order] || "id"
25 | end
26 |
27 | def each(&block)
28 | railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row|
29 | result_row = ETL::Row.new
30 | result_row.source = self
31 | columns.each do |column|
32 | result_row[column.to_sym] = row.send(column)
33 | end
34 | yield result_row
35 | end
36 | end
37 | end
38 | end
39 | end
--------------------------------------------------------------------------------
/lib/etl/control/source/mysql_streamer.rb:
--------------------------------------------------------------------------------
1 | require 'open3'
2 |
3 | # Internal: The MySQL streamer is a helper with works with the database_source
4 | # in order to allow you to use the --quick option (which stops MySQL)
5 | # from building a full result set, also we don't build a full resultset
6 | # in Ruby - instead we yield a row at a time
7 | #
8 | class MySqlStreamer
9 | # Internal: Creates a MySQL Streamer
10 | #
11 | # query - the SQL query
12 | # target - the name of the ETL configuration (ie. development/production)
13 | # connection - the ActiveRecord connection
14 | #
15 | # Examples
16 | #
17 | # MySqlStreamer.new("select * from bob", "development", my_connection)
18 | #
19 | def initialize(query, target, connection)
20 | # Lets just be safe and also make sure there aren't new lines
21 | # in the SQL - its bound to cause trouble
22 | @query = query.split.join(' ')
23 | @name = target
24 | @first_row = connection.select_all("#{query} LIMIT 1")
25 | end
26 |
27 | # We implement some bits of a hash so that database_source
28 | # can use them
29 | def any?
30 | @first_row.any?
31 | end
32 |
33 | def first
34 | @first_row.first
35 | end
36 |
37 | def mandatory_option!(hash, key)
38 | value = hash[key]
39 | raise "Missing key #{key} in connection configuration #{@name}" if value.blank?
40 | value
41 | end
42 |
43 | def each
44 | keys = nil
45 |
46 | config = ETL::Base.configurations[@name.to_s]
47 | host = mandatory_option!(config, 'host')
48 | username = mandatory_option!(config, 'username')
49 | database = mandatory_option!(config, 'database')
50 | password = config['password'] # this one can omitted in some cases
51 |
52 | mysql_command = """mysql --quick -h #{host} -u #{username} -e \"#{@query.gsub("\n","")}\" -D #{database} --password=#{password} -B"""
53 | Open3.popen3(mysql_command) do |stdin, out, err, external|
54 | until (line = out.gets).nil? do
55 | line = line.gsub("\n","")
56 | if keys.nil?
57 | keys = line.split("\t")
58 | else
59 | hash = Hash[keys.zip(line.split("\t"))]
60 | # map out NULL to nil
61 | hash.each do |k, v|
62 | hash[k] = nil if v == 'NULL'
63 | end
64 | yield hash
65 | end
66 | end
67 | error = err.gets
68 | if (!error.nil? && error.strip.length > 0)
69 | throw error
70 | end
71 | end
72 | end
73 | end
--------------------------------------------------------------------------------
/lib/etl/core_ext.rb:
--------------------------------------------------------------------------------
1 | require 'etl/core_ext/time'
--------------------------------------------------------------------------------
/lib/etl/core_ext/time.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/time/calculations'
2 |
3 | class Time#:nodoc:
4 | include ETL::CoreExtensions::Time::Calculations
5 | end
6 |
--------------------------------------------------------------------------------
/lib/etl/core_ext/time/calculations.rb:
--------------------------------------------------------------------------------
1 | #Updated by Jack Hong on 04/05/08
2 |
3 | module ETL #:nodoc:
4 | module CoreExtensions #:nodoc:
5 | module Time #:nodoc:
6 | # Enables the use of time calculations within Time itself
7 | module Calculations
8 | def week
9 | cyw = ((yday - 1) / 7) + 1
10 | cyw = 52 if cyw == 53
11 | cyw
12 | end
13 | def quarter
14 | ((month - 1) / 3) + 1
15 | end
16 | def fiscal_year_week(offset_month=10)
17 | fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1
18 | fyw = 52 if fyw == 53
19 | fyw
20 | end
21 | def fiscal_year_month(offset_month=10)
22 | shifted_month = month - (offset_month - 1)
23 | shifted_month += 12 if shifted_month <= 0
24 | shifted_month
25 | end
26 | def fiscal_year_quarter(offset_month=10)
27 | ((fiscal_year_month(offset_month) - 1) / 3) + 1
28 | end
29 | def fiscal_year(offset_month=10)
30 | month >= offset_month ? year + 1 : year
31 | end
32 | def fiscal_year_yday(offset_month=10)
33 | offset_days = 0
34 | 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) }
35 | shifted_year_day = yday - offset_days
36 | shifted_year_day += 365 if shifted_year_day <= 0
37 | shifted_year_day
38 | end
39 | end
40 | end
41 | end
42 | end
43 |
--------------------------------------------------------------------------------
/lib/etl/execution.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc
2 | # Classes which store information about ETL execution
3 | module Execution
4 | # Execution management
5 | class Execution
6 | class << self
7 | # Migrate the data store
8 | def migrate
9 | ETL::Execution::Migration.migrate
10 | end
11 | end
12 | end
13 | end
14 | end
15 |
16 | require 'etl/execution/base'
17 | require 'etl/execution/batch'
18 | require 'etl/execution/job'
19 | require 'etl/execution/migration'
--------------------------------------------------------------------------------
/lib/etl/execution/base.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Execution #:nodoc:
3 | # Base class for ETL execution information
4 | class Base < ActiveRecord::Base
5 | self.abstract_class = true
6 | establish_connection :etl_execution
7 | end
8 | end
9 | end
--------------------------------------------------------------------------------
/lib/etl/execution/batch.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Execution #:nodoc:
3 | # Persistent class representing an ETL batch
4 | class Batch < Base
5 | belongs_to :batch
6 | has_many :batches
7 | has_many :jobs
8 | attr_accessible :batch_file, :status, :completed_at
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/etl/execution/job.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Execution #:nodoc:
3 | # Persistent class representing an ETL job
4 | class Job < Base
5 | belongs_to :batch
6 | attr_accessible :control_file, :status, :batch_id
7 | end
8 | end
9 | end
10 |
--------------------------------------------------------------------------------
/lib/etl/execution/migration.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Execution #:nodoc
3 | # Handles migration of tables required for persistent storage of meta data
4 | # for the ETL engine
5 | class Migration
6 | class << self
7 | protected
8 | # Get the schema info table name
9 | def schema_info_table_name
10 | ActiveRecord::Migrator.schema_migrations_table_name
11 | end
12 | alias :schema_migrations_table_name :schema_info_table_name
13 |
14 | public
15 | # Execute the migrations
16 | def migrate
17 | connection.initialize_schema_migrations_table
18 | last_migration.upto(target - 1) do |i|
19 | __send__("migration_#{i+1}".to_sym)
20 | connection.assume_migrated_upto_version(i+1)
21 | end
22 | end
23 |
24 | protected
25 | def last_migration
26 | connection.select_values(
27 | "SELECT version FROM #{schema_migrations_table_name}"
28 | ).map(&:to_i).sort.last || 0
29 | end
30 |
31 | # Get the connection to use during migration
32 | def connection
33 | @connection ||= ETL::Execution::Base.connection
34 | end
35 |
36 | # Get the final target version number
37 | def target
38 | 4
39 | end
40 |
41 | private
42 | def migration_1 #:nodoc:
43 | connection.create_table :jobs do |t|
44 | t.column :control_file, :string, :null => false
45 | t.column :created_at, :datetime, :null => false
46 | t.column :completed_at, :datetime
47 | t.column :status, :string
48 | end
49 | connection.create_table :records do |t|
50 | t.column :control_file, :string, :null => false
51 | t.column :natural_key, :string, :null => false
52 | t.column :crc, :string, :null => false
53 | t.column :job_id, :integer, :null => false
54 | end
55 | end
56 |
57 | def migration_2 #:nodoc:
58 | connection.add_index :records, :control_file
59 | connection.add_index :records, :natural_key
60 | connection.add_index :records, :job_id
61 | end
62 |
63 | def migration_3 #:nodoc:
64 | connection.create_table :batches do |t|
65 | t.column :batch_file, :string, :null => false
66 | t.column :created_at, :datetime, :null => false
67 | t.column :completed_at, :datetime
68 | t.column :status, :string
69 | end
70 | connection.add_column :jobs, :batch_id, :integer
71 | connection.add_index :jobs, :batch_id
72 | end
73 |
74 | def migration_4
75 | connection.drop_table :records
76 | end
77 |
78 | def migration_5
79 | connection.add_column :batches, :batch_id, :integer
80 | connection.add_index :batches, :batch_id
81 | end
82 |
83 | # Update the schema info table, setting the version value
84 | def update_schema_info(version)
85 | connection.update("UPDATE #{schema_info_table_name} SET version = #{version}")
86 | end
87 | end
88 | end
89 | end
90 | end
91 |
--------------------------------------------------------------------------------
/lib/etl/generator.rb:
--------------------------------------------------------------------------------
1 | require 'etl/generator/generator'
2 | Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { |file| require(file) }
--------------------------------------------------------------------------------
/lib/etl/generator/generator.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Generator #:nodoc:
3 | # Base class for generators.
4 | class Generator
5 | class << self
6 | # Get the Class for the specified name.
7 | #
8 | # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned
9 | def class_for_name(name)
10 | ETL::Generator.const_get("#{name.to_s.camelize}Generator")
11 | end
12 | end
13 |
14 | # Generate the next value. This method must be implemented by subclasses
15 | def next
16 | raise "Must be implemented by a subclass"
17 | end
18 | end
19 | end
20 | end
--------------------------------------------------------------------------------
/lib/etl/generator/surrogate_key_generator.rb:
--------------------------------------------------------------------------------
1 | # This source file contains code for a basic sequential surrogate key generator
2 |
3 | module ETL #:nodoc:
4 | module Generator #:nodoc:
5 | # Surrogate key generator.
6 | class SurrogateKeyGenerator < Generator
7 | attr_reader :table
8 | attr_reader :target
9 | attr_reader :column
10 | attr_reader :query
11 |
12 | # Initialize the generator
13 | def initialize(options={})
14 | @table = options[:table]
15 | @target = options[:target]
16 | @column = options[:column] || 'id'
17 | @query = options[:query]
18 |
19 | if table
20 | @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
21 | elsif query
22 | @surrogate_key = ETL::Engine.connection(target).select_value(query)
23 | end
24 | @surrogate_key = 0 if @surrogate_key.blank?
25 | @surrogate_key = @surrogate_key.to_i
26 | end
27 |
28 | # Get the next surrogate key
29 | def next
30 | @surrogate_key ||= 0
31 | @surrogate_key += 1
32 | end
33 |
34 | def table_name
35 | ETL::Engine.table(table, ETL::Engine.connection(target))
36 | end
37 | end
38 | end
39 | end
--------------------------------------------------------------------------------
/lib/etl/parser.rb:
--------------------------------------------------------------------------------
1 | # This source file contains the ETL::Parser module and requires all of the files
2 | # in the parser directory ending with .rb
3 |
4 | module ETL #:nodoc:
5 | # The ETL::Parser module provides various text parsers.
6 | module Parser
7 | end
8 | end
9 |
10 | require 'etl/parser/parser'
11 | Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) }
--------------------------------------------------------------------------------
/lib/etl/parser/apache_combined_log_parser.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Parser #:nodoc:
3 | # Parser which can parser the Apache Combined Log Format as defined at
4 | # http://httpd.apache.org/docs/2.2/logs.html
5 | class ApacheCombinedLogParser < ETL::Parser::Parser
6 | include HttpTools
7 | def initialize(source, options={})
8 | super
9 | end
10 |
11 | def each
12 | Dir.glob(file).each do |file|
13 | File.open(file).each_line do |line|
14 | yield parse(line)
15 | end
16 | end
17 | end
18 |
19 | def parse(line)
20 | # example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
21 | line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/
22 | fields = {
23 | :ip_address => $1,
24 | :identd => $2,
25 | :user => $3,
26 | :timestamp => $4,
27 | :request => $5,
28 | :response_code => $6,
29 | :bytes => $7,
30 | :referrer => $8,
31 | :user_agent => $9,
32 | }
33 | #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)}
34 | d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil?
35 | fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil?
36 |
37 | fields[:method], fields[:path] = fields[:request].split(/\s/)
38 |
39 | fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil?
40 | fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_'))
41 |
42 | fields.each do |key, value|
43 | fields[key] = nil if value == '-'
44 | end
45 | end
46 |
47 | end
48 | end
49 | end
--------------------------------------------------------------------------------
/lib/etl/parser/csv_parser.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Parser #:nodoc:
3 | # Parses CSV files
4 | class CsvParser < ETL::Parser::Parser
5 | # Initialize the parser
6 | # * source: The Source object
7 | # * options: Hash of options for the parser, defaults to an empty hash
8 | def initialize(source, options={})
9 | super
10 | configure
11 | end
12 |
13 | attr_reader :validate_rows
14 |
15 | def get_fields_names(file)
16 | File.open(file) do |input|
17 | fields = CSV.parse(input.readline, options).first
18 | new_fields = []
19 | fields.each_with_index do |field,index|
20 | # compute the index of occurrence of this specific occurrence of the field (usually, will be 1)
21 | occurrence_index = fields[0..index].find_all { |e| e == field }.size
22 | number_of_occurrences = fields.find_all { |e| e == field }.size
23 | new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "")
24 | new_fields << Field.new(new_field.to_sym)
25 | end
26 | return new_fields
27 | end
28 | end
29 |
30 | # Returns each row.
31 | def each
32 | Dir.glob(file).each do |file|
33 | ETL::Engine.logger.debug "parsing #{file}"
34 | if fields.length == 0
35 | ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}"
36 | @fields = get_fields_names(file)
37 | end
38 | line = 0
39 | lines_skipped = 0
40 | CSV.foreach(file, options) do |raw_row|
41 | if lines_skipped < source.skip_lines
42 | ETL::Engine.logger.debug "skipping line"
43 | lines_skipped += 1
44 | next
45 | end
46 | line += 1
47 | row = {}
48 | validate_row(raw_row, line, file) if self.validate_rows
49 | raw_row.each_with_index do |value, index|
50 | f = fields[index]
51 | row[f.name] = value
52 | end
53 | yield row
54 | end
55 | end
56 | end
57 |
58 | # Get an array of defined fields
59 | def fields
60 | @fields ||= []
61 | end
62 |
63 | private
64 | def validate_row(row, line, file)
65 | ETL::Engine.logger.debug "validating line #{line} in file #{file}"
66 | if row.length != fields.length
67 | raise_with_info( MismatchError,
68 | "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
69 | line, file
70 | )
71 | end
72 | end
73 |
74 | def configure
75 | @validate_rows = if source.configuration.has_key?(:validate_rows)
76 | source.configuration[:validate_rows]
77 | else
78 | true
79 | end
80 |
81 | source.definition.each do |options|
82 | case options
83 | when Symbol
84 | fields << Field.new(options)
85 | when Hash
86 | fields << Field.new(options[:name])
87 | else
88 | raise DefinitionError, "Each field definition must either be a symbol or a hash"
89 | end
90 | end
91 | end
92 |
93 | class Field #:nodoc:
94 | attr_reader :name
95 | def initialize(name)
96 | @name = name
97 | end
98 | end
99 | end
100 | end
101 | end
102 |
--------------------------------------------------------------------------------
/lib/etl/parser/excel_parser.rb:
--------------------------------------------------------------------------------
1 | optional_require 'roo'
2 |
3 | module ETL
4 | module Parser
5 | class ExcelParser < ETL::Parser::Parser
6 |
7 | attr_accessor :ignore_blank_line, :worksheet_column, :validate_rows
8 |
9 | # Initialize the parser
10 | # * source: The Source object
11 | # * options: Parser options Hash
12 | def initialize(source, options={})
13 | super
14 | configure
15 | end
16 |
17 | # Returns each row
18 | def each
19 | Dir.glob(file).each do |file|
20 | ETL::Engine.logger.debug "parsing #{file}"
21 | line = 0
22 | lines_skipped = 0
23 | book = Roo::Spreadsheet.open file
24 | loopworksheets = []
25 |
26 | if worksheets.empty?
27 | loopworksheets = book.sheets
28 | else
29 | worksheets.each do |index|
30 | loopworksheets << book.sheet(index)
31 | end
32 | end
33 |
34 | sheet_index = -1
35 |
36 | book.each_with_pagename do |name, sheet|
37 | sheet_index += 1
38 | # puts "Sheet: #{name}"
39 | # puts worksheets.inspect
40 | if !worksheets.empty? && !worksheets.include?(sheet_index)
41 | # puts "No!!! #{sheet_index.inspect}"
42 | next
43 | end
44 | sheet.each do |raw_row|
45 | if lines_skipped < source.skip_lines
46 | ETL::Engine.logger.debug "skipping line"
47 | lines_skipped += 1
48 | next
49 | end
50 | line += 1
51 | row = {}
52 | if self.ignore_blank_line and raw_row.empty?
53 | lines_skipped += 1
54 | next
55 | end
56 | validate_row(raw_row, line, file) if self.validate_rows
57 | raw_row.each_with_index do |value, index|
58 | f = fields[index]
59 | row[f.name] = value
60 | end
61 | row[worksheet_column] = name if worksheet_column
62 | yield row
63 | end
64 | end
65 | end
66 | end
67 |
68 | # Get an array of defined worksheets
69 | def worksheets
70 | @worksheets ||= []
71 | end
72 |
73 | # Get an array of defined fields
74 | def fields
75 | @fields ||= []
76 | end
77 |
78 | private
79 | def validate_row(row, line, file)
80 | ETL::Engine.logger.debug "validating line #{line} in file #{file}"
81 | if row.length != fields.length
82 | raise_with_info( MismatchError,
83 | "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
84 | line, file
85 | )
86 | end
87 | end
88 |
89 | private
90 | def configure
91 | source.definition[:worksheets].each do |worksheet|
92 | if Integer(worksheet)
93 | worksheets << worksheet.to_i
94 | else
95 | raise DefinitionError, "Each worksheet definition must be an integer"
96 | end
97 | end unless source.definition[:worksheets].nil?
98 |
99 | self.ignore_blank_line = source.definition[:ignore_blank_line]
100 | self.worksheet_column = source.definition[:worksheet_column]
101 | self.validate_rows = if source.configuration.has_key?(:validate_rows)
102 | source.configuration[:validate_rows]
103 | else
104 | true
105 | end
106 |
107 | source.definition[:fields].each do |options|
108 | case options
109 | when Symbol
110 | fields << Field.new(options)
111 | when Hash
112 | fields << Field.new(options[:name])
113 | else
114 | raise DefinitionError, "Each field definition must either be a symbol or a hash"
115 | end
116 | end
117 | end
118 |
119 | class Field #:nodoc:
120 | attr_reader :name
121 | def initialize(name)
122 | @name = name
123 | end
124 | end
125 |
126 | end
127 | end
128 | end
129 |
--------------------------------------------------------------------------------
/lib/etl/parser/fixed_width_parser.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Parser #:nodoc:
3 | # Parser for fixed with files
4 | class FixedWidthParser < ETL::Parser::Parser
5 | # Initialize the parser
6 | # * source: The source object
7 | # * options: Parser options Hash
8 | def initialize(source, options={})
9 | super
10 | configure
11 | end
12 |
13 | # Return each row
14 | def each
15 | Dir.glob(file).each do |file|
16 | open(file).each do |line|
17 | row = {}
18 | lines_skipped = 0
19 | fields.each do |name, f|
20 | if lines_skipped < source.skip_lines
21 | lines_skipped += 1
22 | next
23 | end
24 | # TODO make strip optional?
25 | row[name] = line[f.field_start, f.field_length].strip
26 | end
27 | yield row
28 | end
29 | end
30 | end
31 |
32 | # Return a map of defined fields
33 | def fields
34 | @fields ||= {}
35 | end
36 |
37 | private
38 | def configure
39 | source.definition.each do |field, options|
40 | fields[field] = FixedWidthField.new(
41 | options[:name], options[:start], options[:end], options[:length]
42 | )
43 | end
44 | end
45 | end
46 |
47 | class FixedWidthField #:nodoc:
48 | attr_reader :name, :field_start, :field_end, :field_length
49 | # Initialize the field.
50 | def initialize(name, field_start, field_end=nil, field_length=nil)
51 | @name = name
52 | @field_start = field_start - 1
53 | if field_end
54 | @field_end = field_end
55 | @field_length = @field_end - @field_start
56 | elsif field_length
57 | @field_length = field_length
58 | @field_end = @field_start + @field_length
59 | else
60 | raise DefinitionError, "Either field_end or field_length required"
61 | end
62 | end
63 | end
64 | end
65 | end
--------------------------------------------------------------------------------
/lib/etl/parser/nokogiri_xml_parser.rb:
--------------------------------------------------------------------------------
1 | optional_require 'nokogiri'
2 | require 'open-uri'
3 | optional_require 'zlib'
4 |
5 | module ETL
6 | module Parser
7 | class NokogiriXmlParser < ETL::Parser::Parser
8 | # Initialize the parser
9 | # * source: The Source object
10 | # * options: Parser options Hash
11 | def initialize(source, options={})
12 | super
13 | configure
14 | end
15 |
16 | # Returns each row
17 | def each
18 | Dir.glob(file).each do |source|
19 |
20 | doc = nil
21 |
22 | gzip = false
23 | magic = "1F8B".to_i(base=16) # Check for gzip archives
24 | if File.exist?(source)
25 | gzip = true if magic == (
26 | File.open(source).read(2).unpack("H2H2").to_s.to_i(base=16))
27 | end
28 |
29 | if gzip
30 | doc = Nokogiri::XML(Zlib::GzipReader.open(source))
31 | else
32 | doc = Nokogiri::XML(open(source))
33 | end
34 |
35 | doc.xpath(@collection_xpath).each do |nodeset|
36 | row = {}
37 |
38 | fields.each do |f|
39 | value = nodeset.xpath(f.xpath).text
40 | row[f.name] = value
41 | end
42 | yield row
43 | end
44 |
45 | end
46 | end
47 |
48 | # Get an array of defined fields
49 | def fields
50 | @fields ||= []
51 | end
52 |
53 | private
54 | def configure
55 | @collection_xpath = source.definition[:collection]
56 | if @collection_xpath.nil?
57 | raise ":collection => 'XPath' argument required"
58 | end
59 | source.definition[:fields].each do |options|
60 | case options
61 | when Symbol
62 | fields << Field.new(options, options.to_s)
63 | when Hash
64 | options[:xpath] ||= options[:name]
65 | fields << Field.new(options[:name], options[:xpath].to_s)
66 | else
67 | raise DefinitionError,
68 | "Each field definition must either be an symbol " +
69 | "or a hash of options for the field"
70 | end
71 | end
72 | end
73 |
74 | class Field
75 | attr_reader :name, :xpath
76 | def initialize(name, xpath)
77 | @name = name
78 | @xpath = xpath
79 | end
80 | end
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------
/lib/etl/parser/parser.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Parser #:nodoc:
3 | # Base parser class. Implementation classes must extend this class and implement
4 | # the each method. The each method should return each row of the source data as
5 | # a Hash.
6 | class Parser
7 | include Enumerable
8 | class << self
9 | # Convert the name (string or symbol) to a parser class.
10 | #
11 | # Example:
12 | # class_for_name(:fixed_width) returns a FixedWidthParser class
13 | def class_for_name(name)
14 | ETL::Parser.const_get("#{name.to_s.camelize}Parser")
15 | end
16 | end
17 |
18 | # The Source object for the data
19 | attr_reader :source
20 |
21 | # Options Hash for the parser
22 | attr_reader :options
23 |
24 | def initialize(source, options={})
25 | @source = source
26 | @options = options || {}
27 | end
28 |
29 | protected
30 | def file
31 | path = Pathname.new(source.configuration[:file])
32 | path = path.absolute? ? path : Pathname.new(File.dirname(source.control.file)) + path
33 | path
34 | end
35 |
36 | def raise_with_info(error, message, file, line)
37 | raise error, "#{message} (line #{line} in #{file})"
38 | end
39 | end
40 | end
41 | end
--------------------------------------------------------------------------------
/lib/etl/parser/xml_parser.rb:
--------------------------------------------------------------------------------
1 | require 'rexml/document'
2 |
3 | module ETL
4 | module Parser
5 | class XmlParser < ETL::Parser::Parser
6 | # Initialize the parser
7 | # * source: The Source object
8 | # * options: Parser options Hash
9 | def initialize(source, options={})
10 | super
11 | configure
12 | end
13 |
14 | # Returns each row
15 | def each
16 | Dir.glob(file).each do |file|
17 | doc = nil
18 | t = Benchmark.realtime do
19 | doc = REXML::Document.new(File.new(file))
20 | end
21 | Engine.logger.info "XML #{file} parsed in #{t}s"
22 | doc.elements.each(@collection_xpath) do |element|
23 | row = {}
24 | fields.each do |f|
25 | value = element.text(f.xpath)
26 | row[f.name] = value
27 | end
28 | yield row
29 | end
30 | end
31 | end
32 |
33 | # Get an array of defined fields
34 | def fields
35 | @fields ||= []
36 | end
37 |
38 | private
39 | def configure
40 | @collection_xpath = source.definition[:collection]
41 | raise "Collection XPath is required" if @collection_xpath.nil?
42 |
43 | source.definition[:fields].each do |options|
44 | case options
45 | when Symbol
46 | fields << Field.new(options, options.to_s)
47 | when Hash
48 | options[:xpath] ||= options[:name]
49 | fields << Field.new(options[:name], options[:xpath].to_s)
50 | else
51 | raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field"
52 | end
53 | end
54 | end
55 |
56 | class Field
57 | attr_reader :name, :xpath
58 | def initialize(name, xpath)
59 | @name = name
60 | @xpath = xpath
61 | end
62 | end
63 | end
64 | end
65 | end
--------------------------------------------------------------------------------
/lib/etl/processor.rb:
--------------------------------------------------------------------------------
1 | # This source file contains the ETL::Processor module and requires all of the processors
2 |
3 | module ETL #:nodoc:
4 | # The ETL::Processor module contains row-level and bulk processors
5 | module Processor
6 | end
7 | end
8 |
9 | require 'etl/processor/processor'
10 | require 'etl/processor/row_processor'
11 | Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) }
--------------------------------------------------------------------------------
/lib/etl/processor/block_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Processor
3 | # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process)
4 | class BlockProcessor < ETL::Processor::RowProcessor
5 | def initialize(control, configuration)
6 | super
7 | @block = configuration[:block]
8 | end
9 | def process(row=nil)
10 | @block.call(row)
11 | end
12 | end
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/lib/etl/processor/bulk_import_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Processor which is used to bulk import data into a target database. The
4 | # underlying database driver from ActiveRecord must support the methods
5 | # +bulk_load+ method.
6 | class BulkImportProcessor < ETL::Processor::Processor
7 |
8 | # The file to load from
9 | attr_reader :file
10 | # The target database
11 | attr_reader :target
12 | # The table name
13 | attr_reader :table
14 | # Set to true to truncate
15 | attr_reader :truncate
16 | # Array of symbols representing the column load order
17 | attr_reader :columns
18 | # The field separator (defaults to a comma)
19 | attr_accessor :field_separator
20 | # The field enclosure (defaults to nil)
21 | attr_accessor :field_enclosure
22 | # The line separator (defaults to a newline)
23 | attr_accessor :line_separator
24 | # The string that indicates a NULL (defaults to an empty string)
25 | attr_accessor :null_string
26 | # boolean that indicates disable keys before, then enable after load (MySql only optimization)
27 | attr_accessor :disable_keys
28 | # replace existing records, not just insert
29 | attr_accessor :replace
30 |
31 | # Initialize the processor.
32 | #
33 | # Configuration options:
34 | # * :file: The file to load data from
35 | # * :target: The target database
36 | # * :table: The table name
37 | # * :truncate: Set to true to truncate before loading
38 | # * :columns: The columns to load in the order they appear in
39 | # the bulk data file
40 | # * :field_separator: The field separator. Defaults to a comma
41 | # * :line_separator: The line separator. Defaults to a newline
42 | # * :field_enclosure: The field enclosure charcaters
43 | # * :disable_keys: Set to true to disable keys before, then enable after load (MySql only optimization)
44 | def initialize(control, configuration)
45 | super
46 | @target = configuration[:target]
47 | path = Pathname.new(configuration[:file])
48 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path
49 |
50 | @table = configuration[:table]
51 | @truncate = configuration[:truncate] ||= false
52 | @columns = configuration[:columns]
53 | @field_separator = (configuration[:field_separator] || ',')
54 | @line_separator = (configuration[:line_separator] || "\n")
55 | @null_string = (configuration[:null_string] || "")
56 | @field_enclosure = configuration[:field_enclosure]
57 | @disable_keys = configuration[:disable_keys] || false
58 | @replace = configuration[:replace] || false
59 |
60 | raise ControlError, "Target must be specified" unless @target
61 | raise ControlError, "Table must be specified" unless @table
62 | end
63 |
64 | # Execute the processor
65 | def process
66 | return if ETL::Engine.skip_bulk_import
67 | return if File.size(file) == 0
68 |
69 | conn = ETL::Engine.connection(target)
70 | conn.transaction do
71 | conn.truncate(table_name) if truncate
72 | options = {}
73 | options[:columns] = columns
74 |
75 | options[:disable_keys] = true if disable_keys
76 | options[:replace] = true if replace
77 |
78 | if field_separator || field_enclosure || line_separator || null_string
79 | options[:fields] = {}
80 | options[:fields][:null_string] = null_string if null_string
81 | options[:fields][:delimited_by] = field_separator if field_separator
82 | options[:fields][:enclosed_by] = field_enclosure if field_enclosure
83 | options[:fields][:terminated_by] = line_separator if line_separator
84 | end
85 | conn.bulk_load(file, table_name, options)
86 | end
87 | end
88 |
89 | def table_name
90 | ETL::Engine.table(table, ETL::Engine.connection(target))
91 | end
92 | end
93 | end
94 | end
95 |
--------------------------------------------------------------------------------
/lib/etl/processor/check_exist_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # A row-level processor that checks if the row already exists in the
4 | # target table
5 | class CheckExistProcessor < ETL::Processor::RowProcessor
6 | # A symbol or array of symbols representing keys that should be skipped
7 | attr_accessor :skip
8 |
9 | # The target database
10 | attr_accessor :target
11 |
12 | # The name of the table to check against
13 | attr_accessor :table
14 |
15 | # An array of columns representing the natural key
16 | attr_accessor :columns
17 |
18 | # Is set to true if the processor should execute the check. If there are
19 | # no rows in the target table then this should return false.
20 | attr_accessor :should_check
21 |
22 | # Initialize the processor
23 | # Configuration options:
24 | # * :columns: An array of symbols for columns that should be included in the query conditions. If this option is not specified then all of the columns in the row will be included in the conditions (unless :skip is specified).
25 | # * :skip: A symbol or array of symbols that should not be included in the existence check. If this option is not specified then all of the columns will be included in the existence check (unless :columns is specified).
26 | # * :target: The target connection
27 | # * :table: The table name
28 | def initialize(control, configuration)
29 | super
30 | @skip = configuration[:skip] || []
31 | @target = configuration[:target] || raise(ETL::ControlError, "target must be specified")
32 | @table = configuration[:table] || raise(ETL::ControlError, "table must be specified")
33 | @columns = configuration[:columns]
34 |
35 | q = "SELECT COUNT(*) FROM #{table_name}"
36 | @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0
37 | end
38 |
39 | # Return true if the given key should be skipped
40 | def skip?(key)
41 | case skip
42 | when Array
43 | skip.include?(key)
44 | else
45 | skip.to_sym == key.to_sym
46 | end
47 | end
48 |
49 | # Return true if the row should be checked
50 | def should_check?
51 | @should_check ? true : false
52 | end
53 |
54 | # Process the row
55 | def process(row)
56 | return row unless should_check?
57 | conn = ETL::Engine.connection(target)
58 | q = "SELECT * FROM #{table_name} WHERE "
59 | conditions = []
60 | ensure_columns_available_in_row!(row, columns, 'for existence check')
61 | row.each do |k,v|
62 | if columns.nil? || columns.include?(k.to_sym)
63 | conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym)
64 | end
65 | end
66 | q << conditions.join(" AND ")
67 | q << " LIMIT 1"
68 |
69 | result = conn.select_one(q)
70 | return row if result.nil?
71 | end
72 |
73 | private
74 |
75 | def table_name
76 | ETL::Engine.table(table, ETL::Engine.connection(target))
77 | end
78 | end
79 | end
80 | end
--------------------------------------------------------------------------------
/lib/etl/processor/check_unique_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Row processor that checks whether or not the row has already passed
4 | # through the ETL processor, using the key fields provided as the keys
5 | # to check.
6 | class CheckUniqueProcessor < ETL::Processor::RowProcessor
7 |
8 | # The keys to check
9 | attr_accessor :keys
10 |
11 | # Initialize the processor
12 | # Configuration options:
13 | # * :keys: An array of keys to check against
14 | def initialize(control, configuration)
15 | super
16 | @keys = configuration[:keys]
17 | end
18 |
19 | # A Hash of keys that have already been processed.
20 | def compound_key_constraints
21 | @compound_key_constraints ||= {}
22 | end
23 |
24 | # Process the row. This implementation will only return a row if it
25 | # it's key combination has not already been seen.
26 | #
27 | # An error will be raised if the row doesn't include the keys.
28 | def process(row)
29 | ensure_columns_available_in_row!(row, keys, 'for unicity check')
30 |
31 | key = (keys.collect { |k| row[k] }).join('|')
32 | unless compound_key_constraints[key]
33 | compound_key_constraints[key] = 1
34 | return row
35 | end
36 | end
37 | end
38 | end
39 | end
--------------------------------------------------------------------------------
/lib/etl/processor/copy_field_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Row processor that will copy one field to another
4 | #
5 | # Configuration options:
6 | # * :destination: The destination field
7 | # * :dest: Alias for :destination
8 | # * :source: The source field
9 | class CopyFieldProcessor < ETL::Processor::RowProcessor
10 | # Process the given row
11 | def process(row)
12 | destination = (configuration[:destination] || configuration[:dest])
13 | source_value = row[configuration[:source]]
14 | case source_value
15 | when Numeric
16 | row[destination] = source_value
17 | when nil
18 | row[destination] = nil
19 | else
20 | row[destination] = source_value.dup
21 | end
22 | row
23 | end
24 | end
25 | end
26 | end
--------------------------------------------------------------------------------
/lib/etl/processor/database_join_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Processor
3 | class DatabaseJoinProcessor < ETL::Processor::RowProcessor
4 | attr_reader :target
5 | attr_reader :query
6 | attr_reader :fields
7 |
8 | # Initialize the procesor.
9 | #
10 | # Arguments:
11 | # * control: The ETL::Control::Control instance
12 | # * configuration: The configuration Hash
13 | # * definition: The source definition
14 | #
15 | # Required configuration options:
16 | # * :target: The target connection
17 | # * :query: The join query
18 | # * :fields: The fields to add to the row
19 | def initialize(control, configuration)
20 | super
21 | @target = configuration[:target]
22 | @query = configuration[:query]
23 | @fields = configuration[:fields]
24 | raise ControlError, ":target must be specified" unless @target
25 | raise ControlError, ":query must be specified" unless @query
26 | raise ControlError, ":fields must be specified" unless @fields
27 | end
28 |
29 | # Get a String identifier for the source
30 | def to_s
31 | "#{host}/#{database}"
32 | end
33 |
34 | def process(row)
35 | return nil if row.nil?
36 |
37 | q = @query
38 | begin
39 | q = eval('"' + @query + '"')
40 | rescue
41 | end
42 |
43 | ETL::Engine.logger.debug("Executing select: #{q}")
44 | res = connection.execute(q)
45 |
46 | # TODO - refactor this and move it (and similar code around) to adapter_extensions
47 | case connection.class.name
48 | when "ActiveRecord::ConnectionAdapters::PostgreSQLAdapter";
49 | res.each do |r|
50 | @fields.each do |field|
51 | row[field.to_sym] = r[field.to_s]
52 | end
53 | end
54 | when "ActiveRecord::ConnectionAdapters::Mysql2Adapter";
55 | res.each(:as => :hash) do |r|
56 | @fields.each do |field|
57 | row[field.to_sym] = r[field.to_s]
58 | end
59 | end
60 | when "ActiveRecord::ConnectionAdapters::MysqlAdapter";
61 | res.each_hash do |r|
62 | @fields.each do |field|
63 | row[field.to_sym] = r[field.to_s]
64 | end
65 | end
66 | res.free
67 | else raise "Unsupported adapter #{connection.class} for this destination"
68 | end
69 |
70 | return row
71 | end
72 |
73 | private
74 | # Get the database connection to use
75 | def connection
76 | ETL::Engine.connection(target)
77 | end
78 |
79 | # Get the host, defaults to 'localhost'
80 | def host
81 | ETL::Base.configurations[target.to_s]['host'] || 'localhost'
82 | end
83 |
84 | def database
85 | ETL::Base.configurations[target.to_s]['database']
86 | end
87 | end
88 | end
89 | end
90 |
--------------------------------------------------------------------------------
/lib/etl/processor/encode_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line.
4 | class EncodeProcessor < ETL::Processor::Processor
5 |
6 | # The file to load from
7 | attr_reader :source_file
8 | # The file to write to
9 | attr_reader :target_file
10 | # The source file encoding
11 | attr_reader :source_encoding
12 | # The target file encoding
13 | attr_reader :target_encoding
14 |
15 | # Initialize the processor.
16 | #
17 | # Configuration options:
18 | # * :source_file: The file to load data from
19 | # * :source_encoding: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv
20 | # * :target_file: The file to write data to
21 | # * :target_encoding: The target file encoding
22 | def initialize(control, configuration)
23 | super
24 | raise ControlError, "Source file must be specified" if configuration[:source_file].nil?
25 | raise ControlError, "Target file must be specified" if configuration[:target_file].nil?
26 | @source_file = File.join(File.dirname(control.file), configuration[:source_file])
27 | @source_encoding = configuration[:source_encoding]
28 | @target_file = File.join(File.dirname(control.file), configuration[:target_file])
29 | @target_encoding = configuration[:target_encoding]
30 | raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file
31 | begin
32 | @iconv = Iconv.new(target_encoding,source_encoding)
33 | rescue Iconv::InvalidEncoding
34 | raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported"
35 | end
36 | end
37 |
38 | # Execute the processor
39 | def process
40 | # operate line by line to handle large files without loading them in-memory
41 | # could be replaced by a system iconv call when available, for greater performance
42 | File.open(source_file) do |source|
43 | #puts "Opening #{target_file}"
44 | File.open(target_file,'w') do |target|
45 | source.each_line do |line|
46 | target << @iconv.iconv(line)
47 | end
48 | end
49 | end
50 | end
51 | end
52 | end
53 | end
--------------------------------------------------------------------------------
/lib/etl/processor/ensure_fields_presence_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Processor
3 | # Ensure that each specified field is available
4 | class EnsureFieldsPresenceProcessor < ETL::Processor::RowProcessor
5 |
6 | # Initialize the processor.
7 | #
8 | # Configuration options:
9 | # * :fields: An array of keys whose presence should be verified in each row
10 | def initialize(control, configuration)
11 | super
12 | @fields = configuration[:fields]
13 | raise ControlError, ":fields must be specified" unless @fields
14 | end
15 |
16 | def process(row)
17 | missing_fields = configuration[:fields].map(&:to_s) - row.keys.map(&:to_s)
18 | raise(ETL::ControlError,
19 | "Row missing required field(s) #{missing_fields.join(',')} in row. Available fields are : #{row.keys.join(',')}") unless missing_fields.empty?
20 | row
21 | end
22 | end
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/etl/processor/escape_csv_processor.rb:
--------------------------------------------------------------------------------
1 | require 'fileutils'
2 |
3 | module ETL #:nodoc:
4 | module Processor #:nodoc:
5 | class EscapeCsvProcessor < ETL::Processor::Processor
6 |
7 | # The file to load from
8 | attr_reader :source_file
9 | # The file to write to
10 | attr_reader :target_file
11 | # whether to use a temporary file or not
12 | attr_reader :use_temp_file
13 |
14 | attr_reader :filters
15 | attr_reader :charcount
16 |
17 | # Initialize the processor.
18 | #
19 | # Configuration options:
20 | # * :source_file: The file to load data from
21 | # * :target_file: The file to write data to
22 | # * :file: short-cut which will set the same value to both source_file and target_file
23 | def initialize(control, configuration)
24 | super
25 | if configuration[:file]
26 | @use_temp_file = true
27 | configuration[:source_file] = configuration[:file]
28 | configuration[:target_file] = configuration[:file] + '.tmp'
29 | end
30 | path = Pathname.new(configuration[:source_file])
31 | @source_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:source_file]))) + path
32 | path = Pathname.new(configuration[:target_file])
33 | @target_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:target_file]))) + path
34 | @filters = configuration[:filters] || [{:replace => '\"', :result => '""'}]
35 | @charcount = configuration[:charcount]
36 | raise ControlError, "Source file must be specified" if @source_file.nil?
37 | raise ControlError, "Target file must be specified" if @target_file.nil?
38 | raise ControlError, "Source and target file cannot currently point to the same file" if @source_file == @target_file
39 | end
40 |
41 | # Execute the processor
42 | def process
43 | reader = File.open(@source_file, 'r')
44 | writer = File.open(@target_file, 'w')
45 |
46 | reader.each_line do |line|
47 | reading = line
48 | @filters.each do |filter|
49 | if (!filter[:replace].nil? &&
50 | !filter[:result].nil?)
51 | result = reading.gsub(Regexp.new(filter[:replace]), filter[:result])
52 | reading = result
53 | end
54 | end unless @filters.nil?
55 | @charcount.each do |count|
56 | if (!count[:char].nil? &&
57 | !count[:count].nil?)
58 | c = reading.count count[:char]
59 | if c != count[:count]
60 | reading = nil
61 | end
62 | end
63 | end unless @charcount.nil?
64 | writer.write(reading) unless reading.nil?
65 | end
66 |
67 | reader.close
68 | writer.close
69 |
70 | if use_temp_file
71 | FileUtils.rm(source_file)
72 | FileUtils.mv(target_file,source_file)
73 | end
74 | end
75 | end
76 | end
77 | end
78 |
--------------------------------------------------------------------------------
/lib/etl/processor/filter_row_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Processor
3 | class FilterRowProcessor < ETL::Processor::RowProcessor
4 | attr_reader :condition
5 | attr_reader :outtrue
6 | attr_reader :outfalse
7 |
8 | def initialize(control, configuration)
9 | @condition = configuration[:condition]
10 | @outtrue = configuration[:outtrue]
11 | @outfalse = configuration[:outfalse]
12 | super
13 | end
14 |
15 | def process(row)
16 | return nil if row.nil?
17 |
18 | if eval_condition(row, @condition)
19 | return [] if @outtrue.nil?
20 |
21 | eval(@outtrue)
22 | else
23 | eval(@outfalse) unless @outfalse.nil?
24 | end
25 |
26 | return row
27 | end
28 |
29 | private
30 | def eval_condition(row, cond)
31 |
32 | first = cond[1]
33 | if (cond[1].class == Array)
34 | first = eval_condition(row, cond[1])
35 | end
36 |
37 | second = cond[2]
38 | if (cond[2].class == Array)
39 | second = eval_condition(row, cond[2])
40 | end
41 |
42 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
43 |
44 | eval("#{first}#{cond[0]}#{second}")
45 | rescue => e
46 | return false
47 | end
48 |
49 | end
50 | end
51 | end
52 |
--------------------------------------------------------------------------------
/lib/etl/processor/ftp_downloader_processor.rb:
--------------------------------------------------------------------------------
1 | # Written by Susan Potter under open source MIT license.
2 | # August 12, 2007.
3 |
4 | require 'net/ftp'
5 |
6 | module ETL
7 | module Processor
8 | # Custom processor to download files via FTP
9 | class FtpDownloaderProcessor < ETL::Processor::Processor
10 | attr_reader :host
11 | attr_reader :port
12 | attr_reader :remote_dir
13 | attr_reader :files
14 | attr_reader :username
15 | attr_reader :local_dir
16 |
17 | # configuration options include:
18 | # * host - hostname or IP address of FTP server (required)
19 | # * port - port number for FTP server (default: 21)
20 | # * remote_dir - remote path on FTP server (default: /)
21 | # * files - list of files to download from FTP server (default: [])
22 | # * username - username for FTP server authentication (default: anonymous)
23 | # * password - password for FTP server authentication (default: nil)
24 | # * local_dir - local output directory to save downloaded files (default: '')
25 | #
26 | # As an example you might write something like the following in your control process file:
27 | # pre_process :ftp_downloader, {
28 | # :host => 'ftp.sec.gov',
29 | # :path => 'edgar/Feed/2007/QTR2',
30 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
31 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
32 | # :local_dir => '/data/sec/2007/04',
33 | # }
34 | # The above example will anonymously download via FTP the first week's worth of SEC filing feed data
35 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
36 | def initialize(control, configuration)
37 | @host = configuration[:host]
38 | @port = configuration[:port] || 21
39 | @remote_dir = configuration[:remote_dir] || '/'
40 | @files = configuration[:files] || []
41 | @username = configuration[:username] || 'anonymous'
42 | @password = configuration[:password]
43 | @local_dir = configuration[:local_dir] || ''
44 | end
45 |
46 | def process
47 | Net::FTP.open(@host) do |conn|
48 | conn.connect(@host, @port)
49 | conn.login(@username, @password)
50 | @files.each do |f|
51 | conn.getbinaryfile(remote_file(f), local_file(f))
52 | end
53 | end
54 | end
55 |
56 | private
57 | attr_accessor :password
58 |
59 | def local_file(name)
60 | File.join(@local_dir, name)
61 | end
62 |
63 | def remote_file(name)
64 | File.join(@remote_dir, name)
65 | end
66 | end
67 | end
68 | end
69 |
--------------------------------------------------------------------------------
/lib/etl/processor/ftp_uploader_processor.rb:
--------------------------------------------------------------------------------
1 | require 'net/ftp'
2 |
3 | module ETL
4 | module Processor
5 | # Custom processor to download files via FTP
6 | class FtpUploaderProcessor < ETL::Processor::Processor
7 | attr_reader :host
8 | attr_reader :port
9 | attr_reader :remote_dir
10 | attr_reader :files
11 | attr_reader :username
12 | attr_reader :local_dir
13 |
14 | # configuration options include:
15 | # * host - hostname or IP address of FTP server (required)
16 | # * port - port number for FTP server (default: 21)
17 | # * remote_dir - remote path on FTP server (default: /)
18 | # * files - list of files to download from FTP server (default: [])
19 | # * username - username for FTP server authentication (default: anonymous)
20 | # * password - password for FTP server authentication (default: nil)
21 | # * local_dir - local output directory to save downloaded files (default: '')
22 | #
23 | # As an example you might write something like the following in your control process file:
24 | # pre_process :ftp_uploader, {
25 | # :host => 'ftp.sec.gov',
26 | # :path => 'edgar/Feed/2007/QTR2',
27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29 | # :local_dir => '/data/sec/2007/04',
30 | # }
31 | # The above example will anonymously download via FTP the first week's worth of SEC filing feed data
32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33 | def initialize(control, configuration)
34 | @host = configuration[:host]
35 | @port = configuration[:port] || 21
36 | @remote_dir = configuration[:remote_dir] || '/'
37 | @files = configuration[:files] || []
38 | @username = configuration[:username] || 'anonymous'
39 | @password = configuration[:password]
40 | @local_dir = configuration[:local_dir] || ''
41 | end
42 |
43 | def process
44 | Net::FTP.open(@host) do |conn|
45 | conn.connect(@host, @port)
46 | conn.login(@username, @password)
47 | @files.each do |f|
48 | conn.putbinaryfile(local_file(f), remote_file(f))
49 | end
50 | end
51 | end
52 |
53 | private
54 | attr_accessor :password
55 |
56 | def local_file(name)
57 | File.join(@local_dir, name)
58 | end
59 |
60 | def remote_file(name)
61 | File.join(@remote_dir, name)
62 | end
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/lib/etl/processor/hierarchy_exploder_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Row-level processor that will convert a single row into multiple rows designed to be inserted
4 | # into a hierarchy bridge table.
5 | class HierarchyExploderProcessor < ETL::Processor::RowProcessor
6 | attr_accessor :id_field
7 | attr_accessor :parent_id_field
8 |
9 | # Initialize the processor
10 | #
11 | # Configuration options:
12 | # * :connection: The ActiveRecord adapter connection
13 | # * :id_field: The name of the id field (defaults to 'id')
14 | # * :parent_id_field: The name of the parent id field (defaults to 'parent_id')
15 | #
16 | # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely
17 | # on AR as the only resolution method.
18 | def initialize(control, configuration={})
19 | @id_field = configuration[:id_field] || 'id'
20 | @parent_id_field = configuration[:parent_id_field] || 'parent_id'
21 | super
22 | end
23 |
24 | # Process the row expanding it into hierarchy values
25 | def process(row)
26 | rows = []
27 | target = configuration[:target]
28 | table = configuration[:table]
29 | conn = ETL::Engine.connection(target)
30 | build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn)
31 | rows
32 | end
33 |
34 | protected
35 | # Recursive function that will add a row for the current level and then call build_rows
36 | # for all of the children of the current level
37 | def build_rows(ids, parent_id, row_id, root, level, rows, table, conn)
38 | ids.each do |id|
39 | child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}")
40 |
41 | row = {
42 | :parent_id => row_id,
43 | :child_id => id,
44 | :num_levels_from_parent => level,
45 | :is_bottom => (child_ids.empty? ? 1 : 0),
46 | :is_top => (root ? 1 : 0),
47 | }
48 | rows << row
49 |
50 | build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn)
51 | end
52 | end
53 | end
54 | end
55 | end
--------------------------------------------------------------------------------
/lib/etl/processor/imapattachment_downloader_processor.rb:
--------------------------------------------------------------------------------
1 | optional_require 'net/imap'
2 | optional_require 'tmail'
3 |
4 | module ETL
5 | module Processor
6 | # Custom processor to download files via Imap Attachment
7 | class ImapattachmentDownloaderProcessor < ETL::Processor::Processor
8 | attr_reader :host
9 | attr_reader :ssl
10 | attr_reader :port
11 | attr_reader :delete
12 | attr_reader :filters
13 | attr_reader :folder
14 | attr_reader :username
15 | attr_reader :local_dir
16 |
17 | # configuration options include:
18 | # * host - hostname or IP address of IMAP server (required)
19 | # * ssl - activate encryption (default false)
20 | # * port - port number for IMAP server (default: 220 or 993)
21 | # * delete - delete message after reading (default false)
22 | # * filters - filter mails (default [])
23 | # * folder - folder to select mails from (default INBOX)
24 | # * username - username for IMAP server authentication (default: anonymous)
25 | # * password - password for IMAP server authentication (default: nil)
26 | # * local_dir - local output directory to save downloaded files (default: '')
27 | #
28 | def initialize(control, configuration)
29 | @host = configuration[:host]
30 | @ssl = configuration[:ssl] || false
31 | @port = configuration[:port] || (@ssl ? 993 : 220 )
32 | @delete = configuration[:delete] || false
33 | @filters = configuration[:filters] || []
34 | @folder = configuration[:folder] || 'INBOX'
35 | @username = configuration[:username] || 'anonymous'
36 | @password = configuration[:password]
37 | @local_dir = configuration[:local_dir] || ''
38 | end
39 |
40 | def process
41 | conn = Net::IMAP.new(@host, @port, @ssl)
42 | conn.login(@username, @password)
43 |
44 | conn.select(@folder)
45 | conn.uid_search(["NOT", "DELETED"]).each do |msguuid|
46 | mail = TMail::Mail.parse( conn.uid_fetch(msguuid, 'RFC822').first.attr['RFC822'] )
47 | next if mail.attachments.blank?
48 | if applyfilter(mail, @filters)
49 | mail.attachments.each do |attachment|
50 | filename = attachment.original_filename
51 | File.open(local_file(filename), "w") {|f|
52 | f << attachment.gets(nil)
53 | }
54 | end
55 |
56 | conn.store(msguuid, "+FLAGS", [:Deleted]) if @delete
57 | end
58 | end
59 | conn.expunge
60 | conn.close
61 | end
62 |
63 | private
64 | attr_accessor :password
65 |
66 | def local_file(name)
67 | File.join(@local_dir, name)
68 | end
69 |
70 | def applyfilter(mail, cond)
71 | return true if (cond.nil? or cond.size < 3)
72 |
73 | first = cond[1]
74 | if (cond[1].class == Array)
75 | first = eval_condition(row, cond[1])
76 | end
77 |
78 | second = cond[2]
79 | if (cond[2].class == Array)
80 | second = eval_condition(row, cond[2])
81 | end
82 |
83 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
84 |
85 | eval("#{first}#{cond[0]}#{second}")
86 | rescue => e
87 | return false
88 | end
89 | end
90 | end
91 | end
92 |
--------------------------------------------------------------------------------
/lib/etl/processor/pop3attachment_downloader_processor.rb:
--------------------------------------------------------------------------------
1 | optional_require 'net/pop'
2 | optional_require 'tmail'
3 |
4 | module ETL
5 | module Processor
6 | # Custom processor to download files via Pop3 Attachment
7 | class Pop3attachmentDownloaderProcessor < ETL::Processor::Processor
8 | attr_reader :host
9 | attr_reader :ssl
10 | attr_reader :port
11 | attr_reader :delete
12 | attr_reader :filters
13 | attr_reader :username
14 | attr_reader :local_dir
15 |
16 | # configuration options include:
17 | # * host - hostname or IP address of POP3 server (required)
18 | # * ssl - activate encryption (default false)
19 | # * port - port number for POP3 server (default: Net::POP3.default_port or Net::POP3.default_pop3s_port)
20 | # * delete - delete message after reading (default false)
21 | # * filters - filter mails (default [])
22 | # * username - username for POP3 server authentication (default: anonymous)
23 | # * password - password for POP3 server authentication (default: nil)
24 | # * local_dir - local output directory to save downloaded files (default: '')
25 | #
26 | def initialize(control, configuration)
27 | @host = configuration[:host]
28 | @ssl = configuration[:ssl] || false
29 | @port = configuration[:port] || (@ssl ? Net::POP3.default_pop3s_port : Net::POP3.default_port )
30 | @delete = configuration[:delete] || false
31 | @filters = configuration[:filters] || []
32 | @username = configuration[:username] || 'anonymous'
33 | @password = configuration[:password]
34 | @local_dir = configuration[:local_dir] || ''
35 | end
36 |
37 | def process
38 | Net::POP3.enable_ssl(OpenSSL::SSL::VERIFY_NONE) if @ssl
39 | conn = Net::POP3.new(@host, @port)
40 | conn.start(@username, @password)
41 | if !conn.mails.empty?
42 | conn.each_mail do |message|
43 | stringmail = message.pop
44 | mail = TMail::Mail.parse(stringmail)
45 | next if mail.attachments.blank?
46 | if applyfilter(mail, @filters)
47 | mail.attachments.each do |attachment|
48 | filename = attachment.original_filename
49 | File.open(local_file(filename), "w") {|f|
50 | f << attachment.gets(nil)
51 | }
52 | end
53 |
54 | message.delete if @delete
55 | end
56 | end
57 | end
58 |
59 | conn.finish
60 | end
61 |
62 | private
63 | attr_accessor :password
64 |
65 | def local_file(name)
66 | File.join(@local_dir, name)
67 | end
68 |
69 | def applyfilter(mail, cond)
70 | return true if (cond.nil? or cond.size < 3)
71 |
72 | first = cond[1]
73 | if (cond[1].class == Array)
74 | first = eval_condition(row, cond[1])
75 | end
76 |
77 | second = cond[2]
78 | if (cond[2].class == Array)
79 | second = eval_condition(row, cond[2])
80 | end
81 |
82 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!"
83 |
84 | eval("#{first}#{cond[0]}#{second}")
85 | rescue => e
86 | return false
87 | end
88 | end
89 | end
90 | end
91 |
--------------------------------------------------------------------------------
/lib/etl/processor/print_row_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Debugging processor for printing the current row
4 | class PrintRowProcessor < ETL::Processor::RowProcessor
5 | # Process the row
6 | def process(row)
7 | puts row.inspect
8 | row
9 | end
10 | end
11 | end
12 | end
--------------------------------------------------------------------------------
/lib/etl/processor/processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Base class for pre and post processors. Subclasses must implement the +process+ method.
4 | class Processor
5 | def initialize(control, configuration)
6 | @control = control
7 | @configuration = configuration
8 | after_initialize if respond_to?(:after_initialize)
9 | end
10 | protected
11 | # Get the control object
12 | def control
13 | @control
14 | end
15 | # Get the configuration Hash
16 | def configuration
17 | @configuration
18 | end
19 | # Get the engine logger
20 | def log
21 | Engine.logger
22 | end
23 | end
24 | end
25 | end
--------------------------------------------------------------------------------
/lib/etl/processor/rename_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Row level processor to rename a field in the row.
4 | #
5 | # Configuration options:
6 | # * :source: the source field name
7 | # * :dest: The destination field name
8 | class RenameProcessor < ETL::Processor::RowProcessor
9 | def process(row)
10 | source_value = row[configuration[:source]]
11 | case source_value
12 | when Numeric
13 | row[configuration[:dest]] = source_value
14 | when nil
15 | row[configuration[:dest]] = nil
16 | else
17 | row[configuration[:dest]] = source_value.dup
18 | end
19 | row.delete(configuration[:source])
20 | row
21 | end
22 | end
23 | end
24 | end
--------------------------------------------------------------------------------
/lib/etl/processor/require_non_blank_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # A processor which requires that the particular fields are non-blank in
4 | # order for the row to be retained.
5 | class RequireNonBlankProcessor < ETL::Processor::RowProcessor
6 | # An array of fields to check
7 | attr_reader :fields
8 |
9 | # Initialize the processor
10 | #
11 | # Options:
12 | # * :fields: An array of fields to check, for example:
13 | # [:first_name,:last_name]
14 | def initialize(control, configuration)
15 | super
16 | @fields = configuration[:fields] || []
17 | end
18 |
19 | # Process the row.
20 | def process(row)
21 | fields.each { |field| return if row[field].blank? }
22 | row
23 | end
24 | end
25 | end
26 | end
--------------------------------------------------------------------------------
/lib/etl/processor/row_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Processor which processes a specific row. Unlike a transformer, which deals with a specific
4 | # value in the row, row processors can process an entire row at once, which can be used to
5 | # explode a single row into multiple rows (for example)
6 | class RowProcessor < Processor
7 | # Initialize the processor
8 | def initialize(control, configuration)
9 | super
10 | end
11 | # Process the specified row. This method must return the row.
12 | def process(row)
13 | raise "process_row is an abstract method"
14 | end
15 |
16 | # Ensure a given row keys include all the provided columns
17 | # and raise an error using the provided message if it doesn't
18 | def ensure_columns_available_in_row!(row, columns, message)
19 | unless columns.nil?
20 | columns.each do |k|
21 | raise(ETL::ControlError, "Row missing required field #{k.inspect} #{message}") unless row.keys.include?(k)
22 | end
23 | end
24 | end
25 | end
26 | end
27 | end
--------------------------------------------------------------------------------
/lib/etl/processor/sequence_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # Row level processor to generate a sequence.
4 | #
5 | # Configuration options:
6 | # * :context: A context name, if none is specified then the context will be
7 | # the current ETL run
8 | # * :dest: The destination field name
9 | class SequenceProcessor < ETL::Processor::RowProcessor
10 | def process(row)
11 | sequences[configuration[:context]] ||= 0
12 | row[configuration[:dest]] = sequences[configuration[:context]] += 1
13 | row
14 | end
15 |
16 | protected
17 | # Get a Hash of sequences
18 | def sequences
19 | @sequences ||= {}
20 | end
21 | end
22 | end
23 | end
--------------------------------------------------------------------------------
/lib/etl/processor/sftp_downloader_processor.rb:
--------------------------------------------------------------------------------
1 | optional_require 'net/sftp'
2 |
3 | module ETL
4 | module Processor
5 | # Custom processor to download files via SFTP
6 | class SftpDownloaderProcessor < ETL::Processor::Processor
7 | attr_reader :host
8 | attr_reader :port
9 | attr_reader :remote_dir
10 | attr_reader :files
11 | attr_reader :username
12 | attr_reader :local_dir
13 |
14 | # configuration options include:
15 | # * host - hostname or IP address of FTP server (required)
16 | # * port - port number for FTP server (default: 22)
17 | # * remote_dir - remote path on FTP server (default: /)
18 | # * files - list of files to download from FTP server (default: [])
19 | # * username - username for FTP server authentication (default: anonymous)
20 | # * password - password for FTP server authentication (default: nil)
21 | # * local_dir - local output directory to save downloaded files (default: '')
22 | #
23 | # As an example you might write something like the following in your control process file:
24 | # pre_process :sftp_downloader, {
25 | # :host => 'sftp.sec.gov',
26 | # :path => 'edgar/Feed/2007/QTR2',
27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29 | # :local_dir => '/data/sec/2007/04',
30 | # }
31 | # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33 | def initialize(control, configuration)
34 | @host = configuration[:host]
35 | @port = configuration[:port] || 22
36 | @remote_dir = configuration[:remote_dir] || '/'
37 | @files = configuration[:files] || []
38 | @username = configuration[:username] || 'anonymous'
39 | @password = configuration[:password]
40 | @local_dir = configuration[:local_dir] || ''
41 | end
42 |
43 | def process
44 | Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
45 | @files.each do |f|
46 | conn.download!(remote_file(f), local_file(f))
47 | end
48 | end
49 | end
50 |
51 | private
52 | attr_accessor :password
53 |
54 | def local_file(name)
55 | File.join(@local_dir, name)
56 | end
57 |
58 | def remote_file(name)
59 | File.join(@remote_dir, name)
60 | end
61 | end
62 | end
63 | end
64 |
--------------------------------------------------------------------------------
/lib/etl/processor/sftp_uploader_processor.rb:
--------------------------------------------------------------------------------
1 | optional_require 'net/sftp'
2 |
3 | module ETL
4 | module Processor
5 | # Custom processor to download files via SFTP
6 | class SftpUploaderProcessor < ETL::Processor::Processor
7 | attr_reader :host
8 | attr_reader :port
9 | attr_reader :remote_dir
10 | attr_reader :files
11 | attr_reader :username
12 | attr_reader :local_dir
13 |
14 | # configuration options include:
15 | # * host - hostname or IP address of FTP server (required)
16 | # * port - port number for FTP server (default: 22)
17 | # * remote_dir - remote path on FTP server (default: /)
18 | # * files - list of files to download from FTP server (default: [])
19 | # * username - username for FTP server authentication (default: anonymous)
20 | # * password - password for FTP server authentication (default: nil)
21 | # * local_dir - local output directory to save downloaded files (default: '')
22 | #
23 | # As an example you might write something like the following in your control process file:
24 | # pre_process :sftp_uploader, {
25 | # :host => 'sftp.sec.gov',
26 | # :path => 'edgar/Feed/2007/QTR2',
27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz',
28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'],
29 | # :local_dir => '/data/sec/2007/04',
30 | # }
31 | # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data
32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+.
33 | def initialize(control, configuration)
34 | @host = configuration[:host]
35 | @port = configuration[:port] || 22
36 | @remote_dir = configuration[:remote_dir] || '/'
37 | @files = configuration[:files] || []
38 | @username = configuration[:username] || 'anonymous'
39 | @password = configuration[:password]
40 | @local_dir = configuration[:local_dir] || ''
41 | end
42 |
43 | def process
44 | Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn|
45 | @files.each do |f|
46 | conn.upload!(local_file(f), remote_file(f))
47 | end
48 | end
49 | end
50 |
51 | private
52 | attr_accessor :password
53 |
54 | def local_file(name)
55 | File.join(@local_dir, name)
56 | end
57 |
58 | def remote_file(name)
59 | File.join(@remote_dir, name)
60 | end
61 | end
62 | end
63 | end
64 |
--------------------------------------------------------------------------------
/lib/etl/processor/surrogate_key_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # A row level processor that provides surrogate keys
4 | class SurrogateKeyProcessor < ETL::Processor::RowProcessor
5 | attr_accessor :destination
6 | attr_accessor :table
7 | attr_accessor :column
8 | attr_accessor :target
9 |
10 | # Initialize the surrogate key generator
11 | #
12 | # Configuration options
13 | # * :query: If specified it contains a query to be used to
14 | # locate the last surrogate key. If this is specified then :target
15 | # must also be specified.
16 | # * :target: The target connection
17 | # * :destination: The destination column name (defaults to :id)
18 | def initialize(control, configuration)
19 | super
20 | @table = configuration[:table]
21 | @column = configuration[:column] || 'id'
22 | @target = configuration[:target]
23 | if configuration[:query]
24 | raise ControlError, "Query option is no longer value, use :column and :table instead"
25 | end
26 | if table
27 | @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}")
28 | end
29 | #puts "initial surrogate key: #{@surrogate_key}"
30 | @surrogate_key = 0 if @surrogate_key.blank?
31 | @surrogate_key = @surrogate_key.to_i
32 | #puts "surrogate key: #{@surrogate_key}"
33 | @destination = configuration[:destination] || :id
34 | end
35 |
36 | # Add a surrogate key to the row
37 | def process(row)
38 | if row
39 | #puts "processing row #{row.inspect}"
40 | @surrogate_key += 1
41 | #puts "adding surrogate key to row: #{@surrogate_key}"
42 | row[destination] = @surrogate_key
43 | row
44 | end
45 | end
46 |
47 | private
48 | def table_name
49 | ETL::Engine.table(table, ETL::Engine.connection(target))
50 | end
51 | end
52 | end
53 | end
--------------------------------------------------------------------------------
/lib/etl/processor/truncate_processor.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Processor #:nodoc:
3 | # A processor which will truncate a table. Use as a pre-processor for cleaning out a table
4 | # prior to loading
5 | class TruncateProcessor < ETL::Processor::Processor
6 | # Defines the table to truncate
7 | attr_reader :table
8 |
9 | # Defines the database connection to use
10 | attr_reader :target
11 |
12 | # Initialize the processor
13 | #
14 | # Options:
15 | # * :target: The target connection
16 | # * :table: The table name
17 | # * :options: Optional truncate options
18 | def initialize(control, configuration)
19 | super
20 | #@file = File.join(File.dirname(control.file), configuration[:file])
21 | @target = configuration[:target] || {}
22 | @table = configuration[:table]
23 | @options = configuration[:options]
24 | end
25 |
26 | def process
27 | conn = ETL::Engine.connection(target)
28 | @options ||= 'RESTART IDENTITY' if conn.class.name =~ /postgres/i
29 | conn.truncate(table_name, @options)
30 | end
31 |
32 | private
33 | def table_name
34 | ETL::Engine.table(table, ETL::Engine.connection(target))
35 | end
36 | end
37 | end
38 | end
--------------------------------------------------------------------------------
/lib/etl/processor/zip_file_processor.rb:
--------------------------------------------------------------------------------
1 | optional_require 'zip/zip'
2 |
3 | module ETL
4 | module Processor
5 | # Custom processor to zip files
6 | class ZipFileProcessor < ETL::Processor::Processor
7 | attr_reader :infile
8 | attr_reader :destination
9 |
10 | # configuration options include:
11 | # * infile - File to zip (required)
12 | # * destination - Zip file name (default: #{infile}.zip)
13 | def initialize(control, configuration)
14 | path = Pathname.new(configuration[:infile])
15 | @infile = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:infile]))) + path
16 | @destination = configuration[:destination] || "#{infile}.zip"
17 | end
18 |
19 | def process
20 | Zip::ZipFile.open(@destination, Zip::ZipFile::CREATE) do |zipfile|
21 | zipfile.add(@infile.basename, @infile)
22 | end
23 | end
24 |
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/lib/etl/row.rb:
--------------------------------------------------------------------------------
1 | # This source file contains the ETL::Row class.
2 |
3 | module ETL #:nodoc:
4 | # This class represents a single row currently passing through the ETL pipeline
5 | class Row < Hash
6 | # Accessor for the originating source
7 | attr_accessor :source
8 |
9 | # All change types
10 | CHANGE_TYPES = [:insert, :update, :delete]
11 |
12 | # Accessor for the row's change type
13 | attr_accessor :change_type
14 |
15 | # Get the change type, defaults to :insert
16 | def change_type
17 | @change_type ||= :insert
18 | end
19 | end
20 | end
--------------------------------------------------------------------------------
/lib/etl/screen.rb:
--------------------------------------------------------------------------------
1 | # This source file contains the ETL::Screen module and requires all of the
2 | # screens
3 |
4 | module ETL #:nodoc:
5 | # The ETL::Screen module contains pre-built screens useful for checking the
6 | # ETL state during execution. Screens may be fatal, which will result in
7 | # termination of the ETL process, errors, which will result in the
8 | # termination of just the current ETL control file, or warnings, which will
9 | # result in a warning message.
10 | module Screen
11 | end
12 | end
13 |
14 | Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) }
--------------------------------------------------------------------------------
/lib/etl/screen/row_count_screen.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Screen
3 | # This screen validates the number of rows which will be bulk loaded
4 | # against the results from some sort of a row count query. If there
5 | # is a difference then the screen will not pass
6 | class RowCountScreen
7 | attr_accessor :control, :configuration
8 | def initialize(control, configuration={})
9 | @control = control
10 | @configuration = configuration
11 | execute
12 | end
13 | def execute
14 | unless Engine.rows_written == configuration[:rows]
15 | raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})"
16 | end
17 | end
18 | end
19 | end
20 | end
--------------------------------------------------------------------------------
/lib/etl/transform.rb:
--------------------------------------------------------------------------------
1 | require 'etl/transform/transform'
2 | Dir[File.dirname(__FILE__) + "/transform/*.rb"].each { |file| require(file) }
--------------------------------------------------------------------------------
/lib/etl/transform/block_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Transform
3 | class BlockTransform < ETL::Transform::Transform
4 | def initialize(control, name, configuration)
5 | super
6 | @block = configuration[:block]
7 | end
8 | def transform(name, value, row)
9 | @block.call(name, value, row)
10 | end
11 | end
12 | end
13 | end
--------------------------------------------------------------------------------
/lib/etl/transform/calculation_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Transform
3 | class CalculationTransform < ETL::Transform::Transform
4 | attr_reader :function
5 | attr_reader :fields
6 |
7 | def initialize(control, name, configuration)
8 | @function = configuration[:function]
9 | @fields = configuration[:fields]
10 | super
11 | end
12 |
13 | def transform(name, value, row)
14 | return nil if row.nil?
15 | return nil if row[@fields[0]].nil?
16 |
17 | if (@function.eql? "A + B")
18 | result = ""
19 | @fields.each do |field|
20 | next if field.nil?
21 |
22 | string = ""
23 | if field.to_s.eql? field
24 | string = field
25 | begin
26 | string = eval('"' + field + '"')
27 | rescue
28 | end
29 | else
30 | string = row[field]
31 | end
32 | next if string.nil?
33 |
34 | result = result + string
35 | end
36 |
37 | row[name] = result
38 | end
39 |
40 | if (@function.eql? "date A")
41 | first = row[@fields[0]]
42 | row[name] = Time.parse(first)
43 | end
44 |
45 | if (@function.eql? "trim A")
46 | first = row[@fields[0]]
47 | row[name] = first.strip
48 | end
49 |
50 | if (@function.eql? "lower A")
51 | first = row[@fields[0]]
52 | row[name] = first.downcase
53 | end
54 |
55 | if (@function.eql? "upper A")
56 | first = row[@fields[0]]
57 | row[name] = first.upcase
58 | end
59 |
60 | if (@function.eql? "encoding A")
61 | # Bug from ruby 1.8 http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
62 | first = row[@fields[0]]
63 | row[name] = Iconv.conv(@fields[1], @fields[2], first + ' ')[0..-2]
64 | end
65 |
66 | row[name]
67 | end
68 |
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/lib/etl/transform/date_to_string_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform a Date or Time to a formatted string instance
4 | class DateToStringTransform < ETL::Transform::Transform
5 | # Initialize the transformer.
6 | #
7 | # Configuration options:
8 | # * :format: A format passed to strftime. Defaults to %Y-%m-%d
9 | def initialize(control, name, configuration={})
10 | super
11 | @format = configuration[:format] || "%Y-%m-%d"
12 | end
13 | # Transform the value using strftime
14 | def transform(name, value, row)
15 | return value unless value.respond_to?(:strftime)
16 | value.strftime(@format)
17 | end
18 | end
19 | end
20 | end
--------------------------------------------------------------------------------
/lib/etl/transform/decode_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform which decodes coded values
4 | class DecodeTransform < ETL::Transform::Transform
5 | attr_accessor :decode_table_path
6 |
7 | attr_accessor :decode_table_delimiter
8 |
9 | attr_accessor :default_value
10 |
11 | # Initialize the transformer
12 | #
13 | # Configuration options:
14 | # * :decode_table_path: The path to the decode table (defaults to 'decode.txt')
15 | # * :decode_table_delimiter: The decode table delimiter (defaults to ':')
16 | # * :default_value: The default value to use (defaults to 'No Value')
17 | def initialize(control, name, configuration={})
18 | super
19 |
20 | if configuration[:decode_table_path]
21 | configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path])
22 | end
23 |
24 | @decode_table_path = (configuration[:decode_table_path] || 'decode.txt')
25 | @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':')
26 | @default_value = (configuration[:default_value] || 'No Value')
27 | end
28 |
29 | # Transform the value
30 | def transform(name, value, row)
31 | decode_table[value] || default_value
32 | end
33 |
34 | # Get the decode table
35 | def decode_table
36 | unless @decode_table
37 | @decode_table = {}
38 | open(decode_table_path).each do |line|
39 | code, value = line.strip.split(decode_table_delimiter)
40 | if code && code.length > 0
41 | @decode_table[code] = value
42 | else
43 | @default_value = value
44 | end
45 | end
46 | end
47 | @decode_table
48 | end
49 | end
50 | end
51 | end
--------------------------------------------------------------------------------
/lib/etl/transform/default_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform which will replace nil or empty values with a specified value.
4 | class DefaultTransform < Transform
5 | attr_accessor :default_value
6 | # Initialize the transform
7 | #
8 | # Configuration options:
9 | # * :default_value: The default value to use if the incoming value is blank
10 | def initialize(control, name, configuration)
11 | super
12 | @default_value = configuration[:default_value]
13 | end
14 | # Transform the value
15 | def transform(name, value, row)
16 | value.blank? ? default_value : value
17 | end
18 | end
19 | end
20 | end
--------------------------------------------------------------------------------
/lib/etl/transform/hierarchy_lookup_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform which walks up the hierarchy tree to find a value of the current level's value
4 | # is nil.
5 | #
6 | # TODO: Let the resolver be implemented in a class so different resolution methods are
7 | # possible.
8 | class HierarchyLookupTransform < ETL::Transform::Transform
9 | # The name of the field to use for the parent ID
10 | attr_accessor :parent_id_field
11 |
12 | # The target connection name
13 | attr_accessor :target
14 |
15 | # Initialize the transform
16 | #
17 | # Configuration options:
18 | # * :target: The target connection name (required)
19 | # * :parent_id_field: The name of the field to use for the parent ID (defaults to :parent_id)
20 | def initialize(control, name, configuration={})
21 | super
22 | @parent_id_field = configuration[:parent_id_field] || :parent_id
23 | @target = configuration[:target]
24 | end
25 |
26 | # Transform the value.
27 | def transform(name, value, row)
28 | if parent_id = row[parent_id_field]
29 | # TODO: should use more than just the first source out of the control
30 | parent_id, value = lookup(name,
31 | control.sources.first.configuration[:table], parent_id, parent_id_field)
32 | until value || parent_id.nil?
33 | # TODO: should use more than just the first source out of the control
34 | parent_id, value = lookup(name,
35 | control.sources.first.configuration[:table], parent_id, parent_id_field)
36 | end
37 | end
38 | value
39 | end
40 |
41 | # Lookup the parent value.
42 | def lookup(field, table, parent_id, parent_id_field)
43 | q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}"
44 | row = ETL::Engine.connection(target).select_one(q)
45 | return row[parent_id_field.to_s], row[field.to_s]
46 | end
47 | end
48 | end
49 | end
--------------------------------------------------------------------------------
/lib/etl/transform/md5_transform.rb:
--------------------------------------------------------------------------------
1 | require 'digest/md5'
2 |
3 | module ETL #:nodoc:
4 | module Transform #:nodoc:
5 | # Transform which hashes the original value with a MD5 hash algorithm
6 | class Md5Transform < ETL::Transform::Transform
7 | # Transform the value with a MD5 digest algorithm.
8 | def transform(name, value, row)
9 | Digest::MD5.hexdigest(value)
10 | end
11 | end
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/etl/transform/ordinalize_transform.rb:
--------------------------------------------------------------------------------
1 | require 'active_support/core_ext/integer/inflections.rb'
2 |
3 | module ETL #:nodoc:
4 | module Transform #:nodoc:
5 | # Transform a number to an ordinalized version using the ActiveSupport ordinalize
6 | # core extension
7 | class OrdinalizeTransform < ETL::Transform::Transform
8 | # Transform the value from a number to an ordinalized number
9 | def transform(name, value, row)
10 | value.ordinalize
11 | end
12 | end
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/lib/etl/transform/sha1_transform.rb:
--------------------------------------------------------------------------------
1 | require 'digest/sha1'
2 |
3 | module ETL #:nodoc:
4 | module Transform #:nodoc:
5 | # Transform which hashes the original value with a SHA-1 hash algorithm
6 | class Sha1Transform < ETL::Transform::Transform
7 | # Transform the value with a SHA1 digest algorithm.
8 | def transform(name, value, row)
9 | Digest::SHA1.hexdigest(value)
10 | end
11 | end
12 | end
13 | end
--------------------------------------------------------------------------------
/lib/etl/transform/split_fields_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Transform
3 | class SplitFieldsTransform < ETL::Transform::Transform
4 | attr_reader :delimiter
5 | attr_reader :new_fields
6 |
7 | def initialize(control, name, configuration)
8 | @delimiter = configuration[:delimiter] || ','
9 | @new_fields = configuration[:new_fields]
10 | super
11 | end
12 |
13 | def transform(name, value, row)
14 | return nil if row.nil?
15 | return nil if row[name].nil?
16 |
17 | fields = row[name].split(@delimiter)
18 | @new_fields.each_with_index do |new, index|
19 | row[new] = fields[index]
20 | end
21 |
22 | row[name]
23 | end
24 |
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/lib/etl/transform/string_to_date_time_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform a String representation of a date to a DateTime instance
4 | class StringToDateTimeTransform < ETL::Transform::Transform
5 | # Transform the value using DateTime.parse.
6 | #
7 | # WARNING: This transform is slow (due to the Ruby implementation), but if you need to
8 | # parse timestamps before or after the values supported by the Time.parse.
9 | def transform(name, value, row)
10 | DateTime.parse(value) unless value.nil?
11 | end
12 | end
13 | end
14 | end
--------------------------------------------------------------------------------
/lib/etl/transform/string_to_date_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform a String representation of a date to a Date instance
4 | class StringToDateTransform < ETL::Transform::Transform
5 | # Transform the value using Date.parse
6 | def transform(name, value, row)
7 | return value if value.nil?
8 | begin
9 | Date.parse(value)
10 | rescue => e
11 | return value
12 | end
13 | end
14 | end
15 | end
16 | end
--------------------------------------------------------------------------------
/lib/etl/transform/string_to_time_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform a String representation of a date to a Time instance
4 | class StringToTimeTransform < ETL::Transform::Transform
5 | # Transform the value using Time.parse
6 | def transform(name, value, row)
7 | Time.parse(value) unless value.nil?
8 | end
9 | end
10 | end
11 | end
--------------------------------------------------------------------------------
/lib/etl/transform/transform.rb:
--------------------------------------------------------------------------------
1 | module ETL#:nodoc:
2 | module Transform#:nodoc:
3 | # Base class for transforms.
4 | #
5 | # A transform converts one value to another value using some sort of algorithm.
6 | #
7 | # A simple transform has two arguments, the field to transform and the name of the transform:
8 | #
9 | # transform :ssn, :sha1
10 | #
11 | # Transforms can also be blocks:
12 | #
13 | # transform(:ssn){ |v| v[0,24] }
14 | #
15 | # Finally, a transform can include a configuration hash:
16 | #
17 | # transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'}
18 | class Transform
19 | class << self
20 | # Transform the specified value using the given transforms. The transforms can either be
21 | # Proc objects or objects which extend from Transform and implement the method transform(value).
22 | # Any other object will result in a ControlError being raised.
23 | def transform(name, value, row, transforms)
24 | transforms.each do |transform|
25 | benchmarks[transform.class] ||= 0
26 | benchmarks[transform.class] += Benchmark.realtime do
27 | Engine.logger.debug "Transforming field #{name} with #{transform.inspect}"
28 | case transform
29 | when Proc
30 | value = transform.call([name, value, row])
31 | when Transform
32 | value = transform.transform(name, value, row)
33 | else
34 | raise ControlError, "Unsupported transform configuration type: #{transform}"
35 | end
36 | end
37 | end
38 | value
39 | end
40 |
41 | def benchmarks
42 | @benchmarks ||= {}
43 | end
44 | end
45 |
46 | attr_reader :control, :name, :configuration
47 |
48 | # Initialize the transform object with the given control object, field name and
49 | # configuration hash
50 | def initialize(control, name, configuration={})
51 | @control = control
52 | @name = name
53 | @configuration = configuration
54 | end
55 |
56 | def transform(name, value, row)
57 | raise "transform is an abstract method"
58 | end
59 | end
60 | end
61 | end
--------------------------------------------------------------------------------
/lib/etl/transform/trim_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform to trim string
4 | class TrimTransform < ETL::Transform::Transform
5 | # Configuration options:
6 | # * :type: :left, :right or :both. Default is :both
7 | def initialize(control, name, configuration={})
8 | super
9 | @type = (configuration[:type] || :both).to_sym
10 | end
11 | # Transform the value
12 | def transform(name, value, row)
13 | case @type
14 | when :left
15 | value.lstrip
16 | when :right
17 | value.rstrip
18 | when :both
19 | value.strip
20 | else
21 | raise "Trim type, if specified, must be :left, :right or :both"
22 | end
23 | end
24 | end
25 | end
26 | end
--------------------------------------------------------------------------------
/lib/etl/transform/type_transform.rb:
--------------------------------------------------------------------------------
1 | module ETL #:nodoc:
2 | module Transform #:nodoc:
3 | # Transform from one type to another
4 | class TypeTransform < ETL::Transform::Transform
5 | # Initialize the transformer.
6 | #
7 | # Configuration options:
8 | # * :type: The type to convert to. Supported types:
9 | # ** :string
10 | # ** :number,:integer
11 | # ** :float
12 | # ** :decimal
13 | def initialize(control, name, configuration={})
14 | super
15 | @type = configuration[:type]
16 | @significant = configuration[:significant] ||= 0
17 | end
18 | # Transform the value
19 | def transform(name, value, row)
20 | case @type
21 | when :string
22 | value.to_s
23 | when :number, :integer
24 | value.to_i
25 | when :float
26 | value.to_f
27 | when :decimal
28 | BigDecimal.new(value.to_s, @significant)
29 | else
30 | raise "Unsupported type: #{@type}"
31 | end
32 | end
33 | end
34 | end
35 | end
--------------------------------------------------------------------------------
/lib/etl/util.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Util
3 | # Return the distance of time in words from the given from_time to the specified to_time. If to_time
4 | # is not specified then Time.now is used. By default seconds are included...set the include_seconds
5 | # argument to false to disable the seconds.
6 | def distance_of_time_in_words(from_time, to_time=Time.now)
7 | from_time = from_time.to_time if from_time.respond_to?(:to_time)
8 | to_time = to_time.to_time if to_time.respond_to?(:to_time)
9 | seconds = (to_time - from_time).round
10 | distance_in_days = (seconds/(60*60*24)).round
11 | seconds = seconds % (60*60*24)
12 | distance_in_hours = (seconds/(60*60)).round
13 | seconds = seconds % (60*60)
14 | distance_in_minutes = (seconds/60).round
15 | seconds = seconds % 60
16 | distance_in_seconds = seconds
17 |
18 | s = ''
19 | s << "#{distance_in_days} days," if distance_in_days > 0
20 | s << "#{distance_in_hours} hours, " if distance_in_hours > 0
21 | s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0
22 | s << "#{distance_in_seconds} seconds"
23 | s
24 | end
25 |
26 | # Get the approximate disntance of time in words from the given from_time
27 | # to the the given to_time. If to_time is not specified then it is set
28 | # to Time.now. By default seconds are included...set the include_seconds
29 | # argument to false to disable the seconds.
30 | def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true)
31 | from_time = from_time.to_time if from_time.respond_to?(:to_time)
32 | to_time = to_time.to_time if to_time.respond_to?(:to_time)
33 | distance_in_minutes = (((to_time - from_time).abs)/60).round
34 | distance_in_seconds = ((to_time - from_time).abs).round
35 |
36 | case distance_in_minutes
37 | when 0..1
38 | return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds
39 | case distance_in_seconds
40 | when 0..4 then 'less than 5 seconds'
41 | when 5..9 then 'less than 10 seconds'
42 | when 10..19 then 'less than 20 seconds'
43 | when 20..39 then 'half a minute'
44 | when 40..59 then 'less than a minute'
45 | else '1 minute'
46 | end
47 | when 2..44 then "#{distance_in_minutes} minutes"
48 | when 45..89 then 'about 1 hour'
49 | when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours"
50 | when 1440..2879 then '1 day'
51 | when 2880..43199 then "#{(distance_in_minutes / 1440).round} days"
52 | when 43200..86399 then 'about 1 month'
53 | when 86400..525959 then "#{(distance_in_minutes / 43200).round} months"
54 | when 525960..1051919 then 'about 1 year'
55 | else "over #{(distance_in_minutes / 525960).round} years"
56 | end
57 | end
58 | end
59 | end
--------------------------------------------------------------------------------
/lib/etl/version.rb:
--------------------------------------------------------------------------------
1 | module ETL#:nodoc:
2 | VERSION = "1.0.0"
3 | end
4 |
--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/.gitignore
--------------------------------------------------------------------------------
/test/.ignore:
--------------------------------------------------------------------------------
1 | database.yml
2 | *.txt
--------------------------------------------------------------------------------
/test/all.ebf:
--------------------------------------------------------------------------------
1 | # This is an ETL Batch File and defines a means for executing
2 | # a collection of ETL scripts as a single process.
3 |
4 | use_temp_tables
5 | run 'batched1.ctl'
6 | run 'batched2.ctl'
--------------------------------------------------------------------------------
/test/apache_combined_log.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/apache_combined_log.txt',
3 | :parser => :apache_combined_log
4 | }
5 |
6 | destination :out, {
7 | :file => 'output/apache_combined_log.txt'
8 | },
9 | {
10 | :order => []
11 | }
--------------------------------------------------------------------------------
/test/batch_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class BatchTest < Test::Unit::TestCase
4 | attr_reader :file, :db_yaml, :engine
5 | def setup
6 | @file = File.dirname(__FILE__) + '/all.ebf'
7 | @db_yaml = File.dirname(__FILE__) + '/database.yml'
8 | @engine = ETL::Engine.new
9 | end
10 | def teardown
11 |
12 | end
13 | def test_etl_batch_file
14 | #`etl #{file} -c #{db_yaml}`
15 | end
16 | def test_batch
17 | assert_nothing_raised do
18 | batch = ETL::Batch::Batch.resolve(file, engine)
19 | batch.execute
20 | end
21 | end
22 | def test_batch_with_file
23 | assert_nothing_raised do
24 | batch = ETL::Batch::Batch.resolve(File.new(file), engine)
25 | batch.execute
26 | end
27 | end
28 | def test_batch_with_batch_object
29 | assert_nothing_raised do
30 | batch_instance = ETL::Batch::Batch.new(File.new(file))
31 | batch_instance.engine = engine
32 | batch = ETL::Batch::Batch.resolve(batch_instance, engine)
33 | batch.execute
34 | end
35 | end
36 | def test_batch_with_object_should_fail
37 | assert_raise(RuntimeError) do
38 | batch = ETL::Batch::Batch.resolve(0, engine)
39 | end
40 | end
41 | end
--------------------------------------------------------------------------------
/test/batch_with_error.ebf:
--------------------------------------------------------------------------------
1 | # This is an ETL Batch File and defines a means for executing
2 | # a collection of ETL scripts as a single process.
3 |
4 | use_temp_tables
5 | run 'delimited_with_bulk_load.ctl'
6 | run 'screen_test_fatal.ctl'
--------------------------------------------------------------------------------
/test/batched1.ctl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/batched1.ctl
--------------------------------------------------------------------------------
/test/batched2.ctl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/batched2.ctl
--------------------------------------------------------------------------------
/test/block_processor.ctl:
--------------------------------------------------------------------------------
1 | source :in, { :type => :mock, :name => :block_processed_input }
2 |
3 | after_read { |row| row[:added_by_after_read] = "after-" +row[:first_name]; row }
4 | before_write { |row| row[:added_by_before_write] = "Row #{Engine.current_source_row}"; [row,{:new_row => 'added by post_processor'}] }
5 |
6 | destination :out, { :type => :mock, :name => :block_processed_output }
--------------------------------------------------------------------------------
/test/block_processor_error.ctl:
--------------------------------------------------------------------------------
1 | pre_process { raise ControlError.new( "Cough!") }
--------------------------------------------------------------------------------
/test/block_processor_pre_post_process.ctl:
--------------------------------------------------------------------------------
1 | source :in, { :type => :mock, :name => :another_input }
2 | pre_process { TestWitness.call("I'm called from pre_process") }
3 | post_process { TestWitness.call("I'm called from post_process") }
4 | destination :out, { :type => :mock, :name => :another_output }
--------------------------------------------------------------------------------
/test/block_processor_remove_rows.ctl:
--------------------------------------------------------------------------------
1 | source :in, { :type => :mock, :name => :block_input }
2 |
3 | before_write { |row| row[:obsolete] == true ? nil : row }
4 |
5 | destination :out, { :type => :mock, :name => :block_output }
--------------------------------------------------------------------------------
/test/block_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 | include ETL
3 | include ETL::Control
4 |
5 | class TestWitness
6 | end
7 |
8 | class BlockProcessorTest < Test::Unit::TestCase
9 |
10 | def test_block_processor_should_work_as_both_after_read_and_before_write_row_processor
11 | MockSource[:block_processed_input] = [{ :first_name => 'John'},{:first_name => 'Gary'}]
12 | process 'block_processor.ctl'
13 | assert_equal 4, MockDestination[:block_processed_output].size
14 | assert_equal({ :first_name => 'John', :added_by_after_read => 'after-John', :added_by_before_write => "Row 1" }, MockDestination[:block_processed_output][0])
15 | assert_equal({ :new_row => 'added by post_processor' }, MockDestination[:block_processed_output][1])
16 | assert_equal({ :first_name => 'Gary', :added_by_after_read => 'after-Gary', :added_by_before_write => "Row 2" }, MockDestination[:block_processed_output][2])
17 | assert_equal({ :new_row => 'added by post_processor' }, MockDestination[:block_processed_output][3])
18 | end
19 |
20 | def test_block_processor_should_let_rows_be_removed_by_setting_it_to_nil
21 | MockSource[:block_input] = [{ :obsolete => true, :name => 'John'},{ :obsolete => false, :name => 'Gary'}]
22 | process 'block_processor_remove_rows.ctl'
23 | assert_equal([{ :obsolete => false, :name => 'Gary' }], MockDestination[:block_output]) # only one record should be kept
24 | end
25 |
26 | def test_block_processor_should_work_as_pre_or_post_processor
27 | flexmock(TestWitness).should_receive(:call).with("I'm called from pre_process")
28 | flexmock(TestWitness).should_receive(:call).with("I'm called from post_process")
29 | MockSource[:another_input] = [{ :obsolete => true, :name => 'John'},{ :obsolete => false, :name => 'Gary'}]
30 | process 'block_processor_pre_post_process.ctl'
31 | assert_equal(MockSource[:another_input], MockDestination[:another_output])
32 | end
33 |
34 | def test_block_error_should_be_propagated
35 | assert_raise(ControlError) { process 'block_processor_error.ctl' }
36 | end
37 |
38 | end
--------------------------------------------------------------------------------
/test/check_exist_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class CheckExistProcessorTest < Test::Unit::TestCase
4 |
5 | context 'CheckExistProcessor' do
6 |
7 | setup do
8 | @config = {
9 | :target => :data_warehouse,
10 | :table => 'people',
11 | :columns => [:first_name, :last_name]
12 | }
13 | end
14 |
15 | should_eventually "compare based on all columns if no columns are provided" do
16 | # TBI
17 | end
18 |
19 | should_eventually "compare based on all columns except skipped ones if columns to skip are provided" do
20 | # TBI
21 | end
22 |
23 | should "raise an error if no table is provided" do
24 | error = assert_raises(ETL::ControlError) do
25 | ETL::Processor::CheckExistProcessor.new(nil, @config.except(:table))
26 | end
27 | # bug #2413 on assert_raises won't let me check error message above
28 | assert_equal 'table must be specified', error.message
29 | end
30 |
31 | should "raise an error if no target is provided" do
32 | error = assert_raises(ETL::ControlError) do
33 | ETL::Processor::CheckExistProcessor.new(nil, @config.except(:target))
34 | end
35 |
36 | assert_equal 'target must be specified', error.message
37 | end
38 |
39 | should "bypass checking if the table has no rows" do
40 | Person.delete_all
41 |
42 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config)
43 | assert_equal false, processor.should_check?
44 | end
45 |
46 | should "raise an error if one of the keys used for checking existence is not available in a row" do
47 | Person.delete_all
48 | # we need at least one record to avoid automatic skipping
49 | # this should be mocked instead, probably
50 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234')
51 |
52 | error = assert_raise(ETL::ControlError) do
53 | row = ETL::Row[:first_name => 'John']
54 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config)
55 |
56 | # guard against bypassing
57 | assert_equal true, processor.should_check?
58 |
59 | processor.process(row)
60 | end
61 |
62 | assert_equal "Row missing required field :last_name for existence check", error.message
63 | end
64 |
65 | should "return nil if the same row is found in database" do
66 | Person.delete_all
67 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234')
68 |
69 | row = ETL::Row[:first_name => 'John', :last_name => 'Barry']
70 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config)
71 | assert_equal true, processor.should_check? # guard against bypassing
72 |
73 | assert_equal nil, processor.process(row)
74 | end
75 |
76 | should "return the row if no same row is found in database" do
77 | Person.delete_all
78 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234')
79 |
80 | row = ETL::Row[:first_name => 'John', :last_name => 'OtherName']
81 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config)
82 | assert_equal true, processor.should_check? # guard against bypassing
83 |
84 | assert_equal row, processor.process(row)
85 | end
86 |
87 | end
88 |
89 | end
90 |
--------------------------------------------------------------------------------
/test/check_unique_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class CheckUniqueProcessorTest < Test::Unit::TestCase
4 |
5 | context 'CheckUniqueProcessor' do
6 | attr_reader :processor
7 |
8 | setup do
9 | @processor = ETL::Processor::CheckUniqueProcessor.new(nil,
10 | :keys => [:first, :second])
11 | end
12 |
13 | should "keep a row whose keys didn't already appear in the pipeline" do
14 | row = ETL::Row[:first => 'A', :second => 'B']
15 |
16 | assert_equal row, processor.process(row)
17 |
18 | assert_equal({ 'A|B' => 1 }, processor.compound_key_constraints)
19 | end
20 |
21 | should "remove a row whose keys already appeared in the pipeline" do
22 | row = ETL::Row[:first => 'A', :second => 'B']
23 |
24 | assert_equal row, processor.process(row)
25 | assert_equal nil, processor.process(row)
26 | end
27 |
28 | should "raise an error if a row lacks one of the keys specified" do
29 | row = ETL::Row[:first => 'A']
30 |
31 | error = assert_raises(ETL::ControlError) do
32 | processor.process(row)
33 | end
34 |
35 | assert_equal "Row missing required field :second for unicity check", error.message
36 | end
37 |
38 | end
39 |
40 | end
41 |
--------------------------------------------------------------------------------
/test/config/.gitignore:
--------------------------------------------------------------------------------
1 | *.lock
--------------------------------------------------------------------------------
/test/config/database.yml:
--------------------------------------------------------------------------------
1 | <% raise "ENV['DB'] not specified!" unless ENV['DB'] %>
2 |
3 | # a bit hackish - tests would require a refactoring instead
4 |
5 | mysql2: &mysql2
6 | host: 127.0.0.1
7 | adapter: mysql2
8 | database: activewarehouse_etl_test
9 | username: root
10 | encoding: utf8
11 | local_infile: true
12 | # the tests would require a rework: disabling casting for now
13 | cast: false
14 |
15 | postgresql: &postgresql
16 | adapter: postgresql
17 | database: activewarehouse_etl_test
18 | username: postgres
19 |
20 | # TODO - refactor test to avoid using 2 databases maybe?
21 | operational_database:
22 | <<: *<%= ENV['DB'] %>
23 |
24 | data_warehouse:
25 | <<: *<%= ENV['DB'] %>
26 |
27 | etl_execution:
28 | <<: *<%= ENV['DB'] %>
29 | database: etl_execution
30 |
--------------------------------------------------------------------------------
/test/config/gemfiles/Gemfile.rails-3.0.x:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/common'
2 |
3 | declare_gems '~> 3.0.20'
4 |
--------------------------------------------------------------------------------
/test/config/gemfiles/Gemfile.rails-3.1.x:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/common'
2 |
3 | declare_gems '~> 3.1.12'
4 |
--------------------------------------------------------------------------------
/test/config/gemfiles/Gemfile.rails-3.2.x:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/common'
2 |
3 | declare_gems '~> 3.2.14'
4 |
--------------------------------------------------------------------------------
/test/config/gemfiles/Gemfile.rails-4.0.x:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/common'
2 |
3 | declare_gems '~> 4.0.0'
4 |
--------------------------------------------------------------------------------
/test/config/gemfiles/common.rb:
--------------------------------------------------------------------------------
1 | def declare_gems(activerecord_version)
2 | source "https://rubygems.org"
3 |
4 | gem 'activerecord', activerecord_version
5 | gem 'adapter_extensions', :git => 'https://github.com/activewarehouse/adapter_extensions.git'
6 |
7 | if activerecord_version < '3.1'
8 | gem 'mysql2', '< 0.3'
9 | else
10 | # use our own fork for bulk load support until issue fixed:
11 | # https://github.com/brianmario/mysql2/pull/242
12 | gem 'mysql2', :git => 'https://github.com/activewarehouse/mysql2.git'
13 | end
14 |
15 | gem 'pg'
16 | gem 'activerecord-sqlserver-adapter'
17 |
18 | gem 'awesome_print'
19 | gem 'rake'
20 | gem 'flexmock'
21 | gem 'shoulda', '3.0.1'
22 | gem 'sqlite3'
23 |
24 | gem 'spreadsheet'
25 | gem 'nokogiri'
26 | gem 'fastercsv'
27 |
28 | gem 'roo'
29 |
30 | gem 'standalone_migrations', '1.0.5'
31 | end
--------------------------------------------------------------------------------
/test/control_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class ControlTest < Test::Unit::TestCase
4 | # Test the ability to parse control files.
5 | def test_parse
6 | assert_nothing_raised do
7 | Dir.glob(File.join(File.dirname(__FILE__), '*.ctl')) do |f|
8 | ETL::Control::Control.parse(f)
9 | end
10 | end
11 | end
12 |
13 | def test_bad_control_raises_error
14 | assert_raise ETL::ControlError do
15 | ETL::Control::Control.resolve(0)
16 | end
17 | end
18 |
19 | def test_resolve_control_object
20 | assert_nothing_raised do
21 | ETL::Control::Control.resolve(ETL::Control::Control.parse(File.join(File.dirname(__FILE__), 'delimited.ctl')))
22 | end
23 | end
24 |
25 | def test_set_error_threshold
26 | assert_nothing_raised do
27 | ETL::Engine.process(File.join(File.dirname(__FILE__), 'errors.ctl'))
28 | end
29 | end
30 |
31 | def test_bad_processor_name
32 | assert_raise ETL::ControlError do
33 | s = "before_write :chunky_monkey"
34 | ETL::Control::Control.parse_text(s)
35 | end
36 | end
37 |
38 | def test_dependencies
39 | s = "depends_on 'foo', 'bar'"
40 | control = ETL::Control::Control.parse_text(s)
41 | assert_equal control.dependencies, ['foo','bar']
42 | end
43 | end
--------------------------------------------------------------------------------
/test/data/apache_combined_log.txt:
--------------------------------------------------------------------------------
1 | 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
2 | 127.0.0.1 - bob [11/Oct/2000:05:22:02 -0700] "GET /apache_pb.gif HTTP/1.1" 200 2326 "http://www.foo.com/" "Mozilla/4.08 [en] (Win98; I ;Nav)"
3 | 127.0.0.1 - bob [11/Oct/2000:05:52:31 -0700] "GET /apache_pb.gif HTTP/1.1" 200 2326 "-" "Mozilla/4.08 [en] (Win98; I ;Nav)"
4 |
--------------------------------------------------------------------------------
/test/data/bulk_import.txt:
--------------------------------------------------------------------------------
1 | 1,Chris,Smith,111223333
2 | 2,Jim,Foxworthy,444332222
3 | 3,Brian,Collingsworth,123443435
--------------------------------------------------------------------------------
/test/data/bulk_import_with_empties.txt:
--------------------------------------------------------------------------------
1 | 1,Chris,Smith,111223333
2 | 2,Jim,,444332222
3 | 3,Brian,Collingsworth,123443435
--------------------------------------------------------------------------------
/test/data/decode.txt:
--------------------------------------------------------------------------------
1 | M:Male
2 | F:Female
3 | :Unknown
--------------------------------------------------------------------------------
/test/data/delimited.txt:
--------------------------------------------------------------------------------
1 | Chris,Smith,111223333,24,M
2 | Jim,Foxworthy,444332222,51,M
3 | Brian,Collingsworth,123443435,10,M
--------------------------------------------------------------------------------
/test/data/encode_source_latin1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/encode_source_latin1.txt
--------------------------------------------------------------------------------
/test/data/excel.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/excel.xls
--------------------------------------------------------------------------------
/test/data/excel2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/excel2.xls
--------------------------------------------------------------------------------
/test/data/fixed_width.txt:
--------------------------------------------------------------------------------
1 | Bob Smith 12344555523
2 | Jane Doe 98766211145
3 | AbcdefghiJklmnopqrstu12345678920
--------------------------------------------------------------------------------
/test/data/multiple_delimited_1.txt:
--------------------------------------------------------------------------------
1 | Chris,Smith,111223333,24
2 | Jim,Foxworthy,444332222,51
3 | Brian,Collingsworth,123443435,10
--------------------------------------------------------------------------------
/test/data/multiple_delimited_2.txt:
--------------------------------------------------------------------------------
1 | Bob,Jones,444223333,28
2 | Tom,Allen,324001232,33
3 | Jesse,Baker,555443333,21
--------------------------------------------------------------------------------
/test/data/nokogiri.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Bob
6 | Smith
7 | bsmith@foo.com
8 |
9 | brown
10 | black
11 | fair
12 |
13 | 24
14 |
15 |
16 | Jane
17 | Doe
18 | jdoe@bar.com
19 |
20 | blue
21 | blond
22 | medium
23 |
24 | 45
25 |
26 |
27 | Jake
28 | Smithsonian
29 | jake@example.com
30 |
31 | brown
32 | black
33 | dark
34 |
35 | 37
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/test/data/people.txt:
--------------------------------------------------------------------------------
1 | Bob,Smith
2 | Jane,Doe
3 | Chris,Cornell
--------------------------------------------------------------------------------
/test/data/sax.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Smith
8 | 123456789
9 |
10 |
11 |
12 | Doe
13 | 222114545
14 |
15 |
--------------------------------------------------------------------------------
/test/data/xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Bob
6 | Smith
7 | 123456789
8 | 24
9 |
10 |
11 | John
12 | Doe
13 | 222114545
14 | 31
15 |
16 |
--------------------------------------------------------------------------------
/test/database_join_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class DatabaseJoinProcessorTest < Test::Unit::TestCase
4 |
5 | def new_processor(options)
6 | ETL::Processor::DatabaseJoinProcessor.new(nil, options)
7 | end
8 |
9 | should 'raise an error unless :fields is specified' do
10 | error = assert_raises(ETL::ControlError) { new_processor({}) }
11 | assert_equal ":target must be specified", error.message
12 | end
13 |
14 | should 'return the row and the database result' do
15 | row = ETL::Row[:id => 1, :first_name => 'Bob', :last_name => 'Smith', :ssn => '111234444']
16 | control = ETL::Control::Control.parse(File.dirname(__FILE__) +
17 | '/delimited.ctl')
18 |
19 | Person.delete_all
20 | assert_equal 0, Person.count
21 |
22 | # First define a basic configuration to check defaults
23 | configuration = {
24 | :target => :data_warehouse,
25 | :database => 'etl_unittest',
26 | :table => 'people',
27 | :buffer_size => 0
28 | }
29 | mapping = { :order => [:id, :first_name, :last_name, :ssn] }
30 | dest = ETL::Control::DatabaseDestination.new(control, configuration, mapping)
31 | dest.write(row)
32 | dest.close
33 |
34 | assert_equal 1, Person.find(:all).length
35 |
36 | row = ETL::Row[:last_name => "Smith"]
37 | processor = new_processor(:target => :data_warehouse,
38 | :query => "SELECT first_name FROM people WHERE last_name = \#{connection.quote(row[:last_name])}",
39 | :fields => ["first_name"]).process(row)
40 | assert_equal row[:first_name], "Bob"
41 | end
42 |
43 | end
44 |
--------------------------------------------------------------------------------
/test/date_dimension_builder_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class DateDimensionBuilderTest < Test::Unit::TestCase
4 |
5 | context "the DateDimensionBuilder" do
6 | context "when initialized with defaults" do
7 | setup do
8 | @builder = ETL::Builder::DateDimensionBuilder.new
9 | end
10 | should "have a start date of 5 years ago" do
11 | assert_equal Time.now.years_ago(5).to_date, @builder.start_date.to_date
12 | end
13 | should "have an end date of now" do
14 | assert_equal Time.now.to_date, @builder.end_date.to_date
15 | end
16 | should "have an empty of array of holiday indicators" do
17 | assert_equal [], @builder.holiday_indicators
18 | end
19 | end
20 | context "when initialized with arguments" do
21 | setup do
22 | @start_date = Time.now.years_ago(2)
23 | @end_date = Time.now.years_ago(1)
24 | @builder = ETL::Builder::DateDimensionBuilder.new(@start_date, @end_date)
25 | end
26 | should "respect a custom start date" do
27 | assert_equal @start_date.to_date, @builder.start_date.to_date
28 | end
29 | should "respect a custom end date" do
30 | assert_equal @end_date.to_date, @builder.end_date.to_date
31 | end
32 | end
33 | context "when building a date dimension using the default settings" do
34 | setup do
35 | # specific dates required when testing, because leap years affect
36 | # how many records are built
37 | @start_date = Date.parse('2002-05-19').to_time
38 | @end_date = Date.parse('2007-05-19').to_time
39 | @builder = ETL::Builder::DateDimensionBuilder.new(@start_date, @end_date)
40 | @records = @builder.build
41 | end
42 | should "build a dimension with the correct number of records" do
43 | assert_equal 1827, @records.length
44 | end
45 | should "have the correct first date" do
46 | assert_date_dimension_record_equal(@builder.start_date, @records.first)
47 | end
48 | end
49 | context "when building a date dimension with a fiscal year offset month" do
50 | should_eventually "respect the fiscal year offset month" do
51 |
52 | end
53 | end
54 | end
55 |
56 | def assert_date_dimension_record_equal(date, record)
57 | real_date = date
58 | date = date.to_time
59 | assert_equal date.strftime("%m/%d/%Y"), record[:date]
60 | assert_equal date.strftime("%B %d,%Y"), record[:full_date_description]
61 | assert_equal date.strftime("%A"), record[:day_of_week]
62 | assert_equal date.day, record[:day_number_in_calendar_month]
63 | assert_equal date.yday, record[:day_number_in_calendar_year]
64 | assert_equal date.day, record[:day_number_in_fiscal_month]
65 | assert_equal date.fiscal_year_yday, record[:day_number_in_fiscal_year]
66 | assert_equal "Week #{date.week}", record[:calendar_week]
67 | assert_equal date.week, record[:calendar_week_number_in_year]
68 | assert_equal date.strftime("%B"), record[:calendar_month_name]
69 | assert_equal date.month, record[:calendar_month_number_in_year]
70 | assert_equal date.strftime("%Y-%m"), record[:calendar_year_month]
71 | assert_equal "Q#{date.quarter}", record[:calendar_quarter]
72 | assert_equal date.quarter, record[:calendar_quarter_number_in_year]
73 | assert_equal "#{date.strftime('%Y')}-#{record[:calendar_quarter]}", record[:calendar_year_quarter]
74 | assert_equal "#{date.year}", record[:calendar_year]
75 | assert_equal "FY Week #{date.fiscal_year_week}", record[:fiscal_week]
76 | assert_equal date.fiscal_year_week, record[:fiscal_week_number_in_year]
77 | assert_equal date.fiscal_year_month, record[:fiscal_month]
78 | assert_equal date.fiscal_year_month, record[:fiscal_month_number_in_year]
79 | assert_equal "FY#{date.fiscal_year}-" + date.fiscal_year_month.to_s.rjust(2, '0'), record[:fiscal_year_month]
80 | assert_equal "FY Q#{date.fiscal_year_quarter}", record[:fiscal_quarter]
81 | assert_equal "FY#{date.fiscal_year}-Q#{date.fiscal_year_quarter}", record[:fiscal_year_quarter]
82 | assert_equal date.fiscal_year_quarter, record[:fiscal_year_quarter_number]
83 | assert_equal "FY#{date.fiscal_year}", record[:fiscal_year]
84 | assert_equal date.fiscal_year, record[:fiscal_year_number]
85 | assert_equal 'Nonholiday', record[:holiday_indicator]
86 | assert_equal weekday_indicators[date.wday], record[:weekday_indicator]
87 | assert_equal 'None', record[:selling_season]
88 | assert_equal 'None', record[:major_event]
89 | assert_equal record[:sql_date_stamp], real_date
90 | end
91 |
92 | private
93 | def weekday_indicators
94 | ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
95 | end
96 | end
--------------------------------------------------------------------------------
/test/delimited.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/delimited.txt',
3 | :parser => {
4 | :name => :csv
5 | }
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | :age,
12 | :sex
13 | ]
14 |
15 | #transform :age, :type, :type => :number
16 | transform :ssn, :sha1
17 | transform(:ssn){ |n, v, row| v[0,24] }
18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'}
19 |
20 | destination :out, {
21 | :file => 'output/delimited.txt'
22 | },
23 | {
24 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test],
25 | :virtual => {
26 | :id => :surrogate_key,
27 | :test => "test!",
28 | :calc_test => Time.now
29 | },
30 | }
--------------------------------------------------------------------------------
/test/delimited_absolute.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => '/tmp/delimited_abs.txt',
3 | :parser => {
4 | :name => :csv
5 | }
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | {
12 | :name => :age,
13 | :type => :integer
14 | },
15 | :sex
16 | ]
17 |
18 | transform :ssn, :sha1
19 | transform(:ssn){ |n, v, row| v[0,24] }
20 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'}
21 |
22 | destination :out, {
23 | :file => 'data/delimited_abs.txt'
24 | },
25 | {
26 | :order => [:first_name, :last_name, :ssn, :age, :sex, :test, :calc_test],
27 | :virtual => {
28 | :test => "test!",
29 | :calc_test => Time.now
30 | }
31 | }
--------------------------------------------------------------------------------
/test/delimited_destination_db.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/delimited.txt',
3 | :parser => :csv
4 | },
5 | [
6 | :id,
7 | :first_name,
8 | :last_name,
9 | :ssn
10 | ]
11 |
12 | transform :ssn, :sha1
13 | transform(:ssn){ |v| v[0,24] }
14 |
15 | destination :out, {
16 | :type => :database,
17 | :target => :data_warehouse,
18 | :database => 'etl_unittest',
19 | :table => 'people',
20 | },
21 | {
22 | :order => [:id, :first_name, :last_name, :ssn]
23 | }
--------------------------------------------------------------------------------
/test/delimited_excel.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/delimited.txt',
3 | :parser => {
4 | :name => :csv
5 | }
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | :age,
12 | :sex
13 | ]
14 |
15 | #transform :age, :type, :type => :number
16 | transform :ssn, :sha1
17 | transform(:ssn){ |n, v, row| v[0,24] }
18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'}
19 |
20 | destination :out, {
21 | :type => :excel,
22 | :file => 'output/delimited_excel.xls'
23 | },
24 | {
25 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test],
26 | :virtual => {
27 | :id => :surrogate_key,
28 | :test => "test!",
29 | :calc_test => Time.now
30 | },
31 | }
--------------------------------------------------------------------------------
/test/delimited_insert_update.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/delimited.txt',
3 | :parser => {
4 | :name => :csv
5 | }
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | :age,
12 | :sex
13 | ]
14 |
15 | #transform :age, :type, :type => :number
16 | transform :ssn, :sha1
17 | transform(:ssn){ |n, v, row| v[0,24] }
18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'}
19 |
20 | destination :out, {
21 | :type => :insert_update_database,
22 | :target => :data_warehouse,
23 | :database => 'etl_unittest',
24 | :table => 'people'
25 | },
26 | {
27 | :primarykey => [:id],
28 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test],
29 | :virtual => {
30 | :id => :surrogate_key,
31 | :test => "test!",
32 | :calc_test => Time.now
33 | },
34 | }
35 |
--------------------------------------------------------------------------------
/test/delimited_update.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/delimited.txt',
3 | :parser => {
4 | :name => :csv
5 | }
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | :age,
12 | :sex
13 | ]
14 |
15 | #transform :age, :type, :type => :number
16 | transform :ssn, :sha1
17 | transform(:ssn){ |n, v, row| v[0,24] }
18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'}
19 |
20 | destination :out, {
21 | :type => :update_database,
22 | :target => :data_warehouse,
23 | :database => 'etl_unittest',
24 | :table => 'people'
25 | },
26 | {
27 | :conditions => [{:field => "\#{conn.quote_column_name(:id)}", :value => "\#{conn.quote(row[:id])}", :comp => "="}],
28 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test],
29 | :virtual => {
30 | :id => :surrogate_key,
31 | :test => "test!",
32 | :calc_test => Time.now
33 | },
34 | }
35 |
--------------------------------------------------------------------------------
/test/delimited_with_bulk_load.ctl:
--------------------------------------------------------------------------------
1 | infile = 'data/people.txt'
2 | outfile = 'output/people.txt'
3 |
4 | source :in, {
5 | :file => infile,
6 | :parser => {
7 | :name => :csv
8 | }
9 | },
10 | [
11 | :first_name,
12 | :last_name,
13 | ]
14 |
15 | before_write :surrogate_key, :target => :data_warehouse, :table => 'person_dimension', :column => 'id'
16 | before_write :check_exist, {
17 | :target => :data_warehouse,
18 | :table => 'person_dimension',
19 | :columns => [:first_name, :last_name]
20 | }
21 |
22 | destination :out, {
23 | :file => outfile
24 | },
25 | {
26 | :order => [:id, :first_name, :last_name]
27 | }
28 |
29 | post_process :bulk_import, {
30 | :file => outfile,
31 | :target => :data_warehouse,
32 | :table => 'person_dimension',
33 | :order => [:id, :first_name, :last_name]
34 | }
--------------------------------------------------------------------------------
/test/directive_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class BadDirective < ETL::Batch::Directive
4 |
5 | end
6 |
7 | class BatchTest < Test::Unit::TestCase
8 |
9 | attr_reader :file
10 | attr_reader :engine
11 | def setup
12 | @file = File.dirname(__FILE__) + '/all.ebf'
13 | @engine = ETL::Engine.new
14 | end
15 |
16 | def test_directive_without_implementation_should_fail
17 | batch = ETL::Batch::Batch.resolve(file, engine)
18 | assert_raise RuntimeError do
19 | d = BadDirective.new(batch)
20 | d.execute
21 | end
22 | end
23 | end
--------------------------------------------------------------------------------
/test/encode_processor_test.rb:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | require File.dirname(__FILE__) + '/test_helper'
3 |
4 | require 'iconv'
5 |
6 | class EncodeProcessorTest < Test::Unit::TestCase
7 |
8 | SOURCE = 'data/encode_source_latin1.txt'
9 | TARGET = 'output/encode_destination_utf-8.txt'
10 |
11 | def setup
12 | @control = flexmock("control")
13 | @control.should_receive(:file).twice.and_return(File.dirname(__FILE__) + '/fake-control.ctl')
14 | end
15 |
16 | def test_should_transform_a_latin1_file_to_utf8_with_grace
17 | configuration = { :source_file => SOURCE, :source_encoding => 'latin1', :target_file => TARGET, :target_encoding => 'utf-8' }
18 | ETL::Processor::EncodeProcessor.new(@control, configuration).process
19 | assert_equal "éphémère has accents.\nlet's encode them.", IO.read(File.join(File.dirname(__FILE__),TARGET))
20 | end
21 |
22 | def test_should_throw_exception_on_unsupported_encoding
23 | configuration = { :source_file => SOURCE, :source_encoding => 'acme-encoding', :target_file => TARGET, :target_encoding => 'utf-8' }
24 | error = assert_raise(ETL::ControlError) { ETL::Processor::EncodeProcessor.new(@control, configuration) }
25 | assert_equal "Either the source encoding 'acme-encoding' or the target encoding 'utf-8' is not supported", error.message
26 | end
27 |
28 | def test_should_throw_exception_when_target_and_source_are_the_same
29 | configuration = { :source_file => SOURCE, :source_encoding => 'latin1', :target_file => SOURCE, :target_encoding => 'utf-8' }
30 | error = assert_raise(ETL::ControlError) { ETL::Processor::EncodeProcessor.new(@control, configuration) }
31 | assert_equal "Source and target file cannot currently point to the same file", error.message
32 | end
33 |
34 | end
--------------------------------------------------------------------------------
/test/engine_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class EngineTest < Test::Unit::TestCase
4 |
5 | context 'process' do
6 |
7 | should 'raise an error when a file which does not exist is given' do
8 | error = assert_raise(Errno::ENOENT) do
9 | ETL::Engine.process('foo-bar.ctl')
10 | end
11 |
12 | assert_equal "No such file or directory - foo-bar.ctl", error.message
13 | end
14 |
15 | should 'raise an error when an unknown file type is given' do
16 | error = assert_raise(RuntimeError) do
17 | ETL::Engine.process(__FILE__)
18 | end
19 |
20 | assert_match /Unsupported file type/, error.message
21 | end
22 |
23 | should_eventually 'stop as soon as the error threshold is reached' do
24 | engine = ETL::Engine.new
25 |
26 | assert_equal 0, engine.errors.size
27 |
28 | engine.process ETL::Control::Control.parse_text < :enumerable, :enumerable => (1..100) }
31 | after_read { |row| raise "Failure" }
32 | CTL
33 |
34 | assert_equal 1, engine.errors.size
35 | end
36 |
37 | should 'call error callbacks' do
38 | engine = ETL::Engine.new
39 |
40 | $our_errors = []
41 | engine.process ETL::Control::Control.parse_text < :enumerable, :enumerable => (1..100) }
43 | on_error { |error| $our_errors << error }
44 | after_read { |row| raise "Failure" }
45 | CTL
46 | assert_equal 100, $our_errors.size
47 | assert_match /on line 100: Failure$/, $our_errors.last
48 | end
49 |
50 | end
51 |
52 | context 'connection' do
53 |
54 | should 'return an ActiveRecord configuration by name' do
55 | assert_not_nil ETL::Engine.connection(:data_warehouse)
56 | end
57 |
58 | should 'raise an error on non existent connection' do
59 | error = assert_raise(ETL::ETLError) do
60 | ETL::Engine.connection(:does_not_exist)
61 | end
62 | assert_equal "Cannot find connection named :does_not_exist", error.message
63 | end
64 |
65 | should 'raise an error when requesting a connection with no name' do
66 | error = assert_raise(ETL::ETLError) do
67 | ETL::Engine.connection(" ")
68 | end
69 | assert_equal "Connection with no name requested. Is there a missing :target parameter somewhere?", error.message
70 | end
71 | end
72 |
73 | context 'temp tables' do
74 | attr_reader :connection
75 |
76 | setup do
77 | @connection = ETL::Engine.connection(:data_warehouse)
78 | end
79 |
80 | should 'return unmodified table name when temp tables are disabled' do
81 | assert_equal 'foo', ETL::Engine.table('foo', ETL::Engine.connection(:data_warehouse))
82 | end
83 |
84 | should 'return temp table name instead of table name when temp tables are enabled' do
85 | ETL::Engine.use_temp_tables = true
86 | assert_equal 'tmp_people', ETL::Engine.table('people', connection)
87 | ETL::Engine.use_temp_tables = false
88 | end
89 | end
90 |
91 | end
--------------------------------------------------------------------------------
/test/ensure_fields_presence_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class EnsureFieldsPresenceProcessorTest < Test::Unit::TestCase
4 |
5 | def new_processor(options)
6 | ETL::Processor::EnsureFieldsPresenceProcessor.new(nil, options)
7 | end
8 |
9 | should 'raise an error unless :fields is specified' do
10 | error = assert_raises(ETL::ControlError) { new_processor({}) }
11 | assert_equal ":fields must be specified", error.message
12 | end
13 |
14 | should 'raise an error if a field is missing in the row' do
15 | error = assert_raise(ETL::ControlError) do
16 | processor = new_processor(:fields => [:key])
17 | processor.process(ETL::Row[])
18 | end
19 |
20 | assert_match /missing required field\(s\)/, error.message
21 | end
22 |
23 | should 'return the row if the required fields are in the row' do
24 | row = ETL::Row[:first => nil, :second => "Barry"]
25 | assert_equal row, new_processor(:fields => [:first, :second]).process(row)
26 | end
27 |
28 | should 'accept strings instead of symbols in both places' do
29 | row = ETL::Row[:first => nil, 'second' => "Barry"]
30 | assert_equal row, new_processor(:fields => ['first', :second]).process(row)
31 | end
32 |
33 | end
34 |
--------------------------------------------------------------------------------
/test/errors.ctl:
--------------------------------------------------------------------------------
1 | class ErrorProcessor < ETL::Processor::RowProcessor
2 | def initialize(control, configuration)
3 | super
4 | end
5 | def process(row)
6 | raise RuntimeError, "Generated error"
7 | end
8 | end
9 |
10 | set_error_threshold 1
11 |
12 | source :in, {
13 | :type => :enumerable,
14 | :enumerable => [
15 | {:first_name => 'Bob',:last_name => 'Smith'},
16 | {:first_name => 'Joe', :last_name => 'Thompson'}
17 | ]
18 | },
19 | [
20 | :first_name,
21 | :last_name
22 | ]
23 |
24 | after_read ErrorProcessor
--------------------------------------------------------------------------------
/test/etl_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # This is an integration test
4 | class ETLTest < Test::Unit::TestCase
5 | # Test end-to-end integration of ETL engine processing for the delimited.ctl control file
6 | def test_delimited_single_file_load
7 | #ETL::Engine.logger = Logger.new(STDOUT)
8 | #ETL::Engine.logger.level = Logger::DEBUG
9 |
10 | ETL::Engine.init(:config => File.dirname(__FILE__) + '/database.yml')
11 | ETL::Engine.process(File.dirname(__FILE__) + '/delimited.ctl')
12 | lines = open(File.dirname(__FILE__) + '/output/delimited.txt').readlines
13 | assert_equal 3, lines.length
14 |
15 | data = lines[0].split(',')
16 | assert_equal '1', data[0]
17 | assert_equal 'Chris', data[1]
18 | assert_equal 'Smith', data[2]
19 | assert_equal '23cc5914d48b146f0fbb73c4', data[3]
20 | assert_equal '24', data[4]
21 | assert_equal 'Male', data[5]
22 | assert_equal 'test!', data[6]
23 | assert_nothing_raised { Time.parse(data[7]) }
24 |
25 | data = lines[1].split(',')
26 | assert_equal '2', data[0]
27 | assert_equal 'Jim', data[1]
28 | assert_equal 'Foxworthy', data[2]
29 | assert_equal '596e3534978b8c2b47851e37', data[3]
30 | assert_equal '51', data[4]
31 | assert_equal 'Male', data[5]
32 | assert_equal 'test!', data[6]
33 | assert_nothing_raised { Time.parse(data[7]) }
34 | end
35 |
36 | # Test end-to-end integration of ETL engine processing for the fixed_width.ctl control file
37 | def test_fixed_width_single_file_load
38 | ETL::Engine.process(File.dirname(__FILE__) + '/fixed_width.ctl')
39 | lines = open(File.dirname(__FILE__) + '/output/delimited.txt').readlines
40 | assert_equal 3, lines.length
41 | end
42 | end
--------------------------------------------------------------------------------
/test/excel.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/excel.xls',
3 | :parser => :excel
4 | },
5 | {
6 | :ignore_blank_line => false,
7 | :fields => [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | :age
12 | ]
13 | }
14 |
15 | transform :ssn, :sha1
16 | transform(:ssn){ |n, v, r| v[0,24] }
17 |
18 |
19 | destination :out, {
20 | :file => 'output/excel.out.txt'
21 | },
22 | {
23 | :order => [:first_name, :last_name, :ssn, :age]
24 | }
--------------------------------------------------------------------------------
/test/excel2.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => 'data/excel2.xls',
3 | :parser => :excel
4 | },
5 | {
6 | :ignore_blank_line => true,
7 | :worksheets => [ 1 ],
8 | :fields => [
9 | :first_name,
10 | :last_name,
11 | :ssn,
12 | :age
13 | ]
14 | }
15 |
16 | transform :ssn, :sha1
17 | transform(:ssn){ |n, v, r| v[0,24] }
18 |
19 |
20 | destination :out, {
21 | :file => 'output/excel2.out.txt'
22 | },
23 | {
24 | :order => [:first_name, :last_name, :ssn, :age]
25 | }
--------------------------------------------------------------------------------
/test/fixed_width.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing fixed_width.ctl"
2 |
3 | source :in, {
4 | :file => 'data/fixed_width.txt',
5 | :parser => :fixed_width
6 | },
7 | {
8 | :first_name => {
9 | :start => 1,
10 | :length => 9
11 | },
12 | :last_name => {
13 | :start => 10,
14 | :length => 12
15 | },
16 | :ssn => {
17 | :start => 22,
18 | :length => 9
19 | },
20 | :age => {
21 | :start => 31,
22 | :length => 2,
23 | :type => :integer
24 | }
25 | }
26 |
27 | transform :ssn, :sha1
28 | transform(:ssn){ |n, v, r| v[0,24] }
29 |
30 | destination :out, {
31 | :file => 'output/fixed_width.txt'
32 | },
33 | {
34 | :order => [:first_name, :last_name, :ssn, :age]
35 | }
--------------------------------------------------------------------------------
/test/foreign_key_lookup_transform_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # TODO - use flexmock instead, but I'm not sure how to handle the respond_to part yet
4 | class TestResolver
5 | attr_accessor :cache_loaded
6 |
7 | def initialize
8 | @cache_loaded = false
9 | end
10 |
11 | def load_cache
12 | @cache_loaded = true
13 | end
14 | end
15 |
16 | class ForeignKeyLookupTransformTest < Test::Unit::TestCase
17 |
18 | context 'configuration' do
19 |
20 | should 'enable cache by default' do
21 | resolver = TestResolver.new
22 |
23 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name',
24 | {:resolver => resolver})
25 |
26 | assert_equal true, resolver.cache_loaded
27 | end
28 |
29 | should 'allow to disable cache' do
30 | resolver = TestResolver.new
31 |
32 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name',
33 | {:resolver => resolver, :cache => false})
34 |
35 | assert_equal false, resolver.cache_loaded
36 | end
37 |
38 | should 'allow to enable cache' do
39 | resolver = TestResolver.new
40 |
41 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name',
42 | {:resolver => resolver, :cache => true})
43 |
44 | assert_equal true, resolver.cache_loaded
45 | end
46 |
47 | end
48 |
49 |
50 | end
--------------------------------------------------------------------------------
/test/generator_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # Test generators
4 | class GeneratorTest < Test::Unit::TestCase
5 | # Test the surrogate key generator
6 | def test_surrogate_key_generator
7 | generator_class = ETL::Generator::Generator.class_for_name(:surrogate_key)
8 | assert_equal ETL::Generator::SurrogateKeyGenerator, generator_class
9 | generator = generator_class.new
10 | 1.upto(10) do |i|
11 | assert_equal i, generator.next
12 | end
13 | end
14 | end
--------------------------------------------------------------------------------
/test/inline_parser.ctl:
--------------------------------------------------------------------------------
1 | class MyParser < ETL::Parser::Parser
2 | def each
3 | [{:name => 'foo'},{:name => 'bar'},{:name => 'baz'}].each do |row|
4 | yield row
5 | end
6 | end
7 | end
8 |
9 | source :in, {
10 | :file => '',
11 | :parser => MyParser
12 | },
13 | [
14 | :name
15 | ]
16 |
17 | destination :out, {:file => 'output/inline_parser.txt'},{:order => [:name]}
--------------------------------------------------------------------------------
/test/mocks/mock_destination.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Control
3 | # Usage:
4 | # - declare in the ctl file:
5 | # destination :out, { :type => :mock, :name => :my_mock_output }
6 | # - run the .ctl from your test
7 | # - then assert the content of the rows
8 | # assert_equal [{:name => 'John Barry'},{:name => 'Gary Moore'}], MockDestination[:my_mock_output]
9 | class MockDestination < Destination
10 | def initialize(control, configuration, mapping={})
11 | super
12 | @mock_destination_name = configuration[:name] || 'mock_destination'
13 | @@registry ||= {}
14 | @@registry[@mock_destination_name] ||= []
15 | end
16 | def self.[](mock_destination_name)
17 | @@registry[mock_destination_name]
18 | end
19 | def write(row)
20 | @@registry[@mock_destination_name] << row
21 | end
22 | # the presence of close is asserted - just do nothing
23 | def close; end
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/test/mocks/mock_source.rb:
--------------------------------------------------------------------------------
1 | module ETL
2 | module Control
3 | # Usage:
4 | # - first set the data in your test setup
5 | # MockSource[:my_input] = [ { :first_name => 'John', :last_name => 'Barry' }, { ...} ]
6 | # - then declare in the ctl file:
7 | # source :in, { :type => :mock, :name => :my_input }
8 | class MockSource < EnumerableSource
9 | def initialize(control, configuration, definition)
10 | super
11 | mock_source_name = configuration[:name] || 'mock_source'
12 | throw "No mock source data set for mock source '#{mock_source_name}'" if @@registry[mock_source_name].nil?
13 | configuration[:enumerable] = @@registry[mock_source_name]
14 | end
15 | def self.[]=(mock_source_name,mock_source_data)
16 | @@registry ||= {}
17 | @@registry[mock_source_name] = mock_source_data
18 | end
19 | def self.[](mock_source_name)
20 | @@registry[mock_source_name]
21 | end
22 | end
23 | end
24 | end
25 |
26 |
--------------------------------------------------------------------------------
/test/model_source.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :type => :model
3 | },
4 | [
5 | :first_name,
6 | :last_name
7 | ]
8 |
9 | destination :out, {
10 | :file => 'data/model_out.txt'
11 | },
12 | {
13 | :order => [:first_name, :last_name],
14 | }
--------------------------------------------------------------------------------
/test/multiple_delimited.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing delimited.ctl"
2 |
3 | source :in, {
4 | :file => 'data/multiple_delimited_*.txt',
5 | :parser => :csv
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | {
12 | :name => :age,
13 | :type => :integer
14 | }
15 | ]
16 |
17 | destination :out, {
18 | :file => 'output/multiple_delimited.txt'
19 | },
20 | {
21 | :order => [:first_name, :last_name, :ssn, :age]
22 | }
23 |
--------------------------------------------------------------------------------
/test/multiple_source_delimited.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing delimited.ctl"
2 |
3 | source :source1, {
4 | :file => 'data/multiple_delimited_*.txt',
5 | :parser => :csv
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | {
12 | :name => :age,
13 | :type => :integer
14 | }
15 | ]
16 |
17 | source :source2, {
18 | :file => 'data/multiple_delimited_*.txt',
19 | :parser => :csv
20 | },
21 | [
22 | :first_name,
23 | :last_name,
24 | :ssn,
25 | {
26 | :name => :age,
27 | :type => :integer
28 | }
29 | ]
30 |
31 | transform :ssn, :sha1
32 | transform(:ssn){ |v| v[0,24] }
33 |
34 | destination :out, {
35 | :file => 'output/multiple_source_delimited.txt'
36 | },
37 | {
38 | :order => [:first_name, :last_name, :ssn, :age]
39 | }
--------------------------------------------------------------------------------
/test/nokogiri_all.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing nokogiri_all.ctl"
2 |
3 | source :in, {
4 | :file => 'data/nokogiri.xml',
5 | :parser => :nokogiri_xml
6 | },
7 | {
8 | :collection => 'people/person',
9 | :fields => [
10 | :first_name,
11 | :last_name,
12 | {
13 | :name => :ssn,
14 | :xpath => '@ssn'
15 | },
16 | {
17 | :name => :age,
18 | :type => :integer
19 | },
20 | {
21 | :name => :hair_colour,
22 | :xpath => 'colours/hair'
23 | }
24 | ]
25 | }
26 |
27 | destination :out, {
28 | :file => 'output/xml.txt'
29 | },
30 | {
31 | :order => [:first_name, :last_name, :ssn]
32 | }
33 |
34 | transform :ssn, :sha1
35 | transform(:ssn){ |v| v[0,24] }
36 |
--------------------------------------------------------------------------------
/test/nokogiri_select.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing nokogiri_select.ctl"
2 |
3 | source :in, {
4 | :file => 'data/nokogiri.xml',
5 | :parser => :nokogiri_xml
6 | },
7 | {
8 | :collection => 'people/person[@type="client"]',
9 | :fields => [
10 | :first_name,
11 | :last_name,
12 | {
13 | :name => :ssn,
14 | :xpath => '@ssn'
15 | },
16 | {
17 | :name => :age,
18 | :type => :integer
19 | },
20 | {
21 | :name => :hair_colour,
22 | :xpath => 'colours/hair'
23 | }
24 | ]
25 | }
26 |
27 | destination :out, {
28 | :file => 'output/xml.txt'
29 | },
30 | {
31 | :order => [:first_name, :last_name, :ssn]
32 | }
33 |
34 | transform :ssn, :sha1
35 | transform(:ssn){ |v| v[0,24] }
36 |
--------------------------------------------------------------------------------
/test/nokogiri_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # Test the flat text parsers
4 | class ParserTest < Test::Unit::TestCase
5 |
6 | # Test the DOM-based Nokogiri XML parser. .
7 | def test_nokogiri_xml_parser_for_all_nodes
8 | control = ETL::Control::Control.resolve(
9 | File.dirname(__FILE__) + '/nokogiri_all.ctl')
10 | parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first)
11 | rows = parser.collect { |row| row }
12 | assert_equal 3, rows.length
13 | assert_equal(
14 | { :hair_colour=>"black",
15 | :first_name=>"Bob",
16 | :last_name=>"Smith",
17 | :ssn=>"123456789", :age=>"24"}, rows.first)
18 | end
19 |
20 | # Test the DOM-based Nokogiri XML parser. .
21 | def test_nokogiri_xml_parser_for_selected_nodes
22 | control = ETL::Control::Control.resolve(
23 | File.dirname(__FILE__) + '/nokogiri_select.ctl')
24 | parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first)
25 | rows = parser.collect { |row| row }
26 | assert_equal 2, rows.length
27 | assert_equal(
28 | { :age=>"37",
29 | :hair_colour=>"black",
30 | :first_name=>"Jake",
31 | :last_name=>"Smithsonian",
32 | :ssn=>"133244566"}, rows.last)
33 | end
34 |
35 | end
36 |
--------------------------------------------------------------------------------
/test/output/.ignore:
--------------------------------------------------------------------------------
1 | *.txt
--------------------------------------------------------------------------------
/test/performance/delimited.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing delimited.ctl"
2 |
3 | source :in, {
4 | :file => 'delimited.txt',
5 | :parser => :csv
6 | },
7 | [
8 | :first_name,
9 | :last_name,
10 | :ssn,
11 | {
12 | :name => :age,
13 | :type => :integer
14 | },
15 | :sex
16 | ]
17 |
18 | transform :ssn, :sha1
19 | transform(:ssn){ |v| v[0,24] }
20 | transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'}
21 |
22 | destination :out, {
23 | :file => 'delimited.out.txt'
24 | },
25 | {
26 | :order => [:first_name, :last_name, :name, :ssn, :age, :sex],
27 | :virtual => {
28 | :name => Proc.new { |row| "#{row[:first_name]} #{row[:last_name]}" }
29 | }
30 | }
--------------------------------------------------------------------------------
/test/processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # Test pre- and post-processors
4 | class ProcessorTest < Test::Unit::TestCase
5 | # Test bulk import functionality
6 |
7 | context "the bulk import processor" do
8 | should "should import successfully" do
9 | assert_nothing_raised { do_bulk_import }
10 | assert_equal 3, Person.count
11 | assert_equal "Foxworthy", Person.find(2).last_name
12 | end
13 | end
14 |
15 | def test_bulk_import_with_empties
16 | # this test ensure that one column with empty value will still allow
17 | # the row to be imported
18 | # this doesn't apply to the id column though - untested
19 | assert_nothing_raised { do_bulk_import('bulk_import_with_empties.txt') }
20 | assert_equal 3, Person.count
21 | assert Person.find(2).last_name.blank?
22 | end
23 |
24 | def test_truncate
25 | # TODO: implement test
26 | end
27 |
28 | private
29 |
30 | def do_bulk_import(file = 'bulk_import.txt')
31 | control = ETL::Control::Control.new(File.join(File.dirname(__FILE__), 'delimited.ctl'))
32 | configuration = {
33 | :file => "data/#{file}",
34 | :truncate => true,
35 | :target => :data_warehouse,
36 | :table => 'people'
37 | }
38 | processor = ETL::Processor::BulkImportProcessor.new(control, configuration)
39 | processor.process
40 | end
41 | end
42 |
--------------------------------------------------------------------------------
/test/row_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | # Test row processors
4 | class RowProcessorTest < Test::Unit::TestCase
5 | def test_copy_field_processor
6 |
7 | end
8 | def test_hierarchy_exploder_processor
9 |
10 | end
11 | def test_rename_processor
12 |
13 | end
14 | def test_sequence_processor
15 |
16 | end
17 | end
--------------------------------------------------------------------------------
/test/sax.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing fixed_width.ctl"
2 |
3 | source :in, {
4 | :file => 'data/sax.xml',
5 | :parser => :sax
6 | },
7 | {
8 | :write_trigger => 'people/person',
9 | :fields => {
10 | :first_name => 'people/person/first_name',
11 | :last_name => 'people/person/last_name',
12 | :ssn => 'people/person/social_security_number',
13 | :age => 'people/person[age]'
14 | }
15 | }
16 |
17 | transform :ssn, :sha1
18 | transform(:ssn){ |v| v[0,24] }
19 | transform :age, :type, {:type => :number}
20 |
21 | destination :out, {
22 | :file => 'output/sax.out.txt'
23 | },
24 | {
25 | :order => [:first_name, :last_name, :ssn, :age]
26 | }
--------------------------------------------------------------------------------
/test/scd/1.txt:
--------------------------------------------------------------------------------
1 | Bob,Smith,200 South Drive,Boston,MA,32123
--------------------------------------------------------------------------------
/test/scd/2.txt:
--------------------------------------------------------------------------------
1 | Bob,Smith,1010 SW 23rd St,Los Angeles,CA,90392
--------------------------------------------------------------------------------
/test/scd/3.txt:
--------------------------------------------------------------------------------
1 | Bob,Smith,280 Pine Street,Los Angeles,CA,90392
--------------------------------------------------------------------------------
/test/scd_test_type_1.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => "scd/#{ENV['run_number']}.txt",
3 | :parser => :csv
4 | },
5 | [
6 | :first_name,
7 | :last_name,
8 | :address,
9 | :city,
10 | :state,
11 | :zip_code
12 | ]
13 |
14 | # NOTE: These are not usually required for a type 1 SCD dimension, but since
15 | # we're sharing this table with the type 2 tests, they're necessary.
16 | transform :effective_date, :default, :default_value => Time.now.to_s(:db)
17 | transform :end_date, :default, :default_value => '9999-12-31 00:00:00'
18 | transform :latest_version, :default, :default_value => true
19 |
20 | destination :out, {
21 | :file => 'output/scd_test_type_1.txt',
22 | :natural_key => [:first_name, :last_name],
23 | :scd => {
24 | :type => 1,
25 | :dimension_target => :data_warehouse,
26 | :dimension_table => 'person_dimension'
27 | },
28 | :scd_fields => [:address, :city, :state, :zip_code]
29 | },
30 | {
31 | :order => [
32 | :id, :first_name, :last_name, :address, :city, :state, :zip_code, :effective_date, :end_date, :latest_version
33 | ],
34 | :virtual => {
35 | :id => ETL::Generator::SurrogateKeyGenerator.new(:target => :data_warehouse, :table => 'person_dimension')
36 | }
37 | }
38 |
39 | post_process :bulk_import, {
40 | :file => 'output/scd_test_type_1.txt',
41 | :target => :data_warehouse,
42 | :table => 'person_dimension'
43 | }
--------------------------------------------------------------------------------
/test/scd_test_type_2.ctl:
--------------------------------------------------------------------------------
1 | source :in, {
2 | :file => "scd/#{ENV['run_number']}.txt",
3 | :parser => :csv
4 | },
5 | [
6 | :first_name,
7 | :last_name,
8 | :address,
9 | :city,
10 | :state,
11 | :zip_code
12 | ]
13 |
14 | destination :out, {
15 | :type => :database,
16 | :target => :data_warehouse,
17 | :database => 'etl_unittest',
18 | :table => 'person_dimension',
19 | :natural_key => [:first_name, :last_name],
20 | :scd => {
21 | :type => 2,
22 | :dimension_target => :data_warehouse,
23 | :dimension_table => 'person_dimension'
24 | },
25 | :scd_fields => ENV['type_2_scd_fields'] ? Marshal.load(ENV['type_2_scd_fields']) : [:address, :city, :state, :zip_code]
26 | },
27 | {
28 | :order => [
29 | :id, :first_name, :last_name, :address, :city, :state, :zip_code, :effective_date, :end_date, :latest_version
30 | ],
31 | :virtual => {
32 | :id => ETL::Generator::SurrogateKeyGenerator.new(:target => :data_warehouse, :table => 'person_dimension')
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/test/screen_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | class ScreenTest < Test::Unit::TestCase
4 | def test_screen
5 | ETL::Engine.process(File.dirname(__FILE__) + '/screen_test_fatal.ctl')
6 | assert_equal 2, ETL::Engine.exit_code
7 | end
8 | end
--------------------------------------------------------------------------------
/test/screen_test_error.ctl:
--------------------------------------------------------------------------------
1 | screen(:error){
2 | ETL::Screen::RowCountScreen.new(self, :rows => 1)
3 | }
--------------------------------------------------------------------------------
/test/screen_test_fatal.ctl:
--------------------------------------------------------------------------------
1 | screen(:fatal){
2 | ETL::Screen::RowCountScreen.new(self, :rows => 1)
3 | }
--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | $:.unshift(File.dirname(__FILE__) + '/../lib')
2 | $:.unshift(File.dirname(__FILE__))
3 |
4 | require 'test/unit'
5 | require 'pp'
6 | require 'etl'
7 | require 'shoulda'
8 | require 'flexmock/test_unit'
9 |
10 | raise "Missing required DB environment variable" unless ENV['DB']
11 |
12 | database_yml = File.dirname(__FILE__) + '/config/database.yml'
13 | ETL::Engine.init(:config => database_yml)
14 | ETL::Engine.logger = Logger.new(STDOUT)
15 | # ETL::Engine.logger.level = Logger::DEBUG
16 | ETL::Engine.logger.level = Logger::FATAL
17 |
18 | ActiveRecord::Base.establish_connection :operational_database
19 | ETL::Execution::Job.delete_all
20 |
21 | require 'mocks/mock_source'
22 | require 'mocks/mock_destination'
23 |
24 | # shortcut to launch a ctl file
25 | def process(file)
26 | Engine.process(File.join(File.dirname(__FILE__), file))
27 | end
28 |
29 | puts "ActiveRecord::VERSION = #{ActiveRecord::VERSION::STRING}"
30 |
31 | class Person < ActiveRecord::Base
32 | end
33 |
34 | def current_adapter
35 | ENV['DB']
36 | end
37 |
--------------------------------------------------------------------------------
/test/truncate_processor_test.rb:
--------------------------------------------------------------------------------
1 | require File.dirname(__FILE__) + '/test_helper'
2 |
3 | include ETL::Processor
4 |
5 | class TruncateTest < ActiveRecord::Base
6 | set_table_name 'truncate_test'
7 | end
8 |
9 | class TruncateProcessorTest < Test::Unit::TestCase
10 |
11 | def create_item!
12 | TruncateTest.create!(:x => 'ABC')
13 | end
14 |
15 | def truncate!(options=nil)
16 | TruncateProcessor.new(nil,
17 | :target => :data_warehouse,
18 | :table => TruncateTest.table_name,
19 | :options => options
20 | ).process
21 | end
22 |
23 | should 'reset ids by default' do
24 | create_item!
25 | truncate!
26 | assert_equal 1, create_item!.id
27 | end
28 |
29 | if ETL::Engine.connection(:data_warehouse).class.name =~ /postgres/i
30 | should 'allow disabling id reset for postgres' do
31 | truncate!
32 | create_item!
33 | truncate!('CONTINUE IDENTITY')
34 | assert_equal 2, create_item!.id
35 | end
36 | end
37 | end
--------------------------------------------------------------------------------
/test/xml.ctl:
--------------------------------------------------------------------------------
1 | # puts "executing fixed_width.ctl"
2 |
3 | source :in, {
4 | :file => 'data/xml.xml',
5 | :parser => :xml
6 | },
7 | {
8 | :collection => 'people/person',
9 | :fields => [
10 | :first_name,
11 | :last_name,
12 | {
13 | :name => :ssn,
14 | :xpath => 'social_security_number'
15 | },
16 | {
17 | :name => :age,
18 | :type => :integer
19 | }
20 | ]
21 | }
22 |
23 | destination :out, {
24 | :file => 'output/xml.txt'
25 | },
26 | {
27 | :order => [:first_name, :last_name, :ssn]
28 | }
29 |
30 | transform :ssn, :sha1
31 | transform(:ssn){ |v| v[0,24] }
--------------------------------------------------------------------------------