├── .gitignore ├── .standalone_migrations ├── .travis.yml ├── 0.9-UPGRADE ├── CHANGELOG ├── Gemfile ├── Guardfile ├── HOW_TO_RELEASE ├── LICENSE ├── README.textile ├── Rakefile ├── TODO ├── activewarehouse-etl.gemspec ├── bin ├── etl └── etl.cmd ├── db ├── migrate │ └── 20120229203554_create_tables.rb └── schema.rb ├── examples └── database.example.yml ├── lib ├── etl.rb └── etl │ ├── batch.rb │ ├── batch │ ├── batch.rb │ └── directives.rb │ ├── builder.rb │ ├── builder │ ├── date_dimension_builder.rb │ └── time_dimension_builder.rb │ ├── commands │ └── etl.rb │ ├── control.rb │ ├── control │ ├── control.rb │ ├── destination.rb │ ├── destination │ │ ├── csv_destination.rb │ │ ├── database_destination.rb │ │ ├── excel_destination.rb │ │ ├── file_destination.rb │ │ ├── insert_update_database_destination.rb │ │ ├── update_database_destination.rb │ │ └── yaml_destination.rb │ ├── source.rb │ └── source │ │ ├── database_source.rb │ │ ├── enumerable_source.rb │ │ ├── file_source.rb │ │ ├── model_source.rb │ │ └── mysql_streamer.rb │ ├── core_ext.rb │ ├── core_ext │ ├── time.rb │ └── time │ │ └── calculations.rb │ ├── engine.rb │ ├── execution.rb │ ├── execution │ ├── base.rb │ ├── batch.rb │ ├── job.rb │ └── migration.rb │ ├── generator.rb │ ├── generator │ ├── generator.rb │ └── surrogate_key_generator.rb │ ├── http_tools.rb │ ├── parser.rb │ ├── parser │ ├── apache_combined_log_parser.rb │ ├── csv_parser.rb │ ├── excel_parser.rb │ ├── fixed_width_parser.rb │ ├── nokogiri_xml_parser.rb │ ├── parser.rb │ ├── sax_parser.rb │ └── xml_parser.rb │ ├── processor.rb │ ├── processor │ ├── block_processor.rb │ ├── bulk_import_processor.rb │ ├── check_exist_processor.rb │ ├── check_unique_processor.rb │ ├── copy_field_processor.rb │ ├── database_join_processor.rb │ ├── encode_processor.rb │ ├── ensure_fields_presence_processor.rb │ ├── escape_csv_processor.rb │ ├── filter_row_processor.rb │ ├── ftp_downloader_processor.rb │ ├── ftp_uploader_processor.rb │ ├── hierarchy_exploder_processor.rb │ ├── imapattachment_downloader_processor.rb │ ├── pop3attachment_downloader_processor.rb │ ├── print_row_processor.rb │ ├── processor.rb │ ├── rename_processor.rb │ ├── require_non_blank_processor.rb │ ├── row_processor.rb │ ├── sequence_processor.rb │ ├── sftp_downloader_processor.rb │ ├── sftp_uploader_processor.rb │ ├── surrogate_key_processor.rb │ ├── truncate_processor.rb │ └── zip_file_processor.rb │ ├── row.rb │ ├── screen.rb │ ├── screen │ └── row_count_screen.rb │ ├── transform.rb │ ├── transform │ ├── block_transform.rb │ ├── calculation_transform.rb │ ├── date_to_string_transform.rb │ ├── decode_transform.rb │ ├── default_transform.rb │ ├── foreign_key_lookup_transform.rb │ ├── hierarchy_lookup_transform.rb │ ├── md5_transform.rb │ ├── ordinalize_transform.rb │ ├── sha1_transform.rb │ ├── split_fields_transform.rb │ ├── string_to_date_time_transform.rb │ ├── string_to_date_transform.rb │ ├── string_to_time_transform.rb │ ├── transform.rb │ ├── trim_transform.rb │ └── type_transform.rb │ ├── util.rb │ └── version.rb └── test ├── .gitignore ├── .ignore ├── all.ebf ├── apache_combined_log.ctl ├── batch_test.rb ├── batch_with_error.ebf ├── batched1.ctl ├── batched2.ctl ├── block_processor.ctl ├── block_processor_error.ctl ├── block_processor_pre_post_process.ctl ├── block_processor_remove_rows.ctl ├── block_processor_test.rb ├── check_exist_processor_test.rb ├── check_unique_processor_test.rb ├── config ├── .gitignore ├── database.yml └── gemfiles │ ├── Gemfile.rails-3.0.x │ ├── Gemfile.rails-3.1.x │ ├── Gemfile.rails-3.2.x │ ├── Gemfile.rails-4.0.x │ └── common.rb ├── control_test.rb ├── data ├── apache_combined_log.txt ├── bulk_import.txt ├── bulk_import_with_empties.txt ├── decode.txt ├── delimited.txt ├── encode_source_latin1.txt ├── excel.xls ├── excel2.xls ├── fixed_width.txt ├── multiple_delimited_1.txt ├── multiple_delimited_2.txt ├── nokogiri.xml ├── people.txt ├── sax.xml └── xml.xml ├── database_join_processor_test.rb ├── date_dimension_builder_test.rb ├── delimited.ctl ├── delimited_absolute.ctl ├── delimited_destination_db.ctl ├── delimited_excel.ctl ├── delimited_insert_update.ctl ├── delimited_update.ctl ├── delimited_with_bulk_load.ctl ├── destination_test.rb ├── directive_test.rb ├── encode_processor_test.rb ├── engine_test.rb ├── ensure_fields_presence_processor_test.rb ├── errors.ctl ├── etl_test.rb ├── excel.ctl ├── excel2.ctl ├── fixed_width.ctl ├── foreign_key_lookup_transform_test.rb ├── generator_test.rb ├── inline_parser.ctl ├── mocks ├── mock_destination.rb └── mock_source.rb ├── model_source.ctl ├── multiple_delimited.ctl ├── multiple_source_delimited.ctl ├── nokogiri_all.ctl ├── nokogiri_select.ctl ├── nokogiri_test.rb ├── output └── .ignore ├── parser_test.rb ├── performance └── delimited.ctl ├── processor_test.rb ├── row_processor_test.rb ├── sax.ctl ├── scd ├── 1.txt ├── 2.txt └── 3.txt ├── scd_test.rb ├── scd_test_type_1.ctl ├── scd_test_type_2.ctl ├── screen_test.rb ├── screen_test_error.ctl ├── screen_test_fatal.ctl ├── source_test.rb ├── test_helper.rb ├── transform_test.rb ├── truncate_processor_test.rb └── xml.ctl /.gitignore: -------------------------------------------------------------------------------- 1 | pkg/* 2 | source_data 3 | test/output/* 4 | rdoc 5 | .rvmrc 6 | .bundle 7 | *.gem 8 | *.lock -------------------------------------------------------------------------------- /.standalone_migrations: -------------------------------------------------------------------------------- 1 | config: 2 | database: test/config/database.yml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | gemfile: 3 | - test/config/gemfiles/Gemfile.rails-4.0.x 4 | - test/config/gemfiles/Gemfile.rails-3.2.x 5 | - test/config/gemfiles/Gemfile.rails-3.1.x 6 | - test/config/gemfiles/Gemfile.rails-3.0.x 7 | rvm: 8 | - 1.9.3 9 | - 1.8.7 10 | env: 11 | - DB=mysql2 12 | - DB=postgresql 13 | before_script: 14 | - bundle exec rake db:create RAILS_ENV=$DB 15 | - bundle exec rake db:create RAILS_ENV=etl_execution 16 | - bundle exec rake db:schema:load RAILS_ENV=$DB 17 | 18 | branches: 19 | only: 20 | - master 21 | -------------------------------------------------------------------------------- /0.9-UPGRADE: -------------------------------------------------------------------------------- 1 | The 0.9 revision of ActiveWarehouse ETL significantly changes how connections are maintained. This release is not backwards compatible. 2 | 3 | To upgrade, you must do the following: 4 | 5 | 1.) All database connections used in ETL control files must be declared in database.yml in the directory that contains your ETL control files. 6 | 2.) All sources, destinations, transforms and processors that use a database connection must include the configuration name/value pair of :target => 'name' where name is replaced with the connection name defined in database.yml. Connection information should no longer be included in control files. -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | # Specify your gem's dependencies in ..gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | db = ENV['DB'] || 'mysql2' 2 | gemfile = ENV['GEMFILE'] || 'test/config/gemfiles/Gemfile.rails-3.2.x' 3 | 4 | guard 'shell' do 5 | watch(/(lib|test)\/\.*/) {|m| `bundle exec rake ci:run_one[#{db},#{gemfile}]` } 6 | end 7 | -------------------------------------------------------------------------------- /HOW_TO_RELEASE: -------------------------------------------------------------------------------- 1 | * update lib/etl/version 2 | * push your changes 3 | * then use bundler to build + git tag + push to rubygems 4 | 5 | rake release 6 | 7 | * if you remain stuck at "Pushed git commits and tags", the task may silently wait for your password. Check this if it's the case: 8 | 9 | https://github.com/carlhuda/bundler/issues/980 10 | 11 | * you can list changes using github: 12 | 13 | https://github.com/activewarehouse/activewarehouse-etl/compare/release-0.9.1...master -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2011 Anthony Eden, Thibaut Barrère 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler' 2 | Bundler::GemHelper.install_tasks 3 | require 'rake' 4 | require 'rake/testtask' 5 | 6 | def system!(cmd) 7 | puts cmd 8 | raise "Command failed!" unless system(cmd) 9 | end 10 | 11 | require 'tasks/standalone_migrations' 12 | 13 | # experimental tasks to reproduce the Travis behaviour locally 14 | namespace :ci do 15 | 16 | desc "For current RVM, run the tests for one db and one gemfile" 17 | task :run_one, :db, :gemfile do |t, args| 18 | Bundler.with_clean_env do 19 | ENV['BUNDLE_GEMFILE'] = File.expand_path(args[:gemfile] || (File.dirname(__FILE__) + '/test/config/gemfiles/Gemfile.rails-3.2.x')) 20 | ENV['DB'] = args[:db] || 'mysql2' 21 | system! "bundle install" 22 | system! "bundle exec rake db:create" 23 | system! "bundle exec rake db:create RAILS_ENV=etl_execution" 24 | system! "bundle exec rake db:schema:load" 25 | system! "bundle exec rake" 26 | end 27 | end 28 | 29 | desc "For current RVM, run the tests for all the combination in travis configuration" 30 | task :run_matrix do 31 | require 'cartesian' 32 | config = YAML.load_file('.travis.yml') 33 | config['env'].cartesian(config['gemfile']).each do |*x| 34 | env, gemfile = *x.flatten 35 | db = env.gsub('DB=', '') 36 | print [db, gemfile].inspect.ljust(40) + ": " 37 | cmd = "rake \"ci:run_one[#{db},#{gemfile}]\"" 38 | result = system "#{cmd} > /dev/null 2>&1" 39 | result = result ? "OK" : "FAILED! - re-run with: #{cmd}" 40 | puts result 41 | end 42 | end 43 | 44 | end 45 | 46 | task :default => :test 47 | 48 | desc 'Test the ETL application.' 49 | Rake::TestTask.new(:test) do |t| 50 | t.libs << 'lib' << '.' 51 | t.pattern = 'test/**/*_test.rb' 52 | t.verbose = true 53 | # TODO: reset the database 54 | end 55 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | TODO 2 | 3 | * Add build-in support for audit_dimension 4 | * Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override) 5 | * Provide greater control in error handling 6 | ** Allow a error threshold 7 | ** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached 8 | ** Allow mismatch row length error in delimited parser to be ignored 9 | * Improve error messages throughout, but especially in problems with the control files 10 | * Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination. 11 | * Check if a temp table exists and the last job run was successful, in which case skip during the current run 12 | * Create models for each of the tables in each of the databases defined in ETL::Engine.connections 13 | 14 | Audit Record 15 | 16 | Process-Level 17 | * Start Time 18 | * End Time 19 | * (Duration) 20 | * Rows Read 21 | * Rows Written 22 | * Rows Rejected 23 | * Errors 24 | * Destination 25 | Record-Level 26 | * Source 27 | * Timestamp 28 | * Transformation Log 29 | -------------------------------------------------------------------------------- /activewarehouse-etl.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | lib = File.expand_path('../lib/', __FILE__) 3 | $:.unshift lib unless $:.include?(lib) 4 | 5 | require 'etl/version' 6 | 7 | Gem::Specification.new do |s| 8 | s.name = %q{activewarehouse-etl} 9 | s.version = ETL::VERSION 10 | s.platform = Gem::Platform::RUBY 11 | s.authors = ["Anthony Eden", "Thibaut Barrère"] 12 | s.email = ["thibaut.barrere@gmail.com"] 13 | s.homepage = "https://github.com/activewarehouse/activewarehouse-etl" 14 | s.summary = %q{Pure Ruby ETL package.} 15 | s.description = %q{ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.} 16 | 17 | s.required_rubygems_version = ">= 1.3.6" 18 | 19 | s.add_runtime_dependency('rake', '>= 0.8.3') 20 | s.add_runtime_dependency('activesupport', '>= 3.0.0') 21 | s.add_runtime_dependency('activerecord', '>= 3.0.0') 22 | s.add_runtime_dependency('fastercsv', '>= 1.2.0') 23 | s.add_runtime_dependency('adapter_extensions', '>= 0.9.5.rc1') 24 | 25 | s.add_development_dependency('shoulda', '~>2.11.3') 26 | s.add_development_dependency('flexmock', '~>0.9.0') 27 | s.add_development_dependency('cartesian') 28 | s.add_development_dependency('guard') 29 | s.add_development_dependency('guard-shell') 30 | s.add_development_dependency('standalone_migrations', '1.0.5') 31 | s.add_development_dependency('roo') 32 | 33 | s.files = `git ls-files`.split("\n") 34 | s.test_files = `git ls-files -- {test}/*`.split("\n") 35 | s.executables = %w(etl) 36 | s.require_path = "lib" 37 | end 38 | -------------------------------------------------------------------------------- /bin/etl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | #-- 4 | # Copyright (c) 2006 Anthony Eden 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | #++ 25 | 26 | $:.unshift(File.dirname(__FILE__) + '/../lib/') 27 | require 'etl' 28 | require 'etl/commands/etl' -------------------------------------------------------------------------------- /bin/etl.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The purpose of this Windows script is to let you use the etl command line with a non-gem version of AW-ETL (eg: unpacked gem, pistoned trunk). 4 | rem Just add the current folder on top of your PATH variable to use it instead of the etl command provided with the gem release. 5 | 6 | rem %~dp0 returns the absolute path where the current script is. We just append 'etl' to it, and forward all the arguments with %* 7 | 8 | ruby "%~dp0etl" %* 9 | -------------------------------------------------------------------------------- /db/migrate/20120229203554_create_tables.rb: -------------------------------------------------------------------------------- 1 | class CreateTables < ActiveRecord::Migration 2 | def self.up 3 | create_table(:people, :force => true) do |t| 4 | t.column :first_name, :string 5 | t.column :last_name, :string 6 | t.column :ssn, :string, :limit => 64 7 | end 8 | 9 | create_table(:places, :force => true) do |t| 10 | t.column :address, :text 11 | t.column :city, :string 12 | t.column :state, :string 13 | t.column :country, :string, :limit => 2 14 | end 15 | 16 | create_table(:person_dimension, :force => true) do |t| 17 | t.column :first_name, :string, :limit => 50 18 | t.column :last_name, :string, :limit => 50 19 | t.column :address, :string, :limit => 100 20 | t.column :city, :string, :limit => 50 21 | t.column :state, :string, :limit => 50 22 | t.column :zip_code, :string, :limit => 20 23 | 24 | t.column :effective_date, :timestamp 25 | t.column :end_date, :timestamp 26 | t.column :latest_version, :boolean 27 | end 28 | 29 | create_table(:truncate_test, :force => true) do |t| 30 | t.column :x, :string, :limit => 4 31 | end 32 | end 33 | 34 | def self.down 35 | raise ActiveRecord::IrreversibleMigration 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /db/schema.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | # This file is auto-generated from the current state of the database. Instead 3 | # of editing this file, please use the migrations feature of Active Record to 4 | # incrementally modify your database, and then regenerate this schema definition. 5 | # 6 | # Note that this schema.rb definition is the authoritative source for your 7 | # database schema. If you need to create the application database on another 8 | # system, you should be using db:schema:load, not running all the migrations 9 | # from scratch. The latter is a flawed and unsustainable approach (the more migrations 10 | # you'll amass, the slower it'll run and the greater likelihood for issues). 11 | # 12 | # It's strongly recommended to check this file into your version control system. 13 | 14 | ActiveRecord::Schema.define(:version => 20120229203554) do 15 | 16 | create_table "people", :force => true do |t| 17 | t.string "first_name" 18 | t.string "last_name" 19 | t.string "ssn", :limit => 64 20 | end 21 | 22 | create_table "person_dimension", :force => true do |t| 23 | t.string "first_name", :limit => 50 24 | t.string "last_name", :limit => 50 25 | t.string "address", :limit => 100 26 | t.string "city", :limit => 50 27 | t.string "state", :limit => 50 28 | t.string "zip_code", :limit => 20 29 | t.datetime "effective_date" 30 | t.datetime "end_date" 31 | t.boolean "latest_version" 32 | end 33 | 34 | create_table "places", :force => true do |t| 35 | t.text "address" 36 | t.string "city" 37 | t.string "state" 38 | t.string "country", :limit => 2 39 | end 40 | 41 | create_table "truncate_test", :force => true do |t| 42 | t.string "x", :limit => 4 43 | end 44 | 45 | end 46 | -------------------------------------------------------------------------------- /examples/database.example.yml: -------------------------------------------------------------------------------- 1 | etl_execution: 2 | adapter: mysql2 3 | username: root 4 | host: localhost 5 | database: etl_execution 6 | encoding: utf8 7 | datawarehouse: 8 | adapter: mysql2 9 | username: root 10 | host: localhost 11 | database: datawarehouse_development 12 | operational: 13 | adapter: mysql2 14 | username: root 15 | host: localhost 16 | database: operational_production -------------------------------------------------------------------------------- /lib/etl.rb: -------------------------------------------------------------------------------- 1 | # This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you 2 | # load this source file all of the other required files and gems will also be brought into the 3 | # runtime. 4 | 5 | #-- 6 | # Copyright (c) 2006-2007 Anthony Eden 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining 9 | # a copy of this software and associated documentation files (the 10 | # "Software"), to deal in the Software without restriction, including 11 | # without limitation the rights to use, copy, modify, merge, publish, 12 | # distribute, sublicense, and/or sell copies of the Software, and to 13 | # permit persons to whom the Software is furnished to do so, subject to 14 | # the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be 17 | # included in all copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 23 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 25 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | #++ 27 | 28 | require 'logger' 29 | require 'yaml' 30 | require 'erb' 31 | 32 | require 'rubygems' 33 | 34 | unless defined?(REXML::VERSION) 35 | require 'rexml/rexml' 36 | unless defined?(REXML::VERSION) 37 | REXML::VERSION = REXML::Version 38 | end 39 | end 40 | 41 | require 'active_support' 42 | require 'active_record' 43 | require 'adapter_extensions' 44 | 45 | if ActiveSupport::VERSION::STRING >= '3.2' 46 | # support for cattr_accessor 47 | require 'active_support/core_ext/class/attribute_accessors' 48 | end 49 | 50 | if RUBY_VERSION < '1.9' 51 | require 'faster_csv' 52 | CSV = FasterCSV unless defined?(CSV) 53 | else 54 | require 'csv' 55 | end 56 | 57 | # patch for https://github.com/activewarehouse/activewarehouse-etl/issues/24 58 | # allow components to require optional gems 59 | class Object 60 | def optional_require(feature) 61 | begin 62 | require feature 63 | rescue LoadError 64 | end 65 | end 66 | end 67 | 68 | $:.unshift(File.dirname(__FILE__)) 69 | 70 | require 'etl/core_ext' 71 | require 'etl/util' 72 | require 'etl/http_tools' 73 | require 'etl/builder' 74 | require 'etl/version' 75 | require 'etl/engine' 76 | require 'etl/control' 77 | require 'etl/batch' 78 | require 'etl/row' 79 | require 'etl/parser' 80 | require 'etl/transform' 81 | require 'etl/processor' 82 | require 'etl/generator' 83 | require 'etl/screen' 84 | 85 | module ETL #:nodoc: 86 | class ETLError < StandardError #:nodoc: 87 | end 88 | class ControlError < ETLError #:nodoc: 89 | end 90 | class DefinitionError < ControlError #:nodoc: 91 | end 92 | class ConfigurationError < ControlError #:nodoc: 93 | end 94 | class MismatchError < ETLError #:nodoc: 95 | end 96 | class ResolverError < ETLError #:nodoc: 97 | end 98 | class ScreenError < ETLError #:nodoc: 99 | end 100 | class FatalScreenError < ScreenError #:nodoc: 101 | end 102 | end -------------------------------------------------------------------------------- /lib/etl/batch.rb: -------------------------------------------------------------------------------- 1 | require 'etl/batch/batch' 2 | require 'etl/batch/directives' -------------------------------------------------------------------------------- /lib/etl/batch/batch.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Batch 3 | class Context 4 | attr_reader :batch 5 | 6 | class << self 7 | # Create a context that is used when evaluating the batch file 8 | def create(batch) 9 | Context.new(batch).get_binding 10 | end 11 | end 12 | 13 | def initialize(batch) 14 | @batch = batch 15 | end 16 | 17 | def file 18 | batch.file 19 | end 20 | 21 | def get_binding 22 | binding 23 | end 24 | 25 | def run(file) 26 | batch.run(File.dirname(self.file) + "/" + file) 27 | end 28 | 29 | def use_temp_tables(value=true) 30 | batch.use_temp_tables(value) 31 | end 32 | 33 | end 34 | class Batch 35 | attr_accessor :file 36 | attr_accessor :engine 37 | 38 | class << self 39 | # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments 40 | # are: 41 | # * The path to a control file as a String 42 | # * A File object referencing the control file 43 | # * The ETL::Control::Control object (which will just be returned) 44 | # 45 | # Raises a ControlError if any other type is given 46 | def resolve(batch, engine) 47 | batch = do_resolve(batch) 48 | batch.engine = engine 49 | batch 50 | end 51 | 52 | protected 53 | def parse(batch_file) 54 | batch_file = batch_file.path if batch_file.instance_of?(File) 55 | batch = ETL::Batch::Batch.new(batch_file) 56 | eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file) 57 | batch 58 | end 59 | 60 | def do_resolve(batch) 61 | case batch 62 | when String 63 | ETL::Batch::Batch.parse(File.new(batch)) 64 | when File 65 | ETL::Batch::Batch.parse(batch) 66 | when ETL::Batch::Batch 67 | batch 68 | else 69 | raise RuntimeError, "Batch must be a String, File or Batch object" 70 | end 71 | end 72 | end 73 | 74 | def initialize(file) 75 | @file = file 76 | end 77 | 78 | def run(file) 79 | directives << Run.new(self, file) 80 | end 81 | 82 | def use_temp_tables(value = true) 83 | directives << UseTempTables.new(self) 84 | end 85 | 86 | def execute 87 | engine.say "Executing batch" 88 | before_execute 89 | directives.each do |directive| 90 | directive.execute 91 | end 92 | engine.say "Finishing batch" 93 | after_execute 94 | engine.say "Batch complete" 95 | end 96 | 97 | def directives 98 | @directives ||= [] 99 | end 100 | 101 | def before_execute 102 | 103 | end 104 | 105 | def after_execute 106 | ETL::Engine.finish # TODO: should be moved to the directive? 107 | ETL::Engine.use_temp_tables = false # reset the temp tables 108 | end 109 | end 110 | end 111 | end -------------------------------------------------------------------------------- /lib/etl/batch/directives.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Batch #:nodoc: 3 | # Abstract base class for directives 4 | class Directive 5 | # Method to access the batch object 6 | attr_reader :batch 7 | 8 | # Initialize the directive with the given batch object 9 | def initialize(batch) 10 | @batch = batch 11 | end 12 | 13 | # Execute the directive 14 | def execute 15 | do_execute 16 | end 17 | 18 | protected 19 | # Implemented by subclasses 20 | def do_execute 21 | raise RuntimeError, "Directive must implement do_execute method" 22 | end 23 | end 24 | 25 | # Directive indicating that the specified ETL control file should be 26 | # run 27 | class Run < Directive 28 | # The file to execute 29 | attr_reader :file 30 | 31 | # Initialize the directive with the given batch object and file 32 | def initialize(batch, file) 33 | super(batch) 34 | @file = file 35 | end 36 | 37 | protected 38 | # Execute the process 39 | def do_execute 40 | current_batch = ETL::Engine.batch 41 | batch.engine.process(file) 42 | 43 | job = ETL::Engine.batch 44 | if (job.kind_of? ETL::Execution::Batch and 45 | current_batch[:id] != job[:id]) 46 | job[:batch_id] = current_batch[:id] 47 | job.save! 48 | end 49 | 50 | ETL::Engine.batch = current_batch 51 | end 52 | end 53 | 54 | # Directive indicating temp tables should be used. 55 | class UseTempTables < Directive 56 | def initialize(batch) 57 | super(batch) 58 | end 59 | protected 60 | def do_execute 61 | ETL::Engine.use_temp_tables = true 62 | end 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /lib/etl/builder.rb: -------------------------------------------------------------------------------- 1 | require 'etl/builder/date_dimension_builder' 2 | require 'etl/builder/time_dimension_builder' -------------------------------------------------------------------------------- /lib/etl/builder/time_dimension_builder.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Builder #:nodoc: 3 | # Builder that creates a simple time dimension. 4 | class TimeDimensionBuilder 5 | def initialize 6 | # Returns an array of hashes representing records in the dimension. The values for each record are 7 | # accessed by name. 8 | def build(options={}) 9 | records = [] 10 | 0.upto(23) do |t_hour| 11 | 0.upto(59) do |t_minute| 12 | 0.upto(59) do |t_second| 13 | t_hour_string = t_hour.to_s.rjust(2, '0') 14 | t_minute_string = t_minute.to_s.rjust(2, '0') 15 | t_second_string = t_second.to_s.rjust(2, '0') 16 | record = {} 17 | record[:hour] = t_hour 18 | record[:minute] = t_minute 19 | record[:second] = t_second 20 | record[:minute_description] = "#{t_hour_string}:#{t_minute_string}" 21 | record[:full_description] = "#{t_hour_string}:#{t_minute_string}:#{t_second_string}" 22 | records << record 23 | end 24 | end 25 | end 26 | records 27 | end 28 | end 29 | end 30 | end 31 | end -------------------------------------------------------------------------------- /lib/etl/commands/etl.rb: -------------------------------------------------------------------------------- 1 | #-- 2 | # Copyright (c) 2006 Anthony Eden 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining 5 | # a copy of this software and associated documentation files (the 6 | # "Software"), to deal in the Software without restriction, including 7 | # without limitation the rights to use, copy, modify, merge, publish, 8 | # distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so, subject to 10 | # the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be 13 | # included in all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | #++ 23 | 24 | require 'benchmark' 25 | require 'getoptlong' 26 | 27 | # Print a usage statement 28 | def usage #:nodoc: 29 | puts "Usage: etl file [file file ...]" # TODO: add the command line options 30 | end 31 | 32 | def execute 33 | opts = GetoptLong.new( 34 | [ '--version', '-v', GetoptLong::NO_ARGUMENT], 35 | [ '--help', '-h', GetoptLong::NO_ARGUMENT ], 36 | [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ], 37 | [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ], 38 | [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT], 39 | [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ], 40 | [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ], 41 | [ '--read-locally', GetoptLong::NO_ARGUMENT], 42 | [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT] 43 | ) 44 | 45 | options = {} 46 | opts.each do |opt, arg| 47 | case opt 48 | when '--version' 49 | puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}" 50 | return 51 | when '--help' 52 | usage 53 | return 54 | when '--config' 55 | options[:config] = arg 56 | when '--limit' 57 | options[:limit] = arg.to_i 58 | when '--offset' 59 | options[:offset] = arg.to_i 60 | when '--newlog' 61 | options[:newlog] = true 62 | when '--skip-bulk-import' 63 | puts "skip bulk import enabled" 64 | options[:skip_bulk_import] = true 65 | when '--read-locally' 66 | puts "read locally enabled" 67 | options[:read_locally] = true 68 | when '--rails-root' 69 | options[:rails_root] = arg 70 | puts "rails root set to #{options[:rails_root]}" 71 | end 72 | end 73 | 74 | if ARGV.length < 1 75 | usage 76 | else 77 | puts "Starting ETL process" 78 | 79 | ETL::Engine.init(options) 80 | ARGV.each do |f| 81 | ETL::Engine.realtime_activity = true 82 | ETL::Engine.process(f) 83 | exit(ETL::Engine.exit_code) if ETL::Engine.exit_code 84 | end 85 | 86 | puts "ETL process complete\n\n" 87 | end 88 | end 89 | 90 | execute -------------------------------------------------------------------------------- /lib/etl/control.rb: -------------------------------------------------------------------------------- 1 | require 'etl/control/control' 2 | require 'etl/control/source' 3 | require 'etl/control/destination' -------------------------------------------------------------------------------- /lib/etl/control/destination/csv_destination.rb: -------------------------------------------------------------------------------- 1 | # This source file contains the ETL::Control::CsvDestination 2 | 3 | module ETL #:nodoc: 4 | module Control #:nodoc: 5 | # CSV File as the final destination. 6 | class CsvDestination < Destination 7 | # The File to write to 8 | attr_reader :file 9 | 10 | # The output order 11 | attr_reader :order 12 | 13 | # Flag which indicates to append (default is to overwrite) 14 | attr_accessor :append 15 | 16 | # The separator 17 | attr_accessor :separator 18 | 19 | # The end of line marker 20 | attr_accessor :eol 21 | 22 | # The enclosure character 23 | attr_accessor :enclose 24 | 25 | # Initialize the object. 26 | # * control: The Control object 27 | # * configuration: The configuration map 28 | # * mapping: The output mapping 29 | # 30 | # Configuration options: 31 | # * :file: The file to write to (REQUIRED) 32 | # * :append: Set to true to append to the file (default is to overwrite) 33 | # * :separator: Record separator (default is a comma) 34 | # * :eol: End of line marker (default is \n) 35 | # * :enclose: Set to true of false 36 | # * :unique: Set to true to only write unique records 37 | # * :append_rows: Array of rows to append 38 | # 39 | # Mapping options: 40 | # * :order: The order array 41 | def initialize(control, configuration, mapping={}) 42 | super 43 | path = Pathname.new(configuration[:file]) 44 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path 45 | @append = configuration[:append] ||= false 46 | @separator = configuration[:separator] ||= ',' 47 | @eol = configuration[:eol] ||= "\n" 48 | @enclose = true & configuration[:enclose] 49 | @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique] 50 | @unique.uniq! unless @unique.nil? 51 | @write_header = configuration[:write_header] 52 | @order = mapping[:order] + scd_required_fields if mapping[:order] 53 | @order.uniq! unless @order.nil? 54 | end 55 | 56 | def order 57 | @order ||= order_from_source 58 | end 59 | 60 | # Close the destination. This will flush the buffer and close the underlying stream or connection. 61 | def close 62 | buffer << append_rows if append_rows 63 | flush 64 | f.close 65 | end 66 | 67 | # Flush the destination buffer 68 | def flush 69 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows" 70 | if @write_header && !@header_written 71 | f << order 72 | @header_written = true 73 | end 74 | 75 | buffer.flatten.each do |row| 76 | #puts "row change type: #{row.change_type}" 77 | # check to see if this row's compound key constraint already exists 78 | # note that the compound key constraint may not utilize virtual fields 79 | next unless row_allowed?(row) 80 | 81 | # add any virtual fields 82 | add_virtuals!(row) 83 | 84 | # collect all of the values using the order designated in the configuration 85 | values = order.collect do |name| 86 | value = row[name] 87 | case value 88 | when Date, Time, DateTime 89 | value.to_s(:db) 90 | else 91 | value.to_s 92 | end 93 | end 94 | 95 | f << values 96 | end 97 | f.flush 98 | buffer.clear 99 | #puts "After flush there are #{buffer.length} rows" 100 | end 101 | 102 | private 103 | # Get the open file stream 104 | def f 105 | @f ||= FasterCSV.open(file, mode, options) 106 | end 107 | 108 | def options 109 | @options ||= { 110 | :col_sep => separator, 111 | :row_sep => eol, 112 | :force_quotes => enclose 113 | } 114 | end 115 | 116 | # Get the appropriate mode to open the file stream 117 | def mode 118 | append ? 'a' : 'w' 119 | end 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /lib/etl/control/destination/database_destination.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Control #:nodoc: 3 | # Destination which writes directly to a database. This is useful when you are dealing with 4 | # a small amount of data. For larger amounts of data you should probably use the bulk 5 | # loader if it is supported with your target database as it will use a much faster load 6 | # method. 7 | class DatabaseDestination < Destination 8 | # The target connection 9 | attr_reader :target 10 | 11 | # The table 12 | attr_reader :table 13 | 14 | # Specify the order from the source 15 | attr_reader :order 16 | 17 | # Set to true to truncate the destination table first 18 | attr_reader :truncate 19 | 20 | # Initialize the database destination 21 | # 22 | # * control: The ETL::Control::Control instance 23 | # * configuration: The configuration Hash 24 | # * mapping: The mapping 25 | # 26 | # Configuration options: 27 | # * :database: The database name (REQUIRED) 28 | # * :target: The target connection (REQUIRED) 29 | # * :table: The table to write to (REQUIRED) 30 | # * :truncate: Set to true to truncate before writing (defaults to false) 31 | # * :unique: Set to true to only insert unique records (defaults to false) 32 | # * :append_rows: Array of rows to append 33 | # 34 | # Mapping options: 35 | # * :order: The order of fields to write (REQUIRED) 36 | def initialize(control, configuration, mapping={}) 37 | super 38 | @target = configuration[:target] 39 | @table = configuration[:table] 40 | @truncate = configuration[:truncate] ||= false 41 | @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique] 42 | @unique.uniq! unless @unique.nil? 43 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source 44 | @order.uniq! unless @order.nil? 45 | raise ControlError, "Order required in mapping" unless @order 46 | raise ControlError, "Table required" unless @table 47 | raise ControlError, "Target required" unless @target 48 | end 49 | 50 | # Flush the currently buffered data 51 | def flush 52 | conn.transaction do 53 | buffer.flatten.each do |row| 54 | # check to see if this row's compound key constraint already exists 55 | # note that the compound key constraint may not utilize virtual fields 56 | next unless row_allowed?(row) 57 | 58 | # add any virtual fields 59 | add_virtuals!(row) 60 | 61 | names = [] 62 | values = [] 63 | order.each do |name| 64 | names << conn.quote_column_name(name) 65 | values << conn.quote(row[name]) 66 | end 67 | q = "INSERT INTO #{conn.quote_table_name(table_name)} (#{names.join(',')}) VALUES (#{values.join(',')})" 68 | ETL::Engine.logger.debug("Executing insert: #{q}") 69 | conn.insert(q, "Insert row #{current_row}") 70 | @current_row += 1 71 | end 72 | buffer.clear 73 | end 74 | end 75 | 76 | # Close the connection 77 | def close 78 | buffer << append_rows if append_rows 79 | flush 80 | end 81 | 82 | private 83 | def conn 84 | @conn ||= begin 85 | conn = ETL::Engine.connection(target) 86 | conn.truncate(table_name) if truncate 87 | conn 88 | end 89 | end 90 | 91 | def table_name 92 | ETL::Engine.table(table, ETL::Engine.connection(target)) 93 | end 94 | 95 | end 96 | end 97 | end 98 | -------------------------------------------------------------------------------- /lib/etl/control/destination/excel_destination.rb: -------------------------------------------------------------------------------- 1 | optional_require 'spreadsheet' 2 | 3 | module ETL 4 | module Control 5 | # Excel as the final destination. 6 | class ExcelDestination < Destination 7 | # The File to write to 8 | attr_reader :file 9 | 10 | # The output order 11 | attr_reader :order 12 | 13 | # Flag which indicates to append (default is to overwrite) 14 | attr_accessor :append 15 | 16 | # Initialize the object. 17 | # * control: The Control object 18 | # * configuration: The configuration map 19 | # * mapping: The output mapping 20 | # 21 | # Configuration options: 22 | # * :file: The file to write to (REQUIRED) 23 | # * :append: Set to true to append to the file (default is to overwrite) 24 | # * :unique: Set to true to only write unique records 25 | # * :append_rows: Array of rows to append 26 | # 27 | # Mapping options: 28 | # * :order: The order array 29 | def initialize(control, configuration, mapping={}) 30 | super 31 | path = Pathname.new(configuration[:file]) 32 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path 33 | @append = configuration[:append] ||= false 34 | @unique = configuration[:unique] ? configuration[:unique] + scd_required_fields : configuration[:unique] 35 | @unique.uniq! unless @unique.nil? 36 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source 37 | @order.uniq! unless @order.nil? 38 | raise ControlError, "Order required in mapping" unless @order 39 | end 40 | 41 | # Close the destination. This will flush the buffer and close the underlying stream or connection. 42 | def close 43 | buffer << append_rows if append_rows 44 | flush 45 | book.write(file) 46 | end 47 | 48 | # Flush the destination buffer 49 | def flush 50 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows" 51 | buffer.flatten.each_with_index do |row, index| 52 | #puts "row change type: #{row.change_type}" 53 | # check to see if this row's compound key constraint already exists 54 | # note that the compound key constraint may not utilize virtual fields 55 | next unless row_allowed?(row) 56 | 57 | # add any virtual fields 58 | add_virtuals!(row) 59 | 60 | # collect all of the values using the order designated in the configuration 61 | values = order.collect do |name| 62 | value = row[name] 63 | case value 64 | when Date, Time, DateTime 65 | value.to_s(:db) 66 | else 67 | value.to_s 68 | end 69 | end 70 | 71 | # write the values 72 | sheet.insert_row(index, values) 73 | end 74 | buffer.clear 75 | #puts "After flush there are #{buffer.length} rows" 76 | end 77 | 78 | private 79 | # Get the open file excel 80 | def book 81 | @book ||= ( append ? Spreadsheet.open(file) : Spreadsheet::Workbook.new(file) ) 82 | end 83 | 84 | private 85 | # Get the open sheet 86 | def sheet 87 | @sheet ||= ( append ? book.worksheet(0) : book.create_worksheet() ) 88 | end 89 | end 90 | end 91 | end 92 | -------------------------------------------------------------------------------- /lib/etl/control/destination/update_database_destination.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Control #:nodoc: 3 | # Destination which writes directly to a database. This is useful when you are dealing with 4 | # a small amount of data. For larger amounts of data you should probably use the bulk 5 | # loader if it is supported with your target database as it will use a much faster load 6 | # method. 7 | class UpdateDatabaseDestination < Destination 8 | # The target connection 9 | attr_reader :target 10 | 11 | # The table 12 | attr_reader :table 13 | 14 | # Specify the order from the source 15 | attr_reader :order 16 | 17 | # Specify the conditions from the source 18 | attr_reader :conditions 19 | 20 | # Initialize the database destination 21 | # 22 | # * control: The ETL::Control::Control instance 23 | # * configuration: The configuration Hash 24 | # * mapping: The mapping 25 | # 26 | # Configuration options: 27 | # * :database: The database name (REQUIRED) 28 | # * :target: The target connection (REQUIRED) 29 | # * :table: The table to write to (REQUIRED) 30 | # * :unique: Set to true to only insert unique records (defaults to false) 31 | # * :append_rows: Array of rows to append 32 | # 33 | # Mapping options: 34 | # * :order: The order of fields to write (REQUIRED) 35 | # * :conditions: The conditions on the fields to update (REQUIRED) 36 | def initialize(control, configuration, mapping={}) 37 | super 38 | @target = configuration[:target] 39 | @table = configuration[:table] 40 | @unique = configuration[:unique] ? configuration[:unique] + [scd_effective_date_field] : configuration[:unique] 41 | @unique.uniq! unless @unique.nil? 42 | @order = mapping[:order] ? mapping[:order] + scd_required_fields : order_from_source 43 | @order.uniq! unless @order.nil? 44 | @conditions = mapping[:conditions] ? mapping[:conditions] + scd_required_fields : nil 45 | @conditions.uniq! unless @conditions.nil? 46 | raise ControlError, "Conditions required in mapping" unless @conditions 47 | raise ControlError, "Order required in mapping" unless @order 48 | raise ControlError, "Table required" unless @table 49 | raise ControlError, "Target required" unless @target 50 | end 51 | 52 | # Flush the currently buffered data 53 | def flush 54 | conn.transaction do 55 | buffer.flatten.each do |row| 56 | # check to see if this row's compound key constraint already exists 57 | # note that the compound key constraint may not utilize virtual fields 58 | next unless row_allowed?(row) 59 | 60 | # add any virtual fields 61 | add_virtuals!(row) 62 | 63 | conditionsfilter = [] 64 | conditions.each do |cond| 65 | c = " #{cond[:field]} #{cond[:comp]} #{cond[:value]} " 66 | condition = c 67 | begin 68 | condition = eval('"' + c + '"') 69 | rescue 70 | end 71 | conditionsfilter << condition 72 | end 73 | 74 | updatevalues = [] 75 | order.each do |name| 76 | updatevalues << "#{conn.quote_column_name(name)} = #{conn.quote(row[name])}" 77 | end 78 | q = "UPDATE #{conn.quote_table_name(table_name)} SET #{updatevalues.join(',')} WHERE #{conditionsfilter.join(' AND ')}" 79 | ETL::Engine.logger.debug("Executing update: #{q}") 80 | conn.update(q, "Update row #{current_row}") 81 | @current_row += 1 82 | end 83 | buffer.clear 84 | end 85 | end 86 | 87 | # Close the connection 88 | def close 89 | buffer << append_rows if append_rows 90 | flush 91 | end 92 | 93 | private 94 | def conn 95 | @conn ||= begin 96 | conn = ETL::Engine.connection(target) 97 | conn 98 | rescue 99 | raise RuntimeError, "Problem to connect to db" 100 | end 101 | end 102 | 103 | def table_name 104 | ETL::Engine.table(table, ETL::Engine.connection(target)) 105 | end 106 | 107 | end 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /lib/etl/control/destination/yaml_destination.rb: -------------------------------------------------------------------------------- 1 | require 'yaml' 2 | 3 | module ETL #:nodoc: 4 | module Control #:nodoc: 5 | class YamlDestination < Destination 6 | attr_reader :file, :append, :only, :except 7 | # Initialize the object. 8 | # * control: The Control object 9 | # * configuration: The configuration map 10 | # * mapping: The output mapping 11 | # 12 | # Configuration options: 13 | # * :file: The file to write to (REQUIRED) 14 | # * :append: Set to true to append to the file (default is to overwrite) 15 | # * :only 16 | # * :except 17 | def initialize(control, configuration, mapping={}) 18 | super 19 | @file = File.join(File.dirname(control.file), configuration[:file]) 20 | @append = configuration[:append] ||= false 21 | @only = configuration[:only] 22 | @except = configuration[:except] 23 | raise ControlError, "the :only and :except options must be used seperately, do not specify both" if @only && @except 24 | end 25 | 26 | # Close the destination. This will flush the buffer and close the underlying stream or connection. 27 | def close 28 | flush 29 | f.close 30 | end 31 | 32 | # Flush the destination buffer 33 | def flush 34 | #puts "Flushing buffer (#{file}) with #{buffer.length} rows" 35 | buffer.flatten.each do |row| 36 | # check to see if this row's compound key constraint already exists 37 | # note that the compound key constraint may not utilize virtual fields 38 | next unless row_allowed?(row) 39 | # add any virtual fields 40 | add_virtuals!(row) 41 | 42 | yaml = {} 43 | row.each do |key, value| 44 | next if only && !only.include?(key) 45 | next if except && except.include?(key) 46 | 47 | case value 48 | when Date, Time, DateTime 49 | value = value.to_s(:db) 50 | end 51 | 52 | yaml[key] = value 53 | end 54 | 55 | # write the values 56 | YAML.dump(yaml, f) 57 | end 58 | f.flush 59 | buffer.clear 60 | end 61 | 62 | private 63 | # Get the open file stream 64 | def f 65 | @f ||= File.open(file, mode) 66 | end 67 | 68 | # Get the appropriate mode to open the file stream 69 | def mode 70 | append ? 'a' : 'w' 71 | end 72 | end 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /lib/etl/control/source/enumerable_source.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Control #:nodoc: 3 | # Use an Enumerable as a source 4 | class EnumerableSource < ETL::Control::Source 5 | # Iterate through the enumerable 6 | def each(&block) 7 | configuration[:enumerable].each(&block) 8 | end 9 | end 10 | end 11 | end -------------------------------------------------------------------------------- /lib/etl/control/source/file_source.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Control #:nodoc: 3 | # A File source. 4 | class FileSource < Source 5 | # The number of lines to skip, default is 0 6 | attr_accessor :skip_lines 7 | 8 | # Accessor for the underlying parser 9 | attr_accessor :parser 10 | 11 | # The source file 12 | attr_accessor :file 13 | 14 | # Initialize the source 15 | # 16 | # Configuration options: 17 | # * :file: The source file 18 | # * :parser: One of the following: a parser name as a String or 19 | # symbol, a class which extends from Parser, a Hash with :name and 20 | # optionally an :options key. Whether or not the parser uses the 21 | # options is dependent on which parser is used. See the documentation 22 | # for each parser for information on what options it accepts. 23 | # * :skip_lines: The number of lines to skip (defaults to 0) 24 | # * :store_locally: Set to false to not store a copy of the 25 | # source data locally for archival 26 | def initialize(control, configuration, definition) 27 | super 28 | configure 29 | end 30 | 31 | # Get a String identifier for the source 32 | def to_s 33 | file 34 | end 35 | 36 | # Get the local storage directory 37 | def local_directory 38 | File.join(local_base, File.basename(file, File.extname(file))) 39 | end 40 | 41 | # Returns each row from the source 42 | def each 43 | count = 0 44 | copy_sources if @store_locally 45 | @parser.each do |row| 46 | if ETL::Engine.offset && count < ETL::Engine.offset 47 | count += 1 48 | else 49 | row = ETL::Row[row] 50 | row.source = self 51 | yield row 52 | end 53 | end 54 | end 55 | 56 | def order 57 | @parser.fields.collect {|field| field.name} 58 | end 59 | 60 | private 61 | # Copy source data to a local directory structure 62 | def copy_sources 63 | sequence = 0 64 | path = Pathname.new(file) 65 | path = path.absolute? ? path : Pathname.new(File.dirname(control.file)) + path 66 | Pathname.glob(path).each do |f| 67 | next if f.directory? 68 | lf = local_file(sequence) 69 | FileUtils.cp(f, lf) 70 | File.open(local_file_trigger(lf), 'w') {|f| } 71 | sequence += 1 72 | end 73 | end 74 | 75 | # Configure the source 76 | def configure 77 | @file = configuration[:file] 78 | case configuration[:parser] 79 | when Class 80 | @parser = configuration[:parser].new(self) 81 | when String, Symbol 82 | @parser = ETL::Parser::Parser.class_for_name(configuration[:parser]).new(self) 83 | when Hash 84 | name = configuration[:parser][:name] 85 | options = configuration[:parser][:options] 86 | @parser = ETL::Parser::Parser.class_for_name(name).new(self, options) 87 | else 88 | raise ControlError, "Configuration option :parser must be a Class, String or Symbol" 89 | end 90 | @skip_lines = configuration[:skip_lines] ||= 0 91 | end 92 | end 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /lib/etl/control/source/model_source.rb: -------------------------------------------------------------------------------- 1 | #RAILS_ENV = 'development' 2 | #require '../config/environment' 3 | 4 | module ETL #:nodoc: 5 | module Control #:nodoc: 6 | class ModelSource < Source 7 | 8 | def columns 9 | case definition 10 | when Array 11 | definition.collect(&:to_sym) 12 | when Hash 13 | definition.keys.collect(&:to_sym) 14 | else 15 | raise "Definition must be either an Array or a Hash" 16 | end 17 | end 18 | 19 | def railsmodel 20 | configuration[:model] 21 | end 22 | 23 | def order 24 | configuration[:order] || "id" 25 | end 26 | 27 | def each(&block) 28 | railsmodel.to_s.camelize.constantize.find(:all,:order=>order).each do |row| 29 | result_row = ETL::Row.new 30 | result_row.source = self 31 | columns.each do |column| 32 | result_row[column.to_sym] = row.send(column) 33 | end 34 | yield result_row 35 | end 36 | end 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/etl/control/source/mysql_streamer.rb: -------------------------------------------------------------------------------- 1 | require 'open3' 2 | 3 | # Internal: The MySQL streamer is a helper with works with the database_source 4 | # in order to allow you to use the --quick option (which stops MySQL) 5 | # from building a full result set, also we don't build a full resultset 6 | # in Ruby - instead we yield a row at a time 7 | # 8 | class MySqlStreamer 9 | # Internal: Creates a MySQL Streamer 10 | # 11 | # query - the SQL query 12 | # target - the name of the ETL configuration (ie. development/production) 13 | # connection - the ActiveRecord connection 14 | # 15 | # Examples 16 | # 17 | # MySqlStreamer.new("select * from bob", "development", my_connection) 18 | # 19 | def initialize(query, target, connection) 20 | # Lets just be safe and also make sure there aren't new lines 21 | # in the SQL - its bound to cause trouble 22 | @query = query.split.join(' ') 23 | @name = target 24 | @first_row = connection.select_all("#{query} LIMIT 1") 25 | end 26 | 27 | # We implement some bits of a hash so that database_source 28 | # can use them 29 | def any? 30 | @first_row.any? 31 | end 32 | 33 | def first 34 | @first_row.first 35 | end 36 | 37 | def mandatory_option!(hash, key) 38 | value = hash[key] 39 | raise "Missing key #{key} in connection configuration #{@name}" if value.blank? 40 | value 41 | end 42 | 43 | def each 44 | keys = nil 45 | 46 | config = ETL::Base.configurations[@name.to_s] 47 | host = mandatory_option!(config, 'host') 48 | username = mandatory_option!(config, 'username') 49 | database = mandatory_option!(config, 'database') 50 | password = config['password'] # this one can omitted in some cases 51 | 52 | mysql_command = """mysql --quick -h #{host} -u #{username} -e \"#{@query.gsub("\n","")}\" -D #{database} --password=#{password} -B""" 53 | Open3.popen3(mysql_command) do |stdin, out, err, external| 54 | until (line = out.gets).nil? do 55 | line = line.gsub("\n","") 56 | if keys.nil? 57 | keys = line.split("\t") 58 | else 59 | hash = Hash[keys.zip(line.split("\t"))] 60 | # map out NULL to nil 61 | hash.each do |k, v| 62 | hash[k] = nil if v == 'NULL' 63 | end 64 | yield hash 65 | end 66 | end 67 | error = err.gets 68 | if (!error.nil? && error.strip.length > 0) 69 | throw error 70 | end 71 | end 72 | end 73 | end -------------------------------------------------------------------------------- /lib/etl/core_ext.rb: -------------------------------------------------------------------------------- 1 | require 'etl/core_ext/time' -------------------------------------------------------------------------------- /lib/etl/core_ext/time.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/time/calculations' 2 | 3 | class Time#:nodoc: 4 | include ETL::CoreExtensions::Time::Calculations 5 | end 6 | -------------------------------------------------------------------------------- /lib/etl/core_ext/time/calculations.rb: -------------------------------------------------------------------------------- 1 | #Updated by Jack Hong on 04/05/08 2 | 3 | module ETL #:nodoc: 4 | module CoreExtensions #:nodoc: 5 | module Time #:nodoc: 6 | # Enables the use of time calculations within Time itself 7 | module Calculations 8 | def week 9 | cyw = ((yday - 1) / 7) + 1 10 | cyw = 52 if cyw == 53 11 | cyw 12 | end 13 | def quarter 14 | ((month - 1) / 3) + 1 15 | end 16 | def fiscal_year_week(offset_month=10) 17 | fyw = ((fiscal_year_yday(offset_month) - 1) / 7) + 1 18 | fyw = 52 if fyw == 53 19 | fyw 20 | end 21 | def fiscal_year_month(offset_month=10) 22 | shifted_month = month - (offset_month - 1) 23 | shifted_month += 12 if shifted_month <= 0 24 | shifted_month 25 | end 26 | def fiscal_year_quarter(offset_month=10) 27 | ((fiscal_year_month(offset_month) - 1) / 3) + 1 28 | end 29 | def fiscal_year(offset_month=10) 30 | month >= offset_month ? year + 1 : year 31 | end 32 | def fiscal_year_yday(offset_month=10) 33 | offset_days = 0 34 | 1.upto(offset_month - 1) { |m| offset_days += ::Time.days_in_month(m, year) } 35 | shifted_year_day = yday - offset_days 36 | shifted_year_day += 365 if shifted_year_day <= 0 37 | shifted_year_day 38 | end 39 | end 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /lib/etl/execution.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc 2 | # Classes which store information about ETL execution 3 | module Execution 4 | # Execution management 5 | class Execution 6 | class << self 7 | # Migrate the data store 8 | def migrate 9 | ETL::Execution::Migration.migrate 10 | end 11 | end 12 | end 13 | end 14 | end 15 | 16 | require 'etl/execution/base' 17 | require 'etl/execution/batch' 18 | require 'etl/execution/job' 19 | require 'etl/execution/migration' -------------------------------------------------------------------------------- /lib/etl/execution/base.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Execution #:nodoc: 3 | # Base class for ETL execution information 4 | class Base < ActiveRecord::Base 5 | self.abstract_class = true 6 | establish_connection :etl_execution 7 | end 8 | end 9 | end -------------------------------------------------------------------------------- /lib/etl/execution/batch.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Execution #:nodoc: 3 | # Persistent class representing an ETL batch 4 | class Batch < Base 5 | belongs_to :batch 6 | has_many :batches 7 | has_many :jobs 8 | attr_accessible :batch_file, :status, :completed_at 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/etl/execution/job.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Execution #:nodoc: 3 | # Persistent class representing an ETL job 4 | class Job < Base 5 | belongs_to :batch 6 | attr_accessible :control_file, :status, :batch_id 7 | end 8 | end 9 | end 10 | -------------------------------------------------------------------------------- /lib/etl/execution/migration.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Execution #:nodoc 3 | # Handles migration of tables required for persistent storage of meta data 4 | # for the ETL engine 5 | class Migration 6 | class << self 7 | protected 8 | # Get the schema info table name 9 | def schema_info_table_name 10 | ActiveRecord::Migrator.schema_migrations_table_name 11 | end 12 | alias :schema_migrations_table_name :schema_info_table_name 13 | 14 | public 15 | # Execute the migrations 16 | def migrate 17 | connection.initialize_schema_migrations_table 18 | last_migration.upto(target - 1) do |i| 19 | __send__("migration_#{i+1}".to_sym) 20 | connection.assume_migrated_upto_version(i+1) 21 | end 22 | end 23 | 24 | protected 25 | def last_migration 26 | connection.select_values( 27 | "SELECT version FROM #{schema_migrations_table_name}" 28 | ).map(&:to_i).sort.last || 0 29 | end 30 | 31 | # Get the connection to use during migration 32 | def connection 33 | @connection ||= ETL::Execution::Base.connection 34 | end 35 | 36 | # Get the final target version number 37 | def target 38 | 4 39 | end 40 | 41 | private 42 | def migration_1 #:nodoc: 43 | connection.create_table :jobs do |t| 44 | t.column :control_file, :string, :null => false 45 | t.column :created_at, :datetime, :null => false 46 | t.column :completed_at, :datetime 47 | t.column :status, :string 48 | end 49 | connection.create_table :records do |t| 50 | t.column :control_file, :string, :null => false 51 | t.column :natural_key, :string, :null => false 52 | t.column :crc, :string, :null => false 53 | t.column :job_id, :integer, :null => false 54 | end 55 | end 56 | 57 | def migration_2 #:nodoc: 58 | connection.add_index :records, :control_file 59 | connection.add_index :records, :natural_key 60 | connection.add_index :records, :job_id 61 | end 62 | 63 | def migration_3 #:nodoc: 64 | connection.create_table :batches do |t| 65 | t.column :batch_file, :string, :null => false 66 | t.column :created_at, :datetime, :null => false 67 | t.column :completed_at, :datetime 68 | t.column :status, :string 69 | end 70 | connection.add_column :jobs, :batch_id, :integer 71 | connection.add_index :jobs, :batch_id 72 | end 73 | 74 | def migration_4 75 | connection.drop_table :records 76 | end 77 | 78 | def migration_5 79 | connection.add_column :batches, :batch_id, :integer 80 | connection.add_index :batches, :batch_id 81 | end 82 | 83 | # Update the schema info table, setting the version value 84 | def update_schema_info(version) 85 | connection.update("UPDATE #{schema_info_table_name} SET version = #{version}") 86 | end 87 | end 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/etl/generator.rb: -------------------------------------------------------------------------------- 1 | require 'etl/generator/generator' 2 | Dir[File.dirname(__FILE__) + "/generator/*.rb"].each { |file| require(file) } -------------------------------------------------------------------------------- /lib/etl/generator/generator.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Generator #:nodoc: 3 | # Base class for generators. 4 | class Generator 5 | class << self 6 | # Get the Class for the specified name. 7 | # 8 | # For example, if name is :surrogate_key then a SurrogateKeyGenerator class is returned 9 | def class_for_name(name) 10 | ETL::Generator.const_get("#{name.to_s.camelize}Generator") 11 | end 12 | end 13 | 14 | # Generate the next value. This method must be implemented by subclasses 15 | def next 16 | raise "Must be implemented by a subclass" 17 | end 18 | end 19 | end 20 | end -------------------------------------------------------------------------------- /lib/etl/generator/surrogate_key_generator.rb: -------------------------------------------------------------------------------- 1 | # This source file contains code for a basic sequential surrogate key generator 2 | 3 | module ETL #:nodoc: 4 | module Generator #:nodoc: 5 | # Surrogate key generator. 6 | class SurrogateKeyGenerator < Generator 7 | attr_reader :table 8 | attr_reader :target 9 | attr_reader :column 10 | attr_reader :query 11 | 12 | # Initialize the generator 13 | def initialize(options={}) 14 | @table = options[:table] 15 | @target = options[:target] 16 | @column = options[:column] || 'id' 17 | @query = options[:query] 18 | 19 | if table 20 | @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}") 21 | elsif query 22 | @surrogate_key = ETL::Engine.connection(target).select_value(query) 23 | end 24 | @surrogate_key = 0 if @surrogate_key.blank? 25 | @surrogate_key = @surrogate_key.to_i 26 | end 27 | 28 | # Get the next surrogate key 29 | def next 30 | @surrogate_key ||= 0 31 | @surrogate_key += 1 32 | end 33 | 34 | def table_name 35 | ETL::Engine.table(table, ETL::Engine.connection(target)) 36 | end 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/etl/parser.rb: -------------------------------------------------------------------------------- 1 | # This source file contains the ETL::Parser module and requires all of the files 2 | # in the parser directory ending with .rb 3 | 4 | module ETL #:nodoc: 5 | # The ETL::Parser module provides various text parsers. 6 | module Parser 7 | end 8 | end 9 | 10 | require 'etl/parser/parser' 11 | Dir[File.dirname(__FILE__) + "/parser/*.rb"].each { |file| require(file) } -------------------------------------------------------------------------------- /lib/etl/parser/apache_combined_log_parser.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Parser #:nodoc: 3 | # Parser which can parser the Apache Combined Log Format as defined at 4 | # http://httpd.apache.org/docs/2.2/logs.html 5 | class ApacheCombinedLogParser < ETL::Parser::Parser 6 | include HttpTools 7 | def initialize(source, options={}) 8 | super 9 | end 10 | 11 | def each 12 | Dir.glob(file).each do |file| 13 | File.open(file).each_line do |line| 14 | yield parse(line) 15 | end 16 | end 17 | end 18 | 19 | def parse(line) 20 | # example line: 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)" 21 | line =~ /^(\S+)\s(\S+)\s(\S+)\s\[([^\]]*)\]\s"([^"]*)"\s(\d*)\s(\d*)\s"([^"]*)"\s"([^"]*)"$/ 22 | fields = { 23 | :ip_address => $1, 24 | :identd => $2, 25 | :user => $3, 26 | :timestamp => $4, 27 | :request => $5, 28 | :response_code => $6, 29 | :bytes => $7, 30 | :referrer => $8, 31 | :user_agent => $9, 32 | } 33 | #fields[:timestamp] =~ r%{(\d\d)/(\w\w\w)/(\d\d\d\d):(\d\d):(\d\d):(\d\d) -(\d\d\d\d)} 34 | d = Date._strptime(fields[:timestamp], '%d/%b/%Y:%H:%M:%S') unless fields[:timestamp].nil? 35 | fields[:timestamp] = Time.mktime(d[:year], d[:mon], d[:mday], d[:hour], d[:min], d[:sec], d[:sec_fraction]) unless d.nil? 36 | 37 | fields[:method], fields[:path] = fields[:request].split(/\s/) 38 | 39 | fields.merge!(parse_user_agent(fields[:user_agent])) unless fields[:user_agent].nil? 40 | fields.merge!(parse_uri(fields[:referrer], :prefix => 'referrer_')) 41 | 42 | fields.each do |key, value| 43 | fields[key] = nil if value == '-' 44 | end 45 | end 46 | 47 | end 48 | end 49 | end -------------------------------------------------------------------------------- /lib/etl/parser/csv_parser.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Parser #:nodoc: 3 | # Parses CSV files 4 | class CsvParser < ETL::Parser::Parser 5 | # Initialize the parser 6 | # * source: The Source object 7 | # * options: Hash of options for the parser, defaults to an empty hash 8 | def initialize(source, options={}) 9 | super 10 | configure 11 | end 12 | 13 | attr_reader :validate_rows 14 | 15 | def get_fields_names(file) 16 | File.open(file) do |input| 17 | fields = CSV.parse(input.readline, options).first 18 | new_fields = [] 19 | fields.each_with_index do |field,index| 20 | # compute the index of occurrence of this specific occurrence of the field (usually, will be 1) 21 | occurrence_index = fields[0..index].find_all { |e| e == field }.size 22 | number_of_occurrences = fields.find_all { |e| e == field }.size 23 | new_field = field + (number_of_occurrences > 1 ? "_#{occurrence_index}" : "") 24 | new_fields << Field.new(new_field.to_sym) 25 | end 26 | return new_fields 27 | end 28 | end 29 | 30 | # Returns each row. 31 | def each 32 | Dir.glob(file).each do |file| 33 | ETL::Engine.logger.debug "parsing #{file}" 34 | if fields.length == 0 35 | ETL::Engine.logger.debug "no columns specified so reading names from first line of #{file}" 36 | @fields = get_fields_names(file) 37 | end 38 | line = 0 39 | lines_skipped = 0 40 | CSV.foreach(file, options) do |raw_row| 41 | if lines_skipped < source.skip_lines 42 | ETL::Engine.logger.debug "skipping line" 43 | lines_skipped += 1 44 | next 45 | end 46 | line += 1 47 | row = {} 48 | validate_row(raw_row, line, file) if self.validate_rows 49 | raw_row.each_with_index do |value, index| 50 | f = fields[index] 51 | row[f.name] = value 52 | end 53 | yield row 54 | end 55 | end 56 | end 57 | 58 | # Get an array of defined fields 59 | def fields 60 | @fields ||= [] 61 | end 62 | 63 | private 64 | def validate_row(row, line, file) 65 | ETL::Engine.logger.debug "validating line #{line} in file #{file}" 66 | if row.length != fields.length 67 | raise_with_info( MismatchError, 68 | "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})", 69 | line, file 70 | ) 71 | end 72 | end 73 | 74 | def configure 75 | @validate_rows = if source.configuration.has_key?(:validate_rows) 76 | source.configuration[:validate_rows] 77 | else 78 | true 79 | end 80 | 81 | source.definition.each do |options| 82 | case options 83 | when Symbol 84 | fields << Field.new(options) 85 | when Hash 86 | fields << Field.new(options[:name]) 87 | else 88 | raise DefinitionError, "Each field definition must either be a symbol or a hash" 89 | end 90 | end 91 | end 92 | 93 | class Field #:nodoc: 94 | attr_reader :name 95 | def initialize(name) 96 | @name = name 97 | end 98 | end 99 | end 100 | end 101 | end 102 | -------------------------------------------------------------------------------- /lib/etl/parser/excel_parser.rb: -------------------------------------------------------------------------------- 1 | optional_require 'roo' 2 | 3 | module ETL 4 | module Parser 5 | class ExcelParser < ETL::Parser::Parser 6 | 7 | attr_accessor :ignore_blank_line, :worksheet_column, :validate_rows 8 | 9 | # Initialize the parser 10 | # * source: The Source object 11 | # * options: Parser options Hash 12 | def initialize(source, options={}) 13 | super 14 | configure 15 | end 16 | 17 | # Returns each row 18 | def each 19 | Dir.glob(file).each do |file| 20 | ETL::Engine.logger.debug "parsing #{file}" 21 | line = 0 22 | lines_skipped = 0 23 | book = Roo::Spreadsheet.open file 24 | loopworksheets = [] 25 | 26 | if worksheets.empty? 27 | loopworksheets = book.sheets 28 | else 29 | worksheets.each do |index| 30 | loopworksheets << book.sheet(index) 31 | end 32 | end 33 | 34 | sheet_index = -1 35 | 36 | book.each_with_pagename do |name, sheet| 37 | sheet_index += 1 38 | # puts "Sheet: #{name}" 39 | # puts worksheets.inspect 40 | if !worksheets.empty? && !worksheets.include?(sheet_index) 41 | # puts "No!!! #{sheet_index.inspect}" 42 | next 43 | end 44 | sheet.each do |raw_row| 45 | if lines_skipped < source.skip_lines 46 | ETL::Engine.logger.debug "skipping line" 47 | lines_skipped += 1 48 | next 49 | end 50 | line += 1 51 | row = {} 52 | if self.ignore_blank_line and raw_row.empty? 53 | lines_skipped += 1 54 | next 55 | end 56 | validate_row(raw_row, line, file) if self.validate_rows 57 | raw_row.each_with_index do |value, index| 58 | f = fields[index] 59 | row[f.name] = value 60 | end 61 | row[worksheet_column] = name if worksheet_column 62 | yield row 63 | end 64 | end 65 | end 66 | end 67 | 68 | # Get an array of defined worksheets 69 | def worksheets 70 | @worksheets ||= [] 71 | end 72 | 73 | # Get an array of defined fields 74 | def fields 75 | @fields ||= [] 76 | end 77 | 78 | private 79 | def validate_row(row, line, file) 80 | ETL::Engine.logger.debug "validating line #{line} in file #{file}" 81 | if row.length != fields.length 82 | raise_with_info( MismatchError, 83 | "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})", 84 | line, file 85 | ) 86 | end 87 | end 88 | 89 | private 90 | def configure 91 | source.definition[:worksheets].each do |worksheet| 92 | if Integer(worksheet) 93 | worksheets << worksheet.to_i 94 | else 95 | raise DefinitionError, "Each worksheet definition must be an integer" 96 | end 97 | end unless source.definition[:worksheets].nil? 98 | 99 | self.ignore_blank_line = source.definition[:ignore_blank_line] 100 | self.worksheet_column = source.definition[:worksheet_column] 101 | self.validate_rows = if source.configuration.has_key?(:validate_rows) 102 | source.configuration[:validate_rows] 103 | else 104 | true 105 | end 106 | 107 | source.definition[:fields].each do |options| 108 | case options 109 | when Symbol 110 | fields << Field.new(options) 111 | when Hash 112 | fields << Field.new(options[:name]) 113 | else 114 | raise DefinitionError, "Each field definition must either be a symbol or a hash" 115 | end 116 | end 117 | end 118 | 119 | class Field #:nodoc: 120 | attr_reader :name 121 | def initialize(name) 122 | @name = name 123 | end 124 | end 125 | 126 | end 127 | end 128 | end 129 | -------------------------------------------------------------------------------- /lib/etl/parser/fixed_width_parser.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Parser #:nodoc: 3 | # Parser for fixed with files 4 | class FixedWidthParser < ETL::Parser::Parser 5 | # Initialize the parser 6 | # * source: The source object 7 | # * options: Parser options Hash 8 | def initialize(source, options={}) 9 | super 10 | configure 11 | end 12 | 13 | # Return each row 14 | def each 15 | Dir.glob(file).each do |file| 16 | open(file).each do |line| 17 | row = {} 18 | lines_skipped = 0 19 | fields.each do |name, f| 20 | if lines_skipped < source.skip_lines 21 | lines_skipped += 1 22 | next 23 | end 24 | # TODO make strip optional? 25 | row[name] = line[f.field_start, f.field_length].strip 26 | end 27 | yield row 28 | end 29 | end 30 | end 31 | 32 | # Return a map of defined fields 33 | def fields 34 | @fields ||= {} 35 | end 36 | 37 | private 38 | def configure 39 | source.definition.each do |field, options| 40 | fields[field] = FixedWidthField.new( 41 | options[:name], options[:start], options[:end], options[:length] 42 | ) 43 | end 44 | end 45 | end 46 | 47 | class FixedWidthField #:nodoc: 48 | attr_reader :name, :field_start, :field_end, :field_length 49 | # Initialize the field. 50 | def initialize(name, field_start, field_end=nil, field_length=nil) 51 | @name = name 52 | @field_start = field_start - 1 53 | if field_end 54 | @field_end = field_end 55 | @field_length = @field_end - @field_start 56 | elsif field_length 57 | @field_length = field_length 58 | @field_end = @field_start + @field_length 59 | else 60 | raise DefinitionError, "Either field_end or field_length required" 61 | end 62 | end 63 | end 64 | end 65 | end -------------------------------------------------------------------------------- /lib/etl/parser/nokogiri_xml_parser.rb: -------------------------------------------------------------------------------- 1 | optional_require 'nokogiri' 2 | require 'open-uri' 3 | optional_require 'zlib' 4 | 5 | module ETL 6 | module Parser 7 | class NokogiriXmlParser < ETL::Parser::Parser 8 | # Initialize the parser 9 | # * source: The Source object 10 | # * options: Parser options Hash 11 | def initialize(source, options={}) 12 | super 13 | configure 14 | end 15 | 16 | # Returns each row 17 | def each 18 | Dir.glob(file).each do |source| 19 | 20 | doc = nil 21 | 22 | gzip = false 23 | magic = "1F8B".to_i(base=16) # Check for gzip archives 24 | if File.exist?(source) 25 | gzip = true if magic == ( 26 | File.open(source).read(2).unpack("H2H2").to_s.to_i(base=16)) 27 | end 28 | 29 | if gzip 30 | doc = Nokogiri::XML(Zlib::GzipReader.open(source)) 31 | else 32 | doc = Nokogiri::XML(open(source)) 33 | end 34 | 35 | doc.xpath(@collection_xpath).each do |nodeset| 36 | row = {} 37 | 38 | fields.each do |f| 39 | value = nodeset.xpath(f.xpath).text 40 | row[f.name] = value 41 | end 42 | yield row 43 | end 44 | 45 | end 46 | end 47 | 48 | # Get an array of defined fields 49 | def fields 50 | @fields ||= [] 51 | end 52 | 53 | private 54 | def configure 55 | @collection_xpath = source.definition[:collection] 56 | if @collection_xpath.nil? 57 | raise ":collection => 'XPath' argument required" 58 | end 59 | source.definition[:fields].each do |options| 60 | case options 61 | when Symbol 62 | fields << Field.new(options, options.to_s) 63 | when Hash 64 | options[:xpath] ||= options[:name] 65 | fields << Field.new(options[:name], options[:xpath].to_s) 66 | else 67 | raise DefinitionError, 68 | "Each field definition must either be an symbol " + 69 | "or a hash of options for the field" 70 | end 71 | end 72 | end 73 | 74 | class Field 75 | attr_reader :name, :xpath 76 | def initialize(name, xpath) 77 | @name = name 78 | @xpath = xpath 79 | end 80 | end 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /lib/etl/parser/parser.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Parser #:nodoc: 3 | # Base parser class. Implementation classes must extend this class and implement 4 | # the each method. The each method should return each row of the source data as 5 | # a Hash. 6 | class Parser 7 | include Enumerable 8 | class << self 9 | # Convert the name (string or symbol) to a parser class. 10 | # 11 | # Example: 12 | # class_for_name(:fixed_width) returns a FixedWidthParser class 13 | def class_for_name(name) 14 | ETL::Parser.const_get("#{name.to_s.camelize}Parser") 15 | end 16 | end 17 | 18 | # The Source object for the data 19 | attr_reader :source 20 | 21 | # Options Hash for the parser 22 | attr_reader :options 23 | 24 | def initialize(source, options={}) 25 | @source = source 26 | @options = options || {} 27 | end 28 | 29 | protected 30 | def file 31 | path = Pathname.new(source.configuration[:file]) 32 | path = path.absolute? ? path : Pathname.new(File.dirname(source.control.file)) + path 33 | path 34 | end 35 | 36 | def raise_with_info(error, message, file, line) 37 | raise error, "#{message} (line #{line} in #{file})" 38 | end 39 | end 40 | end 41 | end -------------------------------------------------------------------------------- /lib/etl/parser/xml_parser.rb: -------------------------------------------------------------------------------- 1 | require 'rexml/document' 2 | 3 | module ETL 4 | module Parser 5 | class XmlParser < ETL::Parser::Parser 6 | # Initialize the parser 7 | # * source: The Source object 8 | # * options: Parser options Hash 9 | def initialize(source, options={}) 10 | super 11 | configure 12 | end 13 | 14 | # Returns each row 15 | def each 16 | Dir.glob(file).each do |file| 17 | doc = nil 18 | t = Benchmark.realtime do 19 | doc = REXML::Document.new(File.new(file)) 20 | end 21 | Engine.logger.info "XML #{file} parsed in #{t}s" 22 | doc.elements.each(@collection_xpath) do |element| 23 | row = {} 24 | fields.each do |f| 25 | value = element.text(f.xpath) 26 | row[f.name] = value 27 | end 28 | yield row 29 | end 30 | end 31 | end 32 | 33 | # Get an array of defined fields 34 | def fields 35 | @fields ||= [] 36 | end 37 | 38 | private 39 | def configure 40 | @collection_xpath = source.definition[:collection] 41 | raise "Collection XPath is required" if @collection_xpath.nil? 42 | 43 | source.definition[:fields].each do |options| 44 | case options 45 | when Symbol 46 | fields << Field.new(options, options.to_s) 47 | when Hash 48 | options[:xpath] ||= options[:name] 49 | fields << Field.new(options[:name], options[:xpath].to_s) 50 | else 51 | raise DefinitionError, "Each field definition must either be an symbol or a hash of options for the field" 52 | end 53 | end 54 | end 55 | 56 | class Field 57 | attr_reader :name, :xpath 58 | def initialize(name, xpath) 59 | @name = name 60 | @xpath = xpath 61 | end 62 | end 63 | end 64 | end 65 | end -------------------------------------------------------------------------------- /lib/etl/processor.rb: -------------------------------------------------------------------------------- 1 | # This source file contains the ETL::Processor module and requires all of the processors 2 | 3 | module ETL #:nodoc: 4 | # The ETL::Processor module contains row-level and bulk processors 5 | module Processor 6 | end 7 | end 8 | 9 | require 'etl/processor/processor' 10 | require 'etl/processor/row_processor' 11 | Dir[File.dirname(__FILE__) + "/processor/*.rb"].each { |file| require(file) } -------------------------------------------------------------------------------- /lib/etl/processor/block_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Processor 3 | # This processor is both a valid RowProcessor (called on each row with after_read) or a Processor (called once on pre_process or post_process) 4 | class BlockProcessor < ETL::Processor::RowProcessor 5 | def initialize(control, configuration) 6 | super 7 | @block = configuration[:block] 8 | end 9 | def process(row=nil) 10 | @block.call(row) 11 | end 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/etl/processor/bulk_import_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Processor which is used to bulk import data into a target database. The 4 | # underlying database driver from ActiveRecord must support the methods 5 | # +bulk_load+ method. 6 | class BulkImportProcessor < ETL::Processor::Processor 7 | 8 | # The file to load from 9 | attr_reader :file 10 | # The target database 11 | attr_reader :target 12 | # The table name 13 | attr_reader :table 14 | # Set to true to truncate 15 | attr_reader :truncate 16 | # Array of symbols representing the column load order 17 | attr_reader :columns 18 | # The field separator (defaults to a comma) 19 | attr_accessor :field_separator 20 | # The field enclosure (defaults to nil) 21 | attr_accessor :field_enclosure 22 | # The line separator (defaults to a newline) 23 | attr_accessor :line_separator 24 | # The string that indicates a NULL (defaults to an empty string) 25 | attr_accessor :null_string 26 | # boolean that indicates disable keys before, then enable after load (MySql only optimization) 27 | attr_accessor :disable_keys 28 | # replace existing records, not just insert 29 | attr_accessor :replace 30 | 31 | # Initialize the processor. 32 | # 33 | # Configuration options: 34 | # * :file: The file to load data from 35 | # * :target: The target database 36 | # * :table: The table name 37 | # * :truncate: Set to true to truncate before loading 38 | # * :columns: The columns to load in the order they appear in 39 | # the bulk data file 40 | # * :field_separator: The field separator. Defaults to a comma 41 | # * :line_separator: The line separator. Defaults to a newline 42 | # * :field_enclosure: The field enclosure charcaters 43 | # * :disable_keys: Set to true to disable keys before, then enable after load (MySql only optimization) 44 | def initialize(control, configuration) 45 | super 46 | @target = configuration[:target] 47 | path = Pathname.new(configuration[:file]) 48 | @file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(control.file))) + path 49 | 50 | @table = configuration[:table] 51 | @truncate = configuration[:truncate] ||= false 52 | @columns = configuration[:columns] 53 | @field_separator = (configuration[:field_separator] || ',') 54 | @line_separator = (configuration[:line_separator] || "\n") 55 | @null_string = (configuration[:null_string] || "") 56 | @field_enclosure = configuration[:field_enclosure] 57 | @disable_keys = configuration[:disable_keys] || false 58 | @replace = configuration[:replace] || false 59 | 60 | raise ControlError, "Target must be specified" unless @target 61 | raise ControlError, "Table must be specified" unless @table 62 | end 63 | 64 | # Execute the processor 65 | def process 66 | return if ETL::Engine.skip_bulk_import 67 | return if File.size(file) == 0 68 | 69 | conn = ETL::Engine.connection(target) 70 | conn.transaction do 71 | conn.truncate(table_name) if truncate 72 | options = {} 73 | options[:columns] = columns 74 | 75 | options[:disable_keys] = true if disable_keys 76 | options[:replace] = true if replace 77 | 78 | if field_separator || field_enclosure || line_separator || null_string 79 | options[:fields] = {} 80 | options[:fields][:null_string] = null_string if null_string 81 | options[:fields][:delimited_by] = field_separator if field_separator 82 | options[:fields][:enclosed_by] = field_enclosure if field_enclosure 83 | options[:fields][:terminated_by] = line_separator if line_separator 84 | end 85 | conn.bulk_load(file, table_name, options) 86 | end 87 | end 88 | 89 | def table_name 90 | ETL::Engine.table(table, ETL::Engine.connection(target)) 91 | end 92 | end 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /lib/etl/processor/check_exist_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # A row-level processor that checks if the row already exists in the 4 | # target table 5 | class CheckExistProcessor < ETL::Processor::RowProcessor 6 | # A symbol or array of symbols representing keys that should be skipped 7 | attr_accessor :skip 8 | 9 | # The target database 10 | attr_accessor :target 11 | 12 | # The name of the table to check against 13 | attr_accessor :table 14 | 15 | # An array of columns representing the natural key 16 | attr_accessor :columns 17 | 18 | # Is set to true if the processor should execute the check. If there are 19 | # no rows in the target table then this should return false. 20 | attr_accessor :should_check 21 | 22 | # Initialize the processor 23 | # Configuration options: 24 | # * :columns: An array of symbols for columns that should be included in the query conditions. If this option is not specified then all of the columns in the row will be included in the conditions (unless :skip is specified). 25 | # * :skip: A symbol or array of symbols that should not be included in the existence check. If this option is not specified then all of the columns will be included in the existence check (unless :columns is specified). 26 | # * :target: The target connection 27 | # * :table: The table name 28 | def initialize(control, configuration) 29 | super 30 | @skip = configuration[:skip] || [] 31 | @target = configuration[:target] || raise(ETL::ControlError, "target must be specified") 32 | @table = configuration[:table] || raise(ETL::ControlError, "table must be specified") 33 | @columns = configuration[:columns] 34 | 35 | q = "SELECT COUNT(*) FROM #{table_name}" 36 | @should_check = ETL::Engine.connection(target).select_value(q).to_i > 0 37 | end 38 | 39 | # Return true if the given key should be skipped 40 | def skip?(key) 41 | case skip 42 | when Array 43 | skip.include?(key) 44 | else 45 | skip.to_sym == key.to_sym 46 | end 47 | end 48 | 49 | # Return true if the row should be checked 50 | def should_check? 51 | @should_check ? true : false 52 | end 53 | 54 | # Process the row 55 | def process(row) 56 | return row unless should_check? 57 | conn = ETL::Engine.connection(target) 58 | q = "SELECT * FROM #{table_name} WHERE " 59 | conditions = [] 60 | ensure_columns_available_in_row!(row, columns, 'for existence check') 61 | row.each do |k,v| 62 | if columns.nil? || columns.include?(k.to_sym) 63 | conditions << "#{k} = #{conn.quote(v)}" unless skip?(k.to_sym) 64 | end 65 | end 66 | q << conditions.join(" AND ") 67 | q << " LIMIT 1" 68 | 69 | result = conn.select_one(q) 70 | return row if result.nil? 71 | end 72 | 73 | private 74 | 75 | def table_name 76 | ETL::Engine.table(table, ETL::Engine.connection(target)) 77 | end 78 | end 79 | end 80 | end -------------------------------------------------------------------------------- /lib/etl/processor/check_unique_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Row processor that checks whether or not the row has already passed 4 | # through the ETL processor, using the key fields provided as the keys 5 | # to check. 6 | class CheckUniqueProcessor < ETL::Processor::RowProcessor 7 | 8 | # The keys to check 9 | attr_accessor :keys 10 | 11 | # Initialize the processor 12 | # Configuration options: 13 | # * :keys: An array of keys to check against 14 | def initialize(control, configuration) 15 | super 16 | @keys = configuration[:keys] 17 | end 18 | 19 | # A Hash of keys that have already been processed. 20 | def compound_key_constraints 21 | @compound_key_constraints ||= {} 22 | end 23 | 24 | # Process the row. This implementation will only return a row if it 25 | # it's key combination has not already been seen. 26 | # 27 | # An error will be raised if the row doesn't include the keys. 28 | def process(row) 29 | ensure_columns_available_in_row!(row, keys, 'for unicity check') 30 | 31 | key = (keys.collect { |k| row[k] }).join('|') 32 | unless compound_key_constraints[key] 33 | compound_key_constraints[key] = 1 34 | return row 35 | end 36 | end 37 | end 38 | end 39 | end -------------------------------------------------------------------------------- /lib/etl/processor/copy_field_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Row processor that will copy one field to another 4 | # 5 | # Configuration options: 6 | # * :destination: The destination field 7 | # * :dest: Alias for :destination 8 | # * :source: The source field 9 | class CopyFieldProcessor < ETL::Processor::RowProcessor 10 | # Process the given row 11 | def process(row) 12 | destination = (configuration[:destination] || configuration[:dest]) 13 | source_value = row[configuration[:source]] 14 | case source_value 15 | when Numeric 16 | row[destination] = source_value 17 | when nil 18 | row[destination] = nil 19 | else 20 | row[destination] = source_value.dup 21 | end 22 | row 23 | end 24 | end 25 | end 26 | end -------------------------------------------------------------------------------- /lib/etl/processor/database_join_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Processor 3 | class DatabaseJoinProcessor < ETL::Processor::RowProcessor 4 | attr_reader :target 5 | attr_reader :query 6 | attr_reader :fields 7 | 8 | # Initialize the procesor. 9 | # 10 | # Arguments: 11 | # * control: The ETL::Control::Control instance 12 | # * configuration: The configuration Hash 13 | # * definition: The source definition 14 | # 15 | # Required configuration options: 16 | # * :target: The target connection 17 | # * :query: The join query 18 | # * :fields: The fields to add to the row 19 | def initialize(control, configuration) 20 | super 21 | @target = configuration[:target] 22 | @query = configuration[:query] 23 | @fields = configuration[:fields] 24 | raise ControlError, ":target must be specified" unless @target 25 | raise ControlError, ":query must be specified" unless @query 26 | raise ControlError, ":fields must be specified" unless @fields 27 | end 28 | 29 | # Get a String identifier for the source 30 | def to_s 31 | "#{host}/#{database}" 32 | end 33 | 34 | def process(row) 35 | return nil if row.nil? 36 | 37 | q = @query 38 | begin 39 | q = eval('"' + @query + '"') 40 | rescue 41 | end 42 | 43 | ETL::Engine.logger.debug("Executing select: #{q}") 44 | res = connection.execute(q) 45 | 46 | # TODO - refactor this and move it (and similar code around) to adapter_extensions 47 | case connection.class.name 48 | when "ActiveRecord::ConnectionAdapters::PostgreSQLAdapter"; 49 | res.each do |r| 50 | @fields.each do |field| 51 | row[field.to_sym] = r[field.to_s] 52 | end 53 | end 54 | when "ActiveRecord::ConnectionAdapters::Mysql2Adapter"; 55 | res.each(:as => :hash) do |r| 56 | @fields.each do |field| 57 | row[field.to_sym] = r[field.to_s] 58 | end 59 | end 60 | when "ActiveRecord::ConnectionAdapters::MysqlAdapter"; 61 | res.each_hash do |r| 62 | @fields.each do |field| 63 | row[field.to_sym] = r[field.to_s] 64 | end 65 | end 66 | res.free 67 | else raise "Unsupported adapter #{connection.class} for this destination" 68 | end 69 | 70 | return row 71 | end 72 | 73 | private 74 | # Get the database connection to use 75 | def connection 76 | ETL::Engine.connection(target) 77 | end 78 | 79 | # Get the host, defaults to 'localhost' 80 | def host 81 | ETL::Base.configurations[target.to_s]['host'] || 'localhost' 82 | end 83 | 84 | def database 85 | ETL::Base.configurations[target.to_s]['database'] 86 | end 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /lib/etl/processor/encode_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # The encode processor uses Iconv to convert a file from one encoding (eg: utf-8) to another (eg: latin1), line by line. 4 | class EncodeProcessor < ETL::Processor::Processor 5 | 6 | # The file to load from 7 | attr_reader :source_file 8 | # The file to write to 9 | attr_reader :target_file 10 | # The source file encoding 11 | attr_reader :source_encoding 12 | # The target file encoding 13 | attr_reader :target_encoding 14 | 15 | # Initialize the processor. 16 | # 17 | # Configuration options: 18 | # * :source_file: The file to load data from 19 | # * :source_encoding: The source file encoding (eg: 'latin1','utf-8'), as supported by Iconv 20 | # * :target_file: The file to write data to 21 | # * :target_encoding: The target file encoding 22 | def initialize(control, configuration) 23 | super 24 | raise ControlError, "Source file must be specified" if configuration[:source_file].nil? 25 | raise ControlError, "Target file must be specified" if configuration[:target_file].nil? 26 | @source_file = File.join(File.dirname(control.file), configuration[:source_file]) 27 | @source_encoding = configuration[:source_encoding] 28 | @target_file = File.join(File.dirname(control.file), configuration[:target_file]) 29 | @target_encoding = configuration[:target_encoding] 30 | raise ControlError, "Source and target file cannot currently point to the same file" if source_file == target_file 31 | begin 32 | @iconv = Iconv.new(target_encoding,source_encoding) 33 | rescue Iconv::InvalidEncoding 34 | raise ControlError, "Either the source encoding '#{source_encoding}' or the target encoding '#{target_encoding}' is not supported" 35 | end 36 | end 37 | 38 | # Execute the processor 39 | def process 40 | # operate line by line to handle large files without loading them in-memory 41 | # could be replaced by a system iconv call when available, for greater performance 42 | File.open(source_file) do |source| 43 | #puts "Opening #{target_file}" 44 | File.open(target_file,'w') do |target| 45 | source.each_line do |line| 46 | target << @iconv.iconv(line) 47 | end 48 | end 49 | end 50 | end 51 | end 52 | end 53 | end -------------------------------------------------------------------------------- /lib/etl/processor/ensure_fields_presence_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Processor 3 | # Ensure that each specified field is available 4 | class EnsureFieldsPresenceProcessor < ETL::Processor::RowProcessor 5 | 6 | # Initialize the processor. 7 | # 8 | # Configuration options: 9 | # * :fields: An array of keys whose presence should be verified in each row 10 | def initialize(control, configuration) 11 | super 12 | @fields = configuration[:fields] 13 | raise ControlError, ":fields must be specified" unless @fields 14 | end 15 | 16 | def process(row) 17 | missing_fields = configuration[:fields].map(&:to_s) - row.keys.map(&:to_s) 18 | raise(ETL::ControlError, 19 | "Row missing required field(s) #{missing_fields.join(',')} in row. Available fields are : #{row.keys.join(',')}") unless missing_fields.empty? 20 | row 21 | end 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/etl/processor/escape_csv_processor.rb: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | 3 | module ETL #:nodoc: 4 | module Processor #:nodoc: 5 | class EscapeCsvProcessor < ETL::Processor::Processor 6 | 7 | # The file to load from 8 | attr_reader :source_file 9 | # The file to write to 10 | attr_reader :target_file 11 | # whether to use a temporary file or not 12 | attr_reader :use_temp_file 13 | 14 | attr_reader :filters 15 | attr_reader :charcount 16 | 17 | # Initialize the processor. 18 | # 19 | # Configuration options: 20 | # * :source_file: The file to load data from 21 | # * :target_file: The file to write data to 22 | # * :file: short-cut which will set the same value to both source_file and target_file 23 | def initialize(control, configuration) 24 | super 25 | if configuration[:file] 26 | @use_temp_file = true 27 | configuration[:source_file] = configuration[:file] 28 | configuration[:target_file] = configuration[:file] + '.tmp' 29 | end 30 | path = Pathname.new(configuration[:source_file]) 31 | @source_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:source_file]))) + path 32 | path = Pathname.new(configuration[:target_file]) 33 | @target_file = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:target_file]))) + path 34 | @filters = configuration[:filters] || [{:replace => '\"', :result => '""'}] 35 | @charcount = configuration[:charcount] 36 | raise ControlError, "Source file must be specified" if @source_file.nil? 37 | raise ControlError, "Target file must be specified" if @target_file.nil? 38 | raise ControlError, "Source and target file cannot currently point to the same file" if @source_file == @target_file 39 | end 40 | 41 | # Execute the processor 42 | def process 43 | reader = File.open(@source_file, 'r') 44 | writer = File.open(@target_file, 'w') 45 | 46 | reader.each_line do |line| 47 | reading = line 48 | @filters.each do |filter| 49 | if (!filter[:replace].nil? && 50 | !filter[:result].nil?) 51 | result = reading.gsub(Regexp.new(filter[:replace]), filter[:result]) 52 | reading = result 53 | end 54 | end unless @filters.nil? 55 | @charcount.each do |count| 56 | if (!count[:char].nil? && 57 | !count[:count].nil?) 58 | c = reading.count count[:char] 59 | if c != count[:count] 60 | reading = nil 61 | end 62 | end 63 | end unless @charcount.nil? 64 | writer.write(reading) unless reading.nil? 65 | end 66 | 67 | reader.close 68 | writer.close 69 | 70 | if use_temp_file 71 | FileUtils.rm(source_file) 72 | FileUtils.mv(target_file,source_file) 73 | end 74 | end 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/etl/processor/filter_row_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Processor 3 | class FilterRowProcessor < ETL::Processor::RowProcessor 4 | attr_reader :condition 5 | attr_reader :outtrue 6 | attr_reader :outfalse 7 | 8 | def initialize(control, configuration) 9 | @condition = configuration[:condition] 10 | @outtrue = configuration[:outtrue] 11 | @outfalse = configuration[:outfalse] 12 | super 13 | end 14 | 15 | def process(row) 16 | return nil if row.nil? 17 | 18 | if eval_condition(row, @condition) 19 | return [] if @outtrue.nil? 20 | 21 | eval(@outtrue) 22 | else 23 | eval(@outfalse) unless @outfalse.nil? 24 | end 25 | 26 | return row 27 | end 28 | 29 | private 30 | def eval_condition(row, cond) 31 | 32 | first = cond[1] 33 | if (cond[1].class == Array) 34 | first = eval_condition(row, cond[1]) 35 | end 36 | 37 | second = cond[2] 38 | if (cond[2].class == Array) 39 | second = eval_condition(row, cond[2]) 40 | end 41 | 42 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!" 43 | 44 | eval("#{first}#{cond[0]}#{second}") 45 | rescue => e 46 | return false 47 | end 48 | 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /lib/etl/processor/ftp_downloader_processor.rb: -------------------------------------------------------------------------------- 1 | # Written by Susan Potter under open source MIT license. 2 | # August 12, 2007. 3 | 4 | require 'net/ftp' 5 | 6 | module ETL 7 | module Processor 8 | # Custom processor to download files via FTP 9 | class FtpDownloaderProcessor < ETL::Processor::Processor 10 | attr_reader :host 11 | attr_reader :port 12 | attr_reader :remote_dir 13 | attr_reader :files 14 | attr_reader :username 15 | attr_reader :local_dir 16 | 17 | # configuration options include: 18 | # * host - hostname or IP address of FTP server (required) 19 | # * port - port number for FTP server (default: 21) 20 | # * remote_dir - remote path on FTP server (default: /) 21 | # * files - list of files to download from FTP server (default: []) 22 | # * username - username for FTP server authentication (default: anonymous) 23 | # * password - password for FTP server authentication (default: nil) 24 | # * local_dir - local output directory to save downloaded files (default: '') 25 | # 26 | # As an example you might write something like the following in your control process file: 27 | # pre_process :ftp_downloader, { 28 | # :host => 'ftp.sec.gov', 29 | # :path => 'edgar/Feed/2007/QTR2', 30 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz', 31 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'], 32 | # :local_dir => '/data/sec/2007/04', 33 | # } 34 | # The above example will anonymously download via FTP the first week's worth of SEC filing feed data 35 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+. 36 | def initialize(control, configuration) 37 | @host = configuration[:host] 38 | @port = configuration[:port] || 21 39 | @remote_dir = configuration[:remote_dir] || '/' 40 | @files = configuration[:files] || [] 41 | @username = configuration[:username] || 'anonymous' 42 | @password = configuration[:password] 43 | @local_dir = configuration[:local_dir] || '' 44 | end 45 | 46 | def process 47 | Net::FTP.open(@host) do |conn| 48 | conn.connect(@host, @port) 49 | conn.login(@username, @password) 50 | @files.each do |f| 51 | conn.getbinaryfile(remote_file(f), local_file(f)) 52 | end 53 | end 54 | end 55 | 56 | private 57 | attr_accessor :password 58 | 59 | def local_file(name) 60 | File.join(@local_dir, name) 61 | end 62 | 63 | def remote_file(name) 64 | File.join(@remote_dir, name) 65 | end 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /lib/etl/processor/ftp_uploader_processor.rb: -------------------------------------------------------------------------------- 1 | require 'net/ftp' 2 | 3 | module ETL 4 | module Processor 5 | # Custom processor to download files via FTP 6 | class FtpUploaderProcessor < ETL::Processor::Processor 7 | attr_reader :host 8 | attr_reader :port 9 | attr_reader :remote_dir 10 | attr_reader :files 11 | attr_reader :username 12 | attr_reader :local_dir 13 | 14 | # configuration options include: 15 | # * host - hostname or IP address of FTP server (required) 16 | # * port - port number for FTP server (default: 21) 17 | # * remote_dir - remote path on FTP server (default: /) 18 | # * files - list of files to download from FTP server (default: []) 19 | # * username - username for FTP server authentication (default: anonymous) 20 | # * password - password for FTP server authentication (default: nil) 21 | # * local_dir - local output directory to save downloaded files (default: '') 22 | # 23 | # As an example you might write something like the following in your control process file: 24 | # pre_process :ftp_uploader, { 25 | # :host => 'ftp.sec.gov', 26 | # :path => 'edgar/Feed/2007/QTR2', 27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz', 28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'], 29 | # :local_dir => '/data/sec/2007/04', 30 | # } 31 | # The above example will anonymously download via FTP the first week's worth of SEC filing feed data 32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+. 33 | def initialize(control, configuration) 34 | @host = configuration[:host] 35 | @port = configuration[:port] || 21 36 | @remote_dir = configuration[:remote_dir] || '/' 37 | @files = configuration[:files] || [] 38 | @username = configuration[:username] || 'anonymous' 39 | @password = configuration[:password] 40 | @local_dir = configuration[:local_dir] || '' 41 | end 42 | 43 | def process 44 | Net::FTP.open(@host) do |conn| 45 | conn.connect(@host, @port) 46 | conn.login(@username, @password) 47 | @files.each do |f| 48 | conn.putbinaryfile(local_file(f), remote_file(f)) 49 | end 50 | end 51 | end 52 | 53 | private 54 | attr_accessor :password 55 | 56 | def local_file(name) 57 | File.join(@local_dir, name) 58 | end 59 | 60 | def remote_file(name) 61 | File.join(@remote_dir, name) 62 | end 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /lib/etl/processor/hierarchy_exploder_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Row-level processor that will convert a single row into multiple rows designed to be inserted 4 | # into a hierarchy bridge table. 5 | class HierarchyExploderProcessor < ETL::Processor::RowProcessor 6 | attr_accessor :id_field 7 | attr_accessor :parent_id_field 8 | 9 | # Initialize the processor 10 | # 11 | # Configuration options: 12 | # * :connection: The ActiveRecord adapter connection 13 | # * :id_field: The name of the id field (defaults to 'id') 14 | # * :parent_id_field: The name of the parent id field (defaults to 'parent_id') 15 | # 16 | # TODO: Allow resolver to be implemented in a customizable fashion, i.e. don't rely 17 | # on AR as the only resolution method. 18 | def initialize(control, configuration={}) 19 | @id_field = configuration[:id_field] || 'id' 20 | @parent_id_field = configuration[:parent_id_field] || 'parent_id' 21 | super 22 | end 23 | 24 | # Process the row expanding it into hierarchy values 25 | def process(row) 26 | rows = [] 27 | target = configuration[:target] 28 | table = configuration[:table] 29 | conn = ETL::Engine.connection(target) 30 | build_rows([row[:id]], row[:id], row[:id], row[:parent_id].nil?, 0, rows, table, conn) 31 | rows 32 | end 33 | 34 | protected 35 | # Recursive function that will add a row for the current level and then call build_rows 36 | # for all of the children of the current level 37 | def build_rows(ids, parent_id, row_id, root, level, rows, table, conn) 38 | ids.each do |id| 39 | child_ids = conn.select_values("SELECT #{id_field} FROM #{table} WHERE #{parent_id_field} = #{id}") 40 | 41 | row = { 42 | :parent_id => row_id, 43 | :child_id => id, 44 | :num_levels_from_parent => level, 45 | :is_bottom => (child_ids.empty? ? 1 : 0), 46 | :is_top => (root ? 1 : 0), 47 | } 48 | rows << row 49 | 50 | build_rows(child_ids, id, row_id, false, level + 1, rows, table, conn) 51 | end 52 | end 53 | end 54 | end 55 | end -------------------------------------------------------------------------------- /lib/etl/processor/imapattachment_downloader_processor.rb: -------------------------------------------------------------------------------- 1 | optional_require 'net/imap' 2 | optional_require 'tmail' 3 | 4 | module ETL 5 | module Processor 6 | # Custom processor to download files via Imap Attachment 7 | class ImapattachmentDownloaderProcessor < ETL::Processor::Processor 8 | attr_reader :host 9 | attr_reader :ssl 10 | attr_reader :port 11 | attr_reader :delete 12 | attr_reader :filters 13 | attr_reader :folder 14 | attr_reader :username 15 | attr_reader :local_dir 16 | 17 | # configuration options include: 18 | # * host - hostname or IP address of IMAP server (required) 19 | # * ssl - activate encryption (default false) 20 | # * port - port number for IMAP server (default: 220 or 993) 21 | # * delete - delete message after reading (default false) 22 | # * filters - filter mails (default []) 23 | # * folder - folder to select mails from (default INBOX) 24 | # * username - username for IMAP server authentication (default: anonymous) 25 | # * password - password for IMAP server authentication (default: nil) 26 | # * local_dir - local output directory to save downloaded files (default: '') 27 | # 28 | def initialize(control, configuration) 29 | @host = configuration[:host] 30 | @ssl = configuration[:ssl] || false 31 | @port = configuration[:port] || (@ssl ? 993 : 220 ) 32 | @delete = configuration[:delete] || false 33 | @filters = configuration[:filters] || [] 34 | @folder = configuration[:folder] || 'INBOX' 35 | @username = configuration[:username] || 'anonymous' 36 | @password = configuration[:password] 37 | @local_dir = configuration[:local_dir] || '' 38 | end 39 | 40 | def process 41 | conn = Net::IMAP.new(@host, @port, @ssl) 42 | conn.login(@username, @password) 43 | 44 | conn.select(@folder) 45 | conn.uid_search(["NOT", "DELETED"]).each do |msguuid| 46 | mail = TMail::Mail.parse( conn.uid_fetch(msguuid, 'RFC822').first.attr['RFC822'] ) 47 | next if mail.attachments.blank? 48 | if applyfilter(mail, @filters) 49 | mail.attachments.each do |attachment| 50 | filename = attachment.original_filename 51 | File.open(local_file(filename), "w") {|f| 52 | f << attachment.gets(nil) 53 | } 54 | end 55 | 56 | conn.store(msguuid, "+FLAGS", [:Deleted]) if @delete 57 | end 58 | end 59 | conn.expunge 60 | conn.close 61 | end 62 | 63 | private 64 | attr_accessor :password 65 | 66 | def local_file(name) 67 | File.join(@local_dir, name) 68 | end 69 | 70 | def applyfilter(mail, cond) 71 | return true if (cond.nil? or cond.size < 3) 72 | 73 | first = cond[1] 74 | if (cond[1].class == Array) 75 | first = eval_condition(row, cond[1]) 76 | end 77 | 78 | second = cond[2] 79 | if (cond[2].class == Array) 80 | second = eval_condition(row, cond[2]) 81 | end 82 | 83 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!" 84 | 85 | eval("#{first}#{cond[0]}#{second}") 86 | rescue => e 87 | return false 88 | end 89 | end 90 | end 91 | end 92 | -------------------------------------------------------------------------------- /lib/etl/processor/pop3attachment_downloader_processor.rb: -------------------------------------------------------------------------------- 1 | optional_require 'net/pop' 2 | optional_require 'tmail' 3 | 4 | module ETL 5 | module Processor 6 | # Custom processor to download files via Pop3 Attachment 7 | class Pop3attachmentDownloaderProcessor < ETL::Processor::Processor 8 | attr_reader :host 9 | attr_reader :ssl 10 | attr_reader :port 11 | attr_reader :delete 12 | attr_reader :filters 13 | attr_reader :username 14 | attr_reader :local_dir 15 | 16 | # configuration options include: 17 | # * host - hostname or IP address of POP3 server (required) 18 | # * ssl - activate encryption (default false) 19 | # * port - port number for POP3 server (default: Net::POP3.default_port or Net::POP3.default_pop3s_port) 20 | # * delete - delete message after reading (default false) 21 | # * filters - filter mails (default []) 22 | # * username - username for POP3 server authentication (default: anonymous) 23 | # * password - password for POP3 server authentication (default: nil) 24 | # * local_dir - local output directory to save downloaded files (default: '') 25 | # 26 | def initialize(control, configuration) 27 | @host = configuration[:host] 28 | @ssl = configuration[:ssl] || false 29 | @port = configuration[:port] || (@ssl ? Net::POP3.default_pop3s_port : Net::POP3.default_port ) 30 | @delete = configuration[:delete] || false 31 | @filters = configuration[:filters] || [] 32 | @username = configuration[:username] || 'anonymous' 33 | @password = configuration[:password] 34 | @local_dir = configuration[:local_dir] || '' 35 | end 36 | 37 | def process 38 | Net::POP3.enable_ssl(OpenSSL::SSL::VERIFY_NONE) if @ssl 39 | conn = Net::POP3.new(@host, @port) 40 | conn.start(@username, @password) 41 | if !conn.mails.empty? 42 | conn.each_mail do |message| 43 | stringmail = message.pop 44 | mail = TMail::Mail.parse(stringmail) 45 | next if mail.attachments.blank? 46 | if applyfilter(mail, @filters) 47 | mail.attachments.each do |attachment| 48 | filename = attachment.original_filename 49 | File.open(local_file(filename), "w") {|f| 50 | f << attachment.gets(nil) 51 | } 52 | end 53 | 54 | message.delete if @delete 55 | end 56 | end 57 | end 58 | 59 | conn.finish 60 | end 61 | 62 | private 63 | attr_accessor :password 64 | 65 | def local_file(name) 66 | File.join(@local_dir, name) 67 | end 68 | 69 | def applyfilter(mail, cond) 70 | return true if (cond.nil? or cond.size < 3) 71 | 72 | first = cond[1] 73 | if (cond[1].class == Array) 74 | first = eval_condition(row, cond[1]) 75 | end 76 | 77 | second = cond[2] 78 | if (cond[2].class == Array) 79 | second = eval_condition(row, cond[2]) 80 | end 81 | 82 | return eval("#{cond[0]}#{first}#{second}") if cond[0] == "!" 83 | 84 | eval("#{first}#{cond[0]}#{second}") 85 | rescue => e 86 | return false 87 | end 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/etl/processor/print_row_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Debugging processor for printing the current row 4 | class PrintRowProcessor < ETL::Processor::RowProcessor 5 | # Process the row 6 | def process(row) 7 | puts row.inspect 8 | row 9 | end 10 | end 11 | end 12 | end -------------------------------------------------------------------------------- /lib/etl/processor/processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Base class for pre and post processors. Subclasses must implement the +process+ method. 4 | class Processor 5 | def initialize(control, configuration) 6 | @control = control 7 | @configuration = configuration 8 | after_initialize if respond_to?(:after_initialize) 9 | end 10 | protected 11 | # Get the control object 12 | def control 13 | @control 14 | end 15 | # Get the configuration Hash 16 | def configuration 17 | @configuration 18 | end 19 | # Get the engine logger 20 | def log 21 | Engine.logger 22 | end 23 | end 24 | end 25 | end -------------------------------------------------------------------------------- /lib/etl/processor/rename_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Row level processor to rename a field in the row. 4 | # 5 | # Configuration options: 6 | # * :source: the source field name 7 | # * :dest: The destination field name 8 | class RenameProcessor < ETL::Processor::RowProcessor 9 | def process(row) 10 | source_value = row[configuration[:source]] 11 | case source_value 12 | when Numeric 13 | row[configuration[:dest]] = source_value 14 | when nil 15 | row[configuration[:dest]] = nil 16 | else 17 | row[configuration[:dest]] = source_value.dup 18 | end 19 | row.delete(configuration[:source]) 20 | row 21 | end 22 | end 23 | end 24 | end -------------------------------------------------------------------------------- /lib/etl/processor/require_non_blank_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # A processor which requires that the particular fields are non-blank in 4 | # order for the row to be retained. 5 | class RequireNonBlankProcessor < ETL::Processor::RowProcessor 6 | # An array of fields to check 7 | attr_reader :fields 8 | 9 | # Initialize the processor 10 | # 11 | # Options: 12 | # * :fields: An array of fields to check, for example: 13 | # [:first_name,:last_name] 14 | def initialize(control, configuration) 15 | super 16 | @fields = configuration[:fields] || [] 17 | end 18 | 19 | # Process the row. 20 | def process(row) 21 | fields.each { |field| return if row[field].blank? } 22 | row 23 | end 24 | end 25 | end 26 | end -------------------------------------------------------------------------------- /lib/etl/processor/row_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Processor which processes a specific row. Unlike a transformer, which deals with a specific 4 | # value in the row, row processors can process an entire row at once, which can be used to 5 | # explode a single row into multiple rows (for example) 6 | class RowProcessor < Processor 7 | # Initialize the processor 8 | def initialize(control, configuration) 9 | super 10 | end 11 | # Process the specified row. This method must return the row. 12 | def process(row) 13 | raise "process_row is an abstract method" 14 | end 15 | 16 | # Ensure a given row keys include all the provided columns 17 | # and raise an error using the provided message if it doesn't 18 | def ensure_columns_available_in_row!(row, columns, message) 19 | unless columns.nil? 20 | columns.each do |k| 21 | raise(ETL::ControlError, "Row missing required field #{k.inspect} #{message}") unless row.keys.include?(k) 22 | end 23 | end 24 | end 25 | end 26 | end 27 | end -------------------------------------------------------------------------------- /lib/etl/processor/sequence_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # Row level processor to generate a sequence. 4 | # 5 | # Configuration options: 6 | # * :context: A context name, if none is specified then the context will be 7 | # the current ETL run 8 | # * :dest: The destination field name 9 | class SequenceProcessor < ETL::Processor::RowProcessor 10 | def process(row) 11 | sequences[configuration[:context]] ||= 0 12 | row[configuration[:dest]] = sequences[configuration[:context]] += 1 13 | row 14 | end 15 | 16 | protected 17 | # Get a Hash of sequences 18 | def sequences 19 | @sequences ||= {} 20 | end 21 | end 22 | end 23 | end -------------------------------------------------------------------------------- /lib/etl/processor/sftp_downloader_processor.rb: -------------------------------------------------------------------------------- 1 | optional_require 'net/sftp' 2 | 3 | module ETL 4 | module Processor 5 | # Custom processor to download files via SFTP 6 | class SftpDownloaderProcessor < ETL::Processor::Processor 7 | attr_reader :host 8 | attr_reader :port 9 | attr_reader :remote_dir 10 | attr_reader :files 11 | attr_reader :username 12 | attr_reader :local_dir 13 | 14 | # configuration options include: 15 | # * host - hostname or IP address of FTP server (required) 16 | # * port - port number for FTP server (default: 22) 17 | # * remote_dir - remote path on FTP server (default: /) 18 | # * files - list of files to download from FTP server (default: []) 19 | # * username - username for FTP server authentication (default: anonymous) 20 | # * password - password for FTP server authentication (default: nil) 21 | # * local_dir - local output directory to save downloaded files (default: '') 22 | # 23 | # As an example you might write something like the following in your control process file: 24 | # pre_process :sftp_downloader, { 25 | # :host => 'sftp.sec.gov', 26 | # :path => 'edgar/Feed/2007/QTR2', 27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz', 28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'], 29 | # :local_dir => '/data/sec/2007/04', 30 | # } 31 | # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data 32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+. 33 | def initialize(control, configuration) 34 | @host = configuration[:host] 35 | @port = configuration[:port] || 22 36 | @remote_dir = configuration[:remote_dir] || '/' 37 | @files = configuration[:files] || [] 38 | @username = configuration[:username] || 'anonymous' 39 | @password = configuration[:password] 40 | @local_dir = configuration[:local_dir] || '' 41 | end 42 | 43 | def process 44 | Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn| 45 | @files.each do |f| 46 | conn.download!(remote_file(f), local_file(f)) 47 | end 48 | end 49 | end 50 | 51 | private 52 | attr_accessor :password 53 | 54 | def local_file(name) 55 | File.join(@local_dir, name) 56 | end 57 | 58 | def remote_file(name) 59 | File.join(@remote_dir, name) 60 | end 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/etl/processor/sftp_uploader_processor.rb: -------------------------------------------------------------------------------- 1 | optional_require 'net/sftp' 2 | 3 | module ETL 4 | module Processor 5 | # Custom processor to download files via SFTP 6 | class SftpUploaderProcessor < ETL::Processor::Processor 7 | attr_reader :host 8 | attr_reader :port 9 | attr_reader :remote_dir 10 | attr_reader :files 11 | attr_reader :username 12 | attr_reader :local_dir 13 | 14 | # configuration options include: 15 | # * host - hostname or IP address of FTP server (required) 16 | # * port - port number for FTP server (default: 22) 17 | # * remote_dir - remote path on FTP server (default: /) 18 | # * files - list of files to download from FTP server (default: []) 19 | # * username - username for FTP server authentication (default: anonymous) 20 | # * password - password for FTP server authentication (default: nil) 21 | # * local_dir - local output directory to save downloaded files (default: '') 22 | # 23 | # As an example you might write something like the following in your control process file: 24 | # pre_process :sftp_uploader, { 25 | # :host => 'sftp.sec.gov', 26 | # :path => 'edgar/Feed/2007/QTR2', 27 | # :files => ['20070402.nc.tar.gz', '20070403.nc.tar.gz', '20070404.nc.tar.gz', 28 | # '20070405.nc.tar.gz', '20070406.nc.tar.gz'], 29 | # :local_dir => '/data/sec/2007/04', 30 | # } 31 | # The above example will anonymously download via SFTP the first week's worth of SEC filing feed data 32 | # from the second quarter of 2007 and download the files to the local directory +/data/sec/2007/04+. 33 | def initialize(control, configuration) 34 | @host = configuration[:host] 35 | @port = configuration[:port] || 22 36 | @remote_dir = configuration[:remote_dir] || '/' 37 | @files = configuration[:files] || [] 38 | @username = configuration[:username] || 'anonymous' 39 | @password = configuration[:password] 40 | @local_dir = configuration[:local_dir] || '' 41 | end 42 | 43 | def process 44 | Net::SFTP.start(@host, @username, {:port => @port, :password => @password}) do |conn| 45 | @files.each do |f| 46 | conn.upload!(local_file(f), remote_file(f)) 47 | end 48 | end 49 | end 50 | 51 | private 52 | attr_accessor :password 53 | 54 | def local_file(name) 55 | File.join(@local_dir, name) 56 | end 57 | 58 | def remote_file(name) 59 | File.join(@remote_dir, name) 60 | end 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/etl/processor/surrogate_key_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # A row level processor that provides surrogate keys 4 | class SurrogateKeyProcessor < ETL::Processor::RowProcessor 5 | attr_accessor :destination 6 | attr_accessor :table 7 | attr_accessor :column 8 | attr_accessor :target 9 | 10 | # Initialize the surrogate key generator 11 | # 12 | # Configuration options 13 | # * :query: If specified it contains a query to be used to 14 | # locate the last surrogate key. If this is specified then :target 15 | # must also be specified. 16 | # * :target: The target connection 17 | # * :destination: The destination column name (defaults to :id) 18 | def initialize(control, configuration) 19 | super 20 | @table = configuration[:table] 21 | @column = configuration[:column] || 'id' 22 | @target = configuration[:target] 23 | if configuration[:query] 24 | raise ControlError, "Query option is no longer value, use :column and :table instead" 25 | end 26 | if table 27 | @surrogate_key = ETL::Engine.connection(target).select_value("SELECT max(#{column}) FROM #{table_name}") 28 | end 29 | #puts "initial surrogate key: #{@surrogate_key}" 30 | @surrogate_key = 0 if @surrogate_key.blank? 31 | @surrogate_key = @surrogate_key.to_i 32 | #puts "surrogate key: #{@surrogate_key}" 33 | @destination = configuration[:destination] || :id 34 | end 35 | 36 | # Add a surrogate key to the row 37 | def process(row) 38 | if row 39 | #puts "processing row #{row.inspect}" 40 | @surrogate_key += 1 41 | #puts "adding surrogate key to row: #{@surrogate_key}" 42 | row[destination] = @surrogate_key 43 | row 44 | end 45 | end 46 | 47 | private 48 | def table_name 49 | ETL::Engine.table(table, ETL::Engine.connection(target)) 50 | end 51 | end 52 | end 53 | end -------------------------------------------------------------------------------- /lib/etl/processor/truncate_processor.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Processor #:nodoc: 3 | # A processor which will truncate a table. Use as a pre-processor for cleaning out a table 4 | # prior to loading 5 | class TruncateProcessor < ETL::Processor::Processor 6 | # Defines the table to truncate 7 | attr_reader :table 8 | 9 | # Defines the database connection to use 10 | attr_reader :target 11 | 12 | # Initialize the processor 13 | # 14 | # Options: 15 | # * :target: The target connection 16 | # * :table: The table name 17 | # * :options: Optional truncate options 18 | def initialize(control, configuration) 19 | super 20 | #@file = File.join(File.dirname(control.file), configuration[:file]) 21 | @target = configuration[:target] || {} 22 | @table = configuration[:table] 23 | @options = configuration[:options] 24 | end 25 | 26 | def process 27 | conn = ETL::Engine.connection(target) 28 | @options ||= 'RESTART IDENTITY' if conn.class.name =~ /postgres/i 29 | conn.truncate(table_name, @options) 30 | end 31 | 32 | private 33 | def table_name 34 | ETL::Engine.table(table, ETL::Engine.connection(target)) 35 | end 36 | end 37 | end 38 | end -------------------------------------------------------------------------------- /lib/etl/processor/zip_file_processor.rb: -------------------------------------------------------------------------------- 1 | optional_require 'zip/zip' 2 | 3 | module ETL 4 | module Processor 5 | # Custom processor to zip files 6 | class ZipFileProcessor < ETL::Processor::Processor 7 | attr_reader :infile 8 | attr_reader :destination 9 | 10 | # configuration options include: 11 | # * infile - File to zip (required) 12 | # * destination - Zip file name (default: #{infile}.zip) 13 | def initialize(control, configuration) 14 | path = Pathname.new(configuration[:infile]) 15 | @infile = path.absolute? ? path : Pathname.new(File.dirname(File.expand_path(configuration[:infile]))) + path 16 | @destination = configuration[:destination] || "#{infile}.zip" 17 | end 18 | 19 | def process 20 | Zip::ZipFile.open(@destination, Zip::ZipFile::CREATE) do |zipfile| 21 | zipfile.add(@infile.basename, @infile) 22 | end 23 | end 24 | 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/etl/row.rb: -------------------------------------------------------------------------------- 1 | # This source file contains the ETL::Row class. 2 | 3 | module ETL #:nodoc: 4 | # This class represents a single row currently passing through the ETL pipeline 5 | class Row < Hash 6 | # Accessor for the originating source 7 | attr_accessor :source 8 | 9 | # All change types 10 | CHANGE_TYPES = [:insert, :update, :delete] 11 | 12 | # Accessor for the row's change type 13 | attr_accessor :change_type 14 | 15 | # Get the change type, defaults to :insert 16 | def change_type 17 | @change_type ||= :insert 18 | end 19 | end 20 | end -------------------------------------------------------------------------------- /lib/etl/screen.rb: -------------------------------------------------------------------------------- 1 | # This source file contains the ETL::Screen module and requires all of the 2 | # screens 3 | 4 | module ETL #:nodoc: 5 | # The ETL::Screen module contains pre-built screens useful for checking the 6 | # ETL state during execution. Screens may be fatal, which will result in 7 | # termination of the ETL process, errors, which will result in the 8 | # termination of just the current ETL control file, or warnings, which will 9 | # result in a warning message. 10 | module Screen 11 | end 12 | end 13 | 14 | Dir[File.dirname(__FILE__) + "/screen/*.rb"].each { |file| require(file) } -------------------------------------------------------------------------------- /lib/etl/screen/row_count_screen.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Screen 3 | # This screen validates the number of rows which will be bulk loaded 4 | # against the results from some sort of a row count query. If there 5 | # is a difference then the screen will not pass 6 | class RowCountScreen 7 | attr_accessor :control, :configuration 8 | def initialize(control, configuration={}) 9 | @control = control 10 | @configuration = configuration 11 | execute 12 | end 13 | def execute 14 | unless Engine.rows_written == configuration[:rows] 15 | raise "Rows written (#{Engine.rows_written}) does not match expected rows (#{configuration[:rows]})" 16 | end 17 | end 18 | end 19 | end 20 | end -------------------------------------------------------------------------------- /lib/etl/transform.rb: -------------------------------------------------------------------------------- 1 | require 'etl/transform/transform' 2 | Dir[File.dirname(__FILE__) + "/transform/*.rb"].each { |file| require(file) } -------------------------------------------------------------------------------- /lib/etl/transform/block_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Transform 3 | class BlockTransform < ETL::Transform::Transform 4 | def initialize(control, name, configuration) 5 | super 6 | @block = configuration[:block] 7 | end 8 | def transform(name, value, row) 9 | @block.call(name, value, row) 10 | end 11 | end 12 | end 13 | end -------------------------------------------------------------------------------- /lib/etl/transform/calculation_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Transform 3 | class CalculationTransform < ETL::Transform::Transform 4 | attr_reader :function 5 | attr_reader :fields 6 | 7 | def initialize(control, name, configuration) 8 | @function = configuration[:function] 9 | @fields = configuration[:fields] 10 | super 11 | end 12 | 13 | def transform(name, value, row) 14 | return nil if row.nil? 15 | return nil if row[@fields[0]].nil? 16 | 17 | if (@function.eql? "A + B") 18 | result = "" 19 | @fields.each do |field| 20 | next if field.nil? 21 | 22 | string = "" 23 | if field.to_s.eql? field 24 | string = field 25 | begin 26 | string = eval('"' + field + '"') 27 | rescue 28 | end 29 | else 30 | string = row[field] 31 | end 32 | next if string.nil? 33 | 34 | result = result + string 35 | end 36 | 37 | row[name] = result 38 | end 39 | 40 | if (@function.eql? "date A") 41 | first = row[@fields[0]] 42 | row[name] = Time.parse(first) 43 | end 44 | 45 | if (@function.eql? "trim A") 46 | first = row[@fields[0]] 47 | row[name] = first.strip 48 | end 49 | 50 | if (@function.eql? "lower A") 51 | first = row[@fields[0]] 52 | row[name] = first.downcase 53 | end 54 | 55 | if (@function.eql? "upper A") 56 | first = row[@fields[0]] 57 | row[name] = first.upcase 58 | end 59 | 60 | if (@function.eql? "encoding A") 61 | # Bug from ruby 1.8 http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ 62 | first = row[@fields[0]] 63 | row[name] = Iconv.conv(@fields[1], @fields[2], first + ' ')[0..-2] 64 | end 65 | 66 | row[name] 67 | end 68 | 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/etl/transform/date_to_string_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform a Date or Time to a formatted string instance 4 | class DateToStringTransform < ETL::Transform::Transform 5 | # Initialize the transformer. 6 | # 7 | # Configuration options: 8 | # * :format: A format passed to strftime. Defaults to %Y-%m-%d 9 | def initialize(control, name, configuration={}) 10 | super 11 | @format = configuration[:format] || "%Y-%m-%d" 12 | end 13 | # Transform the value using strftime 14 | def transform(name, value, row) 15 | return value unless value.respond_to?(:strftime) 16 | value.strftime(@format) 17 | end 18 | end 19 | end 20 | end -------------------------------------------------------------------------------- /lib/etl/transform/decode_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform which decodes coded values 4 | class DecodeTransform < ETL::Transform::Transform 5 | attr_accessor :decode_table_path 6 | 7 | attr_accessor :decode_table_delimiter 8 | 9 | attr_accessor :default_value 10 | 11 | # Initialize the transformer 12 | # 13 | # Configuration options: 14 | # * :decode_table_path: The path to the decode table (defaults to 'decode.txt') 15 | # * :decode_table_delimiter: The decode table delimiter (defaults to ':') 16 | # * :default_value: The default value to use (defaults to 'No Value') 17 | def initialize(control, name, configuration={}) 18 | super 19 | 20 | if configuration[:decode_table_path] 21 | configuration[:decode_table_path] = File.join(File.dirname(control.file), configuration[:decode_table_path]) 22 | end 23 | 24 | @decode_table_path = (configuration[:decode_table_path] || 'decode.txt') 25 | @decode_table_delimiter = (configuration[:decode_table_delimiter] || ':') 26 | @default_value = (configuration[:default_value] || 'No Value') 27 | end 28 | 29 | # Transform the value 30 | def transform(name, value, row) 31 | decode_table[value] || default_value 32 | end 33 | 34 | # Get the decode table 35 | def decode_table 36 | unless @decode_table 37 | @decode_table = {} 38 | open(decode_table_path).each do |line| 39 | code, value = line.strip.split(decode_table_delimiter) 40 | if code && code.length > 0 41 | @decode_table[code] = value 42 | else 43 | @default_value = value 44 | end 45 | end 46 | end 47 | @decode_table 48 | end 49 | end 50 | end 51 | end -------------------------------------------------------------------------------- /lib/etl/transform/default_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform which will replace nil or empty values with a specified value. 4 | class DefaultTransform < Transform 5 | attr_accessor :default_value 6 | # Initialize the transform 7 | # 8 | # Configuration options: 9 | # * :default_value: The default value to use if the incoming value is blank 10 | def initialize(control, name, configuration) 11 | super 12 | @default_value = configuration[:default_value] 13 | end 14 | # Transform the value 15 | def transform(name, value, row) 16 | value.blank? ? default_value : value 17 | end 18 | end 19 | end 20 | end -------------------------------------------------------------------------------- /lib/etl/transform/hierarchy_lookup_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform which walks up the hierarchy tree to find a value of the current level's value 4 | # is nil. 5 | # 6 | # TODO: Let the resolver be implemented in a class so different resolution methods are 7 | # possible. 8 | class HierarchyLookupTransform < ETL::Transform::Transform 9 | # The name of the field to use for the parent ID 10 | attr_accessor :parent_id_field 11 | 12 | # The target connection name 13 | attr_accessor :target 14 | 15 | # Initialize the transform 16 | # 17 | # Configuration options: 18 | # * :target: The target connection name (required) 19 | # * :parent_id_field: The name of the field to use for the parent ID (defaults to :parent_id) 20 | def initialize(control, name, configuration={}) 21 | super 22 | @parent_id_field = configuration[:parent_id_field] || :parent_id 23 | @target = configuration[:target] 24 | end 25 | 26 | # Transform the value. 27 | def transform(name, value, row) 28 | if parent_id = row[parent_id_field] 29 | # TODO: should use more than just the first source out of the control 30 | parent_id, value = lookup(name, 31 | control.sources.first.configuration[:table], parent_id, parent_id_field) 32 | until value || parent_id.nil? 33 | # TODO: should use more than just the first source out of the control 34 | parent_id, value = lookup(name, 35 | control.sources.first.configuration[:table], parent_id, parent_id_field) 36 | end 37 | end 38 | value 39 | end 40 | 41 | # Lookup the parent value. 42 | def lookup(field, table, parent_id, parent_id_field) 43 | q = "SELECT #{parent_id_field}, #{field} FROM #{table} WHERE id = #{parent_id}" 44 | row = ETL::Engine.connection(target).select_one(q) 45 | return row[parent_id_field.to_s], row[field.to_s] 46 | end 47 | end 48 | end 49 | end -------------------------------------------------------------------------------- /lib/etl/transform/md5_transform.rb: -------------------------------------------------------------------------------- 1 | require 'digest/md5' 2 | 3 | module ETL #:nodoc: 4 | module Transform #:nodoc: 5 | # Transform which hashes the original value with a MD5 hash algorithm 6 | class Md5Transform < ETL::Transform::Transform 7 | # Transform the value with a MD5 digest algorithm. 8 | def transform(name, value, row) 9 | Digest::MD5.hexdigest(value) 10 | end 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/etl/transform/ordinalize_transform.rb: -------------------------------------------------------------------------------- 1 | require 'active_support/core_ext/integer/inflections.rb' 2 | 3 | module ETL #:nodoc: 4 | module Transform #:nodoc: 5 | # Transform a number to an ordinalized version using the ActiveSupport ordinalize 6 | # core extension 7 | class OrdinalizeTransform < ETL::Transform::Transform 8 | # Transform the value from a number to an ordinalized number 9 | def transform(name, value, row) 10 | value.ordinalize 11 | end 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/etl/transform/sha1_transform.rb: -------------------------------------------------------------------------------- 1 | require 'digest/sha1' 2 | 3 | module ETL #:nodoc: 4 | module Transform #:nodoc: 5 | # Transform which hashes the original value with a SHA-1 hash algorithm 6 | class Sha1Transform < ETL::Transform::Transform 7 | # Transform the value with a SHA1 digest algorithm. 8 | def transform(name, value, row) 9 | Digest::SHA1.hexdigest(value) 10 | end 11 | end 12 | end 13 | end -------------------------------------------------------------------------------- /lib/etl/transform/split_fields_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Transform 3 | class SplitFieldsTransform < ETL::Transform::Transform 4 | attr_reader :delimiter 5 | attr_reader :new_fields 6 | 7 | def initialize(control, name, configuration) 8 | @delimiter = configuration[:delimiter] || ',' 9 | @new_fields = configuration[:new_fields] 10 | super 11 | end 12 | 13 | def transform(name, value, row) 14 | return nil if row.nil? 15 | return nil if row[name].nil? 16 | 17 | fields = row[name].split(@delimiter) 18 | @new_fields.each_with_index do |new, index| 19 | row[new] = fields[index] 20 | end 21 | 22 | row[name] 23 | end 24 | 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/etl/transform/string_to_date_time_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform a String representation of a date to a DateTime instance 4 | class StringToDateTimeTransform < ETL::Transform::Transform 5 | # Transform the value using DateTime.parse. 6 | # 7 | # WARNING: This transform is slow (due to the Ruby implementation), but if you need to 8 | # parse timestamps before or after the values supported by the Time.parse. 9 | def transform(name, value, row) 10 | DateTime.parse(value) unless value.nil? 11 | end 12 | end 13 | end 14 | end -------------------------------------------------------------------------------- /lib/etl/transform/string_to_date_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform a String representation of a date to a Date instance 4 | class StringToDateTransform < ETL::Transform::Transform 5 | # Transform the value using Date.parse 6 | def transform(name, value, row) 7 | return value if value.nil? 8 | begin 9 | Date.parse(value) 10 | rescue => e 11 | return value 12 | end 13 | end 14 | end 15 | end 16 | end -------------------------------------------------------------------------------- /lib/etl/transform/string_to_time_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform a String representation of a date to a Time instance 4 | class StringToTimeTransform < ETL::Transform::Transform 5 | # Transform the value using Time.parse 6 | def transform(name, value, row) 7 | Time.parse(value) unless value.nil? 8 | end 9 | end 10 | end 11 | end -------------------------------------------------------------------------------- /lib/etl/transform/transform.rb: -------------------------------------------------------------------------------- 1 | module ETL#:nodoc: 2 | module Transform#:nodoc: 3 | # Base class for transforms. 4 | # 5 | # A transform converts one value to another value using some sort of algorithm. 6 | # 7 | # A simple transform has two arguments, the field to transform and the name of the transform: 8 | # 9 | # transform :ssn, :sha1 10 | # 11 | # Transforms can also be blocks: 12 | # 13 | # transform(:ssn){ |v| v[0,24] } 14 | # 15 | # Finally, a transform can include a configuration hash: 16 | # 17 | # transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'} 18 | class Transform 19 | class << self 20 | # Transform the specified value using the given transforms. The transforms can either be 21 | # Proc objects or objects which extend from Transform and implement the method transform(value). 22 | # Any other object will result in a ControlError being raised. 23 | def transform(name, value, row, transforms) 24 | transforms.each do |transform| 25 | benchmarks[transform.class] ||= 0 26 | benchmarks[transform.class] += Benchmark.realtime do 27 | Engine.logger.debug "Transforming field #{name} with #{transform.inspect}" 28 | case transform 29 | when Proc 30 | value = transform.call([name, value, row]) 31 | when Transform 32 | value = transform.transform(name, value, row) 33 | else 34 | raise ControlError, "Unsupported transform configuration type: #{transform}" 35 | end 36 | end 37 | end 38 | value 39 | end 40 | 41 | def benchmarks 42 | @benchmarks ||= {} 43 | end 44 | end 45 | 46 | attr_reader :control, :name, :configuration 47 | 48 | # Initialize the transform object with the given control object, field name and 49 | # configuration hash 50 | def initialize(control, name, configuration={}) 51 | @control = control 52 | @name = name 53 | @configuration = configuration 54 | end 55 | 56 | def transform(name, value, row) 57 | raise "transform is an abstract method" 58 | end 59 | end 60 | end 61 | end -------------------------------------------------------------------------------- /lib/etl/transform/trim_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform to trim string 4 | class TrimTransform < ETL::Transform::Transform 5 | # Configuration options: 6 | # * :type: :left, :right or :both. Default is :both 7 | def initialize(control, name, configuration={}) 8 | super 9 | @type = (configuration[:type] || :both).to_sym 10 | end 11 | # Transform the value 12 | def transform(name, value, row) 13 | case @type 14 | when :left 15 | value.lstrip 16 | when :right 17 | value.rstrip 18 | when :both 19 | value.strip 20 | else 21 | raise "Trim type, if specified, must be :left, :right or :both" 22 | end 23 | end 24 | end 25 | end 26 | end -------------------------------------------------------------------------------- /lib/etl/transform/type_transform.rb: -------------------------------------------------------------------------------- 1 | module ETL #:nodoc: 2 | module Transform #:nodoc: 3 | # Transform from one type to another 4 | class TypeTransform < ETL::Transform::Transform 5 | # Initialize the transformer. 6 | # 7 | # Configuration options: 8 | # * :type: The type to convert to. Supported types: 9 | # ** :string 10 | # ** :number,:integer 11 | # ** :float 12 | # ** :decimal 13 | def initialize(control, name, configuration={}) 14 | super 15 | @type = configuration[:type] 16 | @significant = configuration[:significant] ||= 0 17 | end 18 | # Transform the value 19 | def transform(name, value, row) 20 | case @type 21 | when :string 22 | value.to_s 23 | when :number, :integer 24 | value.to_i 25 | when :float 26 | value.to_f 27 | when :decimal 28 | BigDecimal.new(value.to_s, @significant) 29 | else 30 | raise "Unsupported type: #{@type}" 31 | end 32 | end 33 | end 34 | end 35 | end -------------------------------------------------------------------------------- /lib/etl/util.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Util 3 | # Return the distance of time in words from the given from_time to the specified to_time. If to_time 4 | # is not specified then Time.now is used. By default seconds are included...set the include_seconds 5 | # argument to false to disable the seconds. 6 | def distance_of_time_in_words(from_time, to_time=Time.now) 7 | from_time = from_time.to_time if from_time.respond_to?(:to_time) 8 | to_time = to_time.to_time if to_time.respond_to?(:to_time) 9 | seconds = (to_time - from_time).round 10 | distance_in_days = (seconds/(60*60*24)).round 11 | seconds = seconds % (60*60*24) 12 | distance_in_hours = (seconds/(60*60)).round 13 | seconds = seconds % (60*60) 14 | distance_in_minutes = (seconds/60).round 15 | seconds = seconds % 60 16 | distance_in_seconds = seconds 17 | 18 | s = '' 19 | s << "#{distance_in_days} days," if distance_in_days > 0 20 | s << "#{distance_in_hours} hours, " if distance_in_hours > 0 21 | s << "#{distance_in_minutes} minutes, " if distance_in_minutes > 0 22 | s << "#{distance_in_seconds} seconds" 23 | s 24 | end 25 | 26 | # Get the approximate disntance of time in words from the given from_time 27 | # to the the given to_time. If to_time is not specified then it is set 28 | # to Time.now. By default seconds are included...set the include_seconds 29 | # argument to false to disable the seconds. 30 | def approximate_distance_of_time_in_words(from_time, to_time=Time.now, include_seconds=true) 31 | from_time = from_time.to_time if from_time.respond_to?(:to_time) 32 | to_time = to_time.to_time if to_time.respond_to?(:to_time) 33 | distance_in_minutes = (((to_time - from_time).abs)/60).round 34 | distance_in_seconds = ((to_time - from_time).abs).round 35 | 36 | case distance_in_minutes 37 | when 0..1 38 | return (distance_in_minutes == 0) ? 'less than a minute' : '1 minute' unless include_seconds 39 | case distance_in_seconds 40 | when 0..4 then 'less than 5 seconds' 41 | when 5..9 then 'less than 10 seconds' 42 | when 10..19 then 'less than 20 seconds' 43 | when 20..39 then 'half a minute' 44 | when 40..59 then 'less than a minute' 45 | else '1 minute' 46 | end 47 | when 2..44 then "#{distance_in_minutes} minutes" 48 | when 45..89 then 'about 1 hour' 49 | when 90..1439 then "about #{(distance_in_minutes.to_f / 60.0).round} hours" 50 | when 1440..2879 then '1 day' 51 | when 2880..43199 then "#{(distance_in_minutes / 1440).round} days" 52 | when 43200..86399 then 'about 1 month' 53 | when 86400..525959 then "#{(distance_in_minutes / 43200).round} months" 54 | when 525960..1051919 then 'about 1 year' 55 | else "over #{(distance_in_minutes / 525960).round} years" 56 | end 57 | end 58 | end 59 | end -------------------------------------------------------------------------------- /lib/etl/version.rb: -------------------------------------------------------------------------------- 1 | module ETL#:nodoc: 2 | VERSION = "1.0.0" 3 | end 4 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/.gitignore -------------------------------------------------------------------------------- /test/.ignore: -------------------------------------------------------------------------------- 1 | database.yml 2 | *.txt -------------------------------------------------------------------------------- /test/all.ebf: -------------------------------------------------------------------------------- 1 | # This is an ETL Batch File and defines a means for executing 2 | # a collection of ETL scripts as a single process. 3 | 4 | use_temp_tables 5 | run 'batched1.ctl' 6 | run 'batched2.ctl' -------------------------------------------------------------------------------- /test/apache_combined_log.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/apache_combined_log.txt', 3 | :parser => :apache_combined_log 4 | } 5 | 6 | destination :out, { 7 | :file => 'output/apache_combined_log.txt' 8 | }, 9 | { 10 | :order => [] 11 | } -------------------------------------------------------------------------------- /test/batch_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class BatchTest < Test::Unit::TestCase 4 | attr_reader :file, :db_yaml, :engine 5 | def setup 6 | @file = File.dirname(__FILE__) + '/all.ebf' 7 | @db_yaml = File.dirname(__FILE__) + '/database.yml' 8 | @engine = ETL::Engine.new 9 | end 10 | def teardown 11 | 12 | end 13 | def test_etl_batch_file 14 | #`etl #{file} -c #{db_yaml}` 15 | end 16 | def test_batch 17 | assert_nothing_raised do 18 | batch = ETL::Batch::Batch.resolve(file, engine) 19 | batch.execute 20 | end 21 | end 22 | def test_batch_with_file 23 | assert_nothing_raised do 24 | batch = ETL::Batch::Batch.resolve(File.new(file), engine) 25 | batch.execute 26 | end 27 | end 28 | def test_batch_with_batch_object 29 | assert_nothing_raised do 30 | batch_instance = ETL::Batch::Batch.new(File.new(file)) 31 | batch_instance.engine = engine 32 | batch = ETL::Batch::Batch.resolve(batch_instance, engine) 33 | batch.execute 34 | end 35 | end 36 | def test_batch_with_object_should_fail 37 | assert_raise(RuntimeError) do 38 | batch = ETL::Batch::Batch.resolve(0, engine) 39 | end 40 | end 41 | end -------------------------------------------------------------------------------- /test/batch_with_error.ebf: -------------------------------------------------------------------------------- 1 | # This is an ETL Batch File and defines a means for executing 2 | # a collection of ETL scripts as a single process. 3 | 4 | use_temp_tables 5 | run 'delimited_with_bulk_load.ctl' 6 | run 'screen_test_fatal.ctl' -------------------------------------------------------------------------------- /test/batched1.ctl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/batched1.ctl -------------------------------------------------------------------------------- /test/batched2.ctl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/batched2.ctl -------------------------------------------------------------------------------- /test/block_processor.ctl: -------------------------------------------------------------------------------- 1 | source :in, { :type => :mock, :name => :block_processed_input } 2 | 3 | after_read { |row| row[:added_by_after_read] = "after-" +row[:first_name]; row } 4 | before_write { |row| row[:added_by_before_write] = "Row #{Engine.current_source_row}"; [row,{:new_row => 'added by post_processor'}] } 5 | 6 | destination :out, { :type => :mock, :name => :block_processed_output } -------------------------------------------------------------------------------- /test/block_processor_error.ctl: -------------------------------------------------------------------------------- 1 | pre_process { raise ControlError.new( "Cough!") } -------------------------------------------------------------------------------- /test/block_processor_pre_post_process.ctl: -------------------------------------------------------------------------------- 1 | source :in, { :type => :mock, :name => :another_input } 2 | pre_process { TestWitness.call("I'm called from pre_process") } 3 | post_process { TestWitness.call("I'm called from post_process") } 4 | destination :out, { :type => :mock, :name => :another_output } -------------------------------------------------------------------------------- /test/block_processor_remove_rows.ctl: -------------------------------------------------------------------------------- 1 | source :in, { :type => :mock, :name => :block_input } 2 | 3 | before_write { |row| row[:obsolete] == true ? nil : row } 4 | 5 | destination :out, { :type => :mock, :name => :block_output } -------------------------------------------------------------------------------- /test/block_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | include ETL 3 | include ETL::Control 4 | 5 | class TestWitness 6 | end 7 | 8 | class BlockProcessorTest < Test::Unit::TestCase 9 | 10 | def test_block_processor_should_work_as_both_after_read_and_before_write_row_processor 11 | MockSource[:block_processed_input] = [{ :first_name => 'John'},{:first_name => 'Gary'}] 12 | process 'block_processor.ctl' 13 | assert_equal 4, MockDestination[:block_processed_output].size 14 | assert_equal({ :first_name => 'John', :added_by_after_read => 'after-John', :added_by_before_write => "Row 1" }, MockDestination[:block_processed_output][0]) 15 | assert_equal({ :new_row => 'added by post_processor' }, MockDestination[:block_processed_output][1]) 16 | assert_equal({ :first_name => 'Gary', :added_by_after_read => 'after-Gary', :added_by_before_write => "Row 2" }, MockDestination[:block_processed_output][2]) 17 | assert_equal({ :new_row => 'added by post_processor' }, MockDestination[:block_processed_output][3]) 18 | end 19 | 20 | def test_block_processor_should_let_rows_be_removed_by_setting_it_to_nil 21 | MockSource[:block_input] = [{ :obsolete => true, :name => 'John'},{ :obsolete => false, :name => 'Gary'}] 22 | process 'block_processor_remove_rows.ctl' 23 | assert_equal([{ :obsolete => false, :name => 'Gary' }], MockDestination[:block_output]) # only one record should be kept 24 | end 25 | 26 | def test_block_processor_should_work_as_pre_or_post_processor 27 | flexmock(TestWitness).should_receive(:call).with("I'm called from pre_process") 28 | flexmock(TestWitness).should_receive(:call).with("I'm called from post_process") 29 | MockSource[:another_input] = [{ :obsolete => true, :name => 'John'},{ :obsolete => false, :name => 'Gary'}] 30 | process 'block_processor_pre_post_process.ctl' 31 | assert_equal(MockSource[:another_input], MockDestination[:another_output]) 32 | end 33 | 34 | def test_block_error_should_be_propagated 35 | assert_raise(ControlError) { process 'block_processor_error.ctl' } 36 | end 37 | 38 | end -------------------------------------------------------------------------------- /test/check_exist_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class CheckExistProcessorTest < Test::Unit::TestCase 4 | 5 | context 'CheckExistProcessor' do 6 | 7 | setup do 8 | @config = { 9 | :target => :data_warehouse, 10 | :table => 'people', 11 | :columns => [:first_name, :last_name] 12 | } 13 | end 14 | 15 | should_eventually "compare based on all columns if no columns are provided" do 16 | # TBI 17 | end 18 | 19 | should_eventually "compare based on all columns except skipped ones if columns to skip are provided" do 20 | # TBI 21 | end 22 | 23 | should "raise an error if no table is provided" do 24 | error = assert_raises(ETL::ControlError) do 25 | ETL::Processor::CheckExistProcessor.new(nil, @config.except(:table)) 26 | end 27 | # bug #2413 on assert_raises won't let me check error message above 28 | assert_equal 'table must be specified', error.message 29 | end 30 | 31 | should "raise an error if no target is provided" do 32 | error = assert_raises(ETL::ControlError) do 33 | ETL::Processor::CheckExistProcessor.new(nil, @config.except(:target)) 34 | end 35 | 36 | assert_equal 'target must be specified', error.message 37 | end 38 | 39 | should "bypass checking if the table has no rows" do 40 | Person.delete_all 41 | 42 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config) 43 | assert_equal false, processor.should_check? 44 | end 45 | 46 | should "raise an error if one of the keys used for checking existence is not available in a row" do 47 | Person.delete_all 48 | # we need at least one record to avoid automatic skipping 49 | # this should be mocked instead, probably 50 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234') 51 | 52 | error = assert_raise(ETL::ControlError) do 53 | row = ETL::Row[:first_name => 'John'] 54 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config) 55 | 56 | # guard against bypassing 57 | assert_equal true, processor.should_check? 58 | 59 | processor.process(row) 60 | end 61 | 62 | assert_equal "Row missing required field :last_name for existence check", error.message 63 | end 64 | 65 | should "return nil if the same row is found in database" do 66 | Person.delete_all 67 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234') 68 | 69 | row = ETL::Row[:first_name => 'John', :last_name => 'Barry'] 70 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config) 71 | assert_equal true, processor.should_check? # guard against bypassing 72 | 73 | assert_equal nil, processor.process(row) 74 | end 75 | 76 | should "return the row if no same row is found in database" do 77 | Person.delete_all 78 | Person.create!(:first_name => 'John', :last_name => 'Barry', :ssn => '1234') 79 | 80 | row = ETL::Row[:first_name => 'John', :last_name => 'OtherName'] 81 | processor = ETL::Processor::CheckExistProcessor.new(nil, @config) 82 | assert_equal true, processor.should_check? # guard against bypassing 83 | 84 | assert_equal row, processor.process(row) 85 | end 86 | 87 | end 88 | 89 | end 90 | -------------------------------------------------------------------------------- /test/check_unique_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class CheckUniqueProcessorTest < Test::Unit::TestCase 4 | 5 | context 'CheckUniqueProcessor' do 6 | attr_reader :processor 7 | 8 | setup do 9 | @processor = ETL::Processor::CheckUniqueProcessor.new(nil, 10 | :keys => [:first, :second]) 11 | end 12 | 13 | should "keep a row whose keys didn't already appear in the pipeline" do 14 | row = ETL::Row[:first => 'A', :second => 'B'] 15 | 16 | assert_equal row, processor.process(row) 17 | 18 | assert_equal({ 'A|B' => 1 }, processor.compound_key_constraints) 19 | end 20 | 21 | should "remove a row whose keys already appeared in the pipeline" do 22 | row = ETL::Row[:first => 'A', :second => 'B'] 23 | 24 | assert_equal row, processor.process(row) 25 | assert_equal nil, processor.process(row) 26 | end 27 | 28 | should "raise an error if a row lacks one of the keys specified" do 29 | row = ETL::Row[:first => 'A'] 30 | 31 | error = assert_raises(ETL::ControlError) do 32 | processor.process(row) 33 | end 34 | 35 | assert_equal "Row missing required field :second for unicity check", error.message 36 | end 37 | 38 | end 39 | 40 | end 41 | -------------------------------------------------------------------------------- /test/config/.gitignore: -------------------------------------------------------------------------------- 1 | *.lock -------------------------------------------------------------------------------- /test/config/database.yml: -------------------------------------------------------------------------------- 1 | <% raise "ENV['DB'] not specified!" unless ENV['DB'] %> 2 | 3 | # a bit hackish - tests would require a refactoring instead 4 | 5 | mysql2: &mysql2 6 | host: 127.0.0.1 7 | adapter: mysql2 8 | database: activewarehouse_etl_test 9 | username: root 10 | encoding: utf8 11 | local_infile: true 12 | # the tests would require a rework: disabling casting for now 13 | cast: false 14 | 15 | postgresql: &postgresql 16 | adapter: postgresql 17 | database: activewarehouse_etl_test 18 | username: postgres 19 | 20 | # TODO - refactor test to avoid using 2 databases maybe? 21 | operational_database: 22 | <<: *<%= ENV['DB'] %> 23 | 24 | data_warehouse: 25 | <<: *<%= ENV['DB'] %> 26 | 27 | etl_execution: 28 | <<: *<%= ENV['DB'] %> 29 | database: etl_execution 30 | -------------------------------------------------------------------------------- /test/config/gemfiles/Gemfile.rails-3.0.x: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/common' 2 | 3 | declare_gems '~> 3.0.20' 4 | -------------------------------------------------------------------------------- /test/config/gemfiles/Gemfile.rails-3.1.x: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/common' 2 | 3 | declare_gems '~> 3.1.12' 4 | -------------------------------------------------------------------------------- /test/config/gemfiles/Gemfile.rails-3.2.x: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/common' 2 | 3 | declare_gems '~> 3.2.14' 4 | -------------------------------------------------------------------------------- /test/config/gemfiles/Gemfile.rails-4.0.x: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/common' 2 | 3 | declare_gems '~> 4.0.0' 4 | -------------------------------------------------------------------------------- /test/config/gemfiles/common.rb: -------------------------------------------------------------------------------- 1 | def declare_gems(activerecord_version) 2 | source "https://rubygems.org" 3 | 4 | gem 'activerecord', activerecord_version 5 | gem 'adapter_extensions', :git => 'https://github.com/activewarehouse/adapter_extensions.git' 6 | 7 | if activerecord_version < '3.1' 8 | gem 'mysql2', '< 0.3' 9 | else 10 | # use our own fork for bulk load support until issue fixed: 11 | # https://github.com/brianmario/mysql2/pull/242 12 | gem 'mysql2', :git => 'https://github.com/activewarehouse/mysql2.git' 13 | end 14 | 15 | gem 'pg' 16 | gem 'activerecord-sqlserver-adapter' 17 | 18 | gem 'awesome_print' 19 | gem 'rake' 20 | gem 'flexmock' 21 | gem 'shoulda', '3.0.1' 22 | gem 'sqlite3' 23 | 24 | gem 'spreadsheet' 25 | gem 'nokogiri' 26 | gem 'fastercsv' 27 | 28 | gem 'roo' 29 | 30 | gem 'standalone_migrations', '1.0.5' 31 | end -------------------------------------------------------------------------------- /test/control_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class ControlTest < Test::Unit::TestCase 4 | # Test the ability to parse control files. 5 | def test_parse 6 | assert_nothing_raised do 7 | Dir.glob(File.join(File.dirname(__FILE__), '*.ctl')) do |f| 8 | ETL::Control::Control.parse(f) 9 | end 10 | end 11 | end 12 | 13 | def test_bad_control_raises_error 14 | assert_raise ETL::ControlError do 15 | ETL::Control::Control.resolve(0) 16 | end 17 | end 18 | 19 | def test_resolve_control_object 20 | assert_nothing_raised do 21 | ETL::Control::Control.resolve(ETL::Control::Control.parse(File.join(File.dirname(__FILE__), 'delimited.ctl'))) 22 | end 23 | end 24 | 25 | def test_set_error_threshold 26 | assert_nothing_raised do 27 | ETL::Engine.process(File.join(File.dirname(__FILE__), 'errors.ctl')) 28 | end 29 | end 30 | 31 | def test_bad_processor_name 32 | assert_raise ETL::ControlError do 33 | s = "before_write :chunky_monkey" 34 | ETL::Control::Control.parse_text(s) 35 | end 36 | end 37 | 38 | def test_dependencies 39 | s = "depends_on 'foo', 'bar'" 40 | control = ETL::Control::Control.parse_text(s) 41 | assert_equal control.dependencies, ['foo','bar'] 42 | end 43 | end -------------------------------------------------------------------------------- /test/data/apache_combined_log.txt: -------------------------------------------------------------------------------- 1 | 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)" 2 | 127.0.0.1 - bob [11/Oct/2000:05:22:02 -0700] "GET /apache_pb.gif HTTP/1.1" 200 2326 "http://www.foo.com/" "Mozilla/4.08 [en] (Win98; I ;Nav)" 3 | 127.0.0.1 - bob [11/Oct/2000:05:52:31 -0700] "GET /apache_pb.gif HTTP/1.1" 200 2326 "-" "Mozilla/4.08 [en] (Win98; I ;Nav)" 4 | -------------------------------------------------------------------------------- /test/data/bulk_import.txt: -------------------------------------------------------------------------------- 1 | 1,Chris,Smith,111223333 2 | 2,Jim,Foxworthy,444332222 3 | 3,Brian,Collingsworth,123443435 -------------------------------------------------------------------------------- /test/data/bulk_import_with_empties.txt: -------------------------------------------------------------------------------- 1 | 1,Chris,Smith,111223333 2 | 2,Jim,,444332222 3 | 3,Brian,Collingsworth,123443435 -------------------------------------------------------------------------------- /test/data/decode.txt: -------------------------------------------------------------------------------- 1 | M:Male 2 | F:Female 3 | :Unknown -------------------------------------------------------------------------------- /test/data/delimited.txt: -------------------------------------------------------------------------------- 1 | Chris,Smith,111223333,24,M 2 | Jim,Foxworthy,444332222,51,M 3 | Brian,Collingsworth,123443435,10,M -------------------------------------------------------------------------------- /test/data/encode_source_latin1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/encode_source_latin1.txt -------------------------------------------------------------------------------- /test/data/excel.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/excel.xls -------------------------------------------------------------------------------- /test/data/excel2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/activewarehouse/activewarehouse-etl/0b0b50e140ed02081b3ed1de902f78308ed738a5/test/data/excel2.xls -------------------------------------------------------------------------------- /test/data/fixed_width.txt: -------------------------------------------------------------------------------- 1 | Bob Smith 12344555523 2 | Jane Doe 98766211145 3 | AbcdefghiJklmnopqrstu12345678920 -------------------------------------------------------------------------------- /test/data/multiple_delimited_1.txt: -------------------------------------------------------------------------------- 1 | Chris,Smith,111223333,24 2 | Jim,Foxworthy,444332222,51 3 | Brian,Collingsworth,123443435,10 -------------------------------------------------------------------------------- /test/data/multiple_delimited_2.txt: -------------------------------------------------------------------------------- 1 | Bob,Jones,444223333,28 2 | Tom,Allen,324001232,33 3 | Jesse,Baker,555443333,21 -------------------------------------------------------------------------------- /test/data/nokogiri.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Bob 6 | Smith 7 | bsmith@foo.com 8 | 9 | brown 10 | black 11 | fair 12 | 13 | 24 14 | 15 | 16 | Jane 17 | Doe 18 | jdoe@bar.com 19 | 20 | blue 21 | blond 22 | medium 23 | 24 | 45 25 | 26 | 27 | Jake 28 | Smithsonian 29 | jake@example.com 30 | 31 | brown 32 | black 33 | dark 34 | 35 | 37 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /test/data/people.txt: -------------------------------------------------------------------------------- 1 | Bob,Smith 2 | Jane,Doe 3 | Chris,Cornell -------------------------------------------------------------------------------- /test/data/sax.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Smith 8 | 123456789 9 | 10 | 11 | 12 | Doe 13 | 222114545 14 | 15 | -------------------------------------------------------------------------------- /test/data/xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Bob 6 | Smith 7 | 123456789 8 | 24 9 | 10 | 11 | John 12 | Doe 13 | 222114545 14 | 31 15 | 16 | -------------------------------------------------------------------------------- /test/database_join_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class DatabaseJoinProcessorTest < Test::Unit::TestCase 4 | 5 | def new_processor(options) 6 | ETL::Processor::DatabaseJoinProcessor.new(nil, options) 7 | end 8 | 9 | should 'raise an error unless :fields is specified' do 10 | error = assert_raises(ETL::ControlError) { new_processor({}) } 11 | assert_equal ":target must be specified", error.message 12 | end 13 | 14 | should 'return the row and the database result' do 15 | row = ETL::Row[:id => 1, :first_name => 'Bob', :last_name => 'Smith', :ssn => '111234444'] 16 | control = ETL::Control::Control.parse(File.dirname(__FILE__) + 17 | '/delimited.ctl') 18 | 19 | Person.delete_all 20 | assert_equal 0, Person.count 21 | 22 | # First define a basic configuration to check defaults 23 | configuration = { 24 | :target => :data_warehouse, 25 | :database => 'etl_unittest', 26 | :table => 'people', 27 | :buffer_size => 0 28 | } 29 | mapping = { :order => [:id, :first_name, :last_name, :ssn] } 30 | dest = ETL::Control::DatabaseDestination.new(control, configuration, mapping) 31 | dest.write(row) 32 | dest.close 33 | 34 | assert_equal 1, Person.find(:all).length 35 | 36 | row = ETL::Row[:last_name => "Smith"] 37 | processor = new_processor(:target => :data_warehouse, 38 | :query => "SELECT first_name FROM people WHERE last_name = \#{connection.quote(row[:last_name])}", 39 | :fields => ["first_name"]).process(row) 40 | assert_equal row[:first_name], "Bob" 41 | end 42 | 43 | end 44 | -------------------------------------------------------------------------------- /test/date_dimension_builder_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class DateDimensionBuilderTest < Test::Unit::TestCase 4 | 5 | context "the DateDimensionBuilder" do 6 | context "when initialized with defaults" do 7 | setup do 8 | @builder = ETL::Builder::DateDimensionBuilder.new 9 | end 10 | should "have a start date of 5 years ago" do 11 | assert_equal Time.now.years_ago(5).to_date, @builder.start_date.to_date 12 | end 13 | should "have an end date of now" do 14 | assert_equal Time.now.to_date, @builder.end_date.to_date 15 | end 16 | should "have an empty of array of holiday indicators" do 17 | assert_equal [], @builder.holiday_indicators 18 | end 19 | end 20 | context "when initialized with arguments" do 21 | setup do 22 | @start_date = Time.now.years_ago(2) 23 | @end_date = Time.now.years_ago(1) 24 | @builder = ETL::Builder::DateDimensionBuilder.new(@start_date, @end_date) 25 | end 26 | should "respect a custom start date" do 27 | assert_equal @start_date.to_date, @builder.start_date.to_date 28 | end 29 | should "respect a custom end date" do 30 | assert_equal @end_date.to_date, @builder.end_date.to_date 31 | end 32 | end 33 | context "when building a date dimension using the default settings" do 34 | setup do 35 | # specific dates required when testing, because leap years affect 36 | # how many records are built 37 | @start_date = Date.parse('2002-05-19').to_time 38 | @end_date = Date.parse('2007-05-19').to_time 39 | @builder = ETL::Builder::DateDimensionBuilder.new(@start_date, @end_date) 40 | @records = @builder.build 41 | end 42 | should "build a dimension with the correct number of records" do 43 | assert_equal 1827, @records.length 44 | end 45 | should "have the correct first date" do 46 | assert_date_dimension_record_equal(@builder.start_date, @records.first) 47 | end 48 | end 49 | context "when building a date dimension with a fiscal year offset month" do 50 | should_eventually "respect the fiscal year offset month" do 51 | 52 | end 53 | end 54 | end 55 | 56 | def assert_date_dimension_record_equal(date, record) 57 | real_date = date 58 | date = date.to_time 59 | assert_equal date.strftime("%m/%d/%Y"), record[:date] 60 | assert_equal date.strftime("%B %d,%Y"), record[:full_date_description] 61 | assert_equal date.strftime("%A"), record[:day_of_week] 62 | assert_equal date.day, record[:day_number_in_calendar_month] 63 | assert_equal date.yday, record[:day_number_in_calendar_year] 64 | assert_equal date.day, record[:day_number_in_fiscal_month] 65 | assert_equal date.fiscal_year_yday, record[:day_number_in_fiscal_year] 66 | assert_equal "Week #{date.week}", record[:calendar_week] 67 | assert_equal date.week, record[:calendar_week_number_in_year] 68 | assert_equal date.strftime("%B"), record[:calendar_month_name] 69 | assert_equal date.month, record[:calendar_month_number_in_year] 70 | assert_equal date.strftime("%Y-%m"), record[:calendar_year_month] 71 | assert_equal "Q#{date.quarter}", record[:calendar_quarter] 72 | assert_equal date.quarter, record[:calendar_quarter_number_in_year] 73 | assert_equal "#{date.strftime('%Y')}-#{record[:calendar_quarter]}", record[:calendar_year_quarter] 74 | assert_equal "#{date.year}", record[:calendar_year] 75 | assert_equal "FY Week #{date.fiscal_year_week}", record[:fiscal_week] 76 | assert_equal date.fiscal_year_week, record[:fiscal_week_number_in_year] 77 | assert_equal date.fiscal_year_month, record[:fiscal_month] 78 | assert_equal date.fiscal_year_month, record[:fiscal_month_number_in_year] 79 | assert_equal "FY#{date.fiscal_year}-" + date.fiscal_year_month.to_s.rjust(2, '0'), record[:fiscal_year_month] 80 | assert_equal "FY Q#{date.fiscal_year_quarter}", record[:fiscal_quarter] 81 | assert_equal "FY#{date.fiscal_year}-Q#{date.fiscal_year_quarter}", record[:fiscal_year_quarter] 82 | assert_equal date.fiscal_year_quarter, record[:fiscal_year_quarter_number] 83 | assert_equal "FY#{date.fiscal_year}", record[:fiscal_year] 84 | assert_equal date.fiscal_year, record[:fiscal_year_number] 85 | assert_equal 'Nonholiday', record[:holiday_indicator] 86 | assert_equal weekday_indicators[date.wday], record[:weekday_indicator] 87 | assert_equal 'None', record[:selling_season] 88 | assert_equal 'None', record[:major_event] 89 | assert_equal record[:sql_date_stamp], real_date 90 | end 91 | 92 | private 93 | def weekday_indicators 94 | ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend'] 95 | end 96 | end -------------------------------------------------------------------------------- /test/delimited.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/delimited.txt', 3 | :parser => { 4 | :name => :csv 5 | } 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | :age, 12 | :sex 13 | ] 14 | 15 | #transform :age, :type, :type => :number 16 | transform :ssn, :sha1 17 | transform(:ssn){ |n, v, row| v[0,24] } 18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'} 19 | 20 | destination :out, { 21 | :file => 'output/delimited.txt' 22 | }, 23 | { 24 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test], 25 | :virtual => { 26 | :id => :surrogate_key, 27 | :test => "test!", 28 | :calc_test => Time.now 29 | }, 30 | } -------------------------------------------------------------------------------- /test/delimited_absolute.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => '/tmp/delimited_abs.txt', 3 | :parser => { 4 | :name => :csv 5 | } 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | { 12 | :name => :age, 13 | :type => :integer 14 | }, 15 | :sex 16 | ] 17 | 18 | transform :ssn, :sha1 19 | transform(:ssn){ |n, v, row| v[0,24] } 20 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'} 21 | 22 | destination :out, { 23 | :file => 'data/delimited_abs.txt' 24 | }, 25 | { 26 | :order => [:first_name, :last_name, :ssn, :age, :sex, :test, :calc_test], 27 | :virtual => { 28 | :test => "test!", 29 | :calc_test => Time.now 30 | } 31 | } -------------------------------------------------------------------------------- /test/delimited_destination_db.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/delimited.txt', 3 | :parser => :csv 4 | }, 5 | [ 6 | :id, 7 | :first_name, 8 | :last_name, 9 | :ssn 10 | ] 11 | 12 | transform :ssn, :sha1 13 | transform(:ssn){ |v| v[0,24] } 14 | 15 | destination :out, { 16 | :type => :database, 17 | :target => :data_warehouse, 18 | :database => 'etl_unittest', 19 | :table => 'people', 20 | }, 21 | { 22 | :order => [:id, :first_name, :last_name, :ssn] 23 | } -------------------------------------------------------------------------------- /test/delimited_excel.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/delimited.txt', 3 | :parser => { 4 | :name => :csv 5 | } 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | :age, 12 | :sex 13 | ] 14 | 15 | #transform :age, :type, :type => :number 16 | transform :ssn, :sha1 17 | transform(:ssn){ |n, v, row| v[0,24] } 18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'} 19 | 20 | destination :out, { 21 | :type => :excel, 22 | :file => 'output/delimited_excel.xls' 23 | }, 24 | { 25 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test], 26 | :virtual => { 27 | :id => :surrogate_key, 28 | :test => "test!", 29 | :calc_test => Time.now 30 | }, 31 | } -------------------------------------------------------------------------------- /test/delimited_insert_update.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/delimited.txt', 3 | :parser => { 4 | :name => :csv 5 | } 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | :age, 12 | :sex 13 | ] 14 | 15 | #transform :age, :type, :type => :number 16 | transform :ssn, :sha1 17 | transform(:ssn){ |n, v, row| v[0,24] } 18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'} 19 | 20 | destination :out, { 21 | :type => :insert_update_database, 22 | :target => :data_warehouse, 23 | :database => 'etl_unittest', 24 | :table => 'people' 25 | }, 26 | { 27 | :primarykey => [:id], 28 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test], 29 | :virtual => { 30 | :id => :surrogate_key, 31 | :test => "test!", 32 | :calc_test => Time.now 33 | }, 34 | } 35 | -------------------------------------------------------------------------------- /test/delimited_update.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/delimited.txt', 3 | :parser => { 4 | :name => :csv 5 | } 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | :age, 12 | :sex 13 | ] 14 | 15 | #transform :age, :type, :type => :number 16 | transform :ssn, :sha1 17 | transform(:ssn){ |n, v, row| v[0,24] } 18 | transform :sex, :decode, {:decode_table_path => 'data/decode.txt'} 19 | 20 | destination :out, { 21 | :type => :update_database, 22 | :target => :data_warehouse, 23 | :database => 'etl_unittest', 24 | :table => 'people' 25 | }, 26 | { 27 | :conditions => [{:field => "\#{conn.quote_column_name(:id)}", :value => "\#{conn.quote(row[:id])}", :comp => "="}], 28 | :order => [:id, :first_name, :last_name, :ssn, :age, :sex, :test, :calc_test], 29 | :virtual => { 30 | :id => :surrogate_key, 31 | :test => "test!", 32 | :calc_test => Time.now 33 | }, 34 | } 35 | -------------------------------------------------------------------------------- /test/delimited_with_bulk_load.ctl: -------------------------------------------------------------------------------- 1 | infile = 'data/people.txt' 2 | outfile = 'output/people.txt' 3 | 4 | source :in, { 5 | :file => infile, 6 | :parser => { 7 | :name => :csv 8 | } 9 | }, 10 | [ 11 | :first_name, 12 | :last_name, 13 | ] 14 | 15 | before_write :surrogate_key, :target => :data_warehouse, :table => 'person_dimension', :column => 'id' 16 | before_write :check_exist, { 17 | :target => :data_warehouse, 18 | :table => 'person_dimension', 19 | :columns => [:first_name, :last_name] 20 | } 21 | 22 | destination :out, { 23 | :file => outfile 24 | }, 25 | { 26 | :order => [:id, :first_name, :last_name] 27 | } 28 | 29 | post_process :bulk_import, { 30 | :file => outfile, 31 | :target => :data_warehouse, 32 | :table => 'person_dimension', 33 | :order => [:id, :first_name, :last_name] 34 | } -------------------------------------------------------------------------------- /test/directive_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class BadDirective < ETL::Batch::Directive 4 | 5 | end 6 | 7 | class BatchTest < Test::Unit::TestCase 8 | 9 | attr_reader :file 10 | attr_reader :engine 11 | def setup 12 | @file = File.dirname(__FILE__) + '/all.ebf' 13 | @engine = ETL::Engine.new 14 | end 15 | 16 | def test_directive_without_implementation_should_fail 17 | batch = ETL::Batch::Batch.resolve(file, engine) 18 | assert_raise RuntimeError do 19 | d = BadDirective.new(batch) 20 | d.execute 21 | end 22 | end 23 | end -------------------------------------------------------------------------------- /test/encode_processor_test.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | require File.dirname(__FILE__) + '/test_helper' 3 | 4 | require 'iconv' 5 | 6 | class EncodeProcessorTest < Test::Unit::TestCase 7 | 8 | SOURCE = 'data/encode_source_latin1.txt' 9 | TARGET = 'output/encode_destination_utf-8.txt' 10 | 11 | def setup 12 | @control = flexmock("control") 13 | @control.should_receive(:file).twice.and_return(File.dirname(__FILE__) + '/fake-control.ctl') 14 | end 15 | 16 | def test_should_transform_a_latin1_file_to_utf8_with_grace 17 | configuration = { :source_file => SOURCE, :source_encoding => 'latin1', :target_file => TARGET, :target_encoding => 'utf-8' } 18 | ETL::Processor::EncodeProcessor.new(@control, configuration).process 19 | assert_equal "éphémère has accents.\nlet's encode them.", IO.read(File.join(File.dirname(__FILE__),TARGET)) 20 | end 21 | 22 | def test_should_throw_exception_on_unsupported_encoding 23 | configuration = { :source_file => SOURCE, :source_encoding => 'acme-encoding', :target_file => TARGET, :target_encoding => 'utf-8' } 24 | error = assert_raise(ETL::ControlError) { ETL::Processor::EncodeProcessor.new(@control, configuration) } 25 | assert_equal "Either the source encoding 'acme-encoding' or the target encoding 'utf-8' is not supported", error.message 26 | end 27 | 28 | def test_should_throw_exception_when_target_and_source_are_the_same 29 | configuration = { :source_file => SOURCE, :source_encoding => 'latin1', :target_file => SOURCE, :target_encoding => 'utf-8' } 30 | error = assert_raise(ETL::ControlError) { ETL::Processor::EncodeProcessor.new(@control, configuration) } 31 | assert_equal "Source and target file cannot currently point to the same file", error.message 32 | end 33 | 34 | end -------------------------------------------------------------------------------- /test/engine_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class EngineTest < Test::Unit::TestCase 4 | 5 | context 'process' do 6 | 7 | should 'raise an error when a file which does not exist is given' do 8 | error = assert_raise(Errno::ENOENT) do 9 | ETL::Engine.process('foo-bar.ctl') 10 | end 11 | 12 | assert_equal "No such file or directory - foo-bar.ctl", error.message 13 | end 14 | 15 | should 'raise an error when an unknown file type is given' do 16 | error = assert_raise(RuntimeError) do 17 | ETL::Engine.process(__FILE__) 18 | end 19 | 20 | assert_match /Unsupported file type/, error.message 21 | end 22 | 23 | should_eventually 'stop as soon as the error threshold is reached' do 24 | engine = ETL::Engine.new 25 | 26 | assert_equal 0, engine.errors.size 27 | 28 | engine.process ETL::Control::Control.parse_text < :enumerable, :enumerable => (1..100) } 31 | after_read { |row| raise "Failure" } 32 | CTL 33 | 34 | assert_equal 1, engine.errors.size 35 | end 36 | 37 | should 'call error callbacks' do 38 | engine = ETL::Engine.new 39 | 40 | $our_errors = [] 41 | engine.process ETL::Control::Control.parse_text < :enumerable, :enumerable => (1..100) } 43 | on_error { |error| $our_errors << error } 44 | after_read { |row| raise "Failure" } 45 | CTL 46 | assert_equal 100, $our_errors.size 47 | assert_match /on line 100: Failure$/, $our_errors.last 48 | end 49 | 50 | end 51 | 52 | context 'connection' do 53 | 54 | should 'return an ActiveRecord configuration by name' do 55 | assert_not_nil ETL::Engine.connection(:data_warehouse) 56 | end 57 | 58 | should 'raise an error on non existent connection' do 59 | error = assert_raise(ETL::ETLError) do 60 | ETL::Engine.connection(:does_not_exist) 61 | end 62 | assert_equal "Cannot find connection named :does_not_exist", error.message 63 | end 64 | 65 | should 'raise an error when requesting a connection with no name' do 66 | error = assert_raise(ETL::ETLError) do 67 | ETL::Engine.connection(" ") 68 | end 69 | assert_equal "Connection with no name requested. Is there a missing :target parameter somewhere?", error.message 70 | end 71 | end 72 | 73 | context 'temp tables' do 74 | attr_reader :connection 75 | 76 | setup do 77 | @connection = ETL::Engine.connection(:data_warehouse) 78 | end 79 | 80 | should 'return unmodified table name when temp tables are disabled' do 81 | assert_equal 'foo', ETL::Engine.table('foo', ETL::Engine.connection(:data_warehouse)) 82 | end 83 | 84 | should 'return temp table name instead of table name when temp tables are enabled' do 85 | ETL::Engine.use_temp_tables = true 86 | assert_equal 'tmp_people', ETL::Engine.table('people', connection) 87 | ETL::Engine.use_temp_tables = false 88 | end 89 | end 90 | 91 | end -------------------------------------------------------------------------------- /test/ensure_fields_presence_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class EnsureFieldsPresenceProcessorTest < Test::Unit::TestCase 4 | 5 | def new_processor(options) 6 | ETL::Processor::EnsureFieldsPresenceProcessor.new(nil, options) 7 | end 8 | 9 | should 'raise an error unless :fields is specified' do 10 | error = assert_raises(ETL::ControlError) { new_processor({}) } 11 | assert_equal ":fields must be specified", error.message 12 | end 13 | 14 | should 'raise an error if a field is missing in the row' do 15 | error = assert_raise(ETL::ControlError) do 16 | processor = new_processor(:fields => [:key]) 17 | processor.process(ETL::Row[]) 18 | end 19 | 20 | assert_match /missing required field\(s\)/, error.message 21 | end 22 | 23 | should 'return the row if the required fields are in the row' do 24 | row = ETL::Row[:first => nil, :second => "Barry"] 25 | assert_equal row, new_processor(:fields => [:first, :second]).process(row) 26 | end 27 | 28 | should 'accept strings instead of symbols in both places' do 29 | row = ETL::Row[:first => nil, 'second' => "Barry"] 30 | assert_equal row, new_processor(:fields => ['first', :second]).process(row) 31 | end 32 | 33 | end 34 | -------------------------------------------------------------------------------- /test/errors.ctl: -------------------------------------------------------------------------------- 1 | class ErrorProcessor < ETL::Processor::RowProcessor 2 | def initialize(control, configuration) 3 | super 4 | end 5 | def process(row) 6 | raise RuntimeError, "Generated error" 7 | end 8 | end 9 | 10 | set_error_threshold 1 11 | 12 | source :in, { 13 | :type => :enumerable, 14 | :enumerable => [ 15 | {:first_name => 'Bob',:last_name => 'Smith'}, 16 | {:first_name => 'Joe', :last_name => 'Thompson'} 17 | ] 18 | }, 19 | [ 20 | :first_name, 21 | :last_name 22 | ] 23 | 24 | after_read ErrorProcessor -------------------------------------------------------------------------------- /test/etl_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # This is an integration test 4 | class ETLTest < Test::Unit::TestCase 5 | # Test end-to-end integration of ETL engine processing for the delimited.ctl control file 6 | def test_delimited_single_file_load 7 | #ETL::Engine.logger = Logger.new(STDOUT) 8 | #ETL::Engine.logger.level = Logger::DEBUG 9 | 10 | ETL::Engine.init(:config => File.dirname(__FILE__) + '/database.yml') 11 | ETL::Engine.process(File.dirname(__FILE__) + '/delimited.ctl') 12 | lines = open(File.dirname(__FILE__) + '/output/delimited.txt').readlines 13 | assert_equal 3, lines.length 14 | 15 | data = lines[0].split(',') 16 | assert_equal '1', data[0] 17 | assert_equal 'Chris', data[1] 18 | assert_equal 'Smith', data[2] 19 | assert_equal '23cc5914d48b146f0fbb73c4', data[3] 20 | assert_equal '24', data[4] 21 | assert_equal 'Male', data[5] 22 | assert_equal 'test!', data[6] 23 | assert_nothing_raised { Time.parse(data[7]) } 24 | 25 | data = lines[1].split(',') 26 | assert_equal '2', data[0] 27 | assert_equal 'Jim', data[1] 28 | assert_equal 'Foxworthy', data[2] 29 | assert_equal '596e3534978b8c2b47851e37', data[3] 30 | assert_equal '51', data[4] 31 | assert_equal 'Male', data[5] 32 | assert_equal 'test!', data[6] 33 | assert_nothing_raised { Time.parse(data[7]) } 34 | end 35 | 36 | # Test end-to-end integration of ETL engine processing for the fixed_width.ctl control file 37 | def test_fixed_width_single_file_load 38 | ETL::Engine.process(File.dirname(__FILE__) + '/fixed_width.ctl') 39 | lines = open(File.dirname(__FILE__) + '/output/delimited.txt').readlines 40 | assert_equal 3, lines.length 41 | end 42 | end -------------------------------------------------------------------------------- /test/excel.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/excel.xls', 3 | :parser => :excel 4 | }, 5 | { 6 | :ignore_blank_line => false, 7 | :fields => [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | :age 12 | ] 13 | } 14 | 15 | transform :ssn, :sha1 16 | transform(:ssn){ |n, v, r| v[0,24] } 17 | 18 | 19 | destination :out, { 20 | :file => 'output/excel.out.txt' 21 | }, 22 | { 23 | :order => [:first_name, :last_name, :ssn, :age] 24 | } -------------------------------------------------------------------------------- /test/excel2.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => 'data/excel2.xls', 3 | :parser => :excel 4 | }, 5 | { 6 | :ignore_blank_line => true, 7 | :worksheets => [ 1 ], 8 | :fields => [ 9 | :first_name, 10 | :last_name, 11 | :ssn, 12 | :age 13 | ] 14 | } 15 | 16 | transform :ssn, :sha1 17 | transform(:ssn){ |n, v, r| v[0,24] } 18 | 19 | 20 | destination :out, { 21 | :file => 'output/excel2.out.txt' 22 | }, 23 | { 24 | :order => [:first_name, :last_name, :ssn, :age] 25 | } -------------------------------------------------------------------------------- /test/fixed_width.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing fixed_width.ctl" 2 | 3 | source :in, { 4 | :file => 'data/fixed_width.txt', 5 | :parser => :fixed_width 6 | }, 7 | { 8 | :first_name => { 9 | :start => 1, 10 | :length => 9 11 | }, 12 | :last_name => { 13 | :start => 10, 14 | :length => 12 15 | }, 16 | :ssn => { 17 | :start => 22, 18 | :length => 9 19 | }, 20 | :age => { 21 | :start => 31, 22 | :length => 2, 23 | :type => :integer 24 | } 25 | } 26 | 27 | transform :ssn, :sha1 28 | transform(:ssn){ |n, v, r| v[0,24] } 29 | 30 | destination :out, { 31 | :file => 'output/fixed_width.txt' 32 | }, 33 | { 34 | :order => [:first_name, :last_name, :ssn, :age] 35 | } -------------------------------------------------------------------------------- /test/foreign_key_lookup_transform_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # TODO - use flexmock instead, but I'm not sure how to handle the respond_to part yet 4 | class TestResolver 5 | attr_accessor :cache_loaded 6 | 7 | def initialize 8 | @cache_loaded = false 9 | end 10 | 11 | def load_cache 12 | @cache_loaded = true 13 | end 14 | end 15 | 16 | class ForeignKeyLookupTransformTest < Test::Unit::TestCase 17 | 18 | context 'configuration' do 19 | 20 | should 'enable cache by default' do 21 | resolver = TestResolver.new 22 | 23 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name', 24 | {:resolver => resolver}) 25 | 26 | assert_equal true, resolver.cache_loaded 27 | end 28 | 29 | should 'allow to disable cache' do 30 | resolver = TestResolver.new 31 | 32 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name', 33 | {:resolver => resolver, :cache => false}) 34 | 35 | assert_equal false, resolver.cache_loaded 36 | end 37 | 38 | should 'allow to enable cache' do 39 | resolver = TestResolver.new 40 | 41 | transform = ETL::Transform::ForeignKeyLookupTransform.new(nil, 'name', 42 | {:resolver => resolver, :cache => true}) 43 | 44 | assert_equal true, resolver.cache_loaded 45 | end 46 | 47 | end 48 | 49 | 50 | end -------------------------------------------------------------------------------- /test/generator_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # Test generators 4 | class GeneratorTest < Test::Unit::TestCase 5 | # Test the surrogate key generator 6 | def test_surrogate_key_generator 7 | generator_class = ETL::Generator::Generator.class_for_name(:surrogate_key) 8 | assert_equal ETL::Generator::SurrogateKeyGenerator, generator_class 9 | generator = generator_class.new 10 | 1.upto(10) do |i| 11 | assert_equal i, generator.next 12 | end 13 | end 14 | end -------------------------------------------------------------------------------- /test/inline_parser.ctl: -------------------------------------------------------------------------------- 1 | class MyParser < ETL::Parser::Parser 2 | def each 3 | [{:name => 'foo'},{:name => 'bar'},{:name => 'baz'}].each do |row| 4 | yield row 5 | end 6 | end 7 | end 8 | 9 | source :in, { 10 | :file => '', 11 | :parser => MyParser 12 | }, 13 | [ 14 | :name 15 | ] 16 | 17 | destination :out, {:file => 'output/inline_parser.txt'},{:order => [:name]} -------------------------------------------------------------------------------- /test/mocks/mock_destination.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Control 3 | # Usage: 4 | # - declare in the ctl file: 5 | # destination :out, { :type => :mock, :name => :my_mock_output } 6 | # - run the .ctl from your test 7 | # - then assert the content of the rows 8 | # assert_equal [{:name => 'John Barry'},{:name => 'Gary Moore'}], MockDestination[:my_mock_output] 9 | class MockDestination < Destination 10 | def initialize(control, configuration, mapping={}) 11 | super 12 | @mock_destination_name = configuration[:name] || 'mock_destination' 13 | @@registry ||= {} 14 | @@registry[@mock_destination_name] ||= [] 15 | end 16 | def self.[](mock_destination_name) 17 | @@registry[mock_destination_name] 18 | end 19 | def write(row) 20 | @@registry[@mock_destination_name] << row 21 | end 22 | # the presence of close is asserted - just do nothing 23 | def close; end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /test/mocks/mock_source.rb: -------------------------------------------------------------------------------- 1 | module ETL 2 | module Control 3 | # Usage: 4 | # - first set the data in your test setup 5 | # MockSource[:my_input] = [ { :first_name => 'John', :last_name => 'Barry' }, { ...} ] 6 | # - then declare in the ctl file: 7 | # source :in, { :type => :mock, :name => :my_input } 8 | class MockSource < EnumerableSource 9 | def initialize(control, configuration, definition) 10 | super 11 | mock_source_name = configuration[:name] || 'mock_source' 12 | throw "No mock source data set for mock source '#{mock_source_name}'" if @@registry[mock_source_name].nil? 13 | configuration[:enumerable] = @@registry[mock_source_name] 14 | end 15 | def self.[]=(mock_source_name,mock_source_data) 16 | @@registry ||= {} 17 | @@registry[mock_source_name] = mock_source_data 18 | end 19 | def self.[](mock_source_name) 20 | @@registry[mock_source_name] 21 | end 22 | end 23 | end 24 | end 25 | 26 | -------------------------------------------------------------------------------- /test/model_source.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :type => :model 3 | }, 4 | [ 5 | :first_name, 6 | :last_name 7 | ] 8 | 9 | destination :out, { 10 | :file => 'data/model_out.txt' 11 | }, 12 | { 13 | :order => [:first_name, :last_name], 14 | } -------------------------------------------------------------------------------- /test/multiple_delimited.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing delimited.ctl" 2 | 3 | source :in, { 4 | :file => 'data/multiple_delimited_*.txt', 5 | :parser => :csv 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | { 12 | :name => :age, 13 | :type => :integer 14 | } 15 | ] 16 | 17 | destination :out, { 18 | :file => 'output/multiple_delimited.txt' 19 | }, 20 | { 21 | :order => [:first_name, :last_name, :ssn, :age] 22 | } 23 | -------------------------------------------------------------------------------- /test/multiple_source_delimited.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing delimited.ctl" 2 | 3 | source :source1, { 4 | :file => 'data/multiple_delimited_*.txt', 5 | :parser => :csv 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | { 12 | :name => :age, 13 | :type => :integer 14 | } 15 | ] 16 | 17 | source :source2, { 18 | :file => 'data/multiple_delimited_*.txt', 19 | :parser => :csv 20 | }, 21 | [ 22 | :first_name, 23 | :last_name, 24 | :ssn, 25 | { 26 | :name => :age, 27 | :type => :integer 28 | } 29 | ] 30 | 31 | transform :ssn, :sha1 32 | transform(:ssn){ |v| v[0,24] } 33 | 34 | destination :out, { 35 | :file => 'output/multiple_source_delimited.txt' 36 | }, 37 | { 38 | :order => [:first_name, :last_name, :ssn, :age] 39 | } -------------------------------------------------------------------------------- /test/nokogiri_all.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing nokogiri_all.ctl" 2 | 3 | source :in, { 4 | :file => 'data/nokogiri.xml', 5 | :parser => :nokogiri_xml 6 | }, 7 | { 8 | :collection => 'people/person', 9 | :fields => [ 10 | :first_name, 11 | :last_name, 12 | { 13 | :name => :ssn, 14 | :xpath => '@ssn' 15 | }, 16 | { 17 | :name => :age, 18 | :type => :integer 19 | }, 20 | { 21 | :name => :hair_colour, 22 | :xpath => 'colours/hair' 23 | } 24 | ] 25 | } 26 | 27 | destination :out, { 28 | :file => 'output/xml.txt' 29 | }, 30 | { 31 | :order => [:first_name, :last_name, :ssn] 32 | } 33 | 34 | transform :ssn, :sha1 35 | transform(:ssn){ |v| v[0,24] } 36 | -------------------------------------------------------------------------------- /test/nokogiri_select.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing nokogiri_select.ctl" 2 | 3 | source :in, { 4 | :file => 'data/nokogiri.xml', 5 | :parser => :nokogiri_xml 6 | }, 7 | { 8 | :collection => 'people/person[@type="client"]', 9 | :fields => [ 10 | :first_name, 11 | :last_name, 12 | { 13 | :name => :ssn, 14 | :xpath => '@ssn' 15 | }, 16 | { 17 | :name => :age, 18 | :type => :integer 19 | }, 20 | { 21 | :name => :hair_colour, 22 | :xpath => 'colours/hair' 23 | } 24 | ] 25 | } 26 | 27 | destination :out, { 28 | :file => 'output/xml.txt' 29 | }, 30 | { 31 | :order => [:first_name, :last_name, :ssn] 32 | } 33 | 34 | transform :ssn, :sha1 35 | transform(:ssn){ |v| v[0,24] } 36 | -------------------------------------------------------------------------------- /test/nokogiri_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # Test the flat text parsers 4 | class ParserTest < Test::Unit::TestCase 5 | 6 | # Test the DOM-based Nokogiri XML parser. . 7 | def test_nokogiri_xml_parser_for_all_nodes 8 | control = ETL::Control::Control.resolve( 9 | File.dirname(__FILE__) + '/nokogiri_all.ctl') 10 | parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first) 11 | rows = parser.collect { |row| row } 12 | assert_equal 3, rows.length 13 | assert_equal( 14 | { :hair_colour=>"black", 15 | :first_name=>"Bob", 16 | :last_name=>"Smith", 17 | :ssn=>"123456789", :age=>"24"}, rows.first) 18 | end 19 | 20 | # Test the DOM-based Nokogiri XML parser. . 21 | def test_nokogiri_xml_parser_for_selected_nodes 22 | control = ETL::Control::Control.resolve( 23 | File.dirname(__FILE__) + '/nokogiri_select.ctl') 24 | parser = ETL::Parser::NokogiriXmlParser.new(control.sources.first) 25 | rows = parser.collect { |row| row } 26 | assert_equal 2, rows.length 27 | assert_equal( 28 | { :age=>"37", 29 | :hair_colour=>"black", 30 | :first_name=>"Jake", 31 | :last_name=>"Smithsonian", 32 | :ssn=>"133244566"}, rows.last) 33 | end 34 | 35 | end 36 | -------------------------------------------------------------------------------- /test/output/.ignore: -------------------------------------------------------------------------------- 1 | *.txt -------------------------------------------------------------------------------- /test/performance/delimited.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing delimited.ctl" 2 | 3 | source :in, { 4 | :file => 'delimited.txt', 5 | :parser => :csv 6 | }, 7 | [ 8 | :first_name, 9 | :last_name, 10 | :ssn, 11 | { 12 | :name => :age, 13 | :type => :integer 14 | }, 15 | :sex 16 | ] 17 | 18 | transform :ssn, :sha1 19 | transform(:ssn){ |v| v[0,24] } 20 | transform :sex, :decode, {:decode_table_path => 'delimited_decode.txt'} 21 | 22 | destination :out, { 23 | :file => 'delimited.out.txt' 24 | }, 25 | { 26 | :order => [:first_name, :last_name, :name, :ssn, :age, :sex], 27 | :virtual => { 28 | :name => Proc.new { |row| "#{row[:first_name]} #{row[:last_name]}" } 29 | } 30 | } -------------------------------------------------------------------------------- /test/processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # Test pre- and post-processors 4 | class ProcessorTest < Test::Unit::TestCase 5 | # Test bulk import functionality 6 | 7 | context "the bulk import processor" do 8 | should "should import successfully" do 9 | assert_nothing_raised { do_bulk_import } 10 | assert_equal 3, Person.count 11 | assert_equal "Foxworthy", Person.find(2).last_name 12 | end 13 | end 14 | 15 | def test_bulk_import_with_empties 16 | # this test ensure that one column with empty value will still allow 17 | # the row to be imported 18 | # this doesn't apply to the id column though - untested 19 | assert_nothing_raised { do_bulk_import('bulk_import_with_empties.txt') } 20 | assert_equal 3, Person.count 21 | assert Person.find(2).last_name.blank? 22 | end 23 | 24 | def test_truncate 25 | # TODO: implement test 26 | end 27 | 28 | private 29 | 30 | def do_bulk_import(file = 'bulk_import.txt') 31 | control = ETL::Control::Control.new(File.join(File.dirname(__FILE__), 'delimited.ctl')) 32 | configuration = { 33 | :file => "data/#{file}", 34 | :truncate => true, 35 | :target => :data_warehouse, 36 | :table => 'people' 37 | } 38 | processor = ETL::Processor::BulkImportProcessor.new(control, configuration) 39 | processor.process 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /test/row_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | # Test row processors 4 | class RowProcessorTest < Test::Unit::TestCase 5 | def test_copy_field_processor 6 | 7 | end 8 | def test_hierarchy_exploder_processor 9 | 10 | end 11 | def test_rename_processor 12 | 13 | end 14 | def test_sequence_processor 15 | 16 | end 17 | end -------------------------------------------------------------------------------- /test/sax.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing fixed_width.ctl" 2 | 3 | source :in, { 4 | :file => 'data/sax.xml', 5 | :parser => :sax 6 | }, 7 | { 8 | :write_trigger => 'people/person', 9 | :fields => { 10 | :first_name => 'people/person/first_name', 11 | :last_name => 'people/person/last_name', 12 | :ssn => 'people/person/social_security_number', 13 | :age => 'people/person[age]' 14 | } 15 | } 16 | 17 | transform :ssn, :sha1 18 | transform(:ssn){ |v| v[0,24] } 19 | transform :age, :type, {:type => :number} 20 | 21 | destination :out, { 22 | :file => 'output/sax.out.txt' 23 | }, 24 | { 25 | :order => [:first_name, :last_name, :ssn, :age] 26 | } -------------------------------------------------------------------------------- /test/scd/1.txt: -------------------------------------------------------------------------------- 1 | Bob,Smith,200 South Drive,Boston,MA,32123 -------------------------------------------------------------------------------- /test/scd/2.txt: -------------------------------------------------------------------------------- 1 | Bob,Smith,1010 SW 23rd St,Los Angeles,CA,90392 -------------------------------------------------------------------------------- /test/scd/3.txt: -------------------------------------------------------------------------------- 1 | Bob,Smith,280 Pine Street,Los Angeles,CA,90392 -------------------------------------------------------------------------------- /test/scd_test_type_1.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => "scd/#{ENV['run_number']}.txt", 3 | :parser => :csv 4 | }, 5 | [ 6 | :first_name, 7 | :last_name, 8 | :address, 9 | :city, 10 | :state, 11 | :zip_code 12 | ] 13 | 14 | # NOTE: These are not usually required for a type 1 SCD dimension, but since 15 | # we're sharing this table with the type 2 tests, they're necessary. 16 | transform :effective_date, :default, :default_value => Time.now.to_s(:db) 17 | transform :end_date, :default, :default_value => '9999-12-31 00:00:00' 18 | transform :latest_version, :default, :default_value => true 19 | 20 | destination :out, { 21 | :file => 'output/scd_test_type_1.txt', 22 | :natural_key => [:first_name, :last_name], 23 | :scd => { 24 | :type => 1, 25 | :dimension_target => :data_warehouse, 26 | :dimension_table => 'person_dimension' 27 | }, 28 | :scd_fields => [:address, :city, :state, :zip_code] 29 | }, 30 | { 31 | :order => [ 32 | :id, :first_name, :last_name, :address, :city, :state, :zip_code, :effective_date, :end_date, :latest_version 33 | ], 34 | :virtual => { 35 | :id => ETL::Generator::SurrogateKeyGenerator.new(:target => :data_warehouse, :table => 'person_dimension') 36 | } 37 | } 38 | 39 | post_process :bulk_import, { 40 | :file => 'output/scd_test_type_1.txt', 41 | :target => :data_warehouse, 42 | :table => 'person_dimension' 43 | } -------------------------------------------------------------------------------- /test/scd_test_type_2.ctl: -------------------------------------------------------------------------------- 1 | source :in, { 2 | :file => "scd/#{ENV['run_number']}.txt", 3 | :parser => :csv 4 | }, 5 | [ 6 | :first_name, 7 | :last_name, 8 | :address, 9 | :city, 10 | :state, 11 | :zip_code 12 | ] 13 | 14 | destination :out, { 15 | :type => :database, 16 | :target => :data_warehouse, 17 | :database => 'etl_unittest', 18 | :table => 'person_dimension', 19 | :natural_key => [:first_name, :last_name], 20 | :scd => { 21 | :type => 2, 22 | :dimension_target => :data_warehouse, 23 | :dimension_table => 'person_dimension' 24 | }, 25 | :scd_fields => ENV['type_2_scd_fields'] ? Marshal.load(ENV['type_2_scd_fields']) : [:address, :city, :state, :zip_code] 26 | }, 27 | { 28 | :order => [ 29 | :id, :first_name, :last_name, :address, :city, :state, :zip_code, :effective_date, :end_date, :latest_version 30 | ], 31 | :virtual => { 32 | :id => ETL::Generator::SurrogateKeyGenerator.new(:target => :data_warehouse, :table => 'person_dimension') 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /test/screen_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | class ScreenTest < Test::Unit::TestCase 4 | def test_screen 5 | ETL::Engine.process(File.dirname(__FILE__) + '/screen_test_fatal.ctl') 6 | assert_equal 2, ETL::Engine.exit_code 7 | end 8 | end -------------------------------------------------------------------------------- /test/screen_test_error.ctl: -------------------------------------------------------------------------------- 1 | screen(:error){ 2 | ETL::Screen::RowCountScreen.new(self, :rows => 1) 3 | } -------------------------------------------------------------------------------- /test/screen_test_fatal.ctl: -------------------------------------------------------------------------------- 1 | screen(:fatal){ 2 | ETL::Screen::RowCountScreen.new(self, :rows => 1) 3 | } -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | $:.unshift(File.dirname(__FILE__) + '/../lib') 2 | $:.unshift(File.dirname(__FILE__)) 3 | 4 | require 'test/unit' 5 | require 'pp' 6 | require 'etl' 7 | require 'shoulda' 8 | require 'flexmock/test_unit' 9 | 10 | raise "Missing required DB environment variable" unless ENV['DB'] 11 | 12 | database_yml = File.dirname(__FILE__) + '/config/database.yml' 13 | ETL::Engine.init(:config => database_yml) 14 | ETL::Engine.logger = Logger.new(STDOUT) 15 | # ETL::Engine.logger.level = Logger::DEBUG 16 | ETL::Engine.logger.level = Logger::FATAL 17 | 18 | ActiveRecord::Base.establish_connection :operational_database 19 | ETL::Execution::Job.delete_all 20 | 21 | require 'mocks/mock_source' 22 | require 'mocks/mock_destination' 23 | 24 | # shortcut to launch a ctl file 25 | def process(file) 26 | Engine.process(File.join(File.dirname(__FILE__), file)) 27 | end 28 | 29 | puts "ActiveRecord::VERSION = #{ActiveRecord::VERSION::STRING}" 30 | 31 | class Person < ActiveRecord::Base 32 | end 33 | 34 | def current_adapter 35 | ENV['DB'] 36 | end 37 | -------------------------------------------------------------------------------- /test/truncate_processor_test.rb: -------------------------------------------------------------------------------- 1 | require File.dirname(__FILE__) + '/test_helper' 2 | 3 | include ETL::Processor 4 | 5 | class TruncateTest < ActiveRecord::Base 6 | set_table_name 'truncate_test' 7 | end 8 | 9 | class TruncateProcessorTest < Test::Unit::TestCase 10 | 11 | def create_item! 12 | TruncateTest.create!(:x => 'ABC') 13 | end 14 | 15 | def truncate!(options=nil) 16 | TruncateProcessor.new(nil, 17 | :target => :data_warehouse, 18 | :table => TruncateTest.table_name, 19 | :options => options 20 | ).process 21 | end 22 | 23 | should 'reset ids by default' do 24 | create_item! 25 | truncate! 26 | assert_equal 1, create_item!.id 27 | end 28 | 29 | if ETL::Engine.connection(:data_warehouse).class.name =~ /postgres/i 30 | should 'allow disabling id reset for postgres' do 31 | truncate! 32 | create_item! 33 | truncate!('CONTINUE IDENTITY') 34 | assert_equal 2, create_item!.id 35 | end 36 | end 37 | end -------------------------------------------------------------------------------- /test/xml.ctl: -------------------------------------------------------------------------------- 1 | # puts "executing fixed_width.ctl" 2 | 3 | source :in, { 4 | :file => 'data/xml.xml', 5 | :parser => :xml 6 | }, 7 | { 8 | :collection => 'people/person', 9 | :fields => [ 10 | :first_name, 11 | :last_name, 12 | { 13 | :name => :ssn, 14 | :xpath => 'social_security_number' 15 | }, 16 | { 17 | :name => :age, 18 | :type => :integer 19 | } 20 | ] 21 | } 22 | 23 | destination :out, { 24 | :file => 'output/xml.txt' 25 | }, 26 | { 27 | :order => [:first_name, :last_name, :ssn] 28 | } 29 | 30 | transform :ssn, :sha1 31 | transform(:ssn){ |v| v[0,24] } --------------------------------------------------------------------------------