├── .gitignore
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── embulk-filter-ruby_proc.gemspec
├── example
    ├── comment_upcase.rb
    ├── config.yml
    └── sample_01.csv
└── lib
    └── embulk
        └── filter
            └── ruby_proc.rb


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | /pkg/
3 | /tmp/
4 | /.bundle/
5 | /Gemfile.lock
6 | /example/out*
7 | .ruby-version
8 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org/'
2 | gemspec
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining
 5 | a copy of this software and associated documentation files (the
 6 | "Software"), to deal in the Software without restriction, including
 7 | without limitation the rights to use, copy, modify, merge, publish,
 8 | distribute, sublicense, and/or sell copies of the Software, and to
 9 | permit persons to whom the Software is furnished to do so, subject to
10 | the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ruby Proc filter plugin for Embulk
  2 | 
  3 | This plugin is inspired by [mgi166/embulk-filter-eval: Eval ruby code on filtering](https://github.com/mgi166/embulk-filter-eval "mgi166/embulk-filter-eval: Eval ruby code on filtering")
  4 | 
  5 | This plugin apply ruby proc to each record.
  6 | 
  7 | ## Overview
  8 | 
  9 | * **Plugin type**: filter
 10 | 
 11 | ## Configuration
 12 | 
 13 | - **columns**: filter definition (hash, required)
 14 | - **requires**: pre required libraries (array, default: `[]`)
 15 | 
 16 | ## Example
 17 | 
 18 | ### input
 19 | ```csv
 20 | id,account,time,purchase,comment,data
 21 | 1,32864,2015-01-27 19:23:49,20150127,embulk,"{\"foo\": \"bar\", \"events\": [{\"id\": 1, \"name\": \"Name1\"}, {\"id\": 2, \"name\": \"Name2\"}]}"
 22 | 2,14824,2015-01-27 19:01:23,20150127,embulk jruby,NULL
 23 | 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin",NULL
 24 | 4,11270,2015-01-29 11:54:36,20150129,NULL,NULL
 25 | ```
 26 | 
 27 | ### config
 28 | ```yaml
 29 | # ...
 30 | 
 31 | filters:
 32 |   - type: ruby_proc
 33 |     requires:
 34 |       - cgi
 35 |     variables:
 36 |       multiply: 3
 37 |     before:
 38 |       - proc: |
 39 |           -> do
 40 |             puts "before proc"
 41 |             @started_at = Time.now
 42 |           end
 43 |     after:
 44 |       - proc: |
 45 |           -> do
 46 |             puts "after proc"
 47 |             p Time.now - @started_at
 48 |           end
 49 |     rows:
 50 |       - proc: |
 51 |           ->(record) do
 52 |             [record.dup, record.dup.tap { |r| r["id"] += 10 }]
 53 |           end
 54 |     skip_rows:
 55 |       - proc: |
 56 |           ->(record) do
 57 |             record["id"].odd?
 58 |           end
 59 |     columns:
 60 |       - name: data
 61 |         proc: |
 62 |           ->(data) do
 63 |             data["events"] = data["events"].map.with_index do |e, idx|
 64 |               e.tap { |e_| e_["idx"] = idx }
 65 |             end
 66 |             data
 67 |           end
 68 |       - name: id
 69 |         proc: |
 70 |           ->(id) do
 71 |             id * variables["multiply"]
 72 |           end
 73 |         type: string
 74 |       - name: comment
 75 |         proc_file: comment_upcase.rb
 76 |         skip_nil: false
 77 |         type: json
 78 |     pages:
 79 |       - proc: |
 80 |           ->(records) do
 81 |             records.map do |record|
 82 |               record.tap { |r| r["id"] += 1 }
 83 |             end
 84 |           end
 85 | 
 86 | # ...
 87 | 
 88 | ```
 89 | 
 90 | If you want to skip record in "rows proc" or "columns proc", use `throw :skip_record`.
 91 | 
 92 | ```rb
 93 | # comment_upcase.rb
 94 | 
 95 | ->(comment, record) do
 96 |   return [record["account"].to_s].to_json unless comment
 97 |   comment.upcase.split(" ").map { |s| CGI.escape(s) }
 98 | end
 99 | ```
100 | 
101 | - `before` and `after` is executed at once
102 | - procs is evaluated on same binding (instance of Evaluator class)
103 |   - instance variable is shared
104 | - rows proc must return record hash or array of record hash.
105 |   - user must take care of object identity. Otherwise, error may be occurred when plugin applys column procs.
106 | - pages proc must return array of record hash.
107 |   - use `page_size` option to increase size of processing record (ex. `-X page_size=64KB`)
108 | 
109 | ### proc execution order
110 | 
111 | 1. before procs
112 | 1. per record
113 |     1. all row procs
114 |     1. per record applied row procs
115 |         1. all skip\_row procs
116 |         1. column procs
117 | 1. per page procs
118 | 1. after procs
119 | 
120 | ### preview
121 | ```
122 | +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
123 | | id:string | account:long |          time:timestamp |      purchase:timestamp |                             comment:json |                                                                                data:json |
124 | +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
125 | |         3 |       32,864 | 2015-01-27 19:23:49 UTC | 2015-01-27 00:00:00 UTC |                               ["EMBULK"] | {"events":[{"id":1,"name":"Name1","idx":0},{"id":2,"name":"Name2","idx":1}],"foo":"bar"} |
126 | |        34 |       32,864 | 2015-01-27 19:23:49 UTC | 2015-01-27 00:00:00 UTC |                               ["EMBULK"] | {"events":[{"id":1,"name":"Name1","idx":0},{"id":2,"name":"Name2","idx":1}],"foo":"bar"} |
127 | |         6 |       14,824 | 2015-01-27 19:01:23 UTC | 2015-01-27 00:00:00 UTC |                       ["EMBULK","JRUBY"] |                                                                                          |
128 | |        37 |       14,824 | 2015-01-27 19:01:23 UTC | 2015-01-27 00:00:00 UTC |                       ["EMBULK","JRUBY"] |                                                                                          |
129 | |         9 |       27,559 | 2015-01-28 02:20:02 UTC | 2015-01-28 00:00:00 UTC | ["EMBULK","%22CSV%22","PARSER","PLUGIN"] |                                                                                          |
130 | |        40 |       27,559 | 2015-01-28 02:20:02 UTC | 2015-01-28 00:00:00 UTC | ["EMBULK","%22CSV%22","PARSER","PLUGIN"] |                                                                                          |
131 | |        12 |       11,270 | 2015-01-29 11:54:36 UTC | 2015-01-29 00:00:00 UTC |                                ["11270"] |                                                                                          |
132 | |        43 |       11,270 | 2015-01-29 11:54:36 UTC | 2015-01-29 00:00:00 UTC |                                ["11270"] |                                                                                          |
133 | +-----------+--------------+-------------------------+-------------------------+------------------------------------------+------------------------------------------------------------------------------------------+
134 | ```
135 | 
136 | ## Build
137 | 
138 | ```
139 | $ rake
140 | ```
141 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | 
3 | task default: :build
4 | 


--------------------------------------------------------------------------------
/embulk-filter-ruby_proc.gemspec:
--------------------------------------------------------------------------------
 1 | 
 2 | Gem::Specification.new do |spec|
 3 |   spec.name          = "embulk-filter-ruby_proc"
 4 |   spec.version       = "0.8.1"
 5 |   spec.authors       = ["joker1007"]
 6 |   spec.summary       = "Ruby Proc filter plugin for Embulk"
 7 |   spec.description   = "Filter each record by ruby proc"
 8 |   spec.email         = ["kakyoin.hierophant@gmail.com"]
 9 |   spec.licenses      = ["MIT"]
10 |   spec.homepage      = "https://github.com/joker1007/embulk-filter-ruby_proc"
11 | 
12 |   spec.files         = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13 |   spec.test_files    = spec.files.grep(%r{^(test|spec)/})
14 |   spec.require_paths = ["lib"]
15 | 
16 |   spec.add_development_dependency 'embulk', ['>= 0.8.1']
17 |   spec.add_development_dependency 'bundler', ['>= 1.10.6']
18 |   spec.add_development_dependency 'rake', ['>= 10.0']
19 | end
20 | 


--------------------------------------------------------------------------------
/example/comment_upcase.rb:
--------------------------------------------------------------------------------
1 | ->(comment, record) do
2 |   return [record["account"].to_s].to_json unless comment
3 |   comment.upcase.split(" ").map { |s| CGI.escape(s) }
4 | end
5 | 


--------------------------------------------------------------------------------
/example/config.yml:
--------------------------------------------------------------------------------
 1 | in:
 2 |   type: file
 3 |   path_prefix: ./sample_
 4 |   parser:
 5 |     charset: UTF-8
 6 |     newline: CRLF
 7 |     type: csv
 8 |     delimiter: ','
 9 |     quote: '"'
10 |     escape: '\'
11 |     null_string: 'NULL'
12 |     trim_if_not_quoted: false
13 |     skip_header_lines: 1
14 |     allow_extra_columns: false
15 |     allow_optional_columns: false
16 |     columns:
17 |     - {name: id, type: long}
18 |     - {name: account, type: long}
19 |     - {name: time, type: timestamp, format: '%Y-%m-%d %H:%M:%S'}
20 |     - {name: purchase, type: timestamp, format: '%Y%m%d'}
21 |     - {name: comment, type: string}
22 |     - {name: data, type: json}
23 | 
24 | filters:
25 |   - type: ruby_proc
26 |     variables:
27 |       multiply: 3
28 |     before:
29 |       - proc: |
30 |           -> do
31 |             puts "before proc"
32 |             @started_at = Time.now
33 |           end
34 |     after:
35 |       - proc: |
36 |           -> do
37 |             puts "after proc"
38 |             p Time.now - @started_at
39 |           end
40 |     rows:
41 |       - proc: |
42 |           ->(record) do
43 |             [record.dup, record.dup.tap { |r| r["id"] += 10 }]
44 |           end
45 |     columns:
46 |       - name: data
47 |         proc: |
48 |           ->(data) do
49 |             data["events"] = data["events"].map.with_index do |e, idx|
50 |               e.tap { |e_| e_["idx"] = idx }
51 |             end
52 |             data
53 |           end
54 |       - name: id
55 |         proc: |
56 |           ->(id) do
57 |             id * variables["multiply"]
58 |           end
59 |         type: string
60 |     pages:
61 |       - proc: |
62 |           ->(records) do
63 |             records.map do |record|
64 |               record.tap { |r| r["id"] += 1 }
65 |             end
66 |           end
67 | 
68 |   - type: ruby_proc
69 |     requires:
70 |       - cgi
71 |     columns:
72 |       - name: comment
73 |         proc_file: comment_upcase.rb
74 |         skip_nil: false
75 |         type: json
76 | 
77 | out:
78 |   type: file
79 |   path_prefix: ./out_
80 |   file_ext: tsv
81 |   formatter:
82 |     type: csv
83 |     delimiter: "\t"
84 |     newline: CRLF
85 |     newline_in_field: LF
86 |     charset: UTF-8
87 |     quote_policy: MINIMAL
88 |     quote: '"'
89 |     escape: "\\"
90 |     null_string: 'NULL'
91 |     default_timezone: 'UTC'
92 | 


--------------------------------------------------------------------------------
/example/sample_01.csv:
--------------------------------------------------------------------------------
1 | id,account,time,purchase,comment,data
2 | 1,32864,2015-01-27 19:23:49,20150127,embulk,"{\"foo\": \"bar\", \"events\": [{\"id\": 1, \"name\": \"Name1\"}, {\"id\": 2, \"name\": \"Name2\"}]}"
3 | 2,14824,2015-01-27 19:01:23,20150127,embulk jruby,NULL
4 | 3,27559,2015-01-28 02:20:02,20150128,"Embulk ""csv"" parser plugin",NULL
5 | 4,11270,2015-01-29 11:54:36,20150129,NULL,NULL
6 | 


--------------------------------------------------------------------------------
/lib/embulk/filter/ruby_proc.rb:
--------------------------------------------------------------------------------
  1 | require 'thread'
  2 | require 'securerandom'
  3 | 
  4 | module Embulk
  5 |   module Filter
  6 | 
  7 |     class RubyProc < FilterPlugin
  8 | 
  9 |       class Evaluator
 10 |         attr_reader :variables
 11 |         @mutex = Mutex.new
 12 | 
 13 |         def self.mutex
 14 |           @mutex
 15 |         end
 16 | 
 17 |         def initialize(variables)
 18 |           @variables = variables
 19 |         end
 20 | 
 21 |         def get_binding
 22 |           binding
 23 |         end
 24 | 
 25 |         def mutex
 26 |           self.class.mutex
 27 |         end
 28 |       end
 29 | 
 30 |       Plugin.register_filter("ruby_proc", self)
 31 | 
 32 |       def self.transaction(config, in_schema, &control)
 33 |         task = {
 34 |           "columns" => config.param("columns", :array, default: []),
 35 |           "rows" => config.param("rows", :array, default: []),
 36 |           "pages" => config.param("pages", :array, default: []),
 37 |           "skip_rows" => config.param("skip_rows", :array, default: []),
 38 |           "before" => config.param("before", :array, default: []),
 39 |           "after" => config.param("after", :array, default: []),
 40 |           "requires" => config.param("requires", :array, default: []),
 41 |           "variables" => config.param("variables", :hash, default: {}),
 42 |         }
 43 | 
 44 |         out_columns = in_schema.map do |col|
 45 |           target = task["columns"].find { |filter_col| filter_col["name"] == col.name }
 46 |           if target
 47 |             type = target["type"] ? target["type"].to_sym : col.type
 48 |             Embulk::Column.new(index: col.index, name: col.name, type: type || col.type, format: target["format"] || col.format)
 49 |           else
 50 |             col
 51 |           end
 52 |         end
 53 | 
 54 |         task["requires"].each do |lib|
 55 |           require lib
 56 |         end
 57 | 
 58 |         @proc_store ||= {}
 59 |         @row_proc_store ||= {}
 60 |         @page_proc_store ||= {}
 61 |         @skip_row_proc_store ||= {}
 62 |         transaction_id = rand(100000000)
 63 |         until !@proc_store.has_key?(transaction_id)
 64 |           transaction_id = rand(100000000)
 65 |         end
 66 |         evaluator_binding = Evaluator.new(task["variables"]).get_binding
 67 | 
 68 |         # In order to avoid multithread probrem, initialize procs here
 69 |         before_procs = task["before"].map {|before|
 70 |           if before["proc"]
 71 |             eval(before["proc"], evaluator_binding)
 72 |           else
 73 |             eval(File.read(before["proc_file"]), evaluator_binding, File.expand_path(before["proc_file"]))
 74 |           end
 75 |         }
 76 |         @proc_store[transaction_id] = procs = Hash[task["columns"].map {|col|
 77 |           if col["proc"]
 78 |             [col["name"], eval(col["proc"], evaluator_binding)]
 79 |           else
 80 |             [col["name"], eval(File.read(col["proc_file"]), evaluator_binding, File.expand_path(col["proc_file"]))]
 81 |           end
 82 |         }]
 83 |         @row_proc_store[transaction_id] = row_procs = task["rows"].map {|rowdef|
 84 |           if rowdef["proc"]
 85 |             eval(rowdef["proc"], evaluator_binding)
 86 |           else
 87 |             eval(File.read(rowdef["proc_file"]), evaluator_binding, File.expand_path(rowdef["proc_file"]))
 88 |           end
 89 |         }.compact
 90 |         @page_proc_store[transaction_id] = page_procs = task["pages"].map {|page|
 91 |           if page["proc"]
 92 |             eval(page["proc"], evaluator_binding)
 93 |           else
 94 |             eval(File.read(page["proc_file"]), evaluator_binding, File.expand_path(page["proc_file"]))
 95 |           end
 96 |         }.compact
 97 |         @skip_row_proc_store[transaction_id] = skip_row_procs = task["skip_rows"].map {|rowdef|
 98 |           if rowdef["proc"]
 99 |             eval(rowdef["proc"], evaluator_binding)
100 |           else
101 |             eval(File.read(rowdef["proc_file"]), evaluator_binding, File.expand_path(rowdef["proc_file"]))
102 |           end
103 |         }.compact
104 |         task["transaction_id"] = transaction_id
105 |         if procs.empty? && row_procs.empty? && page_procs.empty? && skip_row_procs.empty?
106 |           raise "Need columns or rows or pages parameter"
107 |         end
108 | 
109 |         before_procs.each do |pr|
110 |           pr.call
111 |         end
112 | 
113 |         yield(task, out_columns)
114 | 
115 |         after_procs = task["after"].map {|after|
116 |           if after["proc"]
117 |             eval(after["proc"], evaluator_binding)
118 |           else
119 |             eval(File.read(after["proc_file"]), evaluator_binding, File.expand_path(after["proc_file"]))
120 |           end
121 |         }
122 | 
123 |         after_procs.each do |pr|
124 |           pr.call
125 |         end
126 |       end
127 | 
128 |       def self.proc_store
129 |         @proc_store
130 |       end
131 | 
132 |       def self.row_proc_store
133 |         @row_proc_store
134 |       end
135 | 
136 |       def self.page_proc_store
137 |         @page_proc_store
138 |       end
139 | 
140 |       def self.skip_row_proc_store
141 |         @skip_row_proc_store
142 |       end
143 | 
144 |       def self.parse_col_procs(columns, evaluator_binding)
145 |         Hash[columns.map {|col|
146 |           if col["proc"]
147 |             [col["name"], eval(col["proc"], evaluator_binding)]
148 |           else
149 |             [col["name"], eval(File.read(col["proc_file"]), evaluator_binding, File.expand_path(col["proc_file"]))]
150 |           end
151 |         }]
152 |       end
153 | 
154 |       def self.parse_row_procs(rows, evaluator_binding)
155 |         rows.map {|rowdef|
156 |           if rowdef["proc"]
157 |             eval(rowdef["proc"], evaluator_binding)
158 |           else
159 |             eval(File.read(rowdef["proc_file"]), evaluator_binding, File.expand_path(rowdef["proc_file"]))
160 |           end
161 |         }.compact
162 |       end
163 | 
164 |       def self.parse_page_procs(pages, evaluator_binding)
165 |         pages.map {|page|
166 |           if page["proc"]
167 |             eval(page["proc"], evaluator_binding)
168 |           else
169 |             eval(File.read(page["proc_file"]), evaluator_binding, File.expand_path(page["proc_file"]))
170 |           end
171 |         }.compact
172 |       end
173 | 
174 |       def init
175 |         task["requires"].each do |lib|
176 |           require lib
177 |         end
178 | 
179 |         if self.class.proc_store.nil? || self.class.row_proc_store.nil? || self.class.page_proc_store.nil? || self.class.skip_row_proc_store.nil?
180 |           evaluator_binding = Evaluator.new(task["variables"]).get_binding
181 |           @procs = self.class.parse_col_procs(task["columns"], evaluator_binding)
182 |           @row_procs = self.class.parse_row_procs(task["rows"], evaluator_binding)
183 |           @page_procs = self.class.parse_page_procs(task["pages"], evaluator_binding)
184 |           @skip_row_procs = self.class.parse_row_procs(task["skip_rows"], evaluator_binding)
185 |         else
186 |           @procs = self.class.proc_store[task["transaction_id"]]
187 |           @row_procs = self.class.row_proc_store[task["transaction_id"]]
188 |           @page_procs = self.class.page_proc_store[task["transaction_id"]]
189 |           @skip_row_procs = self.class.skip_row_proc_store[task["transaction_id"]]
190 |         end
191 |         @skip_nils = Hash[task["columns"].map {|col|
192 |           [col["name"], col["skip_nil"].nil? ? true : !!col["skip_nil"]]
193 |         }]
194 |       end
195 | 
196 |       def close
197 |       end
198 | 
199 |       def add(page)
200 |         proc_records = []
201 |         page.each do |record|
202 |           if row_procs.empty?
203 |             record_hashes = [hashrize(record)]
204 |           else
205 |             record_hashes = row_procs.each_with_object([]) do |pr, arr|
206 |               catch :skip_record do
207 |                 result = pr.call(hashrize(record))
208 |                 case result
209 |                 when Array
210 |                   result.each do |r|
211 |                     arr << r
212 |                   end
213 |                 when Hash
214 |                   arr << result
215 |                 else
216 |                   raise "row proc return value must be a Array or Hash"
217 |                 end
218 |               end
219 |             end
220 |           end
221 | 
222 |           record_hashes.each do |record_hash|
223 |             catch :skip_record do
224 |               skip_row_procs.each do |pr|
225 |                 throw :skip_record if pr.call(record_hash)
226 |               end
227 | 
228 |               procs.each do |col, pr|
229 |                 next unless record_hash.has_key?(col)
230 |                 next if record_hash[col].nil? && skip_nils[col]
231 | 
232 |                 if pr.arity == 1
233 |                   record_hash[col] = pr.call(record_hash[col])
234 |                 else
235 |                   record_hash[col] = pr.call(record_hash[col], record_hash)
236 |                 end
237 |               end
238 |               if page_procs.empty?
239 |                 page_builder.add(record_hash.values)
240 |               else
241 |                 proc_records << record_hash
242 |               end
243 |             end
244 |           end
245 |         end
246 | 
247 |         unless page_procs.empty?
248 |           page_procs.each do |pr|
249 |             result = pr.call(proc_records)
250 |             result.each { |record| page_builder.add(record.values) }
251 |           end
252 |         end
253 |       end
254 | 
255 |       def finish
256 |         page_builder.finish
257 |       end
258 | 
259 |       private
260 | 
261 |       def hashrize(record)
262 |         Hash[in_schema.names.zip(record)]
263 |       end
264 | 
265 |       def procs
266 |         @procs
267 |       end
268 | 
269 |       def row_procs
270 |         @row_procs
271 |       end
272 | 
273 |       def page_procs
274 |         @page_procs
275 |       end
276 | 
277 |       def skip_row_procs
278 |         @skip_row_procs
279 |       end
280 | 
281 |       def skip_nils
282 |         @skip_nils
283 |       end
284 |     end
285 | 
286 |   end
287 | end
288 | 


--------------------------------------------------------------------------------