├── .gitignore ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── embulk-formatter-jsonl.gemspec └── lib └── embulk └── formatter └── jsonl.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /pkg/ 3 | /tmp/ 4 | /.bundle/ 5 | /Gemfile.lock 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org/' 2 | gemspec 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jsonl formatter plugin for Embulk 2 | 3 | Format datum to jsonl, 1 json per 1 line 4 | 5 | ## Overview 6 | 7 | * **Plugin type**: formatter 8 | * **Load all or nothing**: yes 9 | * **Resume supported**: no 10 | 11 | ## Configuration 12 | 13 | - **encoding**: output encoding. must be one of "UTF-8", "UTF-16LE", "UTF-32BE", "UTF-32LE" or "UTF-32BE" (string default: 'UTF-8') 14 | - **newline**: newline character. (string default: 'LF') 15 | - CRLF: use `\r`(0x0d) and `\n`(0x0a) as newline character 16 | - LF: use `\n`(0x0a) as newline character 17 | - CR: use `\r`(0x0d) as newline character 18 | - NUL: use `\0`(0x00) instead of newline (for example, `xargs -0` will be good friend with it) 19 | - NO: dump JSONs in a line 20 | - **date_format**: date format. See below example. (string default: nil) 21 | - "yyyy-MM-dd HH:mm:ss": 2015-04-26 17:23:25 22 | - "yyyy-MM-dd'T'HH:mm:ss.SSSZ": 2015-04-26T17:23:25.123+0900 23 | - For more information: [SimpleDateFormat class document](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) 24 | - **timezone**: timezone. "JST" (string default: nil), When you use this option, you also need to set date_format option too. 25 | - **json_columns**: json formatted column name. (array default:[]) 26 | 27 | ## Example 28 | 29 | ```yaml 30 | out: 31 | type: any output input plugin type 32 | formatter: 33 | type: jsonl 34 | encoding: UTF-8 35 | newline: NUL 36 | ``` 37 | 38 | Timezone example. 39 | 40 | ```yaml 41 | out: 42 | type: any output input plugin type 43 | formatter: 44 | type: jsonl 45 | timezone: "UTC" 46 | date_format: "yyyy-MM-dd'T'HH:mm:ss.SSSZ" 47 | ``` 48 | 49 | ## Build 50 | 51 | ``` 52 | $ rake 53 | ``` 54 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | 3 | task default: :build 4 | -------------------------------------------------------------------------------- /embulk-formatter-jsonl.gemspec: -------------------------------------------------------------------------------- 1 | 2 | Gem::Specification.new do |spec| 3 | spec.name = "embulk-formatter-jsonl" 4 | spec.version = "0.1.4" 5 | spec.authors = ["TAKEI Yuya"] 6 | spec.summary = "Jsonl formatter plugin for Embulk" 7 | spec.description = "Formats Embulk Formatter Jsonl files for other file output plugins." 8 | spec.email = ["takei.yuya+github@gmail.com"] 9 | spec.licenses = ["MIT"] 10 | spec.homepage = "https://github.com/takei-yuya/embulk-formatter-jsonl" 11 | 12 | spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] 13 | spec.test_files = spec.files.grep(%r{^(test|spec)/}) 14 | spec.require_paths = ["lib"] 15 | 16 | #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION'] 17 | spec.add_dependency 'jrjackson', ['~> 0.2.8'] 18 | spec.add_development_dependency 'bundler', ['~> 1.0'] 19 | spec.add_development_dependency 'rake', ['>= 10.0'] 20 | end 21 | -------------------------------------------------------------------------------- /lib/embulk/formatter/jsonl.rb: -------------------------------------------------------------------------------- 1 | require 'jrjackson' 2 | 3 | module Embulk 4 | module Formatter 5 | 6 | class JsonlFormatterPlugin < FormatterPlugin 7 | Plugin.register_formatter("jsonl", self) 8 | 9 | VALID_ENCODINGS = %w(UTF-8 UTF-16LE UTF-32BE UTF-32LE UTF-32BE) 10 | NEWLINES = { 11 | 'CRLF' => "\r\n", 12 | 'LF' => "\n", 13 | 'CR' => "\r", 14 | # following are not jsonl, but useful in some case 15 | 'NUL' => "\0", 16 | 'NO' => '', 17 | } 18 | 19 | def self.join_texts((*inits,last), opt = {}) 20 | delim = opt[:delimiter] || ', ' 21 | last_delim = opt[:last_delimiter] || ' or ' 22 | [inits.join(delim),last].join(last_delim) 23 | end 24 | 25 | def self.transaction(config, schema, &control) 26 | # configuration code: 27 | task = { 28 | 'encoding' => config.param('encoding', :string, default: 'UTF-8'), 29 | 'newline' => config.param('newline', :string, default: 'LF'), 30 | 'date_format' => config.param('date_format', :string, default: nil), 31 | 'timezone' => config.param('timezone', :string, default: nil ), 32 | 'json_columns' => config.param("json_columns", :array, default: []) 33 | } 34 | 35 | encoding = task['encoding'].upcase 36 | raise "encoding must be one of #{join_texts(VALID_ENCODINGS)}" unless VALID_ENCODINGS.include?(encoding) 37 | 38 | newline = task['newline'].upcase 39 | raise "newline must be one of #{join_texts(NEWLINES.keys)}" unless NEWLINES.has_key?(newline) 40 | 41 | yield(task) 42 | end 43 | 44 | def init 45 | # initialization code: 46 | @encoding = task['encoding'].upcase 47 | @newline = NEWLINES[task['newline'].upcase] 48 | @json_columns = task["json_columns"] 49 | 50 | # your data 51 | @current_file == nil 52 | @current_file_size = 0 53 | @opts = { :mode => :compat } 54 | date_format = task['date_format'] 55 | timezone = task['timezone'] 56 | @opts[:date_format] = date_format if date_format 57 | @opts[:timezone] = timezone if timezone 58 | end 59 | 60 | def close 61 | end 62 | 63 | def add(page) 64 | # output code: 65 | page.each do |record| 66 | if @current_file == nil || @current_file_size > 32*1024 67 | @current_file = file_output.next_file 68 | @current_file_size = 0 69 | end 70 | datum = {} 71 | @schema.each do |col| 72 | datum[col.name] = @json_columns.include?(col.name) ? JrJackson::Json.load(record[col.index]) : record[col.index] 73 | end 74 | @current_file.write "#{JrJackson::Json.dump(datum, @opts )}#{@newline}".encode(@encoding) 75 | end 76 | end 77 | 78 | def finish 79 | file_output.finish 80 | end 81 | end 82 | 83 | end 84 | end 85 | --------------------------------------------------------------------------------