├── lib ├── fastest-csv.rb ├── fastest-csv │ └── version.rb └── fastest_csv.rb ├── Gemfile ├── .gitignore ├── ext └── csv_parser │ ├── extconf.rb │ ├── parser.c │ ├── CsvParser.java │ └── CsvParserService.java ├── Rakefile ├── fastest-csv.gemspec ├── LICENSE ├── README.md └── test ├── tc_interface.rb └── tc_csv_parsing.rb /lib/fastest-csv.rb: -------------------------------------------------------------------------------- 1 | require 'fastest_csv' 2 | -------------------------------------------------------------------------------- /lib/fastest-csv/version.rb: -------------------------------------------------------------------------------- 1 | class FastestCSV 2 | VERSION = "0.0.4" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in fastest-csv.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | .DS_Store 7 | Gemfile.lock 8 | InstalledFiles 9 | _yardoc 10 | coverage 11 | doc/ 12 | lib/bundler/man 13 | pkg 14 | rdoc 15 | spec/reports 16 | test/tmp 17 | test/version_tmp 18 | tmp 19 | lib/*.bundle 20 | lib/*.jar 21 | -------------------------------------------------------------------------------- /ext/csv_parser/extconf.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -w 2 | 3 | require 'mkmf' 4 | extension_name = 'csv_parser' 5 | #dir_config(extension_name) 6 | 7 | if RUBY_VERSION =~ /1.8/ then 8 | $CPPFLAGS += " -DRUBY_18" 9 | end 10 | 11 | #if CONFIG["arch"] =~ /mswin32|mingw/ 12 | # $CFLAGS += " -march=i686" 13 | #end 14 | 15 | create_makefile(extension_name) 16 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env rake 2 | require "bundler/gem_tasks" 3 | 4 | spec = Gem::Specification.load('fastest-csv.gemspec') 5 | 6 | if RUBY_PLATFORM =~ /java/ 7 | require 'rake/javaextensiontask' 8 | Rake::JavaExtensionTask.new('csv_parser', spec) 9 | else 10 | require 'rake/extensiontask' 11 | Rake::ExtensionTask.new('csv_parser', spec) 12 | end 13 | 14 | require 'rake/testtask' 15 | Rake::TestTask.new do |t| 16 | t.libs << "test" 17 | t.test_files = FileList['test/tc_*.rb'] 18 | #test.libs << 'lib' << 'test' 19 | #test.pattern = 'test/**/test_*.rb' 20 | #test.verbose = true 21 | end 22 | 23 | -------------------------------------------------------------------------------- /fastest-csv.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | require File.expand_path('../lib/fastest-csv/version', __FILE__) 3 | 4 | Gem::Specification.new do |gem| 5 | gem.authors = ["Maarten Oelering"] 6 | gem.email = ["maarten@brightcode.nl"] 7 | gem.description = %q{Fastest standard CSV parser for MRI Ruby and JRuby} 8 | gem.summary = %q{Fastest standard CSV parser for MRI Ruby and JRuby} 9 | gem.homepage = "https://github.com/brightcode/fastest-csv" 10 | 11 | gem.files = `git ls-files`.split($\) 12 | #gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } 13 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 14 | gem.name = "fastest-csv" 15 | gem.require_paths = ["lib"] 16 | gem.version = FastestCSV::VERSION 17 | 18 | if RUBY_PLATFORM =~ /java/ 19 | gem.platform = "java" 20 | gem.files << "lib/csv_parser.jar" 21 | else 22 | gem.extensions = ['ext/csv_parser/extconf.rb'] 23 | end 24 | 25 | gem.add_development_dependency "rake-compiler" 26 | 27 | gem.license = 'MIT' 28 | end 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, 2013 Maarten Oelering 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastestCSV 2 | 3 | Fastest CSV class for MRI Ruby and JRuby. Faster than faster_csv and fasterer-csv. 4 | 5 | Uses native C code to parse CSV lines in MRI Ruby and Java in JRuby. 6 | 7 | Supports standard CSV according to RFC4180. Not the so-called "csv" from Excel. 8 | 9 | The interface is a subset of the CSV interface in Ruby 1.9.3. The options parameter is not supported. 10 | 11 | Originally developed to parse large CSV log files from PowerMTA. 12 | 13 | ## Installation 14 | 15 | Add this line to your application's Gemfile: 16 | 17 | ```ruby 18 | gem 'fastest-csv' 19 | ``` 20 | 21 | And then execute: 22 | 23 | $ bundle 24 | 25 | Or install it yourself as: 26 | 27 | $ gem install fastest-csv 28 | 29 | ## Usage 30 | 31 | Parse single line 32 | 33 | ```ruby 34 | FastestCSV.parse_line("one,two,three") 35 | => ["one", "two", "three"] 36 | 37 | "one,two,three".parse_csv 38 | => ["one", "two", "three"] 39 | ``` 40 | 41 | Parse file without header 42 | 43 | ```ruby 44 | FastestCSV.foreach("path/to/file.csv") do |row| 45 | # ... 46 | end 47 | ``` 48 | 49 | Parse file with header 50 | 51 | ```ruby 52 | FastestCSV.open("path/to/file.csv") do |csv| 53 | fields = csv.shift 54 | while values = csv.shift 55 | # ... 56 | end 57 | end 58 | ``` 59 | 60 | Parse file in array of arrays 61 | 62 | ```ruby 63 | rows = FastestCSV.read("path/to/file.csv") 64 | ``` 65 | 66 | Parse string in array of arrays 67 | 68 | ```ruby 69 | rows = FastestCSV.parse(csv_data) 70 | ``` 71 | 72 | ## Contributing 73 | 74 | 1. Fork it 75 | 2. Create your feature branch (`git checkout -b my-new-feature`) 76 | 3. Commit your changes (`git commit -am 'Added some feature'`) 77 | 4. Push to the branch (`git push origin my-new-feature`) 78 | 5. Create new Pull Request 79 | -------------------------------------------------------------------------------- /ext/csv_parser/parser.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Maarten Oelering, BrightCode BV 3 | */ 4 | 5 | #include "ruby.h" 6 | #ifdef RUBY_18 7 | #include "rubyio.h" 8 | #else 9 | #include "ruby/io.h" 10 | #endif 11 | 12 | #define DEF_ARRAY_LEN 32 13 | 14 | #define UNQUOTED 0 15 | #define IN_QUOTED 1 16 | #define QUOTE_IN_QUOTED 2 17 | 18 | static VALUE mCsvParser; 19 | 20 | static VALUE parse_line(VALUE self, VALUE str) 21 | { 22 | if (NIL_P(str)) 23 | return Qnil; 24 | 25 | const char *ptr = RSTRING_PTR(str); 26 | int len = (int) RSTRING_LEN(str); /* cast to prevent warning in 64-bit OS */ 27 | 28 | if (len == 0) 29 | return Qnil; 30 | 31 | VALUE array = rb_ary_new2(DEF_ARRAY_LEN); /* default allocated size is 16 */ 32 | char value[len]; /* field value, no longer than line */ 33 | int state = 0; 34 | int index = 0; 35 | int i; 36 | char c; 37 | for (i = 0; i < len; i++) 38 | { 39 | c = ptr[i]; 40 | switch (c) 41 | { 42 | case ',': 43 | if (state == UNQUOTED) { 44 | rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index))); 45 | index = 0; 46 | } 47 | else if (state == IN_QUOTED) { 48 | value[index++] = c; 49 | } 50 | else if (state == QUOTE_IN_QUOTED) { 51 | rb_ary_push(array, rb_str_new(value, index)); 52 | index = 0; 53 | state = UNQUOTED; 54 | } 55 | break; 56 | case '"': 57 | if (state == UNQUOTED) { 58 | state = IN_QUOTED; 59 | } 60 | else if (state == 1) { 61 | state = QUOTE_IN_QUOTED; 62 | } 63 | else if (state == QUOTE_IN_QUOTED) { 64 | value[index++] = c; /* escaped quote */ 65 | state = IN_QUOTED; 66 | } 67 | break; 68 | case 13: /* \r */ 69 | case 10: /* \n */ 70 | if (state == IN_QUOTED) { 71 | value[index++] = c; 72 | } 73 | else { 74 | i = len; /* only parse first line if multiline */ 75 | } 76 | break; 77 | default: 78 | value[index++] = c; 79 | } 80 | } 81 | 82 | if (state == UNQUOTED) { 83 | rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index))); 84 | } 85 | else if (state == QUOTE_IN_QUOTED) { 86 | rb_ary_push(array, rb_str_new(value, index)); 87 | } 88 | return array; 89 | } 90 | 91 | void Init_csv_parser() 92 | { 93 | mCsvParser = rb_define_module("CsvParser"); 94 | rb_define_module_function(mCsvParser, "parse_line", parse_line, 1); 95 | } 96 | -------------------------------------------------------------------------------- /ext/csv_parser/CsvParser.java: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) Maarten Oelering, BrightCode BV 3 | // 4 | 5 | package org.brightcode; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | public class CsvParser { 11 | 12 | private static int DEF_ARRAY_LEN = 32; 13 | 14 | private static int UNQUOTED = 0; 15 | private static int IN_QUOTED = 1; 16 | private static int QUOTE_IN_QUOTED = 2; 17 | 18 | public static List parseLine(String line) { 19 | int length = line.length(); 20 | if (length == 0) 21 | return null; 22 | 23 | int state = UNQUOTED; 24 | StringBuilder value = new StringBuilder(length); // field value, no longer than line 25 | List array = new ArrayList(DEF_ARRAY_LEN); 26 | 27 | for (int i = 0; i < length; i++) { 28 | char c = line.charAt(i); 29 | switch (c) { 30 | case ',': 31 | if (state == UNQUOTED) { 32 | if (value.length() == 0) { 33 | array.add(null); 34 | } 35 | else { 36 | array.add(value.toString()); 37 | value.setLength(0); 38 | } 39 | } 40 | else if (state == IN_QUOTED) { 41 | value.append(c); 42 | } 43 | else if (state == 2) { 44 | array.add(value.toString()); 45 | value.setLength(0); 46 | state = UNQUOTED; 47 | } 48 | break; 49 | case '"': 50 | if (state == UNQUOTED) { 51 | state = IN_QUOTED; 52 | } 53 | else if (state == IN_QUOTED) { 54 | state = QUOTE_IN_QUOTED; 55 | } 56 | else if (state == QUOTE_IN_QUOTED) { 57 | value.append(c); // escaped quote 58 | state = IN_QUOTED; 59 | } 60 | break; 61 | case '\r': 62 | case '\n': 63 | if (state == IN_QUOTED) { 64 | value.append(c); 65 | } 66 | else { 67 | i = length; // only parse first line if multiline 68 | } 69 | break; 70 | default: 71 | value.append(c); 72 | break; 73 | } 74 | } 75 | if (state == UNQUOTED) { 76 | if (value.length() == 0) { 77 | array.add(null); 78 | } 79 | else { 80 | array.add(value.toString()); 81 | value.setLength(0); 82 | } 83 | } 84 | else if (state == QUOTE_IN_QUOTED) { 85 | array.add(value.toString()); 86 | } 87 | return array; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /lib/fastest_csv.rb: -------------------------------------------------------------------------------- 1 | # This loads either csv_parser.so, csv_parser.bundle or 2 | # csv_parser.jar, depending on your Ruby platform and OS 3 | require 'csv_parser' 4 | require 'stringio' 5 | 6 | # Fast CSV parser using native code 7 | class FastestCSV 8 | include Enumerable 9 | 10 | if RUBY_PLATFORM =~ /java/ 11 | if JRUBY_VERSION =~ /^1\.[0-6]/ 12 | require 'jruby' 13 | org.brightcode.CsvParserService.new.basicLoad(JRuby.runtime) 14 | else 15 | include_package "org.brightcode" 16 | end 17 | end 18 | 19 | # Pass each line of the specified +path+ as array to the provided +block+ 20 | def self.foreach(path, &block) 21 | open(path) do |reader| 22 | reader.each(&block) 23 | end 24 | end 25 | 26 | # Opens a csv file. Pass a FastestCSV instance to the provided block, 27 | # or return it when no block is provided 28 | def self.open(path, mode = "rb") 29 | csv = new(File.open(path, mode)) 30 | if block_given? 31 | begin 32 | yield csv 33 | ensure 34 | csv.close 35 | end 36 | else 37 | csv 38 | end 39 | end 40 | 41 | # Read all lines from the specified +path+ into an array of arrays 42 | def self.read(path) 43 | open(path, "rb") { |csv| csv.read } 44 | end 45 | 46 | # Alias for FastestCSV.read 47 | def self.readlines(path) 48 | read(path) 49 | end 50 | 51 | # Read all lines from the specified String into an array of arrays 52 | def self.parse(data, &block) 53 | csv = new(StringIO.new(data)) 54 | if block.nil? 55 | begin 56 | csv.read 57 | ensure 58 | csv.close 59 | end 60 | else 61 | csv.each(&block) 62 | end 63 | end 64 | 65 | def self.parse_line(line) 66 | CsvParser.parse_line(line) 67 | end 68 | 69 | # Create new FastestCSV wrapping the specified IO object 70 | def initialize(io) 71 | @io = io 72 | end 73 | 74 | # Read from the wrapped IO passing each line as array to the specified block 75 | def each 76 | if block_given? 77 | while row = shift 78 | yield row 79 | end 80 | else 81 | to_enum # return enumerator 82 | end 83 | end 84 | 85 | # Read all remaining lines from the wrapped IO into an array of arrays 86 | def read 87 | table = Array.new 88 | each {|row| table << row} 89 | table 90 | end 91 | alias_method :readlines, :read 92 | 93 | # Rewind the underlying IO object and reset line counter 94 | def rewind 95 | @io.rewind 96 | end 97 | 98 | # Read next line from the wrapped IO and return as array or nil at EOF 99 | def shift 100 | if line = @io.gets 101 | CsvParser.parse_line(line) 102 | else 103 | nil 104 | end 105 | end 106 | alias_method :gets, :shift 107 | alias_method :readline, :shift 108 | 109 | # Close the wrapped IO 110 | def close 111 | @io.close 112 | end 113 | 114 | def closed? 115 | @io.closed? 116 | end 117 | end 118 | 119 | class String 120 | # Equivalent to FasterCSV::parse_line(self) 121 | def parse_csv 122 | CsvParser.parse_line(self) 123 | end 124 | end 125 | 126 | -------------------------------------------------------------------------------- /test/tc_interface.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Tests copied from faster_csv by James Edward Gray II 3 | # 4 | 5 | require 'test/unit' 6 | require 'fastest_csv' 7 | 8 | class TestFastestCSVInterface < Test::Unit::TestCase 9 | 10 | def setup 11 | @path = File.join(File.dirname(__FILE__), "temp_test_data.csv") 12 | 13 | File.open(@path, "w") do |file| 14 | file << "1,2,3\r\n" 15 | file << "4,5\r\n" 16 | end 17 | 18 | @expected = [%w{1 2 3}, %w{4 5}] 19 | end 20 | 21 | def teardown 22 | File.unlink(@path) 23 | end 24 | 25 | ### Test Read Interface ### 26 | 27 | def test_foreach 28 | FastestCSV.foreach(@path) do |row| 29 | assert_equal(@expected.shift, row) 30 | end 31 | end 32 | 33 | def test_open_and_close 34 | csv = FastestCSV.open(@path, "r+") 35 | assert_not_nil(csv) 36 | assert_instance_of(FastestCSV, csv) 37 | assert_equal(false, csv.closed?) 38 | csv.close 39 | assert(csv.closed?) 40 | 41 | ret = FastestCSV.open(@path) do |csv| 42 | assert_instance_of(FastestCSV, csv) 43 | "Return value." 44 | end 45 | assert(csv.closed?) 46 | assert_equal("Return value.", ret) 47 | end 48 | 49 | def test_parse 50 | data = File.read(@path) 51 | assert_equal( @expected, 52 | FastestCSV.parse(data) ) 53 | 54 | FastestCSV.parse(data) do |row| 55 | assert_equal(@expected.shift, row) 56 | end 57 | end 58 | 59 | #def test_parse_line 60 | # row = FasterCSV.parse_line("1;2;3", :col_sep => ";") 61 | # assert_not_nil(row) 62 | # assert_instance_of(Array, row) 63 | # assert_equal(%w{1 2 3}, row) 64 | # 65 | # # shortcut interface 66 | # row = "1;2;3".parse_csv(:col_sep => ";") 67 | # assert_not_nil(row) 68 | # assert_instance_of(Array, row) 69 | # assert_equal(%w{1 2 3}, row) 70 | #end 71 | 72 | def test_parse_line_with_empty_lines 73 | assert_equal(nil, FastestCSV.parse_line("")) # to signal eof 74 | #assert_equal(Array.new, FastestCSV.parse_line("\n1,2,3")) 75 | assert_equal([nil], FastestCSV.parse_line("\n1,2,3")) 76 | end 77 | 78 | def test_read_and_readlines 79 | assert_equal( @expected, 80 | FastestCSV.read(@path) ) 81 | assert_equal( @expected, 82 | FastestCSV.readlines(@path)) 83 | 84 | 85 | data = FastestCSV.open(@path) do |csv| 86 | csv.read 87 | end 88 | assert_equal(@expected, data) 89 | data = FastestCSV.open(@path) do |csv| 90 | csv.readlines 91 | end 92 | assert_equal(@expected, data) 93 | end 94 | 95 | #def test_table 96 | # table = FastestCSV.table(@path) 97 | # assert_instance_of(FastestCSV::Table, table) 98 | # assert_equal([[:"1", :"2", :"3"], [4, 5, nil]], table.to_a) 99 | #end 100 | 101 | def test_shift # aliased as gets() and readline() 102 | FastestCSV.open(@path, "r+") do |csv| 103 | assert_equal(@expected.shift, csv.shift) 104 | assert_equal(@expected.shift, csv.shift) 105 | assert_equal(nil, csv.shift) 106 | end 107 | end 108 | 109 | def test_long_line # ruby's regex parser may have problems with long rows 110 | File.unlink(@path) 111 | 112 | long_field_length = 2800 113 | File.open(@path, "w") do |file| 114 | file << "1,2,#{'3' * long_field_length}\r\n" 115 | end 116 | @expected = [%w{1 2} + ['3' * long_field_length]] 117 | test_shift 118 | end 119 | 120 | def test_enumerable 121 | FastestCSV.open(@path) do |csv| 122 | assert(csv.include?(["1", "2", "3"])) 123 | csv.rewind 124 | assert_equal([["1", "2", "3"], ["4", "5"]], csv.to_a) 125 | end 126 | end 127 | 128 | end 129 | -------------------------------------------------------------------------------- /ext/csv_parser/CsvParserService.java: -------------------------------------------------------------------------------- 1 | // 2 | // Copyright (c) Maarten Oelering, BrightCode BV 3 | // 4 | 5 | package org.brightcode; 6 | 7 | import java.io.IOException; 8 | 9 | import org.jruby.Ruby; 10 | import org.jruby.RubyArray; 11 | import org.jruby.RubyModule; 12 | import org.jruby.RubyString; 13 | import org.jruby.runtime.Block; 14 | import org.jruby.runtime.CallbackFactory; 15 | import org.jruby.runtime.builtin.IRubyObject; 16 | import org.jruby.runtime.load.BasicLibraryService; 17 | 18 | public class CsvParserService implements BasicLibraryService { 19 | 20 | private Ruby runtime; 21 | 22 | private static int DEF_ARRAY_LEN = 32; 23 | 24 | private static int UNQUOTED = 0; 25 | private static int IN_QUOTED = 1; 26 | private static int QUOTE_IN_QUOTED = 2; 27 | 28 | // Initial setup function. Takes a reference to the current JRuby runtime and 29 | // sets up our modules. 30 | public boolean basicLoad(Ruby runtime) throws IOException { 31 | this.runtime = runtime; 32 | 33 | RubyModule mCsvParser = runtime.defineModule("CsvParser"); 34 | // TODO: CallbackFactory#getSingletonMethod is deprecated 35 | CallbackFactory callbackFactory = runtime.callbackFactory(CsvParserService.class); 36 | mCsvParser.defineModuleFunction("parse_line", 37 | callbackFactory.getSingletonMethod("parseLine", RubyString.class)); 38 | return true; 39 | } 40 | 41 | public static IRubyObject parseLine(IRubyObject recv, RubyString line, Block unusedBlock) { 42 | Ruby runtime = recv.getRuntime(); 43 | 44 | CharSequence seq = line.getValue(); 45 | int length = seq.length(); 46 | if (length == 0) 47 | return runtime.getNil(); 48 | 49 | int state = UNQUOTED; 50 | StringBuilder value = new StringBuilder(length); // field value, no longer than line 51 | RubyArray array = RubyArray.newArray(runtime, DEF_ARRAY_LEN); 52 | 53 | for (int i = 0; i < length; i++) { 54 | char c = seq.charAt(i); 55 | switch (c) { 56 | case ',': 57 | if (state == UNQUOTED) { 58 | if (value.length() == 0) { 59 | array.append(runtime.getNil()); 60 | } 61 | else { 62 | array.append(RubyString.newString(runtime, value)); 63 | value.setLength(0); 64 | } 65 | } 66 | else if (state == IN_QUOTED) { 67 | value.append(c); 68 | } 69 | else if (state == 2) { 70 | array.append(RubyString.newString(runtime, value)); 71 | value.setLength(0); 72 | state = UNQUOTED; 73 | } 74 | break; 75 | case '"': 76 | if (state == UNQUOTED) { 77 | state = IN_QUOTED; 78 | } 79 | else if (state == IN_QUOTED) { 80 | state = QUOTE_IN_QUOTED; 81 | } 82 | else if (state == QUOTE_IN_QUOTED) { 83 | value.append(c); // escaped quote 84 | state = IN_QUOTED; 85 | } 86 | break; 87 | case '\r': 88 | case '\n': 89 | if (state == IN_QUOTED) { 90 | value.append(c); 91 | } 92 | else { 93 | i = length; // only parse first line if multiline 94 | } 95 | break; 96 | default: 97 | value.append(c); 98 | break; 99 | } 100 | } 101 | if (state == UNQUOTED) { 102 | if (value.length() == 0) { 103 | array.append(runtime.getNil()); 104 | } 105 | else { 106 | array.append(RubyString.newString(runtime, value)); 107 | value.setLength(0); 108 | } 109 | } 110 | else if (state == QUOTE_IN_QUOTED) { 111 | array.append(RubyString.newString(runtime, value)); 112 | } 113 | return array; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /test/tc_csv_parsing.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Tests copied from faster_csv by James Edward Gray II 3 | # 4 | 5 | require 'test/unit' 6 | require 'fastest_csv' 7 | 8 | # 9 | # Following tests are my interpretation of the 10 | # {CSV RCF}[http://www.ietf.org/rfc/rfc4180.txt]. I only deviate from that 11 | # document in one place (intentionally) and that is to make the default row 12 | # separator $/. 13 | # 14 | class TestCSVParsing < Test::Unit::TestCase 15 | 16 | if RUBY_PLATFORM =~ /java/ 17 | include_package "org.brightcode" 18 | end 19 | 20 | def test_mastering_regex_example 21 | ex = %Q{Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K} 22 | assert_equal( [ "Ten Thousand", "10000", " 2710 ", nil, "10,000", 23 | "It's \"10 Grand\", baby", "10K" ], 24 | CsvParser.parse_line(ex) ) 25 | end 26 | 27 | # Pulled from: http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/ruby/test/csv/test_csv.rb?rev=1.12.2.2;content-type=text%2Fplain 28 | def test_std_lib_csv 29 | [ ["\t", ["\t"]], 30 | ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]], 31 | ["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]], 32 | ["\"\"\"\n\",\"\"\"\n\"", ["\"\n", "\"\n"]], 33 | ["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]], 34 | ["\"\"", [""]], 35 | ["foo,\"\"\"\",baz", ["foo", "\"", "baz"]], 36 | ["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]], 37 | ["foo,\"\r\",baz", ["foo", "\r", "baz"]], 38 | ["foo,\"\",baz", ["foo", "", "baz"]], 39 | ["\",\"", [","]], 40 | ["foo", ["foo"]], 41 | [",,", [nil, nil, nil]], 42 | [",", [nil, nil]], 43 | ["foo,\"\n\",baz", ["foo", "\n", "baz"]], 44 | ["foo,,baz", ["foo", nil, "baz"]], 45 | ["\"\"\"\r\",\"\"\"\r\"", ["\"\r", "\"\r"]], 46 | ["\",\",\",\"", [",", ","]], 47 | ["foo,bar,", ["foo", "bar", nil]], 48 | [",foo,bar", [nil, "foo", "bar"]], 49 | ["foo,bar", ["foo", "bar"]], 50 | [";", [";"]], 51 | ["\t,\t", ["\t", "\t"]], 52 | ["foo,\"\r\n\r\",baz", ["foo", "\r\n\r", "baz"]], 53 | ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]], 54 | ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]], 55 | [";,;", [";", ";"]] ].each do |csv_test| 56 | assert_equal(csv_test.last, CsvParser.parse_line(csv_test.first)) 57 | end 58 | 59 | [ ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]], 60 | ["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]], 61 | ["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]], 62 | ["\"\"", [""]], 63 | ["foo,\"\"\"\",baz", ["foo", "\"", "baz"]], 64 | ["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]], 65 | ["foo,\"\r\",baz", ["foo", "\r", "baz"]], 66 | ["foo,\"\",baz", ["foo", "", "baz"]], 67 | ["foo", ["foo"]], 68 | [",,", [nil, nil, nil]], 69 | [",", [nil, nil]], 70 | ["foo,\"\n\",baz", ["foo", "\n", "baz"]], 71 | ["foo,,baz", ["foo", nil, "baz"]], 72 | ["foo,bar", ["foo", "bar"]], 73 | ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]], 74 | ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]] ].each do |csv_test| 75 | assert_equal(csv_test.last, CsvParser.parse_line(csv_test.first)) 76 | end 77 | end 78 | 79 | # From: http://ruby-talk.org/cgi-bin/scat.rb/ruby/ruby-core/6496 80 | def test_aras_edge_cases 81 | [ [%Q{a,b}, ["a", "b"]], 82 | [%Q{a,"""b"""}, ["a", "\"b\""]], 83 | [%Q{a,"""b"}, ["a", "\"b"]], 84 | [%Q{a,"b"""}, ["a", "b\""]], 85 | [%Q{a,"\nb"""}, ["a", "\nb\""]], 86 | [%Q{a,"""\nb"}, ["a", "\"\nb"]], 87 | [%Q{a,"""\nb\n"""}, ["a", "\"\nb\n\""]], 88 | [%Q{a,"""\nb\n""",\nc}, ["a", "\"\nb\n\"", nil]], 89 | [%Q{a,,,}, ["a", nil, nil, nil]], 90 | [%Q{,}, [nil, nil]], 91 | [%Q{"",""}, ["", ""]], 92 | [%Q{""""}, ["\""]], 93 | [%Q{"""",""}, ["\"",""]], 94 | [%Q{,""}, [nil,""]], 95 | [%Q{,"\r"}, [nil,"\r"]], 96 | [%Q{"\r\n,"}, ["\r\n,"]], 97 | [%Q{"\r\n,",}, ["\r\n,", nil]] ].each do |edge_case| 98 | assert_equal(edge_case.last, CsvParser.parse_line(edge_case.first)) 99 | end 100 | end 101 | 102 | def test_james_edge_cases 103 | # A read at eof? should return nil. 104 | assert_equal(nil, CsvParser.parse_line("")) 105 | # 106 | # With CSV it's impossible to tell an empty line from a line containing a 107 | # single +nil+ field. The standard CSV library returns [nil] 108 | # in these cases, but Array.new makes more sense to me. 109 | # 110 | #assert_equal(Array.new, FastestCSV.parse_line("\n1,2,3\n")) 111 | assert_equal([nil], CsvParser.parse_line("\n1,2,3\n")) 112 | end 113 | 114 | def test_rob_edge_cases 115 | [ [%Q{"a\nb"}, ["a\nb"]], 116 | [%Q{"\n\n\n"}, ["\n\n\n"]], 117 | [%Q{a,"b\n\nc"}, ['a', "b\n\nc"]], 118 | [%Q{,"\r\n"}, [nil,"\r\n"]], 119 | [%Q{,"\r\n."}, [nil,"\r\n."]], 120 | [%Q{"a\na","one newline"}, ["a\na", 'one newline']], 121 | [%Q{"a\n\na","two newlines"}, ["a\n\na", 'two newlines']], 122 | [%Q{"a\r\na","one CRLF"}, ["a\r\na", 'one CRLF']], 123 | [%Q{"a\r\n\r\na","two CRLFs"}, ["a\r\n\r\na", 'two CRLFs']], 124 | [%Q{with blank,"start\n\nfinish"\n}, ['with blank', "start\n\nfinish"]], 125 | ].each do |edge_case| 126 | assert_equal(edge_case.last, CsvParser.parse_line(edge_case.first)) 127 | end 128 | end 129 | 130 | end 131 | --------------------------------------------------------------------------------