├── CODE_OF_CONDUCT.md ├── LICENSE.md ├── README.md ├── benchmarks ├── Gemfile ├── README.md ├── benchmark.rb ├── datasets │ ├── README.md │ ├── finance │ │ ├── AAPL.csv │ │ ├── AMZN.csv │ │ ├── FB.csv │ │ ├── GOOGL.csv │ │ ├── MSFT.csv │ │ └── o │ │ │ ├── MSFT.json.csv │ │ │ ├── MSFT.tab │ │ │ └── MSFT.txt │ └── weather │ │ ├── Hobo_15minute_2017.csv │ │ └── o │ │ ├── Hobo_15minute_2017.json.csv │ │ └── Hobo_15minute_2017.txt ├── helper.rb ├── io │ ├── README.md │ ├── benchmark.rb │ ├── buffer.rb │ ├── buffer_line.rb │ ├── buffer_line_pos.rb │ ├── buffer_line_scanner.rb │ ├── buffer_num.rb │ ├── helper.rb │ ├── parser.rb │ ├── parser_nobuf.rb │ ├── parser_num.rb │ ├── parser_scanner.rb │ ├── read.rb │ └── test │ │ ├── test_io.rb │ │ ├── test_sample.rb │ │ └── test_scanner.rb ├── split.rb └── test │ ├── test_numeric.rb │ └── test_read.rb ├── csv11 ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── README.md ├── Rakefile ├── lib │ ├── csv11.rb │ └── csv11 │ │ └── version.rb └── test │ ├── helper.rb │ ├── test_split.rb │ └── test_version.rb ├── csvhuman ├── .gitignore ├── ATTRIBUTES.md ├── HISTORY.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── TAGS.md ├── config │ ├── attributes.csv │ ├── langs.csv │ ├── tags.csv │ ├── types.csv │ └── versions.csv ├── lib │ ├── csvhuman.rb │ └── csvhuman │ │ ├── base.rb │ │ ├── column.rb │ │ ├── converter.rb │ │ ├── doc │ │ ├── helper.rb │ │ └── schema.rb │ │ ├── reader.rb │ │ ├── tag.rb │ │ └── version.rb ├── scripts │ ├── mkattributes.rb │ ├── mktags.rb │ ├── pages │ │ ├── attributes.txt │ │ └── tags.txt │ └── txt2csv.rb └── test │ ├── data │ ├── airports.csv │ ├── ebola.csv │ ├── hdx │ │ ├── ebola_treatment_centres.csv │ │ ├── phl_haima_houses_damaged.csv │ │ └── zika_cases.csv │ ├── sample1.csv │ ├── sample2.csv │ ├── sample3.csv │ ├── sample4.csv │ ├── test.csv │ └── unhcr.csv │ ├── helper.rb │ ├── test_doc.rb │ ├── test_hdx.rb │ ├── test_header_converter.rb │ ├── test_misc.rb │ ├── test_reader.rb │ ├── test_samples.rb │ ├── test_tags.rb │ ├── test_type_converters.rb │ └── test_type_mappings.rb ├── csvjson ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── README.md ├── Rakefile ├── datasets │ ├── hello.json.csv │ └── hello11.json.csv ├── lib │ ├── csvjson.rb │ └── csvjson │ │ ├── parser.rb │ │ └── version.rb └── test │ ├── helper.rb │ ├── test_parser.rb │ └── test_parser_misc.rb ├── csvpack ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── getting-started-samples │ └── start.rb ├── lib │ ├── csvpack.rb │ └── csvpack │ │ ├── downloader.rb │ │ ├── pack.rb │ │ └── version.rb └── test │ ├── helper.rb │ ├── pack │ └── beer │ │ ├── data.csv │ │ └── datapackage.json │ ├── test_companies.rb │ ├── test_countries.rb │ ├── test_downloader.rb │ └── test_import.rb ├── csvreader ├── .gitignore ├── ALTERNATIVES.md ├── CHANGELOG.md ├── DIALECTS.md ├── ERRORS.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── datasets │ ├── beer.csv │ ├── beer11.csv │ ├── cars11.csv │ ├── cities11.csv │ ├── customers11.csv │ ├── iris.attrib.csv │ ├── iris11.csv │ ├── lcc.attrib.csv │ ├── shakespeare.csv │ └── test.csv ├── lib │ ├── csvreader.rb │ └── csvreader │ │ ├── base.rb │ │ ├── buffer.rb │ │ ├── builder.rb │ │ ├── converter.rb │ │ ├── parser.rb │ │ ├── parser_fixed.rb │ │ ├── parser_json.rb │ │ ├── parser_std.rb │ │ ├── parser_strict.rb │ │ ├── parser_tab.rb │ │ ├── parser_table.rb │ │ ├── parser_yaml.rb │ │ ├── reader.rb │ │ ├── reader_hash.rb │ │ └── version.rb └── test │ ├── helper.rb │ ├── test_buffer.rb │ ├── test_converter.rb │ ├── test_parser.rb │ ├── test_parser_autofix.rb │ ├── test_parser_directive.rb │ ├── test_parser_fixed.rb │ ├── test_parser_formats.rb │ ├── test_parser_java.rb │ ├── test_parser_meta.rb │ ├── test_parser_null.rb │ ├── test_parser_numeric.rb │ ├── test_parser_quotes.rb │ ├── test_parser_strict.rb │ ├── test_parser_tab.rb │ ├── test_parser_table.rb │ ├── test_reader.rb │ ├── test_reader_converters.rb │ ├── test_reader_hash.rb │ ├── test_reader_hash_converters.rb │ └── test_samples.rb ├── csvrecord ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── lib │ ├── csvrecord.rb │ └── csvrecord │ │ ├── base.rb │ │ └── version.rb └── test │ ├── data │ ├── beer.csv │ └── beer11.csv │ ├── helper.rb │ ├── test_record.rb │ ├── test_record_auto.rb │ └── test_version.rb ├── csvutils ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── bin │ ├── csvcut │ ├── csvhead │ ├── csvheader │ ├── csvsplit │ └── csvstat ├── datasets │ ├── at-austria │ │ └── AUT.csv │ ├── de-deutschland │ │ └── bundesliga.csv │ └── eng-england │ │ └── 2017-18 │ │ └── E0.csv ├── getting-started-samples │ ├── AUT.csv │ ├── ENG.csv │ ├── start.rb │ └── start.sh ├── lib │ ├── csvutils.rb │ └── csvutils │ │ ├── commands │ │ ├── cut.rb │ │ ├── head.rb │ │ ├── header.rb │ │ ├── split.rb │ │ └── stat.rb │ │ ├── cut.rb │ │ ├── head.rb │ │ ├── header.rb │ │ ├── split.rb │ │ ├── stat.rb │ │ ├── test.rb │ │ ├── utils.rb │ │ └── version.rb └── test │ ├── helper.rb │ ├── test_cut.rb │ ├── test_head.rb │ ├── test_header.rb │ ├── test_misc.rb │ ├── test_split.rb │ └── test_version.rb ├── csvyaml ├── .gitignore ├── HISTORY.md ├── Manifest.txt ├── README.md ├── Rakefile ├── datasets │ ├── hello.yaml.csv │ └── hello11.yaml.csv ├── lib │ ├── csvyaml.rb │ └── csvyaml │ │ ├── parser.rb │ │ └── version.rb └── test │ ├── helper.rb │ ├── test_parser.rb │ └── test_parser_misc.rb ├── docs ├── README.md ├── csv-array-hash-struct.md ├── csv-formats.md ├── csv-libraries.md ├── csv-numerics.md ├── csv-parser.md ├── csv-quotes.md ├── csv-types.md ├── csv_stdlib_human.rb ├── csv_stdlib_test.rb ├── smarter-csv.md ├── sorry-sorry-sorry.md └── why-the-csv-stdlib-is-broken.md └── tabreader ├── .gitignore ├── CHANGELOG.md ├── Manifest.txt ├── NOTES.md ├── README.md ├── Rakefile ├── datasets ├── empty.tab └── test.tab ├── lib ├── tabreader.rb └── tabreader │ ├── reader.rb │ ├── reader_hash.rb │ └── version.rb └── test ├── helper.rb ├── test_reader.rb └── test_reader_hash.rb /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Be A Mensch. 2 | 3 | 4 | ## Attribution 5 | 6 | This Code of Conduct is adapted from the Choose-A-Conduct org, version 1.1, available at . 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Comma-Separated Values (CSV) Tabular Data Readers Incl. CSV <3 JSON, CSV <3 YAML And More 2 | 3 | --- 4 | 5 | NOTE: I (Gerald Bauer) am happy to work on a better (alternate) csv library for ruby, BUT unfortunately for now I have first to figure out how to survive and earn some money. unfortunately, ruby is no longer (never was really) in demand over here (in austria). 6 | thus, if you want to see progress - please ping me (gerald.bauer @ gmail.com) and your support is more than welcome. 7 | 8 | --- 9 | 10 | 11 | 12 | 13 | Gem Family 14 | 15 | [**csvreader**](csvreader) - read tabular data in the comma-separated values (csv) format the right way (uses best practices out-of-the-box with zero-configuration) 16 | 17 | [csvjson](csvjson) - read tabular data in the CSV <3 JSON format, that is, comma-separated values CSV (line-by-line) records with javascript object notation (JSON) encoding rules 18 | 19 | [csvyaml](csvyaml) - read tabular data in the CSV <3 YAML format, that is, comma-separated values (CSV) line-by-line records with yaml ain't markup language (YAML) encoding rules 20 | 21 | [csvhuman](csvhuman) - read tabular data in the CSV Humanitarian eXchange Language (HXL) format, that is, comma-separated values (CSV) line-by-line records with a hashtag (meta data) line using the Humanitarian eXchange Language (HXL) rules 22 | 23 | [tabreader](tabreader) - read in tabular datafiles in text in the tabular (TAB) format 24 | 25 | [csvpack](csvpack) - tools 'n' scripts for working with tabular data packages using comma-separated values (CSV) datafiles in text with meta info (that is, schema, datatypes, ..) in datapackage.json; download, read into and query CSV datafiles with your SQL database (e.g. SQLite, PostgreSQL, ...) of choice and much more 26 | 27 | 28 | 29 | 30 | 31 | Typed Structs & More 32 | 33 | [csvrecord](csvrecord) - read in comma-separated values (csv) records with typed structs / schemas 34 | 35 | 36 | 37 | 38 | (Command Line) Tools & More 39 | 40 | [csvutils](csvutils) - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular data interchange format in text 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /benchmarks/Gemfile: -------------------------------------------------------------------------------- 1 | 2 | gem 'csvreader' 3 | 4 | 5 | gem 'hippie_csv' 6 | gem 'wtf_csv' 7 | gem 'lenientcsv' 8 | -------------------------------------------------------------------------------- /benchmarks/benchmark.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | require 'benchmark' 5 | 6 | require_relative 'helper' 7 | 8 | 9 | 10 | ## "raw" string reader benchmark - no type inference and data conversion 11 | 12 | n = 1000 13 | # n = 2 14 | 15 | 16 | Benchmark.bm(15) do |x| 17 | x.report( 'std:' ) { n.times do CSV.read( "#{data_dir}/finance/MSFT.csv" ); end } 18 | 19 | x.report( 'split:' ) { n.times do read_csv( "#{data_dir}/finance/MSFT.csv" ); end } 20 | x.report( 'split(tab):' ) { n.times do read_tab( "#{data_dir}/finance/o/MSFT.tab" ); end } 21 | x.report( 'split(table)*:' ) { n.times do read_table( "#{data_dir}/finance/o/MSFT.tab" ); end } 22 | x.report( 'split(table):' ) { n.times do read_table( "#{data_dir}/finance/o/MSFT.txt" ); end } 23 | 24 | x.report( 'reader:' ) { n.times do CsvReader.read( "#{data_dir}/finance/MSFT.csv" ); end } 25 | x.report( 'reader(tab):' ) { n.times do CsvReader.tab.read( "#{data_dir}/finance/o/MSFT.tab" ); end } 26 | x.report( 'reader(table)*:' ) { n.times do CsvReader.table.read( "#{data_dir}/finance/o/MSFT.tab" ); end } 27 | x.report( 'reader(table):' ) { n.times do CsvReader.table.read( "#{data_dir}/finance/o/MSFT.txt" ); end } 28 | x.report( 'reader(json):' ) { n.times do CsvReader.json.read( "#{data_dir}/finance/o/MSFT.json.csv" ); end } 29 | x.report( 'reader(yaml):' ) { n.times do CsvReader.yaml.read( "#{data_dir}/finance/MSFT.csv" ); end } 30 | 31 | x.report( 'hippie:' ) { n.times do HippieCSV.read( "#{data_dir}/finance/MSFT.csv" ); end } 32 | x.report( 'wtf:' ) { n.times do WtfCSV.scan( "#{data_dir}/finance/MSFT.csv" );end } 33 | x.report( 'lenient:' ) { n.times do LenientCSV.read( "#{data_dir}/finance/MSFT.csv" ); end } 34 | end 35 | 36 | 37 | 38 | ## numerics reader benchmark - all records numeric (limited type inference and data conversion) 39 | 40 | n = 100 41 | # n=2 42 | 43 | Benchmark.bm(15) do |x| 44 | x.report( 'std:' ) { n.times do CSV.read( "#{data_dir}/weather/Hobo_15minute_2017.csv", { :converters => :all }); end } 45 | 46 | x.report( 'split:' ) { n.times do read_faster_csv( "#{data_dir}/weather/Hobo_15minute_2017.csv", { converter: ->(v) { Float(v) rescue v } }); end } 47 | x.report( 'split(table):' ) { n.times do read_faster_csv( "#{data_dir}/weather/o/Hobo_15minute_2017.txt", { sep: /[ \t]+/, converter: ->(v) { Float(v) rescue v }}); end } 48 | 49 | x.report( 'reader:' ) { n.times do CsvReader.read( "#{data_dir}/weather/Hobo_15minute_2017.csv", { :converters => :all }); end } 50 | x.report( 'reader(table):' ) { n.times do CsvReader.table.read( "#{data_dir}/weather/Hobo_15minute_2017.csv", { :converters => :all }); end } 51 | x.report( 'reader(numeric):' ) { n.times do CsvReader.numeric.read( "#{data_dir}/weather/Hobo_15minute_2017.csv" ); end } 52 | x.report( 'reader(json):' ) { n.times do CsvReader.json.read( "#{data_dir}/weather/o/Hobo_15minute_2017.json.csv" ); end } 53 | x.report( 'reader(yaml):' ) { n.times do CsvReader.yaml.read( "#{data_dir}/weather/Hobo_15minute_2017.csv" ); end } 54 | end 55 | -------------------------------------------------------------------------------- /benchmarks/datasets/README.md: -------------------------------------------------------------------------------- 1 | # Sample Datasets 2 | 3 | 4 | ## Finance 5 | 6 | Source: Yahoo! Finance <> - Historical Stock Price Data (Available for Download as CSV). 7 | Steps: 8 | 9 | - Search for stock ticker symbol 10 | - Click on Historicial Data 11 | - Select Time Period (e.g. YTD - Year to Date) and Frequeny (Daily) 12 | - Click Download Data 13 | 14 | Examples: 15 | - [GOOGL](https://finance.yahoo.com/quote/GOOGL/history) - Alphabet Inc. (Google) - Class A Shares 16 | - [APPL](https://finance.yahoo.com/quote/AAPL/history) - Apple Inc. 17 | - [FB](https://finance.yahoo.com/quote/FB/history) - Facebook Inc. 18 | - [AMZN](https://finance.yahoo.com/quote/AMZN/history) - Amazon Inc. 19 | - [MSFT](https://finance.yahoo.com/quote/MSFT/history) - Microsoft Corp. 20 | 21 | 22 | 23 | 24 | ## Weather 25 | 26 | Source: Weather Station of the University of Waterloo, Ontario, Canada <> 27 | 28 | Datasets (Available for Download as CSV) in Archive: 29 | - <> 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /benchmarks/helper.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'pp' 4 | 5 | 6 | require 'csv' 7 | require 'csvreader' 8 | 9 | 10 | require_relative 'split' 11 | 12 | require 'hippie_csv' 13 | require 'wtf_csv' 14 | require 'lenient_csv' 15 | 16 | 17 | 18 | def data_dir 19 | './datasets' 20 | end 21 | 22 | 23 | 24 | class LenientCSV 25 | def self.read( path ) 26 | txt = File.open( path, 'r:bom|utf-8' ) { |f| f.read } 27 | csv = new( txt ) 28 | csv.to_a 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /benchmarks/io/benchmark.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | require 'benchmark' 5 | 6 | 7 | 8 | require_relative 'helper' 9 | 10 | 11 | n = 100 12 | # n = 10_000 13 | 14 | Benchmark.bm(26) do |x| 15 | x.report( 'line (each_line):' ) { n.times do readline_sample; end } 16 | x.report( 'line (each_line+chomp!):' ) { n.times do readline_inplace_sample; end } 17 | x.report( 'line (each_line+scanner):' ) { n.times do readline_scanner_sample; end } 18 | x.report( 'line (each_line+each_char):' ) { n.times do readchar_sample; end } 19 | 20 | x.report( 'line (parse+getch):' ) { n.times do parse1_sample; end } 21 | x.report( 'line (parse+gets+slice):' ) { n.times do parse2_sample; end } 22 | x.report( 'line (parse+gets+pos):' ) { n.times do parse3_sample; end } 23 | x.report( 'line (parse+nobuf):' ) { n.times do parse4_sample; end } 24 | x.report( 'line (parse+getch+num):' ) { n.times do parse5_sample; end } 25 | 26 | x.report( 'line (parse+gets+scanner):' ) { n.times do parse_scanner_sample; end } 27 | x.report( 'line (parse+gets+scanner*):' ) { n.times do parse_scanner_scanner_sample; end } 28 | end 29 | 30 | 31 | ## user system total real 32 | ## line (each_line): 5.375000 6.141000 11.516000 ( 11.522474) 33 | ## line (each_line+chomp!): 4.375000 6.109000 10.484000 ( 10.496063) 34 | ## line (each_line+scanner): 13.984000 5.656000 19.640000 ( 19.644859) 35 | ## line (each_line+each_char): 43.188000 8.141000 51.329000 ( 51.325110) 36 | ## line (parse+getch): 116.921000 7.312000 124.233000 (124.293261) 37 | ## line (parse+gets+slice): 188.032000 8.500000 196.532000 (196.711467) 38 | ## line (parse+gets+pos): 141.375000 13.485000 154.860000 (154.922206) 39 | ## line (parse+nobuf): 63.718000 7.047000 70.765000 ( 70.774960) 40 | ## line (parse+getch+num): 127.750000 8.156000 135.906000 (136.168328) 41 | ## line (parse+gets+scanner): 127.875000 8.140000 136.015000 (136.358474) 42 | ## line (parse+gets+scanner*): 26.516000 7.375000 33.891000 ( 33.912854) 43 | -------------------------------------------------------------------------------- /benchmarks/io/buffer.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class Buffer 4 | def initialize( data ) 5 | # create the IO object we will read from 6 | @input = data 7 | @buf = [] ## last (buffer) chars (used for peek) 8 | end 9 | 10 | def eof?() @buf.size == 0 && @input.eof?; end 11 | 12 | def getc 13 | if @buf.size > 0 14 | @buf.shift ## get first char from buffer 15 | else 16 | @input.getc 17 | end 18 | end # method getc 19 | 20 | def peek 21 | if @buf.size == 0 && @input.eof? 22 | ## puts "peek - hitting eof!!!" 23 | return "\0" ## return NUL char (0) for now 24 | end 25 | 26 | if @buf.size == 0 27 | c = @input.getc 28 | @buf.push( c ) 29 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 30 | end 31 | 32 | @buf[0] ## @buf.first 33 | end # method peek 34 | end # class Buffer 35 | -------------------------------------------------------------------------------- /benchmarks/io/buffer_line.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class BufferLine 5 | def initialize( data ) 6 | # create the IO object we will read from 7 | @input = data 8 | @buf = "" ## last (buffer) chars (used for peek) 9 | end 10 | 11 | def eof?() @buf.empty? && @input.eof?; end 12 | 13 | def getc 14 | if @buf.empty? 15 | @buf = @input.gets 16 | end 17 | 18 | ## todo: check - if works for multi-byte chars?? 19 | @buf.slice!(0) ## get first char from buffer 20 | end # method getc 21 | 22 | 23 | def peek 24 | if @buf.empty? && @input.eof? 25 | ## puts "peek - hitting eof!!!" 26 | return "\0" ## return NUL char (0) for now 27 | end 28 | 29 | if @buf.empty? 30 | @buf = @input.gets 31 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 32 | end 33 | 34 | ## todo: check - if works for multi-byte chars?? 35 | @buf[0] ## @buf.first 36 | end # method peek 37 | end # class Buffer 38 | -------------------------------------------------------------------------------- /benchmarks/io/buffer_line_pos.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class BufferLinePos 5 | def initialize( data ) 6 | # create the IO object we will read from 7 | @input = data 8 | @buf = "" ## last (buffer) chars (used for peek) 9 | @pos = 0 10 | @length = 0 11 | end 12 | 13 | def empty? 14 | @length == 0 || @pos >= @length 15 | end 16 | 17 | def eof?() @input.eof? && empty?; end 18 | 19 | def getc 20 | if empty? 21 | @buf = @input.gets 22 | @length = @buf.length 23 | @pos = 0 24 | end 25 | 26 | ## todo: check - if works for multi-byte chars?? 27 | c = @buf[@pos] 28 | @pos += 1 29 | c 30 | end # method getc 31 | 32 | 33 | def peek 34 | if empty? 35 | 36 | if @input.eof? 37 | ## puts "peek - hitting eof!!!" 38 | return "\0" ## return NUL char (0) for now 39 | else 40 | @buf = @input.gets 41 | @length = @buf.length 42 | @pos = 0 43 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 44 | end 45 | end 46 | 47 | ## todo: check - if works for multi-byte chars?? 48 | @buf[@pos] ## @buf.first 49 | end # method peek 50 | end # class BufferLinePos 51 | -------------------------------------------------------------------------------- /benchmarks/io/buffer_line_scanner.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class BufferLineScanner 5 | def initialize( data ) 6 | # create the IO object we will read from 7 | @input = data 8 | @buf = StringScanner.new("") ## last (buffer) chars (used for peek) 9 | end 10 | 11 | def eof?() @buf.eos? && @input.eof?; end 12 | 13 | def getc 14 | if @buf.eos? 15 | @buf.string = @input.gets ## was: StringScanner.new( @input.gets ) 16 | end 17 | 18 | ## todo: check - if works for multi-byte chars?? 19 | @buf.getch 20 | end # method getc 21 | 22 | 23 | def skip( pattern ) @buf.skip( pattern ); end 24 | def scan( pattern ) @buf.scan( pattern ); end 25 | def scan_until( pattern ) @buf.scan_until( pattern ); end 26 | 27 | 28 | 29 | def peek 30 | if @buf.eos? && @input.eof? 31 | ## puts "peek - hitting eof!!!" 32 | return "\0" ## return NUL char (0) for now 33 | end 34 | 35 | if @buf.eos? 36 | @buf.string = @input.gets ## was: StringScanner.new( @input.gets ) 37 | end 38 | 39 | ## todo: check - if works for multi-byte chars?? 40 | @buf.peek(1) 41 | end # method peek 42 | end # class BufferLineScanner 43 | -------------------------------------------------------------------------------- /benchmarks/io/buffer_num.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class BufferNum 5 | def initialize( data ) 6 | # create the IO object we will read from 7 | @input = data 8 | @buf = [] ## last (buffer) chars (used for peek) 9 | end 10 | 11 | def eof?() @buf.size == 0 && @input.eof?; end 12 | 13 | 14 | def getc 15 | if @buf.size > 0 16 | @buf.shift.chr ## get first char from buffer (convert back to char/string from ord number/integer) 17 | else 18 | @input.getc 19 | end 20 | end # method getc 21 | 22 | ## note: peek always returns an integer 23 | def peek 24 | if @buf.size == 0 25 | 26 | return 0 if @input.eof? 27 | 28 | c = @input.getc 29 | @buf.push( c.ord ) 30 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 31 | end 32 | 33 | @buf[0] ## @buf.first 34 | end # method peek 35 | end # class Buffer 36 | -------------------------------------------------------------------------------- /benchmarks/io/helper.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | require 'pp' 5 | require 'strscan' 6 | 7 | 8 | 9 | require_relative 'read' 10 | 11 | require_relative 'buffer' 12 | require_relative 'buffer_line' 13 | require_relative 'buffer_line_pos' 14 | require_relative 'buffer_line_scanner' 15 | require_relative 'buffer_num' 16 | 17 | require_relative 'parser' 18 | require_relative 'parser_nobuf' 19 | require_relative 'parser_num' 20 | require_relative 'parser_scanner' 21 | 22 | 23 | 24 | def data_dir 25 | './datasets' 26 | end 27 | 28 | 29 | 30 | def readline_sample 31 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 32 | readline( f ) 33 | end 34 | end 35 | 36 | def readline_inplace_sample 37 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 38 | readline_inplace( f ) 39 | end 40 | end 41 | 42 | def readline_scanner_sample 43 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 44 | readline_scanner( f ) 45 | end 46 | end 47 | 48 | 49 | def readchar_sample 50 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 51 | readchar( f ) 52 | end 53 | end 54 | 55 | 56 | def parse1_sample 57 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 58 | Parser.parse( Buffer.new( f ) ) 59 | end 60 | end 61 | 62 | def parse2_sample 63 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 64 | Parser.parse( BufferLine.new( f ) ) 65 | end 66 | end 67 | 68 | def parse3_sample 69 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 70 | Parser.parse( BufferLinePos.new( f ) ) 71 | end 72 | end 73 | 74 | def parse4_sample 75 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 76 | ParserNobuf.parse( f ) 77 | end 78 | end 79 | 80 | def parse5_sample 81 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 82 | ParserNum.parse( BufferNum.new( f ) ) 83 | end 84 | end 85 | 86 | 87 | def parse_scanner_sample 88 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 89 | Parser.parse( BufferLineScanner.new( f ) ) 90 | end 91 | end 92 | 93 | def parse_scanner_scanner_sample 94 | File.open( "#{data_dir}/finance/MSFT.csv", 'r:utf-8' ) do |f| 95 | ParserScanner.parse( BufferLineScanner.new( f ) ) 96 | end 97 | end 98 | 99 | 100 | ## pp read_sample 101 | ## pp getch_sample 102 | ## pp getch2_sample 103 | ## pp getch3_sample 104 | ## pp parse4_sample 105 | ## pp parse_scanner_sample 106 | ## pp parse_scanner_scanner_sample 107 | -------------------------------------------------------------------------------- /benchmarks/io/parser.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class Parser 4 | 5 | LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed) 6 | CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return) 7 | 8 | def self.parse( input ) 9 | recs = [] 10 | loop do 11 | break if input.eof? 12 | 13 | ## non-blanl line 14 | line = "" 15 | 16 | c = input.peek 17 | if c==LF || c==CR || input.eof? 18 | ## blank line 19 | recs << line 20 | skip_newline( input ) 21 | else 22 | loop do 23 | line << input.getc 24 | c = input.peek 25 | break if c==LF || c==CR || input.eof? 26 | end 27 | recs << line 28 | skip_newline( input ) 29 | end 30 | end 31 | recs 32 | end 33 | 34 | def self.skip_newline( input ) ## note: singular (strict) version 35 | return if input.eof? 36 | 37 | ## only skip CR LF or LF or CR 38 | if input.peek == CR 39 | input.getc ## eat-up 40 | input.getc if input.peek == LF 41 | elsif input.peek == LF 42 | input.getc ## eat-up 43 | else 44 | # do nothing 45 | end 46 | end 47 | 48 | end # class Parser 49 | -------------------------------------------------------------------------------- /benchmarks/io/parser_nobuf.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class ParserNobuf 4 | 5 | LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed) 6 | CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return) 7 | 8 | def self.parse( input ) 9 | recs = [] 10 | if input.eof? 11 | else 12 | c = input.getc 13 | loop do 14 | ## non-blanl line 15 | line = "" 16 | 17 | if c==LF || c==CR || input.eof? 18 | ## blank line 19 | recs << line 20 | break if input.eof? 21 | c = skip_newline( c, input ) 22 | else 23 | loop do 24 | line << c 25 | c = input.getc 26 | break if c==LF || c==CR || input.eof? 27 | end 28 | recs << line 29 | break if input.eof? 30 | c = skip_newline( c, input ) 31 | end 32 | end 33 | end 34 | recs 35 | end 36 | 37 | 38 | def self.skip_newline( c, input ) 39 | return c if input.eof? 40 | 41 | ## only skip CR LF or LF or CR 42 | if c == CR 43 | c = input.getc 44 | c = input.getc if c == LF 45 | c 46 | elsif c == LF 47 | c = input.getc ## eat-up 48 | c 49 | else 50 | # do nothing 51 | c 52 | end 53 | end 54 | 55 | end # class ParserNobuf 56 | -------------------------------------------------------------------------------- /benchmarks/io/parser_num.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class ParserNum 5 | 6 | LF = "\n".ord ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed) 7 | CR = "\r".ord ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return) 8 | 9 | def self.parse( input ) 10 | recs = [] 11 | loop do 12 | break if input.eof? 13 | 14 | ## non-blanl line 15 | line = "" 16 | 17 | c = input.peek 18 | if c==LF || c==CR || input.eof? 19 | ## blank line 20 | recs << line 21 | skip_newline( input ) 22 | else 23 | loop do 24 | line << input.getc 25 | c = input.peek 26 | break if c==LF || c==CR || input.eof? 27 | end 28 | recs << line 29 | skip_newline( input ) 30 | end 31 | end 32 | recs 33 | end 34 | 35 | def self.skip_newline( input ) ## note: singular (strict) version 36 | return if input.eof? 37 | 38 | ## only skip CR LF or LF or CR 39 | if input.peek == CR 40 | input.getc ## eat-up 41 | input.getc if input.peek == LF 42 | elsif input.peek == LF 43 | input.getc ## eat-up 44 | else 45 | # do nothing 46 | end 47 | end 48 | 49 | end # class ParserNum 50 | -------------------------------------------------------------------------------- /benchmarks/io/parser_scanner.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class ParserScanner 4 | 5 | 6 | NOT_COMMA_OR_NEWLINE_RX = /[^,\n\r]*/ 7 | 8 | NEWLINE_RX = /\r?\n/ 9 | 10 | 11 | LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed) 12 | CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return) 13 | COMMA = "," 14 | 15 | def self.parse_record( input ) 16 | values = [] 17 | 18 | loop do 19 | value = input.scan( NOT_COMMA_OR_NEWLINE_RX ) 20 | values << value 21 | 22 | if input.eof? 23 | break 24 | elsif (c=input.peek; c==LF || c==CR) 25 | skip_newline( input ) 26 | break 27 | elsif input.peek == COMMA 28 | input.getc ## eat-up comma 29 | else 30 | puts "!! error - found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" 31 | exit(1) 32 | end 33 | end 34 | 35 | values 36 | end 37 | 38 | 39 | def self.parse( input ) 40 | recs = [] 41 | loop do 42 | break if input.eof? 43 | 44 | ## non-blanl line 45 | line = "" 46 | 47 | c = input.peek 48 | if c==LF || c==CR || input.eof? 49 | ## blank line 50 | recs << line 51 | skip_newline( input ) 52 | else 53 | ## line << input.scan_until_newline 54 | ## recs << line 55 | ## skip_newline( input ) 56 | 57 | line = parse_record( input ) 58 | recs << line 59 | end 60 | end 61 | recs 62 | end 63 | 64 | def self.skip_newline( input ) ## note: singular (strict) version 65 | return if input.eof? 66 | 67 | input.skip( NEWLINE_RX ) 68 | end 69 | 70 | end # class Parser 71 | -------------------------------------------------------------------------------- /benchmarks/io/read.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | def readline( input ) 5 | recs = [] 6 | input.each_line do |line| 7 | line = line.chomp 8 | recs << line.split( "," ) 9 | end 10 | recs 11 | end 12 | 13 | def readline_inplace( input ) 14 | recs = [] 15 | input.each_line do |line| 16 | line.chomp! ## use "inplace" chomp version 17 | recs << line.split( "," ) 18 | end 19 | recs 20 | end 21 | 22 | 23 | 24 | 25 | NOT_COMMA_RX = / [^,]* /x ## everything until the next comma (or end of line) 26 | 27 | def readline_scanner( input ) 28 | recs = [] 29 | buf = StringScanner.new( "" ) 30 | input.each_line do |line| 31 | buf.string = line.chomp ## was: StringScanner.new( line.chomp ) 32 | rec = [] 33 | loop do 34 | value = buf.scan( NOT_COMMA_RX ) 35 | rec << value ## todo: check for value nil/no match - no more value found - why? why not? 36 | break if buf.eos? 37 | buf.getch ## eat-up comma 38 | end 39 | recs << rec # add record 40 | end 41 | recs 42 | end 43 | 44 | 45 | 46 | def readchar( input ) 47 | recs = [] 48 | input.each_line do |line| 49 | line = line.chomp 50 | rec = [] 51 | value = "" 52 | line.each_char do |c| 53 | if c == "," 54 | rec << value 55 | value = "" 56 | else 57 | value << c 58 | end 59 | end 60 | rec << value # add last value 61 | recs << rec # add record 62 | end 63 | recs 64 | end 65 | -------------------------------------------------------------------------------- /benchmarks/io/test/test_io.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | ## note: to run use: 5 | ## ruby ./io/test/test_io.rb 6 | 7 | 8 | require 'minitest/autorun' 9 | 10 | 11 | 12 | class TestIo < MiniTest::Test 13 | 14 | 15 | def test_chomp 16 | ## If $/ has not been changed from the default Ruby record separator, 17 | ## then chomp also removes carriage return characters 18 | ## (that is it will remove \n, \r, and \r\n). 19 | 20 | assert_equal "line", "line".chomp 21 | assert_equal "line ", "line ".chomp 22 | assert_equal "line\r\n ", "line\r\n ".chomp 23 | 24 | assert_equal "line", "line\r\n".chomp 25 | assert_equal "line", "line\r".chomp 26 | assert_equal "line", "line\n".chomp 27 | assert_equal "line\r\n", "line\r\n\r\n".chomp 28 | assert_equal "line\r", "line\r\r".chomp 29 | assert_equal "line\n", "line\n\n".chomp 30 | 31 | ## If $/ is an empty string, it will remove all trailing newlines from the string. 32 | assert_equal "line", "line\r\n\r\n".chomp('') 33 | assert_equal "line", "line\n\n".chomp('') 34 | assert_equal "line\r\r", "line\r\r".chomp('') 35 | end 36 | 37 | end # class TestIo 38 | -------------------------------------------------------------------------------- /benchmarks/io/test/test_sample.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | ## note: to run use: 5 | ## ruby ./io/test/test_sample.rb 6 | 7 | 8 | require 'minitest/autorun' 9 | 10 | 11 | require_relative '../helper' 12 | 13 | 14 | class TestSample < MiniTest::Test 15 | 16 | def recs 17 | [["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"], 18 | ["2018-01-02", "86.129997", "86.309998", "85.500000", "85.949997", "84.487411", "22483800"], 19 | ["2018-01-03", "86.059998", "86.510002", "85.970001", "86.349998", "84.880608", "26061400"], 20 | ["2018-01-04", "86.589996", "87.660004", "86.570000", "87.110001", "85.627678", "21912000"], 21 | ["2018-01-05", "87.660004", "88.410004", "87.430000", "88.190002", "86.689301", "23407100"], 22 | ["2018-01-08", "88.199997", "88.580002", "87.599998", "88.279999", "86.777763", "22113000"]] 23 | end 24 | 25 | 26 | def test_readline_sample 27 | assert_equal recs, readline_sample[0..5] 28 | end 29 | 30 | def test_readline_inplace_sample 31 | assert_equal recs, readline_inplace_sample[0..5] 32 | end 33 | 34 | def test_readline_scanner_sample 35 | assert_equal recs, readline_scanner_sample[0..5] 36 | end 37 | 38 | def test_readchar_sample 39 | assert_equal recs, readchar_sample[0..5] 40 | end 41 | 42 | 43 | def test_parse_scanner_scanner_sample 44 | assert_equal recs, parse_scanner_scanner_sample[0..5] 45 | end 46 | end # class TestSample 47 | -------------------------------------------------------------------------------- /benchmarks/io/test/test_scanner.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | ## note: to run use: 5 | ## ruby ./io/test/test_scanner.rb 6 | 7 | 8 | require 'minitest/autorun' 9 | 10 | 11 | require 'strscan' 12 | 13 | 14 | class TestScanner < MiniTest::Test 15 | 16 | 17 | NOT_COMMA_OR_NEWLINE_RX = /[^,\n\r]*/ 18 | 19 | def test_line 20 | buf = StringScanner.new( "a,b,\r\n" ) 21 | 22 | assert_equal "a", buf.scan( NOT_COMMA_OR_NEWLINE_RX ) 23 | assert_equal ",", buf.peek(1) 24 | assert_equal ",", buf.getch 25 | assert_equal "b", buf.scan( NOT_COMMA_OR_NEWLINE_RX ) 26 | assert_equal ",", buf.peek(1) 27 | assert_equal ",", buf.getch 28 | assert_equal "", buf.scan( NOT_COMMA_OR_NEWLINE_RX ) 29 | assert_equal "\r", buf.peek(1) 30 | assert_equal 2, buf.skip( /\r?\n/ ) 31 | assert buf.eos? 32 | end 33 | 34 | 35 | COMMA_LOOKAHEAD_RX = /(?=,|\n|\r) | $ /x 36 | 37 | def test_line_with_lookahead 38 | buf = StringScanner.new( "a,b,\r\n" ) 39 | 40 | assert_equal "a", buf.scan_until( COMMA_LOOKAHEAD_RX ) 41 | assert_equal ",", buf.peek(1) 42 | assert_equal ",", buf.getch 43 | assert_equal "b", buf.scan_until( COMMA_LOOKAHEAD_RX ) 44 | assert_equal ",", buf.peek(1) 45 | assert_equal ",", buf.getch 46 | assert_equal "", buf.scan_until( COMMA_LOOKAHEAD_RX ) 47 | assert_equal "\r", buf.peek(1) 48 | assert_equal 2, buf.skip( /\r?\n/ ) 49 | assert buf.eos? 50 | end 51 | 52 | 53 | def test_empty 54 | buf = StringScanner.new( "" ) 55 | 56 | assert_equal "", buf.scan_until( /$/ ) 57 | assert_equal "", buf.scan_until( /$/ ) 58 | assert buf.eos? 59 | 60 | assert_equal "", buf.scan_until( /(?=,) | $/x ) 61 | assert_equal "", buf.scan_until( /(?=,) | $/x ) 62 | assert buf.eos? 63 | 64 | assert_equal "", buf.scan( NOT_COMMA_OR_NEWLINE_RX ) 65 | assert buf.eos? 66 | 67 | assert_equal "", buf.scan_until( COMMA_LOOKAHEAD_RX ) 68 | assert buf.eos? 69 | end 70 | end # class TestScanner 71 | -------------------------------------------------------------------------------- /benchmarks/split.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | def read_csv( path, sep: ',' ) 4 | recs = [] 5 | File.open( path, 'r:utf-8' ) do |f| 6 | f.each_line do |line| 7 | line = line.chomp( '' ) ## fix: use line.chomp! inplace - why? why not? 8 | values = line.split( sep ) 9 | recs << values 10 | end 11 | end 12 | recs 13 | end 14 | 15 | 16 | def read_tab( path ) read_csv( path, sep: "\t" ); end 17 | 18 | ## todo: add converter for read_table - why, why not?? 19 | ## translate interpunct back to space 20 | ## values = values.map { |value| value.tr( '•', ' ' ) } 21 | def read_table( path ) read_csv( path, sep: /[ \t]+/ ); end 22 | 23 | 24 | 25 | 26 | 27 | def read_faster_csv( path, sep: ',', converter: nil ) 28 | recs = [] 29 | File.open( path, 'r:utf-8' ) do |f| 30 | f.each_line do |line| 31 | ## note: chomp('') if is an empty string, 32 | line = line.chomp( '' ) ## fix: use line.chomp! inplace - why? why not? 33 | values = line.split( sep ) 34 | 35 | values = values.map { |v| converter.call(v) } if converter 36 | 37 | recs << values 38 | end 39 | end 40 | recs 41 | end 42 | 43 | 44 | 45 | if __FILE__ == $0 46 | 47 | require 'pp' 48 | 49 | data = read_csv( './datasets/finance/MSFT.csv' ) 50 | pp data 51 | date = read_tab( './datasets/finance/o/MSFT.tab' ) 52 | pp data 53 | data = read_table( './datasets/finance/o/MSFT.txt' ) 54 | pp data[0..2] 55 | 56 | 57 | end 58 | -------------------------------------------------------------------------------- /csv11/.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /spec/examples.txt 9 | /test/tmp/ 10 | /test/version_tmp/ 11 | /tmp/ 12 | 13 | # Used by dotenv library to load environment variables. 14 | # .env 15 | 16 | ## Specific to RubyMotion: 17 | .dat* 18 | .repl_history 19 | build/ 20 | *.bridgesupport 21 | build-iPhoneOS/ 22 | build-iPhoneSimulator/ 23 | 24 | ## Specific to RubyMotion (use of CocoaPods): 25 | # 26 | # We recommend against adding the Pods directory to your .gitignore. However 27 | # you should judge for yourself, the pros and cons are mentioned at: 28 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 29 | # 30 | # vendor/Pods/ 31 | 32 | ## Documentation cache and generated files: 33 | /.yardoc/ 34 | /_yardoc/ 35 | /doc/ 36 | /rdoc/ 37 | 38 | ## Environment normalization: 39 | /.bundle/ 40 | /vendor/bundle 41 | /lib/bundler/man/ 42 | 43 | # for a library or gem, you might want to ignore these files since the code is 44 | # intended to run in multiple environments; otherwise, check them in: 45 | # Gemfile.lock 46 | # .ruby-version 47 | # .ruby-gemset 48 | 49 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 50 | .rvmrc 51 | -------------------------------------------------------------------------------- /csv11/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2017-07-02 2 | 3 | * Everything is new. First release. 4 | -------------------------------------------------------------------------------- /csv11/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | Manifest.txt 3 | README.md 4 | Rakefile 5 | lib/csv11.rb 6 | lib/csv11/version.rb 7 | test/helper.rb 8 | test/test_version.rb 9 | -------------------------------------------------------------------------------- /csv11/README.md: -------------------------------------------------------------------------------- 1 | # csv11 2 | 3 | csv11 library / gem - read / parse comma-separated values (csv); supports csv 1.1 incl. comments, named values, multi-line records, and more 4 | 5 | * home :: [github.com/csv11/csv11](https://github.com/csv11/csv11) 6 | * bugs :: [github.com/csv11/csv11/issues](https://github.com/csv11/csv11/issues) 7 | * gem :: [rubygems.org/gems/csv11](https://rubygems.org/gems/csv11) 8 | * rdoc :: [rubydoc.info/gems/csv11](http://rubydoc.info/gems/csv11) 9 | 10 | 11 | 12 | 13 | ## Usage 14 | 15 | to be done 16 | 17 | 18 | ## License 19 | 20 | ![](https://publicdomainworks.github.io/buttons/zero88x31.png) 21 | 22 | The `csv11` scripts are dedicated to the public domain. 23 | Use it as you please with no restrictions whatsoever. 24 | 25 | ## Questions? Comments? 26 | 27 | Post them to the [wwwmake forum](http://groups.google.com/group/wwwmake). Thanks! 28 | -------------------------------------------------------------------------------- /csv11/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csv11/version.rb' 3 | 4 | Hoe.spec 'csv11' do 5 | 6 | self.version = Values::VERSION 7 | 8 | self.summary = 'csv11 - read / parse comma-separated values (csv); supports csv 1.1 incl. comments, named values, multi-line records, and more' 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csv11/csv11'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'ruby-talk@ruby-lang.org' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.licenses = ['Public Domain'] 21 | 22 | self.spec_extras = { 23 | required_ruby_version: '>= 2.2.2' 24 | } 25 | 26 | end 27 | -------------------------------------------------------------------------------- /csv11/lib/csv11/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | module Values 4 | MAJOR = 0 ## todo: namespace inside version or something - why? why not?? 5 | MINOR = 0 6 | PATCH = 3 7 | VERSION = [MAJOR,MINOR,PATCH].join('.') 8 | 9 | def self.version 10 | VERSION 11 | end 12 | 13 | def self.banner 14 | "csv11/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 15 | end 16 | 17 | def self.root 18 | "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}" 19 | end 20 | end # module Values 21 | -------------------------------------------------------------------------------- /csv11/test/helper.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ## minitest setup 4 | require 'minitest/autorun' 5 | 6 | 7 | $RUBYLIBS_DEBUG = true 8 | 9 | ## our own code 10 | require 'csv11' 11 | -------------------------------------------------------------------------------- /csv11/test/test_split.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_split.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestSplit < MiniTest::Test 12 | 13 | def test_split 14 | 15 | assert_equal ['a', 'b', 'c'], Values.split( 'a,b,c' ) 16 | assert_equal ['a', 'b', 'c'], Values.split( 'a, b, c' ) 17 | assert_equal ['a', 'b', ''], Values.split( 'a, b,' ) 18 | assert_equal ['a', 'b'], Values.split( 'a, b' ) 19 | 20 | assert_equal ['a', ['n','b'], ['m','c'],'d'], Values.split( 'a,n:b,m:c,d' ) 21 | assert_equal ['a', ['n','b'], ['m','c'],'d'], Values.split( 'a, n: b, m: c, d' ) 22 | 23 | ## check reserved names (e.g. http and https) 24 | assert_equal ['a', 'http://example.com', 'b'], Values.split( 'a, http://example.com, b' ) 25 | assert_equal ['a', 'http://example.com:80', 'b'], Values.split( 'a, http://example.com:80, b' ) 26 | assert_equal ['a', 'https://example.com', 'b'], Values.split( 'a, https://example.com, b' ) 27 | assert_equal ['a', 'https://example.com:80', 'b'], Values.split( 'a, https://example.com:80, b' ) 28 | assert_equal ['https://example.com'], Values.split( 'https://example.com' ) 29 | assert_equal ['https://example.com:80'], Values.split( 'https://example.com:80' ) 30 | 31 | assert_equal ['a', 'n n: b', 'm&m: c','d'], Values.split( 'a, n n: b, m&m: c, d' ) 32 | 33 | assert_equal [%{Hello, World!}], Values.split( %{"Hello, World!"} ) 34 | assert_equal [%{Hello, World!}], Values.split( %{'Hello, World!'} ) 35 | assert_equal [%{'Hello, World!'}], Values.split( %{"'Hello, World!'"} ) 36 | assert_equal [%{"Hello, World!"}], Values.split( %{'"Hello, World!"'} ) 37 | assert_equal [%{'Hello, World!'}, %{"Hello, World!"}], Values.split( %{"'Hello, World!'",'"Hello, World!"'} ) 38 | 39 | assert_equal [%{The "Quoted" World}], Values.split( %{The "Quoted" World} ) ## no need to escape quotes if not first (letter) of value 40 | 41 | assert_equal [%{'""Hello""', Quotes}], Values.split( %{"""'""Hello""', Quotes"""} ) 42 | 43 | ## check single-line named Values - will IGNORE commas (not special) 44 | assert_equal [['open', '12h, 13h, 14, 15h']], Values.split( 'open: 12h, 13h, 14, 15h') 45 | 46 | ## check named value with comma escaped with quote 47 | assert_equal ['a', 'b,c', 'd'], Values.split( %{a,"b,c",d} ) 48 | assert_equal ['a', 'b,c', 'd'], Values.split( %{ a , "b,c" , d } ) 49 | assert_equal ['a', ['n','b,c'], ['m','d,e'],'f,g'], Values.split( %{a,n:"b,c",m:"d,e","f,g"} ) 50 | 51 | assert_equal ['a', ['n','b:c'], ['m','d:e'],'f'], Values.split( 'a, n: b:c, m:d:e, f' ) 52 | assert_equal ['a', ['n','b:c'], ['m','d:e:f'],'g h:i:j'], Values.split( 'a,n:b:c,m:d:e:f,g h:i:j' ) 53 | 54 | ## note: space in quotes is significant - keep? why ? why not?? 55 | assert_equal ['a', ' b , c ', 'd'], Values.split( %{ a , " b , c " , d } ) 56 | end 57 | 58 | 59 | end # class TestSplit 60 | -------------------------------------------------------------------------------- /csv11/test/test_version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_version.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestVersion < MiniTest::Test 12 | 13 | 14 | def test_version 15 | 16 | puts Values::VERSION 17 | assert true 18 | ## assume everything ok if get here 19 | end 20 | 21 | end # class TestVersion 22 | -------------------------------------------------------------------------------- /csvhuman/.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /vendor/bundle 26 | /lib/bundler/man/ 27 | 28 | # for a library or gem, you might want to ignore these files since the code is 29 | # intended to run in multiple environments; otherwise, check them in: 30 | # Gemfile.lock 31 | # .ruby-version 32 | # .ruby-gemset 33 | 34 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 35 | .rvmrc 36 | -------------------------------------------------------------------------------- /csvhuman/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-11-06 2 | 3 | * Everything is new. First release 4 | -------------------------------------------------------------------------------- /csvhuman/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | Manifest.txt 3 | README.md 4 | Rakefile 5 | config/attributes.csv 6 | config/langs.csv 7 | config/tags.csv 8 | config/types.csv 9 | config/versions.csv 10 | lib/csvhuman.rb 11 | lib/csvhuman/base.rb 12 | lib/csvhuman/column.rb 13 | lib/csvhuman/converter.rb 14 | lib/csvhuman/doc/helper.rb 15 | lib/csvhuman/doc/schema.rb 16 | lib/csvhuman/reader.rb 17 | lib/csvhuman/tag.rb 18 | lib/csvhuman/version.rb 19 | test/data/airports.csv 20 | test/data/ebola.csv 21 | test/data/hdx/ebola_treatment_centres.csv 22 | test/data/hdx/phl_haima_houses_damaged.csv 23 | test/data/hdx/zika_cases.csv 24 | test/data/sample1.csv 25 | test/data/sample2.csv 26 | test/data/sample3.csv 27 | test/data/sample4.csv 28 | test/data/test.csv 29 | test/data/unhcr.csv 30 | test/helper.rb 31 | test/test_doc.rb 32 | test/test_hdx.rb 33 | test/test_header_converter.rb 34 | test/test_misc.rb 35 | test/test_reader.rb 36 | test/test_samples.rb 37 | test/test_tags.rb 38 | test/test_type_converters.rb 39 | test/test_type_mappings.rb 40 | -------------------------------------------------------------------------------- /csvhuman/NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes 2 | 3 | ## Todos 4 | 5 | - [ ] check if `+id` is always a number type (for auto-conversion) e.g. `#event+id` 6 | - [ ] add check for `#geo` and `+lat`, `+lon` (for auto-conversion) to floats 7 | - [ ] add type converter for `#date` (e.g. `#date+start`, `#date+reported`, etc.) - support 2017-12-11 and 11/14/2017 for now? 8 | - [ ] header converter for symbols - turn `+` into `_x_` or `_I_` or into `$` - why? why not? (check if `$` supported in ruby inline? - no, it's not possible) 9 | 10 | 11 | ## Examples 12 | 13 | Add more .csv examples with hxl tags, see 14 | 15 | 16 | 17 | 18 | 19 | 20 | ### Use `$` in symbol for `+` 21 | 22 | #### Ruby 23 | 24 | ``` 25 | >> s = :adm1 26 | => :adm1 27 | >> s = :adm1_x_code 28 | => :adm1_x_code 29 | >> s = :adm1$code 30 | SyntaxError: (irb):3: syntax error, unexpected tGVAR, expecting end-of-input 31 | ``` 32 | -------------------------------------------------------------------------------- /csvhuman/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvhuman/version.rb' 3 | 4 | Hoe.spec 'csvhuman' do 5 | 6 | self.version = CsvHuman::VERSION 7 | 8 | self.summary = "csvhuman - read tabular data in the CSV Humanitarian eXchange Language (HXL) format, that is, comma-separated values (CSV) line-by-line records with a hashtag (meta data) line using the Humanitarian eXchange Language (HXL) rules" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/csvhuman'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ['csvreader', '>=1.2.1'] 22 | ] 23 | 24 | self.licenses = ['Public Domain'] 25 | 26 | self.spec_extras = { 27 | required_ruby_version: '>= 2.2.2' 28 | } 29 | 30 | end 31 | -------------------------------------------------------------------------------- /csvhuman/config/langs.csv: -------------------------------------------------------------------------------- 1 | code, name+en, name 2 | ar, Arabic, 3 | en, English, English 4 | es, Spanish, Español 5 | de, German, Deutsch 6 | fa, Dari / Farsi / Persian, 7 | fr, French, 8 | ms, Malay, 9 | ps, Pashto, 10 | ru, Russian, 11 | sw, Swahili, 12 | tl, Tagalog, 13 | uk, Ukrainian, 14 | ur, Urdu, 15 | -------------------------------------------------------------------------------- /csvhuman/config/types.csv: -------------------------------------------------------------------------------- 1 | type,description 2 | text, 3 | number, 4 | url, 5 | email, 6 | phone, 7 | date, 8 | -------------------------------------------------------------------------------- /csvhuman/config/versions.csv: -------------------------------------------------------------------------------- 1 | version, date 2 | 1.0, 2016-03-18 3 | 1.1, 2018-04-30 4 | 1.2, 5 | -------------------------------------------------------------------------------- /csvhuman/lib/csvhuman.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csvreader' ## add all "alternative" shortcut aliases 4 | 5 | 6 | ## our own code (without "top-level" shortcuts e.g. "modular version") 7 | require 'csvhuman/base' 8 | 9 | 10 | #### 11 | # add some "alternative" shortcut aliases 12 | CsvHum = CsvHuman 13 | CSV_HXL = CsvHuman 14 | CSVHXL = CsvHuman 15 | HXL = CsvHuman 16 | -------------------------------------------------------------------------------- /csvhuman/lib/csvhuman/base.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csvreader/base' 4 | 5 | 6 | ## our own code 7 | require 'csvhuman/version' # note: let version always go first 8 | require 'csvhuman/tag' 9 | require 'csvhuman/column' 10 | require 'csvhuman/converter' 11 | require 'csvhuman/reader' 12 | 13 | require 'csvhuman/doc/helper.rb' 14 | require 'csvhuman/doc/schema.rb' 15 | 16 | 17 | # say hello 18 | puts CsvHuman.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 19 | -------------------------------------------------------------------------------- /csvhuman/lib/csvhuman/column.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvHuman 5 | 6 | 7 | class Columns 8 | 9 | 10 | def self.build( values, header_converter ) 11 | 12 | ## "clean" unify/normalize names 13 | keys = values.map do |value| 14 | if value 15 | if value.empty? 16 | nil 17 | else 18 | ## e.g. #ADM1 CODE => #adm1 +code 19 | ## POPULATION F CHILDREN AFFECTED => #population +affected +children +f 20 | tag_key = Tag.normalize( value ) 21 | ## turn empty normalized tags (e.g. "stray" hashtag) into nil too 22 | 23 | if value.empty? 24 | nil 25 | else 26 | header_key = 27 | ## todo/fix: pass in column index - why? why not? 28 | ## pass in column index for all columns (or only tagged ones?) or both? 29 | ## if header_converter.arity == 1 # straight converter 30 | header_converter.call( tag_key ) 31 | ## else 32 | ## header_converter.call( value, index ) 33 | ## end 34 | 35 | ## note: 36 | ## return nil, "" or false to skip column 37 | if header_key.nil? || header_key.empty? || header_key == false ## check again: skip empty "" columns 38 | nil 39 | else 40 | ## note: return header_key (used for returned record/hash) AND tag_key (used for type conversion config) 41 | ## lets us fold more columns into one or splat single list/array columns into many 42 | [header_key,tag_key] 43 | end 44 | end 45 | end 46 | else # keep (nil) as is 47 | nil 48 | end 49 | end 50 | 51 | 52 | counts = {} 53 | keys.each_with_index do |key,i| 54 | if key 55 | header_key = key[0] 56 | counts[header_key] ||= [] 57 | counts[header_key] << i 58 | end 59 | end 60 | ## puts "counts:" 61 | ## pp counts 62 | 63 | 64 | ## create all unique tags (used for type conversion) 65 | tags = {} 66 | keys.each do |key| 67 | if key 68 | tag_key = key[1] 69 | tags[tag_key] ||= Tag.parse( tag_key ) ## note: "reuse" tag for all columns if same tag key 70 | end 71 | end 72 | ## puts "tags:" 73 | ## pp tags 74 | 75 | 76 | cols = [] 77 | keys.each do |key| 78 | if key 79 | header_key = key[0] 80 | tag_key = key[1] 81 | 82 | count = counts[header_key] 83 | tag = tags[tag_key] ## note: "reuse" tag for all columns if same tag key 84 | 85 | if count.size > 1 86 | ## note: defaults to use "standard/default" tag key (as a string) 87 | cols << Column.new( header_key, tag, list: true ) 88 | else 89 | cols << Column.new( header_key, tag ) 90 | end 91 | else 92 | cols << Column.new 93 | end 94 | end 95 | 96 | cols 97 | end 98 | end ## class Columns 99 | 100 | 101 | 102 | 103 | class Column 104 | attr_reader :key # used for record (record key); note: list columns must use the same key 105 | attr_reader :tag 106 | 107 | 108 | def initialize( key=nil, tag=nil, list: false ) 109 | @key = key 110 | @tag = tag 111 | @list = list 112 | end 113 | 114 | 115 | def tagged?() @tag.nil? == false; end 116 | def list?() @list; end 117 | end # class Column 118 | 119 | end # class CsvHuman 120 | -------------------------------------------------------------------------------- /csvhuman/lib/csvhuman/doc/helper.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class CsvHuman 4 | module DocHelper 5 | 6 | 7 | HASHTAG_LINE_RX= /^ 8 | \s* 9 | \# 10 | (?[a-z][a-z0-9]+) 11 | \s* 12 | $/x 13 | 14 | def match_hashtag( line ) 15 | if (m=HASHTAG_LINE_RX.match(line)) 16 | puts "hashtag >#{m[:name]}<" 17 | m 18 | else 19 | nil 20 | end 21 | end 22 | 23 | 24 | 25 | ## note: attrib might be one letter only (e.g.) +m,+f, etc. 26 | ATTRIBUTE_LINE_RX= /^ 27 | \s* 28 | \+ 29 | (?[a-z][a-z0-9]*) 30 | \s* 31 | $/x 32 | 33 | def match_attribute( line ) 34 | if (m=ATTRIBUTE_LINE_RX.match(line)) 35 | puts "attrib >#{m[:name]}<" 36 | m 37 | else 38 | false 39 | end 40 | end 41 | 42 | 43 | 44 | ## 45 | ## e.g. 1.1. Places 46 | ## 2.1. Sex- and-age disaggregation (SADD) attributes 47 | 48 | HEADING_LINE_RX=/^ 49 | \s* 50 | (?[1-9]) 51 | \. 52 | (?[1-9]) 53 | \. 54 | \s+ 55 | (?.+?) 56 | \s* 57 | $/x 58 | 59 | def match_heading( line ) 60 | if (m=HEADING_LINE_RX.match(line)) 61 | puts "heading #{m[:level1]}.#{m[:level2]}. (#{m[:level2]}) >#{m[:title]}<" 62 | m 63 | else 64 | false 65 | end 66 | end 67 | 68 | 69 | 70 | TYPE_RX = /Every value must be a (?<type>[a-z]+)./ 71 | def match_type( line ) 72 | if (m=TYPE_RX.match(line)) 73 | puts "type: >#{m[:type]}<" 74 | m 75 | else 76 | false 77 | end 78 | end 79 | 80 | 81 | 82 | SINCE_HXL_RX = /Since HXL (?<version>[1]\.[0-9])\.?/ 83 | def match_since_hxl( line ) 84 | if (m=SINCE_HXL_RX.match(line)) 85 | puts "version: >#{m[:version]}<" 86 | m 87 | else 88 | false 89 | end 90 | end 91 | 92 | 93 | 94 | def split_descr( line ) 95 | if( m=match_since_hxl( line )) 96 | version = m[:version] 97 | ## remove "Since HXL 1.0" from text 98 | text = line.gsub( SINCE_HXL_RX, '' ).strip 99 | else 100 | version = '?' 101 | text = line 102 | end 103 | [text,version] 104 | end 105 | 106 | 107 | end # module DocHelper 108 | end # class CsvHuman 109 | -------------------------------------------------------------------------------- /csvhuman/lib/csvhuman/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvHuman 5 | 6 | MAJOR = 1 7 | MINOR = 1 8 | PATCH = 1 9 | VERSION = [MAJOR,MINOR,PATCH].join('.') 10 | 11 | 12 | def self.version 13 | VERSION 14 | end 15 | 16 | def self.banner 17 | "csvhuman/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 18 | end 19 | 20 | def self.root 21 | "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}" 22 | end 23 | 24 | end # module CsvHuman 25 | -------------------------------------------------------------------------------- /csvhuman/scripts/mkattributes.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csvreader' 4 | 5 | 6 | 7 | def linkify_attribute( attribute ) 8 | "[`+#{attribute}`](##{attribute})" 9 | end 10 | 11 | 12 | ATTRIBUTE_RX = / 13 | \+[a-z][a-z0-9_]* 14 | /x 15 | 16 | HASHTAG_RX = / 17 | \#[a-z][a-z0-9]+ 18 | /x 19 | 20 | 21 | 22 | def linkify_hashtags( line, page: '' ) 23 | ## note: assumes #adm1 etc. (that is, includes leading hashtag) 24 | line.gsub( HASHTAG_RX ) do |hashtag| 25 | puts "linkify hashtag >#{hashtag}<" 26 | "[`#{hashtag}`](#{page}#{hashtag})" 27 | end 28 | end 29 | 30 | def linkify_attributes( line ) 31 | ## note: assumes +f etc. (that is, includes leading plus) 32 | line.gsub( ATTRIBUTE_RX ) do |attrib| 33 | puts "linkify attribute >#{attrib}<" 34 | if attrib.index( '_' ) 35 | "`#{attrib}`" ## note: do NOT linkify custom attributes for now (if include underscore e.g. +age12_17 etc.) 36 | else 37 | "[`#{attrib}`](##{attrib[1..-1]})" ## note: cut-of leading + in intralink 38 | end 39 | end 40 | end 41 | 42 | 43 | 44 | def build_summary( attributes ) 45 | pp attributes 46 | 47 | attributes_a_to_z = attributes.sort { |l,r| l['attribute'] <=> r['attribute'] } 48 | pp attributes_a_to_z 49 | 50 | 51 | buf = "" 52 | buf << "# Humanitarian eXchangle Language (HXL) Attributes\n\n" 53 | 54 | attributes_a_to_z.each do |attribute| 55 | buf << linkify_attribute( attribute['attribute']) 56 | buf << "\n" 57 | end 58 | 59 | buf << "\n\n" 60 | 61 | 62 | 63 | last_category = nil 64 | 65 | attributes.each do |attribute| 66 | 67 | if attribute['category'] != last_category 68 | buf << "## #{attribute['category']}\n\n" 69 | end 70 | 71 | buf << "### `+#{attribute['attribute']}`\n\n" 72 | buf << "#{linkify_attributes(attribute['description'])}" 73 | buf << " " 74 | buf << "_Since version #{attribute['since']}_\n\n" 75 | 76 | unless attribute['tags'].empty? 77 | buf << "Tags: #{linkify_hashtags(attribute['tags'], page: 'TAGS.md')}\n\n" 78 | end 79 | 80 | 81 | last_category = attribute['category'] 82 | end 83 | 84 | buf 85 | end 86 | 87 | 88 | 89 | 90 | ## pp Csv.read( "./config/attributes.csv" ) 91 | 92 | attributes = CsvHash.read( "./config/attributes.csv" ) 93 | 94 | buf = build_summary( attributes ) 95 | puts buf 96 | 97 | File.open( "./ATTRIBUTES.md", 'w:utf-8') do |f| 98 | f.write( buf ) 99 | end 100 | -------------------------------------------------------------------------------- /csvhuman/scripts/mktags.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csvreader' 4 | 5 | 6 | 7 | def linkify_tag( tag ) 8 | "[`##{tag}`](##{tag})" 9 | end 10 | 11 | 12 | ATTRIBUTE_RX = / 13 | \+[a-z][a-z0-9_]* 14 | /x 15 | 16 | HASHTAG_RX = / 17 | \#[a-z][a-z0-9]+ 18 | /x 19 | 20 | def linkify_hashtags( line ) 21 | ## note: assumes #adm1 etc. (that is, includes leading hashtag) 22 | line.gsub( HASHTAG_RX ) do |hashtag| 23 | puts "linkify hashtag >#{hashtag}<" 24 | "[`#{hashtag}`](#{hashtag})" 25 | end 26 | end 27 | 28 | def linkify_attributes( line, page: '' ) 29 | ## note: assumes +f etc. (that is, includes leading plus) 30 | line.gsub( ATTRIBUTE_RX ) do |attrib| 31 | puts "linkify attribute >#{attrib}<" 32 | if attrib.index( '_' ) 33 | "`#{attrib}`" ## note: do NOT linkify custom attributes for now (if include underscore e.g. +age12_17 etc.) 34 | else 35 | "[`#{attrib}`](#{page}##{attrib[1..-1]})" ## note: cut-of leading + in intralink 36 | end 37 | end 38 | end 39 | 40 | 41 | 42 | 43 | def build_summary( tags ) 44 | pp tags 45 | 46 | tags_a_to_z = tags.sort { |l,r| l['tag'] <=> r['tag'] } 47 | pp tags_a_to_z 48 | 49 | 50 | buf = "" 51 | buf << "# Humanitarian eXchangle Language (HXL) Tags\n\n" 52 | 53 | tags_a_to_z.each do |tag| 54 | buf << linkify_tag( tag['tag']) 55 | buf << "\n" 56 | end 57 | 58 | buf << "\n\n" 59 | 60 | 61 | 62 | last_category = nil 63 | 64 | tags.each do |tag| 65 | 66 | if tag['category'] != last_category 67 | buf << "## #{tag['category']}\n\n" 68 | end 69 | 70 | buf << "### `##{tag['tag']}`\n\n" 71 | buf << "#{linkify_hashtags(tag['description'])}" 72 | buf << " " 73 | buf << "_Since version #{tag['since']}_\n\n" 74 | 75 | unless tag['type'].empty? 76 | buf << "Every value must be a **#{tag['type']}**.\n\n" 77 | end 78 | 79 | unless tag['attributes'].empty? 80 | buf << "Attributes: #{linkify_attributes(tag['attributes'], page: 'ATTRIBUTES.md')}\n\n" 81 | end 82 | 83 | 84 | last_category = tag['category'] 85 | end 86 | 87 | buf 88 | end 89 | 90 | 91 | ## pp Csv.read( "./config/tags.csv" ) 92 | 93 | tags = CsvHash.read( "./config/tags.csv" ) 94 | 95 | buf = build_summary( tags ) 96 | puts buf 97 | 98 | File.open( "./TAGS.md", 'w:utf-8') do |f| 99 | f.write( buf ) 100 | end 101 | -------------------------------------------------------------------------------- /csvhuman/scripts/txt2csv.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csvhuman' 4 | 5 | 6 | def csv_row( *values ) 7 | values.map do |value| 8 | if value && (value.index(",") || value.index('"')) 9 | ## double quotes and enclose in double qoutes 10 | value = %Q{"#{value.gsub('"', '""')}"} 11 | else 12 | value 13 | end 14 | end 15 | end 16 | 17 | 18 | attribs = CsvHuman::Doc.read_attributes( "./scripts/pages/attributes.txt" ) 19 | pp attribs 20 | 21 | File.open( "./config/attributes.csv", 'w:utf-8') do |f| 22 | f.write ["attribute","since","category","tags","description"].join(",") 23 | f.write "\n" 24 | attribs.each do |attrib| 25 | f.write csv_row(*attrib).join(",") 26 | f.write "\n" 27 | end 28 | end 29 | 30 | 31 | tags = CsvHuman::Doc.read_tags( "./scripts/pages/tags.txt" ) 32 | pp tags 33 | 34 | File.open( "./config/tags.csv", 'w:utf-8') do |f| 35 | f.write ["tag", "type", "since", "category", "attributes", "description"].join(",") 36 | f.write "\n" 37 | tags.each do |tag| 38 | f.write csv_row(*tag).join(",") 39 | f.write "\n" 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /csvhuman/test/data/airports.csv: -------------------------------------------------------------------------------- 1 | id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords,score,last_updated 2 | "#meta +id","#meta +code","#loc +airport +type","#loc +airport +name","#geo +lat","#geo +lon","#geo +elevation +ft","#region +continent +code","#country +code +iso2","#adm1 +code +iso","#loc +municipality +name","#status +scheduled","#loc +airport +code +gps","#loc +airport +code +iata","#loc +airport +code +local","#meta +url +airport","#meta +url +wikipedia","#meta +keywords","#meta +score","#date +updated" 3 | 4976,NSFA,medium_airport,"Faleolo International Airport",-13.829999923706055,-172.00799560546875,58,OC,WS,WS-AA,Apia,1,NSFA,APW,,,http://en.wikipedia.org/wiki/Faleolo_International_Airport,,1050,2009-08-31T16:22:49+00:00 4 | 35173,NSMA,small_airport,"Maota Airport",-13.742300033569336,-172.25799560546875,,OC,WS,WS-PA,Maota,1,NSMA,MXS,,,http://en.wikipedia.org/wiki/Maota_Airport,"Savaii Island",450,2009-08-31T16:13:53+00:00 5 | 31127,NSFI,small_airport,"Fagali'i Airport",-13.848699569699999,-171.740005493,131,OC,WS,WS-TU,Apia,0,NSFI,FGI,,,http://en.wikipedia.org/wiki/Fagali'i_Airport,,50,2012-11-26T12:09:24+00:00 6 | 30608,NSAU,small_airport,"Asau Airport",-13.505132,-172.627888,,OC,WS,WS-VS,Asau,1,NSAU,AAU,,,http://en.wikipedia.org/wiki/Asau_Airport,,50,2016-06-01T06:17:22+00:00 7 | -------------------------------------------------------------------------------- /csvhuman/test/data/sample1.csv: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%% 2 | % sample from HXL tagging conventions (version 1.1) 3 | % see http://hxlstandard.org/standard/1_1final/tagging/ 4 | 5 | CAMP INFORMATION, , NEEDS 6 | LOCATION NAME, LOCATION CODE, NUMBER AFFECTED 7 | #loc +name, #loc +code, #affected 8 | Camp A, 01000001, 2000 9 | Camp B, 01000002, 750 10 | Camp C, 01000003, 1920 11 | -------------------------------------------------------------------------------- /csvhuman/test/data/sample2.csv: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%% 2 | % sample from HXL tagging conventions (version 1.1) 3 | % see http://hxlstandard.org/standard/1_1final/tagging/ 4 | 5 | #event+id, #affected+killed, #region, #meta+source+reliability, #date+reported, #geo+lat, #geo+lon 6 | 1, 1, Mediterranean, Verified, 2015-11-05, 36.891500, 27.287700 7 | 3, 1, Central America incl. Mexico, Partially Verified, 2015-11-03, 15.956400, -93.663099 8 | -------------------------------------------------------------------------------- /csvhuman/test/data/sample3.csv: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%% 2 | % sample from HXL tagging conventions (version 1.1) 3 | % see http://hxlstandard.org/standard/1_1final/tagging/ 4 | 5 | P-CODE 1, P-CODE 2, P-CODE 3 6 | #loc+code, #loc+code, #loc+code 7 | 020503 8 | 060107, 060108 9 | 173219 10 | 530012 11 | 530013, 530015, 279333 12 | -------------------------------------------------------------------------------- /csvhuman/test/data/sample4.csv: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%% 2 | % sample from HXL tagging conventions (version 1.1) 3 | % see http://hxlstandard.org/standard/1_1final/tagging/ 4 | 5 | REGION, 2008, 2009, 2010, 2011 6 | #adm1 +name, #affected+label, #affected+label, #affected+label, #affected+label 7 | Coast District, 0, 30, 100, 250 8 | Mountain District, 15, 75, 30, 45 9 | -------------------------------------------------------------------------------- /csvhuman/test/data/test.csv: -------------------------------------------------------------------------------- 1 | What,,,Who,Where,For whom, 2 | Record,Sector/Cluster,Subsector,Organisation,Country,Males,Females,Subregion 3 | ,#sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1 4 | 001,WASH,Subsector 1,Org 1,Country 1,100,100,Region 1 5 | 002,Health,Subsector 2,Org 2,Country 2,,,Region 2 6 | 003,Education,Subsector 3,Org 3,Country 2,250,300,Region 3 7 | 004,WASH,Subsector 4,Org 1,Country 3,80,95,Region 4 8 | -------------------------------------------------------------------------------- /csvhuman/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | ## require 'csvhuman/base' 10 | require 'csvhuman' 11 | 12 | 13 | ## add test_data_dir helper 14 | class CsvHuman 15 | def self.test_data_dir 16 | "#{root}/test/data" 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /csvhuman/test/test_hdx.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_hdx.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestHdxSamples < MiniTest::Test 11 | 12 | 13 | def test_ebola 14 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/hdx/ebola_treatment_centres.csv" ) 15 | pp recs 16 | end 17 | 18 | def test_phl_haima 19 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/hdx/phl_haima_houses_damaged.csv" ) 20 | pp recs 21 | end 22 | 23 | def test_zika_cases 24 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/hdx/zika_cases.csv" ) 25 | pp recs 26 | end 27 | 28 | end # class TestHdxSamples 29 | -------------------------------------------------------------------------------- /csvhuman/test/test_header_converter.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_header_converter.rb 6 | 7 | 8 | 9 | require 'helper' 10 | 11 | class TestHeaderConverter < MiniTest::Test 12 | 13 | def conv_none( value ) 14 | CsvHuman::HEADER_CONVERTERS[:none].call( value ) 15 | end 16 | 17 | def conv_default( value ) 18 | CsvHuman::HEADER_CONVERTERS[:default].call( value ) 19 | end 20 | 21 | def conv_symbol( value ) 22 | CsvHuman::HEADER_CONVERTERS[:symbol].call( value ) 23 | end 24 | 25 | 26 | 27 | def test_none 28 | assert_equal "#sector", conv_none( "#sector" ) 29 | assert_equal "#adm1", conv_none( "#adm1" ) 30 | 31 | assert_equal "#sector +en", conv_none( "#sector +en" ) 32 | assert_equal "#adm1 +code", conv_none( "#adm1 +code" ) 33 | 34 | assert_equal "#affected +children +f", conv_none( "#affected +children +f" ) 35 | assert_equal "#population +affected +children +m", conv_none( "#population +affected +children +m" ) 36 | end 37 | 38 | 39 | def test_default 40 | assert_equal "sector", conv_default( "#sector" ) 41 | assert_equal "adm1", conv_default( "#adm1" ) 42 | 43 | assert_equal "sector+en", conv_default( "#sector +en" ) 44 | assert_equal "adm1+code", conv_default( "#adm1 +code" ) 45 | 46 | assert_equal "affected+children+f", conv_default( "#affected +children +f" ) 47 | assert_equal "population+affected+children+m", conv_default( "#population +affected +children +m" ) 48 | end 49 | 50 | 51 | def test_symbol 52 | assert_equal :sector, conv_symbol( "#sector" ) 53 | assert_equal :adm1, conv_symbol( "#adm1" ) 54 | 55 | assert_equal :sector_en, conv_symbol( "#sector +en" ) 56 | assert_equal :adm1_code, conv_symbol( "#adm1 +code" ) 57 | 58 | assert_equal :affected_children_f, conv_symbol( "#affected +children +f" ) 59 | assert_equal :population_affected_children_m, conv_symbol( "#population +affected +children +m" ) 60 | end 61 | 62 | 63 | end # class TestHeaderConverter 64 | -------------------------------------------------------------------------------- /csvhuman/test/test_misc.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_misc.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestMisc < MiniTest::Test 11 | 12 | 13 | def test_airports 14 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/airports.csv" ) 15 | pp recs 16 | end 17 | 18 | def test_unhcr 19 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/unhcr.csv" ) 20 | pp recs 21 | end 22 | 23 | def test_ebola 24 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/ebola.csv" ) 25 | pp recs 26 | end 27 | 28 | end # class TestMisc 29 | -------------------------------------------------------------------------------- /csvhuman/test/test_samples.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_samples.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestSamples < MiniTest::Test 11 | 12 | 13 | def test_sample1 14 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/sample1.csv" ) 15 | ## pp recs 16 | assert_equal [{"loc+name"=>"Camp A", "loc+code"=>"01000001", "affected"=>2000}, 17 | {"loc+name"=>"Camp B", "loc+code"=>"01000002", "affected"=>750}, 18 | {"loc+name"=>"Camp C", "loc+code"=>"01000003", "affected"=>1920}], recs 19 | end 20 | 21 | def test_sample2 22 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/sample2.csv" ) 23 | ## pp recs 24 | assert_equal [{"event+id"=>1, 25 | "affected+killed"=>1, 26 | "region"=>"Mediterranean", 27 | "meta+reliability+source"=>"Verified", 28 | "date+reported"=>Date.new( 2015, 11, 5 ), 29 | "geo+lat"=>36.8915, 30 | "geo+lon"=>27.2877}, 31 | {"event+id"=>3, 32 | "affected+killed"=>1, 33 | "region"=>"Central America incl. Mexico", 34 | "meta+reliability+source"=>"Partially Verified", 35 | "date+reported"=>Date.new( 2015, 11, 3 ), 36 | "geo+lat"=>15.9564, 37 | "geo+lon"=>-93.663099}], recs 38 | end 39 | 40 | def test_sample3 41 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/sample3.csv" ) 42 | ## pp recs 43 | assert_equal [{"loc+code"=>["020503", nil, nil]}, 44 | {"loc+code"=>["060107", "060108", nil]}, 45 | {"loc+code"=>["173219", nil, nil]}, 46 | {"loc+code"=>["530012", nil, nil]}, 47 | {"loc+code"=>["530013", "530015", "279333"]}], recs 48 | end 49 | 50 | def test_sample4 51 | recs = CsvHuman.read( "#{CsvHuman.test_data_dir}/sample4.csv" ) 52 | ## pp recs 53 | assert_equal [{"adm1+name"=>"Coast District", "affected+label"=>[0, 30, 100, 250]}, 54 | {"adm1+name"=>"Mountain District", "affected+label"=>[15, 75, 30, 45]}], recs 55 | end 56 | 57 | 58 | end # class TestSamples 59 | -------------------------------------------------------------------------------- /csvhuman/test/test_type_converters.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_type_converters.rb 6 | 7 | 8 | 9 | require 'helper' 10 | 11 | class TestTypeConverters < MiniTest::Test 12 | 13 | def conv_to_i( value ) 14 | CsvHuman::TYPE_CONVERTERS[Integer].call( value ) 15 | end 16 | 17 | def conv_to_f( value ) 18 | CsvHuman::TYPE_CONVERTERS[Float].call( value ) 19 | end 20 | 21 | def conv_to_date( value ) 22 | CsvHuman::TYPE_CONVERTERS[Date].call( value ) 23 | end 24 | 25 | 26 | 27 | def test_integer 28 | assert_equal 0, conv_to_i( "0" ) 29 | assert_equal 2011, conv_to_i( "2011" ) 30 | end 31 | 32 | def test_float 33 | assert_equal 0.0, conv_to_f( "0" ) 34 | assert_equal 2011.0, conv_to_f( "2011" ) 35 | end 36 | 37 | def test_date 38 | assert_equal Date.new( 2011, 12, 25 ), conv_to_date( "2011-12-25") 39 | end 40 | 41 | 42 | end # class TestTypeConverters 43 | -------------------------------------------------------------------------------- /csvhuman/test/test_type_mappings.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_type_mappings.rb 6 | 7 | 8 | 9 | require 'helper' 10 | 11 | class TestTypeMappings < MiniTest::Test 12 | 13 | def split( value ) 14 | parts = CsvHuman::Tag.split( value ) 15 | 16 | name = parts[0] 17 | attributes = parts[1..-1] ## todo/fix: check if nil (make it empty array [] always) - why? why not? 18 | 19 | [name, attributes] 20 | end 21 | 22 | 23 | def conv_guess( value ) 24 | CsvHuman.guess_type( *split(value) ) 25 | end 26 | 27 | def conv_default( value ) 28 | CsvHuman::TYPE_MAPPINGS[:default].call( *split(value) ) 29 | end 30 | 31 | def conv_none( value ) 32 | CsvHuman::TYPE_MAPPINGS[:none].call( *split(value) ) 33 | end 34 | 35 | 36 | 37 | def test_none 38 | assert_equal String, conv_none( "#date" ) 39 | assert_equal String, conv_none( "#date +year" ) 40 | assert_equal String, conv_none( "#geo +lat" ) 41 | assert_equal String, conv_none( "#geo +elevation" ) 42 | end 43 | 44 | def test_guess_and_default 45 | assert_equal Date, conv_guess( "#date" ) 46 | assert_equal Integer, conv_guess( "#date +year" ) 47 | assert_equal Float, conv_guess( "#geo +lat" ) 48 | assert_equal Float, conv_guess( "#geo +elevation" ) 49 | 50 | assert_equal Date, conv_default( "#date" ) 51 | assert_equal Integer, conv_default( "#date +year" ) 52 | assert_equal Float, conv_default( "#geo +lat" ) 53 | assert_equal Float, conv_default( "#geo +elevation" ) 54 | end 55 | 56 | 57 | end # class TestTypeMappings 58 | -------------------------------------------------------------------------------- /csvjson/.gitignore: -------------------------------------------------------------------------------- 1 | ####################### 2 | # ignore ruby rake generated folders 3 | 4 | /pkg/ 5 | /doc/ 6 | 7 | 8 | ################ 9 | # ignore (top-level) datapackage folders 10 | 11 | /pack/ 12 | /.pack/ 13 | -------------------------------------------------------------------------------- /csvjson/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-10-14 2 | 3 | * Everything is new. First release 4 | -------------------------------------------------------------------------------- /csvjson/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | LICENSE.md 3 | Manifest.txt 4 | README.md 5 | Rakefile 6 | datasets/hello.json.csv 7 | datasets/hello11.json.csv 8 | lib/csvjson.rb 9 | lib/csvjson/parser.rb 10 | lib/csvjson/version.rb 11 | test/helper.rb 12 | test/test_parser.rb 13 | test/test_parser_misc.rb 14 | -------------------------------------------------------------------------------- /csvjson/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvjson/version.rb' 3 | 4 | Hoe.spec 'csvjson' do 5 | 6 | self.version = CsvJson::VERSION 7 | 8 | self.summary = "csvjson - read tabular data in the CSV <3 JSON format, that is, comma-separated values CSV (line-by-line) records with javascript object notation (JSON) encoding rules" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/csvjson'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ] 22 | 23 | self.licenses = ['Public Domain'] 24 | 25 | self.spec_extras = { 26 | required_ruby_version: '>= 2.2.2' 27 | } 28 | 29 | end 30 | -------------------------------------------------------------------------------- /csvjson/datasets/hello.json.csv: -------------------------------------------------------------------------------- 1 | 1,"John","12 Totem Rd. Aspen",true 2 | 2,"Bob",null,false 3 | 3,"Sue","Bigsby, 345 Carnival, WA 23009",false 4 | -------------------------------------------------------------------------------- /csvjson/datasets/hello11.json.csv: -------------------------------------------------------------------------------- 1 | # hello world 2 | 3 | 1, "John", "12 Totem Rd. Aspen", true 4 | 2, "Bob", null, false 5 | 3, "Sue", "Bigsby, 345 Carnival, WA 23009", false 6 | -------------------------------------------------------------------------------- /csvjson/lib/csvjson.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'pp' 4 | require 'json' 5 | require 'logger' 6 | 7 | 8 | ## our own code 9 | ## todo/check: use require_relative - why? why not? 10 | require 'csvjson/version' # note: let version always go first 11 | require 'csvjson/parser' 12 | 13 | 14 | ## add some "alternative" shortcut aliases 15 | CSV_JSON = CsvJson 16 | CSVJSON = CsvJson 17 | CSVJ = CsvJson 18 | CsvJ = CsvJson 19 | 20 | 21 | # say hello 22 | puts CsvJson.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 23 | -------------------------------------------------------------------------------- /csvjson/lib/csvjson/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ## note: for now CsvJson is a class!! (not a module) 4 | 5 | class CsvJson 6 | 7 | MAJOR = 1 8 | MINOR = 0 9 | PATCH = 1 10 | VERSION = [MAJOR,MINOR,PATCH].join('.') 11 | 12 | def self.version 13 | VERSION 14 | end 15 | 16 | def self.banner 17 | "csvjson/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 18 | end 19 | 20 | def self.root 21 | "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}" 22 | end 23 | 24 | end # class CsvJson 25 | -------------------------------------------------------------------------------- /csvjson/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'csvjson' 10 | 11 | 12 | ## add test_data_dir helper 13 | class CsvJson 14 | def self.test_data_dir 15 | "#{root}/datasets" 16 | end 17 | end 18 | 19 | 20 | CsvJson.logger.level = :debug ## turn on "global" logging 21 | -------------------------------------------------------------------------------- /csvjson/test/test_parser.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParser < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvJson 15 | end 16 | 17 | def records ## "standard" records for testing 18 | [[1, "John", "12 Totem Rd. Aspen", true], 19 | [2, "Bob", nil, false], 20 | [3, "Sue", "Bigsby, 345 Carnival, WA 23009", false]] 21 | end 22 | 23 | 24 | 25 | def test_parse 26 | assert_equal records, parser.parse( <<TXT ) 27 | 1,"John","12 Totem Rd. Aspen",true 28 | 2,"Bob",null,false 29 | 3,"Sue","Bigsby, 345 Carnival, WA 23009",false 30 | TXT 31 | 32 | assert_equal records, parser.parse( <<TXT ) 33 | # hello world 34 | 35 | 1,"John","12 Totem Rd. Aspen",true 36 | 2,"Bob",null,false 37 | 3,"Sue","Bigsby, 345 Carnival, WA 23009",false 38 | TXT 39 | 40 | assert_equal records, parser.parse( <<TXT ) 41 | # hello world (pretty printed) 42 | 43 | 1, "John", "12 Totem Rd. Aspen", true 44 | 2, "Bob", null, false 45 | 3, "Sue", "Bigsby, 345 Carnival, WA 23009", false 46 | 47 | # try more comments and empty lines 48 | 49 | TXT 50 | 51 | 52 | txt =<<TXT 53 | # hello world 54 | 55 | 1,"John","12 Totem Rd. Aspen",true 56 | 2,"Bob",null,false 57 | 3,"Sue","Bigsby, 345 Carnival, WA 23009",false 58 | TXT 59 | 60 | recs = [] 61 | parser.parse( txt ) { |rec| recs << rec } 62 | assert_equal records, recs 63 | end 64 | 65 | 66 | def test_read 67 | assert_equal records, parser.read( "#{CsvJson.test_data_dir}/hello.json.csv" ) 68 | assert_equal records, parser.read( "#{CsvJson.test_data_dir}/hello11.json.csv" ) 69 | end 70 | 71 | 72 | def test_open 73 | assert_equal records, parser.open( "#{CsvJson.test_data_dir}/hello.json.csv", "r:bom|utf-8" ).read 74 | assert_equal records, parser.open( "#{CsvJson.test_data_dir}/hello11.json.csv", "r:bom|utf-8" ).read 75 | end 76 | 77 | 78 | def test_foreach 79 | recs = [] 80 | parser.foreach( "#{CsvJson.test_data_dir}/hello.json.csv" ) { |rec| recs << rec } 81 | assert_equal records, recs 82 | 83 | recs = [] 84 | parser.foreach( "#{CsvJson.test_data_dir}/hello11.json.csv" ) { |rec| recs << rec } 85 | assert_equal records, recs 86 | end 87 | 88 | 89 | def test_enum 90 | csv = CsvJson.new( <<TXT ) 91 | # hello world 92 | 93 | 1,"John","12 Totem Rd. Aspen",true 94 | 2,"Bob",null,false 95 | 3,"Sue","Bigsby, 345 Carnival, WA 23009",false 96 | TXT 97 | 98 | it = csv.to_enum 99 | assert_equal [1, "John", "12 Totem Rd. Aspen", true], it.next 100 | assert_equal [2, "Bob", nil, false], it.next 101 | assert_equal [3, "Sue", "Bigsby, 345 Carnival, WA 23009", false], it.next 102 | end 103 | 104 | end # class TestParser 105 | -------------------------------------------------------------------------------- /csvjson/test/test_parser_misc.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_misc.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserMisc < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvJson 15 | end 16 | 17 | 18 | def test_quotes_and_commas 19 | assert_equal [ 20 | [1, "John", "12 Totem Rd., Aspen", true], 21 | [2, "Bob", nil, false], 22 | [3, "Sue", "\"Bigsby\", 345 Carnival, WA 23009", false] 23 | ], parser.parse( <<TXT ) 24 | 1,"John","12 Totem Rd., Aspen",true 25 | 2,"Bob",null,false 26 | 3,"Sue","\\"Bigsby\\", 345 Carnival, WA 23009",false 27 | TXT 28 | end 29 | 30 | 31 | def test_arrays 32 | assert_equal [ 33 | [1, "directions", ["north","south","east","west"]], 34 | [2, "colors", ["red","green","blue"]], 35 | [3, "drinks", ["soda","water","tea","coffe"]], 36 | [4, "spells", []], 37 | ], parser.parse( <<TXT ) 38 | # CSV <3 JSON with array values 39 | 40 | 1,"directions",["north","south","east","west"] 41 | 2,"colors",["red","green","blue"] 42 | 3,"drinks",["soda","water","tea","coffe"] 43 | 4,"spells",[] 44 | TXT 45 | end 46 | 47 | def test_misc 48 | ## note: 49 | ## in the csv <3 json source text backslash needs to get doubled / escaped twice e.g. 50 | ## \\" for quotes 51 | ## \\n for newlines and so on 52 | 53 | assert_equal [ 54 | ["index", "value1", "value2"], 55 | ["number", 1, 2], 56 | ["boolean", false, true], 57 | ["null", nil, "non null"], 58 | ["array of numbers", [1], [1,2]], 59 | ["simple object", {"a" => 1}, {"a" => 1, "b" => 2}], 60 | ["array with mixed objects", [1, nil,"ball"], [2,{"a" => 10, "b" => 20},"cube"]], 61 | ["string with quotes", "a\"b", "alert(\"Hi!\")"], 62 | ["string with bell&newlines","bell is \u0007","multi\nline\ntext"] 63 | ], parser.parse( <<TXT ) 64 | # CSV with all kinds of values 65 | 66 | "index","value1","value2" 67 | "number",1,2 68 | "boolean",false,true 69 | "null",null,"non null" 70 | "array of numbers",[1],[1,2] 71 | "simple object",{"a": 1},{"a":1, "b":2} 72 | "array with mixed objects",[1,null,"ball"],[2,{"a": 10, "b": 20},"cube"] 73 | "string with quotes","a\\"b","alert(\\"Hi!\\")" 74 | "string with bell&newlines","bell is \\u0007","multi\\nline\\ntext" 75 | TXT 76 | 77 | end 78 | 79 | 80 | end # class TestParserMisc 81 | -------------------------------------------------------------------------------- /csvpack/.gitignore: -------------------------------------------------------------------------------- 1 | ####################### 2 | # ignore ruby rake generated folders 3 | 4 | /pkg/ 5 | /doc/ 6 | 7 | 8 | ################ 9 | # ignore (top-level) datapackage folders 10 | 11 | /pack/ 12 | /.pack/ 13 | -------------------------------------------------------------------------------- /csvpack/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2015-04-23 2 | 3 | * Everything is new. First release 4 | 5 | -------------------------------------------------------------------------------- /csvpack/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | Manifest.txt 3 | README.md 4 | Rakefile 5 | lib/csvpack.rb 6 | lib/csvpack/downloader.rb 7 | lib/csvpack/pack.rb 8 | lib/csvpack/version.rb 9 | test/helper.rb 10 | test/test_companies.rb 11 | test/test_countries.rb 12 | test/test_downloader.rb 13 | test/test_import.rb 14 | -------------------------------------------------------------------------------- /csvpack/NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes 2 | 3 | 4 | ## Todos 5 | 6 | - [ ] add CsvPack.config !!! - e.g config.cache_dir "global" setting and others 7 | - [ ] add loading from directory 8 | - [ ] add loading from zip 9 | - [ ] add csvpack command line tool / bin 10 | - [ ] add schema class for json_schema - why? why not? 11 | - [ ] add table lookup by name e.g. table['constituent'] 12 | -------------------------------------------------------------------------------- /csvpack/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvpack/version.rb' 3 | 4 | Hoe.spec 'csvpack' do 5 | 6 | self.version = CsvPack::VERSION 7 | 8 | self.summary = "csvpack - tools 'n' scripts for working with tabular data packages using comma-separated values (CSV) datafiles in text with meta info (that is, schema, datatypes, ..) in datapackage.json; download, read into and query CSV datafiles with your SQL database (e.g. SQLite, PostgreSQL, ...) of choice and much more" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csv11/csvpack'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ['logutils', '>=0.6.1'], 22 | ['fetcher', '>=0.4.5'], 23 | ['activerecord', '>=5.0.0'], 24 | ] 25 | 26 | self.licenses = ['Public Domain'] 27 | 28 | self.spec_extras = { 29 | required_ruby_version: '>= 2.2.2' 30 | } 31 | 32 | end 33 | -------------------------------------------------------------------------------- /csvpack/getting-started-samples/start.rb: -------------------------------------------------------------------------------- 1 | ### 2 | # ruby script (data work flow) getting started sample from the csvpack readme 3 | # see https://github.com/csvreader/csvpack 4 | # 5 | 6 | require 'csvpack' 7 | 8 | 9 | CsvPack.import( 10 | 's-and-p-500-companies', 11 | 'gdb' 12 | ) 13 | 14 | 15 | 16 | 17 | 18 | pack = CsvPack::Pack.new( './pack/s-and-p-500-companies' ) 19 | 20 | Constituent = pack.table.ar_clazz 21 | 22 | 23 | pp Constituent.count 24 | # SELECT COUNT(*) FROM "constituents" 25 | # => 496 26 | 27 | 28 | pp Constituent.first 29 | # SELECT "constituents".* FROM "constituents" ORDER BY "constituents"."id" ASC LIMIT 1 30 | # => #<Constituent:0x9f8cb78 31 | # id: 1, 32 | # symbol: "MMM", 33 | # name: "3M Company", 34 | # sector: "Industrials"> 35 | 36 | 37 | pp Constituent.find_by!( symbol: 'MMM' ) 38 | # SELECT "constituents".* 39 | # FROM "constituents" 40 | # WHERE "constituents"."symbol" = "MMM" 41 | # LIMIT 1 42 | # => #<Constituent:0x9f8cb78 43 | # id: 1, 44 | # symbol: "MMM", 45 | # name: "3M Company", 46 | # sector: "Industrials"> 47 | 48 | 49 | pp Constituent.find_by!( name: '3M Company' ) 50 | # SELECT "constituents".* 51 | # FROM "constituents" 52 | # WHERE "constituents"."name" = "3M Company" 53 | # LIMIT 1 54 | # => #<Constituent:0x9f8cb78 55 | # id: 1, 56 | # symbol: "MMM", 57 | # name: "3M Company", 58 | # sector: "Industrials"> 59 | 60 | 61 | pp Constituent.where( sector: 'Industrials' ).count 62 | # SELECT COUNT(*) FROM "constituents" 63 | # WHERE "constituents"."sector" = "Industrials" 64 | # => 63 65 | 66 | 67 | pp Constituent.where( sector: 'Industrials' ).all 68 | # SELECT "constituents".* 69 | # FROM "constituents" 70 | # WHERE "constituents"."sector" = "Industrials" 71 | # => [#<Constituent:0x9f8cb78 72 | # id: 1, 73 | # symbol: "MMM", 74 | # name: "3M Company", 75 | # sector: "Industrials">, 76 | # #<Constituent:0xa2a4180 77 | # id: 8, 78 | # symbol: "ADT", 79 | # name: "ADT Corp (The)", 80 | # sector: "Industrials">,...] 81 | 82 | 83 | 84 | ##### 85 | # From F.A.Q. 86 | 87 | 88 | dl = CsvPack::Downloader.new 89 | dl.fetch( 'language-codes' ) 90 | dl.fetch( 's-and-p-500-companies' ) 91 | dl.fetch( 'un-locode') 92 | 93 | 94 | 95 | ####### 96 | # New db connection - store to ./mine.db 97 | 98 | ActiveRecord::Base.establish_connection( adapter: 'sqlite3', 99 | database: './mine.db' ) 100 | 101 | ## import 1) "auto"-magic 102 | CsvPack.import( 103 | 's-and-p-500-companies' 104 | ) 105 | 106 | ## import 2) "by hand" 107 | pack = CsvPack::Pack.new( './pack/gdb' ) 108 | pack.tables.each do |table| 109 | table.up! # (auto-) add table using SQL create_table via ActiveRecord migration 110 | table.import! # import all records using SQL inserts 111 | end 112 | -------------------------------------------------------------------------------- /csvpack/lib/csvpack.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | require 'pp' 5 | require 'forwardable' 6 | 7 | ### csv 8 | require 'csv' 9 | require 'json' 10 | require 'fileutils' 11 | 12 | 13 | ### downloader 14 | require 'fetcher' 15 | 16 | ### activerecord w/ sqlite3 17 | ## require 'active_support/all' ## needed for String#binary? method 18 | require 'active_record' 19 | 20 | 21 | 22 | # our own code 23 | 24 | require 'csvpack/version' ## let version always go first 25 | require 'csvpack/pack' 26 | require 'csvpack/downloader' 27 | 28 | module CsvPack 29 | 30 | def self.import( *args ) 31 | ## step 1: download 32 | dl = Downloader.new 33 | args.each do |arg| 34 | dl.fetch( arg ) 35 | end 36 | 37 | ## step 2: up 'n' import 38 | args.each do |arg| 39 | pack = Pack.new( "./pack/#{arg}/datapackage.json" ) 40 | pack.tables.each do |table| 41 | table.up! 42 | table.import! 43 | end 44 | end 45 | end 46 | 47 | end # module CsvPack 48 | 49 | 50 | 51 | # say hello 52 | puts CsvPack.banner if defined?($RUBYLIBS_DEBUG) && $RUBYLIBS_DEBUG 53 | -------------------------------------------------------------------------------- /csvpack/lib/csvpack/downloader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | module CsvPack 4 | 5 | class Downloader 6 | 7 | def initialize( cache_dir='./pack' ) 8 | @cache_dir = cache_dir # todo: check if folder exists now (or on demand)? 9 | @worker = Fetcher::Worker.new 10 | end 11 | 12 | SHORTCUTS = { 13 | ## to be done 14 | } 15 | 16 | def fetch( name_or_shortcut_or_url ) ## todo/check: use (re)name to get/update/etc. why? why not?? 17 | 18 | name = name_or_shortcut_or_url 19 | 20 | ## 21 | ## e.g. try 22 | ## country-list 23 | ## 24 | 25 | ## url_base = "http://data.okfn.org/data/core/#{name}" 26 | ## url_base = "https://datahub.io/core/#{name}" 27 | 28 | ## or use "https://github.com/datasets/#{name}/raw/master" 29 | url_base = "https://raw.githubusercontent.com/datasets/#{name}/master" 30 | 31 | 32 | url = "#{url_base}/datapackage.json" 33 | 34 | dest_dir = "#{@cache_dir}/#{name}" 35 | FileUtils.mkdir_p( dest_dir ) 36 | 37 | pack_path = "#{dest_dir}/datapackage.json" ## todo/fix: rename to meta_path - why? why not? 38 | @worker.copy( url, pack_path ) 39 | 40 | h = Meta.load_file( pack_path ) 41 | pp h 42 | 43 | ## copy resources (tables) 44 | h.resources.each do |r| 45 | puts "== resource:" 46 | pp r 47 | 48 | res_name = r['name'] 49 | res_relative_path = r['path'] ## fix/todo: might no contain the url - is now res_url_or_relative_path !!!!! 50 | if res_relative_path.nil? 51 | res_relative_path = "#{res_name}.csv" 52 | end 53 | 54 | res_url = r['url'] ## check - old package format - url NO longer used!!!! 55 | if res_url.nil? 56 | ## build url 57 | res_url = "#{url_base}/#{res_relative_path}" 58 | end 59 | 60 | ## todo/fix: rename - use just res_path - why? why not? 61 | local_res_path = "#{dest_dir}/#{res_relative_path}" 62 | puts "[debug] local_res_path: >#{local_res_path}<" 63 | local_res_dir = File.dirname( local_res_path ) 64 | FileUtils.mkdir_p( local_res_dir ) 65 | 66 | @worker.copy( res_url, local_res_path ) 67 | end 68 | end 69 | 70 | end # class Downloader 71 | 72 | end # module CsvPack 73 | -------------------------------------------------------------------------------- /csvpack/lib/csvpack/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | module CsvPack 4 | 5 | MAJOR = 0 ## todo: namespace inside version or something - why? why not?? 6 | MINOR = 2 7 | PATCH = 0 8 | VERSION = [MAJOR,MINOR,PATCH].join('.') 9 | 10 | def self.version 11 | VERSION 12 | end 13 | 14 | def self.banner 15 | "csvpack/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 16 | end 17 | 18 | def self.root 19 | File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) 20 | end 21 | 22 | end # module CsvPack 23 | -------------------------------------------------------------------------------- /csvpack/test/helper.rb: -------------------------------------------------------------------------------- 1 | 2 | ## minitest setup 3 | require 'minitest/autorun' 4 | 5 | 6 | ## our own code 7 | require 'csvpack' 8 | -------------------------------------------------------------------------------- /csvpack/test/pack/beer/data.csv: -------------------------------------------------------------------------------- 1 | Brewery,City,Name,Abv 2 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 3 | Augustiner Bräu München,München,Edelstoff,5.6% 4 | Bayerische Staatsbrauerei Weihenstephan,Freising,Hefe Weissbier,5.4% 5 | Brauerei Spezial,Bamberg,Rauchbier Märzen,5.1% 6 | Hacker-Pschorr Bräu,München,Münchner Dunkel,5.0% 7 | Staatliches Hofbräuhaus München,München,Hofbräu Oktoberfestbier,6.3% 8 | -------------------------------------------------------------------------------- /csvpack/test/pack/beer/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "beer", 3 | "resources": [ 4 | { 5 | "path": "data.csv", 6 | "schema": { 7 | "fields": [{ "name": "Brewery", "type": "string" }, 8 | { "name": "City", "type": "string" }, 9 | { "name": "Name", "type": "string" }, 10 | { "name": "Abv", "type": "number" }] 11 | } 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /csvpack/test/test_companies.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_companies.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestCompanies < MiniTest::Test 11 | 12 | def test_s_and_p_500_companies 13 | 14 | pack = CsvPack::Pack.new( './pack/s-and-p-500-companies/datapackage.json' ) 15 | 16 | meta = pack.meta 17 | puts "name: #{meta.name}" 18 | puts "title: #{meta.title}" 19 | puts "license: #{meta.license}" 20 | 21 | pp pack.tables 22 | pp pack.table[0]['Symbol'] 23 | pp pack.table[495]['Symbol'] 24 | 25 | ## pak.table.each do |row| 26 | ## pp row 27 | ## end 28 | 29 | puts pack.tables[0].dump_schema 30 | 31 | # database setup 'n' config 32 | ActiveRecord::Base.establish_connection( adapter: 'sqlite3', 33 | database: ':memory:' ) 34 | ActiveRecord::Base.logger = Logger.new( STDOUT ) 35 | 36 | pack.table.up! 37 | pack.table.import! 38 | 39 | ## pack.tables[0].up! 40 | ## pack.tables[0].import! 41 | 42 | 43 | pp pack.table.ar_clazz 44 | 45 | 46 | company = pack.table.ar_clazz 47 | 48 | puts "Company:" 49 | pp company.count 50 | pp company.first 51 | pp company.find_by!( symbol: 'MMM' ) 52 | pp company.find_by!( name: '3M Company' ) 53 | pp company.where( sector: 'Industrials' ).count 54 | pp company.where( sector: 'Industrials' ).all 55 | 56 | 57 | ### todo: try a join w/ belongs_to ?? 58 | 59 | assert true # if we get here - test success 60 | end 61 | 62 | end # class TestCompanies 63 | -------------------------------------------------------------------------------- /csvpack/test/test_countries.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_countries.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestCountries < MiniTest::Test 11 | 12 | def test_country_list 13 | pack = CsvPack::Pack.new( './pack/country-list/datapackage.json' ) 14 | 15 | meta = pack.meta 16 | puts "name: #{meta.name}" 17 | puts "title: #{meta.title}" 18 | puts "license: #{meta.license}" 19 | 20 | pp pack.tables 21 | 22 | ## pak.table.each do |row| 23 | ## pp row 24 | ## end 25 | 26 | puts pack.table.dump_schema 27 | 28 | # database setup 'n' config 29 | ActiveRecord::Base.establish_connection( adapter: 'sqlite3', 30 | database: ':memory:' ) 31 | ActiveRecord::Base.logger = Logger.new( STDOUT ) 32 | 33 | pack.table.up! 34 | pack.table.import! 35 | 36 | pp pack.table.ar_clazz 37 | 38 | assert true # if we get here - test success 39 | end 40 | 41 | end # class TestCountries 42 | -------------------------------------------------------------------------------- /csvpack/test/test_downloader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_downloader.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestDownloader < MiniTest::Test 11 | 12 | def test_download 13 | 14 | names = [ 15 | 'country-list', 16 | 'country-codes', 17 | 'language-codes', 18 | 'cpi', ## Annual Consumer Price Index (CPI) 19 | 'gdp', ## Country, Regional and World GDP (Gross Domestic Product) 20 | 's-and-p-500-companies', ## S&P 500 Companies with Financial Information 21 | 'un-locode', ## UN-LOCODE Codelist - note: incl. country-codes.csv 22 | ] 23 | 24 | dl = CsvPack::Downloader.new 25 | names.each do |name| 26 | dl.fetch( name ) 27 | end 28 | 29 | assert true # if we get here - test success 30 | end 31 | 32 | end # class TestDownloader 33 | -------------------------------------------------------------------------------- /csvpack/test/test_import.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_import.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestImport < MiniTest::Test 11 | 12 | def test_import 13 | 14 | CsvPack.import( 15 | 'cpi', ## Annual Consumer Price Index (CPI) 16 | 'gdp', ## Country, Regional and World GDP (Gross Domestic Product) 17 | ) 18 | 19 | assert true # if we get here - test success 20 | end 21 | 22 | end # class TestImport 23 | -------------------------------------------------------------------------------- /csvreader/.gitignore: -------------------------------------------------------------------------------- 1 | ####################### 2 | # ignore ruby rake generated folders 3 | 4 | /pkg/ 5 | /doc/ 6 | 7 | 8 | ################ 9 | # ignore (top-level) datapackage folders 10 | 11 | /pack/ 12 | /.pack/ 13 | -------------------------------------------------------------------------------- /csvreader/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-08-15 2 | 3 | * Everything is new. First release. 4 | -------------------------------------------------------------------------------- /csvreader/ERRORS.md: -------------------------------------------------------------------------------- 1 | # Errors / Error Recovery / Auto-Fixes 2 | 3 | 4 | 5 | ### Quoted Value with Trailing Data (Auto-Fixed) 6 | 7 | ``` 8 | Farrokh,"Freddy" Mercury,Bulsara 9 | ``` 10 | 11 | See `"Freddy" Mercury` for example. 12 | 13 | How to handle? 14 | 15 | Add new rule! 16 | If quoted value is followed by more data auto-add all the data 17 | until hitting the separator (that is, comma) 18 | and turn the quotes into "literal" quotes as part of the value. 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /csvreader/Manifest.txt: -------------------------------------------------------------------------------- 1 | CHANGELOG.md 2 | Manifest.txt 3 | README.md 4 | Rakefile 5 | datasets/beer.csv 6 | datasets/beer11.csv 7 | datasets/cars11.csv 8 | datasets/cities11.csv 9 | datasets/customers11.csv 10 | datasets/iris.attrib.csv 11 | datasets/iris11.csv 12 | datasets/lcc.attrib.csv 13 | datasets/shakespeare.csv 14 | datasets/test.csv 15 | lib/csvreader.rb 16 | lib/csvreader/base.rb 17 | lib/csvreader/buffer.rb 18 | lib/csvreader/builder.rb 19 | lib/csvreader/converter.rb 20 | lib/csvreader/parser.rb 21 | lib/csvreader/parser_fixed.rb 22 | lib/csvreader/parser_json.rb 23 | lib/csvreader/parser_std.rb 24 | lib/csvreader/parser_strict.rb 25 | lib/csvreader/parser_tab.rb 26 | lib/csvreader/parser_table.rb 27 | lib/csvreader/parser_yaml.rb 28 | lib/csvreader/reader.rb 29 | lib/csvreader/reader_hash.rb 30 | lib/csvreader/version.rb 31 | test/helper.rb 32 | test/test_buffer.rb 33 | test/test_converter.rb 34 | test/test_parser.rb 35 | test/test_parser_autofix.rb 36 | test/test_parser_directive.rb 37 | test/test_parser_fixed.rb 38 | test/test_parser_formats.rb 39 | test/test_parser_java.rb 40 | test/test_parser_meta.rb 41 | test/test_parser_null.rb 42 | test/test_parser_numeric.rb 43 | test/test_parser_quotes.rb 44 | test/test_parser_strict.rb 45 | test/test_parser_tab.rb 46 | test/test_parser_table.rb 47 | test/test_reader.rb 48 | test/test_reader_converters.rb 49 | test/test_reader_hash.rb 50 | test/test_reader_hash_converters.rb 51 | test/test_samples.rb 52 | -------------------------------------------------------------------------------- /csvreader/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvreader/version.rb' 3 | 4 | 5 | Hoe.spec 'csvreader' do 6 | 7 | self.version = CsvReader::VERSION 8 | 9 | self.summary = "csvreader - read tabular data in the comma-separated values (csv) format the right way (uses best practices out-of-the-box with zero-configuration)" 10 | self.description = summary 11 | 12 | self.urls = { home: 'https://github.com/csvreader/csvreader' } 13 | 14 | self.author = 'Gerald Bauer' 15 | self.email = 'wwwmake@googlegroups.com' 16 | 17 | # switch extension to .markdown for gihub formatting 18 | self.readme_file = 'README.md' 19 | self.history_file = 'CHANGELOG.md' 20 | 21 | self.extra_deps = [ 22 | ['tabreader', '>=1.0.1'], 23 | ['csvyaml', '>=0.1.0'], 24 | ['csvjson', '>=1.0.0'] 25 | ] 26 | 27 | self.licenses = ['Public Domain'] 28 | 29 | self.spec_extras = { 30 | required_ruby_version: '>= 2.2.2' 31 | } 32 | 33 | end 34 | -------------------------------------------------------------------------------- /csvreader/datasets/beer.csv: -------------------------------------------------------------------------------- 1 | Brewery,City,Name,Abv 2 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 3 | Augustiner Bräu München,München,Edelstoff,5.6% 4 | Bayerische Staatsbrauerei Weihenstephan,Freising,Hefe Weissbier,5.4% 5 | Brauerei Spezial,Bamberg,Rauchbier Märzen,5.1% 6 | Hacker-Pschorr Bräu,München,Münchner Dunkel,5.0% 7 | Staatliches Hofbräuhaus München,München,Hofbräu Oktoberfestbier,6.3% 8 | -------------------------------------------------------------------------------- /csvreader/datasets/beer11.csv: -------------------------------------------------------------------------------- 1 | ####### 2 | # try with some comments 3 | # and blank lines even before header 4 | 5 | Brewery,City,Name,Abv 6 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 7 | Augustiner Bräu München,München,Edelstoff,5.6% 8 | Bayerische Staatsbrauerei Weihenstephan,Freising,Hefe Weissbier,5.4% 9 | 10 | Brauerei Spezial, Bamberg, Rauchbier Märzen, 5.1% 11 | 12 | Hacker-Pschorr Bräu, München, Münchner Dunkel, 5.0% 13 | 14 | ## some more comments here 15 | 16 | Staatliches Hofbräuhaus München,München,Hofbräu Oktoberfestbier,6.3% 17 | 18 | ## check for nil 19 | "", ,,"", 20 | 21 | ## check for blank line with spaces 22 | ## yes, will get added as a record!! e.g. ["", nil, nil, nil] 23 | ## use regex to skip blank lines with spaces!!!! 24 | 25 | 26 | ## test double quotes and double quotes escaped 27 | ## note: double quotes do NOT work with leading AND/OR trailing spaces 28 | ## leads to: 29 | ## CSV::MalformedCSVError - Missing or stray quote in line xxx 30 | ## 31 | ## note: for now double quote does not accept leading AND/OR trailing spaces!!!! 32 | ## 33 | ## todo/fix: check liberal_quote option starting in csv ruby 2.4 ??? 34 | ## 35 | ## examples: 36 | ## "value with comma, comma","some ""hello""","some ""hello""", 37 | ## works - but does NOT work (note the leading and trailing spaces for double quotes): 38 | ## "value with comma, comma" ,"some ""hello""", "some ""hello""", 39 | ## 40 | ## check for "multi-line": 41 | ## "hello 42 | ## and another line 43 | ## and another",two,three, 44 | 45 | 46 | "value with comma, comma","some ""hello""","some ""hello""", 47 | 48 | ## check for "multi-line" 49 | "hello 50 | and another line 51 | and another",two,three, 52 | -------------------------------------------------------------------------------- /csvreader/datasets/cars11.csv: -------------------------------------------------------------------------------- 1 | ##### 2 | # csv sample from the wikipedia article "Comma-separated values" 3 | # see en.wikipedia.org/wiki/Comma-separated_values 4 | 5 | Year,Make,Model,Description,Price 6 | 1997, Ford, E350,"ac, abs, moon",3000.00 7 | 1999, Chevy, "Venture ""Extended Edition""","",4900.00 8 | 1999, Chevy, "Venture ""Extended Edition, Very Large""",,5000.00 9 | 1996, Jeep, Grand Cherokee,"MUST SELL! 10 | air, moon roof, loaded",4799.00 11 | -------------------------------------------------------------------------------- /csvreader/datasets/cities11.csv: -------------------------------------------------------------------------------- 1 | ##### 2 | # csv sample from the wikipedia article "Comma-separated values" 3 | # see en.wikipedia.org/wiki/Comma-separated_values 4 | # 5 | # note: 6 | # Double quote processing need only apply if the field starts 7 | # with a double quote. Note, however, that double quotes are not 8 | # allowed in unquoted fields according to RFC 4180 9 | 10 | Los Angeles, 34°03'N, 118°15'W 11 | New York City, 40°42'46"N, 74°00'21"W 12 | Paris, 48°51'24"N, 2°21'03"E 13 | -------------------------------------------------------------------------------- /csvreader/datasets/customers11.csv: -------------------------------------------------------------------------------- 1 | ##### 2 | # csv sample from the article: 3 | # A Guide to the Ruby CSV Library, Part I 4 | # - sitepoint.com/guide-ruby-csv-library-part 5 | 6 | Name,Times arrived,Total $ spent,Food feedback 7 | Dan, 34, 2548, Lovin it! 8 | Maria, 55, 5054, "Good, delicious food" 9 | Carlos, 22, 4352, "I am ""pleased"", but could be better" 10 | Stephany, 34, 6542, I want bigger steaks!!!!! 11 | James, 1, 43, Not bad 12 | Robin, 1, 56, Fish is tasty 13 | Anna, 1, 79, "Good, better, the best!" 14 | -------------------------------------------------------------------------------- /csvreader/datasets/iris.attrib.csv: -------------------------------------------------------------------------------- 1 | % 1. Title: Iris Plants Database 2 | % 3 | % 2. Sources: 4 | % (a) Creator: R.A. Fisher 5 | 6 | 7 | @RELATION iris 8 | 9 | @ATTRIBUTE sepallength NUMERIC 10 | @ATTRIBUTE sepalwidth NUMERIC 11 | @ATTRIBUTE petallength NUMERIC 12 | @ATTRIBUTE petalwidth NUMERIC 13 | @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} 14 | 15 | @DATA 16 | 5.1,3.5,1.4,0.2,Iris-setosa 17 | 4.9,3.0,1.4,0.2,Iris-setosa 18 | 4.7,3.2,1.3,0.2,Iris-setosa 19 | 4.6,3.1,1.5,0.2,Iris-setosa 20 | 5.0,3.6,1.4,0.2,Iris-setosa 21 | 5.4,3.9,1.7,0.4,Iris-setosa 22 | 4.6,3.4,1.4,0.3,Iris-setosa 23 | 5.0,3.4,1.5,0.2,Iris-setosa 24 | 4.4,2.9,1.4,0.2,Iris-setosa 25 | 4.9,3.1,1.5,0.1,Iris-setosa 26 | -------------------------------------------------------------------------------- /csvreader/datasets/lcc.attrib.csv: -------------------------------------------------------------------------------- 1 | % Attribute-Relation File Format (ARFF) Example 2 | % see https://www.cs.waikato.ac.nz/ml/weka/arff.html 3 | 4 | @relation LCCvsLCSH 5 | 6 | @attribute LCC string 7 | @attribute LCSH string 8 | 9 | @data 10 | AG5, 'Encyclopedias and dictionaries.;Twentieth century.' 11 | AS262, 'Science -- Soviet Union -- History.' 12 | AE5, 'Encyclopedias and dictionaries.' 13 | AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' 14 | AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' 15 | -------------------------------------------------------------------------------- /csvreader/datasets/shakespeare.csv: -------------------------------------------------------------------------------- 1 | Quote,Play,Cite 2 | Sweet are the uses of adversity,As You Like It,"Act 2, scene 1, 12" 3 | All the world's a stage,As You Like It,"Act 2, scene 7, 139" 4 | "We few, we happy few",Henry V, 5 | """Seems,"" madam! Nay it is; I know not ""seems.""",Hamlet,(1.ii.76) 6 | "To be, or not to be",Hamlet,"Act 3, scene 1, 55" 7 | What's in a name? That which we call a rose by any other name would smell as sweet.,Romeo and Juliet,"(II, ii, 1-2)" 8 | "O Romeo, Romeo, wherefore art thou Romeo?",Romeo and Juliet,"Act 2, scene 2, 33" 9 | "Tomorrow, and tomorrow, and tomorrow",Macbeth,"Act 5, scene 5, 19" 10 | -------------------------------------------------------------------------------- /csvreader/datasets/test.csv: -------------------------------------------------------------------------------- 1 | ################################################## 2 | ## Apache Commons CSV Reader Test Sample 3 | ## see https://github.com/apache/commons-csv/blob/master/src/test/resources/CSVFileParser/test.csv 4 | 5 | 6 | A,B,C,"D" 7 | # plain values 8 | a,b,c,d 9 | # spaces before and after 10 | e ,f , g,h 11 | # quoted: with spaces before and after 12 | " i ", " j " , " k "," l " 13 | # empty values 14 | ,,, 15 | # empty quoted values 16 | "","","","" 17 | # 3 empty lines 18 | 19 | 20 | 21 | # EOF on next line 22 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader.rb: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## our own code (without "top-level" shortcuts e.g. "modular version") 4 | require 'csvreader/base' 5 | 6 | 7 | ### 8 | # add convenience top-level shortcuts / aliases 9 | 10 | Csv = CsvReader 11 | CsvHash = CsvHashReader 12 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/buffer.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader 3 | class Buffer ## todo: find a better name: 4 | ## BufferedReader 5 | ## BufferedInput 6 | ## BufferI 7 | ## - why? why not? is really just for reading (keep io?) 8 | 9 | def initialize( data ) 10 | # create the IO object we will read from 11 | @io = data.is_a?(String) ? StringIO.new(data) : data 12 | @buf = [] ## last (buffer) chars (used for peek) 13 | end 14 | 15 | def eof?() @buf.size == 0 && @io.eof?; end 16 | 17 | def getc 18 | if @buf.size > 0 19 | @buf.shift ## get first char from buffer 20 | else 21 | @io.getc 22 | end 23 | end # method getc 24 | 25 | 26 | def peekn( lookahead ) 27 | ## todo/check: use a new method peekstr or match or something 28 | ## for more than 29 | if @buf.size == 0 && @io.eof? 30 | ## puts "peek - hitting eof!!!" 31 | return "\0" ## return NUL char (0) for now 32 | end 33 | 34 | while @buf.size < lookahead do 35 | ## todo/check: add/append NUL char (0) - why? why not? 36 | break if @io.eof? ## nothing more to read; break out of filling up buffer 37 | 38 | c = @io.getc 39 | @buf.push( c ) 40 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 41 | end 42 | 43 | @buf[0,lookahead].join 44 | end 45 | 46 | 47 | def peek1 48 | if @buf.size == 0 && @io.eof? 49 | ## puts "peek - hitting eof!!!" 50 | return "\0" ## return NUL char (0) for now 51 | end 52 | 53 | if @buf.size == 0 54 | c = @io.getc 55 | @buf.push( c ) 56 | ## puts "peek - fill buffer >#{c}< (#{c.ord})" 57 | end 58 | 59 | @buf[0] ## @buf.first 60 | end # method peek1 61 | alias :peek :peek1 ## for now alias for peek1 62 | 63 | 64 | 65 | end # class Buffer 66 | end # class CsvReader 67 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/builder.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader 3 | class Builder 4 | 5 | 6 | def initialize( parser ) 7 | @parser = parser 8 | end 9 | 10 | 11 | ## (auto-)forward to wrapped parser 12 | ## note/fix: not all parser use/have config e.g. ParserTab, ParserFixed, etc. 13 | ## 14 | ## todo/fix: 15 | ## add parser config (attribute) setter e.g. 16 | ## - sep=(value) 17 | ## - comment=(value) 18 | ## - and so on!!! 19 | def config() @parser.config; end 20 | 21 | 22 | 23 | def open( path, mode=nil, **kwargs, &block ) 24 | CsvReader.open( path, mode, parser: @parser, **kwargs, &block ) 25 | end 26 | 27 | def read( path, **kwargs ) 28 | CsvReader.read( path, parser: @parser, **kwargs ) 29 | end 30 | 31 | def header( path, **kwargs ) 32 | CsvReader.header( path, parser: @parser, **kwargs ) 33 | end 34 | 35 | def foreach( path, **kwargs, &block ) 36 | CsvReader.foreach( path, parser: @parser, **kwargs, &block ) 37 | end 38 | 39 | 40 | def parse( str_or_readable, **kwargs, &block ) 41 | CsvReader.parse( str_or_readable, parser: @parser, **kwargs, &block ) 42 | end 43 | end # class Builder 44 | end # class CsvReader 45 | 46 | 47 | 48 | class CsvHashReader 49 | class Builder 50 | def initialize( parser ) 51 | @parser = parser 52 | end 53 | 54 | ## (auto-)forward to wrapped parser 55 | ## note/fix: not all parser use/have config e.g. ParserTab, ParserFixed, etc. 56 | ## 57 | ## todo/fix: 58 | ## add parser config (attribute) setter e.g. 59 | ## - sep=(value) 60 | ## - comment=(value) 61 | ## - and so on!!! 62 | def config() @parser.config; end 63 | 64 | 65 | 66 | def open( path, mode=nil, **kwargs, &block ) 67 | CsvHashReader.open( path, mode, parser: @parser, **kwargs, &block ) 68 | end 69 | 70 | def read( path, **kwargs ) 71 | CsvHashReader.read( path, parser: @parser, **kwargs ) 72 | end 73 | 74 | def foreach( path, **kwargs, &block ) 75 | CsvHashReader.foreach( path, parser: @parser, **kwargs, &block ) 76 | end 77 | 78 | 79 | def parse( str_or_readable, **kwargs, &block ) 80 | CsvHashReader.parse( str_or_readable, parser: @parser, **kwargs, &block ) 81 | end 82 | end # class Builder 83 | end # class CsvHashReader 84 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/parser.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader 3 | 4 | class Parser 5 | ## "forward" reference, 6 | ## see base.rb for more 7 | end 8 | 9 | 10 | #################################### 11 | # define errors / exceptions 12 | # for all parsers for (re)use 13 | 14 | class Error < StandardError 15 | end 16 | 17 | #### 18 | # todo/check: 19 | # use "common" error class - why? why not? 20 | 21 | class ParseError < Error 22 | attr_reader :message 23 | 24 | def initialize( message ) 25 | @message = message 26 | end 27 | 28 | def to_s 29 | "*** csv parse error: #{@message}" 30 | end 31 | end # class ParseError 32 | end # class CsvReader 33 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/parser_fixed.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader 3 | 4 | class ParserFixed 5 | 6 | ################################### 7 | ## add simple logger with debug flag/switch 8 | # 9 | # use Parser.debug = true # to turn on 10 | # 11 | # todo/fix: use logutils instead of std logger - why? why not? 12 | 13 | def self.build_logger() 14 | l = Logger.new( STDOUT ) 15 | l.level = :info ## set to :info on start; note: is 0 (debug) by default 16 | l 17 | end 18 | def self.logger() @@logger ||= build_logger; end 19 | def logger() self.class.logger; end 20 | 21 | 22 | def parse( data, width:, &block ) 23 | 24 | ## note: input: required each_line (string or io/file for example) 25 | 26 | input = data ## assume it's a string or io/file handle 27 | 28 | if block_given? 29 | parse_lines( input, width: width, &block ) 30 | else 31 | records = [] 32 | 33 | parse_lines( input, width: width ) do |record| 34 | records << record 35 | end 36 | 37 | records 38 | end 39 | end ## method parse 40 | 41 | 42 | 43 | private 44 | 45 | def parse_lines( input, width:, &block ) 46 | 47 | ## note: each line only works with \n (windows) or \r\n (unix) 48 | ## will NOT work with \r (old mac, any others?) only!!!! 49 | input.each_line do |line| 50 | 51 | ## note: chomp('') if is an empty string, 52 | ## it will remove all trailing newlines from the string. 53 | ## use line.sub(/[\n\r]*$/, '') or similar instead - why? why not? 54 | line = line.chomp( '' ) 55 | logger.debug "line:" if logger.debug? 56 | logger.debug line.pretty_inspect if logger.debug? 57 | 58 | 59 | ## skip empty lines and comments 60 | if line =~ /^[ \t]*$/ ## skip blank lines (with whitespace only) 61 | logger.debug "skip blank line" if logger.debug? 62 | next 63 | end 64 | 65 | if line =~ /^[ \t]*#/ # start_with?( "#" ) -- skip comment lines (note: allow leading whitespaces) 66 | logger.debug "skip comment line" if logger.debug? 67 | next 68 | end 69 | 70 | 71 | if width.is_a?( String ) 72 | ## assume it's String#unpack format e.g. 73 | ## "209231-231992395 MoreData".unpack('aa5A1A9a4Z*') 74 | ## returns an array as follows : 75 | ## ["2", "09231", "-", "231992395", " ", "MoreData"] 76 | ## see String#unpack 77 | 78 | values = line.unpack( width ) 79 | else ## assume array with integers 80 | values = [] 81 | offset = 0 # start position / offset 82 | width.each_with_index do |w,i| 83 | logger.debug "[#{i}] start: #{offset}, width: #{w}" if logger.debug? 84 | 85 | if w < 0 ## convention - if width negative, skip column 86 | # note: minus (-) and minus (-) equal plus (+) 87 | ## e.g. 2 - -2 = 4 88 | offset -= w 89 | else 90 | value = line[offset, w] 91 | value = value.strip if value ## note: if not nil strip; only use rstrip (for trailing only) - why? why not? 92 | values << value 93 | offset += w 94 | end 95 | end 96 | end 97 | 98 | ## note: requires block - enforce? how? why? why not? 99 | block.call( values ) 100 | end 101 | end # method parse_lines 102 | 103 | 104 | end # class ParserFixed 105 | end # class CsvReader 106 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/parser_json.rb: -------------------------------------------------------------------------------- 1 | 2 | 3 | class CsvReader 4 | 5 | class ParserJson 6 | 7 | def parse( data, **kwargs, &block ) 8 | ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers) 9 | 10 | ## note: input: required each_line (string or io/file for example) 11 | ## assume data is a string or io/file handle 12 | csv = CsvJson.new( data ) 13 | 14 | if block_given? 15 | csv.each( &block ) 16 | else 17 | csv.to_a 18 | end 19 | end ## method parse 20 | 21 | 22 | end # class ParserJson 23 | end # class CsvReader 24 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/parser_tab.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader 3 | 4 | class ParserTab 5 | 6 | def parse( data, **kwargs, &block ) 7 | ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers) 8 | 9 | ## note: input: required each_line (string or io/file for example) 10 | ## assume data is a string or io/file handle 11 | tab = TabReader.new( data ) 12 | 13 | if block_given? 14 | tab.each( &block ) 15 | else 16 | tab.to_a 17 | end 18 | end ## method parse 19 | 20 | 21 | end # class ParserTab 22 | end # class CsvReader 23 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/parser_yaml.rb: -------------------------------------------------------------------------------- 1 | 2 | 3 | class CsvReader 4 | 5 | class ParserYaml 6 | 7 | def parse( data, **kwargs, &block ) 8 | ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers) 9 | 10 | ## note: input: required each_line (string or io/file for example) 11 | ## assume data is a string or io/file handle 12 | csv = CsvYaml.new( data ) 13 | 14 | if block_given? 15 | csv.each( &block ) 16 | else 17 | csv.to_a 18 | end 19 | end ## method parse 20 | 21 | 22 | end # class ParserYaml 23 | end # class CsvReader 24 | -------------------------------------------------------------------------------- /csvreader/lib/csvreader/version.rb: -------------------------------------------------------------------------------- 1 | 2 | class CsvReader ## note: uses a class for now - change to module - why? why not? 3 | 4 | module Version 5 | MAJOR = 1 ## todo: namespace inside version or something - why? why not?? 6 | MINOR = 2 7 | PATCH = 5 8 | 9 | ## self.to_s - why? why not? 10 | end 11 | 12 | VERSION = [Version::MAJOR, 13 | Version::MINOR, 14 | Version::PATCH].join('.') 15 | 16 | def self.version ## keep (as an alternative to VERSION) - why? why not? 17 | VERSION 18 | end 19 | 20 | 21 | 22 | def self.banner 23 | "csvreader/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})" 24 | end 25 | 26 | def self.root 27 | File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) 28 | end 29 | 30 | end # class CsvReader 31 | -------------------------------------------------------------------------------- /csvreader/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'csvreader' 10 | ## require 'csvreader/base' ## try modular version (that is, without Csv,CsvHash "top-level" shortcuts) 11 | 12 | 13 | ## add test_data_dir helper 14 | class CsvReader 15 | def self.test_data_dir 16 | "#{root}/datasets" 17 | end 18 | end 19 | 20 | 21 | ## CsvReader::ParserStd.logger.level = :debug ## turn on "global" logging 22 | ## CsvReader::ParserStrict.logger.level = :debug ## turn on "global" logging 23 | ## CsvReader::ParserFixed.logger.level = :debug ## turn on "global" logging 24 | CsvReader::ParserTable.logger.level = :debug ## turn on "global" logging 25 | -------------------------------------------------------------------------------- /csvreader/test/test_buffer.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_buffer.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestBuffer < MiniTest::Test 12 | 13 | 14 | def test_peek 15 | 16 | buf = CsvReader::Buffer.new( <<TXT ) 17 | # hello 18 | 1,2,3 19 | TXT 20 | 21 | assert_equal '#', buf.peek 22 | assert_equal '#', buf.peek1 23 | assert_equal '#', buf.peekn(1) 24 | assert_equal '# ', buf.peekn(2) 25 | assert_equal '# h', buf.peekn(3) 26 | assert_equal '# he', buf.peekn(4) 27 | 28 | buf.getc ## eat first char 29 | 30 | assert_equal ' ', buf.peek 31 | assert_equal ' ', buf.peek1 32 | assert_equal ' ', buf.peekn(1) 33 | assert_equal ' h', buf.peekn(2) 34 | assert_equal ' he', buf.peekn(3) 35 | assert_equal ' hel', buf.peekn(4) 36 | end 37 | 38 | 39 | end # class TestBuffer 40 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_autofix.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_autofix.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestParserAutofix < MiniTest::Test 12 | 13 | 14 | def parser 15 | CsvReader::Parser::DEFAULT 16 | end 17 | 18 | 19 | def test_quote_with_trailing_value 20 | recs = [[ "Farrokh", "\"Freddy\" Mercury", "Bulsara" ]] 21 | 22 | assert_equal recs, parser.parse( %Q{Farrokh,"Freddy" Mercury,Bulsara} ) 23 | assert_equal recs, parser.parse( %Q{ Farrokh , "Freddy" Mercury , Bulsara } ) 24 | assert_equal recs, parser.parse( %Q{Farrokh, "Freddy" Mercury ,Bulsara} ) 25 | end 26 | 27 | 28 | end # class TestParserAutofix 29 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_directive.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_directive.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserDirective < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvReader::Parser::DEFAULT 15 | end 16 | 17 | 18 | def test_iris 19 | records = [["5.1","3.5","1.4","0.2","Iris-setosa"], 20 | ["4.9","3.0","1.4","0.2","Iris-setosa"]] 21 | 22 | 23 | assert_equal records, parser.parse( <<TXT ) 24 | % with meta data - arff (attribute relation file format)-style 25 | % 26 | 27 | @RELATION iris 28 | 29 | @ATTRIBUTE sepallength NUMERIC 30 | @ATTRIBUTE sepalwidth NUMERIC 31 | @ATTRIBUTE petallength NUMERIC 32 | @ATTRIBUTE petalwidth NUMERIC 33 | @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} 34 | 35 | @DATA 36 | 5.1,3.5,1.4,0.2,Iris-setosa 37 | 4.9,3.0,1.4,0.2,Iris-setosa 38 | TXT 39 | end 40 | 41 | 42 | def test_lcc 43 | records = [['AG5', 'Encyclopedias and dictionaries.;Twentieth century.'], 44 | ['AS262', 'Science -- Soviet Union -- History.'], 45 | ['AE5', 'Encyclopedias and dictionaries.'], 46 | ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Phases.'], 47 | ['AS281', 'Astronomy, Assyro-Babylonian.;Moon -- Tables.']] 48 | 49 | 50 | assert_equal records, parser.parse( <<TXT ) 51 | % Attribute-Relation File Format (ARFF) Example 52 | % see https://www.cs.waikato.ac.nz/ml/weka/arff.html 53 | 54 | @relation LCCvsLCSH 55 | 56 | @attribute LCC string 57 | @attribute LCSH string 58 | 59 | @data 60 | AG5, 'Encyclopedias and dictionaries.;Twentieth century.' 61 | AS262, 'Science -- Soviet Union -- History.' 62 | AE5, 'Encyclopedias and dictionaries.' 63 | AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' 64 | AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' 65 | TXT 66 | end 67 | 68 | end # class TestParserDirective 69 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_fixed.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_fixed.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserFixed < MiniTest::Test 11 | 12 | 13 | def parser() CsvReader::Parser::FIXED; end 14 | def reader() CsvReader.fixed; end 15 | 16 | 17 | def test_numbers 18 | numbers( parser ) 19 | numbers( reader ) 20 | end 21 | 22 | def test_contacts 23 | contacts( parser ) 24 | contacts( reader ) 25 | end 26 | 27 | 28 | def numbers( parser ) 29 | records = [["12345678","12345678", "12345678901234567890123456789012", "12345678901234"]] 30 | 31 | assert_equal records, parser.parse( <<TXT, width: [8,8,32,14] ) 32 | # fixed width with comments and blank lines 33 | 34 | 12345678123456781234567890123456789012345678901212345678901234 35 | 36 | TXT 37 | 38 | assert_equal records, parser.parse( <<TXT, width: [8,8,32,14] ) 39 | 12345678123456781234567890123456789012345678901212345678901234 40 | TXT 41 | 42 | ## note: negative width fields gets skipped 43 | assert_equal records, parser.parse( <<TXT, width: [8,-2,8,-3,32,-2,14] ) 44 | 12345678XX12345678XXX12345678901234567890123456789012XX12345678901234XXX 45 | TXT 46 | end 47 | 48 | 49 | def contacts( parser ) 50 | records = [["John", "Smith", "john@example.com", "1-888-555-6666"], 51 | ["Michele", "O'Reiley", "michele@example.com", "1-333-321-8765"]] 52 | 53 | assert_equal records, parser.parse( <<TXT, width: [8,8,32,14] ) 54 | # fixed width with comments and blank lines 55 | 56 | John Smith john@example.com 1-888-555-6666 57 | Michele O'Reileymichele@example.com 1-333-321-8765 58 | 59 | TXT 60 | 61 | 62 | assert_equal records, parser.parse( <<TXT, width: [8,8,32,14] ) 63 | John Smith john@example.com 1-888-555-6666 64 | Michele O'Reileymichele@example.com 1-333-321-8765 65 | TXT 66 | end 67 | 68 | 69 | 70 | def test_unpack_numbers 71 | records = [["12345678","12345678", "12345678901234567890123456789012", "12345678901234"]] 72 | 73 | assert_equal records, parser.parse( <<TXT, width: 'a8 a8 a32 Z*' ) 74 | 12345678123456781234567890123456789012345678901212345678901234 75 | TXT 76 | end 77 | 78 | def test_unpack_contacts 79 | records = [["John", "Smith", "john@example.com", "1-888-555-6666"], 80 | ["Michele", "O'Reiley", "michele@example.com", "1-333-321-8765"]] 81 | 82 | assert_equal records, parser.parse( <<TXT, width: 'A8 A8 A32 Z*' ) 83 | John Smith john@example.com 1-888-555-6666 84 | Michele O'Reileymichele@example.com 1-333-321-8765 85 | TXT 86 | end 87 | 88 | 89 | 90 | end # class TestParserFixed 91 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_formats.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_formats.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserFormats < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvReader::Parser 15 | end 16 | 17 | 18 | def test_parse_whitespace 19 | records = [["a", "b", "c"], 20 | ["1", "2", "3"]] 21 | 22 | ## don't care about newlines (\r\n) ??? - fix? why? why not? 23 | assert_equal records, parser.default.parse( "a,b,c\n1,2,3" ) 24 | assert_equal records, parser.default.parse( "a,b,c\n1,2,3\n" ) 25 | assert_equal records, parser.default.parse( " a, b ,c \n\n1,2,3\n" ) 26 | assert_equal records, parser.default.parse( " a, b ,c \n \n1,2,3\n" ) 27 | 28 | assert_equal [["a", "b", "c"], 29 | [""], 30 | ["1", "2", "3"]], parser.default.parse( %Q{a,b,c\n""\n1,2,3\n} ) 31 | assert_equal [["", ""], 32 | [""], 33 | ["", "", ""]], parser.default.parse( %Q{,\n""\n"","",""\n} ) 34 | 35 | 36 | ## strict rfc4180 - no trim leading or trailing spaces or blank lines 37 | assert_equal records, parser.strict.parse( "a,b,c\n1,2,3" ) 38 | assert_equal [["a", "b", "c"], 39 | [""], 40 | ["1", "2", "3"]], parser.strict.parse( "a,b,c\n\n1,2,3" ) 41 | assert_equal [[" a", " b ", "c "], 42 | [""], 43 | ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n\n1,2,3" ) 44 | assert_equal [[" a", " b ", "c "], 45 | [" "], 46 | ["",""], 47 | ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n \n,\n1,2,3" ) 48 | end 49 | 50 | 51 | def test_parse_empties 52 | assert_equal [], parser.default.parse( "\n \n \n" ) 53 | 54 | ## strict rfc4180 - no trim leading or trailing spaces or blank lines 55 | assert_equal [[""], 56 | [" "], 57 | [" "]], parser.strict.parse( "\n \n \n" ) 58 | assert_equal [[""], 59 | [" "], 60 | [" "]], parser.strict.parse( "\n \n " ) 61 | 62 | assert_equal [[""]], parser.strict.parse( "\n" ) 63 | assert_equal [], parser.strict.parse( "" ) 64 | end 65 | 66 | end # class TestParserFormats 67 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_meta.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_meta.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserMeta < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvReader::Parser::DEFAULT 15 | end 16 | 17 | 18 | def test_parse 19 | pp parser.config 20 | 21 | records = [["a", "b", "c"], 22 | ["1", "2", "3"]] 23 | 24 | assert_equal records, parser.parse( <<TXT ) 25 | # with meta data 26 | ## see https://blog.datacite.org/using-yaml-frontmatter-with-csv/ 27 | --- 28 | columns: 29 | - title: Purchase Date 30 | type: date 31 | - title: Item 32 | type: string 33 | - title: Amount (€) 34 | type: float 35 | --- 36 | a,b,c 37 | 1,2,3 38 | TXT 39 | 40 | pp parser.meta 41 | meta = { "columns"=> 42 | [{"title"=>"Purchase Date", "type"=>"date"}, 43 | {"title"=>"Item", "type"=>"string"}, 44 | {"title"=>"Amount (€)", "type"=>"float"}] 45 | } 46 | assert_equal meta, parser.meta 47 | 48 | 49 | assert_equal records, parser.parse( <<TXT ) 50 | # with (empty) meta data 51 | --- 52 | --- 53 | a,b,c 54 | 1,2,3 55 | TXT 56 | 57 | pp parser.meta 58 | meta = {} 59 | assert_equal meta, parser.meta 60 | 61 | 62 | 63 | assert_equal records, parser.parse( <<TXT ) 64 | # without meta data 65 | a,b,c 66 | 1,2,3 67 | TXT 68 | 69 | assert_nil parser.meta 70 | end 71 | 72 | 73 | end # class TestParserMeta 74 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_null.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_null.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestParserNull < MiniTest::Test 12 | 13 | 14 | def parser 15 | CsvReader::Parser 16 | end 17 | 18 | 19 | def test_escaped_mysql_null_value 20 | ## MySQL uses \N to symbolize null values. We have to restore this 21 | 22 | ## note: "unknown escape sequences e.g. \N get passed "through" as-is (unescaped)" 23 | ## only supports \n \r (sep e.g \, or \t) (quote e.g. \") for now - any others? 24 | assert_equal [[ "character\\NEscaped" ]], 25 | parser.default.parse( "character\\NEscaped" ) 26 | 27 | assert_equal [[ "character\\NEscaped" ]], 28 | parser.strict.parse( "character\\NEscaped" ) 29 | end 30 | 31 | 32 | def test_mysql_null_value 33 | default_null_values = parser.default.config[:null] ## save default null settings 34 | 35 | assert_equal [[ nil, nil, "" ]], 36 | parser.default.parse( "\\N, \\N ," ) 37 | 38 | ## escaped with quotes 39 | assert_equal [[ "\\N", "\\N", "" ]], 40 | parser.default.parse( %Q{"\\N", "\\N" ,} ) 41 | 42 | ## try single \N setting 43 | parser.default.null = "\\N" 44 | assert_equal [[ nil, nil, "" ]], 45 | parser.default.parse( "\\N, \\N ," ) 46 | 47 | ## try no null values setting 48 | parser.default.null = nil 49 | assert_equal [[ "\\N", "\\N", "" ]], 50 | parser.default.parse( "\\N, \\N ," ) 51 | 52 | ## try postgresql unquoted empty string is nil/null 53 | parser.default.null = "" 54 | assert_equal [[ nil, nil, "" ], 55 | [ nil, nil, "", nil ]], 56 | parser.default.parse( %Q{,,""\n , , "" ,} ) 57 | 58 | ## try proc 59 | parser.default.null = ->(value) { value.downcase == 'nil' } 60 | assert_equal [[ nil, nil, nil, "" ]], 61 | parser.default.parse( "nil, Nil, NIL," ) 62 | 63 | ## try array 64 | parser.default.null = ['nil', 'Nil', 'NIL'] 65 | assert_equal [[ nil, nil, nil, "" ]], 66 | parser.default.parse( "nil, Nil, NIL," ) 67 | 68 | ## restore defaults 69 | parser.default.null = default_null_values ## ['\N', 'NA'] 70 | end 71 | 72 | 73 | def test_strict_mysql_null_value 74 | assert_equal [[ "\\N", " \\N ", "" ]], 75 | parser.strict.parse( "\\N, \\N ," ) 76 | 77 | ## try single \N setting 78 | parser.strict.null = "\\N" 79 | assert_equal [[ nil, nil, " \\N", "\\N ", "" ]], 80 | parser.strict.parse( "\\N,\\N, \\N,\\N ," ) 81 | 82 | ## escaped with quotes 83 | assert_equal [[ "\\N", "\\N", nil, "" ]], 84 | parser.strict.parse( %Q{"\\N","\\N",\\N,} ) 85 | 86 | 87 | ## try postgresql unquoted empty string is nil/null 88 | parser.strict.null = "" 89 | assert_equal [[ nil, nil, "" ], 90 | [ " ", " ", "", nil ]], 91 | parser.strict.parse( %Q{,,""\n , ,"",} ) 92 | 93 | ## try proc 94 | parser.strict.null = ->(value) { value.downcase == 'nil' } 95 | assert_equal [[ nil, nil, nil, "" ]], 96 | parser.strict.parse( "nil,Nil,NIL," ) 97 | 98 | ## try array 99 | parser.strict.null = ['nil', 'Nil', 'NIL'] 100 | assert_equal [[ nil, nil, nil, "" ]], 101 | parser.strict.parse( "nil,Nil,NIL," ) 102 | 103 | ## restore defaults 104 | parser.strict.null = nil 105 | end 106 | 107 | end # class TestParserNull 108 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_numeric.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_numeric.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestParserNumeric < MiniTest::Test 12 | 13 | def parser 14 | CsvReader::Parser::NUMERIC 15 | end 16 | 17 | 18 | def test_parser_numeric 19 | pp CsvReader::Parser::NUMERIC 20 | pp CsvReader::Parser.numeric 21 | assert true 22 | end 23 | 24 | def test_parse 25 | assert_equal [[1.0,2.0,3.0], 26 | [4.0,5.0,6.0]], parser.parse( "1,2,3\n4,5,6" ) 27 | assert_equal [[1.0,2.0,3.0], 28 | ["4","5","6"]], parser.parse( %Q{ 1,2 , 3\n"4","5","6"} ) 29 | assert_equal [[1.0,2.0,3.0], 30 | ["4","5","6"]], parser.parse( %Q{ 1,2 , 3\n "4", "5" ,"6" } ) 31 | assert_equal [["a","b","c"]], parser.parse( %Q{"a","b","c"} ) 32 | end 33 | 34 | 35 | def test_empty 36 | assert_equal [[nil,nil,nil], 37 | ["","",""]], parser.parse( %Q{,,\n"","",""} ) 38 | end 39 | 40 | end # class TestParserNumeric 41 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_quotes.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_quotes.rb 6 | 7 | 8 | require 'helper' 9 | 10 | 11 | class TestParserQuotes < MiniTest::Test 12 | 13 | 14 | def parser 15 | CsvReader::Parser::DEFAULT 16 | end 17 | 18 | 19 | def test_french_single 20 | assert_equal [[ "a", "b", "c" ]], 21 | parser.parse( " ‹a›, ‹b›, ›c‹ " ) 22 | 23 | assert_equal [[ "a,1", " b,2", "c, 3" ]], 24 | parser.parse( " ‹a,1›, ‹ b,2›, ›c, 3‹ " ) 25 | 26 | assert_equal [[ %Q{"a"}, %Q{'b'}, %Q{c'"'"} ]], 27 | parser.parse( %Q{ ‹"a"›, ‹'b'›, ›c'"'"‹} ) 28 | 29 | # note: quote matches only if first non-whitespace char 30 | assert_equal [[ "_‹a›", "_‹b›", "›c‹" ]], 31 | parser.parse( %Q{ _‹a›, _‹b›, "›c‹"} ) 32 | 33 | end 34 | 35 | 36 | def test_french_double 37 | assert_equal [[ "a", "b", "c" ]], 38 | parser.parse( " «a», «b», »c« " ) 39 | 40 | assert_equal [[ "a,1", " b,2", "c, 3" ]], 41 | parser.parse( " «a,1», « b,2», »c, 3« " ) 42 | 43 | assert_equal [[ %Q{"a"}, %Q{'b'}, %Q{c'"'"} ]], 44 | parser.parse( %Q{ «"a"», «'b'», »c'"'"«} ) 45 | 46 | # note: quote matches only if first non-whitespace char 47 | assert_equal [[ "_«a»", "_«b»", "»c«" ]], 48 | parser.parse( %Q{ _«a», _«b», "»c«"} ) 49 | 50 | end 51 | 52 | 53 | end # class TestParserQuotes 54 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_strict.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_strict.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserStrict < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvReader::Parser::STRICT 15 | end 16 | 17 | 18 | def test_parser_strict 19 | pp CsvReader::Parser::STRICT 20 | pp CsvReader::Parser.strict 21 | assert true 22 | end 23 | 24 | def test_parse 25 | records = [["a", "b", "c"], 26 | ["1", "2", "3"], 27 | ["4", "5", "6"]] 28 | 29 | ## don't care about newlines (\r\n) ??? - fix? why? why not? 30 | assert_equal records, parser.parse( "a,b,c\n1,2,3\n4,5,6" ) 31 | assert_equal records, parser.parse( "a,b,c\n1,2,3\n4,5,6\n" ) 32 | assert_equal records, parser.parse( "a,b,c\r1,2,3\r4,5,6" ) 33 | assert_equal records, parser.parse( "a,b,c\r\n1,2,3\r\n4,5,6\r\n" ) 34 | end 35 | 36 | def test_parse_semicolon 37 | records = [["a", "b", "c"], 38 | ["1", "2", "3"], 39 | ["4", "5", "6"]] 40 | 41 | ## don't care about newlines (\r\n) ??? - fix? why? why not? 42 | assert_equal records, parser.parse( "a;b;c\n1;2;3\n4;5;6", sep: ';' ) 43 | assert_equal records, parser.parse( "a;b;c\n1;2;3\n4;5;6\n", sep: ';' ) 44 | assert_equal records, parser.parse( "a;b;c\r1;2;3\r4;5;6", sep: ';' ) 45 | assert_equal records, parser.parse( "a;b;c\r\n1;2;3\r\n4;5;6\r\n", sep: ';' ) 46 | end 47 | 48 | def test_parse_tab 49 | records = [["a", "b", "c"], 50 | ["1", "2", "3"], 51 | ["4", "5", "6"]] 52 | 53 | ## don't care about newlines (\r\n) ??? - fix? why? why not? 54 | assert_equal records, parser.parse( "a\tb\tc\n1\t2\t3\n4\t5\t6", sep: "\t" ) 55 | assert_equal records, parser.parse( "a\tb\tc\n1\t2\t3\n4\t5\t6\n", sep: "\t" ) 56 | assert_equal records, parser.parse( "a\tb\tc\r1\t2\t3\r4\t5\t6", sep: "\t" ) 57 | assert_equal records, parser.parse( "a\tb\tc\r\n1\t2\t3\r\n4\t5\t6\r\n", sep: "\t" ) 58 | end 59 | 60 | 61 | 62 | def test_parse_empties 63 | assert_equal [["","",""],["","",""]], 64 | parser.parse( %Q{"","",""\n,,} ) 65 | 66 | parser.null = "" 67 | assert_equal [["","",""," "],[nil,nil,nil," "]], 68 | parser.parse( %Q{"","",""," "\n,,, } ) 69 | parser.null = [""] ## try array (allows multiple null values) 70 | assert_equal [[nil,nil,nil," "],["","",""," "]], 71 | parser.parse( %Q{,,, \n"","",""," "} ) 72 | 73 | ## reset to defaults 74 | parser.null = nil 75 | assert_equal [["","",""],["","",""]], 76 | parser.parse( %Q{"","",""\n,,} ) 77 | end 78 | 79 | 80 | end # class TestParserStrict 81 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_tab.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_tab.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserTab < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvReader::Parser::TAB 15 | end 16 | 17 | 18 | def test_parser_tab 19 | pp CsvReader::Parser::TAB 20 | pp CsvReader::Parser.tab 21 | assert true 22 | end 23 | 24 | def test_parse 25 | records = [["a", "b", "c"], 26 | ["1", "2", "3"], 27 | ["4", "5", "6"]] 28 | 29 | ## don't care about newlines (\r\n) 30 | assert_equal records, parser.parse( "a\tb\tc\n1\t2\t3\n4\t5\t6" ) 31 | assert_equal records, parser.parse( "a\tb\tc\n1\t2\t3\n4\t5\t6\n" ) 32 | assert_equal records, parser.parse( "a\tb\tc\r\n1\t2\t3\r\n4\t5\t6\r\n" ) 33 | end 34 | 35 | def test_parse_empties 36 | # note: trailing empty fields got (auto-)trimmed !!!!!!!; 37 | # add missing -1 limit option :-) now works 38 | assert_equal [["","",""]], parser.parse( "\t\t" ) 39 | assert_equal [["","","","",""]], parser.parse( "\t\t\t\t" ) 40 | assert_equal [["1","",""]], parser.parse( "1\t\t" ) 41 | assert_equal [["1","","","",""]], parser.parse( "1\t\t\t\t" ) 42 | assert_equal [["","","3"]], parser.parse( "\t\t3" ) 43 | assert_equal [["","","","","5"]], parser.parse( "\t\t\t\t5" ) 44 | 45 | assert_equal [], parser.parse( "" ) 46 | end 47 | 48 | 49 | end # class TestParserTab 50 | -------------------------------------------------------------------------------- /csvreader/test/test_parser_table.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_table.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserTable < MiniTest::Test 11 | 12 | 13 | def parser() CsvReader::Parser::TABLE; end 14 | 15 | 16 | def test_space 17 | records = [["1", "Man City", "10", "8", "2", "0", "27", "3", "24", "26"], 18 | ["2", "Liverpool", "10", "8", "2", "0", "20", "4", "16", "26"], 19 | ["3", "Chelsea", "10", "7", "3", "0", "24", "7", "17", "24"], 20 | ["4", "Arsenal", "10", "7", "1", "2", "24", "13", "11", "22"], 21 | ["8", "Man Utd", "10", "5", "2", "3", "17", "17", "0", "17"], 22 | ["13", "West Ham", "10", "2", "2", "6", "9", "15", "-6", "8"], 23 | ["14", "Crystal Palace", "10", "2", "2", "6", "7", "13", "-6", "8"]] 24 | 25 | parser.space='_' 26 | 27 | assert_equal records, parser.parse( <<TXT ) 28 | 1 Man_City 10 8 2 0 27 3 24 26 29 | 2 Liverpool 10 8 2 0 20 4 16 26 30 | 3 Chelsea 10 7 3 0 24 7 17 24 31 | 4 Arsenal 10 7 1 2 24 13 11 22 32 | 8 Man_Utd 10 5 2 3 17 17 0 17 33 | 13 West_Ham 10 2 2 6 9 15 -6 8 34 | 14 Crystal_Palace 10 2 2 6 7 13 -6 8 35 | TXT 36 | 37 | assert_equal [[" "," "," "]], parser.parse( "_ _ __" ) 38 | 39 | 40 | parser.space='•' 41 | 42 | assert_equal records, parser.parse( <<TXT ) 43 | 1 Man•City 10 8 2 0 27 3 24 26 44 | 2 Liverpool 10 8 2 0 20 4 16 26 45 | 3 Chelsea 10 7 3 0 24 7 17 24 46 | 4 Arsenal 10 7 1 2 24 13 11 22 47 | 8 Man•Utd 10 5 2 3 17 17 0 17 48 | 13 West•Ham 10 2 2 6 9 15 -6 8 49 | 14 Crystal•Palace 10 2 2 6 7 13 -6 8 50 | TXT 51 | 52 | assert_equal [[" "," "," "]], parser.parse( "• • ••" ) 53 | 54 | parser.space = nil ## reset to default setting 55 | end 56 | 57 | 58 | def test_contacts 59 | records = [["aa", "bbb"], 60 | ["cc", "dd", "ee"]] 61 | 62 | assert_equal records, parser.parse( <<TXT ) 63 | # space-separated with comments and blank lines 64 | 65 | aa bbb 66 | cc dd ee 67 | 68 | TXT 69 | 70 | assert_equal records, parser.parse( <<TXT ) 71 | aa bbb 72 | cc dd ee 73 | TXT 74 | end 75 | 76 | 77 | end # class TestParserTable 78 | -------------------------------------------------------------------------------- /csvreader/test/test_reader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestReader < MiniTest::Test 11 | 12 | 13 | 14 | 15 | def test_parse_line 16 | puts "== parse_line:" 17 | row = CsvReader.parse_line( <<TXT ) 18 | Augustiner Bräu München, München, Edelstoff, 5.6% 19 | Bayerische Staatsbrauerei Weihenstephan, Freising, Hefe Weissbier, 5.4% 20 | TXT 21 | 22 | pp row 23 | assert_equal ['Augustiner Bräu München', 'München', 'Edelstoff', '5.6%'], row 24 | end 25 | 26 | def test_parse_line11 27 | puts "== parse_line:" 28 | row = CsvReader.parse_line( <<TXT ) 29 | ####### 30 | # try with some comments 31 | # and blank lines even before header 32 | 33 | Augustiner Bräu München, München, Edelstoff, 5.6% 34 | Bayerische Staatsbrauerei Weihenstephan, Freising, Hefe Weissbier, 5.4% 35 | TXT 36 | 37 | pp row 38 | assert_equal ['Augustiner Bräu München', 'München', 'Edelstoff', '5.6%'], row 39 | end 40 | 41 | 42 | 43 | def test_read 44 | puts "== read: beer.csv:" 45 | rows = CsvReader.read( "#{CsvReader.test_data_dir}/beer.csv" ) 46 | pp rows 47 | 48 | rows.each do |row| 49 | pp row 50 | end 51 | puts " #{rows.size} rows" 52 | assert_equal 7, rows.size ## note: include header row in count 53 | end 54 | 55 | 56 | def test_header 57 | puts "== header: beer.csv:" 58 | header = CsvReader.header( "#{CsvReader.test_data_dir}/beer.csv" ) 59 | pp header 60 | assert_equal ['Brewery','City','Name','Abv'], header 61 | end 62 | 63 | def test_header11 64 | puts "== header: beer11.csv:" 65 | header = CsvReader.header( "#{CsvReader.test_data_dir}/beer11.csv" ) 66 | pp header 67 | assert_equal ['Brewery','City','Name','Abv'], header 68 | end 69 | 70 | 71 | 72 | def test_foreach 73 | puts "== foreach: beer11.csv:" 74 | CsvReader.foreach( "#{CsvReader.test_data_dir}/beer11.csv" ) do |row| 75 | pp row 76 | end 77 | assert true 78 | end 79 | 80 | 81 | def test_enum 82 | csv = CsvReader.new( "a,b,c" ) 83 | enum = csv.to_enum 84 | assert_equal ["a","b","c"], enum.next 85 | 86 | ## test Csv == CsvReader class alias 87 | if defined?( Csv ) 88 | csv = Csv.new( "a,b,c" ) 89 | enum = csv.to_enum 90 | assert_equal ["a","b","c"], enum.next 91 | end 92 | end 93 | 94 | 95 | end # class TestReader 96 | -------------------------------------------------------------------------------- /csvreader/test/test_reader_converters.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader_converters.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestReaderConverters < MiniTest::Test 11 | 12 | 13 | def test_all 14 | rows = CsvReader.parse( <<TXT, :converters => :all ) 15 | 1,2,3 16 | true,false,null 17 | ,, 18 | TXT 19 | 20 | pp rows 21 | 22 | assert_equal 3, rows.size 23 | assert_equal [1,2,3], rows[0] 24 | assert_equal [true,false,nil], rows[1] 25 | assert_equal [nil,nil,nil], rows[2] 26 | end 27 | 28 | 29 | def test_all_quotes 30 | ## only convert unquoted values - why? why not? 31 | ## e.g. 1 => 1 (integer) 32 | ## "1" => "1" (string) 33 | ## true => true (boolean) 34 | ## "true" => "true" (string) 35 | ## 36 | ## 37 | ## note: use CsvRecord for by column types / converters 38 | 39 | rows = CsvReader.parse( <<TXT, :converters => :all ) 40 | "1","2","3" 41 | "true","false","null" 42 | "","","" 43 | TXT 44 | 45 | pp rows 46 | 47 | assert_equal 3, rows.size 48 | assert_equal [1,2,3], rows[0] 49 | assert_equal [true,false,nil], rows[1] 50 | assert_equal [nil,nil,nil], rows[2] 51 | end 52 | 53 | 54 | end # class TestReaderConverters 55 | -------------------------------------------------------------------------------- /csvreader/test/test_reader_hash.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader_hash.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestHashReader < MiniTest::Test 11 | 12 | 13 | def test_read 14 | puts "== read (hash): beer.csv:" 15 | rows = CsvHashReader.read( "#{CsvReader.test_data_dir}/beer.csv" ) 16 | pp rows 17 | pp rows.to_a 18 | 19 | rows.each do |row| ## note: will skip (NOT include) header row!! 20 | pp row 21 | end 22 | puts " #{rows.size} rows" ## note: again will skip (NOT include) header row in count!!! 23 | assert_equal 6, rows.size 24 | end 25 | 26 | def test_read11 27 | puts "== read (hash): beer11.csv:" 28 | rows = CsvHashReader.read( "#{CsvReader.test_data_dir}/beer11.csv" ) 29 | pp rows 30 | pp rows.to_a ## note: includes header (first row with column names) 31 | 32 | assert true 33 | end 34 | 35 | 36 | def test_foreach 37 | puts "== foreach (hash): beer.csv:" 38 | CsvHashReader.foreach( "#{CsvReader.test_data_dir}/beer.csv" ) do |row| 39 | pp row 40 | end 41 | assert true 42 | end 43 | 44 | def test_foreach11 45 | puts "== foreach (hash): beer11.csv:" 46 | CsvHashReader.foreach( "#{CsvReader.test_data_dir}/beer11.csv" ) do |row| 47 | pp row 48 | end 49 | assert true 50 | end 51 | 52 | end # class TestHashReader 53 | -------------------------------------------------------------------------------- /csvreader/test/test_reader_hash_converters.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader_hash_converters.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestReaderHashConverters < MiniTest::Test 11 | 12 | 13 | def test_nil 14 | ## default no converters 15 | rows = CsvHashReader.parse( <<TXT ) 16 | a,b,c 17 | 1,2,3 18 | true,false,null 19 | ,, 20 | TXT 21 | 22 | pp rows 23 | 24 | assert_equal 3, rows.size 25 | assert_equal( {'a'=>'1', 'b'=>'2', 'c'=>'3'}, rows[0] ) 26 | assert_equal( {'a'=>'true','b'=>'false','c'=>'null'}, rows[1] ) 27 | assert_equal( {'a'=>'', 'b'=>'', 'c'=>'' }, rows[2] ) 28 | end 29 | 30 | 31 | def test_all 32 | rows = CsvHashReader.parse( <<TXT, :converters => :all ) 33 | a,b,c 34 | 1,2,3 35 | true,false,null 36 | ,, 37 | TXT 38 | 39 | pp rows 40 | 41 | assert_equal 3, rows.size 42 | assert_equal( {'a'=>1, 'b'=>2, 'c'=>3}, rows[0] ) 43 | assert_equal( {'a'=>true,'b'=>false,'c'=>nil}, rows[1] ) 44 | assert_equal( {'a'=>nil, 'b'=>nil, 'c'=>nil}, rows[2] ) 45 | end 46 | 47 | 48 | def test_downcase 49 | rows = CsvHashReader.parse( <<TXT, :converters => :all, :header_converters => :downcase ) 50 | A,B,C 51 | 1,2,3 52 | true,false,null 53 | ,, 54 | TXT 55 | 56 | pp rows 57 | 58 | assert_equal 3, rows.size 59 | assert_equal( {'a'=>1, 'b'=>2, 'c'=>3}, rows[0] ) 60 | assert_equal( {'a'=>true,'b'=>false,'c'=>nil}, rows[1] ) 61 | assert_equal( {'a'=>nil, 'b'=>nil, 'c'=>nil}, rows[2] ) 62 | end 63 | 64 | 65 | def test_symbol 66 | rows = CsvHashReader.parse( <<TXT, :converters => :all, :header_converters => :symbol ) 67 | a,b,c 68 | 1,2,3 69 | true,false,null 70 | ,, 71 | TXT 72 | 73 | pp rows 74 | 75 | assert_equal 3, rows.size 76 | assert_equal( {a: 1, b: 2, c: 3}, rows[0] ) 77 | assert_equal( {a: true, b: false, c: nil}, rows[1] ) 78 | assert_equal( {a: nil, b: nil, c: nil}, rows[2] ) 79 | end 80 | 81 | 82 | 83 | def test_all_quotes 84 | ## only convert unquoted values - why? why not? 85 | ## e.g. 1 => 1 (integer) 86 | ## "1" => "1" (string) 87 | ## true => true (boolean) 88 | ## "true" => "true" (string) 89 | ## 90 | ## 91 | ## note: use CsvRecord for by column types / converters 92 | 93 | rows = CsvHashReader.parse( <<TXT, :converters => :all ) 94 | "a","b","c" 95 | "1","2","3" 96 | "true","false","null" 97 | "","","" 98 | TXT 99 | 100 | pp rows 101 | 102 | assert_equal 3, rows.size 103 | assert_equal( {'a'=>1, 'b'=>2, 'c'=>3}, rows[0] ) 104 | assert_equal( {'a'=>true,'b'=>false,'c'=>nil}, rows[1] ) 105 | assert_equal( {'a'=>nil, 'b'=>nil, 'c'=>nil}, rows[2] ) 106 | end 107 | 108 | 109 | end # class TestHashReaderConverters 110 | -------------------------------------------------------------------------------- /csvreader/test/test_samples.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_samples.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestSamples < MiniTest::Test 11 | 12 | 13 | def test_cities11 14 | records = CsvReader.read( "#{CsvReader.test_data_dir}/cities11.csv" ) 15 | pp records 16 | 17 | assert_equal [["Los Angeles", "34°03'N", "118°15'W"], 18 | ["New York City", %Q{40°42'46"N}, %Q{74°00'21"W}], 19 | ["Paris", %Q{48°51'24"N}, %Q{2°21'03"E}]], records 20 | end 21 | 22 | 23 | def test_cars11 24 | records = CsvReader.read( "#{CsvReader.test_data_dir}/cars11.csv" ) 25 | pp records 26 | 27 | assert_equal [["Year", "Make", "Model", "Description", "Price"], 28 | ["1997", "Ford", "E350", "ac, abs, moon", "3000.00"], 29 | ["1999", "Chevy", %Q{Venture "Extended Edition"}, "", "4900.00"], 30 | ["1999", "Chevy", %Q{Venture "Extended Edition, Very Large"}, "", "5000.00"], 31 | ["1996", "Jeep", "Grand Cherokee", "MUST SELL!\nair, moon roof, loaded", "4799.00"]], records 32 | end 33 | 34 | 35 | def test_customers11 36 | records = CsvReader.read( "#{CsvReader.test_data_dir}/customers11.csv" ) 37 | pp records 38 | 39 | assert_equal [["Name", "Times arrived", "Total $ spent", "Food feedback"], 40 | ["Dan", "34", "2548", "Lovin it!"], 41 | ["Maria", "55", "5054", "Good, delicious food"], 42 | ["Carlos", "22", "4352", %Q{I am "pleased", but could be better}], 43 | ["Stephany", "34", "6542", "I want bigger steaks!!!!!"], 44 | ["James", "1", "43", "Not bad"], 45 | ["Robin", "1", "56", "Fish is tasty"], 46 | ["Anna", "1", "79", "Good, better, the best!"]], records 47 | end 48 | 49 | def test_shakespeare11 50 | records = CsvReader.read( "#{CsvReader.test_data_dir}/shakespeare.csv" ) 51 | pp records 52 | 53 | assert_equal [["Quote", "Play", "Cite"], 54 | ["Sweet are the uses of adversity", "As You Like It", "Act 2, scene 1, 12"], 55 | ["All the world's a stage", "As You Like It", "Act 2, scene 7, 139"], 56 | ["We few, we happy few", "Henry V", ""], 57 | [%Q{"Seems," madam! Nay it is; I know not "seems."}, "Hamlet", "(1.ii.76)"], 58 | ["To be, or not to be", "Hamlet", "Act 3, scene 1, 55"], 59 | ["What's in a name? That which we call a rose by any other name would smell as sweet.", "Romeo and Juliet", "(II, ii, 1-2)"], 60 | ["O Romeo, Romeo, wherefore art thou Romeo?", "Romeo and Juliet", "Act 2, scene 2, 33"], 61 | ["Tomorrow, and tomorrow, and tomorrow", "Macbeth", "Act 5, scene 5, 19"]], records 62 | end 63 | 64 | 65 | def test_test 66 | records = CsvReader.read( "#{CsvReader.test_data_dir}/test.csv" ) 67 | pp records 68 | 69 | assert_equal [["A", "B", "C", "D"], 70 | ["a", "b", "c", "d"], 71 | ["e", "f", "g", "h"], 72 | [" i ", " j ", " k ", " l "], 73 | ["", "", "", ""], 74 | ["", "", "", ""]], records 75 | end 76 | 77 | 78 | end # class TestSamples 79 | -------------------------------------------------------------------------------- /csvrecord/.gitignore: -------------------------------------------------------------------------------- 1 | ####################### 2 | # ignore ruby rake generated folders 3 | 4 | /pkg/ 5 | /doc/ 6 | 7 | 8 | ################ 9 | # ignore (top-level) datapackage folders 10 | 11 | /pack/ 12 | /.pack/ 13 | -------------------------------------------------------------------------------- /csvrecord/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-08-11 2 | 3 | * Everything is new. First release. 4 | -------------------------------------------------------------------------------- /csvrecord/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | LICENSE.md 3 | Manifest.txt 4 | README.md 5 | Rakefile 6 | lib/csvrecord.rb 7 | lib/csvrecord/base.rb 8 | lib/csvrecord/version.rb 9 | test/data/beer.csv 10 | test/data/beer11.csv 11 | test/helper.rb 12 | test/test_record.rb 13 | test/test_record_auto.rb 14 | test/test_version.rb 15 | -------------------------------------------------------------------------------- /csvrecord/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvrecord/version.rb' 3 | 4 | Hoe.spec 'csvrecord' do 5 | 6 | self.version = CsvRecord::VERSION 7 | 8 | self.summary = "csvrecord - read in comma-separated values (csv) records with typed structs / schemas" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/csvrecord'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ['record', '>=1.2.0'], 22 | ['csvreader', '>=1.1.4'] 23 | ] 24 | 25 | self.licenses = ['Public Domain'] 26 | 27 | self.spec_extras = { 28 | required_ruby_version: '>= 2.2.2' 29 | } 30 | 31 | end 32 | -------------------------------------------------------------------------------- /csvrecord/lib/csvrecord.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # 3rd party gems 5 | require 'record' 6 | require 'csvreader' 7 | 8 | 9 | ### 10 | # our own code 11 | require 'csvrecord/version' # let version always go first 12 | require 'csvrecord/base' 13 | 14 | 15 | # say hello 16 | puts CsvRecord.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 17 | -------------------------------------------------------------------------------- /csvrecord/lib/csvrecord/base.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | module CsvRecord 5 | 6 | ######################## 7 | # Base 8 | 9 | class Base < Record::Base 10 | 11 | def self.foreach( path, sep: nil, headers: true ) 12 | 13 | ## note: always use reader w/o headers to get row/record values as array of strings 14 | ## if headers: true -> skip first row 15 | names = nil 16 | 17 | CsvReader.foreach( path, sep: sep ) do |row| 18 | if headers && names.nil? 19 | names = row ## store header row / a.k.a. field/column names 20 | else 21 | rec = new 22 | rec.parse( row ) 23 | 24 | yield( rec ) ## check: use block.class( rec ) - why? why not? 25 | end 26 | end 27 | end 28 | 29 | 30 | 31 | 32 | def self.parse( txt_or_rows, sep: nil, headers: true ) ## note: returns an (lazy) enumarator 33 | if txt_or_rows.is_a? String 34 | txt = txt_or_rows 35 | ## note: always use reader w/o headers to get row/record values as array of strings 36 | ## if headers: true -> skip first row 37 | rows = CsvReader.parse( txt, sep: sep ) 38 | else 39 | ### todo/fix: use only self.create( array-like ) for array-like data - why? why not? 40 | rows = txt_or_rows 41 | end 42 | 43 | ## pp rows 44 | 45 | 46 | names = nil 47 | 48 | Enumerator.new do |yielder| 49 | rows.each do |row| 50 | if headers && names.nil? 51 | names = row ## store header row / a.k.a. field/column names 52 | else 53 | rec = new 54 | rec.parse( row ) 55 | 56 | yielder.yield( rec ) 57 | end 58 | end 59 | end 60 | end 61 | 62 | 63 | def self.read( path, sep: nil, headers: true ) ## not returns an enumarator 64 | txt = File.open( path, 'r:utf-8' ).read 65 | parse( txt, sep: sep, headers: headers ) 66 | end 67 | 68 | 69 | 70 | def to_csv ## use/rename/alias to to_row too - why? why not? 71 | ## todo/fix: check for date and use own date to string format!!!! 72 | @values.map{ |value| value.to_s } 73 | end 74 | 75 | end # class Base 76 | 77 | 78 | 79 | 80 | 81 | ########################################### 82 | ## "magic" lazy auto-build schema from headers versions 83 | 84 | def self.build_class( headers ) ## check: find a better name - why? why not? 85 | ## (auto-)build record class from an array of headers 86 | ## add fields (all types will be string for now) 87 | clazz = Class.new( Base ) 88 | headers.each do |header| 89 | ## downcase and remove all non-ascii chars etc. 90 | ## todo/fix: remove all non-ascii chars!!! 91 | ## todo: check if header starts with a number too!! 92 | name = header.downcase.gsub( ' ', '_' ) 93 | name = name.to_sym ## symbol-ify 94 | clazz.field( name ) 95 | end 96 | clazz 97 | end 98 | 99 | def self.read( path, sep: nil ) 100 | headers = CsvReader.header( path, sep: sep ) 101 | 102 | clazz = build_class( headers ) 103 | clazz.read( path, sep: sep ) 104 | end 105 | 106 | def self.foreach( path, sep: nil, &block ) 107 | headers = CsvReader.header( path, sep: sep ) 108 | 109 | clazz = build_class( headers ) 110 | clazz.foreach( path, sep: sep, &block ) 111 | end 112 | 113 | 114 | ######### 115 | # alternative class (record) builder 116 | 117 | def self.define( &block ) ## check: rename super_class to base - why? why not? 118 | Record.define( Base, &block ) 119 | end 120 | 121 | end # module CsvRecord 122 | -------------------------------------------------------------------------------- /csvrecord/lib/csvrecord/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | module CsvRecord 5 | 6 | module Version 7 | MAJOR = 0 8 | MINOR = 4 9 | PATCH = 3 10 | end 11 | VERSION = [Version::MAJOR, 12 | Version::MINOR, 13 | Version::PATCH].join('.') 14 | 15 | 16 | def self.version 17 | VERSION 18 | end 19 | 20 | def self.banner 21 | "csvrecord/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 22 | end 23 | 24 | def self.root 25 | File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) 26 | end 27 | 28 | end # module CsvRecord 29 | -------------------------------------------------------------------------------- /csvrecord/test/data/beer.csv: -------------------------------------------------------------------------------- 1 | Brewery,City,Name,Abv 2 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 3 | Augustiner Bräu München,München,Edelstoff,5.6% 4 | Bayerische Staatsbrauerei Weihenstephan,Freising,Hefe Weissbier,5.4% 5 | Brauerei Spezial,Bamberg,Rauchbier Märzen,5.1% 6 | Hacker-Pschorr Bräu,München,Münchner Dunkel,5.0% 7 | Staatliches Hofbräuhaus München,München,Hofbräu Oktoberfestbier,6.3% 8 | -------------------------------------------------------------------------------- /csvrecord/test/data/beer11.csv: -------------------------------------------------------------------------------- 1 | ####### 2 | # try with some comments 3 | # and blank lines even before header 4 | 5 | Brewery,City,Name,Abv 6 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 7 | Augustiner Bräu München,München,Edelstoff,5.6% 8 | Bayerische Staatsbrauerei Weihenstephan,Freising,Hefe Weissbier,5.4% 9 | 10 | Brauerei Spezial, Bamberg, Rauchbier Märzen, 5.1% 11 | 12 | Hacker-Pschorr Bräu, München, Münchner Dunkel, 5.0% 13 | 14 | ## some more comments here 15 | 16 | Staatliches Hofbräuhaus München,München,Hofbräu Oktoberfestbier,6.3% 17 | 18 | ## check for nil 19 | "", ,,"", 20 | 21 | ## check for blank line with spaces 22 | ## yes, will get added as a record!! e.g. ["", nil, nil, nil] 23 | ## use regex to skip blank lines with spaces!!!! 24 | 25 | 26 | ## test double quotes and double quotes escaped 27 | ## note: double quotes do NOT work with leading AND/OR trailing spaces 28 | ## leads to: 29 | ## CSV::MalformedCSVError - Missing or stray quote in line xxx 30 | ## 31 | ## note: for now double quote does not accept leading AND/OR trailing spaces!!!! 32 | ## 33 | ## todo/fix: check liberal_quote option starting in csv ruby 2.4 ??? 34 | ## 35 | ## examples: 36 | ## "value with comma, comma","some ""hello""","some ""hello""", 37 | ## works - but does NOT work (note the leading and trailing spaces for double quotes): 38 | ## "value with comma, comma" ,"some ""hello""", "some ""hello""", 39 | ## 40 | ## check for "multi-line": 41 | ## "hello 42 | ## and another line 43 | ## and another",two,three, 44 | 45 | 46 | "value with comma, comma","some ""hello""","some ""hello""", 47 | 48 | ## check for "multi-line" 49 | "hello 50 | and another line 51 | and another",two,three, 52 | -------------------------------------------------------------------------------- /csvrecord/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'csvrecord' 10 | 11 | ## add test_data_dir helper 12 | module CsvRecord 13 | def self.test_data_dir 14 | "#{root}/test/data" 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /csvrecord/test/test_record_auto.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_record_auto.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestRecordAuto < MiniTest::Test 11 | 12 | 13 | def test_read 14 | beers = CsvRecord.read( "#{CsvRecord.test_data_dir}/beer.csv" ).to_a 15 | pp beers 16 | 17 | assert_equal 6, beers.size 18 | assert_equal 'Andechser Klosterbrauerei', beers[0].brewery 19 | assert_equal 'Andechs', beers[0].city 20 | assert_equal 'Doppelbock Dunkel', beers[0].name 21 | assert_equal '7%', beers[0].abv 22 | end 23 | 24 | 25 | def test_foreach 26 | CsvRecord.foreach( "#{CsvRecord.test_data_dir}/beer.csv" ) do |rec| 27 | pp rec 28 | puts "#{rec.name} (#{rec.abv}%) by #{rec.brewery}, #{rec.city}" 29 | end 30 | 31 | assert true 32 | end 33 | 34 | end # class TestRecordAuto 35 | -------------------------------------------------------------------------------- /csvrecord/test/test_version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_version.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestVersion < MiniTest::Test 11 | 12 | def test_version 13 | pp CsvRecord::VERSION 14 | pp CsvRecord.banner 15 | pp CsvRecord.root 16 | 17 | assert true ## assume ok if we get here 18 | end 19 | 20 | end # class TestVersion 21 | -------------------------------------------------------------------------------- /csvutils/.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /lib/bundler/man/ 26 | 27 | # for a library or gem, you might want to ignore these files since the code is 28 | # intended to run in multiple environments; otherwise, check them in: 29 | # Gemfile.lock 30 | # .ruby-version 31 | # .ruby-gemset 32 | 33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 34 | .rvmrc 35 | 36 | 37 | #### 38 | # add some auto-generated getting started samples 39 | 40 | getting-started-samples/AUT_2016-2017.csv 41 | getting-started-samples/AUT_2017-2018.csv 42 | -------------------------------------------------------------------------------- /csvutils/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-08-04 2 | 3 | * Everything is new. First release. 4 | -------------------------------------------------------------------------------- /csvutils/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | LICENSE.md 3 | Manifest.txt 4 | README.md 5 | Rakefile 6 | bin/csvcut 7 | bin/csvhead 8 | bin/csvheader 9 | bin/csvsplit 10 | bin/csvstat 11 | datasets/at-austria/AUT.csv 12 | datasets/de-deutschland/bundesliga.csv 13 | datasets/eng-england/2017-18/E0.csv 14 | lib/csvutils.rb 15 | lib/csvutils/commands/cut.rb 16 | lib/csvutils/commands/head.rb 17 | lib/csvutils/commands/header.rb 18 | lib/csvutils/commands/split.rb 19 | lib/csvutils/commands/stat.rb 20 | lib/csvutils/cut.rb 21 | lib/csvutils/head.rb 22 | lib/csvutils/header.rb 23 | lib/csvutils/split.rb 24 | lib/csvutils/stat.rb 25 | lib/csvutils/test.rb 26 | lib/csvutils/utils.rb 27 | lib/csvutils/version.rb 28 | test/helper.rb 29 | test/test_cut.rb 30 | test/test_head.rb 31 | test/test_header.rb 32 | test/test_misc.rb 33 | test/test_split.rb 34 | test/test_version.rb 35 | -------------------------------------------------------------------------------- /csvutils/NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes 2 | 3 | 4 | ## Todos 5 | 6 | - [ ] use line-by-line reading / streaming for utils - do NOT read all into memory 7 | - [ ] add "classic" stdin (standard input) support too (e.g. `-` on the command line) or check tty? 8 | - [ ] package (include) test datasets in gem - why? why not? 9 | 10 | 11 | 12 | ## More CSV Tools 13 | 14 | ### Ruby 15 | 16 | See csvlint in ruby - <https://github.com/theodi/csvlint.rb> - 17 | supports validating CSV files to check their syntax and contents; 18 | by Stuart Harrison (pezholio) et al 19 | 20 | See ? 21 | 22 | Add more CSV tools here. 23 | 24 | 25 | ### Other Langs 26 | 27 | See xcv in rust - <https://github.com/BurntSushi/xsv> - 28 | fast CSV command line toolkit; 29 | written in Rust by Andrew Gallant (burntsushi) et al 30 | 31 | See csvkit in python - <https://github.com/wireservice/csvkit>, <https://csvkit.readthedocs.io> - 32 | a suite of command-line tools for converting to and working with CSV, the king of tabular file formats; 33 | written in Python by Christopher Groskopf (onyxfish) et al 34 | 35 | See ? 36 | 37 | Add more CSV tools here. 38 | -------------------------------------------------------------------------------- /csvutils/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvutils/version.rb' 3 | 4 | Hoe.spec 'csvutils' do 5 | 6 | self.version = CsvUtils::VERSION 7 | 8 | self.summary = "csvutils - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular data interchange format in text" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/csvutils'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ['csvreader', '>=1.2.3'] 22 | ] 23 | 24 | self.licenses = ['Public Domain'] 25 | 26 | self.spec_extras = { 27 | required_ruby_version: '>= 2.2.2' 28 | } 29 | 30 | end 31 | -------------------------------------------------------------------------------- /csvutils/bin/csvcut: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ################### 4 | # DEV TIPS: 5 | # 6 | # For local testing run like: 7 | # 8 | # ruby -Ilib bin/csvcut 9 | # 10 | # Set the executable bit in Linux. Example: 11 | # 12 | # % chmod a+x bin/csvcut 13 | # 14 | 15 | require 'csvutils' 16 | 17 | CsvTool.cut( ARGV ) 18 | -------------------------------------------------------------------------------- /csvutils/bin/csvhead: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ################### 4 | # DEV TIPS: 5 | # 6 | # For local testing run like: 7 | # 8 | # ruby -Ilib bin/csvhead 9 | # 10 | # Set the executable bit in Linux. Example: 11 | # 12 | # % chmod a+x bin/csvhead 13 | # 14 | 15 | require 'csvutils' 16 | 17 | CsvTool.head( ARGV ) 18 | -------------------------------------------------------------------------------- /csvutils/bin/csvheader: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ################### 4 | # DEV TIPS: 5 | # 6 | # For local testing run like: 7 | # 8 | # ruby -Ilib bin/csvheader 9 | # 10 | # Set the executable bit in Linux. Example: 11 | # 12 | # % chmod a+x bin/csvheader 13 | # 14 | 15 | require 'csvutils' 16 | 17 | CsvTool.header( ARGV ) 18 | -------------------------------------------------------------------------------- /csvutils/bin/csvsplit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ################### 4 | # DEV TIPS: 5 | # 6 | # For local testing run like: 7 | # 8 | # ruby -Ilib bin/csvsplit 9 | # 10 | # Set the executable bit in Linux. Example: 11 | # 12 | # % chmod a+x bin/csvsplit 13 | # 14 | 15 | require 'csvutils' 16 | 17 | CsvTool.split( ARGV ) 18 | -------------------------------------------------------------------------------- /csvutils/bin/csvstat: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | ################### 4 | # DEV TIPS: 5 | # 6 | # For local testing run like: 7 | # 8 | # ruby -Ilib bin/csvstat 9 | # 10 | # Set the executable bit in Linux. Example: 11 | # 12 | # % chmod a+x bin/csvstat 13 | # 14 | 15 | require 'csvutils' 16 | 17 | CsvTool.stat( ARGV ) 18 | -------------------------------------------------------------------------------- /csvutils/getting-started-samples/start.rb: -------------------------------------------------------------------------------- 1 | ### 2 | # ruby script (data work flow) getting started sample from the csvutils readme 3 | # see https://github.com/csv11/csvutils 4 | # 5 | 6 | require 'csvutils' 7 | 8 | 9 | CsvUtils.head( 'ENG.csv' ) 10 | # same as: 11 | # $ csvhead ENG.csv 12 | 13 | CsvUtils.header( 'ENG.csv' ) 14 | # same as: 15 | # $ csvheader ENG.csv 16 | 17 | CsvUtils.stat( 'ENG.csv', 'Team1', 'Team2' ) 18 | # same as: 19 | # $ csvstat -c Team1,Team2 ENG.csv 20 | 21 | 22 | CsvUtils.stat( 'AUT.csv', 'Season' ) 23 | # same as: 24 | # $ csvstat -c Season AUT.csv 25 | 26 | 27 | CsvUtils.split( 'AUT.csv', 'Season' ) 28 | # same as: 29 | # $ csvsplit -c Season AUT.csv 30 | 31 | CsvUtils.cut( 'AUT_2016-2017.csv', 'Date', 'Team1', 'Team2', 'FT1', 'FT2' ) 32 | # same as: 33 | # $ csvcut -c Date,Team1,Team2,FT1,FT2 AUT_2016-2017.csv 34 | -------------------------------------------------------------------------------- /csvutils/getting-started-samples/start.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # command line shell script getting started sample from the csvutils readme 3 | # see https://github.com/csv11/csvutils 4 | # 5 | 6 | ########################################## 7 | ## try help output of tools 8 | csvhead -h # or 9 | csvhead --help 10 | 11 | csvheader -h 12 | csvstat -h 13 | csvsplit -h 14 | csvcut -h 15 | 16 | #################################################### 17 | # Working with Comma-Separated Values (CSV) Datafile Examples 18 | 19 | csvhead ENG.csv 20 | csvheader ENG.csv 21 | csvstat -c Team1,Team2 ENG.csv 22 | 23 | ##################################################### 24 | # Split & Cut - Split One Datafile into Many or Cut / Reorder Columns 25 | 26 | csvstat -c Season AUT.csv 27 | csvsplit -c Season AUT.csv 28 | csvcut -c Date,Team1,Team2,FT1,FT2 AUT_2016-2017.csv 29 | csvhead AUT_2016-2017.csv 30 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'date' 4 | require 'fileutils' 5 | require 'optparse' 6 | 7 | 8 | require 'csvreader' 9 | 10 | 11 | 12 | ### 13 | # our own code 14 | require 'csvutils/version' # let version always go first 15 | require 'csvutils/utils' 16 | require 'csvutils/split' 17 | require 'csvutils/cut' 18 | require 'csvutils/test' 19 | require 'csvutils/stat' 20 | require 'csvutils/header' 21 | require 'csvutils/head' 22 | 23 | require 'csvutils/commands/head' 24 | require 'csvutils/commands/header' 25 | require 'csvutils/commands/stat' 26 | require 'csvutils/commands/cut' 27 | require 'csvutils/commands/split' 28 | 29 | 30 | 31 | # say hello 32 | puts CsvUtils.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 33 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/commands/cut.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvTool 5 | 6 | ## command line tools 7 | def self.cut( args ) 8 | 9 | config = { columns: [] } 10 | 11 | parser = OptionParser.new do |opts| 12 | opts.banner = "Usage: csvcut [OPTS] source [dest]" 13 | 14 | opts.on("-c", "--columns=COLUMNS", "Name of header columns" ) do |columns| 15 | config[:columns] = columns.split(/[,|;]/) ## allow differnt separators 16 | end 17 | 18 | opts.on("-h", "--help", "Prints this help") do 19 | puts opts 20 | exit 21 | end 22 | end 23 | 24 | parser.parse!( args ) 25 | 26 | ## pp config 27 | ## pp args 28 | 29 | source = args[0] 30 | dest = args[1] || source ## default to same as source (note: overwrites datafile in place!!!) 31 | 32 | unless args[0] 33 | puts "** error: arg missing - source filepath required - #{args.inspect}" 34 | exit 1 35 | end 36 | 37 | columns = config[:columns] 38 | 39 | CsvUtils.cut( source, *columns, output: dest ) 40 | end 41 | 42 | 43 | end # class CsvTool 44 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/commands/head.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvTool 5 | 6 | ## command line tools 7 | def self.head( args ) 8 | 9 | config = { n: 4 } 10 | 11 | parser = OptionParser.new do |opts| 12 | opts.banner = "Usage: csvhead [OPTS] datafile ..." 13 | 14 | opts.on("-n", "--num=NUM", "Number of rows" ) do |num| 15 | config[:n] = num.to_i 16 | end 17 | 18 | opts.on("-h", "--help", "Prints this help") do 19 | puts opts 20 | exit 21 | end 22 | end 23 | 24 | parser.parse!( args ) 25 | 26 | ## pp config 27 | ## pp args 28 | 29 | args.each do |arg| 30 | path = arg 31 | n = config[:n] 32 | 33 | puts "== #{File.basename(path)} (#{File.dirname(path)}) ==" 34 | puts 35 | CsvUtils.head( path, n: n ) 36 | puts 37 | end # each arg 38 | end 39 | 40 | end # class CsvTool 41 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/commands/header.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvTool 5 | 6 | ## command line tools 7 | def self.header( args ) 8 | 9 | config = {} 10 | 11 | parser = OptionParser.new do |opts| 12 | opts.banner = "Usage: csvheader [OPTS] datafile ..." 13 | 14 | opts.on("-h", "--help", "Prints this help") do 15 | puts opts 16 | exit 17 | end 18 | end 19 | 20 | parser.parse!( args ) 21 | 22 | ## pp config 23 | ## pp args 24 | 25 | args.each do |arg| 26 | path = arg 27 | 28 | puts "== #{File.basename(path)} (#{File.dirname(path)}) ==" 29 | puts 30 | CsvUtils.pp_header( CsvUtils.header( path ) ) 31 | puts 32 | end # each arg 33 | end 34 | 35 | end # class CsvTool 36 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/commands/split.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvTool 5 | 6 | ## command line tools 7 | def self.split( args ) 8 | 9 | config = { columns: [] } 10 | 11 | parser = OptionParser.new do |opts| 12 | opts.banner = "Usage: csvsplit [OPTS] datafile ..." 13 | 14 | opts.on("-c", "--columns=COLUMNS", "Name of header columns" ) do |columns| 15 | config[:columns] = columns.split(/[,|;]/) ## allow differnt separators 16 | end 17 | 18 | opts.on("-h", "--help", "Prints this help") do 19 | puts opts 20 | exit 21 | end 22 | end 23 | 24 | parser.parse!( args ) 25 | 26 | ## pp config 27 | ## pp args 28 | 29 | args.each do |arg| 30 | path = arg 31 | columns = config[:columns] 32 | 33 | puts "== #{File.basename(path)} (#{File.dirname(path)}) ==" 34 | puts 35 | CsvUtils.split( path, *columns ) 36 | puts 37 | end 38 | end 39 | 40 | 41 | end # class CsvTool 42 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/commands/stat.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvTool 5 | 6 | ## command line tools 7 | def self.stat( args ) 8 | 9 | config = { columns: [] } 10 | 11 | parser = OptionParser.new do |opts| 12 | opts.banner = "Usage: csvstat [OPTS] datafile ..." 13 | 14 | opts.on("-c", "--columns=COLUMNS", "Name of header columns" ) do |columns| 15 | config[:columns] = columns.split(/[,|;]/) ## allow differnt separators 16 | end 17 | 18 | opts.on("-h", "--help", "Prints this help") do 19 | puts opts 20 | exit 21 | end 22 | end 23 | 24 | parser.parse!( args ) 25 | 26 | ## pp config 27 | ## pp args 28 | 29 | args.each do |arg| 30 | path = arg 31 | columns = config[:columns] 32 | 33 | puts "== #{File.basename(path)} (#{File.dirname(path)}) ==" 34 | puts 35 | CsvUtils.stat( path, *columns ) 36 | puts 37 | end # each arg 38 | end 39 | 40 | 41 | end # class CsvTool 42 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/cut.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ## check/use class or module ??? 4 | 5 | 6 | class CsvUtils 7 | 8 | def self.cut( path, *columns, output: path, sep: ',' ) 9 | 10 | inpath = path 11 | outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!) 12 | 13 | puts "cvscut in: >#{inpath}< out: >#{outpath}<" 14 | 15 | ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] 16 | puts "columns:" 17 | pp columns 18 | 19 | csv_options = { sep: sep } 20 | 21 | recs = CsvHash.read( inpath, csv_options ) 22 | 23 | 24 | ## for convenience - make sure parent folders/directories exist 25 | FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath )) 26 | 27 | 28 | ## note: 29 | ## todo/fix: add two trailing spaces for pretty printing - why? why not? 30 | File.open( outpath, 'w:utf-8' ) do |out| 31 | out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns 32 | out << "\n" 33 | recs.each do |rec| 34 | values = columns.map { |col| rec[col] } ## find data for column 35 | out << csv_row( *values, sep: sep ).join( sep ) 36 | out << "\n" 37 | end 38 | end 39 | 40 | puts 'Done.' 41 | end ## method self.cut 42 | 43 | end # class CsvUtils 44 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/head.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvUtils 5 | 6 | ## test or dry run to check if rows can get read/scanned 7 | def self.head( path, sep: ',', n: 4 ) 8 | i = 0 9 | csv_options = { sep: sep } 10 | 11 | CsvHash.foreach( path, csv_options ) do |rec| 12 | i += 1 13 | 14 | pp rec 15 | 16 | break if i >= n 17 | end 18 | 19 | puts " #{i} records" 20 | end 21 | 22 | end # class CsvUtils 23 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/header.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvUtils 5 | 6 | def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)? 7 | row = CsvReader.header( path, sep: sep ) 8 | 9 | pp row if debug 10 | ## e.g.: 11 | # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n" 12 | 13 | row 14 | end # method self.header 15 | 16 | end # class CsvUtils 17 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/split.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | 5 | 6 | class CsvUtils 7 | 8 | def self.split( path, *columns, sep: ',', &blk ) 9 | 10 | puts "cvssplit in: >#{path}<" 11 | 12 | ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] 13 | puts "columns:" 14 | pp columns 15 | 16 | ## note: do NOT use headers 17 | ## for easy sorting use "plain" array of array for records 18 | csv_options = { sep: sep } 19 | 20 | data = CsvReader.read( path, csv_options ) 21 | 22 | ## todo/check: (auto-) strip (remove all leading and trailing spaces) 23 | ## from all values - why? why not? 24 | ## check if CSV.parse has an option for it? 25 | 26 | headers = data.shift ## remove top array item (that is, row with headers) 27 | 28 | header_mapping = {} 29 | headers.each_with_index { | header,i | header_mapping[header]=i } 30 | pp header_mapping 31 | 32 | ## map columns to array indices e.g. ['Season', 'Div'] => [1,2] 33 | column_indices = columns.map { |col| header_mapping[col] } 34 | pp column_indices 35 | 36 | 37 | ################################################### 38 | ## note: sort data by columns (before split) 39 | data = data.sort do |row1,row2| 40 | res = 0 41 | column_indices.each do |col| 42 | res = row1[col] <=> row2[col] if res == 0 43 | end 44 | res 45 | end 46 | 47 | chunk = [] 48 | data.each_with_index do |row,i| 49 | chunk << row 50 | 51 | next_row = data[i+1] 52 | 53 | changed = false 54 | if next_row.nil? ## end-of-file 55 | changed = true 56 | else 57 | column_indices.each do |col| 58 | if row[col] != next_row[col] 59 | changed = true 60 | break ## out of each column_indices loop 61 | end 62 | end 63 | end 64 | 65 | if changed 66 | puts "save new chunk:" 67 | column_values = column_indices.map {|col| row[col] } 68 | pp column_values 69 | 70 | # note: add header(s) row upfront (as first row) to chunk (with unshift) 71 | chunk_with_headers = chunk.unshift( headers ) 72 | if blk 73 | yield( column_values, chunk_with_headers ) 74 | else 75 | ## auto-save (write-to-file) by default - why? why not? 76 | split_write( path, column_values, chunk_with_headers, sep: sep ) 77 | end 78 | 79 | chunk = [] ## reset chunk for next batch of records 80 | end 81 | end 82 | 83 | puts 'Done.' 84 | end ## method self.split 85 | 86 | 87 | def self.split_write( inpath, values, chunk, sep: ) 88 | basename = File.basename( inpath, '.*' ) 89 | dirname = File.dirname( inpath ) 90 | 91 | ## check/change invalid filename chars 92 | ## e.g. change 1990/91 to 1990-91 93 | extraname = values.map {|value| value.tr('/','-')}.join('~') 94 | 95 | outpath = "#{dirname}/#{basename}_#{extraname}.csv" 96 | puts "saving >#{basename}_#{extraname}.csv<..." 97 | 98 | File.open( outpath, 'w:utf-8' ) do |out| 99 | chunk.each do |row| 100 | out << csv_row( *row, sep: sep ).join( sep ) 101 | out << "\n" 102 | end 103 | end 104 | end 105 | 106 | end # class CsvUtils 107 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/stat.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvUtils 5 | 6 | def self.stat( path, *columns, sep: ',', debug: false ) 7 | 8 | csv_options = { sep: sep } 9 | 10 | values = {} 11 | nulls = {} 12 | # check 1) nulls/nils (e.g. empty strings ""), 13 | # 2) not/appliation or available n/a NA or NaN or ... 14 | # 3) missing - e.g. ? 15 | 16 | i=0 17 | CsvHash.foreach( path, csv_options ) do |rec| 18 | i += 1 19 | 20 | pp rec if i == 1 && debug 21 | 22 | print '.' if i % 100 == 0 23 | 24 | ## collect unique values for passed in columns 25 | columns.each do |col| 26 | value = rec[col] ## note: value might be nil!!!!! 27 | 28 | values[col] ||= Hash.new(0) 29 | values[col][ value ? value : '<nil>' ] +=1 30 | end 31 | 32 | ## alway track nulls - why? why not 33 | rec.each do |col,value| 34 | ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?) 35 | ## puts "[debug] nil value in row:" 36 | ## puts "#{col} = #{value.inspect} : #{value.class.name}" 37 | ## end 38 | 39 | if value.nil? 40 | nulls[col] ||= Hash.new(0) 41 | nulls[col]['nil'] +=1 42 | elsif value.empty? 43 | nulls[col] ||= Hash.new(0) 44 | nulls[col]['empty'] +=1 45 | elsif ['na', 'n/a', '-'].include?( value.downcase ) 46 | nulls[col] ||= Hash.new(0) 47 | nulls[col]['na'] +=1 48 | elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not? 49 | nulls[col] ||= Hash.new(0) 50 | nulls[col]['?'] +=1 51 | else 52 | # do nothing; "regular" value 53 | end 54 | end 55 | end 56 | 57 | puts " #{i} rows" 58 | puts 59 | puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):" 60 | puts " #{nulls.inspect}" 61 | puts 62 | 63 | ## dump headers first (first row with names of columns) 64 | headers = header( path, sep: sep, debug: debug ) 65 | pp_header( headers ) ## pretty print header columns 66 | puts 67 | 68 | if values.any? 69 | ## pretty print (pp) / dump unique values for passed in columns 70 | values.each do |col,h| 71 | puts " column >#{col}< #{h.size} unique values:" 72 | ## sort by name/value for now (not frequency) - change - why? why not? 73 | sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] } 74 | sorted_values.each do |rec| 75 | puts " #{rec[1]} x #{rec[0]}" 76 | end 77 | end 78 | end 79 | end # method self.stat 80 | 81 | end # class CsvUtils 82 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/test.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvUtils 5 | 6 | ## test or dry run to check if rows can get read/scanned 7 | def self.test( path, sep: ',' ) 8 | i = 0 9 | csv_options = { sep: sep } 10 | 11 | CsvHash.foreach( path, csv_options ) do |rec| 12 | i += 1 13 | print '.' if i % 100 == 0 14 | end 15 | 16 | puts " #{i} rows" 17 | end 18 | 19 | end # class CsvUtils 20 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/utils.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvUtils 5 | 6 | def self.pp_header( headers ) ## check: rename to print_headers or prettyprint_header - why? why not? 7 | puts "#{headers.size} columns:" 8 | headers.each_with_index do |header,i| 9 | puts " #{i+1}: #{header}" 10 | end 11 | end 12 | 13 | 14 | ################### 15 | ## (simple) helper for "csv-encoding" values / row 16 | ## 17 | ## todo: check for newline in value too? why? why not? 18 | def self.csv_row( *values, sep: ',' ) 19 | values.map do |value| 20 | if value && (value.index( sep ) || value.index('"')) 21 | ## double quotes and enclose in double qoutes 22 | value = %Q{"#{value.gsub('"', '""')}"} 23 | else 24 | value 25 | end 26 | end 27 | end 28 | 29 | end # class CsvUtils 30 | -------------------------------------------------------------------------------- /csvutils/lib/csvutils/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | ## note: for now CsvUtils is a class!!! NOT a module - change - why? why not? 5 | class CsvUtils 6 | 7 | MAJOR = 0 ## todo: namespace inside version or something - why? why not?? 8 | MINOR = 3 9 | PATCH = 0 10 | VERSION = [MAJOR,MINOR,PATCH].join('.') 11 | 12 | def self.version 13 | VERSION 14 | end 15 | 16 | def self.banner 17 | "csvutils/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 18 | end 19 | 20 | def self.root 21 | File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) 22 | end 23 | 24 | end # class CsvUtils 25 | -------------------------------------------------------------------------------- /csvutils/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'csvutils' 10 | 11 | ## add test_data_dir helper 12 | class CsvUtils 13 | def self.test_data_dir 14 | "#{root}/datasets" 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /csvutils/test/test_cut.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_cut.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestCut < MiniTest::Test 11 | 12 | def test_eng 13 | path = "#{CsvUtils.test_data_dir}/eng-england/2017-18/E0.csv" 14 | columns = [ 'HomeTeam', 'FTHG', 'FTAG', 'AwayTeam', 'Date' ] 15 | CsvUtils.cut( path, *columns, output: './tmp/cut_test_eng.csv' ) 16 | end 17 | 18 | def test_at 19 | path = "#{CsvUtils.test_data_dir}/at-austria/AUT.csv" 20 | columns = [ 'Home', 'HG', 'AG', 'Away', 'Date', 'Time' ] 21 | CsvUtils.cut( path, *columns, output: './tmp/cut_test_at.csv' ) 22 | end 23 | 24 | def test_de 25 | path = "#{CsvUtils.test_data_dir}/de-deutschland/bundesliga.csv" 26 | columns = ['Saison', 'Spieltag', 27 | 'Heim', 'Ergebnis', 'Gast', 'Datum', 'Uhrzeit' ] 28 | CsvUtils.cut( path, *columns, sep: ';', output: './tmp/cut_test_de.csv' ) 29 | end 30 | 31 | end # class TestHead 32 | -------------------------------------------------------------------------------- /csvutils/test/test_head.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_head.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestHead < MiniTest::Test 11 | 12 | def test_eng 13 | path = "#{CsvUtils.test_data_dir}/eng-england/2017-18/E0.csv" 14 | 15 | CsvUtils.head( path ) 16 | end 17 | 18 | def test_at 19 | path = "#{CsvUtils.test_data_dir}/at-austria/AUT.csv" 20 | 21 | CsvUtils.head( path ) 22 | end 23 | 24 | def test_de 25 | path = "#{CsvUtils.test_data_dir}/de-deutschland/bundesliga.csv" 26 | 27 | CsvUtils.head( path, sep: ';' ) 28 | end 29 | 30 | end # class TestHead 31 | -------------------------------------------------------------------------------- /csvutils/test/test_header.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_header.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestHeader < MiniTest::Test 11 | 12 | 13 | ## 14 | # Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR, 15 | # Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR, 16 | # B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA, 17 | # WHH,WHD,WHA,VCH,VCD,VCA, 18 | # Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5, 19 | # BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA 20 | def test_eng 21 | path = "#{CsvUtils.test_data_dir}/eng-england/2017-18/E0.csv" 22 | 23 | headers = CsvUtils.header( path ) 24 | pp headers 25 | 26 | assert_equal ['Date','HomeTeam','AwayTeam','FTHG','FTAG','HTHG','HTAG'], headers 27 | end 28 | 29 | ### 30 | # Country,League,Season,Date,Time,Home,Away,HG,AG, 31 | # Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA 32 | def test_at 33 | path = "#{CsvUtils.test_data_dir}/at-austria/AUT.csv" 34 | 35 | headers = CsvUtils.header( path ) 36 | pp headers 37 | 38 | assert_equal ['Season','Date','Time','Home','Away','HG','AG'], headers 39 | end 40 | 41 | def test_de 42 | path = "#{CsvUtils.test_data_dir}/de-deutschland/bundesliga.csv" 43 | 44 | headers = CsvUtils.header( path, sep: ';' ) 45 | pp headers 46 | 47 | assert_equal ['Spielzeit','Saison','Spieltag','Datum','Uhrzeit','Heim','Gast','Ergebnis','Halbzeit'], headers 48 | end 49 | 50 | end # class TestHeader 51 | -------------------------------------------------------------------------------- /csvutils/test/test_misc.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_misc.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestMiscellaneous < MiniTest::Test 11 | 12 | def test_eng 13 | path = "#{CsvUtils.test_data_dir}/eng-england/2017-18/E0.csv" 14 | 15 | CsvUtils.test( path ) 16 | 17 | CsvUtils.stat( path ) 18 | CsvUtils.stat( path, 'HomeTeam', 'AwayTeam' ) 19 | 20 | assert true 21 | end 22 | 23 | def test_test_de 24 | path = "#{CsvUtils.test_data_dir}/de-deutschland/bundesliga.csv" 25 | 26 | CsvUtils.test( path, sep: ';' ) 27 | 28 | CsvUtils.stat( path, sep: ';' ) 29 | CsvUtils.stat( path, 'Spielzeit', 'Saison', 'Heim', 'Gast', sep: ';' ) 30 | 31 | assert true 32 | end 33 | 34 | def test_test_at 35 | path = "#{CsvUtils.test_data_dir}/at-austria/AUT.csv" 36 | 37 | CsvUtils.test( path ) 38 | 39 | CsvUtils.stat( path ) 40 | CsvUtils.stat( path, 'Season', 'Home', 'Away' ) 41 | assert true 42 | end 43 | 44 | end # class TestMiscellaneous 45 | -------------------------------------------------------------------------------- /csvutils/test/test_split.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_split.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestSplit < MiniTest::Test 11 | 12 | def test_eng 13 | path = "#{CsvUtils.test_data_dir}/eng-england/2017-18/E0.csv" 14 | columns = [ 'HomeTeam' ] 15 | CsvUtils.split( path, *columns ) do |values, chunk| 16 | pp values 17 | pp chunk 18 | end 19 | end 20 | 21 | 22 | def test_de 23 | path = "#{CsvUtils.test_data_dir}/de-deutschland/bundesliga.csv" 24 | columns = ['Saison', 'Spieltag' ] 25 | CsvUtils.split( path, *columns, sep: ';' ) do |values, chunk| 26 | pp values 27 | pp chunk 28 | end 29 | end 30 | 31 | end # class TestSplit 32 | -------------------------------------------------------------------------------- /csvutils/test/test_version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_version.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestVersion < MiniTest::Test 11 | 12 | def test_version 13 | pp CsvUtils::VERSION 14 | pp CsvUtils.banner 15 | pp CsvUtils.root 16 | 17 | assert true ## assume ok if we get here 18 | end 19 | 20 | end # class TestVersion 21 | -------------------------------------------------------------------------------- /csvyaml/.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /vendor/bundle 26 | /lib/bundler/man/ 27 | 28 | # for a library or gem, you might want to ignore these files since the code is 29 | # intended to run in multiple environments; otherwise, check them in: 30 | # Gemfile.lock 31 | # .ruby-version 32 | # .ruby-gemset 33 | 34 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 35 | .rvmrc 36 | -------------------------------------------------------------------------------- /csvyaml/HISTORY.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-10-18 2 | 3 | * Everything is new. First release 4 | -------------------------------------------------------------------------------- /csvyaml/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | LICENSE.md 3 | Manifest.txt 4 | README.md 5 | Rakefile 6 | datasets/hello.yaml.csv 7 | datasets/hello11.yaml.csv 8 | lib/csvyaml.rb 9 | lib/csvyaml/parser.rb 10 | lib/csvyaml/version.rb 11 | test/helper.rb 12 | test/test_parser.rb 13 | test/test_parser_misc.rb 14 | -------------------------------------------------------------------------------- /csvyaml/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/csvyaml/version.rb' 3 | 4 | Hoe.spec 'csvyaml' do 5 | 6 | self.version = CsvYaml::VERSION 7 | 8 | self.summary = "csvyaml - read tabular data in the CSV <3 YAML format, that is, comma-separated values CSV (line-by-line) records with yaml ain't markup language (YAML) encoding rules" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/csvyaml'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.extra_deps = [ 21 | ] 22 | 23 | self.licenses = ['Public Domain'] 24 | 25 | self.spec_extras = { 26 | required_ruby_version: '>= 2.2.2' 27 | } 28 | 29 | end 30 | -------------------------------------------------------------------------------- /csvyaml/datasets/hello.yaml.csv: -------------------------------------------------------------------------------- 1 | 1,John,12 Totem Rd. Aspen,true 2 | 2,Bob,null,false 3 | 3,Sue,"Bigsby, 345 Carnival, WA 23009",false 4 | -------------------------------------------------------------------------------- /csvyaml/datasets/hello11.yaml.csv: -------------------------------------------------------------------------------- 1 | # hello world 2 | 3 | 1, John, 12 Totem Rd. Aspen, true 4 | 2, Bob, null, false 5 | 3, Sue, "Bigsby, 345 Carnival, WA 23009", false 6 | -------------------------------------------------------------------------------- /csvyaml/lib/csvyaml.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'pp' 4 | require 'yaml' 5 | require 'logger' 6 | 7 | ## our own code 8 | require 'csvyaml/version' # note: let version always go first 9 | require 'csvyaml/parser' 10 | 11 | 12 | 13 | ## add some "alternative" shortcut aliases 14 | CSV_YAML = CsvYaml 15 | CSVYAML = CsvYaml 16 | CSVY = CsvYaml 17 | CsvY = CsvYaml 18 | 19 | 20 | # say hello 21 | puts CsvYaml.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 22 | -------------------------------------------------------------------------------- /csvyaml/lib/csvyaml/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | class CsvYaml 5 | 6 | MAJOR = 1 7 | MINOR = 0 8 | PATCH = 0 9 | VERSION = [MAJOR,MINOR,PATCH].join('.') 10 | 11 | 12 | def self.version 13 | VERSION 14 | end 15 | 16 | def self.banner 17 | "csvyaml/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 18 | end 19 | 20 | def self.root 21 | "#{File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )}" 22 | end 23 | 24 | end # module CsvYaml 25 | -------------------------------------------------------------------------------- /csvyaml/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'csvyaml' 10 | 11 | 12 | ## add test_data_dir helper 13 | class CsvYaml 14 | def self.test_data_dir 15 | "#{root}/datasets" 16 | end 17 | end 18 | 19 | 20 | CsvYaml.logger.level = :debug ## turn on "global" logging 21 | -------------------------------------------------------------------------------- /csvyaml/test/test_parser.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParser < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvYaml 15 | end 16 | 17 | 18 | def records ## "standard" records for testing 19 | [[1, "John", "12 Totem Rd. Aspen", true], 20 | [2, "Bob", nil, false], 21 | [3, "Sue", "Bigsby, 345 Carnival, WA 23009", false]] 22 | end 23 | 24 | 25 | 26 | def test_parse 27 | assert_equal records, parser.parse( <<TXT ) 28 | 1,John,12 Totem Rd. Aspen,true 29 | 2,Bob,null,false 30 | 3,Sue,"Bigsby, 345 Carnival, WA 23009",false 31 | TXT 32 | 33 | assert_equal records, parser.parse( <<TXT ) 34 | # hello world 35 | 36 | 1,John,12 Totem Rd. Aspen,true 37 | 2,Bob,null,false 38 | 3,Sue,"Bigsby, 345 Carnival, WA 23009",false 39 | TXT 40 | 41 | assert_equal records, parser.parse( <<TXT ) 42 | # hello world (pretty printed) 43 | 44 | 1, John, 12 Totem Rd. Aspen, true 45 | 2, Bob, null, false 46 | 3, Sue, "Bigsby, 345 Carnival, WA 23009", false 47 | 48 | # try more comments and empty lines 49 | 50 | TXT 51 | 52 | 53 | txt =<<TXT 54 | # hello world 55 | 1,John,12 Totem Rd. Aspen,true 56 | 2,Bob,null,false 57 | 3,Sue,"Bigsby, 345 Carnival, WA 23009",false 58 | TXT 59 | 60 | recs = [] 61 | parser.parse( txt ) { |rec| recs << rec } 62 | assert_equal records, recs 63 | end 64 | 65 | 66 | def test_read 67 | assert_equal records, parser.read( "#{CsvYaml.test_data_dir}/hello.yaml.csv" ) 68 | assert_equal records, parser.read( "#{CsvYaml.test_data_dir}/hello11.yaml.csv" ) 69 | end 70 | 71 | 72 | def test_open 73 | assert_equal records, parser.open( "#{CsvYaml.test_data_dir}/hello.yaml.csv", "r:bom|utf-8" ).read 74 | assert_equal records, parser.open( "#{CsvYaml.test_data_dir}/hello11.yaml.csv", "r:bom|utf-8" ).read 75 | end 76 | 77 | 78 | def test_foreach 79 | recs = [] 80 | parser.foreach( "#{CsvYaml.test_data_dir}/hello.yaml.csv" ) { |rec| recs << rec } 81 | assert_equal records, recs 82 | 83 | recs = [] 84 | parser.foreach( "#{CsvYaml.test_data_dir}/hello11.yaml.csv" ) { |rec| recs << rec } 85 | assert_equal records, recs 86 | end 87 | 88 | 89 | def test_enum 90 | csv = CsvYaml.new( <<TXT ) 91 | # hello world 92 | 93 | 1,John,12 Totem Rd. Aspen,true 94 | 2,Bob,null,false 95 | 3,Sue,"Bigsby, 345 Carnival, WA 23009",false 96 | TXT 97 | 98 | it = csv.to_enum 99 | assert_equal [1, "John", "12 Totem Rd. Aspen", true], it.next 100 | assert_equal [2, "Bob", nil, false], it.next 101 | assert_equal [3, "Sue", "Bigsby, 345 Carnival, WA 23009", false], it.next 102 | end 103 | 104 | end # class TestParser 105 | -------------------------------------------------------------------------------- /csvyaml/test/test_parser_misc.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_parser_misc.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestParserMisc < MiniTest::Test 11 | 12 | 13 | def parser 14 | CsvYaml 15 | end 16 | 17 | 18 | def test_quotes_and_commas 19 | assert_equal [ 20 | [1, "John", "12 Totem Rd., Aspen", true], 21 | [2, "Bob", nil, false], 22 | [3, "Sue", "\"Bigsby\", 345 Carnival, WA 23009", false] 23 | ], parser.parse( <<TXT ) 24 | 1,John,"12 Totem Rd., Aspen",true 25 | 2,Bob,null,false 26 | 3,Sue,"\\"Bigsby\\", 345 Carnival, WA 23009",false 27 | TXT 28 | end 29 | 30 | 31 | def test_arrays 32 | assert_equal [ 33 | [1, "directions", ["north","south","east","west"]], 34 | [2, "colors", ["red","green","blue"]], 35 | [3, "drinks", ["soda","water","tea","coffe"]], 36 | [4, "spells", []], 37 | ], parser.parse( <<TXT ) 38 | # CSV <3 YAML with array values 39 | 40 | 1,directions,[north,south,east,west] 41 | 2,colors,[red,green,blue] 42 | 3,drinks,[soda,water,tea,coffe] 43 | 4,spells,[] 44 | TXT 45 | end 46 | 47 | def test_misc 48 | ## note: 49 | ## in the csv <3 json source text backslash needs to get doubled / escaped twice e.g. 50 | ## \\" for quotes 51 | ## \\n for newlines and so on 52 | 53 | assert_equal [ 54 | ["index", "value1", "value2"], 55 | ["number", 1, 2], 56 | ["boolean", false, true], 57 | ["null", nil, "non null"], 58 | ["array of numbers", [1], [1,2]], 59 | ["simple object", {"a" => 1}, {"a" => 1, "b" => 2}], 60 | ["array with mixed objects", [1, nil,"ball"], [2,{"a" => 10, "b" => 20},"cube"]], 61 | ["string with quotes", "a\"b", "alert(\"Hi!\")"], 62 | ["string with bell&newlines","bell is \u0007","multi\nline\ntext"] 63 | ], parser.parse( <<TXT ) 64 | # CSV with all kinds of values 65 | 66 | index,value1,value2 67 | number,1,2 68 | boolean,false,true 69 | "null",null,non null 70 | array of numbers,[1],[1,2] 71 | ## note: key:value pairs need a space after colon!!! NOT working {a:1},{a:1, b:2} 72 | simple object,{a: 1},{a: 1, b: 2} 73 | ## note: again - key:value pairs need a space after colon!!! NOT working {a:10, b:20} 74 | array with mixed objects,[1,null,ball],[2,{a: 10,b: 20},cube] 75 | string with quotes,"a\\"b","alert(\\"Hi!\\")" 76 | string with bell&newlines,"bell is \\u0007","multi\\nline\\ntext" 77 | TXT 78 | 79 | end 80 | 81 | 82 | end # class TestParserMisc 83 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Comma-separated values (csv) scripts & tools docs 2 | 3 | 4 | 5 | ## Article Series - Why the CSV standard library is broken (and how to fix it) 6 | 7 | <!-- comment out introduction 8 | 9 | ### Introduction 10 | 11 | <details> 12 | <summary>Show/Hide Text</summary> 13 | 14 | 15 | Reminder: Dear [James Edward Gray II](https://twitter.com/JEG2), We love you. We thank you for your code. 16 | You're a genius. You're beautiful. [We stand on your shoulders. You're a giant.¹](https://en.wikipedia.org/wiki/Standing_on_the_shoulders_of_giants) 17 | Please, please, please - these articles are NOT about you. 18 | It's about the code and how to fix it. 19 | 20 | > I'm seeing from you is that we should not consider people's feelings when criticizing their work. [...] 21 | > Please take time to sit down [..] and offer an apology to the author of the CSV library. 22 | 23 | [I Apologize - Sorry, Sorry, Sorry - Why the standard CSV library author deserves our hugs and thank yous and why new giants are wanted »](sorry-sorry-sorry.md) 24 | 25 | 26 | --- 27 | ¹: stand on someone's shoulders - to make discoveries, insights, or progress due to the discoveries or previous work of those who have come before. 28 | 29 | </details> 30 | 31 | --> 32 | 33 | 34 | <!-- 35 | ### Content 36 | --> 37 | 38 | 39 | > "Criticism is something we can avoid easily by saying nothing, doing nothing, and being nothing." 40 | > 41 | > -- Aristotle 42 | 43 | 44 | _What's broken (and wrong, wrong, wrong) in the CSV standard library? Let's count the ways:_ 45 | 46 | - [**Part I or A (Simplistic) String#split Kludge vs A Purpose Built CSV Parser**](why-the-csv-stdlib-is-broken.md) 47 | - [**Part II or The Wonders of CSV Formats / Dialects**](csv-formats.md) 48 | - [**Part III or Returning a CSV Record as an Array? Hash? Struct? Row?**](csv-array-hash-struct.md) 49 | - [**Part IV or Numerics a.k.a. Auto-Magic Type Inference for Strings and Numbers**](csv-numerics.md) 50 | - [**Part V or Escaping the Stray Quote Error Hell - Do You Want Single, Double, or French Quotes With That Comma?**](csv-quotes.md) 51 | - [**Part VI or Fixes in Alternative CSV Libraries or Evolve or Die or Fast, Faster, Fasterer, Fastest**](csv-libraries.md) 52 | - [**Part VII or What's Your Type? Guess. Again. And Again. And Again. Guess What's a Schema For?**](csv-types.md) 53 | 54 | 55 | 56 | <!-- 57 | 58 | > "He has a right to criticize, who has a heart to help." 59 | > 60 | > -- Abraham Lincoln 61 | 62 | 63 | --> 64 | 65 | 66 | 67 | 68 | ## Migrate / Upgrade from ___ - Side-by-Side Examples 69 | 70 | - [**Migrate / Upgrade from Smarter CSV to CSV Reader - Side-by-Side Examples**](smarter-csv.md) 71 | 72 | -------------------------------------------------------------------------------- /docs/csv_stdlib_human.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csv' 4 | require 'pp' 5 | 6 | 7 | txt = <<TXT 8 | ####### 9 | # try with some comments 10 | # and blank lines even before header (first row) 11 | 12 | Brewery,City,Name,Abv 13 | Andechser Klosterbrauerei,Andechs,Doppelbock Dunkel,7% 14 | Augustiner Bräu München,München,Edelstoff,5.6% 15 | 16 | Bayerische Staatsbrauerei Weihenstephan, Freising, Hefe Weissbier, 5.4% 17 | Brauerei Spezial, Bamberg, Rauchbier Märzen, 5.1% 18 | Hacker-Pschorr Bräu, München, Münchner Dunkel, 5.0% 19 | Staatliches Hofbräuhaus München, München, Hofbräu Oktoberfestbier, 6.3% 20 | TXT 21 | 22 | 23 | COMMENTS_REGEX = /^\s*#/ 24 | BLANK_REGEX = /^\s*$/ ## skip all whitespace lines - note: use "" for a blank record 25 | SKIP_REGEX = Regexp.union( COMMENTS_REGEX, BLANK_REGEX ) 26 | 27 | ## register our own converters 28 | CSV::Converters[:strip] = ->(field) { field.strip } 29 | 30 | csv_opts = { 31 | skip_lines: SKIP_REGEX, 32 | skip_blanks: true, ## note: skips lines with no whitespaces only!! (e.g. line with space is NOT blank!!) 33 | :converters => [:strip], 34 | encoding: 'utf-8' 35 | } 36 | 37 | pp CSV.parse( txt, csv_opts ) 38 | 39 | # => [["Brewery", "City", "Name", "Abv"], 40 | # ["Andechser Klosterbrauerei", "Andechs", "Doppelbock Dunkel", "7%"], 41 | # ["Augustiner Br\u00E4u M\u00FCnchen", "M\u00FCnchen", "Edelstoff", "5.6%"], 42 | # ["Bayerische Staatsbrauerei Weihenstephan", "Freising", "Hefe Weissbier", "5.4%"], 43 | # ["Brauerei Spezial", "Bamberg", "Rauchbier M\u00E4rzen", "5.1%"], 44 | # ["Hacker-Pschorr Br\u00E4u", "M\u00FCnchen", "M\u00FCnchner Dunkel", "5.0%"], 45 | # ["Staatliches Hofbr\u00E4uhaus M\u00FCnchen", "M\u00FCnchen", "Hofbr\u00E4u Oktoberfestbier", "6.3%"]] 46 | -------------------------------------------------------------------------------- /docs/csv_stdlib_test.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'csv' 4 | require 'pp' 5 | 6 | 7 | 8 | begin 9 | CSV.parse( %{1, "2"}) 10 | rescue CSV::MalformedCSVError => ex 11 | pp ex 12 | end 13 | # => #<CSV::MalformedCSVError: Illegal quoting in line 1.> 14 | 15 | begin 16 | CSV.parse( %{"3" , 4}) 17 | rescue CSV::MalformedCSVError => ex 18 | pp ex 19 | end 20 | # => #<CSV::MalformedCSVError: Unclosed quoted field on line 1.> 21 | 22 | pp CSV.parse( %{"","",,} ) 23 | # => ["", "", nil, nil] 24 | -------------------------------------------------------------------------------- /tabreader/.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | /.config 4 | /coverage/ 5 | /InstalledFiles 6 | /pkg/ 7 | /spec/reports/ 8 | /test/tmp/ 9 | /test/version_tmp/ 10 | /tmp/ 11 | 12 | ## Specific to RubyMotion: 13 | .dat* 14 | .repl_history 15 | build/ 16 | 17 | ## Documentation cache and generated files: 18 | /.yardoc/ 19 | /_yardoc/ 20 | /doc/ 21 | /rdoc/ 22 | 23 | ## Environment normalisation: 24 | /.bundle/ 25 | /lib/bundler/man/ 26 | 27 | # for a library or gem, you might want to ignore these files since the code is 28 | # intended to run in multiple environments; otherwise, check them in: 29 | # Gemfile.lock 30 | # .ruby-version 31 | # .ruby-gemset 32 | 33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 34 | .rvmrc 35 | 36 | 37 | #### 38 | # add some auto-generated getting started samples 39 | 40 | getting-started-samples/AUT_2016-2017.csv 41 | getting-started-samples/AUT_2017-2018.csv 42 | -------------------------------------------------------------------------------- /tabreader/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.0.1 / 2018-08-17 2 | 3 | * Everything is new. First release. 4 | -------------------------------------------------------------------------------- /tabreader/Manifest.txt: -------------------------------------------------------------------------------- 1 | HISTORY.md 2 | LICENSE.md 3 | Manifest.txt 4 | README.md 5 | Rakefile 6 | datasets/empty.tab 7 | datasets/test.tab 8 | lib/tabreader.rb 9 | lib/tabreader/reader.rb 10 | lib/tabreader/reader_hash.rb 11 | lib/tabreader/version.rb 12 | test/helper.rb 13 | test/test_reader.rb 14 | test/test_reader_hash.rb 15 | -------------------------------------------------------------------------------- /tabreader/NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes 2 | 3 | ## Todos 4 | 5 | - [ ] add encoding option to all file convenience methods 6 | - [ ] add (auto) skip blank lines? 7 | - [ ] add (auto) skip comment lines (`#`) or (`%`)? why? why not? 8 | -------------------------------------------------------------------------------- /tabreader/Rakefile: -------------------------------------------------------------------------------- 1 | require 'hoe' 2 | require './lib/tabreader/version.rb' 3 | 4 | Hoe.spec 'tabreader' do 5 | 6 | self.version = TabReader::VERSION 7 | 8 | self.summary = "tabreader - read in tabular datafiles in text in the tabular (TAB) format" 9 | self.description = summary 10 | 11 | self.urls = ['https://github.com/csvreader/tabreader'] 12 | 13 | self.author = 'Gerald Bauer' 14 | self.email = 'wwwmake@googlegroups.com' 15 | 16 | # switch extension to .markdown for gihub formatting 17 | self.readme_file = 'README.md' 18 | self.history_file = 'HISTORY.md' 19 | 20 | self.licenses = ['Public Domain'] 21 | 22 | self.spec_extras = { 23 | required_ruby_version: '>= 2.2.2' 24 | } 25 | 26 | end 27 | -------------------------------------------------------------------------------- /tabreader/datasets/empty.tab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubycocos/csvreader/5f33603e8f2f8db57bebfb167561111317ed7d0a/tabreader/datasets/empty.tab -------------------------------------------------------------------------------- /tabreader/datasets/test.tab: -------------------------------------------------------------------------------- 1 | a b c 2 | 1 2 3 3 | 4 5 6 4 | {"one":1,"two":2,[3,4,5],null} 7 8 5 | test newline \n and tab \t literals 9 10 6 | -------------------------------------------------------------------------------- /tabreader/lib/tabreader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | require 'pp' 5 | require 'logger' 6 | 7 | 8 | ### 9 | # our own code 10 | # check: use require_relative - why? why not? 11 | require 'tabreader/version' # let version always go first 12 | require 'tabreader/reader' 13 | require 'tabreader/reader_hash' 14 | 15 | 16 | 17 | ## add some "convenience" shortcuts 18 | TAB = TabReader 19 | Tab = TabReader 20 | TabHash = TabHashReader 21 | 22 | 23 | # say hello 24 | puts TabReader.banner if $DEBUG || (defined?($RUBYCOCO_DEBUG) && $RUBYCOCO_DEBUG) 25 | -------------------------------------------------------------------------------- /tabreader/lib/tabreader/reader_hash.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class TabHashReader 4 | 5 | 6 | def self.open( path, mode=nil, headers: nil, &block ) ## rename path to filename or name - why? why not? 7 | 8 | ## note: default mode (if nil/not passed in) to 'r:bom|utf-8' 9 | f = File.open( path, mode ? mode : 'r:bom|utf-8' ) 10 | tab = new(f, headers: headers ) 11 | 12 | # handle blocks like Ruby's open() 13 | if block_given? 14 | begin 15 | block.call( tab ) 16 | ensure 17 | tab.close 18 | end 19 | else 20 | tab 21 | end 22 | end # method self.open 23 | 24 | 25 | def self.read( path, headers: nil ) 26 | open( path, headers: headers ) { |tab| tab.read } 27 | end 28 | 29 | 30 | 31 | def self.foreach( path, headers: nil, &block ) 32 | tab = open( path, headers: headers) 33 | 34 | if block_given? 35 | begin 36 | tab.each( &block ) 37 | ensure 38 | tab.close 39 | end 40 | else 41 | tab.to_enum ## note: caller (responsible) must close file!!! 42 | ## remove version without block given - why? why not? 43 | ## use Tab.open().to_enum or Tab.open().each 44 | ## or Tab.new( File.new() ).to_enum or Tab.new( File.new() ).each ??? 45 | end 46 | end # method self.foreach 47 | 48 | 49 | def self.parse( data, headers: nil, &block ) 50 | tab = new( data, headers: headers ) 51 | 52 | if block_given? 53 | tab.each( &block ) ## note: caller (responsible) must close file!!! - add autoclose - why? why not? 54 | else # slurp contents, if no block is given 55 | tab.read ## note: caller (responsible) must close file!!! - add autoclose - why? why not? 56 | end 57 | end # method self.parse 58 | 59 | 60 | 61 | 62 | def initialize( data, headers: nil ) 63 | raise ArgumentError.new( "Cannot parse nil as TAB" ) if data.nil? 64 | 65 | if data.is_a?( String ) 66 | @input = data # note: just needs each for each_line 67 | else ## assume io 68 | @input = data 69 | end 70 | 71 | ## pass in headers as array e.g. ['A', 'B', 'C'] 72 | @names = headers ? headers : nil 73 | end 74 | 75 | 76 | 77 | include Enumerable 78 | 79 | 80 | def each( &block ) 81 | 82 | ## todo/fix: 83 | ## add case for headers/names.size != values.size 84 | ## - add rest option? for if less headers than values (see python csv.DictReader - why? why not?) 85 | ## 86 | ## handle case with duplicate and empty header names etc. 87 | 88 | 89 | if block_given? 90 | TabReader.parse( @input ) do |values| 91 | if @names.nil? ## check for (first) headers row 92 | @names = values ## store header row / a.k.a. field/column names 93 | else ## "regular" record 94 | record = @names.zip( values ).to_h ## todo/fix: check for more values than names/headers!!! 95 | block.call( record ) 96 | end 97 | end 98 | else 99 | to_enum 100 | end 101 | end # method each 102 | 103 | def read() to_a; end # method read 104 | 105 | 106 | def close 107 | @input.close if @input.respond_to?(:close) ## note: string needs no close 108 | end 109 | 110 | 111 | end # class TabHashReader 112 | -------------------------------------------------------------------------------- /tabreader/lib/tabreader/version.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | 4 | ## note: for now TabReader is a class!!! NOT a module - change - why? why not? 5 | class TabReader 6 | 7 | MAJOR = 1 ## todo: namespace inside version or something - why? why not?? 8 | MINOR = 0 9 | PATCH = 1 10 | VERSION = [MAJOR,MINOR,PATCH].join('.') 11 | 12 | def self.version 13 | VERSION 14 | end 15 | 16 | def self.banner 17 | "tabreader/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" 18 | end 19 | 20 | def self.root 21 | File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) ) 22 | end 23 | 24 | end # class TabReader 25 | -------------------------------------------------------------------------------- /tabreader/test/helper.rb: -------------------------------------------------------------------------------- 1 | ## $:.unshift(File.dirname(__FILE__)) 2 | 3 | ## minitest setup 4 | 5 | require 'minitest/autorun' 6 | 7 | 8 | ## our own code 9 | require 'tabreader' 10 | 11 | ## add test_data_dir helper 12 | class TabReader 13 | def self.test_data_dir 14 | "#{root}/datasets" 15 | end 16 | end 17 | 18 | 19 | 20 | TabReader.logger.level = :debug ## turn on "global" logging 21 | -------------------------------------------------------------------------------- /tabreader/test/test_reader.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestReader < MiniTest::Test 11 | 12 | 13 | def test_parse 14 | 15 | txt1 = <<TXT 16 | a\tb\tc 17 | 1\t2\t3 18 | 4\t5\t6 19 | TXT 20 | 21 | txt2 = <<TXT 22 | a b c d 23 | 1 2 3 4 24 | 5 6 7 8 25 | TXT 26 | 27 | puts "== parse:" 28 | pp TabReader.parse( txt1 ) 29 | 30 | puts "== parse:" 31 | pp TabReader.parse( txt2 ) 32 | 33 | puts "== parse_line:" 34 | pp TabReader.parse_line( "1\t2\t3" ) 35 | 36 | puts "== parse_line:" 37 | pp TabReader.parse_line( "1 2 3 4" ) 38 | 39 | puts "== parse_line:" 40 | pp TabReader.parse_line( "1\t2\t3\r\n" ) 41 | 42 | assert true 43 | end 44 | 45 | 46 | def test_read 47 | 48 | puts "== read:" 49 | pp TabReader.read( "#{TabReader.test_data_dir}/test.tab" ) 50 | puts "== header:" 51 | pp TabReader.header( "#{TabReader.test_data_dir}/test.tab" ) 52 | puts "== foreach:" 53 | TabReader.foreach( "#{TabReader.test_data_dir}/test.tab" ) do |row| 54 | pp row 55 | end 56 | end 57 | 58 | 59 | def test_read_empty 60 | 61 | puts "== read (empty):" 62 | pp TabReader.read( "#{TabReader.test_data_dir}/empty.tab" ) 63 | puts "== header (empty):" 64 | pp TabReader.header( "#{TabReader.test_data_dir}/empty.tab" ) 65 | puts "== foreach (empty):" 66 | TabReader.foreach( "#{TabReader.test_data_dir}/empty.tab" ) do |row| 67 | pp row 68 | end 69 | puts "== parse (empty):" 70 | pp TabReader.parse( "" ) 71 | pp TabReader.parse_line( "" ) 72 | end 73 | 74 | end 75 | -------------------------------------------------------------------------------- /tabreader/test/test_reader_hash.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ### 4 | # to run use 5 | # ruby -I ./lib -I ./test test/test_reader_hash.rb 6 | 7 | 8 | require 'helper' 9 | 10 | class TestReader < MiniTest::Test 11 | 12 | 13 | def test_parse 14 | 15 | txt1 = <<TXT 16 | a\tb\tc 17 | 1\t2\t3 18 | 4\t5\t6 19 | TXT 20 | 21 | txt2 = <<TXT 22 | a b c d 23 | 1 2 3 4 24 | 5 6 7 8 25 | TXT 26 | 27 | puts "== parse:" 28 | pp TabHashReader.parse( txt1 ) 29 | 30 | puts "== parse:" 31 | pp TabHashReader.parse( txt2 ) 32 | 33 | assert true 34 | end 35 | 36 | 37 | def test_read 38 | 39 | puts "== read:" 40 | pp TabHashReader.read( "#{TabReader.test_data_dir}/test.tab" ) 41 | puts "== foreach:" 42 | TabHashReader.foreach( "#{TabReader.test_data_dir}/test.tab" ) do |row| 43 | pp row 44 | end 45 | end 46 | 47 | 48 | def test_read_empty 49 | 50 | puts "== read (empty):" 51 | pp TabHashReader.read( "#{TabReader.test_data_dir}/empty.tab" ) 52 | puts "== foreach (empty):" 53 | TabHashReader.foreach( "#{TabReader.test_data_dir}/empty.tab" ) do |row| 54 | pp row 55 | end 56 | puts "== parse (empty):" 57 | pp TabHashReader.parse( "" ) 58 | end 59 | 60 | end 61 | --------------------------------------------------------------------------------