├── testdata ├── claset.dsv ├── writeraw.exp ├── expected_skip.dat ├── expected.dat ├── input.dat ├── expected_merge_rows.dat ├── expected_merge_columns.dat ├── expected_simplemerge.dat ├── config.dsv ├── config_simpleread.dsv └── config_skip.dsv ├── .gitignore ├── config.go ├── claset_test.go ├── metadata_test.go ├── writerinterface.go ├── metadatainterface.go ├── configinterface.go ├── readererror.go ├── LICENSE ├── dsvinterface.go ├── dsv_test.go ├── data_test.go ├── writer_test.go ├── dsv.go ├── common_test.go ├── metadata.go ├── readerinterface.go ├── README.md ├── reader_test.go ├── writer.go └── reader.go /testdata/claset.dsv: -------------------------------------------------------------------------------- 1 | { 2 | "Input" :"input.dat" 3 | , "Rejected" :"rejected.dat" 4 | , "Skip" :1 5 | , "MaxRows" :2 6 | , "ClassIndex" :3 7 | } 8 | -------------------------------------------------------------------------------- /testdata/writeraw.exp: -------------------------------------------------------------------------------- 1 | 0,1,A 2 | 1,1.1,B 3 | 2,1.2,A 4 | 3,1.3,B 5 | 4,1.4,C 6 | 5,1.5,D 7 | 6,1.6,C 8 | 7,1.7,D 9 | 8,1.8,E 10 | 9,1.9,F 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | rejected.dat 2 | testdata/output.dat 3 | testdata/output_merge_columns.dat 4 | testdata/output_merge_rows.dat 5 | testdata/output_skip.dat 6 | testdata/rejected.dat 7 | testdata/writerawcolumns.out 8 | testdata/writerawrows.out 9 | -------------------------------------------------------------------------------- /testdata/expected_skip.dat: -------------------------------------------------------------------------------- 1 | 0.1;1;{{AB}};A-B# 2 | 0.02;2;{{BCD}};A-B-C# 3 | 0.003;3;{{A;B C,D}};A;B-C,D# 4 | 0.0004;4;{{A;B C D}};A;B-C,D# 5 | 0.000006;6;{{}};# 6 | 0.000000009;9;{{missing right-quote];8;0.00000008 7 | 9;"ok"-[[ok}};ok# 8 | 0.101;10;{{integer}};test# 9 | 0.123456789;123456789;{{real}};test# 10 | 13;13;{{string with}};string with# 11 | 14;14;{{string with]] escape}};string with" quote# 12 | -------------------------------------------------------------------------------- /testdata/expected.dat: -------------------------------------------------------------------------------- 1 | ID 1/A-B# {{AB}};1;0.1 2 | ID 2/A-B-C# {{BCD}};2;0.02 3 | ID 3/A;B-C,D# {{A;B C,D}};3;0.003 4 | ID 4/A;B-C,D# {{A;B C D}};4;0.0004 5 | ID 6/# {{}};6;0.000006 6 | ID 8/ok# {{missing right-quote];8;0.00000008 7 | 9;"ok"-[[ok}};9;0.000000009 8 | ID 10/test# {{integer}};10;0.101 9 | ID 12/test# {{real}};123456789;0.123456789 10 | ID 13/string with# {{string with}};13;13 11 | ID 14/string with" quote# {{string with]] escape}};14;14 12 | -------------------------------------------------------------------------------- /testdata/input.dat: -------------------------------------------------------------------------------- 1 | "id","name","value","integer";"real" 2 | 1;"A-B"-[[AB]];1;0.1 3 | 2;"A-B-C"-[[BCD]];2;0.02 4 | 3;"A;B-C,D"-[[A;B C,D]];3;0.003 5 | 4;"A;B-C,D"-[[A;B C D]];4;0.0004 6 | 5;"A;B-C,D-"[[A;B C D]];5;0.00005 7 | 6;""-[[]];6;0.000006 8 | 7;"ok"-[missing left-quote]];7;0.0000007 9 | 8;"ok"-[[missing right-quote];8;0.00000008 10 | 9;"ok"-[[ok]];9;0.000000009 11 | 10;"test"-[[integer]];010;0.101 12 | 11;"test"-[[integer]];1a;0.1001 13 | 12;"test"-[[real]];123456789;00.123456789 14 | 13;"string with" quote"-[[string with]];13;13.0 15 | 14;"string with\" quote"-[[string with\]] escape]];14;14.0 16 | -------------------------------------------------------------------------------- /testdata/expected_merge_rows.dat: -------------------------------------------------------------------------------- 1 | 1 A-B AB 1 0.1 2 | 2 A-B-C BCD 2 0.02 3 | 3 A;B-C,D A;B C,D 3 0.003 4 | 4 A;B-C,D A;B C D 4 0.0004 5 | 6 6 0.000006 6 | 8 ok missing right-quote];8;0.00000008 7 | 9;"ok"-[[ok 9 0.000000009 8 | 10 test integer 10 0.101 9 | 12 test real 123456789 0.123456789 10 | 13 string with string with 13 13 11 | 14 string with" quote string with]] escape 14 14 12 | A-B AB 1 0.1 13 | A-B-C BCD 2 0.02 14 | A;B-C,D A;B C,D 3 0.003 15 | A;B-C,D A;B C D 4 0.0004 16 | 6 0.000006 17 | ok missing right-quote];8;0.00000008 18 | 9;"ok"-[[ok 9 0.000000009 19 | test integer 10 0.101 20 | test real 123456789 0.123456789 21 | string with string with 13 13 22 | string with" quote string with]] escape 14 14 23 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | // 8 | // Config for working with DSV configuration. 9 | // 10 | type Config struct { 11 | // ConfigPath path to configuration file. 12 | ConfigPath string 13 | } 14 | 15 | // 16 | // GetConfigPath return the base path of configuration file. 17 | // 18 | func (cfg *Config) GetConfigPath() string { 19 | return cfg.ConfigPath 20 | } 21 | 22 | // 23 | // SetConfigPath for reading input and writing rejected file. 24 | // 25 | func (cfg *Config) SetConfigPath(dir string) { 26 | cfg.ConfigPath = dir 27 | } 28 | -------------------------------------------------------------------------------- /testdata/expected_merge_columns.dat: -------------------------------------------------------------------------------- 1 | 1 A-B AB 1 0.1 2 | 2 A-B-C BCD 2 0.02 3 | 3 A;B-C,D A;B C,D 3 0.003 4 | 4 A;B-C,D A;B C D 4 0.0004 5 | 6 6 0.000006 6 | 8 ok missing right-quote];8;0.00000008 7 | 9;"ok"-[[ok 9 0.000000009 8 | 10 test integer 10 0.101 9 | 12 test real 123456789 0.123456789 10 | 13 string with string with 13 13 11 | 14 string with" quote string with]] escape 14 14 12 | A-B AB 1 0.1 13 | A-B-C BCD 2 0.02 14 | A;B-C,D A;B C,D 3 0.003 15 | A;B-C,D A;B C D 4 0.0004 16 | 6 0.000006 17 | ok missing right-quote];8;0.00000008 18 | 9;"ok"-[[ok 9 0.000000009 19 | test integer 10 0.101 20 | test real 123456789 0.123456789 21 | string with string with 13 13 22 | string with" quote string with]] escape 14 14 23 | -------------------------------------------------------------------------------- /testdata/expected_simplemerge.dat: -------------------------------------------------------------------------------- 1 | ID 1/A-B# {{AB}};1;0.1 2 | ID 2/A-B-C# {{BCD}};2;0.02 3 | ID 3/A;B-C,D# {{A;B C,D}};3;0.003 4 | ID 4/A;B-C,D# {{A;B C D}};4;0.0004 5 | ID 6/# {{}};6;0.000006 6 | ID 8/ok# {{missing right-quote];8;0.00000008 7 | 9;"ok"-[[ok}};9;0.000000009 8 | ID 10/test# {{integer}};10;0.101 9 | ID 12/test# {{real}};123456789;0.123456789 10 | ID 13/string with# {{string with}};13;13 11 | ID 14/string with" quote# {{string with]] escape}};14;14 12 | ID 1/A-B# {{AB}};1;0.1 13 | ID 2/A-B-C# {{BCD}};2;0.02 14 | ID 3/A;B-C,D# {{A;B C,D}};3;0.003 15 | ID 4/A;B-C,D# {{A;B C D}};4;0.0004 16 | ID 6/# {{}};6;0.000006 17 | ID 8/ok# {{missing right-quote];8;0.00000008 18 | 9;"ok"-[[ok}};9;0.000000009 19 | ID 10/test# {{integer}};10;0.101 20 | ID 12/test# {{real}};123456789;0.123456789 21 | ID 13/string with# {{string with}};13;13 22 | ID 14/string with" quote# {{string with]] escape}};14;14 23 | -------------------------------------------------------------------------------- /claset_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "github.com/shuLhan/dsv" 9 | "github.com/shuLhan/tabula" 10 | "testing" 11 | ) 12 | 13 | func TestReaderWithClaset(t *testing.T) { 14 | fcfg := "testdata/claset.dsv" 15 | 16 | claset := tabula.Claset{} 17 | 18 | _, e := dsv.NewReader(fcfg, &claset) 19 | if e != nil { 20 | t.Fatal(e) 21 | } 22 | 23 | assert(t, 3, claset.GetClassIndex(), true) 24 | 25 | claset.SetMajorityClass("regular") 26 | claset.SetMinorityClass("vandalism") 27 | 28 | clone := claset.Clone().(tabula.ClasetInterface) 29 | 30 | assert(t, 3, clone.GetClassIndex(), true) 31 | assert(t, "regular", clone.MajorityClass(), true) 32 | assert(t, "vandalism", clone.MinorityClass(), true) 33 | } 34 | -------------------------------------------------------------------------------- /metadata_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "github.com/shuLhan/dsv" 9 | "testing" 10 | ) 11 | 12 | func TestMetadataIsEqual(t *testing.T) { 13 | cases := []struct { 14 | in dsv.Metadata 15 | out dsv.Metadata 16 | result bool 17 | }{ 18 | { 19 | dsv.Metadata{ 20 | Name: "A", 21 | Separator: ",", 22 | }, 23 | dsv.Metadata{ 24 | Name: "A", 25 | Separator: ",", 26 | }, 27 | true, 28 | }, 29 | { 30 | dsv.Metadata{ 31 | Name: "A", 32 | Separator: ",", 33 | }, 34 | dsv.Metadata{ 35 | Name: "A", 36 | Separator: ";", 37 | }, 38 | false, 39 | }, 40 | } 41 | 42 | for _, c := range cases { 43 | r := c.in.IsEqual(&c.out) 44 | 45 | if r != c.result { 46 | t.Error("Test failed on ", c.in, c.out) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /testdata/config.dsv: -------------------------------------------------------------------------------- 1 | { 2 | "Input" :"input.dat" 3 | , "Rejected" :"rejected.dat" 4 | , "Skip" :1 5 | , "MaxRows" :1 6 | , "InputMetadata" : 7 | [{ 8 | "Name" :"id" 9 | , "Separator" :";" 10 | , "Type" :"integer" 11 | },{ 12 | "Name" :"name" 13 | , "Separator" :"-" 14 | , "LeftQuote" :"\"" 15 | , "RightQuote" :"\"" 16 | },{ 17 | "Name" :"value" 18 | , "Separator" :";" 19 | , "LeftQuote" :"[[" 20 | , "RightQuote" :"]]" 21 | },{ 22 | "Name" :"integer" 23 | , "Type" :"integer" 24 | , "Separator" :";" 25 | },{ 26 | "Name" :"real" 27 | , "Type" :"real" 28 | }] 29 | , "Output" :"output.dat" 30 | , "OutputMetadata": 31 | [{ 32 | "Name" :"id" 33 | , "LeftQuote" :"ID " 34 | , "Separator" :"/" 35 | },{ 36 | "Name" :"name" 37 | , "RightQuote" :"#" 38 | , "Separator" :"\t" 39 | },{ 40 | "Name" :"value" 41 | , "Separator" :";" 42 | , "LeftQuote" :"{{" 43 | , "RightQuote" :"}}" 44 | },{ 45 | "Name" :"integer" 46 | , "Separator" :";" 47 | },{ 48 | "Name" :"real" 49 | }] 50 | } 51 | -------------------------------------------------------------------------------- /testdata/config_simpleread.dsv: -------------------------------------------------------------------------------- 1 | { 2 | "Input" :"input.dat" 3 | , "Rejected" :"rejected.dat" 4 | , "Skip" :1 5 | , "MaxRows" :-1 6 | , "InputMetadata" : 7 | [{ 8 | "Name" :"id" 9 | , "Separator" :";" 10 | , "Type" :"integer" 11 | },{ 12 | "Name" :"name" 13 | , "Separator" :"-" 14 | , "LeftQuote" :"\"" 15 | , "RightQuote" :"\"" 16 | },{ 17 | "Name" :"value" 18 | , "Separator" :";" 19 | , "LeftQuote" :"[[" 20 | , "RightQuote" :"]]" 21 | },{ 22 | "Name" :"integer" 23 | , "Type" :"integer" 24 | , "Separator" :";" 25 | },{ 26 | "Name" :"real" 27 | , "Type" :"real" 28 | }] 29 | , "Output" :"output.dat" 30 | , "OutputMetadata": 31 | [{ 32 | "Name" :"id" 33 | , "LeftQuote" :"ID " 34 | , "Separator" :"/" 35 | },{ 36 | "Name" :"name" 37 | , "RightQuote" :"#" 38 | , "Separator" :"\t" 39 | },{ 40 | "Name" :"value" 41 | , "Separator" :";" 42 | , "LeftQuote" :"{{" 43 | , "RightQuote" :"}}" 44 | },{ 45 | "Name" :"integer" 46 | , "Separator" :";" 47 | },{ 48 | "Name" :"real" 49 | }] 50 | } 51 | -------------------------------------------------------------------------------- /testdata/config_skip.dsv: -------------------------------------------------------------------------------- 1 | { 2 | "Input" :"input.dat" 3 | , "Rejected" :"rejected.dat" 4 | , "Skip" :1 5 | , "MaxRows" :1 6 | , "InputMetadata" : 7 | [{ 8 | "Name" :"id" 9 | , "Separator" :";" 10 | , "Type" :"integer" 11 | , "Skip" :true 12 | },{ 13 | "Name" :"name" 14 | , "Separator" :"-" 15 | , "LeftQuote" :"\"" 16 | , "RightQuote" :"\"" 17 | },{ 18 | "Name" :"value" 19 | , "Separator" :";" 20 | , "LeftQuote" :"[[" 21 | , "RightQuote" :"]]" 22 | },{ 23 | "Name" :"integer" 24 | , "Type" :"integer" 25 | , "Separator" :";" 26 | },{ 27 | "Name" :"real" 28 | , "Type" :"real" 29 | }] 30 | , "Output" :"testdata/output_skip.dat" 31 | , "OutputMetadata": 32 | [{ 33 | "Name" :"real" 34 | , "Separator" :";" 35 | },{ 36 | "Name" :"integer" 37 | , "Separator" :";" 38 | },{ 39 | "Name" :"value" 40 | , "Separator" :";" 41 | , "LeftQuote" :"{{" 42 | , "RightQuote" :"}}" 43 | },{ 44 | "Name" :"name" 45 | , "RightQuote" :"#" 46 | },{ 47 | "Name" :"id" 48 | , "LeftQuote" :"ID " 49 | , "Separator" :"/" 50 | },{ 51 | "Name" :"invalid" 52 | , "Separator" :";" 53 | }] 54 | } 55 | -------------------------------------------------------------------------------- /writerinterface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | // 8 | // WriterInterface is an interface for writing DSV data to file. 9 | // 10 | type WriterInterface interface { 11 | ConfigInterface 12 | GetOutput() string 13 | SetOutput(path string) 14 | OpenOutput(file string) error 15 | Flush() error 16 | Close() error 17 | } 18 | 19 | // 20 | // OpenWriter configuration file and initialize the attributes. 21 | // 22 | func OpenWriter(writer WriterInterface, fcfg string) (e error) { 23 | e = ConfigOpen(writer, fcfg) 24 | if e != nil { 25 | return 26 | } 27 | 28 | return InitWriter(writer) 29 | } 30 | 31 | // 32 | // InitWriter initialize writer by opening output file. 33 | // 34 | func InitWriter(writer WriterInterface) error { 35 | out := writer.GetOutput() 36 | 37 | // Exit immediately if no output file is defined in config. 38 | if "" == out { 39 | return ErrNoOutput 40 | } 41 | 42 | writer.SetOutput(ConfigCheckPath(writer, out)) 43 | 44 | return writer.OpenOutput("") 45 | } 46 | -------------------------------------------------------------------------------- /metadatainterface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | // 8 | // MetadataInterface is the interface for field metadata. 9 | // This is to make anyone can extend the DSV library including the metadata. 10 | // 11 | type MetadataInterface interface { 12 | Init() 13 | GetName() string 14 | GetType() int 15 | GetTypeName() string 16 | GetLeftQuote() string 17 | GetRightQuote() string 18 | GetSeparator() string 19 | GetSkip() bool 20 | GetValueSpace() []string 21 | 22 | IsEqual(MetadataInterface) bool 23 | } 24 | 25 | // 26 | // FindMetadata Given a slice of metadata, find `mdin` in the slice which has the 27 | // same name, ignoring metadata where Skip value is true. 28 | // If found, return the index and metadata object of matched metadata name. 29 | // If not found return -1 as index and nil in `mdout`. 30 | // 31 | func FindMetadata(mdin MetadataInterface, mds []MetadataInterface) ( 32 | idx int, 33 | mdout MetadataInterface, 34 | ) { 35 | for _, md := range mds { 36 | if md.GetName() == mdin.GetName() { 37 | mdout = md 38 | break 39 | } 40 | if !md.GetSkip() { 41 | idx++ 42 | } 43 | } 44 | return idx, mdout 45 | } 46 | -------------------------------------------------------------------------------- /configinterface.go: -------------------------------------------------------------------------------- 1 | package dsv 2 | 3 | import ( 4 | "encoding/json" 5 | "io/ioutil" 6 | "path" 7 | ) 8 | 9 | // 10 | // ConfigInterface for reader and writer for initializing the config from JSON. 11 | // 12 | type ConfigInterface interface { 13 | GetConfigPath() string 14 | SetConfigPath(dir string) 15 | } 16 | 17 | // 18 | // ConfigOpen configuration file and initialize the attributes. 19 | // 20 | func ConfigOpen(rw interface{}, fcfg string) error { 21 | cfg, e := ioutil.ReadFile(fcfg) 22 | 23 | if nil != e { 24 | return e 25 | } 26 | 27 | // Get directory where the config reside. 28 | rwconfig := rw.(ConfigInterface) 29 | rwconfig.SetConfigPath(path.Dir(fcfg)) 30 | 31 | return ConfigParse(rw, cfg) 32 | } 33 | 34 | // 35 | // ConfigParse from JSON string. 36 | // 37 | func ConfigParse(rw interface{}, cfg []byte) error { 38 | return json.Unmarshal(cfg, rw) 39 | } 40 | 41 | // 42 | // ConfigCheckPath if no path in file, return the config path plus file name, 43 | // otherwise leave it unchanged. 44 | // 45 | func ConfigCheckPath(comin ConfigInterface, file string) string { 46 | dir := path.Dir(file) 47 | 48 | if dir == "." { 49 | cfgPath := comin.GetConfigPath() 50 | if cfgPath != "" && cfgPath != "." { 51 | return cfgPath + "/" + file 52 | } 53 | } 54 | 55 | // nothing happen. 56 | return file 57 | } 58 | -------------------------------------------------------------------------------- /readererror.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "fmt" 9 | ) 10 | 11 | const ( 12 | _ = iota 13 | // EReadMissLeftQuote read error when no left-quote found on line. 14 | EReadMissLeftQuote 15 | // EReadMissRightQuote read error when no right-quote found on line. 16 | EReadMissRightQuote 17 | // EReadMissSeparator read error when no separator found on line. 18 | EReadMissSeparator 19 | // EReadLine error when reading line from file. 20 | EReadLine 21 | // EReadEOF error which indicated end-of-file. 22 | EReadEOF 23 | // ETypeConversion error when converting type from string to numeric or 24 | // vice versa. 25 | ETypeConversion 26 | ) 27 | 28 | // 29 | // ReaderError to handle error data and message. 30 | // 31 | type ReaderError struct { 32 | // T define type of error. 33 | T int 34 | // Func where error happened 35 | Func string 36 | // What cause the error? 37 | What string 38 | // Line define the line which cause error 39 | Line string 40 | // Pos character position which cause error 41 | Pos int 42 | // N line number 43 | N int 44 | } 45 | 46 | // 47 | // Error to string. 48 | // 49 | func (e *ReaderError) Error() string { 50 | return fmt.Sprintf("dsv.Reader.%-20s [%d:%d]: %-30s data:|%s|", e.Func, e.N, 51 | e.Pos, e.What, e.Line) 52 | } 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015-2018, Shulhan (ms@kilabit.info). All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | * Neither the name of Kilabit nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED 18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 20 | EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | --- --- --- --- --- --- --- 29 | 30 | TT TT II BB AAAA LLLLLL II KKKKKKKK 31 | TT TT II BB AA AA LL LL II KK 32 | TTTT II BB AA AA LL LL II KK 33 | TT TT II BB AAAAAAAA LLLLLL II KK 34 | TT TT II BB AA AA LL LL II KK 35 | TT TT II BBBBBBBB AA AA LLLLLL II KK 36 | 37 | Website: http://kilabit.info 38 | Contact: ms@kilabit.info 39 | -------------------------------------------------------------------------------- /dsvinterface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "io" 9 | ) 10 | 11 | // 12 | // SimpleRead provide a shortcut to read data from file using configuration file 13 | // from `fcfg`. 14 | // Return the reader contained data or error if failed. 15 | // Reader object upon returned has been closed, so if one need to read all 16 | // data in it simply set the `MaxRows` to `-1` in config file. 17 | // 18 | func SimpleRead(fcfg string, dataset interface{}) ( 19 | reader ReaderInterface, 20 | e error, 21 | ) { 22 | reader, e = NewReader(fcfg, dataset) 23 | 24 | if e != nil { 25 | return 26 | } 27 | 28 | _, e = Read(reader) 29 | if e != nil && e != io.EOF { 30 | return nil, e 31 | } 32 | 33 | e = reader.Close() 34 | 35 | return 36 | } 37 | 38 | // 39 | // SimpleWrite provide a shortcut to write data from reader using output metadata 40 | // format and output file defined in file `fcfg`. 41 | // 42 | func SimpleWrite(reader ReaderInterface, fcfg string) (nrows int, e error) { 43 | writer, e := NewWriter(fcfg) 44 | if e != nil { 45 | return 46 | } 47 | 48 | nrows, e = writer.Write(reader) 49 | if e != nil { 50 | return 51 | } 52 | 53 | e = writer.Close() 54 | 55 | return 56 | } 57 | 58 | // 59 | // SimpleMerge provide a shortcut to merge two dsv files using configuration 60 | // files passed in parameters. 61 | // 62 | // One must remember to set, 63 | // - "MaxRows" to -1 to be able to read all rows, in both input configuration, and 64 | // - "DatasetMode" to "columns" to speeding up process. 65 | // 66 | // This function return the merged reader or error if failed. 67 | // 68 | func SimpleMerge(fin1, fin2 string, dataset1, dataset2 interface{}) ( 69 | ReaderInterface, 70 | error, 71 | ) { 72 | reader1, e := SimpleRead(fin1, dataset1) 73 | if e != nil { 74 | return nil, e 75 | } 76 | 77 | reader2, e := SimpleRead(fin2, dataset2) 78 | if e != nil { 79 | return nil, e 80 | } 81 | 82 | reader1.MergeColumns(reader2) 83 | 84 | return reader1, nil 85 | } 86 | -------------------------------------------------------------------------------- /dsv_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "github.com/shuLhan/dsv" 9 | "testing" 10 | ) 11 | 12 | // 13 | // doInit create read-write object. 14 | // 15 | func doInit(t *testing.T, fcfg string) (rw *dsv.ReadWriter, e error) { 16 | // Initialize dsv 17 | rw, e = dsv.New(fcfg, nil) 18 | 19 | if nil != e { 20 | t.Fatal(e) 21 | } 22 | 23 | return 24 | } 25 | 26 | // 27 | // TestReadWriter test reading and writing DSV. 28 | // 29 | func TestReadWriter(t *testing.T) { 30 | rw, _ := doInit(t, "testdata/config.dsv") 31 | 32 | doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true) 33 | 34 | e := rw.Close() 35 | if e != nil { 36 | t.Fatal(e) 37 | } 38 | 39 | assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) 40 | } 41 | 42 | // 43 | // TestReadWriter test reading and writing DSV. 44 | // 45 | func TestReadWriterAll(t *testing.T) { 46 | rw, _ := doInit(t, "testdata/config.dsv") 47 | 48 | rw.SetMaxRows(-1) 49 | 50 | doReadWrite(t, &rw.Reader, &rw.Writer, expectation, false) 51 | 52 | e := rw.Close() 53 | if e != nil { 54 | t.Fatal(e) 55 | } 56 | 57 | assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) 58 | } 59 | 60 | func TestSimpleReadWrite(t *testing.T) { 61 | fcfg := "testdata/config_simpleread.dsv" 62 | 63 | reader, e := dsv.SimpleRead(fcfg, nil) 64 | if e != nil { 65 | t.Fatal(e) 66 | } 67 | 68 | fout := "testdata/output.dat" 69 | fexp := "testdata/expected.dat" 70 | 71 | _, e = dsv.SimpleWrite(reader, fcfg) 72 | if e != nil { 73 | t.Fatal(e) 74 | } 75 | 76 | assertFile(t, fexp, fout, true) 77 | } 78 | 79 | func TestSimpleMerge(t *testing.T) { 80 | fcfg1 := "testdata/config_simpleread.dsv" 81 | fcfg2 := "testdata/config_simpleread.dsv" 82 | 83 | reader, e := dsv.SimpleMerge(fcfg1, fcfg2, nil, nil) 84 | if e != nil { 85 | t.Fatal(e) 86 | } 87 | 88 | _, e = dsv.SimpleWrite(reader, fcfg1) 89 | if e != nil { 90 | t.Fatal(e) 91 | } 92 | 93 | fexp := "testdata/expected_simplemerge.dat" 94 | fout := "testdata/output.dat" 95 | 96 | assertFile(t, fexp, fout, true) 97 | } 98 | -------------------------------------------------------------------------------- /data_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | var expectation = []string{ 8 | "&[1 A-B AB 1 0.1]", 9 | "&[2 A-B-C BCD 2 0.02]", 10 | "&[3 A;B-C,D A;B C,D 3 0.003]", 11 | "&[4 A;B-C,D A;B C D 4 0.0004]", 12 | "&[6 6 0.000006]", 13 | "&[8 ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]", 14 | "&[10 test integer 10 0.101]", 15 | "&[12 test real 123456789 0.123456789]", 16 | "&[13 string with string with 13 13]", 17 | "&[14 string with\" quote string with]] escape 14 14]", 18 | } 19 | 20 | var expSkip = []string{ 21 | "&[A-B AB 1 0.1]", 22 | "&[A-B-C BCD 2 0.02]", 23 | "&[A;B-C,D A;B C,D 3 0.003]", 24 | "&[A;B-C,D A;B C D 4 0.0004]", 25 | "&[ 6 0.000006]", 26 | "&[ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]", 27 | "&[test integer 10 0.101]", 28 | "&[test real 123456789 0.123456789]", 29 | "&[string with string with 13 13]", 30 | "&[string with\" quote string with]] escape 14 14]", 31 | } 32 | 33 | var expSkipColumns = []string{ 34 | "[{name 0 0 [] [A-B]} {value 0 0 [] [AB]} {integer 1 0 [] [1]} {real 2 0 [] [0.1]}]", 35 | "[{name 0 0 [] [A-B-C]} {value 0 0 [] [BCD]} {integer 1 0 [] [2]} {real 2 0 [] [0.02]}]", 36 | "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C,D]} {integer 1 0 [] [3]} {real 2 0 [] [0.003]}]", 37 | "[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C D]} {integer 1 0 [] [4]} {real 2 0 [] [0.0004]}]", 38 | "[{name 0 0 [] []} {value 0 0 [] []} {integer 1 0 [] [6]} {real 2 0 [] [0.000006]}]", 39 | "[{name 0 0 [] [ok]} {value 0 0 [] [missing right-quote];8;0.00000008\n9;\"ok\"-[[ok]} {integer 1 0 [] [9]} {real 2 0 [] [0.000000009]}]", 40 | "[{name 0 0 [] [test]} {value 0 0 [] [integer]} {integer 1 0 [] [10]} {real 2 0 [] [0.101]}]", 41 | "[{name 0 0 [] [test]} {value 0 0 [] [real]} {integer 1 0 [] [123456789]} {real 2 0 [] [0.123456789]}]", 42 | "[{name 0 0 [] [string with]} {value 0 0 [] [string with]} {integer 1 0 [] [13]} {real 2 0 [] [13]}]", 43 | "[{name 0 0 [] [string with\" quote]} {value 0 0 [] [string with]] escape]} {integer 1 0 [] [14]} {real 2 0 [] [14]}]", 44 | } 45 | 46 | var expSkipColumnsAll = []string{ 47 | "{name 0 0 [] [A-B A-B-C A;B-C,D A;B-C,D ok test test string with string with\" quote]}", 48 | "{value 0 0 [] [AB BCD A;B C,D A;B C D missing right-quote];8;0.00000008\n9;\"ok\"-[[ok integer real string with string with]] escape]}", 49 | "{integer 1 0 [] [1 2 3 4 6 9 10 123456789 13 14]}", 50 | "{real 2 0 [] [0.1 0.02 0.003 0.0004 0.000006 0.000000009 0.101 0.123456789 13 14]}", 51 | } 52 | 53 | var expSkipColumnsAllRev = []string{ 54 | "{name 0 0 [] [string with\" quote string with test test ok A;B-C,D A;B-C,D A-B-C A-B]}", 55 | "{value 0 0 [] [string with]] escape string with real integer missing right-quote];8;0.00000008\n9;\"ok\"-[[ok A;B C D A;B C,D BCD AB]}", 56 | "{integer 1 0 [] [14 13 123456789 10 9 6 4 3 2 1]}", 57 | "{real 2 0 [] [14 13 0.123456789 0.101 0.000000009 0.000006 0.0004 0.003 0.02 0.1]}", 58 | } 59 | -------------------------------------------------------------------------------- /writer_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/shuLhan/dsv" 11 | "github.com/shuLhan/tabula" 12 | ) 13 | 14 | // 15 | // TestWriter test reading and writing DSV. 16 | // 17 | func TestWriter(t *testing.T) { 18 | rw, e := dsv.New("testdata/config.dsv", nil) 19 | if e != nil { 20 | t.Fatal(e) 21 | } 22 | 23 | doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true) 24 | 25 | e = rw.Close() 26 | if e != nil { 27 | t.Fatal(e) 28 | } 29 | 30 | assertFile(t, rw.GetOutput(), "testdata/expected.dat", true) 31 | } 32 | 33 | // 34 | // TestWriterWithSkip test reading and writing DSV with some column in input being 35 | // skipped. 36 | // 37 | func TestWriterWithSkip(t *testing.T) { 38 | rw, e := dsv.New("testdata/config_skip.dsv", nil) 39 | if e != nil { 40 | t.Fatal(e) 41 | } 42 | 43 | doReadWrite(t, &rw.Reader, &rw.Writer, expSkip, true) 44 | 45 | e = rw.Close() 46 | if e != nil { 47 | t.Fatal(e) 48 | } 49 | 50 | assertFile(t, rw.GetOutput(), "testdata/expected_skip.dat", true) 51 | } 52 | 53 | // 54 | // TestWriterWithColumns test reading and writing DSV with where each row 55 | // is saved in DatasetMode = 'columns'. 56 | // 57 | func TestWriterWithColumns(t *testing.T) { 58 | rw, e := dsv.New("testdata/config_skip.dsv", nil) 59 | if e != nil { 60 | t.Fatal(e) 61 | } 62 | 63 | rw.SetDatasetMode(dsv.DatasetModeCOLUMNS) 64 | 65 | doReadWrite(t, &rw.Reader, &rw.Writer, expSkipColumns, true) 66 | 67 | e = rw.Close() 68 | if e != nil { 69 | t.Fatal(e) 70 | } 71 | 72 | assertFile(t, "testdata/expected_skip.dat", rw.GetOutput(), true) 73 | } 74 | 75 | func TestWriteRawRows(t *testing.T) { 76 | dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil) 77 | 78 | populateWithRows(t, dataset) 79 | 80 | writer, e := dsv.NewWriter("") 81 | if e != nil { 82 | t.Fatal(e) 83 | } 84 | 85 | outfile := "testdata/writerawrows.out" 86 | expfile := "testdata/writeraw.exp" 87 | 88 | e = writer.OpenOutput(outfile) 89 | if e != nil { 90 | t.Fatal(e) 91 | } 92 | 93 | _, e = writer.WriteRawDataset(dataset, nil) 94 | if e != nil { 95 | t.Fatal(e) 96 | } 97 | 98 | assertFile(t, outfile, expfile, true) 99 | } 100 | 101 | func TestWriteRawColumns(t *testing.T) { 102 | var e error 103 | 104 | dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil) 105 | 106 | populateWithColumns(t, dataset) 107 | 108 | writer, e := dsv.NewWriter("") 109 | if e != nil { 110 | t.Fatal(e) 111 | } 112 | 113 | outfile := "testdata/writerawcolumns.out" 114 | expfile := "testdata/writeraw.exp" 115 | 116 | e = writer.OpenOutput(outfile) 117 | if e != nil { 118 | t.Fatal(e) 119 | } 120 | 121 | _, e = writer.WriteRawDataset(dataset, nil) 122 | if e != nil { 123 | t.Fatal(e) 124 | } 125 | 126 | assertFile(t, outfile, expfile, true) 127 | } 128 | -------------------------------------------------------------------------------- /dsv.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // 6 | // Package dsv is a library for working with delimited separated value (DSV). 7 | // 8 | // DSV is a free-style form of Comma Separated Value (CSV) format of text data, 9 | // where each row is separated by newline, and each column can be separated by 10 | // any string enclosed with left-quote and right-quote. 11 | // 12 | package dsv 13 | 14 | import ( 15 | "errors" 16 | "os" 17 | "strconv" 18 | ) 19 | 20 | const ( 21 | // DefaultRejected define the default file which will contain the 22 | // rejected row. 23 | DefaultRejected = "rejected.dat" 24 | // DefaultMaxRows define default maximum row that will be saved 25 | // in memory for each read if input data is too large and can not be 26 | // consumed in one read operation. 27 | DefaultMaxRows = 256 28 | // DefDatasetMode default output mode is rows. 29 | DefDatasetMode = DatasetModeROWS 30 | // DefEOL default end-of-line 31 | DefEOL = '\n' 32 | ) 33 | 34 | var ( 35 | // ErrNoInput define an error when no Input file is given to Reader. 36 | ErrNoInput = errors.New("dsv: No input file is given in config") 37 | // ErrMissRecordsLen define an error when trying to push Row 38 | // to Field, when their length is not equal. 39 | // See reader.PushRowToColumns(). 40 | ErrMissRecordsLen = errors.New("dsv: Mismatch between number of record in row and columns length") 41 | // ErrNoOutput define an error when no output file is given to Writer. 42 | ErrNoOutput = errors.New("dsv: No output file is given in config") 43 | // ErrNotOpen define an error when output file has not been opened 44 | // by Writer. 45 | ErrNotOpen = errors.New("dsv: Output file is not opened") 46 | // ErrNilReader define an error when Reader object is nil when passed 47 | // to Write function. 48 | ErrNilReader = errors.New("dsv: Reader object is nil") 49 | 50 | // DEBUG imported from environment DSV_DEBUG to debug the library. 51 | DEBUG = 0 52 | ) 53 | 54 | // 55 | // ReadWriter combine reader and writer. 56 | // 57 | type ReadWriter struct { 58 | Reader 59 | Writer 60 | } 61 | 62 | func init() { 63 | var e error 64 | DEBUG, e = strconv.Atoi(os.Getenv("DSV_DEBUG")) 65 | if e != nil { 66 | DEBUG = 0 67 | } 68 | } 69 | 70 | // 71 | // New create a new ReadWriter object. 72 | // 73 | func New(config string, dataset interface{}) (rw *ReadWriter, e error) { 74 | rw = &ReadWriter{} 75 | 76 | e = rw.Reader.Init(config, dataset) 77 | if e != nil { 78 | return nil, e 79 | } 80 | 81 | e = OpenWriter(&rw.Writer, config) 82 | if e != nil { 83 | return nil, e 84 | } 85 | 86 | return 87 | } 88 | 89 | // 90 | // SetConfigPath of input and output file. 91 | // 92 | func (dsv *ReadWriter) SetConfigPath(dir string) { 93 | dsv.Reader.SetConfigPath(dir) 94 | dsv.Writer.SetConfigPath(dir) 95 | } 96 | 97 | // 98 | // Close reader and writer. 99 | // 100 | func (dsv *ReadWriter) Close() (e error) { 101 | e = dsv.Writer.Close() 102 | if e != nil { 103 | return 104 | } 105 | return dsv.Reader.Close() 106 | } 107 | -------------------------------------------------------------------------------- /common_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "github.com/shuLhan/dsv" 11 | "github.com/shuLhan/tabula" 12 | "io" 13 | "io/ioutil" 14 | "reflect" 15 | "runtime/debug" 16 | "testing" 17 | ) 18 | 19 | func assert(t *testing.T, exp, got interface{}, equal bool) { 20 | if reflect.DeepEqual(exp, got) != equal { 21 | debug.PrintStack() 22 | t.Fatalf("\n"+ 23 | ">>> Expecting '%v'\n"+ 24 | " got '%v'\n", exp, got) 25 | } 26 | } 27 | 28 | // 29 | // assertFile compare content of two file, print error message and exit 30 | // when both are different. 31 | // 32 | func assertFile(t *testing.T, a, b string, equal bool) { 33 | out, e := ioutil.ReadFile(a) 34 | 35 | if nil != e { 36 | debug.PrintStack() 37 | t.Error(e) 38 | } 39 | 40 | exp, e := ioutil.ReadFile(b) 41 | 42 | if nil != e { 43 | debug.PrintStack() 44 | t.Error(e) 45 | } 46 | 47 | r := bytes.Compare(out, exp) 48 | 49 | if equal && 0 != r { 50 | debug.PrintStack() 51 | t.Fatal("Comparing", a, "with", b, ": result is different (", 52 | r, ")") 53 | } 54 | } 55 | 56 | func checkDataset(t *testing.T, r *dsv.Reader, exp string) { 57 | var got string 58 | ds := r.GetDataset().(tabula.DatasetInterface) 59 | data := ds.GetData() 60 | 61 | switch data.(type) { 62 | case *tabula.Rows: 63 | rows := data.(*tabula.Rows) 64 | got = fmt.Sprint(*rows) 65 | case *tabula.Columns: 66 | cols := data.(*tabula.Columns) 67 | got = fmt.Sprint(*cols) 68 | case *tabula.Matrix: 69 | matrix := data.(*tabula.Matrix) 70 | got = fmt.Sprint(*matrix) 71 | default: 72 | fmt.Println("data type unknown") 73 | } 74 | 75 | assert(t, exp, got, true) 76 | } 77 | 78 | // 79 | // doReadWrite test reading and writing the DSV data. 80 | // 81 | func doReadWrite(t *testing.T, dsvReader *dsv.Reader, dsvWriter *dsv.Writer, 82 | expectation []string, check bool) { 83 | i := 0 84 | 85 | for { 86 | n, e := dsv.Read(dsvReader) 87 | 88 | if e == io.EOF || n == 0 { 89 | _, e = dsvWriter.Write(dsvReader) 90 | if e != nil { 91 | t.Fatal(e) 92 | } 93 | 94 | break 95 | } 96 | 97 | if e != nil { 98 | continue 99 | } 100 | 101 | if check { 102 | checkDataset(t, dsvReader, expectation[i]) 103 | i++ 104 | } 105 | 106 | _, e = dsvWriter.Write(dsvReader) 107 | if e != nil { 108 | t.Fatal(e) 109 | } 110 | } 111 | 112 | e := dsvWriter.Flush() 113 | if e != nil { 114 | t.Fatal(e) 115 | } 116 | } 117 | 118 | var datasetRows = [][]string{ 119 | {"0", "1", "A"}, 120 | {"1", "1.1", "B"}, 121 | {"2", "1.2", "A"}, 122 | {"3", "1.3", "B"}, 123 | {"4", "1.4", "C"}, 124 | {"5", "1.5", "D"}, 125 | {"6", "1.6", "C"}, 126 | {"7", "1.7", "D"}, 127 | {"8", "1.8", "E"}, 128 | {"9", "1.9", "F"}, 129 | } 130 | 131 | var datasetCols = [][]string{ 132 | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, 133 | {"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"}, 134 | {"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"}, 135 | } 136 | 137 | var datasetTypes = []int{ 138 | tabula.TInteger, 139 | tabula.TReal, 140 | tabula.TString, 141 | } 142 | 143 | var datasetNames = []string{"int", "real", "string"} 144 | 145 | func populateWithRows(t *testing.T, dataset *tabula.Dataset) { 146 | for _, rowin := range datasetRows { 147 | row := make(tabula.Row, len(rowin)) 148 | 149 | for x, recin := range rowin { 150 | rec, e := tabula.NewRecordBy(recin, datasetTypes[x]) 151 | if e != nil { 152 | t.Fatal(e) 153 | } 154 | 155 | row[x] = rec 156 | } 157 | 158 | dataset.PushRow(&row) 159 | } 160 | } 161 | 162 | func populateWithColumns(t *testing.T, dataset *tabula.Dataset) { 163 | for x := range datasetCols { 164 | col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x], 165 | datasetNames[x]) 166 | if e != nil { 167 | t.Fatal(e) 168 | } 169 | 170 | dataset.PushColumn(*col) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /metadata.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "encoding/json" 9 | "github.com/shuLhan/tabula" 10 | "log" 11 | "strings" 12 | ) 13 | 14 | // 15 | // Metadata represent on how to parse each column in record. 16 | // 17 | type Metadata struct { 18 | // Name of the column, optional. 19 | Name string `json:"Name"` 20 | // Type of the column, default to "string". 21 | // Valid value are: "string", "integer", "real" 22 | Type string `json:"Type"` 23 | // T type of column in integer. 24 | T int 25 | // Separator for column in record. 26 | Separator string `json:"Separator"` 27 | // LeftQuote define the characters that enclosed the column in the left 28 | // side. 29 | LeftQuote string `json:"LeftQuote"` 30 | // RightQuote define the characters that enclosed the column in the 31 | // right side. 32 | RightQuote string `json:"RightQuote"` 33 | // Skip, if its true this column will be ignored, not saved in reader 34 | // object. Default to false. 35 | Skip bool `json:"Skip"` 36 | // ValueSpace contain the possible value in records 37 | ValueSpace []string `json:"ValueSpace"` 38 | } 39 | 40 | // 41 | // NewMetadata create and return new metadata. 42 | // 43 | func NewMetadata(name, tipe, sep, leftq, rightq string, vs []string) ( 44 | md *Metadata, 45 | ) { 46 | md = &Metadata{ 47 | Name: name, 48 | Type: tipe, 49 | Separator: sep, 50 | LeftQuote: leftq, 51 | RightQuote: rightq, 52 | ValueSpace: vs, 53 | } 54 | 55 | md.Init() 56 | 57 | return 58 | } 59 | 60 | // 61 | // Init initialize metadata column, i.e. check and set column type. 62 | // 63 | // If type is unknown it will default to string. 64 | // 65 | func (md *Metadata) Init() { 66 | switch strings.ToUpper(md.Type) { 67 | case "INTEGER", "INT": 68 | md.T = tabula.TInteger 69 | case "REAL": 70 | md.T = tabula.TReal 71 | default: 72 | md.T = tabula.TString 73 | md.Type = "string" 74 | } 75 | } 76 | 77 | // 78 | // GetName return the name of metadata. 79 | // 80 | func (md *Metadata) GetName() string { 81 | return md.Name 82 | } 83 | 84 | // 85 | // GetType return type of metadata. 86 | // 87 | func (md *Metadata) GetType() int { 88 | return md.T 89 | } 90 | 91 | // 92 | // GetTypeName return string representation of type. 93 | // 94 | func (md *Metadata) GetTypeName() string { 95 | return md.Type 96 | } 97 | 98 | // 99 | // GetSeparator return the field separator. 100 | // 101 | func (md *Metadata) GetSeparator() string { 102 | return md.Separator 103 | } 104 | 105 | // 106 | // GetLeftQuote return the string used in the beginning of record value. 107 | // 108 | func (md *Metadata) GetLeftQuote() string { 109 | return md.LeftQuote 110 | } 111 | 112 | // 113 | // GetRightQuote return string that end in record value. 114 | // 115 | func (md *Metadata) GetRightQuote() string { 116 | return md.RightQuote 117 | } 118 | 119 | // 120 | // GetSkip return number of rows that will be skipped when reading data. 121 | // 122 | func (md *Metadata) GetSkip() bool { 123 | return md.Skip 124 | } 125 | 126 | // 127 | // GetValueSpace return value space. 128 | // 129 | func (md *Metadata) GetValueSpace() []string { 130 | return md.ValueSpace 131 | } 132 | 133 | // 134 | // IsEqual return true if this metadata equal with other instance, return false 135 | // otherwise. 136 | // 137 | func (md *Metadata) IsEqual(o MetadataInterface) bool { 138 | if md.Name != o.GetName() { 139 | return false 140 | } 141 | if md.Separator != o.GetSeparator() { 142 | return false 143 | } 144 | if md.LeftQuote != o.GetLeftQuote() { 145 | return false 146 | } 147 | if md.RightQuote != o.GetRightQuote() { 148 | return false 149 | } 150 | return true 151 | } 152 | 153 | // 154 | // String yes, it will print it JSON like format. 155 | // 156 | func (md *Metadata) String() string { 157 | r, e := json.MarshalIndent(md, "", "\t") 158 | if nil != e { 159 | log.Print(e) 160 | } 161 | return string(r) 162 | } 163 | -------------------------------------------------------------------------------- /readerinterface.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "github.com/shuLhan/tabula" 11 | "github.com/shuLhan/tekstus" 12 | "io" 13 | "os" 14 | ) 15 | 16 | // 17 | // ReaderInterface is the interface for reading DSV file. 18 | // 19 | type ReaderInterface interface { 20 | ConfigInterface 21 | AddInputMetadata(*Metadata) 22 | AppendMetadata(MetadataInterface) 23 | GetInputMetadata() []MetadataInterface 24 | GetInputMetadataAt(idx int) MetadataInterface 25 | GetMaxRows() int 26 | SetMaxRows(max int) 27 | GetDatasetMode() string 28 | SetDatasetMode(mode string) 29 | GetNColumnIn() int 30 | GetInput() string 31 | SetInput(path string) 32 | GetRejected() string 33 | SetRejected(path string) 34 | GetSkip() int 35 | SetSkip(n int) 36 | IsTrimSpace() bool 37 | SetDefault() 38 | OpenInput() error 39 | OpenRejected() error 40 | SkipLines() error 41 | 42 | Reset() error 43 | Flush() error 44 | ReadLine() ([]byte, error) 45 | FetchNextLine([]byte) ([]byte, error) 46 | Reject(line []byte) (int, error) 47 | Close() error 48 | 49 | GetDataset() interface{} 50 | MergeColumns(ReaderInterface) 51 | } 52 | 53 | // 54 | // Read row from input file. 55 | // 56 | func Read(reader ReaderInterface) (n int, e error) { 57 | var ( 58 | row *tabula.Row 59 | line []byte 60 | linenum int 61 | eRead *ReaderError 62 | ) 63 | maxrows := reader.GetMaxRows() 64 | 65 | e = reader.Reset() 66 | if e != nil { 67 | return 68 | } 69 | 70 | dataset := reader.GetDataset().(tabula.DatasetInterface) 71 | 72 | // Loop until we reached MaxRows (> 0) or when all rows has been 73 | // read (= -1) 74 | for { 75 | row, line, linenum, eRead = ReadRow(reader, linenum) 76 | if nil == eRead { 77 | dataset.PushRow(row) 78 | 79 | n++ 80 | if maxrows > 0 && n >= maxrows { 81 | break 82 | } 83 | continue 84 | } 85 | 86 | if eRead.T&EReadEOF == EReadEOF { 87 | _ = reader.Flush() 88 | e = io.EOF 89 | return 90 | } 91 | 92 | eRead.N = linenum 93 | fmt.Fprintf(os.Stderr, "%s\n", eRead) 94 | 95 | // If error, save the rejected line. 96 | line = append(line, DefEOL) 97 | 98 | _, e = reader.Reject(line) 99 | if e != nil { 100 | break 101 | } 102 | } 103 | 104 | // remember to flush if we have rejected rows. 105 | e = reader.Flush() 106 | 107 | return n, e 108 | } 109 | 110 | // 111 | // parsingLeftQuote parse the left-quote string from line. 112 | // 113 | func parsingLeftQuote(lq, line []byte, startAt int) ( 114 | p int, eRead *ReaderError, 115 | ) { 116 | p = startAt 117 | 118 | // parsing until we found left quote token 119 | p, found := tekstus.BytesSkipUntil(line, lq, p, false) 120 | 121 | if found { 122 | return p, nil 123 | } 124 | 125 | eRead = &ReaderError{ 126 | T: EReadMissLeftQuote, 127 | Func: "parsingLeftQuote", 128 | What: "Missing left-quote '" + string(lq) + "'", 129 | Line: string(line), 130 | Pos: p, 131 | N: 0, 132 | } 133 | 134 | return p, eRead 135 | } 136 | 137 | // 138 | // parsingSeparator parsing the line until we found the separator. 139 | // 140 | // Return the data and index of last parsed line, or error if separator is not 141 | // found or not match with specification. 142 | // 143 | func parsingSeparator(sep, line []byte, startAt int) ( 144 | v []byte, p int, eRead *ReaderError, 145 | ) { 146 | p = startAt 147 | 148 | v, p, found := tekstus.BytesCutUntil(line, sep, p, false) 149 | 150 | if found { 151 | return v, p, nil 152 | } 153 | 154 | eRead = &ReaderError{ 155 | Func: "parsingSeparator", 156 | What: "Missing separator '" + string(sep) + "'", 157 | Line: string(line), 158 | Pos: p, 159 | N: 0, 160 | } 161 | 162 | return v, p, eRead 163 | } 164 | 165 | // 166 | // parsingRightQuote parsing the line until we found the right quote or separator. 167 | // 168 | // Return the data and index of last parsed line, or error if right-quote is not 169 | // found or not match with specification. 170 | // 171 | func parsingRightQuote(reader ReaderInterface, rq, line []byte, startAt int) ( 172 | v, lines []byte, p int, eRead *ReaderError, 173 | ) { 174 | var e error 175 | var content []byte 176 | p = startAt 177 | var found bool 178 | 179 | // (2.2.1) 180 | for { 181 | content, p, found = tekstus.BytesCutUntil(line, rq, p, true) 182 | 183 | v = append(v, content...) 184 | 185 | if found { 186 | return v, line, p, nil 187 | } 188 | 189 | // EOL before finding right-quote. 190 | // Read and join with the next line. 191 | line, e = reader.FetchNextLine(line) 192 | 193 | if e != nil { 194 | break 195 | } 196 | } 197 | 198 | eRead = &ReaderError{ 199 | T: EReadMissRightQuote, 200 | Func: "parsingRightQuote", 201 | What: "Missing right-quote '" + string(rq) + "'", 202 | Line: string(line), 203 | Pos: p, 204 | N: 0, 205 | } 206 | 207 | if e == io.EOF { 208 | eRead.T &= EReadEOF 209 | } 210 | 211 | return v, line, p, eRead 212 | } 213 | 214 | // 215 | // parsingSkipSeparator parse until we found separator or EOF 216 | // 217 | func parsingSkipSeparator(sep, line []byte, startAt int) ( 218 | p int, eRead *ReaderError, 219 | ) { 220 | p = startAt 221 | 222 | p, found := tekstus.BytesSkipUntil(line, sep, p, false) 223 | 224 | if found { 225 | return p, nil 226 | } 227 | 228 | eRead = &ReaderError{ 229 | T: EReadMissSeparator, 230 | Func: "parsingSkipSeparator", 231 | What: "Missing separator '" + string(sep) + "'", 232 | Line: string(line), 233 | Pos: p, 234 | N: 0, 235 | } 236 | 237 | return p, eRead 238 | } 239 | 240 | // 241 | // parsingSkipSpace skip all space starting from `startAt`. 242 | // 243 | func parsingSkipSpace(line []byte, startAt int) (p int) { 244 | linelen := len(line) 245 | 246 | for p = startAt; p < linelen; p++ { 247 | if line[p] == ' ' || line[p] == '\t' || line[p] == '\n' || 248 | line[p] == '\r' { 249 | continue 250 | } 251 | break 252 | } 253 | return 254 | } 255 | 256 | // 257 | // ParseLine parse a line containing records. The output is array of record 258 | // (or single row). 259 | // 260 | // This is how the algorithm works 261 | // (1) create n slice of record, where n is number of column metadata 262 | // (2) for each metadata 263 | // (2.0) Check if the next sequence matched with separator. 264 | // (2.0.1) If its match, create empty record 265 | // (2.1) If using left quote, skip until we found left-quote 266 | // (2.2) If using right quote, append byte to buffer until right-quote 267 | // (2.2.1) If using separator, skip until separator 268 | // (2.3) If using separator, append byte to buffer until separator 269 | // (2.4) else append all byte to buffer. 270 | // (3) save buffer to record 271 | // 272 | func ParseLine(reader ReaderInterface, line []byte) ( 273 | prow *tabula.Row, eRead *ReaderError, 274 | ) { 275 | p := 0 276 | rIdx := 0 277 | inputMd := reader.GetInputMetadata() 278 | row := make(tabula.Row, 0) 279 | 280 | for _, md := range inputMd { 281 | lq := md.GetLeftQuote() 282 | rq := md.GetRightQuote() 283 | sep := md.GetSeparator() 284 | v := []byte{} 285 | 286 | // (2.0) 287 | if sep != "" && sep != lq { 288 | match := tekstus.BytesMatchForward(line, []byte(sep), 289 | p) 290 | 291 | // (2.0.1) 292 | if match { 293 | p += len(sep) 294 | goto empty 295 | } 296 | } 297 | 298 | // (2.1) 299 | if lq != "" { 300 | p, eRead = parsingLeftQuote([]byte(lq), line, p) 301 | 302 | if eRead != nil { 303 | return 304 | } 305 | } 306 | 307 | // (2.2) 308 | if rq != "" { 309 | v, line, p, eRead = parsingRightQuote(reader, []byte(rq), 310 | line, p) 311 | 312 | if eRead != nil { 313 | return 314 | } 315 | 316 | if sep != "" { 317 | p, eRead = parsingSkipSeparator([]byte(sep), 318 | line, p) 319 | 320 | if eRead != nil { 321 | return 322 | } 323 | 324 | // Handle multi space if separator is a single 325 | // space. 326 | if sep == " " { 327 | p = parsingSkipSpace(line, p) 328 | } 329 | } 330 | } else { 331 | if sep != "" { 332 | // Skip space at beginning if separator is a 333 | // single space. 334 | if sep == " " { 335 | p = parsingSkipSpace(line, p) 336 | } 337 | 338 | v, p, eRead = parsingSeparator([]byte(sep), 339 | line, p) 340 | 341 | if eRead != nil { 342 | return 343 | } 344 | 345 | // Handle multi space if separator is a single 346 | // space. 347 | if sep == " " { 348 | p = parsingSkipSpace(line, p) 349 | } 350 | } else { 351 | v = line[p:] 352 | p = p + len(line) 353 | } 354 | } 355 | 356 | if md.GetSkip() { 357 | continue 358 | } 359 | empty: 360 | r, e := tabula.NewRecordBy(string(v), md.GetType()) 361 | 362 | if nil != e { 363 | msg := fmt.Sprintf("md %s: Type convertion error from %q to %s", 364 | md.GetName(), string(v), md.GetTypeName()) 365 | 366 | return nil, &ReaderError{ 367 | T: ETypeConversion, 368 | Func: "ParseLine", 369 | What: msg, 370 | Line: string(line), 371 | Pos: p, 372 | N: 0, 373 | } 374 | } 375 | 376 | row = append(row, r) 377 | rIdx++ 378 | } 379 | 380 | return &row, nil 381 | } 382 | 383 | // 384 | // ReadRow read one line at a time until we get one row or error when parsing the 385 | // data. 386 | // 387 | func ReadRow(reader ReaderInterface, linenum int) ( 388 | row *tabula.Row, 389 | line []byte, 390 | n int, 391 | eRead *ReaderError, 392 | ) { 393 | var e error 394 | n = linenum 395 | 396 | // Read one line, skip empty line. 397 | for { 398 | line, e = reader.ReadLine() 399 | n++ 400 | 401 | if e != nil { 402 | goto err 403 | } 404 | 405 | // check for empty line 406 | linetrimed := bytes.TrimSpace(line) 407 | 408 | if len(linetrimed) > 0 { 409 | break 410 | } 411 | } 412 | 413 | if reader.IsTrimSpace() { 414 | line = bytes.TrimSpace(line) 415 | } 416 | 417 | row, eRead = ParseLine(reader, line) 418 | 419 | return row, line, n, eRead 420 | 421 | err: 422 | eRead = &ReaderError{ 423 | Func: "ReadRow", 424 | What: fmt.Sprint(e), 425 | } 426 | 427 | if e == io.EOF { 428 | eRead.T = EReadEOF 429 | } else { 430 | eRead.T = EReadLine 431 | } 432 | 433 | return nil, line, n, eRead 434 | } 435 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GoDoc](https://godoc.org/github.com/shuLhan/dsv?status.svg)](https://godoc.org/github.com/shuLhan/dsv) 2 | [![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/dsv)](https://goreportcard.com/report/github.com/shuLhan/dsv) 3 | 4 | Package `dsv` is a Go library for working with delimited separated value (DSV). 5 | 6 | NOTE: This package has been deprecated. See 7 | https://github.com/shuLhan/share/lib/dsv for latest implementation. 8 | 9 | DSV is a free-style form of CSV format of text data, where each record is 10 | separated by newline, and each column can be separated by any string, not just 11 | comma. 12 | 13 | - [Example](#example) 14 | - [Terminology](#terminology) 15 | - [Configuration](#configuration) 16 | - [Metadata](#metadata) 17 | - [Input](#input) 18 | - [DatasetMode Explained](#datasetmode-explained) 19 | - [Output](#output) 20 | - [Working with DSV](#working-with-dsv) 21 | - [Processing each Rows/Columns](#processing-each-rowscolumns) 22 | - [Using different Dataset](#using-different-dataset) 23 | - [Builtin Functions for Dataset](#builtin-functions-for-dataset) 24 | - [Limitations](#limitations) 25 | 26 | --- 27 | 28 | ## Example 29 | 30 | Lets process this input file `input.dat`, 31 | 32 | Mon Dt HH MM SS Process 33 | Nov 29 23:14:36 process-1 34 | Nov 29 23:14:37 process-2 35 | Nov 29 23:14:38 process-3 36 | 37 | and generate output file `output.dat` which format like this, 38 | 39 | "process_1","29-Nov" 40 | "process_2","29-Nov" 41 | "process_3","29-Nov" 42 | 43 | How do we do it? 44 | 45 | First, create file metadata for input and output, name it `config.dsv`, 46 | 47 | { 48 | "Input" :"input.dat" 49 | , "Skip" :1 50 | , "InputMetadata" : 51 | [{ 52 | "Name" :"month" 53 | , "Separator" :" " 54 | },{ 55 | "Name" :"date" 56 | , "Separator" :" " 57 | , "Type" :"integer" 58 | },{ 59 | "Name" :"hour" 60 | , "Separator" :":" 61 | , "Type" :"integer" 62 | },{ 63 | "Name" :"minute" 64 | , "Separator" :":" 65 | , "Type" :"integer" 66 | },{ 67 | "Name" :"second" 68 | , "Separator" :" " 69 | , "Type" :"integer" 70 | },{ 71 | "Name" :"process_name" 72 | , "Separator" :"-" 73 | },{ 74 | "Name" :"process_id" 75 | }] 76 | , "Output" :"output.dat" 77 | , "OutputMetadata": 78 | [{ 79 | "Name" :"process_name" 80 | , "LeftQuote" :"\"" 81 | , "Separator" :"_" 82 | },{ 83 | "Name" :"process_id" 84 | , "RightQuote":"\"" 85 | , "Separator" :"," 86 | },{ 87 | "Name" :"date" 88 | , "LeftQuote" :"\"" 89 | , "Separator" :"-" 90 | },{ 91 | "Name" :"month" 92 | , "RightQuote":"\"" 93 | }] 94 | } 95 | 96 | The metadata is using JSON format. For more information see `metadata.go` 97 | and `reader.go`. 98 | 99 | Second, we create a reader to read the input file. 100 | 101 | dsvReader, e := dsv.NewReader("config.dsv", nil) 102 | 103 | if nil != e { 104 | t.Fatal(e) 105 | } 106 | 107 | Third, we create a writer to write our output data, 108 | 109 | dsvWriter, e := dsv.NewWriter("config.dsv") 110 | 111 | if nil != e { 112 | t.Error(e) 113 | } 114 | 115 | Last action, we process them: read input records and pass them to writer. 116 | 117 | for { 118 | n, e := dsv.Read(dsvReader) 119 | 120 | if n > 0 { 121 | dsvWriter.Write(dsvReader) 122 | 123 | // EOF, no more record. 124 | } else if e == io.EOF { 125 | break 126 | } 127 | } 128 | 129 | // we will make sure all open descriptor is closed. 130 | _ = dsvReader.Close() 131 | 132 | Easy enough? We can combine the reader and writer using `dsv.New()`, which will 133 | create reader and writer, 134 | 135 | rw, e := dsv.New("config.dsv", nil) 136 | 137 | if nil != e { 138 | t.Error(e) 139 | } 140 | 141 | // do usual process like in the last step. 142 | 143 | Thats it! 144 | 145 | ## Terminology 146 | 147 | Here are some terminology that we used in developing this library, which may 148 | help reader understanding the configuration and API. 149 | 150 | - Dataset: is a content of file 151 | - Record: a single cell in row or column, or the smallest building block of 152 | dataset 153 | - Row: is a horizontal representation of records in dataset 154 | - Column: is a vertical representation of records in dataset 155 | 156 | ``` 157 | COL-0 COL-1 ... COL-x 158 | ROW-0: record record ... record 159 | ROW-1: record record ... record 160 | ... 161 | ROW-y: record record ... record 162 | ``` 163 | 164 | ## Configuration 165 | 166 | We choose and use JSON for configuration because, 167 | 168 | 1. No additional source to test. 169 | 2. Easy to extended. User can embed the current metadata, add additional 170 | configuration, and create another reader to work with it. 171 | 172 | ### Metadata 173 | 174 | Metadata contain information about each column when reading input file and 175 | writing to output file, 176 | 177 | - `Name`: mandatory, the name of column 178 | - `Type`: optional, type of record when reading input file. Valid value are 179 | "integer", "real", or "string" (default) 180 | - `Separator`: optional, default to `"\n"`. Separator is a string that 181 | separate the current record with the next record. 182 | - `LeftQuote`: optional, default is empty `""`. LeftQuote is a string that 183 | start at the beginning of record. 184 | - `RightQuote`: optional, default is empty `""`. RightQuote is a string at the 185 | end of record. 186 | - `Skip`: optional, boolean, default is `false`. If true the column will be 187 | saved in dataset when reading input file, otherwise it will be ignored. 188 | - `ValueSpace`: optional, slice of string, default is empty. This contain the 189 | string representation of all possible value in column. 190 | 191 | ### Input 192 | 193 | Input configuration contain information about input file. 194 | 195 | - `Input`: mandatory, the name of input file, could use relative or absolute 196 | path. If no path is given then it assumed that the input file is in the same 197 | directory with configuration file. 198 | - `InputMetadata`: mandatory, list of metadata. 199 | - `Skip`: optional, number, default 0. Skip define the number of line that will 200 | be skipped when first input file is opened. 201 | - `TrimSpace`: optional, boolean, default is true. If its true, before parsed, the 202 | white space in the beginning and end of each input line will be removed, 203 | otherwise it will leave unmodified. 204 | - `Rejected`: optional, default to `rejected.dat`. Rejected is file where 205 | data that does not match with metadata will be saved. One can inspect the 206 | rejected file fix it for re-process or ignore it. 207 | - `MaxRows`: optional, default to `256`. Maximum number of rows for one read 208 | operation that will be saved in memory. If its negative, i.e. `-1`, all data 209 | in input file will be processed. 210 | - `DatasetMode`: optional, default to "rows". Mode of dataset in memory. 211 | Valid values are "rows", "columns", or "matrix". Matrix mode is combination of 212 | rows and columns, it give more flexibility when processing the dataset but 213 | will require additional memory. 214 | 215 | #### `DatasetMode` Explained 216 | 217 | For example, given input data file, 218 | 219 | col1,col2,col3 220 | a,b,c 221 | 1,2,3 222 | 223 | "rows" mode is where each line saved in its own slice, resulting in Rows: 224 | 225 | Rows[0]: [a b c] 226 | Rows[1]: [1 2 3] 227 | 228 | "columns" mode is where each line saved by columns, resulting in Columns: 229 | 230 | Columns[0]: {col1 0 0 [] [a 1]} 231 | Columns[1]: {col2 0 0 [] [b 2]} 232 | Columns[1]: {col3 0 0 [] [c 3]} 233 | 234 | Unlike rows mode, each column contain metadata including column name, type, 235 | flag, and value space (all possible value that _may_ contain in column value). 236 | 237 | "matrix" mode is where each record saved both in row and column. 238 | 239 | ### Output 240 | 241 | Output configuration contain information about output file when writing the 242 | dataset. 243 | 244 | - `Output`: mandatory, the name of output file, could use relative or absolute 245 | path. If no path is given then it assumed that the output file is in the same 246 | directory with configuration file. 247 | - `OutputMetadata`: mandatory, list of metadata. 248 | 249 | ## Working with DSV 250 | 251 | ### Processing each Rows/Columns 252 | 253 | After opening the input file, we can process the dataset based on rows/columns 254 | mode using simple `for` loop. Example, 255 | 256 | ``` 257 | // Save dataset object for used later. 258 | dataset := dsvReader.GetDataset().(tabula.DatasetInterface) 259 | 260 | for { 261 | n, e := dsv.Read(dsvReader) 262 | 263 | if n > 0 { 264 | // Process each row ... 265 | for x, row := dataset.GetDataAsRows() { 266 | 267 | for y, record := range row.Records { 268 | // process each record in row 269 | } 270 | } 271 | 272 | // Or, process each columns 273 | for x, column := dataset.GetDataAsColumns() { 274 | 275 | for y, record := range column.Records { 276 | // process each record in column 277 | } 278 | } 279 | 280 | // Write the dataset to file after processed 281 | dsvWriter.Write(dsvReader) 282 | } 283 | if e == io.EOF { 284 | break 285 | } 286 | if e != nil { 287 | // handle error 288 | } 289 | } 290 | ``` 291 | 292 | ### Using different Dataset 293 | 294 | Default dataset used by Reader is 295 | [tabula.Dataset](https://godoc.org/github.com/shuLhan/tabula#Dataset). 296 | 297 | You can extend and implement 298 | [DatasetInterface](https://godoc.org/github.com/shuLhan/tabula#DatasetInterface) 299 | and use it in reader object, either by 300 | 301 | - passing it in the second parameter in `NewReader`, for example, 302 | 303 | ``` 304 | myset := MySet{ 305 | ... 306 | } 307 | reader, e := dsv.NewReader("config.dsv", &myset) 308 | ``` 309 | 310 | - or by calling `reader.Init` after creating new Reader, 311 | 312 | ``` 313 | myset := MySet{ 314 | ... 315 | } 316 | reader := dsv.Reader{ 317 | ... 318 | } 319 | reader.Init("config.dsv", &myset) 320 | ``` 321 | 322 | ### Builtin Functions for Dataset 323 | 324 | Since we use tabula package to manage data, any features in those package 325 | can be used in our dataset. 326 | For more information see [tabula 327 | package](https://godoc.org/github.com/shuLhan/tabula). 328 | 329 | ## Limitations 330 | 331 | - New line is `\n` for each row. 332 | 333 | - Reader and Writer operate in ASCII (8 bit or char type), UTF-8 is not 334 | supported yet, since we can not test it. Patch for supporting UTF-8 (or 335 | runes type) are welcome. 336 | 337 | - About escaped character in content of data. 338 | 339 | Since we said that we handle free-style form of CSV, what we mean was the 340 | left-quote, right-quote and separator can be string. Its not only one single 341 | character like single quote or double quote or any single character, but 342 | literally one or more characters without space. Any escaped character will be 343 | read as is (along with `'\'`) unless its followed by right-quote or separator. 344 | For example, 345 | 346 | "test\'" 347 | 348 | will be readed as `test\'`. But 349 | 350 | "test\"" 351 | 352 | will be readed as `test"`, since the right-quote is matched with escaped 353 | token. 354 | -------------------------------------------------------------------------------- /reader_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv_test 6 | 7 | import ( 8 | "fmt" 9 | "github.com/shuLhan/dsv" 10 | "github.com/shuLhan/tabula" 11 | "io" 12 | "strings" 13 | "testing" 14 | ) 15 | 16 | var jsonSample = []string{ 17 | `{}`, 18 | `{ 19 | "Input" :"testdata/input.dat" 20 | }`, 21 | `{ 22 | "Input" :"testdata/input.dat" 23 | }`, 24 | `{ 25 | "Input" :"testdata/input.dat" 26 | , "InputMetadata" : 27 | [{ 28 | "Name" :"A" 29 | , "Separator" :"," 30 | },{ 31 | "Name" :"B" 32 | , "Separator" :";" 33 | }] 34 | }`, 35 | `{ 36 | "Input" :"testdata/input.dat" 37 | , "Skip" :1 38 | , "MaxRows" :1 39 | , "InputMetadata" : 40 | [{ 41 | "Name" :"id" 42 | , "Separator" :";" 43 | , "Type" :"integer" 44 | },{ 45 | "Name" :"name" 46 | , "Separator" :"-" 47 | , "LeftQuote" :"\"" 48 | , "RightQuote" :"\"" 49 | },{ 50 | "Name" :"value" 51 | , "Separator" :";" 52 | , "LeftQuote" :"[[" 53 | , "RightQuote" :"]]" 54 | },{ 55 | "Name" :"integer" 56 | , "Type" :"integer" 57 | , "Separator" :";" 58 | },{ 59 | "Name" :"real" 60 | , "Type" :"real" 61 | }] 62 | }`, 63 | `{ 64 | "Input" :"testdata/input.dat" 65 | , "Skip" :1 66 | , "MaxRows" :1 67 | , "InputMetadata" : 68 | [{ 69 | "Name" :"id" 70 | },{ 71 | "Name" :"editor" 72 | },{ 73 | "Name" :"old_rev_id" 74 | },{ 75 | "Name" :"new_rev_id" 76 | },{ 77 | "Name" :"diff_url" 78 | },{ 79 | "Name" :"edit_time" 80 | },{ 81 | "Name" :"edit_comment" 82 | },{ 83 | "Name" :"article_id" 84 | },{ 85 | "Name" :"article_title" 86 | }] 87 | }`, 88 | } 89 | 90 | var readers = []*dsv.Reader{ 91 | {}, 92 | { 93 | Input: "testdata/input.dat", 94 | }, 95 | { 96 | Input: "test-another.dsv", 97 | }, 98 | { 99 | Input: "testdata/input.dat", 100 | InputMetadata: []dsv.Metadata{ 101 | { 102 | Name: "A", 103 | Separator: ",", 104 | }, 105 | { 106 | Name: "B", 107 | Separator: ";", 108 | }, 109 | }, 110 | }, 111 | } 112 | 113 | // 114 | // TestReaderNoInput will print error that the input is not defined. 115 | // 116 | func TestReaderNoInput(t *testing.T) { 117 | dsvReader := &dsv.Reader{} 118 | 119 | e := dsv.ConfigParse(dsvReader, []byte(jsonSample[0])) 120 | 121 | if nil != e { 122 | t.Fatal(e) 123 | } 124 | 125 | e = dsvReader.Init("", nil) 126 | 127 | if nil == e { 128 | t.Fatal("TestReaderNoInput: failed, should return non nil!") 129 | } 130 | } 131 | 132 | // 133 | // TestConfigParse test parsing metadata. 134 | // 135 | func TestConfigParse(t *testing.T) { 136 | cases := []struct { 137 | in string 138 | out *dsv.Reader 139 | }{ 140 | { 141 | jsonSample[1], 142 | readers[1], 143 | }, 144 | { 145 | jsonSample[3], 146 | readers[3], 147 | }, 148 | } 149 | 150 | dsvReader := &dsv.Reader{} 151 | 152 | for _, c := range cases { 153 | e := dsv.ConfigParse(dsvReader, []byte(c.in)) 154 | 155 | if e != nil { 156 | t.Fatal(e) 157 | } 158 | if !dsvReader.IsEqual(c.out) { 159 | t.Fatal("Test failed on ", c.in) 160 | } 161 | } 162 | } 163 | 164 | func TestReaderIsEqual(t *testing.T) { 165 | cases := []struct { 166 | in *dsv.Reader 167 | out *dsv.Reader 168 | result bool 169 | }{ 170 | { 171 | readers[1], 172 | &dsv.Reader{ 173 | Input: "testdata/input.dat", 174 | }, 175 | true, 176 | }, 177 | { 178 | readers[1], 179 | readers[2], 180 | false, 181 | }, 182 | } 183 | 184 | var r bool 185 | 186 | for _, c := range cases { 187 | r = c.in.IsEqual(c.out) 188 | 189 | if r != c.result { 190 | t.Fatal("Test failed on equality between ", c.in, 191 | "\n and ", c.out) 192 | } 193 | } 194 | } 195 | 196 | // 197 | // doRead test reading the DSV data. 198 | // 199 | func doRead(t *testing.T, dsvReader *dsv.Reader, exp []string) { 200 | i := 0 201 | var n int 202 | var e error 203 | 204 | for { 205 | n, e = dsv.Read(dsvReader) 206 | 207 | if n > 0 { 208 | r := fmt.Sprint(dsvReader. 209 | GetDataset().(tabula.DatasetInterface). 210 | GetDataAsRows()) 211 | 212 | assert(t, exp[i], r, true) 213 | 214 | i++ 215 | } else if e == io.EOF { 216 | // EOF 217 | break 218 | } 219 | } 220 | } 221 | 222 | // 223 | // TestReader test reading. 224 | // 225 | func TestReaderRead(t *testing.T) { 226 | dsvReader := &dsv.Reader{} 227 | 228 | e := dsv.ConfigParse(dsvReader, []byte(jsonSample[4])) 229 | 230 | if nil != e { 231 | t.Fatal(e) 232 | } 233 | 234 | e = dsvReader.Init("", nil) 235 | if nil != e { 236 | t.Fatal(e) 237 | } 238 | 239 | doRead(t, dsvReader, expectation) 240 | 241 | e = dsvReader.Close() 242 | if e != nil { 243 | t.Fatal(e) 244 | } 245 | } 246 | 247 | // 248 | // TestReaderOpen real example from the start. 249 | // 250 | func TestReaderOpen(t *testing.T) { 251 | dsvReader, e := dsv.NewReader("testdata/config.dsv", nil) 252 | if nil != e { 253 | t.Fatal(e) 254 | } 255 | 256 | doRead(t, dsvReader, expectation) 257 | 258 | e = dsvReader.Close() 259 | if e != nil { 260 | t.Fatal(e) 261 | } 262 | } 263 | 264 | func TestDatasetMode(t *testing.T) { 265 | var e error 266 | var config = []string{`{ 267 | "Input" :"testdata/input.dat" 268 | , "DatasetMode" :"row" 269 | }`, `{ 270 | "Input" :"testdata/input.dat" 271 | , "DatasetMode" :"rows" 272 | }`, `{ 273 | "Input" :"testdata/input.dat" 274 | , "DatasetMode" :"columns" 275 | }`} 276 | 277 | var exps = []struct { 278 | status bool 279 | value string 280 | }{{ 281 | false, 282 | string(config[0]), 283 | }, { 284 | true, 285 | string(config[1]), 286 | }, { 287 | true, 288 | string(config[2]), 289 | }} 290 | 291 | reader := &dsv.Reader{} 292 | 293 | for k, v := range exps { 294 | e = dsv.ConfigParse(reader, []byte(config[k])) 295 | 296 | if e != nil { 297 | t.Fatal(e) 298 | } 299 | 300 | e = reader.Init("", nil) 301 | if e != nil { 302 | if v.status { 303 | t.Fatal(e) 304 | } 305 | } 306 | } 307 | } 308 | 309 | func TestReaderToColumns(t *testing.T) { 310 | reader := &dsv.Reader{} 311 | 312 | e := dsv.ConfigParse(reader, []byte(jsonSample[4])) 313 | if nil != e { 314 | t.Fatal(e) 315 | } 316 | 317 | e = reader.Init("", nil) 318 | if nil != e { 319 | t.Fatal(e) 320 | } 321 | 322 | reader.SetDatasetMode(dsv.DatasetModeCOLUMNS) 323 | 324 | var n, i int 325 | for { 326 | n, e = dsv.Read(reader) 327 | 328 | if n > 0 { 329 | ds := reader.GetDataset().(tabula.DatasetInterface) 330 | ds.TransposeToRows() 331 | 332 | r := fmt.Sprint(ds.GetData()) 333 | 334 | assert(t, expectation[i], r, true) 335 | 336 | i++ 337 | } else if e == io.EOF { 338 | // EOF 339 | break 340 | } 341 | } 342 | } 343 | 344 | // 345 | // TestReaderSkip will test the 'Skip' option in Metadata. 346 | // 347 | func TestReaderSkip(t *testing.T) { 348 | dsvReader, e := dsv.NewReader("testdata/config_skip.dsv", nil) 349 | if nil != e { 350 | t.Fatal(e) 351 | } 352 | 353 | doRead(t, dsvReader, expSkip) 354 | 355 | e = dsvReader.Close() 356 | if e != nil { 357 | t.Fatal(e) 358 | } 359 | } 360 | 361 | func TestTransposeToColumns(t *testing.T) { 362 | reader, e := dsv.NewReader("testdata/config_skip.dsv", nil) 363 | if nil != e { 364 | t.Fatal(e) 365 | } 366 | 367 | reader.SetMaxRows(-1) 368 | 369 | _, e = dsv.Read(reader) 370 | 371 | if e != io.EOF { 372 | t.Fatal(e) 373 | } 374 | 375 | ds := reader.GetDataset().(tabula.DatasetInterface) 376 | ds.TransposeToColumns() 377 | 378 | exp := fmt.Sprint(expSkipColumnsAll) 379 | 380 | columns := ds.GetDataAsColumns() 381 | 382 | got := fmt.Sprint(*columns) 383 | 384 | assert(t, exp, got, true) 385 | 386 | e = reader.Close() 387 | if e != nil { 388 | t.Fatal(e) 389 | } 390 | } 391 | 392 | func TestSortColumnsByIndex(t *testing.T) { 393 | reader, e := dsv.NewReader("testdata/config_skip.dsv", nil) 394 | if nil != e { 395 | t.Fatal(e) 396 | } 397 | 398 | reader.SetMaxRows(-1) 399 | 400 | _, e = dsv.Read(reader) 401 | if e != io.EOF { 402 | t.Fatal(e) 403 | } 404 | 405 | // reverse the data 406 | var idxReverse []int 407 | var expReverse []string 408 | 409 | for x := len(expSkip) - 1; x >= 0; x-- { 410 | idxReverse = append(idxReverse, x) 411 | expReverse = append(expReverse, expSkip[x]) 412 | } 413 | 414 | ds := reader.GetDataset().(tabula.DatasetInterface) 415 | 416 | tabula.SortColumnsByIndex(ds, idxReverse) 417 | 418 | exp := strings.Join(expReverse, "") 419 | got := fmt.Sprint(ds.GetDataAsRows()) 420 | 421 | assert(t, exp, got, true) 422 | 423 | exp = "[" + strings.Join(expSkipColumnsAllRev, " ") + "]" 424 | 425 | columns := ds.GetDataAsColumns() 426 | 427 | got = fmt.Sprint(*columns) 428 | 429 | assert(t, exp, got, true) 430 | 431 | e = reader.Close() 432 | if e != nil { 433 | t.Fatal(e) 434 | } 435 | } 436 | 437 | func TestSplitRowsByValue(t *testing.T) { 438 | reader, e := dsv.NewReader("testdata/config.dsv", nil) 439 | if nil != e { 440 | t.Fatal(e) 441 | } 442 | 443 | reader.SetMaxRows(256) 444 | 445 | _, e = dsv.Read(reader) 446 | 447 | if e != nil && e != io.EOF { 448 | t.Fatal(e) 449 | } 450 | 451 | ds := reader.GetDataset().(tabula.DatasetInterface) 452 | splitL, splitR, e := tabula.SplitRowsByValue(ds, 0, 6) 453 | 454 | if e != nil { 455 | t.Fatal(e) 456 | } 457 | 458 | // test left split 459 | exp := "" 460 | for x := 0; x < 4; x++ { 461 | exp += expectation[x] 462 | } 463 | 464 | got := fmt.Sprint(splitL.GetDataAsRows()) 465 | 466 | assert(t, exp, got, true) 467 | 468 | // test right split 469 | exp = "" 470 | for x := 4; x < len(expectation); x++ { 471 | exp += expectation[x] 472 | } 473 | 474 | got = fmt.Sprint(splitR.GetDataAsRows()) 475 | 476 | assert(t, exp, got, true) 477 | 478 | e = reader.Close() 479 | if e != nil { 480 | t.Fatal(e) 481 | } 482 | } 483 | 484 | // 485 | // testWriteOutput will write merged reader and check with expected file output. 486 | // 487 | func testWriteOutput(t *testing.T, r *dsv.Reader, outfile, expfile string) { 488 | 489 | writer, e := dsv.NewWriter("") 490 | if e != nil { 491 | t.Fatal(e) 492 | } 493 | 494 | e = writer.OpenOutput(outfile) 495 | 496 | if e != nil { 497 | t.Fatal(e) 498 | } 499 | 500 | sep := "\t" 501 | ds := r.GetDataset().(tabula.DatasetInterface) 502 | 503 | _, e = writer.WriteRawDataset(ds, &sep) 504 | if e != nil { 505 | t.Fatal(e) 506 | } 507 | 508 | e = writer.Close() 509 | if e != nil { 510 | t.Fatal(e) 511 | } 512 | 513 | assertFile(t, outfile, expfile, true) 514 | } 515 | 516 | func TestMergeColumns(t *testing.T) { 517 | reader1, e := dsv.NewReader("testdata/config.dsv", nil) 518 | if nil != e { 519 | t.Fatal(e) 520 | } 521 | 522 | reader2, e := dsv.NewReader("testdata/config_skip.dsv", nil) 523 | if nil != e { 524 | t.Fatal(e) 525 | } 526 | 527 | reader1.SetMaxRows(-1) 528 | reader2.SetMaxRows(-1) 529 | 530 | _, e = dsv.Read(reader1) 531 | if e != io.EOF { 532 | t.Fatal(e) 533 | } 534 | 535 | _, e = dsv.Read(reader2) 536 | if e != io.EOF { 537 | t.Fatal(e) 538 | } 539 | 540 | e = reader1.Close() 541 | if e != nil { 542 | t.Fatal(e) 543 | } 544 | 545 | e = reader2.Close() 546 | if e != nil { 547 | t.Fatal(e) 548 | } 549 | 550 | reader1.InputMetadata[len(reader1.InputMetadata)-1].Separator = ";" 551 | 552 | reader1.MergeColumns(reader2) 553 | 554 | outfile := "testdata/output_merge_columns.dat" 555 | expfile := "testdata/expected_merge_columns.dat" 556 | 557 | testWriteOutput(t, reader1, outfile, expfile) 558 | } 559 | 560 | func TestMergeRows(t *testing.T) { 561 | reader1, e := dsv.NewReader("testdata/config.dsv", nil) 562 | if nil != e { 563 | t.Fatal(e) 564 | } 565 | 566 | reader2, e := dsv.NewReader("testdata/config_skip.dsv", nil) 567 | if nil != e { 568 | t.Fatal(e) 569 | } 570 | 571 | reader1.SetMaxRows(-1) 572 | reader2.SetMaxRows(-1) 573 | 574 | _, e = dsv.Read(reader1) 575 | if e != io.EOF { 576 | t.Fatal(e) 577 | } 578 | 579 | _, e = dsv.Read(reader2) 580 | if e != io.EOF { 581 | t.Fatal(e) 582 | } 583 | 584 | e = reader1.Close() 585 | if e != nil { 586 | t.Fatal(e) 587 | } 588 | 589 | e = reader2.Close() 590 | if e != nil { 591 | t.Fatal(e) 592 | } 593 | 594 | reader1.MergeRows(reader2) 595 | 596 | outfile := "testdata/output_merge_rows.dat" 597 | expfile := "testdata/expected_merge_rows.dat" 598 | 599 | testWriteOutput(t, reader1, outfile, expfile) 600 | } 601 | -------------------------------------------------------------------------------- /writer.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "bufio" 9 | "encoding/json" 10 | "github.com/shuLhan/tabula" 11 | "github.com/shuLhan/tekstus" 12 | "log" 13 | "os" 14 | ) 15 | 16 | const ( 17 | // DefSeparator default separator that will be used if its not given 18 | // in config file. 19 | DefSeparator = "," 20 | // DefOutput file. 21 | DefOutput = "output.dat" 22 | // DefEscape default string to escape the right quote or separator. 23 | DefEscape = "\\" 24 | ) 25 | 26 | // 27 | // Writer write records from reader or slice using format configuration in 28 | // metadata. 29 | // 30 | type Writer struct { 31 | Config `json:"-"` 32 | // Output file where the records will be written. 33 | Output string `json:"Output"` 34 | // OutputMetadata define format for each column. 35 | OutputMetadata []Metadata `json:"OutputMetadata"` 36 | // fWriter as write descriptor. 37 | fWriter *os.File 38 | // BufWriter for buffered writer. 39 | BufWriter *bufio.Writer 40 | } 41 | 42 | // 43 | // NewWriter create a writer object. 44 | // User must call Open after that to populate the output and metadata. 45 | // 46 | func NewWriter(config string) (writer *Writer, e error) { 47 | writer = &Writer{ 48 | Output: "", 49 | OutputMetadata: nil, 50 | fWriter: nil, 51 | BufWriter: nil, 52 | } 53 | 54 | if config == "" { 55 | return 56 | } 57 | 58 | e = OpenWriter(writer, config) 59 | if e != nil { 60 | return nil, e 61 | } 62 | 63 | return 64 | } 65 | 66 | // 67 | // GetOutput return output filename. 68 | // 69 | func (writer *Writer) GetOutput() string { 70 | return writer.Output 71 | } 72 | 73 | // 74 | // SetOutput will set the output file to path. 75 | // 76 | func (writer *Writer) SetOutput(path string) { 77 | writer.Output = path 78 | } 79 | 80 | // 81 | // AddMetadata will add new output metadata to writer. 82 | // 83 | func (writer *Writer) AddMetadata(md Metadata) { 84 | writer.OutputMetadata = append(writer.OutputMetadata, md) 85 | } 86 | 87 | // 88 | // open a generic method to open output file with specific flag. 89 | // 90 | func (writer *Writer) open(file string, flag int) (e error) { 91 | if file == "" { 92 | if writer.Output == "" { 93 | file = DefOutput 94 | } else { 95 | file = writer.Output 96 | } 97 | } 98 | 99 | writer.fWriter, e = os.OpenFile(file, flag, 0600) 100 | if nil != e { 101 | return e 102 | } 103 | 104 | writer.BufWriter = bufio.NewWriter(writer.fWriter) 105 | 106 | return nil 107 | } 108 | 109 | // 110 | // OpenOutput file and buffered writer. 111 | // File will be truncated if its exist. 112 | // 113 | func (writer *Writer) OpenOutput(file string) (e error) { 114 | return writer.open(file, os.O_CREATE|os.O_TRUNC|os.O_WRONLY) 115 | } 116 | 117 | // 118 | // ReopenOutput will open the output file back without truncating the content. 119 | // 120 | func (writer *Writer) ReopenOutput(file string) (e error) { 121 | if e = writer.Close(); e != nil { 122 | return 123 | } 124 | return writer.open(file, os.O_CREATE|os.O_APPEND|os.O_WRONLY) 125 | } 126 | 127 | // 128 | // Flush output buffer to disk. 129 | // 130 | func (writer *Writer) Flush() error { 131 | return writer.BufWriter.Flush() 132 | } 133 | 134 | // 135 | // Close all open descriptor. 136 | // 137 | func (writer *Writer) Close() (e error) { 138 | if nil != writer.BufWriter { 139 | e = writer.BufWriter.Flush() 140 | if e != nil { 141 | return 142 | } 143 | } 144 | if nil != writer.fWriter { 145 | e = writer.fWriter.Close() 146 | } 147 | return 148 | } 149 | 150 | // 151 | // WriteRow dump content of Row to file using format in metadata. 152 | // 153 | func (writer *Writer) WriteRow(row *tabula.Row, recordMd []MetadataInterface) ( 154 | e error, 155 | ) { 156 | nRecord := row.Len() 157 | v := []byte{} 158 | esc := []byte(DefEscape) 159 | 160 | for i := range writer.OutputMetadata { 161 | md := writer.OutputMetadata[i] 162 | 163 | // find the input index based on name on record metadata. 164 | rIdx, mdMatch := FindMetadata(&md, recordMd) 165 | 166 | // No input metadata matched? skip it too. 167 | if rIdx >= nRecord { 168 | continue 169 | } 170 | 171 | // If input column is ignored, continue to next record. 172 | if mdMatch != nil && mdMatch.GetSkip() { 173 | continue 174 | } 175 | 176 | recV := (*row)[rIdx].Bytes() 177 | lq := md.GetLeftQuote() 178 | 179 | if "" != lq { 180 | v = append(v, []byte(lq)...) 181 | } 182 | 183 | rq := md.GetRightQuote() 184 | sep := md.GetSeparator() 185 | 186 | // Escape the escape character itself. 187 | if md.T == tabula.TString { 188 | recV, _ = tekstus.BytesEncapsulate(esc, recV, esc, nil) 189 | } 190 | 191 | // Escape the right quote in field content before writing it. 192 | if "" != rq && md.T == tabula.TString { 193 | recV, _ = tekstus.BytesEncapsulate([]byte(rq), recV, 194 | esc, nil) 195 | } else { 196 | // Escape the separator 197 | if "" != sep && md.T == tabula.TString { 198 | recV, _ = tekstus.BytesEncapsulate([]byte(sep), 199 | recV, esc, nil) 200 | } 201 | } 202 | 203 | v = append(v, recV...) 204 | 205 | if "" != rq { 206 | v = append(v, []byte(rq)...) 207 | } 208 | 209 | if "" != sep { 210 | v = append(v, []byte(sep)...) 211 | } 212 | } 213 | 214 | v = append(v, DefEOL) 215 | 216 | _, e = writer.BufWriter.Write(v) 217 | 218 | return e 219 | } 220 | 221 | // 222 | // WriteRows will loop each row in the list of rows and write their content to 223 | // output file. 224 | // Return n for number of row written, and e if error happened. 225 | // 226 | func (writer *Writer) WriteRows(rows tabula.Rows, recordMd []MetadataInterface) ( 227 | n int, 228 | e error, 229 | ) { 230 | for n = range rows { 231 | e = writer.WriteRow(rows[n], recordMd) 232 | if nil != e { 233 | break 234 | } 235 | } 236 | 237 | _ = writer.Flush() 238 | return 239 | } 240 | 241 | // 242 | // WriteColumns will write content of columns to output file. 243 | // Return n for number of row written, and e if error happened. 244 | // 245 | func (writer *Writer) WriteColumns(columns tabula.Columns, 246 | colMd []MetadataInterface, 247 | ) ( 248 | n int, 249 | e error, 250 | ) { 251 | nColumns := len(columns) 252 | if nColumns <= 0 { 253 | return 254 | } 255 | 256 | emptyRec := tabula.NewRecordString("") 257 | 258 | // Get minimum and maximum length of all columns. 259 | // In case one of the column have different length (shorter or longer), 260 | // we will take the column with minimum length first and continue with 261 | // the maximum length. 262 | 263 | minlen, maxlen := columns.GetMinMaxLength() 264 | 265 | // If metadata is nil, generate it from column name. 266 | if colMd == nil { 267 | for _, col := range columns { 268 | md := &Metadata{ 269 | Name: col.Name, 270 | T: col.Type, 271 | } 272 | 273 | colMd = append(colMd, md) 274 | } 275 | } 276 | 277 | // First loop, iterate until minimum column length. 278 | row := make(tabula.Row, nColumns) 279 | 280 | for ; n < minlen; n++ { 281 | // Convert columns to record. 282 | for y, col := range columns { 283 | row[y] = col.Records[n] 284 | } 285 | 286 | e = writer.WriteRow(&row, colMd) 287 | if e != nil { 288 | goto err 289 | } 290 | } 291 | 292 | // Second loop, iterate until maximum column length. 293 | for ; n < maxlen; n++ { 294 | // Convert columns to record. 295 | for y, col := range columns { 296 | if col.Len() > n { 297 | row[y] = col.Records[n] 298 | } else { 299 | row[y] = emptyRec 300 | } 301 | } 302 | 303 | e = writer.WriteRow(&row, colMd) 304 | if e != nil { 305 | goto err 306 | } 307 | } 308 | 309 | err: 310 | _ = writer.Flush() 311 | return n, e 312 | } 313 | 314 | // 315 | // WriteRawRow will write row data using separator `sep` for each record. 316 | // 317 | func (writer *Writer) WriteRawRow(row *tabula.Row, sep, esc []byte) (e error) { 318 | if sep == nil { 319 | sep = []byte(DefSeparator) 320 | } 321 | if esc == nil { 322 | esc = []byte(DefEscape) 323 | } 324 | 325 | v := []byte{} 326 | for x, rec := range *row { 327 | if x > 0 { 328 | v = append(v, sep...) 329 | } 330 | 331 | recV := rec.Bytes() 332 | 333 | if rec.Type() == tabula.TString { 334 | recV, _ = tekstus.BytesEncapsulate(sep, recV, esc, nil) 335 | } 336 | 337 | v = append(v, recV...) 338 | } 339 | 340 | v = append(v, DefEOL) 341 | 342 | _, e = writer.BufWriter.Write(v) 343 | 344 | _ = writer.Flush() 345 | 346 | return e 347 | } 348 | 349 | // 350 | // WriteRawRows write rows data using separator `sep` for each record. 351 | // We use pointer in separator parameter, so we can use empty string as 352 | // separator. 353 | // 354 | func (writer *Writer) WriteRawRows(rows *tabula.Rows, sep *string) ( 355 | nrow int, 356 | e error, 357 | ) { 358 | nrow = len(*rows) 359 | if nrow <= 0 { 360 | return 361 | } 362 | 363 | if sep == nil { 364 | sep = new(string) 365 | *sep = DefSeparator 366 | } 367 | 368 | escbytes := []byte(DefEscape) 369 | sepbytes := []byte(*sep) 370 | x := 0 371 | 372 | for ; x < nrow; x++ { 373 | e = writer.WriteRawRow((*rows)[x], sepbytes, escbytes) 374 | if nil != e { 375 | break 376 | } 377 | } 378 | 379 | return x, e 380 | } 381 | 382 | // 383 | // WriteRawColumns write raw columns using separator `sep` for each record to 384 | // file. 385 | // 386 | // We use pointer in separator parameter, so we can use empty string as 387 | // separator. 388 | // 389 | func (writer *Writer) WriteRawColumns(cols *tabula.Columns, sep *string) ( 390 | nrow int, 391 | e error, 392 | ) { 393 | ncol := len(*cols) 394 | if ncol <= 0 { 395 | return 396 | } 397 | 398 | if sep == nil { 399 | sep = new(string) 400 | *sep = DefSeparator 401 | } 402 | 403 | // Find minimum and maximum column length. 404 | minlen, maxlen := cols.GetMinMaxLength() 405 | 406 | esc := []byte(DefEscape) 407 | sepbytes := []byte(*sep) 408 | x := 0 409 | 410 | // First, write until minimum column length. 411 | for ; x < minlen; x++ { 412 | v := cols.Join(x, sepbytes, esc) 413 | v = append(v, DefEOL) 414 | 415 | _, e = writer.BufWriter.Write(v) 416 | 417 | if nil != e { 418 | return x, e 419 | } 420 | } 421 | 422 | // and then write column until max length. 423 | for ; x < maxlen; x++ { 424 | v := cols.Join(x, sepbytes, esc) 425 | v = append(v, DefEOL) 426 | 427 | _, e = writer.BufWriter.Write(v) 428 | 429 | if nil != e { 430 | break 431 | } 432 | } 433 | 434 | _ = writer.Flush() 435 | return x, e 436 | } 437 | 438 | // 439 | // WriteRawDataset will write content of dataset to file without metadata but 440 | // using separator `sep` for each record. 441 | // 442 | // We use pointer in separator parameter, so we can use empty string as 443 | // separator. 444 | // 445 | func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface, 446 | sep *string, 447 | ) ( 448 | int, error, 449 | ) { 450 | if nil == writer.fWriter { 451 | return 0, ErrNotOpen 452 | } 453 | if nil == dataset { 454 | return 0, nil 455 | } 456 | if sep == nil { 457 | sep = new(string) 458 | *sep = DefSeparator 459 | } 460 | 461 | var rows *tabula.Rows 462 | 463 | switch dataset.GetMode() { 464 | case tabula.DatasetModeColumns: 465 | cols := dataset.GetDataAsColumns() 466 | return writer.WriteRawColumns(cols, sep) 467 | case tabula.DatasetModeRows, tabula.DatasetModeMatrix: 468 | fallthrough 469 | default: 470 | rows = dataset.GetDataAsRows() 471 | } 472 | 473 | return writer.WriteRawRows(rows, sep) 474 | } 475 | 476 | // 477 | // Write rows from Reader to file. 478 | // Return n for number of row written, or e if error happened. 479 | // 480 | func (writer *Writer) Write(reader ReaderInterface) (int, error) { 481 | if nil == reader { 482 | return 0, ErrNilReader 483 | } 484 | if nil == writer.fWriter { 485 | return 0, ErrNotOpen 486 | } 487 | 488 | ds := reader.GetDataset().(tabula.DatasetInterface) 489 | 490 | var rows *tabula.Rows 491 | 492 | switch ds.GetMode() { 493 | case tabula.DatasetModeColumns: 494 | cols := ds.GetDataAsColumns() 495 | return writer.WriteColumns(*cols, reader.GetInputMetadata()) 496 | case tabula.DatasetModeRows, tabula.DatasetModeMatrix: 497 | fallthrough 498 | default: 499 | rows = ds.GetDataAsRows() 500 | } 501 | 502 | return writer.WriteRows(*rows, reader.GetInputMetadata()) 503 | } 504 | 505 | // 506 | // String yes, it will print it in JSON like format. 507 | // 508 | func (writer *Writer) String() string { 509 | r, e := json.MarshalIndent(writer, "", "\t") 510 | 511 | if nil != e { 512 | log.Print(e) 513 | } 514 | 515 | return string(r) 516 | } 517 | -------------------------------------------------------------------------------- /reader.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2018, Shulhan . All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dsv 6 | 7 | import ( 8 | "bufio" 9 | "github.com/shuLhan/tabula" 10 | "log" 11 | "os" 12 | "strings" 13 | ) 14 | 15 | const ( 16 | // DatasetModeROWS is a string representation of output mode rows. 17 | DatasetModeROWS = "ROWS" 18 | // DatasetModeCOLUMNS is a string representation of output mode columns. 19 | DatasetModeCOLUMNS = "COLUMNS" 20 | // DatasetModeMATRIX will save data in rows and columns. This mode will 21 | // consume more memory that "rows" and "columns" but give greater 22 | // flexibility when working with data. 23 | DatasetModeMATRIX = "MATRIX" 24 | ) 25 | 26 | // 27 | // Reader hold all configuration, metadata and input data. 28 | // 29 | // DSV Reader work like this, 30 | // 31 | // (1) Initialize new dsv reader object 32 | // 33 | // dsvReader, e := dsv.NewReader(configfile) 34 | // 35 | // (2) Do not forget to check for error ... 36 | // 37 | // if e != nil { 38 | // // handle error 39 | // } 40 | // 41 | // (3) Make sure to close all files after finished 42 | // 43 | // defer dsvReader.Close () 44 | // 45 | // (4) Create loop to read input data 46 | // 47 | // for { 48 | // n, e := dsv.Read (dsvReader) 49 | // 50 | // if e == io.EOF { 51 | // break 52 | // } 53 | // 54 | // (4.1) Iterate through rows 55 | // 56 | // for row := range dsvReader.GetDataAsRows() { 57 | // // work with row ... 58 | // } 59 | // } 60 | // 61 | // Thats it. 62 | // 63 | // 64 | type Reader struct { 65 | // Config define path of configuration file. 66 | // 67 | // If the configuration located in other directory, e.g. 68 | // "../../config.dsv", and the Input option is set with name only, like 69 | // "input.dat", we assume that its in the same directory where the 70 | // configuration file belong. 71 | Config 72 | // Dataset contains the content of input file after read. 73 | dataset interface{} 74 | // Input file, mandatory. 75 | Input string `json:"Input"` 76 | // Skip n lines from the head. 77 | Skip int `json:"Skip"` 78 | // TrimSpace or not. If its true, before parsing the line, the white 79 | // space in the beginning and end of each input line will be removed, 80 | // otherwise it will leave unmodified. Default is true. 81 | TrimSpace bool `json:"TrimSpace"` 82 | // Rejected is the file name where row that does not fit 83 | // with metadata will be saved. 84 | Rejected string `json:"Rejected"` 85 | // InputMetadata define format for each column in input data. 86 | InputMetadata []Metadata `json:"InputMetadata"` 87 | // MaxRows define maximum row that this reader will read and 88 | // saved in the memory at one read operation. 89 | // If the value is -1, all rows will read. 90 | MaxRows int `json:"MaxRows"` 91 | // DatasetMode define on how do you want the result is saved. There are 92 | // three options: either in "rows", "columns", or "matrix" mode. 93 | // For example, input data file, 94 | // 95 | // a,b,c 96 | // 1,2,3 97 | // 98 | // "rows" mode is where each line saved in its own slice, resulting 99 | // in Rows: 100 | // 101 | // [a b c] 102 | // [1 2 3] 103 | // 104 | // "columns" mode is where each line saved by columns, resulting in 105 | // Columns: 106 | // 107 | // [a 1] 108 | // [b 2] 109 | // [c 3] 110 | // 111 | // "matrix" mode is where each record saved in their own row and column. 112 | // 113 | DatasetMode string `json:"DatasetMode"` 114 | // fRead is read descriptor. 115 | fRead *os.File 116 | // fReject is reject descriptor. 117 | fReject *os.File 118 | // bufRead is a buffer for working with input file. 119 | bufRead *bufio.Reader 120 | // bufReject is a buffer for working with rejected file. 121 | bufReject *bufio.Writer 122 | } 123 | 124 | // 125 | // NewReader create and initialize new instance of DSV Reader with default values. 126 | // 127 | func NewReader(config string, dataset interface{}) (reader *Reader, e error) { 128 | reader = &Reader{ 129 | Input: "", 130 | Skip: 0, 131 | TrimSpace: true, 132 | Rejected: DefaultRejected, 133 | InputMetadata: nil, 134 | MaxRows: DefaultMaxRows, 135 | DatasetMode: DefDatasetMode, 136 | dataset: dataset, 137 | fRead: nil, 138 | fReject: nil, 139 | bufRead: nil, 140 | bufReject: nil, 141 | } 142 | 143 | e = reader.Init(config, dataset) 144 | if e != nil { 145 | return nil, e 146 | } 147 | 148 | return 149 | } 150 | 151 | // 152 | // Init will initialize reader object by 153 | // 154 | // (1) Check if dataset is not empty. 155 | // (2) Read config file. 156 | // (3) Set reader object default value. 157 | // (4) Check if output mode is valid and initialize it if valid. 158 | // (5) Check and initialize metadata and columns attributes. 159 | // (6) Check if Input is name only without path, so we can prefix it with 160 | // config path. 161 | // (7) Open rejected file. 162 | // (8) Open input file. 163 | // 164 | func (reader *Reader) Init(fcfg string, dataset interface{}) (e error) { 165 | // (1) 166 | if dataset == nil { 167 | dataset = reader.GetDataset() 168 | if dataset == nil { 169 | dataset = &tabula.Dataset{} 170 | reader.dataset = dataset 171 | } 172 | } 173 | 174 | // (2) 175 | fcfg = strings.TrimSpace(fcfg) 176 | if fcfg != "" { 177 | e = ConfigOpen(reader, fcfg) 178 | if e != nil { 179 | return e 180 | } 181 | 182 | e = tabula.ReadDatasetConfig(dataset, fcfg) 183 | if e != nil { 184 | return e 185 | } 186 | } 187 | 188 | // (3) 189 | reader.SetDefault() 190 | 191 | // (4) 192 | reader.SetDatasetMode(reader.GetDatasetMode()) 193 | 194 | // (5) 195 | ds := dataset.(tabula.DatasetInterface) 196 | md := reader.GetInputMetadata() 197 | for i := range md { 198 | md[i].Init() 199 | 200 | // Count number of output columns. 201 | if !md[i].GetSkip() { 202 | // add type of metadata to list of type 203 | col := tabula.Column{ 204 | Type: md[i].GetType(), 205 | Name: md[i].GetName(), 206 | ValueSpace: md[i].GetValueSpace(), 207 | } 208 | ds.PushColumn(col) 209 | } 210 | } 211 | 212 | // (6) 213 | reader.SetInput(ConfigCheckPath(reader, reader.GetInput())) 214 | reader.SetRejected(ConfigCheckPath(reader, reader.GetRejected())) 215 | 216 | // (7) 217 | e = reader.OpenRejected() 218 | if nil != e { 219 | return 220 | } 221 | 222 | // (8) 223 | e = reader.OpenInput() 224 | if nil != e { 225 | return 226 | } 227 | 228 | return 229 | } 230 | 231 | // 232 | // SetDefault options for global config and each metadata. 233 | // 234 | func (reader *Reader) SetDefault() { 235 | if "" == strings.TrimSpace(reader.Rejected) { 236 | reader.Rejected = DefaultRejected 237 | } 238 | if 0 == reader.MaxRows { 239 | reader.MaxRows = DefaultMaxRows 240 | } 241 | if "" == strings.TrimSpace(reader.DatasetMode) { 242 | reader.DatasetMode = DefDatasetMode 243 | } 244 | if nil == reader.dataset { 245 | reader.dataset = &tabula.Dataset{} 246 | } 247 | } 248 | 249 | // 250 | // CopyConfig copy configuration from other reader object not including data 251 | // and metadata. 252 | // 253 | func (reader *Reader) CopyConfig(src *Reader) { 254 | reader.ConfigPath = src.GetConfigPath() 255 | reader.Input = src.GetInput() 256 | reader.Skip = src.GetSkip() 257 | reader.TrimSpace = src.IsTrimSpace() 258 | reader.Rejected = src.GetRejected() 259 | reader.MaxRows = src.GetMaxRows() 260 | reader.DatasetMode = src.GetDatasetMode() 261 | } 262 | 263 | // 264 | // GetInput return the input file. 265 | // 266 | func (reader *Reader) GetInput() string { 267 | return reader.Input 268 | } 269 | 270 | // 271 | // SetInput file. 272 | // 273 | func (reader *Reader) SetInput(path string) { 274 | reader.Input = path 275 | } 276 | 277 | // 278 | // GetSkip return number of line that will be skipped. 279 | // 280 | func (reader *Reader) GetSkip() int { 281 | return reader.Skip 282 | } 283 | 284 | // 285 | // SetSkip set number of lines that will be skipped before reading actual data. 286 | // 287 | func (reader *Reader) SetSkip(n int) { 288 | reader.Skip = n 289 | } 290 | 291 | // 292 | // IsTrimSpace return value of TrimSpace option. 293 | // 294 | func (reader *Reader) IsTrimSpace() bool { 295 | return reader.TrimSpace 296 | } 297 | 298 | // 299 | // GetRejected return name of rejected file. 300 | // 301 | func (reader *Reader) GetRejected() string { 302 | return reader.Rejected 303 | } 304 | 305 | // 306 | // SetRejected file. 307 | // 308 | func (reader *Reader) SetRejected(path string) { 309 | reader.Rejected = path 310 | } 311 | 312 | // 313 | // AddInputMetadata add new input metadata to reader. 314 | // 315 | func (reader *Reader) AddInputMetadata(md *Metadata) { 316 | reader.InputMetadata = append(reader.InputMetadata, *md) 317 | ds := reader.dataset.(tabula.DatasetInterface) 318 | ds.AddColumn(md.GetType(), md.GetName(), md.GetValueSpace()) 319 | } 320 | 321 | // 322 | // AppendMetadata will append new metadata `md` to list of reader input metadata. 323 | // 324 | func (reader *Reader) AppendMetadata(mdi MetadataInterface) { 325 | md := mdi.(*Metadata) 326 | reader.InputMetadata = append(reader.InputMetadata, *md) 327 | } 328 | 329 | // 330 | // GetInputMetadata return pointer to slice of metadata. 331 | // 332 | func (reader *Reader) GetInputMetadata() []MetadataInterface { 333 | md := make([]MetadataInterface, len(reader.InputMetadata)) 334 | for i := range reader.InputMetadata { 335 | md[i] = &reader.InputMetadata[i] 336 | } 337 | 338 | return md 339 | } 340 | 341 | // 342 | // GetInputMetadataAt return pointer to metadata at index 'idx'. 343 | // 344 | func (reader *Reader) GetInputMetadataAt(idx int) MetadataInterface { 345 | return &reader.InputMetadata[idx] 346 | } 347 | 348 | // 349 | // GetMaxRows return number of maximum rows for reading. 350 | // 351 | func (reader *Reader) GetMaxRows() int { 352 | return reader.MaxRows 353 | } 354 | 355 | // 356 | // SetMaxRows will set maximum rows that will be read from input file. 357 | // 358 | func (reader *Reader) SetMaxRows(max int) { 359 | reader.MaxRows = max 360 | } 361 | 362 | // 363 | // GetDatasetMode return output mode of data. 364 | // 365 | func (reader *Reader) GetDatasetMode() string { 366 | return reader.DatasetMode 367 | } 368 | 369 | // 370 | // SetDatasetMode to `mode`. 371 | // 372 | func (reader *Reader) SetDatasetMode(mode string) { 373 | ds := reader.dataset.(tabula.DatasetInterface) 374 | switch strings.ToUpper(mode) { 375 | case DatasetModeROWS: 376 | ds.SetMode(tabula.DatasetModeRows) 377 | case DatasetModeCOLUMNS: 378 | ds.SetMode(tabula.DatasetModeColumns) 379 | case DatasetModeMATRIX: 380 | fallthrough 381 | default: 382 | ds.SetMode(tabula.DatasetModeMatrix) 383 | mode = DatasetModeMATRIX 384 | } 385 | reader.DatasetMode = mode 386 | } 387 | 388 | // 389 | // GetNColumnIn return number of input columns, or number of metadata, including 390 | // column with Skip=true. 391 | // 392 | func (reader *Reader) GetNColumnIn() int { 393 | return len(reader.InputMetadata) 394 | } 395 | 396 | // 397 | // OpenInput open the input file, metadata must have been initialize. 398 | // 399 | func (reader *Reader) OpenInput() (e error) { 400 | reader.fRead, e = os.OpenFile(reader.Input, os.O_RDONLY, 0600) 401 | if nil != e { 402 | return e 403 | } 404 | 405 | reader.bufRead = bufio.NewReader(reader.fRead) 406 | 407 | // Skip lines 408 | if reader.GetSkip() > 0 { 409 | e = reader.SkipLines() 410 | 411 | if nil != e { 412 | return 413 | } 414 | } 415 | 416 | return nil 417 | } 418 | 419 | // 420 | // OpenRejected open rejected file, for saving unparseable line. 421 | // 422 | func (reader *Reader) OpenRejected() (e error) { 423 | reader.fReject, e = os.OpenFile(reader.Rejected, 424 | os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 425 | if nil != e { 426 | return e 427 | } 428 | 429 | reader.bufReject = bufio.NewWriter(reader.fReject) 430 | 431 | return nil 432 | } 433 | 434 | // 435 | // Open input and rejected file. 436 | // 437 | func (reader *Reader) Open() (e error) { 438 | // do not let file descriptor leaked 439 | e = reader.Close() 440 | if e != nil { 441 | return 442 | } 443 | 444 | e = reader.OpenInput() 445 | if e != nil { 446 | return 447 | } 448 | 449 | e = reader.OpenRejected() 450 | 451 | return 452 | } 453 | 454 | // 455 | // SkipLines skip parsing n lines from input file. 456 | // The n is defined in the attribute "Skip" 457 | // 458 | func (reader *Reader) SkipLines() (e error) { 459 | for i := 0; i < reader.Skip; i++ { 460 | _, e = reader.ReadLine() 461 | 462 | if nil != e { 463 | log.Print("dsv: ", e) 464 | return 465 | } 466 | } 467 | return 468 | } 469 | 470 | // 471 | // Reset all variables for next read operation. Number of rows will be 0, and 472 | // Rows will be empty again. 473 | // 474 | func (reader *Reader) Reset() (e error) { 475 | e = reader.Flush() 476 | if e != nil { 477 | return 478 | } 479 | e = reader.dataset.(tabula.DatasetInterface).Reset() 480 | return 481 | } 482 | 483 | // 484 | // Flush all output buffer. 485 | // 486 | func (reader *Reader) Flush() error { 487 | return reader.bufReject.Flush() 488 | } 489 | 490 | // 491 | // ReadLine will read one line from input file. 492 | // 493 | func (reader *Reader) ReadLine() (line []byte, e error) { 494 | line, e = reader.bufRead.ReadBytes(DefEOL) 495 | 496 | if e == nil { 497 | // remove EOL 498 | line = line[:len(line)-1] 499 | } 500 | 501 | return 502 | } 503 | 504 | // 505 | // FetchNextLine read the next line and combine it with the `lastline`. 506 | // 507 | func (reader *Reader) FetchNextLine(lastline []byte) (line []byte, e error) { 508 | line, e = reader.ReadLine() 509 | 510 | lastline = append(lastline, DefEOL) 511 | lastline = append(lastline, line...) 512 | 513 | return lastline, e 514 | } 515 | 516 | // 517 | // Reject the line and save it to the reject file. 518 | // 519 | func (reader *Reader) Reject(line []byte) (int, error) { 520 | return reader.bufReject.Write(line) 521 | } 522 | 523 | // 524 | // deleteEmptyRejected if rejected file is empty, delete it. 525 | // 526 | func (reader *Reader) deleteEmptyRejected() { 527 | finfo, e := os.Stat(reader.Rejected) 528 | if e != nil { 529 | return 530 | } 531 | 532 | if finfo.Size() >= 0 { 533 | _ = os.Remove(reader.Rejected) 534 | } 535 | } 536 | 537 | // 538 | // Close all open descriptors. 539 | // 540 | func (reader *Reader) Close() (e error) { 541 | if nil != reader.bufReject { 542 | e = reader.bufReject.Flush() 543 | if e != nil { 544 | return 545 | } 546 | } 547 | if nil != reader.fReject { 548 | e = reader.fReject.Close() 549 | if e != nil { 550 | return 551 | } 552 | } 553 | 554 | reader.deleteEmptyRejected() 555 | 556 | if nil != reader.fRead { 557 | e = reader.fRead.Close() 558 | } 559 | return 560 | } 561 | 562 | // 563 | // IsEqual compare only the configuration and metadata with other instance. 564 | // 565 | func (reader *Reader) IsEqual(other *Reader) bool { 566 | if reader == other { 567 | return true 568 | } 569 | if reader.Input != other.Input { 570 | return false 571 | } 572 | 573 | l, r := len(reader.InputMetadata), len(other.InputMetadata) 574 | 575 | if l != r { 576 | return false 577 | } 578 | 579 | for a := 0; a < l; a++ { 580 | if !reader.InputMetadata[a].IsEqual(&other.InputMetadata[a]) { 581 | return false 582 | } 583 | } 584 | 585 | return true 586 | } 587 | 588 | // 589 | // GetDataset return reader dataset. 590 | // 591 | func (reader *Reader) GetDataset() interface{} { 592 | return reader.dataset 593 | } 594 | 595 | // 596 | // MergeColumns append metadata and columns from another reader if not exist in 597 | // current metadata set. 598 | // 599 | func (reader *Reader) MergeColumns(other ReaderInterface) { 600 | for _, md := range other.GetInputMetadata() { 601 | if md.GetSkip() { 602 | continue 603 | } 604 | 605 | // Check if the same metadata name exist in current dataset. 606 | found := false 607 | for _, lmd := range reader.GetInputMetadata() { 608 | if lmd.GetName() == md.GetName() { 609 | found = true 610 | break 611 | } 612 | } 613 | 614 | if found { 615 | continue 616 | } 617 | 618 | reader.AppendMetadata(md) 619 | } 620 | 621 | reader.dataset.(tabula.DatasetInterface).MergeColumns( 622 | other.GetDataset().(tabula.DatasetInterface)) 623 | } 624 | 625 | // 626 | // MergeRows append rows from another reader. 627 | // 628 | func (reader *Reader) MergeRows(other *Reader) { 629 | reader.dataset.(tabula.DatasetInterface).MergeRows( 630 | other.GetDataset().(tabula.DatasetInterface)) 631 | } 632 | --------------------------------------------------------------------------------