├── testdata
    ├── claset.dsv
    ├── writeraw.exp
    ├── expected_skip.dat
    ├── expected.dat
    ├── input.dat
    ├── expected_merge_rows.dat
    ├── expected_merge_columns.dat
    ├── expected_simplemerge.dat
    ├── config.dsv
    ├── config_simpleread.dsv
    └── config_skip.dsv
├── .gitignore
├── config.go
├── claset_test.go
├── metadata_test.go
├── writerinterface.go
├── metadatainterface.go
├── configinterface.go
├── readererror.go
├── LICENSE
├── dsvinterface.go
├── dsv_test.go
├── data_test.go
├── writer_test.go
├── dsv.go
├── common_test.go
├── metadata.go
├── readerinterface.go
├── README.md
├── reader_test.go
├── writer.go
└── reader.go


/testdata/claset.dsv:
--------------------------------------------------------------------------------
1 | {
2 | 	"Input"		:"input.dat"
3 | ,	"Rejected"	:"rejected.dat"
4 | ,	"Skip"		:1
5 | ,	"MaxRows"	:2
6 | ,	"ClassIndex"	:3
7 | }
8 | 


--------------------------------------------------------------------------------
/testdata/writeraw.exp:
--------------------------------------------------------------------------------
 1 | 0,1,A
 2 | 1,1.1,B
 3 | 2,1.2,A
 4 | 3,1.3,B
 5 | 4,1.4,C
 6 | 5,1.5,D
 7 | 6,1.6,C
 8 | 7,1.7,D
 9 | 8,1.8,E
10 | 9,1.9,F
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | rejected.dat
2 | testdata/output.dat
3 | testdata/output_merge_columns.dat
4 | testdata/output_merge_rows.dat
5 | testdata/output_skip.dat
6 | testdata/rejected.dat
7 | testdata/writerawcolumns.out
8 | testdata/writerawrows.out
9 | 


--------------------------------------------------------------------------------
/testdata/expected_skip.dat:
--------------------------------------------------------------------------------
 1 | 0.1;1;{{AB}};A-B#
 2 | 0.02;2;{{BCD}};A-B-C#
 3 | 0.003;3;{{A;B C,D}};A;B-C,D#
 4 | 0.0004;4;{{A;B C D}};A;B-C,D#
 5 | 0.000006;6;{{}};#
 6 | 0.000000009;9;{{missing right-quote];8;0.00000008
 7 | 9;"ok"-[[ok}};ok#
 8 | 0.101;10;{{integer}};test#
 9 | 0.123456789;123456789;{{real}};test#
10 | 13;13;{{string with}};string with#
11 | 14;14;{{string with]] escape}};string with" quote#
12 | 


--------------------------------------------------------------------------------
/testdata/expected.dat:
--------------------------------------------------------------------------------
 1 | ID 1/A-B#	{{AB}};1;0.1
 2 | ID 2/A-B-C#	{{BCD}};2;0.02
 3 | ID 3/A;B-C,D#	{{A;B C,D}};3;0.003
 4 | ID 4/A;B-C,D#	{{A;B C D}};4;0.0004
 5 | ID 6/#	{{}};6;0.000006
 6 | ID 8/ok#	{{missing right-quote];8;0.00000008
 7 | 9;"ok"-[[ok}};9;0.000000009
 8 | ID 10/test#	{{integer}};10;0.101
 9 | ID 12/test#	{{real}};123456789;0.123456789
10 | ID 13/string with#	{{string with}};13;13
11 | ID 14/string with" quote#	{{string with]] escape}};14;14
12 | 


--------------------------------------------------------------------------------
/testdata/input.dat:
--------------------------------------------------------------------------------
 1 | "id","name","value","integer";"real"
 2 | 1;"A-B"-[[AB]];1;0.1
 3 | 2;"A-B-C"-[[BCD]];2;0.02
 4 | 3;"A;B-C,D"-[[A;B C,D]];3;0.003
 5 | 4;"A;B-C,D"-[[A;B C D]];4;0.0004
 6 | 5;"A;B-C,D-"[[A;B C D]];5;0.00005
 7 | 6;""-[[]];6;0.000006
 8 | 7;"ok"-[missing left-quote]];7;0.0000007
 9 | 8;"ok"-[[missing right-quote];8;0.00000008
10 | 9;"ok"-[[ok]];9;0.000000009
11 | 10;"test"-[[integer]];010;0.101
12 | 11;"test"-[[integer]];1a;0.1001
13 | 12;"test"-[[real]];123456789;00.123456789
14 | 13;"string with" quote"-[[string with]];13;13.0
15 | 14;"string with\" quote"-[[string with\]] escape]];14;14.0
16 | 


--------------------------------------------------------------------------------
/testdata/expected_merge_rows.dat:
--------------------------------------------------------------------------------
 1 | 1	A-B	AB	1	0.1
 2 | 2	A-B-C	BCD	2	0.02
 3 | 3	A;B-C,D	A;B C,D	3	0.003
 4 | 4	A;B-C,D	A;B C D	4	0.0004
 5 | 6			6	0.000006
 6 | 8	ok	missing right-quote];8;0.00000008
 7 | 9;"ok"-[[ok	9	0.000000009
 8 | 10	test	integer	10	0.101
 9 | 12	test	real	123456789	0.123456789
10 | 13	string with	string with	13	13
11 | 14	string with" quote	string with]] escape	14	14
12 | A-B	AB	1	0.1
13 | A-B-C	BCD	2	0.02
14 | A;B-C,D	A;B C,D	3	0.003
15 | A;B-C,D	A;B C D	4	0.0004
16 | 		6	0.000006
17 | ok	missing right-quote];8;0.00000008
18 | 9;"ok"-[[ok	9	0.000000009
19 | test	integer	10	0.101
20 | test	real	123456789	0.123456789
21 | string with	string with	13	13
22 | string with" quote	string with]] escape	14	14
23 | 


--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv
 6 | 
 7 | //
 8 | // Config for working with DSV configuration.
 9 | //
10 | type Config struct {
11 | 	// ConfigPath path to configuration file.
12 | 	ConfigPath string
13 | }
14 | 
15 | //
16 | // GetConfigPath return the base path of configuration file.
17 | //
18 | func (cfg *Config) GetConfigPath() string {
19 | 	return cfg.ConfigPath
20 | }
21 | 
22 | //
23 | // SetConfigPath for reading input and writing rejected file.
24 | //
25 | func (cfg *Config) SetConfigPath(dir string) {
26 | 	cfg.ConfigPath = dir
27 | }
28 | 


--------------------------------------------------------------------------------
/testdata/expected_merge_columns.dat:
--------------------------------------------------------------------------------
 1 | 1	A-B	AB	1	0.1
 2 | 2	A-B-C	BCD	2	0.02
 3 | 3	A;B-C,D	A;B C,D	3	0.003
 4 | 4	A;B-C,D	A;B C D	4	0.0004
 5 | 6			6	0.000006
 6 | 8	ok	missing right-quote];8;0.00000008
 7 | 9;"ok"-[[ok	9	0.000000009
 8 | 10	test	integer	10	0.101
 9 | 12	test	real	123456789	0.123456789
10 | 13	string with	string with	13	13
11 | 14	string with" quote	string with]] escape	14	14
12 | 	A-B	AB	1	0.1
13 | 	A-B-C	BCD	2	0.02
14 | 	A;B-C,D	A;B C,D	3	0.003
15 | 	A;B-C,D	A;B C D	4	0.0004
16 | 			6	0.000006
17 | 	ok	missing right-quote];8;0.00000008
18 | 9;"ok"-[[ok	9	0.000000009
19 | 	test	integer	10	0.101
20 | 	test	real	123456789	0.123456789
21 | 	string with	string with	13	13
22 | 	string with" quote	string with]] escape	14	14
23 | 


--------------------------------------------------------------------------------
/testdata/expected_simplemerge.dat:
--------------------------------------------------------------------------------
 1 | ID 1/A-B#	{{AB}};1;0.1
 2 | ID 2/A-B-C#	{{BCD}};2;0.02
 3 | ID 3/A;B-C,D#	{{A;B C,D}};3;0.003
 4 | ID 4/A;B-C,D#	{{A;B C D}};4;0.0004
 5 | ID 6/#	{{}};6;0.000006
 6 | ID 8/ok#	{{missing right-quote];8;0.00000008
 7 | 9;"ok"-[[ok}};9;0.000000009
 8 | ID 10/test#	{{integer}};10;0.101
 9 | ID 12/test#	{{real}};123456789;0.123456789
10 | ID 13/string with#	{{string with}};13;13
11 | ID 14/string with" quote#	{{string with]] escape}};14;14
12 | ID 1/A-B#	{{AB}};1;0.1
13 | ID 2/A-B-C#	{{BCD}};2;0.02
14 | ID 3/A;B-C,D#	{{A;B C,D}};3;0.003
15 | ID 4/A;B-C,D#	{{A;B C D}};4;0.0004
16 | ID 6/#	{{}};6;0.000006
17 | ID 8/ok#	{{missing right-quote];8;0.00000008
18 | 9;"ok"-[[ok}};9;0.000000009
19 | ID 10/test#	{{integer}};10;0.101
20 | ID 12/test#	{{real}};123456789;0.123456789
21 | ID 13/string with#	{{string with}};13;13
22 | ID 14/string with" quote#	{{string with]] escape}};14;14
23 | 


--------------------------------------------------------------------------------
/claset_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/dsv"
 9 | 	"github.com/shuLhan/tabula"
10 | 	"testing"
11 | )
12 | 
13 | func TestReaderWithClaset(t *testing.T) {
14 | 	fcfg := "testdata/claset.dsv"
15 | 
16 | 	claset := tabula.Claset{}
17 | 
18 | 	_, e := dsv.NewReader(fcfg, &claset)
19 | 	if e != nil {
20 | 		t.Fatal(e)
21 | 	}
22 | 
23 | 	assert(t, 3, claset.GetClassIndex(), true)
24 | 
25 | 	claset.SetMajorityClass("regular")
26 | 	claset.SetMinorityClass("vandalism")
27 | 
28 | 	clone := claset.Clone().(tabula.ClasetInterface)
29 | 
30 | 	assert(t, 3, clone.GetClassIndex(), true)
31 | 	assert(t, "regular", clone.MajorityClass(), true)
32 | 	assert(t, "vandalism", clone.MinorityClass(), true)
33 | }
34 | 


--------------------------------------------------------------------------------
/metadata_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/dsv"
 9 | 	"testing"
10 | )
11 | 
12 | func TestMetadataIsEqual(t *testing.T) {
13 | 	cases := []struct {
14 | 		in     dsv.Metadata
15 | 		out    dsv.Metadata
16 | 		result bool
17 | 	}{
18 | 		{
19 | 			dsv.Metadata{
20 | 				Name:      "A",
21 | 				Separator: ",",
22 | 			},
23 | 			dsv.Metadata{
24 | 				Name:      "A",
25 | 				Separator: ",",
26 | 			},
27 | 			true,
28 | 		},
29 | 		{
30 | 			dsv.Metadata{
31 | 				Name:      "A",
32 | 				Separator: ",",
33 | 			},
34 | 			dsv.Metadata{
35 | 				Name:      "A",
36 | 				Separator: ";",
37 | 			},
38 | 			false,
39 | 		},
40 | 	}
41 | 
42 | 	for _, c := range cases {
43 | 		r := c.in.IsEqual(&c.out)
44 | 
45 | 		if r != c.result {
46 | 			t.Error("Test failed on ", c.in, c.out)
47 | 		}
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/testdata/config.dsv:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"Input"		:"input.dat"
 3 | ,	"Rejected"	:"rejected.dat"
 4 | ,	"Skip"		:1
 5 | ,	"MaxRows"	:1
 6 | ,	"InputMetadata"	:
 7 | 	[{
 8 | 		"Name"		:"id"
 9 | 	,	"Separator"	:";"
10 | 	,	"Type"		:"integer"
11 | 	},{
12 | 		"Name"		:"name"
13 | 	,	"Separator"	:"-"
14 | 	,	"LeftQuote"	:"\""
15 | 	,	"RightQuote"	:"\""
16 | 	},{
17 | 		"Name"		:"value"
18 | 	,	"Separator"	:";"
19 | 	,	"LeftQuote"	:"[["
20 | 	,	"RightQuote"	:"]]"
21 | 	},{
22 | 		"Name"		:"integer"
23 | 	,	"Type"		:"integer"
24 | 	,	"Separator"	:";"
25 | 	},{
26 | 		"Name"		:"real"
27 | 	,	"Type"		:"real"
28 | 	}]
29 | ,	"Output"	:"output.dat"
30 | ,	"OutputMetadata":
31 | 	[{
32 | 		"Name"		:"id"
33 | 	,	"LeftQuote"	:"ID "
34 | 	,	"Separator"	:"/"
35 | 	},{
36 | 		"Name"		:"name"
37 | 	,	"RightQuote"	:"#"
38 | 	,	"Separator"	:"\t"
39 | 	},{
40 | 		"Name"		:"value"
41 | 	,	"Separator"	:";"
42 | 	,	"LeftQuote"	:"{{"
43 | 	,	"RightQuote"	:"}}"
44 | 	},{
45 | 		"Name"		:"integer"
46 | 	,	"Separator"	:";"
47 | 	},{
48 | 		"Name"		:"real"
49 | 	}]
50 | }
51 | 


--------------------------------------------------------------------------------
/testdata/config_simpleread.dsv:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"Input"		:"input.dat"
 3 | ,	"Rejected"	:"rejected.dat"
 4 | ,	"Skip"		:1
 5 | ,	"MaxRows"	:-1
 6 | ,	"InputMetadata"	:
 7 | 	[{
 8 | 		"Name"		:"id"
 9 | 	,	"Separator"	:";"
10 | 	,	"Type"		:"integer"
11 | 	},{
12 | 		"Name"		:"name"
13 | 	,	"Separator"	:"-"
14 | 	,	"LeftQuote"	:"\""
15 | 	,	"RightQuote"	:"\""
16 | 	},{
17 | 		"Name"		:"value"
18 | 	,	"Separator"	:";"
19 | 	,	"LeftQuote"	:"[["
20 | 	,	"RightQuote"	:"]]"
21 | 	},{
22 | 		"Name"		:"integer"
23 | 	,	"Type"		:"integer"
24 | 	,	"Separator"	:";"
25 | 	},{
26 | 		"Name"		:"real"
27 | 	,	"Type"		:"real"
28 | 	}]
29 | ,	"Output"	:"output.dat"
30 | ,	"OutputMetadata":
31 | 	[{
32 | 		"Name"		:"id"
33 | 	,	"LeftQuote"	:"ID "
34 | 	,	"Separator"	:"/"
35 | 	},{
36 | 		"Name"		:"name"
37 | 	,	"RightQuote"	:"#"
38 | 	,	"Separator"	:"\t"
39 | 	},{
40 | 		"Name"		:"value"
41 | 	,	"Separator"	:";"
42 | 	,	"LeftQuote"	:"{{"
43 | 	,	"RightQuote"	:"}}"
44 | 	},{
45 | 		"Name"		:"integer"
46 | 	,	"Separator"	:";"
47 | 	},{
48 | 		"Name"		:"real"
49 | 	}]
50 | }
51 | 


--------------------------------------------------------------------------------
/testdata/config_skip.dsv:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"Input"		:"input.dat"
 3 | ,	"Rejected"	:"rejected.dat"
 4 | ,	"Skip"		:1
 5 | ,	"MaxRows"	:1
 6 | ,	"InputMetadata"	:
 7 | 	[{
 8 | 		"Name"		:"id"
 9 | 	,	"Separator"	:";"
10 | 	,	"Type"		:"integer"
11 | 	,	"Skip"		:true
12 | 	},{
13 | 		"Name"		:"name"
14 | 	,	"Separator"	:"-"
15 | 	,	"LeftQuote"	:"\""
16 | 	,	"RightQuote"	:"\""
17 | 	},{
18 | 		"Name"		:"value"
19 | 	,	"Separator"	:";"
20 | 	,	"LeftQuote"	:"[["
21 | 	,	"RightQuote"	:"]]"
22 | 	},{
23 | 		"Name"		:"integer"
24 | 	,	"Type"		:"integer"
25 | 	,	"Separator"	:";"
26 | 	},{
27 | 		"Name"		:"real"
28 | 	,	"Type"		:"real"
29 | 	}]
30 | ,	"Output"	:"testdata/output_skip.dat"
31 | ,	"OutputMetadata":
32 | 	[{
33 | 		"Name"		:"real"
34 | 	,	"Separator"	:";"
35 | 	},{
36 | 		"Name"		:"integer"
37 | 	,	"Separator"	:";"
38 | 	},{
39 | 		"Name"		:"value"
40 | 	,	"Separator"	:";"
41 | 	,	"LeftQuote"	:"{{"
42 | 	,	"RightQuote"	:"}}"
43 | 	},{
44 | 		"Name"		:"name"
45 | 	,	"RightQuote"	:"#"
46 | 	},{
47 | 		"Name"		:"id"
48 | 	,	"LeftQuote"	:"ID "
49 | 	,	"Separator"	:"/"
50 | 	},{
51 | 		"Name"		:"invalid"
52 | 	,	"Separator"	:";"
53 | 	}]
54 | }
55 | 


--------------------------------------------------------------------------------
/writerinterface.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv
 6 | 
 7 | //
 8 | // WriterInterface is an interface for writing DSV data to file.
 9 | //
10 | type WriterInterface interface {
11 | 	ConfigInterface
12 | 	GetOutput() string
13 | 	SetOutput(path string)
14 | 	OpenOutput(file string) error
15 | 	Flush() error
16 | 	Close() error
17 | }
18 | 
19 | //
20 | // OpenWriter configuration file and initialize the attributes.
21 | //
22 | func OpenWriter(writer WriterInterface, fcfg string) (e error) {
23 | 	e = ConfigOpen(writer, fcfg)
24 | 	if e != nil {
25 | 		return
26 | 	}
27 | 
28 | 	return InitWriter(writer)
29 | }
30 | 
31 | //
32 | // InitWriter initialize writer by opening output file.
33 | //
34 | func InitWriter(writer WriterInterface) error {
35 | 	out := writer.GetOutput()
36 | 
37 | 	// Exit immediately if no output file is defined in config.
38 | 	if "" == out {
39 | 		return ErrNoOutput
40 | 	}
41 | 
42 | 	writer.SetOutput(ConfigCheckPath(writer, out))
43 | 
44 | 	return writer.OpenOutput("")
45 | }
46 | 


--------------------------------------------------------------------------------
/metadatainterface.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv
 6 | 
 7 | //
 8 | // MetadataInterface is the interface for field metadata.
 9 | // This is to make anyone can extend the DSV library including the metadata.
10 | //
11 | type MetadataInterface interface {
12 | 	Init()
13 | 	GetName() string
14 | 	GetType() int
15 | 	GetTypeName() string
16 | 	GetLeftQuote() string
17 | 	GetRightQuote() string
18 | 	GetSeparator() string
19 | 	GetSkip() bool
20 | 	GetValueSpace() []string
21 | 
22 | 	IsEqual(MetadataInterface) bool
23 | }
24 | 
25 | //
26 | // FindMetadata Given a slice of metadata, find `mdin` in the slice which has the
27 | // same name, ignoring metadata where Skip value is true.
28 | // If found, return the index and metadata object of matched metadata name.
29 | // If not found return -1 as index and nil in `mdout`.
30 | //
31 | func FindMetadata(mdin MetadataInterface, mds []MetadataInterface) (
32 | 	idx int,
33 | 	mdout MetadataInterface,
34 | ) {
35 | 	for _, md := range mds {
36 | 		if md.GetName() == mdin.GetName() {
37 | 			mdout = md
38 | 			break
39 | 		}
40 | 		if !md.GetSkip() {
41 | 			idx++
42 | 		}
43 | 	}
44 | 	return idx, mdout
45 | }
46 | 


--------------------------------------------------------------------------------
/configinterface.go:
--------------------------------------------------------------------------------
 1 | package dsv
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"io/ioutil"
 6 | 	"path"
 7 | )
 8 | 
 9 | //
10 | // ConfigInterface for reader and writer for initializing the config from JSON.
11 | //
12 | type ConfigInterface interface {
13 | 	GetConfigPath() string
14 | 	SetConfigPath(dir string)
15 | }
16 | 
17 | //
18 | // ConfigOpen configuration file and initialize the attributes.
19 | //
20 | func ConfigOpen(rw interface{}, fcfg string) error {
21 | 	cfg, e := ioutil.ReadFile(fcfg)
22 | 
23 | 	if nil != e {
24 | 		return e
25 | 	}
26 | 
27 | 	// Get directory where the config reside.
28 | 	rwconfig := rw.(ConfigInterface)
29 | 	rwconfig.SetConfigPath(path.Dir(fcfg))
30 | 
31 | 	return ConfigParse(rw, cfg)
32 | }
33 | 
34 | //
35 | // ConfigParse from JSON string.
36 | //
37 | func ConfigParse(rw interface{}, cfg []byte) error {
38 | 	return json.Unmarshal(cfg, rw)
39 | }
40 | 
41 | //
42 | // ConfigCheckPath if no path in file, return the config path plus file name,
43 | // otherwise leave it unchanged.
44 | //
45 | func ConfigCheckPath(comin ConfigInterface, file string) string {
46 | 	dir := path.Dir(file)
47 | 
48 | 	if dir == "." {
49 | 		cfgPath := comin.GetConfigPath()
50 | 		if cfgPath != "" && cfgPath != "." {
51 | 			return cfgPath + "/" + file
52 | 		}
53 | 	}
54 | 
55 | 	// nothing happen.
56 | 	return file
57 | }
58 | 


--------------------------------------------------------------------------------
/readererror.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | )
10 | 
11 | const (
12 | 	_ = iota
13 | 	// EReadMissLeftQuote read error when no left-quote found on line.
14 | 	EReadMissLeftQuote
15 | 	// EReadMissRightQuote read error when no right-quote found on line.
16 | 	EReadMissRightQuote
17 | 	// EReadMissSeparator read error when no separator found on line.
18 | 	EReadMissSeparator
19 | 	// EReadLine error when reading line from file.
20 | 	EReadLine
21 | 	// EReadEOF error which indicated end-of-file.
22 | 	EReadEOF
23 | 	// ETypeConversion error when converting type from string to numeric or
24 | 	// vice versa.
25 | 	ETypeConversion
26 | )
27 | 
28 | //
29 | // ReaderError to handle error data and message.
30 | //
31 | type ReaderError struct {
32 | 	// T define type of error.
33 | 	T int
34 | 	// Func where error happened
35 | 	Func string
36 | 	// What cause the error?
37 | 	What string
38 | 	// Line define the line which cause error
39 | 	Line string
40 | 	// Pos character position which cause error
41 | 	Pos int
42 | 	// N line number
43 | 	N int
44 | }
45 | 
46 | //
47 | // Error to string.
48 | //
49 | func (e *ReaderError) Error() string {
50 | 	return fmt.Sprintf("dsv.Reader.%-20s [%d:%d]: %-30s data:|%s|", e.Func, e.N,
51 | 		e.Pos, e.What, e.Line)
52 | }
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2015-2018, Shulhan (ms@kilabit.info). All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | * Redistributions of source code must retain the above copyright notice, this
 7 |   list of conditions and the following disclaimer.
 8 | 
 9 | * Redistributions in binary form must reproduce the above copyright notice,
10 |   this list of conditions and the following disclaimer in the documentation
11 |   and/or other materials provided with the distribution.
12 | 
13 | * Neither the name of Kilabit nor the names of its
14 |   contributors may be used to endorse or promote products derived from
15 |   this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED
18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
20 | EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 |         ---      --- ---       ---       ---      --- ---
29 | 
30 | 	TT  TT   II  BB          AAAA     LLLLLL  II  KKKKKKKK
31 | 	TT TT    II  BB         AA  AA   LL   LL  II     KK
32 | 	TTTT     II  BB        AA    AA   LL  LL  II     KK
33 | 	TT TT    II  BB        AAAAAAAA   LLLLLL  II     KK
34 | 	TT  TT   II  BB        AA    AA  LL   LL  II     KK
35 | 	TT   TT  II  BBBBBBBB  AA    AA   LLLLLL  II     KK
36 | 
37 | Website: http://kilabit.info
38 | Contact: ms@kilabit.info
39 | 


--------------------------------------------------------------------------------
/dsvinterface.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv
 6 | 
 7 | import (
 8 | 	"io"
 9 | )
10 | 
11 | //
12 | // SimpleRead provide a shortcut to read data from file using configuration file
13 | // from `fcfg`.
14 | // Return the reader contained data or error if failed.
15 | // Reader object upon returned has been closed, so if one need to read all
16 | // data in it simply set the `MaxRows` to `-1` in config file.
17 | //
18 | func SimpleRead(fcfg string, dataset interface{}) (
19 | 	reader ReaderInterface,
20 | 	e error,
21 | ) {
22 | 	reader, e = NewReader(fcfg, dataset)
23 | 
24 | 	if e != nil {
25 | 		return
26 | 	}
27 | 
28 | 	_, e = Read(reader)
29 | 	if e != nil && e != io.EOF {
30 | 		return nil, e
31 | 	}
32 | 
33 | 	e = reader.Close()
34 | 
35 | 	return
36 | }
37 | 
38 | //
39 | // SimpleWrite provide a shortcut to write data from reader using output metadata
40 | // format and output file defined in file `fcfg`.
41 | //
42 | func SimpleWrite(reader ReaderInterface, fcfg string) (nrows int, e error) {
43 | 	writer, e := NewWriter(fcfg)
44 | 	if e != nil {
45 | 		return
46 | 	}
47 | 
48 | 	nrows, e = writer.Write(reader)
49 | 	if e != nil {
50 | 		return
51 | 	}
52 | 
53 | 	e = writer.Close()
54 | 
55 | 	return
56 | }
57 | 
58 | //
59 | // SimpleMerge provide a shortcut to merge two dsv files using configuration
60 | // files passed in parameters.
61 | //
62 | // One must remember to set,
63 | // - "MaxRows" to -1 to be able to read all rows, in both input configuration, and
64 | // - "DatasetMode" to "columns" to speeding up process.
65 | //
66 | // This function return the merged reader or error if failed.
67 | //
68 | func SimpleMerge(fin1, fin2 string, dataset1, dataset2 interface{}) (
69 | 	ReaderInterface,
70 | 	error,
71 | ) {
72 | 	reader1, e := SimpleRead(fin1, dataset1)
73 | 	if e != nil {
74 | 		return nil, e
75 | 	}
76 | 
77 | 	reader2, e := SimpleRead(fin2, dataset2)
78 | 	if e != nil {
79 | 		return nil, e
80 | 	}
81 | 
82 | 	reader1.MergeColumns(reader2)
83 | 
84 | 	return reader1, nil
85 | }
86 | 


--------------------------------------------------------------------------------
/dsv_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/dsv"
 9 | 	"testing"
10 | )
11 | 
12 | //
13 | // doInit create read-write object.
14 | //
15 | func doInit(t *testing.T, fcfg string) (rw *dsv.ReadWriter, e error) {
16 | 	// Initialize dsv
17 | 	rw, e = dsv.New(fcfg, nil)
18 | 
19 | 	if nil != e {
20 | 		t.Fatal(e)
21 | 	}
22 | 
23 | 	return
24 | }
25 | 
26 | //
27 | // TestReadWriter test reading and writing DSV.
28 | //
29 | func TestReadWriter(t *testing.T) {
30 | 	rw, _ := doInit(t, "testdata/config.dsv")
31 | 
32 | 	doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true)
33 | 
34 | 	e := rw.Close()
35 | 	if e != nil {
36 | 		t.Fatal(e)
37 | 	}
38 | 
39 | 	assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
40 | }
41 | 
42 | //
43 | // TestReadWriter test reading and writing DSV.
44 | //
45 | func TestReadWriterAll(t *testing.T) {
46 | 	rw, _ := doInit(t, "testdata/config.dsv")
47 | 
48 | 	rw.SetMaxRows(-1)
49 | 
50 | 	doReadWrite(t, &rw.Reader, &rw.Writer, expectation, false)
51 | 
52 | 	e := rw.Close()
53 | 	if e != nil {
54 | 		t.Fatal(e)
55 | 	}
56 | 
57 | 	assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
58 | }
59 | 
60 | func TestSimpleReadWrite(t *testing.T) {
61 | 	fcfg := "testdata/config_simpleread.dsv"
62 | 
63 | 	reader, e := dsv.SimpleRead(fcfg, nil)
64 | 	if e != nil {
65 | 		t.Fatal(e)
66 | 	}
67 | 
68 | 	fout := "testdata/output.dat"
69 | 	fexp := "testdata/expected.dat"
70 | 
71 | 	_, e = dsv.SimpleWrite(reader, fcfg)
72 | 	if e != nil {
73 | 		t.Fatal(e)
74 | 	}
75 | 
76 | 	assertFile(t, fexp, fout, true)
77 | }
78 | 
79 | func TestSimpleMerge(t *testing.T) {
80 | 	fcfg1 := "testdata/config_simpleread.dsv"
81 | 	fcfg2 := "testdata/config_simpleread.dsv"
82 | 
83 | 	reader, e := dsv.SimpleMerge(fcfg1, fcfg2, nil, nil)
84 | 	if e != nil {
85 | 		t.Fatal(e)
86 | 	}
87 | 
88 | 	_, e = dsv.SimpleWrite(reader, fcfg1)
89 | 	if e != nil {
90 | 		t.Fatal(e)
91 | 	}
92 | 
93 | 	fexp := "testdata/expected_simplemerge.dat"
94 | 	fout := "testdata/output.dat"
95 | 
96 | 	assertFile(t, fexp, fout, true)
97 | }
98 | 


--------------------------------------------------------------------------------
/data_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dsv_test
 6 | 
 7 | var expectation = []string{
 8 | 	"&[1 A-B AB 1 0.1]",
 9 | 	"&[2 A-B-C BCD 2 0.02]",
10 | 	"&[3 A;B-C,D A;B C,D 3 0.003]",
11 | 	"&[4 A;B-C,D A;B C D 4 0.0004]",
12 | 	"&[6   6 0.000006]",
13 | 	"&[8 ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]",
14 | 	"&[10 test integer 10 0.101]",
15 | 	"&[12 test real 123456789 0.123456789]",
16 | 	"&[13 string with string with 13 13]",
17 | 	"&[14 string with\" quote string with]] escape 14 14]",
18 | }
19 | 
20 | var expSkip = []string{
21 | 	"&[A-B AB 1 0.1]",
22 | 	"&[A-B-C BCD 2 0.02]",
23 | 	"&[A;B-C,D A;B C,D 3 0.003]",
24 | 	"&[A;B-C,D A;B C D 4 0.0004]",
25 | 	"&[  6 0.000006]",
26 | 	"&[ok missing right-quote];8;0.00000008\n9;\"ok\"-[[ok 9 0.000000009]",
27 | 	"&[test integer 10 0.101]",
28 | 	"&[test real 123456789 0.123456789]",
29 | 	"&[string with string with 13 13]",
30 | 	"&[string with\" quote string with]] escape 14 14]",
31 | }
32 | 
33 | var expSkipColumns = []string{
34 | 	"[{name 0 0 [] [A-B]} {value 0 0 [] [AB]} {integer 1 0 [] [1]} {real 2 0 [] [0.1]}]",
35 | 	"[{name 0 0 [] [A-B-C]} {value 0 0 [] [BCD]} {integer 1 0 [] [2]} {real 2 0 [] [0.02]}]",
36 | 	"[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C,D]} {integer 1 0 [] [3]} {real 2 0 [] [0.003]}]",
37 | 	"[{name 0 0 [] [A;B-C,D]} {value 0 0 [] [A;B C D]} {integer 1 0 [] [4]} {real 2 0 [] [0.0004]}]",
38 | 	"[{name 0 0 [] []} {value 0 0 [] []} {integer 1 0 [] [6]} {real 2 0 [] [0.000006]}]",
39 | 	"[{name 0 0 [] [ok]} {value 0 0 [] [missing right-quote];8;0.00000008\n9;\"ok\"-[[ok]} {integer 1 0 [] [9]} {real 2 0 [] [0.000000009]}]",
40 | 	"[{name 0 0 [] [test]} {value 0 0 [] [integer]} {integer 1 0 [] [10]} {real 2 0 [] [0.101]}]",
41 | 	"[{name 0 0 [] [test]} {value 0 0 [] [real]} {integer 1 0 [] [123456789]} {real 2 0 [] [0.123456789]}]",
42 | 	"[{name 0 0 [] [string with]} {value 0 0 [] [string with]} {integer 1 0 [] [13]} {real 2 0 [] [13]}]",
43 | 	"[{name 0 0 [] [string with\" quote]} {value 0 0 [] [string with]] escape]} {integer 1 0 [] [14]} {real 2 0 [] [14]}]",
44 | }
45 | 
46 | var expSkipColumnsAll = []string{
47 | 	"{name 0 0 [] [A-B A-B-C A;B-C,D A;B-C,D  ok test test string with string with\" quote]}",
48 | 	"{value 0 0 [] [AB BCD A;B C,D A;B C D  missing right-quote];8;0.00000008\n9;\"ok\"-[[ok integer real string with string with]] escape]}",
49 | 	"{integer 1 0 [] [1 2 3 4 6 9 10 123456789 13 14]}",
50 | 	"{real 2 0 [] [0.1 0.02 0.003 0.0004 0.000006 0.000000009 0.101 0.123456789 13 14]}",
51 | }
52 | 
53 | var expSkipColumnsAllRev = []string{
54 | 	"{name 0 0 [] [string with\" quote string with test test ok  A;B-C,D A;B-C,D A-B-C A-B]}",
55 | 	"{value 0 0 [] [string with]] escape string with real integer missing right-quote];8;0.00000008\n9;\"ok\"-[[ok  A;B C D A;B C,D BCD AB]}",
56 | 	"{integer 1 0 [] [14 13 123456789 10 9 6 4 3 2 1]}",
57 | 	"{real 2 0 [] [14 13 0.123456789 0.101 0.000000009 0.000006 0.0004 0.003 0.02 0.1]}",
58 | }
59 | 


--------------------------------------------------------------------------------
/writer_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv_test
  6 | 
  7 | import (
  8 | 	"testing"
  9 | 
 10 | 	"github.com/shuLhan/dsv"
 11 | 	"github.com/shuLhan/tabula"
 12 | )
 13 | 
 14 | //
 15 | // TestWriter test reading and writing DSV.
 16 | //
 17 | func TestWriter(t *testing.T) {
 18 | 	rw, e := dsv.New("testdata/config.dsv", nil)
 19 | 	if e != nil {
 20 | 		t.Fatal(e)
 21 | 	}
 22 | 
 23 | 	doReadWrite(t, &rw.Reader, &rw.Writer, expectation, true)
 24 | 
 25 | 	e = rw.Close()
 26 | 	if e != nil {
 27 | 		t.Fatal(e)
 28 | 	}
 29 | 
 30 | 	assertFile(t, rw.GetOutput(), "testdata/expected.dat", true)
 31 | }
 32 | 
 33 | //
 34 | // TestWriterWithSkip test reading and writing DSV with some column in input being
 35 | // skipped.
 36 | //
 37 | func TestWriterWithSkip(t *testing.T) {
 38 | 	rw, e := dsv.New("testdata/config_skip.dsv", nil)
 39 | 	if e != nil {
 40 | 		t.Fatal(e)
 41 | 	}
 42 | 
 43 | 	doReadWrite(t, &rw.Reader, &rw.Writer, expSkip, true)
 44 | 
 45 | 	e = rw.Close()
 46 | 	if e != nil {
 47 | 		t.Fatal(e)
 48 | 	}
 49 | 
 50 | 	assertFile(t, rw.GetOutput(), "testdata/expected_skip.dat", true)
 51 | }
 52 | 
 53 | //
 54 | // TestWriterWithColumns test reading and writing DSV with where each row
 55 | // is saved in DatasetMode = 'columns'.
 56 | //
 57 | func TestWriterWithColumns(t *testing.T) {
 58 | 	rw, e := dsv.New("testdata/config_skip.dsv", nil)
 59 | 	if e != nil {
 60 | 		t.Fatal(e)
 61 | 	}
 62 | 
 63 | 	rw.SetDatasetMode(dsv.DatasetModeCOLUMNS)
 64 | 
 65 | 	doReadWrite(t, &rw.Reader, &rw.Writer, expSkipColumns, true)
 66 | 
 67 | 	e = rw.Close()
 68 | 	if e != nil {
 69 | 		t.Fatal(e)
 70 | 	}
 71 | 
 72 | 	assertFile(t, "testdata/expected_skip.dat", rw.GetOutput(), true)
 73 | }
 74 | 
 75 | func TestWriteRawRows(t *testing.T) {
 76 | 	dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil)
 77 | 
 78 | 	populateWithRows(t, dataset)
 79 | 
 80 | 	writer, e := dsv.NewWriter("")
 81 | 	if e != nil {
 82 | 		t.Fatal(e)
 83 | 	}
 84 | 
 85 | 	outfile := "testdata/writerawrows.out"
 86 | 	expfile := "testdata/writeraw.exp"
 87 | 
 88 | 	e = writer.OpenOutput(outfile)
 89 | 	if e != nil {
 90 | 		t.Fatal(e)
 91 | 	}
 92 | 
 93 | 	_, e = writer.WriteRawDataset(dataset, nil)
 94 | 	if e != nil {
 95 | 		t.Fatal(e)
 96 | 	}
 97 | 
 98 | 	assertFile(t, outfile, expfile, true)
 99 | }
100 | 
101 | func TestWriteRawColumns(t *testing.T) {
102 | 	var e error
103 | 
104 | 	dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil)
105 | 
106 | 	populateWithColumns(t, dataset)
107 | 
108 | 	writer, e := dsv.NewWriter("")
109 | 	if e != nil {
110 | 		t.Fatal(e)
111 | 	}
112 | 
113 | 	outfile := "testdata/writerawcolumns.out"
114 | 	expfile := "testdata/writeraw.exp"
115 | 
116 | 	e = writer.OpenOutput(outfile)
117 | 	if e != nil {
118 | 		t.Fatal(e)
119 | 	}
120 | 
121 | 	_, e = writer.WriteRawDataset(dataset, nil)
122 | 	if e != nil {
123 | 		t.Fatal(e)
124 | 	}
125 | 
126 | 	assertFile(t, outfile, expfile, true)
127 | }
128 | 


--------------------------------------------------------------------------------
/dsv.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //
  6 | // Package dsv is a library for working with delimited separated value (DSV).
  7 | //
  8 | // DSV is a free-style form of Comma Separated Value (CSV) format of text data,
  9 | // where each row is separated by newline, and each column can be separated by
 10 | // any string enclosed with left-quote and right-quote.
 11 | //
 12 | package dsv
 13 | 
 14 | import (
 15 | 	"errors"
 16 | 	"os"
 17 | 	"strconv"
 18 | )
 19 | 
 20 | const (
 21 | 	// DefaultRejected define the default file which will contain the
 22 | 	// rejected row.
 23 | 	DefaultRejected = "rejected.dat"
 24 | 	// DefaultMaxRows define default maximum row that will be saved
 25 | 	// in memory for each read if input data is too large and can not be
 26 | 	// consumed in one read operation.
 27 | 	DefaultMaxRows = 256
 28 | 	// DefDatasetMode default output mode is rows.
 29 | 	DefDatasetMode = DatasetModeROWS
 30 | 	// DefEOL default end-of-line
 31 | 	DefEOL = '\n'
 32 | )
 33 | 
 34 | var (
 35 | 	// ErrNoInput define an error when no Input file is given to Reader.
 36 | 	ErrNoInput = errors.New("dsv: No input file is given in config")
 37 | 	// ErrMissRecordsLen define an error when trying to push Row
 38 | 	// to Field, when their length is not equal.
 39 | 	// See reader.PushRowToColumns().
 40 | 	ErrMissRecordsLen = errors.New("dsv: Mismatch between number of record in row and columns length")
 41 | 	// ErrNoOutput define an error when no output file is given to Writer.
 42 | 	ErrNoOutput = errors.New("dsv: No output file is given in config")
 43 | 	// ErrNotOpen define an error when output file has not been opened
 44 | 	// by Writer.
 45 | 	ErrNotOpen = errors.New("dsv: Output file is not opened")
 46 | 	// ErrNilReader define an error when Reader object is nil when passed
 47 | 	// to Write function.
 48 | 	ErrNilReader = errors.New("dsv: Reader object is nil")
 49 | 
 50 | 	// DEBUG imported from environment DSV_DEBUG to debug the library.
 51 | 	DEBUG = 0
 52 | )
 53 | 
 54 | //
 55 | // ReadWriter combine reader and writer.
 56 | //
 57 | type ReadWriter struct {
 58 | 	Reader
 59 | 	Writer
 60 | }
 61 | 
 62 | func init() {
 63 | 	var e error
 64 | 	DEBUG, e = strconv.Atoi(os.Getenv("DSV_DEBUG"))
 65 | 	if e != nil {
 66 | 		DEBUG = 0
 67 | 	}
 68 | }
 69 | 
 70 | //
 71 | // New create a new ReadWriter object.
 72 | //
 73 | func New(config string, dataset interface{}) (rw *ReadWriter, e error) {
 74 | 	rw = &ReadWriter{}
 75 | 
 76 | 	e = rw.Reader.Init(config, dataset)
 77 | 	if e != nil {
 78 | 		return nil, e
 79 | 	}
 80 | 
 81 | 	e = OpenWriter(&rw.Writer, config)
 82 | 	if e != nil {
 83 | 		return nil, e
 84 | 	}
 85 | 
 86 | 	return
 87 | }
 88 | 
 89 | //
 90 | // SetConfigPath of input and output file.
 91 | //
 92 | func (dsv *ReadWriter) SetConfigPath(dir string) {
 93 | 	dsv.Reader.SetConfigPath(dir)
 94 | 	dsv.Writer.SetConfigPath(dir)
 95 | }
 96 | 
 97 | //
 98 | // Close reader and writer.
 99 | //
100 | func (dsv *ReadWriter) Close() (e error) {
101 | 	e = dsv.Writer.Close()
102 | 	if e != nil {
103 | 		return
104 | 	}
105 | 	return dsv.Reader.Close()
106 | }
107 | 


--------------------------------------------------------------------------------
/common_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv_test
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"github.com/shuLhan/dsv"
 11 | 	"github.com/shuLhan/tabula"
 12 | 	"io"
 13 | 	"io/ioutil"
 14 | 	"reflect"
 15 | 	"runtime/debug"
 16 | 	"testing"
 17 | )
 18 | 
 19 | func assert(t *testing.T, exp, got interface{}, equal bool) {
 20 | 	if reflect.DeepEqual(exp, got) != equal {
 21 | 		debug.PrintStack()
 22 | 		t.Fatalf("\n"+
 23 | 			">>> Expecting '%v'\n"+
 24 | 			"          got '%v'\n", exp, got)
 25 | 	}
 26 | }
 27 | 
 28 | //
 29 | // assertFile compare content of two file, print error message and exit
 30 | // when both are different.
 31 | //
 32 | func assertFile(t *testing.T, a, b string, equal bool) {
 33 | 	out, e := ioutil.ReadFile(a)
 34 | 
 35 | 	if nil != e {
 36 | 		debug.PrintStack()
 37 | 		t.Error(e)
 38 | 	}
 39 | 
 40 | 	exp, e := ioutil.ReadFile(b)
 41 | 
 42 | 	if nil != e {
 43 | 		debug.PrintStack()
 44 | 		t.Error(e)
 45 | 	}
 46 | 
 47 | 	r := bytes.Compare(out, exp)
 48 | 
 49 | 	if equal && 0 != r {
 50 | 		debug.PrintStack()
 51 | 		t.Fatal("Comparing", a, "with", b, ": result is different (",
 52 | 			r, ")")
 53 | 	}
 54 | }
 55 | 
 56 | func checkDataset(t *testing.T, r *dsv.Reader, exp string) {
 57 | 	var got string
 58 | 	ds := r.GetDataset().(tabula.DatasetInterface)
 59 | 	data := ds.GetData()
 60 | 
 61 | 	switch data.(type) {
 62 | 	case *tabula.Rows:
 63 | 		rows := data.(*tabula.Rows)
 64 | 		got = fmt.Sprint(*rows)
 65 | 	case *tabula.Columns:
 66 | 		cols := data.(*tabula.Columns)
 67 | 		got = fmt.Sprint(*cols)
 68 | 	case *tabula.Matrix:
 69 | 		matrix := data.(*tabula.Matrix)
 70 | 		got = fmt.Sprint(*matrix)
 71 | 	default:
 72 | 		fmt.Println("data type unknown")
 73 | 	}
 74 | 
 75 | 	assert(t, exp, got, true)
 76 | }
 77 | 
 78 | //
 79 | // doReadWrite test reading and writing the DSV data.
 80 | //
 81 | func doReadWrite(t *testing.T, dsvReader *dsv.Reader, dsvWriter *dsv.Writer,
 82 | 	expectation []string, check bool) {
 83 | 	i := 0
 84 | 
 85 | 	for {
 86 | 		n, e := dsv.Read(dsvReader)
 87 | 
 88 | 		if e == io.EOF || n == 0 {
 89 | 			_, e = dsvWriter.Write(dsvReader)
 90 | 			if e != nil {
 91 | 				t.Fatal(e)
 92 | 			}
 93 | 
 94 | 			break
 95 | 		}
 96 | 
 97 | 		if e != nil {
 98 | 			continue
 99 | 		}
100 | 
101 | 		if check {
102 | 			checkDataset(t, dsvReader, expectation[i])
103 | 			i++
104 | 		}
105 | 
106 | 		_, e = dsvWriter.Write(dsvReader)
107 | 		if e != nil {
108 | 			t.Fatal(e)
109 | 		}
110 | 	}
111 | 
112 | 	e := dsvWriter.Flush()
113 | 	if e != nil {
114 | 		t.Fatal(e)
115 | 	}
116 | }
117 | 
118 | var datasetRows = [][]string{
119 | 	{"0", "1", "A"},
120 | 	{"1", "1.1", "B"},
121 | 	{"2", "1.2", "A"},
122 | 	{"3", "1.3", "B"},
123 | 	{"4", "1.4", "C"},
124 | 	{"5", "1.5", "D"},
125 | 	{"6", "1.6", "C"},
126 | 	{"7", "1.7", "D"},
127 | 	{"8", "1.8", "E"},
128 | 	{"9", "1.9", "F"},
129 | }
130 | 
131 | var datasetCols = [][]string{
132 | 	{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
133 | 	{"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"},
134 | 	{"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"},
135 | }
136 | 
137 | var datasetTypes = []int{
138 | 	tabula.TInteger,
139 | 	tabula.TReal,
140 | 	tabula.TString,
141 | }
142 | 
143 | var datasetNames = []string{"int", "real", "string"}
144 | 
145 | func populateWithRows(t *testing.T, dataset *tabula.Dataset) {
146 | 	for _, rowin := range datasetRows {
147 | 		row := make(tabula.Row, len(rowin))
148 | 
149 | 		for x, recin := range rowin {
150 | 			rec, e := tabula.NewRecordBy(recin, datasetTypes[x])
151 | 			if e != nil {
152 | 				t.Fatal(e)
153 | 			}
154 | 
155 | 			row[x] = rec
156 | 		}
157 | 
158 | 		dataset.PushRow(&row)
159 | 	}
160 | }
161 | 
162 | func populateWithColumns(t *testing.T, dataset *tabula.Dataset) {
163 | 	for x := range datasetCols {
164 | 		col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x],
165 | 			datasetNames[x])
166 | 		if e != nil {
167 | 			t.Fatal(e)
168 | 		}
169 | 
170 | 		dataset.PushColumn(*col)
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/metadata.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv
  6 | 
  7 | import (
  8 | 	"encoding/json"
  9 | 	"github.com/shuLhan/tabula"
 10 | 	"log"
 11 | 	"strings"
 12 | )
 13 | 
 14 | //
 15 | // Metadata represent on how to parse each column in record.
 16 | //
 17 | type Metadata struct {
 18 | 	// Name of the column, optional.
 19 | 	Name string `json:"Name"`
 20 | 	// Type of the column, default to "string".
 21 | 	// Valid value are: "string", "integer", "real"
 22 | 	Type string `json:"Type"`
 23 | 	// T type of column in integer.
 24 | 	T int
 25 | 	// Separator for column in record.
 26 | 	Separator string `json:"Separator"`
 27 | 	// LeftQuote define the characters that enclosed the column in the left
 28 | 	// side.
 29 | 	LeftQuote string `json:"LeftQuote"`
 30 | 	// RightQuote define the characters that enclosed the column in the
 31 | 	// right side.
 32 | 	RightQuote string `json:"RightQuote"`
 33 | 	// Skip, if its true this column will be ignored, not saved in reader
 34 | 	// object. Default to false.
 35 | 	Skip bool `json:"Skip"`
 36 | 	// ValueSpace contain the possible value in records
 37 | 	ValueSpace []string `json:"ValueSpace"`
 38 | }
 39 | 
 40 | //
 41 | // NewMetadata create and return new metadata.
 42 | //
 43 | func NewMetadata(name, tipe, sep, leftq, rightq string, vs []string) (
 44 | 	md *Metadata,
 45 | ) {
 46 | 	md = &Metadata{
 47 | 		Name:       name,
 48 | 		Type:       tipe,
 49 | 		Separator:  sep,
 50 | 		LeftQuote:  leftq,
 51 | 		RightQuote: rightq,
 52 | 		ValueSpace: vs,
 53 | 	}
 54 | 
 55 | 	md.Init()
 56 | 
 57 | 	return
 58 | }
 59 | 
 60 | //
 61 | // Init initialize metadata column, i.e. check and set column type.
 62 | //
 63 | // If type is unknown it will default to string.
 64 | //
 65 | func (md *Metadata) Init() {
 66 | 	switch strings.ToUpper(md.Type) {
 67 | 	case "INTEGER", "INT":
 68 | 		md.T = tabula.TInteger
 69 | 	case "REAL":
 70 | 		md.T = tabula.TReal
 71 | 	default:
 72 | 		md.T = tabula.TString
 73 | 		md.Type = "string"
 74 | 	}
 75 | }
 76 | 
 77 | //
 78 | // GetName return the name of metadata.
 79 | //
 80 | func (md *Metadata) GetName() string {
 81 | 	return md.Name
 82 | }
 83 | 
 84 | //
 85 | // GetType return type of metadata.
 86 | //
 87 | func (md *Metadata) GetType() int {
 88 | 	return md.T
 89 | }
 90 | 
 91 | //
 92 | // GetTypeName return string representation of type.
 93 | //
 94 | func (md *Metadata) GetTypeName() string {
 95 | 	return md.Type
 96 | }
 97 | 
 98 | //
 99 | // GetSeparator return the field separator.
100 | //
101 | func (md *Metadata) GetSeparator() string {
102 | 	return md.Separator
103 | }
104 | 
105 | //
106 | // GetLeftQuote return the string used in the beginning of record value.
107 | //
108 | func (md *Metadata) GetLeftQuote() string {
109 | 	return md.LeftQuote
110 | }
111 | 
112 | //
113 | // GetRightQuote return string that end in record value.
114 | //
115 | func (md *Metadata) GetRightQuote() string {
116 | 	return md.RightQuote
117 | }
118 | 
119 | //
120 | // GetSkip return number of rows that will be skipped when reading data.
121 | //
122 | func (md *Metadata) GetSkip() bool {
123 | 	return md.Skip
124 | }
125 | 
126 | //
127 | // GetValueSpace return value space.
128 | //
129 | func (md *Metadata) GetValueSpace() []string {
130 | 	return md.ValueSpace
131 | }
132 | 
133 | //
134 | // IsEqual return true if this metadata equal with other instance, return false
135 | // otherwise.
136 | //
137 | func (md *Metadata) IsEqual(o MetadataInterface) bool {
138 | 	if md.Name != o.GetName() {
139 | 		return false
140 | 	}
141 | 	if md.Separator != o.GetSeparator() {
142 | 		return false
143 | 	}
144 | 	if md.LeftQuote != o.GetLeftQuote() {
145 | 		return false
146 | 	}
147 | 	if md.RightQuote != o.GetRightQuote() {
148 | 		return false
149 | 	}
150 | 	return true
151 | }
152 | 
153 | //
154 | // String yes, it will print it JSON like format.
155 | //
156 | func (md *Metadata) String() string {
157 | 	r, e := json.MarshalIndent(md, "", "\t")
158 | 	if nil != e {
159 | 		log.Print(e)
160 | 	}
161 | 	return string(r)
162 | }
163 | 


--------------------------------------------------------------------------------
/readerinterface.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"github.com/shuLhan/tabula"
 11 | 	"github.com/shuLhan/tekstus"
 12 | 	"io"
 13 | 	"os"
 14 | )
 15 | 
 16 | //
 17 | // ReaderInterface is the interface for reading DSV file.
 18 | //
 19 | type ReaderInterface interface {
 20 | 	ConfigInterface
 21 | 	AddInputMetadata(*Metadata)
 22 | 	AppendMetadata(MetadataInterface)
 23 | 	GetInputMetadata() []MetadataInterface
 24 | 	GetInputMetadataAt(idx int) MetadataInterface
 25 | 	GetMaxRows() int
 26 | 	SetMaxRows(max int)
 27 | 	GetDatasetMode() string
 28 | 	SetDatasetMode(mode string)
 29 | 	GetNColumnIn() int
 30 | 	GetInput() string
 31 | 	SetInput(path string)
 32 | 	GetRejected() string
 33 | 	SetRejected(path string)
 34 | 	GetSkip() int
 35 | 	SetSkip(n int)
 36 | 	IsTrimSpace() bool
 37 | 	SetDefault()
 38 | 	OpenInput() error
 39 | 	OpenRejected() error
 40 | 	SkipLines() error
 41 | 
 42 | 	Reset() error
 43 | 	Flush() error
 44 | 	ReadLine() ([]byte, error)
 45 | 	FetchNextLine([]byte) ([]byte, error)
 46 | 	Reject(line []byte) (int, error)
 47 | 	Close() error
 48 | 
 49 | 	GetDataset() interface{}
 50 | 	MergeColumns(ReaderInterface)
 51 | }
 52 | 
 53 | //
 54 | // Read row from input file.
 55 | //
 56 | func Read(reader ReaderInterface) (n int, e error) {
 57 | 	var (
 58 | 		row     *tabula.Row
 59 | 		line    []byte
 60 | 		linenum int
 61 | 		eRead   *ReaderError
 62 | 	)
 63 | 	maxrows := reader.GetMaxRows()
 64 | 
 65 | 	e = reader.Reset()
 66 | 	if e != nil {
 67 | 		return
 68 | 	}
 69 | 
 70 | 	dataset := reader.GetDataset().(tabula.DatasetInterface)
 71 | 
 72 | 	// Loop until we reached MaxRows (> 0) or when all rows has been
 73 | 	// read (= -1)
 74 | 	for {
 75 | 		row, line, linenum, eRead = ReadRow(reader, linenum)
 76 | 		if nil == eRead {
 77 | 			dataset.PushRow(row)
 78 | 
 79 | 			n++
 80 | 			if maxrows > 0 && n >= maxrows {
 81 | 				break
 82 | 			}
 83 | 			continue
 84 | 		}
 85 | 
 86 | 		if eRead.T&EReadEOF == EReadEOF {
 87 | 			_ = reader.Flush()
 88 | 			e = io.EOF
 89 | 			return
 90 | 		}
 91 | 
 92 | 		eRead.N = linenum
 93 | 		fmt.Fprintf(os.Stderr, "%s\n", eRead)
 94 | 
 95 | 		// If error, save the rejected line.
 96 | 		line = append(line, DefEOL)
 97 | 
 98 | 		_, e = reader.Reject(line)
 99 | 		if e != nil {
100 | 			break
101 | 		}
102 | 	}
103 | 
104 | 	// remember to flush if we have rejected rows.
105 | 	e = reader.Flush()
106 | 
107 | 	return n, e
108 | }
109 | 
110 | //
111 | // parsingLeftQuote parse the left-quote string from line.
112 | //
113 | func parsingLeftQuote(lq, line []byte, startAt int) (
114 | 	p int, eRead *ReaderError,
115 | ) {
116 | 	p = startAt
117 | 
118 | 	// parsing until we found left quote token
119 | 	p, found := tekstus.BytesSkipUntil(line, lq, p, false)
120 | 
121 | 	if found {
122 | 		return p, nil
123 | 	}
124 | 
125 | 	eRead = &ReaderError{
126 | 		T:    EReadMissLeftQuote,
127 | 		Func: "parsingLeftQuote",
128 | 		What: "Missing left-quote '" + string(lq) + "'",
129 | 		Line: string(line),
130 | 		Pos:  p,
131 | 		N:    0,
132 | 	}
133 | 
134 | 	return p, eRead
135 | }
136 | 
137 | //
138 | // parsingSeparator parsing the line until we found the separator.
139 | //
140 | // Return the data and index of last parsed line, or error if separator is not
141 | // found or not match with specification.
142 | //
143 | func parsingSeparator(sep, line []byte, startAt int) (
144 | 	v []byte, p int, eRead *ReaderError,
145 | ) {
146 | 	p = startAt
147 | 
148 | 	v, p, found := tekstus.BytesCutUntil(line, sep, p, false)
149 | 
150 | 	if found {
151 | 		return v, p, nil
152 | 	}
153 | 
154 | 	eRead = &ReaderError{
155 | 		Func: "parsingSeparator",
156 | 		What: "Missing separator '" + string(sep) + "'",
157 | 		Line: string(line),
158 | 		Pos:  p,
159 | 		N:    0,
160 | 	}
161 | 
162 | 	return v, p, eRead
163 | }
164 | 
165 | //
166 | // parsingRightQuote parsing the line until we found the right quote or separator.
167 | //
168 | // Return the data and index of last parsed line, or error if right-quote is not
169 | // found or not match with specification.
170 | //
171 | func parsingRightQuote(reader ReaderInterface, rq, line []byte, startAt int) (
172 | 	v, lines []byte, p int, eRead *ReaderError,
173 | ) {
174 | 	var e error
175 | 	var content []byte
176 | 	p = startAt
177 | 	var found bool
178 | 
179 | 	// (2.2.1)
180 | 	for {
181 | 		content, p, found = tekstus.BytesCutUntil(line, rq, p, true)
182 | 
183 | 		v = append(v, content...)
184 | 
185 | 		if found {
186 | 			return v, line, p, nil
187 | 		}
188 | 
189 | 		// EOL before finding right-quote.
190 | 		// Read and join with the next line.
191 | 		line, e = reader.FetchNextLine(line)
192 | 
193 | 		if e != nil {
194 | 			break
195 | 		}
196 | 	}
197 | 
198 | 	eRead = &ReaderError{
199 | 		T:    EReadMissRightQuote,
200 | 		Func: "parsingRightQuote",
201 | 		What: "Missing right-quote '" + string(rq) + "'",
202 | 		Line: string(line),
203 | 		Pos:  p,
204 | 		N:    0,
205 | 	}
206 | 
207 | 	if e == io.EOF {
208 | 		eRead.T &= EReadEOF
209 | 	}
210 | 
211 | 	return v, line, p, eRead
212 | }
213 | 
214 | //
215 | // parsingSkipSeparator parse until we found separator or EOF
216 | //
217 | func parsingSkipSeparator(sep, line []byte, startAt int) (
218 | 	p int, eRead *ReaderError,
219 | ) {
220 | 	p = startAt
221 | 
222 | 	p, found := tekstus.BytesSkipUntil(line, sep, p, false)
223 | 
224 | 	if found {
225 | 		return p, nil
226 | 	}
227 | 
228 | 	eRead = &ReaderError{
229 | 		T:    EReadMissSeparator,
230 | 		Func: "parsingSkipSeparator",
231 | 		What: "Missing separator '" + string(sep) + "'",
232 | 		Line: string(line),
233 | 		Pos:  p,
234 | 		N:    0,
235 | 	}
236 | 
237 | 	return p, eRead
238 | }
239 | 
240 | //
241 | // parsingSkipSpace skip all space starting from `startAt`.
242 | //
243 | func parsingSkipSpace(line []byte, startAt int) (p int) {
244 | 	linelen := len(line)
245 | 
246 | 	for p = startAt; p < linelen; p++ {
247 | 		if line[p] == ' ' || line[p] == '\t' || line[p] == '\n' ||
248 | 			line[p] == '\r' {
249 | 			continue
250 | 		}
251 | 		break
252 | 	}
253 | 	return
254 | }
255 | 
256 | //
257 | // ParseLine parse a line containing records. The output is array of record
258 | // (or single row).
259 | //
260 | // This is how the algorithm works
261 | // (1) create n slice of record, where n is number of column metadata
262 | // (2) for each metadata
263 | // (2.0) Check if the next sequence matched with separator.
264 | // (2.0.1) If its match, create empty record
265 | // (2.1) If using left quote, skip until we found left-quote
266 | // (2.2) If using right quote, append byte to buffer until right-quote
267 | // 	(2.2.1) If using separator, skip until separator
268 | // (2.3) If using separator, append byte to buffer until separator
269 | // (2.4) else append all byte to buffer.
270 | // (3) save buffer to record
271 | //
272 | func ParseLine(reader ReaderInterface, line []byte) (
273 | 	prow *tabula.Row, eRead *ReaderError,
274 | ) {
275 | 	p := 0
276 | 	rIdx := 0
277 | 	inputMd := reader.GetInputMetadata()
278 | 	row := make(tabula.Row, 0)
279 | 
280 | 	for _, md := range inputMd {
281 | 		lq := md.GetLeftQuote()
282 | 		rq := md.GetRightQuote()
283 | 		sep := md.GetSeparator()
284 | 		v := []byte{}
285 | 
286 | 		// (2.0)
287 | 		if sep != "" && sep != lq {
288 | 			match := tekstus.BytesMatchForward(line, []byte(sep),
289 | 				p)
290 | 
291 | 			// (2.0.1)
292 | 			if match {
293 | 				p += len(sep)
294 | 				goto empty
295 | 			}
296 | 		}
297 | 
298 | 		// (2.1)
299 | 		if lq != "" {
300 | 			p, eRead = parsingLeftQuote([]byte(lq), line, p)
301 | 
302 | 			if eRead != nil {
303 | 				return
304 | 			}
305 | 		}
306 | 
307 | 		// (2.2)
308 | 		if rq != "" {
309 | 			v, line, p, eRead = parsingRightQuote(reader, []byte(rq),
310 | 				line, p)
311 | 
312 | 			if eRead != nil {
313 | 				return
314 | 			}
315 | 
316 | 			if sep != "" {
317 | 				p, eRead = parsingSkipSeparator([]byte(sep),
318 | 					line, p)
319 | 
320 | 				if eRead != nil {
321 | 					return
322 | 				}
323 | 
324 | 				// Handle multi space if separator is a single
325 | 				// space.
326 | 				if sep == " " {
327 | 					p = parsingSkipSpace(line, p)
328 | 				}
329 | 			}
330 | 		} else {
331 | 			if sep != "" {
332 | 				// Skip space at beginning if separator is a
333 | 				// single space.
334 | 				if sep == " " {
335 | 					p = parsingSkipSpace(line, p)
336 | 				}
337 | 
338 | 				v, p, eRead = parsingSeparator([]byte(sep),
339 | 					line, p)
340 | 
341 | 				if eRead != nil {
342 | 					return
343 | 				}
344 | 
345 | 				// Handle multi space if separator is a single
346 | 				// space.
347 | 				if sep == " " {
348 | 					p = parsingSkipSpace(line, p)
349 | 				}
350 | 			} else {
351 | 				v = line[p:]
352 | 				p = p + len(line)
353 | 			}
354 | 		}
355 | 
356 | 		if md.GetSkip() {
357 | 			continue
358 | 		}
359 | 	empty:
360 | 		r, e := tabula.NewRecordBy(string(v), md.GetType())
361 | 
362 | 		if nil != e {
363 | 			msg := fmt.Sprintf("md %s: Type convertion error from %q to %s",
364 | 				md.GetName(), string(v), md.GetTypeName())
365 | 
366 | 			return nil, &ReaderError{
367 | 				T:    ETypeConversion,
368 | 				Func: "ParseLine",
369 | 				What: msg,
370 | 				Line: string(line),
371 | 				Pos:  p,
372 | 				N:    0,
373 | 			}
374 | 		}
375 | 
376 | 		row = append(row, r)
377 | 		rIdx++
378 | 	}
379 | 
380 | 	return &row, nil
381 | }
382 | 
383 | //
384 | // ReadRow read one line at a time until we get one row or error when parsing the
385 | // data.
386 | //
387 | func ReadRow(reader ReaderInterface, linenum int) (
388 | 	row *tabula.Row,
389 | 	line []byte,
390 | 	n int,
391 | 	eRead *ReaderError,
392 | ) {
393 | 	var e error
394 | 	n = linenum
395 | 
396 | 	// Read one line, skip empty line.
397 | 	for {
398 | 		line, e = reader.ReadLine()
399 | 		n++
400 | 
401 | 		if e != nil {
402 | 			goto err
403 | 		}
404 | 
405 | 		// check for empty line
406 | 		linetrimed := bytes.TrimSpace(line)
407 | 
408 | 		if len(linetrimed) > 0 {
409 | 			break
410 | 		}
411 | 	}
412 | 
413 | 	if reader.IsTrimSpace() {
414 | 		line = bytes.TrimSpace(line)
415 | 	}
416 | 
417 | 	row, eRead = ParseLine(reader, line)
418 | 
419 | 	return row, line, n, eRead
420 | 
421 | err:
422 | 	eRead = &ReaderError{
423 | 		Func: "ReadRow",
424 | 		What: fmt.Sprint(e),
425 | 	}
426 | 
427 | 	if e == io.EOF {
428 | 		eRead.T = EReadEOF
429 | 	} else {
430 | 		eRead.T = EReadLine
431 | 	}
432 | 
433 | 	return nil, line, n, eRead
434 | }
435 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GoDoc](https://godoc.org/github.com/shuLhan/dsv?status.svg)](https://godoc.org/github.com/shuLhan/dsv)
  2 | [![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/dsv)](https://goreportcard.com/report/github.com/shuLhan/dsv)
  3 | 
  4 | Package `dsv` is a Go library for working with delimited separated value (DSV).
  5 | 
  6 | NOTE: This package has been deprecated. See
  7 | https://github.com/shuLhan/share/lib/dsv for latest implementation.
  8 | 
  9 | DSV is a free-style form of CSV format of text data, where each record is
 10 | separated by newline, and each column can be separated by any string, not just
 11 | comma.
 12 | 
 13 | - [Example](#example)
 14 | - [Terminology](#terminology)
 15 | - [Configuration](#configuration)
 16 |   - [Metadata](#metadata)
 17 |   - [Input](#input)
 18 |     - [DatasetMode Explained](#datasetmode-explained)
 19 |   - [Output](#output)
 20 | - [Working with DSV](#working-with-dsv)
 21 |   - [Processing each Rows/Columns](#processing-each-rowscolumns)
 22 |   - [Using different Dataset](#using-different-dataset)
 23 |   - [Builtin Functions for Dataset](#builtin-functions-for-dataset)
 24 | - [Limitations](#limitations)
 25 | 
 26 | ---
 27 | 
 28 | ## Example
 29 | 
 30 | Lets process this input file `input.dat`,
 31 | 
 32 |     Mon Dt HH MM SS Process
 33 |     Nov 29 23:14:36 process-1
 34 |     Nov 29 23:14:37 process-2
 35 |     Nov 29 23:14:38 process-3
 36 | 
 37 | and generate output file `output.dat` which format like this,
 38 | 
 39 |     "process_1","29-Nov"
 40 |     "process_2","29-Nov"
 41 |     "process_3","29-Nov"
 42 | 
 43 | How do we do it?
 44 | 
 45 | First, create file metadata for input and output, name it `config.dsv`,
 46 | 
 47 |     {
 48 |         "Input"         :"input.dat"
 49 |     ,   "Skip"          :1
 50 |     ,   "InputMetadata" :
 51 |         [{
 52 |             "Name"      :"month"
 53 |         ,   "Separator" :" "
 54 |         },{
 55 |             "Name"      :"date"
 56 |         ,   "Separator" :" "
 57 |         ,   "Type"      :"integer"
 58 |         },{
 59 |             "Name"      :"hour"
 60 |         ,   "Separator" :":"
 61 |         ,   "Type"      :"integer"
 62 |         },{
 63 |             "Name"      :"minute"
 64 |         ,   "Separator" :":"
 65 |         ,   "Type"      :"integer"
 66 |         },{
 67 |             "Name"      :"second"
 68 |         ,   "Separator" :" "
 69 |         ,   "Type"      :"integer"
 70 |         },{
 71 |             "Name"      :"process_name"
 72 |         ,   "Separator" :"-"
 73 |         },{
 74 |             "Name"      :"process_id"
 75 |         }]
 76 |     ,   "Output"        :"output.dat"
 77 |     ,   "OutputMetadata":
 78 |         [{
 79 |             "Name"      :"process_name"
 80 |         ,   "LeftQuote" :"\""
 81 |         ,   "Separator" :"_"
 82 |         },{
 83 |             "Name"      :"process_id"
 84 |         ,   "RightQuote":"\""
 85 |         ,   "Separator" :","
 86 |         },{
 87 |             "Name"      :"date"
 88 |         ,   "LeftQuote" :"\""
 89 |         ,   "Separator" :"-"
 90 |         },{
 91 |             "Name"      :"month"
 92 |         ,   "RightQuote":"\""
 93 |         }]
 94 |     }
 95 | 
 96 | The metadata is using JSON format. For more information see `metadata.go`
 97 | and `reader.go`.
 98 | 
 99 | Second, we create a reader to read the input file.
100 | 
101 |     dsvReader, e := dsv.NewReader("config.dsv", nil)
102 | 
103 |     if nil != e {
104 |         t.Fatal(e)
105 |     }
106 | 
107 | Third, we create a writer to write our output data,
108 | 
109 |     dsvWriter, e := dsv.NewWriter("config.dsv")
110 | 
111 |     if nil != e {
112 |         t.Error(e)
113 |     }
114 | 
115 | Last action, we process them: read input records and pass them to writer.
116 | 
117 |     for {
118 |         n, e := dsv.Read(dsvReader)
119 | 
120 |         if n > 0 {
121 |             dsvWriter.Write(dsvReader)
122 | 
123 |         // EOF, no more record.
124 |         } else if e == io.EOF {
125 |             break
126 |         }
127 |     }
128 | 
129 |     // we will make sure all open descriptor is closed.
130 |     _ = dsvReader.Close()
131 | 
132 | Easy enough? We can combine the reader and writer using `dsv.New()`, which will
133 | create reader and writer,
134 | 
135 |     rw, e := dsv.New("config.dsv", nil)
136 | 
137 |     if nil != e {
138 |         t.Error(e)
139 |     }
140 | 
141 |     // do usual process like in the last step.
142 | 
143 | Thats it!
144 | 
145 | ## Terminology
146 | 
147 | Here are some terminology that we used in developing this library, which may
148 | help reader understanding the configuration and API.
149 | 
150 | - Dataset: is a content of file
151 | - Record: a single cell in row or column, or the smallest building block of
152 |   dataset
153 | - Row: is a horizontal representation of records in dataset
154 | - Column: is a vertical representation of records in dataset
155 | 
156 | ```
157 |        COL-0  COL-1  ... COL-x
158 | ROW-0: record record ... record
159 | ROW-1: record record ... record
160 | ...
161 | ROW-y: record record ... record
162 | ```
163 | 
164 | ## Configuration
165 | 
166 | We choose and use JSON for configuration because,
167 | 
168 | 1. No additional source to test.
169 | 2. Easy to extended. User can embed the current metadata, add additional
170 |    configuration, and create another reader to work with it.
171 | 
172 | ### Metadata
173 | 
174 | Metadata contain information about each column when reading input file and
175 | writing to output file,
176 | 
177 | - `Name`: mandatory, the name of column
178 | - `Type`: optional, type of record when reading input file. Valid value are
179 |   "integer", "real", or "string" (default)
180 | - `Separator`: optional, default to `"\n"`. Separator is a string that
181 |   separate the current record with the next record.
182 | - `LeftQuote`: optional, default is empty `""`. LeftQuote is a string that
183 |   start at the beginning of record.
184 | - `RightQuote`: optional, default is empty `""`. RightQuote is a string at the
185 |   end of record.
186 | - `Skip`: optional, boolean, default is `false`. If true the column will be
187 |   saved in dataset when reading input file, otherwise it will be ignored.
188 | - `ValueSpace`: optional, slice of string, default is empty. This contain the
189 |   string representation of all possible value in column.
190 | 
191 | ### Input
192 | 
193 | Input configuration contain information about input file.
194 | 
195 | - `Input`: mandatory, the name of input file, could use relative or absolute
196 |   path. If no path is given then it assumed that the input file is in the same
197 |   directory with configuration file.
198 | - `InputMetadata`: mandatory, list of metadata.
199 | - `Skip`: optional, number, default 0. Skip define the number of line that will
200 |   be skipped when first input file is opened.
201 | - `TrimSpace`: optional, boolean, default is true. If its true, before parsed, the
202 |   white space in the beginning and end of each input line will be removed,
203 |   otherwise it will leave unmodified.
204 | - `Rejected`: optional, default to `rejected.dat`. Rejected is file where
205 |   data that does not match with metadata will be saved. One can inspect the
206 |   rejected file fix it for re-process or ignore it.
207 | - `MaxRows`: optional, default to `256`. Maximum number of rows for one read
208 |   operation that will be saved in memory. If its negative, i.e. `-1`, all data
209 |   in input file will be processed.
210 | - `DatasetMode`: optional, default to "rows". Mode of dataset in memory.
211 |   Valid values are "rows", "columns", or "matrix". Matrix mode is combination of
212 |   rows and columns, it give more flexibility when processing the dataset but
213 |   will require additional memory.
214 | 
215 | #### `DatasetMode` Explained
216 | 
217 | For example, given input data file,
218 | 
219 |     col1,col2,col3
220 |     a,b,c
221 |     1,2,3
222 | 
223 | "rows" mode is where each line saved in its own slice, resulting in Rows:
224 | 
225 |     Rows[0]: [a b c]
226 |     Rows[1]: [1 2 3]
227 | 
228 | "columns" mode is where each line saved by columns, resulting in Columns:
229 | 
230 |     Columns[0]: {col1 0 0 [] [a 1]}
231 |     Columns[1]: {col2 0 0 [] [b 2]}
232 |     Columns[1]: {col3 0 0 [] [c 3]}
233 | 
234 | Unlike rows mode, each column contain metadata including column name, type,
235 | flag, and value space (all possible value that _may_ contain in column value).
236 | 
237 | "matrix" mode is where each record saved both in row and column.
238 | 
239 | ### Output
240 | 
241 | Output configuration contain information about output file when writing the
242 | dataset.
243 | 
244 | - `Output`: mandatory, the name of output file, could use relative or absolute
245 |   path. If no path is given then it assumed that the output file is in the same
246 |   directory with configuration file.
247 | - `OutputMetadata`: mandatory, list of metadata.
248 | 
249 | ## Working with DSV
250 | 
251 | ### Processing each Rows/Columns
252 | 
253 | After opening the input file, we can process the dataset based on rows/columns
254 | mode using simple `for` loop. Example,
255 | 
256 | ```
257 | // Save dataset object for used later.
258 | dataset := dsvReader.GetDataset().(tabula.DatasetInterface)
259 | 
260 | for {
261 | 	n, e := dsv.Read(dsvReader)
262 | 
263 | 	if n > 0 {
264 | 		// Process each row ...
265 | 		for x, row := dataset.GetDataAsRows() {
266 | 
267 | 			for y, record := range row.Records {
268 | 				// process each record in row
269 | 			}
270 | 		}
271 | 
272 | 		// Or, process each columns
273 | 		for x, column := dataset.GetDataAsColumns() {
274 | 
275 | 			for y, record := range column.Records {
276 | 				// process each record in column
277 | 			}
278 | 		}
279 | 
280 | 		// Write the dataset to file after processed
281 | 		dsvWriter.Write(dsvReader)
282 | 	}
283 | 	if e == io.EOF {
284 | 		break
285 | 	}
286 | 	if e != nil {
287 | 		// handle error
288 | 	}
289 | }
290 | ```
291 | 
292 | ### Using different Dataset
293 | 
294 | Default dataset used by Reader is
295 | [tabula.Dataset](https://godoc.org/github.com/shuLhan/tabula#Dataset).
296 | 
297 | You can extend and implement
298 | [DatasetInterface](https://godoc.org/github.com/shuLhan/tabula#DatasetInterface)
299 | and use it in reader object, either by
300 | 
301 | - passing it in the second parameter in `NewReader`, for example,
302 | 
303 |   ```
304 |   myset := MySet{
305 |   	...
306 |   }
307 |   reader, e := dsv.NewReader("config.dsv", &myset)
308 |   ```
309 | 
310 | - or by calling `reader.Init` after creating new Reader,
311 | 
312 |   ```
313 |   myset := MySet{
314 |   	...
315 |   }
316 |   reader := dsv.Reader{
317 |   	...
318 |   }
319 |   reader.Init("config.dsv", &myset)
320 |   ```
321 | 
322 | ### Builtin Functions for Dataset
323 | 
324 | Since we use tabula package to manage data, any features in those package
325 | can be used in our dataset.
326 | For more information see [tabula
327 | package](https://godoc.org/github.com/shuLhan/tabula).
328 | 
329 | ## Limitations
330 | 
331 | - New line is `\n` for each row.
332 | 
333 | - Reader and Writer operate in ASCII (8 bit or char type), UTF-8 is not
334 |   supported yet, since we can not test it. Patch for supporting UTF-8 (or
335 |   runes type) are welcome.
336 | 
337 | - About escaped character in content of data.
338 | 
339 |   Since we said that we handle free-style form of CSV, what we mean was the
340 |   left-quote, right-quote and separator can be string. Its not only one single
341 |   character like single quote or double quote or any single character, but
342 |   literally one or more characters without space. Any escaped character will be
343 |   read as is (along with `'\'`) unless its followed by right-quote or separator.
344 |   For example,
345 | 
346 |       "test\'"
347 | 
348 |   will be readed as `test\'`. But
349 | 
350 |       "test\""
351 | 
352 |   will be readed as `test"`, since the right-quote is matched with escaped
353 |   token.
354 | 


--------------------------------------------------------------------------------
/reader_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv_test
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"github.com/shuLhan/dsv"
 10 | 	"github.com/shuLhan/tabula"
 11 | 	"io"
 12 | 	"strings"
 13 | 	"testing"
 14 | )
 15 | 
 16 | var jsonSample = []string{
 17 | 	`{}`,
 18 | 	`{
 19 | 		"Input"		:"testdata/input.dat"
 20 | 	}`,
 21 | 	`{
 22 | 		"Input"		:"testdata/input.dat"
 23 | 	}`,
 24 | 	`{
 25 | 		"Input"		:"testdata/input.dat"
 26 | 	,	"InputMetadata"	:
 27 | 		[{
 28 | 			"Name"		:"A"
 29 | 		,	"Separator"	:","
 30 | 		},{
 31 | 			"Name"		:"B"
 32 | 		,	"Separator"	:";"
 33 | 		}]
 34 | 	}`,
 35 | 	`{
 36 | 		"Input"		:"testdata/input.dat"
 37 | 	,	"Skip"		:1
 38 | 	,	"MaxRows"	:1
 39 | 	,	"InputMetadata"	:
 40 | 		[{
 41 | 			"Name"		:"id"
 42 | 		,	"Separator"	:";"
 43 | 		,	"Type"		:"integer"
 44 | 		},{
 45 | 			"Name"		:"name"
 46 | 		,	"Separator"	:"-"
 47 | 		,	"LeftQuote"	:"\""
 48 | 		,	"RightQuote"	:"\""
 49 | 		},{
 50 | 			"Name"		:"value"
 51 | 		,	"Separator"	:";"
 52 | 		,	"LeftQuote"	:"[["
 53 | 		,	"RightQuote"	:"]]"
 54 | 		},{
 55 | 			"Name"		:"integer"
 56 | 		,	"Type"		:"integer"
 57 | 		,	"Separator"	:";"
 58 | 		},{
 59 | 			"Name"		:"real"
 60 | 		,	"Type"		:"real"
 61 | 		}]
 62 | 	}`,
 63 | 	`{
 64 | 		"Input"		:"testdata/input.dat"
 65 | 	,	"Skip"		:1
 66 | 	,	"MaxRows"	:1
 67 | 	,	"InputMetadata"	:
 68 | 		[{
 69 | 			"Name"		:"id"
 70 | 		},{
 71 | 			"Name"		:"editor"
 72 | 		},{
 73 | 			"Name"		:"old_rev_id"
 74 | 		},{
 75 | 			"Name"		:"new_rev_id"
 76 | 		},{
 77 | 			"Name"		:"diff_url"
 78 | 		},{
 79 | 			"Name"		:"edit_time"
 80 | 		},{
 81 | 			"Name"		:"edit_comment"
 82 | 		},{
 83 | 			"Name"		:"article_id"
 84 | 		},{
 85 | 			"Name"		:"article_title"
 86 | 		}]
 87 | 	}`,
 88 | }
 89 | 
 90 | var readers = []*dsv.Reader{
 91 | 	{},
 92 | 	{
 93 | 		Input: "testdata/input.dat",
 94 | 	},
 95 | 	{
 96 | 		Input: "test-another.dsv",
 97 | 	},
 98 | 	{
 99 | 		Input: "testdata/input.dat",
100 | 		InputMetadata: []dsv.Metadata{
101 | 			{
102 | 				Name:      "A",
103 | 				Separator: ",",
104 | 			},
105 | 			{
106 | 				Name:      "B",
107 | 				Separator: ";",
108 | 			},
109 | 		},
110 | 	},
111 | }
112 | 
113 | //
114 | // TestReaderNoInput will print error that the input is not defined.
115 | //
116 | func TestReaderNoInput(t *testing.T) {
117 | 	dsvReader := &dsv.Reader{}
118 | 
119 | 	e := dsv.ConfigParse(dsvReader, []byte(jsonSample[0]))
120 | 
121 | 	if nil != e {
122 | 		t.Fatal(e)
123 | 	}
124 | 
125 | 	e = dsvReader.Init("", nil)
126 | 
127 | 	if nil == e {
128 | 		t.Fatal("TestReaderNoInput: failed, should return non nil!")
129 | 	}
130 | }
131 | 
132 | //
133 | // TestConfigParse test parsing metadata.
134 | //
135 | func TestConfigParse(t *testing.T) {
136 | 	cases := []struct {
137 | 		in  string
138 | 		out *dsv.Reader
139 | 	}{
140 | 		{
141 | 			jsonSample[1],
142 | 			readers[1],
143 | 		},
144 | 		{
145 | 			jsonSample[3],
146 | 			readers[3],
147 | 		},
148 | 	}
149 | 
150 | 	dsvReader := &dsv.Reader{}
151 | 
152 | 	for _, c := range cases {
153 | 		e := dsv.ConfigParse(dsvReader, []byte(c.in))
154 | 
155 | 		if e != nil {
156 | 			t.Fatal(e)
157 | 		}
158 | 		if !dsvReader.IsEqual(c.out) {
159 | 			t.Fatal("Test failed on ", c.in)
160 | 		}
161 | 	}
162 | }
163 | 
164 | func TestReaderIsEqual(t *testing.T) {
165 | 	cases := []struct {
166 | 		in     *dsv.Reader
167 | 		out    *dsv.Reader
168 | 		result bool
169 | 	}{
170 | 		{
171 | 			readers[1],
172 | 			&dsv.Reader{
173 | 				Input: "testdata/input.dat",
174 | 			},
175 | 			true,
176 | 		},
177 | 		{
178 | 			readers[1],
179 | 			readers[2],
180 | 			false,
181 | 		},
182 | 	}
183 | 
184 | 	var r bool
185 | 
186 | 	for _, c := range cases {
187 | 		r = c.in.IsEqual(c.out)
188 | 
189 | 		if r != c.result {
190 | 			t.Fatal("Test failed on equality between ", c.in,
191 | 				"\n and ", c.out)
192 | 		}
193 | 	}
194 | }
195 | 
196 | //
197 | // doRead test reading the DSV data.
198 | //
199 | func doRead(t *testing.T, dsvReader *dsv.Reader, exp []string) {
200 | 	i := 0
201 | 	var n int
202 | 	var e error
203 | 
204 | 	for {
205 | 		n, e = dsv.Read(dsvReader)
206 | 
207 | 		if n > 0 {
208 | 			r := fmt.Sprint(dsvReader.
209 | 				GetDataset().(tabula.DatasetInterface).
210 | 				GetDataAsRows())
211 | 
212 | 			assert(t, exp[i], r, true)
213 | 
214 | 			i++
215 | 		} else if e == io.EOF {
216 | 			// EOF
217 | 			break
218 | 		}
219 | 	}
220 | }
221 | 
222 | //
223 | // TestReader test reading.
224 | //
225 | func TestReaderRead(t *testing.T) {
226 | 	dsvReader := &dsv.Reader{}
227 | 
228 | 	e := dsv.ConfigParse(dsvReader, []byte(jsonSample[4]))
229 | 
230 | 	if nil != e {
231 | 		t.Fatal(e)
232 | 	}
233 | 
234 | 	e = dsvReader.Init("", nil)
235 | 	if nil != e {
236 | 		t.Fatal(e)
237 | 	}
238 | 
239 | 	doRead(t, dsvReader, expectation)
240 | 
241 | 	e = dsvReader.Close()
242 | 	if e != nil {
243 | 		t.Fatal(e)
244 | 	}
245 | }
246 | 
247 | //
248 | // TestReaderOpen real example from the start.
249 | //
250 | func TestReaderOpen(t *testing.T) {
251 | 	dsvReader, e := dsv.NewReader("testdata/config.dsv", nil)
252 | 	if nil != e {
253 | 		t.Fatal(e)
254 | 	}
255 | 
256 | 	doRead(t, dsvReader, expectation)
257 | 
258 | 	e = dsvReader.Close()
259 | 	if e != nil {
260 | 		t.Fatal(e)
261 | 	}
262 | }
263 | 
264 | func TestDatasetMode(t *testing.T) {
265 | 	var e error
266 | 	var config = []string{`{
267 | 		"Input"		:"testdata/input.dat"
268 | 	,	"DatasetMode"	:"row"
269 | 	}`, `{
270 | 		"Input"		:"testdata/input.dat"
271 | 	,	"DatasetMode"	:"rows"
272 | 	}`, `{
273 | 		"Input"		:"testdata/input.dat"
274 | 	,	"DatasetMode"	:"columns"
275 | 	}`}
276 | 
277 | 	var exps = []struct {
278 | 		status bool
279 | 		value  string
280 | 	}{{
281 | 		false,
282 | 		string(config[0]),
283 | 	}, {
284 | 		true,
285 | 		string(config[1]),
286 | 	}, {
287 | 		true,
288 | 		string(config[2]),
289 | 	}}
290 | 
291 | 	reader := &dsv.Reader{}
292 | 
293 | 	for k, v := range exps {
294 | 		e = dsv.ConfigParse(reader, []byte(config[k]))
295 | 
296 | 		if e != nil {
297 | 			t.Fatal(e)
298 | 		}
299 | 
300 | 		e = reader.Init("", nil)
301 | 		if e != nil {
302 | 			if v.status {
303 | 				t.Fatal(e)
304 | 			}
305 | 		}
306 | 	}
307 | }
308 | 
309 | func TestReaderToColumns(t *testing.T) {
310 | 	reader := &dsv.Reader{}
311 | 
312 | 	e := dsv.ConfigParse(reader, []byte(jsonSample[4]))
313 | 	if nil != e {
314 | 		t.Fatal(e)
315 | 	}
316 | 
317 | 	e = reader.Init("", nil)
318 | 	if nil != e {
319 | 		t.Fatal(e)
320 | 	}
321 | 
322 | 	reader.SetDatasetMode(dsv.DatasetModeCOLUMNS)
323 | 
324 | 	var n, i int
325 | 	for {
326 | 		n, e = dsv.Read(reader)
327 | 
328 | 		if n > 0 {
329 | 			ds := reader.GetDataset().(tabula.DatasetInterface)
330 | 			ds.TransposeToRows()
331 | 
332 | 			r := fmt.Sprint(ds.GetData())
333 | 
334 | 			assert(t, expectation[i], r, true)
335 | 
336 | 			i++
337 | 		} else if e == io.EOF {
338 | 			// EOF
339 | 			break
340 | 		}
341 | 	}
342 | }
343 | 
344 | //
345 | // TestReaderSkip will test the 'Skip' option in Metadata.
346 | //
347 | func TestReaderSkip(t *testing.T) {
348 | 	dsvReader, e := dsv.NewReader("testdata/config_skip.dsv", nil)
349 | 	if nil != e {
350 | 		t.Fatal(e)
351 | 	}
352 | 
353 | 	doRead(t, dsvReader, expSkip)
354 | 
355 | 	e = dsvReader.Close()
356 | 	if e != nil {
357 | 		t.Fatal(e)
358 | 	}
359 | }
360 | 
361 | func TestTransposeToColumns(t *testing.T) {
362 | 	reader, e := dsv.NewReader("testdata/config_skip.dsv", nil)
363 | 	if nil != e {
364 | 		t.Fatal(e)
365 | 	}
366 | 
367 | 	reader.SetMaxRows(-1)
368 | 
369 | 	_, e = dsv.Read(reader)
370 | 
371 | 	if e != io.EOF {
372 | 		t.Fatal(e)
373 | 	}
374 | 
375 | 	ds := reader.GetDataset().(tabula.DatasetInterface)
376 | 	ds.TransposeToColumns()
377 | 
378 | 	exp := fmt.Sprint(expSkipColumnsAll)
379 | 
380 | 	columns := ds.GetDataAsColumns()
381 | 
382 | 	got := fmt.Sprint(*columns)
383 | 
384 | 	assert(t, exp, got, true)
385 | 
386 | 	e = reader.Close()
387 | 	if e != nil {
388 | 		t.Fatal(e)
389 | 	}
390 | }
391 | 
392 | func TestSortColumnsByIndex(t *testing.T) {
393 | 	reader, e := dsv.NewReader("testdata/config_skip.dsv", nil)
394 | 	if nil != e {
395 | 		t.Fatal(e)
396 | 	}
397 | 
398 | 	reader.SetMaxRows(-1)
399 | 
400 | 	_, e = dsv.Read(reader)
401 | 	if e != io.EOF {
402 | 		t.Fatal(e)
403 | 	}
404 | 
405 | 	// reverse the data
406 | 	var idxReverse []int
407 | 	var expReverse []string
408 | 
409 | 	for x := len(expSkip) - 1; x >= 0; x-- {
410 | 		idxReverse = append(idxReverse, x)
411 | 		expReverse = append(expReverse, expSkip[x])
412 | 	}
413 | 
414 | 	ds := reader.GetDataset().(tabula.DatasetInterface)
415 | 
416 | 	tabula.SortColumnsByIndex(ds, idxReverse)
417 | 
418 | 	exp := strings.Join(expReverse, "")
419 | 	got := fmt.Sprint(ds.GetDataAsRows())
420 | 
421 | 	assert(t, exp, got, true)
422 | 
423 | 	exp = "[" + strings.Join(expSkipColumnsAllRev, " ") + "]"
424 | 
425 | 	columns := ds.GetDataAsColumns()
426 | 
427 | 	got = fmt.Sprint(*columns)
428 | 
429 | 	assert(t, exp, got, true)
430 | 
431 | 	e = reader.Close()
432 | 	if e != nil {
433 | 		t.Fatal(e)
434 | 	}
435 | }
436 | 
437 | func TestSplitRowsByValue(t *testing.T) {
438 | 	reader, e := dsv.NewReader("testdata/config.dsv", nil)
439 | 	if nil != e {
440 | 		t.Fatal(e)
441 | 	}
442 | 
443 | 	reader.SetMaxRows(256)
444 | 
445 | 	_, e = dsv.Read(reader)
446 | 
447 | 	if e != nil && e != io.EOF {
448 | 		t.Fatal(e)
449 | 	}
450 | 
451 | 	ds := reader.GetDataset().(tabula.DatasetInterface)
452 | 	splitL, splitR, e := tabula.SplitRowsByValue(ds, 0, 6)
453 | 
454 | 	if e != nil {
455 | 		t.Fatal(e)
456 | 	}
457 | 
458 | 	// test left split
459 | 	exp := ""
460 | 	for x := 0; x < 4; x++ {
461 | 		exp += expectation[x]
462 | 	}
463 | 
464 | 	got := fmt.Sprint(splitL.GetDataAsRows())
465 | 
466 | 	assert(t, exp, got, true)
467 | 
468 | 	// test right split
469 | 	exp = ""
470 | 	for x := 4; x < len(expectation); x++ {
471 | 		exp += expectation[x]
472 | 	}
473 | 
474 | 	got = fmt.Sprint(splitR.GetDataAsRows())
475 | 
476 | 	assert(t, exp, got, true)
477 | 
478 | 	e = reader.Close()
479 | 	if e != nil {
480 | 		t.Fatal(e)
481 | 	}
482 | }
483 | 
484 | //
485 | // testWriteOutput will write merged reader and check with expected file output.
486 | //
487 | func testWriteOutput(t *testing.T, r *dsv.Reader, outfile, expfile string) {
488 | 
489 | 	writer, e := dsv.NewWriter("")
490 | 	if e != nil {
491 | 		t.Fatal(e)
492 | 	}
493 | 
494 | 	e = writer.OpenOutput(outfile)
495 | 
496 | 	if e != nil {
497 | 		t.Fatal(e)
498 | 	}
499 | 
500 | 	sep := "\t"
501 | 	ds := r.GetDataset().(tabula.DatasetInterface)
502 | 
503 | 	_, e = writer.WriteRawDataset(ds, &sep)
504 | 	if e != nil {
505 | 		t.Fatal(e)
506 | 	}
507 | 
508 | 	e = writer.Close()
509 | 	if e != nil {
510 | 		t.Fatal(e)
511 | 	}
512 | 
513 | 	assertFile(t, outfile, expfile, true)
514 | }
515 | 
516 | func TestMergeColumns(t *testing.T) {
517 | 	reader1, e := dsv.NewReader("testdata/config.dsv", nil)
518 | 	if nil != e {
519 | 		t.Fatal(e)
520 | 	}
521 | 
522 | 	reader2, e := dsv.NewReader("testdata/config_skip.dsv", nil)
523 | 	if nil != e {
524 | 		t.Fatal(e)
525 | 	}
526 | 
527 | 	reader1.SetMaxRows(-1)
528 | 	reader2.SetMaxRows(-1)
529 | 
530 | 	_, e = dsv.Read(reader1)
531 | 	if e != io.EOF {
532 | 		t.Fatal(e)
533 | 	}
534 | 
535 | 	_, e = dsv.Read(reader2)
536 | 	if e != io.EOF {
537 | 		t.Fatal(e)
538 | 	}
539 | 
540 | 	e = reader1.Close()
541 | 	if e != nil {
542 | 		t.Fatal(e)
543 | 	}
544 | 
545 | 	e = reader2.Close()
546 | 	if e != nil {
547 | 		t.Fatal(e)
548 | 	}
549 | 
550 | 	reader1.InputMetadata[len(reader1.InputMetadata)-1].Separator = ";"
551 | 
552 | 	reader1.MergeColumns(reader2)
553 | 
554 | 	outfile := "testdata/output_merge_columns.dat"
555 | 	expfile := "testdata/expected_merge_columns.dat"
556 | 
557 | 	testWriteOutput(t, reader1, outfile, expfile)
558 | }
559 | 
560 | func TestMergeRows(t *testing.T) {
561 | 	reader1, e := dsv.NewReader("testdata/config.dsv", nil)
562 | 	if nil != e {
563 | 		t.Fatal(e)
564 | 	}
565 | 
566 | 	reader2, e := dsv.NewReader("testdata/config_skip.dsv", nil)
567 | 	if nil != e {
568 | 		t.Fatal(e)
569 | 	}
570 | 
571 | 	reader1.SetMaxRows(-1)
572 | 	reader2.SetMaxRows(-1)
573 | 
574 | 	_, e = dsv.Read(reader1)
575 | 	if e != io.EOF {
576 | 		t.Fatal(e)
577 | 	}
578 | 
579 | 	_, e = dsv.Read(reader2)
580 | 	if e != io.EOF {
581 | 		t.Fatal(e)
582 | 	}
583 | 
584 | 	e = reader1.Close()
585 | 	if e != nil {
586 | 		t.Fatal(e)
587 | 	}
588 | 
589 | 	e = reader2.Close()
590 | 	if e != nil {
591 | 		t.Fatal(e)
592 | 	}
593 | 
594 | 	reader1.MergeRows(reader2)
595 | 
596 | 	outfile := "testdata/output_merge_rows.dat"
597 | 	expfile := "testdata/expected_merge_rows.dat"
598 | 
599 | 	testWriteOutput(t, reader1, outfile, expfile)
600 | }
601 | 


--------------------------------------------------------------------------------
/writer.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"encoding/json"
 10 | 	"github.com/shuLhan/tabula"
 11 | 	"github.com/shuLhan/tekstus"
 12 | 	"log"
 13 | 	"os"
 14 | )
 15 | 
 16 | const (
 17 | 	// DefSeparator default separator that will be used if its not given
 18 | 	// in config file.
 19 | 	DefSeparator = ","
 20 | 	// DefOutput file.
 21 | 	DefOutput = "output.dat"
 22 | 	// DefEscape default string to escape the right quote or separator.
 23 | 	DefEscape = "\\"
 24 | )
 25 | 
 26 | //
 27 | // Writer write records from reader or slice using format configuration in
 28 | // metadata.
 29 | //
 30 | type Writer struct {
 31 | 	Config `json:"-"`
 32 | 	// Output file where the records will be written.
 33 | 	Output string `json:"Output"`
 34 | 	// OutputMetadata define format for each column.
 35 | 	OutputMetadata []Metadata `json:"OutputMetadata"`
 36 | 	// fWriter as write descriptor.
 37 | 	fWriter *os.File
 38 | 	// BufWriter for buffered writer.
 39 | 	BufWriter *bufio.Writer
 40 | }
 41 | 
 42 | //
 43 | // NewWriter create a writer object.
 44 | // User must call Open after that to populate the output and metadata.
 45 | //
 46 | func NewWriter(config string) (writer *Writer, e error) {
 47 | 	writer = &Writer{
 48 | 		Output:         "",
 49 | 		OutputMetadata: nil,
 50 | 		fWriter:        nil,
 51 | 		BufWriter:      nil,
 52 | 	}
 53 | 
 54 | 	if config == "" {
 55 | 		return
 56 | 	}
 57 | 
 58 | 	e = OpenWriter(writer, config)
 59 | 	if e != nil {
 60 | 		return nil, e
 61 | 	}
 62 | 
 63 | 	return
 64 | }
 65 | 
 66 | //
 67 | // GetOutput return output filename.
 68 | //
 69 | func (writer *Writer) GetOutput() string {
 70 | 	return writer.Output
 71 | }
 72 | 
 73 | //
 74 | // SetOutput will set the output file to path.
 75 | //
 76 | func (writer *Writer) SetOutput(path string) {
 77 | 	writer.Output = path
 78 | }
 79 | 
 80 | //
 81 | // AddMetadata will add new output metadata to writer.
 82 | //
 83 | func (writer *Writer) AddMetadata(md Metadata) {
 84 | 	writer.OutputMetadata = append(writer.OutputMetadata, md)
 85 | }
 86 | 
 87 | //
 88 | // open a generic method to open output file with specific flag.
 89 | //
 90 | func (writer *Writer) open(file string, flag int) (e error) {
 91 | 	if file == "" {
 92 | 		if writer.Output == "" {
 93 | 			file = DefOutput
 94 | 		} else {
 95 | 			file = writer.Output
 96 | 		}
 97 | 	}
 98 | 
 99 | 	writer.fWriter, e = os.OpenFile(file, flag, 0600)
100 | 	if nil != e {
101 | 		return e
102 | 	}
103 | 
104 | 	writer.BufWriter = bufio.NewWriter(writer.fWriter)
105 | 
106 | 	return nil
107 | }
108 | 
109 | //
110 | // OpenOutput file and buffered writer.
111 | // File will be truncated if its exist.
112 | //
113 | func (writer *Writer) OpenOutput(file string) (e error) {
114 | 	return writer.open(file, os.O_CREATE|os.O_TRUNC|os.O_WRONLY)
115 | }
116 | 
117 | //
118 | // ReopenOutput will open the output file back without truncating the content.
119 | //
120 | func (writer *Writer) ReopenOutput(file string) (e error) {
121 | 	if e = writer.Close(); e != nil {
122 | 		return
123 | 	}
124 | 	return writer.open(file, os.O_CREATE|os.O_APPEND|os.O_WRONLY)
125 | }
126 | 
127 | //
128 | // Flush output buffer to disk.
129 | //
130 | func (writer *Writer) Flush() error {
131 | 	return writer.BufWriter.Flush()
132 | }
133 | 
134 | //
135 | // Close all open descriptor.
136 | //
137 | func (writer *Writer) Close() (e error) {
138 | 	if nil != writer.BufWriter {
139 | 		e = writer.BufWriter.Flush()
140 | 		if e != nil {
141 | 			return
142 | 		}
143 | 	}
144 | 	if nil != writer.fWriter {
145 | 		e = writer.fWriter.Close()
146 | 	}
147 | 	return
148 | }
149 | 
150 | //
151 | // WriteRow dump content of Row to file using format in metadata.
152 | //
153 | func (writer *Writer) WriteRow(row *tabula.Row, recordMd []MetadataInterface) (
154 | 	e error,
155 | ) {
156 | 	nRecord := row.Len()
157 | 	v := []byte{}
158 | 	esc := []byte(DefEscape)
159 | 
160 | 	for i := range writer.OutputMetadata {
161 | 		md := writer.OutputMetadata[i]
162 | 
163 | 		// find the input index based on name on record metadata.
164 | 		rIdx, mdMatch := FindMetadata(&md, recordMd)
165 | 
166 | 		// No input metadata matched? skip it too.
167 | 		if rIdx >= nRecord {
168 | 			continue
169 | 		}
170 | 
171 | 		// If input column is ignored, continue to next record.
172 | 		if mdMatch != nil && mdMatch.GetSkip() {
173 | 			continue
174 | 		}
175 | 
176 | 		recV := (*row)[rIdx].Bytes()
177 | 		lq := md.GetLeftQuote()
178 | 
179 | 		if "" != lq {
180 | 			v = append(v, []byte(lq)...)
181 | 		}
182 | 
183 | 		rq := md.GetRightQuote()
184 | 		sep := md.GetSeparator()
185 | 
186 | 		// Escape the escape character itself.
187 | 		if md.T == tabula.TString {
188 | 			recV, _ = tekstus.BytesEncapsulate(esc, recV, esc, nil)
189 | 		}
190 | 
191 | 		// Escape the right quote in field content before writing it.
192 | 		if "" != rq && md.T == tabula.TString {
193 | 			recV, _ = tekstus.BytesEncapsulate([]byte(rq), recV,
194 | 				esc, nil)
195 | 		} else {
196 | 			// Escape the separator
197 | 			if "" != sep && md.T == tabula.TString {
198 | 				recV, _ = tekstus.BytesEncapsulate([]byte(sep),
199 | 					recV, esc, nil)
200 | 			}
201 | 		}
202 | 
203 | 		v = append(v, recV...)
204 | 
205 | 		if "" != rq {
206 | 			v = append(v, []byte(rq)...)
207 | 		}
208 | 
209 | 		if "" != sep {
210 | 			v = append(v, []byte(sep)...)
211 | 		}
212 | 	}
213 | 
214 | 	v = append(v, DefEOL)
215 | 
216 | 	_, e = writer.BufWriter.Write(v)
217 | 
218 | 	return e
219 | }
220 | 
221 | //
222 | // WriteRows will loop each row in the list of rows and write their content to
223 | // output file.
224 | // Return n for number of row written, and e if error happened.
225 | //
226 | func (writer *Writer) WriteRows(rows tabula.Rows, recordMd []MetadataInterface) (
227 | 	n int,
228 | 	e error,
229 | ) {
230 | 	for n = range rows {
231 | 		e = writer.WriteRow(rows[n], recordMd)
232 | 		if nil != e {
233 | 			break
234 | 		}
235 | 	}
236 | 
237 | 	_ = writer.Flush()
238 | 	return
239 | }
240 | 
241 | //
242 | // WriteColumns will write content of columns to output file.
243 | // Return n for number of row written, and e if error happened.
244 | //
245 | func (writer *Writer) WriteColumns(columns tabula.Columns,
246 | 	colMd []MetadataInterface,
247 | ) (
248 | 	n int,
249 | 	e error,
250 | ) {
251 | 	nColumns := len(columns)
252 | 	if nColumns <= 0 {
253 | 		return
254 | 	}
255 | 
256 | 	emptyRec := tabula.NewRecordString("")
257 | 
258 | 	// Get minimum and maximum length of all columns.
259 | 	// In case one of the column have different length (shorter or longer),
260 | 	// we will take the column with minimum length first and continue with
261 | 	// the maximum length.
262 | 
263 | 	minlen, maxlen := columns.GetMinMaxLength()
264 | 
265 | 	// If metadata is nil, generate it from column name.
266 | 	if colMd == nil {
267 | 		for _, col := range columns {
268 | 			md := &Metadata{
269 | 				Name: col.Name,
270 | 				T:    col.Type,
271 | 			}
272 | 
273 | 			colMd = append(colMd, md)
274 | 		}
275 | 	}
276 | 
277 | 	// First loop, iterate until minimum column length.
278 | 	row := make(tabula.Row, nColumns)
279 | 
280 | 	for ; n < minlen; n++ {
281 | 		// Convert columns to record.
282 | 		for y, col := range columns {
283 | 			row[y] = col.Records[n]
284 | 		}
285 | 
286 | 		e = writer.WriteRow(&row, colMd)
287 | 		if e != nil {
288 | 			goto err
289 | 		}
290 | 	}
291 | 
292 | 	// Second loop, iterate until maximum column length.
293 | 	for ; n < maxlen; n++ {
294 | 		// Convert columns to record.
295 | 		for y, col := range columns {
296 | 			if col.Len() > n {
297 | 				row[y] = col.Records[n]
298 | 			} else {
299 | 				row[y] = emptyRec
300 | 			}
301 | 		}
302 | 
303 | 		e = writer.WriteRow(&row, colMd)
304 | 		if e != nil {
305 | 			goto err
306 | 		}
307 | 	}
308 | 
309 | err:
310 | 	_ = writer.Flush()
311 | 	return n, e
312 | }
313 | 
314 | //
315 | // WriteRawRow will write row data using separator `sep` for each record.
316 | //
317 | func (writer *Writer) WriteRawRow(row *tabula.Row, sep, esc []byte) (e error) {
318 | 	if sep == nil {
319 | 		sep = []byte(DefSeparator)
320 | 	}
321 | 	if esc == nil {
322 | 		esc = []byte(DefEscape)
323 | 	}
324 | 
325 | 	v := []byte{}
326 | 	for x, rec := range *row {
327 | 		if x > 0 {
328 | 			v = append(v, sep...)
329 | 		}
330 | 
331 | 		recV := rec.Bytes()
332 | 
333 | 		if rec.Type() == tabula.TString {
334 | 			recV, _ = tekstus.BytesEncapsulate(sep, recV, esc, nil)
335 | 		}
336 | 
337 | 		v = append(v, recV...)
338 | 	}
339 | 
340 | 	v = append(v, DefEOL)
341 | 
342 | 	_, e = writer.BufWriter.Write(v)
343 | 
344 | 	_ = writer.Flush()
345 | 
346 | 	return e
347 | }
348 | 
349 | //
350 | // WriteRawRows write rows data using separator `sep` for each record.
351 | // We use pointer in separator parameter, so we can use empty string as
352 | // separator.
353 | //
354 | func (writer *Writer) WriteRawRows(rows *tabula.Rows, sep *string) (
355 | 	nrow int,
356 | 	e error,
357 | ) {
358 | 	nrow = len(*rows)
359 | 	if nrow <= 0 {
360 | 		return
361 | 	}
362 | 
363 | 	if sep == nil {
364 | 		sep = new(string)
365 | 		*sep = DefSeparator
366 | 	}
367 | 
368 | 	escbytes := []byte(DefEscape)
369 | 	sepbytes := []byte(*sep)
370 | 	x := 0
371 | 
372 | 	for ; x < nrow; x++ {
373 | 		e = writer.WriteRawRow((*rows)[x], sepbytes, escbytes)
374 | 		if nil != e {
375 | 			break
376 | 		}
377 | 	}
378 | 
379 | 	return x, e
380 | }
381 | 
382 | //
383 | // WriteRawColumns write raw columns using separator `sep` for each record to
384 | // file.
385 | //
386 | // We use pointer in separator parameter, so we can use empty string as
387 | // separator.
388 | //
389 | func (writer *Writer) WriteRawColumns(cols *tabula.Columns, sep *string) (
390 | 	nrow int,
391 | 	e error,
392 | ) {
393 | 	ncol := len(*cols)
394 | 	if ncol <= 0 {
395 | 		return
396 | 	}
397 | 
398 | 	if sep == nil {
399 | 		sep = new(string)
400 | 		*sep = DefSeparator
401 | 	}
402 | 
403 | 	// Find minimum and maximum column length.
404 | 	minlen, maxlen := cols.GetMinMaxLength()
405 | 
406 | 	esc := []byte(DefEscape)
407 | 	sepbytes := []byte(*sep)
408 | 	x := 0
409 | 
410 | 	// First, write until minimum column length.
411 | 	for ; x < minlen; x++ {
412 | 		v := cols.Join(x, sepbytes, esc)
413 | 		v = append(v, DefEOL)
414 | 
415 | 		_, e = writer.BufWriter.Write(v)
416 | 
417 | 		if nil != e {
418 | 			return x, e
419 | 		}
420 | 	}
421 | 
422 | 	// and then write column until max length.
423 | 	for ; x < maxlen; x++ {
424 | 		v := cols.Join(x, sepbytes, esc)
425 | 		v = append(v, DefEOL)
426 | 
427 | 		_, e = writer.BufWriter.Write(v)
428 | 
429 | 		if nil != e {
430 | 			break
431 | 		}
432 | 	}
433 | 
434 | 	_ = writer.Flush()
435 | 	return x, e
436 | }
437 | 
438 | //
439 | // WriteRawDataset will write content of dataset to file without metadata but
440 | // using separator `sep` for each record.
441 | //
442 | // We use pointer in separator parameter, so we can use empty string as
443 | // separator.
444 | //
445 | func (writer *Writer) WriteRawDataset(dataset tabula.DatasetInterface,
446 | 	sep *string,
447 | ) (
448 | 	int, error,
449 | ) {
450 | 	if nil == writer.fWriter {
451 | 		return 0, ErrNotOpen
452 | 	}
453 | 	if nil == dataset {
454 | 		return 0, nil
455 | 	}
456 | 	if sep == nil {
457 | 		sep = new(string)
458 | 		*sep = DefSeparator
459 | 	}
460 | 
461 | 	var rows *tabula.Rows
462 | 
463 | 	switch dataset.GetMode() {
464 | 	case tabula.DatasetModeColumns:
465 | 		cols := dataset.GetDataAsColumns()
466 | 		return writer.WriteRawColumns(cols, sep)
467 | 	case tabula.DatasetModeRows, tabula.DatasetModeMatrix:
468 | 		fallthrough
469 | 	default:
470 | 		rows = dataset.GetDataAsRows()
471 | 	}
472 | 
473 | 	return writer.WriteRawRows(rows, sep)
474 | }
475 | 
476 | //
477 | // Write rows from Reader to file.
478 | // Return n for number of row written, or e if error happened.
479 | //
480 | func (writer *Writer) Write(reader ReaderInterface) (int, error) {
481 | 	if nil == reader {
482 | 		return 0, ErrNilReader
483 | 	}
484 | 	if nil == writer.fWriter {
485 | 		return 0, ErrNotOpen
486 | 	}
487 | 
488 | 	ds := reader.GetDataset().(tabula.DatasetInterface)
489 | 
490 | 	var rows *tabula.Rows
491 | 
492 | 	switch ds.GetMode() {
493 | 	case tabula.DatasetModeColumns:
494 | 		cols := ds.GetDataAsColumns()
495 | 		return writer.WriteColumns(*cols, reader.GetInputMetadata())
496 | 	case tabula.DatasetModeRows, tabula.DatasetModeMatrix:
497 | 		fallthrough
498 | 	default:
499 | 		rows = ds.GetDataAsRows()
500 | 	}
501 | 
502 | 	return writer.WriteRows(*rows, reader.GetInputMetadata())
503 | }
504 | 
505 | //
506 | // String yes, it will print it in JSON like format.
507 | //
508 | func (writer *Writer) String() string {
509 | 	r, e := json.MarshalIndent(writer, "", "\t")
510 | 
511 | 	if nil != e {
512 | 		log.Print(e)
513 | 	}
514 | 
515 | 	return string(r)
516 | }
517 | 


--------------------------------------------------------------------------------
/reader.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2018, Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package dsv
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"github.com/shuLhan/tabula"
 10 | 	"log"
 11 | 	"os"
 12 | 	"strings"
 13 | )
 14 | 
 15 | const (
 16 | 	// DatasetModeROWS is a string representation of output mode rows.
 17 | 	DatasetModeROWS = "ROWS"
 18 | 	// DatasetModeCOLUMNS is a string representation of output mode columns.
 19 | 	DatasetModeCOLUMNS = "COLUMNS"
 20 | 	// DatasetModeMATRIX will save data in rows and columns. This mode will
 21 | 	// consume more memory that "rows" and "columns" but give greater
 22 | 	// flexibility when working with data.
 23 | 	DatasetModeMATRIX = "MATRIX"
 24 | )
 25 | 
 26 | //
 27 | // Reader hold all configuration, metadata and input data.
 28 | //
 29 | // DSV Reader work like this,
 30 | //
 31 | // (1) Initialize new dsv reader object
 32 | //
 33 | // dsvReader, e := dsv.NewReader(configfile)
 34 | //
 35 | // (2) Do not forget to check for error ...
 36 | //
 37 | // if e != nil {
 38 | // 	// handle error
 39 | // }
 40 | //
 41 | // (3) Make sure to close all files after finished
 42 | //
 43 | // defer dsvReader.Close ()
 44 | //
 45 | // (4) Create loop to read input data
 46 | //
 47 | // for {
 48 | // 	n, e := dsv.Read (dsvReader)
 49 | //
 50 | // 	if e == io.EOF {
 51 | // 		break
 52 | // 	}
 53 | //
 54 | // (4.1) Iterate through rows
 55 | //
 56 | // 	for row := range dsvReader.GetDataAsRows() {
 57 | // 		// work with row ...
 58 | // 	}
 59 | // }
 60 | //
 61 | // Thats it.
 62 | //
 63 | //
 64 | type Reader struct {
 65 | 	// Config define path of configuration file.
 66 | 	//
 67 | 	// If the configuration located in other directory, e.g.
 68 | 	// "../../config.dsv", and the Input option is set with name only, like
 69 | 	// "input.dat", we assume that its in the same directory where the
 70 | 	// configuration file belong.
 71 | 	Config
 72 | 	// Dataset contains the content of input file after read.
 73 | 	dataset interface{}
 74 | 	// Input file, mandatory.
 75 | 	Input string `json:"Input"`
 76 | 	// Skip n lines from the head.
 77 | 	Skip int `json:"Skip"`
 78 | 	// TrimSpace or not. If its true, before parsing the line, the white
 79 | 	// space in the beginning and end of each input line will be removed,
 80 | 	// otherwise it will leave unmodified.  Default is true.
 81 | 	TrimSpace bool `json:"TrimSpace"`
 82 | 	// Rejected is the file name where row that does not fit
 83 | 	// with metadata will be saved.
 84 | 	Rejected string `json:"Rejected"`
 85 | 	// InputMetadata define format for each column in input data.
 86 | 	InputMetadata []Metadata `json:"InputMetadata"`
 87 | 	// MaxRows define maximum row that this reader will read and
 88 | 	// saved in the memory at one read operation.
 89 | 	// If the value is -1, all rows will read.
 90 | 	MaxRows int `json:"MaxRows"`
 91 | 	// DatasetMode define on how do you want the result is saved. There are
 92 | 	// three options: either in "rows", "columns", or "matrix" mode.
 93 | 	// For example, input data file,
 94 | 	//
 95 | 	//	a,b,c
 96 | 	//	1,2,3
 97 | 	//
 98 | 	// "rows" mode is where each line saved in its own slice, resulting
 99 | 	// in Rows:
100 | 	//
101 | 	//	[a b c]
102 | 	//	[1 2 3]
103 | 	//
104 | 	// "columns" mode is where each line saved by columns, resulting in
105 | 	// Columns:
106 | 	//
107 | 	//	[a 1]
108 | 	//	[b 2]
109 | 	//	[c 3]
110 | 	//
111 | 	// "matrix" mode is where each record saved in their own row and column.
112 | 	//
113 | 	DatasetMode string `json:"DatasetMode"`
114 | 	// fRead is read descriptor.
115 | 	fRead *os.File
116 | 	// fReject is reject descriptor.
117 | 	fReject *os.File
118 | 	// bufRead is a buffer for working with input file.
119 | 	bufRead *bufio.Reader
120 | 	// bufReject is a buffer for working with rejected file.
121 | 	bufReject *bufio.Writer
122 | }
123 | 
124 | //
125 | // NewReader create and initialize new instance of DSV Reader with default values.
126 | //
127 | func NewReader(config string, dataset interface{}) (reader *Reader, e error) {
128 | 	reader = &Reader{
129 | 		Input:         "",
130 | 		Skip:          0,
131 | 		TrimSpace:     true,
132 | 		Rejected:      DefaultRejected,
133 | 		InputMetadata: nil,
134 | 		MaxRows:       DefaultMaxRows,
135 | 		DatasetMode:   DefDatasetMode,
136 | 		dataset:       dataset,
137 | 		fRead:         nil,
138 | 		fReject:       nil,
139 | 		bufRead:       nil,
140 | 		bufReject:     nil,
141 | 	}
142 | 
143 | 	e = reader.Init(config, dataset)
144 | 	if e != nil {
145 | 		return nil, e
146 | 	}
147 | 
148 | 	return
149 | }
150 | 
151 | //
152 | // Init will initialize reader object by
153 | //
154 | // (1) Check if dataset is not empty.
155 | // (2) Read config file.
156 | // (3) Set reader object default value.
157 | // (4) Check if output mode is valid and initialize it if valid.
158 | // (5) Check and initialize metadata and columns attributes.
159 | // (6) Check if Input is name only without path, so we can prefix it with
160 | //     config path.
161 | // (7) Open rejected file.
162 | // (8) Open input file.
163 | //
164 | func (reader *Reader) Init(fcfg string, dataset interface{}) (e error) {
165 | 	// (1)
166 | 	if dataset == nil {
167 | 		dataset = reader.GetDataset()
168 | 		if dataset == nil {
169 | 			dataset = &tabula.Dataset{}
170 | 			reader.dataset = dataset
171 | 		}
172 | 	}
173 | 
174 | 	// (2)
175 | 	fcfg = strings.TrimSpace(fcfg)
176 | 	if fcfg != "" {
177 | 		e = ConfigOpen(reader, fcfg)
178 | 		if e != nil {
179 | 			return e
180 | 		}
181 | 
182 | 		e = tabula.ReadDatasetConfig(dataset, fcfg)
183 | 		if e != nil {
184 | 			return e
185 | 		}
186 | 	}
187 | 
188 | 	// (3)
189 | 	reader.SetDefault()
190 | 
191 | 	// (4)
192 | 	reader.SetDatasetMode(reader.GetDatasetMode())
193 | 
194 | 	// (5)
195 | 	ds := dataset.(tabula.DatasetInterface)
196 | 	md := reader.GetInputMetadata()
197 | 	for i := range md {
198 | 		md[i].Init()
199 | 
200 | 		// Count number of output columns.
201 | 		if !md[i].GetSkip() {
202 | 			// add type of metadata to list of type
203 | 			col := tabula.Column{
204 | 				Type:       md[i].GetType(),
205 | 				Name:       md[i].GetName(),
206 | 				ValueSpace: md[i].GetValueSpace(),
207 | 			}
208 | 			ds.PushColumn(col)
209 | 		}
210 | 	}
211 | 
212 | 	// (6)
213 | 	reader.SetInput(ConfigCheckPath(reader, reader.GetInput()))
214 | 	reader.SetRejected(ConfigCheckPath(reader, reader.GetRejected()))
215 | 
216 | 	// (7)
217 | 	e = reader.OpenRejected()
218 | 	if nil != e {
219 | 		return
220 | 	}
221 | 
222 | 	// (8)
223 | 	e = reader.OpenInput()
224 | 	if nil != e {
225 | 		return
226 | 	}
227 | 
228 | 	return
229 | }
230 | 
231 | //
232 | // SetDefault options for global config and each metadata.
233 | //
234 | func (reader *Reader) SetDefault() {
235 | 	if "" == strings.TrimSpace(reader.Rejected) {
236 | 		reader.Rejected = DefaultRejected
237 | 	}
238 | 	if 0 == reader.MaxRows {
239 | 		reader.MaxRows = DefaultMaxRows
240 | 	}
241 | 	if "" == strings.TrimSpace(reader.DatasetMode) {
242 | 		reader.DatasetMode = DefDatasetMode
243 | 	}
244 | 	if nil == reader.dataset {
245 | 		reader.dataset = &tabula.Dataset{}
246 | 	}
247 | }
248 | 
249 | //
250 | // CopyConfig copy configuration from other reader object not including data
251 | // and metadata.
252 | //
253 | func (reader *Reader) CopyConfig(src *Reader) {
254 | 	reader.ConfigPath = src.GetConfigPath()
255 | 	reader.Input = src.GetInput()
256 | 	reader.Skip = src.GetSkip()
257 | 	reader.TrimSpace = src.IsTrimSpace()
258 | 	reader.Rejected = src.GetRejected()
259 | 	reader.MaxRows = src.GetMaxRows()
260 | 	reader.DatasetMode = src.GetDatasetMode()
261 | }
262 | 
263 | //
264 | // GetInput return the input file.
265 | //
266 | func (reader *Reader) GetInput() string {
267 | 	return reader.Input
268 | }
269 | 
270 | //
271 | // SetInput file.
272 | //
273 | func (reader *Reader) SetInput(path string) {
274 | 	reader.Input = path
275 | }
276 | 
277 | //
278 | // GetSkip return number of line that will be skipped.
279 | //
280 | func (reader *Reader) GetSkip() int {
281 | 	return reader.Skip
282 | }
283 | 
284 | //
285 | // SetSkip set number of lines that will be skipped before reading actual data.
286 | //
287 | func (reader *Reader) SetSkip(n int) {
288 | 	reader.Skip = n
289 | }
290 | 
291 | //
292 | // IsTrimSpace return value of TrimSpace option.
293 | //
294 | func (reader *Reader) IsTrimSpace() bool {
295 | 	return reader.TrimSpace
296 | }
297 | 
298 | //
299 | // GetRejected return name of rejected file.
300 | //
301 | func (reader *Reader) GetRejected() string {
302 | 	return reader.Rejected
303 | }
304 | 
305 | //
306 | // SetRejected file.
307 | //
308 | func (reader *Reader) SetRejected(path string) {
309 | 	reader.Rejected = path
310 | }
311 | 
312 | //
313 | // AddInputMetadata add new input metadata to reader.
314 | //
315 | func (reader *Reader) AddInputMetadata(md *Metadata) {
316 | 	reader.InputMetadata = append(reader.InputMetadata, *md)
317 | 	ds := reader.dataset.(tabula.DatasetInterface)
318 | 	ds.AddColumn(md.GetType(), md.GetName(), md.GetValueSpace())
319 | }
320 | 
321 | //
322 | // AppendMetadata will append new metadata `md` to list of reader input metadata.
323 | //
324 | func (reader *Reader) AppendMetadata(mdi MetadataInterface) {
325 | 	md := mdi.(*Metadata)
326 | 	reader.InputMetadata = append(reader.InputMetadata, *md)
327 | }
328 | 
329 | //
330 | // GetInputMetadata return pointer to slice of metadata.
331 | //
332 | func (reader *Reader) GetInputMetadata() []MetadataInterface {
333 | 	md := make([]MetadataInterface, len(reader.InputMetadata))
334 | 	for i := range reader.InputMetadata {
335 | 		md[i] = &reader.InputMetadata[i]
336 | 	}
337 | 
338 | 	return md
339 | }
340 | 
341 | //
342 | // GetInputMetadataAt return pointer to metadata at index 'idx'.
343 | //
344 | func (reader *Reader) GetInputMetadataAt(idx int) MetadataInterface {
345 | 	return &reader.InputMetadata[idx]
346 | }
347 | 
348 | //
349 | // GetMaxRows return number of maximum rows for reading.
350 | //
351 | func (reader *Reader) GetMaxRows() int {
352 | 	return reader.MaxRows
353 | }
354 | 
355 | //
356 | // SetMaxRows will set maximum rows that will be read from input file.
357 | //
358 | func (reader *Reader) SetMaxRows(max int) {
359 | 	reader.MaxRows = max
360 | }
361 | 
362 | //
363 | // GetDatasetMode return output mode of data.
364 | //
365 | func (reader *Reader) GetDatasetMode() string {
366 | 	return reader.DatasetMode
367 | }
368 | 
369 | //
370 | // SetDatasetMode to `mode`.
371 | //
372 | func (reader *Reader) SetDatasetMode(mode string) {
373 | 	ds := reader.dataset.(tabula.DatasetInterface)
374 | 	switch strings.ToUpper(mode) {
375 | 	case DatasetModeROWS:
376 | 		ds.SetMode(tabula.DatasetModeRows)
377 | 	case DatasetModeCOLUMNS:
378 | 		ds.SetMode(tabula.DatasetModeColumns)
379 | 	case DatasetModeMATRIX:
380 | 		fallthrough
381 | 	default:
382 | 		ds.SetMode(tabula.DatasetModeMatrix)
383 | 		mode = DatasetModeMATRIX
384 | 	}
385 | 	reader.DatasetMode = mode
386 | }
387 | 
388 | //
389 | // GetNColumnIn return number of input columns, or number of metadata, including
390 | // column with Skip=true.
391 | //
392 | func (reader *Reader) GetNColumnIn() int {
393 | 	return len(reader.InputMetadata)
394 | }
395 | 
396 | //
397 | // OpenInput open the input file, metadata must have been initialize.
398 | //
399 | func (reader *Reader) OpenInput() (e error) {
400 | 	reader.fRead, e = os.OpenFile(reader.Input, os.O_RDONLY, 0600)
401 | 	if nil != e {
402 | 		return e
403 | 	}
404 | 
405 | 	reader.bufRead = bufio.NewReader(reader.fRead)
406 | 
407 | 	// Skip lines
408 | 	if reader.GetSkip() > 0 {
409 | 		e = reader.SkipLines()
410 | 
411 | 		if nil != e {
412 | 			return
413 | 		}
414 | 	}
415 | 
416 | 	return nil
417 | }
418 | 
419 | //
420 | // OpenRejected open rejected file, for saving unparseable line.
421 | //
422 | func (reader *Reader) OpenRejected() (e error) {
423 | 	reader.fReject, e = os.OpenFile(reader.Rejected,
424 | 		os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
425 | 	if nil != e {
426 | 		return e
427 | 	}
428 | 
429 | 	reader.bufReject = bufio.NewWriter(reader.fReject)
430 | 
431 | 	return nil
432 | }
433 | 
434 | //
435 | // Open input and rejected file.
436 | //
437 | func (reader *Reader) Open() (e error) {
438 | 	// do not let file descriptor leaked
439 | 	e = reader.Close()
440 | 	if e != nil {
441 | 		return
442 | 	}
443 | 
444 | 	e = reader.OpenInput()
445 | 	if e != nil {
446 | 		return
447 | 	}
448 | 
449 | 	e = reader.OpenRejected()
450 | 
451 | 	return
452 | }
453 | 
454 | //
455 | // SkipLines skip parsing n lines from input file.
456 | // The n is defined in the attribute "Skip"
457 | //
458 | func (reader *Reader) SkipLines() (e error) {
459 | 	for i := 0; i < reader.Skip; i++ {
460 | 		_, e = reader.ReadLine()
461 | 
462 | 		if nil != e {
463 | 			log.Print("dsv: ", e)
464 | 			return
465 | 		}
466 | 	}
467 | 	return
468 | }
469 | 
470 | //
471 | // Reset all variables for next read operation. Number of rows will be 0, and
472 | // Rows will be empty again.
473 | //
474 | func (reader *Reader) Reset() (e error) {
475 | 	e = reader.Flush()
476 | 	if e != nil {
477 | 		return
478 | 	}
479 | 	e = reader.dataset.(tabula.DatasetInterface).Reset()
480 | 	return
481 | }
482 | 
483 | //
484 | // Flush all output buffer.
485 | //
486 | func (reader *Reader) Flush() error {
487 | 	return reader.bufReject.Flush()
488 | }
489 | 
490 | //
491 | // ReadLine will read one line from input file.
492 | //
493 | func (reader *Reader) ReadLine() (line []byte, e error) {
494 | 	line, e = reader.bufRead.ReadBytes(DefEOL)
495 | 
496 | 	if e == nil {
497 | 		// remove EOL
498 | 		line = line[:len(line)-1]
499 | 	}
500 | 
501 | 	return
502 | }
503 | 
504 | //
505 | // FetchNextLine read the next line and combine it with the `lastline`.
506 | //
507 | func (reader *Reader) FetchNextLine(lastline []byte) (line []byte, e error) {
508 | 	line, e = reader.ReadLine()
509 | 
510 | 	lastline = append(lastline, DefEOL)
511 | 	lastline = append(lastline, line...)
512 | 
513 | 	return lastline, e
514 | }
515 | 
516 | //
517 | // Reject the line and save it to the reject file.
518 | //
519 | func (reader *Reader) Reject(line []byte) (int, error) {
520 | 	return reader.bufReject.Write(line)
521 | }
522 | 
523 | //
524 | // deleteEmptyRejected if rejected file is empty, delete it.
525 | //
526 | func (reader *Reader) deleteEmptyRejected() {
527 | 	finfo, e := os.Stat(reader.Rejected)
528 | 	if e != nil {
529 | 		return
530 | 	}
531 | 
532 | 	if finfo.Size() >= 0 {
533 | 		_ = os.Remove(reader.Rejected)
534 | 	}
535 | }
536 | 
537 | //
538 | // Close all open descriptors.
539 | //
540 | func (reader *Reader) Close() (e error) {
541 | 	if nil != reader.bufReject {
542 | 		e = reader.bufReject.Flush()
543 | 		if e != nil {
544 | 			return
545 | 		}
546 | 	}
547 | 	if nil != reader.fReject {
548 | 		e = reader.fReject.Close()
549 | 		if e != nil {
550 | 			return
551 | 		}
552 | 	}
553 | 
554 | 	reader.deleteEmptyRejected()
555 | 
556 | 	if nil != reader.fRead {
557 | 		e = reader.fRead.Close()
558 | 	}
559 | 	return
560 | }
561 | 
562 | //
563 | // IsEqual compare only the configuration and metadata with other instance.
564 | //
565 | func (reader *Reader) IsEqual(other *Reader) bool {
566 | 	if reader == other {
567 | 		return true
568 | 	}
569 | 	if reader.Input != other.Input {
570 | 		return false
571 | 	}
572 | 
573 | 	l, r := len(reader.InputMetadata), len(other.InputMetadata)
574 | 
575 | 	if l != r {
576 | 		return false
577 | 	}
578 | 
579 | 	for a := 0; a < l; a++ {
580 | 		if !reader.InputMetadata[a].IsEqual(&other.InputMetadata[a]) {
581 | 			return false
582 | 		}
583 | 	}
584 | 
585 | 	return true
586 | }
587 | 
588 | //
589 | // GetDataset return reader dataset.
590 | //
591 | func (reader *Reader) GetDataset() interface{} {
592 | 	return reader.dataset
593 | }
594 | 
595 | //
596 | // MergeColumns append metadata and columns from another reader if not exist in
597 | // current metadata set.
598 | //
599 | func (reader *Reader) MergeColumns(other ReaderInterface) {
600 | 	for _, md := range other.GetInputMetadata() {
601 | 		if md.GetSkip() {
602 | 			continue
603 | 		}
604 | 
605 | 		// Check if the same metadata name exist in current dataset.
606 | 		found := false
607 | 		for _, lmd := range reader.GetInputMetadata() {
608 | 			if lmd.GetName() == md.GetName() {
609 | 				found = true
610 | 				break
611 | 			}
612 | 		}
613 | 
614 | 		if found {
615 | 			continue
616 | 		}
617 | 
618 | 		reader.AppendMetadata(md)
619 | 	}
620 | 
621 | 	reader.dataset.(tabula.DatasetInterface).MergeColumns(
622 | 		other.GetDataset().(tabula.DatasetInterface))
623 | }
624 | 
625 | //
626 | // MergeRows append rows from another reader.
627 | //
628 | func (reader *Reader) MergeRows(other *Reader) {
629 | 	reader.dataset.(tabula.DatasetInterface).MergeRows(
630 | 		other.GetDataset().(tabula.DatasetInterface))
631 | }
632 | 


--------------------------------------------------------------------------------