├── .gitignore
├── matrix.go
├── columninterface.go
├── dataset_bench_test.go
├── records_test.go
├── record_test.go
├── row_test.go
├── Makefile
├── clasetinterface.go
├── maprows_test.go
├── records.go
├── columns_test.go
├── column_test.go
├── maprows.go
├── LICENSE
├── tabula_test.go
├── tabula.go
├── row.go
├── columns.go
├── rows_test.go
├── README.md
├── rows.go
├── record.go
├── column.go
├── claset.go
├── dataset_test.go
├── datasetinterface.go
└── dataset.go


/.gitignore:
--------------------------------------------------------------------------------
1 | cover.html
2 | cover.out
3 | *.bench
4 | *.prof
5 | *.test
6 | 


--------------------------------------------------------------------------------
/matrix.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula
 6 | 
 7 | //
 8 | // Matrix is a combination of columns and rows.
 9 | //
10 | type Matrix struct {
11 | 	Columns *Columns
12 | 	Rows    *Rows
13 | }
14 | 


--------------------------------------------------------------------------------
/columninterface.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula
 6 | 
 7 | //
 8 | // ColumnInterface define an interface for working with Column.
 9 | //
10 | type ColumnInterface interface {
11 | 	SetType(tipe int)
12 | 	SetName(name string)
13 | 
14 | 	GetType() int
15 | 	GetName() string
16 | 
17 | 	SetRecords(recs *Records)
18 | 
19 | 	Interface() interface{}
20 | }
21 | 


--------------------------------------------------------------------------------
/dataset_bench_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/tabula"
 9 | 	"testing"
10 | )
11 | 
12 | func BenchmarkPushRow(b *testing.B) {
13 | 	dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil)
14 | 
15 | 	for i := 0; i < b.N; i++ {
16 | 		e := populateWithRows(dataset)
17 | 		if e != nil {
18 | 			b.Fatal(e)
19 | 		}
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/records_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"github.com/shuLhan/tabula"
10 | 	"testing"
11 | )
12 | 
13 | func TestSortByIndex(t *testing.T) {
14 | 	data := make(tabula.Records, 3)
15 | 	data[0] = tabula.NewRecordInt(3)
16 | 	data[1] = tabula.NewRecordInt(2)
17 | 	data[2] = tabula.NewRecordInt(1)
18 | 
19 | 	sortedIdx := []int{2, 1, 0}
20 | 	expect := []int{1, 2, 3}
21 | 
22 | 	sorted := data.SortByIndex(sortedIdx)
23 | 
24 | 	got := fmt.Sprint(sorted)
25 | 	exp := fmt.Sprint(&expect)
26 | 
27 | 	assert(t, exp, got, true)
28 | }
29 | 


--------------------------------------------------------------------------------
/record_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"github.com/shuLhan/tabula"
10 | 	"testing"
11 | )
12 | 
13 | //
14 | // TestRecord simply check how the stringer work.
15 | //
16 | func TestRecord(t *testing.T) {
17 | 	expec := []string{"test", "1", "2"}
18 | 	expType := []int{tabula.TString, tabula.TInteger, tabula.TInteger}
19 | 
20 | 	row := make(tabula.Row, 0)
21 | 
22 | 	for i := range expec {
23 | 		r, e := tabula.NewRecordBy(expec[i], expType[i])
24 | 		if nil != e {
25 | 			t.Error(e)
26 | 		}
27 | 
28 | 		row = append(row, r)
29 | 	}
30 | 
31 | 	exp := fmt.Sprint(expec)
32 | 	got := fmt.Sprint(row)
33 | 	assert(t, exp, got, true)
34 | }
35 | 


--------------------------------------------------------------------------------
/row_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/tabula"
 9 | 	"testing"
10 | )
11 | 
12 | var dataFloat64 = []float64{0.1, 0.2, 0.3, 0.4, 0.5}
13 | 
14 | func createRow() (row tabula.Row) {
15 | 	for _, v := range dataFloat64 {
16 | 		row.PushBack(tabula.NewRecordReal(v))
17 | 	}
18 | 	return
19 | }
20 | 
21 | func TestClone(t *testing.T) {
22 | 	row := createRow()
23 | 	rowClone := row.Clone()
24 | 	rowClone2 := row.Clone()
25 | 
26 | 	assert(t, &row, rowClone, true)
27 | 
28 | 	// changing the clone value should not change the original copy.
29 | 	(*rowClone2)[0].SetFloat(0)
30 | 	assert(t, &row, rowClone, true)
31 | 	assert(t, &row, rowClone2, false)
32 | }
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #!/bin/make
 2 | 
 3 | ## Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 4 | ## Use of this source code is governed by a BSD-style license that can be found
 5 | ## in the LICENSE file.
 6 | 
 7 | SRC_FILES	:=$(shell go list -f '{{ join .GoFiles " " }}')
 8 | TEST_FILES	:=$(shell go list -f '{{ join .TestGoFiles " " }}')
 9 | XTEST_FILES	:=$(shell go list -f '{{ join .XTestGoFiles " " }}')
10 | COVER_OUT	:=cover.out
11 | COVER_HTML	:=cover.html
12 | TARGET		:=$(shell go list -f '{{ .Target }}')
13 | 
14 | .PHONY: all clean coverbrowse
15 | 
16 | all: ${TARGET}
17 | 
18 | ${TARGET}: ${COVER_HTML}
19 | 	go install -a .
20 | 
21 | ${COVER_HTML}: ${COVER_OUT}
22 | 	go tool cover -html=$< -o $@
23 | 
24 | ${COVER_OUT}: ${SRC_FILES} ${TEST_FILES} ${XTEST_FILES}
25 | 	go test -v -coverprofile $@
26 | 
27 | coverbrowse: ${COVER_HTML}
28 | 	xdg-open $<
29 | 
30 | clean:
31 | 	rm -f ${COVER_HTML} ${COVER_OUT} *.bench *.prof *.test
32 | 


--------------------------------------------------------------------------------
/clasetinterface.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula
 6 | 
 7 | //
 8 | // ClasetInterface is the interface for working with dataset containing class
 9 | // or target attribute. It embed dataset interface.
10 | //
11 | // Yes, the name is Claset with single `s` not Classset with triple `s` to
12 | // minimize typo.
13 | //
14 | type ClasetInterface interface {
15 | 	DatasetInterface
16 | 
17 | 	GetClassType() int
18 | 	GetClassValueSpace() []string
19 | 	GetClassColumn() *Column
20 | 	GetClassRecords() *Records
21 | 	GetClassAsStrings() []string
22 | 	GetClassAsReals() []float64
23 | 	GetClassIndex() int
24 | 	MajorityClass() string
25 | 	MinorityClass() string
26 | 	Counts() []int
27 | 
28 | 	SetDataset(DatasetInterface)
29 | 	SetClassIndex(int)
30 | 	SetMajorityClass(string)
31 | 	SetMinorityClass(string)
32 | 
33 | 	CountValueSpaces()
34 | 	RecountMajorMinor()
35 | 	IsInSingleClass() (bool, string)
36 | 
37 | 	GetMinorityRows() *Rows
38 | }
39 | 


--------------------------------------------------------------------------------
/maprows_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"github.com/shuLhan/tabula"
10 | 	"testing"
11 | )
12 | 
13 | func TestAddRow(t *testing.T) {
14 | 	mapRows := tabula.MapRows{}
15 | 	rows, e := initRows()
16 | 
17 | 	if e != nil {
18 | 		t.Fatal(e)
19 | 	}
20 | 
21 | 	for _, row := range rows {
22 | 		key := fmt.Sprint((*row)[testClassIdx].Interface())
23 | 		mapRows.AddRow(key, row)
24 | 	}
25 | 
26 | 	got := fmt.Sprint(mapRows)
27 | 
28 | 	assert(t, groupByExpect, got, true)
29 | }
30 | 
31 | func TestGetMinority(t *testing.T) {
32 | 	mapRows := tabula.MapRows{}
33 | 	rows, e := initRows()
34 | 
35 | 	if e != nil {
36 | 		t.Fatal(e)
37 | 	}
38 | 
39 | 	for _, row := range rows {
40 | 		key := fmt.Sprint((*row)[testClassIdx].Interface())
41 | 		mapRows.AddRow(key, row)
42 | 	}
43 | 
44 | 	// remove the first row in the first key, so we can make it minority.
45 | 	mapRows[0].Value.PopFront()
46 | 
47 | 	_, minRows := mapRows.GetMinority()
48 | 
49 | 	exp := rowsExpect[3]
50 | 	got := fmt.Sprint(minRows)
51 | 
52 | 	assert(t, exp, got, true)
53 | }
54 | 


--------------------------------------------------------------------------------
/records.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula
 6 | 
 7 | //
 8 | // Records define slice of pointer to Record.
 9 | //
10 | type Records []*Record
11 | 
12 | //
13 | // Len will return the length of records.
14 | //
15 | func (recs *Records) Len() int {
16 | 	return len(*recs)
17 | }
18 | 
19 | //
20 | // SortByIndex will sort the records using slice of index `sortedIDx` and
21 | // return it.
22 | //
23 | func (recs *Records) SortByIndex(sortedIdx []int) *Records {
24 | 	sorted := make(Records, len(*recs))
25 | 
26 | 	for x, v := range sortedIdx {
27 | 		sorted[x] = (*recs)[v]
28 | 	}
29 | 	return &sorted
30 | }
31 | 
32 | //
33 | // CountWhere return number of record where its value is equal to `v` type and
34 | // value.
35 | //
36 | func (recs *Records) CountWhere(v interface{}) (c int) {
37 | 	for _, r := range *recs {
38 | 		if r.IsEqualToInterface(v) {
39 | 			c++
40 | 		}
41 | 	}
42 | 	return
43 | }
44 | 
45 | //
46 | // CountsWhere will return count of each value in slice `sv`.
47 | //
48 | func (recs *Records) CountsWhere(vs []interface{}) (counts []int) {
49 | 	for _, v := range vs {
50 | 		c := recs.CountWhere(v)
51 | 		counts = append(counts, c)
52 | 	}
53 | 	return
54 | }
55 | 


--------------------------------------------------------------------------------
/columns_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/tabula"
 9 | 	"testing"
10 | )
11 | 
12 | func TestRandomPickColumns(t *testing.T) {
13 | 	var dataset tabula.Dataset
14 | 	var e error
15 | 
16 | 	dataset.Init(tabula.DatasetModeRows, testColTypes, testColNames)
17 | 
18 | 	dataset.Rows, e = initRows()
19 | 	if e != nil {
20 | 		t.Fatal(e)
21 | 	}
22 | 
23 | 	dataset.TransposeToColumns()
24 | 
25 | 	// random pick with duplicate
26 | 	ncols := 6
27 | 	dup := true
28 | 	excludeIdx := []int{3}
29 | 
30 | 	for i := 0; i < 5; i++ {
31 | 		picked, unpicked, _, _ :=
32 | 			dataset.Columns.RandomPick(ncols, dup, excludeIdx)
33 | 
34 | 		// check if unpicked item exist in picked items.
35 | 		for _, un := range unpicked {
36 | 			for _, pick := range picked {
37 | 				assert(t, un, pick, false)
38 | 			}
39 | 		}
40 | 	}
41 | 
42 | 	// random pick without duplicate
43 | 	dup = false
44 | 	for i := 0; i < 5; i++ {
45 | 		picked, unpicked, _, _ :=
46 | 			dataset.Columns.RandomPick(ncols, dup, excludeIdx)
47 | 
48 | 		// check if unpicked item exist in picked items.
49 | 		for _, un := range unpicked {
50 | 			for _, pick := range picked {
51 | 				assert(t, un, pick, false)
52 | 			}
53 | 		}
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/column_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"github.com/shuLhan/tabula"
 9 | 	"testing"
10 | )
11 | 
12 | var data = []string{"9.987654321", "8.8", "7.7", "6.6", "5.5", "4.4", "3.3"}
13 | var expFloat = []float64{9.987654321, 8.8, 7.7, 6.6, 5.5, 4.4, 3.3}
14 | 
15 | func initColReal(t *testing.T) (col *tabula.Column) {
16 | 	col = tabula.NewColumn(tabula.TReal, "TREAL")
17 | 
18 | 	for x := range data {
19 | 		rec, e := tabula.NewRecordBy(data[x], tabula.TReal)
20 | 		if e != nil {
21 | 			t.Fatal(e)
22 | 		}
23 | 
24 | 		col.PushBack(rec)
25 | 	}
26 | 
27 | 	return col
28 | }
29 | 
30 | func TestToFloatSlice(t *testing.T) {
31 | 	col := initColReal(t)
32 | 	got := col.ToFloatSlice()
33 | 
34 | 	assert(t, expFloat, got, true)
35 | }
36 | 
37 | func TestToStringSlice(t *testing.T) {
38 | 	var col tabula.Column
39 | 
40 | 	for x := range data {
41 | 		rec, e := tabula.NewRecordBy(data[x], tabula.TString)
42 | 		if e != nil {
43 | 			t.Fatal(e)
44 | 		}
45 | 
46 | 		col.PushBack(rec)
47 | 	}
48 | 
49 | 	got := col.ToStringSlice()
50 | 
51 | 	assert(t, data, got, true)
52 | }
53 | 
54 | func TestDeleteRecordAt(t *testing.T) {
55 | 	var exp []float64
56 | 	del := 2
57 | 
58 | 	exp = append(exp, expFloat[:del]...)
59 | 	exp = append(exp, expFloat[del+1:]...)
60 | 
61 | 	col := initColReal(t)
62 | 	col.DeleteRecordAt(del)
63 | 	got := col.ToFloatSlice()
64 | 
65 | 	assert(t, exp, got, true)
66 | }
67 | 


--------------------------------------------------------------------------------
/maprows.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula
 6 | 
 7 | import (
 8 | 	"math"
 9 | )
10 | 
11 | //
12 | // MapRowsElement represent a single mapping of string key to rows.
13 | //
14 | type MapRowsElement struct {
15 | 	Key   string
16 | 	Value Rows
17 | }
18 | 
19 | //
20 | // MapRows represent a list of mapping between string key and rows.
21 | //
22 | type MapRows []MapRowsElement
23 | 
24 | //
25 | // insertRow will insert a row `v` into map using key `k`.
26 | //
27 | func (mapRows *MapRows) insertRow(k string, v *Row) {
28 | 	rows := Rows{}
29 | 	rows.PushBack(v)
30 | 	el := MapRowsElement{k, rows}
31 | 	(*mapRows) = append((*mapRows), el)
32 | }
33 | 
34 | //
35 | // AddRow will append a row `v` into map value if they key `k` exist in map,
36 | // otherwise it will insert a new map element.
37 | //
38 | func (mapRows *MapRows) AddRow(k string, v *Row) {
39 | 	for x := range *mapRows {
40 | 		if (*mapRows)[x].Key == k {
41 | 			(*mapRows)[x].Value.PushBack(v)
42 | 			return
43 | 		}
44 | 	}
45 | 	// no key found on map
46 | 	mapRows.insertRow(k, v)
47 | }
48 | 
49 | //
50 | // GetMinority return map value which contain the minimum rows.
51 | //
52 | func (mapRows *MapRows) GetMinority() (keyMin string, valMin Rows) {
53 | 	min := math.MaxInt32
54 | 
55 | 	for k := range *mapRows {
56 | 		v := (*mapRows)[k].Value
57 | 		l := len(v)
58 | 		if l < min {
59 | 			keyMin = (*mapRows)[k].Key
60 | 			valMin = v
61 | 			min = l
62 | 		}
63 | 	}
64 | 	return
65 | }
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017, M. Shulhan (ms@kilabit.info).
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of copyright holder nor the names of its contributors may be
15 |    used to endorse or promote products derived from this software without
16 |    specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 |         ---      --- ---       ---       ---      --- ---
30 | 
31 | 	TT  TT   II  BB          AAAA     LLLLLL  II  KKKKKKKK
32 | 	TT TT    II  BB         AA  AA   LL   LL  II     KK
33 | 	TTTT     II  BB        AA    AA   LL  LL  II     KK
34 | 	TT TT    II  BB        AAAAAAAA   LLLLLL  II     KK
35 | 	TT  TT   II  BB        AA    AA  LL   LL  II     KK
36 | 	TT   TT  II  BBBBBBBB  AA    AA   LLLLLL  II     KK
37 | 
38 | Website: http://kilabit.info
39 | Contact: ms@kilabit.info
40 | 


--------------------------------------------------------------------------------
/tabula_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | package tabula_test
 6 | 
 7 | import (
 8 | 	"os"
 9 | 	"reflect"
10 | 	"runtime"
11 | 	"testing"
12 | 
13 | 	"github.com/shuLhan/tabula"
14 | )
15 | 
16 | var (
17 | 	traces = make([]byte, 1024)
18 | )
19 | 
20 | func printStackTrace() {
21 | 	var lines, start, end int
22 | 
23 | 	for x, b := range traces {
24 | 		if b != '\n' {
25 | 			continue
26 | 		}
27 | 		lines++
28 | 		if lines == 3 {
29 | 			start = x
30 | 		} else if lines == 5 {
31 | 			end = x + 1
32 | 			break
33 | 		}
34 | 	}
35 | 
36 | 	os.Stderr.Write(traces[start:end])
37 | }
38 | 
39 | func assert(t *testing.T, exp, got interface{}, equal bool) {
40 | 	if reflect.DeepEqual(exp, got) != equal {
41 | 		runtime.Stack(traces, true)
42 | 		printStackTrace()
43 | 		t.Fatalf("\n"+
44 | 			">>> Expecting '%v'\n"+
45 | 			"          got '%v'\n", exp, got)
46 | 	}
47 | }
48 | 
49 | var testColTypes = []int{
50 | 	tabula.TInteger,
51 | 	tabula.TInteger,
52 | 	tabula.TInteger,
53 | 	tabula.TString,
54 | }
55 | 
56 | var testColNames = []string{"int01", "int02", "int03", "class"}
57 | 
58 | // Testing data and function for Rows and MapRows
59 | var rowsData = [][]string{
60 | 	{"1", "5", "9", "+"},
61 | 	{"2", "6", "0", "-"},
62 | 	{"3", "7", "1", "-"},
63 | 	{"4", "8", "2", "+"},
64 | }
65 | 
66 | var testClassIdx = 3
67 | 
68 | var rowsExpect = []string{
69 | 	"&[1 5 9 +]",
70 | 	"&[2 6 0 -]",
71 | 	"&[3 7 1 -]",
72 | 	"&[4 8 2 +]",
73 | }
74 | 
75 | var groupByExpect = "[{+ &[1 5 9 +]&[4 8 2 +]} {- &[2 6 0 -]&[3 7 1 -]}]"
76 | 
77 | func initRows() (rows tabula.Rows, e error) {
78 | 	for i := range rowsData {
79 | 		l := len(rowsData[i])
80 | 		row := make(tabula.Row, 0)
81 | 
82 | 		for j := 0; j < l; j++ {
83 | 			rec, e := tabula.NewRecordBy(rowsData[i][j],
84 | 				testColTypes[j])
85 | 
86 | 			if nil != e {
87 | 				return nil, e
88 | 			}
89 | 
90 | 			row = append(row, rec)
91 | 		}
92 | 
93 | 		rows.PushBack(&row)
94 | 	}
95 | 	return rows, nil
96 | }
97 | 


--------------------------------------------------------------------------------
/tabula.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style license that can be found
 3 | // in the LICENSE file.
 4 | 
 5 | //
 6 | // Package tabula is a Go library for working with rows, columns, or matrix
 7 | // (table), or in another terms working with data set.
 8 | //
 9 | // # Overview
10 | //
11 | // Go's slice gave a flexible way to manage sequence of data in one type, but what
12 | // if you want to manage a sequence of value but with different type of data?
13 | // Or manage a bunch of values like a table?
14 | //
15 | // You can use this library to manage sequence of value with different type
16 | // and manage data in two dimensional tuple.
17 | //
18 | // ## Terminology
19 | //
20 | // Here are some terminologies that we used in developing this library, which may
21 | // help reader understand the internal and API.
22 | //
23 | // Record is a single cell in row or column, or the smallest building block of
24 | // dataset.
25 | //
26 | // Row is a horizontal representation of records in dataset.
27 | //
28 | // Column is a vertical representation of records in dataset.
29 | // Each column has a unique name and has the same type data.
30 | //
31 | // Dataset is a collection of rows and columns.
32 | //
33 | // Given those definitions we can draw the representation of rows, columns, or
34 | // matrix:
35 | //
36 | // 	        COL-0  COL-1 ...  COL-x
37 | // 	ROW-0: record record ... record
38 | // 	ROW-1: record record ... record
39 | // 	...
40 | // 	ROW-y: record record ... record
41 | //
42 | // ## Record Type
43 | //
44 | // There are only three valid type in record: int64, float64, and string.
45 | //
46 | // ## Dataset Mode
47 | //
48 | // Tabula has three mode for dataset: rows, columns, or matrix.
49 | //
50 | // For example, given a table of data,
51 | //
52 | //     col1,col2,col3
53 | //     a,b,c
54 | //     1,2,3
55 | //
56 | // "rows" mode is where each line saved in its own slice, resulting in Rows:
57 | //
58 | //     Rows[0]: [a b c]
59 | //     Rows[1]: [1 2 3]
60 | //
61 | // "columns" mode is where each line saved by columns, resulting in Columns:
62 | //
63 | //     Columns[0]: {col1 0 0 [] [a 1]}
64 | //     Columns[1]: {col2 0 0 [] [b 2]}
65 | //     Columns[1]: {col3 0 0 [] [c 3]}
66 | //
67 | // Unlike rows mode, each column contain metadata including column name, type,
68 | // flag, and value space (all possible value that _may_ contain in column value).
69 | //
70 | // "matrix" mode is where each record saved both in row and column.
71 | //
72 | // Matrix mode consume more memory but give a flexible way to manage records.
73 | //
74 | //
75 | package tabula
76 | 
77 | import (
78 | 	"os"
79 | 	"strconv"
80 | )
81 | 
82 | var (
83 | 	// DEBUG debug level, set using environment TABULA_DEBUG
84 | 	DEBUG = 0
85 | )
86 | 
87 | func init() {
88 | 	var e error
89 | 	DEBUG, e = strconv.Atoi(os.Getenv("TABULA_DEBUG"))
90 | 	if e != nil {
91 | 		DEBUG = 0
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/row.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | //
  8 | // Row represent slice of record.
  9 | //
 10 | type Row []*Record
 11 | 
 12 | //
 13 | // Len return number of record in row.
 14 | //
 15 | func (row *Row) Len() int {
 16 | 	return len(*row)
 17 | }
 18 | 
 19 | //
 20 | // PushBack will add new record to the end of row.
 21 | //
 22 | func (row *Row) PushBack(r *Record) {
 23 | 	*row = append(*row, r)
 24 | }
 25 | 
 26 | //
 27 | // Types return type of all records.
 28 | //
 29 | func (row *Row) Types() (types []int) {
 30 | 	for _, r := range *row {
 31 | 		types = append(types, r.Type())
 32 | 	}
 33 | 	return
 34 | }
 35 | 
 36 | //
 37 | // Clone create and return a clone of row.
 38 | //
 39 | func (row *Row) Clone() *Row {
 40 | 	clone := make(Row, len(*row))
 41 | 
 42 | 	for x, rec := range *row {
 43 | 		clone[x] = rec.Clone()
 44 | 	}
 45 | 	return &clone
 46 | }
 47 | 
 48 | //
 49 | // IsNilAt return true if there is no record value in row at `idx`, otherwise
 50 | // return false.
 51 | //
 52 | func (row *Row) IsNilAt(idx int) bool {
 53 | 	if idx < 0 {
 54 | 		return true
 55 | 	}
 56 | 	if idx >= len(*row) {
 57 | 		return true
 58 | 	}
 59 | 	if (*row)[idx] == nil {
 60 | 		return true
 61 | 	}
 62 | 	return (*row)[idx].IsNil()
 63 | }
 64 | 
 65 | //
 66 | // SetValueAt will set the value of row at cell index `idx` with record `rec`.
 67 | //
 68 | func (row *Row) SetValueAt(idx int, rec *Record) {
 69 | 	(*row)[idx] = rec
 70 | }
 71 | 
 72 | //
 73 | // GetRecord will return pointer to record at index `i`, or nil if index
 74 | // is out of range.
 75 | //
 76 | func (row *Row) GetRecord(i int) *Record {
 77 | 	if i < 0 {
 78 | 		return nil
 79 | 	}
 80 | 	if i >= row.Len() {
 81 | 		return nil
 82 | 	}
 83 | 	return (*row)[i]
 84 | }
 85 | 
 86 | //
 87 | // GetValueAt return the value of row record at index `idx`. If the index is
 88 | // out of range it will return nil and false
 89 | //
 90 | func (row *Row) GetValueAt(idx int) (interface{}, bool) {
 91 | 	if row.Len() <= idx {
 92 | 		return nil, false
 93 | 	}
 94 | 	return (*row)[idx].Interface(), true
 95 | }
 96 | 
 97 | //
 98 | // GetIntAt return the integer value of row record at index `idx`.
 99 | // If the index is out of range it will return 0 and false.
100 | //
101 | func (row *Row) GetIntAt(idx int) (int64, bool) {
102 | 	if row.Len() <= idx {
103 | 		return 0, false
104 | 	}
105 | 
106 | 	return (*row)[idx].Integer(), true
107 | }
108 | 
109 | //
110 | // IsEqual return true if row content equal with `other` row, otherwise return
111 | // false.
112 | //
113 | func (row *Row) IsEqual(other *Row) bool {
114 | 	if len(*row) != len(*other) {
115 | 		return false
116 | 	}
117 | 	for x, xrec := range *row {
118 | 		if !xrec.IsEqual((*other)[x]) {
119 | 			return false
120 | 		}
121 | 	}
122 | 	return true
123 | }
124 | 


--------------------------------------------------------------------------------
/columns.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"github.com/shuLhan/numerus"
  9 | 	"github.com/shuLhan/tekstus"
 10 | )
 11 | 
 12 | //
 13 | // Columns represent slice of Column.
 14 | //
 15 | type Columns []Column
 16 | 
 17 | //
 18 | // Len return length of columns.
 19 | //
 20 | func (cols *Columns) Len() int {
 21 | 	return len(*cols)
 22 | }
 23 | 
 24 | //
 25 | // Reset each data and attribute in all columns.
 26 | //
 27 | func (cols *Columns) Reset() {
 28 | 	for x := range *cols {
 29 | 		(*cols)[x].Reset()
 30 | 	}
 31 | }
 32 | 
 33 | //
 34 | // SetTypes of each column. The length of type must be equal with the number of
 35 | // column, otherwise it will used the minimum length between types or columns.
 36 | //
 37 | func (cols *Columns) SetTypes(types []int) {
 38 | 	typeslen := len(types)
 39 | 	colslen := len(*cols)
 40 | 	minlen := typeslen
 41 | 
 42 | 	if colslen < minlen {
 43 | 		minlen = colslen
 44 | 	}
 45 | 
 46 | 	for x := 0; x < minlen; x++ {
 47 | 		(*cols)[x].Type = types[x]
 48 | 	}
 49 | }
 50 | 
 51 | //
 52 | // RandomPick column in columns until n item and return it like its has been
 53 | // shuffled.  If duplicate is true, column that has been picked can be picked up
 54 | // again, otherwise it will only picked up once.
 55 | //
 56 | // This function return picked and unpicked column and index of them.
 57 | //
 58 | func (cols *Columns) RandomPick(n int, dup bool, excludeIdx []int) (
 59 | 	picked Columns,
 60 | 	unpicked Columns,
 61 | 	pickedIdx []int,
 62 | 	unpickedIdx []int,
 63 | ) {
 64 | 	excLen := len(excludeIdx)
 65 | 	colsLen := len(*cols)
 66 | 	allowedLen := colsLen - excLen
 67 | 
 68 | 	// if duplication is not allowed, limit the number of selected
 69 | 	// column.
 70 | 	if n > allowedLen && !dup {
 71 | 		n = allowedLen
 72 | 	}
 73 | 
 74 | 	for ; n >= 1; n-- {
 75 | 		idx := numerus.IntPickRandPositive(colsLen, dup, pickedIdx,
 76 | 			excludeIdx)
 77 | 
 78 | 		pickedIdx = append(pickedIdx, idx)
 79 | 		picked = append(picked, (*cols)[idx])
 80 | 	}
 81 | 
 82 | 	// select unpicked columns using picked index.
 83 | 	for cid := range *cols {
 84 | 		// check if column index has been picked up
 85 | 		isPicked := false
 86 | 		for _, idx := range pickedIdx {
 87 | 			if cid == idx {
 88 | 				isPicked = true
 89 | 				break
 90 | 			}
 91 | 		}
 92 | 		if !isPicked {
 93 | 			unpicked = append(unpicked, (*cols)[cid])
 94 | 			unpickedIdx = append(unpickedIdx, cid)
 95 | 		}
 96 | 	}
 97 | 
 98 | 	return
 99 | }
100 | 
101 | //
102 | // GetMinMaxLength given a slice of column, find the minimum and maximum column
103 | // length among them.
104 | //
105 | func (cols *Columns) GetMinMaxLength() (min, max int) {
106 | 	for _, col := range *cols {
107 | 		collen := col.Len()
108 | 		if collen < min {
109 | 			min = collen
110 | 		} else if collen > max {
111 | 			max = collen
112 | 		}
113 | 	}
114 | 	return
115 | }
116 | 
117 | //
118 | // Join all column records value at index `row` using separator `sep` and make
119 | // sure if there is a separator in value it will be escaped with `esc`.
120 | //
121 | // Given slice of columns, where row is 1 and sep is `,` and escape is `\`
122 | //
123 | // 	  0 1 2
124 | // 	0 A B C
125 | // 	1 D , F <- row
126 | // 	2 G H I
127 | //
128 | // this function will return "D,\,,F" in bytes.
129 | //
130 | //
131 | func (cols *Columns) Join(row int, sep, esc []byte) (v []byte) {
132 | 	for y, col := range *cols {
133 | 		if y > 0 {
134 | 			v = append(v, sep...)
135 | 		}
136 | 
137 | 		rec := col.Records[row]
138 | 		recV := rec.Bytes()
139 | 
140 | 		if rec.Type() == TString {
141 | 			recV, _ = tekstus.BytesEncapsulate(sep, recV, esc, nil)
142 | 		}
143 | 
144 | 		v = append(v, recV...)
145 | 	}
146 | 	return
147 | }
148 | 


--------------------------------------------------------------------------------
/rows_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula_test
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | func TestPushBack(t *testing.T) {
 14 | 	rows, e := initRows()
 15 | 	if e != nil {
 16 | 		t.Fatal(e)
 17 | 	}
 18 | 
 19 | 	exp := strings.Join(rowsExpect, "")
 20 | 	got := fmt.Sprint(rows)
 21 | 
 22 | 	assert(t, exp, got, true)
 23 | }
 24 | 
 25 | func TestPopFront(t *testing.T) {
 26 | 	rows, e := initRows()
 27 | 	if e != nil {
 28 | 		t.Fatal(e)
 29 | 	}
 30 | 
 31 | 	l := len(rows) - 1
 32 | 	for i := range rows {
 33 | 		row := rows.PopFront()
 34 | 
 35 | 		exp := rowsExpect[i]
 36 | 		got := fmt.Sprint(row)
 37 | 
 38 | 		assert(t, exp, got, true)
 39 | 
 40 | 		if i < l {
 41 | 			exp = strings.Join(rowsExpect[i+1:], "")
 42 | 		} else {
 43 | 			exp = ""
 44 | 		}
 45 | 		got = fmt.Sprint(rows)
 46 | 
 47 | 		assert(t, exp, got, true)
 48 | 	}
 49 | 
 50 | 	// empty rows
 51 | 	row := rows.PopFront()
 52 | 
 53 | 	exp := "<nil>"
 54 | 	got := fmt.Sprint(row)
 55 | 
 56 | 	assert(t, exp, got, true)
 57 | }
 58 | 
 59 | func TestPopFrontRow(t *testing.T) {
 60 | 	rows, e := initRows()
 61 | 	if e != nil {
 62 | 		t.Fatal(e)
 63 | 	}
 64 | 
 65 | 	l := len(rows) - 1
 66 | 	for i := range rows {
 67 | 		newRows := rows.PopFrontAsRows()
 68 | 
 69 | 		exp := rowsExpect[i]
 70 | 		got := fmt.Sprint(newRows)
 71 | 
 72 | 		assert(t, exp, got, true)
 73 | 
 74 | 		if i < l {
 75 | 			exp = strings.Join(rowsExpect[i+1:], "")
 76 | 		} else {
 77 | 			exp = ""
 78 | 		}
 79 | 		got = fmt.Sprint(rows)
 80 | 
 81 | 		assert(t, exp, got, true)
 82 | 	}
 83 | 
 84 | 	// empty rows
 85 | 	row := rows.PopFrontAsRows()
 86 | 
 87 | 	exp := ""
 88 | 	got := fmt.Sprint(row)
 89 | 
 90 | 	assert(t, exp, got, true)
 91 | }
 92 | 
 93 | func TestGroupByValue(t *testing.T) {
 94 | 	rows, e := initRows()
 95 | 	if e != nil {
 96 | 		t.Fatal(e)
 97 | 	}
 98 | 
 99 | 	mapRows := rows.GroupByValue(testClassIdx)
100 | 
101 | 	got := fmt.Sprint(mapRows)
102 | 
103 | 	assert(t, groupByExpect, got, true)
104 | }
105 | 
106 | func TestRandomPick(t *testing.T) {
107 | 	rows, e := initRows()
108 | 	if e != nil {
109 | 		t.Fatal(e)
110 | 	}
111 | 
112 | 	// random pick with duplicate
113 | 	for i := 0; i < 5; i++ {
114 | 		picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(6,
115 | 			true)
116 | 
117 | 		// check if unpicked item exist in picked items.
118 | 		isin, _ := picked.Contains(unpicked)
119 | 
120 | 		if isin {
121 | 			fmt.Println("Random pick with duplicate rows")
122 | 			fmt.Println("==> picked rows   :", picked)
123 | 			fmt.Println("==> picked idx    :", pickedIdx)
124 | 			fmt.Println("==> unpicked rows :", unpicked)
125 | 			fmt.Println("==> unpicked idx  :", unpickedIdx)
126 | 			t.Fatal("random pick: unpicked is false")
127 | 		}
128 | 	}
129 | 
130 | 	// random pick without duplication
131 | 	for i := 0; i < 5; i++ {
132 | 		picked, unpicked, pickedIdx, unpickedIdx := rows.RandomPick(3,
133 | 			false)
134 | 
135 | 		// check if picked rows is duplicate
136 | 		assert(t, picked[0], picked[1], false)
137 | 
138 | 		// check if unpicked item exist in picked items.
139 | 		isin, _ := picked.Contains(unpicked)
140 | 
141 | 		if isin {
142 | 			fmt.Println("Random pick with no duplicate rows")
143 | 			fmt.Println("==> picked rows   :", picked)
144 | 			fmt.Println("==> picked idx    :", pickedIdx)
145 | 			fmt.Println("==> unpicked rows :", unpicked)
146 | 			fmt.Println("==> unpicked idx  :", unpickedIdx)
147 | 			t.Fatal("random pick: unpicked is false")
148 | 		}
149 | 	}
150 | }
151 | 
152 | func TestRowsDel(t *testing.T) {
153 | 	rows, e := initRows()
154 | 	if e != nil {
155 | 		t.Fatal(e)
156 | 	}
157 | 
158 | 	// Test deleting row index out of range.
159 | 	row := rows.Del(-1)
160 | 	if row != nil {
161 | 		t.Fatal("row should be nil!")
162 | 	}
163 | 
164 | 	row = rows.Del(rows.Len())
165 | 	if row != nil {
166 | 		t.Fatal("row should be nil!")
167 | 	}
168 | 
169 | 	// Test deleting index that is actually exist.
170 | 	row = rows.Del(0)
171 | 
172 | 	exp := strings.Join(rowsExpect[1:], "")
173 | 	got := fmt.Sprint(rows)
174 | 
175 | 	assert(t, exp, got, true)
176 | 
177 | 	got = fmt.Sprint(row)
178 | 	assert(t, rowsExpect[0], got, true)
179 | }
180 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GoDoc](https://godoc.org/github.com/shuLhan/tabula?status.svg)](https://godoc.org/github.com/shuLhan/tabula)
  2 | [![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/tabula)](https://goreportcard.com/report/github.com/shuLhan/tabula)
  3 | ![cover.run go](https://cover.run/go/github.com/shuLhan/tabula.svg)
  4 | 
  5 | Package tabula is a Go library for working with rows, columns, or matrix
  6 | (table), or in another terms working with data set.
  7 | 
  8 | NOTE: This package has been deprecated. See
  9 | https://github.com/shuLhan/share/tree/master/lib/tabula for latest implementation.
 10 | 
 11 | # Overview
 12 | 
 13 | Go's slice gave a flexible way to manage sequence of data in one type, but what
 14 | if you want to manage a sequence of value but with different type of data?
 15 | Or manage a bunch of values like a table?
 16 | 
 17 | You can use this library to manage sequence of value with different type
 18 | and manage data in two dimensional tuple.
 19 | 
 20 | ## Terminology
 21 | 
 22 | Here are some terminologies that we used in developing this library, which may
 23 | help reader understand the internal and API.
 24 | 
 25 | Record is a single cell in row or column, or the smallest building block of
 26 | dataset.
 27 | 
 28 | Row is a horizontal representation of records in dataset.
 29 | 
 30 | Column is a vertical representation of records in dataset.
 31 | Each column has a unique name and has the same type data.
 32 | 
 33 | Dataset is a collection of rows and columns.
 34 | 
 35 | Given those definitions we can draw the representation of rows, columns, or
 36 | matrix:
 37 | 
 38 |             COL-0  COL-1 ...  COL-x
 39 |     ROW-0: record record ... record
 40 |     ROW-1: record record ... record
 41 |     ...
 42 |     ROW-y: record record ... record
 43 | 
 44 | ## What make this package different from other dataset packages?
 45 | 
 46 | ### Record Type
 47 | 
 48 | There are only three valid type in record: int64, float64, and string.
 49 | 
 50 | Each record is a pointer to interface value. Which means,
 51 | 
 52 | - Switching between rows to columns mode, or vice versa, is only a matter of
 53 |   pointer switching, no memory relocations.
 54 | - When using matrix mode, additional memory is used only to allocate slice, the
 55 |   record in each rows and columns is shared.
 56 | 
 57 | ### Dataset Mode
 58 | 
 59 | Tabula has three mode for dataset: rows, columns, or matrix.
 60 | 
 61 | For example, given a table of data,
 62 | 
 63 |     col1,col2,col3
 64 |     a,b,c
 65 |     1,2,3
 66 | 
 67 | - When in "rows" mode, each line is saved in its own slice, resulting in Rows:
 68 | 
 69 |   ```
 70 |   Rows[0]: [a b c]
 71 |   Rows[1]: [1 2 3]
 72 |   ```
 73 | 
 74 |   Columns is used only to save record metadata: column name, type, flag and
 75 |   value space.
 76 | 
 77 | - When in "columns" mode, each line saved in columns, resulting in Columns:
 78 | 
 79 |   ```
 80 |   Columns[0]: {col1 0 0 [] [a 1]}
 81 |   Columns[1]: {col2 0 0 [] [b 2]}
 82 |   Columns[1]: {col3 0 0 [] [c 3]}
 83 |   ```
 84 | 
 85 |   Each column will contain metadata including column name, type, flag, and
 86 |   value space (all possible value that _may_ contain in column value).
 87 | 
 88 |   Rows in "columns" mode is empty.
 89 | 
 90 | - When in "matrix" mode, each record is saved both in row and column using
 91 |   shared pointer to record.
 92 | 
 93 |   Matrix mode consume more memory by allocating two slice in rows and columns,
 94 |   but give flexible way to manage records.
 95 | 
 96 | ## Features
 97 | 
 98 | - **Switching between rows and columns mode**.
 99 | 
100 | - [**Random pick rows with or without replacement**](https://godoc.org/github.com/shuLhan/tabula#RandomPickRows).
101 | 
102 | - [**Random pick columns with or without replacement**](https://godoc.org/github.com/shuLhan/tabula#RandomPickColumns).
103 | 
104 | - [**Select column from dataset by index**](https://godoc.org/github.com/shuLhan/tabula#SelectColumnsByIdx).
105 | 
106 | - [**Sort columns by index**](https://godoc.org/github.com/shuLhan/tabula#SortColumnsByIndex),
107 |   or indirect sort.
108 | 
109 | - [**Split rows value by numeric**](https://godoc.org/github.com/shuLhan/tabula#SplitRowsByNumeric).
110 |   For example, given two numeric rows,
111 | 
112 |   ```
113 |   A: {1,2,3,4}
114 |   B: {5,6,7,8}
115 |   ```
116 | 
117 |   if we split row by value 7, the data will splitted into left set
118 | 
119 |   ```
120 |   A': {1,2}
121 |   B': {5,6}
122 |   ```
123 | 
124 |   and the right set would be
125 | 
126 |   ```
127 |   A'': {3,4}
128 |   B'': {7,8}
129 |   ```
130 | 
131 | - [**Split rows by string**](https://godoc.org/github.com/shuLhan/tabula#SplitRowsByCategorical).
132 |   For example, given two rows,
133 | 
134 |   ```
135 |   X: [A,B,A,B,C,D,C,D]
136 |   Y: [1,2,3,4,5,6,7,8]
137 |   ```
138 | 
139 |   if we split the rows with value set `[A,C]`, the data will splitted into left
140 |   set which contain all rows that have A or C,
141 | 
142 |   ```
143 |   		X': [A,A,C,C]
144 |   		Y': [1,3,5,7]
145 |   ```
146 | 
147 |   and the right set, excluded set, will contain all rows which is not A or C,
148 | 
149 |   ```
150 |   		X'': [B,B,D,D]
151 |   		Y'': [2,4,6,8]
152 |   ```
153 | 
154 | - [**Select row where**](https://godoc.org/github.com/shuLhan/tabula#SelectRowsWhere).
155 |   Select row at column index x where their value is equal to y (an analogy to
156 |   _select where_ in SQL).
157 |   For example, given a rows of dataset,
158 |   ```
159 |   ROW-1: {1,A}
160 |   ROW-2: {2,B}
161 |   ROW-3: {3,A}
162 |   ROW-4: {4,C}
163 |   ```
164 |   we can select row where the second column contain 'A', which result in,
165 |   ```
166 |   ROW-1: {1,A}
167 |   ROW-3: {3,A}
168 |   ```
169 | 


--------------------------------------------------------------------------------
/rows.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"math/rand"
 10 | 	"time"
 11 | )
 12 | 
 13 | //
 14 | // Rows represent slice of Row.
 15 | //
 16 | type Rows []*Row
 17 | 
 18 | //
 19 | // Len return number of row.
 20 | //
 21 | func (rows *Rows) Len() int {
 22 | 	return len(*rows)
 23 | }
 24 | 
 25 | //
 26 | // PushBack append record r to the end of rows.
 27 | //
 28 | func (rows *Rows) PushBack(r *Row) {
 29 | 	if r != nil {
 30 | 		(*rows) = append((*rows), r)
 31 | 	}
 32 | }
 33 | 
 34 | //
 35 | // PopFront remove the head, return the record value.
 36 | //
 37 | func (rows *Rows) PopFront() (row *Row) {
 38 | 	l := len(*rows)
 39 | 	if l > 0 {
 40 | 		row = (*rows)[0]
 41 | 		(*rows) = (*rows)[1:]
 42 | 	}
 43 | 	return
 44 | }
 45 | 
 46 | //
 47 | // PopFrontAsRows remove the head and return ex-head as new rows.
 48 | //
 49 | func (rows *Rows) PopFrontAsRows() (newRows Rows) {
 50 | 	row := rows.PopFront()
 51 | 	if nil == row {
 52 | 		return
 53 | 	}
 54 | 	newRows.PushBack(row)
 55 | 	return
 56 | }
 57 | 
 58 | //
 59 | // Del will detach row at index `i` from slice and return it.
 60 | //
 61 | func (rows *Rows) Del(i int) (row *Row) {
 62 | 	if i < 0 {
 63 | 		return
 64 | 	}
 65 | 	if i >= rows.Len() {
 66 | 		return
 67 | 	}
 68 | 
 69 | 	row = (*rows)[i]
 70 | 
 71 | 	last := len(*rows) - 1
 72 | 	copy((*rows)[i:], (*rows)[i+1:])
 73 | 	(*rows)[last] = nil
 74 | 	(*rows) = (*rows)[0:last]
 75 | 
 76 | 	return row
 77 | }
 78 | 
 79 | //
 80 | // GroupByValue will group each row based on record value in index recGroupIdx
 81 | // into map of string -> *Row.
 82 | //
 83 | // WARNING: returned rows will be empty!
 84 | //
 85 | // For example, given rows with target group in column index 1,
 86 | //
 87 | // 	[1 +]
 88 | // 	[2 -]
 89 | // 	[3 -]
 90 | // 	[4 +]
 91 | //
 92 | // this function will create a map with key is string of target and value is
 93 | // pointer to sub-rows,
 94 | //
 95 | // 	+ -> [1 +]
 96 | //           [4 +]
 97 | // 	- -> [2 -]
 98 | //           [3 -]
 99 | //
100 | //
101 | func (rows *Rows) GroupByValue(GroupIdx int) (mapRows MapRows) {
102 | 	for {
103 | 		row := rows.PopFront()
104 | 		if nil == row {
105 | 			break
106 | 		}
107 | 
108 | 		key := fmt.Sprint((*row)[GroupIdx])
109 | 
110 | 		mapRows.AddRow(key, row)
111 | 	}
112 | 	return
113 | }
114 | 
115 | //
116 | // RandomPick row in rows until n item and return it like its has been shuffled.
117 | // If duplicate is true, row that has been picked can be picked up again,
118 | // otherwise it will only picked up once.
119 | //
120 | // This function return picked and unpicked rows and index of them.
121 | //
122 | func (rows *Rows) RandomPick(n int, duplicate bool) (
123 | 	picked Rows,
124 | 	unpicked Rows,
125 | 	pickedIdx []int,
126 | 	unpickedIdx []int,
127 | ) {
128 | 	rowsLen := len(*rows)
129 | 
130 | 	// if duplication is not allowed, we can only select as many as rows
131 | 	// that we have.
132 | 	if n > rowsLen && !duplicate {
133 | 		n = rowsLen
134 | 	}
135 | 
136 | 	rand.Seed(time.Now().UnixNano())
137 | 
138 | 	for ; n >= 1; n-- {
139 | 		idx := 0
140 | 		for {
141 | 			idx = rand.Intn(len(*rows))
142 | 
143 | 			if duplicate {
144 | 				// allow duplicate idx
145 | 				pickedIdx = append(pickedIdx, idx)
146 | 				break
147 | 			}
148 | 
149 | 			// check if its already picked
150 | 			isPicked := false
151 | 			for _, pastIdx := range pickedIdx {
152 | 				if idx == pastIdx {
153 | 					isPicked = true
154 | 					break
155 | 				}
156 | 			}
157 | 			// get another random idx again
158 | 			if isPicked {
159 | 				continue
160 | 			}
161 | 
162 | 			// bingo, we found unique idx that has not been picked.
163 | 			pickedIdx = append(pickedIdx, idx)
164 | 			break
165 | 		}
166 | 
167 | 		row := (*rows)[idx]
168 | 
169 | 		picked.PushBack(row)
170 | 	}
171 | 
172 | 	// select unpicked rows using picked index.
173 | 	for rid := range *rows {
174 | 		// check if row index has been picked up
175 | 		isPicked := false
176 | 		for _, idx := range pickedIdx {
177 | 			if rid == idx {
178 | 				isPicked = true
179 | 				break
180 | 			}
181 | 		}
182 | 		if !isPicked {
183 | 			unpicked.PushBack((*rows)[rid])
184 | 			unpickedIdx = append(unpickedIdx, rid)
185 | 		}
186 | 	}
187 | 	return
188 | }
189 | 
190 | //
191 | // Contain return true and index of row, if rows has data that has the same value
192 | // with `row`, otherwise return false and -1 as index.
193 | //
194 | func (rows *Rows) Contain(xrow *Row) (bool, int) {
195 | 	for x, row := range *rows {
196 | 		if xrow.IsEqual(row) {
197 | 			return true, x
198 | 		}
199 | 	}
200 | 	return false, -1
201 | }
202 | 
203 | //
204 | // Contains return true and indices of row, if rows has data that has the same
205 | // value with `rows`, otherwise return false and empty indices.
206 | //
207 | func (rows *Rows) Contains(xrows Rows) (isin bool, indices []int) {
208 | 	// No data to compare.
209 | 	if len(xrows) <= 0 {
210 | 		return
211 | 	}
212 | 
213 | 	for _, xrow := range xrows {
214 | 		isin, idx := rows.Contain(xrow)
215 | 
216 | 		if isin {
217 | 			indices = append(indices, idx)
218 | 		}
219 | 	}
220 | 
221 | 	// Check if indices length equal to searched rows
222 | 	if len(indices) == len(xrows) {
223 | 		return true, indices
224 | 	}
225 | 
226 | 	return false, nil
227 | }
228 | 
229 | //
230 | // SelectWhere return all rows which column value in `colidx` is equal
231 | // to `colval`.
232 | //
233 | func (rows *Rows) SelectWhere(colidx int, colval string) (selected Rows) {
234 | 	for _, row := range *rows {
235 | 		col := (*row)[colidx]
236 | 		if col.IsEqualToString(colval) {
237 | 			selected.PushBack(row)
238 | 		}
239 | 	}
240 | 	return
241 | }
242 | 
243 | //
244 | // String return the string representation of each row.
245 | //
246 | func (rows Rows) String() (s string) {
247 | 	for x := range rows {
248 | 		s += fmt.Sprint(rows[x])
249 | 	}
250 | 	return
251 | }
252 | 


--------------------------------------------------------------------------------
/record.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"math"
  9 | 	"reflect"
 10 | 	"strconv"
 11 | )
 12 | 
 13 | const (
 14 | 	// TUndefined for undefined type
 15 | 	TUndefined = -1
 16 | 	// TString string type.
 17 | 	TString = 0
 18 | 	// TInteger integer type (64 bit).
 19 | 	TInteger = 1
 20 | 	// TReal float type (64 bit).
 21 | 	TReal = 2
 22 | )
 23 | 
 24 | //
 25 | // Record represent the smallest building block of data-set.
 26 | //
 27 | type Record struct {
 28 | 	v interface{}
 29 | }
 30 | 
 31 | //
 32 | // NewRecord will create and return record with nil value.
 33 | //
 34 | func NewRecord() *Record {
 35 | 	return &Record{v: nil}
 36 | }
 37 | 
 38 | //
 39 | // NewRecordBy create new record from string with type set to `t`.
 40 | //
 41 | func NewRecordBy(v string, t int) (r *Record, e error) {
 42 | 	r = NewRecord()
 43 | 	e = r.SetValue(v, t)
 44 | 	return
 45 | }
 46 | 
 47 | //
 48 | // NewRecordString will create new record from string.
 49 | //
 50 | func NewRecordString(v string) (r *Record) {
 51 | 	return &Record{v: v}
 52 | }
 53 | 
 54 | //
 55 | // NewRecordInt create new record from integer value.
 56 | //
 57 | func NewRecordInt(v int64) (r *Record) {
 58 | 	return &Record{v: v}
 59 | }
 60 | 
 61 | //
 62 | // NewRecordReal create new record from float value.
 63 | //
 64 | func NewRecordReal(v float64) (r *Record) {
 65 | 	return &Record{v: v}
 66 | }
 67 | 
 68 | //
 69 | // Clone will create and return a clone of record.
 70 | //
 71 | func (r *Record) Clone() *Record {
 72 | 	return &Record{v: r.v}
 73 | }
 74 | 
 75 | //
 76 | // IsNil return true if record has not been set with value, or nil.
 77 | //
 78 | func (r *Record) IsNil() bool {
 79 | 	return r.v == nil
 80 | }
 81 | 
 82 | //
 83 | // Type of record.
 84 | //
 85 | func (r *Record) Type() int {
 86 | 	switch r.v.(type) {
 87 | 	case int64:
 88 | 		return TInteger
 89 | 	case float64:
 90 | 		return TReal
 91 | 	}
 92 | 	return TString
 93 | }
 94 | 
 95 | //
 96 | // SetValue set the record value from string using type `t`. If value can not
 97 | // be converted to type, it will return an error.
 98 | //
 99 | func (r *Record) SetValue(v string, t int) error {
100 | 	switch t {
101 | 	case TString:
102 | 		r.v = v
103 | 
104 | 	case TInteger:
105 | 		i64, e := strconv.ParseInt(v, 10, 64)
106 | 		if nil != e {
107 | 			return e
108 | 		}
109 | 
110 | 		r.v = i64
111 | 
112 | 	case TReal:
113 | 		f64, e := strconv.ParseFloat(v, 64)
114 | 		if nil != e {
115 | 			return e
116 | 		}
117 | 
118 | 		r.v = f64
119 | 	}
120 | 	return nil
121 | }
122 | 
123 | //
124 | // SetString will set the record value with string value.
125 | //
126 | func (r *Record) SetString(v string) {
127 | 	r.v = v
128 | }
129 | 
130 | //
131 | // SetFloat will set the record value with float 64bit.
132 | //
133 | func (r *Record) SetFloat(v float64) {
134 | 	r.v = v
135 | }
136 | 
137 | //
138 | // SetInteger will set the record value with integer 64bit.
139 | //
140 | func (r *Record) SetInteger(v int64) {
141 | 	r.v = v
142 | }
143 | 
144 | //
145 | // IsMissingValue check wether the value is a missing attribute.
146 | //
147 | // If its string the missing value is indicated by character '?'.
148 | //
149 | // If its integer the missing value is indicated by minimum negative integer,
150 | // or math.MinInt64.
151 | //
152 | // If its real the missing value is indicated by -Inf.
153 | //
154 | func (r *Record) IsMissingValue() bool {
155 | 	switch r.v.(type) {
156 | 	case string:
157 | 		str := r.v.(string)
158 | 		if str == "?" {
159 | 			return true
160 | 		}
161 | 
162 | 	case int64:
163 | 		i64 := r.v.(int64)
164 | 		if i64 == math.MinInt64 {
165 | 			return true
166 | 		}
167 | 
168 | 	case float64:
169 | 		f64 := r.v.(float64)
170 | 		return math.IsInf(f64, -1)
171 | 	}
172 | 
173 | 	return false
174 | }
175 | 
176 | //
177 | // Interface return record value as interface.
178 | //
179 | func (r *Record) Interface() interface{} {
180 | 	return r.v
181 | }
182 | 
183 | //
184 | // Bytes convert record value to slice of byte.
185 | //
186 | func (r *Record) Bytes() []byte {
187 | 	return []byte(r.String())
188 | }
189 | 
190 | //
191 | // String convert record value to string.
192 | //
193 | func (r Record) String() (s string) {
194 | 	switch r.v.(type) {
195 | 	case string:
196 | 		s = r.v.(string)
197 | 
198 | 	case int64:
199 | 		s = strconv.FormatInt(r.v.(int64), 10)
200 | 
201 | 	case float64:
202 | 		s = strconv.FormatFloat(r.v.(float64), 'f', -1, 64)
203 | 	}
204 | 	return
205 | }
206 | 
207 | //
208 | // Float convert given record to float value. If its failed it will return
209 | // the -Infinity value.
210 | //
211 | func (r *Record) Float() (f64 float64) {
212 | 	var e error
213 | 
214 | 	switch r.v.(type) {
215 | 	case string:
216 | 		f64, e = strconv.ParseFloat(r.v.(string), 64)
217 | 
218 | 		if nil != e {
219 | 			f64 = math.Inf(-1)
220 | 		}
221 | 
222 | 	case int64:
223 | 		f64 = float64(r.v.(int64))
224 | 
225 | 	case float64:
226 | 		f64 = r.v.(float64)
227 | 	}
228 | 
229 | 	return
230 | }
231 | 
232 | //
233 | // Integer convert given record to integer value. If its failed, it will return
234 | // the minimum integer in 64bit.
235 | //
236 | func (r *Record) Integer() (i64 int64) {
237 | 	var e error
238 | 
239 | 	switch r.v.(type) {
240 | 	case string:
241 | 		i64, e = strconv.ParseInt(r.v.(string), 10, 64)
242 | 
243 | 		if nil != e {
244 | 			i64 = math.MinInt64
245 | 		}
246 | 
247 | 	case int64:
248 | 		i64 = r.v.(int64)
249 | 
250 | 	case float64:
251 | 		i64 = int64(r.v.(float64))
252 | 	}
253 | 
254 | 	return
255 | }
256 | 
257 | //
258 | // IsEqual return true if record is equal with other, otherwise return false.
259 | //
260 | func (r *Record) IsEqual(o *Record) bool {
261 | 	return reflect.DeepEqual(r.v, o.Interface())
262 | }
263 | 
264 | //
265 | // IsEqualToString return true if string representation of record value is
266 | // equal to string `v`.
267 | //
268 | func (r *Record) IsEqualToString(v string) bool {
269 | 	return r.String() == v
270 | }
271 | 
272 | //
273 | // IsEqualToInterface return true if interface type and value equal to record
274 | // type and value.
275 | //
276 | func (r *Record) IsEqualToInterface(v interface{}) bool {
277 | 	return reflect.DeepEqual(r.v, v)
278 | }
279 | 
280 | //
281 | // Reset will reset record value to empty string or zero, depend on type.
282 | //
283 | func (r *Record) Reset() {
284 | 	switch r.v.(type) {
285 | 	case string:
286 | 		r.v = ""
287 | 	case int64:
288 | 		r.v = int64(0)
289 | 	case float64:
290 | 		r.v = float64(0)
291 | 	}
292 | }
293 | 


--------------------------------------------------------------------------------
/column.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"strconv"
  9 | )
 10 | 
 11 | //
 12 | // Column represent slice of record. A vertical representation of data.
 13 | //
 14 | type Column struct {
 15 | 	// Name of column. String identifier for the column.
 16 | 	Name string
 17 | 	// Type of column. All record in column have the same type.
 18 | 	Type int
 19 | 	// Flag additional attribute that can be set to mark some value on this
 20 | 	// column
 21 | 	Flag int
 22 | 	// ValueSpace contain the possible value in records
 23 | 	ValueSpace []string
 24 | 	// Records contain column data.
 25 | 	Records Records
 26 | }
 27 | 
 28 | //
 29 | // NewColumn return new column with type and name.
 30 | //
 31 | func NewColumn(colType int, colName string) (col *Column) {
 32 | 	col = &Column{
 33 | 		Type: colType,
 34 | 		Name: colName,
 35 | 		Flag: 0,
 36 | 	}
 37 | 
 38 | 	col.Records = make([]*Record, 0)
 39 | 
 40 | 	return
 41 | }
 42 | 
 43 | //
 44 | // NewColumnString initialize column with type anda data as string.
 45 | //
 46 | func NewColumnString(data []string, colType int, colName string) (
 47 | 	col *Column,
 48 | 	e error,
 49 | ) {
 50 | 	col = NewColumn(colType, colName)
 51 | 
 52 | 	datalen := len(data)
 53 | 
 54 | 	if datalen <= 0 {
 55 | 		return
 56 | 	}
 57 | 
 58 | 	col.Records = make([]*Record, datalen)
 59 | 
 60 | 	for x := 0; x < datalen; x++ {
 61 | 		col.Records[x] = NewRecordString(data[x])
 62 | 	}
 63 | 
 64 | 	return col, nil
 65 | }
 66 | 
 67 | //
 68 | // NewColumnInt create new column with record type as integer, and fill it
 69 | // with `data`.
 70 | //
 71 | func NewColumnInt(data []int64, colName string) (col *Column) {
 72 | 	col = NewColumn(TInteger, colName)
 73 | 
 74 | 	datalen := len(data)
 75 | 	if datalen <= 0 {
 76 | 		return
 77 | 	}
 78 | 
 79 | 	col.Records = make([]*Record, datalen)
 80 | 
 81 | 	for x, v := range data {
 82 | 		col.Records[x] = NewRecordInt(v)
 83 | 	}
 84 | 	return
 85 | }
 86 | 
 87 | //
 88 | // NewColumnReal create new column with record type is real.
 89 | //
 90 | func NewColumnReal(data []float64, colName string) (col *Column) {
 91 | 	col = NewColumn(TReal, colName)
 92 | 
 93 | 	datalen := len(data)
 94 | 
 95 | 	if datalen <= 0 {
 96 | 		return
 97 | 	}
 98 | 
 99 | 	col.Records = make([]*Record, datalen)
100 | 
101 | 	for x := 0; x < datalen; x++ {
102 | 		rec := NewRecordReal(data[x])
103 | 		col.Records[x] = rec
104 | 	}
105 | 
106 | 	return
107 | }
108 | 
109 | //
110 | // SetType will set the type of column to `tipe`.
111 | //
112 | func (col *Column) SetType(tipe int) {
113 | 	col.Type = tipe
114 | }
115 | 
116 | //
117 | // SetName will set the name of column to `name`.
118 | //
119 | func (col *Column) SetName(name string) {
120 | 	col.Name = name
121 | }
122 | 
123 | //
124 | // GetType return the type of column.
125 | //
126 | func (col *Column) GetType() int {
127 | 	return col.Type
128 | }
129 | 
130 | //
131 | // GetName return the column name.
132 | //
133 | func (col *Column) GetName() string {
134 | 	return col.Name
135 | }
136 | 
137 | //
138 | // SetRecords will set records in column to `recs`.
139 | //
140 | func (col *Column) SetRecords(recs *Records) {
141 | 	col.Records = *recs
142 | }
143 | 
144 | //
145 | // Interface return the column object as an interface.
146 | //
147 | func (col *Column) Interface() interface{} {
148 | 	return col
149 | }
150 | 
151 | //
152 | // Reset column data and flag.
153 | //
154 | func (col *Column) Reset() {
155 | 	col.Flag = 0
156 | 	col.Records = make([]*Record, 0)
157 | }
158 | 
159 | //
160 | // Len return number of record.
161 | //
162 | func (col *Column) Len() int {
163 | 	return len(col.Records)
164 | }
165 | 
166 | //
167 | // PushBack push record the end of column.
168 | //
169 | func (col *Column) PushBack(r *Record) {
170 | 	col.Records = append(col.Records, r)
171 | }
172 | 
173 | //
174 | // PushRecords append slice of record to the end of column's records.
175 | //
176 | func (col *Column) PushRecords(rs []*Record) {
177 | 	col.Records = append(col.Records, rs...)
178 | }
179 | 
180 | //
181 | // ToIntegers convert slice of record to slice of int64.
182 | //
183 | func (col *Column) ToIntegers() []int64 {
184 | 	newcol := make([]int64, col.Len())
185 | 
186 | 	for x := range col.Records {
187 | 		newcol[x] = col.Records[x].Integer()
188 | 	}
189 | 
190 | 	return newcol
191 | }
192 | 
193 | //
194 | // ToFloatSlice convert slice of record to slice of float64.
195 | //
196 | func (col *Column) ToFloatSlice() (newcol []float64) {
197 | 	newcol = make([]float64, col.Len())
198 | 
199 | 	for i := range col.Records {
200 | 		newcol[i] = col.Records[i].Float()
201 | 	}
202 | 
203 | 	return
204 | }
205 | 
206 | //
207 | // ToStringSlice convert slice of record to slice of string.
208 | //
209 | func (col *Column) ToStringSlice() (newcol []string) {
210 | 	newcol = make([]string, col.Len())
211 | 
212 | 	for i := range col.Records {
213 | 		newcol[i] = col.Records[i].String()
214 | 	}
215 | 
216 | 	return
217 | }
218 | 
219 | //
220 | // ClearValues set all value in column to empty string or zero if column type is
221 | // numeric.
222 | //
223 | func (col *Column) ClearValues() {
224 | 	for _, r := range col.Records {
225 | 		r.Reset()
226 | 	}
227 | }
228 | 
229 | //
230 | // SetValueAt will set column value at cell `idx` with `v`, unless the index
231 | // is out of range.
232 | //
233 | func (col *Column) SetValueAt(idx int, v string) {
234 | 	if idx < 0 {
235 | 		return
236 | 	}
237 | 	if col.Records.Len() <= idx {
238 | 		return
239 | 	}
240 | 	_ = col.Records[idx].SetValue(v, col.Type)
241 | }
242 | 
243 | //
244 | // SetValueByNumericAt will set column value at cell `idx` with numeric value
245 | // `v`, unless the index is out of range.
246 | //
247 | func (col *Column) SetValueByNumericAt(idx int, v float64) {
248 | 	if idx < 0 {
249 | 		return
250 | 	}
251 | 	if col.Records.Len() <= idx {
252 | 		return
253 | 	}
254 | 	switch col.Type {
255 | 	case TString:
256 | 		col.Records[idx].SetString(strconv.FormatFloat(v, 'f', -1, 64))
257 | 	case TInteger:
258 | 		col.Records[idx].SetInteger(int64(v))
259 | 	case TReal:
260 | 		col.Records[idx].SetFloat(v)
261 | 	}
262 | }
263 | 
264 | //
265 | // SetValues of all column record.
266 | //
267 | func (col *Column) SetValues(values []string) {
268 | 	vallen := len(values)
269 | 	reclen := col.Len()
270 | 
271 | 	// initialize column record if its empty.
272 | 	if reclen <= 0 {
273 | 		col.Records = make([]*Record, vallen)
274 | 		reclen = vallen
275 | 	}
276 | 
277 | 	// pick the least length
278 | 	minlen := reclen
279 | 	if vallen < reclen {
280 | 		minlen = vallen
281 | 	}
282 | 
283 | 	for x := 0; x < minlen; x++ {
284 | 		_ = col.Records[x].SetValue(values[x], col.Type)
285 | 	}
286 | }
287 | 
288 | //
289 | // DeleteRecordAt will delete record at index `i` and return it.
290 | //
291 | func (col *Column) DeleteRecordAt(i int) *Record {
292 | 	if i < 0 {
293 | 		return nil
294 | 	}
295 | 
296 | 	clen := col.Len()
297 | 	if i >= clen {
298 | 		return nil
299 | 	}
300 | 
301 | 	r := col.Records[i]
302 | 
303 | 	last := clen - 1
304 | 	copy(col.Records[i:], col.Records[i+1:])
305 | 	col.Records[last] = nil
306 | 	col.Records = col.Records[0:last]
307 | 
308 | 	return r
309 | }
310 | 


--------------------------------------------------------------------------------
/claset.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"github.com/shuLhan/numerus"
 10 | 	"github.com/shuLhan/tekstus"
 11 | 	"strconv"
 12 | )
 13 | 
 14 | //
 15 | // Claset define a dataset with class attribute.
 16 | //
 17 | type Claset struct {
 18 | 	// Dataset embedded, for implementing the dataset interface.
 19 | 	Dataset
 20 | 	// ClassIndex contain index for target classification in columns.
 21 | 	ClassIndex int `json:"ClassIndex"`
 22 | 
 23 | 	// vs contain a copy of value space.
 24 | 	vs []string
 25 | 	// counts number of value space in current set.
 26 | 	counts []int
 27 | 
 28 | 	// major contain the name of majority class in dataset.
 29 | 	major string
 30 | 	// minor contain the name of minority class in dataset.
 31 | 	minor string
 32 | }
 33 | 
 34 | //
 35 | // NewClaset create and return new Claset object.
 36 | //
 37 | func NewClaset(mode int, types []int, names []string) (claset *Claset) {
 38 | 	claset = &Claset{
 39 | 		ClassIndex: -1,
 40 | 	}
 41 | 
 42 | 	claset.Init(mode, types, names)
 43 | 
 44 | 	return
 45 | }
 46 | 
 47 | //
 48 | // Clone return a copy of current claset object.
 49 | //
 50 | func (claset *Claset) Clone() interface{} {
 51 | 	clone := Claset{
 52 | 		ClassIndex: claset.GetClassIndex(),
 53 | 		major:      claset.MajorityClass(),
 54 | 		minor:      claset.MinorityClass(),
 55 | 	}
 56 | 	clone.SetDataset(claset.GetDataset().Clone().(DatasetInterface))
 57 | 	return &clone
 58 | }
 59 | 
 60 | //
 61 | // GetDataset return the dataset.
 62 | //
 63 | func (claset *Claset) GetDataset() DatasetInterface {
 64 | 	return &claset.Dataset
 65 | }
 66 | 
 67 | //
 68 | // GetClassType return type of class in dataset.
 69 | //
 70 | func (claset *Claset) GetClassType() int {
 71 | 	if claset.Columns.Len() <= 0 {
 72 | 		return TString
 73 | 	}
 74 | 	return claset.Columns[claset.ClassIndex].Type
 75 | }
 76 | 
 77 | //
 78 | // GetClassValueSpace return the class value space.
 79 | //
 80 | func (claset *Claset) GetClassValueSpace() []string {
 81 | 	if claset.Columns.Len() <= 0 {
 82 | 		return nil
 83 | 	}
 84 | 	return claset.Columns[claset.ClassIndex].ValueSpace
 85 | }
 86 | 
 87 | //
 88 | // GetClassColumn return dataset class values in column.
 89 | //
 90 | func (claset *Claset) GetClassColumn() *Column {
 91 | 	if claset.Mode == DatasetModeRows {
 92 | 		claset.TransposeToColumns()
 93 | 	}
 94 | 	if claset.Columns.Len() <= 0 {
 95 | 		return nil
 96 | 	}
 97 | 	return &claset.Columns[claset.ClassIndex]
 98 | }
 99 | 
100 | //
101 | // GetClassRecords return class values as records.
102 | //
103 | func (claset *Claset) GetClassRecords() *Records {
104 | 	if claset.Mode == DatasetModeRows {
105 | 		claset.TransposeToColumns()
106 | 	}
107 | 	if claset.Columns.Len() <= 0 {
108 | 		return nil
109 | 	}
110 | 	return &claset.Columns[claset.ClassIndex].Records
111 | }
112 | 
113 | //
114 | // GetClassAsStrings return all class values as slice of string.
115 | //
116 | func (claset *Claset) GetClassAsStrings() []string {
117 | 	if claset.Mode == DatasetModeRows {
118 | 		claset.TransposeToColumns()
119 | 	}
120 | 	if claset.Columns.Len() <= 0 {
121 | 		return nil
122 | 	}
123 | 	return claset.Columns[claset.ClassIndex].ToStringSlice()
124 | }
125 | 
126 | //
127 | // GetClassAsReals return class record value as slice of float64.
128 | //
129 | func (claset *Claset) GetClassAsReals() []float64 {
130 | 	if claset.Mode == DatasetModeRows {
131 | 		claset.TransposeToColumns()
132 | 	}
133 | 	if claset.Columns.Len() <= 0 {
134 | 		return nil
135 | 	}
136 | 	return claset.Columns[claset.ClassIndex].ToFloatSlice()
137 | }
138 | 
139 | //
140 | // GetClassAsInteger return class record value as slice of int64.
141 | //
142 | func (claset *Claset) GetClassAsInteger() []int64 {
143 | 	if claset.Mode == DatasetModeRows {
144 | 		claset.TransposeToColumns()
145 | 	}
146 | 	if claset.Columns.Len() <= 0 {
147 | 		return nil
148 | 	}
149 | 	return claset.Columns[claset.ClassIndex].ToIntegers()
150 | }
151 | 
152 | //
153 | // GetClassIndex return index of class attribute in dataset.
154 | //
155 | func (claset *Claset) GetClassIndex() int {
156 | 	return claset.ClassIndex
157 | }
158 | 
159 | //
160 | // MajorityClass return the majority class of data.
161 | //
162 | func (claset *Claset) MajorityClass() string {
163 | 	return claset.major
164 | }
165 | 
166 | //
167 | // MinorityClass return the minority class in dataset.
168 | //
169 | func (claset *Claset) MinorityClass() string {
170 | 	return claset.minor
171 | }
172 | 
173 | //
174 | // Counts return the number of each class in value-space.
175 | //
176 | func (claset *Claset) Counts() []int {
177 | 	if len(claset.counts) <= 0 {
178 | 		claset.CountValueSpaces()
179 | 	}
180 | 	return claset.counts
181 | }
182 | 
183 | //
184 | // SetDataset in class set.
185 | //
186 | func (claset *Claset) SetDataset(dataset DatasetInterface) {
187 | 	claset.Dataset = *(dataset.(*Dataset))
188 | }
189 | 
190 | //
191 | // SetClassIndex will set the class index to `v`.
192 | //
193 | func (claset *Claset) SetClassIndex(v int) {
194 | 	claset.ClassIndex = v
195 | }
196 | 
197 | //
198 | // SetMajorityClass will set the majority class to `v`.
199 | //
200 | func (claset *Claset) SetMajorityClass(v string) {
201 | 	claset.major = v
202 | }
203 | 
204 | //
205 | // SetMinorityClass will set the minority class to `v`.
206 | //
207 | func (claset *Claset) SetMinorityClass(v string) {
208 | 	claset.minor = v
209 | }
210 | 
211 | //
212 | // CountValueSpaces will count number of value space in current dataset.
213 | //
214 | func (claset *Claset) CountValueSpaces() {
215 | 	classv := claset.GetClassAsStrings()
216 | 	claset.vs = claset.GetClassValueSpace()
217 | 
218 | 	claset.counts = tekstus.WordsCountTokens(classv, claset.vs, false)
219 | }
220 | 
221 | //
222 | // RecountMajorMinor recount major and minor class in claset.
223 | //
224 | func (claset *Claset) RecountMajorMinor() {
225 | 	claset.CountValueSpaces()
226 | 
227 | 	_, maxIdx, maxok := numerus.IntsFindMax(claset.counts)
228 | 	_, minIdx, minok := numerus.IntsFindMin(claset.counts)
229 | 
230 | 	if maxok {
231 | 		claset.major = claset.vs[maxIdx]
232 | 	}
233 | 	if minok {
234 | 		claset.minor = claset.vs[minIdx]
235 | 	}
236 | }
237 | 
238 | //
239 | // IsInSingleClass check whether all target class contain only single value.
240 | // Return true and name of target if all rows is in the same class,
241 | // false and empty string otherwise.
242 | //
243 | func (claset *Claset) IsInSingleClass() (single bool, class string) {
244 | 	classv := claset.GetClassAsStrings()
245 | 
246 | 	for i, t := range classv {
247 | 		if i == 0 {
248 | 			single = true
249 | 			class = t
250 | 			continue
251 | 		}
252 | 		if t != class {
253 | 			return false, ""
254 | 		}
255 | 	}
256 | 	return
257 | }
258 | 
259 | //
260 | // GetMinorityRows return rows where their class is minority in dataset, or nil
261 | // if dataset is empty.
262 | //
263 | func (claset *Claset) GetMinorityRows() *Rows {
264 | 	if claset.Len() == 0 {
265 | 		return nil
266 | 	}
267 | 	if claset.vs == nil {
268 | 		claset.RecountMajorMinor()
269 | 	}
270 | 
271 | 	minRows := claset.GetRows().SelectWhere(claset.ClassIndex,
272 | 		claset.minor)
273 | 
274 | 	return &minRows
275 | }
276 | 
277 | //
278 | // String, yes it will pretty print the meta-data in JSON format.
279 | //
280 | func (claset *Claset) String() (s string) {
281 | 	if claset.vs == nil {
282 | 		claset.RecountMajorMinor()
283 | 	}
284 | 
285 | 	s = fmt.Sprintf("'claset':{'rows': %d, 'columns': %d, ", claset.Len(),
286 | 		claset.GetNColumn())
287 | 
288 | 	s += "'vs':{"
289 | 	for x, v := range claset.vs {
290 | 		if x > 0 {
291 | 			s += ", "
292 | 		}
293 | 		s += "'" + v + "':" + strconv.Itoa(claset.counts[x])
294 | 	}
295 | 	s += "}"
296 | 
297 | 	s += ", 'major': '" + claset.major + "'"
298 | 	s += ", 'minor': '" + claset.minor + "'"
299 | 	s += "}"
300 | 
301 | 	return
302 | }
303 | 


--------------------------------------------------------------------------------
/dataset_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula_test
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"github.com/shuLhan/tabula"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var datasetRows = [][]string{
 14 | 	{"0", "1", "A"},
 15 | 	{"1", "1.1", "B"},
 16 | 	{"2", "1.2", "A"},
 17 | 	{"3", "1.3", "B"},
 18 | 	{"4", "1.4", "C"},
 19 | 	{"5", "1.5", "D"},
 20 | 	{"6", "1.6", "C"},
 21 | 	{"7", "1.7", "D"},
 22 | 	{"8", "1.8", "E"},
 23 | 	{"9", "1.9", "F"},
 24 | }
 25 | 
 26 | var datasetCols = [][]string{
 27 | 	{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
 28 | 	{"1", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9"},
 29 | 	{"A", "B", "A", "B", "C", "D", "C", "D", "E", "F"},
 30 | }
 31 | 
 32 | var datasetTypes = []int{
 33 | 	tabula.TInteger,
 34 | 	tabula.TReal,
 35 | 	tabula.TString,
 36 | }
 37 | 
 38 | var datasetNames = []string{"int", "real", "string"}
 39 | 
 40 | func populateWithRows(dataset *tabula.Dataset) error {
 41 | 	for _, rowin := range datasetRows {
 42 | 		row := make(tabula.Row, len(rowin))
 43 | 
 44 | 		for x, recin := range rowin {
 45 | 			rec, e := tabula.NewRecordBy(recin, datasetTypes[x])
 46 | 			if e != nil {
 47 | 				return e
 48 | 			}
 49 | 
 50 | 			row[x] = rec
 51 | 		}
 52 | 
 53 | 		dataset.PushRow(&row)
 54 | 	}
 55 | 	return nil
 56 | }
 57 | 
 58 | func populateWithColumns(t *testing.T, dataset *tabula.Dataset) {
 59 | 	for x := range datasetCols {
 60 | 		col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x],
 61 | 			datasetNames[x])
 62 | 		if e != nil {
 63 | 			t.Fatal(e)
 64 | 		}
 65 | 
 66 | 		dataset.PushColumn(*col)
 67 | 	}
 68 | }
 69 | 
 70 | func createDataset(t *testing.T) (dataset *tabula.Dataset) {
 71 | 	dataset = tabula.NewDataset(tabula.DatasetModeRows, datasetTypes,
 72 | 		datasetNames)
 73 | 
 74 | 	e := populateWithRows(dataset)
 75 | 	if e != nil {
 76 | 		t.Fatal(e)
 77 | 	}
 78 | 
 79 | 	return
 80 | }
 81 | 
 82 | func DatasetStringJoinByIndex(t *testing.T, dataset [][]string, indis []int) (res string) {
 83 | 	for x := range indis {
 84 | 		res += fmt.Sprint("&", dataset[indis[x]])
 85 | 	}
 86 | 	return res
 87 | }
 88 | 
 89 | func DatasetRowsJoin(t *testing.T) (s string) {
 90 | 	for x := range datasetRows {
 91 | 		s += fmt.Sprint("&", datasetRows[x])
 92 | 	}
 93 | 	return
 94 | }
 95 | 
 96 | func DatasetColumnsJoin(t *testing.T) (s string) {
 97 | 	for x := range datasetCols {
 98 | 		s += fmt.Sprint(datasetCols[x])
 99 | 	}
100 | 	return
101 | }
102 | 
103 | func TestSplitRowsByNumeric(t *testing.T) {
104 | 	dataset := createDataset(t)
105 | 
106 | 	// Split integer by float
107 | 	splitL, splitR, e := tabula.SplitRowsByNumeric(dataset, 0, 4.5)
108 | 	if e != nil {
109 | 		t.Fatal(e)
110 | 	}
111 | 
112 | 	expIdx := []int{0, 1, 2, 3, 4}
113 | 	exp := DatasetStringJoinByIndex(t, datasetRows, expIdx)
114 | 	rows := splitL.GetDataAsRows()
115 | 	got := fmt.Sprint(rows)
116 | 
117 | 	assert(t, exp, got, true)
118 | 
119 | 	expIdx = []int{5, 6, 7, 8, 9}
120 | 	exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
121 | 	got = fmt.Sprint(splitR.GetDataAsRows())
122 | 
123 | 	assert(t, exp, got, true)
124 | 
125 | 	// Split by float
126 | 	splitL, splitR, e = tabula.SplitRowsByNumeric(dataset, 1, 1.8)
127 | 	if e != nil {
128 | 		t.Fatal(e)
129 | 	}
130 | 
131 | 	expIdx = []int{0, 1, 2, 3, 4, 5, 6, 7}
132 | 	exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
133 | 	got = fmt.Sprint(splitL.GetDataAsRows())
134 | 
135 | 	assert(t, exp, got, true)
136 | 
137 | 	expIdx = []int{8, 9}
138 | 	exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
139 | 	got = fmt.Sprint(splitR.GetDataAsRows())
140 | 
141 | 	assert(t, exp, got, true)
142 | }
143 | 
144 | func TestSplitRowsByCategorical(t *testing.T) {
145 | 	dataset := createDataset(t)
146 | 	splitval := []string{"A", "D"}
147 | 
148 | 	splitL, splitR, e := tabula.SplitRowsByCategorical(dataset, 2,
149 | 		splitval)
150 | 	if e != nil {
151 | 		t.Fatal(e)
152 | 	}
153 | 
154 | 	expIdx := []int{0, 2, 5, 7}
155 | 	exp := DatasetStringJoinByIndex(t, datasetRows, expIdx)
156 | 	got := fmt.Sprint(splitL.GetDataAsRows())
157 | 
158 | 	assert(t, exp, got, true)
159 | 
160 | 	expIdx = []int{1, 3, 4, 6, 8, 9}
161 | 	exp = DatasetStringJoinByIndex(t, datasetRows, expIdx)
162 | 	got = fmt.Sprint(splitR.GetDataAsRows())
163 | 
164 | 	assert(t, exp, got, true)
165 | }
166 | 
167 | func TestModeColumnsPushColumn(t *testing.T) {
168 | 	dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil)
169 | 
170 | 	exp := ""
171 | 	got := ""
172 | 	for x := range datasetCols {
173 | 		col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x],
174 | 			datasetNames[x])
175 | 		if e != nil {
176 | 			t.Fatal(e)
177 | 		}
178 | 
179 | 		dataset.PushColumn(*col)
180 | 
181 | 		exp += fmt.Sprint(datasetCols[x])
182 | 		got += fmt.Sprint(dataset.Columns[x].Records)
183 | 	}
184 | 
185 | 	assert(t, exp, got, true)
186 | 
187 | 	// Check rows
188 | 	exp = ""
189 | 	got = fmt.Sprint(dataset.Rows)
190 | 	assert(t, exp, got, true)
191 | }
192 | 
193 | func TestModeRowsPushColumn(t *testing.T) {
194 | 	dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil)
195 | 
196 | 	populateWithColumns(t, dataset)
197 | 
198 | 	// Check rows
199 | 	exp := DatasetRowsJoin(t)
200 | 	got := fmt.Sprint(dataset.Rows)
201 | 
202 | 	assert(t, exp, got, true)
203 | 
204 | 	// Check columns
205 | 	exp = "[{int 1 0 [] []} {real 2 0 [] []} {string 0 0 [] []}]"
206 | 	got = fmt.Sprint(dataset.Columns)
207 | 
208 | 	assert(t, exp, got, true)
209 | }
210 | 
211 | func TestModeMatrixPushColumn(t *testing.T) {
212 | 	dataset := tabula.NewDataset(tabula.DatasetModeMatrix, nil, nil)
213 | 
214 | 	exp := ""
215 | 	got := ""
216 | 	for x := range datasetCols {
217 | 		col, e := tabula.NewColumnString(datasetCols[x], datasetTypes[x],
218 | 			datasetNames[x])
219 | 		if e != nil {
220 | 			t.Fatal(e)
221 | 		}
222 | 
223 | 		dataset.PushColumn(*col)
224 | 
225 | 		exp += fmt.Sprint(datasetCols[x])
226 | 		got += fmt.Sprint(dataset.Columns[x].Records)
227 | 	}
228 | 
229 | 	assert(t, exp, got, true)
230 | 
231 | 	// Check rows
232 | 	exp = DatasetRowsJoin(t)
233 | 	got = fmt.Sprint(dataset.Rows)
234 | 
235 | 	assert(t, exp, got, true)
236 | }
237 | 
238 | func TestModeRowsPushRows(t *testing.T) {
239 | 	dataset := tabula.NewDataset(tabula.DatasetModeRows, nil, nil)
240 | 
241 | 	e := populateWithRows(dataset)
242 | 	if e != nil {
243 | 		t.Fatal(e)
244 | 	}
245 | 
246 | 	exp := DatasetRowsJoin(t)
247 | 	got := fmt.Sprint(dataset.Rows)
248 | 
249 | 	assert(t, exp, got, true)
250 | }
251 | 
252 | func TestModeColumnsPushRows(t *testing.T) {
253 | 	dataset := tabula.NewDataset(tabula.DatasetModeColumns, nil, nil)
254 | 
255 | 	e := populateWithRows(dataset)
256 | 	if e != nil {
257 | 		t.Fatal(e)
258 | 	}
259 | 
260 | 	// check rows
261 | 	exp := ""
262 | 	got := fmt.Sprint(dataset.Rows)
263 | 
264 | 	assert(t, exp, got, true)
265 | 
266 | 	// check columns
267 | 	exp = DatasetColumnsJoin(t)
268 | 	got = ""
269 | 	for x := range dataset.Columns {
270 | 		got += fmt.Sprint(dataset.Columns[x].Records)
271 | 	}
272 | 
273 | 	assert(t, exp, got, true)
274 | }
275 | 
276 | func TestModeMatrixPushRows(t *testing.T) {
277 | 	dataset := tabula.NewDataset(tabula.DatasetModeMatrix, nil, nil)
278 | 
279 | 	e := populateWithRows(dataset)
280 | 	if e != nil {
281 | 		t.Fatal(e)
282 | 	}
283 | 
284 | 	exp := DatasetRowsJoin(t)
285 | 	got := fmt.Sprint(dataset.Rows)
286 | 
287 | 	assert(t, exp, got, true)
288 | 
289 | 	// check columns
290 | 	exp = DatasetColumnsJoin(t)
291 | 	got = ""
292 | 	for x := range dataset.Columns {
293 | 		got += fmt.Sprint(dataset.Columns[x].Records)
294 | 	}
295 | 
296 | 	assert(t, exp, got, true)
297 | }
298 | 
299 | func TestSelectRowsWhere(t *testing.T) {
300 | 	dataset := tabula.NewDataset(tabula.DatasetModeMatrix, nil, nil)
301 | 
302 | 	e := populateWithRows(dataset)
303 | 	if e != nil {
304 | 		t.Fatal(e)
305 | 	}
306 | 
307 | 	// select all rows where the first column value is 9.
308 | 	selected := tabula.SelectRowsWhere(dataset, 0, "9")
309 | 	exp := dataset.GetRow(9)
310 | 	got := selected.GetRow(0)
311 | 
312 | 	assert(t, exp, got, true)
313 | }
314 | 
315 | func TestDeleteRow(t *testing.T) {
316 | 	dataset := tabula.NewDataset(tabula.DatasetModeMatrix, nil, nil)
317 | 
318 | 	e := populateWithRows(dataset)
319 | 	if e != nil {
320 | 		t.Fatal(e)
321 | 	}
322 | 
323 | 	delIdx := 2
324 | 
325 | 	// Check rows len.
326 | 	exp := dataset.Len() - 1
327 | 	dataset.DeleteRow(delIdx)
328 | 	got := dataset.Len()
329 | 
330 | 	assert(t, exp, got, true)
331 | 
332 | 	// Check columns len.
333 | 	for _, col := range dataset.Columns {
334 | 		got = col.Len()
335 | 
336 | 		assert(t, exp, got, true)
337 | 	}
338 | 
339 | 	// Check rows data.
340 | 	ridx := 0
341 | 	for x, row := range datasetRows {
342 | 		if x == delIdx {
343 | 			continue
344 | 		}
345 | 		exp := fmt.Sprint("&", row)
346 | 		got := fmt.Sprint(dataset.GetRow(ridx))
347 | 		ridx++
348 | 
349 | 		assert(t, exp, got, true)
350 | 	}
351 | 
352 | 	// Check columns data.
353 | 	for x := range dataset.Columns {
354 | 		col := datasetCols[x]
355 | 
356 | 		coldel := []string{}
357 | 		coldel = append(coldel, col[:delIdx]...)
358 | 		coldel = append(coldel, col[delIdx+1:]...)
359 | 
360 | 		exp := fmt.Sprint(coldel)
361 | 		got := fmt.Sprint(dataset.Columns[x].Records)
362 | 		assert(t, exp, got, true)
363 | 	}
364 | }
365 | 


--------------------------------------------------------------------------------
/datasetinterface.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"encoding/json"
  9 | 	"fmt"
 10 | 	"io/ioutil"
 11 | )
 12 | 
 13 | //
 14 | // DatasetInterface is the interface for working with DSV data.
 15 | //
 16 | type DatasetInterface interface {
 17 | 	Init(mode int, types []int, names []string)
 18 | 	Clone() interface{}
 19 | 	Reset() error
 20 | 
 21 | 	GetMode() int
 22 | 	SetMode(mode int)
 23 | 
 24 | 	GetNColumn() int
 25 | 	GetNRow() int
 26 | 	Len() int
 27 | 
 28 | 	GetColumnsType() []int
 29 | 	SetColumnsType(types []int)
 30 | 
 31 | 	GetColumnTypeAt(idx int) (int, error)
 32 | 	SetColumnTypeAt(idx, tipe int) error
 33 | 
 34 | 	GetColumnsName() []string
 35 | 	SetColumnsName(names []string)
 36 | 
 37 | 	AddColumn(tipe int, name string, vs []string)
 38 | 	GetColumn(idx int) *Column
 39 | 	GetColumnByName(name string) *Column
 40 | 	GetColumns() *Columns
 41 | 	SetColumns(*Columns)
 42 | 
 43 | 	GetRow(idx int) *Row
 44 | 	GetRows() *Rows
 45 | 	SetRows(*Rows)
 46 | 	DeleteRow(idx int) *Row
 47 | 
 48 | 	GetData() interface{}
 49 | 	GetDataAsRows() *Rows
 50 | 	GetDataAsColumns() *Columns
 51 | 
 52 | 	TransposeToColumns()
 53 | 	TransposeToRows()
 54 | 
 55 | 	PushRow(r *Row)
 56 | 	PushRowToColumns(r *Row)
 57 | 	FillRowsWithColumn(colidx int, col Column)
 58 | 	PushColumn(col Column)
 59 | 	PushColumnToRows(col Column)
 60 | 
 61 | 	MergeColumns(DatasetInterface)
 62 | 	MergeRows(DatasetInterface)
 63 | }
 64 | 
 65 | //
 66 | // ReadDatasetConfig open dataset configuration file and initialize dataset
 67 | // field from there.
 68 | //
 69 | func ReadDatasetConfig(ds interface{}, fcfg string) (e error) {
 70 | 	cfg, e := ioutil.ReadFile(fcfg)
 71 | 
 72 | 	if nil != e {
 73 | 		return e
 74 | 	}
 75 | 
 76 | 	return json.Unmarshal(cfg, ds)
 77 | }
 78 | 
 79 | //
 80 | // SortColumnsByIndex will sort all columns using sorted index.
 81 | //
 82 | func SortColumnsByIndex(di DatasetInterface, sortedIdx []int) {
 83 | 	if di.GetMode() == DatasetModeRows {
 84 | 		di.TransposeToColumns()
 85 | 	}
 86 | 
 87 | 	cols := di.GetColumns()
 88 | 	for x, col := range *cols {
 89 | 		colsorted := col.Records.SortByIndex(sortedIdx)
 90 | 		(*cols)[x].SetRecords(colsorted)
 91 | 	}
 92 | }
 93 | 
 94 | //
 95 | // SplitRowsByNumeric will split the data using splitVal in column `colidx`.
 96 | //
 97 | // For example, given two continuous attribute,
 98 | //
 99 | // 	A: {1,2,3,4}
100 | // 	B: {5,6,7,8}
101 | //
102 | // if colidx is (1) B and splitVal is 7, the data will splitted into left set
103 | //
104 | // 	A': {1,2}
105 | // 	B': {5,6}
106 | //
107 | // and right set
108 | //
109 | // 	A'': {3,4}
110 | // 	B'': {7,8}
111 | //
112 | func SplitRowsByNumeric(di DatasetInterface, colidx int, splitVal float64) (
113 | 	splitLess DatasetInterface,
114 | 	splitGreater DatasetInterface,
115 | 	e error,
116 | ) {
117 | 	// check type of column
118 | 	coltype, e := di.GetColumnTypeAt(colidx)
119 | 	if e != nil {
120 | 		return
121 | 	}
122 | 
123 | 	if !(coltype == TInteger || coltype == TReal) {
124 | 		return splitLess, splitGreater, ErrInvalidColType
125 | 	}
126 | 
127 | 	// Should we convert the data mode back later.
128 | 	orgmode := di.GetMode()
129 | 
130 | 	if orgmode == DatasetModeColumns {
131 | 		di.TransposeToRows()
132 | 	}
133 | 
134 | 	if DEBUG >= 2 {
135 | 		fmt.Println("[tabula] dataset:", di)
136 | 	}
137 | 
138 | 	splitLess = di.Clone().(DatasetInterface)
139 | 	splitGreater = di.Clone().(DatasetInterface)
140 | 
141 | 	rows := di.GetRows()
142 | 	for _, row := range *rows {
143 | 		if (*row)[colidx].Float() < splitVal {
144 | 			splitLess.PushRow(row)
145 | 		} else {
146 | 			splitGreater.PushRow(row)
147 | 		}
148 | 	}
149 | 
150 | 	if DEBUG >= 2 {
151 | 		fmt.Println("[tabula] split less:", splitLess)
152 | 		fmt.Println("[tabula] split greater:", splitGreater)
153 | 	}
154 | 
155 | 	switch orgmode {
156 | 	case DatasetModeColumns:
157 | 		di.TransposeToColumns()
158 | 		splitLess.TransposeToColumns()
159 | 		splitGreater.TransposeToColumns()
160 | 	case DatasetModeMatrix:
161 | 		// do nothing, since its already filled when pushing new row.
162 | 	}
163 | 
164 | 	return
165 | }
166 | 
167 | //
168 | // SplitRowsByCategorical will split the data using a set of split value in
169 | // column `colidx`.
170 | //
171 | // For example, given two attributes,
172 | //
173 | // 	X: [A,B,A,B,C,D,C,D]
174 | // 	Y: [1,2,3,4,5,6,7,8]
175 | //
176 | // if colidx is (0) or A and split value is a set `[A,C]`, the data will
177 | // splitted into left set which contain all rows that have A or C,
178 | //
179 | // 	X': [A,A,C,C]
180 | // 	Y': [1,3,5,7]
181 | //
182 | // and the right set, excluded set, will contain all rows which is not A or C,
183 | //
184 | // 	X'': [B,B,D,D]
185 | // 	Y'': [2,4,6,8]
186 | //
187 | func SplitRowsByCategorical(di DatasetInterface, colidx int,
188 | 	splitVal []string) (
189 | 	splitIn DatasetInterface,
190 | 	splitEx DatasetInterface,
191 | 	e error,
192 | ) {
193 | 	// check type of column
194 | 	coltype, e := di.GetColumnTypeAt(colidx)
195 | 	if e != nil {
196 | 		return
197 | 	}
198 | 
199 | 	if coltype != TString {
200 | 		return splitIn, splitEx, ErrInvalidColType
201 | 	}
202 | 
203 | 	// should we convert the data mode back?
204 | 	orgmode := di.GetMode()
205 | 
206 | 	if orgmode == DatasetModeColumns {
207 | 		di.TransposeToRows()
208 | 	}
209 | 
210 | 	splitIn = di.Clone().(DatasetInterface)
211 | 	splitEx = di.Clone().(DatasetInterface)
212 | 
213 | 	for _, row := range *di.GetRows() {
214 | 		found := false
215 | 		for _, val := range splitVal {
216 | 			if (*row)[colidx].String() == val {
217 | 				splitIn.PushRow(row)
218 | 				found = true
219 | 				break
220 | 			}
221 | 		}
222 | 		if !found {
223 | 			splitEx.PushRow(row)
224 | 		}
225 | 	}
226 | 
227 | 	// convert all dataset based on original
228 | 	switch orgmode {
229 | 	case DatasetModeColumns:
230 | 		di.TransposeToColumns()
231 | 		splitIn.TransposeToColumns()
232 | 		splitEx.TransposeToColumns()
233 | 	case DatasetModeMatrix, DatasetNoMode:
234 | 		splitIn.TransposeToColumns()
235 | 		splitEx.TransposeToColumns()
236 | 	}
237 | 
238 | 	return
239 | }
240 | 
241 | //
242 | // SplitRowsByValue generic function to split data by value. This function will
243 | // split data using value in column `colidx`. If value is numeric it will return
244 | // any rows that have column value less than `value` in `splitL`, and any column
245 | // value greater or equal to `value` in `splitR`.
246 | //
247 | func SplitRowsByValue(di DatasetInterface, colidx int, value interface{}) (
248 | 	splitL DatasetInterface,
249 | 	splitR DatasetInterface,
250 | 	e error,
251 | ) {
252 | 	coltype, e := di.GetColumnTypeAt(colidx)
253 | 	if e != nil {
254 | 		return
255 | 	}
256 | 
257 | 	if coltype == TString {
258 | 		splitL, splitR, e = SplitRowsByCategorical(di, colidx,
259 | 			value.([]string))
260 | 	} else {
261 | 		var splitval float64
262 | 
263 | 		switch value.(type) {
264 | 		case int:
265 | 			splitval = float64(value.(int))
266 | 		case int64:
267 | 			splitval = float64(value.(int64))
268 | 		case float32:
269 | 			splitval = float64(value.(float32))
270 | 		case float64:
271 | 			splitval = value.(float64)
272 | 		}
273 | 
274 | 		splitL, splitR, e = SplitRowsByNumeric(di, colidx,
275 | 			splitval)
276 | 	}
277 | 
278 | 	if e != nil {
279 | 		return nil, nil, e
280 | 	}
281 | 
282 | 	return
283 | }
284 | 
285 | //
286 | // SelectRowsWhere return all rows which column value in `colidx` is equal to
287 | // `colval`.
288 | //
289 | func SelectRowsWhere(dataset DatasetInterface, colidx int, colval string) DatasetInterface {
290 | 	orgmode := dataset.GetMode()
291 | 
292 | 	if orgmode == DatasetModeColumns {
293 | 		dataset.TransposeToRows()
294 | 	}
295 | 
296 | 	selected := NewDataset(dataset.GetMode(), nil, nil)
297 | 
298 | 	selected.Rows = dataset.GetRows().SelectWhere(colidx, colval)
299 | 
300 | 	switch orgmode {
301 | 	case DatasetModeColumns:
302 | 		dataset.TransposeToColumns()
303 | 		selected.TransposeToColumns()
304 | 	case DatasetModeMatrix, DatasetNoMode:
305 | 		selected.TransposeToColumns()
306 | 	}
307 | 
308 | 	return selected
309 | }
310 | 
311 | //
312 | // RandomPickRows return `n` item of row that has been selected randomly from
313 | // dataset.Rows. The ids of rows that has been picked is saved id `pickedIdx`.
314 | //
315 | // If duplicate is true, the row that has been picked can be picked up again,
316 | // otherwise it only allow one pick. This is also called as random selection
317 | // with or without replacement in machine learning domain.
318 | //
319 | // If output mode is columns, it will be transposed to rows.
320 | //
321 | func RandomPickRows(dataset DatasetInterface, n int, duplicate bool) (
322 | 	picked DatasetInterface,
323 | 	unpicked DatasetInterface,
324 | 	pickedIdx []int,
325 | 	unpickedIdx []int,
326 | ) {
327 | 	orgmode := dataset.GetMode()
328 | 
329 | 	if orgmode == DatasetModeColumns {
330 | 		dataset.TransposeToRows()
331 | 	}
332 | 
333 | 	picked = dataset.Clone().(DatasetInterface)
334 | 	unpicked = dataset.Clone().(DatasetInterface)
335 | 
336 | 	pickedRows, unpickedRows, pickedIdx, unpickedIdx :=
337 | 		dataset.GetRows().RandomPick(n, duplicate)
338 | 
339 | 	picked.SetRows(&pickedRows)
340 | 	unpicked.SetRows(&unpickedRows)
341 | 
342 | 	// switch the dataset based on original mode
343 | 	switch orgmode {
344 | 	case DatasetModeColumns:
345 | 		dataset.TransposeToColumns()
346 | 		// transform the picked and unpicked set.
347 | 		picked.TransposeToColumns()
348 | 		unpicked.TransposeToColumns()
349 | 
350 | 	case DatasetModeMatrix, DatasetNoMode:
351 | 		// transform the picked and unpicked set.
352 | 		picked.TransposeToColumns()
353 | 		unpicked.TransposeToColumns()
354 | 	}
355 | 
356 | 	return
357 | }
358 | 
359 | //
360 | // RandomPickColumns will select `n` column randomly from dataset and return
361 | // new dataset with picked and unpicked columns, and their column index.
362 | //
363 | // If duplicate is true, column that has been pick up can be pick up again.
364 | //
365 | // If dataset output mode is rows, it will transposed to columns.
366 | //
367 | func RandomPickColumns(dataset DatasetInterface, n int, dup bool,
368 | 	excludeIdx []int) (
369 | 	picked DatasetInterface,
370 | 	unpicked DatasetInterface,
371 | 	pickedIdx []int,
372 | 	unpickedIdx []int,
373 | ) {
374 | 	orgmode := dataset.GetMode()
375 | 
376 | 	if orgmode == DatasetModeRows {
377 | 		dataset.TransposeToColumns()
378 | 	}
379 | 
380 | 	picked = dataset.Clone().(DatasetInterface)
381 | 	unpicked = dataset.Clone().(DatasetInterface)
382 | 
383 | 	pickedColumns, unpickedColumns, pickedIdx, unpickedIdx :=
384 | 		dataset.GetColumns().RandomPick(n, dup, excludeIdx)
385 | 
386 | 	picked.SetColumns(&pickedColumns)
387 | 	unpicked.SetColumns(&unpickedColumns)
388 | 
389 | 	// transpose picked and unpicked dataset based on original mode
390 | 	switch orgmode {
391 | 	case DatasetModeRows:
392 | 		dataset.TransposeToRows()
393 | 		picked.TransposeToRows()
394 | 		unpicked.TransposeToRows()
395 | 	case DatasetModeMatrix, DatasetNoMode:
396 | 		picked.TransposeToRows()
397 | 		unpicked.TransposeToRows()
398 | 	}
399 | 
400 | 	return
401 | }
402 | 
403 | //
404 | // SelectColumnsByIdx return new dataset with selected column index.
405 | //
406 | func SelectColumnsByIdx(dataset DatasetInterface, colsIdx []int) (
407 | 	newset DatasetInterface,
408 | ) {
409 | 	var col *Column
410 | 
411 | 	orgmode := dataset.GetMode()
412 | 
413 | 	if orgmode == DatasetModeRows {
414 | 		dataset.TransposeToColumns()
415 | 	}
416 | 
417 | 	newset = dataset.Clone().(DatasetInterface)
418 | 
419 | 	for _, idx := range colsIdx {
420 | 		col = dataset.GetColumn(idx)
421 | 		if col == nil {
422 | 			continue
423 | 		}
424 | 
425 | 		newset.PushColumn(*col)
426 | 	}
427 | 
428 | 	// revert the mode back
429 | 	switch orgmode {
430 | 	case DatasetModeRows:
431 | 		dataset.TransposeToRows()
432 | 		newset.TransposeToRows()
433 | 	case DatasetModeColumns:
434 | 		// do nothing
435 | 	case DatasetModeMatrix:
436 | 		// do nothing
437 | 	}
438 | 
439 | 	return
440 | }
441 | 


--------------------------------------------------------------------------------
/dataset.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 M. Shulhan <ms@kilabit.info>. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style license that can be found
  3 | // in the LICENSE file.
  4 | 
  5 | package tabula
  6 | 
  7 | import (
  8 | 	"errors"
  9 | 	"math"
 10 | )
 11 | 
 12 | const (
 13 | 	// DatasetNoMode default to matrix.
 14 | 	DatasetNoMode = 0
 15 | 	// DatasetModeRows for output mode in rows.
 16 | 	DatasetModeRows = 1
 17 | 	// DatasetModeColumns for output mode in columns.
 18 | 	DatasetModeColumns = 2
 19 | 	// DatasetModeMatrix will save data in rows and columns.
 20 | 	DatasetModeMatrix = 4
 21 | )
 22 | 
 23 | var (
 24 | 	// ErrColIdxOutOfRange operation on column index is invalid
 25 | 	ErrColIdxOutOfRange = errors.New("tabula: Column index out of range")
 26 | 	// ErrInvalidColType operation on column with different type
 27 | 	ErrInvalidColType = errors.New("tabula: Invalid column type")
 28 | 	// ErrMisColLength returned when operation on columns does not match
 29 | 	// between parameter and their length
 30 | 	ErrMisColLength = errors.New("tabula: mismatch on column length")
 31 | )
 32 | 
 33 | //
 34 | // Dataset contain the data, mode of saved data, number of columns and rows in
 35 | // data.
 36 | //
 37 | type Dataset struct {
 38 | 	// Mode define the numeric value of output mode.
 39 | 	Mode int
 40 | 	// Columns is input data that has been parsed.
 41 | 	Columns Columns
 42 | 	// Rows is input data that has been parsed.
 43 | 	Rows Rows
 44 | }
 45 | 
 46 | //
 47 | // NewDataset create new dataset, use the mode to initialize the dataset.
 48 | //
 49 | func NewDataset(mode int, types []int, names []string) (
 50 | 	dataset *Dataset,
 51 | ) {
 52 | 	dataset = &Dataset{}
 53 | 
 54 | 	dataset.Init(mode, types, names)
 55 | 
 56 | 	return
 57 | }
 58 | 
 59 | //
 60 | // Init will set the dataset using mode and types.
 61 | //
 62 | func (dataset *Dataset) Init(mode int, types []int, names []string) {
 63 | 	if types == nil {
 64 | 		dataset.Columns = make(Columns, 0)
 65 | 	} else {
 66 | 		dataset.Columns = make(Columns, len(types))
 67 | 		dataset.Columns.SetTypes(types)
 68 | 	}
 69 | 
 70 | 	dataset.SetColumnsName(names)
 71 | 	dataset.SetMode(mode)
 72 | }
 73 | 
 74 | //
 75 | // Clone return a copy of current dataset.
 76 | //
 77 | func (dataset *Dataset) Clone() interface{} {
 78 | 	clone := NewDataset(dataset.GetMode(), nil, nil)
 79 | 
 80 | 	for _, col := range dataset.Columns {
 81 | 		newcol := Column{
 82 | 			Type:       col.Type,
 83 | 			Name:       col.Name,
 84 | 			ValueSpace: col.ValueSpace,
 85 | 		}
 86 | 		clone.PushColumn(newcol)
 87 | 	}
 88 | 
 89 | 	return clone
 90 | }
 91 | 
 92 | //
 93 | // Reset all data and attributes.
 94 | //
 95 | func (dataset *Dataset) Reset() error {
 96 | 	dataset.Rows = Rows{}
 97 | 	dataset.Columns.Reset()
 98 | 	return nil
 99 | }
100 | 
101 | //
102 | // GetMode return mode of data.
103 | //
104 | func (dataset *Dataset) GetMode() int {
105 | 	return dataset.Mode
106 | }
107 | 
108 | //
109 | // SetMode of saved data to `mode`.
110 | //
111 | func (dataset *Dataset) SetMode(mode int) {
112 | 	switch mode {
113 | 	case DatasetModeRows:
114 | 		dataset.Mode = DatasetModeRows
115 | 		dataset.Rows = make(Rows, 0)
116 | 	case DatasetModeColumns:
117 | 		dataset.Mode = DatasetModeColumns
118 | 		dataset.Columns.Reset()
119 | 	default:
120 | 		dataset.Mode = DatasetModeMatrix
121 | 		dataset.Rows = make(Rows, 0)
122 | 		dataset.Columns.Reset()
123 | 	}
124 | }
125 | 
126 | //
127 | // GetNColumn return the number of column in dataset.
128 | //
129 | func (dataset *Dataset) GetNColumn() (ncol int) {
130 | 	ncol = len(dataset.Columns)
131 | 
132 | 	if ncol > 0 {
133 | 		return
134 | 	}
135 | 
136 | 	switch dataset.Mode {
137 | 	case DatasetModeRows:
138 | 		if len(dataset.Rows) <= 0 {
139 | 			return 0
140 | 		}
141 | 		return dataset.Rows[0].Len()
142 | 	}
143 | 
144 | 	return
145 | }
146 | 
147 | //
148 | // GetNRow return number of rows in dataset.
149 | //
150 | func (dataset *Dataset) GetNRow() (nrow int) {
151 | 	switch dataset.Mode {
152 | 	case DatasetModeRows:
153 | 		nrow = len(dataset.Rows)
154 | 	case DatasetModeColumns:
155 | 		if len(dataset.Columns) <= 0 {
156 | 			nrow = 0
157 | 		} else {
158 | 			// get length of record in the first column
159 | 			nrow = dataset.Columns[0].Len()
160 | 		}
161 | 	case DatasetModeMatrix, DatasetNoMode:
162 | 		// matrix mode could have empty either in rows or column.
163 | 		nrow = len(dataset.Rows)
164 | 	}
165 | 	return
166 | }
167 | 
168 | //
169 | // Len return number of row in dataset.
170 | //
171 | func (dataset *Dataset) Len() int {
172 | 	return dataset.GetNRow()
173 | }
174 | 
175 | //
176 | // GetColumnsType return the type of all columns.
177 | //
178 | func (dataset *Dataset) GetColumnsType() (types []int) {
179 | 	for x := range dataset.Columns {
180 | 		types = append(types, dataset.Columns[x].Type)
181 | 	}
182 | 
183 | 	return
184 | }
185 | 
186 | //
187 | // SetColumnsType of data in all columns.
188 | //
189 | func (dataset *Dataset) SetColumnsType(types []int) {
190 | 	dataset.Columns = make(Columns, len(types))
191 | 	dataset.Columns.SetTypes(types)
192 | }
193 | 
194 | //
195 | // GetColumnTypeAt return type of column in index `colidx` in dataset.
196 | //
197 | func (dataset *Dataset) GetColumnTypeAt(idx int) (int, error) {
198 | 	if idx >= dataset.GetNColumn() {
199 | 		return TUndefined, ErrColIdxOutOfRange
200 | 	}
201 | 
202 | 	return dataset.Columns[idx].Type, nil
203 | }
204 | 
205 | //
206 | // SetColumnTypeAt will set column type at index `colidx` to `tipe`.
207 | //
208 | func (dataset *Dataset) SetColumnTypeAt(idx, tipe int) error {
209 | 	if idx >= dataset.GetNColumn() {
210 | 		return ErrColIdxOutOfRange
211 | 	}
212 | 
213 | 	dataset.Columns[idx].Type = tipe
214 | 	return nil
215 | }
216 | 
217 | //
218 | // GetColumnsName return name of all columns.
219 | //
220 | func (dataset *Dataset) GetColumnsName() (names []string) {
221 | 	for x := range dataset.Columns {
222 | 		names = append(names, dataset.Columns[x].Name)
223 | 	}
224 | 
225 | 	return
226 | }
227 | 
228 | //
229 | // SetColumnsName set column name.
230 | //
231 | func (dataset *Dataset) SetColumnsName(names []string) {
232 | 	nameslen := len(names)
233 | 
234 | 	if nameslen <= 0 {
235 | 		// empty names, return immediately.
236 | 		return
237 | 	}
238 | 
239 | 	collen := dataset.GetNColumn()
240 | 
241 | 	if collen <= 0 {
242 | 		dataset.Columns = make(Columns, nameslen)
243 | 		collen = nameslen
244 | 	}
245 | 
246 | 	// find minimum length
247 | 	minlen := collen
248 | 	if nameslen < collen {
249 | 		minlen = nameslen
250 | 	}
251 | 
252 | 	for x := 0; x < minlen; x++ {
253 | 		dataset.Columns[x].Name = names[x]
254 | 	}
255 | }
256 | 
257 | //
258 | // AddColumn will create and add new empty column with specific type and name
259 | // into dataset.
260 | //
261 | func (dataset *Dataset) AddColumn(tipe int, name string, vs []string) {
262 | 	col := Column{
263 | 		Type:       tipe,
264 | 		Name:       name,
265 | 		ValueSpace: vs,
266 | 	}
267 | 	dataset.PushColumn(col)
268 | }
269 | 
270 | //
271 | // GetColumn return pointer to column object at index `idx`.  If `idx` is out of
272 | // range return nil.
273 | //
274 | func (dataset *Dataset) GetColumn(idx int) (col *Column) {
275 | 	if idx > dataset.GetNColumn() {
276 | 		return
277 | 	}
278 | 
279 | 	switch dataset.Mode {
280 | 	case DatasetModeRows:
281 | 		dataset.TransposeToColumns()
282 | 	case DatasetModeColumns:
283 | 		// do nothing
284 | 	case DatasetModeMatrix:
285 | 		// do nothing
286 | 	}
287 | 
288 | 	return &dataset.Columns[idx]
289 | }
290 | 
291 | //
292 | // GetColumnByName return column based on their `name`.
293 | //
294 | func (dataset *Dataset) GetColumnByName(name string) (col *Column) {
295 | 	switch dataset.Mode {
296 | 	case DatasetModeRows:
297 | 		dataset.TransposeToColumns()
298 | 	}
299 | 
300 | 	for x, col := range dataset.Columns {
301 | 		if col.Name == name {
302 | 			return &dataset.Columns[x]
303 | 		}
304 | 	}
305 | 	return
306 | }
307 | 
308 | //
309 | // GetColumns return columns in dataset, without transposing.
310 | //
311 | func (dataset *Dataset) GetColumns() *Columns {
312 | 	return &dataset.Columns
313 | }
314 | 
315 | //
316 | // SetColumns will replace current columns with new one from parameter.
317 | //
318 | func (dataset *Dataset) SetColumns(cols *Columns) {
319 | 	dataset.Columns = *cols
320 | }
321 | 
322 | //
323 | // GetRow return pointer to row at index `idx` or nil if index is out of range.
324 | //
325 | func (dataset *Dataset) GetRow(idx int) *Row {
326 | 	if idx < 0 {
327 | 		return nil
328 | 	}
329 | 	if idx >= dataset.Rows.Len() {
330 | 		return nil
331 | 	}
332 | 	return dataset.Rows[idx]
333 | }
334 | 
335 | //
336 | // GetRows return rows in dataset, without transposing.
337 | //
338 | func (dataset *Dataset) GetRows() *Rows {
339 | 	return &dataset.Rows
340 | }
341 | 
342 | //
343 | // SetRows will replace current rows with new one from parameter.
344 | //
345 | func (dataset *Dataset) SetRows(rows *Rows) {
346 | 	dataset.Rows = *rows
347 | }
348 | 
349 | //
350 | // GetData return the data, based on mode (rows, columns, or matrix).
351 | //
352 | func (dataset *Dataset) GetData() interface{} {
353 | 	switch dataset.Mode {
354 | 	case DatasetModeRows:
355 | 		return &dataset.Rows
356 | 	case DatasetModeColumns:
357 | 		return &dataset.Columns
358 | 	case DatasetModeMatrix, DatasetNoMode:
359 | 		return &Matrix{
360 | 			Columns: &dataset.Columns,
361 | 			Rows:    &dataset.Rows,
362 | 		}
363 | 	}
364 | 
365 | 	return nil
366 | }
367 | 
368 | //
369 | // GetDataAsRows return data in rows mode.
370 | //
371 | func (dataset *Dataset) GetDataAsRows() *Rows {
372 | 	if dataset.Mode == DatasetModeColumns {
373 | 		dataset.TransposeToRows()
374 | 	}
375 | 	return &dataset.Rows
376 | }
377 | 
378 | //
379 | // GetDataAsColumns return data in columns mode.
380 | //
381 | func (dataset *Dataset) GetDataAsColumns() (columns *Columns) {
382 | 	if dataset.Mode == DatasetModeRows {
383 | 		dataset.TransposeToColumns()
384 | 	}
385 | 	return &dataset.Columns
386 | }
387 | 
388 | //
389 | // TransposeToColumns move all data from rows (horizontal) to columns
390 | // (vertical) mode.
391 | //
392 | func (dataset *Dataset) TransposeToColumns() {
393 | 	if dataset.GetNRow() <= 0 {
394 | 		// nothing to transpose
395 | 		return
396 | 	}
397 | 
398 | 	ncol := dataset.GetNColumn()
399 | 	if ncol <= 0 {
400 | 		// if no columns defined, initialize it using record type
401 | 		// in the first row.
402 | 		types := dataset.GetRow(0).Types()
403 | 		dataset.SetColumnsType(types)
404 | 		ncol = len(types)
405 | 	}
406 | 
407 | 	orgmode := dataset.GetMode()
408 | 
409 | 	switch orgmode {
410 | 	case DatasetModeRows:
411 | 		// do nothing.
412 | 	case DatasetModeColumns, DatasetModeMatrix, DatasetNoMode:
413 | 		// check if column records contain data.
414 | 		nrow := dataset.Columns[0].Len()
415 | 		if nrow > 0 {
416 | 			// return if column record is not empty, its already
417 | 			// transposed
418 | 			return
419 | 		}
420 | 	}
421 | 
422 | 	// use the least length
423 | 	minlen := len(*dataset.GetRow(0))
424 | 
425 | 	if minlen > ncol {
426 | 		minlen = ncol
427 | 	}
428 | 
429 | 	switch orgmode {
430 | 	case DatasetModeRows, DatasetNoMode:
431 | 		dataset.SetMode(DatasetModeColumns)
432 | 	}
433 | 
434 | 	for _, row := range dataset.Rows {
435 | 		for y := 0; y < minlen; y++ {
436 | 			dataset.Columns[y].PushBack((*row)[y])
437 | 		}
438 | 	}
439 | 
440 | 	// reset the rows data only if original mode is rows
441 | 	// this to prevent empty data when mode is matrix.
442 | 	switch orgmode {
443 | 	case DatasetModeRows:
444 | 		dataset.Rows = nil
445 | 	}
446 | }
447 | 
448 | //
449 | // TransposeToRows will move all data from columns (vertical) to rows
450 | // (horizontal) mode.
451 | //
452 | func (dataset *Dataset) TransposeToRows() {
453 | 	orgmode := dataset.GetMode()
454 | 
455 | 	if orgmode == DatasetModeRows {
456 | 		// already transposed
457 | 		return
458 | 	}
459 | 
460 | 	if orgmode == DatasetModeColumns {
461 | 		// only set mode if transposing from columns to rows
462 | 		dataset.SetMode(DatasetModeRows)
463 | 	}
464 | 
465 | 	// Get the max length of columns.
466 | 	rowlen := math.MinInt32
467 | 	flen := len(dataset.Columns)
468 | 
469 | 	for f := 0; f < flen; f++ {
470 | 		l := dataset.Columns[f].Len()
471 | 
472 | 		if l > rowlen {
473 | 			rowlen = l
474 | 		}
475 | 	}
476 | 
477 | 	dataset.Rows = make(Rows, 0)
478 | 
479 | 	// Transpose record from column to row.
480 | 	for r := 0; r < rowlen; r++ {
481 | 		row := make(Row, flen)
482 | 
483 | 		for f := 0; f < flen; f++ {
484 | 			if dataset.Columns[f].Len() > r {
485 | 				row[f] = dataset.Columns[f].Records[r]
486 | 			} else {
487 | 				row[f] = NewRecord()
488 | 			}
489 | 		}
490 | 
491 | 		dataset.Rows = append(dataset.Rows, &row)
492 | 	}
493 | 
494 | 	// Only reset the columns if original dataset mode is "columns".
495 | 	// This to prevent empty data when mode is matrix.
496 | 	if orgmode == DatasetModeColumns {
497 | 		dataset.Columns.Reset()
498 | 	}
499 | }
500 | 
501 | //
502 | // PushRow save the data, which is already in row object, to Rows.
503 | //
504 | func (dataset *Dataset) PushRow(row *Row) {
505 | 	switch dataset.GetMode() {
506 | 	case DatasetModeRows:
507 | 		dataset.Rows = append(dataset.Rows, row)
508 | 	case DatasetModeColumns:
509 | 		dataset.PushRowToColumns(row)
510 | 	case DatasetModeMatrix, DatasetNoMode:
511 | 		dataset.Rows = append(dataset.Rows, row)
512 | 		dataset.PushRowToColumns(row)
513 | 	}
514 | }
515 | 
516 | //
517 | // PushRowToColumns push each data in Row to Columns.
518 | //
519 | func (dataset *Dataset) PushRowToColumns(row *Row) {
520 | 	rowlen := row.Len()
521 | 	if rowlen <= 0 {
522 | 		// return immediately if no data in row.
523 | 		return
524 | 	}
525 | 
526 | 	// check if columns is initialize.
527 | 	collen := len(dataset.Columns)
528 | 	if collen <= 0 {
529 | 		dataset.Columns = make(Columns, rowlen)
530 | 		collen = rowlen
531 | 	}
532 | 
533 | 	// pick the minimum length.
534 | 	min := rowlen
535 | 	if collen < rowlen {
536 | 		min = collen
537 | 	}
538 | 
539 | 	for x := 0; x < min; x++ {
540 | 		dataset.Columns[x].PushBack((*row)[x])
541 | 	}
542 | }
543 | 
544 | //
545 | // FillRowsWithColumn given a column, fill the dataset with row where the record
546 | // only set at index `colIdx`.
547 | //
548 | // Example, content of dataset was,
549 | //
550 | // index:	0 1 2
551 | // 	A B C
552 | // 	X     (step 1) nrow = 2
553 | //
554 | // If we filled column at index 2 with [Y Z], the dataset will become:
555 | //
556 | // index:	0 1 2
557 | // 	A B C
558 | // 	X   Y (step 2) fill the empty row
559 | // 	    Z (step 3) create dummy row which contain the rest of column data.
560 | //
561 | func (dataset *Dataset) FillRowsWithColumn(colIdx int, col Column) {
562 | 	if dataset.GetMode() != DatasetModeRows {
563 | 		// Only work if dataset mode is ROWS
564 | 		return
565 | 	}
566 | 
567 | 	nrow := dataset.GetNRow()
568 | 	emptyAt := nrow
569 | 
570 | 	// (step 1) Find the row with empty records
571 | 	for x, row := range dataset.Rows {
572 | 		if row.IsNilAt(colIdx) {
573 | 			emptyAt = x
574 | 			break
575 | 		}
576 | 	}
577 | 
578 | 	// (step 2) Fill the empty rows using column records.
579 | 	y := 0
580 | 	for x := emptyAt; x < nrow; x++ {
581 | 		dataset.Rows[x].SetValueAt(colIdx, col.Records[y])
582 | 		y++
583 | 	}
584 | 
585 | 	// (step 3) Continue filling the column but using dummy row which
586 | 	// contain only record at index `colIdx`.
587 | 	ncol := dataset.GetNColumn()
588 | 	nrow = col.Len()
589 | 	for ; y < nrow; y++ {
590 | 		row := make(Row, ncol)
591 | 
592 | 		for z := 0; z < ncol; z++ {
593 | 			if z == colIdx {
594 | 				row[colIdx] = col.Records[y]
595 | 			} else {
596 | 				row[z] = NewRecord()
597 | 			}
598 | 		}
599 | 
600 | 		dataset.PushRow(&row)
601 | 	}
602 | }
603 | 
604 | //
605 | // PushColumn will append new column to the end of slice if no existing column
606 | // with the same name. If it exist, the records will be merged.
607 | //
608 | func (dataset *Dataset) PushColumn(col Column) {
609 | 	exist := false
610 | 	colIdx := 0
611 | 	for x, c := range dataset.Columns {
612 | 		if c.Name == col.Name {
613 | 			exist = true
614 | 			colIdx = x
615 | 			break
616 | 		}
617 | 	}
618 | 
619 | 	switch dataset.GetMode() {
620 | 	case DatasetModeRows:
621 | 		if exist {
622 | 			dataset.FillRowsWithColumn(colIdx, col)
623 | 		} else {
624 | 			// append new column
625 | 			dataset.Columns = append(dataset.Columns, col)
626 | 			dataset.PushColumnToRows(col)
627 | 			// Remove records in column
628 | 			dataset.Columns[dataset.GetNColumn()-1].Reset()
629 | 		}
630 | 	case DatasetModeColumns:
631 | 		if exist {
632 | 			dataset.Columns[colIdx].PushRecords(col.Records)
633 | 		} else {
634 | 			dataset.Columns = append(dataset.Columns, col)
635 | 		}
636 | 	case DatasetModeMatrix, DatasetNoMode:
637 | 		if exist {
638 | 			dataset.Columns[colIdx].PushRecords(col.Records)
639 | 		} else {
640 | 			dataset.Columns = append(dataset.Columns, col)
641 | 			dataset.PushColumnToRows(col)
642 | 		}
643 | 	}
644 | }
645 | 
646 | //
647 | // PushColumnToRows add each record in column to each rows, from top to bottom.
648 | //
649 | func (dataset *Dataset) PushColumnToRows(col Column) {
650 | 	colsize := col.Len()
651 | 	if colsize <= 0 {
652 | 		// Do nothing if column is empty.
653 | 		return
654 | 	}
655 | 
656 | 	nrow := dataset.GetNRow()
657 | 	if nrow <= 0 {
658 | 		// If no existing rows in dataset, initialize the rows slice.
659 | 		dataset.Rows = make(Rows, colsize)
660 | 
661 | 		for nrow = 0; nrow < colsize; nrow++ {
662 | 			row := make(Row, 0)
663 | 			dataset.Rows[nrow] = &row
664 | 		}
665 | 	}
666 | 
667 | 	// Pick the minimum length between column or current row length.
668 | 	minrow := nrow
669 | 
670 | 	if colsize < nrow {
671 | 		minrow = colsize
672 | 	}
673 | 
674 | 	// Push each record in column to each rows
675 | 	var row *Row
676 | 	var rec *Record
677 | 
678 | 	for x := 0; x < minrow; x++ {
679 | 		row = dataset.Rows[x]
680 | 		rec = col.Records[x]
681 | 
682 | 		row.PushBack(rec)
683 | 	}
684 | }
685 | 
686 | //
687 | // MergeColumns append columns from other dataset into current dataset.
688 | //
689 | func (dataset *Dataset) MergeColumns(other DatasetInterface) {
690 | 	othermode := other.GetMode()
691 | 	if othermode == DatasetModeRows {
692 | 		other.TransposeToColumns()
693 | 	}
694 | 
695 | 	cols := other.GetDataAsColumns()
696 | 	for _, col := range *cols {
697 | 		dataset.PushColumn(col)
698 | 	}
699 | 
700 | 	switch othermode {
701 | 	case DatasetModeRows:
702 | 		other.TransposeToRows()
703 | 	}
704 | }
705 | 
706 | //
707 | // MergeRows append rows from other dataset into current dataset.
708 | //
709 | func (dataset *Dataset) MergeRows(other DatasetInterface) {
710 | 	rows := other.GetDataAsRows()
711 | 	for _, row := range *rows {
712 | 		dataset.PushRow(row)
713 | 	}
714 | }
715 | 
716 | //
717 | // DeleteRow will detach row at index `i` from dataset and return it.
718 | //
719 | func (dataset *Dataset) DeleteRow(i int) (row *Row) {
720 | 	if i < 0 {
721 | 		return
722 | 	}
723 | 	if i >= dataset.Rows.Len() {
724 | 		return
725 | 	}
726 | 
727 | 	orgmode := dataset.GetMode()
728 | 	if orgmode == DatasetModeColumns {
729 | 		dataset.TransposeToRows()
730 | 	}
731 | 
732 | 	row = dataset.Rows.Del(i)
733 | 
734 | 	if orgmode == DatasetModeColumns {
735 | 		dataset.TransposeToColumns()
736 | 	}
737 | 
738 | 	if orgmode != DatasetModeRows {
739 | 		// Delete record in each columns as the same index as deleted
740 | 		// row.
741 | 		for x := range dataset.Columns {
742 | 			dataset.Columns[x].DeleteRecordAt(i)
743 | 		}
744 | 	}
745 | 
746 | 	return row
747 | }
748 | 


--------------------------------------------------------------------------------