├── series ├── type-string.go ├── type-int.go ├── type-float.go ├── type-bool.go ├── benchmarks_test.go ├── series.go └── series_test.go ├── dataframe ├── benchmark_test.go ├── examples_test.go └── dataframe.go ├── CHANGELOG.md ├── LICENSE.md └── README.md /series/type-string.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | type stringElement struct { 11 | e string 12 | nan bool 13 | } 14 | 15 | func (e *stringElement) Set(value interface{}) { 16 | e.nan = false 17 | switch value.(type) { 18 | case string: 19 | e.e = string(value.(string)) 20 | if e.e == "NaN" { 21 | e.nan = true 22 | return 23 | } 24 | case int: 25 | e.e = strconv.Itoa(value.(int)) 26 | case float64: 27 | e.e = strconv.FormatFloat(value.(float64), 'f', 6, 64) 28 | case bool: 29 | b := value.(bool) 30 | if b { 31 | e.e = "true" 32 | } else { 33 | e.e = "false" 34 | } 35 | case Element: 36 | e.e = value.(Element).String() 37 | default: 38 | e.nan = true 39 | return 40 | } 41 | return 42 | } 43 | 44 | func (e stringElement) Copy() Element { 45 | if e.IsNA() { 46 | return &stringElement{"", true} 47 | } 48 | return &stringElement{e.e, false} 49 | } 50 | 51 | func (e stringElement) IsNA() bool { 52 | if e.nan { 53 | return true 54 | } 55 | return false 56 | } 57 | 58 | func (e stringElement) Type() Type { 59 | return String 60 | } 61 | 62 | func (e stringElement) Val() ElementValue { 63 | if e.IsNA() { 64 | return nil 65 | } 66 | return string(e.e) 67 | } 68 | 69 | func (e stringElement) String() string { 70 | if e.IsNA() { 71 | return "NaN" 72 | } 73 | return string(e.e) 74 | } 75 | 76 | func (e stringElement) Int() (int, error) { 77 | if e.IsNA() { 78 | return 0, fmt.Errorf("can't convert NaN to int") 79 | } 80 | return strconv.Atoi(e.e) 81 | } 82 | 83 | func (e stringElement) Float() float64 { 84 | if e.IsNA() { 85 | return math.NaN() 86 | } 87 | f, err := strconv.ParseFloat(e.e, 64) 88 | if err != nil { 89 | return math.NaN() 90 | } 91 | return f 92 | } 93 | 94 | func (e stringElement) Bool() (bool, error) { 95 | if e.IsNA() { 96 | return false, fmt.Errorf("can't convert NaN to bool") 97 | } 98 | switch strings.ToLower(e.e) { 99 | case "true", "t", "1": 100 | return true, nil 101 | case "false", "f", "0": 102 | return false, nil 103 | } 104 | return false, fmt.Errorf("can't convert String \"%v\" to bool", e.e) 105 | } 106 | 107 | func (e stringElement) Eq(elem Element) bool { 108 | if e.IsNA() || elem.IsNA() { 109 | return false 110 | } 111 | return e.e == elem.String() 112 | } 113 | 114 | func (e stringElement) Neq(elem Element) bool { 115 | if e.IsNA() || elem.IsNA() { 116 | return false 117 | } 118 | return e.e != elem.String() 119 | } 120 | 121 | func (e stringElement) Less(elem Element) bool { 122 | if e.IsNA() || elem.IsNA() { 123 | return false 124 | } 125 | return e.e < elem.String() 126 | } 127 | 128 | func (e stringElement) LessEq(elem Element) bool { 129 | if e.IsNA() || elem.IsNA() { 130 | return false 131 | } 132 | return e.e <= elem.String() 133 | } 134 | 135 | func (e stringElement) Greater(elem Element) bool { 136 | if e.IsNA() || elem.IsNA() { 137 | return false 138 | } 139 | return e.e > elem.String() 140 | } 141 | 142 | func (e stringElement) GreaterEq(elem Element) bool { 143 | if e.IsNA() || elem.IsNA() { 144 | return false 145 | } 146 | return e.e >= elem.String() 147 | } 148 | -------------------------------------------------------------------------------- /series/type-int.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | ) 8 | 9 | type intElement struct { 10 | e int 11 | nan bool 12 | } 13 | 14 | func (e *intElement) Set(value interface{}) { 15 | e.nan = false 16 | switch value.(type) { 17 | case string: 18 | if value.(string) == "NaN" { 19 | e.nan = true 20 | return 21 | } 22 | i, err := strconv.Atoi(value.(string)) 23 | if err != nil { 24 | e.nan = true 25 | return 26 | } 27 | e.e = i 28 | case int: 29 | e.e = int(value.(int)) 30 | case float64: 31 | f := value.(float64) 32 | if math.IsNaN(f) || 33 | math.IsInf(f, 0) || 34 | math.IsInf(f, 1) { 35 | e.nan = true 36 | return 37 | } 38 | e.e = int(f) 39 | case bool: 40 | b := value.(bool) 41 | if b { 42 | e.e = 1 43 | } else { 44 | e.e = 0 45 | } 46 | case Element: 47 | v, err := value.(Element).Int() 48 | if err != nil { 49 | e.nan = true 50 | return 51 | } 52 | e.e = v 53 | default: 54 | e.nan = true 55 | return 56 | } 57 | return 58 | } 59 | 60 | func (e intElement) Copy() Element { 61 | if e.IsNA() { 62 | return &intElement{0, true} 63 | } 64 | return &intElement{e.e, false} 65 | } 66 | 67 | func (e intElement) IsNA() bool { 68 | if e.nan { 69 | return true 70 | } 71 | return false 72 | } 73 | 74 | func (e intElement) Type() Type { 75 | return Int 76 | } 77 | 78 | func (e intElement) Val() ElementValue { 79 | if e.IsNA() { 80 | return nil 81 | } 82 | return int(e.e) 83 | } 84 | 85 | func (e intElement) String() string { 86 | if e.IsNA() { 87 | return "NaN" 88 | } 89 | return fmt.Sprint(e.e) 90 | } 91 | 92 | func (e intElement) Int() (int, error) { 93 | if e.IsNA() { 94 | return 0, fmt.Errorf("can't convert NaN to int") 95 | } 96 | return int(e.e), nil 97 | } 98 | 99 | func (e intElement) Float() float64 { 100 | if e.IsNA() { 101 | return math.NaN() 102 | } 103 | return float64(e.e) 104 | } 105 | 106 | func (e intElement) Bool() (bool, error) { 107 | if e.IsNA() { 108 | return false, fmt.Errorf("can't convert NaN to bool") 109 | } 110 | switch e.e { 111 | case 1: 112 | return true, nil 113 | case 0: 114 | return false, nil 115 | } 116 | return false, fmt.Errorf("can't convert Int \"%v\" to bool", e.e) 117 | } 118 | 119 | func (e intElement) Eq(elem Element) bool { 120 | i, err := elem.Int() 121 | if err != nil || e.IsNA() { 122 | return false 123 | } 124 | return e.e == i 125 | } 126 | 127 | func (e intElement) Neq(elem Element) bool { 128 | i, err := elem.Int() 129 | if err != nil || e.IsNA() { 130 | return false 131 | } 132 | return e.e != i 133 | } 134 | 135 | func (e intElement) Less(elem Element) bool { 136 | i, err := elem.Int() 137 | if err != nil || e.IsNA() { 138 | return false 139 | } 140 | return e.e < i 141 | } 142 | 143 | func (e intElement) LessEq(elem Element) bool { 144 | i, err := elem.Int() 145 | if err != nil || e.IsNA() { 146 | return false 147 | } 148 | return e.e <= i 149 | } 150 | 151 | func (e intElement) Greater(elem Element) bool { 152 | i, err := elem.Int() 153 | if err != nil || e.IsNA() { 154 | return false 155 | } 156 | return e.e > i 157 | } 158 | 159 | func (e intElement) GreaterEq(elem Element) bool { 160 | i, err := elem.Int() 161 | if err != nil || e.IsNA() { 162 | return false 163 | } 164 | return e.e >= i 165 | } 166 | -------------------------------------------------------------------------------- /series/type-float.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | ) 8 | 9 | type floatElement struct { 10 | e float64 11 | nan bool 12 | } 13 | 14 | func (e *floatElement) Set(value interface{}) { 15 | e.nan = false 16 | switch value.(type) { 17 | case string: 18 | if value.(string) == "NaN" { 19 | e.nan = true 20 | return 21 | } 22 | f, err := strconv.ParseFloat(value.(string), 64) 23 | if err != nil { 24 | e.nan = true 25 | return 26 | } 27 | e.e = f 28 | case int: 29 | e.e = float64(value.(int)) 30 | case float64: 31 | e.e = float64(value.(float64)) 32 | case bool: 33 | b := value.(bool) 34 | if b { 35 | e.e = 1 36 | } else { 37 | e.e = 0 38 | } 39 | case Element: 40 | e.e = value.(Element).Float() 41 | default: 42 | e.nan = true 43 | return 44 | } 45 | return 46 | } 47 | 48 | func (e floatElement) Copy() Element { 49 | if e.IsNA() { 50 | return &floatElement{0.0, true} 51 | } 52 | return &floatElement{e.e, false} 53 | } 54 | 55 | func (e floatElement) IsNA() bool { 56 | if e.nan || math.IsNaN(e.e) { 57 | return true 58 | } 59 | return false 60 | } 61 | 62 | func (e floatElement) Type() Type { 63 | return Float 64 | } 65 | 66 | func (e floatElement) Val() ElementValue { 67 | if e.IsNA() { 68 | return nil 69 | } 70 | return float64(e.e) 71 | } 72 | 73 | func (e floatElement) String() string { 74 | if e.IsNA() { 75 | return "NaN" 76 | } 77 | return fmt.Sprintf("%f", e.e) 78 | } 79 | 80 | func (e floatElement) Int() (int, error) { 81 | if e.IsNA() { 82 | return 0, fmt.Errorf("can't convert NaN to int") 83 | } 84 | f := e.e 85 | if math.IsInf(f, 1) || math.IsInf(f, -1) { 86 | return 0, fmt.Errorf("can't convert Inf to int") 87 | } 88 | if math.IsNaN(f) { 89 | return 0, fmt.Errorf("can't convert NaN to int") 90 | } 91 | return int(f), nil 92 | } 93 | 94 | func (e floatElement) Float() float64 { 95 | if e.IsNA() { 96 | return math.NaN() 97 | } 98 | return float64(e.e) 99 | } 100 | 101 | func (e floatElement) Bool() (bool, error) { 102 | if e.IsNA() { 103 | return false, fmt.Errorf("can't convert NaN to bool") 104 | } 105 | switch e.e { 106 | case 1: 107 | return true, nil 108 | case 0: 109 | return false, nil 110 | } 111 | return false, fmt.Errorf("can't convert Float \"%v\" to bool", e.e) 112 | } 113 | 114 | func (e floatElement) Eq(elem Element) bool { 115 | f := elem.Float() 116 | if e.IsNA() || math.IsNaN(f) { 117 | return false 118 | } 119 | return e.e == f 120 | } 121 | 122 | func (e floatElement) Neq(elem Element) bool { 123 | f := elem.Float() 124 | if e.IsNA() || math.IsNaN(f) { 125 | return false 126 | } 127 | return e.e != f 128 | } 129 | 130 | func (e floatElement) Less(elem Element) bool { 131 | f := elem.Float() 132 | if e.IsNA() || math.IsNaN(f) { 133 | return false 134 | } 135 | return e.e < f 136 | } 137 | 138 | func (e floatElement) LessEq(elem Element) bool { 139 | f := elem.Float() 140 | if e.IsNA() || math.IsNaN(f) { 141 | return false 142 | } 143 | return e.e <= f 144 | } 145 | 146 | func (e floatElement) Greater(elem Element) bool { 147 | f := elem.Float() 148 | if e.IsNA() || math.IsNaN(f) { 149 | return false 150 | } 151 | return e.e > f 152 | } 153 | 154 | func (e floatElement) GreaterEq(elem Element) bool { 155 | f := elem.Float() 156 | if e.IsNA() || math.IsNaN(f) { 157 | return false 158 | } 159 | return e.e >= f 160 | } 161 | -------------------------------------------------------------------------------- /series/type-bool.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strings" 7 | ) 8 | 9 | type boolElement struct { 10 | e bool 11 | nan bool 12 | } 13 | 14 | func (e *boolElement) Set(value interface{}) { 15 | e.nan = false 16 | switch value.(type) { 17 | case string: 18 | if value.(string) == "NaN" { 19 | e.nan = true 20 | return 21 | } 22 | switch strings.ToLower(value.(string)) { 23 | case "true", "t", "1": 24 | e.e = true 25 | case "false", "f", "0": 26 | e.e = false 27 | default: 28 | e.nan = true 29 | return 30 | } 31 | case int: 32 | switch value.(int) { 33 | case 1: 34 | e.e = true 35 | case 0: 36 | e.e = false 37 | default: 38 | e.nan = true 39 | return 40 | } 41 | case float64: 42 | switch value.(float64) { 43 | case 1: 44 | e.e = true 45 | case 0: 46 | e.e = false 47 | default: 48 | e.nan = true 49 | return 50 | } 51 | case bool: 52 | e.e = value.(bool) 53 | case Element: 54 | b, err := value.(Element).Bool() 55 | if err != nil { 56 | e.nan = true 57 | return 58 | } 59 | e.e = b 60 | default: 61 | e.nan = true 62 | return 63 | } 64 | return 65 | } 66 | 67 | func (e boolElement) Copy() Element { 68 | if e.IsNA() { 69 | return &boolElement{false, true} 70 | } 71 | return &boolElement{e.e, false} 72 | } 73 | 74 | func (e boolElement) IsNA() bool { 75 | if e.nan { 76 | return true 77 | } 78 | return false 79 | } 80 | 81 | func (e boolElement) Type() Type { 82 | return Bool 83 | } 84 | 85 | func (e boolElement) Val() ElementValue { 86 | if e.IsNA() { 87 | return nil 88 | } 89 | return bool(e.e) 90 | } 91 | 92 | func (e boolElement) String() string { 93 | if e.IsNA() { 94 | return "NaN" 95 | } 96 | if e.e { 97 | return "true" 98 | } 99 | return "false" 100 | } 101 | 102 | func (e boolElement) Int() (int, error) { 103 | if e.IsNA() { 104 | return 0, fmt.Errorf("can't convert NaN to int") 105 | } 106 | if e.e == true { 107 | return 1, nil 108 | } 109 | return 0, nil 110 | } 111 | 112 | func (e boolElement) Float() float64 { 113 | if e.IsNA() { 114 | return math.NaN() 115 | } 116 | if e.e { 117 | return 1.0 118 | } 119 | return 0.0 120 | } 121 | 122 | func (e boolElement) Bool() (bool, error) { 123 | if e.IsNA() { 124 | return false, fmt.Errorf("can't convert NaN to bool") 125 | } 126 | return bool(e.e), nil 127 | } 128 | 129 | func (e boolElement) Eq(elem Element) bool { 130 | b, err := elem.Bool() 131 | if err != nil || e.IsNA() { 132 | return false 133 | } 134 | return e.e == b 135 | } 136 | 137 | func (e boolElement) Neq(elem Element) bool { 138 | b, err := elem.Bool() 139 | if err != nil || e.IsNA() { 140 | return false 141 | } 142 | return e.e != b 143 | } 144 | 145 | func (e boolElement) Less(elem Element) bool { 146 | b, err := elem.Bool() 147 | if err != nil || e.IsNA() { 148 | return false 149 | } 150 | return !e.e && b 151 | } 152 | 153 | func (e boolElement) LessEq(elem Element) bool { 154 | b, err := elem.Bool() 155 | if err != nil || e.IsNA() { 156 | return false 157 | } 158 | return !e.e || b 159 | } 160 | 161 | func (e boolElement) Greater(elem Element) bool { 162 | b, err := elem.Bool() 163 | if err != nil || e.IsNA() { 164 | return false 165 | } 166 | return e.e && !b 167 | } 168 | 169 | func (e boolElement) GreaterEq(elem Element) bool { 170 | b, err := elem.Bool() 171 | if err != nil || e.IsNA() { 172 | return false 173 | } 174 | return e.e || !b 175 | } 176 | -------------------------------------------------------------------------------- /series/benchmarks_test.go: -------------------------------------------------------------------------------- 1 | package series_test 2 | 3 | import ( 4 | "math/rand" 5 | "strconv" 6 | "testing" 7 | 8 | "github.com/libonomy/libonomy-gota/series" 9 | ) 10 | 11 | func generateInts(n int) (data []int) { 12 | for i := 0; i < n; i++ { 13 | data = append(data, rand.Int()) 14 | } 15 | return 16 | } 17 | 18 | func generateFloats(n int) (data []float64) { 19 | for i := 0; i < n; i++ { 20 | data = append(data, rand.Float64()) 21 | } 22 | return 23 | } 24 | 25 | func generateStrings(n int) (data []string) { 26 | for i := 0; i < n; i++ { 27 | data = append(data, strconv.Itoa(rand.Int())) 28 | } 29 | return 30 | } 31 | 32 | func generateBools(n int) (data []bool) { 33 | for i := 0; i < n; i++ { 34 | r := rand.Intn(2) 35 | b := false 36 | if r == 1 { 37 | b = true 38 | } 39 | data = append(data, b) 40 | } 41 | return 42 | } 43 | 44 | func generateIntsN(n, k int) (data []int) { 45 | for i := 0; i < n; i++ { 46 | data = append(data, rand.Intn(k)) 47 | } 48 | return 49 | } 50 | 51 | func BenchmarkSeries_New(b *testing.B) { 52 | rand.Seed(100) 53 | table := []struct { 54 | name string 55 | data interface{} 56 | seriesType series.Type 57 | }{ 58 | { 59 | "[]bool(100000)_Int", 60 | generateBools(100000), 61 | series.Int, 62 | }, 63 | { 64 | "[]bool(100000)_String", 65 | generateBools(100000), 66 | series.String, 67 | }, 68 | { 69 | "[]bool(100000)_Bool", 70 | generateBools(100000), 71 | series.Bool, 72 | }, 73 | { 74 | "[]bool(100000)_Float", 75 | generateBools(100000), 76 | series.Float, 77 | }, 78 | { 79 | "[]string(100000)_Int", 80 | generateStrings(100000), 81 | series.Int, 82 | }, 83 | { 84 | "[]string(100000)_String", 85 | generateStrings(100000), 86 | series.String, 87 | }, 88 | { 89 | "[]string(100000)_Bool", 90 | generateStrings(100000), 91 | series.Bool, 92 | }, 93 | { 94 | "[]string(100000)_Float", 95 | generateStrings(100000), 96 | series.Float, 97 | }, 98 | { 99 | "[]float64(100000)_Int", 100 | generateFloats(100000), 101 | series.Int, 102 | }, 103 | { 104 | "[]float64(100000)_String", 105 | generateFloats(100000), 106 | series.String, 107 | }, 108 | { 109 | "[]float64(100000)_Bool", 110 | generateFloats(100000), 111 | series.Bool, 112 | }, 113 | { 114 | "[]float64(100000)_Float", 115 | generateFloats(100000), 116 | series.Float, 117 | }, 118 | { 119 | "[]int(100000)_Int", 120 | generateInts(100000), 121 | series.Int, 122 | }, 123 | { 124 | "[]int(100000)_String", 125 | generateInts(100000), 126 | series.String, 127 | }, 128 | { 129 | "[]int(100000)_Bool", 130 | generateInts(100000), 131 | series.Bool, 132 | }, 133 | { 134 | "[]int(100000)_Float", 135 | generateInts(100000), 136 | series.Float, 137 | }, 138 | } 139 | for _, test := range table { 140 | b.Run(test.name, func(b *testing.B) { 141 | for i := 0; i < b.N; i++ { 142 | series.New(test.data, test.seriesType, test.name) 143 | } 144 | }) 145 | } 146 | } 147 | 148 | func BenchmarkSeries_Copy(b *testing.B) { 149 | rand.Seed(100) 150 | table := []struct { 151 | name string 152 | series series.Series 153 | }{ 154 | { 155 | "[]int(100000)_Int", 156 | series.Ints(generateInts(100000)), 157 | }, 158 | { 159 | "[]int(100000)_String", 160 | series.Strings(generateInts(100000)), 161 | }, 162 | { 163 | "[]int(100000)_Bool", 164 | series.Bools(generateInts(100000)), 165 | }, 166 | { 167 | "[]int(100000)_Float", 168 | series.Floats(generateInts(100000)), 169 | }, 170 | } 171 | for _, test := range table { 172 | b.Run(test.name, func(b *testing.B) { 173 | for i := 0; i < b.N; i++ { 174 | test.series.Copy() 175 | } 176 | }) 177 | } 178 | } 179 | 180 | func BenchmarkSeries_Subset(b *testing.B) { 181 | rand.Seed(100) 182 | table := []struct { 183 | name string 184 | indexes interface{} 185 | series series.Series 186 | }{ 187 | { 188 | "[]int(100000)_Int", 189 | generateIntsN(10000, 2), 190 | series.Ints(generateInts(100000)), 191 | }, 192 | { 193 | "[]int(100000)_String", 194 | generateIntsN(10000, 2), 195 | series.Strings(generateInts(100000)), 196 | }, 197 | { 198 | "[]int(100000)_Bool", 199 | generateIntsN(10000, 2), 200 | series.Bools(generateInts(100000)), 201 | }, 202 | { 203 | "[]int(100000)_Float", 204 | generateIntsN(10000, 2), 205 | series.Floats(generateInts(100000)), 206 | }, 207 | } 208 | for _, test := range table { 209 | b.Run(test.name, func(b *testing.B) { 210 | for i := 0; i < b.N; i++ { 211 | test.series.Subset(test.indexes) 212 | } 213 | }) 214 | } 215 | } 216 | 217 | func BenchmarkSeries_Set(b *testing.B) { 218 | rand.Seed(100) 219 | table := []struct { 220 | name string 221 | indexes interface{} 222 | newValues series.Series 223 | series series.Series 224 | }{ 225 | { 226 | "[]int(100000)_Int", 227 | generateIntsN(10000, 2), 228 | series.Ints(generateIntsN(10000, 2)), 229 | series.Ints(generateInts(100000)), 230 | }, 231 | { 232 | "[]int(100000)_String", 233 | generateIntsN(10000, 2), 234 | series.Strings(generateIntsN(10000, 2)), 235 | series.Strings(generateInts(100000)), 236 | }, 237 | { 238 | "[]int(100000)_Bool", 239 | generateIntsN(10000, 2), 240 | series.Bools(generateIntsN(10000, 2)), 241 | series.Bools(generateInts(100000)), 242 | }, 243 | { 244 | "[]int(100000)_Float", 245 | generateIntsN(10000, 2), 246 | series.Floats(generateIntsN(10000, 2)), 247 | series.Floats(generateInts(100000)), 248 | }, 249 | } 250 | for _, test := range table { 251 | s := test.series.Copy() 252 | b.Run(test.name, func(b *testing.B) { 253 | for i := 0; i < b.N; i++ { 254 | s.Set(test.indexes, test.newValues) 255 | } 256 | }) 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /dataframe/benchmark_test.go: -------------------------------------------------------------------------------- 1 | package dataframe_test 2 | 3 | import ( 4 | "math/rand" 5 | "strconv" 6 | "testing" 7 | 8 | "github.com/libonomy/libonomy-gota/dataframe" 9 | "github.com/libonomy/libonomy-gota/series" 10 | ) 11 | 12 | func generateSeries(n, rep int) (data []series.Series) { 13 | rand.Seed(100) 14 | for j := 0; j < rep; j++ { 15 | var is []int 16 | var bs []bool 17 | var fs []float64 18 | var ss []string 19 | for i := 0; i < n; i++ { 20 | is = append(is, rand.Int()) 21 | } 22 | for i := 0; i < n; i++ { 23 | fs = append(fs, rand.Float64()) 24 | } 25 | for i := 0; i < n; i++ { 26 | ss = append(ss, strconv.Itoa(rand.Int())) 27 | } 28 | for i := 0; i < n; i++ { 29 | r := rand.Intn(2) 30 | b := false 31 | if r == 1 { 32 | b = true 33 | } 34 | bs = append(bs, b) 35 | } 36 | data = append(data, series.Ints(is)) 37 | data = append(data, series.Bools(bs)) 38 | data = append(data, series.Floats(fs)) 39 | data = append(data, series.Strings(ss)) 40 | } 41 | return 42 | } 43 | 44 | func generateIntsN(n, k int) (data []int) { 45 | for i := 0; i < n; i++ { 46 | data = append(data, rand.Intn(k)) 47 | } 48 | return 49 | } 50 | 51 | func BenchmarkNew(b *testing.B) { 52 | table := []struct { 53 | name string 54 | data []series.Series 55 | }{ 56 | { 57 | "100000x4", 58 | generateSeries(100000, 1), 59 | }, 60 | { 61 | "100000x40", 62 | generateSeries(100000, 10), 63 | }, 64 | { 65 | "100000x400", 66 | generateSeries(100000, 100), 67 | }, 68 | { 69 | "1000x40", 70 | generateSeries(1000, 10), 71 | }, 72 | { 73 | "1000x4000", 74 | generateSeries(1000, 1000), 75 | }, 76 | { 77 | "1000x40000", 78 | generateSeries(1000, 10000), 79 | }, 80 | } 81 | for _, test := range table { 82 | b.Run(test.name, func(b *testing.B) { 83 | for i := 0; i < b.N; i++ { 84 | dataframe.New(test.data...) 85 | } 86 | }) 87 | } 88 | } 89 | 90 | func BenchmarkDataFrame_Arrange(b *testing.B) { 91 | data := dataframe.New(generateSeries(100000, 5)...) 92 | table := []struct { 93 | name string 94 | data dataframe.DataFrame 95 | key []dataframe.Order 96 | }{ 97 | { 98 | "100000x20_1", 99 | data, 100 | []dataframe.Order{dataframe.Sort("X0")}, 101 | }, 102 | { 103 | "100000x20_2", 104 | data, 105 | []dataframe.Order{ 106 | dataframe.Sort("X0"), 107 | dataframe.Sort("X1"), 108 | }, 109 | }, 110 | { 111 | "100000x20_3", 112 | data, 113 | []dataframe.Order{ 114 | dataframe.Sort("X0"), 115 | dataframe.Sort("X1"), 116 | dataframe.Sort("X2"), 117 | }, 118 | }, 119 | } 120 | for _, test := range table { 121 | b.Run(test.name, func(b *testing.B) { 122 | for i := 0; i < b.N; i++ { 123 | test.data.Arrange(test.key...) 124 | } 125 | }) 126 | } 127 | } 128 | 129 | func BenchmarkDataFrame_Subset(b *testing.B) { 130 | b.ReportAllocs() 131 | data1000x20 := dataframe.New(generateSeries(1000, 5)...) 132 | data1000x200 := dataframe.New(generateSeries(1000, 50)...) 133 | data1000x2000 := dataframe.New(generateSeries(1000, 500)...) 134 | data100000x20 := dataframe.New(generateSeries(100000, 5)...) 135 | data1000000x20 := dataframe.New(generateSeries(1000000, 5)...) 136 | idx10 := generateIntsN(10, 10) 137 | idx100 := generateIntsN(100, 100) 138 | idx1000 := generateIntsN(1000, 1000) 139 | idx10000 := generateIntsN(10000, 10000) 140 | idx100000 := generateIntsN(100000, 100000) 141 | idx1000000 := generateIntsN(1000000, 1000000) 142 | table := []struct { 143 | name string 144 | data dataframe.DataFrame 145 | indexes interface{} 146 | }{ 147 | { 148 | "1000000x20_100", 149 | data1000000x20, 150 | idx100, 151 | }, 152 | { 153 | "1000000x20_1000", 154 | data1000000x20, 155 | idx1000, 156 | }, 157 | { 158 | "1000000x20_10000", 159 | data1000000x20, 160 | idx10000, 161 | }, 162 | { 163 | "1000000x20_100000", 164 | data1000000x20, 165 | idx100000, 166 | }, 167 | { 168 | "1000000x20_1000000", 169 | data1000000x20, 170 | idx1000000, 171 | }, 172 | { 173 | "100000x20_100", 174 | data100000x20, 175 | idx100, 176 | }, 177 | { 178 | "100000x20_1000", 179 | data100000x20, 180 | idx1000, 181 | }, 182 | { 183 | "100000x20_10000", 184 | data100000x20, 185 | idx10000, 186 | }, 187 | { 188 | "100000x20_100000", 189 | data100000x20, 190 | idx100000, 191 | }, 192 | { 193 | "1000x20_10", 194 | data1000x20, 195 | idx10, 196 | }, 197 | { 198 | "1000x20_100", 199 | data1000x20, 200 | idx100, 201 | }, 202 | { 203 | "1000x20_1000", 204 | data1000x20, 205 | idx1000, 206 | }, 207 | { 208 | "1000x200_10", 209 | data1000x200, 210 | idx10, 211 | }, 212 | { 213 | "1000x200_100", 214 | data1000x200, 215 | idx100, 216 | }, 217 | { 218 | "1000x200_1000", 219 | data1000x200, 220 | idx1000, 221 | }, 222 | { 223 | "1000x2000_10", 224 | data1000x2000, 225 | idx10, 226 | }, 227 | { 228 | "1000x2000_100", 229 | data1000x2000, 230 | idx100, 231 | }, 232 | { 233 | "1000x2000_1000", 234 | data1000x2000, 235 | idx1000, 236 | }, 237 | } 238 | for _, test := range table { 239 | b.Run(test.name, func(b *testing.B) { 240 | for i := 0; i < b.N; i++ { 241 | test.data.Subset(test.indexes) 242 | } 243 | }) 244 | } 245 | } 246 | 247 | func BenchmarkDataFrame_Elem(b *testing.B) { 248 | data := dataframe.New(generateSeries(100000, 5)...) 249 | table := []struct { 250 | name string 251 | data dataframe.DataFrame 252 | }{ 253 | { 254 | "100000x20_ALL", 255 | data, 256 | }, 257 | } 258 | for _, test := range table { 259 | b.Run(test.name, func(b *testing.B) { 260 | for i := 0; i < b.N; i++ { 261 | for k := 0; k < 100000; k++ { 262 | test.data.Elem(k, 0) 263 | } 264 | } 265 | }) 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /dataframe/examples_test.go: -------------------------------------------------------------------------------- 1 | package dataframe_test 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/libonomy/libonomy-gota/dataframe" 8 | "github.com/libonomy/libonomy-gota/series" 9 | ) 10 | 11 | func ExampleNew() { 12 | df := dataframe.New( 13 | series.New([]string{"b", "a"}, series.String, "COL.1"), 14 | series.New([]int{1, 2}, series.Int, "COL.2"), 15 | series.New([]float64{3.0, 4.0}, series.Float, "COL.3"), 16 | ) 17 | fmt.Println(df) 18 | } 19 | 20 | func ExampleLoadStructs() { 21 | type User struct { 22 | Name string 23 | Age int 24 | Accuracy float64 25 | } 26 | users := []User{ 27 | User{"Aram", 17, 0.2}, 28 | User{"Juan", 18, 0.8}, 29 | User{"Ana", 22, 0.5}, 30 | } 31 | df := dataframe.LoadStructs(users) 32 | fmt.Println(df) 33 | } 34 | 35 | func ExampleLoadRecords() { 36 | df := dataframe.LoadRecords( 37 | [][]string{ 38 | []string{"A", "B", "C", "D"}, 39 | []string{"a", "4", "5.1", "true"}, 40 | []string{"k", "5", "7.0", "true"}, 41 | []string{"k", "4", "6.0", "true"}, 42 | []string{"a", "2", "7.1", "false"}, 43 | }, 44 | ) 45 | fmt.Println(df) 46 | } 47 | 48 | func ExampleLoadRecords_options() { 49 | df := dataframe.LoadRecords( 50 | [][]string{ 51 | []string{"A", "B", "C", "D"}, 52 | []string{"a", "4", "5.1", "true"}, 53 | []string{"k", "5", "7.0", "true"}, 54 | []string{"k", "4", "6.0", "true"}, 55 | []string{"a", "2", "7.1", "false"}, 56 | }, 57 | dataframe.DetectTypes(false), 58 | dataframe.DefaultType(series.Float), 59 | dataframe.WithTypes(map[string]series.Type{ 60 | "A": series.String, 61 | "D": series.Bool, 62 | }), 63 | ) 64 | fmt.Println(df) 65 | } 66 | 67 | func ExampleLoadMaps() { 68 | df := dataframe.LoadMaps( 69 | []map[string]interface{}{ 70 | map[string]interface{}{ 71 | "A": "a", 72 | "B": 1, 73 | "C": true, 74 | "D": 0, 75 | }, 76 | map[string]interface{}{ 77 | "A": "b", 78 | "B": 2, 79 | "C": true, 80 | "D": 0.5, 81 | }, 82 | }, 83 | ) 84 | fmt.Println(df) 85 | } 86 | 87 | func ExampleReadCSV() { 88 | csvStr := ` 89 | Country,Date,Age,Amount,Id 90 | "United States",2012-02-01,50,112.1,01234 91 | "United States",2012-02-01,32,321.31,54320 92 | "United Kingdom",2012-02-01,17,18.2,12345 93 | "United States",2012-02-01,32,321.31,54320 94 | "United Kingdom",2012-02-01,NA,18.2,12345 95 | "United States",2012-02-01,32,321.31,54320 96 | "United States",2012-02-01,32,321.31,54320 97 | Spain,2012-02-01,66,555.42,00241 98 | ` 99 | df := dataframe.ReadCSV(strings.NewReader(csvStr)) 100 | fmt.Println(df) 101 | } 102 | 103 | func ExampleReadJSON() { 104 | jsonStr := `[{"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":1}]` 105 | df := dataframe.ReadJSON(strings.NewReader(jsonStr)) 106 | fmt.Println(df) 107 | } 108 | 109 | func ExampleDataFrame_Subset() { 110 | df := dataframe.LoadRecords( 111 | [][]string{ 112 | []string{"A", "B", "C", "D"}, 113 | []string{"a", "4", "5.1", "true"}, 114 | []string{"k", "5", "7.0", "true"}, 115 | []string{"k", "4", "6.0", "true"}, 116 | []string{"a", "2", "7.1", "false"}, 117 | }, 118 | ) 119 | sub := df.Subset([]int{0, 2}) 120 | fmt.Println(sub) 121 | } 122 | 123 | func ExampleDataFrame_Select() { 124 | df := dataframe.LoadRecords( 125 | [][]string{ 126 | []string{"A", "B", "C", "D"}, 127 | []string{"a", "4", "5.1", "true"}, 128 | []string{"k", "5", "7.0", "true"}, 129 | []string{"k", "4", "6.0", "true"}, 130 | []string{"a", "2", "7.1", "false"}, 131 | }, 132 | ) 133 | sel1 := df.Select([]int{0, 2}) 134 | sel2 := df.Select([]string{"A", "C"}) 135 | fmt.Println(sel1) 136 | fmt.Println(sel2) 137 | } 138 | 139 | func ExampleDataFrame_Filter() { 140 | df := dataframe.LoadRecords( 141 | [][]string{ 142 | []string{"A", "B", "C", "D"}, 143 | []string{"a", "4", "5.1", "true"}, 144 | []string{"k", "5", "7.0", "true"}, 145 | []string{"k", "4", "6.0", "true"}, 146 | []string{"a", "2", "7.1", "false"}, 147 | }, 148 | ) 149 | fil := df.Filter( 150 | dataframe.F{ 151 | Colname: "A", 152 | Comparator: series.Eq, 153 | Comparando: "a", 154 | }, 155 | dataframe.F{ 156 | Colname: "B", 157 | Comparator: series.Greater, 158 | Comparando: 4, 159 | }, 160 | ) 161 | fil2 := fil.Filter( 162 | dataframe.F{ 163 | Colname: "D", 164 | Comparator: series.Eq, 165 | Comparando: true, 166 | }, 167 | ) 168 | fmt.Println(fil) 169 | fmt.Println(fil2) 170 | } 171 | 172 | func ExampleDataFrame_Mutate() { 173 | df := dataframe.LoadRecords( 174 | [][]string{ 175 | []string{"A", "B", "C", "D"}, 176 | []string{"a", "4", "5.1", "true"}, 177 | []string{"k", "5", "7.0", "true"}, 178 | []string{"k", "4", "6.0", "true"}, 179 | []string{"a", "2", "7.1", "false"}, 180 | }, 181 | ) 182 | // Change column C with a new one 183 | mut := df.Mutate( 184 | series.New([]string{"a", "b", "c", "d"}, series.String, "C"), 185 | ) 186 | // Add a new column E 187 | mut2 := df.Mutate( 188 | series.New([]string{"a", "b", "c", "d"}, series.String, "E"), 189 | ) 190 | fmt.Println(mut) 191 | fmt.Println(mut2) 192 | } 193 | 194 | func ExampleDataFrame_InnerJoin() { 195 | df := dataframe.LoadRecords( 196 | [][]string{ 197 | []string{"A", "B", "C", "D"}, 198 | []string{"a", "4", "5.1", "true"}, 199 | []string{"k", "5", "7.0", "true"}, 200 | []string{"k", "4", "6.0", "true"}, 201 | []string{"a", "2", "7.1", "false"}, 202 | }, 203 | ) 204 | df2 := dataframe.LoadRecords( 205 | [][]string{ 206 | []string{"A", "F", "D"}, 207 | []string{"1", "1", "true"}, 208 | []string{"4", "2", "false"}, 209 | []string{"2", "8", "false"}, 210 | []string{"5", "9", "false"}, 211 | }, 212 | ) 213 | join := df.InnerJoin(df2, "D") 214 | fmt.Println(join) 215 | } 216 | 217 | func ExampleDataFrame_Set() { 218 | df := dataframe.LoadRecords( 219 | [][]string{ 220 | []string{"A", "B", "C", "D"}, 221 | []string{"a", "4", "5.1", "true"}, 222 | []string{"k", "5", "7.0", "true"}, 223 | []string{"k", "4", "6.0", "true"}, 224 | []string{"a", "2", "7.1", "false"}, 225 | }, 226 | ) 227 | df2 := df.Set( 228 | series.Ints([]int{0, 2}), 229 | dataframe.LoadRecords( 230 | [][]string{ 231 | []string{"A", "B", "C", "D"}, 232 | []string{"b", "4", "6.0", "true"}, 233 | []string{"c", "3", "6.0", "false"}, 234 | }, 235 | ), 236 | ) 237 | fmt.Println(df2) 238 | } 239 | 240 | func ExampleDataFrame_Arrange() { 241 | df := dataframe.LoadRecords( 242 | [][]string{ 243 | []string{"A", "B", "C", "D"}, 244 | []string{"a", "4", "5.1", "true"}, 245 | []string{"b", "4", "6.0", "true"}, 246 | []string{"c", "3", "6.0", "false"}, 247 | []string{"a", "2", "7.1", "false"}, 248 | }, 249 | ) 250 | sorted := df.Arrange( 251 | dataframe.Sort("A"), 252 | dataframe.RevSort("B"), 253 | ) 254 | fmt.Println(sorted) 255 | } 256 | 257 | func ExampleDataFrame_Describe() { 258 | df := dataframe.LoadRecords( 259 | [][]string{ 260 | []string{"A", "B", "C", "D"}, 261 | []string{"a", "4", "5.1", "true"}, 262 | []string{"b", "4", "6.0", "true"}, 263 | []string{"c", "3", "6.0", "false"}, 264 | []string{"a", "2", "7.1", "false"}, 265 | }, 266 | ) 267 | fmt.Println(df.Describe()) 268 | } 269 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | All notable changes to this project will be documented in this file. 4 | This project adheres to [Semantic Versioning](http://semver.org/). 5 | 6 | ## [0.10.1] - 2019-11-08 7 | 8 | ### Fixed 9 | 10 | - LoadRecords printing type debug information 11 | - Missing closing brackets in series.go 12 | - Fix gonum import path in dataframe_test 13 | 14 | ## [0.10.0] - 2019-11-08 15 | 16 | ### Changed 17 | 18 | - Merged dev branch changes from multiple collaborators (Sam Zaydel, Kyle 19 | Ellrott, Daniela Petruzalek, Christoph Laaber). 20 | 21 | ## [0.9.0] - 2016-10-03 22 | 23 | ### Added 24 | 25 | - Additional method to load arbitrary struct slices to DataFrames (Juan Álvarez) 26 | - New LoadOption Names to set initial column names (Sander van Harmelen). 27 | - Parser option for csv delimiter (Kyle Ellrott) 28 | - New Describe method for reporting summary statistics (Daniela Petruzalek) 29 | 30 | ### Changed 31 | 32 | - Improve the performance of multiple operations. 33 | - Code cleanup for better consistency (Sander van Harmelen) 34 | - Renamed 'Deselect' function to 'Drop' (Ben Marshall) 35 | 36 | ## [0.8.0] - 2016-12-12 37 | 38 | ### Added 39 | 40 | - Series.Order method and tests. 41 | - Series.IsNaN method and tests. 42 | - DataFrame.Arrange method and tests. 43 | - DataFrame.Capply method and tests. 44 | - DataFrame.Rapply method and tests. 45 | - Benchmarks for several operations on both the `series` and 46 | `dataframe` packages. 47 | - Many optimizations that increase the performance dramatically. 48 | - New LoadOption where the elements to be parsed as NaN from string 49 | can be selected. 50 | - Gota can now return an implementation of `gonum/mat64.Matrix` 51 | interface via `DataFrame.Matrix()` and load a `mat64.Matrix` via 52 | `dataframe.LoadMatrix()`. 53 | 54 | ### Changed 55 | 56 | - elementInterface is now exported as Element. 57 | - Split element.go into separate files for the implementations of the 58 | Element interface. 59 | - LoadOptions API has been renamed for better documentation via `godoc`. 60 | - `Series.Set` and `DataFrame.Set` now modify the structure in place 61 | for performance considerations. If one wants to use the old 62 | behaviour, it is suggested to use `DataFrame.Copy().Set(...)` 63 | instead of `DataFrame.Set(...)`. 64 | - `DataFrame.Dim` has been changed to `DataFrame.Dims` for consistency 65 | with the `mat64.Matrix` interface. 66 | - When printing a large `DataFrame` now the behaviour of the stringer 67 | interface is much nicer, showing only the first 10 rows and limiting 68 | the number of characters that can be shown by line 69 | 70 | ### Removed 71 | 72 | - Some unused functions from the helpers.go file. 73 | 74 | ### Fix 75 | 76 | - Linter errors. 77 | - stringElement.Float now returns NaN instead of 0 when applicable. 78 | - Autorenaming column names when `hasHeaders == false` now is 79 | consistent with the autorename used with `dataframe.New` 80 | - Bug where duplicated column names were not been assigned consecutive 81 | suffix numbers if the number of duplicates was greater than two. 82 | 83 | ## [0.7.0] - 2016-11-27 84 | 85 | ### Added 86 | 87 | - Many more table tests for both `series` and `dataframe` 88 | - Set method for `Series` and `DataFrame` 89 | - When loading data from CSV, JSON, or Records, different 90 | `LoadOptions` can now be configured. This includes assigning 91 | a default type, manually specifying the column types and others. 92 | - More documentation for previously undocumented functions. 93 | 94 | ### Changed 95 | 96 | - The project has been restructured on separated `dataframe` and 97 | `series` packages. 98 | - Reviewed entire `Series` codebase for better style and 99 | maintainability. 100 | - `DataFrame.Select` now accepts several types of indexes 101 | - Error messages are now more consistent. 102 | - The standard way of checking for errors on both `series` and 103 | `dataframe` is to check the `Err` field on each structure. 104 | - `ReadCSV`/`ReadJSON` and `WriteCSV`/`WriteJSON` now accept 105 | `io.Reader` and `io.Writer` respectively. 106 | - Updated README with the new changes. 107 | 108 | ### Removed 109 | 110 | - Removed unnecessary abstraction layer on `Series.elements` 111 | 112 | ## [0.6.0] - 2016-10-29 113 | 114 | ### Added 115 | 116 | - InnerJoin, CrossJoin, RightJoin, LeftJoin, OuterJoin functions 117 | 118 | ### Changed 119 | 120 | - More code refactoring for easier maintenance and management 121 | - Add more documentation to the exported functions 122 | - Remove unnecessary methods and structures from the exported API 123 | 124 | ### Removed 125 | 126 | - colnames and coltypes from the DataFrame structure 127 | 128 | ## [0.5.0] - 2016-08-09 129 | 130 | ### Added 131 | 132 | - Read and write DataFrames from CSV, JSON, []map[string]interface{}, 133 | [][]string. 134 | - New constructor for DataFrame accept Series and NamedSeries as 135 | arguments. 136 | - Subset, Select, Rename, Mutate, Filter, RBind and CBind methods 137 | - Much Better error handling 138 | 139 | ### Changed 140 | 141 | - Almost complete rewrite of DataFrame code. 142 | - Now using Series as first class citizens and building blocks for 143 | DataFrames. 144 | 145 | ### Removed 146 | 147 | - Merge/Join functions have been temporarily removed to be adapted to 148 | the new architecture. 149 | - Cell interface for allowing custom types into the system. 150 | 151 | ## [0.4.0] - 2016-02-18 152 | 153 | ### Added 154 | 155 | - Getter methods for nrows and ncols. 156 | - An InnerJoin function that performs an Inner Merge/Join of two 157 | DataFrames by the given keys. 158 | - An RightJoin and LeftJoin functions that performs outer right/outer 159 | left joins of two DataFrames by the given keys. 160 | - A CrossJoin function that performs an Cross Merge/Join of two 161 | DataFrames. 162 | - Cell interface now have to implement the NA() method that will 163 | return a empty cell for the given type. 164 | - Cell interface now have to implement a Copy method. 165 | 166 | ### Changed 167 | 168 | - The `cell` interface is now exported: `Cell`. 169 | - Cell method NA() is now IsNA(). 170 | - The function parseColumn is now a method. 171 | - A number of fields and methods are now expoted. 172 | 173 | ### Fixed 174 | 175 | - Now ensuring that generated subsets are in fact new copies entirely, 176 | not copying pointers to the same memory address. 177 | 178 | ## [0.3.0] - 2016-02-18 179 | 180 | ### Added 181 | 182 | - Getter and setter methods for the column names of a DataFrame 183 | - Bool column type has been made available 184 | - New Bool() interface 185 | - A `column` now can now if any of it's elements is NA and a list of 186 | said NA elements ([]bool). 187 | 188 | ### Changed 189 | 190 | - Renamed `cell` interface elements to be more idiomatic: 191 | - ToInteger() is now Int() 192 | - ToFloat() is now Float() 193 | - The `cell` interface has changed. Int() and Float() now 194 | return pointers instead of values to prevent future conflicts when 195 | returning an error. 196 | - The `cell` interface has changed. Checksum() [16]byte added. 197 | - Using cell.Checksum() for identification of unique elements instead 198 | of raw strings. 199 | - The `cell` interface has changed, now also requires ToBool() method. 200 | - String type now does not contain a string, but a pointer to a string. 201 | 202 | ### Fixed 203 | 204 | - Bool type constructor function Bools now parses `bool` and `[]bool` 205 | elements correctly. 206 | - Int type constructor function Ints now parses `bool` and `[]bool` 207 | elements correctly. 208 | - Float type constructor function Floats now parses `bool` and `[]bool` 209 | elements correctly. 210 | - String type constructor function Strings now parses `bool` and `[]bool` 211 | elements correctly. 212 | 213 | ## [0.2.1] - 2016-02-14 214 | 215 | ### Fixed 216 | 217 | - Fixed a bug when the maximum number of characters on a column was 218 | not being updated properly when subsetting. 219 | 220 | ## [0.2.0] - 2016-02-13 221 | 222 | ### Added 223 | 224 | - Added a lot of unit tests 225 | 226 | ### Changed 227 | 228 | - The base types are now `df.String`, `df.Int`, and `df.Float`. 229 | - Restructured the project in different files. 230 | - Refactored the project so that it will allow columns to be of any 231 | type as long as it complies with the necessary interfaces. 232 | 233 | ## [0.1.0] - 2016-02-06 234 | 235 | ### Added 236 | 237 | - Load csv data to DataFrame. 238 | - Parse data to four supported types: `int`, `float64`, `date` 239 | & `string`. 240 | - Row/Column subsetting (Indexing, column names, row numbers, range). 241 | - Unique/Duplicated row subsetting. 242 | - DataFrame combinations by rows and columns (cbind/rbind). 243 | 244 | [0.1.0]: https://github.com/go-gota/gota/compare/v0.1.0...v0.1.0 245 | [0.2.0]: https://github.com/go-gota/gota/compare/v0.1.0...v0.2.0 246 | [0.2.1]: https://github.com/go-gota/gota/compare/v0.2.0...v0.2.1 247 | [0.3.0]: https://github.com/go-gota/gota/compare/v0.2.1...v0.3.0 248 | [0.4.0]: https://github.com/go-gota/gota/compare/v0.3.0...v0.4.0 249 | [0.5.0]: https://github.com/go-gota/gota/compare/v0.4.0...v0.5.0 250 | [0.6.0]: https://github.com/go-gota/gota/compare/v0.5.0...v0.6.0 251 | [0.7.0]: https://github.com/go-gota/gota/compare/v0.6.0...v0.7.0 252 | [0.8.0]: https://github.com/go-gota/gota/compare/v0.7.0...v0.8.0 253 | [0.9.0]: https://github.com/go-gota/gota/compare/v0.8.0...v0.9.0 254 | [0.10.0]: https://github.com/go-gota/gota/compare/v0.9.0...v0.10.0 255 | [0.10.1]: https://github.com/go-gota/gota/compare/v0.10.0...v0.10.1 256 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | ============== 3 | 4 | _Version 2.0, January 2004_ 5 | __ 6 | 7 | ### Terms and Conditions for use, reproduction, and distribution 8 | 9 | #### 1. Definitions 10 | 11 | “License” shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | “Licensor” shall mean the copyright owner or entity authorized by the 15 | copyright owner that is granting the License. 16 | 17 | “Legal Entity” shall mean the union of the acting entity and all other 18 | entities that control, are controlled by, or are under common control 19 | with that entity. For the purposes of this definition, “control” 20 | means **(i)** the power, direct or indirect, to cause the direction or 21 | management of such entity, whether by contract or otherwise, or 22 | **(ii)** ownership of fifty percent (50%) or more of the outstanding 23 | shares, or **(iii)** beneficial ownership of such entity. 24 | 25 | “You” (or “Your”) shall mean an individual or Legal Entity exercising 26 | permissions granted by this License. 27 | 28 | “Source” form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | “Object” form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but not 34 | limited to compiled object code, generated documentation, and 35 | conversions to other media types. 36 | 37 | “Work” shall mean the work of authorship, whether in Source or Object 38 | form, made available under the License, as indicated by a copyright 39 | notice that is included in or attached to the work (an example is 40 | provided in the Appendix below). 41 | 42 | “Derivative Works” shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the 46 | purposes of this License, Derivative Works shall not include works 47 | that remain separable from, or merely link (or bind by name) to the 48 | interfaces of, the Work and Derivative Works thereof. 49 | 50 | “Contribution” shall mean any work of authorship, including the 51 | original version of the Work and any modifications or additions to 52 | that Work or Derivative Works thereof, that is intentionally submitted 53 | to Licensor for inclusion in the Work by the copyright owner or by an 54 | individual or Legal Entity authorized to submit on behalf of the 55 | copyright owner. For the purposes of this definition, “submitted” 56 | means any form of electronic, verbal, or written communication sent to 57 | the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control 59 | systems, and issue tracking systems that are managed by, or on behalf 60 | of, the Licensor for the purpose of discussing and improving the Work, 61 | but excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as “Not a Contribution.” 63 | 64 | “Contributor” shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | #### 2. Grant of Copyright License 69 | 70 | Subject to the terms and conditions of this License, each Contributor 71 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, 72 | royalty-free, irrevocable copyright license to reproduce, prepare 73 | Derivative Works of, publicly display, publicly perform, sublicense, 74 | and distribute the Work and such Derivative Works in Source or Object 75 | form. 76 | 77 | #### 3. Grant of Patent License 78 | 79 | Subject to the terms and conditions of this License, each Contributor 80 | hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, 81 | royalty-free, irrevocable (except as stated in this section) patent 82 | license to make, have made, use, offer to sell, sell, import, and 83 | otherwise transfer the Work, where such license applies only to those 84 | patent claims licensable by such Contributor that are necessarily 85 | infringed by their Contribution(s) alone or by combination of their 86 | Contribution(s) with the Work to which such Contribution(s) was 87 | submitted. If You institute patent litigation against any entity 88 | (including a cross-claim or counterclaim in a lawsuit) alleging that 89 | the Work or a Contribution incorporated within the Work constitutes 90 | direct or contributory patent infringement, then any patent licenses 91 | granted to You under this License for that Work shall terminate as of 92 | the date such litigation is filed. 93 | 94 | #### 4. Redistribution 95 | 96 | You may reproduce and distribute copies of the Work or Derivative 97 | Works thereof in any medium, with or without modifications, and in 98 | Source or Object form, provided that You meet the following 99 | conditions: 100 | 101 | * **(a)** You must give any other recipients of the Work or Derivative 102 | Works a copy of this License; and 103 | * **(b)** You must cause any modified files to carry prominent notices 104 | stating that You changed the files; and 105 | * **(c)** You must retain, in the Source form of any Derivative Works 106 | that You distribute, all copyright, patent, trademark, and 107 | attribution notices from the Source form of the Work, excluding 108 | those notices that do not pertain to any part of the Derivative 109 | Works; and 110 | * **(d)** If the Work includes a “NOTICE” text file as part of its 111 | distribution, then any Derivative Works that You distribute must 112 | include a readable copy of the attribution notices contained within 113 | such NOTICE file, excluding those notices that do not pertain to any 114 | part of the Derivative Works, in at least one of the following 115 | places: within a NOTICE text file distributed as part of the 116 | Derivative Works; within the Source form or documentation, if 117 | provided along with the Derivative Works; or, within a display 118 | generated by the Derivative Works, if and wherever such third-party 119 | notices normally appear. The contents of the NOTICE file are for 120 | informational purposes only and do not modify the License. You may 121 | add Your own attribution notices within Derivative Works that You 122 | distribute, alongside or as an addendum to the NOTICE text from the 123 | Work, provided that such additional attribution notices cannot be 124 | construed as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and may 127 | provide additional or different license terms and conditions for use, 128 | reproduction, or distribution of Your modifications, or for any such 129 | Derivative Works as a whole, provided Your use, reproduction, and 130 | distribution of the Work otherwise complies with the conditions stated 131 | in this License. 132 | 133 | #### 5. Submission of Contributions 134 | 135 | Unless You explicitly state otherwise, any Contribution intentionally 136 | submitted for inclusion in the Work by You to the Licensor shall be 137 | under the terms and conditions of this License, without any additional 138 | terms or conditions. Notwithstanding the above, nothing herein shall 139 | supersede or modify the terms of any separate license agreement you 140 | may have executed with Licensor regarding such Contributions. 141 | 142 | #### 6. Trademarks 143 | 144 | This License does not grant permission to use the trade names, 145 | trademarks, service marks, or product names of the Licensor, except as 146 | required for reasonable and customary use in describing the origin of 147 | the Work and reproducing the content of the NOTICE file. 148 | 149 | #### 7. Disclaimer of Warranty 150 | 151 | Unless required by applicable law or agreed to in writing, Licensor 152 | provides the Work (and each Contributor provides its Contributions) on 153 | an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 154 | express or implied, including, without limitation, any warranties or 155 | conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR 156 | A PARTICULAR PURPOSE. You are solely responsible for determining the 157 | appropriateness of using or redistributing the Work and assume any 158 | risks associated with Your exercise of permissions under this License. 159 | 160 | #### 8. Limitation of Liability 161 | 162 | In no event and under no legal theory, whether in tort (including 163 | negligence), contract, or otherwise, unless required by applicable law 164 | (such as deliberate and grossly negligent acts) or agreed to in 165 | writing, shall any Contributor be liable to You for damages, including 166 | any direct, indirect, special, incidental, or consequential damages of 167 | any character arising as a result of this License or out of the use or 168 | inability to use the Work (including but not limited to damages for 169 | loss of goodwill, work stoppage, computer failure or malfunction, or 170 | any and all other commercial damages or losses), even if such 171 | Contributor has been advised of the possibility of such damages. 172 | 173 | #### 9. Accepting Warranty or Additional Liability 174 | 175 | While redistributing the Work or Derivative Works thereof, You may 176 | choose to offer, and charge a fee for, acceptance of support, 177 | warranty, indemnity, or other liability obligations and/or rights 178 | consistent with this License. However, in accepting such obligations, 179 | You may act only on Your own behalf and on Your sole responsibility, 180 | not on behalf of any other Contributor, and only if You agree to 181 | indemnify, defend, and hold each Contributor harmless for any 182 | liability incurred by, or claims asserted against, such Contributor by 183 | reason of your accepting any such warranty or additional liability. 184 | 185 | _END OF TERMS AND CONDITIONS_ 186 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gota: DataFrames, Series and Data Wrangling for Go 2 | 3 | This is an implementation of DataFrames, Series and data wrangling 4 | methods for the Go programming language. The API is still in flux so 5 | _use at your own risk_. 6 | 7 | ## DataFrame 8 | 9 | The term DataFrame typically refers to a tabular dataset that can be 10 | viewed as a two dimensional table. Often the columns of this dataset 11 | refers to a list of features, while the rows represent a number of 12 | measurements. As the data on the real world is not perfect, DataFrame 13 | supports non measurements or NaN elements. 14 | 15 | Common examples of DataFrames can be found on Excel sheets, CSV files 16 | or SQL database tables, but this data can come on a variety of other 17 | formats, like a collection of JSON objects or XML files. 18 | 19 | The utility of DataFrames resides on the ability to subset them, merge 20 | them, summarize the data for individual features or apply functions to 21 | entire rows or columns, all while keeping column type integrity. 22 | 23 | ### Usage 24 | 25 | #### Loading data 26 | 27 | DataFrames can be constructed passing Series to the dataframe.New constructor 28 | function: 29 | 30 | ```go 31 | df := dataframe.New( 32 | series.New([]string{"b", "a"}, series.String, "COL.1"), 33 | series.New([]int{1, 2}, series.Int, "COL.2"), 34 | series.New([]float64{3.0, 4.0}, series.Float, "COL.3"), 35 | ) 36 | ``` 37 | 38 | You can also load the data directly from other formats. 39 | The base loading function takes some records in the 40 | form `[][]string` and returns a new DataFrame from there: 41 | 42 | ```go 43 | df := dataframe.LoadRecords( 44 | [][]string{ 45 | []string{"A", "B", "C", "D"}, 46 | []string{"a", "4", "5.1", "true"}, 47 | []string{"k", "5", "7.0", "true"}, 48 | []string{"k", "4", "6.0", "true"}, 49 | []string{"a", "2", "7.1", "false"}, 50 | }, 51 | ) 52 | ``` 53 | 54 | Now you can also create DataFrames by loading an slice of arbitrary structs: 55 | 56 | ```go 57 | type User struct { 58 | Name string 59 | Age int 60 | Accuracy float64 61 | ignored bool // ignored since unexported 62 | } 63 | users := []User{ 64 | {"Aram", 17, 0.2, true}, 65 | {"Juan", 18, 0.8, true}, 66 | {"Ana", 22, 0.5, true}, 67 | } 68 | df := dataframe.LoadStructs(users) 69 | ``` 70 | 71 | By default, the column types will be auto detected but this can be 72 | configured. For example, if we wish the default type to be `Float` but 73 | columns `A` and `D` are `String` and `Bool` respectively: 74 | 75 | ```go 76 | df := dataframe.LoadRecords( 77 | [][]string{ 78 | []string{"A", "B", "C", "D"}, 79 | []string{"a", "4", "5.1", "true"}, 80 | []string{"k", "5", "7.0", "true"}, 81 | []string{"k", "4", "6.0", "true"}, 82 | []string{"a", "2", "7.1", "false"}, 83 | }, 84 | dataframe.DetectTypes(false), 85 | dataframe.DefaultType(series.Float), 86 | dataframe.WithTypes(map[string]series.Type{ 87 | "A": series.String, 88 | "D": series.Bool, 89 | }), 90 | ) 91 | ``` 92 | 93 | Similarly, you can load the data stored on a `[]map[string]interface{}`: 94 | 95 | ```go 96 | df := dataframe.LoadMaps( 97 | []map[string]interface{}{ 98 | map[string]interface{}{ 99 | "A": "a", 100 | "B": 1, 101 | "C": true, 102 | "D": 0, 103 | }, 104 | map[string]interface{}{ 105 | "A": "b", 106 | "B": 2, 107 | "C": true, 108 | "D": 0.5, 109 | }, 110 | }, 111 | ) 112 | ``` 113 | 114 | You can also pass an `io.Reader` to the functions `ReadCSV`/`ReadJSON` 115 | and it will work as expected given that the data is correct: 116 | 117 | ```go 118 | csvStr := ` 119 | Country,Date,Age,Amount,Id 120 | "United States",2012-02-01,50,112.1,01234 121 | "United States",2012-02-01,32,321.31,54320 122 | "United Kingdom",2012-02-01,17,18.2,12345 123 | "United States",2012-02-01,32,321.31,54320 124 | "United Kingdom",2012-02-01,NA,18.2,12345 125 | "United States",2012-02-01,32,321.31,54320 126 | "United States",2012-02-01,32,321.31,54320 127 | Spain,2012-02-01,66,555.42,00241 128 | ` 129 | df := dataframe.ReadCSV(strings.NewReader(csvStr)) 130 | ``` 131 | 132 | ```go 133 | jsonStr := `[{"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":1}]` 134 | df := dataframe.ReadJSON(strings.NewReader(jsonStr)) 135 | ``` 136 | 137 | #### Subsetting 138 | 139 | We can subset our DataFrames with the Subset method. For example if we 140 | want the first and third rows we can do the following: 141 | 142 | ```go 143 | sub := df.Subset([]int{0, 2}) 144 | ``` 145 | 146 | #### Column selection 147 | 148 | If instead of subsetting the rows we want to select specific columns, 149 | by an index or column name: 150 | 151 | ```go 152 | sel1 := df.Select([]int{0, 2}) 153 | sel2 := df.Select([]string{"A", "C"}) 154 | ``` 155 | 156 | #### Updating values 157 | 158 | In order to update the values of a DataFrame we can use the Set 159 | method: 160 | 161 | ```go 162 | df2 := df.Set( 163 | []int{0, 2}, 164 | dataframe.LoadRecords( 165 | [][]string{ 166 | []string{"A", "B", "C", "D"}, 167 | []string{"b", "4", "6.0", "true"}, 168 | []string{"c", "3", "6.0", "false"}, 169 | }, 170 | ), 171 | ) 172 | ``` 173 | 174 | #### Filtering 175 | 176 | For more complex row subsetting we can use the Filter method. For 177 | example, if we want the rows where the column "A" is equal to "a" or 178 | column "B" is greater than 4: 179 | 180 | ```go 181 | fil := df.Filter( 182 | dataframe.F{"A", series.Eq, "a"}, 183 | dataframe.F{"B", series.Greater, 4}, 184 | ) 185 | fil2 := fil.Filter( 186 | dataframe.F{"D", series.Eq, true}, 187 | ) 188 | ``` 189 | 190 | Filters inside Filter are combined as OR operations whereas if we chain 191 | Filter methods, they will behave as AND. 192 | 193 | #### Arrange 194 | 195 | With Arrange a DataFrame can be sorted by the given column names: 196 | 197 | ```go 198 | sorted := df.Arrange( 199 | dataframe.Sort("A"), // Sort in ascending order 200 | dataframe.RevSort("B"), // Sort in descending order 201 | ) 202 | ``` 203 | 204 | #### Mutate 205 | 206 | If we want to modify a column or add one based on a given Series at 207 | the end we can use the Mutate method: 208 | 209 | ```go 210 | // Change column C with a new one 211 | mut := df.Mutate( 212 | series.New([]string{"a", "b", "c", "d"}, series.String, "C"), 213 | ) 214 | // Add a new column E 215 | mut2 := df.Mutate( 216 | series.New([]string{"a", "b", "c", "d"}, series.String, "E"), 217 | ) 218 | ``` 219 | 220 | #### Joins 221 | 222 | Different Join operations are supported (`InnerJoin`, `LeftJoin`, 223 | `RightJoin`, `CrossJoin`). In order to use these methods you have to 224 | specify which are the keys to be used for joining the DataFrames: 225 | 226 | ```go 227 | df := dataframe.LoadRecords( 228 | [][]string{ 229 | []string{"A", "B", "C", "D"}, 230 | []string{"a", "4", "5.1", "true"}, 231 | []string{"k", "5", "7.0", "true"}, 232 | []string{"k", "4", "6.0", "true"}, 233 | []string{"a", "2", "7.1", "false"}, 234 | }, 235 | ) 236 | df2 := dataframe.LoadRecords( 237 | [][]string{ 238 | []string{"A", "F", "D"}, 239 | []string{"1", "1", "true"}, 240 | []string{"4", "2", "false"}, 241 | []string{"2", "8", "false"}, 242 | []string{"5", "9", "false"}, 243 | }, 244 | ) 245 | join := df.InnerJoin(df2, "D") 246 | ``` 247 | 248 | #### Function application 249 | 250 | Functions can be applied to the rows or columns of a DataFrame, 251 | casting the types as necessary: 252 | 253 | ```go 254 | mean := func(s series.Series) series.Series { 255 | floats := s.Float() 256 | sum := 0.0 257 | for _, f := range floats { 258 | sum += f 259 | } 260 | return series.Floats(sum / float64(len(floats))) 261 | } 262 | df.Capply(mean) 263 | df.Rapply(mean) 264 | ``` 265 | 266 | #### Chaining operations 267 | 268 | DataFrames support a number of methods for wrangling the data, 269 | filtering, subsetting, selecting columns, adding new columns or 270 | modifying existing ones. All these methods can be chained one after 271 | another and at the end of the procedure check if there has been any 272 | errors by the DataFrame Err field. If any of the methods in the chain 273 | returns an error, the remaining operations on the chain will become 274 | a no-op. 275 | 276 | ```go 277 | a = a.Rename("Origin", "Country"). 278 | Filter(dataframe.F{"Age", "<", 50}). 279 | Filter(dataframe.F{"Origin", "==", "United States"}). 280 | Select("Id", "Origin", "Date"). 281 | Subset([]int{1, 3}) 282 | if a.Err != nil { 283 | log.Fatal("Oh noes!") 284 | } 285 | ``` 286 | 287 | #### Print to console 288 | 289 | ```go 290 | fmt.Println(flights) 291 | 292 | > [336776x20] DataFrame 293 | > 294 | > X0 year month day dep_time sched_dep_time dep_delay arr_time ... 295 | > 0: 1 2013 1 1 517 515 2 830 ... 296 | > 1: 2 2013 1 1 533 529 4 850 ... 297 | > 2: 3 2013 1 1 542 540 2 923 ... 298 | > 3: 4 2013 1 1 544 545 -1 1004 ... 299 | > 4: 5 2013 1 1 554 600 -6 812 ... 300 | > 5: 6 2013 1 1 554 558 -4 740 ... 301 | > 6: 7 2013 1 1 555 600 -5 913 ... 302 | > 7: 8 2013 1 1 557 600 -3 709 ... 303 | > 8: 9 2013 1 1 557 600 -3 838 ... 304 | > 9: 10 2013 1 1 558 600 -2 753 ... 305 | > ... ... ... ... ... ... ... ... ... 306 | > ... 307 | > 308 | > Not Showing: sched_arr_time , arr_delay , carrier , flight , 309 | > tailnum , origin , dest , air_time , distance , hour , 310 | > minute , time_hour 311 | ``` 312 | 313 | #### Interfacing with gonum 314 | 315 | A `gonum/mat.Matrix` or any object that implements the `dataframe.Matrix` 316 | interface can be loaded as a `DataFrame` by using the `LoadMatrix()` method. If 317 | one wants to convert a `DataFrame` to a `mat.Matrix` it is necessary to create 318 | the necessary structs and method implementations. Since a `DataFrame` already 319 | implements the `Dims() (r, c int)` method, only implementations for the `At` and 320 | `T` methods are necessary: 321 | 322 | ```go 323 | type matrix struct { 324 | DataFrame 325 | } 326 | 327 | func (m matrix) At(i, j int) float64 { 328 | return m.columns[j].Elem(i).Float() 329 | } 330 | 331 | func (m matrix) T() mat64.Matrix { 332 | return mat64.Transpose{Matrix: m} 333 | } 334 | ``` 335 | 336 | ## Series 337 | 338 | Series are essentially vectors of elements of the same type with 339 | support for missing values. Series are the building blocks for 340 | DataFrame columns. 341 | 342 | Four types are currently supported: 343 | 344 | ```go 345 | Int 346 | Float 347 | String 348 | Bool 349 | ``` 350 | 351 | For more information about the API, make sure to check: 352 | 353 | - [dataframe godoc][3] 354 | - [series godoc][4] 355 | 356 | ## License 357 | 358 | Copyright 2016 Alejandro Sanchez Brotons 359 | 360 | Licensed under the Apache License, Version 2.0 (the "License"); you 361 | may not use this file except in compliance with the License. You may 362 | obtain a copy of the License at 363 | 364 | http://www.apache.org/licenses/LICENSE-2.0 365 | 366 | Unless required by applicable law or agreed to in writing, software 367 | distributed under the License is distributed on an "AS IS" BASIS, 368 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 369 | implied. See the License for the specific language governing 370 | permissions and limitations under the License. 371 | 372 | [1]: https://github.com/gonum 373 | [2]: https://github.com/go-gota/gota 374 | [3]: https://godoc.org/github.com/go-gota/gota/dataframe 375 | [4]: https://godoc.org/github.com/go-gota/gota/series 376 | -------------------------------------------------------------------------------- /series/series.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "sort" 7 | "strings" 8 | 9 | "math" 10 | 11 | "gonum.org/v1/gonum/stat" 12 | ) 13 | 14 | // Series is a data structure designed for operating on arrays of elements that 15 | // should comply with a certain type structure. They are flexible enough that can 16 | // be transformed to other Series types and account for missing or non valid 17 | // elements. Most of the power of Series resides on the ability to compare and 18 | // subset Series of different types. 19 | type Series struct { 20 | Name string // The name of the series 21 | elements Elements // The values of the elements 22 | t Type // The type of the series 23 | Err error // If there are errors they are stored here 24 | } 25 | 26 | // Elements is the interface that represents the array of elements contained on 27 | // a Series. 28 | type Elements interface { 29 | Elem(int) Element 30 | Len() int 31 | } 32 | 33 | // Element is the interface that defines the types of methods to be present for 34 | // elements of a Series 35 | type Element interface { 36 | // Setter method 37 | Set(interface{}) 38 | 39 | // Comparation methods 40 | Eq(Element) bool 41 | Neq(Element) bool 42 | Less(Element) bool 43 | LessEq(Element) bool 44 | Greater(Element) bool 45 | GreaterEq(Element) bool 46 | 47 | // Accessor/conversion methods 48 | Copy() Element // FIXME: Returning interface is a recipe for pain 49 | Val() ElementValue // FIXME: Returning interface is a recipe for pain 50 | String() string 51 | Int() (int, error) 52 | Float() float64 53 | Bool() (bool, error) 54 | 55 | // Information methods 56 | IsNA() bool 57 | Type() Type 58 | } 59 | 60 | // intElements is the concrete implementation of Elements for Int elements. 61 | type intElements []intElement 62 | 63 | func (e intElements) Len() int { return len(e) } 64 | func (e intElements) Elem(i int) Element { return &e[i] } 65 | 66 | // stringElements is the concrete implementation of Elements for String elements. 67 | type stringElements []stringElement 68 | 69 | func (e stringElements) Len() int { return len(e) } 70 | func (e stringElements) Elem(i int) Element { return &e[i] } 71 | 72 | // floatElements is the concrete implementation of Elements for Float elements. 73 | type floatElements []floatElement 74 | 75 | func (e floatElements) Len() int { return len(e) } 76 | func (e floatElements) Elem(i int) Element { return &e[i] } 77 | 78 | // boolElements is the concrete implementation of Elements for Bool elements. 79 | type boolElements []boolElement 80 | 81 | func (e boolElements) Len() int { return len(e) } 82 | func (e boolElements) Elem(i int) Element { return &e[i] } 83 | 84 | // ElementValue represents the value that can be used for marshaling or 85 | // unmarshaling Elements. 86 | type ElementValue interface{} 87 | 88 | type MapFunction func(Element) Element 89 | 90 | // Comparator is a convenience alias that can be used for a more type safe way of 91 | // reason and use comparators. 92 | type Comparator string 93 | 94 | // Supported Comparators 95 | const ( 96 | Eq Comparator = "==" // Equal 97 | Neq Comparator = "!=" // Non equal 98 | Greater Comparator = ">" // Greater than 99 | GreaterEq Comparator = ">=" // Greater or equal than 100 | Less Comparator = "<" // Lesser than 101 | LessEq Comparator = "<=" // Lesser or equal than 102 | In Comparator = "in" // Inside 103 | ) 104 | 105 | // Type is a convenience alias that can be used for a more type safe way of 106 | // reason and use Series types. 107 | type Type string 108 | 109 | // Supported Series Types 110 | const ( 111 | String Type = "string" 112 | Int Type = "int" 113 | Float Type = "float" 114 | Bool Type = "bool" 115 | ) 116 | 117 | // Indexes represent the elements that can be used for selecting a subset of 118 | // elements within a Series. Currently supported are: 119 | // 120 | // int // Matches the given index number 121 | // []int // Matches all given index numbers 122 | // []bool // Matches all elements in a Series marked as true 123 | // Series [Int] // Same as []int 124 | // Series [Bool] // Same as []bool 125 | type Indexes interface{} 126 | 127 | // New is the generic Series constructor 128 | func New(values interface{}, t Type, name string) Series { 129 | ret := Series{ 130 | Name: name, 131 | t: t, 132 | } 133 | 134 | // Pre-allocate elements 135 | preAlloc := func(n int) { 136 | switch t { 137 | case String: 138 | ret.elements = make(stringElements, n) 139 | case Int: 140 | ret.elements = make(intElements, n) 141 | case Float: 142 | ret.elements = make(floatElements, n) 143 | case Bool: 144 | ret.elements = make(boolElements, n) 145 | default: 146 | panic(fmt.Sprintf("unknown type %v", t)) 147 | } 148 | } 149 | 150 | if values == nil { 151 | preAlloc(1) 152 | ret.elements.Elem(0).Set(nil) 153 | return ret 154 | } 155 | 156 | switch values.(type) { 157 | case []string: 158 | v := values.([]string) 159 | l := len(v) 160 | preAlloc(l) 161 | for i := 0; i < l; i++ { 162 | ret.elements.Elem(i).Set(v[i]) 163 | } 164 | case []float64: 165 | v := values.([]float64) 166 | l := len(v) 167 | preAlloc(l) 168 | for i := 0; i < l; i++ { 169 | ret.elements.Elem(i).Set(v[i]) 170 | } 171 | case []int: 172 | v := values.([]int) 173 | l := len(v) 174 | preAlloc(l) 175 | for i := 0; i < l; i++ { 176 | ret.elements.Elem(i).Set(v[i]) 177 | } 178 | case []bool: 179 | v := values.([]bool) 180 | l := len(v) 181 | preAlloc(l) 182 | for i := 0; i < l; i++ { 183 | ret.elements.Elem(i).Set(v[i]) 184 | } 185 | case Series: 186 | v := values.(Series) 187 | l := v.Len() 188 | preAlloc(l) 189 | for i := 0; i < l; i++ { 190 | ret.elements.Elem(i).Set(v.elements.Elem(i)) 191 | } 192 | default: 193 | switch reflect.TypeOf(values).Kind() { 194 | case reflect.Slice: 195 | v := reflect.ValueOf(values) 196 | l := v.Len() 197 | preAlloc(v.Len()) 198 | for i := 0; i < l; i++ { 199 | val := v.Index(i).Interface() 200 | ret.elements.Elem(i).Set(val) 201 | } 202 | default: 203 | preAlloc(1) 204 | v := reflect.ValueOf(values) 205 | val := v.Interface() 206 | ret.elements.Elem(0).Set(val) 207 | } 208 | } 209 | 210 | return ret 211 | } 212 | 213 | // Strings is a constructor for a String Series 214 | func Strings(values interface{}) Series { 215 | return New(values, String, "") 216 | } 217 | 218 | // Ints is a constructor for an Int Series 219 | func Ints(values interface{}) Series { 220 | return New(values, Int, "") 221 | } 222 | 223 | // Floats is a constructor for a Float Series 224 | func Floats(values interface{}) Series { 225 | return New(values, Float, "") 226 | } 227 | 228 | // Bools is a constructor for a Bool Series 229 | func Bools(values interface{}) Series { 230 | return New(values, Bool, "") 231 | } 232 | 233 | // Empty returns an empty Series of the same type 234 | func (s Series) Empty() Series { 235 | return New([]int{}, s.t, s.Name) 236 | } 237 | 238 | // Append adds new elements to the end of the Series. When using Append, the 239 | // Series is modified in place. 240 | func (s *Series) Append(values interface{}) { 241 | if err := s.Err; err != nil { 242 | return 243 | } 244 | news := New(values, s.t, s.Name) 245 | switch s.t { 246 | case String: 247 | s.elements = append(s.elements.(stringElements), news.elements.(stringElements)...) 248 | case Int: 249 | s.elements = append(s.elements.(intElements), news.elements.(intElements)...) 250 | case Float: 251 | s.elements = append(s.elements.(floatElements), news.elements.(floatElements)...) 252 | case Bool: 253 | s.elements = append(s.elements.(boolElements), news.elements.(boolElements)...) 254 | } 255 | } 256 | 257 | // Concat concatenates two series together. It will return a new Series with the 258 | // combined elements of both Series. 259 | func (s Series) Concat(x Series) Series { 260 | if err := s.Err; err != nil { 261 | return s 262 | } 263 | if err := x.Err; err != nil { 264 | s.Err = fmt.Errorf("concat error: argument has errors: %v", err) 265 | return s 266 | } 267 | y := s.Copy() 268 | y.Append(x) 269 | return y 270 | } 271 | 272 | // Subset returns a subset of the series based on the given Indexes. 273 | func (s Series) Subset(indexes Indexes) Series { 274 | if err := s.Err; err != nil { 275 | return s 276 | } 277 | idx, err := parseIndexes(s.Len(), indexes) 278 | if err != nil { 279 | s.Err = err 280 | return s 281 | } 282 | ret := Series{ 283 | Name: s.Name, 284 | t: s.t, 285 | } 286 | switch s.t { 287 | case String: 288 | elements := make(stringElements, len(idx)) 289 | for k, i := range idx { 290 | elements[k] = s.elements.(stringElements)[i] 291 | } 292 | ret.elements = elements 293 | case Int: 294 | elements := make(intElements, len(idx)) 295 | for k, i := range idx { 296 | elements[k] = s.elements.(intElements)[i] 297 | } 298 | ret.elements = elements 299 | case Float: 300 | elements := make(floatElements, len(idx)) 301 | for k, i := range idx { 302 | elements[k] = s.elements.(floatElements)[i] 303 | } 304 | ret.elements = elements 305 | case Bool: 306 | elements := make(boolElements, len(idx)) 307 | for k, i := range idx { 308 | elements[k] = s.elements.(boolElements)[i] 309 | } 310 | ret.elements = elements 311 | default: 312 | panic("unknown series type") 313 | } 314 | return ret 315 | } 316 | 317 | // Set sets the values on the indexes of a Series and returns the reference 318 | // for itself. The original Series is modified. 319 | func (s Series) Set(indexes Indexes, newvalues Series) Series { 320 | if err := s.Err; err != nil { 321 | return s 322 | } 323 | if err := newvalues.Err; err != nil { 324 | s.Err = fmt.Errorf("set error: argument has errors: %v", err) 325 | return s 326 | } 327 | idx, err := parseIndexes(s.Len(), indexes) 328 | if err != nil { 329 | s.Err = err 330 | return s 331 | } 332 | if len(idx) != newvalues.Len() { 333 | s.Err = fmt.Errorf("set error: dimensions mismatch") 334 | return s 335 | } 336 | for k, i := range idx { 337 | if i < 0 || i >= s.Len() { 338 | s.Err = fmt.Errorf("set error: index out of range") 339 | return s 340 | } 341 | s.elements.Elem(i).Set(newvalues.elements.Elem(k)) 342 | } 343 | return s 344 | } 345 | 346 | // HasNaN checks whether the Series contain NaN elements. 347 | func (s Series) HasNaN() bool { 348 | for i := 0; i < s.Len(); i++ { 349 | if s.elements.Elem(i).IsNA() { 350 | return true 351 | } 352 | } 353 | return false 354 | } 355 | 356 | // IsNaN returns an array that identifies which of the elements are NaN. 357 | func (s Series) IsNaN() []bool { 358 | ret := make([]bool, s.Len()) 359 | for i := 0; i < s.Len(); i++ { 360 | ret[i] = s.elements.Elem(i).IsNA() 361 | } 362 | return ret 363 | } 364 | 365 | // Compare compares the values of a Series with other elements. To do so, the 366 | // elements with are to be compared are first transformed to a Series of the same 367 | // type as the caller. 368 | func (s Series) Compare(comparator Comparator, comparando interface{}) Series { 369 | if err := s.Err; err != nil { 370 | return s 371 | } 372 | compareElements := func(a, b Element, c Comparator) (bool, error) { 373 | var ret bool 374 | switch c { 375 | case Eq: 376 | ret = a.Eq(b) 377 | case Neq: 378 | ret = a.Neq(b) 379 | case Greater: 380 | ret = a.Greater(b) 381 | case GreaterEq: 382 | ret = a.GreaterEq(b) 383 | case Less: 384 | ret = a.Less(b) 385 | case LessEq: 386 | ret = a.LessEq(b) 387 | default: 388 | return false, fmt.Errorf("unknown comparator: %v", c) 389 | } 390 | return ret, nil 391 | } 392 | 393 | comp := New(comparando, s.t, "") 394 | bools := make([]bool, s.Len()) 395 | // In comparator comparation 396 | if comparator == In { 397 | for i := 0; i < s.Len(); i++ { 398 | e := s.elements.Elem(i) 399 | b := false 400 | for j := 0; j < comp.Len(); j++ { 401 | m := comp.elements.Elem(j) 402 | c, err := compareElements(e, m, Eq) 403 | if err != nil { 404 | s = s.Empty() 405 | s.Err = err 406 | return s 407 | } 408 | if c { 409 | b = true 410 | break 411 | } 412 | } 413 | bools[i] = b 414 | } 415 | return Bools(bools) 416 | } 417 | 418 | // Single element comparison 419 | if comp.Len() == 1 { 420 | for i := 0; i < s.Len(); i++ { 421 | e := s.elements.Elem(i) 422 | c, err := compareElements(e, comp.elements.Elem(0), comparator) 423 | if err != nil { 424 | s = s.Empty() 425 | s.Err = err 426 | return s 427 | } 428 | bools[i] = c 429 | } 430 | return Bools(bools) 431 | } 432 | 433 | // Multiple element comparison 434 | if s.Len() != comp.Len() { 435 | s := s.Empty() 436 | s.Err = fmt.Errorf("can't compare: length mismatch") 437 | return s 438 | } 439 | for i := 0; i < s.Len(); i++ { 440 | e := s.elements.Elem(i) 441 | c, err := compareElements(e, comp.elements.Elem(i), comparator) 442 | if err != nil { 443 | s = s.Empty() 444 | s.Err = err 445 | return s 446 | } 447 | bools[i] = c 448 | } 449 | return Bools(bools) 450 | } 451 | 452 | // Copy will return a copy of the Series. 453 | func (s Series) Copy() Series { 454 | name := s.Name 455 | t := s.t 456 | err := s.Err 457 | var elements Elements 458 | switch s.t { 459 | case String: 460 | elements = make(stringElements, s.Len()) 461 | copy(elements.(stringElements), s.elements.(stringElements)) 462 | case Float: 463 | elements = make(floatElements, s.Len()) 464 | copy(elements.(floatElements), s.elements.(floatElements)) 465 | case Bool: 466 | elements = make(boolElements, s.Len()) 467 | copy(elements.(boolElements), s.elements.(boolElements)) 468 | case Int: 469 | elements = make(intElements, s.Len()) 470 | copy(elements.(intElements), s.elements.(intElements)) 471 | } 472 | ret := Series{ 473 | Name: name, 474 | t: t, 475 | elements: elements, 476 | Err: err, 477 | } 478 | return ret 479 | } 480 | 481 | // Records returns the elements of a Series as a []string 482 | func (s Series) Records() []string { 483 | ret := make([]string, s.Len()) 484 | for i := 0; i < s.Len(); i++ { 485 | e := s.elements.Elem(i) 486 | ret[i] = e.String() 487 | } 488 | return ret 489 | } 490 | 491 | // Float returns the elements of a Series as a []float64. If the elements can not 492 | // be converted to float64 or contains a NaN returns the float representation of 493 | // NaN. 494 | func (s Series) Float() []float64 { 495 | ret := make([]float64, s.Len()) 496 | for i := 0; i < s.Len(); i++ { 497 | e := s.elements.Elem(i) 498 | ret[i] = e.Float() 499 | } 500 | return ret 501 | } 502 | 503 | // Int returns the elements of a Series as a []int or an error if the 504 | // transformation is not possible. 505 | func (s Series) Int() ([]int, error) { 506 | ret := make([]int, s.Len()) 507 | for i := 0; i < s.Len(); i++ { 508 | e := s.elements.Elem(i) 509 | val, err := e.Int() 510 | if err != nil { 511 | return nil, err 512 | } 513 | ret[i] = val 514 | } 515 | return ret, nil 516 | } 517 | 518 | // Bool returns the elements of a Series as a []bool or an error if the 519 | // transformation is not possible. 520 | func (s Series) Bool() ([]bool, error) { 521 | ret := make([]bool, s.Len()) 522 | for i := 0; i < s.Len(); i++ { 523 | e := s.elements.Elem(i) 524 | val, err := e.Bool() 525 | if err != nil { 526 | return nil, err 527 | } 528 | ret[i] = val 529 | } 530 | return ret, nil 531 | } 532 | 533 | // Type returns the type of a given series 534 | func (s Series) Type() Type { 535 | return s.t 536 | } 537 | 538 | // Len returns the length of a given Series 539 | func (s Series) Len() int { 540 | return s.elements.Len() 541 | } 542 | 543 | // String implements the Stringer interface for Series 544 | func (s Series) String() string { 545 | return fmt.Sprint(s.elements) 546 | } 547 | 548 | // Str prints some extra information about a given series 549 | func (s Series) Str() string { 550 | var ret []string 551 | // If name exists print name 552 | if s.Name != "" { 553 | ret = append(ret, "Name: "+s.Name) 554 | } 555 | ret = append(ret, "Type: "+fmt.Sprint(s.t)) 556 | ret = append(ret, "Length: "+fmt.Sprint(s.Len())) 557 | if s.Len() != 0 { 558 | ret = append(ret, "Values: "+fmt.Sprint(s)) 559 | } 560 | return strings.Join(ret, "\n") 561 | } 562 | 563 | // Val returns the value of a series for the given index. Will panic if the index 564 | // is out of bounds. 565 | func (s Series) Val(i int) interface{} { 566 | return s.elements.Elem(i).Val() 567 | } 568 | 569 | // Elem returns the element of a series for the given index. Will panic if the 570 | // index is out of bounds. 571 | func (s Series) Elem(i int) Element { 572 | return s.elements.Elem(i) 573 | } 574 | 575 | // parseIndexes will parse the given indexes for a given series of length `l`. No 576 | // out of bounds checks is performed. 577 | func parseIndexes(l int, indexes Indexes) ([]int, error) { 578 | var idx []int 579 | switch indexes.(type) { 580 | case []int: 581 | idx = indexes.([]int) 582 | case int: 583 | idx = []int{indexes.(int)} 584 | case []bool: 585 | bools := indexes.([]bool) 586 | if len(bools) != l { 587 | return nil, fmt.Errorf("indexing error: index dimensions mismatch") 588 | } 589 | for i, b := range bools { 590 | if b { 591 | idx = append(idx, i) 592 | } 593 | } 594 | case Series: 595 | s := indexes.(Series) 596 | if err := s.Err; err != nil { 597 | return nil, fmt.Errorf("indexing error: new values has errors: %v", err) 598 | } 599 | if s.HasNaN() { 600 | return nil, fmt.Errorf("indexing error: indexes contain NaN") 601 | } 602 | switch s.t { 603 | case Int: 604 | return s.Int() 605 | case Bool: 606 | bools, err := s.Bool() 607 | if err != nil { 608 | return nil, fmt.Errorf("indexing error: %v", err) 609 | } 610 | return parseIndexes(l, bools) 611 | default: 612 | return nil, fmt.Errorf("indexing error: unknown indexing mode") 613 | } 614 | default: 615 | return nil, fmt.Errorf("indexing error: unknown indexing mode") 616 | } 617 | return idx, nil 618 | } 619 | 620 | // Order returns the indexes for sorting a Series. NaN elements are pushed to the 621 | // end by order of appearance. 622 | func (s Series) Order(reverse bool) []int { 623 | var ie indexedElements 624 | var nasIdx []int 625 | for i := 0; i < s.Len(); i++ { 626 | e := s.elements.Elem(i) 627 | if e.IsNA() { 628 | nasIdx = append(nasIdx, i) 629 | } else { 630 | ie = append(ie, indexedElement{i, e}) 631 | } 632 | } 633 | var srt sort.Interface 634 | srt = ie 635 | if reverse { 636 | srt = sort.Reverse(srt) 637 | } 638 | sort.Sort(srt) 639 | var ret []int 640 | for _, e := range ie { 641 | ret = append(ret, e.index) 642 | } 643 | return append(ret, nasIdx...) 644 | } 645 | 646 | type indexedElement struct { 647 | index int 648 | element Element 649 | } 650 | 651 | type indexedElements []indexedElement 652 | 653 | func (e indexedElements) Len() int { return len(e) } 654 | func (e indexedElements) Less(i, j int) bool { return e[i].element.Less(e[j].element) } 655 | func (e indexedElements) Swap(i, j int) { e[i], e[j] = e[j], e[i] } 656 | 657 | // StdDev calculates the standard deviation of a series 658 | func (s Series) StdDev() float64 { 659 | stdDev := stat.StdDev(s.Float(), nil) 660 | return stdDev 661 | } 662 | 663 | // Mean calculates the average value of a series 664 | func (s Series) Mean() float64 { 665 | stdDev := stat.Mean(s.Float(), nil) 666 | return stdDev 667 | } 668 | 669 | // Median calculates the middle or median value, as opposed to 670 | // mean, and there is less susceptible to being affected by outliers. 671 | func (s Series) Median() float64 { 672 | if s.elements.Len() == 0 || 673 | s.Type() == String || 674 | s.Type() == Bool { 675 | return math.NaN() 676 | } 677 | ix := s.Order(false) 678 | newElem := make([]Element, len(ix)) 679 | 680 | for newpos, oldpos := range ix { 681 | newElem[newpos] = s.elements.Elem(oldpos) 682 | } 683 | 684 | // When length is odd, we just take length(list)/2 685 | // value as the median. 686 | if len(newElem)%2 != 0 { 687 | return newElem[len(newElem)/2].Float() 688 | } 689 | // When length is even, we take middle two elements of 690 | // list and the median is an average of the two of them. 691 | return (newElem[(len(newElem)/2)-1].Float() + 692 | newElem[len(newElem)/2].Float()) * 0.5 693 | } 694 | 695 | // Max return the biggest element in the series 696 | func (s Series) Max() float64 { 697 | if s.elements.Len() == 0 || s.Type() == String { 698 | return math.NaN() 699 | } 700 | 701 | max := s.elements.Elem(0) 702 | for i := 1; i < s.elements.Len(); i++ { 703 | elem := s.elements.Elem(i) 704 | if elem.Greater(max) { 705 | max = elem 706 | } 707 | } 708 | return max.Float() 709 | } 710 | 711 | // MaxStr return the biggest element in a series of type String 712 | func (s Series) MaxStr() string { 713 | if s.elements.Len() == 0 || s.Type() != String { 714 | return "" 715 | } 716 | 717 | max := s.elements.Elem(0) 718 | for i := 1; i < s.elements.Len(); i++ { 719 | elem := s.elements.Elem(i) 720 | if elem.Greater(max) { 721 | max = elem 722 | } 723 | } 724 | return max.String() 725 | } 726 | 727 | // Min return the lowest element in the series 728 | func (s Series) Min() float64 { 729 | if s.elements.Len() == 0 || s.Type() == String { 730 | return math.NaN() 731 | } 732 | 733 | min := s.elements.Elem(0) 734 | for i := 1; i < s.elements.Len(); i++ { 735 | elem := s.elements.Elem(i) 736 | if elem.Less(min) { 737 | min = elem 738 | } 739 | } 740 | return min.Float() 741 | } 742 | 743 | // MinStr return the lowest element in a series of type String 744 | func (s Series) MinStr() string { 745 | if s.elements.Len() == 0 || s.Type() != String { 746 | return "" 747 | } 748 | 749 | min := s.elements.Elem(0) 750 | for i := 1; i < s.elements.Len(); i++ { 751 | elem := s.elements.Elem(i) 752 | if elem.Less(min) { 753 | min = elem 754 | } 755 | } 756 | return min.String() 757 | } 758 | 759 | // Quantile returns the sample of x such that x is greater than or 760 | // equal to the fraction p of samples. 761 | // Note: gonum/stat panics when called with strings 762 | func (s Series) Quantile(p float64) float64 { 763 | if s.Type() == String || s.Len() == 0 { 764 | return math.NaN() 765 | } 766 | 767 | ordered := s.Subset(s.Order(false)).Float() 768 | 769 | return stat.Quantile(p, stat.Empirical, ordered, nil) 770 | } 771 | 772 | // Map applies a function matching MapFunction signature, which itself 773 | // allowing for a fairly flexible MAP implementation, intended for mapping 774 | // the function over each element in Series and returning a new Series object. 775 | // Function must be compatible with the underlying type of data in the Series. 776 | // In other words it is expected that when working with a Float Series, that 777 | // the function passed in via argument `f` will not expect another type, but 778 | // instead expects to handle Element(s) of type Float. 779 | func (s Series) Map(f MapFunction) Series { 780 | 781 | mappedValues := make([]Element, s.Len()) 782 | for i := 0; i < s.Len(); i++ { 783 | value := f(s.elements.Elem(i)) 784 | mappedValues[i] = value 785 | } 786 | return New(mappedValues, s.Type(), s.Name) 787 | } 788 | -------------------------------------------------------------------------------- /series/series_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "reflect" 7 | "testing" 8 | "strings" 9 | ) 10 | 11 | // Check that there are no shared memory addreses between the elements of two Series 12 | //func checkAddr(addra, addrb []string) error { 13 | //for i := 0; i < len(addra); i++ { 14 | //for j := 0; j < len(addrb); j++ { 15 | //if addra[i] == "" || addrb[j] == "" { 16 | //continue 17 | //} 18 | //if addra[i] == addrb[j] { 19 | //return fmt.Errorf("found same address on\nA:%v\nB:%v", i, j) 20 | //} 21 | //} 22 | //} 23 | //return nil 24 | //} 25 | 26 | // Check that all the types on a Series are the same type and that it matches with 27 | // Series.t 28 | func checkTypes(s Series) error { 29 | var types []Type 30 | for i := 0; i < s.Len(); i++ { 31 | e := s.elements.Elem(i) 32 | types = append(types, e.Type()) 33 | } 34 | for _, t := range types { 35 | if t != s.t { 36 | return fmt.Errorf("bad types for %v Series:\n%v", s.t, types) 37 | } 38 | } 39 | return nil 40 | } 41 | 42 | // compareFloats compares floating point values up to the number of digits specified. 43 | // Returns true if both values are equal with the given precision 44 | func compareFloats(lvalue, rvalue float64, digits int) bool { 45 | if math.IsNaN(lvalue) || math.IsNaN(rvalue) { 46 | return math.IsNaN(lvalue) && math.IsNaN(rvalue) 47 | } 48 | d := math.Pow(10.0, float64(digits)) 49 | lv := int(lvalue * d) 50 | rv := int(rvalue * d) 51 | return lv == rv 52 | } 53 | 54 | func TestSeries_Compare(t *testing.T) { 55 | table := []struct { 56 | series Series 57 | comparator Comparator 58 | comparando interface{} 59 | expected Series 60 | }{ 61 | { 62 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 63 | Eq, 64 | "B", 65 | Bools([]bool{false, true, false, true, false, false}), 66 | }, 67 | { 68 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 69 | Eq, 70 | []string{"B", "B", "C", "D", "A", "A"}, 71 | Bools([]bool{false, true, true, false, false, false}), 72 | }, 73 | { 74 | Ints([]int{0, 2, 1, 5, 9}), 75 | Eq, 76 | "2", 77 | Bools([]bool{false, true, false, false, false}), 78 | }, 79 | { 80 | Ints([]int{0, 2, 1, 5, 9}), 81 | Eq, 82 | []int{0, 2, 0, 5, 10}, 83 | Bools([]bool{true, true, false, true, false}), 84 | }, 85 | { 86 | Floats([]float64{0.1, 2, 1, 5, 9}), 87 | Eq, 88 | "2", 89 | Bools([]bool{false, true, false, false, false}), 90 | }, 91 | { 92 | Floats([]float64{0.1, 2, 1, 5, 9}), 93 | Eq, 94 | []float64{0.1, 2, 0, 5, 10}, 95 | Bools([]bool{true, true, false, true, false}), 96 | }, 97 | { 98 | Bools([]bool{true, true, false}), 99 | Eq, 100 | "true", 101 | Bools([]bool{true, true, false}), 102 | }, 103 | { 104 | Bools([]bool{true, true, false}), 105 | Eq, 106 | []bool{true, false, false}, 107 | Bools([]bool{true, false, true}), 108 | }, 109 | { 110 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 111 | Neq, 112 | "B", 113 | Bools([]bool{true, false, true, false, true, true}), 114 | }, 115 | { 116 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 117 | Neq, 118 | []string{"B", "B", "C", "D", "A", "A"}, 119 | Bools([]bool{true, false, false, true, true, true}), 120 | }, 121 | { 122 | Ints([]int{0, 2, 1, 5, 9}), 123 | Neq, 124 | "2", 125 | Bools([]bool{true, false, true, true, true}), 126 | }, 127 | { 128 | Ints([]int{0, 2, 1, 5, 9}), 129 | Neq, 130 | []int{0, 2, 0, 5, 10}, 131 | Bools([]bool{false, false, true, false, true}), 132 | }, 133 | { 134 | Floats([]float64{0.1, 2, 1, 5, 9}), 135 | Neq, 136 | "2", 137 | Bools([]bool{true, false, true, true, true}), 138 | }, 139 | { 140 | Floats([]float64{0.1, 2, 1, 5, 9}), 141 | Neq, 142 | []float64{0.1, 2, 0, 5, 10}, 143 | Bools([]bool{false, false, true, false, true}), 144 | }, 145 | { 146 | Bools([]bool{true, true, false}), 147 | Neq, 148 | "true", 149 | Bools([]bool{false, false, true}), 150 | }, 151 | { 152 | Bools([]bool{true, true, false}), 153 | Neq, 154 | []bool{true, false, false}, 155 | Bools([]bool{false, true, false}), 156 | }, 157 | { 158 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 159 | Greater, 160 | "B", 161 | Bools([]bool{false, false, true, false, true, true}), 162 | }, 163 | { 164 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 165 | Greater, 166 | []string{"B", "B", "C", "D", "A", "A"}, 167 | Bools([]bool{false, false, false, false, true, true}), 168 | }, 169 | { 170 | Ints([]int{0, 2, 1, 5, 9}), 171 | Greater, 172 | "2", 173 | Bools([]bool{false, false, false, true, true}), 174 | }, 175 | { 176 | Ints([]int{0, 2, 1, 5, 9}), 177 | Greater, 178 | []int{0, 2, 0, 5, 10}, 179 | Bools([]bool{false, false, true, false, false}), 180 | }, 181 | { 182 | Floats([]float64{0.1, 2, 1, 5, 9}), 183 | Greater, 184 | "2", 185 | Bools([]bool{false, false, false, true, true}), 186 | }, 187 | { 188 | Floats([]float64{0.1, 2, 1, 5, 9}), 189 | Greater, 190 | []float64{0.1, 2, 0, 5, 10}, 191 | Bools([]bool{false, false, true, false, false}), 192 | }, 193 | { 194 | Bools([]bool{true, true, false}), 195 | Greater, 196 | "true", 197 | Bools([]bool{false, false, false}), 198 | }, 199 | { 200 | Bools([]bool{true, true, false}), 201 | Greater, 202 | []bool{true, false, false}, 203 | Bools([]bool{false, true, false}), 204 | }, 205 | { 206 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 207 | GreaterEq, 208 | "B", 209 | Bools([]bool{false, true, true, true, true, true}), 210 | }, 211 | { 212 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 213 | GreaterEq, 214 | []string{"B", "B", "C", "D", "A", "A"}, 215 | Bools([]bool{false, true, true, false, true, true}), 216 | }, 217 | { 218 | Ints([]int{0, 2, 1, 5, 9}), 219 | GreaterEq, 220 | "2", 221 | Bools([]bool{false, true, false, true, true}), 222 | }, 223 | { 224 | Ints([]int{0, 2, 1, 5, 9}), 225 | GreaterEq, 226 | []int{0, 2, 0, 5, 10}, 227 | Bools([]bool{true, true, true, true, false}), 228 | }, 229 | { 230 | Floats([]float64{0.1, 2, 1, 5, 9}), 231 | GreaterEq, 232 | "2", 233 | Bools([]bool{false, true, false, true, true}), 234 | }, 235 | { 236 | Floats([]float64{0.1, 2, 1, 5, 9}), 237 | GreaterEq, 238 | []float64{0.1, 2, 0, 5, 10}, 239 | Bools([]bool{true, true, true, true, false}), 240 | }, 241 | { 242 | Bools([]bool{true, true, false}), 243 | GreaterEq, 244 | "true", 245 | Bools([]bool{true, true, false}), 246 | }, 247 | { 248 | Bools([]bool{true, true, false}), 249 | GreaterEq, 250 | []bool{true, false, false}, 251 | Bools([]bool{true, true, true}), 252 | }, 253 | { 254 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 255 | Less, 256 | "B", 257 | Bools([]bool{true, false, false, false, false, false}), 258 | }, 259 | { 260 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 261 | Less, 262 | []string{"B", "B", "C", "D", "A", "A"}, 263 | Bools([]bool{true, false, false, true, false, false}), 264 | }, 265 | { 266 | Ints([]int{0, 2, 1, 5, 9}), 267 | Less, 268 | "2", 269 | Bools([]bool{true, false, true, false, false}), 270 | }, 271 | { 272 | Ints([]int{0, 2, 1, 5, 9}), 273 | Less, 274 | []int{0, 2, 0, 5, 10}, 275 | Bools([]bool{false, false, false, false, true}), 276 | }, 277 | { 278 | Floats([]float64{0.1, 2, 1, 5, 9}), 279 | Less, 280 | "2", 281 | Bools([]bool{true, false, true, false, false}), 282 | }, 283 | { 284 | Floats([]float64{0.1, 2, 1, 5, 9}), 285 | Less, 286 | []float64{0.1, 2, 0, 5, 10}, 287 | Bools([]bool{false, false, false, false, true}), 288 | }, 289 | { 290 | Bools([]bool{true, true, false}), 291 | Less, 292 | "true", 293 | Bools([]bool{false, false, true}), 294 | }, 295 | { 296 | Bools([]bool{true, true, false}), 297 | Less, 298 | []bool{true, false, false}, 299 | Bools([]bool{false, false, false}), 300 | }, 301 | { 302 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 303 | LessEq, 304 | "B", 305 | Bools([]bool{true, true, false, true, false, false}), 306 | }, 307 | { 308 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 309 | LessEq, 310 | []string{"B", "B", "C", "D", "A", "A"}, 311 | Bools([]bool{true, true, true, true, false, false}), 312 | }, 313 | { 314 | Ints([]int{0, 2, 1, 5, 9}), 315 | LessEq, 316 | "2", 317 | Bools([]bool{true, true, true, false, false}), 318 | }, 319 | { 320 | Ints([]int{0, 2, 1, 5, 9}), 321 | LessEq, 322 | []int{0, 2, 0, 5, 10}, 323 | Bools([]bool{true, true, false, true, true}), 324 | }, 325 | { 326 | Floats([]float64{0.1, 2, 1, 5, 9}), 327 | LessEq, 328 | "2", 329 | Bools([]bool{true, true, true, false, false}), 330 | }, 331 | { 332 | Floats([]float64{0.1, 2, 1, 5, 9}), 333 | LessEq, 334 | []float64{0.1, 2, 0, 5, 10}, 335 | Bools([]bool{true, true, false, true, true}), 336 | }, 337 | { 338 | Bools([]bool{true, true, false}), 339 | LessEq, 340 | "true", 341 | Bools([]bool{true, true, true}), 342 | }, 343 | { 344 | Bools([]bool{true, true, false}), 345 | LessEq, 346 | []bool{true, false, false}, 347 | Bools([]bool{true, false, true}), 348 | }, 349 | { 350 | Strings([]string{"A", "B", "C", "B", "D", "BADA"}), 351 | In, 352 | "B", 353 | Bools([]bool{false, true, false, true, false, false}), 354 | }, 355 | { 356 | Strings([]string{"Hello", "world", "this", "is", "a", "test"}), 357 | In, 358 | []string{"cat", "world", "hello", "a"}, 359 | Bools([]bool{false, true, false, false, true, false}), 360 | }, 361 | { 362 | Ints([]int{0, 2, 1, 5, 9}), 363 | In, 364 | "2", 365 | Bools([]bool{false, true, false, false, false}), 366 | }, 367 | { 368 | Ints([]int{0, 2, 1, 5, 9}), 369 | In, 370 | []int{2, 99, 1234, 9}, 371 | Bools([]bool{false, true, false, false, true}), 372 | }, 373 | { 374 | Floats([]float64{0.1, 2, 1, 5, 9}), 375 | In, 376 | "2", 377 | Bools([]bool{false, true, false, false, false}), 378 | }, 379 | { 380 | Floats([]float64{0.1, 2, 1, 5, 9}), 381 | In, 382 | []float64{2, 99, 1234, 9}, 383 | Bools([]bool{false, true, false, false, true}), 384 | }, 385 | { 386 | Bools([]bool{true, true, false}), 387 | In, 388 | "true", 389 | Bools([]bool{true, true, false}), 390 | }, 391 | { 392 | Bools([]bool{true, true, false}), 393 | In, 394 | []bool{false, false, false}, 395 | Bools([]bool{false, false, true}), 396 | }, 397 | } 398 | for testnum, test := range table { 399 | a := test.series 400 | b := a.Compare(test.comparator, test.comparando) 401 | if err := b.Err; err != nil { 402 | t.Errorf("Test:%v\nError:%v", testnum, err) 403 | } 404 | expected := test.expected.Records() 405 | received := b.Records() 406 | if !reflect.DeepEqual(expected, received) { 407 | t.Errorf( 408 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 409 | testnum, expected, received, 410 | ) 411 | } 412 | if err := checkTypes(b); err != nil { 413 | t.Errorf( 414 | "Test:%v\nError:%v", 415 | testnum, err, 416 | ) 417 | } 418 | //if err := checkAddr(a.Addr(), b.Addr()); err != nil { 419 | //t.Errorf("Test:%v\nError:%v\nA:%v\nB:%v", testnum, err, a.Addr(), b.Addr()) 420 | //} 421 | } 422 | } 423 | 424 | func TestSeries_Subset(t *testing.T) { 425 | table := []struct { 426 | series Series 427 | indexes Indexes 428 | expected string 429 | }{ 430 | { 431 | Strings([]string{"A", "B", "C", "K", "D"}), 432 | []int{2, 1, 4, 4, 0, 3}, 433 | "[C B D D A K]", 434 | }, 435 | { 436 | Strings([]string{"A", "B", "C", "K", "D"}), 437 | int(1), 438 | "[B]", 439 | }, 440 | { 441 | Strings([]string{"A", "B", "C", "K", "D"}), 442 | []bool{true, false, false, true, true}, 443 | "[A K D]", 444 | }, 445 | { 446 | Strings([]string{"A", "B", "C", "K", "D"}), 447 | Ints([]int{3, 2, 1, 0}), 448 | "[K C B A]", 449 | }, 450 | { 451 | Strings([]string{"A", "B", "C", "K", "D"}), 452 | Ints([]int{1}), 453 | "[B]", 454 | }, 455 | { 456 | Strings([]string{"A", "B", "C", "K", "D"}), 457 | Ints(2), 458 | "[C]", 459 | }, 460 | { 461 | Strings([]string{"A", "B", "C", "K", "D"}), 462 | Bools([]bool{true, false, false, true, true}), 463 | "[A K D]", 464 | }, 465 | } 466 | for testnum, test := range table { 467 | a := test.series 468 | b := a.Subset(test.indexes) 469 | if err := b.Err; err != nil { 470 | t.Errorf("Test:%v\nError:%v", testnum, err) 471 | } 472 | expected := test.expected 473 | received := fmt.Sprint(b) 474 | if expected != received { 475 | t.Errorf( 476 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 477 | testnum, expected, received, 478 | ) 479 | } 480 | if err := checkTypes(b); err != nil { 481 | t.Errorf( 482 | "Test:%v\nError:%v", 483 | testnum, err, 484 | ) 485 | } 486 | //if err := checkAddr(a.Addr(), b.Addr()); err != nil { 487 | //t.Errorf("Test:%v\nError:%v\nA:%v\nB:%v", testnum, err, a.Addr(), b.Addr()) 488 | //} 489 | } 490 | } 491 | 492 | func TestSeries_Set(t *testing.T) { 493 | table := []struct { 494 | series Series 495 | indexes Indexes 496 | values Series 497 | expected string 498 | }{ 499 | { 500 | Strings([]string{"A", "B", "C", "K", "D"}), 501 | []int{1, 2, 4}, 502 | Ints([]string{"1", "2", "3"}), 503 | "[A 1 2 K 3]", 504 | }, 505 | { 506 | Strings([]string{"A", "B", "C", "K", "D"}), 507 | []bool{false, true, true, false, true}, 508 | Ints([]string{"1", "2", "3"}), 509 | "[A 1 2 K 3]", 510 | }, 511 | { 512 | Strings([]string{"A", "B", "C", "K", "D"}), 513 | Ints([]int{1, 2, 4}), 514 | Ints([]string{"1", "2", "3"}), 515 | "[A 1 2 K 3]", 516 | }, 517 | { 518 | Strings([]string{"A", "B", "C", "K", "D"}), 519 | Bools([]bool{false, true, true, false, true}), 520 | Ints([]string{"1", "2", "3"}), 521 | "[A 1 2 K 3]", 522 | }, 523 | } 524 | for testnum, test := range table { 525 | b := test.series.Set(test.indexes, test.values) 526 | if err := b.Err; err != nil { 527 | t.Errorf("Test:%v\nError:%v", testnum, err) 528 | } 529 | expected := test.expected 530 | received := fmt.Sprint(b) 531 | if expected != received { 532 | t.Errorf( 533 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 534 | testnum, expected, received, 535 | ) 536 | } 537 | if err := checkTypes(b); err != nil { 538 | t.Errorf( 539 | "Test:%v\nError:%v", 540 | testnum, err, 541 | ) 542 | } 543 | //if err := checkAddr(test.values.Addr(), b.Addr()); err != nil { 544 | //t.Errorf("Test:%v\nError:%v\nNV:%v\nB:%v", testnum, err, test.values.Addr(), b.Addr()) 545 | //} 546 | } 547 | } 548 | 549 | func TestStrings(t *testing.T) { 550 | table := []struct { 551 | series Series 552 | expected string 553 | }{ 554 | { 555 | Strings([]string{"A", "B", "C", "D"}), 556 | "[A B C D]", 557 | }, 558 | { 559 | Strings([]string{"A"}), 560 | "[A]", 561 | }, 562 | { 563 | Strings("A"), 564 | "[A]", 565 | }, 566 | { 567 | Strings([]int{1, 2, 3}), 568 | "[1 2 3]", 569 | }, 570 | { 571 | Strings([]int{2}), 572 | "[2]", 573 | }, 574 | { 575 | Strings(-1), 576 | "[-1]", 577 | }, 578 | { 579 | Strings([]float64{1, 2, 3}), 580 | "[1.000000 2.000000 3.000000]", 581 | }, 582 | { 583 | Strings([]float64{2}), 584 | "[2.000000]", 585 | }, 586 | { 587 | Strings(-1.0), 588 | "[-1.000000]", 589 | }, 590 | { 591 | Strings(math.NaN()), 592 | "[NaN]", 593 | }, 594 | { 595 | Strings(math.Inf(1)), 596 | "[+Inf]", 597 | }, 598 | { 599 | Strings(math.Inf(-1)), 600 | "[-Inf]", 601 | }, 602 | { 603 | Strings([]bool{true, true, false}), 604 | "[true true false]", 605 | }, 606 | { 607 | Strings([]bool{false}), 608 | "[false]", 609 | }, 610 | { 611 | Strings(true), 612 | "[true]", 613 | }, 614 | { 615 | Strings([]int{}), 616 | "[]", 617 | }, 618 | { 619 | Strings(nil), 620 | "[NaN]", 621 | }, 622 | { 623 | Strings(Strings([]string{"A", "B", "C"})), 624 | "[A B C]", 625 | }, 626 | } 627 | for testnum, test := range table { 628 | if err := test.series.Err; err != nil { 629 | t.Errorf("Test:%v\nError:%v", testnum, err) 630 | } 631 | expected := test.expected 632 | received := fmt.Sprint(test.series) 633 | if expected != received { 634 | t.Errorf( 635 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 636 | testnum, expected, received, 637 | ) 638 | } 639 | if err := checkTypes(test.series); err != nil { 640 | t.Errorf("Test:%v\nError:%v", testnum, err) 641 | } 642 | } 643 | } 644 | 645 | func TestInts(t *testing.T) { 646 | table := []struct { 647 | series Series 648 | expected string 649 | }{ 650 | { 651 | Ints([]string{"A", "B", "1", "2"}), 652 | "[NaN NaN 1 2]", 653 | }, 654 | { 655 | Ints([]string{"1"}), 656 | "[1]", 657 | }, 658 | { 659 | Ints("2"), 660 | "[2]", 661 | }, 662 | { 663 | Ints([]int{1, 2, 3}), 664 | "[1 2 3]", 665 | }, 666 | { 667 | Ints([]int{2}), 668 | "[2]", 669 | }, 670 | { 671 | Ints(-1), 672 | "[-1]", 673 | }, 674 | { 675 | Ints([]float64{1, 2, 3}), 676 | "[1 2 3]", 677 | }, 678 | { 679 | Ints([]float64{2}), 680 | "[2]", 681 | }, 682 | { 683 | Ints(-1.0), 684 | "[-1]", 685 | }, 686 | { 687 | Ints(math.NaN()), 688 | "[NaN]", 689 | }, 690 | { 691 | Ints(math.Inf(1)), 692 | "[NaN]", 693 | }, 694 | { 695 | Ints(math.Inf(-1)), 696 | "[NaN]", 697 | }, 698 | { 699 | Ints([]bool{true, true, false}), 700 | "[1 1 0]", 701 | }, 702 | { 703 | Ints([]bool{false}), 704 | "[0]", 705 | }, 706 | { 707 | Ints(true), 708 | "[1]", 709 | }, 710 | { 711 | Ints([]int{}), 712 | "[]", 713 | }, 714 | { 715 | Ints(nil), 716 | "[NaN]", 717 | }, 718 | { 719 | Ints(Strings([]string{"1", "2", "3"})), 720 | "[1 2 3]", 721 | }, 722 | { 723 | Ints(Ints([]string{"1", "2", "3"})), 724 | "[1 2 3]", 725 | }, 726 | } 727 | for testnum, test := range table { 728 | if err := test.series.Err; err != nil { 729 | t.Errorf("Test:%v\nError:%v", testnum, err) 730 | } 731 | expected := test.expected 732 | received := fmt.Sprint(test.series) 733 | if expected != received { 734 | t.Errorf( 735 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 736 | testnum, expected, received, 737 | ) 738 | } 739 | if err := checkTypes(test.series); err != nil { 740 | t.Errorf("Test:%v\nError:%v", testnum, err) 741 | } 742 | } 743 | } 744 | 745 | func TestFloats(t *testing.T) { 746 | table := []struct { 747 | series Series 748 | expected string 749 | }{ 750 | { 751 | Floats([]string{"A", "B", "1", "2"}), 752 | "[NaN NaN 1.000000 2.000000]", 753 | }, 754 | { 755 | Floats([]string{"1"}), 756 | "[1.000000]", 757 | }, 758 | { 759 | Floats("2.1"), 760 | "[2.100000]", 761 | }, 762 | { 763 | Floats([]int{1, 2, 3}), 764 | "[1.000000 2.000000 3.000000]", 765 | }, 766 | { 767 | Floats([]int{2}), 768 | "[2.000000]", 769 | }, 770 | { 771 | Floats(-1), 772 | "[-1.000000]", 773 | }, 774 | { 775 | Floats([]float64{1.1, 2, 3}), 776 | "[1.100000 2.000000 3.000000]", 777 | }, 778 | { 779 | Floats([]float64{2}), 780 | "[2.000000]", 781 | }, 782 | { 783 | Floats(-1.0), 784 | "[-1.000000]", 785 | }, 786 | { 787 | Floats(math.NaN()), 788 | "[NaN]", 789 | }, 790 | { 791 | Floats(math.Inf(1)), 792 | "[+Inf]", 793 | }, 794 | { 795 | Floats(math.Inf(-1)), 796 | "[-Inf]", 797 | }, 798 | { 799 | Floats([]bool{true, true, false}), 800 | "[1.000000 1.000000 0.000000]", 801 | }, 802 | { 803 | Floats([]bool{false}), 804 | "[0.000000]", 805 | }, 806 | { 807 | Floats(true), 808 | "[1.000000]", 809 | }, 810 | { 811 | Floats([]int{}), 812 | "[]", 813 | }, 814 | { 815 | Floats(nil), 816 | "[NaN]", 817 | }, 818 | { 819 | Floats(Strings([]string{"1", "2", "3"})), 820 | "[1.000000 2.000000 3.000000]", 821 | }, 822 | } 823 | for testnum, test := range table { 824 | if err := test.series.Err; err != nil { 825 | t.Errorf("Test:%v\nError:%v", testnum, err) 826 | } 827 | expected := test.expected 828 | received := fmt.Sprint(test.series) 829 | if expected != received { 830 | t.Errorf( 831 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 832 | testnum, expected, received, 833 | ) 834 | } 835 | if err := checkTypes(test.series); err != nil { 836 | t.Errorf("Test:%v\nError:%v", testnum, err) 837 | } 838 | } 839 | } 840 | 841 | func TestBools(t *testing.T) { 842 | table := []struct { 843 | series Series 844 | expected string 845 | }{ 846 | { 847 | Bools([]string{"A", "true", "1", "f"}), 848 | "[NaN true true false]", 849 | }, 850 | { 851 | Bools([]string{"t"}), 852 | "[true]", 853 | }, 854 | { 855 | Bools("False"), 856 | "[false]", 857 | }, 858 | { 859 | Bools([]int{1, 2, 0}), 860 | "[true NaN false]", 861 | }, 862 | { 863 | Bools([]int{1}), 864 | "[true]", 865 | }, 866 | { 867 | Bools(-1), 868 | "[NaN]", 869 | }, 870 | { 871 | Bools([]float64{1, 2, 0}), 872 | "[true NaN false]", 873 | }, 874 | { 875 | Bools([]float64{0}), 876 | "[false]", 877 | }, 878 | { 879 | Bools(-1.0), 880 | "[NaN]", 881 | }, 882 | { 883 | Bools(math.NaN()), 884 | "[NaN]", 885 | }, 886 | { 887 | Bools(math.Inf(1)), 888 | "[NaN]", 889 | }, 890 | { 891 | Bools(math.Inf(-1)), 892 | "[NaN]", 893 | }, 894 | { 895 | Bools([]bool{true, true, false}), 896 | "[true true false]", 897 | }, 898 | { 899 | Bools([]bool{false}), 900 | "[false]", 901 | }, 902 | { 903 | Bools(true), 904 | "[true]", 905 | }, 906 | { 907 | Bools([]int{}), 908 | "[]", 909 | }, 910 | { 911 | Bools(nil), 912 | "[NaN]", 913 | }, 914 | { 915 | Bools(Strings([]string{"1", "0", "1"})), 916 | "[true false true]", 917 | }, 918 | } 919 | for testnum, test := range table { 920 | if err := test.series.Err; err != nil { 921 | t.Errorf("Test:%v\nError:%v", testnum, err) 922 | } 923 | expected := test.expected 924 | received := fmt.Sprint(test.series) 925 | if expected != received { 926 | t.Errorf( 927 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 928 | testnum, expected, received, 929 | ) 930 | } 931 | if err := checkTypes(test.series); err != nil { 932 | t.Errorf("Test:%v\nError:%v", testnum, err) 933 | } 934 | } 935 | } 936 | 937 | func TestSeries_Copy(t *testing.T) { 938 | tests := []Series{ 939 | Strings([]string{"1", "2", "3", "a", "b", "c"}), 940 | Ints([]string{"1", "2", "3", "a", "b", "c"}), 941 | Floats([]string{"1", "2", "3", "a", "b", "c"}), 942 | Bools([]string{"1", "0", "1", "t", "f", "c"}), 943 | } 944 | for testnum, test := range tests { 945 | a := test 946 | b := a.Copy() 947 | if fmt.Sprint(a) != fmt.Sprint(b) { 948 | t.Error("Different values when copying String elements") 949 | } 950 | if err := b.Err; err != nil { 951 | t.Errorf("Test:%v\nError:%v", testnum, err) 952 | } 953 | if err := checkTypes(b); err != nil { 954 | t.Errorf("Test:%v\nError:%v", testnum, err) 955 | } 956 | //if err := checkAddr(a.Addr(), b.Addr()); err != nil { 957 | //t.Errorf("Test:%v\nError:%v\nA:%v\nB:%v", testnum, err, a.Addr(), b.Addr()) 958 | //} 959 | } 960 | } 961 | 962 | func TestSeries_Records(t *testing.T) { 963 | tests := []struct { 964 | series Series 965 | expected []string 966 | }{ 967 | { 968 | Strings([]string{"1", "2", "3", "a", "b", "c"}), 969 | []string{"1", "2", "3", "a", "b", "c"}, 970 | }, 971 | { 972 | Ints([]string{"1", "2", "3", "a", "b", "c"}), 973 | []string{"1", "2", "3", "NaN", "NaN", "NaN"}, 974 | }, 975 | { 976 | Floats([]string{"1", "2", "3", "a", "b", "c"}), 977 | []string{"1.000000", "2.000000", "3.000000", "NaN", "NaN", "NaN"}, 978 | }, 979 | { 980 | Bools([]string{"1", "0", "1", "t", "f", "c"}), 981 | []string{"true", "false", "true", "true", "false", "NaN"}, 982 | }, 983 | } 984 | for testnum, test := range tests { 985 | expected := test.expected 986 | received := test.series.Records() 987 | if !reflect.DeepEqual(expected, received) { 988 | t.Errorf( 989 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 990 | testnum, expected, received, 991 | ) 992 | } 993 | } 994 | } 995 | 996 | func TestSeries_Float(t *testing.T) { 997 | precision := 0.0000001 998 | floatEquals := func(x, y []float64) bool { 999 | if len(x) != len(y) { 1000 | return false 1001 | } 1002 | for i := 0; i < len(x); i++ { 1003 | a := x[i] 1004 | b := y[i] 1005 | if (a-b) > precision || (b-a) > precision { 1006 | return false 1007 | } 1008 | } 1009 | return true 1010 | } 1011 | tests := []struct { 1012 | series Series 1013 | expected []float64 1014 | }{ 1015 | { 1016 | Strings([]string{"1", "2", "3", "a", "b", "c"}), 1017 | []float64{1, 2, 3, math.NaN(), math.NaN(), math.NaN()}, 1018 | }, 1019 | { 1020 | Ints([]string{"1", "2", "3", "a", "b", "c"}), 1021 | []float64{1, 2, 3, math.NaN(), math.NaN(), math.NaN()}, 1022 | }, 1023 | { 1024 | Floats([]string{"1", "2", "3", "a", "b", "c"}), 1025 | []float64{1, 2, 3, math.NaN(), math.NaN(), math.NaN()}, 1026 | }, 1027 | { 1028 | Bools([]string{"1", "0", "1", "t", "f", "c"}), 1029 | []float64{1, 0, 1, 1, 0, math.NaN()}, 1030 | }, 1031 | } 1032 | for testnum, test := range tests { 1033 | expected := test.expected 1034 | received := test.series.Float() 1035 | if !floatEquals(expected, received) { 1036 | t.Errorf( 1037 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1038 | testnum, expected, received, 1039 | ) 1040 | } 1041 | } 1042 | } 1043 | 1044 | func TestSeries_Concat(t *testing.T) { 1045 | tests := []struct { 1046 | a Series 1047 | b Series 1048 | expected []string 1049 | }{ 1050 | { 1051 | Strings([]string{"1", "2", "3"}), 1052 | Strings([]string{"a", "b", "c"}), 1053 | []string{"1", "2", "3", "a", "b", "c"}, 1054 | }, 1055 | { 1056 | Ints([]string{"1", "2", "3"}), 1057 | Ints([]string{"a", "4", "c"}), 1058 | []string{"1", "2", "3", "NaN", "4", "NaN"}, 1059 | }, 1060 | { 1061 | Floats([]string{"1", "2", "3"}), 1062 | Floats([]string{"a", "4", "c"}), 1063 | []string{"1.000000", "2.000000", "3.000000", "NaN", "4.000000", "NaN"}, 1064 | }, 1065 | { 1066 | Bools([]string{"1", "1", "0"}), 1067 | Bools([]string{"0", "0", "0"}), 1068 | []string{"true", "true", "false", "false", "false", "false"}, 1069 | }, 1070 | } 1071 | for testnum, test := range tests { 1072 | ab := test.a.Concat(test.b) 1073 | if err := ab.Err; err != nil { 1074 | t.Errorf("Test:%v\nError:%v", testnum, err) 1075 | } 1076 | received := ab.Records() 1077 | expected := test.expected 1078 | if !reflect.DeepEqual(expected, received) { 1079 | t.Errorf( 1080 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1081 | testnum, expected, received, 1082 | ) 1083 | } 1084 | //a := test.a 1085 | //b := ab 1086 | //if err := checkAddr(a.Addr(), b.Addr()); err != nil { 1087 | //t.Errorf("Test:%v\nError:%v\nA:%v\nAB:%v", testnum, err, a.Addr(), b.Addr()) 1088 | //} 1089 | //a = test.b 1090 | //b = ab 1091 | //if err := checkAddr(a.Addr(), b.Addr()); err != nil { 1092 | //t.Errorf("Test:%v\nError:%v\nB:%v\nAB:%v", testnum, err, a.Addr(), b.Addr()) 1093 | //} 1094 | } 1095 | } 1096 | 1097 | func TestSeries_Order(t *testing.T) { 1098 | tests := []struct { 1099 | series Series 1100 | reverse bool 1101 | expected []int 1102 | }{ 1103 | { 1104 | Ints([]string{"2", "1", "3", "NaN", "4", "NaN"}), 1105 | false, 1106 | []int{1, 0, 2, 4, 3, 5}, 1107 | }, 1108 | { 1109 | Floats([]string{"2", "1", "3", "NaN", "4", "NaN"}), 1110 | false, 1111 | []int{1, 0, 2, 4, 3, 5}, 1112 | }, 1113 | { 1114 | Strings([]string{"c", "b", "a"}), 1115 | false, 1116 | []int{2, 1, 0}, 1117 | }, 1118 | { 1119 | Bools([]bool{true, false, false, false, true}), 1120 | false, 1121 | []int{1, 2, 3, 0, 4}, 1122 | }, 1123 | { 1124 | Ints([]string{"2", "1", "3", "NaN", "4", "NaN"}), 1125 | true, 1126 | []int{4, 2, 0, 1, 3, 5}, 1127 | }, 1128 | { 1129 | Floats([]string{"2", "1", "3", "NaN", "4", "NaN"}), 1130 | true, 1131 | []int{4, 2, 0, 1, 3, 5}, 1132 | }, 1133 | { 1134 | Strings([]string{"c", "b", "a"}), 1135 | true, 1136 | []int{0, 1, 2}, 1137 | }, 1138 | { 1139 | Bools([]bool{true, false, false, false, true}), 1140 | true, 1141 | []int{0, 4, 1, 2, 3}, 1142 | }, 1143 | } 1144 | for testnum, test := range tests { 1145 | received := test.series.Order(test.reverse) 1146 | expected := test.expected 1147 | if !reflect.DeepEqual(expected, received) { 1148 | t.Errorf( 1149 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1150 | testnum, expected, received, 1151 | ) 1152 | } 1153 | } 1154 | } 1155 | 1156 | func TestSeries_IsNaN(t *testing.T) { 1157 | tests := []struct { 1158 | series Series 1159 | expected []bool 1160 | }{ 1161 | { 1162 | Ints([]string{"2", "1", "3", "NaN", "4", "NaN"}), 1163 | []bool{false, false, false, true, false, true}, 1164 | }, 1165 | { 1166 | Floats([]string{"A", "1", "B", "3"}), 1167 | []bool{true, false, true, false}, 1168 | }, 1169 | } 1170 | for testnum, test := range tests { 1171 | received := test.series.IsNaN() 1172 | expected := test.expected 1173 | if !reflect.DeepEqual(expected, received) { 1174 | t.Errorf( 1175 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1176 | testnum, expected, received, 1177 | ) 1178 | } 1179 | } 1180 | } 1181 | 1182 | func TestSeries_StdDev(t *testing.T) { 1183 | tests := []struct { 1184 | series Series 1185 | expected float64 1186 | }{ 1187 | { 1188 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1189 | 3.02765, 1190 | }, 1191 | { 1192 | Floats([]float64{1.0, 2.0, 3.0}), 1193 | 1.0, 1194 | }, 1195 | { 1196 | Strings([]string{"A", "B", "C", "D"}), 1197 | math.NaN(), 1198 | }, 1199 | { 1200 | Bools([]bool{true, true, false, true}), 1201 | 0.5, 1202 | }, 1203 | { 1204 | Floats([]float64{}), 1205 | math.NaN(), 1206 | }, 1207 | } 1208 | 1209 | for testnum, test := range tests { 1210 | received := test.series.StdDev() 1211 | expected := test.expected 1212 | if !compareFloats(received, expected, 6) { 1213 | t.Errorf( 1214 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1215 | testnum, expected, received, 1216 | ) 1217 | } 1218 | } 1219 | } 1220 | 1221 | func TestSeries_Mean(t *testing.T) { 1222 | tests := []struct { 1223 | series Series 1224 | expected float64 1225 | }{ 1226 | { 1227 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1228 | 5.5, 1229 | }, 1230 | { 1231 | Floats([]float64{1.0, 2.0, 3.0}), 1232 | 2.0, 1233 | }, 1234 | { 1235 | Strings([]string{"A", "B", "C", "D"}), 1236 | math.NaN(), 1237 | }, 1238 | { 1239 | Bools([]bool{true, true, false, true}), 1240 | 0.75, 1241 | }, 1242 | { 1243 | Floats([]float64{}), 1244 | math.NaN(), 1245 | }, 1246 | } 1247 | 1248 | for testnum, test := range tests { 1249 | received := test.series.Mean() 1250 | expected := test.expected 1251 | if !compareFloats(received, expected, 6) { 1252 | t.Errorf( 1253 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1254 | testnum, expected, received, 1255 | ) 1256 | } 1257 | } 1258 | } 1259 | 1260 | func TestSeries_Max(t *testing.T) { 1261 | tests := []struct { 1262 | series Series 1263 | expected float64 1264 | }{ 1265 | { 1266 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1267 | 10, 1268 | }, 1269 | { 1270 | Floats([]float64{1.0, 2.0, 3.0}), 1271 | 3.0, 1272 | }, 1273 | { 1274 | Strings([]string{"A", "B", "C", "D"}), 1275 | math.NaN(), 1276 | }, 1277 | { 1278 | Bools([]bool{true, true, false, true}), 1279 | 1.0, 1280 | }, 1281 | { 1282 | Floats([]float64{}), 1283 | math.NaN(), 1284 | }, 1285 | } 1286 | 1287 | for testnum, test := range tests { 1288 | received := test.series.Max() 1289 | expected := test.expected 1290 | if !compareFloats(received, expected, 6) { 1291 | t.Errorf( 1292 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1293 | testnum, expected, received, 1294 | ) 1295 | } 1296 | } 1297 | } 1298 | 1299 | func TestSeries_Median(t *testing.T) { 1300 | tests := []struct { 1301 | series Series 1302 | expected float64 1303 | }{ 1304 | { 1305 | // Extreme observations should not factor in. 1306 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 1000, 10000}), 1307 | 7, 1308 | }, 1309 | { 1310 | // Change in order should influence result. 1311 | Ints([]int{1, 2, 3, 10, 100, 1000, 10000, 4, 5, 6, 7, 8, 9}), 1312 | 7, 1313 | }, 1314 | { 1315 | Floats([]float64{20.2755, 4.98964, -20.2006, 1.19854, 1.89977, 1316 | 1.51178, -17.4687, 4.65567, -8.65952, 6.31649, 1317 | }), 1318 | 1.705775, 1319 | }, 1320 | { 1321 | // Change in order should not influence result. 1322 | Floats([]float64{4.98964, -20.2006, 1.89977, 1.19854, 1323 | 1.51178, -17.4687, -8.65952, 20.2755, 4.65567, 6.31649, 1324 | }), 1325 | 1.705775, 1326 | }, 1327 | { 1328 | Strings([]string{"A", "B", "C", "D"}), 1329 | math.NaN(), 1330 | }, 1331 | { 1332 | Bools([]bool{true, true, false, true}), 1333 | math.NaN(), 1334 | }, 1335 | { 1336 | Floats([]float64{}), 1337 | math.NaN(), 1338 | }, 1339 | } 1340 | 1341 | for testnum, test := range tests { 1342 | received := test.series.Median() 1343 | expected := test.expected 1344 | if !compareFloats(received, expected, 6) { 1345 | t.Errorf( 1346 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1347 | testnum, expected, received, 1348 | ) 1349 | } 1350 | } 1351 | } 1352 | 1353 | func TestSeries_Min(t *testing.T) { 1354 | tests := []struct { 1355 | series Series 1356 | expected float64 1357 | }{ 1358 | { 1359 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1360 | 1.0, 1361 | }, 1362 | { 1363 | Floats([]float64{1.0, 2.0, 3.0}), 1364 | 1.0, 1365 | }, 1366 | { 1367 | Strings([]string{"A", "B", "C", "D"}), 1368 | math.NaN(), 1369 | }, 1370 | { 1371 | Bools([]bool{true, true, false, true}), 1372 | 0.0, 1373 | }, 1374 | { 1375 | Floats([]float64{}), 1376 | math.NaN(), 1377 | }, 1378 | } 1379 | 1380 | for testnum, test := range tests { 1381 | received := test.series.Min() 1382 | expected := test.expected 1383 | if !compareFloats(received, expected, 6) { 1384 | t.Errorf( 1385 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1386 | testnum, expected, received, 1387 | ) 1388 | } 1389 | } 1390 | } 1391 | 1392 | func TestSeries_MaxStr(t *testing.T) { 1393 | tests := []struct { 1394 | series Series 1395 | expected string 1396 | }{ 1397 | { 1398 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1399 | "", 1400 | }, 1401 | { 1402 | Floats([]float64{1.0, 2.0, 3.0}), 1403 | "", 1404 | }, 1405 | { 1406 | Strings([]string{"A", "B", "C", "D"}), 1407 | "D", 1408 | }, 1409 | { 1410 | Strings([]string{"quick", "Brown", "fox", "Lazy", "dog"}), 1411 | "quick", 1412 | }, 1413 | { 1414 | Bools([]bool{true, true, false, true}), 1415 | "", 1416 | }, 1417 | { 1418 | Floats([]float64{}), 1419 | "", 1420 | }, 1421 | } 1422 | 1423 | for testnum, test := range tests { 1424 | received := test.series.MaxStr() 1425 | expected := test.expected 1426 | if received != expected { 1427 | t.Errorf( 1428 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1429 | testnum, expected, received, 1430 | ) 1431 | } 1432 | } 1433 | } 1434 | 1435 | func TestSeries_MinStr(t *testing.T) { 1436 | tests := []struct { 1437 | series Series 1438 | expected string 1439 | }{ 1440 | { 1441 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1442 | "", 1443 | }, 1444 | { 1445 | Floats([]float64{1.0, 2.0, 3.0}), 1446 | "", 1447 | }, 1448 | { 1449 | Strings([]string{"A", "B", "C", "D"}), 1450 | "A", 1451 | }, 1452 | { 1453 | Strings([]string{"quick", "Brown", "fox", "Lazy", "dog"}), 1454 | "Brown", 1455 | }, 1456 | { 1457 | Bools([]bool{true, true, false, true}), 1458 | "", 1459 | }, 1460 | { 1461 | Floats([]float64{}), 1462 | "", 1463 | }, 1464 | } 1465 | 1466 | for testnum, test := range tests { 1467 | received := test.series.MinStr() 1468 | expected := test.expected 1469 | if received != expected { 1470 | t.Errorf( 1471 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1472 | testnum, expected, received, 1473 | ) 1474 | } 1475 | } 1476 | } 1477 | 1478 | func TestSeries_Quantile(t *testing.T) { 1479 | tests := []struct { 1480 | series Series 1481 | p float64 1482 | expected float64 1483 | }{ 1484 | { 1485 | Ints([]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}), 1486 | 0.9, 1487 | 9, 1488 | }, 1489 | { 1490 | Floats([]float64{3.141592, math.Sqrt(3), 2.718281, math.Sqrt(2)}), 1491 | 0.8, 1492 | 3.141592, 1493 | }, 1494 | { 1495 | Floats([]float64{1.0, 2.0, 3.0}), 1496 | 0.5, 1497 | 2.0, 1498 | }, 1499 | { 1500 | Strings([]string{"A", "B", "C", "D"}), 1501 | 0.25, 1502 | math.NaN(), 1503 | }, 1504 | { 1505 | Bools([]bool{false, false, false, true}), 1506 | 0.75, 1507 | 0.0, 1508 | }, 1509 | { 1510 | Floats([]float64{}), 1511 | 0.50, 1512 | math.NaN(), 1513 | }, 1514 | } 1515 | 1516 | for testnum, test := range tests { 1517 | received := test.series.Quantile(test.p) 1518 | expected := test.expected 1519 | if !compareFloats(received, expected, 6) { 1520 | t.Errorf( 1521 | "Test:%v\nExpected:\n%v\nReceived:\n%v", 1522 | testnum, expected, received, 1523 | ) 1524 | } 1525 | } 1526 | } 1527 | 1528 | 1529 | func TestSeries_Map(t *testing.T) { 1530 | tests := []struct { 1531 | series Series 1532 | expected Series 1533 | }{ 1534 | { 1535 | Bools([]bool{false, true, false, false, true}), 1536 | Bools([]bool{false, true, false, false, true}), 1537 | }, 1538 | { 1539 | Floats([]float64{1.5, -3.23, -0.337397, -0.380079, 1.60979, 34.}), 1540 | Floats([]float64{3, -6.46, -0.674794, -0.760158, 3.21958, 68.}), 1541 | }, 1542 | { 1543 | Floats([]float64{math.Pi, math.Phi, math.SqrtE, math.Cbrt(64)}), 1544 | Floats([]float64{2 * math.Pi, 2 * math.Phi, 2 * math.SqrtE, 2 * math.Cbrt(64)}), 1545 | }, 1546 | { 1547 | Strings([]string{"XyZApple", "XyZBanana", "XyZCitrus", "XyZDragonfruit"}), 1548 | Strings([]string{"Apple", "Banana", "Citrus", "Dragonfruit"}), 1549 | }, 1550 | { 1551 | Strings([]string{"San Francisco", "XyZTokyo", "MoscowXyZ", "XyzSydney"}), 1552 | Strings([]string{"San Francisco", "Tokyo", "MoscowXyZ", "XyzSydney"}), 1553 | }, 1554 | { 1555 | Ints([]int{23, 13, 101, -64, -3}), 1556 | Ints([]int{28, 18, 106, -59, 2}), 1557 | }, 1558 | { 1559 | Ints([]string{"morning", "noon", "afternoon", "evening", "night"}), 1560 | Ints([]int{5, 5, 5, 5, 5}), 1561 | }, 1562 | } 1563 | 1564 | doubleFloat64 := func(e Element) Element { 1565 | var result Element 1566 | result = e.Copy() 1567 | result.Set(result.Float() * 2) 1568 | return Element(result) 1569 | } 1570 | 1571 | // and two booleans 1572 | and := func(e Element) Element { 1573 | var result Element 1574 | result = e.Copy() 1575 | b, err := result.Bool() 1576 | if err != nil { 1577 | t.Errorf("%v", err) 1578 | return Element(nil) 1579 | } 1580 | result.Set(b && true) 1581 | return Element(result) 1582 | } 1583 | 1584 | // add constant (+5) to value (v) 1585 | add5Int := func(e Element) Element { 1586 | var result Element 1587 | result = e.Copy() 1588 | i, err := result.Int() 1589 | if err != nil { 1590 | return Element(&intElement{ 1591 | e: +5, 1592 | nan: false, 1593 | }) 1594 | } 1595 | result.Set(i + 5) 1596 | return Element(result) 1597 | } 1598 | 1599 | // trim (XyZ) prefix from string 1600 | trimXyZPrefix := func(e Element) Element { 1601 | var result Element 1602 | result = e.Copy() 1603 | result.Set(strings.TrimPrefix(result.String(), "XyZ")) 1604 | return Element(result) 1605 | } 1606 | 1607 | for testnum, test := range tests { 1608 | switch test.series.Type() { 1609 | case Bool: 1610 | expected := test.expected 1611 | received := test.series.Map(and) 1612 | for i := 0 ; i maxRows { 140 | shortening = true 141 | df = df.Subset(idx) 142 | records = df.Records() 143 | } else { 144 | records = df.Records() 145 | } 146 | 147 | if showDims { 148 | str += fmt.Sprintf("[%dx%d] %s\n\n", nrows, ncols, class) 149 | } 150 | 151 | // Add the row numbers 152 | for i := 0; i < df.nrows+1; i++ { 153 | add := "" 154 | if i != 0 { 155 | add = strconv.Itoa(i-1) + ":" 156 | } 157 | records[i] = append([]string{add}, records[i]...) 158 | } 159 | if shortening { 160 | dots := make([]string, ncols+1) 161 | for i := 1; i < ncols+1; i++ { 162 | dots[i] = "..." 163 | } 164 | records = append(records, dots) 165 | } 166 | types := df.Types() 167 | typesrow := make([]string, ncols) 168 | for i := 0; i < ncols; i++ { 169 | typesrow[i] = fmt.Sprintf("<%v>", types[i]) 170 | } 171 | typesrow = append([]string{""}, typesrow...) 172 | 173 | if showTypes { 174 | records = append(records, typesrow) 175 | } 176 | 177 | maxChars := make([]int, df.ncols+1) 178 | for i := 0; i < len(records); i++ { 179 | for j := 0; j < df.ncols+1; j++ { 180 | // Escape special characters 181 | records[i][j] = strconv.Quote(records[i][j]) 182 | records[i][j] = records[i][j][1 : len(records[i][j])-1] 183 | 184 | // Detect maximum number of characters per column 185 | if len(records[i][j]) > maxChars[j] { 186 | maxChars[j] = utf8.RuneCountInString(records[i][j]) 187 | } 188 | } 189 | } 190 | maxCols := len(records[0]) 191 | var notShowing []string 192 | if shortCols { 193 | maxCharsCum := 0 194 | for colnum, m := range maxChars { 195 | maxCharsCum += m 196 | if maxCharsCum > maxCharsTotal { 197 | maxCols = colnum 198 | break 199 | } 200 | } 201 | notShowingNames := records[0][maxCols:] 202 | notShowingTypes := typesrow[maxCols:] 203 | notShowing = make([]string, len(notShowingNames)) 204 | for i := 0; i < len(notShowingNames); i++ { 205 | notShowing[i] = fmt.Sprintf("%s %s", notShowingNames[i], notShowingTypes[i]) 206 | } 207 | } 208 | for i := 0; i < len(records); i++ { 209 | // Add right padding to all elements 210 | records[i][0] = addLeftPadding(records[i][0], maxChars[0]+1) 211 | for j := 1; j < df.ncols+1; j++ { 212 | records[i][j] = addRightPadding(records[i][j], maxChars[j]) 213 | } 214 | records[i] = records[i][0:maxCols] 215 | if shortCols && len(notShowing) != 0 { 216 | records[i] = append(records[i], "...") 217 | } 218 | // Create the final string 219 | str += strings.Join(records[i], " ") 220 | str += "\n" 221 | } 222 | if shortCols && len(notShowing) != 0 { 223 | var notShown string 224 | var notShownArr [][]string 225 | cum := 0 226 | i := 0 227 | for n, ns := range notShowing { 228 | cum += len(ns) 229 | if cum > maxCharsTotal { 230 | notShownArr = append(notShownArr, notShowing[i:n]) 231 | cum = 0 232 | i = n 233 | } 234 | } 235 | if i < len(notShowing) { 236 | notShownArr = append(notShownArr, notShowing[i:len(notShowing)]) 237 | } 238 | for k, ns := range notShownArr { 239 | notShown += strings.Join(ns, ", ") 240 | if k != len(notShownArr)-1 { 241 | notShown += "," 242 | } 243 | notShown += "\n" 244 | } 245 | str += fmt.Sprintf("\nNot Showing: %s", notShown) 246 | } 247 | return str 248 | } 249 | 250 | // Subsetting, mutating and transforming DataFrame methods 251 | // ======================================================= 252 | 253 | // Set will update the values of a DataFrame for all rows selected via indexes. 254 | func (df DataFrame) Set(indexes series.Indexes, newvalues DataFrame) DataFrame { 255 | if df.Err != nil { 256 | return df 257 | } 258 | if newvalues.Err != nil { 259 | return DataFrame{Err: fmt.Errorf("argument has errors: %v", newvalues.Err)} 260 | } 261 | if df.ncols != newvalues.ncols { 262 | return DataFrame{Err: fmt.Errorf("different number of columns")} 263 | } 264 | columns := make([]series.Series, df.ncols) 265 | for i, s := range df.columns { 266 | columns[i] = s.Set(indexes, newvalues.columns[i]) 267 | if columns[i].Err != nil { 268 | df = DataFrame{Err: fmt.Errorf("setting error on column %d: %v", i, columns[i].Err)} 269 | return df 270 | } 271 | } 272 | return df 273 | } 274 | 275 | // Subset returns a subset of the rows of the original DataFrame based on the 276 | // Series subsetting indexes. 277 | func (df DataFrame) Subset(indexes series.Indexes) DataFrame { 278 | if df.Err != nil { 279 | return df 280 | } 281 | columns := make([]series.Series, df.ncols) 282 | for i, column := range df.columns { 283 | s := column.Subset(indexes) 284 | columns[i] = s 285 | } 286 | nrows, ncols, err := checkColumnsDimensions(columns...) 287 | if err != nil { 288 | return DataFrame{Err: err} 289 | } 290 | return DataFrame{ 291 | columns: columns, 292 | ncols: ncols, 293 | nrows: nrows, 294 | } 295 | } 296 | 297 | // SelectIndexes are the supported indexes used for the DataFrame.Select method. Currently supported are: 298 | // 299 | // int // Matches the given index number 300 | // []int // Matches all given index numbers 301 | // []bool // Matches all columns marked as true 302 | // string // Matches the column with the matching column name 303 | // []string // Matches all columns with the matching column names 304 | // Series [Int] // Same as []int 305 | // Series [Bool] // Same as []bool 306 | // Series [String] // Same as []string 307 | type SelectIndexes interface{} 308 | 309 | // Select the given DataFrame columns 310 | func (df DataFrame) Select(indexes SelectIndexes) DataFrame { 311 | if df.Err != nil { 312 | return df 313 | } 314 | idx, err := parseSelectIndexes(df.ncols, indexes, df.Names()) 315 | if err != nil { 316 | return DataFrame{Err: fmt.Errorf("can't select columns: %v", err)} 317 | } 318 | columns := make([]series.Series, len(idx)) 319 | for k, i := range idx { 320 | if i < 0 || i >= df.ncols { 321 | return DataFrame{Err: fmt.Errorf("can't select columns: index out of range")} 322 | } 323 | columns[k] = df.columns[i].Copy() 324 | } 325 | nrows, ncols, err := checkColumnsDimensions(columns...) 326 | if err != nil { 327 | return DataFrame{Err: err} 328 | } 329 | df = DataFrame{ 330 | columns: columns, 331 | ncols: ncols, 332 | nrows: nrows, 333 | } 334 | colnames := df.Names() 335 | fixColnames(colnames) 336 | for i, colname := range colnames { 337 | df.columns[i].Name = colname 338 | } 339 | return df 340 | } 341 | 342 | // Drop the given DataFrame columns 343 | func (df DataFrame) Drop(indexes SelectIndexes) DataFrame { 344 | if df.Err != nil { 345 | return df 346 | } 347 | idx, err := parseSelectIndexes(df.ncols, indexes, df.Names()) 348 | if err != nil { 349 | return DataFrame{Err: fmt.Errorf("can't select columns: %v", err)} 350 | } 351 | var columns []series.Series 352 | for k, col := range df.columns { 353 | if !inIntSlice(k, idx) { 354 | columns = append(columns, col.Copy()) 355 | } 356 | } 357 | nrows, ncols, err := checkColumnsDimensions(columns...) 358 | if err != nil { 359 | return DataFrame{Err: err} 360 | } 361 | df = DataFrame{ 362 | columns: columns, 363 | ncols: ncols, 364 | nrows: nrows, 365 | } 366 | colnames := df.Names() 367 | fixColnames(colnames) 368 | for i, colname := range colnames { 369 | df.columns[i].Name = colname 370 | } 371 | return df 372 | } 373 | 374 | // Rename changes the name of one of the columns of a DataFrame 375 | func (df DataFrame) Rename(newname, oldname string) DataFrame { 376 | if df.Err != nil { 377 | return df 378 | } 379 | // Check that colname exist on dataframe 380 | colnames := df.Names() 381 | idx := findInStringSlice(oldname, colnames) 382 | if idx == -1 { 383 | return DataFrame{Err: fmt.Errorf("rename: can't find column name")} 384 | } 385 | 386 | copy := df.Copy() 387 | copy.columns[idx].Name = newname 388 | return copy 389 | } 390 | 391 | // CBind combines the columns of this DataFrame and dfb DataFrame. 392 | func (df DataFrame) CBind(dfb DataFrame) DataFrame { 393 | if df.Err != nil { 394 | return df 395 | } 396 | if dfb.Err != nil { 397 | return dfb 398 | } 399 | cols := append(df.columns, dfb.columns...) 400 | return New(cols...) 401 | } 402 | 403 | // RBind matches the column names of two DataFrames and returns combined 404 | // rows from both of them. 405 | func (df DataFrame) RBind(dfb DataFrame) DataFrame { 406 | if df.Err != nil { 407 | return df 408 | } 409 | if dfb.Err != nil { 410 | return dfb 411 | } 412 | expandedSeries := make([]series.Series, df.ncols) 413 | for k, v := range df.Names() { 414 | idx := findInStringSlice(v, dfb.Names()) 415 | if idx == -1 { 416 | return DataFrame{Err: fmt.Errorf("rbind: column names are not compatible")} 417 | } 418 | 419 | originalSeries := df.columns[k] 420 | addedSeries := dfb.columns[idx] 421 | newSeries := originalSeries.Concat(addedSeries) 422 | if err := newSeries.Err; err != nil { 423 | return DataFrame{Err: fmt.Errorf("rbind: %v", err)} 424 | } 425 | expandedSeries[k] = newSeries 426 | } 427 | return New(expandedSeries...) 428 | } 429 | 430 | // Mutate changes a column of the DataFrame with the given Series or adds it as 431 | // a new column if the column name does not exist. 432 | func (df DataFrame) Mutate(s series.Series) DataFrame { 433 | if df.Err != nil { 434 | return df 435 | } 436 | if s.Len() != df.nrows { 437 | return DataFrame{Err: fmt.Errorf("mutate: wrong dimensions")} 438 | } 439 | df = df.Copy() 440 | // Check that colname exist on dataframe 441 | columns := df.columns 442 | if idx := findInStringSlice(s.Name, df.Names()); idx != -1 { 443 | columns[idx] = s 444 | } else { 445 | columns = append(columns, s) 446 | } 447 | nrows, ncols, err := checkColumnsDimensions(columns...) 448 | if err != nil { 449 | return DataFrame{Err: err} 450 | } 451 | df = DataFrame{ 452 | columns: columns, 453 | ncols: ncols, 454 | nrows: nrows, 455 | } 456 | colnames := df.Names() 457 | fixColnames(colnames) 458 | for i, colname := range colnames { 459 | df.columns[i].Name = colname 460 | } 461 | return df 462 | } 463 | 464 | // F is the filtering structure 465 | type F struct { 466 | Colname string 467 | Comparator series.Comparator 468 | Comparando interface{} 469 | } 470 | 471 | // Filter will filter the rows of a DataFrame based on the given filters. All 472 | // filters on the argument of a Filter call are aggregated as an OR operation 473 | // whereas if we chain Filter calls, every filter will act as an AND operation 474 | // with regards to the rest. 475 | func (df DataFrame) Filter(filters ...F) DataFrame { 476 | if df.Err != nil { 477 | return df 478 | } 479 | compResults := make([]series.Series, len(filters)) 480 | for i, f := range filters { 481 | idx := findInStringSlice(f.Colname, df.Names()) 482 | if idx < 0 { 483 | return DataFrame{Err: fmt.Errorf("filter: can't find column name")} 484 | } 485 | res := df.columns[idx].Compare(f.Comparator, f.Comparando) 486 | if err := res.Err; err != nil { 487 | return DataFrame{Err: fmt.Errorf("filter: %v", err)} 488 | } 489 | compResults[i] = res 490 | } 491 | // Join compResults via "OR" 492 | if len(compResults) == 0 { 493 | return df.Copy() 494 | } 495 | res, err := compResults[0].Bool() 496 | if err != nil { 497 | return DataFrame{Err: fmt.Errorf("filter: %v", err)} 498 | } 499 | for i := 1; i < len(compResults); i++ { 500 | nextRes, err := compResults[i].Bool() 501 | if err != nil { 502 | return DataFrame{Err: fmt.Errorf("filter: %v", err)} 503 | } 504 | for j := 0; j < len(res); j++ { 505 | res[j] = res[j] || nextRes[j] 506 | } 507 | } 508 | return df.Subset(res) 509 | } 510 | 511 | // Order is the ordering structure 512 | type Order struct { 513 | Colname string 514 | Reverse bool 515 | } 516 | 517 | // Sort return an ordering structure for regular column sorting sort. 518 | func Sort(colname string) Order { 519 | return Order{colname, false} 520 | } 521 | 522 | // RevSort return an ordering structure for reverse column sorting. 523 | func RevSort(colname string) Order { 524 | return Order{colname, true} 525 | } 526 | 527 | // Arrange sort the rows of a DataFrame according to the given Order 528 | func (df DataFrame) Arrange(order ...Order) DataFrame { 529 | if df.Err != nil { 530 | return df 531 | } 532 | if order == nil || len(order) == 0 { 533 | return DataFrame{Err: fmt.Errorf("rename: no arguments")} 534 | } 535 | 536 | // Check that all colnames exist before starting to sort 537 | for i := 0; i < len(order); i++ { 538 | colname := order[i].Colname 539 | if df.colIndex(colname) == -1 { 540 | return DataFrame{Err: fmt.Errorf("colname %s doesn't exist", colname)} 541 | } 542 | } 543 | 544 | // Initialize the index that will be used to store temporary and final order 545 | // results. 546 | origIdx := make([]int, df.nrows) 547 | for i := 0; i < df.nrows; i++ { 548 | origIdx[i] = i 549 | } 550 | 551 | swapOrigIdx := func(newidx []int) { 552 | newOrigIdx := make([]int, len(newidx)) 553 | for k, i := range newidx { 554 | newOrigIdx[k] = origIdx[i] 555 | } 556 | origIdx = newOrigIdx 557 | } 558 | 559 | suborder := origIdx 560 | for i := len(order) - 1; i >= 0; i-- { 561 | colname := order[i].Colname 562 | idx := df.colIndex(colname) 563 | nextSeries := df.columns[idx].Subset(suborder) 564 | suborder = nextSeries.Order(order[i].Reverse) 565 | swapOrigIdx(suborder) 566 | } 567 | return df.Subset(origIdx) 568 | } 569 | 570 | // Capply applies the given function to the columns of a DataFrame 571 | func (df DataFrame) Capply(f func(series.Series) series.Series) DataFrame { 572 | if df.Err != nil { 573 | return df 574 | } 575 | columns := make([]series.Series, df.ncols) 576 | for i, s := range df.columns { 577 | applied := f(s) 578 | applied.Name = s.Name 579 | columns[i] = applied 580 | } 581 | return New(columns...) 582 | } 583 | 584 | // Rapply applies the given function to the rows of a DataFrame. Prior to applying 585 | // the function the elements of each row are cast to a Series of a specific 586 | // type. In order of priority: String -> Float -> Int -> Bool. This casting also 587 | // takes place after the function application to equalize the type of the columns. 588 | func (df DataFrame) Rapply(f func(series.Series) series.Series) DataFrame { 589 | if df.Err != nil { 590 | return df 591 | } 592 | 593 | detectType := func(types []series.Type) series.Type { 594 | var hasStrings, hasFloats, hasInts, hasBools bool 595 | for _, t := range types { 596 | switch t { 597 | case series.String: 598 | hasStrings = true 599 | case series.Float: 600 | hasFloats = true 601 | case series.Int: 602 | hasInts = true 603 | case series.Bool: 604 | hasBools = true 605 | } 606 | } 607 | switch { 608 | case hasStrings: 609 | return series.String 610 | case hasBools: 611 | return series.Bool 612 | case hasFloats: 613 | return series.Float 614 | case hasInts: 615 | return series.Int 616 | default: 617 | panic("type not supported") 618 | } 619 | } 620 | 621 | // Detect row type prior to function application 622 | types := df.Types() 623 | rowType := detectType(types) 624 | 625 | // Create Element matrix 626 | elements := make([][]series.Element, df.nrows) 627 | rowlen := -1 628 | for i := 0; i < df.nrows; i++ { 629 | row := series.New(nil, rowType, "").Empty() 630 | for _, col := range df.columns { 631 | row.Append(col.Elem(i)) 632 | } 633 | row = f(row) 634 | if row.Err != nil { 635 | return DataFrame{Err: fmt.Errorf("error applying function on row %d: %v", i, row.Err)} 636 | } 637 | 638 | if rowlen != -1 && rowlen != row.Len() { 639 | return DataFrame{Err: fmt.Errorf("error applying function: rows have different lengths")} 640 | } 641 | rowlen = row.Len() 642 | 643 | rowElems := make([]series.Element, rowlen) 644 | for j := 0; j < rowlen; j++ { 645 | rowElems[j] = row.Elem(j) 646 | } 647 | elements[i] = rowElems 648 | } 649 | 650 | // Cast columns if necessary 651 | columns := make([]series.Series, rowlen) 652 | for j := 0; j < rowlen; j++ { 653 | types := make([]series.Type, df.nrows) 654 | for i := 0; i < df.nrows; i++ { 655 | types[i] = elements[i][j].Type() 656 | } 657 | colType := detectType(types) 658 | s := series.New(nil, colType, "").Empty() 659 | for i := 0; i < df.nrows; i++ { 660 | s.Append(elements[i][j]) 661 | } 662 | columns[j] = s 663 | } 664 | 665 | nrows, ncols, err := checkColumnsDimensions(columns...) 666 | if err != nil { 667 | return DataFrame{Err: err} 668 | } 669 | df = DataFrame{ 670 | columns: columns, 671 | ncols: ncols, 672 | nrows: nrows, 673 | } 674 | colnames := df.Names() 675 | fixColnames(colnames) 676 | for i, colname := range colnames { 677 | df.columns[i].Name = colname 678 | } 679 | return df 680 | } 681 | 682 | // Read/Write Methods 683 | // ================= 684 | 685 | // LoadOption is the type used to configure the load of elements 686 | type LoadOption func(*loadOptions) 687 | 688 | type loadOptions struct { 689 | // Specifies which is the default type in case detectTypes is disabled. 690 | defaultType series.Type 691 | 692 | // If set, the type of each column will be automatically detected unless 693 | // otherwise specified. 694 | detectTypes bool 695 | 696 | // If set, the first row of the tabular structure will be used as column 697 | // names. 698 | hasHeader bool 699 | 700 | // The names to set as columns names. 701 | names []string 702 | 703 | // Defines which values are going to be considered as NaN when parsing from string. 704 | nanValues []string 705 | 706 | // Defines the csv delimiter 707 | delimiter rune 708 | 709 | // Defines the comment delimiter 710 | comment rune 711 | 712 | // The types of specific columns can be specified via column name. 713 | types map[string]series.Type 714 | } 715 | 716 | // DefaultType sets the defaultType option for loadOptions. 717 | func DefaultType(t series.Type) LoadOption { 718 | return func(c *loadOptions) { 719 | c.defaultType = t 720 | } 721 | } 722 | 723 | // DetectTypes sets the detectTypes option for loadOptions. 724 | func DetectTypes(b bool) LoadOption { 725 | return func(c *loadOptions) { 726 | c.detectTypes = b 727 | } 728 | } 729 | 730 | // HasHeader sets the hasHeader option for loadOptions. 731 | func HasHeader(b bool) LoadOption { 732 | return func(c *loadOptions) { 733 | c.hasHeader = b 734 | } 735 | } 736 | 737 | // Names sets the names option for loadOptions. 738 | func Names(names ...string) LoadOption { 739 | return func(c *loadOptions) { 740 | c.names = names 741 | } 742 | } 743 | 744 | // NaNValues sets the nanValues option for loadOptions. 745 | func NaNValues(nanValues []string) LoadOption { 746 | return func(c *loadOptions) { 747 | c.nanValues = nanValues 748 | } 749 | } 750 | 751 | // WithTypes sets the types option for loadOptions. 752 | func WithTypes(coltypes map[string]series.Type) LoadOption { 753 | return func(c *loadOptions) { 754 | c.types = coltypes 755 | } 756 | } 757 | 758 | // WithDelimiter sets the csv delimiter other than ',', for example '\t' 759 | func WithDelimiter(b rune) LoadOption { 760 | return func(c *loadOptions) { 761 | c.delimiter = b 762 | } 763 | } 764 | 765 | // WithComments sets the csv comment line detect to remove lines 766 | func WithComments(b rune) LoadOption { 767 | return func(c *loadOptions) { 768 | c.comment = b 769 | } 770 | } 771 | 772 | // LoadStructs creates a new DataFrame from arbitrary struct slices. 773 | // 774 | // LoadStructs will ignore unexported fields inside an struct. Note also that 775 | // unless otherwise specified the column names will correspond with the name of 776 | // the field. 777 | // 778 | // You can configure each field with the `dataframe:"name[,type]"` struct 779 | // tag. If the name on the tag is the empty string `""` the field name will be 780 | // used instead. If the name is `"-"` the field will be ignored. 781 | // 782 | // Examples: 783 | // 784 | // // field will be ignored 785 | // field int 786 | // 787 | // // Field will be ignored 788 | // Field int `dataframe:"-"` 789 | // 790 | // // Field will be parsed with column name Field and type int 791 | // Field int 792 | // 793 | // // Field will be parsed with column name `field_column` and type int. 794 | // Field int `dataframe:"field_column"` 795 | // 796 | // // Field will be parsed with column name `field` and type string. 797 | // Field int `dataframe:"field,string"` 798 | // 799 | // // Field will be parsed with column name `Field` and type string. 800 | // Field int `dataframe:",string"` 801 | // 802 | // If the struct tags and the given LoadOptions contradict each other, the later 803 | // will have preference over the former. 804 | func LoadStructs(i interface{}, options ...LoadOption) DataFrame { 805 | if i == nil { 806 | return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from value")} 807 | } 808 | 809 | // Set the default load options 810 | cfg := loadOptions{ 811 | defaultType: series.String, 812 | detectTypes: true, 813 | hasHeader: true, 814 | nanValues: []string{"NA", "NaN", ""}, 815 | } 816 | 817 | // Set any custom load options 818 | for _, option := range options { 819 | option(&cfg) 820 | } 821 | 822 | tpy, val := reflect.TypeOf(i), reflect.ValueOf(i) 823 | switch tpy.Kind() { 824 | case reflect.Slice: 825 | if tpy.Elem().Kind() != reflect.Struct { 826 | return DataFrame{Err: fmt.Errorf( 827 | "load: type %s (%s %s) is not supported, must be []struct", tpy.Name(), tpy.Elem().Kind(), tpy.Kind())} 828 | } 829 | if val.Len() == 0 { 830 | return DataFrame{Err: fmt.Errorf("load: can't create DataFrame from empty slice")} 831 | } 832 | 833 | numFields := val.Index(0).Type().NumField() 834 | var columns []series.Series 835 | for j := 0; j < numFields; j++ { 836 | // Extract field metadata 837 | if !val.Index(0).Field(j).CanInterface() { 838 | continue 839 | } 840 | field := val.Index(0).Type().Field(j) 841 | fieldName := field.Name 842 | fieldType := field.Type.String() 843 | 844 | // Process struct tags 845 | fieldTags := field.Tag.Get("dataframe") 846 | if fieldTags == "-" { 847 | continue 848 | } 849 | tagOpts := strings.Split(fieldTags, ",") 850 | if len(tagOpts) > 2 { 851 | return DataFrame{Err: fmt.Errorf("malformed struct tag on field %s: %s", fieldName, fieldTags)} 852 | } 853 | if len(tagOpts) > 0 { 854 | if name := strings.TrimSpace(tagOpts[0]); name != "" { 855 | fieldName = name 856 | } 857 | if len(tagOpts) == 2 { 858 | if tagType := strings.TrimSpace(tagOpts[1]); tagType != "" { 859 | fieldType = tagType 860 | } 861 | } 862 | } 863 | 864 | // Handle `types` option 865 | var t series.Type 866 | if cfgtype, ok := cfg.types[fieldName]; ok { 867 | t = cfgtype 868 | } else { 869 | // Handle `detectTypes` option 870 | if cfg.detectTypes { 871 | // Parse field type 872 | parsedType, err := parseType(fieldType) 873 | if err != nil { 874 | return DataFrame{Err: err} 875 | } 876 | t = parsedType 877 | } else { 878 | t = cfg.defaultType 879 | } 880 | } 881 | 882 | // Create Series for this field 883 | elements := make([]interface{}, val.Len()) 884 | for i := 0; i < val.Len(); i++ { 885 | fieldValue := val.Index(i).Field(j) 886 | elements[i] = fieldValue.Interface() 887 | 888 | // Handle `nanValues` option 889 | if findInStringSlice(fmt.Sprint(elements[i]), cfg.nanValues) != -1 { 890 | elements[i] = nil 891 | } 892 | } 893 | 894 | // Handle `hasHeader` option 895 | if !cfg.hasHeader { 896 | tmp := make([]interface{}, 1) 897 | tmp[0] = fieldName 898 | elements = append(tmp, elements...) 899 | fieldName = "" 900 | } 901 | columns = append(columns, series.New(elements, t, fieldName)) 902 | } 903 | return New(columns...) 904 | } 905 | return DataFrame{Err: fmt.Errorf( 906 | "load: type %s (%s) is not supported, must be []struct", tpy.Name(), tpy.Kind())} 907 | } 908 | 909 | func parseType(s string) (series.Type, error) { 910 | switch s { 911 | case "float", "float64", "float32": 912 | return series.Float, nil 913 | case "int", "int64", "int32", "int16", "int8": 914 | return series.Int, nil 915 | case "string": 916 | return series.String, nil 917 | case "bool": 918 | return series.Bool, nil 919 | } 920 | return "", fmt.Errorf("type (%s) is not supported", s) 921 | } 922 | 923 | // LoadRecords creates a new DataFrame based on the given records. 924 | func LoadRecords(records [][]string, options ...LoadOption) DataFrame { 925 | // Set the default load options 926 | cfg := loadOptions{ 927 | defaultType: series.String, 928 | detectTypes: true, 929 | hasHeader: true, 930 | nanValues: []string{"NA", "NaN", ""}, 931 | } 932 | 933 | // Set any custom load options 934 | for _, option := range options { 935 | option(&cfg) 936 | } 937 | 938 | if len(records) == 0 { 939 | return DataFrame{Err: fmt.Errorf("load records: empty DataFrame")} 940 | } 941 | if cfg.hasHeader && len(records) <= 1 { 942 | return DataFrame{Err: fmt.Errorf("load records: empty DataFrame")} 943 | } 944 | if cfg.names != nil && len(cfg.names) != len(records[0]) { 945 | if len(cfg.names) > len(records[0]) { 946 | return DataFrame{Err: fmt.Errorf("load records: too many column names")} 947 | } 948 | return DataFrame{Err: fmt.Errorf("load records: not enough column names")} 949 | } 950 | 951 | // Extract headers 952 | headers := make([]string, len(records[0])) 953 | if cfg.hasHeader { 954 | headers = records[0] 955 | records = records[1:] 956 | } 957 | if cfg.names != nil { 958 | headers = cfg.names 959 | } 960 | 961 | types := make([]series.Type, len(headers)) 962 | rawcols := make([][]string, len(headers)) 963 | for i, colname := range headers { 964 | rawcol := make([]string, len(records)) 965 | for j := 0; j < len(records); j++ { 966 | rawcol[j] = records[j][i] 967 | if findInStringSlice(rawcol[j], cfg.nanValues) != -1 { 968 | rawcol[j] = "NaN" 969 | } 970 | } 971 | rawcols[i] = rawcol 972 | 973 | t, ok := cfg.types[colname] 974 | if !ok { 975 | t = cfg.defaultType 976 | if cfg.detectTypes { 977 | if l, err := findType(rawcol); err == nil { 978 | t = l 979 | } 980 | } 981 | } 982 | types[i] = t 983 | } 984 | 985 | columns := make([]series.Series, len(headers)) 986 | for i, colname := range headers { 987 | col := series.New(rawcols[i], types[i], colname) 988 | if col.Err != nil { 989 | return DataFrame{Err: col.Err} 990 | } 991 | columns[i] = col 992 | } 993 | nrows, ncols, err := checkColumnsDimensions(columns...) 994 | if err != nil { 995 | return DataFrame{Err: err} 996 | } 997 | df := DataFrame{ 998 | columns: columns, 999 | ncols: ncols, 1000 | nrows: nrows, 1001 | } 1002 | 1003 | colnames := df.Names() 1004 | fixColnames(colnames) 1005 | for i, colname := range colnames { 1006 | df.columns[i].Name = colname 1007 | } 1008 | return df 1009 | } 1010 | 1011 | // LoadMaps creates a new DataFrame based on the given maps. This function assumes 1012 | // that every map on the array represents a row of observations. 1013 | func LoadMaps(maps []map[string]interface{}, options ...LoadOption) DataFrame { 1014 | if len(maps) == 0 { 1015 | return DataFrame{Err: fmt.Errorf("load maps: empty array")} 1016 | } 1017 | inStrSlice := func(i string, s []string) bool { 1018 | for _, v := range s { 1019 | if v == i { 1020 | return true 1021 | } 1022 | } 1023 | return false 1024 | } 1025 | // Detect all colnames 1026 | var colnames []string 1027 | for _, v := range maps { 1028 | for k := range v { 1029 | if exists := inStrSlice(k, colnames); !exists { 1030 | colnames = append(colnames, k) 1031 | } 1032 | } 1033 | } 1034 | sort.Strings(colnames) 1035 | records := make([][]string, len(maps)+1) 1036 | records[0] = colnames 1037 | for k, m := range maps { 1038 | row := make([]string, len(colnames)) 1039 | for i, colname := range colnames { 1040 | element := "" 1041 | val, ok := m[colname] 1042 | if ok { 1043 | element = fmt.Sprint(val) 1044 | } 1045 | row[i] = element 1046 | } 1047 | records[k+1] = row 1048 | } 1049 | return LoadRecords(records, options...) 1050 | } 1051 | 1052 | // LoadMatrix loads the given Matrix as a DataFrame 1053 | // TODO: Add Loadoptions 1054 | func LoadMatrix(mat Matrix) DataFrame { 1055 | nrows, ncols := mat.Dims() 1056 | columns := make([]series.Series, ncols) 1057 | for i := 0; i < ncols; i++ { 1058 | floats := make([]float64, nrows) 1059 | for j := 0; j < nrows; j++ { 1060 | floats[j] = mat.At(j, i) 1061 | } 1062 | columns[i] = series.Floats(floats) 1063 | } 1064 | nrows, ncols, err := checkColumnsDimensions(columns...) 1065 | if err != nil { 1066 | return DataFrame{Err: err} 1067 | } 1068 | df := DataFrame{ 1069 | columns: columns, 1070 | ncols: ncols, 1071 | nrows: nrows, 1072 | } 1073 | colnames := df.Names() 1074 | fixColnames(colnames) 1075 | for i, colname := range colnames { 1076 | df.columns[i].Name = colname 1077 | } 1078 | return df 1079 | } 1080 | 1081 | // ReadCSV reads a CSV file from a io.Reader and builds a DataFrame with the 1082 | // resulting records. 1083 | func ReadCSV(r io.Reader, options ...LoadOption) DataFrame { 1084 | csvReader := csv.NewReader(r) 1085 | cfg := loadOptions{ 1086 | delimiter: ',', 1087 | } 1088 | for _, option := range options { 1089 | option(&cfg) 1090 | } 1091 | if cfg.delimiter != ',' { 1092 | csvReader.Comma = cfg.delimiter 1093 | } 1094 | if cfg.comment != 0 { 1095 | csvReader.Comment = cfg.comment 1096 | } 1097 | 1098 | records, err := csvReader.ReadAll() 1099 | if err != nil { 1100 | return DataFrame{Err: err} 1101 | } 1102 | return LoadRecords(records, options...) 1103 | } 1104 | 1105 | // ReadJSON reads a JSON array from a io.Reader and builds a DataFrame with the 1106 | // resulting records. 1107 | func ReadJSON(r io.Reader, options ...LoadOption) DataFrame { 1108 | var m []map[string]interface{} 1109 | err := json.NewDecoder(r).Decode(&m) 1110 | if err != nil { 1111 | return DataFrame{Err: err} 1112 | } 1113 | return LoadMaps(m, options...) 1114 | } 1115 | 1116 | // WriteOption is the type used to configure the writing of elements 1117 | type WriteOption func(*writeOptions) 1118 | 1119 | type writeOptions struct { 1120 | // Specifies whether the header is also written 1121 | writeHeader bool 1122 | } 1123 | 1124 | // WriteHeader sets the writeHeader option for writeOptions. 1125 | func WriteHeader(b bool) WriteOption { 1126 | return func(c *writeOptions) { 1127 | c.writeHeader = b 1128 | } 1129 | } 1130 | 1131 | // WriteCSV writes the DataFrame to the given io.Writer as a CSV file. 1132 | func (df DataFrame) WriteCSV(w io.Writer, options ...WriteOption) error { 1133 | if df.Err != nil { 1134 | return df.Err 1135 | } 1136 | 1137 | // Set the default write options 1138 | cfg := writeOptions{ 1139 | writeHeader: true, 1140 | } 1141 | 1142 | // Set any custom write options 1143 | for _, option := range options { 1144 | option(&cfg) 1145 | } 1146 | 1147 | records := df.Records() 1148 | if !cfg.writeHeader { 1149 | records = records[1:] 1150 | } 1151 | 1152 | return csv.NewWriter(w).WriteAll(records) 1153 | } 1154 | 1155 | // WriteJSON writes the DataFrame to the given io.Writer as a JSON array. 1156 | func (df DataFrame) WriteJSON(w io.Writer) error { 1157 | if df.Err != nil { 1158 | return df.Err 1159 | } 1160 | return json.NewEncoder(w).Encode(df.Maps()) 1161 | } 1162 | 1163 | // Getters/Setters for DataFrame fields 1164 | // ==================================== 1165 | 1166 | // Names returns the name of the columns on a DataFrame. 1167 | func (df DataFrame) Names() []string { 1168 | colnames := make([]string, df.ncols) 1169 | for i, s := range df.columns { 1170 | colnames[i] = s.Name 1171 | } 1172 | return colnames 1173 | } 1174 | 1175 | // Types returns the types of the columns on a DataFrame. 1176 | func (df DataFrame) Types() []series.Type { 1177 | coltypes := make([]series.Type, df.ncols) 1178 | for i, s := range df.columns { 1179 | coltypes[i] = s.Type() 1180 | } 1181 | return coltypes 1182 | } 1183 | 1184 | // SetNames changes the column names of a DataFrame to the ones passed as an 1185 | // argument. 1186 | func (df DataFrame) SetNames(colnames ...string) error { 1187 | if df.Err != nil { 1188 | return df.Err 1189 | } 1190 | if len(colnames) != df.ncols { 1191 | return fmt.Errorf("setting names: wrong dimensions") 1192 | } 1193 | for k, s := range colnames { 1194 | df.columns[k].Name = s 1195 | } 1196 | return nil 1197 | } 1198 | 1199 | // Dims retrieves the dimensions of a DataFrame. 1200 | func (df DataFrame) Dims() (int, int) { 1201 | return df.Nrow(), df.Ncol() 1202 | } 1203 | 1204 | // Nrow returns the number of rows on a DataFrame. 1205 | func (df DataFrame) Nrow() int { 1206 | return df.nrows 1207 | } 1208 | 1209 | // Ncol returns the number of columns on a DataFrame. 1210 | func (df DataFrame) Ncol() int { 1211 | return df.ncols 1212 | } 1213 | 1214 | // Col returns a copy of the Series with the given column name contained in the DataFrame. 1215 | func (df DataFrame) Col(colname string) series.Series { 1216 | if df.Err != nil { 1217 | return series.Series{Err: df.Err} 1218 | } 1219 | // Check that colname exist on dataframe 1220 | idx := findInStringSlice(colname, df.Names()) 1221 | if idx < 0 { 1222 | return series.Series{Err: fmt.Errorf("unknown column name")} 1223 | } 1224 | return df.columns[idx].Copy() 1225 | } 1226 | 1227 | // InnerJoin returns a DataFrame containing the inner join of two DataFrames. 1228 | func (df DataFrame) InnerJoin(b DataFrame, keys ...string) DataFrame { 1229 | if len(keys) == 0 { 1230 | return DataFrame{Err: fmt.Errorf("join keys not specified")} 1231 | } 1232 | // Check that we have all given keys in both DataFrames 1233 | var iKeysA []int 1234 | var iKeysB []int 1235 | var errorArr []string 1236 | for _, key := range keys { 1237 | i := df.colIndex(key) 1238 | if i < 0 { 1239 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on left DataFrame", key)) 1240 | } 1241 | iKeysA = append(iKeysA, i) 1242 | j := b.colIndex(key) 1243 | if j < 0 { 1244 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on right DataFrame", key)) 1245 | } 1246 | iKeysB = append(iKeysB, j) 1247 | } 1248 | if len(errorArr) != 0 { 1249 | return DataFrame{Err: fmt.Errorf(strings.Join(errorArr, "\n"))} 1250 | } 1251 | 1252 | aCols := df.columns 1253 | bCols := b.columns 1254 | // Initialize newCols 1255 | var newCols []series.Series 1256 | for _, i := range iKeysA { 1257 | newCols = append(newCols, aCols[i].Empty()) 1258 | } 1259 | var iNotKeysA []int 1260 | for i := 0; i < df.ncols; i++ { 1261 | if !inIntSlice(i, iKeysA) { 1262 | iNotKeysA = append(iNotKeysA, i) 1263 | newCols = append(newCols, aCols[i].Empty()) 1264 | } 1265 | } 1266 | var iNotKeysB []int 1267 | for i := 0; i < b.ncols; i++ { 1268 | if !inIntSlice(i, iKeysB) { 1269 | iNotKeysB = append(iNotKeysB, i) 1270 | newCols = append(newCols, bCols[i].Empty()) 1271 | } 1272 | } 1273 | 1274 | // Fill newCols 1275 | for i := 0; i < df.nrows; i++ { 1276 | for j := 0; j < b.nrows; j++ { 1277 | match := true 1278 | for k := range keys { 1279 | aElem := aCols[iKeysA[k]].Elem(i) 1280 | bElem := bCols[iKeysB[k]].Elem(j) 1281 | match = match && aElem.Eq(bElem) 1282 | } 1283 | if match { 1284 | ii := 0 1285 | for _, k := range iKeysA { 1286 | elem := aCols[k].Elem(i) 1287 | newCols[ii].Append(elem) 1288 | ii++ 1289 | } 1290 | for _, k := range iNotKeysA { 1291 | elem := aCols[k].Elem(i) 1292 | newCols[ii].Append(elem) 1293 | ii++ 1294 | } 1295 | for _, k := range iNotKeysB { 1296 | elem := bCols[k].Elem(j) 1297 | newCols[ii].Append(elem) 1298 | ii++ 1299 | } 1300 | } 1301 | } 1302 | } 1303 | return New(newCols...) 1304 | } 1305 | 1306 | // LeftJoin returns a DataFrame containing the left join of two DataFrames. 1307 | func (df DataFrame) LeftJoin(b DataFrame, keys ...string) DataFrame { 1308 | if len(keys) == 0 { 1309 | return DataFrame{Err: fmt.Errorf("join keys not specified")} 1310 | } 1311 | // Check that we have all given keys in both DataFrames 1312 | var iKeysA []int 1313 | var iKeysB []int 1314 | var errorArr []string 1315 | for _, key := range keys { 1316 | i := df.colIndex(key) 1317 | if i < 0 { 1318 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on left DataFrame", key)) 1319 | } 1320 | iKeysA = append(iKeysA, i) 1321 | j := b.colIndex(key) 1322 | if j < 0 { 1323 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on right DataFrame", key)) 1324 | } 1325 | iKeysB = append(iKeysB, j) 1326 | } 1327 | if len(errorArr) != 0 { 1328 | return DataFrame{Err: fmt.Errorf(strings.Join(errorArr, "\n"))} 1329 | } 1330 | 1331 | aCols := df.columns 1332 | bCols := b.columns 1333 | // Initialize newCols 1334 | var newCols []series.Series 1335 | for _, i := range iKeysA { 1336 | newCols = append(newCols, aCols[i].Empty()) 1337 | } 1338 | var iNotKeysA []int 1339 | for i := 0; i < df.ncols; i++ { 1340 | if !inIntSlice(i, iKeysA) { 1341 | iNotKeysA = append(iNotKeysA, i) 1342 | newCols = append(newCols, aCols[i].Empty()) 1343 | } 1344 | } 1345 | var iNotKeysB []int 1346 | for i := 0; i < b.ncols; i++ { 1347 | if !inIntSlice(i, iKeysB) { 1348 | iNotKeysB = append(iNotKeysB, i) 1349 | newCols = append(newCols, bCols[i].Empty()) 1350 | } 1351 | } 1352 | 1353 | // Fill newCols 1354 | for i := 0; i < df.nrows; i++ { 1355 | matched := false 1356 | for j := 0; j < b.nrows; j++ { 1357 | match := true 1358 | for k := range keys { 1359 | aElem := aCols[iKeysA[k]].Elem(i) 1360 | bElem := bCols[iKeysB[k]].Elem(j) 1361 | match = match && aElem.Eq(bElem) 1362 | } 1363 | if match { 1364 | matched = true 1365 | ii := 0 1366 | for _, k := range iKeysA { 1367 | elem := aCols[k].Elem(i) 1368 | newCols[ii].Append(elem) 1369 | ii++ 1370 | } 1371 | for _, k := range iNotKeysA { 1372 | elem := aCols[k].Elem(i) 1373 | newCols[ii].Append(elem) 1374 | ii++ 1375 | } 1376 | for _, k := range iNotKeysB { 1377 | elem := bCols[k].Elem(j) 1378 | newCols[ii].Append(elem) 1379 | ii++ 1380 | } 1381 | } 1382 | } 1383 | if !matched { 1384 | ii := 0 1385 | for _, k := range iKeysA { 1386 | elem := aCols[k].Elem(i) 1387 | newCols[ii].Append(elem) 1388 | ii++ 1389 | } 1390 | for _, k := range iNotKeysA { 1391 | elem := aCols[k].Elem(i) 1392 | newCols[ii].Append(elem) 1393 | ii++ 1394 | } 1395 | // for _ = range iNotKeysB { 1396 | // newCols[ii].Append(nil) 1397 | // ii++ 1398 | // } 1399 | 1400 | for _, k := range iNotKeysB { 1401 | _ = k 1402 | newCols[ii].Append(nil) 1403 | ii++ 1404 | } 1405 | } 1406 | } 1407 | return New(newCols...) 1408 | } 1409 | 1410 | // RightJoin returns a DataFrame containing the right join of two DataFrames. 1411 | func (df DataFrame) RightJoin(b DataFrame, keys ...string) DataFrame { 1412 | if len(keys) == 0 { 1413 | return DataFrame{Err: fmt.Errorf("join keys not specified")} 1414 | } 1415 | // Check that we have all given keys in both DataFrames 1416 | var iKeysA []int 1417 | var iKeysB []int 1418 | var errorArr []string 1419 | for _, key := range keys { 1420 | i := df.colIndex(key) 1421 | if i < 0 { 1422 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on left DataFrame", key)) 1423 | } 1424 | iKeysA = append(iKeysA, i) 1425 | j := b.colIndex(key) 1426 | if j < 0 { 1427 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on right DataFrame", key)) 1428 | } 1429 | iKeysB = append(iKeysB, j) 1430 | } 1431 | if len(errorArr) != 0 { 1432 | return DataFrame{Err: fmt.Errorf(strings.Join(errorArr, "\n"))} 1433 | } 1434 | 1435 | aCols := df.columns 1436 | bCols := b.columns 1437 | // Initialize newCols 1438 | var newCols []series.Series 1439 | for _, i := range iKeysA { 1440 | newCols = append(newCols, aCols[i].Empty()) 1441 | } 1442 | var iNotKeysA []int 1443 | for i := 0; i < df.ncols; i++ { 1444 | if !inIntSlice(i, iKeysA) { 1445 | iNotKeysA = append(iNotKeysA, i) 1446 | newCols = append(newCols, aCols[i].Empty()) 1447 | } 1448 | } 1449 | var iNotKeysB []int 1450 | for i := 0; i < b.ncols; i++ { 1451 | if !inIntSlice(i, iKeysB) { 1452 | iNotKeysB = append(iNotKeysB, i) 1453 | newCols = append(newCols, bCols[i].Empty()) 1454 | } 1455 | } 1456 | 1457 | // Fill newCols 1458 | var yesmatched []struct{ i, j int } 1459 | var nonmatched []int 1460 | for j := 0; j < b.nrows; j++ { 1461 | matched := false 1462 | for i := 0; i < df.nrows; i++ { 1463 | match := true 1464 | for k := range keys { 1465 | aElem := aCols[iKeysA[k]].Elem(i) 1466 | bElem := bCols[iKeysB[k]].Elem(j) 1467 | match = match && aElem.Eq(bElem) 1468 | } 1469 | if match { 1470 | matched = true 1471 | yesmatched = append(yesmatched, struct{ i, j int }{i, j}) 1472 | } 1473 | } 1474 | if !matched { 1475 | nonmatched = append(nonmatched, j) 1476 | } 1477 | } 1478 | for _, v := range yesmatched { 1479 | i := v.i 1480 | j := v.j 1481 | ii := 0 1482 | for _, k := range iKeysA { 1483 | elem := aCols[k].Elem(i) 1484 | newCols[ii].Append(elem) 1485 | ii++ 1486 | } 1487 | for _, k := range iNotKeysA { 1488 | elem := aCols[k].Elem(i) 1489 | newCols[ii].Append(elem) 1490 | ii++ 1491 | } 1492 | for _, k := range iNotKeysB { 1493 | elem := bCols[k].Elem(j) 1494 | newCols[ii].Append(elem) 1495 | ii++ 1496 | } 1497 | } 1498 | for _, j := range nonmatched { 1499 | ii := 0 1500 | for _, k := range iKeysB { 1501 | elem := bCols[k].Elem(j) 1502 | newCols[ii].Append(elem) 1503 | ii++ 1504 | } 1505 | // for _ = range iNotKeysA { 1506 | // newCols[ii].Append(nil) 1507 | // ii++ 1508 | // } 1509 | for _, k := range iNotKeysA { 1510 | _ = k 1511 | newCols[ii].Append(nil) 1512 | ii++ 1513 | } 1514 | for _, k := range iNotKeysB { 1515 | elem := bCols[k].Elem(j) 1516 | newCols[ii].Append(elem) 1517 | ii++ 1518 | } 1519 | } 1520 | return New(newCols...) 1521 | } 1522 | 1523 | // OuterJoin returns a DataFrame containing the outer join of two DataFrames. 1524 | func (df DataFrame) OuterJoin(b DataFrame, keys ...string) DataFrame { 1525 | if len(keys) == 0 { 1526 | return DataFrame{Err: fmt.Errorf("join keys not specified")} 1527 | } 1528 | // Check that we have all given keys in both DataFrames 1529 | var iKeysA []int 1530 | var iKeysB []int 1531 | var errorArr []string 1532 | for _, key := range keys { 1533 | i := df.colIndex(key) 1534 | if i < 0 { 1535 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on left DataFrame", key)) 1536 | } 1537 | iKeysA = append(iKeysA, i) 1538 | j := b.colIndex(key) 1539 | if j < 0 { 1540 | errorArr = append(errorArr, fmt.Sprintf("can't find key %q on right DataFrame", key)) 1541 | } 1542 | iKeysB = append(iKeysB, j) 1543 | } 1544 | if len(errorArr) != 0 { 1545 | return DataFrame{Err: fmt.Errorf(strings.Join(errorArr, "\n"))} 1546 | } 1547 | 1548 | aCols := df.columns 1549 | bCols := b.columns 1550 | // Initialize newCols 1551 | var newCols []series.Series 1552 | for _, i := range iKeysA { 1553 | newCols = append(newCols, aCols[i].Empty()) 1554 | } 1555 | var iNotKeysA []int 1556 | for i := 0; i < df.ncols; i++ { 1557 | if !inIntSlice(i, iKeysA) { 1558 | iNotKeysA = append(iNotKeysA, i) 1559 | newCols = append(newCols, aCols[i].Empty()) 1560 | } 1561 | } 1562 | var iNotKeysB []int 1563 | for i := 0; i < b.ncols; i++ { 1564 | if !inIntSlice(i, iKeysB) { 1565 | iNotKeysB = append(iNotKeysB, i) 1566 | newCols = append(newCols, bCols[i].Empty()) 1567 | } 1568 | } 1569 | 1570 | // Fill newCols 1571 | for i := 0; i < df.nrows; i++ { 1572 | matched := false 1573 | for j := 0; j < b.nrows; j++ { 1574 | match := true 1575 | for k := range keys { 1576 | aElem := aCols[iKeysA[k]].Elem(i) 1577 | bElem := bCols[iKeysB[k]].Elem(j) 1578 | match = match && aElem.Eq(bElem) 1579 | } 1580 | if match { 1581 | matched = true 1582 | ii := 0 1583 | for _, k := range iKeysA { 1584 | elem := aCols[k].Elem(i) 1585 | newCols[ii].Append(elem) 1586 | ii++ 1587 | } 1588 | for _, k := range iNotKeysA { 1589 | elem := aCols[k].Elem(i) 1590 | newCols[ii].Append(elem) 1591 | ii++ 1592 | } 1593 | for _, k := range iNotKeysB { 1594 | elem := bCols[k].Elem(j) 1595 | newCols[ii].Append(elem) 1596 | ii++ 1597 | } 1598 | } 1599 | } 1600 | if !matched { 1601 | ii := 0 1602 | for _, k := range iKeysA { 1603 | elem := aCols[k].Elem(i) 1604 | newCols[ii].Append(elem) 1605 | ii++ 1606 | } 1607 | for _, k := range iNotKeysA { 1608 | elem := aCols[k].Elem(i) 1609 | newCols[ii].Append(elem) 1610 | ii++ 1611 | } 1612 | // for _,_ = range iNotKeysB { 1613 | // newCols[ii].Append(nil) 1614 | // ii++ 1615 | // } 1616 | for _, k := range iNotKeysB { 1617 | _ = k 1618 | newCols[ii].Append(nil) 1619 | ii++ 1620 | } 1621 | } 1622 | } 1623 | for j := 0; j < b.nrows; j++ { 1624 | matched := false 1625 | for i := 0; i < df.nrows; i++ { 1626 | match := true 1627 | for k := range keys { 1628 | aElem := aCols[iKeysA[k]].Elem(i) 1629 | bElem := bCols[iKeysB[k]].Elem(j) 1630 | match = match && aElem.Eq(bElem) 1631 | } 1632 | if match { 1633 | matched = true 1634 | } 1635 | } 1636 | if !matched { 1637 | ii := 0 1638 | for _, k := range iKeysB { 1639 | elem := bCols[k].Elem(j) 1640 | newCols[ii].Append(elem) 1641 | ii++ 1642 | } 1643 | // for _ = range iNotKeysA { 1644 | // newCols[ii].Append(nil) 1645 | // ii++ 1646 | // } 1647 | for _, k := range iNotKeysA { 1648 | _ = k 1649 | newCols[ii].Append(nil) 1650 | ii++ 1651 | } 1652 | for _, k := range iNotKeysB { 1653 | elem := bCols[k].Elem(j) 1654 | newCols[ii].Append(elem) 1655 | ii++ 1656 | } 1657 | } 1658 | } 1659 | return New(newCols...) 1660 | } 1661 | 1662 | // CrossJoin returns a DataFrame containing the cross join of two DataFrames. 1663 | func (df DataFrame) CrossJoin(b DataFrame) DataFrame { 1664 | aCols := df.columns 1665 | bCols := b.columns 1666 | // Initialize newCols 1667 | var newCols []series.Series 1668 | for i := 0; i < df.ncols; i++ { 1669 | newCols = append(newCols, aCols[i].Empty()) 1670 | } 1671 | for i := 0; i < b.ncols; i++ { 1672 | newCols = append(newCols, bCols[i].Empty()) 1673 | } 1674 | // Fill newCols 1675 | for i := 0; i < df.nrows; i++ { 1676 | for j := 0; j < b.nrows; j++ { 1677 | for ii := 0; ii < df.ncols; ii++ { 1678 | elem := aCols[ii].Elem(i) 1679 | newCols[ii].Append(elem) 1680 | } 1681 | for ii := 0; ii < b.ncols; ii++ { 1682 | jj := ii + df.ncols 1683 | elem := bCols[ii].Elem(j) 1684 | newCols[jj].Append(elem) 1685 | } 1686 | } 1687 | } 1688 | return New(newCols...) 1689 | } 1690 | 1691 | // colIndex returns the index of the column with name `s`. If it fails to find the 1692 | // column it returns -1 instead. 1693 | func (df DataFrame) colIndex(s string) int { 1694 | for k, v := range df.Names() { 1695 | if v == s { 1696 | return k 1697 | } 1698 | } 1699 | return -1 1700 | } 1701 | 1702 | // Records return the string record representation of a DataFrame. 1703 | func (df DataFrame) Records() [][]string { 1704 | var records [][]string 1705 | records = append(records, df.Names()) 1706 | if df.ncols == 0 || df.nrows == 0 { 1707 | return records 1708 | } 1709 | var tRecords [][]string 1710 | for _, col := range df.columns { 1711 | tRecords = append(tRecords, col.Records()) 1712 | } 1713 | records = append(records, transposeRecords(tRecords)...) 1714 | return records 1715 | } 1716 | 1717 | // Maps return the array of maps representation of a DataFrame. 1718 | func (df DataFrame) Maps() []map[string]interface{} { 1719 | maps := make([]map[string]interface{}, df.nrows) 1720 | colnames := df.Names() 1721 | for i := 0; i < df.nrows; i++ { 1722 | m := make(map[string]interface{}) 1723 | for k, v := range colnames { 1724 | val := df.columns[k].Val(i) 1725 | m[v] = val 1726 | } 1727 | maps[i] = m 1728 | } 1729 | return maps 1730 | } 1731 | 1732 | // Elem returns the element on row `r` and column `c`. Will panic if the index is 1733 | // out of bounds. 1734 | func (df DataFrame) Elem(r, c int) series.Element { 1735 | return df.columns[c].Elem(r) 1736 | } 1737 | 1738 | // fixColnames assigns a name to the missing column names and makes it so that the 1739 | // column names are unique. 1740 | func fixColnames(colnames []string) { 1741 | // Find duplicated colnames 1742 | dupnamesidx := make(map[string][]int) 1743 | var missingnames []int 1744 | for i := 0; i < len(colnames); i++ { 1745 | a := colnames[i] 1746 | if a == "" { 1747 | missingnames = append(missingnames, i) 1748 | continue 1749 | } 1750 | for j := 0; j < len(colnames); j++ { 1751 | b := colnames[j] 1752 | if i != j && a == b { 1753 | temp := dupnamesidx[a] 1754 | if !inIntSlice(i, temp) { 1755 | dupnamesidx[a] = append(temp, i) 1756 | } 1757 | } 1758 | } 1759 | } 1760 | 1761 | // Autofill missing column names 1762 | counter := 0 1763 | for _, i := range missingnames { 1764 | proposedName := fmt.Sprintf("X%d", counter) 1765 | for findInStringSlice(proposedName, colnames) != -1 { 1766 | counter++ 1767 | proposedName = fmt.Sprintf("X%d", counter) 1768 | } 1769 | colnames[i] = proposedName 1770 | counter++ 1771 | } 1772 | 1773 | // Sort map keys to make sure it always follows the same order 1774 | var keys []string 1775 | for k := range dupnamesidx { 1776 | keys = append(keys, k) 1777 | } 1778 | sort.Strings(keys) 1779 | 1780 | // Add a suffix to the duplicated colnames 1781 | for _, name := range keys { 1782 | idx := dupnamesidx[name] 1783 | if name == "" { 1784 | name = "X" 1785 | } 1786 | counter := 0 1787 | for _, i := range idx { 1788 | proposedName := fmt.Sprintf("%s_%d", name, counter) 1789 | for findInStringSlice(proposedName, colnames) != -1 { 1790 | counter++ 1791 | proposedName = fmt.Sprintf("%s_%d", name, counter) 1792 | } 1793 | colnames[i] = proposedName 1794 | counter++ 1795 | } 1796 | } 1797 | } 1798 | 1799 | func findInStringSlice(str string, s []string) int { 1800 | for i, e := range s { 1801 | if e == str { 1802 | return i 1803 | } 1804 | } 1805 | return -1 1806 | } 1807 | 1808 | func parseSelectIndexes(l int, indexes SelectIndexes, colnames []string) ([]int, error) { 1809 | var idx []int 1810 | switch indexes.(type) { 1811 | case []int: 1812 | idx = indexes.([]int) 1813 | case int: 1814 | idx = []int{indexes.(int)} 1815 | case []bool: 1816 | bools := indexes.([]bool) 1817 | if len(bools) != l { 1818 | return nil, fmt.Errorf("indexing error: index dimensions mismatch") 1819 | } 1820 | for i, b := range bools { 1821 | if b { 1822 | idx = append(idx, i) 1823 | } 1824 | } 1825 | case string: 1826 | s := indexes.(string) 1827 | i := findInStringSlice(s, colnames) 1828 | if i < 0 { 1829 | return nil, fmt.Errorf("can't select columns: column name %q not found", s) 1830 | } 1831 | idx = append(idx, i) 1832 | case []string: 1833 | xs := indexes.([]string) 1834 | for _, s := range xs { 1835 | i := findInStringSlice(s, colnames) 1836 | if i < 0 { 1837 | return nil, fmt.Errorf("can't select columns: column name %q not found", s) 1838 | } 1839 | idx = append(idx, i) 1840 | } 1841 | case series.Series: 1842 | s := indexes.(series.Series) 1843 | if err := s.Err; err != nil { 1844 | return nil, fmt.Errorf("indexing error: new values has errors: %v", err) 1845 | } 1846 | if s.HasNaN() { 1847 | return nil, fmt.Errorf("indexing error: indexes contain NaN") 1848 | } 1849 | switch s.Type() { 1850 | case series.Int: 1851 | return s.Int() 1852 | case series.Bool: 1853 | bools, err := s.Bool() 1854 | if err != nil { 1855 | return nil, fmt.Errorf("indexing error: %v", err) 1856 | } 1857 | return parseSelectIndexes(l, bools, colnames) 1858 | case series.String: 1859 | xs := indexes.(series.Series).Records() 1860 | return parseSelectIndexes(l, xs, colnames) 1861 | default: 1862 | return nil, fmt.Errorf("indexing error: unknown indexing mode") 1863 | } 1864 | default: 1865 | return nil, fmt.Errorf("indexing error: unknown indexing mode") 1866 | } 1867 | return idx, nil 1868 | } 1869 | 1870 | func findType(arr []string) (series.Type, error) { 1871 | var hasFloats, hasInts, hasBools, hasStrings bool 1872 | for _, str := range arr { 1873 | if str == "" || str == "NaN" { 1874 | continue 1875 | } 1876 | if _, err := strconv.Atoi(str); err == nil { 1877 | hasInts = true 1878 | continue 1879 | } 1880 | if _, err := strconv.ParseFloat(str, 64); err == nil { 1881 | hasFloats = true 1882 | continue 1883 | } 1884 | if str == "true" || str == "false" { 1885 | hasBools = true 1886 | continue 1887 | } 1888 | hasStrings = true 1889 | } 1890 | 1891 | switch { 1892 | case hasStrings: 1893 | return series.String, nil 1894 | case hasBools: 1895 | return series.Bool, nil 1896 | case hasFloats: 1897 | return series.Float, nil 1898 | case hasInts: 1899 | return series.Int, nil 1900 | default: 1901 | return series.String, fmt.Errorf("couldn't detect type") 1902 | } 1903 | } 1904 | 1905 | func transposeRecords(x [][]string) [][]string { 1906 | n := len(x) 1907 | if n == 0 { 1908 | return x 1909 | } 1910 | m := len(x[0]) 1911 | y := make([][]string, m) 1912 | for i := 0; i < m; i++ { 1913 | z := make([]string, n) 1914 | for j := 0; j < n; j++ { 1915 | z[j] = x[j][i] 1916 | } 1917 | y[i] = z 1918 | } 1919 | return y 1920 | } 1921 | 1922 | func inIntSlice(i int, is []int) bool { 1923 | for _, v := range is { 1924 | if v == i { 1925 | return true 1926 | } 1927 | } 1928 | return false 1929 | } 1930 | 1931 | // Matrix is an interface which is compatible with gonum's mat.Matrix interface 1932 | type Matrix interface { 1933 | Dims() (r, c int) 1934 | At(i, j int) float64 1935 | } 1936 | 1937 | // Describe prints the summary statistics for each column of the dataframe 1938 | func (df DataFrame) Describe() DataFrame { 1939 | labels := series.Strings([]string{ 1940 | "mean", 1941 | "median", 1942 | "stddev", 1943 | "min", 1944 | "25%", 1945 | "50%", 1946 | "75%", 1947 | "max", 1948 | }) 1949 | labels.Name = "column" 1950 | 1951 | ss := []series.Series{labels} 1952 | 1953 | for _, col := range df.columns { 1954 | var newCol series.Series 1955 | switch col.Type() { 1956 | case series.String: 1957 | newCol = series.New([]string{ 1958 | "-", 1959 | "-", 1960 | "-", 1961 | col.MinStr(), 1962 | "-", 1963 | "-", 1964 | "-", 1965 | col.MaxStr(), 1966 | }, 1967 | col.Type(), 1968 | col.Name, 1969 | ) 1970 | case series.Bool: 1971 | fallthrough 1972 | case series.Float: 1973 | fallthrough 1974 | case series.Int: 1975 | newCol = series.New([]float64{ 1976 | col.Mean(), 1977 | col.Median(), 1978 | col.StdDev(), 1979 | col.Min(), 1980 | col.Quantile(0.25), 1981 | col.Quantile(0.50), 1982 | col.Quantile(0.75), 1983 | col.Max(), 1984 | }, 1985 | series.Float, 1986 | col.Name, 1987 | ) 1988 | } 1989 | ss = append(ss, newCol) 1990 | } 1991 | 1992 | ddf := New(ss...) 1993 | return ddf 1994 | } 1995 | --------------------------------------------------------------------------------