├── csv_tests ├── empty.csv ├── pipe.csv └── pass.csv ├── dataframe ├── filter.go ├── filter_test.go ├── math_test.go ├── math.go ├── merge.go ├── merge_test.go ├── dataframe.go ├── constructor.go ├── columns.go ├── pivot.go ├── select.go ├── group.go └── describe_test.go ├── guides ├── docker │ ├── down.sh │ ├── Dockerfile │ ├── docker-compose.yml │ └── up.sh └── Options.ipynb ├── .gitignore ├── .travis.yml ├── go.mod ├── example_pd_test.go ├── internal └── values │ ├── options_test.go │ ├── sharedMeta.go │ ├── type-bool.go │ ├── type-float.go │ ├── sharedMeta_test.go │ ├── type-int.go │ ├── type-datetime.go │ ├── values.go │ ├── options.go │ ├── type-interface.go │ ├── type-string.go │ └── shared_template.go ├── benchmarking └── profiler │ ├── main.go │ ├── benchmarks │ ├── compare_test.go │ ├── benchmarks_test.go │ ├── profile.go │ ├── benchmarks.go │ ├── compare.go │ ├── profile.py │ └── config.go │ └── comparison_summary.txt ├── LICENSE ├── options ├── datatypes_test.go ├── datatypes.go ├── settable_test.go └── settable.go ├── series ├── constructor_benchmark_test.go ├── merge.go ├── series.go ├── select.go ├── constructor.go ├── merge_test.go ├── filter_test.go ├── describe_test.go ├── group_test.go ├── group.go ├── filter.go ├── math_test.go ├── select_test.go └── constructor_test.go ├── README.md └── pd.go /csv_tests/empty.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /csv_tests/pipe.csv: -------------------------------------------------------------------------------- 1 | |A|B 2 | foo|1|2 -------------------------------------------------------------------------------- /csv_tests/pass.csv: -------------------------------------------------------------------------------- 1 | ,A 2 | foo,1 3 | bar,2 -------------------------------------------------------------------------------- /dataframe/filter.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | -------------------------------------------------------------------------------- /dataframe/filter_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | -------------------------------------------------------------------------------- /guides/docker/down.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | docker-compose down -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | *.out 3 | *.test 4 | cleanGeneratedCode.sh 5 | Makefile 6 | notebooks/ 7 | output_test.csv 8 | debug 9 | go.sum 10 | benchmarking/archive 11 | data*.csv -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.12.x 5 | 6 | before_install: 7 | - go get -t -v ./... 8 | 9 | script: 10 | - go test -race -coverprofile=coverage.txt -covermode=atomic 11 | 12 | after_success: 13 | - bash <(curl -s https://codecov.io/bash) -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/ptiger10/pd 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/araddon/dateparse v0.0.0-20190622164848-0fb0a474d195 7 | github.com/cheekybits/genny v1.0.0 8 | github.com/davecgh/go-spew v1.1.1 // indirect 9 | github.com/stretchr/testify v1.3.0 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /guides/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM yunabe/lgo:latest 2 | 3 | # Fetch gopandas 4 | RUN go get -u github.com/ptiger10/pd/... 5 | RUN lgo installpkg github.com/ptiger10/pd/... 6 | 7 | WORKDIR /notebooks 8 | 9 | # To use JupyterLab, replace "notebook" with "lab". 10 | CMD ["jupyter", "notebook", "--ip=0.0.0.0"] 11 | -------------------------------------------------------------------------------- /guides/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | 2 | version: "3.3" 3 | 4 | services: 5 | jupyter: 6 | build: 7 | context: . 8 | # To use a different port of host, change the first 8888. 9 | ports: 10 | - "8888:8888" 11 | volumes: 12 | - type: "bind" 13 | source: ./../ 14 | target: /notebooks 15 | -------------------------------------------------------------------------------- /guides/docker/up.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $1 4 | if [ "$1" = "-r" ]; then 5 | echo rebuilding 6 | docker build --no-cache -t docker_default . 7 | fi 8 | docker-compose up -d 9 | 10 | url=$(docker-compose exec jupyter jupyter notebook list | grep http | awk '{print $1}') 11 | if [[ -z $url ]]; then 12 | echo Cannot determine url 13 | exit 1 14 | fi 15 | 16 | if [[ "$OSTYPE" == "linux-gnu" ]]; then 17 | xdg-open $url 18 | elif [[ "$OSTYPE" == "darwin"* ]]; then 19 | open $url 20 | else 21 | echo $url 22 | fi 23 | -------------------------------------------------------------------------------- /example_pd_test.go: -------------------------------------------------------------------------------- 1 | package pd 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func ExampleSeries_defaultIndex() { 8 | s, _ := Series([]string{"foo", "bar", "baz"}) 9 | fmt.Println(s) 10 | // Output: 11 | // 0 foo 12 | // 1 bar 13 | // 2 baz 14 | // 15 | // datatype: string 16 | } 17 | 18 | func ExampleDataFrame_default() { 19 | df, _ := DataFrame([]interface{}{[]string{"foo", "bar", "baz"}, []int{7, 11, 19}}) 20 | fmt.Println(df) 21 | // Output: 22 | // 0 1 23 | // 0 foo 7 24 | // 1 bar 11 25 | // 2 baz 19 26 | } 27 | -------------------------------------------------------------------------------- /internal/values/options_test.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import "testing" 4 | 5 | func TestNonsettableOptions(t *testing.T) { 6 | if GetDisplayValuesWhitespaceBuffer() != 4 { 7 | t.Error("Default setting not reading for DisplayValuesWhitespaceBuffer") 8 | } 9 | if GetDisplayColumnsWhitespaceBuffer() != 2 { 10 | t.Error("Default setting not reading for DisplayColumnsWhitespaceBuffer") 11 | } 12 | if GetDisplayElementWhitespaceBuffer() != 1 { 13 | t.Errorf("Default setting not reading for DisplayElementWhitespaceBuffer") 14 | } 15 | if GetDisplayIndexWhitespaceBuffer() != 1 { 16 | t.Errorf("Default setting not reading for DisplayIndexWhitespaceBuffer") 17 | } 18 | if GetMultiColNameSeparator() != " | " { 19 | t.Errorf("Default setting not reading for MultiColNameSeparator") 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /internal/values/sharedMeta.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | // Generic methods for valueTypeValue type converters 4 | 5 | func (val valueTypeValue) toFloat64() float64Value { 6 | return float64Value{} 7 | } 8 | 9 | func (val valueTypeValue) toInt64() int64Value { 10 | return int64Value{} 11 | } 12 | 13 | func (val valueTypeValue) toString() stringValue { 14 | return stringValue{} 15 | } 16 | 17 | func (val valueTypeValue) toBool() boolValue { 18 | return boolValue{} 19 | } 20 | 21 | func (val valueTypeValue) toDateTime() dateTimeValue { 22 | return dateTimeValue{} 23 | } 24 | 25 | func (val valueTypeValues) Less(i, j int) bool { 26 | return true 27 | } 28 | 29 | func (val interfaceValue) tovalueType() valueTypeValue { 30 | return valueTypeValue{} 31 | } 32 | 33 | func newvalueType(vals valueType) valueTypeValue { 34 | return valueTypeValue{} 35 | } 36 | -------------------------------------------------------------------------------- /benchmarking/profiler/main.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package main 4 | 5 | import ( 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "path/filepath" 10 | "runtime" 11 | 12 | "github.com/ptiger10/pd/benchmarking/profiler/benchmarks" 13 | ) 14 | 15 | func main() { 16 | benchmarks.ReadData() 17 | goBenchmarks := benchmarks.RunGoProfiler() 18 | pyBenchmarks := benchmarks.RunPythonProfiler() 19 | 20 | // fmt.Println(goBenchmarks, pyBenchmarks) 21 | 22 | table := benchmarks.CompareBenchmarks( 23 | goBenchmarks, pyBenchmarks, 24 | benchmarks.SampleSizes, benchmarks.Descriptions) 25 | _, thisFile, _, _ := runtime.Caller(0) 26 | basename := "comparison_summary.txt" 27 | dest := filepath.Join(filepath.Dir(thisFile), basename) 28 | err := ioutil.WriteFile(dest, []byte(table), 0666) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | fmt.Printf(">> %v\n", basename) 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Dave Fort 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/compare_test.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | // func TestPythonProfiler(t *testing.T) { 6 | // got := RunPythonProfiler() 7 | // fmt.Println(got) 8 | // } 9 | 10 | // func TestCompareBenchmarks(t *testing.T) { 11 | // type args struct { 12 | // goBenchmarks Results 13 | // pyBenchmarks Results 14 | // sampleSizes []string 15 | // descs map[string]desc 16 | // } 17 | // tests := []struct { 18 | // name string 19 | // args args 20 | // }{ 21 | // {name: "normal", args: args{ 22 | // goBenchmarks: Results{"100k": { 23 | // "sum": []interface{}{"50ms", 50.0}, "mean": []interface{}{"50ms", 50.0}}}, 24 | // pyBenchmarks: Results{"100k": {"sum": []interface{}{"100ms", 100.0}}}, 25 | // descs: map[string]desc{"sum": desc{1, "Simple sum"}, "mean": desc{2, "Simple mean"}}, 26 | // sampleSizes: []string{"100k", "200k"}}}, 27 | // } 28 | // for _, tt := range tests { 29 | // t.Run(tt.name, func(t *testing.T) { 30 | // got := CompareBenchmarks(tt.args.goBenchmarks, tt.args.pyBenchmarks, tt.args.sampleSizes, tt.args.descs) 31 | // print(got) 32 | // }) 33 | // } 34 | // } 35 | 36 | // func TestProfileGo(t *testing.T) { 37 | // ProfileGo(benchmarkMeanFloat64_100000) 38 | 39 | // } 40 | -------------------------------------------------------------------------------- /options/datatypes_test.go: -------------------------------------------------------------------------------- 1 | package options 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDataType(t *testing.T) { 8 | var tests = []struct { 9 | DataType DataType 10 | expected string 11 | }{ 12 | 13 | {None, "none"}, 14 | {Float64, "float64"}, 15 | {Int64, "int64"}, 16 | {String, "string"}, 17 | {Bool, "bool"}, 18 | {DateTime, "dateTime"}, 19 | {Interface, "interface"}, 20 | {Unsupported, "unsupported"}, 21 | {-1, "unknown"}, 22 | {100, "unknown"}, 23 | } 24 | for _, test := range tests { 25 | if test.DataType.String() != test.expected { 26 | t.Errorf("DataType.String() for DataType %v returned %v, want %v", test.DataType, test.DataType.String(), test.expected) 27 | } 28 | } 29 | } 30 | 31 | func TestGetDataType(t *testing.T) { 32 | var tests = []struct { 33 | expected DataType 34 | dataType string 35 | }{ 36 | {Float64, "float"}, 37 | {Float64, "float64"}, 38 | {Float64, "Float64"}, 39 | {Int64, "int"}, 40 | {Int64, "int64"}, 41 | {String, "string"}, 42 | {String, "STRING"}, 43 | {Bool, "bool"}, 44 | {DateTime, "dateTime"}, 45 | {Interface, "interface"}, 46 | {Unsupported, "other"}, 47 | } 48 | for _, tt := range tests { 49 | got := DT(tt.dataType) 50 | if got != tt.expected { 51 | t.Errorf("DT() = %v, want %v", got, tt.expected) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/benchmarks_test.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | import ( 6 | "testing" 7 | ) 8 | 9 | func BenchmarkMath(b *testing.B) { 10 | benchmarks := []struct { 11 | name string 12 | fn func(b *testing.B) 13 | }{ 14 | // {"100k sum 1 column", benchmarkSumFloat64_100000}, 15 | {"100k sum 10 column", benchmarkSumFloat64_100k10x}, 16 | // {"100k read then sum 1 column", benchmarkReadSumFloat64_100000}, 17 | // {"100k mean 1 column", benchmarkMeanFloat64_100000}, 18 | // {"100k sync mean 1 column", benchmarkSyncMeanFloat64_100000}, 19 | // {"100k median 1 column", benchmarkMedianFloat64_100000}, 20 | // {"100k min 1 column", benchmarkMinFloat64_100000}, 21 | // {"100k max 1 column", benchmarkMaxFloat64_100000}, 22 | // {"100k std 1 column", benchmarkStdFloat64_100000}, 23 | // {"100k sync std 1 column", benchmarkSyncStdFloat64_100000}, 24 | // {"500k std 2 columns", benchmarkStdFloat64_500000}, 25 | // {"500k sync std 2 columns", benchmarkSyncStdFloat64_500000}, 26 | // {"500k sum 2 columns", benchmarkSumFloat64_500000}, 27 | // {"500k mean 2 columns", benchmarkMeanFloat64_500000}, 28 | // {"500k sync mean 2 columns", benchmarkSyncMeanFloat64_500000}, 29 | // {"5m sum 1 column", benchmarkSumFloat64_5m}, 30 | } 31 | ReadData() 32 | for _, bm := range benchmarks { 33 | b.Run(bm.name, func(b *testing.B) { 34 | bm.fn(b) 35 | }) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /benchmarking/profiler/comparison_summary.txt: -------------------------------------------------------------------------------- 1 | GoPandas vs Pandas speed comparison 2 | Fri, 19 Jul 2019 19:56:33 PDT 3 | +----+----------------------------------------+------+-----------+-----------+---------+ 4 | | # | DESCRIPTION | N | GOPANDAS | PANDAS | SPEED Δ | 5 | +----+----------------------------------------+------+-----------+-----------+---------+ 6 | | 1 | Sum one column | 100k | 248.2μs | 565.8μs | 2.28x | 7 | +----+----------------------------------------+------+-----------+-----------+---------+ 8 | | 2 | Simple mean of one column | 100k | 248.1μs | 588.2μs | 2.37x | 9 | +----+----------------------------------------+------+-----------+-----------+---------+ 10 | | 3 | Min of one column | 100k | 302.2μs | 536.6μs | 1.78x | 11 | +----+----------------------------------------+------+-----------+-----------+---------+ 12 | | 4 | Max of one column | 100k | 348.0μs | 528.6μs | 1.52x | 13 | +----+----------------------------------------+------+-----------+-----------+---------+ 14 | | 5 | Standard deviation of one column | 100k | 389.0μs | 904.9μs | 2.33x | 15 | +----+----------------------------------------+------+-----------+-----------+---------+ 16 | | 6 | Sum two columns | 500k | 2.6ms | 16.6ms | 6.46x | 17 | +----+----------------------------------------+------+-----------+-----------+---------+ 18 | -------------------------------------------------------------------------------- /options/datatypes.go: -------------------------------------------------------------------------------- 1 | // Package options defines custom DataTypes and settable options for use in constructing, modifying, and displaying DataFrames and Series. 2 | package options 3 | 4 | import "strings" 5 | 6 | // DataType identifies the type of a data object. 7 | // For most values it is interchangeable with the reflect.Type value, but it supports custom identifiers as well (e.g., DateTime). 8 | type DataType int 9 | 10 | // datatype convenience options 11 | const ( 12 | None DataType = iota 13 | Float64 14 | Int64 15 | String 16 | Bool 17 | DateTime 18 | Interface 19 | PlaceholdervalueType 20 | Unsupported 21 | ) 22 | 23 | // DT returns the DataType associated with a string. 24 | func DT(datatype string) DataType { 25 | datatype = strings.ToLower(datatype) 26 | switch datatype { 27 | case "string": 28 | return String 29 | case "bool": 30 | return Bool 31 | case "datetime": 32 | return DateTime 33 | case "interface": 34 | return Interface 35 | default: 36 | if strings.Contains(datatype, "float") { 37 | return Float64 38 | } 39 | if strings.Contains(datatype, "int") { 40 | return Int64 41 | } 42 | return Unsupported 43 | } 44 | } 45 | 46 | func (datatype DataType) String() string { 47 | datatypes := []string{ 48 | "none", 49 | "float64", 50 | "int64", 51 | "string", 52 | "bool", 53 | "dateTime", 54 | "interface", 55 | "placeholder", 56 | "unsupported", 57 | } 58 | 59 | if datatype < None || datatype > Unsupported { 60 | return "unknown" 61 | } 62 | return datatypes[datatype] 63 | } 64 | -------------------------------------------------------------------------------- /series/constructor_benchmark_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "log" 5 | "testing" 6 | ) 7 | 8 | // Floats 9 | func float32Slice(n int) []float32 { 10 | var l []float32 11 | for i := 0; i < n; i++ { 12 | l = append(l, 1) 13 | } 14 | return l 15 | } 16 | 17 | func float64Slice(n int) []float64 { 18 | var l []float64 19 | for i := 0; i < n; i++ { 20 | l = append(l, 1) 21 | } 22 | return l 23 | } 24 | 25 | func benchmarkNewFloat32(i int, b *testing.B) { 26 | v := float32Slice(i) 27 | b.ResetTimer() 28 | for n := 0; n < b.N; n++ { 29 | _, err := New(v) 30 | if err != nil { 31 | log.Fatal(err) 32 | } 33 | } 34 | } 35 | 36 | func benchmarkNewFloat64(i int, b *testing.B) { 37 | v := float64Slice(i) 38 | b.ResetTimer() 39 | for n := 0; n < b.N; n++ { 40 | _, err := New(v) 41 | if err != nil { 42 | log.Fatal(err) 43 | } 44 | } 45 | } 46 | 47 | // func BenchmarkNewFloat32_1(b *testing.B) { benchmarkNewFloat32(10000, b) } 48 | // func BenchmarkNewFloat64_1(b *testing.B) { benchmarkNewFloat64(10000, b) } 49 | // func BenchmarkNewFloat32_2(b *testing.B) { benchmarkNewFloat32(100000, b) } 50 | 51 | // func BenchmarkNewFloat64_2(b *testing.B) { benchmarkNewFloat64(100000, b) } 52 | // func BenchmarkNewFloat32_3(b *testing.B) { benchmarkNewFloat32(1000000, b) } 53 | 54 | // func BenchmarkNewFloat64_3(b *testing.B) { benchmarkNewFloat64(1000000, b) } 55 | 56 | // func BenchmarkNewFloat32_4(b *testing.B) { benchmarkNewFloat32(10000000, b) } 57 | func BenchmarkNewFloat64_4(b *testing.B) { benchmarkNewFloat64(10000000, b) } 58 | -------------------------------------------------------------------------------- /dataframe/math_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/ptiger10/pd/series" 8 | ) 9 | 10 | func Test_Math(t *testing.T) { 11 | df := MustNew([]interface{}{[]float64{1, 3, 5}, []float64{-3, math.NaN(), -1}, []float64{-5, 0, 5}}, 12 | Config{Col: []string{"foo", "bar", "baz"}}) 13 | tests := []struct { 14 | name string 15 | input *DataFrame 16 | fn func(*DataFrame) *series.Series 17 | want *series.Series 18 | }{ 19 | {name: "Empty", input: newEmptyDataFrame(), fn: (*DataFrame).Sum, want: series.MustNew(nil)}, 20 | {"Sum", df, (*DataFrame).Sum, 21 | series.MustNew([]float64{9, -4, 0}, series.Config{Index: []string{"foo", "bar", "baz"}, Name: "sum"}), 22 | }, 23 | {"Mean", df, (*DataFrame).Mean, 24 | series.MustNew([]float64{3, -2, 0}, series.Config{Index: []string{"foo", "bar", "baz"}, Name: "mean"}), 25 | }, 26 | {"Min", df, (*DataFrame).Min, 27 | series.MustNew([]float64{1, -3, -5}, series.Config{Index: []string{"foo", "bar", "baz"}, Name: "min"}), 28 | }, 29 | {"Max", df, (*DataFrame).Max, 30 | series.MustNew([]float64{5, -1, 5}, series.Config{Index: []string{"foo", "bar", "baz"}, Name: "max"}), 31 | }, 32 | {"Std", df, (*DataFrame).Std, 33 | series.MustNew([]float64{1.632993161855452, 1, 4.08248290463863}, series.Config{Index: []string{"foo", "bar", "baz"}, Name: "std"}), 34 | }, 35 | } 36 | for _, tt := range tests { 37 | t.Run(tt.name, func(t *testing.T) { 38 | got := tt.fn(tt.input) 39 | if !series.Equal(got, tt.want) { 40 | t.Errorf("%v() got %v, want %v", tt.name, got, tt.want) 41 | } 42 | }) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /dataframe/math.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "math" 5 | 6 | "github.com/ptiger10/pd/series" 7 | ) 8 | 9 | func (df *DataFrame) math(name string, fn func(s *series.Series) float64) *series.Series { 10 | if Equal(df, newEmptyDataFrame()) { 11 | return series.MustNew(nil) 12 | } 13 | var vals []interface{} 14 | var idx []interface{} 15 | for m := 0; m < df.NumCols(); m++ { 16 | s := df.hydrateSeries(m) 17 | if calc := fn(s); !math.IsNaN(calc) { 18 | vals = append(vals, calc) 19 | idx = append(idx, s.Name()) 20 | } 21 | } 22 | ret := series.MustNew(nil) 23 | for i := 0; i < len(vals); i++ { 24 | // ducks safe method because values are assumed to be supported 25 | s := series.MustNew(vals[i], series.Config{Index: idx[i], Name: name}) 26 | ret.InPlace.Join(s) 27 | } 28 | 29 | return ret 30 | } 31 | 32 | // Sum all numerical or boolean columns. 33 | func (df *DataFrame) Sum() *series.Series { 34 | return df.math("sum", (*series.Series).Sum) 35 | } 36 | 37 | // Mean of all numerical or boolean columns. 38 | func (df *DataFrame) Mean() *series.Series { 39 | return df.math("mean", (*series.Series).Mean) 40 | } 41 | 42 | // Median of all numerical or boolean columns. 43 | func (df *DataFrame) Median() *series.Series { 44 | return df.math("median", (*series.Series).Median) 45 | } 46 | 47 | // Min all numerical columns. 48 | func (df *DataFrame) Min() *series.Series { 49 | return df.math("min", (*series.Series).Min) 50 | } 51 | 52 | // Max all numerical columns. 53 | func (df *DataFrame) Max() *series.Series { 54 | return df.math("max", (*series.Series).Max) 55 | } 56 | 57 | // Std returns the standard deviation of all numerical columns. 58 | func (df *DataFrame) Std() *series.Series { 59 | return df.math("std", (*series.Series).Std) 60 | } 61 | -------------------------------------------------------------------------------- /internal/values/type-bool.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "time" 7 | 8 | "github.com/ptiger10/pd/options" 9 | ) 10 | 11 | // [START Constructor Functions] 12 | 13 | // newBool creates a boolValue from atomic bool value 14 | func newBool(val bool) boolValue { 15 | return boolValue{val, false} 16 | } 17 | 18 | func (vals *boolValues) Less(i, j int) bool { 19 | if !(*vals)[i].v && (*vals)[j].v { 20 | return true 21 | } 22 | return false 23 | } 24 | 25 | // [END Constructor Functions] 26 | 27 | // [START Converters] 28 | // toFloat converts boolValues to float64Values. 29 | // 30 | // true: 1.0, false: 0.0, null: NaN 31 | func (val boolValue) toFloat64() float64Value { 32 | if val.null { 33 | return float64Value{math.NaN(), true} 34 | } else if val.v { 35 | return float64Value{1, false} 36 | } else { 37 | return float64Value{0, false} 38 | } 39 | } 40 | 41 | // toInt converts boolValues to int64Values. 42 | // 43 | // true: 1, false: 0, null: 0 44 | func (val boolValue) toInt64() int64Value { 45 | if val.null { 46 | return int64Value{0, true} 47 | } else if val.v { 48 | return int64Value{1, false} 49 | } else { 50 | return int64Value{0, false} 51 | } 52 | } 53 | 54 | func (val boolValue) toString() stringValue { 55 | if val.null { 56 | return stringValue{options.GetDisplayStringNullFiller(), true} 57 | } 58 | return stringValue{fmt.Sprint(val.v), false} 59 | } 60 | 61 | // toBool returns itself. 62 | func (val boolValue) toBool() boolValue { 63 | return val 64 | } 65 | 66 | // !null: 1/1/1970; null: time.Time{} 67 | func (val boolValue) toDateTime() dateTimeValue { 68 | epochDate := time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC) 69 | if val.null { 70 | return dateTimeValue{time.Time{}, true} 71 | } 72 | return dateTimeValue{epochDate, false} 73 | } 74 | 75 | // [END Converters] 76 | -------------------------------------------------------------------------------- /internal/values/type-float.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "time" 7 | 8 | "github.com/ptiger10/pd/options" 9 | ) 10 | 11 | // newFloat64 creates a float64Value from atomic float64 value 12 | func newFloat64(val float64) float64Value { 13 | if math.IsNaN(val) { 14 | return float64Value{val, true} 15 | } 16 | return float64Value{val, false} 17 | } 18 | 19 | func (vals *float64Values) Less(i, j int) bool { 20 | if (*vals)[i].v < (*vals)[j].v { 21 | return true 22 | } 23 | return false 24 | } 25 | 26 | // [START Converters] 27 | 28 | // toFloat returns itself 29 | func (val float64Value) toFloat64() float64Value { 30 | return val 31 | } 32 | 33 | // toInt converts a float64Value to int64Value 34 | // 35 | // 1.9: 1, 1.5: 1, null: 0 36 | func (val float64Value) toInt64() int64Value { 37 | if val.null { 38 | return int64Value{0, true} 39 | } 40 | v := int64(val.v) 41 | return int64Value{v, false} 42 | 43 | } 44 | 45 | func (val float64Value) toString() stringValue { 46 | if val.null { 47 | return stringValue{options.GetDisplayStringNullFiller(), true} 48 | } 49 | return stringValue{fmt.Sprint(val.v), false} 50 | } 51 | 52 | // toBool converts float64Value to boolValue 53 | // 54 | // x != 0: true; x == 0: false; null: false 55 | func (val float64Value) toBool() boolValue { 56 | if val.null { 57 | return boolValue{false, true} 58 | } 59 | if val.v == 0 { 60 | return boolValue{false, false} 61 | } 62 | return boolValue{true, false} 63 | } 64 | 65 | // toDateTime converts float64Value to dateTimeValue. 66 | // Tries to convert from Unix EPOCH time, otherwise returns null 67 | func (val float64Value) toDateTime() dateTimeValue { 68 | if val.null { 69 | return dateTimeValue{time.Time{}, true} 70 | } 71 | return floatToDateTime(val.v) 72 | } 73 | 74 | func floatToDateTime(f float64) dateTimeValue { 75 | return intToDateTime(int64(f)) 76 | } 77 | 78 | // [END Converters] 79 | -------------------------------------------------------------------------------- /internal/values/sharedMeta_test.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | // Pro forma tests for generics 9 | func TestMeta(t *testing.T) { 10 | newSlicevalueType([]valueType{newvalueType("foo")}) 11 | 12 | val := newvalueType("foo") 13 | f := val.toFloat64() 14 | if vType := reflect.TypeOf(f); vType.Name() != "float64Value" { 15 | t.Errorf("%v", vType.Name()) 16 | } 17 | i := val.toInt64() 18 | if vType := reflect.TypeOf(i); vType.Name() != "int64Value" { 19 | t.Errorf("%v", vType.Name()) 20 | } 21 | s := val.toString() 22 | if vType := reflect.TypeOf(s); vType.Name() != "stringValue" { 23 | t.Errorf("%v", vType.Name()) 24 | } 25 | b := val.toBool() 26 | if vType := reflect.TypeOf(b); vType.Name() != "boolValue" { 27 | t.Errorf("%v", vType.Name()) 28 | } 29 | dt := val.toDateTime() 30 | if vType := reflect.TypeOf(dt); vType.Name() != "dateTimeValue" { 31 | t.Errorf("%v", vType.Name()) 32 | } 33 | 34 | nullVal := valueTypeValue{"foo", true} 35 | nullVal.toString() 36 | nullVals := valueTypeValues{nullVal} 37 | nullVals.ToInterface() 38 | 39 | vals := valueTypeValues{val} 40 | vals.Len() 41 | vals.Swap(0, 0) 42 | vals.Less(0, 0) 43 | vals.Values() 44 | vals.Vals() 45 | vals.Copy() 46 | vals.Value(0) 47 | vals.Null(0) 48 | vals.ToFloat64() 49 | vals.ToInt64() 50 | vals.ToString() 51 | vals.ToBool() 52 | vals.ToDateTime() 53 | vals.ToInterface() 54 | 55 | vals.Subset([]int{0}) 56 | vals.Set(0, "bar") 57 | vals.Set(0, "") 58 | vals.Drop(0) 59 | vals.Insert(0, "foo") 60 | 61 | v := interfaceValue{"foo", false} 62 | v.tovalueType() 63 | } 64 | 65 | // No easy way to Convert valueTypeValues, so expect panic 66 | func TestPanic(t *testing.T) { 67 | val := newvalueType("foo") 68 | vals := valueTypeValues{val} 69 | defer func() { 70 | if r := recover(); r == nil { 71 | t.Errorf("The code did not panic") 72 | } 73 | }() 74 | 75 | // The following is the code under test 76 | vals.Append(&vals) 77 | } 78 | -------------------------------------------------------------------------------- /internal/values/type-int.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "time" 7 | 8 | "github.com/ptiger10/pd/options" 9 | ) 10 | 11 | // [START Constructor Functions] 12 | 13 | // newInt64 creates an int64Value from atomic int64 value 14 | func newInt64(val int64) int64Value { 15 | return int64Value{val, false} 16 | } 17 | 18 | func (vals *int64Values) Less(i, j int) bool { 19 | if (*vals)[i].v < (*vals)[j].v { 20 | return true 21 | } 22 | return false 23 | } 24 | 25 | // [END Constructor Functions] 26 | 27 | // [START Converters] 28 | 29 | // toFloat converts int64Value to float64Value 30 | // 31 | // 1: 1.0 32 | func (val int64Value) toFloat64() float64Value { 33 | if val.null { 34 | return float64Value{math.NaN(), true} 35 | } 36 | v := float64(val.v) 37 | return float64Value{v, false} 38 | 39 | } 40 | 41 | // toInt returns itself 42 | func (val int64Value) toInt64() int64Value { 43 | return val 44 | } 45 | 46 | func (val int64Value) toString() stringValue { 47 | if val.null { 48 | return stringValue{options.GetDisplayStringNullFiller(), true} 49 | } 50 | return stringValue{fmt.Sprint(val.v), false} 51 | } 52 | 53 | // toBool converts int64Value to boolValue 54 | // 55 | // x != 0: true; x == 0: false; null: false 56 | func (val int64Value) toBool() boolValue { 57 | if val.null { 58 | return boolValue{false, true} 59 | } 60 | if val.v == 0 { 61 | return boolValue{false, false} 62 | } 63 | return boolValue{true, false} 64 | } 65 | 66 | // toDateTime converts int64Value to dateTimeValue. 67 | // Tries to convert from Unix EPOCH timestamp. 68 | // Defaults to 1970-01-01 00:00:00 +0000 UTC. 69 | func (val int64Value) toDateTime() dateTimeValue { 70 | if val.null { 71 | return dateTimeValue{time.Time{}, true} 72 | } 73 | return intToDateTime(val.v) 74 | } 75 | 76 | func intToDateTime(i int64) dateTimeValue { 77 | // convert from nanoseconds to seconds 78 | i /= 1000000000 79 | v := time.Unix(i, 0).UTC() 80 | return dateTimeValue{v, false} 81 | } 82 | 83 | // [END Converters] 84 | -------------------------------------------------------------------------------- /internal/values/type-datetime.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "time" 7 | 8 | "github.com/ptiger10/pd/options" 9 | ) 10 | 11 | // [START Constructor Functions] 12 | 13 | // newDateTime creates a dateTimeValue from atomic time.Time value 14 | func newDateTime(val time.Time) dateTimeValue { 15 | if (time.Time{}) == val { 16 | return dateTimeValue{val, true} 17 | } 18 | return dateTimeValue{val, false} 19 | } 20 | 21 | func (vals *dateTimeValues) Less(i, j int) bool { 22 | if (*vals)[i].v.Before((*vals)[j].v) { 23 | return true 24 | } 25 | return false 26 | } 27 | 28 | // [END Constructor Functions] 29 | 30 | // [START Converters] 31 | 32 | // toFloat converts dateTimeValues to float64Values of the Unix EPOCH timestamp 33 | // (seconds since midnight January 1, 1970) 34 | // 2019-05-01 00:00:00 +0000 UTC: 1556757505 35 | func (val dateTimeValue) toFloat64() float64Value { 36 | if val.null || val.v == (time.Time{}) { 37 | return float64Value{math.NaN(), true} 38 | } 39 | v := val.v.UnixNano() 40 | return float64Value{float64(v), false} 41 | } 42 | 43 | // ToInt converts dateTimeValues to int64Values of the Unix EPOCH timestamp 44 | // (seconds since midnight January 1, 1970) 45 | // 46 | // 2019-05-01 00:00:00 +0000 UTC: 1556757505 47 | func (val dateTimeValue) toInt64() int64Value { 48 | if val.null || val.v == (time.Time{}) { 49 | return int64Value{0, true} 50 | } 51 | v := val.v.UnixNano() 52 | return int64Value{v, false} 53 | } 54 | 55 | func (val dateTimeValue) toString() stringValue { 56 | if val.null { 57 | return stringValue{options.GetDisplayStringNullFiller(), true} 58 | } 59 | return stringValue{fmt.Sprint(val.v), false} 60 | } 61 | 62 | // ToBool converts dateTimeValues to boolValues 63 | // 64 | // x != time.Time{}: true; x == time.Time{}: false; null: false 65 | func (val dateTimeValue) toBool() boolValue { 66 | if val.null || val.v == (time.Time{}) { 67 | return boolValue{false, true} 68 | } 69 | return boolValue{true, false} 70 | 71 | } 72 | 73 | // ToDateTime returns itself 74 | func (val dateTimeValue) toDateTime() dateTimeValue { 75 | return val 76 | } 77 | 78 | // [END Converters] 79 | -------------------------------------------------------------------------------- /series/merge.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "reflect" 7 | 8 | "github.com/ptiger10/pd/internal/index" 9 | "github.com/ptiger10/pd/options" 10 | ) 11 | 12 | // Join converts s2 to the same type as the base Series (s), appends s2 to the end, and modifies s in place. 13 | func (ip InPlace) Join(s2 *Series) error { 14 | if ip.s == nil || ip.s.datatype == options.None { 15 | ip.s.replace(s2) 16 | return nil 17 | } 18 | 19 | if s2.index.NumLevels() != ip.s.NumLevels() { 20 | return fmt.Errorf("Series.Join(): s2 must have same number of index levels as s (%d != %d)", s2.index.NumLevels(), ip.s.NumLevels()) 21 | } 22 | for i := 0; i < s2.Len(); i++ { 23 | elem := s2.Element(i) 24 | ip.s.InPlace.Append(elem.Value, elem.Labels...) 25 | } 26 | return nil 27 | } 28 | 29 | // Join converts s2 to the same type as the base Series (s), appends s2 to the end, and returns a new Series. 30 | func (s *Series) Join(s2 *Series) (*Series, error) { 31 | s = s.Copy() 32 | err := s.InPlace.Join(s2) 33 | return s, err 34 | } 35 | 36 | // LookupSeries performs a vlookup of each values in one Series against another Series. 37 | func (s *Series) LookupSeries(s2 *Series) *Series { 38 | if s2.index.NumLevels() != s.NumLevels() { 39 | if options.GetLogWarnings() { 40 | log.Printf("Series.LookupSeries(): s2 must have same number of index levels as s (%d != %d)\n", s2.index.NumLevels(), s.NumLevels()) 41 | } 42 | return newEmptySeries() 43 | } 44 | 45 | matchShallow := func(s *Series, idx index.Elements) int { 46 | for i := 0; i < s.Len(); i++ { 47 | if reflect.DeepEqual(idx, s.index.Elements(i)) { 48 | return i 49 | } 50 | } 51 | return -1 52 | } 53 | 54 | vals := make([]interface{}, 0) 55 | for i := 0; i < s.Len(); i++ { 56 | elems := s.index.Elements(i) 57 | pos := matchShallow(s2, elems) 58 | if pos != -1 { 59 | vals = append(vals, s2.At(pos)) 60 | } else { 61 | vals = append(vals, "") 62 | } 63 | } 64 | // ducks error because there will be no unsupported values coming from an existing series 65 | ret, _ := New(vals, Config{DataType: s2.datatype}) 66 | ret.index = s.index 67 | 68 | return ret 69 | } 70 | -------------------------------------------------------------------------------- /dataframe/merge.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | // // Join extends the columns, rows, or columns and rows of a dataframe by appending s2 and modifies the DataFrame in place. 4 | // // If extending rows, the values within a Values container are converted to []interface if the container datatypes are not the same. 5 | // // 6 | // // Allowable append values: "rows", "cols", "both" 7 | // // 8 | // // Allowable method values: "left", "right", "inner", "outer" 9 | // func (ip InPlace) Join(append string, method string, df2 *DataFrame) error { 10 | // if ip.df.vals == nil { 11 | // ip.df.replace(df2) 12 | // return nil 13 | // } 14 | // switch append { 15 | // case "rows": 16 | // switch method { 17 | // case "left": 18 | 19 | // } 20 | // } 21 | // return nil 22 | // } 23 | 24 | // assumes equivalent index levels and column positions 25 | func (ip InPlace) appendDataFrameRow(df2 *DataFrame) { 26 | // Handling empty DataFrame 27 | if Equal(ip.df, newEmptyDataFrame()) { 28 | ip.df.replace(df2) 29 | return 30 | } 31 | 32 | // Append 33 | // Index Levels 34 | for j := 0; j < ip.df.IndexLevels(); j++ { 35 | ip.df.index.Levels[j].Labels.Append(df2.index.Levels[j].Labels) 36 | ip.df.index.Levels[j].NeedsRefresh = true 37 | } 38 | // Values 39 | for m := 0; m < ip.df.NumCols(); m++ { 40 | ip.df.vals[m].Values.Append(df2.vals[m].Values) 41 | } 42 | return 43 | } 44 | 45 | 46 | func (ip InPlace) appendDataFrameColumn(df2 *DataFrame) error { 47 | // Handling empty DataFrame 48 | if Equal(ip.df, newEmptyDataFrame()) { 49 | ip.df.replace(df2) 50 | return nil 51 | } 52 | 53 | // Append 54 | for m := 0; m < df2.NumCols(); m++ { 55 | // drop errors for now, because input is controlled 56 | // err := ip.AppendCol( 57 | ip.AppendCol( 58 | df2.hydrateSeries(m), 59 | df2.cols.MultiName(m)..., 60 | ) 61 | // if err != nil { 62 | // return fmt.Errorf("appendDataFrameColumn(): %v", err) 63 | // } 64 | } 65 | return nil 66 | } 67 | 68 | // // Join extends the columns, rows, or columns and rows of a dataframe by appending s2 and returns a new DataFrame. 69 | // // If extending rows, the values within a Values container are converted to []interface if the container datatypes are not the same. 70 | // func (df *DataFrame) Join(append string, method string, df2 *DataFrame) *DataFrame { 71 | // df = df.Copy() 72 | // df.InPlace.Join(append, method, df2) 73 | // return df 74 | // } 75 | -------------------------------------------------------------------------------- /dataframe/merge_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import "testing" 4 | 5 | func TestMerge_appendDataFrameRow(t *testing.T) { 6 | type args struct { 7 | df2 *DataFrame 8 | } 9 | tests := []struct { 10 | name string 11 | input *DataFrame 12 | args args 13 | want *DataFrame 14 | }{ 15 | {name: "empty", input: newEmptyDataFrame(), 16 | args: args{df2: MustNew([]interface{}{"foo"})}, 17 | want: MustNew([]interface{}{"foo"})}, 18 | {"same datatype", MustNew([]interface{}{"foo"}, Config{Index: "1"}), 19 | args{MustNew([]interface{}{"bar"}, Config{Index: "2"})}, 20 | MustNew([]interface{}{[]string{"foo", "bar"}}, Config{Index: []string{"1", "2"}})}, 21 | {"different datatype", MustNew([]interface{}{"foo"}, Config{Index: "1"}), 22 | args{MustNew([]interface{}{10}, Config{Index: "2"})}, 23 | MustNew([]interface{}{[]string{"foo", "10"}}, Config{Index: []string{"1", "2"}})}, 24 | } 25 | for _, tt := range tests { 26 | t.Run(tt.name, func(t *testing.T) { 27 | df := tt.input.Copy() 28 | df.InPlace.appendDataFrameRow(tt.args.df2) 29 | if !Equal(df, tt.want) { 30 | t.Errorf("InPlace.appendDataFrameRow() = %v, want %v", df, tt.want) 31 | } 32 | }) 33 | } 34 | } 35 | 36 | func TestMerge_appendDataFrameColumn(t *testing.T) { 37 | type args struct { 38 | df2 *DataFrame 39 | } 40 | type want struct { 41 | df *DataFrame 42 | err bool 43 | } 44 | tests := []struct { 45 | name string 46 | input *DataFrame 47 | args args 48 | want want 49 | }{ 50 | {name: "empty", input: newEmptyDataFrame(), 51 | args: args{df2: MustNew([]interface{}{"foo"})}, 52 | want: want{df: MustNew([]interface{}{"foo"}), err: false}}, 53 | {"pass", MustNew([]interface{}{"foo"}, Config{Col: []string{"1"}}), 54 | args{MustNew([]interface{}{"bar"}, Config{Col: []string{"2"}})}, 55 | want{MustNew([]interface{}{"foo", "bar"}, Config{Col: []string{"1", "2"}}), false}}, 56 | // fix to append multiple columns in order 57 | // {"extra columns in df2", MustNew([]interface{}{"foo"}, Config{Col: []string{"1"}}), 58 | // args{MustNew([]interface{}{"bar", "baz"}, Config{Col: []string{"1", "2"}})}, 59 | // want{MustNew([]interface{}{"foo"}, Config{Col: []string{"1"}}), false}}, 60 | } 61 | for _, tt := range tests { 62 | t.Run(tt.name, func(t *testing.T) { 63 | df := tt.input.Copy() 64 | err := df.InPlace.appendDataFrameColumn(tt.args.df2) 65 | if (err != nil) != tt.want.err { 66 | t.Errorf("DataFrame.appendDataFrameColumn() error = %v, want %v", err, tt.want.err) 67 | return 68 | } 69 | if !Equal(df, tt.want.df) { 70 | t.Errorf("InPlace.appendDataFrameColumn() = %v, want %v", df, tt.want.df) 71 | } 72 | }) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /internal/values/values.go: -------------------------------------------------------------------------------- 1 | // Package values is an internal package that powers the values within pd/Series and pd/DataFrame. 2 | // This package defines the Values interface and multiple concrete implementations of the interface. 3 | package values 4 | 5 | import ( 6 | "fmt" 7 | 8 | "github.com/ptiger10/pd/options" 9 | ) 10 | 11 | // The Values interface is the primary means of handling a collection of values. 12 | // The same interface and value types are used for both Series values and Index labels 13 | type Values interface { 14 | Len() int // number of Value/Null structs 15 | Vals() interface{} // an interface of values, ready for type assertion into a slice of their native type 16 | Values() []interface{} // an interface slice of values, for handling values as a predictable slice 17 | Subset([]int) Values // a new Values object comprised of the Value/Null pairs at one or more integer positions 18 | Value(int) interface{} // the value field at an integer position 19 | Null(int) bool // the null field at an integer position 20 | Set(int, interface{}) // overwrite the value/null struct at an integer position 21 | Copy() Values // clone the Values 22 | Insert(int, interface{}) // insert a Value/Null pair at an integer position 23 | Append(Values) // append Values together 24 | Drop(int) // drop a Value/Null pair at an integer position 25 | Swap(i, j int) // swap two values - necessary for sorting 26 | Less(i, j int) bool // compare two values and return the lesser - required for sorting 27 | 28 | ToFloat64() Values 29 | ToInt64() Values 30 | ToString() Values 31 | ToBool() Values 32 | ToDateTime() Values 33 | ToInterface() Values 34 | } 35 | 36 | // Container contains Values (a list of Value/Null pairs satisfying the Values interface) and Kind. 37 | type Container struct { 38 | Values Values 39 | DataType options.DataType 40 | } 41 | 42 | // Convert a collection of values from one type to another, and coerce to null if a value cannot be converted sensibly 43 | func Convert(currentVals Values, dataType options.DataType) (Values, error) { 44 | var vals Values 45 | switch dataType { 46 | case options.None: 47 | return nil, fmt.Errorf("unable to convert values: must supply a valid Kind") 48 | case options.Float64: 49 | vals = currentVals.ToFloat64() 50 | case options.Int64: 51 | vals = currentVals.ToInt64() 52 | case options.String: 53 | vals = currentVals.ToString() 54 | case options.Bool: 55 | vals = currentVals.ToBool() 56 | case options.DateTime: 57 | vals = currentVals.ToDateTime() 58 | case options.Interface: 59 | vals = currentVals.ToInterface() 60 | default: 61 | return nil, fmt.Errorf("unable to convert values: kind not supported: %v", dataType) 62 | } 63 | return vals, nil 64 | } 65 | -------------------------------------------------------------------------------- /internal/values/options.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | var displayValuesWhitespaceBuffer = 4 4 | var displayColumnsWhitespaceBuffer = 2 5 | var displayElementWhitespaceBuffer = 1 6 | var displayIndexWhitespaceBuffer = 1 7 | var multiColNameSeparator = " | " 8 | var interpolationMaximum = 50 9 | var interpolationThreshold = .80 10 | 11 | // GetDisplayValuesWhitespaceBuffer returns displayValuesWhitespaceBuffer. 12 | // displayValuesWhitespaceBuffer is an option when printing a Series or DataFrame. 13 | // It is the number of spaces between the last level of index labels 14 | // and the first collection of values. In a Series, there is only one collection of values. 15 | // In a DataFrame, the first collection of values is the first Series. 16 | // 17 | // Default buffer: 4 spaces 18 | func GetDisplayValuesWhitespaceBuffer() int { 19 | return displayValuesWhitespaceBuffer 20 | } 21 | 22 | // GetDisplayColumnsWhitespaceBuffer returns displayColumnsWhitespaceBuffer. 23 | // displayColumnsWhitespaceBuffer is an option when printing a Series or DataFrame. 24 | // It is the number of spaces between columns in a DataFrame. 25 | // 26 | // Default buffer: 2 spaces 27 | func GetDisplayColumnsWhitespaceBuffer() int { 28 | return displayColumnsWhitespaceBuffer 29 | } 30 | 31 | // GetDisplayElementWhitespaceBuffer returns displayElementWhitespaceBuffer. 32 | // DisplayElementWhitespaceBuffer is an option when printing an Element. 33 | // It is the number of spaces between the last level of index labels and the first value. 34 | // 35 | // // Default buffer: 1 space 36 | func GetDisplayElementWhitespaceBuffer() int { 37 | return displayElementWhitespaceBuffer 38 | } 39 | 40 | // GetDisplayIndexWhitespaceBuffer returns displayIndexWhitespaceBuffer. 41 | // DisplayIndexWhitespaceBuffer is an option when printing a Series. 42 | // It is the number of spaces between index labels. This applies only to a multi-level index. 43 | // 44 | // Default buffer: 1 space 45 | func GetDisplayIndexWhitespaceBuffer() int { 46 | return displayIndexWhitespaceBuffer 47 | } 48 | 49 | // GetMultiColNameSeparator returns the multiColNameSeparator. 50 | // The multiColNameSeparator separates col names whenever a multicol is concatenated together (e.g., into a Series name or index level name). 51 | // 52 | // Default: " | " 53 | func GetMultiColNameSeparator() string { 54 | return multiColNameSeparator 55 | } 56 | 57 | // GetInterpolationMaximum returns the max number of records that will be checked during an interpolation check. 58 | // 59 | // Default: 50 60 | func GetInterpolationMaximum() int { 61 | return interpolationMaximum 62 | } 63 | 64 | // GetInterpolationThreshold returns the ratio of type inclusion required for a dataType to be interpolated. 65 | // 66 | // Default: .80 67 | func GetInterpolationThreshold() float64 { 68 | return interpolationThreshold 69 | } 70 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/profile.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | import ( 6 | "encoding/json" 7 | "fmt" 8 | "log" 9 | "os/exec" 10 | "path" 11 | "runtime" 12 | "testing" 13 | "time" 14 | ) 15 | 16 | type desc struct { 17 | order int 18 | str string 19 | } 20 | 21 | // RunGoProfiler specifies all the benchmarks to profile and return in the benchmark table. 22 | func RunGoProfiler() Results { 23 | fmt.Println("Profiling Go") 24 | Results := Results{ 25 | "100k": { 26 | "sum": ProfileGo(benchmarkSumFloat64_100000), 27 | // "sumx10": ProfileGo(benchmarkSumFloat64_100k10x), 28 | // "readCSVSum10x": ProfileGo(benchmarkReadSumFloat64_100k10x), 29 | "mean": ProfileGo(benchmarkMeanFloat64_100000), 30 | "min": ProfileGo(benchmarkMinFloat64_100000), 31 | "max": ProfileGo(benchmarkMaxFloat64_100000), 32 | "std": ProfileGo(benchmarkStdFloat64_100000), 33 | // "readCSVSum": ProfileGo(benchmarkReadSumFloat64_100000), 34 | }, 35 | "500k": { 36 | "sum2": ProfileGo(benchmarkSumFloat64_500000), 37 | // "mean2": ProfileGo(benchmarkMeanFloat64_500000), 38 | }, 39 | // "5m": { 40 | // "sum": ProfileGo(benchmarkSumFloat64_5m), 41 | // }, 42 | } 43 | return Results 44 | } 45 | 46 | // Results contains benchmarking results 47 | // {"num of samples": {"test1": "10ms"...}} 48 | type Results map[string]map[string]Result 49 | 50 | // A Result of benchmarking data in the form [string, float64] 51 | type Result []interface{} 52 | 53 | // ProfileGo runs the normal Go benchmarking command but formats the result as a rounded string 54 | // and raw ns float 55 | func ProfileGo(f func(b *testing.B)) Result { 56 | benchmark := testing.Benchmark(f).NsPerOp() 57 | var speed string 58 | switch { 59 | case benchmark < int64(time.Microsecond): 60 | speed = fmt.Sprintf("%vns", benchmark) 61 | case benchmark < int64(time.Millisecond): 62 | speed = fmt.Sprintf("%.1fμs", float64(benchmark)/float64(time.Microsecond)) 63 | case benchmark < int64(time.Second): 64 | speed = fmt.Sprintf("%.1fms", float64(benchmark)/float64(time.Millisecond)) 65 | default: 66 | speed = fmt.Sprintf("%.2fs", float64(benchmark)/float64(time.Second)) 67 | } 68 | return Result{speed, float64(benchmark)} 69 | } 70 | 71 | // RunPythonProfiler executes main.py in this directory, which is expected to return JSON 72 | // in the form of Results. This command is expected to be initiated from the directory above. 73 | func RunPythonProfiler() Results { 74 | fmt.Println("Profiling Python") 75 | _, thisFile, _, _ := runtime.Caller(0) 76 | script := "profile.py" 77 | scriptPath := path.Join(path.Dir(thisFile), script) 78 | cmd := exec.Command("python3", scriptPath) 79 | out, err := cmd.Output() 80 | if err != nil { 81 | log.Fatal(err) 82 | } 83 | 84 | var r Results 85 | err = json.Unmarshal(out, &r) 86 | if err != nil { 87 | log.Fatal(err) 88 | } 89 | return r 90 | } 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pd 2 | [![Go Report Card](https://goreportcard.com/badge/github.com/ptiger10/pd)](https://goreportcard.com/report/github.com/ptiger10/pd) 3 | [![GoDoc](https://godoc.org/github.com/ptiger10/pd?status.svg)](https://godoc.org/github.com/ptiger10/pd) 4 | [![Build Status](https://travis-ci.org/ptiger10/pd.svg?branch=master)](https://travis-ci.org/ptiger10/pd) 5 | [![codecov](https://codecov.io/gh/ptiger10/pd/branch/master/graph/badge.svg)](https://codecov.io/gh/ptiger10/pd) 6 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 7 | 8 | pd (informally known as "GoPandas") is a library for cleaning, aggregating, and transforming data using Series and DataFrames. GoPandas combines a flexible API familiar to Python pandas users with the qualities of Go, including type safety, predictable error handling, and fast concurrent processing. 9 | 10 | The API is still version 0 and subject to major revisions. Use in production code at your own risk. 11 | 12 | Some notable features of GoPandas: 13 | * flexible constructor that supports float, int, string, bool, time.Time, and interface Series 14 | * seamlessly handles null data and type conversions 15 | * well-suited to either the Jupyter notebook style of data exploration or conventional programming 16 | * advanced filtering, grouping, and pivoting 17 | * hierarchical indexing (i.e., multi-level indexes and columns) 18 | * reads from either CSV or any spreadsheet or tabular data structured as [][]interface (e.g., Google Sheets) 19 | * complete test coverage 20 | * minimal dependencies (total package size is <10MB, compared to Pandas at >200MB) 21 | * uses concurrent processing to achieve faster speeds than Pandas on many fundamental operations, and the performance differential becomes more pronounced with scale (6x+ superior performance summing two columns in a 500k row spreadsheet - see the most recent [benchmarking table](benchmarking/profiler/comparison_summary.txt) 22 | 23 | ## Getting Started 24 | Check out the Jupyter notebook examples in the [guides](https://github.com/ptiger10/pd/tree/master/guides). Github sometimes has trouble rendering .ipynb, backup views are here: [Series](https://nbviewer.jupyter.org/github/ptiger10/pd/blob/master/guides/Series.ipynb?flush_cache=true), [DataFrame](https://nbviewer.jupyter.org/github/ptiger10/pd/blob/master/guides/DataFrame.ipynb?flush_cache=true), [Options](https://nbviewer.jupyter.org/github/ptiger10/pd/blob/master/guides/Options.ipynb?flush_cache=true). 25 | 26 | To run the Jupyter notebooks yourself, I recommend lgo (Docker required) 27 | * `cd guides/docker` 28 | * start: `./up.sh` 29 | * stop: `./down.sh` 30 | * rebuild package to newest version: `./up.sh -r` 31 | 32 | ## Replicating Benchmark Tests 33 | * Requires Python 3.x and pandas 34 | * Download data from [here](https://github.com/ptiger10/pdTestData) and save in benchmarking/profiler 35 | * `go run -tags=benchmarks benchmarking/profiler/main.go` -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/benchmarks.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | import ( 6 | "log" 7 | "testing" 8 | 9 | "github.com/ptiger10/pd" 10 | "github.com/ptiger10/pd/options" 11 | ) 12 | 13 | func benchmarkSumFloat64_5m(b *testing.B) { 14 | for n := 0; n < b.N; n++ { 15 | df5m.Sum() 16 | } 17 | } 18 | 19 | func benchmarkSumFloat64_500000(b *testing.B) { 20 | for n := 0; n < b.N; n++ { 21 | df500k.Sum() 22 | } 23 | } 24 | 25 | func benchmarkSumFloat64_100k10x(b *testing.B) { 26 | for n := 0; n < b.N; n++ { 27 | df100k10x.Sum() 28 | } 29 | } 30 | 31 | func benchmarkSumFloat64_100000(b *testing.B) { 32 | for n := 0; n < b.N; n++ { 33 | df100k.Sum() 34 | } 35 | } 36 | 37 | func benchmarkMeanFloat64_100000(b *testing.B) { 38 | for n := 0; n < b.N; n++ { 39 | df100k.Mean() 40 | } 41 | } 42 | 43 | func benchmarkSyncMeanFloat64_100000(b *testing.B) { 44 | options.SetAsync(false) 45 | for n := 0; n < b.N; n++ { 46 | df100k.Mean() 47 | } 48 | options.RestoreDefaults() 49 | } 50 | 51 | func benchmarkMeanFloat64_500000(b *testing.B) { 52 | for n := 0; n < b.N; n++ { 53 | df500k.Mean() 54 | } 55 | } 56 | 57 | func benchmarkSyncMeanFloat64_500000(b *testing.B) { 58 | options.SetAsync(false) 59 | for n := 0; n < b.N; n++ { 60 | df500k.Mean() 61 | } 62 | options.RestoreDefaults() 63 | } 64 | 65 | func benchmarkMedianFloat64_100000(b *testing.B) { 66 | for n := 0; n < b.N; n++ { 67 | df100k.Median() 68 | } 69 | } 70 | 71 | func benchmarkMinFloat64_100000(b *testing.B) { 72 | for n := 0; n < b.N; n++ { 73 | df100k.Min() 74 | } 75 | } 76 | 77 | func benchmarkMaxFloat64_100000(b *testing.B) { 78 | for n := 0; n < b.N; n++ { 79 | df100k.Max() 80 | } 81 | } 82 | 83 | func benchmarkStdFloat64_100000(b *testing.B) { 84 | for n := 0; n < b.N; n++ { 85 | df100k.Max() 86 | } 87 | } 88 | 89 | func benchmarkSyncStdFloat64_100000(b *testing.B) { 90 | options.SetAsync(false) 91 | for n := 0; n < b.N; n++ { 92 | df100k.Std() 93 | } 94 | options.RestoreDefaults() 95 | } 96 | 97 | func benchmarkStdFloat64_500000(b *testing.B) { 98 | for n := 0; n < b.N; n++ { 99 | df500k.Max() 100 | } 101 | } 102 | 103 | func benchmarkSyncStdFloat64_500000(b *testing.B) { 104 | options.SetAsync(false) 105 | for n := 0; n < b.N; n++ { 106 | df500k.Std() 107 | } 108 | options.RestoreDefaults() 109 | } 110 | 111 | func benchmarkReadSumFloat64_100000(b *testing.B) { 112 | for n := 0; n < b.N; n++ { 113 | df, err := pd.ReadCSV(getPath("100k"), pd.ReadOptions{HeaderRows: 1}) 114 | if err != nil { 115 | log.Fatal(err) 116 | } 117 | df.Sum() 118 | } 119 | } 120 | 121 | func benchmarkReadSumFloat64_100k10x(b *testing.B) { 122 | for n := 0; n < b.N; n++ { 123 | df, err := pd.ReadCSV(getPath("100k10x"), pd.ReadOptions{HeaderRows: 1}) 124 | if err != nil { 125 | log.Fatal(err) 126 | } 127 | df.Sum() 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /series/series.go: -------------------------------------------------------------------------------- 1 | // Package series defines the Series, a typed 1-dimensional data structure with an n-level index, analogous to a column in a spreadsheet. 2 | package series 3 | 4 | import ( 5 | "fmt" 6 | "reflect" 7 | "strings" 8 | 9 | "github.com/ptiger10/pd/internal/index" 10 | "github.com/ptiger10/pd/internal/values" 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | // A Series is a 1-D data container with a labeled index, static type, and the ability to handle null values 15 | type Series struct { 16 | index index.Index 17 | values values.Values 18 | datatype options.DataType 19 | name string 20 | Index Index 21 | InPlace InPlace 22 | } 23 | 24 | func (s *Series) String() string { 25 | if Equal(s, newEmptySeries()) { 26 | return "{Empty Series}" 27 | } 28 | return s.print() 29 | } 30 | 31 | // InPlace contains methods for modifying a Series in place. 32 | type InPlace struct { 33 | s *Series 34 | } 35 | 36 | func (ip InPlace) String() string { 37 | printer := "{InPlace Series Handler}\n" 38 | printer += "Methods:\n" 39 | t := reflect.TypeOf(InPlace{}) 40 | for i := 0; i < t.NumMethod(); i++ { 41 | method := t.Method(i) 42 | printer += fmt.Sprintln(method.Name) 43 | } 44 | return printer 45 | } 46 | 47 | // An Element is a single item in a Series. 48 | type Element struct { 49 | Value interface{} 50 | Null bool 51 | Labels []interface{} 52 | LabelTypes []options.DataType 53 | } 54 | 55 | func (el Element) String() string { 56 | var printStr string 57 | for _, pair := range [][]interface{}{ 58 | {"Value", el.Value}, 59 | {"Null", el.Null}, 60 | {"Labels", el.Labels}, 61 | {"LabelTypes", el.LabelTypes}, 62 | } { 63 | // LabelTypes is 10 characters wide, so left padding set to 10 64 | printStr += fmt.Sprintf("%10v:%v%v\n", pair[0], strings.Repeat(" ", values.GetDisplayElementWhitespaceBuffer()), pair[1]) 65 | } 66 | return printStr 67 | } 68 | 69 | // The Config struct can be used in the custom Series constructor to name the Series or specify its data type. 70 | type Config struct { 71 | Name string 72 | DataType options.DataType 73 | Index interface{} 74 | IndexName string 75 | MultiIndex []interface{} 76 | MultiIndexNames []string 77 | Manual bool 78 | } 79 | 80 | // A Grouping returns a collection of index labels with mutually exclusive integer positions. 81 | type Grouping struct { 82 | s *Series 83 | groups map[string]*group 84 | } 85 | 86 | func (g Grouping) String() string { 87 | printer := fmt.Sprintf("{Series Grouping | NumGroups: %v, Groups: [%v]}\n", len(g.groups), strings.Join(g.Groups(), ", ")) 88 | return printer 89 | } 90 | 91 | // Index contains index selection and conversion 92 | type Index struct { 93 | s *Series 94 | } 95 | 96 | func (idx Index) String() string { 97 | printer := fmt.Sprintf("{Series Index | Len: %d, NumLevels: %d}\n", idx.Len(), idx.s.NumLevels()) 98 | return printer 99 | } 100 | -------------------------------------------------------------------------------- /options/settable_test.go: -------------------------------------------------------------------------------- 1 | package options 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestSettableOptions(t *testing.T) { 9 | if GetDisplayMaxWidth() != defaultOptions.displayMaxWidth { 10 | t.Errorf("Default setting not reading for DisplayMaxWidth") 11 | } 12 | SetDisplayMaxWidth(15) 13 | if GetDisplayMaxWidth() != 15 { 14 | t.Error("Unable to set/get DisplayMaxWidth") 15 | } 16 | 17 | if GetDisplayFloatPrecision() != defaultOptions.displayFloatPrecision { 18 | t.Errorf("Default setting not reading for DisplayFloatPrecision") 19 | } 20 | SetDisplayFloatPrecision(10) 21 | if GetDisplayFloatPrecision() != 10 { 22 | t.Error("Unable to set/get DisplayFloatPrecision") 23 | } 24 | 25 | if GetDisplayMaxRows() != defaultOptions.displayMaxRows { 26 | t.Errorf("Default setting not reading for DisplayMaxRows") 27 | } 28 | SetDisplayMaxRows(10) 29 | if GetDisplayMaxRows() != 10 { 30 | t.Error("Unable to set/get DisplayMaxRows") 31 | } 32 | 33 | if GetDisplayMaxColumns() != defaultOptions.displayMaxColumns { 34 | t.Errorf("Default setting not reading for DisplayMaxColumns") 35 | } 36 | SetDisplayMaxColumns(10) 37 | if GetDisplayMaxColumns() != 10 { 38 | t.Error("Unable to set/get DisplayMaxColumns") 39 | } 40 | 41 | if GetDisplayRepeatedLabels() != defaultOptions.displayRepeatedLabels { 42 | t.Errorf("Default setting not reading for DisplayRepeatedLabels") 43 | } 44 | SetDisplayRepeatedLabels(true) 45 | if GetDisplayRepeatedLabels() != true { 46 | t.Error("Unable to set/get DisplayRepeatedLabels") 47 | } 48 | 49 | if GetDisplayStringNullFiller() != defaultOptions.displayStringNullFiller { 50 | t.Errorf("Default setting not reading for DisplayStringNullFiller") 51 | } 52 | SetDisplayStringNullFiller("Nothing") 53 | if GetDisplayStringNullFiller() != "Nothing" { 54 | t.Error("Unable to set/get DisplayStringNullFiller") 55 | } 56 | 57 | if GetDisplayTimeFormat() != defaultOptions.displayTimeFormat { 58 | t.Errorf("Default setting not reading for DisplayTimeFormat") 59 | } 60 | SetDisplayTimeFormat("2006") 61 | if GetDisplayTimeFormat() != "2006" { 62 | t.Error("Unable to set/get DisplayTimeFormat") 63 | } 64 | 65 | SetStringNullValues([]string{"Nada", "Nothing"}) 66 | if !reflect.DeepEqual(GetStringNullValues(), []string{"Nada", "Nothing"}) { 67 | t.Error("Unable to set/get StringNullValues") 68 | } 69 | 70 | if GetLogWarnings() != defaultOptions.logWarnings { 71 | t.Errorf("Default setting not reading for LogWarnings") 72 | } 73 | SetLogWarnings(false) 74 | if GetLogWarnings() != false { 75 | t.Error("Unable to set/get LogWarnings") 76 | } 77 | 78 | if GetAsync() != defaultOptions.async { 79 | t.Errorf("Default setting not reading for Async") 80 | } 81 | SetAsync(false) 82 | if GetAsync() != false { 83 | t.Error("Unable to set/get Async") 84 | } 85 | 86 | RestoreDefaults() 87 | if GetDisplayMaxWidth() != 35 { 88 | t.Error("Unable to restore default for DisplayMaxWidth") 89 | } 90 | if GetLogWarnings() != true { 91 | t.Error("Unable to restore default for LogWarnings") 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /dataframe/dataframe.go: -------------------------------------------------------------------------------- 1 | // Package dataframe defines the DataFrame, a 2-dimensional data structure with an n-level index, n-level column headers, 2 | // and columns of typed data. It is analogous to a spreadsheet. 3 | package dataframe 4 | 5 | import ( 6 | "fmt" 7 | "reflect" 8 | "strings" 9 | 10 | "github.com/ptiger10/pd/internal/index" 11 | "github.com/ptiger10/pd/internal/values" 12 | "github.com/ptiger10/pd/options" 13 | ) 14 | 15 | // A DataFrame is a 2D collection of one or more Series with a shared index and associated columns. 16 | type DataFrame struct { 17 | name string 18 | vals []values.Container 19 | cols index.Columns 20 | Columns Columns 21 | index index.Index 22 | Index Index 23 | InPlace InPlace 24 | } 25 | 26 | func (df *DataFrame) String() string { 27 | if Equal(df, newEmptyDataFrame()) { 28 | return "{Empty DataFrame}" 29 | } 30 | return df.print() 31 | } 32 | 33 | // Index contains index level data. 34 | type Index struct { 35 | df *DataFrame 36 | } 37 | 38 | func (idx Index) String() string { 39 | printer := fmt.Sprintf("{DataFrame Index | Len: %d, NumLevels: %d}\n", idx.Len(), idx.df.IndexLevels()) 40 | return printer 41 | } 42 | 43 | // Columns contains column level data. 44 | type Columns struct { 45 | df *DataFrame 46 | } 47 | 48 | func (col Columns) String() string { 49 | printer := fmt.Sprintf("{DataFrame Columns | NumCols: %d, NumLevels: %d}\n", col.df.NumCols(), col.df.ColLevels()) 50 | return printer 51 | } 52 | 53 | // A Row is a single row in a DataFrame. 54 | type Row struct { 55 | Values []interface{} 56 | Nulls []bool 57 | ValueTypes []options.DataType 58 | Labels []interface{} 59 | LabelTypes []options.DataType 60 | } 61 | 62 | func (r Row) String() string { 63 | var printStr string 64 | for _, pair := range [][]interface{}{ 65 | {"Values", r.Values}, 66 | {"IsNull", r.Nulls}, 67 | {"ValueTypes", r.ValueTypes}, 68 | {"Labels", r.Labels}, 69 | {"LabelTypes", r.LabelTypes}, 70 | } { 71 | // LabelTypes is 10 characters wide, so left padding set to 10 72 | printStr += fmt.Sprintf("%10v:%v%v\n", pair[0], strings.Repeat(" ", values.GetDisplayElementWhitespaceBuffer()), pair[1]) 73 | } 74 | return printStr 75 | } 76 | 77 | // Config customizes the DataFrame constructor. 78 | type Config struct { 79 | Name string 80 | DataType options.DataType 81 | Index interface{} 82 | IndexName string 83 | MultiIndex []interface{} 84 | MultiIndexNames []string 85 | Col []string 86 | ColName string 87 | MultiCol [][]string 88 | MultiColNames []string 89 | Manual bool 90 | } 91 | 92 | // A Grouping returns a collection of index labels with mutually exclusive integer positions. 93 | type Grouping struct { 94 | df *DataFrame 95 | groups map[string]*group 96 | err bool 97 | } 98 | 99 | func (g Grouping) String() string { 100 | printer := fmt.Sprintf("{DataFrame Grouping | NumGroups: %v, Groups: [%v]}\n", len(g.groups), strings.Join(g.Groups(), ", ")) 101 | return printer 102 | } 103 | 104 | // InPlace contains methods for modifying a DataFrame in place. 105 | type InPlace struct { 106 | df *DataFrame 107 | } 108 | 109 | func (ip InPlace) String() string { 110 | printer := "{InPlace DataFrame Handler}\n" 111 | printer += "Methods:\n" 112 | t := reflect.TypeOf(InPlace{}) 113 | for i := 0; i < t.NumMethod(); i++ { 114 | method := t.Method(i) 115 | printer += fmt.Sprintln(method.Name) 116 | } 117 | return printer 118 | } 119 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/compare.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | import ( 6 | "fmt" 7 | "log" 8 | "sort" 9 | "strconv" 10 | "strings" 11 | "time" 12 | ) 13 | 14 | // CompareBenchmarks creates a comparison table of GoPandas <> Pandas for equivalent operations 15 | func CompareBenchmarks( 16 | goBenchmarks, pyBenchmarks Results, 17 | sampleSizes []string, 18 | descs map[string]desc, 19 | ) string { 20 | 21 | var printer string 22 | printer += "GoPandas vs Pandas speed comparison\n" 23 | printer += time.Now().In(time.Local).Format(time.RFC1123) + "\n" 24 | // model 25 | // +-----+-----+ 26 | // | foo | bar | 27 | // +-----+-----+ 28 | spacerChar := "-" 29 | sepChar := "+" 30 | vChar := "|" 31 | 32 | // Sections 33 | type section struct { 34 | name string 35 | width int 36 | } 37 | num := section{name: "#", width: 4} 38 | desc := section{name: "DESCRIPTION", width: 40} 39 | sample := section{name: "N", width: 6} 40 | 41 | goBenchmark := section{name: "GOPANDAS", width: 11} 42 | pyBenchmark := section{name: "PANDAS", width: 11} 43 | comparison := section{name: "SPEED Δ", width: 9} 44 | sections := []section{num, desc, sample, goBenchmark, pyBenchmark, comparison} 45 | 46 | // Break Line 47 | breakLineComponents := make([]string, len(sections)) 48 | for i := 0; i < len(sections); i++ { 49 | breakLineComponents[i] = strings.Repeat(spacerChar, sections[i].width) 50 | } 51 | breakLine := sepChar + strings.Join(breakLineComponents, sepChar) + sepChar + "\n" 52 | 53 | // Header 54 | headerComponents := make([]string, len(sections)) 55 | for i := 0; i < len(sections); i++ { 56 | headerComponents[i] = fmt.Sprintf(" %-*v", sections[i].width-1, sections[i].name) 57 | } 58 | header := vChar + strings.Join(headerComponents, vChar) + vChar + "\n" 59 | printer += breakLine + header + breakLine 60 | 61 | // Rows 62 | var i int 63 | type orderedDesc struct { 64 | n int 65 | label string 66 | } 67 | var orderedDescs []orderedDesc 68 | for k, v := range descs { 69 | orderedDescs = append(orderedDescs, orderedDesc{v.order, k}) 70 | } 71 | sort.Slice(orderedDescs, func(i, j int) bool { 72 | if orderedDescs[i].n < orderedDescs[j].n { 73 | return true 74 | } 75 | return false 76 | }) 77 | for _, sample := range sampleSizes { 78 | results, ok := goBenchmarks[sample] 79 | if !ok { 80 | log.Printf("sample size %v not in %v", sample, goBenchmarks) 81 | continue 82 | } 83 | for _, desc := range orderedDescs { 84 | testName := desc.label 85 | goResult, ok := results[desc.label] 86 | if !ok { 87 | continue 88 | } 89 | i++ 90 | gospeed, gons := goResult[0], goResult[1] 91 | goSpeed := gospeed.(string) 92 | goNS := gons.(float64) 93 | pySpeed := "n/a" 94 | comparison := "n/a" 95 | py, ok := pyBenchmarks[sample] 96 | if ok { 97 | pyResult, ok := py[testName] 98 | if ok { 99 | pyspeed, pyns := pyResult[0], pyResult[1] 100 | pySpeed = pyspeed.(string) 101 | pyNS := pyns.(float64) 102 | comparison = fmt.Sprintf("%.2fx", pyNS/goNS) 103 | } 104 | } 105 | 106 | rowComponents := []string{ 107 | strconv.Itoa(i), descs[desc.label].str, sample, goSpeed, pySpeed, comparison, 108 | } 109 | for i := range rowComponents { 110 | rowComponents[i] = fmt.Sprintf( 111 | " %-*v", sections[i].width-1, rowComponents[i]) 112 | } 113 | printer += vChar + strings.Join(rowComponents, vChar) + vChar + "\n" 114 | printer += breakLine 115 | 116 | } 117 | } 118 | return printer 119 | } 120 | -------------------------------------------------------------------------------- /series/select.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/ptiger10/pd/internal/values" 8 | 9 | "github.com/ptiger10/pd/options" 10 | ) 11 | 12 | // Element returns information about the value and index labels at this position but panics if an out-of-range position is provided. 13 | func (s *Series) Element(p int) Element { 14 | idxElems := s.index.Elements(p) 15 | return Element{ 16 | s.values.Value(p), 17 | s.values.Null(p), 18 | idxElems.Labels, 19 | idxElems.DataTypes, 20 | } 21 | } 22 | 23 | // At returns the value at a single integer position, but returns nil if value is null. Panics if position is out of range. 24 | func (s *Series) At(position int) interface{} { 25 | if position >= s.Len() { 26 | if options.GetLogWarnings() { 27 | log.Printf("s.Index.At(): invalid position: %d (max: %v)", position, s.Len()-1) 28 | } 29 | } 30 | if s.values.Null(position) { 31 | return nil 32 | } 33 | return s.values.Value(position) 34 | } 35 | 36 | // From subsets the Series from start to end (inclusive) and returns a new Series. 37 | // If an invalid position is provided, returns empty Series. 38 | func (s *Series) From(start int, end int) *Series { 39 | rowPositions := values.MakeIntRangeInclusive(start, end) 40 | var err error 41 | s, err = s.Subset(rowPositions) 42 | if err != nil { 43 | if options.GetLogWarnings() { 44 | log.Printf("s.From(): %v", err) 45 | } 46 | return newEmptySeries() 47 | } 48 | return s 49 | } 50 | 51 | // [END Series methods] 52 | 53 | // [START Selection] 54 | 55 | // XS returns a new Series with only the rows and index levels at the specified positions. 56 | func (s *Series) XS(rowPositions []int, levelPositions []int) (*Series, error) { 57 | var err error 58 | s, err = s.Subset(rowPositions) 59 | if err != nil { 60 | return newEmptySeries(), fmt.Errorf("s.XS() rows: %v", err) 61 | } 62 | err = s.Index.SubsetLevels(levelPositions) 63 | if err != nil { 64 | return newEmptySeries(), fmt.Errorf("s.XS() index levels: %v", err) 65 | } 66 | return s, nil 67 | } 68 | 69 | // SelectLabel returns the integer location of the first row in index level 0 with the supplied label, or -1 if the label does not exist. 70 | func (s *Series) SelectLabel(label string) int { 71 | if s.NumLevels() == 0 { 72 | if options.GetLogWarnings() { 73 | log.Println("Series.SelectLabel(): index has no length") 74 | } 75 | return -1 76 | } 77 | s.index.Levels[0].UpdateLabelMap() 78 | val, ok := s.index.Levels[0].LabelMap[label] 79 | if !ok { 80 | if options.GetLogWarnings() { 81 | log.Printf("Series.SelectLabel(): %v not in label map\n", label) 82 | } 83 | return -1 84 | } 85 | return val[0] 86 | } 87 | 88 | // SelectLabels returns the integer locations of all rows with the supplied labels within the supplied level. 89 | // If an error is encountered, returns a new slice of 0 length. 90 | func (s *Series) SelectLabels(labels []string, level int) []int { 91 | empty := make([]int, 0) 92 | err := s.ensureLevelPositions([]int{level}) 93 | if err != nil { 94 | if options.GetLogWarnings() { 95 | log.Printf("Series.SelectLabels(): %v", err) 96 | } 97 | return empty 98 | } 99 | s.index.Levels[level].UpdateLabelMap() 100 | include := make([]int, 0) 101 | for _, label := range labels { 102 | val, ok := s.index.Levels[level].LabelMap[label] 103 | if !ok { 104 | if options.GetLogWarnings() { 105 | log.Printf("Series.SelectLabels(): %v not in label map", label) 106 | } 107 | return empty 108 | } 109 | include = append(include, val...) 110 | } 111 | return include 112 | } 113 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/profile.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime 3 | import json 4 | import sys 5 | import os 6 | 7 | 8 | def main(): 9 | # Start tests 10 | results = { 11 | "100k": { 12 | "sum": sumTest(), 13 | # "sumx10": sumTest100k10x(), 14 | # "readCSVSum10x": readCSVSumTest10x(), 15 | "mean": meanTest(), 16 | "min": minTest(), 17 | "max": maxTest(), 18 | "std": stdTest(), 19 | # "readCSVSum": readCSVSumTest(), 20 | }, 21 | "500k": { 22 | "sum2": sumTest500(), 23 | # "mean2": meanTest500(), 24 | }, 25 | # "5m": { 26 | # "sum": sumTest5m(), 27 | # } 28 | } 29 | json.dump(results, sys.stdout) 30 | 31 | 32 | # timer computes the average duration across n tests 33 | # returns the duration as string and nanoseconds 34 | def timer(n): 35 | def decorator(fn): 36 | def wrapper(*args, **kwargs): 37 | times = [] 38 | for i in range(n): 39 | start = datetime.datetime.now() 40 | fn(*args, **kwargs) 41 | end = datetime.datetime.now() 42 | duration = (end-start).total_seconds() 43 | times.append(duration) 44 | duration = sum(times)/len(times) 45 | ns = 1000000000 46 | mcs = 1000000 47 | ms = 1000 48 | if duration * mcs < 1: 49 | speed = "{:.1f}ns".format(duration*ns) 50 | if duration * ms < 1: 51 | speed = "{:.1f}μs".format(duration*mcs) 52 | elif duration < 1: 53 | speed = "{:.1f}ms".format(duration*ms) 54 | else: 55 | speed = "{:.1f}s".format(duration) 56 | return speed, int(duration*ns) 57 | return wrapper 58 | return decorator 59 | 60 | 61 | def get_filepath(s): 62 | basename = files[s] 63 | thisFile = sys.argv[0] 64 | path = os.path.join(os.path.dirname(thisFile), basename) 65 | return path 66 | 67 | 68 | files = { 69 | '100k': '../dataRandom100k1Col.csv', 70 | '100k10x': '../dataRandom100k10Col.csv', 71 | '500k': '../dataRandom500k2Col.csv', 72 | '5m': '../dataRandom5m1Col.csv', 73 | } 74 | df100 = pd.read_csv(get_filepath('100k')) 75 | df100k10x = pd.read_csv(get_filepath('100k10x')) 76 | df500 = pd.read_csv(get_filepath('500k')) 77 | # df5m = pd.read_csv(get_filepath('5m')) 78 | 79 | 80 | @timer(1000) 81 | def sumTest(): 82 | s = df100.sum() 83 | assert round(s.iloc[0], 2) == 50408.63 84 | 85 | 86 | @timer(100) 87 | def sumTest100k10x(): 88 | s = df100k10x.sum() 89 | assert round(s.iloc[0], 2) == 50408.63 90 | 91 | 92 | @timer(100) 93 | def sumTest500(): 94 | s = df500.sum() 95 | assert round(s.iloc[0], 2) == 130598.19 96 | 97 | 98 | # @timer(20) 99 | # def sumTest5m(): 100 | # s = df5m.sum() 101 | # assert round(s.iloc[0], 2) == 2520431.67 102 | 103 | 104 | @timer(1000) 105 | def meanTest(): 106 | s = df100.mean() 107 | assert round(s.iloc[0], 2) == 0.5 108 | 109 | 110 | @timer(100) 111 | def meanTest500(): 112 | s = df500.mean() 113 | assert round(s.iloc[0], 2) == 0.26 114 | 115 | 116 | @timer(1000) 117 | def minTest(): 118 | s = df100.min() 119 | assert round(s.iloc[0], 2) == 0.0 120 | 121 | 122 | @timer(1000) 123 | def maxTest(): 124 | s = df100.max() 125 | assert round(s.iloc[0], 2) == 1.0 126 | 127 | 128 | @timer(1000) 129 | def stdTest(): 130 | s = df100.std() 131 | assert round(s.iloc[0], 2) == 0.29 132 | 133 | 134 | @timer(100) 135 | def medianTest(): 136 | s = df100.median() 137 | assert round(s.iloc[0], 2) == 0.5 138 | 139 | 140 | @timer(50) 141 | def readCSVSumTest(): 142 | df = pd.read_csv(get_filepath('100k')) 143 | s = df.sum() 144 | assert round(s.iloc[0], 2) == 50408.63 145 | 146 | 147 | @timer(20) 148 | def readCSVSumTest10x(): 149 | df = pd.read_csv(get_filepath('100k10x')) 150 | s = df.sum() 151 | assert round(s.iloc[0], 2) == 50408.63 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /series/constructor.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/ptiger10/pd/internal/index" 8 | "github.com/ptiger10/pd/internal/values" 9 | "github.com/ptiger10/pd/options" 10 | ) 11 | 12 | // New creates a new Series with the supplied values and an optional config. 13 | func New(data interface{}, config ...Config) (*Series, error) { 14 | var idx index.Index 15 | configuration := index.Config{} // Series config 16 | 17 | if data == nil { 18 | return newEmptySeries(), nil 19 | } 20 | 21 | // Handling config 22 | if config != nil { 23 | if len(config) > 1 { 24 | return newEmptySeries(), fmt.Errorf("series.New(): can supply at most one Config (%d > 1)", len(config)) 25 | } 26 | tmp := config[0] 27 | configuration = index.Config{ 28 | Name: tmp.Name, DataType: tmp.DataType, 29 | Index: tmp.Index, IndexName: tmp.IndexName, 30 | MultiIndex: tmp.MultiIndex, MultiIndexNames: tmp.MultiIndexNames, 31 | } 32 | } 33 | 34 | // Handling values 35 | container, err := values.InterfaceFactory(data) 36 | if err != nil { 37 | return newEmptySeries(), fmt.Errorf("series.New(): %v", err) 38 | } 39 | 40 | // Handling index 41 | // empty data: return empty index 42 | if lenValues := container.Values.Len(); lenValues == 0 { 43 | idx = index.New() 44 | // not empty data: use config 45 | } else { 46 | idx, err = index.NewFromConfig(configuration, lenValues) 47 | if err != nil { 48 | return newEmptySeries(), fmt.Errorf("series.New(): %v", err) 49 | } 50 | } 51 | 52 | s := &Series{ 53 | values: container.Values, 54 | index: idx, 55 | datatype: container.DataType, 56 | name: configuration.Name, 57 | } 58 | 59 | // Optional datatype conversion 60 | if configuration.DataType != options.None { 61 | s.values, err = values.Convert(s.values, configuration.DataType) 62 | if err != nil { 63 | return newEmptySeries(), fmt.Errorf("series.New(): %v", err) 64 | } 65 | s.datatype = configuration.DataType 66 | } 67 | 68 | s.Index = Index{s: s} 69 | s.InPlace = InPlace{s: s} 70 | 71 | // Alignment check 72 | if err := s.ensureAlignment(); err != nil { 73 | return newEmptySeries(), fmt.Errorf("series.New(): %v", err) 74 | } 75 | 76 | return s, err 77 | } 78 | 79 | // MustNew returns a new Series or logs an error and returns an empty Series. 80 | func MustNew(data interface{}, config ...Config) *Series { 81 | s, err := New(data, config...) 82 | if err != nil { 83 | if options.GetLogWarnings() { 84 | log.Printf("series.MustNew(): %v", err) 85 | } 86 | return newEmptySeries() 87 | } 88 | return s 89 | } 90 | 91 | func newEmptySeries() *Series { 92 | // ducks error because InterfaceFactory supports nil data 93 | container, _ := values.InterfaceFactory(nil) 94 | s := &Series{index: index.New(), values: container.Values, datatype: container.DataType} 95 | s.Index = Index{s: s} 96 | s.InPlace = InPlace{s: s} 97 | return s 98 | } 99 | 100 | // Copy creates a new deep copy of a Series. 101 | func (s *Series) Copy() *Series { 102 | idx := s.index.Copy() 103 | valsCopy := s.values.Copy() 104 | copyS := &Series{ 105 | values: valsCopy, 106 | index: idx, 107 | datatype: s.datatype, 108 | name: s.name, 109 | } 110 | copyS.Index = Index{s: copyS} 111 | copyS.InPlace = InPlace{s: copyS} 112 | return copyS 113 | } 114 | 115 | // [START semi-private methods] 116 | 117 | // FromInternalComponents is a semi-private method for hydrating Series within the DataFrame module. 118 | // The required inputs are not available to the caller. 119 | func FromInternalComponents(container values.Container, index index.Index, name string) *Series { 120 | s := &Series{ 121 | values: container.Values, 122 | index: index, 123 | datatype: container.DataType, 124 | name: name, 125 | } 126 | s.Index = Index{s: s} 127 | s.InPlace = InPlace{s: s} 128 | return s 129 | } 130 | 131 | // ToInternalComponents is a semi-private method for using a Series within the DataFrame module. 132 | // The required inputs are not available to the caller. 133 | func (s *Series) ToInternalComponents() (values.Container, index.Index) { 134 | return values.Container{Values: s.values.Copy(), DataType: s.datatype}, s.index.Copy() 135 | } 136 | 137 | // [END semi-private methods] 138 | -------------------------------------------------------------------------------- /series/merge_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | "os" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | ) 11 | 12 | func TestSeries_Join(t *testing.T) { 13 | single := MustNew("foo", Config{Index: []int{1}, IndexName: "foobar"}) 14 | single2 := MustNew("bar", Config{Index: []int{2}, IndexName: "corge"}) 15 | single3 := MustNew(7.11, Config{Index: []int{2}, IndexName: "corge"}) 16 | multi := MustNew("foo", Config{MultiIndex: []interface{}{[]string{"A"}, []int{1}}, MultiIndexNames: []string{"foobar", "corge"}}) 17 | multi2 := MustNew("bar", Config{MultiIndex: []interface{}{[]string{"B"}, []int{2}}, MultiIndexNames: []string{"waldo", "fred"}}) 18 | type args struct { 19 | s2 *Series 20 | } 21 | type want struct { 22 | series *Series 23 | err bool 24 | } 25 | var tests = []struct { 26 | name string 27 | input *Series 28 | args args 29 | want want 30 | }{ 31 | {name: "singleIndex", 32 | input: single, args: args{s2: single2}, 33 | want: want{series: MustNew([]string{"foo", "bar"}, Config{Index: []int{1, 2}, IndexName: "foobar"}), err: false}}, 34 | {"replace empty s", 35 | newEmptySeries(), args{s2: single2}, 36 | want{MustNew([]string{"bar"}, Config{Index: []int{2}, IndexName: "corge"}), false}}, 37 | {"singleIndex convert", 38 | single, args{single3}, 39 | want{MustNew([]string{"foo", "7.11"}, Config{Index: []int{1, 2}, IndexName: "foobar"}), false}}, 40 | {"multiIndex", 41 | multi, args{multi2}, 42 | want{MustNew([]string{"foo", "bar"}, Config{MultiIndex: []interface{}{[]string{"A", "B"}, []int{1, 2}}, MultiIndexNames: []string{"foobar", "corge"}}), false}}, 43 | {"fail: empty s2", 44 | single, args{newEmptySeries()}, 45 | want{single, true}}, 46 | {"fail: nil s2", 47 | single, args{&Series{}}, 48 | want{single, true}}, 49 | {"fail: invalid num levels", 50 | single, args{multi}, 51 | want{single, true}}, 52 | } 53 | for _, tt := range tests { 54 | t.Run(tt.name, func(t *testing.T) { 55 | s := tt.input.Copy() 56 | sArchive := tt.input.Copy() 57 | err := s.InPlace.Join(tt.args.s2) 58 | if (err != nil) != tt.want.err { 59 | t.Errorf("InPlace.Join() error = %v, want %v", err, tt.want.err) 60 | return 61 | } 62 | 63 | if !Equal(s, tt.want.series) { 64 | t.Errorf("InPlace.Join() got %v, want %v", s, tt.want.series) 65 | } 66 | 67 | sCopy, err := sArchive.Join(tt.args.s2) 68 | if (err != nil) != tt.want.err { 69 | t.Errorf("Series.Join() error = %v, want %v", err, tt.want.err) 70 | return 71 | } 72 | if !Equal(sCopy, tt.want.series) { 73 | t.Errorf("Series.Join() got %v, want %v", sCopy, tt.want.series) 74 | } 75 | if !strings.Contains(tt.name, "fail") { 76 | if !strings.Contains(tt.name, "same") { 77 | if Equal(sArchive, sCopy) { 78 | t.Errorf("Series.Join() retained access to original, want copy") 79 | } 80 | } 81 | } 82 | }) 83 | } 84 | } 85 | 86 | func TestSeries_LookupSeries(t *testing.T) { 87 | multi := MustNew([]string{"foo", "bar"}, Config{MultiIndex: []interface{}{[]string{"baz", "qux"}, []int{1, 2}}}) 88 | multi2 := MustNew("corge", Config{MultiIndex: []interface{}{[]string{"baz"}, []int{1}}}) 89 | type args struct { 90 | s2 *Series 91 | } 92 | tests := []struct { 93 | name string 94 | input *Series 95 | args args 96 | want *Series 97 | wantFail bool 98 | }{ 99 | {name: "single", input: MustNew("foo"), args: args{s2: MustNew("bar")}, 100 | want: MustNew("bar"), wantFail: false}, 101 | {"multi", multi, args{multi2}, 102 | MustNew([]string{"corge", ""}, Config{MultiIndex: []interface{}{[]string{"baz", "qux"}, []int{1, 2}}}), false}, 103 | {"fail", MustNew("foo"), args{multi2}, 104 | newEmptySeries(), true}, 105 | } 106 | for _, tt := range tests { 107 | t.Run(tt.name, func(t *testing.T) { 108 | var buf bytes.Buffer 109 | log.SetOutput(&buf) 110 | defer log.SetOutput(os.Stderr) 111 | 112 | if got := tt.input.LookupSeries(tt.args.s2); !reflect.DeepEqual(got, tt.want) { 113 | t.Errorf("Series.LookupSeries() = %v, want %v", got.index, tt.want.index) 114 | } 115 | if tt.wantFail { 116 | if buf.String() == "" { 117 | t.Errorf("Series.LookupSeries() returned no log message, want log due to fail") 118 | } 119 | } 120 | }) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /internal/values/type-interface.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "reflect" 7 | "time" 8 | 9 | "github.com/ptiger10/pd/options" 10 | ) 11 | 12 | // [START Convenience Functions] 13 | 14 | func isNullInterface(i interface{}) bool { 15 | switch i.(type) { 16 | case string: 17 | s := i.(string) 18 | if isNullString(s) { 19 | return true 20 | } 21 | case float32, float64: 22 | f := reflect.ValueOf(i).Float() 23 | if math.IsNaN(f) { 24 | return true 25 | } 26 | } 27 | return false 28 | } 29 | 30 | func (vals *interfaceValues) Less(i, j int) bool { 31 | if fmt.Sprint((*vals)[i].v) < fmt.Sprint((*vals)[j].v) { 32 | return true 33 | } 34 | return false 35 | } 36 | 37 | // [END Convenience Functions] 38 | 39 | // newInterface creates an interfaceValue from atomic interface{} value 40 | func newInterface(val interface{}) interfaceValue { 41 | if isNullInterface(val) { 42 | return interfaceValue{val, true} 43 | } 44 | return interfaceValue{val, false} 45 | } 46 | 47 | // [START Converters] 48 | func (val interfaceValue) toFloat64() float64Value { 49 | if val.null { 50 | return float64Value{math.NaN(), true} 51 | } 52 | switch val.v.(type) { 53 | case float32, float64: 54 | v := reflect.ValueOf(val.v).Float() 55 | return newFloat64(v) 56 | case int, int8, int16, int32, int64: 57 | v := reflect.ValueOf(val.v).Int() 58 | return newInt64(v).toFloat64() 59 | case uint, uint8, uint16, uint32, uint64: 60 | v := reflect.ValueOf(val.v).Uint() 61 | return newInt64(int64(v)).toFloat64() 62 | case string: 63 | return newString(val.v.(string)).toFloat64() 64 | case bool: 65 | return newBool(val.v.(bool)).toFloat64() 66 | case time.Time: 67 | return newDateTime(val.v.(time.Time)).toFloat64() 68 | } 69 | return float64Value{math.NaN(), true} 70 | } 71 | 72 | func (val interfaceValue) toInt64() int64Value { 73 | if val.null { 74 | return int64Value{0, true} 75 | } 76 | switch val.v.(type) { 77 | case float32, float64: 78 | v := reflect.ValueOf(val.v).Float() 79 | return newFloat64(v).toInt64() 80 | case int, int8, int16, int32, int64: 81 | v := reflect.ValueOf(val.v).Int() 82 | return newInt64(v) 83 | case uint, uint8, uint16, uint32, uint64: 84 | v := reflect.ValueOf(val.v).Uint() 85 | return int64Value{int64(v), false} 86 | case string: 87 | return newString(val.v.(string)).toInt64() 88 | case bool: 89 | return newBool(val.v.(bool)).toInt64() 90 | case time.Time: 91 | return newDateTime(val.v.(time.Time)).toInt64() 92 | } 93 | return int64Value{0, true} 94 | } 95 | 96 | func (val interfaceValue) toString() stringValue { 97 | if isNullString(fmt.Sprint(val.v)) || val.null { 98 | return stringValue{options.GetDisplayStringNullFiller(), true} 99 | } 100 | return stringValue{fmt.Sprint(val.v), false} 101 | } 102 | 103 | func (val interfaceValue) toBool() boolValue { 104 | if val.null { 105 | return boolValue{false, true} 106 | } 107 | switch val.v.(type) { 108 | case float32, float64: 109 | v := reflect.ValueOf(val.v).Float() 110 | return newFloat64(v).toBool() 111 | case int, int8, int16, int32, int64: 112 | v := reflect.ValueOf(val.v).Int() 113 | return newInt64(v).toBool() 114 | case uint, uint8, uint16, uint32, uint64: 115 | v := reflect.ValueOf(val.v).Uint() 116 | return newInt64(int64(v)).toBool() 117 | case string: 118 | return newString(val.v.(string)).toBool() 119 | case bool: 120 | return newBool(val.v.(bool)) 121 | case time.Time: 122 | return newDateTime(val.v.(time.Time)).toBool() 123 | } 124 | return boolValue{false, true} 125 | } 126 | 127 | func (val interfaceValue) toDateTime() dateTimeValue { 128 | if val.null { 129 | return dateTimeValue{time.Time{}, true} 130 | } 131 | switch val.v.(type) { 132 | case float32, float64: 133 | v := reflect.ValueOf(val.v).Float() 134 | return newFloat64(v).toDateTime() 135 | case int, int8, int16, int32, int64: 136 | v := reflect.ValueOf(val.v).Int() 137 | return newInt64(v).toDateTime() 138 | case uint, uint8, uint16, uint32, uint64: 139 | v := reflect.ValueOf(val.v).Uint() 140 | return newInt64(int64(v)).toDateTime() 141 | case string: 142 | return newString(val.v.(string)).toDateTime() 143 | case bool: 144 | return newBool(val.v.(bool)).toDateTime() 145 | case time.Time: 146 | return newDateTime(val.v.(time.Time)) 147 | } 148 | return dateTimeValue{time.Time{}, true} 149 | } 150 | 151 | func (val interfaceValue) toInterface() interfaceValue { 152 | return val 153 | } 154 | 155 | // [END Converters] 156 | 157 | // emptyValues returns empty interface values 158 | func emptyValues() Values { 159 | return &interfaceValues{} 160 | } 161 | -------------------------------------------------------------------------------- /dataframe/constructor.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/ptiger10/pd/internal/index" 8 | "github.com/ptiger10/pd/internal/values" 9 | "github.com/ptiger10/pd/options" 10 | "github.com/ptiger10/pd/series" 11 | ) 12 | 13 | // New creates a new DataFrame with default column names. 14 | func New(data []interface{}, config ...Config) (*DataFrame, error) { 15 | var vals []values.Container 16 | var idx index.Index 17 | var cols index.Columns 18 | configuration := index.Config{} 19 | tmp := Config{} 20 | var err error 21 | 22 | if len(data) == 0 { 23 | return newEmptyDataFrame(), nil 24 | } 25 | // Handling config 26 | if config != nil { 27 | if len(config) > 1 { 28 | return newEmptyDataFrame(), fmt.Errorf("dataframe.New(): can supply at most one Config (%d > 1)", len(config)) 29 | } 30 | tmp = config[0] 31 | configuration = index.Config{ 32 | Name: tmp.Name, 33 | DataType: tmp.DataType, 34 | Index: tmp.Index, IndexName: tmp.IndexName, 35 | MultiIndex: tmp.MultiIndex, MultiIndexNames: tmp.MultiIndexNames, 36 | Col: tmp.Col, ColName: tmp.ColName, 37 | MultiCol: tmp.MultiCol, MultiColNames: tmp.MultiColNames, 38 | } 39 | } 40 | 41 | // Handling map 42 | isSplit, extractedData, extractedColumns := values.MapSplitter(data) 43 | if isSplit { 44 | data = extractedData 45 | configuration.Col = extractedColumns 46 | } 47 | 48 | // Handling values 49 | vals, err = values.InterfaceSliceFactory(data, tmp.Manual, configuration.DataType) 50 | if err != nil { 51 | return newEmptyDataFrame(), fmt.Errorf("dataframe.New(): %v", err) 52 | } 53 | 54 | // Handling index 55 | idx, err = index.NewFromConfig(configuration, vals[0].Values.Len()) 56 | if err != nil { 57 | return newEmptyDataFrame(), fmt.Errorf("dataframe.New(): %v", err) 58 | } 59 | //Handling columns 60 | cols, err = index.NewColumnsFromConfig(configuration, len(data)) 61 | if err != nil { 62 | return newEmptyDataFrame(), fmt.Errorf("dataframe.New(): %v", err) 63 | } 64 | 65 | df := &DataFrame{ 66 | vals: vals, 67 | index: idx, 68 | cols: cols, 69 | name: configuration.Name, 70 | } 71 | 72 | df.Columns = Columns{df: df} 73 | df.Index = Index{df: df} 74 | df.InPlace = InPlace{df: df} 75 | 76 | if err := df.ensureAlignment(); err != nil { 77 | return newEmptyDataFrame(), fmt.Errorf("dataframe.New(): %v", err) 78 | } 79 | 80 | return df, err 81 | } 82 | 83 | func newEmptyDataFrame() *DataFrame { 84 | df := &DataFrame{vals: nil, index: index.New(), cols: index.NewColumns()} 85 | df.Columns = Columns{df: df} 86 | df.Index = Index{df: df} 87 | df.InPlace = InPlace{df: df} 88 | return df 89 | } 90 | 91 | // MustNew constructs a new DataFrame or logs an error and returns an empty DataFrame. 92 | func MustNew(data []interface{}, config ...Config) *DataFrame { 93 | df, err := New(data, config...) 94 | if err != nil { 95 | if options.GetLogWarnings() { 96 | log.Printf("dataframe.MustNew(): %v", err) 97 | } 98 | return newEmptyDataFrame() 99 | } 100 | return df 101 | } 102 | 103 | // newFromComponents constructs a dataframe from its constituent parts but returns an empty dataframe if series is nil 104 | func newFromComponents(vals []values.Container, idx index.Index, cols index.Columns, name string) *DataFrame { 105 | if vals == nil { 106 | return newEmptyDataFrame() 107 | } 108 | df := &DataFrame{ 109 | vals: vals, 110 | index: idx, 111 | cols: cols, 112 | name: name, 113 | } 114 | df.Columns = Columns{df: df} 115 | df.Index = Index{df: df} 116 | df.InPlace = InPlace{df: df} 117 | 118 | return df 119 | } 120 | 121 | func (df *DataFrame) valsAligned() error { 122 | if df.NumCols() == 0 { 123 | return nil 124 | } 125 | lvl0 := df.vals[0].Values.Len() 126 | for i := 1; i < df.NumCols(); i++ { 127 | if cmpLvl := df.vals[i].Values.Len(); lvl0 != cmpLvl { 128 | return fmt.Errorf("df.valsAligned(): values container at %v must have same number of labels as container 0, %d != %d", 129 | i, cmpLvl, lvl0) 130 | } 131 | } 132 | return nil 133 | } 134 | 135 | // Copy creates a new deep copy of a Series. 136 | func (df *DataFrame) Copy() *DataFrame { 137 | var valsCopy []values.Container 138 | for i := 0; i < len(df.vals); i++ { 139 | valsCopy = append(valsCopy, df.vals[i].Copy()) 140 | } 141 | idxCopy := df.index.Copy() 142 | colsCopy := df.cols.Copy() 143 | dfCopy := &DataFrame{ 144 | vals: valsCopy, 145 | index: idxCopy, 146 | cols: colsCopy, 147 | name: df.name, 148 | } 149 | dfCopy.Columns = Columns{df: dfCopy} 150 | dfCopy.Index = Index{df: dfCopy} 151 | dfCopy.InPlace = InPlace{df: dfCopy} 152 | return dfCopy 153 | } 154 | 155 | // hydrateSeries converts a column of values.Values into a Series with the same index as df. 156 | func (df *DataFrame) hydrateSeries(col int) *series.Series { 157 | return series.FromInternalComponents( 158 | df.vals[col], df.index, df.cols.Name(col)) 159 | } 160 | -------------------------------------------------------------------------------- /guides/Options.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import \"github.com/ptiger10/pd/options\"\n", 10 | "import \"github.com/ptiger10/pd\"" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "0 foo\n", 23 | "1 bar\n", 24 | "2 foobarb...\n", 25 | "\n", 26 | "datatype: string\n", 27 | "\n", 28 | "\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "options.RestoreDefaults()\n", 34 | "options.SetDisplayMaxWidth(10)\n", 35 | "pd.Series([]string{\"foo\", \"bar\", \"foobarbazbang\"})" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "0 1\n", 48 | "1 2\n", 49 | "2 3\n", 50 | "\n", 51 | "datatype: float64\n", 52 | "\n", 53 | "\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "options.RestoreDefaults()\n", 59 | "options.SetDisplayFloatPrecision(0)\n", 60 | "pd.Series([]float64{1, 2, 3})" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 8, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "foo A 1.00\n", 73 | " B 2.00\n", 74 | "bar C 3.00\n", 75 | "\n", 76 | "datatype: float64\n", 77 | "\n", 78 | "\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "pd.Series([]float64{1, 2, 3}, pd.Config{\n", 84 | " MultiIndex: []interface{}{[]string{\"foo\", \"foo\", \"bar\"}, []string{\"A\", \"B\", \"C\"}}})" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "foo A 1.00\n", 97 | "foo B 2.00\n", 98 | "bar C 3.00\n", 99 | "\n", 100 | "datatype: float64\n", 101 | "\n", 102 | "\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "options.RestoreDefaults()\n", 108 | "options.SetDisplayRepeatedLabels(true)\n", 109 | "pd.Series([]float64{1, 2, 3}, pd.Config{\n", 110 | " MultiIndex: []interface{}{[]string{\"foo\", \"foo\", \"bar\"}, []string{\"A\", \"B\", \"C\"}}})" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 5, 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "0 #ERR\n", 125 | "1 foo\n", 126 | "\n", 127 | "datatype: string\n", 128 | "\n", 129 | "\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "options.RestoreDefaults()\n", 135 | "options.SetDisplayStringNullFiller(\"#ERR\")\n", 136 | "pd.Series([]string{\"\", \"foo\"})" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "0 01/01/2019\n", 149 | "\n", 150 | "datatype: dateTime\n", 151 | "\n", 152 | "\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "import \"time\"\n", 158 | "options.RestoreDefaults()\n", 159 | "options.SetDisplayTimeFormat(\"01/02/2006\")\n", 160 | "pd.Series([]time.Time{time.Date(2019,1,1,0,0,0,0,time.UTC)})" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "0 NaN\n", 173 | "1 foo\n", 174 | "\n", 175 | "datatype: string\n", 176 | "\n", 177 | "\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "options.RestoreDefaults()\n", 183 | "options.SetStringNullValues([]string{\"#REF\"})\n", 184 | "pd.Series([]string{\"#REF\", \"foo\"})" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Go (lgo)", 191 | "language": "go", 192 | "name": "lgo" 193 | }, 194 | "language_info": { 195 | "file_extension": "", 196 | "mimetype": "", 197 | "name": "go", 198 | "version": "" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 2 203 | } 204 | -------------------------------------------------------------------------------- /benchmarking/profiler/benchmarks/config.go: -------------------------------------------------------------------------------- 1 | // +build benchmarks 2 | 3 | package benchmarks 4 | 5 | import ( 6 | "log" 7 | "math" 8 | "os" 9 | "path/filepath" 10 | "runtime" 11 | 12 | "github.com/ptiger10/pd" 13 | "github.com/ptiger10/pd/dataframe" 14 | ) 15 | 16 | // Descriptions of the benchmarking tests 17 | var Descriptions = map[string]desc{ 18 | "sum": {1, "Sum one column"}, 19 | "sumx10": {2, "Sum 10 columns individually"}, 20 | "mean": {3, "Simple mean of one column"}, 21 | "min": {4, "Min of one column"}, 22 | "max": {5, "Max of one column"}, 23 | "std": {6, "Standard deviation of one column"}, 24 | "readCSVSum": {7, "Read in CSV then calculate sum"}, 25 | "readCSVSum10x": {7, "Read CSV, sum 10 cols individually"}, 26 | "sum2": {8, "Sum two columns"}, 27 | "mean2": {9, "Mean of two columns"}, 28 | } 29 | 30 | // SampleSizes is all the potential sample sizes and the order in which they should appear in the comparison table. 31 | var SampleSizes = []string{ 32 | "100k", 33 | "500k", 34 | // "5m", 35 | } 36 | 37 | var df100k *dataframe.DataFrame 38 | var df100k10x *dataframe.DataFrame 39 | var df500k *dataframe.DataFrame 40 | var df5m *dataframe.DataFrame 41 | 42 | func read100k() { 43 | var err error 44 | df100k, err = pd.ReadCSV(getPath("100k"), pd.ReadOptions{HeaderRows: 1}) 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | 49 | got := math.Round(df100k.Sum().At(0).(float64)*100) / 100 50 | want := 50408.63 51 | if got != want { 52 | log.Fatalf("profiler/config.go: reading in test data: df.Sum() got %v, want %v", got, want) 53 | } 54 | 55 | got = math.Round(df100k.Mean().At(0).(float64)*100) / 100 56 | want = 0.5 57 | if got != want { 58 | log.Fatalf("profiler/config.go: reading in test data: df.Mean() got %v, want %v", got, want) 59 | } 60 | 61 | got = math.Round(df100k.Median().At(0).(float64)*100) / 100 62 | want = 0.50 63 | if got != want { 64 | log.Fatalf("profiler/config.go: reading in test data: df.Median() got %v, want %v", got, want) 65 | } 66 | 67 | got = math.Round(df100k.Min().At(0).(float64)*100) / 100 68 | want = 0.0 69 | if got != want { 70 | log.Fatalf("profiler/config.go: reading in test data: df.Min() got %v, want %v", got, want) 71 | } 72 | 73 | got = math.Round(df100k.Max().At(0).(float64)*100) / 100 74 | want = 1.0 75 | if got != want { 76 | log.Fatalf("profiler/config.go: reading in test data: df.Max() got %v, want %v", got, want) 77 | } 78 | 79 | got = math.Round(df100k.Std().At(0).(float64)*100) / 100 80 | want = 0.29 81 | if got != want { 82 | log.Fatalf("profiler/config.go: reading in test data: df.Std() got %v, want %v", got, want) 83 | } 84 | 85 | } 86 | 87 | func read100k10x() { 88 | var err error 89 | df100k10x, err = pd.ReadCSV(getPath("100k10x"), pd.ReadOptions{HeaderRows: 1}) 90 | if err != nil { 91 | log.Fatal(err) 92 | } 93 | 94 | got := math.Round(df100k10x.Sum().At(0).(float64)*100) / 100 95 | want := 50408.63 96 | if got != want { 97 | log.Fatalf("profiler/config.go: reading in test data: df.Sum() got %v, want %v", got, want) 98 | } 99 | } 100 | 101 | func read500k() { 102 | var err error 103 | df500k, err = pd.ReadCSV(getPath("500k"), pd.ReadOptions{HeaderRows: 1}) 104 | if err != nil { 105 | log.Fatal(err) 106 | } 107 | 108 | got := math.Round(df500k.Sum().At(0).(float64)*100) / 100 109 | want := 130598.19 110 | if got != want { 111 | log.Fatalf("profiler/config.go: reading in test data: df.Sum500() got %v, want %v", got, want) 112 | } 113 | 114 | got = math.Round(df500k.Mean().At(0).(float64)*100) / 100 115 | want = 0.26 116 | if got != want { 117 | log.Fatalf("profiler/config.go: reading in test data: df.Mean() got %v, want %v", got, want) 118 | } 119 | } 120 | 121 | func read5m() { 122 | var err error 123 | df5m, err = pd.ReadCSV(getPath("5m"), pd.ReadOptions{HeaderRows: 1}) 124 | if err != nil { 125 | log.Fatal(err) 126 | } 127 | 128 | got := math.Round(df5m.Sum().At(0).(float64)*100) / 100 129 | want := 2520431.67 130 | if got != want { 131 | log.Fatalf("profiler/config.go: reading in test data: df.Sum() got %v, want %v", got, want) 132 | } 133 | } 134 | 135 | // ReadData initializes data for use in comparison tetss 136 | func ReadData() { 137 | read100k() 138 | read100k10x() 139 | read500k() 140 | // read5m() 141 | } 142 | 143 | var files = map[string]string{ 144 | "100k": "../dataRandom100k1Col.csv", 145 | "100k10x": "../dataRandom100k10Col.csv", 146 | "500k": "../dataRandom500k2Col.csv", 147 | "5m": "../dataRandom5m1Col.csv", 148 | } 149 | 150 | func getPath(s string) string { 151 | basename, ok := files[s] 152 | if !ok { 153 | log.Fatalf("profiler/config.go: reading in test data: df.%v not in %v", s, files) 154 | } 155 | _, thisFile, _, _ := runtime.Caller(0) 156 | path := filepath.Join(filepath.Dir(thisFile), basename) 157 | if _, err := os.Stat(path); os.IsNotExist(err) { 158 | log.Fatalf("profiler/config.go: reading in test data: df.File does not exist at %s", path) 159 | } 160 | return path 161 | } 162 | -------------------------------------------------------------------------------- /series/filter_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "reflect" 5 | "strings" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func TestApply(t *testing.T) { 11 | s := MustNew([]float64{1, 2, 3}) 12 | sArchive := s.Copy() 13 | 14 | s.InPlace.Apply(func(val interface{}) interface{} { 15 | v, ok := val.(float64) 16 | if !ok { 17 | return "" 18 | } 19 | return ((v - s.Mean()) / s.Std()) 20 | }) 21 | want := MustNew([]float64{-1.224744871391589, 0, 1.224744871391589}) 22 | if !Equal(s, want) { 23 | t.Errorf("InPlace.Apply() returned %v, want %v", s, want) 24 | } 25 | 26 | sCopy := sArchive.Apply(func(val interface{}) interface{} { 27 | v, ok := val.(float64) 28 | if !ok { 29 | return "" 30 | } 31 | return ((v - sArchive.Mean()) / sArchive.Std()) 32 | }) 33 | if !Equal(sCopy, want) { 34 | t.Errorf("Apply() returned %v, want %v", sCopy, want) 35 | } 36 | if Equal(sArchive, sCopy) { 37 | t.Errorf("Apply() retained access to original, want copy") 38 | } 39 | } 40 | 41 | func TestApply_riskier(t *testing.T) { 42 | s := MustNew([]float64{1, 2, 3}) 43 | got := s.Apply(func(val interface{}) interface{} { 44 | return (val.(float64) - s.Mean()) / s.Std() 45 | }) 46 | want := MustNew([]float64{-1.224744871391589, 0, 1.224744871391589}) 47 | if !Equal(got, want) { 48 | t.Errorf("Apply() returned %v, want %v", got, want) 49 | } 50 | } 51 | 52 | func TestFilterFloat64(t *testing.T) { 53 | tests := []struct { 54 | name string 55 | fn func(*Series, float64) []int 56 | arg float64 57 | want []int 58 | }{ 59 | {"GT", (*Series).GT, 2, []int{2}}, 60 | {"GTE", (*Series).GTE, 2, []int{1, 2}}, 61 | {"LT", (*Series).LT, 2, []int{0}}, 62 | {"LTE", (*Series).LTE, 2, []int{0, 1}}, 63 | {"EQ", (*Series).EQ, 2, []int{1}}, 64 | {"NEQ", (*Series).NEQ, 2, []int{0, 2}}, 65 | } 66 | for _, tt := range tests { 67 | t.Run(tt.name, func(t *testing.T) { 68 | s := MustNew([]float64{1, 2, 3}) 69 | got := tt.fn(s, tt.arg) 70 | if !reflect.DeepEqual(got, tt.want) { 71 | t.Errorf("s.Filter() got %v, want %v for arg %v", got, tt.want, tt.arg) 72 | } 73 | }) 74 | } 75 | } 76 | 77 | func TestFilterBool(t *testing.T) { 78 | tests := []struct { 79 | name string 80 | fn func(*Series) []int 81 | want []int 82 | }{ 83 | {"True", (*Series).True, []int{1}}, 84 | {"False", (*Series).False, []int{0}}, 85 | } 86 | for _, tt := range tests { 87 | t.Run(tt.name, func(t *testing.T) { 88 | s := MustNew([]bool{false, true}) 89 | got := tt.fn(s) 90 | if !reflect.DeepEqual(got, tt.want) { 91 | t.Errorf("s.Filter() got %v, want %v", got, tt.want) 92 | } 93 | }) 94 | } 95 | } 96 | 97 | func TestFilterDateTime(t *testing.T) { 98 | tests := []struct { 99 | name string 100 | fn func(*Series, time.Time) []int 101 | arg time.Time 102 | want []int 103 | }{ 104 | {"Before", (*Series).Before, time.Date(2019, 1, 2, 0, 0, 0, 0, time.UTC), []int{0}}, 105 | {"After", (*Series).After, time.Date(2019, 1, 2, 0, 0, 0, 0, time.UTC), []int{1}}, 106 | } 107 | for _, tt := range tests { 108 | t.Run(tt.name, func(t *testing.T) { 109 | s := MustNew([]time.Time{time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC), time.Date(2019, 3, 1, 0, 0, 0, 0, time.UTC)}) 110 | got := tt.fn(s, tt.arg) 111 | if !reflect.DeepEqual(got, tt.want) { 112 | t.Errorf("s.Filter() got %v, want %v", got, tt.want) 113 | } 114 | }) 115 | } 116 | } 117 | 118 | func TestFilter_Contains(t *testing.T) { 119 | s := MustNew([]string{"foo", "bar", "baz"}) 120 | got := s.Contains("ba") 121 | want := []int{1, 2} 122 | if !reflect.DeepEqual(got, want) { 123 | t.Errorf("s.Contains() got %v, want %v", got, want) 124 | } 125 | 126 | got = s.InList([]string{"foo", "bar"}) 127 | want = []int{0, 1} 128 | if !reflect.DeepEqual(got, want) { 129 | t.Errorf("s.In() got %v, want %v", got, want) 130 | } 131 | } 132 | 133 | func TestFilter_float(t *testing.T) { 134 | s := MustNew([]float64{1, 2, 3}) 135 | got := s.Filter(func(val interface{}) bool { 136 | v, ok := val.(float64) 137 | if !ok { 138 | return false 139 | } 140 | if v > 2 { 141 | return true 142 | } 143 | return false 144 | }) 145 | want := []int{2} 146 | if !reflect.DeepEqual(got, want) { 147 | t.Errorf("s.Filter() got %v, want %v", got, want) 148 | } 149 | } 150 | 151 | func TestFilter_string(t *testing.T) { 152 | s := MustNew([]string{"bamboo", "leaves", "taboo"}) 153 | got := s.Filter(func(val interface{}) bool { 154 | v, ok := val.(string) 155 | if !ok { 156 | return false 157 | } 158 | if strings.HasSuffix(v, "boo") { 159 | return true 160 | } 161 | return false 162 | }) 163 | want := []int{0, 2} 164 | if !reflect.DeepEqual(got, want) { 165 | t.Errorf("s.Filter() got %v, want %v", got, want) 166 | } 167 | } 168 | 169 | func TestFilter_string_riskier(t *testing.T) { 170 | s := MustNew([]string{"bamboo", "leaves", "taboo"}) 171 | got := s.Filter(func(val interface{}) bool { 172 | if strings.HasSuffix(val.(string), "boo") { 173 | return true 174 | } 175 | return false 176 | }) 177 | want := []int{0, 2} 178 | if !reflect.DeepEqual(got, want) { 179 | t.Errorf("s.Filter() got %v, want %v", got, want) 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /dataframe/columns.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/ptiger10/pd/internal/index" 8 | 9 | "github.com/ptiger10/pd/options" 10 | ) 11 | 12 | // Values returns an []string of the values at each level of the cols. 13 | func (col Columns) Values() [][]string { 14 | ret := make([][]string, col.df.ColLevels()) 15 | for j := 0; j < col.df.ColLevels(); j++ { 16 | ret[j] = col.df.cols.Levels[j].Labels 17 | } 18 | return ret 19 | } 20 | 21 | // Reorder reorders the columns in the order in which the labels are supplied and excludes any unsupplied labels. 22 | // Reorder looks for these labels in level 0 and modifies the DataFrame in place. 23 | func (col Columns) Reorder(labels []string) { 24 | positions := col.df.SelectCols(labels, 0) 25 | col.df.InPlace.SubsetColumns(positions) 26 | } 27 | 28 | // SwapLevels swaps two column levels and modifies the cols in place. 29 | func (col Columns) SwapLevels(i, j int) error { 30 | if err := col.df.ensureColumnLevelPositions([]int{i, j}); err != nil { 31 | return fmt.Errorf("Columns.SwapLevels(): %v", err) 32 | } 33 | col.df.cols.Levels[i], col.df.cols.Levels[j] = col.df.cols.Levels[j], col.df.cols.Levels[i] 34 | col.df.cols.Refresh() 35 | return nil 36 | } 37 | 38 | // At returns the cols values at a specified col level and column position but returns nil if either integer is out of range. 39 | func (col Columns) At(level int, column int) string { 40 | if err := col.df.ensureColumnLevelPositions([]int{level}); err != nil { 41 | if options.GetLogWarnings() { 42 | log.Printf("Columns.At(): %v", err) 43 | } 44 | return "" 45 | } 46 | if err := col.df.ensureColumnPositions([]int{column}); err != nil { 47 | if options.GetLogWarnings() { 48 | log.Printf("Columns.At(): %v", err) 49 | } 50 | return "" 51 | } 52 | return col.df.cols.Levels[level].Labels[column] 53 | } 54 | 55 | // RenameLevel renames an cols level in place but does not change anything if level is out of range. 56 | func (col Columns) RenameLevel(level int, name string) error { 57 | if err := col.df.ensureColumnLevelPositions([]int{level}); err != nil { 58 | return fmt.Errorf("df.cols.RenameLevel(): %v", err) 59 | } 60 | col.df.cols.Levels[level].Name = name 61 | col.df.cols.Refresh() 62 | return nil 63 | } 64 | 65 | // InsertLevel inserts a level into the cols and modifies the DataFrame in place. 66 | func (col Columns) InsertLevel(pos int, labels []string, name string) error { 67 | if err := col.df.cols.InsertLevel(pos, labels, name); err != nil { 68 | return fmt.Errorf("df.Column.InsertLevel(): %v", err) 69 | } 70 | return nil 71 | } 72 | 73 | // AppendLevel adds a new cols level to the end of the current cols and modifies the DataFrame in place. 74 | func (col Columns) AppendLevel(labels []string, name string) error { 75 | err := col.InsertLevel(col.df.ColLevels(), labels, name) 76 | if err != nil { 77 | return fmt.Errorf("df.cols.AppendLevel(): %v", err) 78 | } 79 | return nil 80 | } 81 | 82 | // SubsetLevels modifies the DataFrame in place with only the specified cols levels. 83 | func (col Columns) SubsetLevels(levelPositions []int) error { 84 | 85 | err := col.df.ensureColumnLevelPositions(levelPositions) 86 | if err != nil { 87 | return fmt.Errorf("df.cols.SubsetLevels(): %v", err) 88 | } 89 | if len(levelPositions) == 0 { 90 | return fmt.Errorf("df.cols.SubsetLevels(): no levels provided") 91 | } 92 | 93 | levels := make([]index.ColLevel, len(levelPositions)) 94 | for j := 0; j < len(levelPositions); j++ { 95 | levels[j] = col.df.cols.Levels[levelPositions[j]] 96 | } 97 | col.df.cols.Levels = levels 98 | col.df.cols.Refresh() 99 | return nil 100 | } 101 | 102 | // DropLevel drops the specified cols level and modifies the DataFrame in place. 103 | // If there is only one col level remaining, replaces with a new default col level. 104 | func (col Columns) DropLevel(level int) error { 105 | if err := col.df.ensureColumnLevelPositions([]int{level}); err != nil { 106 | return fmt.Errorf("Columns.DropLevel(): %v", err) 107 | } 108 | if col.df.ColLevels() == 1 { 109 | col.df.cols.Levels = append(col.df.cols.Levels, index.NewDefaultColLevel(col.df.NumCols(), "")) 110 | } 111 | col.df.cols.Levels = append(col.df.cols.Levels[:level], col.df.cols.Levels[level+1:]...) 112 | col.df.cols.Refresh() 113 | return nil 114 | } 115 | 116 | // SelectName returns the integer position of the cols level at the first occurrence of the supplied name, or -1 if not a valid cols level name. 117 | func (col Columns) SelectName(name string) int { 118 | v, ok := col.df.cols.NameMap[name] 119 | if !ok { 120 | if options.GetLogWarnings() { 121 | log.Printf("Columns.SelectName(): name not in cols level names: %v\n", name) 122 | } 123 | return -1 124 | } 125 | return v[0] 126 | } 127 | 128 | // SelectNames returns the integer positions of the cols levels with the supplied names. 129 | func (col Columns) SelectNames(names []string) []int { 130 | include := make([]int, 0) 131 | empty := make([]int, 0) 132 | for _, name := range names { 133 | v, ok := col.df.cols.NameMap[name] 134 | if !ok { 135 | if options.GetLogWarnings() { 136 | log.Printf("Columns.SelectNames(): name not in cols level names: %v\n", name) 137 | } 138 | return empty 139 | } 140 | include = append(include, v...) 141 | } 142 | return include 143 | } 144 | -------------------------------------------------------------------------------- /series/describe_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | "os" 7 | "reflect" 8 | "testing" 9 | "time" 10 | 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | func TestSeries_Describe(t *testing.T) { 15 | type want struct { 16 | len int 17 | numIdxLevels int 18 | maxWidth int 19 | values []interface{} 20 | vals interface{} 21 | datatype string 22 | name string 23 | valid []int 24 | null []int 25 | } 26 | tests := []struct { 27 | name string 28 | input *Series 29 | want want 30 | }{ 31 | {"empty", 32 | newEmptySeries(), 33 | want{len: 0, numIdxLevels: 0, maxWidth: 0, 34 | values: []interface{}{}, vals: []interface{}{}, datatype: "none", name: "", 35 | valid: []int{}, null: []int{}}}, 36 | {name: "default index", 37 | input: MustNew([]string{"foo", "", "bar", ""}), 38 | want: want{len: 4, numIdxLevels: 1, maxWidth: 3, 39 | values: []interface{}{"foo", "NaN", "bar", "NaN"}, 40 | vals: []string{"foo", "NaN", "bar", "NaN"}, 41 | datatype: "string", name: "", 42 | valid: []int{0, 2}, null: []int{1, 3}}}, 43 | {"multi index", 44 | MustNew( 45 | 1.0, 46 | Config{MultiIndex: []interface{}{"baz", "qux"}, Name: "foo"}, 47 | ), 48 | want{len: 1, numIdxLevels: 2, maxWidth: 4, 49 | values: []interface{}{1.0}, vals: []float64{1}, 50 | datatype: "float64", name: "foo", 51 | valid: []int{0}, null: []int{}}}, 52 | } 53 | for _, tt := range tests { 54 | t.Run(tt.name, func(t *testing.T) { 55 | s := tt.input.Copy() 56 | gotLen := s.Len() 57 | if gotLen != tt.want.len { 58 | t.Errorf("s.Len(): got %v, want %v", gotLen, tt.want.len) 59 | } 60 | gotNumIdxLevels := s.NumLevels() 61 | if gotNumIdxLevels != tt.want.numIdxLevels { 62 | t.Errorf("s.NumLevels(): got %v, want %v", gotNumIdxLevels, tt.want.numIdxLevels) 63 | } 64 | gotMaxWidth := s.MaxWidth() 65 | if gotMaxWidth != tt.want.maxWidth { 66 | t.Errorf("s.MaxWidth(): got %v, want %v", gotMaxWidth, tt.want.maxWidth) 67 | } 68 | gotValues := s.Values() 69 | if !reflect.DeepEqual(gotValues, tt.want.values) { 70 | t.Errorf("s.Values(): got %v, want %v", gotValues, tt.want.values) 71 | } 72 | gotVals := s.Vals() 73 | if !reflect.DeepEqual(gotVals, tt.want.vals) { 74 | t.Errorf("s.Vals(): got %#v, want %v", gotVals, tt.want.vals) 75 | } 76 | gotDatatype := s.DataType() 77 | if gotDatatype != tt.want.datatype { 78 | t.Errorf("s.Datatype(): got %v, want %v", gotDatatype, tt.want.datatype) 79 | } 80 | gotName := s.Name() 81 | if gotName != tt.want.name { 82 | t.Errorf("s.Name(): got %v, want %v", gotName, tt.want.name) 83 | } 84 | gotValid := s.valid() 85 | if !reflect.DeepEqual(gotValid, tt.want.valid) { 86 | t.Errorf("s.valid(): got %v, want %v", gotValid, tt.want.valid) 87 | } 88 | gotNull := s.null() 89 | if !reflect.DeepEqual(gotNull, tt.want.null) { 90 | t.Errorf("s.null(): got %v, want %v", gotNull, tt.want.null) 91 | } 92 | }) 93 | } 94 | } 95 | 96 | func TestSeries_Equal(t *testing.T) { 97 | s, err := New("foo", Config{Index: "bar", Name: "baz"}) 98 | if err != nil { 99 | t.Error(err) 100 | } 101 | s2, _ := New("foo", Config{Index: "bar", Name: "baz"}) 102 | if !Equal(s, s2) { 103 | t.Errorf("Equal() returned false, want true") 104 | } 105 | s2.datatype = options.Bool 106 | if Equal(s, s2) { 107 | t.Errorf("Equal() returned true for different kind, want false") 108 | } 109 | 110 | s2, _ = New("quux", Config{Index: "bar", Name: "baz"}) 111 | if Equal(s, s2) { 112 | t.Errorf("Equal() returned true for different values, want false") 113 | } 114 | s2, _ = New("foo", Config{Index: "corge", Name: "baz"}) 115 | if Equal(s, s2) { 116 | t.Errorf("Equal() returned true for different index, want false") 117 | } 118 | s2, _ = New("foo", Config{Index: "bar", Name: "qux"}) 119 | if Equal(s, s2) { 120 | t.Errorf("Equal() returned true for different name, want false") 121 | } 122 | } 123 | 124 | func TestSeries_ReplaceNil(t *testing.T) { 125 | s := MustNew(nil) 126 | s2 := MustNew([]int{1, 2}) 127 | s.replace(s2) 128 | if !Equal(s, s2) { 129 | t.Errorf("Series.replace() returned %v, want %v", s, s2) 130 | } 131 | } 132 | 133 | func TestSeries_Describe_unsupported(t *testing.T) { 134 | s := MustNew([]float64{1, 2, 3}) 135 | tm := s.Earliest() 136 | if (time.Time{}) != tm { 137 | t.Errorf("Earliest() got %v, want time.Time{} for unsupported type", tm) 138 | } 139 | tm = s.Latest() 140 | if (time.Time{}) != tm { 141 | t.Errorf("Latest() got %v, want time.Time{} for unsupported type", tm) 142 | } 143 | } 144 | 145 | // [START ensure tests] 146 | func TestSeries_EnsureTypes_fail(t *testing.T) { 147 | defer log.SetOutput(os.Stderr) 148 | vals := []interface{}{1, 2, 3} 149 | 150 | var buf bytes.Buffer 151 | log.SetOutput(&buf) 152 | ensureFloatFromNumerics(vals) 153 | if buf.String() == "" { 154 | t.Errorf("ensureNumerics() returned no log message, want log due to fail") 155 | } 156 | buf.Reset() 157 | 158 | ensureDateTime(vals) 159 | if buf.String() == "" { 160 | t.Errorf("ensureDateTime() returned no log message, want log due to fail") 161 | } 162 | buf.Reset() 163 | 164 | ensureBools(vals) 165 | if buf.String() == "" { 166 | t.Errorf("ensureBools() returned no log message, want log due to fail") 167 | } 168 | buf.Reset() 169 | } 170 | 171 | // [END ensure tests] 172 | -------------------------------------------------------------------------------- /internal/values/type-string.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "strconv" 7 | "strings" 8 | "time" 9 | 10 | "github.com/araddon/dateparse" 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | // [START Constructor Functions] 15 | 16 | func isNullString(s string) bool { 17 | nullStrings := options.GetStringNullValues() 18 | for _, ns := range nullStrings { 19 | if strings.TrimSpace(s) == ns { 20 | return true 21 | } 22 | } 23 | return false 24 | } 25 | 26 | // newString creates an stringValue from atomic string value 27 | func newString(val string) stringValue { 28 | if isNullString(val) { 29 | return stringValue{options.GetDisplayStringNullFiller(), true} 30 | } 31 | return stringValue{val, false} 32 | } 33 | 34 | func (vals *stringValues) Less(i, j int) bool { 35 | if (*vals)[i].v < (*vals)[j].v { 36 | return true 37 | } 38 | return false 39 | } 40 | 41 | // [END Constructor Functions] 42 | 43 | // [START Converters] 44 | 45 | // toFloat converts stringValue to float64Value 46 | // 47 | // "1": 1.0, Null: NaN 48 | func (val stringValue) toFloat64() float64Value { 49 | f, err := strconv.ParseFloat(val.v, 64) 50 | if math.IsNaN(f) || err != nil { 51 | return float64Value{math.NaN(), true} 52 | } 53 | return float64Value{f, false} 54 | } 55 | 56 | // toInt converts stringValue to int64Value 57 | // 58 | // "1": 1, null: NaN 59 | func (val stringValue) toInt64() int64Value { 60 | if val.null { 61 | return int64Value{0, true} 62 | } 63 | f, err := strconv.ParseFloat(val.v, 64) 64 | if err != nil { 65 | return int64Value{0, true} 66 | } 67 | return int64Value{int64(f), false} 68 | } 69 | 70 | func (val stringValue) toString() stringValue { 71 | if isNullString(val.v) || val.null { 72 | return stringValue{options.GetDisplayStringNullFiller(), true} 73 | } 74 | return stringValue{fmt.Sprint(val.v), false} 75 | } 76 | 77 | // toBool converts stringValue to boolValue 78 | // 79 | // null: false; notnull: true 80 | func (val stringValue) toBool() boolValue { 81 | if val.null { 82 | return boolValue{false, true} 83 | } 84 | return boolValue{true, false} 85 | } 86 | 87 | // toDateTime converts stringValue to dateTimeValue using an external parse library 88 | // 89 | // Jan 1 2019: 2019-01-01 00:00:00 90 | // 91 | // Acceptable DateTime string formats 92 | /* 93 | "May 8, 2009 5:57:51 PM", 94 | "oct 7, 1970", 95 | "oct 7, '70", 96 | "oct. 7, 1970", 97 | "oct. 7, 70", 98 | "Mon Jan 2 15:04:05 2006", 99 | "Mon Jan 2 15:04:05 MST 2006", 100 | "Mon Jan 02 15:04:05 -0700 2006", 101 | "Monday, 02-Jan-06 15:04:05 MST", 102 | "Mon, 02 Jan 2006 15:04:05 MST", 103 | "Tue, 11 Jul 2017 16:28:13 +0200 (CEST)", 104 | "Mon, 02 Jan 2006 15:04:05 -0700", 105 | "Thu, 4 Jan 2018 17:53:36 +0000", 106 | "Mon Aug 10 15:44:11 UTC+0100 2015", 107 | "Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)", 108 | "September 17, 2012 10:09am", 109 | "September 17, 2012 at 10:09am PST-08", 110 | "September 17, 2012, 10:10:09", 111 | "October 7, 1970", 112 | "October 7th, 1970", 113 | "12 Feb 2006, 19:17", 114 | "12 Feb 2006 19:17", 115 | "7 oct 70", 116 | "7 oct 1970", 117 | "03 February 2013", 118 | "1 July 2013", 119 | "2013-Feb-03", 120 | // mm/dd/yy 121 | "3/31/2014", 122 | "03/31/2014", 123 | "08/21/71", 124 | "8/1/71", 125 | "4/8/2014 22:05", 126 | "04/08/2014 22:05", 127 | "4/8/14 22:05", 128 | "04/2/2014 03:00:51", 129 | "8/8/1965 12:00:00 AM", 130 | "8/8/1965 01:00:01 PM", 131 | "8/8/1965 01:00 PM", 132 | "8/8/1965 1:00 PM", 133 | "8/8/1965 12:00 AM", 134 | "4/02/2014 03:00:51", 135 | "03/19/2012 10:11:59", 136 | "03/19/2012 10:11:59.3186369", 137 | // yyyy/mm/dd 138 | "2014/3/31", 139 | "2014/03/31", 140 | "2014/4/8 22:05", 141 | "2014/04/08 22:05", 142 | "2014/04/2 03:00:51", 143 | "2014/4/02 03:00:51", 144 | "2012/03/19 10:11:59", 145 | "2012/03/19 10:11:59.3186369", 146 | // Chinese 147 | "2014年04月08日", 148 | // yyyy-mm-ddThh 149 | "2006-01-02T15:04:05+0000", 150 | "2009-08-12T22:15:09-07:00", 151 | "2009-08-12T22:15:09", 152 | "2009-08-12T22:15:09Z", 153 | // yyyy-mm-dd hh:mm:ss 154 | "2014-04-26 17:24:37.3186369", 155 | "2012-08-03 18:31:59.257000000", 156 | "2014-04-26 17:24:37.123", 157 | "2013-04-01 22:43", 158 | "2013-04-01 22:43:22", 159 | "2014-12-16 06:20:00 UTC", 160 | "2014-12-16 06:20:00 GMT", 161 | "2014-04-26 05:24:37 PM", 162 | "2014-04-26 13:13:43 +0800", 163 | "2014-04-26 13:13:43 +0800 +08", 164 | "2014-04-26 13:13:44 +09:00", 165 | "2012-08-03 18:31:59.257000000 +0000 UTC", 166 | "2015-09-30 18:48:56.35272715 +0000 UTC", 167 | "2015-02-18 00:12:00 +0000 GMT", 168 | "2015-02-18 00:12:00 +0000 UTC", 169 | "2015-02-08 03:02:00 +0300 MSK m=+0.000000001", 170 | "2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001", 171 | "2017-07-19 03:21:51+00:00", 172 | "2014-04-26", 173 | "2014-04", 174 | "2014", 175 | "2014-05-11 08:20:13,787", 176 | // mm.dd.yy 177 | "3.31.2014", 178 | "03.31.2014", 179 | "08.21.71", 180 | "2014.03", 181 | "2014.03.30", 182 | // yyyymmdd and similar 183 | "20140601", 184 | "20140722105203", 185 | // unix seconds, ms, micro, nano 186 | "1332151919", 187 | "1384216367189", 188 | "1384216367111222", 189 | "1384216367111222333", 190 | } 191 | */ 192 | func (val stringValue) toDateTime() dateTimeValue { 193 | if val.null { 194 | return dateTimeValue{time.Time{}, true} 195 | } 196 | t, err := dateparse.ParseAny(val.v) 197 | if err != nil { 198 | return dateTimeValue{time.Time{}, true} 199 | } 200 | return dateTimeValue{t, false} 201 | } 202 | 203 | // [END Converters] 204 | -------------------------------------------------------------------------------- /dataframe/pivot.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/ptiger10/pd/internal/index" 8 | "github.com/ptiger10/pd/internal/values" 9 | "github.com/ptiger10/pd/options" 10 | "github.com/ptiger10/pd/series" 11 | ) 12 | 13 | // values: 14 | // make a [][]interface{} valsMatrix for rows x cols 15 | // # rows: unique non-stacked labels 16 | // # cols = unique stacked labels * number of columns 17 | // isolate first value of the stacked label within each non-stacked label 18 | // transpose to []interface and feed into interface factory to create []Values.Container 19 | func (df *DataFrame) stack(level int) (newIdxPositions []int, valsMatrix [][]interface{}, newColLvl []string) { 20 | var unstackedIndexLevels []int 21 | for j := 0; j < df.IndexLevels(); j++ { 22 | if j != level { 23 | unstackedIndexLevels = append(unstackedIndexLevels, j) 24 | } 25 | } 26 | g := df.GroupByIndex(unstackedIndexLevels...) 27 | 28 | labelsToStack := df.Index.unique(level) 29 | numRows := g.Len() 30 | numCols := len(labelsToStack) * df.NumCols() 31 | valsMatrix = make([][]interface{}, numRows) 32 | for i := 0; i < numRows; i++ { 33 | valsMatrix[i] = make([]interface{}, numCols) 34 | } 35 | 36 | // only extend the labels for the columns-to-be-stacked once 37 | extendColLevel := true 38 | for i, group := range g.Groups() { 39 | newIdxPositions = append(newIdxPositions, g.groups[group].Positions[0]) 40 | rows, _ := df.SubsetRows(g.groups[group].Positions) 41 | for labelOffset, label := range labelsToStack { 42 | // log warnings disabled because frequently a label will not exist in an index 43 | archive := options.GetLogWarnings() 44 | options.SetLogWarnings(false) 45 | row := rows.SelectLabels([]string{label}, level) 46 | options.SetLogWarnings(archive) 47 | // log warnings restored 48 | for m := 0; m < df.NumCols(); m++ { 49 | if len(row) > 0 { 50 | valsMatrix[i][m+labelOffset*df.NumCols()] = rows.vals[m].Values.Value(row[0]) 51 | } 52 | if extendColLevel { 53 | newColLvl = append(newColLvl, label) 54 | } 55 | } 56 | } 57 | extendColLevel = false 58 | } 59 | return newIdxPositions, valsMatrix, newColLvl 60 | } 61 | 62 | func (df *DataFrame) stackIndex(level int) *DataFrame { 63 | newIdxPositions, valsMatrix, newColLevel := df.stack(level) 64 | transposedVals := values.TransposeValues(valsMatrix) 65 | var containers []values.Container 66 | for i := 0; i < len(transposedVals); i++ { 67 | container := values.MustCreateValuesFromInterface(transposedVals[i]) 68 | containers = append(containers, container) 69 | } 70 | 71 | idx := df.index.Copy() 72 | idx.Subset(newIdxPositions) 73 | idx.DropLevel(level) 74 | 75 | cols := df.cols.Copy() 76 | for j := 0; j < df.ColLevels(); j++ { 77 | // duplicate each level enough times that it is same length as new column level 78 | cols.Levels[j].Duplicate((len(newColLevel) / df.NumCols()) - 1) 79 | } 80 | 81 | // ducks error because input is controlled 82 | cols.InsertLevel(0, newColLevel, df.index.Levels[level].Name) 83 | 84 | ret := newFromComponents(containers, idx, cols, df.Name()) 85 | if df.dataType() != options.Interface { 86 | ret.InPlace.Convert(df.dataType().String()) 87 | } 88 | return ret 89 | } 90 | 91 | // Pivot transforms data into the desired form and calls aggFunc on the reshaped data. 92 | func (df *DataFrame) Pivot(index int, values int, columns int, aggFunc string) (*DataFrame, error) { 93 | df = df.Copy() 94 | df.InPlace.SubsetColumns([]int{index, columns, values}) 95 | g := df.GroupBy(index, columns) 96 | 97 | switch aggFunc { 98 | case "sum": 99 | df = g.Sum() 100 | case "mean": 101 | df = g.Mean() 102 | case "median": 103 | df = g.Median() 104 | case "min": 105 | df = g.Min() 106 | case "max": 107 | df = g.Max() 108 | case "std": 109 | df = g.Std() 110 | default: 111 | return newEmptyDataFrame(), fmt.Errorf("df.Pivot(): aggFunc (%v) does not exist", aggFunc) 112 | } 113 | df = df.stackIndex(1) 114 | df.Columns.DropLevel(1) 115 | return df, nil 116 | } 117 | 118 | // Transpose transforms all rows to columns. 119 | func (df *DataFrame) Transpose() *DataFrame { 120 | ret := newEmptyDataFrame() 121 | for m := 0; m < df.NumCols(); m++ { 122 | row := transposeSeries(df.hydrateSeries(m)) 123 | ret.InPlace.appendDataFrameRow(row) 124 | } 125 | return ret 126 | } 127 | 128 | func transposeSeries(s *series.Series) *DataFrame { 129 | // Columns 130 | lvls := make([]index.ColLevel, s.NumLevels()) 131 | cols := index.NewColumns(lvls...) 132 | container, idx := s.ToInternalComponents() 133 | for j := 0; j < s.NumLevels(); j++ { 134 | cols.Levels[j].IsDefault = idx.Levels[j].IsDefault 135 | cols.Levels[j].DataType = idx.Levels[j].DataType 136 | cols.Levels[j].Name = idx.Levels[j].Name 137 | for m := 0; m < s.Len(); m++ { 138 | val := idx.Levels[j].Labels.Value(m) 139 | // TODO: test null value 140 | // if !elem.Null { 141 | // cols.Levels[j].Labels = append(cols.Levels[j].Labels, fmt.Sprint(elem.Value)) 142 | // } else { 143 | // cols.Levels[j].Labels = append(cols.Levels[j].Labels, "") 144 | // } 145 | cols.Levels[j].Labels = append(cols.Levels[j].Labels, fmt.Sprint(val)) 146 | } 147 | } 148 | cols.Refresh() 149 | 150 | // Index 151 | names := strings.Split(s.Name(), values.GetMultiColNameSeparator()) 152 | idxLvls := make([]index.Level, len(names)) 153 | retIdx := index.New(idxLvls...) 154 | for j := 0; j < len(names); j++ { 155 | name := names[j] 156 | // ducks error because type is known to be supported 157 | retIdx.Levels[j], _ = index.NewLevel(values.InterpolateString(name), "") 158 | } 159 | retIdx.NeedsRefresh = true 160 | 161 | // Values 162 | vals := make([]values.Container, s.Len()) 163 | for m := 0; m < s.Len(); m++ { 164 | vals[m].Values = container.Values.Subset([]int{m}) 165 | vals[m].DataType = container.DataType 166 | } 167 | 168 | return newFromComponents(vals, retIdx, cols, "") 169 | } 170 | -------------------------------------------------------------------------------- /series/group_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | "os" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | func TestGroup_Copy(t *testing.T) { 15 | s := MustNew([]int{1, 2, 3, 4}, Config{Index: []int{1, 1, 2, 2}}) 16 | got := s.GroupByIndex(0).copy().groups 17 | want := map[string]*group{ 18 | "1": {Positions: []int{0, 1}, FirstPosition: 0}, 19 | "2": {Positions: []int{2, 3}, FirstPosition: 2}, 20 | } 21 | if !reflect.DeepEqual(got, want) { 22 | t.Errorf("group.copy() got %v, want %v", got, want) 23 | } 24 | } 25 | 26 | func TestGrouping_Math(t *testing.T) { 27 | s := MustNew([]int{1, 2, 3, 4}, Config{Index: []int{1, 1, 2, 2}}) 28 | tests := []struct { 29 | name string 30 | input *Series 31 | fn func(Grouping) *Series 32 | want *Series 33 | }{ 34 | {name: "fail: empty", input: newEmptySeries(), fn: Grouping.Sum, 35 | want: newEmptySeries()}, 36 | {"sum", s, Grouping.Sum, 37 | MustNew([]float64{3, 7}, Config{Index: []int{1, 2}})}, 38 | {"mean", s, Grouping.Mean, 39 | MustNew([]float64{1.5, 3.5}, Config{Index: []int{1, 2}})}, 40 | {"min", s, Grouping.Min, 41 | MustNew([]float64{1, 3}, Config{Index: []int{1, 2}})}, 42 | {"max", s, Grouping.Max, 43 | MustNew([]float64{2, 4}, Config{Index: []int{1, 2}})}, 44 | {"median", s, Grouping.Median, 45 | MustNew([]float64{1.5, 3.5}, Config{Index: []int{1, 2}})}, 46 | {"standard deviation", s, Grouping.Std, 47 | MustNew([]float64{0.5, 0.5}, Config{Index: []int{1, 2}})}, 48 | } 49 | for _, tt := range tests { 50 | t.Run(tt.name, func(t *testing.T) { 51 | g := tt.input.GroupByIndex() 52 | // Test Asynchronously 53 | got := tt.fn(g) 54 | if !Equal(got, tt.want) { 55 | t.Errorf("s.GroupByIndex math operation returned %v, want %v", got, tt.want) 56 | } 57 | // Test Synchronously 58 | options.SetAsync(false) 59 | gotSync := tt.fn(g) 60 | if !Equal(gotSync, tt.want) { 61 | t.Errorf("s.GroupByIndex synchronous math operation returned %v, want %v", gotSync, tt.want) 62 | } 63 | options.RestoreDefaults() 64 | }) 65 | } 66 | } 67 | 68 | func TestSeries_GroupByIndex(t *testing.T) { 69 | multi := MustNew([]string{"foo", "bar", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 1, 2}, []int{2, 2, 1}}}) 70 | type args struct { 71 | levelPositions []int 72 | } 73 | tests := []struct { 74 | name string 75 | input *Series 76 | args args 77 | want map[string]*group 78 | }{ 79 | {name: "single no args", 80 | input: MustNew([]string{"foo", "bar", "baz"}, Config{Index: []int{1, 1, 2}}), 81 | args: args{[]int{}}, 82 | want: map[string]*group{ 83 | "1": {Positions: []int{0, 1}, FirstPosition: 0}, 84 | "2": {Positions: []int{2}, FirstPosition: 2}, 85 | }}, 86 | {"multi no args", 87 | multi, 88 | args{[]int{}}, 89 | map[string]*group{ 90 | "1 | 2": {Positions: []int{0, 1}, FirstPosition: 0}, 91 | "2 | 1": {Positions: []int{2}, FirstPosition: 2}, 92 | }}, 93 | {"multi one level", 94 | multi, 95 | args{[]int{0}}, 96 | map[string]*group{ 97 | "1": {Positions: []int{0, 1}, FirstPosition: 0}, 98 | "2": {Positions: []int{2}, FirstPosition: 2}, 99 | }}, 100 | {"multi two levels reversed", 101 | multi, 102 | args{[]int{1, 0}}, 103 | map[string]*group{ 104 | "2 | 1": {Positions: []int{0, 1}, FirstPosition: 0}, 105 | "1 | 2": {Positions: []int{2}, FirstPosition: 2}, 106 | }}, 107 | {"fail: invalid level", 108 | multi, 109 | args{[]int{10}}, 110 | newEmptyGrouping().groups}, 111 | {"fail: partial invalid level", 112 | multi, 113 | args{[]int{0, 10}}, 114 | newEmptyGrouping().groups}, 115 | } 116 | for _, tt := range tests { 117 | t.Run(tt.name, func(t *testing.T) { 118 | var buf bytes.Buffer 119 | log.SetOutput(&buf) 120 | defer log.SetOutput(os.Stderr) 121 | 122 | s := tt.input.Copy() 123 | got := s.GroupByIndex(tt.args.levelPositions...).groups 124 | if !reflect.DeepEqual(got, tt.want) { 125 | t.Errorf("Series.GroupByIndex() = %#v, want %#v", got, tt.want) 126 | } 127 | 128 | if strings.Contains(tt.name, "fail") { 129 | if buf.String() == "" { 130 | t.Errorf("Series.GroupByIndex() returned no log message, want log due to fail") 131 | } 132 | } 133 | }) 134 | } 135 | } 136 | 137 | func Test_Group(t *testing.T) { 138 | type args struct { 139 | label string 140 | } 141 | tests := []struct { 142 | name string 143 | args args 144 | want *Series 145 | }{ 146 | {name: "pass", args: args{"1"}, want: MustNew([]int{1, 2}, Config{Index: []int{1, 1}})}, 147 | {name: "fail", args: args{"100"}, want: newEmptySeries()}, 148 | } 149 | for _, tt := range tests { 150 | t.Run(tt.name, func(t *testing.T) { 151 | var buf bytes.Buffer 152 | log.SetOutput(&buf) 153 | defer log.SetOutput(os.Stderr) 154 | 155 | s := MustNew([]int{1, 2, 3, 4}, Config{Index: []int{1, 1, 2, 2}}) 156 | g := s.GroupByIndex() 157 | got := g.Group(tt.args.label) 158 | if !Equal(got, tt.want) { 159 | t.Errorf("Grouping.Group() = %v, want %v", got, tt.want) 160 | } 161 | if strings.Contains(tt.name, "fail") { 162 | if buf.String() == "" { 163 | t.Errorf("Grouping.Group() returned no log message, want log due to fail") 164 | } 165 | } 166 | 167 | }) 168 | } 169 | } 170 | 171 | func TestGrouping_Nth(t *testing.T) { 172 | s := MustNew([]string{"foo", "bar", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 1, 2}, []int{2, 2, 1}}}) 173 | g := s.GroupByIndex() 174 | gotFirst := g.First() 175 | wantFirst := MustNew([]string{"foo", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 2}, []int{2, 1}}}) 176 | if !Equal(gotFirst, wantFirst) { 177 | t.Errorf("Grouping.First() = %#v, want %#v", gotFirst, wantFirst) 178 | } 179 | gotLast := g.Last() 180 | wantLast := MustNew([]string{"bar", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 2}, []int{2, 1}}}) 181 | if !Equal(gotLast, wantLast) { 182 | t.Errorf("Grouping.Last() = %#v, want %#v", gotLast, wantLast) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /internal/values/shared_template.go: -------------------------------------------------------------------------------- 1 | package values 2 | 3 | import ( 4 | "github.com/cheekybits/genny/generic" 5 | "github.com/ptiger10/pd/options" 6 | ) 7 | 8 | //go:generate genny -in=$GOFILE -out=shared_autogen.go gen "valueType=float64,int64,string,bool,time.Time,interface{}" 9 | 10 | // [START] valueTypeValues 11 | 12 | // valueType is the generic ValueType that will be replaced by specific types on `make generate` 13 | type valueType generic.Type 14 | 15 | // valueTypeValues is a slice of valueType-typed value/null structs. 16 | type valueTypeValues []valueTypeValue 17 | 18 | // valueTypeValue is a valueType-typed value/null struct. 19 | type valueTypeValue struct { 20 | v valueType 21 | null bool 22 | } 23 | 24 | // newSlicevalueType converts []valueType -> Container with valueTypeValues 25 | func newSlicevalueType(vals []valueType) Container { 26 | ret := make(valueTypeValues, len(vals)) 27 | for i := 0; i < len(vals); i++ { 28 | ret[i] = newvalueType(vals[i]) 29 | } 30 | return Container{&ret, options.PlaceholdervalueType} 31 | } 32 | 33 | // Len returns the number of value/null structs in the container. 34 | func (vals *valueTypeValues) Len() int { 35 | return len(*vals) 36 | } 37 | 38 | func (vals *valueTypeValues) Swap(i, j int) { 39 | (*vals)[i], (*vals)[j] = (*vals)[j], (*vals)[i] 40 | } 41 | 42 | // Subset returns the values located at specific index positions. 43 | func (vals *valueTypeValues) Subset(rowPositions []int) Values { 44 | ret := make(valueTypeValues, len(rowPositions)) 45 | for i := 0; i < len(rowPositions); i++ { 46 | ret[i] = (*vals)[rowPositions[i]] 47 | } 48 | return &ret 49 | } 50 | 51 | // Append converts vals2 to valueTypeValues and extends the original valueTypeValues. 52 | func (vals *valueTypeValues) Append(vals2 Values) { 53 | convertedVals, _ := Convert(vals2, options.PlaceholdervalueType) 54 | newVals := convertedVals.(*valueTypeValues) 55 | *vals = append(*vals, *newVals...) 56 | } 57 | 58 | // Values returns only the Value fields for the collection of Value/Null structs as an interface slice. 59 | func (vals *valueTypeValues) Values() []interface{} { 60 | v := *vals 61 | ret := make([]interface{}, len(v)) 62 | for i := 0; i < len(v); i++ { 63 | ret[i] = v[i].v 64 | } 65 | return ret 66 | } 67 | 68 | // Vals returns only the Value fields for the collection of Value/Null structs as an empty interface. 69 | // 70 | // Caution: This operation excludes the Null field but retains any null values. 71 | func (vals *valueTypeValues) Vals() interface{} { 72 | v := *vals 73 | ret := make([]valueType, len(v)) 74 | for i := 0; i < len(v); i++ { 75 | ret[i] = v[i].v 76 | } 77 | return ret 78 | } 79 | 80 | // Value returns the Value field at the specified integer position. 81 | func (vals *valueTypeValues) Value(position int) interface{} { 82 | return (*vals)[position].v 83 | } 84 | 85 | // Value returns the Null field at the specified integer position. 86 | func (vals *valueTypeValues) Null(position int) bool { 87 | return (*vals)[position].null 88 | } 89 | 90 | // Copy transfers every value from the current valueTypeValues container into a new Values container 91 | func (vals *valueTypeValues) Copy() Values { 92 | v := *vals 93 | newValues := make(valueTypeValues, len(v)) 94 | for i := 0; i < len(v); i++ { 95 | newValues[i] = v[i] 96 | } 97 | return &newValues 98 | } 99 | 100 | // Set overwrites a Value/Null pair at an integer position. 101 | func (vals *valueTypeValues) Set(position int, newVal interface{}) { 102 | var v interfaceValue 103 | if isNullInterface(newVal) { 104 | v = interfaceValue{newVal, true} 105 | } else { 106 | v = interfaceValue{newVal, false} 107 | } 108 | (*vals)[position] = v.tovalueType() 109 | } 110 | 111 | // Drop drops the Value/Null pair at an integer position. 112 | func (vals *valueTypeValues) Drop(pos int) { 113 | *vals = append((*vals)[:pos], (*vals)[pos+1:]...) 114 | } 115 | 116 | // Insert inserts a new Value/Null pair at an integer position. 117 | func (vals *valueTypeValues) Insert(pos int, val interface{}) { 118 | v := interfaceValue{val, false} 119 | *vals = append((*vals)[:pos], append([]valueTypeValue{v.tovalueType()}, (*vals)[pos:]...)...) 120 | } 121 | 122 | // ToFloat converts valueTypeValues to floatValues. 123 | func (vals *valueTypeValues) ToFloat64() Values { 124 | ret := make(float64Values, len(*vals)) 125 | for i := 0; i < len(*vals); i++ { 126 | ret[i] = (*vals)[i].toFloat64() 127 | } 128 | return &ret 129 | } 130 | 131 | // ToInt converts valueTypeValues to intValues. 132 | func (vals *valueTypeValues) ToInt64() Values { 133 | ret := make(int64Values, len(*vals)) 134 | for i := 0; i < len(*vals); i++ { 135 | ret[i] = (*vals)[i].toInt64() 136 | } 137 | return &ret 138 | } 139 | 140 | // ToString converts valueTypeValues to stringValues. 141 | func (vals *valueTypeValues) ToString() Values { 142 | ret := make(stringValues, len(*vals)) 143 | for i := 0; i < len(*vals); i++ { 144 | ret[i] = (*vals)[i].toString() 145 | } 146 | return &ret 147 | } 148 | 149 | // ToBool converts valueTypeValues to boolValues. 150 | func (vals *valueTypeValues) ToBool() Values { 151 | ret := make(boolValues, len(*vals)) 152 | for i := 0; i < len(*vals); i++ { 153 | ret[i] = (*vals)[i].toBool() 154 | } 155 | return &ret 156 | } 157 | 158 | // ToBool converts valueTypeValues to dateTimeValues. 159 | func (vals *valueTypeValues) ToDateTime() Values { 160 | ret := make(dateTimeValues, len(*vals)) 161 | for i := 0; i < len(*vals); i++ { 162 | ret[i] = (*vals)[i].toDateTime() 163 | } 164 | return &ret 165 | } 166 | 167 | // ToInterface converts valueTypeValues to interfaceValues. 168 | func (vals *valueTypeValues) ToInterface() Values { 169 | ret := make(interfaceValues, len(*vals)) 170 | for i := 0; i < len(*vals); i++ { 171 | if (*vals)[i].null { 172 | ret[i] = interfaceValue{(*vals)[i].v, true} 173 | } else { 174 | ret[i] = interfaceValue{(*vals)[i].v, false} 175 | } 176 | } 177 | return &ret 178 | } 179 | 180 | // [END] valueTypeValues 181 | // --------------------------------------------------------------------------- 182 | var placeholder = true 183 | 184 | // the placeholder and this comment are overwritten on `make generate`, but are included so that the [END] comment survives 185 | -------------------------------------------------------------------------------- /series/group.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "sort" 7 | "strings" 8 | "sync" 9 | 10 | "github.com/ptiger10/pd/internal/values" 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | type group struct { 15 | Positions []int 16 | FirstPosition int 17 | } 18 | 19 | func (grp *group) copy() *group { 20 | pos := make([]int, len(grp.Positions)) 21 | for i, p := range grp.Positions { 22 | pos[i] = p 23 | } 24 | return &group{Positions: pos, FirstPosition: grp.FirstPosition} 25 | } 26 | 27 | // copy a grouping 28 | func (g Grouping) copy() Grouping { 29 | grps := make(map[string]*group) 30 | for k, v := range g.groups { 31 | grps[k] = v.copy() 32 | } 33 | return Grouping{ 34 | s: g.s.Copy(), 35 | groups: grps, 36 | } 37 | } 38 | 39 | func (g Grouping) asyncMath(fn func(*Series) float64) *Series { 40 | var wg sync.WaitGroup 41 | g = g.copy() 42 | if g.Len() == 0 { 43 | return newEmptySeries() 44 | } 45 | 46 | // synchronous option 47 | if !options.GetAsync() { 48 | ret := newEmptySeries() 49 | for _, group := range g.Groups() { 50 | s := g.math(group, fn) 51 | ret.InPlace.Join(s) 52 | } 53 | return ret 54 | } 55 | 56 | // asynchronous option 57 | ch := make(chan calcReturn, g.Len()) 58 | for i, group := range g.Groups() { 59 | wg.Add(1) 60 | go g.awaitMath(ch, i, group, fn, &wg) 61 | } 62 | wg.Wait() 63 | close(ch) 64 | var container []calcReturn 65 | for result := range ch { 66 | container = append(container, result) 67 | } 68 | sort.Slice(container, func(i, j int) bool { 69 | return container[i].n < container[j].n 70 | }) 71 | 72 | s := newEmptySeries() 73 | for _, result := range container { 74 | s.InPlace.Join(result.s) 75 | } 76 | s.index.NeedsRefresh = true 77 | return s 78 | } 79 | 80 | type calcReturn struct { 81 | s *Series 82 | n int 83 | } 84 | 85 | func (g Grouping) awaitMath(ch chan<- calcReturn, n int, group string, fn func(*Series) float64, wg *sync.WaitGroup) { 86 | s := g.math(group, fn) 87 | ret := calcReturn{s: s, n: n} 88 | ch <- ret 89 | wg.Done() 90 | } 91 | 92 | func (g Grouping) math(group string, fn func(*Series) float64) *Series { 93 | positions := g.groups[group].Positions 94 | rows, _ := g.s.Subset(positions) 95 | calc := fn(rows) 96 | s := MustNew(calc) 97 | 98 | // index is the same as the index at the first row position of the group 99 | idxCopy := g.s.index.Copy() 100 | idxCopy.Subset([]int{g.groups[group].FirstPosition}) 101 | s.index = idxCopy 102 | return s 103 | } 104 | 105 | // Groups returns all valid group labels in the Grouping. 106 | func (g Grouping) Groups() []string { 107 | var keys []string 108 | for k := range g.groups { 109 | keys = append(keys, k) 110 | } 111 | sort.Strings(keys) 112 | return keys 113 | } 114 | 115 | // Len returns the number of groups in the Grouping. 116 | func (g Grouping) Len() int { 117 | return len(g.groups) 118 | } 119 | 120 | // Group returns the Series with the given group label, or an error if that label does not exist. 121 | func (g Grouping) Group(label string) *Series { 122 | group, ok := g.groups[label] 123 | if !ok { 124 | if options.GetLogWarnings() { 125 | log.Printf("s.Grouping.Group(): label %v not in g.Groups()", label) 126 | } 127 | return newEmptySeries() 128 | } 129 | s, _ := g.s.Subset(group.Positions) 130 | return s 131 | } 132 | 133 | func newEmptyGrouping() Grouping { 134 | groups := make(map[string]*group) 135 | s := newEmptySeries() 136 | return Grouping{s: s, groups: groups} 137 | } 138 | 139 | // GroupByIndex groups a Series by one or more of its index levels. If no int is provided, all index levels are used. 140 | func (s *Series) GroupByIndex(levelPositions ...int) Grouping { 141 | groups := make(map[string]*group) 142 | if len(levelPositions) != 0 { 143 | var err error 144 | s = s.Copy() 145 | err = s.Index.SubsetLevels(levelPositions) 146 | if err != nil { 147 | if options.GetLogWarnings() { 148 | log.Printf("s.GroupByIndex() %v\n", err) 149 | } 150 | return newEmptyGrouping() 151 | } 152 | } 153 | 154 | for i := 0; i < s.Len(); i++ { 155 | labels := s.index.Elements(i).Labels 156 | var strLabels []string 157 | for _, label := range labels { 158 | strLabels = append(strLabels, fmt.Sprint(label)) 159 | } 160 | groupLabel := strings.Join(strLabels, values.GetMultiColNameSeparator()) 161 | 162 | if _, ok := groups[groupLabel]; !ok { 163 | groups[groupLabel] = &group{FirstPosition: i} 164 | } 165 | groups[groupLabel].Positions = append(groups[groupLabel].Positions, i) 166 | } 167 | return Grouping{s: s, groups: groups} 168 | } 169 | 170 | // First returns the first occurrence of each grouping in the Series. 171 | func (g Grouping) First() *Series { 172 | first := func(group string) *Series { 173 | position := g.groups[group].Positions[0] 174 | s, _ := g.s.Subset([]int{position}) 175 | return s 176 | } 177 | ret := newEmptySeries() 178 | for _, group := range g.Groups() { 179 | s := first(group) 180 | ret.InPlace.Join(s) 181 | } 182 | return ret 183 | } 184 | 185 | // Last returns the last occurrence of each grouping in the Series. 186 | func (g Grouping) Last() *Series { 187 | last := func(group string) *Series { 188 | lastIdx := len(g.groups[group].Positions) - 1 189 | position := g.groups[group].Positions[lastIdx] 190 | s, _ := g.s.Subset([]int{position}) 191 | return s 192 | } 193 | ret := newEmptySeries() 194 | for _, group := range g.Groups() { 195 | s := last(group) 196 | ret.InPlace.Join(s) 197 | } 198 | return ret 199 | } 200 | 201 | // Sum for each group in the Grouping. 202 | func (g Grouping) Sum() *Series { 203 | return g.asyncMath((*Series).Sum) 204 | } 205 | 206 | // Mean for each group in the Grouping. 207 | func (g Grouping) Mean() *Series { 208 | return g.asyncMath((*Series).Mean) 209 | } 210 | 211 | // Min for each group in the Grouping. 212 | func (g Grouping) Min() *Series { 213 | return g.asyncMath((*Series).Min) 214 | } 215 | 216 | // Max for each group in the Grouping. 217 | func (g Grouping) Max() *Series { 218 | return g.asyncMath((*Series).Max) 219 | } 220 | 221 | // Median for each group in the Grouping. 222 | func (g Grouping) Median() *Series { 223 | return g.asyncMath((*Series).Median) 224 | } 225 | 226 | // Std for each group in the Grouping. 227 | func (g Grouping) Std() *Series { 228 | return g.asyncMath((*Series).Std) 229 | } 230 | -------------------------------------------------------------------------------- /options/settable.go: -------------------------------------------------------------------------------- 1 | package options 2 | 3 | var defaultOptions = struct { 4 | displayMaxWidth int 5 | displayMaxRows int 6 | displayMaxColumns int 7 | displayFloatPrecision int 8 | displayRepeatedLabels bool 9 | displayStringNullFiller string 10 | displayTimeFormat string 11 | stringNullValues []string 12 | logWarnings bool 13 | async bool 14 | }{ 15 | displayMaxWidth, 16 | displayMaxRows, 17 | displayMaxColumns, 18 | displayFloatPrecision, 19 | displayRepeatedLabels, 20 | displayStringNullFiller, 21 | displayTimeFormat, 22 | stringNullValues, 23 | logWarnings, 24 | async, 25 | } 26 | 27 | // RestoreDefaults resets options back to their default setting 28 | func RestoreDefaults() { 29 | SetDisplayMaxWidth(defaultOptions.displayMaxWidth) 30 | SetDisplayMaxRows(defaultOptions.displayMaxRows) 31 | SetDisplayMaxColumns(defaultOptions.displayMaxColumns) 32 | SetDisplayFloatPrecision(defaultOptions.displayFloatPrecision) 33 | SetDisplayRepeatedLabels(defaultOptions.displayRepeatedLabels) 34 | SetDisplayStringNullFiller(defaultOptions.displayStringNullFiller) 35 | SetDisplayTimeFormat(defaultOptions.displayTimeFormat) 36 | SetStringNullValues(defaultOptions.stringNullValues) 37 | SetLogWarnings(defaultOptions.logWarnings) 38 | SetAsync(defaultOptions.async) 39 | } 40 | 41 | var displayMaxWidth = 35 42 | var displayMaxRows = 50 43 | var displayMaxColumns = 50 44 | var displayFloatPrecision = 2 45 | var displayRepeatedLabels = false 46 | var displayStringNullFiller = "NaN" 47 | var displayTimeFormat = "1/2/2006T15:04:05" 48 | var stringNullValues = []string{"NaN", "n/a", "N/A", "", "nil"} 49 | var logWarnings = true 50 | var async = true 51 | 52 | // SetDisplayMaxWidth sets DisplayMaxWidth to n characters. 53 | // DisplayMaxWidth is an option when printing a Series. 54 | // It is the widest allowable character width for an index label or value. 55 | // If a label is longer than the max, it will be elided at the end. 56 | // 57 | // Default width: 35 characters 58 | func SetDisplayMaxWidth(n int) { 59 | displayMaxWidth = n 60 | } 61 | 62 | // GetDisplayMaxWidth returns DisplayMaxWidth. 63 | func GetDisplayMaxWidth() int { 64 | return displayMaxWidth 65 | } 66 | 67 | // SetDisplayMaxRows sets DisplayMaxRow to n rows. 68 | // DisplayMaxRow is an option when printing a Series. 69 | // It is the max number of rows that will be printed to the screen. 70 | // If the actual number of rows is longer than the max, the first n/2 and last n/2 will be displayed, and the middle will be elided. 71 | // 72 | // Default width: 50 rows 73 | func SetDisplayMaxRows(n int) { 74 | displayMaxRows = n 75 | } 76 | 77 | // GetDisplayMaxRows returns DisplayMaxRows. 78 | func GetDisplayMaxRows() int { 79 | return displayMaxRows 80 | } 81 | 82 | // SetDisplayMaxColumns sets DisplayMaxColumns to n columns. 83 | // DisplayMaxColumns is an option when printing a Series. 84 | // It is the max number of columns that will be printed to the screen. 85 | // If the actual number of columns is longer than the max, the first n/2 and last n/2 will be displayed, and the middle will be elided. 86 | // 87 | // Default width: 50 columns 88 | func SetDisplayMaxColumns(n int) { 89 | displayMaxColumns = n 90 | } 91 | 92 | // GetDisplayMaxColumns returns DisplayMaxColumns. 93 | func GetDisplayMaxColumns() int { 94 | return displayMaxColumns 95 | } 96 | 97 | // SetDisplayFloatPrecision sets DisplayFloatPrecision to n decimal places. 98 | // DisplayFloatPrecision is an option when printing a Series. 99 | // It is the number of decimal points in floating point values and index labels. 100 | // 101 | // Default precision: 2 decimal points 102 | func SetDisplayFloatPrecision(n int) { 103 | displayFloatPrecision = n 104 | } 105 | 106 | // GetDisplayFloatPrecision returns DisplayFloatPrecision. 107 | func GetDisplayFloatPrecision() int { 108 | return displayFloatPrecision 109 | } 110 | 111 | // SetDisplayRepeatedLabels sets DisplayRepeatedLabels to boolean. 112 | // DisplayRepeatedLabels is an option when printing a Series. 113 | // If true, all index labels will be shown, like so: 114 | // 115 | // A 0 foo 116 | // 117 | // B 0 bar 118 | // 119 | // C 1 baz 120 | // 121 | // If false, repeated index labels in the same level will be excluded, like so: 122 | // 123 | // A 0 foo 124 | // 125 | // B ... bar 126 | // 127 | // C 1 baz 128 | // 129 | // NB: ellipsis not included in actual printing 130 | // 131 | // Default: false 132 | func SetDisplayRepeatedLabels(boolean bool) { 133 | displayRepeatedLabels = boolean 134 | } 135 | 136 | // GetDisplayRepeatedLabels returns DisplayRepeatedLabels. 137 | func GetDisplayRepeatedLabels() bool { 138 | return displayRepeatedLabels 139 | } 140 | 141 | // SetDisplayStringNullFiller sets DisplayStringNullFiller to "s". 142 | // DisplayStringNullFiller is an option when printing a Series. 143 | // It is how null string values are represented. 144 | // 145 | // Default: "NaN" 146 | func SetDisplayStringNullFiller(s string) { 147 | displayStringNullFiller = s 148 | } 149 | 150 | // GetDisplayStringNullFiller returns DisplayStringNullFiller. 151 | func GetDisplayStringNullFiller() string { 152 | return displayStringNullFiller 153 | } 154 | 155 | // SetDisplayTimeFormat formats how datetimes are displayed, using the syntax specified in package time.Time. 156 | // 157 | // Default: "1/2/2006T15:04:05" 158 | func SetDisplayTimeFormat(s string) { 159 | displayTimeFormat = s 160 | } 161 | 162 | // GetDisplayTimeFormat returns DisplayTimeFormat. 163 | func GetDisplayTimeFormat() string { 164 | return displayTimeFormat 165 | } 166 | 167 | // SetStringNullValues sets StringNullValues to include only those items contained in nullList. 168 | // StringNullValues is an option when constructing or converting a Series. 169 | // It is the list of string values that are considered null. 170 | // 171 | // default: []string{"NaN", "n/a", "N/A", "", "nil"} 172 | func SetStringNullValues(nullList []string) { 173 | stringNullValues = nullList 174 | } 175 | 176 | // GetStringNullValues returns StringNullValues. 177 | func GetStringNullValues() []string { 178 | return stringNullValues 179 | } 180 | 181 | // SetLogWarnings sets LogWarnings to boolean. 182 | // LogWarnings is an option when executing functions within this module. 183 | // If true, non-returned errors are logged to stderr. 184 | // This is relevant for many common exploratory methods, which are often chained together and therefore not designed to return an error value. 185 | // 186 | // default: true 187 | func SetLogWarnings(boolean bool) { 188 | logWarnings = boolean 189 | } 190 | 191 | // GetLogWarnings returns LogWarnings. 192 | func GetLogWarnings() bool { 193 | return logWarnings 194 | } 195 | 196 | // SetAsync sets Async to boolean. 197 | // Async is an option for executing certain operations over multiple groups (e.g., math on Groupings or Columns) as goroutines instead of synchronously. 198 | // If true, eligible operations are split into goroutines and merged back together. 199 | // 200 | // default: true 201 | func SetAsync(boolean bool) { 202 | async = boolean 203 | } 204 | 205 | // GetAsync returns Async. 206 | func GetAsync() bool { 207 | return async 208 | } 209 | -------------------------------------------------------------------------------- /dataframe/select.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/ptiger10/pd/internal/values" 8 | "github.com/ptiger10/pd/options" 9 | "github.com/ptiger10/pd/series" 10 | ) 11 | 12 | // Row returns information about the values and index labels in this row but panics if an out-of-range position is provided. 13 | func (df *DataFrame) Row(position int) Row { 14 | vals := make([]interface{}, df.NumCols()) 15 | nulls := make([]bool, df.NumCols()) 16 | types := make([]options.DataType, df.NumCols()) 17 | for m := 0; m < df.NumCols(); m++ { 18 | vals[m] = df.vals[m].Values.Value(position) 19 | nulls[m] = df.vals[m].Values.Null(position) 20 | types[m] = df.vals[m].DataType 21 | } 22 | idxElems := df.index.Elements(position) 23 | return Row{Values: vals, Nulls: nulls, ValueTypes: types, Labels: idxElems.Labels, LabelTypes: idxElems.DataTypes} 24 | } 25 | 26 | // SelectLabel returns the integer location of the first row in index level 0 with the supplied label, or -1 if the label does not exist. 27 | func (df *DataFrame) SelectLabel(label string) int { 28 | if df.IndexLevels() == 0 { 29 | if options.GetLogWarnings() { 30 | log.Println("DataFrame.SelectLabel(): index has no levels") 31 | } 32 | return -1 33 | } 34 | df.index.Levels[0].UpdateLabelMap() 35 | val, ok := df.index.Levels[0].LabelMap[label] 36 | if !ok { 37 | if options.GetLogWarnings() { 38 | log.Printf("DataFrame.SelectLabel(): %v not in label map\n", label) 39 | } 40 | return -1 41 | } 42 | return val[0] 43 | } 44 | 45 | // SelectLabels returns the integer locations of all rows with the supplied labels within the supplied level. 46 | // If an error is encountered, returns a new slice of 0 length. 47 | func (df *DataFrame) SelectLabels(labels []string, level int) []int { 48 | empty := make([]int, 0) 49 | err := df.ensureIndexLevelPositions([]int{level}) 50 | if err != nil { 51 | if options.GetLogWarnings() { 52 | log.Printf("DataFrame.SelectLabels(): %v", err) 53 | } 54 | return empty 55 | } 56 | df.index.Levels[level].UpdateLabelMap() 57 | include := make([]int, 0) 58 | for _, label := range labels { 59 | val, ok := df.index.Levels[level].LabelMap[label] 60 | if !ok { 61 | if options.GetLogWarnings() { 62 | log.Printf("DataFrame.SelectLabels(): %v not in label map", label) 63 | } 64 | return empty 65 | } 66 | include = append(include, val...) 67 | } 68 | return include 69 | } 70 | 71 | // SelectCol returns the integer location of the first row in index level 0 with the supplied label, or -1 if the label does not exist. 72 | func (df *DataFrame) SelectCol(label string) int { 73 | if df.ColLevels() == 0 { 74 | if options.GetLogWarnings() { 75 | log.Println("DataFrame.SelectCol(): index has no levels") 76 | } 77 | return -1 78 | } 79 | val, ok := df.cols.Levels[0].LabelMap[label] 80 | if !ok { 81 | if options.GetLogWarnings() { 82 | log.Printf("DataFrame.SelectCol(): %v not in label map\n", label) 83 | } 84 | return -1 85 | } 86 | return val[0] 87 | } 88 | 89 | // SelectCols returns the integer locations of all columns with the supplied labels within the supplied level. 90 | // If an error is encountered, returns a new slice of 0 length. 91 | func (df *DataFrame) SelectCols(labels []string, level int) []int { 92 | empty := make([]int, 0) 93 | err := df.ensureColumnLevelPositions([]int{level}) 94 | if err != nil { 95 | if options.GetLogWarnings() { 96 | log.Printf("DataFrame.SelectCols(): %v", err) 97 | } 98 | return empty 99 | } 100 | include := make([]int, 0) 101 | for _, label := range labels { 102 | val, ok := df.cols.Levels[level].LabelMap[label] 103 | if !ok { 104 | if options.GetLogWarnings() { 105 | log.Printf("DataFrame.SelectCols(): %v not in label map", label) 106 | } 107 | return empty 108 | } 109 | include = append(include, val...) 110 | } 111 | return include 112 | } 113 | 114 | // Col returns the first Series with the specified column label at column level 0. 115 | func (df *DataFrame) Col(label string) *series.Series { 116 | colPos, ok := df.cols.Levels[0].LabelMap[label] 117 | if !ok { 118 | if options.GetLogWarnings() { 119 | log.Printf("df.Col(): invalid column label: %v not in labels", label) 120 | } 121 | s, _ := series.New(nil) 122 | return s 123 | } 124 | return df.hydrateSeries(colPos[0]) 125 | } 126 | 127 | // ColAt returns the Series at the specified column. 128 | func (df *DataFrame) ColAt(col int) *series.Series { 129 | if err := df.ensureColumnPositions([]int{col}); err != nil { 130 | if options.GetLogWarnings() { 131 | log.Printf("df.ColAt(): %v", err) 132 | } 133 | s, _ := series.New(nil) 134 | return s 135 | } 136 | return df.hydrateSeries(col) 137 | } 138 | 139 | // subsetRows subsets a DataFrame to include only index items and values at the row positions supplied and modifies the DataFrame in place. 140 | func (ip InPlace) subsetRows(positions []int) { 141 | for m := 0; m < ip.df.NumCols(); m++ { 142 | ip.df.vals[m].Values = ip.df.vals[m].Values.Subset(positions) 143 | } 144 | 145 | ip.df.index.Subset(positions) 146 | } 147 | 148 | // subsetRows subsets a DataFrame to include only index items and values at the row positions supplied and modifies the DataFrame in place. 149 | // For use in internal functions that do not expect en error, such as GroupBy. 150 | func (df *DataFrame) subsetRows(positions []int) *DataFrame { 151 | df = df.Copy() 152 | df.InPlace.subsetRows(positions) 153 | return df 154 | } 155 | 156 | // SubsetRows subsets a DataFrame to include only the rows at supplied integer positions and modifies the DataFrame in place. 157 | func (ip InPlace) SubsetRows(rowPositions []int) error { 158 | if len(rowPositions) == 0 { 159 | return fmt.Errorf("dataframe.SubsetRows(): no valid rows provided") 160 | } 161 | if err := ip.df.ensureRowPositions(rowPositions); err != nil { 162 | return fmt.Errorf("dataframe.SubsetRows(): %v", err) 163 | } 164 | 165 | ip.subsetRows(rowPositions) 166 | return nil 167 | } 168 | 169 | // SubsetRows subsets a DataFrame to include only the rows at supplied integer positions and returns a new DataFrame. 170 | func (df *DataFrame) SubsetRows(rowPositions []int) (*DataFrame, error) { 171 | df = df.Copy() 172 | err := df.InPlace.SubsetRows(rowPositions) 173 | return df, err 174 | } 175 | 176 | // SubsetColumns subsets a DataFrame to include only the columns at supplied integer positions and modifies the DataFrame in place. 177 | func (ip InPlace) SubsetColumns(columnPositions []int) error { 178 | if len(columnPositions) == 0 { 179 | return fmt.Errorf("dataframe.SubsetColumns(): no valid columns provided") 180 | } 181 | 182 | if err := ip.df.ensureColumnPositions(columnPositions); err != nil { 183 | return fmt.Errorf("dataframe.SubsetColumns(): %v", err) 184 | } 185 | 186 | vals := make([]values.Container, len(columnPositions)) 187 | for i, pos := range columnPositions { 188 | vals[i] = ip.df.vals[pos] 189 | } 190 | ip.df.vals = vals 191 | ip.df.cols.Subset(columnPositions) 192 | 193 | return nil 194 | } 195 | 196 | // SubsetColumns subsets a DataFrame to include only the columns at supplied integer positions and returns a new DataFrame. 197 | func (df *DataFrame) SubsetColumns(columnPositions []int) (*DataFrame, error) { 198 | df = df.Copy() 199 | err := df.InPlace.SubsetColumns(columnPositions) 200 | return df, err 201 | } 202 | -------------------------------------------------------------------------------- /series/filter.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "strings" 5 | "time" 6 | 7 | "github.com/ptiger10/pd/internal/values" 8 | ) 9 | 10 | // Apply a callback function to every value in a Series and return a new Series. 11 | // The Apply function iterates over all Series values in interface{} form and applies the callback function to each. 12 | // The final values are then converted to match the datatype of the original Series. 13 | // The caller is responsible for handling the type assertion on the interface, though this step is not necessary if the datatype is known with certainty. 14 | // For example, here are two ways to write an apply function that computes the z-score of every row and rounds it two decimal points: 15 | // 16 | // #1 (safer) error check type assertion 17 | // 18 | // s.Apply(func(val interface{}) interface{} { 19 | // v, ok := val.(float64) 20 | // if !ok { 21 | // return "" 22 | // } 23 | // return (v - s.Mean()) / s.Std() 24 | // 25 | // Input: 26 | // 0 1 27 | // 1 2 28 | // 2 3 29 | // 30 | // Output: 31 | // 0 -1.22... 32 | // 1 0 33 | // 2 1.22... 34 | // 35 | // #2 (riskier) no error check 36 | // 37 | // s.Apply(func(val interface{}) interface{} { 38 | // return (val.(float64) - s.Mean()) / s.Std() 39 | // }) 40 | func (s *Series) Apply(fn func(interface{}) interface{}) *Series { 41 | ret := s.Copy() 42 | ret.InPlace.Apply(fn) 43 | return ret 44 | } 45 | 46 | // Apply a callback function to every value in a Series and modify the Series in place. 47 | func (ip InPlace) Apply(fn func(interface{}) interface{}) { 48 | vals := ip.s.Values() 49 | newVals := make([]interface{}, 0) 50 | for _, val := range vals { 51 | newVal := fn(val) 52 | newVals = append(newVals, newVal) 53 | } 54 | // ducks error because []interface{} as arg in InterfaceFactory cannot trigger unsupported error 55 | container := values.MustCreateValuesFromInterface(newVals) 56 | ret, _ := values.Convert(container.Values, ip.s.datatype) 57 | ip.s.values = ret 58 | } 59 | 60 | // Filter a Series using a callback function test. 61 | // The Filter function iterates over all Series values in interface{} form and applies the callback test to each. 62 | // The return value is a slice of integer positions of all the rows passing the test. 63 | // The caller is responsible for handling the type assertion on the interface, though this step is not necessary if the datatype is known with certainty. 64 | // For example, here are two ways to write a filter that returns all rows with the suffix "boo": 65 | // 66 | // #1 (safer) error check type assertion 67 | // 68 | // s.Filter(func(val interface{}) bool { 69 | // v, ok := val.(string) 70 | // if !ok { 71 | // return false 72 | // } 73 | // if strings.HasSuffix(v, "boo") { 74 | // return true 75 | // } 76 | // return false 77 | // }) 78 | // 79 | // Input: 80 | // 0 bamboo 81 | // 1 leaves 82 | // 2 taboo 83 | // 84 | // Output: 85 | // []int{0,2} 86 | // 87 | // #2 (riskier) no error check 88 | // 89 | // s.Filter(func(val interface{}) bool { 90 | // if strings.HasSuffix(val.(string), "boo") { 91 | // return true 92 | // } 93 | // return false 94 | // }) 95 | func (s *Series) Filter(cmp func(interface{}) bool) []int { 96 | vals := s.Values() 97 | include := make([]int, 0) 98 | for i, val := range vals { 99 | if cmp(val) { 100 | include = append(include, i) 101 | } 102 | } 103 | return include 104 | } 105 | 106 | // filterFloat64 converts a Series to float values, applies a filter, and returns the rows where the condition is true. 107 | func (s *Series) filterFloat64(cmp func(float64) bool) []int { 108 | include := make([]int, 0) 109 | vals := s.ToFloat64().values.Vals().([]float64) 110 | for i, val := range vals { 111 | if cmp(val) { 112 | include = append(include, i) 113 | } 114 | } 115 | return include 116 | } 117 | 118 | // filterString converts a Series to string values, applies a filter, and returns the rows where the condition is true. 119 | func (s *Series) filterString(cmp func(string) bool) []int { 120 | include := make([]int, 0) 121 | vals := s.ToString().values.Vals().([]string) 122 | for i, val := range vals { 123 | if cmp(val) { 124 | include = append(include, i) 125 | } 126 | } 127 | return include 128 | } 129 | 130 | // filterBool converts a Series to bool values, applies a filter, and returns the rows where the condition is true. 131 | func (s *Series) filterBool(cmp func(bool) bool) []int { 132 | include := make([]int, 0) 133 | vals := s.ToBool().values.Vals().([]bool) 134 | for i, val := range vals { 135 | if cmp(val) { 136 | include = append(include, i) 137 | } 138 | } 139 | return include 140 | } 141 | 142 | // filterDateTime converts a Series to datetime values, applies a filter, and returns the rows where the condition is true. 143 | func (s *Series) filterDateTime(cmp func(time.Time) bool) []int { 144 | include := make([]int, 0) 145 | vals := s.ToDateTime().values.Vals().([]time.Time) 146 | for i, val := range vals { 147 | if cmp(val) { 148 | include = append(include, i) 149 | } 150 | } 151 | return include 152 | } 153 | 154 | // GT filter: Greater Than (numeric). 155 | func (s *Series) GT(comparison float64) []int { 156 | return s.filterFloat64(func(elem float64) bool { 157 | return elem > comparison 158 | }) 159 | } 160 | 161 | // GTE filter: Greater Than or Equal To (numeric). 162 | func (s *Series) GTE(comparison float64) []int { 163 | return s.filterFloat64(func(elem float64) bool { 164 | return elem >= comparison 165 | }) 166 | } 167 | 168 | // LT filter - Less Than (numeric). 169 | func (s *Series) LT(comparison float64) []int { 170 | return s.filterFloat64(func(elem float64) bool { 171 | return elem < comparison 172 | }) 173 | } 174 | 175 | // LTE filter - Less Than or Equal To (numeric). 176 | func (s *Series) LTE(comparison float64) []int { 177 | return s.filterFloat64(func(elem float64) bool { 178 | return elem <= comparison 179 | }) 180 | } 181 | 182 | // EQ filter - Equal To (numeric). 183 | func (s *Series) EQ(comparison float64) []int { 184 | return s.filterFloat64(func(elem float64) bool { 185 | return elem == comparison 186 | }) 187 | } 188 | 189 | // NEQ filter - Not Equal To (numeric). 190 | func (s *Series) NEQ(comparison float64) []int { 191 | return s.filterFloat64(func(elem float64) bool { 192 | return elem != comparison 193 | }) 194 | } 195 | 196 | // Contains filter - value contains substr (string). 197 | func (s *Series) Contains(substr string) []int { 198 | return s.filterString(func(elem string) bool { 199 | return strings.Contains(elem, substr) 200 | }) 201 | } 202 | 203 | // InList filter - value is contained within list (string). 204 | func (s *Series) InList(list []string) []int { 205 | return s.filterString(func(elem string) bool { 206 | for _, s := range list { 207 | if elem == s { 208 | return true 209 | } 210 | } 211 | return false 212 | }) 213 | } 214 | 215 | // True filter - value is true (bool). 216 | func (s *Series) True() []int { 217 | return s.filterBool(func(elem bool) bool { 218 | return elem 219 | }) 220 | } 221 | 222 | // False filter - value is false (bool). 223 | func (s *Series) False() []int { 224 | return s.filterBool(func(elem bool) bool { 225 | return !elem 226 | }) 227 | } 228 | 229 | // Before filter - value is before a specific time (datetime). 230 | func (s *Series) Before(t time.Time) []int { 231 | return s.filterDateTime(func(elem time.Time) bool { 232 | return elem.Before(t) 233 | }) 234 | } 235 | 236 | // After filter - value is after a specific time (datetime). 237 | func (s *Series) After(t time.Time) []int { 238 | return s.filterDateTime(func(elem time.Time) bool { 239 | return elem.After(t) 240 | }) 241 | } 242 | -------------------------------------------------------------------------------- /series/math_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/ptiger10/pd/options" 8 | ) 9 | 10 | func TestSeriesMath(t *testing.T) { 11 | s, _ := New([]int{1, 2, 3}) 12 | if sum := s.Sum(); sum != 6 { 13 | t.Errorf("s.Sum() returned %v, want %v", sum, 6) 14 | } 15 | if mean := s.Mean(); mean != 2 { 16 | t.Errorf("s.Mean() returned %v, want %v", mean, 2) 17 | } 18 | 19 | } 20 | 21 | func TestMath_numerics(t *testing.T) { 22 | var tests = []struct { 23 | name string 24 | s *Series 25 | wantSum float64 26 | wantMean float64 27 | wantMedian float64 28 | wantMin float64 29 | wantMax float64 30 | wantQ1 float64 31 | wantQ2 float64 32 | wantQ3 float64 33 | wantStd float64 34 | }{ 35 | {"float with null", MustNew([]float64{math.NaN(), math.NaN(), 2, 3, 1, 4}), 10, 2.5, 2.5, 1, 4, 1.5, 2.5, 3.5, 1.12}, 36 | {"float from string with null", MustNew([]string{"", "", "1", "2", "3", "4", "5"}).ToFloat64(), 15, 3, 3, 1, 5, 1.5, 3, 4.5, 1.41}, 37 | {"int from string with null", MustNew([]string{"", "", "1", "2", "3", "4", "5"}).ToInt64(), 15, 3, 3, 1, 5, 1.5, 3, 4.5, 1.41}, 38 | {"int", MustNew([]int{2, 1, 3, 4, 5, 6, 7, 8, 9}), 45, 5, 5, 1, 9, 2.5, 5, 7.5, 2.58}, 39 | {"float", MustNew([]float64{1, 2, 3, 4, 5, 6, 7, 8, 9}), 45, 5, 5, 1, 9, 2.5, 5, 7.5, 2.58}, 40 | {"float with negative", MustNew([]float64{2, -1, 4, 3}), 8, 2, 2.5, -1, 4, 0.5, 2.5, 3.5, 1.87}, 41 | } 42 | for _, tt := range tests { 43 | t.Run(tt.name, func(t *testing.T) { 44 | gotSum := tt.s.Sum() 45 | if gotSum != tt.wantSum { 46 | t.Errorf("Sum()returned %v, want %v", gotSum, tt.wantSum) 47 | } 48 | gotMean := tt.s.Mean() 49 | if gotMean != tt.wantMean { 50 | t.Errorf("Mean()returned %v, want %v", gotMean, tt.wantMean) 51 | } 52 | gotMedian := tt.s.Median() 53 | if gotMedian != tt.wantMedian { 54 | t.Errorf("Median()returned %v, want %v", gotMedian, tt.wantMedian) 55 | } 56 | gotMin := tt.s.Min() 57 | if gotMin != tt.wantMin { 58 | t.Errorf("Min()returned %v, want %v", gotMin, tt.wantMin) 59 | } 60 | gotMax := tt.s.Max() 61 | if gotMax != tt.wantMax { 62 | t.Errorf("Max()returned %v, want %v", gotMax, tt.wantMax) 63 | } 64 | gotQ1 := tt.s.Quartile(1) 65 | if gotQ1 != tt.wantQ1 { 66 | t.Errorf("Quartile(1)returned %v, want %v", gotQ1, tt.wantQ1) 67 | } 68 | gotQ2 := tt.s.Quartile(2) 69 | if gotQ2 != tt.wantQ2 { 70 | t.Errorf("Quartile(2)returned %v, want %v", gotQ2, tt.wantQ2) 71 | } 72 | gotQ3 := tt.s.Quartile(3) 73 | if gotQ3 != tt.wantQ3 { 74 | t.Errorf("Quartile(3)returned %v, want %v", gotQ3, tt.wantQ3) 75 | } 76 | gotStd := tt.s.Std() 77 | if math.Round(gotStd*100)/100 != math.Round(tt.wantStd*100)/100 { 78 | t.Errorf("Std()returned %v, want %v", gotStd, tt.wantStd) 79 | } 80 | }) 81 | 82 | } 83 | } 84 | 85 | func TestMath_numerics_async(t *testing.T) { 86 | var tests = []struct { 87 | name string 88 | s *Series 89 | wantSum float64 90 | wantMean float64 91 | wantMedian float64 92 | wantMin float64 93 | wantMax float64 94 | wantQ1 float64 95 | wantQ2 float64 96 | wantQ3 float64 97 | wantStd float64 98 | }{ 99 | {"float with null", MustNew([]float64{math.NaN(), math.NaN(), 2, 3, 1, 4}), 10, 2.5, 2.5, 1, 4, 1.5, 2.5, 3.5, 1.12}, 100 | } 101 | for _, tt := range tests { 102 | options.SetAsync(false) 103 | defer options.RestoreDefaults() 104 | t.Run(tt.name, func(t *testing.T) { 105 | gotSum := tt.s.Sum() 106 | if gotSum != tt.wantSum { 107 | t.Errorf("Sum()returned %v, want %v", gotSum, tt.wantSum) 108 | } 109 | gotMean := tt.s.Mean() 110 | if gotMean != tt.wantMean { 111 | t.Errorf("Mean()returned %v, want %v", gotMean, tt.wantMean) 112 | } 113 | gotMedian := tt.s.Median() 114 | if gotMedian != tt.wantMedian { 115 | t.Errorf("Median()returned %v, want %v", gotMedian, tt.wantMedian) 116 | } 117 | gotMin := tt.s.Min() 118 | if gotMin != tt.wantMin { 119 | t.Errorf("Min()returned %v, want %v", gotMin, tt.wantMin) 120 | } 121 | gotMax := tt.s.Max() 122 | if gotMax != tt.wantMax { 123 | t.Errorf("Max()returned %v, want %v", gotMax, tt.wantMax) 124 | } 125 | gotQ1 := tt.s.Quartile(1) 126 | if gotQ1 != tt.wantQ1 { 127 | t.Errorf("Quartile(1)returned %v, want %v", gotQ1, tt.wantQ1) 128 | } 129 | gotQ2 := tt.s.Quartile(2) 130 | if gotQ2 != tt.wantQ2 { 131 | t.Errorf("Quartile(2)returned %v, want %v", gotQ2, tt.wantQ2) 132 | } 133 | gotQ3 := tt.s.Quartile(3) 134 | if gotQ3 != tt.wantQ3 { 135 | t.Errorf("Quartile(3)returned %v, want %v", gotQ3, tt.wantQ3) 136 | } 137 | gotStd := tt.s.Std() 138 | if math.Round(gotStd*100)/100 != math.Round(tt.wantStd*100)/100 { 139 | t.Errorf("Std()returned %v, want %v", gotStd, tt.wantStd) 140 | } 141 | }) 142 | 143 | } 144 | } 145 | 146 | func TestMath_bool(t *testing.T) { 147 | var tests = []struct { 148 | s *Series 149 | wantSum float64 150 | wantMean float64 151 | }{ 152 | {MustNew([]string{"", "true"}).ToBool(), 1, 1}, 153 | {MustNew([]bool{true, false}), 1, .5}, 154 | {MustNew([]bool{false}), 0, 0}, 155 | } 156 | for _, tt := range tests { 157 | gotSum := tt.s.Sum() 158 | if gotSum != tt.wantSum { 159 | t.Errorf("Sum()returned %v, want %v", gotSum, tt.wantSum) 160 | } 161 | gotMean := tt.s.Mean() 162 | if gotMean != tt.wantMean { 163 | t.Errorf("Mean()returned %v, want %v", gotMean, tt.wantMean) 164 | } 165 | } 166 | } 167 | 168 | func TestMath_unsupported(t *testing.T) { 169 | var tests = []struct { 170 | name string 171 | s *Series 172 | }{ 173 | {"string", MustNew([]string{"foo"})}, 174 | {"null", MustNew([]string{})}, 175 | } 176 | for _, tt := range tests { 177 | t.Run(tt.name, func(t *testing.T) { 178 | gotSum := tt.s.Sum() 179 | if !math.IsNaN(gotSum) { 180 | t.Errorf("Sum()returned %v, want NaN", gotSum) 181 | } 182 | gotMean := tt.s.Mean() 183 | if !math.IsNaN(gotMean) { 184 | t.Errorf("Mean()returned %v, want NaN", gotMean) 185 | } 186 | gotMedian := tt.s.Median() 187 | if !math.IsNaN(gotMedian) { 188 | t.Errorf("Median()returned %v, want NaN", gotMedian) 189 | } 190 | gotMin := tt.s.Min() 191 | if !math.IsNaN(gotMin) { 192 | t.Errorf("Min()returned %v, want NaN", gotMin) 193 | } 194 | gotMax := tt.s.Max() 195 | if !math.IsNaN(gotMax) { 196 | t.Errorf("Max()returned %v, want NaN", gotMax) 197 | } 198 | gotQ1 := tt.s.Quartile(1) 199 | if !math.IsNaN(gotQ1) { 200 | t.Errorf("Quartile(1)returned %v, want NaN", gotQ1) 201 | } 202 | gotQ2 := tt.s.Quartile(2) 203 | if !math.IsNaN(gotQ2) { 204 | t.Errorf("Quartile(2)returned %v, want NaN", gotQ2) 205 | } 206 | gotQ3 := tt.s.Quartile(3) 207 | if !math.IsNaN(gotQ3) { 208 | t.Errorf("Quartile(3)returned %v, want NaN", gotQ3) 209 | } 210 | gotStd := tt.s.Std() 211 | if !math.IsNaN(gotStd) { 212 | t.Errorf("Std()returned %v, want NaN", gotStd) 213 | } 214 | }) 215 | } 216 | } 217 | 218 | func TestMath_unsupported_other(t *testing.T) { 219 | s := MustNew([]float64{}) 220 | got := s.Std() 221 | if !math.IsNaN(got) { 222 | t.Errorf("Std()returned %v, want NaN", got) 223 | } 224 | got = s.Median() 225 | if !math.IsNaN(got) { 226 | t.Errorf("Median()returned %v, want NaN", got) 227 | } 228 | gotQuartiles := s.quartiles() 229 | if gotQuartiles != nil { 230 | t.Errorf("Empty quartiles()returned %v, want nil", gotQuartiles) 231 | } 232 | 233 | s = MustNew([]float64{1}) 234 | got = s.Quartile(10) 235 | if !math.IsNaN(got) { 236 | t.Errorf("Quartile()returned %v, want NaN", got) 237 | } 238 | 239 | s = MustNew([]float64{1}) 240 | gotQuartiles = s.quartiles() 241 | if !math.IsNaN(gotQuartiles[0]) || gotQuartiles[1] != 1 || !math.IsNaN(gotQuartiles[2]) { 242 | t.Errorf("quartiles() of len < 4 returned %v, want NaN, median, NaN", gotQuartiles) 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /series/select_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | "os" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/ptiger10/pd/options" 12 | ) 13 | 14 | func TestElement(t *testing.T) { 15 | s, err := New([]string{"", "valid"}, Config{MultiIndex: []interface{}{[]string{"A", "B"}, []int{1, 2}}}) 16 | if err != nil { 17 | t.Error(err) 18 | } 19 | var tests = []struct { 20 | position int 21 | wantVal interface{} 22 | wantNull bool 23 | wantIdx []interface{} 24 | }{ 25 | {0, "NaN", true, []interface{}{"A", int64(1)}}, 26 | {1, "valid", false, []interface{}{"B", int64(2)}}, 27 | } 28 | wantIdxTypes := []options.DataType{options.String, options.Int64} 29 | for _, test := range tests { 30 | got := s.Element(test.position) 31 | if got.Value != test.wantVal { 32 | t.Errorf("Element returned value %v, want %v", got.Value, test.wantVal) 33 | } 34 | if got.Null != test.wantNull { 35 | t.Errorf("Element returned bool %v, want %v", got.Null, test.wantNull) 36 | } 37 | if !reflect.DeepEqual(got.Labels, test.wantIdx) { 38 | t.Errorf("Element returned index %#v, want %#v", got.Labels, test.wantIdx) 39 | } 40 | if !reflect.DeepEqual(got.LabelTypes, wantIdxTypes) { 41 | t.Errorf("Element returned kind %v, want %v", got.LabelTypes, wantIdxTypes) 42 | } 43 | } 44 | } 45 | 46 | func TestSeries_At(t *testing.T) { 47 | type args struct { 48 | position int 49 | } 50 | tests := []struct { 51 | name string 52 | input *Series 53 | args args 54 | want interface{} 55 | }{ 56 | {name: "pass", input: MustNew([]string{"foo", "bar", "baz"}), args: args{1}, want: "bar"}, 57 | {name: "nil", input: MustNew([]string{"", "bar", "baz"}), args: args{0}, want: nil}, 58 | {"fail: invalid position", MustNew([]string{"foo", "bar", "baz"}), args{10}, nil}, 59 | } 60 | for _, tt := range tests { 61 | t.Run(tt.name, func(t *testing.T) { 62 | var buf bytes.Buffer 63 | log.SetOutput(&buf) 64 | defer log.SetOutput(os.Stderr) 65 | // test panic 66 | if strings.Contains(tt.name, "fail") { 67 | defer func() { 68 | if r := recover(); r == nil { 69 | t.Errorf("The code did not panic") 70 | } 71 | }() 72 | 73 | tt.input.At(tt.args.position) 74 | if buf.String() == "" { 75 | t.Errorf("Series.At() returned no log message, want log due to fail") 76 | } 77 | return 78 | } 79 | if got := tt.input.At(tt.args.position); !reflect.DeepEqual(got, tt.want) { 80 | t.Errorf("Series.At() = %v, want %v", got, tt.want) 81 | } 82 | 83 | }) 84 | } 85 | } 86 | 87 | func TestFrom(t *testing.T) { 88 | s := MustNew([]string{"foo", "bar", "baz"}, Config{Index: []int{0, 1, 2}}) 89 | type args struct { 90 | start int 91 | end int 92 | } 93 | tests := []struct { 94 | name string 95 | input *Series 96 | args args 97 | want *Series 98 | }{ 99 | {name: "ascending", input: s, args: args{start: 0, end: 2}, want: MustNew([]string{"foo", "bar", "baz"}, Config{Index: []int{0, 1, 2}})}, 100 | {"single", s, args{1, 1}, MustNew([]string{"bar"}, Config{Index: []int{1}})}, 101 | {"partial", s, args{1, 2}, MustNew([]string{"bar", "baz"}, Config{Index: []int{1, 2}})}, 102 | {"descending", s, args{2, 0}, MustNew([]string{"baz", "bar", "foo"}, Config{Index: []int{2, 1, 0}})}, 103 | {"fail: partial invalid", s, args{10, 0}, newEmptySeries()}, 104 | } 105 | for _, tt := range tests { 106 | t.Run(tt.name, func(t *testing.T) { 107 | var buf bytes.Buffer 108 | log.SetOutput(&buf) 109 | defer log.SetOutput(os.Stderr) 110 | 111 | got := tt.input.From(tt.args.start, tt.args.end) 112 | if !Equal(got, tt.want) { 113 | t.Errorf("Series.From() got %v, want %v", s, tt.want) 114 | } 115 | 116 | if strings.Contains(tt.name, "fail") { 117 | if buf.String() == "" { 118 | t.Errorf("Series.From() returned no log message, want log due to fail") 119 | } 120 | } 121 | }) 122 | 123 | } 124 | } 125 | 126 | func TestSeries_XS(t *testing.T) { 127 | s := MustNew([]string{"foo", "bar", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 2, 3}, []string{"qux", "quux", "quuz"}}}) 128 | type args struct { 129 | rowPositions []int 130 | levelPositions []int 131 | } 132 | type want struct { 133 | series *Series 134 | err bool 135 | } 136 | tests := []struct { 137 | name string 138 | input *Series 139 | args args 140 | want want 141 | }{ 142 | {name: "pass", input: s, args: args{[]int{0, 1}, []int{0}}, 143 | want: want{series: MustNew([]string{"foo", "bar"}, Config{MultiIndex: []interface{}{[]int{1, 2}}}), err: false}}, 144 | {"pass reverse", s, args{[]int{1, 0}, []int{1}}, 145 | want{MustNew([]string{"bar", "foo"}, Config{MultiIndex: []interface{}{[]string{"quux", "qux"}}}), false}}, 146 | {"pass multi reverse", s, args{[]int{1, 0}, []int{1, 0}}, 147 | want{MustNew([]string{"bar", "foo"}, Config{MultiIndex: []interface{}{[]string{"quux", "qux"}, []int{2, 1}}}), false}}, 148 | {"fail: invalid row position", s, args{[]int{10}, []int{0}}, want{newEmptySeries(), true}}, 149 | {"fail: partial invalid row position", s, args{[]int{0, 10}, []int{0}}, want{newEmptySeries(), true}}, 150 | {"fail: invalid level position", s, args{[]int{0}, []int{10}}, want{newEmptySeries(), true}}, 151 | {"fail: partial invalid level position", s, args{[]int{0}, []int{0, 10}}, want{newEmptySeries(), true}}, 152 | } 153 | for _, tt := range tests { 154 | t.Run(tt.name, func(t *testing.T) { 155 | got, err := tt.input.XS(tt.args.rowPositions, tt.args.levelPositions) 156 | if (err != nil) != tt.want.err { 157 | t.Errorf("Series.XS() error = %v, want %v", err, tt.want.err) 158 | } 159 | if !reflect.DeepEqual(got, tt.want.series) { 160 | t.Errorf("Series.XS() = %v, want %v", got, tt.want.series) 161 | } 162 | 163 | }) 164 | } 165 | } 166 | 167 | func TestSeries_SelectLabel(t *testing.T) { 168 | s := MustNew([]string{"foo", "bar", "baz"}, Config{Index: []int{1, 1, 1}}) 169 | type args struct { 170 | label string 171 | } 172 | tests := []struct { 173 | name string 174 | input *Series 175 | args args 176 | want int 177 | }{ 178 | {name: "pass", input: s, args: args{label: "1"}, want: 0}, 179 | {"fail: empty Series", newEmptySeries(), args{label: "1"}, -1}, 180 | {"fail: label not in Series", s, args{label: "100"}, -1}, 181 | } 182 | for _, tt := range tests { 183 | t.Run(tt.name, func(t *testing.T) { 184 | var buf bytes.Buffer 185 | log.SetOutput(&buf) 186 | defer log.SetOutput(os.Stderr) 187 | if got := tt.input.SelectLabel(tt.args.label); got != tt.want { 188 | t.Errorf("Series.SelectLabel() = %v, want %v", got, tt.want) 189 | } 190 | if strings.Contains(tt.name, "fail") { 191 | if buf.String() == "" { 192 | t.Errorf("Series.SelectLabel() returned no log message, want log due to fail") 193 | } 194 | } 195 | }) 196 | } 197 | } 198 | 199 | func TestSeries_SelectLabels(t *testing.T) { 200 | s := MustNew([]string{"foo", "bar", "baz"}, Config{MultiIndex: []interface{}{[]int{1, 1, 1}, []string{"qux", "quux", "quuz"}}}) 201 | type args struct { 202 | labels []string 203 | level int 204 | } 205 | tests := []struct { 206 | name string 207 | input *Series 208 | args args 209 | want []int 210 | }{ 211 | {name: "pass", input: s, args: args{labels: []string{"1"}, level: 0}, want: []int{0, 1, 2}}, 212 | {"pass", s, args{[]string{"qux", "quux"}, 1}, []int{0, 1}}, 213 | {"fail: empty Series", newEmptySeries(), args{[]string{"1"}, 0}, []int{}}, 214 | {"fail: label not in Series", s, args{[]string{"100"}, 0}, []int{}}, 215 | {"fail: invalid level", s, args{[]string{"1"}, 100}, []int{}}, 216 | } 217 | for _, tt := range tests { 218 | t.Run(tt.name, func(t *testing.T) { 219 | var buf bytes.Buffer 220 | log.SetOutput(&buf) 221 | defer log.SetOutput(os.Stderr) 222 | if got := tt.input.SelectLabels(tt.args.labels, tt.args.level); !reflect.DeepEqual(got, tt.want) { 223 | t.Errorf("Series.SelectLabels() = %v, want %v", got, tt.want) 224 | } 225 | if strings.Contains(tt.name, "fail") { 226 | if buf.String() == "" { 227 | t.Errorf("Series.SelectLabels() returned no log message, want log due to fail") 228 | } 229 | } 230 | }) 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /pd.go: -------------------------------------------------------------------------------- 1 | // Package pd (aka GoPandas) is a library for cleaning, aggregating, and transforming data. 2 | // GoPandas combines a flexible API familiar to Python pandas users with the strengths of Go, 3 | // including type safety, predictable error handling, and concurrent processing. 4 | package pd 5 | 6 | import ( 7 | "bytes" 8 | "encoding/csv" 9 | "fmt" 10 | "io/ioutil" 11 | "log" 12 | 13 | "github.com/ptiger10/pd/dataframe" 14 | "github.com/ptiger10/pd/internal/values" 15 | "github.com/ptiger10/pd/options" 16 | "github.com/ptiger10/pd/series" 17 | ) 18 | 19 | // Series constructs a new Series. 20 | func Series(data interface{}, config ...Config) (*series.Series, error) { 21 | tmp := Config{} 22 | if config != nil { 23 | if len(config) > 1 { 24 | return series.MustNew(nil), fmt.Errorf("pd.Series(): can supply at most one Config (%d > 1)", len(config)) 25 | } 26 | tmp = config[0] 27 | } 28 | sConfig := series.Config{ 29 | Name: tmp.Name, DataType: tmp.DataType, 30 | Index: tmp.Index, IndexName: tmp.IndexName, 31 | MultiIndex: tmp.MultiIndex, MultiIndexNames: tmp.MultiIndexNames, 32 | } 33 | s, err := series.New(data, sConfig) 34 | if err != nil { 35 | return series.MustNew(nil), fmt.Errorf("pd.Series(): %v", err) 36 | } 37 | return s, nil 38 | 39 | } 40 | 41 | // DataFrame constructs a new DataFrame. 42 | func DataFrame(data []interface{}, config ...Config) (*dataframe.DataFrame, error) { 43 | tmp := Config{} 44 | if config != nil { 45 | if len(config) > 1 { 46 | return dataframe.MustNew(nil), fmt.Errorf("pd.Series(): can supply at most one Config (%d > 1)", len(config)) 47 | } 48 | tmp = config[0] 49 | } 50 | dfConfig := dataframe.Config{ 51 | Name: tmp.Name, DataType: tmp.DataType, 52 | Index: tmp.Index, IndexName: tmp.IndexName, 53 | MultiIndex: tmp.MultiIndex, MultiIndexNames: tmp.MultiIndexNames, 54 | Col: tmp.Col, ColName: tmp.ColName, 55 | MultiCol: tmp.MultiCol, MultiColNames: tmp.MultiColNames, 56 | } 57 | df, err := dataframe.New(data, dfConfig) 58 | if err != nil { 59 | return dataframe.MustNew(nil), fmt.Errorf("pd.DataFrame(): %v", err) 60 | } 61 | return df, nil 62 | } 63 | 64 | // ReadInterface converts [][]interface{}{row1{col1, ...}...} into a DataFrame 65 | func ReadInterface(input [][]interface{}, config ...ReadOptions) (*dataframe.DataFrame, error) { 66 | if len(input) == 0 { 67 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): Input must contain at least one row") 68 | } 69 | if len(input[0]) == 0 { 70 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): must contain at least one column") 71 | } 72 | 73 | data := make([][]interface{}, len(input)) 74 | for i := 0; i < len(input); i++ { 75 | data[i] = make([]interface{}, len(input[0])) 76 | for m := 0; m < len(input[0]); m++ { 77 | data[i][m] = input[i][m] 78 | } 79 | } 80 | 81 | tmp := ReadOptions{} 82 | if config != nil { 83 | if len(config) > 1 { 84 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): can supply at most one ReadOptions (%d > 1)", 85 | len(config)) 86 | } 87 | tmp = config[0] 88 | } 89 | 90 | var tmpMultiCol [][]interface{} 91 | if tmp.DropRows > len(data) { 92 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): DropRows cannot exceed the number of rows (%d > %d)", 93 | tmp.DropRows, len(data)) 94 | } 95 | 96 | data = data[tmp.DropRows:] 97 | // header rows 98 | if tmp.HeaderRows > len(data) { 99 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): HeaderRows cannot exceed the number of rows (%d > %d)", 100 | tmp.HeaderRows, len(data)) 101 | } 102 | 103 | tmpMultiCol = data[:tmp.HeaderRows] 104 | for m := 0; m < tmp.HeaderRows; m++ { 105 | tmpMultiCol[m] = tmpMultiCol[m][tmp.IndexCols:] 106 | } 107 | 108 | data = data[tmp.HeaderRows:] 109 | 110 | if tmp.IndexCols > len(data[0]) { 111 | return dataframe.MustNew(nil), fmt.Errorf("ReadInterface(): IndexCols cannot exceed the number of rows (%d > %d)", 112 | tmp.IndexCols, len(data)) 113 | } 114 | 115 | tmpVals := make([][]interface{}, len(data[0])-tmp.IndexCols) 116 | tmpMultiIndex := make([][]interface{}, tmp.IndexCols) 117 | 118 | for m := 0; m < len(data[0])-tmp.IndexCols; m++ { 119 | tmpVals[m] = make([]interface{}, len(data)) 120 | } 121 | for m := 0; m < tmp.IndexCols; m++ { 122 | tmpMultiIndex[m] = make([]interface{}, len(data)) 123 | } 124 | 125 | // transpose index and values 126 | for i := 0; i < len(data); i++ { 127 | for m := 0; m < len(data[0]); m++ { 128 | if m < tmp.IndexCols { 129 | tmpMultiIndex[m][i] = data[i][m] 130 | } else { 131 | tmpVals[m-tmp.IndexCols][i] = data[i][m] 132 | } 133 | } 134 | } 135 | // convert [][]interface{} to []interface{} of []interface for compatibility with DataFrame constructor 136 | var ( 137 | multiIndex []interface{} 138 | vals []interface{} 139 | ) 140 | for _, col := range tmpMultiIndex { 141 | multiIndex = append(multiIndex, col) 142 | } 143 | for _, col := range tmpVals { 144 | vals = append(vals, col) 145 | } 146 | multiCol := make([][]string, len(tmpMultiCol)) 147 | 148 | if len(tmpMultiCol) > 0 { 149 | for j := 0; j < len(tmpMultiCol); j++ { 150 | multiCol[j] = make([]string, len(tmpMultiCol[0])) 151 | for m := 0; m < len(tmpMultiCol[0]); m++ { 152 | multiCol[j][m] = fmt.Sprint(tmpMultiCol[j][m]) 153 | } 154 | } 155 | } 156 | 157 | // ducks error because all []interface{} values are supported and Config properties are controlled 158 | df, _ := DataFrame(vals, Config{Manual: tmp.Manual, MultiIndex: multiIndex, MultiCol: multiCol}) 159 | 160 | for k, v := range tmp.DataTypes { 161 | colInt := df.SelectCol(k) 162 | if colInt != -1 { 163 | df.InPlace.SetCol(colInt, df.ColAt(colInt).Convert(v)) 164 | } 165 | } 166 | for k, v := range tmp.IndexDataTypes { 167 | err := df.Index.Convert(v, k) 168 | if err != nil { 169 | if options.GetLogWarnings() { 170 | log.Printf("warning: ReadInterface() converting IndexDataTypes: %v", err) 171 | } 172 | } 173 | } 174 | df.RenameCols(tmp.Rename) 175 | 176 | return df, nil 177 | } 178 | 179 | // ReadCSV converts a CSV file into a DataFrame. 180 | func ReadCSV(path string, config ...ReadOptions) (*dataframe.DataFrame, error) { 181 | tmp := ReadOptions{} 182 | if config != nil { 183 | if len(config) > 1 { 184 | return dataframe.MustNew(nil), fmt.Errorf("ReadCSV(): can supply at most one ReadOptions (%d > 1)", 185 | len(config)) 186 | } 187 | tmp = config[0] 188 | } 189 | 190 | data, err := ioutil.ReadFile(path) 191 | if err != nil { 192 | return dataframe.MustNew(nil), fmt.Errorf("ReadCSV(): %s", err) 193 | } 194 | reader := csv.NewReader(bytes.NewReader(data)) 195 | if tmp.Delimiter != 0 { 196 | reader.Comma = tmp.Delimiter 197 | } 198 | 199 | records, err := reader.ReadAll() 200 | if err != nil { 201 | return dataframe.MustNew(nil), fmt.Errorf("ReadCSV(): %v", err) 202 | } 203 | if len(records) == 0 { 204 | return dataframe.MustNew(nil), fmt.Errorf("ReadCSV(): input must contain at least one row") 205 | } 206 | 207 | // convert to [][]interface 208 | var interfaceRecords [][]interface{} 209 | for j := 0; j < len(records); j++ { 210 | interfaceRecords = append(interfaceRecords, make([]interface{}, len(records[0]))) 211 | for m := 0; m < len(records[0]); m++ { 212 | // optional interpolation if not in Manual mode 213 | if !tmp.Manual { 214 | interfaceRecords[j][m] = values.InterpolateString(records[j][m]) 215 | } else { 216 | interfaceRecords[j][m] = records[j][m] 217 | } 218 | } 219 | } 220 | 221 | df, err := ReadInterface(interfaceRecords, tmp) 222 | if err != nil { 223 | return dataframe.MustNew(nil), fmt.Errorf("ReadCSV(): %v", err) 224 | } 225 | return df, nil 226 | } 227 | 228 | // Config customizes the construction of either a DataFrame or Series. 229 | type Config struct { 230 | Name string 231 | DataType options.DataType 232 | Index interface{} 233 | IndexName string 234 | MultiIndex []interface{} 235 | MultiIndexNames []string 236 | Col []string 237 | ColName string 238 | MultiCol [][]string 239 | MultiColNames []string 240 | Manual bool 241 | } 242 | 243 | // ReadOptions are options for reading in files from other formats 244 | type ReadOptions struct { 245 | DropRows int 246 | HeaderRows int 247 | IndexCols int 248 | Manual bool 249 | DataTypes map[string]string 250 | IndexDataTypes map[int]string 251 | ColumnDataTypes map[int]string 252 | Rename map[string]string 253 | Delimiter rune 254 | } 255 | -------------------------------------------------------------------------------- /dataframe/group.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "sort" 7 | "strings" 8 | "sync" 9 | 10 | "github.com/ptiger10/pd/internal/values" 11 | "github.com/ptiger10/pd/options" 12 | "github.com/ptiger10/pd/series" 13 | ) 14 | 15 | type group struct { 16 | Positions []int 17 | FirstPosition int 18 | } 19 | 20 | func (grp *group) copy() *group { 21 | pos := make([]int, len(grp.Positions)) 22 | for i, p := range grp.Positions { 23 | pos[i] = p 24 | } 25 | return &group{Positions: pos, FirstPosition: grp.FirstPosition} 26 | } 27 | 28 | // copy a grouping 29 | func (g Grouping) copy() Grouping { 30 | grps := make(map[string]*group) 31 | for k, v := range g.groups { 32 | grps[k] = v.copy() 33 | } 34 | return Grouping{ 35 | df: g.df.Copy(), 36 | groups: grps, 37 | } 38 | } 39 | 40 | // SortedGroups returns all valid group labels in the Grouping, sorted in alphabetical order. 41 | func (g Grouping) SortedGroups() []string { 42 | var keys []string 43 | for k := range g.groups { 44 | keys = append(keys, k) 45 | } 46 | sort.Strings(keys) 47 | return keys 48 | } 49 | 50 | // Groups returns all valid group labels in the Grouping, in their original group position. 51 | func (g Grouping) Groups() []string { 52 | type groupContainer struct { 53 | grp *group 54 | label string 55 | } 56 | var orderedGroups []groupContainer 57 | for k, v := range g.groups { 58 | orderedGroups = append(orderedGroups, groupContainer{grp: v, label: k}) 59 | } 60 | sort.Slice(orderedGroups, func(i, j int) bool { 61 | if orderedGroups[i].grp.FirstPosition < orderedGroups[j].grp.FirstPosition { 62 | return true 63 | } 64 | return false 65 | }) 66 | var labels []string 67 | for _, grp := range orderedGroups { 68 | labels = append(labels, grp.label) 69 | } 70 | return labels 71 | } 72 | 73 | // Len returns the number of groups in the Grouping. 74 | func (g Grouping) Len() int { 75 | return len(g.groups) 76 | } 77 | 78 | // Group returns the DataFrame with the given group label, or an error if that label does not exist. 79 | func (g Grouping) Group(label string) *DataFrame { 80 | group, ok := g.groups[label] 81 | if !ok { 82 | if options.GetLogWarnings() { 83 | log.Printf("s.Grouping.Group(): label %v not in g.Groups()", label) 84 | } 85 | return newEmptyDataFrame() 86 | } 87 | s := g.df.subsetRows(group.Positions) 88 | return s 89 | } 90 | 91 | func newEmptyGrouping() Grouping { 92 | groups := make(map[string]*group) 93 | df := newEmptyDataFrame() 94 | return Grouping{df: df, groups: groups, err: true} 95 | } 96 | 97 | // GroupByIndex groups a DataFrame by one or more of its index levels. If no level is provided, all index levels are used. 98 | func (df *DataFrame) GroupByIndex(levelPositions ...int) Grouping { 99 | if len(levelPositions) != 0 { 100 | df = df.Copy() 101 | err := df.Index.SubsetLevels(levelPositions) 102 | if err != nil { 103 | if options.GetLogWarnings() { 104 | log.Printf("df.GroupByIndex() %v\n", err) 105 | } 106 | return newEmptyGrouping() 107 | } 108 | } 109 | 110 | // Default: use all label level positions 111 | return df.groupby() 112 | } 113 | 114 | // GroupBy groups a DataFrame by one or more columns. 115 | // If no column is supplied or an invalid column is supplied, an empty grouping is returned. 116 | func (df *DataFrame) GroupBy(cols ...int) Grouping { 117 | if len(cols) == 0 { 118 | if options.GetLogWarnings() { 119 | log.Print("df.GroupBy(): empty cols, returning empty Grouping\n") 120 | } 121 | return newEmptyGrouping() 122 | } 123 | if len(cols) == df.NumCols() { 124 | if options.GetLogWarnings() { 125 | log.Print("df.GroupBy(): at least one column must be excluded from the Grouping\n") 126 | } 127 | return newEmptyGrouping() 128 | } 129 | if err := df.ensureColumnPositions(cols); err != nil { 130 | if options.GetLogWarnings() { 131 | log.Printf("df.GroupBy(): %v\n", err) 132 | } 133 | return newEmptyGrouping() 134 | } 135 | df = df.Copy() 136 | df.InPlace.replaceIndex(cols) 137 | 138 | return df.groupby() 139 | } 140 | 141 | func (ip InPlace) replaceIndex(cols []int) { 142 | lengthArchive := ip.df.IndexLevels() 143 | // set new levels 144 | ip.setIndexes(cols) 145 | 146 | // Drop old levels 147 | for j := len(cols); j < len(cols)+lengthArchive; j++ { 148 | // use lower-level method to change index in place and duck error because level is certain to be in index 149 | ip.df.index.DropLevel(j) 150 | } 151 | ip.df.index.UpdateNameMap() 152 | } 153 | 154 | func (df *DataFrame) groupby() Grouping { 155 | groups := make(map[string]*group) 156 | for i := 0; i < df.Len(); i++ { 157 | labels := df.Row(i).Labels 158 | var strLabels []string 159 | for _, label := range labels { 160 | strLabels = append(strLabels, fmt.Sprint(label)) 161 | } 162 | groupLabel := strings.Join(strLabels, values.GetMultiColNameSeparator()) 163 | 164 | // create group with groupLabel and index labels if it is not already within groups map 165 | if _, ok := groups[groupLabel]; !ok { 166 | groups[groupLabel] = &group{FirstPosition: i} 167 | } 168 | groups[groupLabel].Positions = append(groups[groupLabel].Positions, i) 169 | } 170 | return Grouping{df: df, groups: groups} 171 | } 172 | 173 | // First returns the first occurrence of each grouping in the DataFrame. 174 | func (g Grouping) First() *DataFrame { 175 | first := func(group string) *DataFrame { 176 | position := g.groups[group].Positions[0] 177 | df := g.df.subsetRows([]int{position}) 178 | return df 179 | } 180 | ret := newEmptyDataFrame() 181 | for _, group := range g.Groups() { 182 | df := first(group) 183 | ret.InPlace.appendDataFrameRow(df) 184 | } 185 | return ret 186 | } 187 | 188 | // Last returns the last occurrence of each grouping in the DataFrame. 189 | func (g Grouping) Last() *DataFrame { 190 | last := func(group string) *DataFrame { 191 | lastIdx := len(g.groups[group].Positions) - 1 192 | position := g.groups[group].Positions[lastIdx] 193 | df := g.df.subsetRows([]int{position}) 194 | return df 195 | } 196 | ret := newEmptyDataFrame() 197 | for _, group := range g.Groups() { 198 | df := last(group) 199 | ret.InPlace.appendDataFrameRow(df) 200 | } 201 | return ret 202 | } 203 | 204 | type calcReturn struct { 205 | df *DataFrame 206 | n int 207 | } 208 | 209 | func (g Grouping) asyncMath(fn func(*DataFrame) *series.Series) *DataFrame { 210 | var wg sync.WaitGroup 211 | // synchronous option 212 | if !options.GetAsync() { 213 | ret := newEmptyDataFrame() 214 | for _, group := range g.Groups() { 215 | calc := g.mathSingleGroup(group, fn) 216 | ret.InPlace.appendDataFrameRow(calc) 217 | } 218 | return ret 219 | } 220 | 221 | // asynchronous option 222 | ch := make(chan calcReturn, g.Len()) 223 | for i, group := range g.Groups() { 224 | wg.Add(1) 225 | go g.awaitMath(ch, i, group, fn, &wg) 226 | } 227 | wg.Wait() 228 | close(ch) 229 | var returnedData []calcReturn 230 | for result := range ch { 231 | returnedData = append(returnedData, result) 232 | } 233 | sort.Slice(returnedData, func(i, j int) bool { 234 | return returnedData[i].n < returnedData[j].n 235 | }) 236 | 237 | df := newEmptyDataFrame() 238 | for _, result := range returnedData { 239 | df.InPlace.appendDataFrameRow((result.df)) 240 | } 241 | df.index.NeedsRefresh = true 242 | return df 243 | } 244 | 245 | func (g Grouping) awaitMath( 246 | ch chan<- calcReturn, n int, group string, 247 | fn func(*DataFrame) *series.Series, wg *sync.WaitGroup, 248 | ) { 249 | df := g.mathSingleGroup(group, fn) 250 | ret := calcReturn{df: df, n: n} 251 | ch <- ret 252 | wg.Done() 253 | } 254 | 255 | func (g Grouping) mathSingleGroup(group string, fn func(*DataFrame) *series.Series) *DataFrame { 256 | positions := g.groups[group].Positions 257 | rows := g.df.subsetRows(positions) 258 | calc := fn(rows) 259 | calc.Rename(group) 260 | df := transposeSeries(calc) 261 | return df 262 | } 263 | 264 | // Sum for each group in the Grouping. 265 | func (g Grouping) Sum() *DataFrame { 266 | return g.asyncMath((*DataFrame).Sum) 267 | } 268 | 269 | // Mean for each group in the Grouping. 270 | func (g Grouping) Mean() *DataFrame { 271 | return g.asyncMath((*DataFrame).Mean) 272 | } 273 | 274 | // Min for each group in the Grouping. 275 | func (g Grouping) Min() *DataFrame { 276 | return g.asyncMath((*DataFrame).Min) 277 | } 278 | 279 | // Max for each group in the Grouping. 280 | func (g Grouping) Max() *DataFrame { 281 | return g.asyncMath((*DataFrame).Max) 282 | } 283 | 284 | // Median for each group in the Grouping. 285 | func (g Grouping) Median() *DataFrame { 286 | return g.asyncMath((*DataFrame).Median) 287 | } 288 | 289 | // Std for each group in the Grouping. 290 | func (g Grouping) Std() *DataFrame { 291 | return g.asyncMath((*DataFrame).Std) 292 | } 293 | -------------------------------------------------------------------------------- /series/constructor_test.go: -------------------------------------------------------------------------------- 1 | package series 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | "os" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | "time" 11 | 12 | "github.com/ptiger10/pd/internal/index" 13 | "github.com/ptiger10/pd/internal/values" 14 | "github.com/ptiger10/pd/options" 15 | ) 16 | 17 | func TestNew_emptySeries(t *testing.T) { 18 | got := newEmptySeries() 19 | want := &Series{values: values.MustCreateValuesFromInterface(nil).Values, index: index.New(), datatype: options.None} 20 | if !Equal(got, want) { 21 | t.Errorf("New(nil) returned %#v, want %#v", got, want) 22 | } 23 | _ = got.Len() 24 | _ = got.NumLevels() 25 | _ = got.Name() 26 | } 27 | 28 | func TestNew_nilWithConfig_emptySeries(t *testing.T) { 29 | got, err := New(nil, Config{Index: "foo"}) 30 | if err != nil { 31 | t.Errorf("New(): %v", err) 32 | } 33 | want := newEmptySeries() 34 | if !Equal(got, want) { 35 | t.Errorf("New(nil) returned %#v, want %#v", got, want) 36 | } 37 | } 38 | 39 | func TestNew(t *testing.T) { 40 | testDate := time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC) 41 | type args struct { 42 | data interface{} 43 | } 44 | type want struct { 45 | values interface{} 46 | dtype options.DataType 47 | } 48 | tests := []struct { 49 | name string 50 | args args 51 | want want 52 | }{ 53 | {"all null", args{""}, want{"", options.String}}, 54 | {"float32", args{float32(1)}, want{1.0, options.Float64}}, 55 | {"float64", args{float64(1)}, want{1.0, options.Float64}}, 56 | {"int", args{int(1)}, want{1, options.Int64}}, 57 | {"int8", args{int8(1)}, want{1, options.Int64}}, 58 | {"int16", args{int16(1)}, want{1, options.Int64}}, 59 | {"int32", args{int32(1)}, want{1, options.Int64}}, 60 | {"int64", args{int64(1)}, want{1, options.Int64}}, 61 | {"string", args{"foo"}, want{"foo", options.String}}, 62 | {"bool", args{true}, want{true, options.Bool}}, 63 | {"datetime", args{testDate}, want{testDate, options.DateTime}}, 64 | 65 | {"float32_slice", args{[]float32{1}}, want{1.0, options.Float64}}, 66 | {"float64_slice", args{[]float64{1}}, want{1.0, options.Float64}}, 67 | {"int_slice", args{[]int{1}}, want{1, options.Int64}}, 68 | {"int8_slice", args{[]int8{1}}, want{1, options.Int64}}, 69 | {"int16_slice", args{[]int16{1}}, want{1, options.Int64}}, 70 | {"int32_slice", args{[]int32{1}}, want{1, options.Int64}}, 71 | {"int64_slice", args{[]int64{1}}, want{1, options.Int64}}, 72 | {"string_slice", args{[]string{"foo"}}, want{"foo", options.String}}, 73 | {"bool_slice", args{[]bool{true}}, want{true, options.Bool}}, 74 | {"datetime_slice", args{[]time.Time{testDate}}, want{testDate, options.DateTime}}, 75 | } 76 | for _, tt := range tests { 77 | t.Run(tt.name, func(t *testing.T) { 78 | got, err := New(tt.args.data) 79 | if err != nil { 80 | t.Errorf("New() error = %v, wantErr nil", err) 81 | } 82 | container := values.MustCreateValuesFromInterface(tt.want.values) 83 | wantValues := container.Values 84 | wantIdx := index.NewDefault(1) 85 | want := &Series{values: wantValues, index: wantIdx, datatype: tt.want.dtype} 86 | if !Equal(got, want) { 87 | t.Errorf("New() = %v, want %v", got, want) 88 | } 89 | }) 90 | } 91 | } 92 | 93 | func TestNew_conversion(t *testing.T) { 94 | got, err := New("3.5", Config{DataType: options.Float64}) 95 | if err != nil { 96 | t.Errorf("New(): %v", err) 97 | } 98 | values, _ := values.InterfaceFactory(3.5) 99 | index := index.NewDefault(1) 100 | want := &Series{values: values.Values, index: index, datatype: options.Float64} 101 | if !Equal(got, want) { 102 | t.Errorf("New(nil) returned %v, want %v", got, want) 103 | } 104 | } 105 | 106 | func TestNew_Fail(t *testing.T) { 107 | type args struct { 108 | data interface{} 109 | config Config 110 | } 111 | tests := []struct { 112 | name string 113 | args args 114 | }{ 115 | {"unsupported value", args{complex64(1), Config{}}}, 116 | {"unsupported single index", args{"foo", Config{Index: complex64(1)}}}, 117 | {"unsupported multiIndex", args{"foo", Config{MultiIndex: []interface{}{complex64(1)}}}}, 118 | {"unsupported conversion", args{"3.5", Config{DataType: options.Unsupported}}}, 119 | {"index-multiIndex ambiguity", args{"foo", Config{Index: "foo", MultiIndex: []interface{}{"bar"}}}}, 120 | {"values-index alignmentV1", args{"foo", Config{Index: []string{"foo", "bar"}}}}, 121 | {"values-index alignmentV2", args{[]string{"foo"}, Config{Index: []string{"foo", "bar"}}}}, 122 | {"values-index alignmentV3", args{[]string{"foo", "bar"}, Config{Index: "foo"}}}, 123 | {"values-index alignmentV4", args{[]string{"foo", "bar"}, Config{Index: []string{"foo"}}}}, 124 | {"values-multiIndex alignmentV1", args{"foo", Config{MultiIndex: []interface{}{[]string{"foo", "bar"}}}}}, 125 | {"values-multiIndex alignment2", args{[]string{"foo"}, Config{MultiIndex: []interface{}{[]string{"foo", "bar"}}}}}, 126 | {"values-multiIndex alignmentV3", args{[]string{"foo", "bar"}, Config{MultiIndex: []interface{}{"foo"}}}}, 127 | {"values-multiIndex alignmentV4", args{[]string{"foo", "bar"}, Config{MultiIndex: []interface{}{"foo"}}}}, 128 | {"values-multiIndex alignmentV5", args{[]string{"foo", "bar"}, Config{MultiIndex: []interface{}{"foo", "bar"}}}}, 129 | {"multiIndex alignment", args{[]string{"foo", "bar"}, Config{ 130 | MultiIndex: []interface{}{[]string{"foo", "bar"}, []string{"baz"}}}}}, 131 | {"multiIndex names", args{[]string{"foo", "bar"}, Config{ 132 | MultiIndex: []interface{}{[]string{"foo", "bar"}, []string{"baz", "qux"}}, 133 | MultiIndexNames: []string{"1"}, 134 | }}}, 135 | } 136 | for _, tt := range tests { 137 | t.Run(tt.name, func(t *testing.T) { 138 | _, err := New(tt.args.data, tt.args.config) 139 | if err == nil { 140 | t.Error("New() error = nil, want error") 141 | return 142 | } 143 | }) 144 | } 145 | } 146 | 147 | func TestNew_Fail_multipleConfigs(t *testing.T) { 148 | _, err := New("foo", Config{}, Config{}) 149 | if err == nil { 150 | t.Error("New() error = nil, want error due to multiple configs") 151 | } 152 | } 153 | 154 | func TestMustNew(t *testing.T) { 155 | v, _ := values.InterfaceFactory(1.0) 156 | tests := []struct { 157 | name string 158 | args interface{} 159 | want *Series 160 | }{ 161 | {name: "pass", args: 1.0, 162 | want: &Series{values: v.Values, index: index.NewDefault(1), datatype: options.Float64}}, 163 | {name: "fail", args: complex64(1), 164 | want: newEmptySeries()}, 165 | } 166 | for _, tt := range tests { 167 | t.Run(tt.name, func(t *testing.T) { 168 | var buf bytes.Buffer 169 | log.SetOutput(&buf) 170 | defer log.SetOutput(os.Stderr) 171 | 172 | got := MustNew(tt.args) 173 | if !Equal(got, tt.want) { 174 | t.Errorf("MustNew() = %v, want %v", got, tt.want) 175 | } 176 | if strings.Contains(tt.name, "fail") { 177 | if buf.String() == "" { 178 | t.Errorf("series.MustNew() returned no log message, want log due to fail") 179 | } 180 | } 181 | }) 182 | } 183 | } 184 | func TestMustNew_fail(t *testing.T) { 185 | var buf bytes.Buffer 186 | log.SetOutput(&buf) 187 | defer log.SetOutput(os.Stderr) 188 | MustNew(complex64(1)) 189 | if buf.String() == "" { 190 | t.Errorf("MustNew() returned no log message, want log due to fail") 191 | } 192 | } 193 | 194 | func Test_Copy(t *testing.T) { 195 | tests := []struct { 196 | name string 197 | input *Series 198 | want *Series 199 | }{ 200 | {name: "pass", input: MustNew("foo"), want: MustNew("foo")}, 201 | } 202 | for _, tt := range tests { 203 | t.Run(tt.name, func(t *testing.T) { 204 | got := tt.input.Copy() 205 | if !Equal(got, tt.want) { 206 | t.Errorf("s.Copy() returned %v, want %v", got, tt.want) 207 | } 208 | if reflect.ValueOf(tt.input).Pointer() == reflect.ValueOf(tt.want).Pointer() { 209 | t.Errorf("s.Copy() retained reference to original, want copy") 210 | } 211 | if reflect.ValueOf(tt.input.values).Pointer() == reflect.ValueOf(tt.want.values).Pointer() { 212 | t.Errorf("s.Copy() retained reference to original values, want copy") 213 | } 214 | if reflect.ValueOf(tt.input.index.Levels).Pointer() == reflect.ValueOf(tt.want.index.Levels).Pointer() { 215 | t.Errorf("s.Copy() retained reference to original index, want copy") 216 | } 217 | }) 218 | } 219 | } 220 | 221 | func TestFromInternalComponents(t *testing.T) { 222 | vals := values.MustCreateValuesFromInterface("foo") 223 | index := index.NewDefault(1) 224 | got := FromInternalComponents(vals, index, "bar") 225 | want := MustNew("foo", Config{Name: "bar"}) 226 | if !Equal(got, want) { 227 | t.Errorf("FromInternalComponents() returned %v, want %v", got, want) 228 | } 229 | 230 | } 231 | 232 | func TestToInternalComponents(t *testing.T) { 233 | s := MustNew("foo") 234 | vals, idx := s.ToInternalComponents() 235 | wantVals := values.MustCreateValuesFromInterface("foo") 236 | wantIdx := index.NewDefault(1) 237 | if !reflect.DeepEqual(vals, wantVals) { 238 | t.Errorf("ToInternalComponents() returned %v, want %v", vals, wantVals) 239 | } 240 | if !reflect.DeepEqual(idx, wantIdx) { 241 | t.Errorf("ToInternalComponents() returned %v, want %v", idx, wantIdx) 242 | } 243 | 244 | } 245 | -------------------------------------------------------------------------------- /dataframe/describe_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "github.com/ptiger10/pd/options" 8 | "github.com/ptiger10/pd/series" 9 | ) 10 | 11 | func TestDataFrame_Describe(t *testing.T) { 12 | type want struct { 13 | len int 14 | numCols int 15 | numIdxLevels int 16 | numColLevels int 17 | dataType options.DataType 18 | dataTypePrinter string 19 | dataTypes *series.Series 20 | } 21 | tests := []struct { 22 | name string 23 | input *DataFrame 24 | want want 25 | }{ 26 | {name: "empty", 27 | input: newEmptyDataFrame(), 28 | want: want{ 29 | len: 0, numCols: 0, numIdxLevels: 0, numColLevels: 0, 30 | dataType: options.None, dataTypePrinter: "empty", dataTypes: series.MustNew(nil), 31 | }}, 32 | {"default index, col", 33 | MustNew([]interface{}{"foo"}), 34 | want{ 35 | len: 1, numCols: 1, numIdxLevels: 1, numColLevels: 1, 36 | dataType: options.String, dataTypePrinter: "string", dataTypes: series.MustNew("string", series.Config{Name: "datatypes"}), 37 | }}, 38 | {"multi index, single col", 39 | MustNew([]interface{}{"foo"}, Config{MultiIndex: []interface{}{"baz", "qux"}}), 40 | want{ 41 | len: 1, numCols: 1, numIdxLevels: 2, numColLevels: 1, 42 | dataType: options.String, dataTypePrinter: "string", dataTypes: series.MustNew("string", series.Config{Name: "datatypes"}), 43 | }}, 44 | {"single index, two cols, mixed types", 45 | MustNew([]interface{}{"foo", 5}, Config{Col: []string{"baz", "qux"}}), 46 | want{ 47 | len: 1, numCols: 2, numIdxLevels: 1, numColLevels: 1, 48 | dataType: options.Unsupported, dataTypePrinter: "mixed", dataTypes: series.MustNew([]string{"string", "int64"}, series.Config{Name: "datatypes"}), 49 | }}, 50 | {"single index, multi col", 51 | MustNew([]interface{}{"foo", "bar"}, Config{MultiCol: [][]string{{"baz", "qux"}, {"corge", "fred"}}}), 52 | want{ 53 | len: 1, numCols: 2, numIdxLevels: 1, numColLevels: 2, 54 | dataType: options.String, dataTypePrinter: "string", dataTypes: series.MustNew([]string{"string", "string"}, series.Config{Name: "datatypes"}), 55 | }}, 56 | } 57 | for _, tt := range tests { 58 | t.Run(tt.name, func(t *testing.T) { 59 | df := tt.input.Copy() 60 | gotLen := df.Len() 61 | if gotLen != tt.want.len { 62 | t.Errorf("df.Len(): got %v, want %v", gotLen, tt.want.len) 63 | } 64 | gotNumCols := df.NumCols() 65 | if gotNumCols != tt.want.numCols { 66 | t.Errorf("df.NumCols(): got %v, want %v", gotNumCols, tt.want.numCols) 67 | } 68 | gotNumIdxLevels := df.IndexLevels() 69 | if gotNumIdxLevels != tt.want.numIdxLevels { 70 | t.Errorf("df.IndexLevels(): got %v, want %v", gotNumIdxLevels, tt.want.numIdxLevels) 71 | } 72 | gotNumColLevels := df.ColLevels() 73 | if gotNumColLevels != tt.want.numColLevels { 74 | t.Errorf("df.ColLevels(): got %v, want %v", gotNumColLevels, tt.want.numColLevels) 75 | } 76 | gotDataType := df.dataType() 77 | if gotDataType != tt.want.dataType { 78 | t.Errorf("df.gotDataType: got %v, want %v", gotDataType, tt.want.dataType) 79 | } 80 | gotDataTypePrinter := df.dataTypePrinter() 81 | if gotDataTypePrinter != tt.want.dataTypePrinter { 82 | t.Errorf("df.dataTypePrinter: got %v, want %v", gotDataTypePrinter, tt.want.dataTypePrinter) 83 | } 84 | gotDataTypes := df.DataTypes() 85 | if !series.Equal(gotDataTypes, tt.want.dataTypes) { 86 | t.Errorf("df.DataTypes(): got %v, want %v", gotDataTypes, tt.want.dataTypes) 87 | } 88 | }) 89 | } 90 | } 91 | 92 | func TestEqual(t *testing.T) { 93 | df := MustNew([]interface{}{[]string{"foo"}, []string{"bar"}}, Config{Index: "corge", Col: []string{"baz", "qux"}}) 94 | type args struct { 95 | df2 *DataFrame 96 | } 97 | tests := []struct { 98 | name string 99 | input *DataFrame 100 | args args 101 | want bool 102 | }{ 103 | {name: "equal", input: df, 104 | args: args{df2: MustNew([]interface{}{[]string{"foo"}, []string{"bar"}}, Config{Index: "corge", Col: []string{"baz", "qux"}})}, 105 | want: true}, 106 | {"equal empty", newEmptyDataFrame(), args{newEmptyDataFrame()}, true}, 107 | {"equal empty copy", newEmptyDataFrame().Copy(), args{newEmptyDataFrame()}, true}, 108 | {"not equal: values", df, 109 | args{MustNew([]interface{}{[]string{"foo"}, []string{"bar"}})}, false}, 110 | {"not equal: cols", df, 111 | args{MustNew([]interface{}{[]string{"foo"}, []string{"bar"}}, Config{Index: "corge", Col: []string{"fred", "qux"}})}, false}, 112 | {"not equal: name", df, 113 | args{MustNew([]interface{}{[]string{"foo"}, []string{"bar"}}, Config{Index: "corge", Col: []string{"baz", "qux"}, Name: "quux"})}, false}, 114 | } 115 | for _, tt := range tests { 116 | t.Run(tt.name, func(t *testing.T) { 117 | got := Equal(tt.input, tt.args.df2) 118 | if got != tt.want { 119 | t.Errorf("Equal() got %v, want %v", got, tt.want) 120 | } 121 | }) 122 | } 123 | } 124 | 125 | func TestMaxColWidth(t *testing.T) { 126 | type want struct { 127 | colWidths []int 128 | exclusionsTable [][]bool 129 | } 130 | tests := []struct { 131 | name string 132 | input *DataFrame 133 | want want 134 | }{ 135 | {name: "empty config", input: MustNew([]interface{}{[]string{"a", "foo"}, []string{"b", "quux"}}), 136 | want: want{colWidths: []int{3, 4}, exclusionsTable: [][]bool{{false, false}}}}, 137 | {"single level", 138 | MustNew([]interface{}{[]string{"a", "foo"}, []string{"b", "quux"}}, 139 | Config{Col: []string{"corge", "bar"}, ColName: "grapply"}), 140 | want{[]int{5, 4}, [][]bool{{false, false}}}}, 141 | {"multi level", 142 | MustNew([]interface{}{[]string{"a", "foo"}, []string{"b", "quux"}}, 143 | Config{MultiCol: [][]string{{"corge", "bar"}, {"qux", "quuz"}}, MultiColNames: []string{"grapply", "grault"}}), 144 | want{[]int{5, 4}, [][]bool{{false, false}, {false, false}}}}, 145 | {"nil: empty colWidths", newEmptyDataFrame(), want{nil, [][]bool{}}}, 146 | } 147 | 148 | for _, tt := range tests { 149 | t.Run(tt.name, func(t *testing.T) { 150 | df := tt.input 151 | excl := df.makeColumnExclusionsTable() 152 | got := df.maxColWidths(excl) 153 | if !reflect.DeepEqual(excl, tt.want.exclusionsTable) { 154 | t.Errorf("df.makeColumnExclusionsTable() got %v, want %v", excl, tt.want.exclusionsTable) 155 | } 156 | if !reflect.DeepEqual(got, tt.want.colWidths) { 157 | t.Errorf("df.maxColWidths() got %v, want %v", got, tt.want.colWidths) 158 | } 159 | }) 160 | } 161 | } 162 | 163 | func TestMaxColWidthExcludeRepeat(t *testing.T) { 164 | df := MustNew( 165 | []interface{}{[]string{"a", "b"}, []string{"c", "quux"}}, 166 | Config{MultiCol: [][]string{{"waldo", "waldo"}, {"d", "e"}}}) 167 | excl := [][]bool{{false, true}, {false, false}} 168 | got := df.maxColWidths(excl) 169 | want := []int{5, 4} 170 | if !reflect.DeepEqual(got, want) { 171 | t.Errorf("df.maxColWidths() got %v, want %v", got, want) 172 | } 173 | } 174 | 175 | func TestHeadTail(t *testing.T) { 176 | df := MustNew([]interface{}{[]string{"foo", "bar", "baz", "qux"}}, Config{Index: []int{0, 1, 2, 3}}) 177 | type args struct { 178 | n int 179 | } 180 | tests := []struct { 181 | name string 182 | input *DataFrame 183 | fn func(*DataFrame, int) *DataFrame 184 | args args 185 | want *DataFrame 186 | }{ 187 | {name: "head", input: df, fn: (*DataFrame).Head, args: args{n: 2}, 188 | want: MustNew([]interface{}{[]string{"foo", "bar"}}, Config{Index: []int{0, 1}})}, 189 | {name: "head - max", input: df, fn: (*DataFrame).Head, args: args{n: 10}, 190 | want: df}, 191 | {name: "tail", input: df, fn: (*DataFrame).Tail, args: args{n: 2}, 192 | want: MustNew([]interface{}{[]string{"baz", "qux"}}, Config{Index: []int{2, 3}})}, 193 | {name: "tail - max", input: df, fn: (*DataFrame).Tail, args: args{n: 10}, 194 | want: df}, 195 | } 196 | for _, tt := range tests { 197 | t.Run(tt.name, func(t *testing.T) { 198 | got := tt.fn(tt.input, tt.args.n) 199 | if !Equal(got, tt.want) { 200 | t.Errorf("df.Head/Tail() got %v, want %v", got, tt.want) 201 | } 202 | }) 203 | } 204 | } 205 | 206 | func TestDataFrame_Export(t *testing.T) { 207 | tests := []struct { 208 | name string 209 | input *DataFrame 210 | want [][]interface{} 211 | }{ 212 | {name: "pass", input: MustNew([]interface{}{"foo"}, Config{Index: "bar", Col: []string{"baz"}}), 213 | want: [][]interface{}{{nil, "baz"}, {"bar", "foo"}}}, 214 | } 215 | for _, tt := range tests { 216 | t.Run(tt.name, func(t *testing.T) { 217 | got := tt.input.Export() 218 | if !reflect.DeepEqual(got, tt.want) { 219 | t.Errorf("df.Export() got %v, want %v", got, tt.want) 220 | } 221 | }) 222 | } 223 | } 224 | 225 | func TestDataFrame_ExportToCSV(t *testing.T) { 226 | type args struct { 227 | filepath string 228 | } 229 | tests := []struct { 230 | name string 231 | input *DataFrame 232 | args args 233 | want bool 234 | }{ 235 | {name: "pass", input: MustNew([]interface{}{"foo"}, Config{Index: "bar", Col: []string{"baz"}}), 236 | want: false}, 237 | } 238 | for _, tt := range tests { 239 | t.Run(tt.name, func(t *testing.T) { 240 | tt.input.ExportToCSV("output_test.csv") 241 | //TODO: move ReadCSV to dataframe package to rehydrate output and compare to input 242 | }) 243 | } 244 | } 245 | --------------------------------------------------------------------------------