├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── dataframe ├── constructors.go ├── dataframe.go ├── dataframe_test.go ├── doc.go ├── element.go ├── element_numeric.gen.go ├── element_numeric.gen.go.tmpl ├── example_test.go ├── mutations.go ├── smartbuilder.go ├── smartbuilder_test.go ├── stepvalue.go └── tablefacade.go ├── datatype_numeric.gen.go.tmpldata ├── doc.go ├── go.mod ├── go.sum ├── internal ├── cast │ ├── dense.go │ ├── doc.go │ └── sparse.go ├── constructors │ ├── doc.go │ └── interface.go └── debug │ ├── assert_disabled.go │ ├── assert_enabled.go │ ├── debug_disabled.go │ ├── debug_enabled.go │ ├── doc.go │ ├── warn_disabled.go │ └── warn_enabled.go ├── iterator ├── booleaniterator.go ├── chunkiterator.gen.go ├── chunkiterator.gen.go.tmpl ├── chunkiterator.go ├── chunkiterator_test.go ├── doc.go ├── stepiterator.go ├── stepiterator_test.go ├── stringiterator.go ├── valueiterator.gen.go ├── valueiterator.gen.go.tmpl ├── valueiterator.go └── valueiterator_test.go ├── numeric.tmpldata └── tools.go /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Golang CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-go/ for more details 4 | version: 2 5 | jobs: 6 | build: 7 | docker: 8 | # specify the version 9 | - image: circleci/golang:1.12 10 | steps: 11 | - checkout 12 | 13 | - run: make ci 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/ 2 | *.test 3 | bin/ 4 | vendor/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Nick Poorman 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GO_BUILD=go build 2 | GO_TEST?=go test 3 | GO_MOD=go mod 4 | 5 | GO_SOURCES := $(shell find . -path -prune -o -name '*.go' -not -name '*_test.go') 6 | SOURCES_NO_VENDOR := $(shell find . -path ./vendor -prune -o -name "*.go" -not -name '*_test.go' -print) 7 | GO_TEMPLATES := $(shell find . -path ./vendor -prune -o -name "*.tmpl" -print) 8 | GO_COMPILED_TEMPLATES = $(patsubst %.gen.go.tmpl,%.gen.go,$(GO_TEMPLATES)) 9 | 10 | default: build 11 | 12 | build: vendor go-templates 13 | 14 | clean: 15 | find . -type f -name '*.gen.go' -exec rm {} + 16 | rm -rf bin/ 17 | rm -rf vendor/ 18 | 19 | test: $(GO_SOURCES) 20 | $(GO_TEST) $(GO_TEST_ARGS) ./... 21 | 22 | ci: test-debug-assert 23 | 24 | test-debug-assert: $(GO_SOURCES) 25 | $(GO_TEST) $(GO_TEST_ARGS) -tags='debug assert' ./... 26 | 27 | bench: $(GO_SOURCES) 28 | $(GO_TEST) $(GO_TEST_ARGS) -bench=. -run=- ./... 29 | 30 | go-templates: bin/tmpl $(GO_COMPILED_TEMPLATES) 31 | 32 | %.gen.go: %.gen.go.tmpl 33 | bin/tmpl -i -data=numeric.tmpldata $< 34 | 35 | fmt: $(SOURCES_NO_VENDOR) 36 | goimports -w $^ 37 | 38 | bin/tmpl: ./vendor/github.com/apache/arrow/go/arrow/_tools/tmpl/main.go 39 | $(GO_BUILD) -o $@ "./$( 23 | 24 | ## Installation 25 | 26 | Add the package to your `go.mod` file: 27 | 28 | require github.com/go-bullseye/bullseye master 29 | 30 | Or, clone the repository: 31 | 32 | git clone --branch master https://github.com/go-bullseye/bullseye.git $GOPATH/src/github.com/go-bullseye/bullseye 33 | 34 | A complete example: 35 | 36 | ```bash 37 | mkdir my-dataframe-app && cd my-dataframe-app 38 | 39 | cat > go.mod <<-END 40 | module my-dataframe-app 41 | 42 | require github.com/go-bullseye/bullseye master 43 | END 44 | 45 | cat > main.go <<-END 46 | package main 47 | 48 | import ( 49 | "fmt" 50 | 51 | "github.com/apache/arrow/go/arrow/memory" 52 | "github.com/go-bullseye/bullseye/dataframe" 53 | ) 54 | 55 | func main() { 56 | pool := memory.NewGoAllocator() 57 | df, _ := dataframe.NewDataFrameFromMem(pool, dataframe.Dict{ 58 | "col1": []int32{1, 2, 3, 4, 5}, 59 | "col2": []float64{1.1, 2.2, 3.3, 4.4, 5}, 60 | "col3": []string{"foo", "bar", "ping", "", "pong"}, 61 | "col4": []interface{}{2, 4, 6, nil, 8}, 62 | }) 63 | defer df.Release() 64 | fmt.Printf("DataFrame:\n%s\n", df.Display(0)) 65 | } 66 | 67 | // DataFrame: 68 | // rec[0]["col1"]: [1 2 3 4 5] 69 | // rec[0]["col2"]: [1.1 2.2 3.3 4.4 5] 70 | // rec[0]["col3"]: ["foo" "bar" "ping" "" "pong"] 71 | // rec[0]["col4"]: [2 4 6 (null) 8] 72 | END 73 | 74 | go run main.go 75 | ``` 76 | 77 | 78 | 79 | ## Usage 80 | 81 | See the [DataFrame tests](dataframe/dataframe_test.go) for extensive usage examples. 82 | 83 | ## Reference Counting 84 | 85 | From the [arrow/go README](https://github.com/apache/arrow/blob/master/go/README.md)... 86 | 87 | > The library makes use of reference counting so that it can track when memory 88 | > buffers are no longer used. This allows Arrow to update resource accounting, 89 | > pool memory such and track overall memory usage as objects are created and 90 | > released. Types expose two methods to deal with this pattern. The `Retain` 91 | > method will increase the reference count by 1 and `Release` method will reduce 92 | > the count by 1. Once the reference count of an object is zero, any associated 93 | > object will be freed. `Retain` and `Release` are safe to call from multiple 94 | > goroutines. 95 | 96 | ### When to call `Retain` / `Release`? 97 | 98 | - If you are passed an object and wish to take ownership of it, you must call 99 | `Retain`. You must later pair this with a call to `Release` when you no 100 | longer need the object. "Taking ownership" typically means you wish to 101 | access the object outside the scope of the current function call. 102 | 103 | - You own any object you create via functions whose name begins with `New` or 104 | `Copy` or **any operation that results in a new immutable DataFrame being returned** 105 | or when receiving an object over a channel. Therefore you must call 106 | `Release` once you no longer need the object. 107 | 108 | - If you send an object over a channel, you must call `Retain` before sending 109 | it as the receiver is assumed to own the object and will later call `Release` 110 | when it no longer needs the object. 111 | 112 | _Note:_ You can write a test using `memory.NewCheckedAllocator` to assert that you have 113 | released all resources properly. See: [tests](https://github.com/go-bullseye/bullseye/blob/e0958263a91ec914aa4cd0a1b26e43aab29b4c74/dataframe/dataframe_test.go#L234) 114 | 115 | ## TODO 116 | 117 | This DataFrame currently implements most of the scalar types we've come across. 118 | There is still work to be done on some of the list and struct types. Feel free 119 | to submit a PR if find you need them. This library will let you know when you do. 120 | 121 | - [ ] Implement all Arrow DataTypes. 122 | - [ ] Add a filter function to DataFrame. 123 | - [ ] Add an order by function to DataFrame. 124 | 125 | ## License 126 | 127 | (c) 2019 Nick Poorman. Licensed under the Apache License, Version 2.0. 128 | -------------------------------------------------------------------------------- /dataframe/constructors.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "github.com/apache/arrow/go/arrow/array" 5 | "github.com/apache/arrow/go/arrow/memory" 6 | "github.com/go-bullseye/bullseye/internal/cast" 7 | "github.com/go-bullseye/bullseye/internal/constructors" 8 | ) 9 | 10 | // NewColumnFromMem is a helper for creating a new Column from memory. 11 | func NewColumnFromMem(mem memory.Allocator, name string, values interface{}) (*array.Column, error) { 12 | arr, field, err := constructors.NewInterfaceFromMem(mem, name, values, nil) 13 | if err != nil { 14 | return nil, err 15 | } 16 | defer arr.Release() 17 | 18 | // create the chunk from the data 19 | chunk := array.NewChunked(arr.DataType(), []array.Interface{arr}) 20 | defer chunk.Release() 21 | 22 | // create the column from the schema and chunk 23 | col := array.NewColumn(*field, chunk) 24 | 25 | return col, nil 26 | } 27 | 28 | // NewColumnFromSparseMem is a helper for creating a new Column from sparse memory. 29 | func NewColumnFromSparseMem(mem memory.Allocator, name string, values []interface{}, valueIndexes []int, size int) (*array.Column, error) { 30 | // build valid mask 31 | valid := make([]bool, size) 32 | for _, idx := range valueIndexes { 33 | valid[idx] = true 34 | } 35 | 36 | ifaceDense, err := cast.SparseCollectionToInterface(values, valueIndexes, size) 37 | if err != nil { 38 | return nil, err 39 | } 40 | 41 | arr, field, err := constructors.NewInterfaceFromMem(mem, name, ifaceDense, valid) 42 | if err != nil { 43 | return nil, err 44 | } 45 | defer arr.Release() 46 | 47 | // create the chunk from the data 48 | chunk := array.NewChunked(arr.DataType(), []array.Interface{arr}) 49 | defer chunk.Release() 50 | 51 | // create the column from the schema and chunk 52 | col := array.NewColumn(*field, chunk) 53 | 54 | return col, nil 55 | } 56 | -------------------------------------------------------------------------------- /dataframe/dataframe.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | "sync/atomic" 8 | 9 | "github.com/apache/arrow/go/arrow" 10 | "github.com/apache/arrow/go/arrow/array" 11 | "github.com/apache/arrow/go/arrow/memory" 12 | "github.com/go-bullseye/bullseye/internal/constructors" 13 | "github.com/go-bullseye/bullseye/internal/debug" 14 | "github.com/go-bullseye/bullseye/iterator" 15 | "github.com/pkg/errors" 16 | ) 17 | 18 | // Dict is a map of string to array of data. 19 | type Dict map[string]interface{} 20 | 21 | // Option is an option that may be passed to a function. 22 | type Option func(interface{}) error 23 | 24 | // NewDataFrame creates a new data frame from the provided schema and arrays. 25 | func NewDataFrame(mem memory.Allocator, schema *arrow.Schema, arrs []array.Interface) (*DataFrame, error) { 26 | df := &DataFrame{ 27 | refs: 1, 28 | mem: mem, 29 | schema: schema, 30 | rows: -1, 31 | mutator: NewMutator(mem), 32 | } 33 | 34 | if df.rows < 0 { 35 | switch len(arrs) { 36 | case 0: 37 | df.rows = 0 38 | default: 39 | df.rows = int64(arrs[0].Len()) 40 | } 41 | } 42 | 43 | if df.schema == nil { 44 | return nil, errors.Errorf("dataframe: nil schema") 45 | } 46 | 47 | if len(df.schema.Fields()) != len(arrs) { 48 | return nil, errors.Errorf("dataframe: inconsistent schema/arrays") 49 | } 50 | 51 | for i, arr := range arrs { 52 | ft := df.schema.Field(i) 53 | if arr.DataType() != ft.Type { 54 | return nil, errors.Errorf("dataframe: column %q is inconsitent with schema", ft.Name) 55 | } 56 | 57 | if int64(arr.Len()) < df.rows { 58 | return nil, errors.Errorf("dataframe: column %q expected length >= %d but got length %d", ft.Name, df.rows, arr.Len()) 59 | } 60 | } 61 | 62 | df.cols = make([]array.Column, len(arrs)) 63 | for i := range arrs { 64 | func(i int) { 65 | chunk := array.NewChunked(arrs[i].DataType(), []array.Interface{arrs[i]}) 66 | defer chunk.Release() 67 | 68 | col := array.NewColumn(df.schema.Field(i), chunk) 69 | df.cols[i] = *col 70 | }(i) 71 | } 72 | 73 | return df, nil 74 | } 75 | 76 | // NewDataFrameFromColumns returns a DataFrame interface. 77 | func NewDataFrameFromColumns(mem memory.Allocator, cols []array.Column) (*DataFrame, error) { 78 | var rows int64 79 | if len(cols) > 0 { 80 | rows = columnLen(cols[0]) 81 | } 82 | 83 | return NewDataFrameFromShape(mem, cols, rows) 84 | } 85 | 86 | // NewDataFrameFromMem creates a new data frame from the provided in-memory data. 87 | func NewDataFrameFromMem(mem memory.Allocator, dict Dict) (*DataFrame, error) { 88 | var ( 89 | err error 90 | arrs = make([]array.Interface, 0, len(dict)) 91 | fields = make([]arrow.Field, 0, len(dict)) 92 | ) 93 | 94 | keys := make([]string, 0, len(dict)) 95 | for k := range dict { 96 | keys = append(keys, k) 97 | } 98 | sort.Strings(keys) 99 | for _, k := range keys { 100 | v := dict[k] 101 | arr, field, newInterfaceErr := constructors.NewInterfaceFromMem(mem, k, v, nil) 102 | if newInterfaceErr != nil { 103 | err = newInterfaceErr 104 | break 105 | } 106 | arrs = append(arrs, arr) 107 | fields = append(fields, *field) 108 | } 109 | 110 | defer func() { 111 | for i := range arrs { 112 | arrs[i].Release() 113 | } 114 | }() 115 | 116 | if err != nil { 117 | return nil, err 118 | } 119 | 120 | schema := arrow.NewSchema(fields, nil) 121 | return NewDataFrame(mem, schema, arrs) 122 | } 123 | 124 | // NewDataFrameFromShape is the same as NewDataFrameFromColumns only it allows you to specify the number 125 | // of rows in the DataFrame. 126 | func NewDataFrameFromShape(mem memory.Allocator, cols []array.Column, rows int64) (*DataFrame, error) { 127 | df := &DataFrame{ 128 | refs: 1, 129 | mem: mem, 130 | schema: buildSchema(cols), 131 | cols: cols, 132 | rows: rows, 133 | mutator: NewMutator(mem), 134 | } 135 | 136 | // validate the data frame and its constituents. 137 | // note we retain the columns after having validated the data frame 138 | // in case the validation fails and panics (and would otherwise leak 139 | // a ref-count on the columns.) 140 | if err := df.validate(); err != nil { 141 | return nil, err 142 | } 143 | 144 | for i := range df.cols { 145 | df.cols[i].Retain() 146 | } 147 | 148 | return df, nil 149 | } 150 | 151 | func NewDataFrameFromTable(mem memory.Allocator, table array.Table) (*DataFrame, error) { 152 | cols := make([]array.Column, table.NumCols()) 153 | for i := range cols { 154 | col := table.Column(i) 155 | cols[i] = *col 156 | } 157 | 158 | return NewDataFrameFromShape(mem, cols, table.NumRows()) 159 | } 160 | 161 | // DataFrame is an immutable DataFrame that uses Arrow 162 | // to store it's data in a standard columnar format. 163 | type DataFrame struct { 164 | refs int64 // reference count 165 | mem memory.Allocator 166 | schema *arrow.Schema 167 | 168 | cols []array.Column 169 | rows int64 170 | 171 | // Mutations that can be performed on this DataFrame 172 | // require a the Mutator to be set up. 173 | mutator *Mutator 174 | } 175 | 176 | // Allocator returns the memory allocator for this DataFrame 177 | func (df *DataFrame) Allocator() memory.Allocator { 178 | return df.mem 179 | } 180 | 181 | // Column returns the column matching the given name. 182 | func (df *DataFrame) Column(name string) *array.Column { 183 | for i, col := range df.cols { 184 | if col.Name() == name { 185 | return &df.cols[i] 186 | } 187 | } 188 | return nil 189 | } 190 | 191 | // ColumnAt returns the i-th column of this Frame. 192 | func (df *DataFrame) ColumnAt(i int) *array.Column { 193 | return &df.cols[i] 194 | } 195 | 196 | // Columns is the slice of Columns that make up this DataFrame. 197 | func (df *DataFrame) Columns() []array.Column { 198 | return df.cols 199 | } 200 | 201 | // ColumnNames is the slice of column names that make up this DataFrame. 202 | func (df *DataFrame) ColumnNames() []string { 203 | fields := df.schema.Fields() 204 | names := make([]string, len(fields)) 205 | for i, field := range fields { 206 | names[i] = field.Name 207 | } 208 | return names 209 | } 210 | 211 | // ColumnTypes is the slice of column types that make up this DataFrame. 212 | func (df *DataFrame) ColumnTypes() []arrow.Field { 213 | return df.schema.Fields() 214 | } 215 | 216 | // Equals checks for equality between this DataFrame and DataFrame d. 217 | // nil elements at the same location are considered equal. 218 | func (df *DataFrame) Equals(d *DataFrame) bool { 219 | if !df.schema.Equal(d.schema) { 220 | return false 221 | } 222 | 223 | // compare the columns 224 | leftCols := df.Columns() 225 | rightCols := d.Columns() 226 | 227 | if len(leftCols) != len(rightCols) { 228 | return false 229 | } 230 | 231 | for i := range leftCols { 232 | leftCol := leftCols[i] 233 | rightCol := rightCols[i] 234 | 235 | // Could do this with a column iterator? 236 | same := compareColumns(&leftCol, &rightCol) 237 | if !same { 238 | return false 239 | } 240 | } 241 | 242 | return true 243 | } 244 | 245 | // NumCols returns the number of columns of this DataFrame using Go's len(). 246 | func (df *DataFrame) NumCols() int { 247 | return len(df.cols) 248 | } 249 | 250 | // NumRows returns the number of rows of this DataFrame. 251 | func (df *DataFrame) NumRows() int64 { 252 | return df.rows 253 | } 254 | 255 | // Name returns the name of the i-th column of this DataFrame. 256 | func (df *DataFrame) Name(i int) string { 257 | return df.schema.Field(i).Name 258 | } 259 | 260 | // Dims retrieves the dimensions of a DataFrame. 261 | func (df *DataFrame) Dims() (int, int64) { 262 | return len(df.cols), df.rows 263 | } 264 | 265 | // Display builds out a string representation of the DataFrame that is useful for debugging. 266 | // if chunkSize is <= 0, the biggest possible chunk will be selected. 267 | func (df *DataFrame) Display(chunkSize int64) string { 268 | tr := array.NewTableReader(NewTableFacade(df), chunkSize) 269 | defer tr.Release() 270 | 271 | n := 0 272 | var output strings.Builder 273 | for tr.Next() { 274 | rec := tr.Record() 275 | for i, col := range rec.Columns() { 276 | fmt.Fprintf(&output, "rec[%d][%q]: %v\n", n, rec.ColumnName(i), col) 277 | } 278 | n++ 279 | } 280 | 281 | return output.String() 282 | } 283 | 284 | /** 285 | * These are column specific helpers 286 | */ 287 | 288 | // SelectColumns returns only columns matching names. 289 | func (df *DataFrame) SelectColumns(names ...string) []array.Column { 290 | if len(names) == 0 { 291 | return []array.Column{} 292 | } 293 | 294 | set := make(map[string]struct{}, len(names)) 295 | for _, name := range names { 296 | set[name] = struct{}{} 297 | } 298 | 299 | cols := make([]array.Column, 0, len(names)) 300 | 301 | dfColumns := df.Columns() 302 | for i := range dfColumns { 303 | if _, ok := set[dfColumns[i].Name()]; !ok { 304 | continue 305 | } 306 | cols = append(cols, dfColumns[i]) 307 | } 308 | 309 | return cols[:len(cols):len(cols)] 310 | } 311 | 312 | // RejectColumns returns only columns not matching names. 313 | func (df *DataFrame) RejectColumns(names ...string) []array.Column { 314 | if len(names) == 0 { 315 | return df.Columns() 316 | } 317 | 318 | set := make(map[string]struct{}, len(names)) 319 | for _, name := range names { 320 | set[name] = struct{}{} 321 | } 322 | 323 | cols := make([]array.Column, 0, df.NumCols()-len(names)) 324 | 325 | dfColumns := df.Columns() 326 | for i := range dfColumns { 327 | if _, drop := set[dfColumns[i].Name()]; drop { 328 | continue 329 | } 330 | cols = append(cols, dfColumns[i]) 331 | } 332 | 333 | return cols[:len(cols):len(cols)] 334 | } 335 | 336 | // Apply takes a series of MutationFunc and calls them with the existing DataFrame on the left. 337 | func (df *DataFrame) Apply(fns ...MutationFunc) (*DataFrame, error) { 338 | left, err := df.Copy() 339 | if err != nil { 340 | return nil, err 341 | } 342 | if len(fns) == 0 { 343 | return left, err 344 | } 345 | for i := range fns { 346 | left, err = func() (*DataFrame, error) { 347 | defer left.Release() 348 | return fns[i](left) 349 | }() 350 | if err != nil { 351 | return nil, err 352 | } 353 | } 354 | return left, err 355 | } 356 | 357 | // ApplyToColumnFunc is a type alias for a function that will be called for each element 358 | // that is iterated over in a column. The return value will 359 | type ApplyToColumnFunc func(v interface{}) (interface{}, error) 360 | 361 | // ApplyToColumn creates a new DataFrame with the new column appended. The new column is built 362 | // with the response values obtained from ApplyToColumnFunc. An error response value from 363 | // ApplyToColumnFunc will cause ApplyToColumn to return immediately. 364 | func (df *DataFrame) ApplyToColumn(columnName, newColumnName string, fn ApplyToColumnFunc) (*DataFrame, error) { 365 | return df.Apply(func(df *DataFrame) (*DataFrame, error) { 366 | // TODO(nickpoorman): refactor this 367 | col := df.Column(columnName) 368 | field := col.Field() 369 | field.Name = newColumnName 370 | schema := arrow.NewSchema([]arrow.Field{field}, nil) 371 | builder := array.NewRecordBuilder(df.Allocator(), schema) 372 | defer builder.Release() 373 | smartBuilder := NewSmartBuilder(builder, schema) 374 | valueIterator := iterator.NewValueIterator(col) 375 | defer valueIterator.Release() 376 | for valueIterator.Next() { 377 | value := valueIterator.ValueInterface() 378 | res, err := fn(value) 379 | if err != nil { 380 | return nil, err 381 | } 382 | smartBuilder.Append(0, res) 383 | } 384 | rec := builder.NewRecord() 385 | defer rec.Release() 386 | chunk := array.NewChunked(col.DataType(), rec.Columns()) 387 | defer chunk.Release() 388 | newCol := array.NewColumn(field, chunk) 389 | defer newCol.Release() 390 | return df.AppendColumn(newCol) 391 | }) 392 | } 393 | 394 | /** 395 | * The following functions will always return a new DataFrame. 396 | */ 397 | 398 | // AppendColumn builds a new DataFrame with the provided Column included. 399 | func (df *DataFrame) AppendColumn(c *array.Column) (*DataFrame, error) { 400 | nCols := len(df.cols) 401 | cols := make([]array.Column, nCols+1) 402 | copy(cols, df.cols) 403 | cols[nCols] = *c 404 | return NewDataFrameFromShape(df.mem, cols, df.rows) 405 | } 406 | 407 | // Copy returns a copy of this dataframe. The underlying byte buffers will not be copied. 408 | func (df *DataFrame) Copy() (*DataFrame, error) { 409 | nCols := len(df.cols) 410 | cols := make([]array.Column, nCols) 411 | copy(cols, df.cols) 412 | return NewDataFrameFromShape(df.mem, cols, df.rows) 413 | } 414 | 415 | // CrossJoin returns a DataFrame containing the cross join of two DataFrames. 416 | func (df *DataFrame) CrossJoin(right *DataFrame, opts ...Option) (*DataFrame, error) { 417 | fn := df.mutator.CrossJoin(right, opts...) 418 | return fn(df) 419 | } 420 | 421 | // Select the given DataFrame columns by name. 422 | func (df *DataFrame) Select(names ...string) (*DataFrame, error) { 423 | fn := df.mutator.Select(names...) 424 | return fn(df) 425 | } 426 | 427 | // Drop the given DataFrame columns by name. 428 | func (df *DataFrame) Drop(names ...string) (*DataFrame, error) { 429 | fn := df.mutator.Drop(names...) 430 | return fn(df) 431 | } 432 | 433 | // InnerJoin returns a DataFrame containing the inner join of two DataFrames. 434 | func (df *DataFrame) InnerJoin(right *DataFrame, columns []string, opts ...Option) (*DataFrame, error) { 435 | fn := df.mutator.InnerJoin(right, columns, opts...) 436 | return fn(df) 437 | } 438 | 439 | // LeftJoin returns a DataFrame containing the left join of two DataFrames. 440 | func (df *DataFrame) LeftJoin(right *DataFrame, columns []string, opts ...Option) (*DataFrame, error) { 441 | fn := df.mutator.LeftJoin(right, columns, opts...) 442 | return fn(df) 443 | } 444 | 445 | // OuterJoin returns a DataFrame containing the outer join of two DataFrames. 446 | // Use union of keys from both frames, similar to a SQL full outer join. 447 | func (df *DataFrame) OuterJoin(right *DataFrame, columns []string, opts ...Option) (*DataFrame, error) { 448 | fn := df.mutator.OuterJoin(right, columns, opts...) 449 | return fn(df) 450 | } 451 | 452 | // RightJoin returns a DataFrame containing the right join of two DataFrames. 453 | func (df *DataFrame) RightJoin(right *DataFrame, columns []string, opts ...Option) (*DataFrame, error) { 454 | fn := df.mutator.RightJoin(right, columns, opts...) 455 | return fn(df) 456 | } 457 | 458 | // Slice creates a new DataFrame consisting of rows[beg:end]. 459 | func (df *DataFrame) Slice(beg, end int64) (*DataFrame, error) { 460 | return df.mutator.Slice(beg, end)(df) 461 | } 462 | 463 | // Schema returns the schema of this Frame. 464 | func (df *DataFrame) Schema() *arrow.Schema { 465 | return df.schema 466 | } 467 | 468 | // Retain increases the reference count by 1. 469 | // Retain may be called simultaneously from multiple goroutines. 470 | func (df *DataFrame) Retain() { 471 | atomic.AddInt64(&df.refs, 1) 472 | } 473 | 474 | // Release decreases the reference count by 1. 475 | // When the reference count goes to zero, the memory is freed. 476 | // Release may be called simultaneously from multiple goroutines. 477 | func (df *DataFrame) Release() { 478 | refs := atomic.AddInt64(&df.refs, -1) 479 | debug.Assert(refs >= 0, "too many releases") 480 | 481 | if refs == 0 { 482 | for i := range df.cols { 483 | df.cols[i].Release() 484 | } 485 | df.cols = nil 486 | } 487 | } 488 | 489 | func (df *DataFrame) validate() error { 490 | if len(df.Columns()) != len(df.schema.Fields()) { 491 | return errors.New("dataframe validate(): table schema mismatch") 492 | } 493 | for i, col := range df.cols { 494 | if !col.Field().Equal(df.schema.Field(i)) { 495 | return errors.Errorf("dataframe validate(): column field %q is inconsistent with schema", col.Name()) 496 | } 497 | colLen := columnLen(col) 498 | if colLen < df.rows { 499 | return errors.Errorf("dataframe validate(): column %q expected length >= %d but got length %d", col.Name(), df.rows, colLen) 500 | } 501 | } 502 | return nil 503 | } 504 | 505 | func compareColumns(left, right *array.Column) bool { 506 | // We have to use value iterators and the only way to do that is to switch on the type 507 | leftDtype := left.DataType() 508 | rightDtype := right.DataType() 509 | if leftDtype.ID() != rightDtype.ID() { 510 | debug.Warnf("warning: comparing different types of columns: %v | %v", leftDtype.Name(), rightDtype.Name()) 511 | return false 512 | } 513 | 514 | // Let's use the stuff we already have to do all columns 515 | it := iterator.NewStepIteratorForColumns([]array.Column{*left, *right}) 516 | defer it.Release() 517 | 518 | for it.Next() { 519 | stepValue := it.Values() 520 | var elTPrev Element 521 | for i := range stepValue.Values { 522 | elT := StepValueElementAt(stepValue, i) 523 | if elTPrev == nil { 524 | elTPrev = elT 525 | continue 526 | } 527 | eq, err := elT.EqStrict(elTPrev) 528 | if err != nil { 529 | debug.Warnf("warning: bullseye/dataframe#compareColumns: %v\n", err) 530 | // types must not be equal 531 | return false 532 | } 533 | if !eq { 534 | return false 535 | } 536 | } 537 | } 538 | 539 | return true 540 | } 541 | 542 | func buildSchema(cols []array.Column) *arrow.Schema { 543 | fields := make([]arrow.Field, 0, len(cols)) 544 | for i := range cols { 545 | fields = append(fields, cols[i].Field()) 546 | } 547 | return arrow.NewSchema(fields, nil) 548 | } 549 | 550 | // columnLen returns the number of rows in the Column. 551 | // Because Arrow chunks arrays, you may encounter an overflow if 552 | // there are MaxInt64 rows, i.e. 9223372036854775807. 553 | func columnLen(col array.Column) int64 { 554 | var length int64 555 | for _, chunk := range col.Data().Chunks() { 556 | // Keep our own counters instead of Chunked's 557 | length += int64(chunk.Len()) 558 | } 559 | return length 560 | } 561 | -------------------------------------------------------------------------------- /dataframe/dataframe_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | "github.com/apache/arrow/go/arrow/memory" 10 | "github.com/go-bullseye/bullseye/iterator" 11 | "github.com/pkg/errors" 12 | ) 13 | 14 | const ( 15 | NUMROWS = int64(30) 16 | NUMCOLS = 2 17 | COL0NAME = "f1-i32" 18 | COL1NAME = "f2-f64" 19 | ) 20 | 21 | func buildRecords(pool *memory.CheckedAllocator, t *testing.T, last int32) ([]array.Record, *arrow.Schema) { 22 | schema := arrow.NewSchema( 23 | []arrow.Field{ 24 | {Name: COL0NAME, Type: arrow.PrimitiveTypes.Int32}, 25 | {Name: COL1NAME, Type: arrow.PrimitiveTypes.Float64}, 26 | }, 27 | nil, 28 | ) 29 | 30 | b := array.NewRecordBuilder(pool, schema) 31 | defer b.Release() 32 | 33 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3, 4, 5, 6}, nil) 34 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{7, 8, 9, 10}, []bool{true, true, false, true}) 35 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{1, 2, 3, 4, 5, 6}, nil) 36 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{7, 8, 9, 10}, []bool{true, true, false, true}) 37 | 38 | rec1 := b.NewRecord() 39 | 40 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, nil) 41 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, nil) 42 | 43 | rec2 := b.NewRecord() 44 | 45 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{31, 32, 33, 34, 35, 36, 37, 38, 39, last}, nil) 46 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{31, 32, 33, 34, 35, 36, 37, 38, 39, 40}, nil) 47 | 48 | rec3 := b.NewRecord() 49 | 50 | return []array.Record{rec1, rec2, rec3}, schema 51 | } 52 | 53 | func getColumns(pool *memory.CheckedAllocator, t *testing.T, last int32) []array.Column { 54 | records, schema := buildRecords(pool, t, last) 55 | for i := range records { 56 | defer records[i].Release() 57 | } 58 | 59 | tbl := array.NewTableFromRecords(schema, records) 60 | defer tbl.Release() 61 | 62 | cols := make([]array.Column, tbl.NumCols()) 63 | for i := range cols { 64 | col := tbl.Column(i) 65 | col.Retain() 66 | cols[i] = *col 67 | } 68 | 69 | return cols 70 | } 71 | 72 | func genValues(length int) []int32 { 73 | colVals := make([]int32, 30) 74 | for i := range colVals { 75 | colVals[i] = int32(i) 76 | } 77 | return colVals 78 | } 79 | 80 | func TestNewDataFrameFromColumns(t *testing.T) { 81 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 82 | defer pool.AssertSize(t, 0) 83 | 84 | cols := getColumns(pool, t, 40) 85 | for i := range cols { 86 | defer cols[i].Release() 87 | } 88 | 89 | df, err := NewDataFrameFromColumns(pool, cols) 90 | if err != nil { 91 | t.Fatal(err) 92 | } 93 | defer df.Release() 94 | 95 | if got, want := df.NumRows(), NUMROWS; got != want { 96 | t.Fatalf("got=%d, want=%d", got, want) 97 | } 98 | } 99 | 100 | func TestNumCols(t *testing.T) { 101 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 102 | defer pool.AssertSize(t, 0) 103 | 104 | cols := getColumns(pool, t, 40) 105 | for i := range cols { 106 | defer cols[i].Release() 107 | } 108 | 109 | df, err := NewDataFrameFromColumns(pool, cols) 110 | if err != nil { 111 | t.Fatal(err) 112 | } 113 | defer df.Release() 114 | 115 | if got, want := df.NumCols(), NUMCOLS; got != want { 116 | t.Fatalf("got=%d, want=%d", got, want) 117 | } 118 | } 119 | 120 | func TestNumRows(t *testing.T) { 121 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 122 | defer pool.AssertSize(t, 0) 123 | 124 | cols := getColumns(pool, t, 40) 125 | for i := range cols { 126 | defer cols[i].Release() 127 | } 128 | 129 | df, err := NewDataFrameFromColumns(pool, cols) 130 | if err != nil { 131 | t.Fatal(err) 132 | } 133 | defer df.Release() 134 | 135 | if got, want := df.NumRows(), NUMROWS; got != want { 136 | t.Fatalf("got=%d, want=%d", got, want) 137 | } 138 | } 139 | 140 | func TestDims(t *testing.T) { 141 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 142 | defer pool.AssertSize(t, 0) 143 | 144 | cols := getColumns(pool, t, 40) 145 | for i := range cols { 146 | defer cols[i].Release() 147 | } 148 | 149 | df, err := NewDataFrameFromColumns(pool, cols) 150 | if err != nil { 151 | t.Fatal(err) 152 | } 153 | defer df.Release() 154 | 155 | w, l := df.Dims() 156 | 157 | if got, want := w, NUMCOLS; got != want { 158 | t.Fatalf("got=%d, want=%d", got, want) 159 | } 160 | 161 | if got, want := l, NUMROWS; got != want { 162 | t.Fatalf("got=%d, want=%d", got, want) 163 | } 164 | } 165 | 166 | func TestEquals(t *testing.T) { 167 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 168 | defer pool.AssertSize(t, 0) 169 | 170 | cols := getColumns(pool, t, 40) 171 | for i := range cols { 172 | defer cols[i].Release() 173 | } 174 | 175 | df, err := NewDataFrameFromColumns(pool, cols) 176 | if err != nil { 177 | t.Fatal(err) 178 | } 179 | defer df.Release() 180 | 181 | cols2 := getColumns(pool, t, 40) 182 | defer func() { 183 | for _, col := range cols2 { 184 | col.Release() 185 | } 186 | }() 187 | 188 | df2, err := NewDataFrameFromColumns(pool, cols2) 189 | if err != nil { 190 | t.Fatal(err) 191 | } 192 | defer df2.Release() 193 | 194 | if got, want := df.Equals(df2), true; got != want { 195 | t.Fatalf("got=%v, want=%v", got, want) 196 | } 197 | } 198 | 199 | func TestEqualsFalse(t *testing.T) { 200 | // This test makes sure Equals returns false as well as true. 201 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 202 | defer pool.AssertSize(t, 0) 203 | 204 | cols := getColumns(pool, t, 40) 205 | for i := range cols { 206 | defer cols[i].Release() 207 | } 208 | 209 | df, err := NewDataFrameFromColumns(pool, cols) 210 | if err != nil { 211 | t.Fatal(err) 212 | } 213 | defer df.Release() 214 | 215 | cols2 := getColumns(pool, t, 99) 216 | defer func() { 217 | for _, col := range cols2 { 218 | col.Release() 219 | } 220 | }() 221 | 222 | df2, err := NewDataFrameFromColumns(pool, cols2) 223 | if err != nil { 224 | t.Fatal(err) 225 | } 226 | defer df2.Release() 227 | 228 | if got, want := df.Equals(df2), false; got != want { 229 | t.Fatalf("got=%v, want=%v", got, want) 230 | } 231 | } 232 | 233 | func TestName(t *testing.T) { 234 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 235 | defer pool.AssertSize(t, 0) 236 | 237 | cols := getColumns(pool, t, 40) 238 | for i := range cols { 239 | defer cols[i].Release() 240 | } 241 | 242 | df, err := NewDataFrameFromColumns(pool, cols) 243 | if err != nil { 244 | t.Fatal(err) 245 | } 246 | defer df.Release() 247 | 248 | if got, want := df.Name(0), COL0NAME; got != want { 249 | t.Fatalf("got=%s, want=%s", got, want) 250 | } 251 | 252 | if got, want := df.Name(1), COL1NAME; got != want { 253 | t.Fatalf("got=%s, want=%s", got, want) 254 | } 255 | } 256 | 257 | func TestSlice(t *testing.T) { 258 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 259 | defer pool.AssertSize(t, 0) 260 | 261 | cols := getColumns(pool, t, 40) 262 | for i := range cols { 263 | defer cols[i].Release() 264 | } 265 | 266 | df, err := NewDataFrameFromColumns(pool, cols) 267 | if err != nil { 268 | t.Fatal(err) 269 | } 270 | defer df.Release() 271 | 272 | df2, err := df.Slice(0, 5) 273 | if err != nil { 274 | t.Fatal(err) 275 | } 276 | defer df2.Release() 277 | 278 | if got, want := df2.NumRows(), int64(5); got != want { 279 | t.Fatalf("got=%d, want=%d", got, want) 280 | } 281 | } 282 | 283 | func TestColumnNames(t *testing.T) { 284 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 285 | defer pool.AssertSize(t, 0) 286 | 287 | df, err := NewDataFrameFromMem(pool, Dict{ 288 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 289 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 290 | }) 291 | if err != nil { 292 | t.Fatal(err) 293 | } 294 | defer df.Release() 295 | 296 | if got, want := df.ColumnNames(), []string{"col1-i32", "col2-f64"}; !reflect.DeepEqual(got, want) { 297 | t.Fatalf("got=%v, want=%v", got, want) 298 | } 299 | } 300 | 301 | func TestColumnTypes(t *testing.T) { 302 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 303 | defer pool.AssertSize(t, 0) 304 | 305 | df, err := NewDataFrameFromMem(pool, Dict{ 306 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 307 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 308 | }) 309 | if err != nil { 310 | t.Fatal(err) 311 | } 312 | defer df.Release() 313 | 314 | field1 := arrow.Field{ 315 | Name: "col1-i32", 316 | Type: arrow.PrimitiveTypes.Int32, 317 | Nullable: false, 318 | Metadata: arrow.Metadata{}, 319 | } 320 | field2 := arrow.Field{ 321 | Name: "col2-f64", 322 | Type: arrow.PrimitiveTypes.Float64, 323 | Nullable: false, 324 | Metadata: arrow.Metadata{}, 325 | } 326 | 327 | if got, want := df.ColumnTypes(), []arrow.Field{field1, field2}; !reflect.DeepEqual(got, want) { 328 | t.Fatalf("got=%v, want=%v", got, want) 329 | } 330 | } 331 | 332 | func TestAppendColumn(t *testing.T) { 333 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 334 | defer pool.AssertSize(t, 0) 335 | 336 | cols := getColumns(pool, t, 40) 337 | for i := range cols { 338 | defer cols[i].Release() 339 | } 340 | 341 | df, err := NewDataFrameFromColumns(pool, cols) 342 | if err != nil { 343 | t.Fatal(err) 344 | } 345 | defer df.Release() 346 | 347 | // Create a new Column to append 348 | col, err := NewColumnFromMem(pool, "col3-i32", genValues(int(df.NumRows()))) 349 | if err != nil { 350 | t.Fatal(err) 351 | } 352 | defer col.Release() 353 | 354 | largerDf, err := df.AppendColumn(col) 355 | if err != nil { 356 | t.Fatal(err) 357 | } 358 | defer largerDf.Release() 359 | 360 | got := largerDf.Display(-1) 361 | want := `rec[0]["f1-i32"]: [1 2 3 4 5 6 7 8 (null) 10] 362 | rec[0]["f2-f64"]: [1 2 3 4 5 6 7 8 (null) 10] 363 | rec[0]["col3-i32"]: [0 1 2 3 4 5 6 7 8 9] 364 | rec[1]["f1-i32"]: [11 12 13 14 15 16 17 18 19 20] 365 | rec[1]["f2-f64"]: [11 12 13 14 15 16 17 18 19 20] 366 | rec[1]["col3-i32"]: [10 11 12 13 14 15 16 17 18 19] 367 | rec[2]["f1-i32"]: [31 32 33 34 35 36 37 38 39 40] 368 | rec[2]["f2-f64"]: [31 32 33 34 35 36 37 38 39 40] 369 | rec[2]["col3-i32"]: [20 21 22 23 24 25 26 27 28 29] 370 | ` 371 | 372 | if got != want { 373 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 374 | } 375 | } 376 | 377 | func TestCopy(t *testing.T) { 378 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 379 | defer pool.AssertSize(t, 0) 380 | 381 | df, err := NewDataFrameFromMem(pool, Dict{ 382 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 383 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 384 | }) 385 | if err != nil { 386 | t.Fatal(err) 387 | } 388 | defer df.Release() 389 | 390 | df2, err := df.Copy() 391 | if err != nil { 392 | t.Fatal(err) 393 | } 394 | defer df2.Release() 395 | 396 | got := df2.Display(-1) 397 | want := `rec[0]["col1-i32"]: [1 2 3 4 5 6 7 8 9 10] 398 | rec[0]["col2-f64"]: [1 2 3 4 5 6 7 8 9 10] 399 | ` 400 | if got != want { 401 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 402 | } 403 | 404 | if &df == &df2 { 405 | t.Fatalf("references are the same. df is not a copy of df2 (%v) == (%v)", &df, &df2) 406 | } 407 | } 408 | 409 | func TestSelect(t *testing.T) { 410 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 411 | defer pool.AssertSize(t, 0) 412 | 413 | df, err := NewDataFrameFromMem(pool, Dict{ 414 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 415 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 416 | "col3-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 417 | "col4-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 418 | "col5-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 419 | "col6-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 420 | }) 421 | if err != nil { 422 | t.Fatal(err) 423 | } 424 | defer df.Release() 425 | 426 | names := []string{"col1-i32", "col3-i32", "col6-f64"} 427 | df2, err := df.Select(names...) 428 | if err != nil { 429 | t.Fatal(err) 430 | } 431 | defer df2.Release() 432 | 433 | got := df2.Display(-1) 434 | want := `rec[0]["col1-i32"]: [1 2 3 4 5 6 7 8 9 10] 435 | rec[0]["col3-i32"]: [1 2 3 4 5 6 7 8 9 10] 436 | rec[0]["col6-f64"]: [1 2 3 4 5 6 7 8 9 10] 437 | ` 438 | 439 | if got != want { 440 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 441 | } 442 | } 443 | 444 | func TestDrop(t *testing.T) { 445 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 446 | defer pool.AssertSize(t, 0) 447 | 448 | df, err := NewDataFrameFromMem(pool, Dict{ 449 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 450 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 451 | "col3-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 452 | "col4-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 453 | "col5-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 454 | "col6-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 455 | }) 456 | if err != nil { 457 | t.Fatal(err) 458 | } 459 | defer df.Release() 460 | 461 | names := []string{"col1-i32", "col3-i32", "col6-f64"} 462 | df2, err := df.Drop(names...) 463 | if err != nil { 464 | t.Fatal(err) 465 | } 466 | defer df2.Release() 467 | 468 | got := df2.Display(-1) 469 | want := `rec[0]["col2-f64"]: [1 2 3 4 5 6 7 8 9 10] 470 | rec[0]["col4-f64"]: [1 2 3 4 5 6 7 8 9 10] 471 | rec[0]["col5-i32"]: [1 2 3 4 5 6 7 8 9 10] 472 | ` 473 | 474 | if got != want { 475 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 476 | } 477 | } 478 | 479 | func TestNewDataFrameFromMem(t *testing.T) { 480 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 481 | defer pool.AssertSize(t, 0) 482 | 483 | df, err := NewDataFrameFromMem(pool, Dict{ 484 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 485 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 486 | }) 487 | if err != nil { 488 | t.Fatal(err) 489 | } 490 | defer df.Release() 491 | 492 | got := df.Display(5) 493 | want := `rec[0]["col1-i32"]: [1 2 3 4 5] 494 | rec[0]["col2-f64"]: [1 2 3 4 5] 495 | rec[1]["col1-i32"]: [6 7 8 9 10] 496 | rec[1]["col2-f64"]: [6 7 8 9 10] 497 | ` 498 | if got != want { 499 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 500 | } 501 | } 502 | 503 | func TestNewColumnFromSparseMem(t *testing.T) { 504 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 505 | defer pool.AssertSize(t, 0) 506 | 507 | values := []interface{}{1, nil, 3} 508 | valueIndexes := []int{0, 2, 4} 509 | col, err := NewColumnFromSparseMem(pool, "sparse-col-i32", values, valueIndexes, 10) 510 | if err != nil { 511 | t.Fatal(err) 512 | } 513 | defer col.Release() 514 | 515 | df, err := NewDataFrameFromColumns(pool, []array.Column{*col}) 516 | if err != nil { 517 | t.Fatal(err) 518 | } 519 | defer df.Release() 520 | 521 | got := df.Display(-1) 522 | want := `rec[0]["sparse-col-i32"]: [1 (null) 0 (null) 3 (null) (null) (null) (null) (null)] 523 | ` 524 | if got != want { 525 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 526 | } 527 | } 528 | 529 | func TestColumn(t *testing.T) { 530 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 531 | defer pool.AssertSize(t, 0) 532 | 533 | df, err := NewDataFrameFromMem(pool, Dict{ 534 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 535 | "col2-f64": []float64{10, 12, 13, 14, 15, 16, 17, 18, 19, 20}, 536 | }) 537 | if err != nil { 538 | t.Fatal(err) 539 | } 540 | defer df.Release() 541 | 542 | name := "col2-f64" 543 | 544 | col := df.Column(name) 545 | if col == nil { 546 | t.Fatal("col should not be nil") 547 | } 548 | 549 | // Column should have the same name 550 | if got, want := col.Name(), name; got != want { 551 | t.Fatalf("got=%v, want=%v", got, want) 552 | } 553 | 554 | // Pointer should be the same 555 | cols := df.Columns() 556 | if got, want := &cols[1], col; got != want { 557 | t.Fatalf("got=%v, want=%v", got, want) 558 | } 559 | } 560 | 561 | func TestColumnAt(t *testing.T) { 562 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 563 | defer pool.AssertSize(t, 0) 564 | 565 | df, err := NewDataFrameFromMem(pool, Dict{ 566 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 567 | "col2-f64": []float64{10, 12, 13, 14, 15, 16, 17, 18, 19, 20}, 568 | }) 569 | if err != nil { 570 | t.Fatal(err) 571 | } 572 | defer df.Release() 573 | 574 | col := df.ColumnAt(1) 575 | if col == nil { 576 | t.Fatal("col should not be nil") 577 | } 578 | 579 | // Pointer should be the same 580 | cols := df.Columns() 581 | if got, want := &cols[1], col; got != want { 582 | t.Fatalf("got=%v, want=%v", got, want) 583 | } 584 | } 585 | 586 | func TestLeftJoin(t *testing.T) { 587 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 588 | defer pool.AssertSize(t, 0) 589 | 590 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 591 | "A": []float32{5, 2, 3, 1}, 592 | "B": []float64{6, 4, 3, 2}, 593 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 594 | "D": []float64{5, 1, 0, 0}, 595 | }) 596 | if err != nil { 597 | t.Fatal(err) 598 | } 599 | defer leftDf.Release() 600 | 601 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 602 | "A": []float32{5, 4, 2, 5}, 603 | "F": []float64{7, 3, 5, 8}, 604 | "D": []float64{5, 0, 0, 0}, 605 | }) 606 | if err != nil { 607 | t.Fatal(err) 608 | } 609 | defer rightDf.Release() 610 | 611 | joinedDf, err := leftDf.LeftJoin(rightDf, []string{"A", "D"}) 612 | if err != nil { 613 | t.Fatal(err) 614 | } 615 | defer joinedDf.Release() 616 | 617 | got := joinedDf.Display(-1) 618 | want := `rec[0]["A"]: [5 2 3 1] 619 | rec[0]["D"]: [5 1 0 0] 620 | rec[0]["B"]: [6 4 3 2] 621 | rec[0]["C"]: [1.7 2.3 2.3 7.8] 622 | rec[0]["F"]: [7 (null) (null) (null)] 623 | ` 624 | if got != want { 625 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 626 | } 627 | } 628 | 629 | func TestLeftJoinCase2(t *testing.T) { 630 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 631 | defer pool.AssertSize(t, 0) 632 | 633 | // This test is meant to test LeftJoin 634 | // when there will be duplicate leftDf rows 635 | // because they matched more than one rightDf row. 636 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 637 | "A": []float64{5, 2, 3, 1}, 638 | "B": []float64{6, 4, 3, 2}, 639 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 640 | "D": []float64{5, 1, 0, 0}, 641 | }) 642 | if err != nil { 643 | t.Fatal(err) 644 | } 645 | defer leftDf.Release() 646 | 647 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 648 | "A": []float64{5, 4, 2, 5, 3, 3}, 649 | "F": []float64{7, 3, 5, 8, 99, 44}, 650 | "D": []float64{5, 0, 0, 0, 0, 0}, 651 | }) 652 | if err != nil { 653 | t.Fatal(err) 654 | } 655 | defer rightDf.Release() 656 | 657 | joinedDf, err := leftDf.LeftJoin(rightDf, []string{"A", "D"}) 658 | if err != nil { 659 | t.Fatal(err) 660 | } 661 | defer joinedDf.Release() 662 | 663 | got := joinedDf.Display(-1) 664 | want := `rec[0]["A"]: [5 2 3 3 1] 665 | rec[0]["D"]: [5 1 0 0 0] 666 | rec[0]["B"]: [6 4 3 3 2] 667 | rec[0]["C"]: [1.7 2.3 2.3 2.3 7.8] 668 | rec[0]["F"]: [7 (null) 99 44 (null)] 669 | ` 670 | if got != want { 671 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 672 | } 673 | } 674 | 675 | func TestLeftJoinCase3(t *testing.T) { 676 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 677 | defer pool.AssertSize(t, 0) 678 | 679 | // This test is meant to test LeftJoin 680 | // when there is only one column to match on 681 | // that would result in duplicate columns. 682 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 683 | "A": []float64{5, 2, 3, 1}, 684 | "B": []float64{6, 4, 3, 2}, 685 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 686 | "D": []float64{5, 1, 0, 0}, 687 | }) 688 | if err != nil { 689 | t.Fatal(err) 690 | } 691 | defer leftDf.Release() 692 | 693 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 694 | "A": []float64{5, 4, 2, 5, 3, 3}, 695 | "F": []float64{7, 3, 5, 8, 99, 44}, 696 | "D": []float64{5, 0, 0, 0, 0, 0}, 697 | }) 698 | if err != nil { 699 | t.Fatal(err) 700 | } 701 | defer rightDf.Release() 702 | 703 | joinedDf, err := leftDf.LeftJoin(rightDf, []string{"A"}) 704 | if err != nil { 705 | t.Fatal(err) 706 | } 707 | defer joinedDf.Release() 708 | 709 | got := joinedDf.Display(-1) 710 | want := `rec[0]["A"]: [5 5 2 3 3 1] 711 | rec[0]["B"]: [6 6 4 3 3 2] 712 | rec[0]["C"]: [1.7 1.7 2.3 2.3 2.3 7.8] 713 | rec[0]["D_0"]: [5 5 1 0 0 0] 714 | rec[0]["D_1"]: [5 0 0 0 0 (null)] 715 | rec[0]["F"]: [7 8 5 99 44 (null)] 716 | ` 717 | if got != want { 718 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 719 | } 720 | } 721 | 722 | func TestRightJoin(t *testing.T) { 723 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 724 | defer pool.AssertSize(t, 0) 725 | 726 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 727 | "A": []float64{5, 2, 3, 1}, 728 | "B": []float64{6, 4, 3, 2}, 729 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 730 | "D": []float64{5, 1, 0, 0}, 731 | }) 732 | if err != nil { 733 | t.Fatal(err) 734 | } 735 | defer leftDf.Release() 736 | 737 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 738 | "A": []float64{5, 4, 2, 5}, 739 | "F": []float64{7, 3, 5, 8}, 740 | "D": []float64{5, 0, 0, 0}, 741 | }) 742 | if err != nil { 743 | t.Fatal(err) 744 | } 745 | defer rightDf.Release() 746 | 747 | joinedDf, err := leftDf.RightJoin(rightDf, []string{"A", "D"}) 748 | if err != nil { 749 | t.Fatal(err) 750 | } 751 | defer joinedDf.Release() 752 | 753 | got := joinedDf.Display(-1) 754 | want := `rec[0]["A"]: [5 4 2 5] 755 | rec[0]["D"]: [5 0 0 0] 756 | rec[0]["F"]: [7 3 5 8] 757 | rec[0]["B"]: [6 (null) (null) (null)] 758 | rec[0]["C"]: [1.7 (null) (null) (null)] 759 | ` 760 | if got != want { 761 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 762 | } 763 | } 764 | 765 | func TestRightJoinCase2(t *testing.T) { 766 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 767 | defer pool.AssertSize(t, 0) 768 | 769 | // This test is meant to test RightJoin 770 | // when there will be duplicate rightDf rows 771 | // because they matched more than one leftDf row. 772 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 773 | "A": []float64{5, 2, 5, 1, 4}, 774 | "B": []float64{6, 4, 3, 2, 9}, 775 | "C": []float64{1.7, 2.3, 2.3, 7.8, 9.1}, 776 | "D": []float64{5, 1, 5, 0, 0}, 777 | }) 778 | if err != nil { 779 | t.Fatal(err) 780 | } 781 | defer leftDf.Release() 782 | 783 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 784 | "A": []float64{5, 4, 8}, 785 | "F": []float64{7, 3, 8}, 786 | "D": []float64{5, 0, 8}, 787 | }) 788 | if err != nil { 789 | t.Fatal(err) 790 | } 791 | defer rightDf.Release() 792 | 793 | joinedDf, err := leftDf.RightJoin(rightDf, []string{"A", "D"}) 794 | if err != nil { 795 | t.Fatal(err) 796 | } 797 | defer joinedDf.Release() 798 | 799 | got := joinedDf.Display(-1) 800 | want := `rec[0]["A"]: [5 5 4 8] 801 | rec[0]["D"]: [5 5 0 8] 802 | rec[0]["F"]: [7 7 3 8] 803 | rec[0]["B"]: [6 3 9 (null)] 804 | rec[0]["C"]: [1.7 2.3 9.1 (null)] 805 | ` 806 | if got != want { 807 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 808 | } 809 | } 810 | 811 | func TestRightJoinCase3(t *testing.T) { 812 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 813 | defer pool.AssertSize(t, 0) 814 | 815 | // This test is meant to test RightJoin 816 | // when there is only one column to match on 817 | // that would result in duplicate columns. 818 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 819 | "A": []float64{5, 2, 3, 1}, 820 | "B": []float64{6, 4, 3, 2}, 821 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 822 | "D": []float64{5, 1, 0, 0}, 823 | }) 824 | if err != nil { 825 | t.Fatal(err) 826 | } 827 | defer leftDf.Release() 828 | 829 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 830 | "A": []float64{5, 4, 2, 5, 3, 3}, 831 | "F": []float64{7, 3, 5, 8, 99, 44}, 832 | "D": []float64{5, 0, 0, 0, 0, 0}, 833 | }) 834 | if err != nil { 835 | t.Fatal(err) 836 | } 837 | defer rightDf.Release() 838 | 839 | joinedDf, err := leftDf.RightJoin(rightDf, []string{"A"}) 840 | if err != nil { 841 | t.Fatal(err) 842 | } 843 | defer joinedDf.Release() 844 | 845 | got := joinedDf.Display(-1) 846 | want := `rec[0]["A"]: [5 4 2 5 3 3] 847 | rec[0]["D_1"]: [5 0 0 0 0 0] 848 | rec[0]["F"]: [7 3 5 8 99 44] 849 | rec[0]["B"]: [6 (null) 4 6 3 3] 850 | rec[0]["C"]: [1.7 (null) 2.3 1.7 2.3 2.3] 851 | rec[0]["D_0"]: [5 (null) 1 5 0 0] 852 | ` 853 | if got != want { 854 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 855 | } 856 | } 857 | 858 | func TestInnerJoinCase1(t *testing.T) { 859 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 860 | defer pool.AssertSize(t, 0) 861 | 862 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 863 | "A": []float64{1, 7, 6, 1}, 864 | "B": []float64{2.1, 2.2, 2.3, 2.4}, 865 | "C": []float64{3.3, 8.0, 8.0, 1.1}, 866 | "D": []float64{5, 3, 2, 0}, 867 | }) 868 | if err != nil { 869 | t.Fatal(err) 870 | } 871 | defer leftDf.Release() 872 | 873 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 874 | "A": []float64{2, 0, 6, 1, 6}, 875 | "F": []float64{2, 5, 2, 8, 9}, 876 | "D": []float64{2, 7, 2, 2, 2}, 877 | }) 878 | if err != nil { 879 | t.Fatal(err) 880 | } 881 | defer rightDf.Release() 882 | 883 | joinedDf, err := leftDf.InnerJoin(rightDf, []string{"A", "D"}) 884 | if err != nil { 885 | t.Fatal(err) 886 | } 887 | defer joinedDf.Release() 888 | 889 | got := joinedDf.Display(-1) 890 | want := `rec[0]["A"]: [6 6] 891 | rec[0]["D"]: [2 2] 892 | rec[0]["B"]: [2.3 2.3] 893 | rec[0]["C"]: [8 8] 894 | rec[0]["F"]: [2 9] 895 | ` 896 | if got != want { 897 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 898 | } 899 | } 900 | 901 | func TestOuterJoin(t *testing.T) { 902 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 903 | defer pool.AssertSize(t, 0) 904 | 905 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 906 | "A": []int32{5, 2, 3, 1}, 907 | "B": []float64{6, 4, 3, 2}, 908 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 909 | "D": []int64{5, 1, 0, 0}, 910 | }) 911 | if err != nil { 912 | t.Fatal(err) 913 | } 914 | defer leftDf.Release() 915 | 916 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 917 | "A": []int32{5, 4, 2, 5}, 918 | "F": []float64{7, 3, 5, 8}, 919 | "D": []int64{5, 0, 0, 0}, 920 | }) 921 | if err != nil { 922 | t.Fatal(err) 923 | } 924 | defer rightDf.Release() 925 | 926 | joinedDf, err := leftDf.OuterJoin(rightDf, []string{"A", "D"}) 927 | if err != nil { 928 | t.Fatal(err) 929 | } 930 | defer joinedDf.Release() 931 | 932 | got := joinedDf.Display(-1) 933 | want := `rec[0]["A"]: [5 2 3 1 4 2 5] 934 | rec[0]["D"]: [5 1 0 0 0 0 0] 935 | rec[0]["B"]: [6 4 3 2 (null) (null) (null)] 936 | rec[0]["C"]: [1.7 2.3 2.3 7.8 (null) (null) (null)] 937 | rec[0]["F"]: [7 (null) (null) (null) 3 5 8] 938 | ` 939 | if got != want { 940 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 941 | } 942 | } 943 | 944 | func TestOuterJoinCase2(t *testing.T) { 945 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 946 | defer pool.AssertSize(t, 0) 947 | 948 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 949 | "A": []uint8{5, 2, 3, 1}, 950 | "B": []float64{6, 4, 3, 2}, 951 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 952 | "D": []int16{5, 1, 0, 0}, 953 | }) 954 | if err != nil { 955 | t.Fatal(err) 956 | } 957 | defer leftDf.Release() 958 | 959 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 960 | "A": []uint8{5, 4, 2, 5}, 961 | "F": []int8{7, 3, 5, 8}, 962 | "D": []int16{5, 0, 0, 0}, 963 | }) 964 | if err != nil { 965 | t.Fatal(err) 966 | } 967 | defer rightDf.Release() 968 | 969 | joinedDf, err := leftDf.OuterJoin(rightDf, []string{"A"}) 970 | if err != nil { 971 | t.Fatal(err) 972 | } 973 | defer joinedDf.Release() 974 | 975 | got := joinedDf.Display(-1) 976 | want := `rec[0]["A"]: [5 5 2 3 1 4] 977 | rec[0]["B"]: [6 6 4 3 2 (null)] 978 | rec[0]["C"]: [1.7 1.7 2.3 2.3 7.8 (null)] 979 | rec[0]["D_0"]: [5 5 1 0 0 (null)] 980 | rec[0]["D_1"]: [5 0 0 (null) (null) 0] 981 | rec[0]["F"]: [7 8 5 (null) (null) 3] 982 | ` 983 | if got != want { 984 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 985 | } 986 | } 987 | 988 | func TestOuterJoinCase3(t *testing.T) { 989 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 990 | defer pool.AssertSize(t, 0) 991 | 992 | // When elements are nil at the same location we should not consider them equal as they are unknown. 993 | // This follows SQL practices. 994 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 995 | "A": []interface{}{nil, 2, 3, 1}, 996 | "B": []float64{6, 4, 3, 2}, 997 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 998 | "D": []int64{5, 1, 0, 0}, 999 | }) 1000 | if err != nil { 1001 | t.Fatal(err) 1002 | } 1003 | defer leftDf.Release() 1004 | 1005 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 1006 | "A": []interface{}{nil, 4, 2, 5}, 1007 | "F": []float64{7, 3, 5, 8}, 1008 | "D": []int64{5, 0, 0, 0}, 1009 | }) 1010 | if err != nil { 1011 | t.Fatal(err) 1012 | } 1013 | defer rightDf.Release() 1014 | 1015 | joinedDf, err := leftDf.OuterJoin(rightDf, []string{"A", "D"}) 1016 | if err != nil { 1017 | t.Fatal(err) 1018 | } 1019 | defer joinedDf.Release() 1020 | 1021 | got := joinedDf.Display(-1) 1022 | want := `rec[0]["A"]: [(null) 2 3 1 (null) 4 2 5] 1023 | rec[0]["D"]: [5 1 0 0 5 0 0 0] 1024 | rec[0]["B"]: [6 4 3 2 (null) (null) (null) (null)] 1025 | rec[0]["C"]: [1.7 2.3 2.3 7.8 (null) (null) (null) (null)] 1026 | rec[0]["F"]: [(null) (null) (null) (null) 7 3 5 8] 1027 | ` 1028 | if got != want { 1029 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1030 | } 1031 | } 1032 | 1033 | func TestCrossJoin(t *testing.T) { 1034 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1035 | defer pool.AssertSize(t, 0) 1036 | 1037 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 1038 | "A": []int64{5, 2, 3, 1}, 1039 | "B": []float64{6, 4, 3, 2}, 1040 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 1041 | "D": []float32{5, 1, 0, 0}, 1042 | }) 1043 | if err != nil { 1044 | t.Fatal(err) 1045 | } 1046 | defer leftDf.Release() 1047 | 1048 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 1049 | "A": []int64{5, 4, 2, 5, 10}, 1050 | "F": []int32{7, 3, 5, 8, 11}, 1051 | "D": []float32{5, 0, 0, 0, 12}, 1052 | }) 1053 | if err != nil { 1054 | t.Fatal(err) 1055 | } 1056 | defer rightDf.Release() 1057 | 1058 | joinedDf, err := leftDf.CrossJoin(rightDf) 1059 | if err != nil { 1060 | t.Fatal(err) 1061 | } 1062 | defer joinedDf.Release() 1063 | 1064 | got := joinedDf.Display(-1) 1065 | want := `rec[0]["A_0"]: [5 5 5 5 5 2 2 2 2 2 3 3 3 3 3 1 1 1 1 1] 1066 | rec[0]["B"]: [6 6 6 6 6 4 4 4 4 4 3 3 3 3 3 2 2 2 2 2] 1067 | rec[0]["C"]: [1.7 1.7 1.7 1.7 1.7 2.3 2.3 2.3 2.3 2.3 2.3 2.3 2.3 2.3 2.3 7.8 7.8 7.8 7.8 7.8] 1068 | rec[0]["D_0"]: [5 5 5 5 5 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] 1069 | rec[0]["A_1"]: [5 4 2 5 10 5 4 2 5 10 5 4 2 5 10 5 4 2 5 10] 1070 | rec[0]["D_1"]: [5 0 0 0 12 5 0 0 0 12 5 0 0 0 12 5 0 0 0 12] 1071 | rec[0]["F"]: [7 3 5 8 11 7 3 5 8 11 7 3 5 8 11 7 3 5 8 11] 1072 | ` 1073 | if got != want { 1074 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1075 | } 1076 | } 1077 | 1078 | func TestJoinSuffix(t *testing.T) { 1079 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1080 | defer pool.AssertSize(t, 0) 1081 | 1082 | // This test is meant to test RightJoin 1083 | // when there is only one column to match on 1084 | // that would result in duplicate columns. 1085 | leftDf, err := NewDataFrameFromMem(pool, Dict{ 1086 | "A": []float64{5, 2, 3, 1}, 1087 | "B": []float64{6, 4, 3, 2}, 1088 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 1089 | "D": []float64{5, 1, 0, 0}, 1090 | }) 1091 | if err != nil { 1092 | t.Fatal(err) 1093 | } 1094 | defer leftDf.Release() 1095 | 1096 | rightDf, err := NewDataFrameFromMem(pool, Dict{ 1097 | "A": []float64{5, 4, 2, 5, 3, 3}, 1098 | "F": []float64{7, 3, 5, 8, 99, 44}, 1099 | "D": []float64{5, 0, 0, 0, 0, 0}, 1100 | }) 1101 | if err != nil { 1102 | t.Fatal(err) 1103 | } 1104 | defer rightDf.Release() 1105 | 1106 | joinedDf, err := leftDf.RightJoin(rightDf, []string{"A"}, WithLsuffix("_left"), WithRsuffix("_right")) 1107 | if err != nil { 1108 | t.Fatal(err) 1109 | } 1110 | defer joinedDf.Release() 1111 | 1112 | got := joinedDf.Display(-1) 1113 | want := `rec[0]["A"]: [5 4 2 5 3 3] 1114 | rec[0]["D_right"]: [5 0 0 0 0 0] 1115 | rec[0]["F"]: [7 3 5 8 99 44] 1116 | rec[0]["B"]: [6 (null) 4 6 3 3] 1117 | rec[0]["C"]: [1.7 (null) 2.3 1.7 2.3 2.3] 1118 | rec[0]["D_left"]: [5 (null) 1 5 0 0] 1119 | ` 1120 | if got != want { 1121 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1122 | } 1123 | } 1124 | 1125 | func TestInconsistentDataTypesError(t *testing.T) { 1126 | // When elements are nil at the same location we should not consider them equal as they are unknown. 1127 | // This follows SQL practices. 1128 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1129 | defer pool.AssertSize(t, 0) 1130 | 1131 | df, err := NewDataFrameFromMem(pool, Dict{ 1132 | "A": []interface{}{nil, 2, 3, 1.2}, 1133 | "B": []float64{6, 4, 3, 2}, 1134 | "C": []float64{1.7, 2.3, 2.3, 7.8}, 1135 | "D": []int64{5, 1, 0, 0}, 1136 | }) 1137 | if err == nil { 1138 | defer df.Release() 1139 | } 1140 | 1141 | var v int 1142 | got := err 1143 | want := errors.Errorf("inconsistent data types for elements, expecting %v to be of type (%T)", 1.2, v) 1144 | if got.Error() != want.Error() { 1145 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1146 | } 1147 | } 1148 | 1149 | // multByN takes a DataFrame and multiplies a the column by the provided multipier. 1150 | func multByN(columnName string, multipier float64) MutationFunc { 1151 | return func(df *DataFrame) (*DataFrame, error) { 1152 | col := df.Column(columnName) 1153 | schema := arrow.NewSchema([]arrow.Field{col.Field()}, nil) 1154 | builder := array.NewRecordBuilder(df.Allocator(), schema) 1155 | defer builder.Release() 1156 | smartBuilder := NewSmartBuilder(builder, schema) 1157 | valueIterator := iterator.NewFloat64ValueIterator(col) 1158 | defer valueIterator.Release() 1159 | for valueIterator.Next() { 1160 | value, isNil := valueIterator.Value() 1161 | if isNil { 1162 | smartBuilder.Append(0, nil) 1163 | } else { 1164 | value *= multipier 1165 | smartBuilder.Append(0, value) 1166 | } 1167 | } 1168 | rec := builder.NewRecord() 1169 | defer rec.Release() 1170 | chunk := array.NewChunked(col.DataType(), rec.Columns()) 1171 | defer chunk.Release() 1172 | newCol := array.NewColumn(col.Field(), chunk) 1173 | defer newCol.Release() 1174 | df2, err := df.Drop(columnName) 1175 | if err != nil { 1176 | return nil, err 1177 | } 1178 | defer df2.Release() 1179 | return df2.AppendColumn(newCol) 1180 | } 1181 | } 1182 | 1183 | func TestApply(t *testing.T) { 1184 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1185 | defer pool.AssertSize(t, 0) 1186 | 1187 | df, err := NewDataFrameFromMem(pool, Dict{ 1188 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1189 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1190 | }) 1191 | if err != nil { 1192 | t.Fatal(err) 1193 | } 1194 | defer df.Release() 1195 | 1196 | df2, err := df.Apply(multByN("col2-f64", 2.0), multByN("col2-f64", -1.0)) 1197 | if err != nil { 1198 | t.Fatal(err) 1199 | } 1200 | defer df2.Release() 1201 | 1202 | got := df2.Display(-1) 1203 | want := `rec[0]["col1-i32"]: [1 2 3 4 5 6 7 8 9 10] 1204 | rec[0]["col2-f64"]: [-2 -4 -6 -8 -10 -12 -14 -16 -18 -20] 1205 | ` 1206 | if got != want { 1207 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1208 | } 1209 | 1210 | if &df == &df2 { 1211 | t.Fatalf("references are the same. df is not a copy of df2 (%v) == (%v)", &df, &df2) 1212 | } 1213 | } 1214 | 1215 | func TestApplyToColumn(t *testing.T) { 1216 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1217 | defer pool.AssertSize(t, 0) 1218 | 1219 | df, err := NewDataFrameFromMem(pool, Dict{ 1220 | "col1-i32": []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1221 | "col2-f64": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 1222 | }) 1223 | if err != nil { 1224 | t.Fatal(err) 1225 | } 1226 | defer df.Release() 1227 | 1228 | df2, err := df.ApplyToColumn("col2-f64", "col2-f64-x2", func(v interface{}) (interface{}, error) { 1229 | // This function will be called for every element in "col2-f64" 1230 | if v == nil { 1231 | // can't multiply nil by anything 1232 | return nil, nil 1233 | } 1234 | value, ok := v.(float64) 1235 | if !ok { 1236 | return nil, errors.New("v is not a float64") 1237 | } 1238 | value *= 2 1239 | return value, nil 1240 | }) 1241 | 1242 | if err != nil { 1243 | t.Fatal(err) 1244 | } 1245 | defer df2.Release() 1246 | 1247 | got := df2.Display(-1) 1248 | want := `rec[0]["col1-i32"]: [1 2 3 4 5 6 7 8 9 10] 1249 | rec[0]["col2-f64"]: [1 2 3 4 5 6 7 8 9 10] 1250 | rec[0]["col2-f64-x2"]: [2 4 6 8 10 12 14 16 18 20] 1251 | ` 1252 | if got != want { 1253 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1254 | } 1255 | 1256 | if &df == &df2 { 1257 | t.Fatalf("references are the same. df is not a copy of df2 (%v) == (%v)", &df, &df2) 1258 | } 1259 | } 1260 | 1261 | func TestNewDataFrameFromTable(t *testing.T) { 1262 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 1263 | defer pool.AssertSize(t, 0) 1264 | 1265 | records, schema := buildRecords(pool, t, 48) 1266 | for i := range records { 1267 | defer records[i].Release() 1268 | } 1269 | 1270 | table := array.NewTableFromRecords(schema, records) 1271 | defer table.Release() 1272 | 1273 | df, err := NewDataFrameFromTable(pool, table) 1274 | if err != nil { 1275 | t.Fatal(err) 1276 | } 1277 | defer df.Release() 1278 | 1279 | got := df.Display(-1) 1280 | want := `rec[0]["f1-i32"]: [1 2 3 4 5 6 7 8 (null) 10] 1281 | rec[0]["f2-f64"]: [1 2 3 4 5 6 7 8 (null) 10] 1282 | rec[1]["f1-i32"]: [11 12 13 14 15 16 17 18 19 20] 1283 | rec[1]["f2-f64"]: [11 12 13 14 15 16 17 18 19 20] 1284 | rec[2]["f1-i32"]: [31 32 33 34 35 36 37 38 39 48] 1285 | rec[2]["f2-f64"]: [31 32 33 34 35 36 37 38 39 40] 1286 | ` 1287 | if got != want { 1288 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 1289 | } 1290 | } 1291 | -------------------------------------------------------------------------------- /dataframe/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package dataframe provides the DataFrame implementation. 3 | 4 | */ 5 | package dataframe 6 | -------------------------------------------------------------------------------- /dataframe/element.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | ) 8 | 9 | // Element is an interface for Elements within a Column. 10 | type Element interface { 11 | // Compare methods 12 | // Eq returns true if the left Element is equal to the right Element. 13 | // When both are nil Eq returns false because nil actualy signifies "unknown" 14 | // and you can't compare two things when you don't know what they are. 15 | Eq(Element) (bool, error) 16 | // EqStrict returns true if the left Element is equal to the right Element. 17 | // When both are nil EqStrict returns true. 18 | EqStrict(Element) (bool, error) 19 | // Neq returns true when Eq returns false. 20 | Neq(Element) (bool, error) 21 | // Less returns true if the left Element is less than the right Element. 22 | Less(Element) (bool, error) 23 | // LessEq returns true if the left Element is less than or equal to the right Element. 24 | LessEq(Element) (bool, error) 25 | // Greater returns true if the left Element is greter than the right Element. 26 | Greater(Element) (bool, error) 27 | // GreaterEq returns true if the left Element is greter than or equal to the right Element. 28 | GreaterEq(Element) (bool, error) 29 | 30 | // Accessor/conversion methods 31 | 32 | // Copy returns a copy of this Element. 33 | Copy() Element 34 | 35 | // Information methods 36 | 37 | // String prints the value of this element as a string. 38 | String() string 39 | // IsNil returns true when the underlying value is nil. 40 | IsNil() bool 41 | } 42 | 43 | // CastElement returns an Element type for the passed DataType and value v. 44 | func CastElement(dtype arrow.DataType, v interface{}) Element { 45 | switch dtype.(type) { 46 | // case *arrow.NullType: // TODO: implement 47 | // case *arrow.BooleanType: // TODO: implement 48 | case *arrow.Uint8Type: 49 | return NewUint8Element(v) 50 | case *arrow.Int8Type: 51 | return NewInt8Element(v) 52 | case *arrow.Uint16Type: 53 | return NewUint16Element(v) 54 | case *arrow.Int16Type: 55 | return NewInt16Element(v) 56 | case *arrow.Uint32Type: 57 | return NewUint32Element(v) 58 | case *arrow.Int32Type: 59 | return NewInt32Element(v) 60 | case *arrow.Uint64Type: 61 | return NewUint64Element(v) 62 | case *arrow.Int64Type: 63 | return NewInt64Element(v) 64 | // case arrow.HALF_FLOAT: // TODO: implement? 65 | case *arrow.Float32Type: 66 | return NewFloat32Element(v) 67 | case *arrow.Float64Type: 68 | return NewFloat64Element(v) 69 | case *arrow.Date32Type: 70 | return NewDate32Element(v) 71 | case *arrow.Date64Type: 72 | return NewDate64Element(v) 73 | // case *arrow.StringType: // TODO: implement 74 | 75 | } 76 | panic(fmt.Errorf("bullseye/element: unsupported element for %T", dtype)) 77 | } 78 | -------------------------------------------------------------------------------- /dataframe/element_numeric.gen.go.tmpl: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/go-bullseye/bullseye" 8 | ) 9 | 10 | {{range .In}} 11 | // {{.Name}}Element has logic to apply to this type. 12 | type {{.Name}}Element struct { 13 | v interface{} 14 | } 15 | 16 | // New{{.Name}}Element creates a new {{.Name}}Element logic wrapper 17 | // from the given value provided as v. 18 | func New{{.Name}}Element(v interface{}) *{{.Name}}Element { 19 | return &{{.Name}}Element{ 20 | v: v, 21 | } 22 | } 23 | 24 | // compare takes the left and right elements and applies the comparator function to them. 25 | func (e {{.Name}}Element) compare(r Element, f func(left, right {{or .QualifiedType .Type}}) bool) (bool, error) { 26 | rE, ok := r.(*{{.Name}}Element) 27 | if !ok { 28 | return false, fmt.Errorf("cannot cast %v to {{.Name}}Element", r) 29 | } 30 | 31 | // When their nil status isn't the same, we can't compare them. 32 | // Explicit both nil should be handled elsewhere. 33 | if e.IsNil() != rE.IsNil() { 34 | return false, nil 35 | } 36 | 37 | lv, lok := e.v.({{or .QualifiedType .Type}}) 38 | if !lok { 39 | return false, fmt.Errorf("cannot assert %v is a {{or .QualifiedType .Type}}", e.v) 40 | } 41 | rv, rok := rE.v.({{or .QualifiedType .Type}}) 42 | if !rok { 43 | return false, fmt.Errorf("cannot assert %v is a {{or .QualifiedType .Type}}", rE.v) 44 | } 45 | 46 | return f(lv, rv), nil 47 | } 48 | 49 | // Comparation methods 50 | 51 | // Eq returns true if the left {{.Name}}Element is equal to the right {{.Name}}Element. 52 | // When both are nil Eq returns false because nil actualy signifies "unknown" 53 | // and you can't compare two things when you don't know what they are. 54 | func (e {{.Name}}Element) Eq(r Element) (bool, error) { 55 | if e.IsNil() && r.IsNil() { 56 | return false, nil 57 | } 58 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 59 | return left == right 60 | }) 61 | } 62 | 63 | // EqStrict returns true if the left {{.Name}}Element is equal to the right {{.Name}}Element. 64 | // When both are nil EqStrict returns true. 65 | func (e {{.Name}}Element) EqStrict(r Element) (bool, error) { 66 | if e.IsNil() && r.IsNil() { 67 | return true, nil 68 | } 69 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 70 | return left == right 71 | }) 72 | } 73 | 74 | // Neq returns true if the left {{.Name}}Element 75 | // is not equal to the right {{.Name}}Element. 76 | func (e {{.Name}}Element) Neq(r Element) (bool, error) { 77 | v, ok := e.Eq(r) 78 | return !v, ok 79 | } 80 | 81 | // Less returns true if the left {{.Name}}Element 82 | // is less than the right {{.Name}}Element. 83 | func (e {{.Name}}Element) Less(r Element) (bool, error) { 84 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 85 | return left < right 86 | }) 87 | } 88 | 89 | // LessEq returns true if the left {{.Name}}Element 90 | // is less than or equal to the right {{.Name}}Element. 91 | func (e {{.Name}}Element) LessEq(r Element) (bool, error) { 92 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 93 | return left <= right 94 | }) 95 | } 96 | 97 | // Greater returns true if the left {{.Name}}Element 98 | // is greter than the right {{.Name}}Element. 99 | func (e {{.Name}}Element) Greater(r Element) (bool, error) { 100 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 101 | return left > right 102 | }) 103 | } 104 | 105 | // GreaterEq returns true if the left {{.Name}}Element 106 | // is greter than or equal to the right {{.Name}}Element. 107 | func (e {{.Name}}Element) GreaterEq(r Element) (bool, error) { 108 | return e.compare(r, func(left, right {{or .QualifiedType .Type}}) bool { 109 | return left >= right 110 | }) 111 | } 112 | 113 | // Accessor/conversion methods 114 | 115 | // Copy returns a copy of this {{.Name}}Element. 116 | func (e {{.Name}}Element) Copy() Element { 117 | return e 118 | } 119 | 120 | // String prints the value of this element as a string. 121 | func (e {{.Name}}Element) String() string { 122 | return fmt.Sprintf("%v", e.v) 123 | } 124 | 125 | // Information methods 126 | 127 | // IsNil returns true when the underlying value is nil. 128 | func (e {{.Name}}Element) IsNil() bool { 129 | return e.v == nil 130 | } 131 | 132 | 133 | {{end}} 134 | -------------------------------------------------------------------------------- /dataframe/example_test.go: -------------------------------------------------------------------------------- 1 | package dataframe_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apache/arrow/go/arrow/memory" 7 | "github.com/go-bullseye/bullseye/dataframe" 8 | ) 9 | 10 | // This example demonstrates creating a new DataFrame from memory 11 | // using a Dict and displaying the contents of it. 12 | func Example_newDataFrameFromMemory() { 13 | pool := memory.NewGoAllocator() 14 | df, _ := dataframe.NewDataFrameFromMem(pool, dataframe.Dict{ 15 | "col1": []int32{1, 2, 3, 4, 5}, 16 | "col2": []float64{1.1, 2.2, 3.3, 4.4, 5}, 17 | "col3": []string{"foo", "bar", "ping", "", "pong"}, 18 | "col4": []interface{}{2, 4, 6, nil, 8}, 19 | }) 20 | defer df.Release() 21 | fmt.Printf("DataFrame:\n%s\n", df.Display(0)) 22 | 23 | // Output: 24 | // DataFrame: 25 | // rec[0]["col1"]: [1 2 3 4 5] 26 | // rec[0]["col2"]: [1.1 2.2 3.3 4.4 5] 27 | // rec[0]["col3"]: ["foo" "bar" "ping" "" "pong"] 28 | // rec[0]["col4"]: [2 4 6 (null) 8] 29 | } 30 | -------------------------------------------------------------------------------- /dataframe/mutations.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/apache/arrow/go/arrow/memory" 9 | "github.com/go-bullseye/bullseye/iterator" 10 | "github.com/pkg/errors" 11 | ) 12 | 13 | // Mutator is a type that has some standard mutations. 14 | type Mutator struct { 15 | // Almost all mutations will require setting up new memory as they create new a DataFrame. 16 | // So we need to provide the ability to set the Allocator. 17 | mem memory.Allocator 18 | } 19 | 20 | // NewMutator creates a new mutator. 21 | func NewMutator(mem memory.Allocator) *Mutator { 22 | return &Mutator{ 23 | mem: mem, 24 | } 25 | } 26 | 27 | // MutationFunc is a function that mutates an existing DataFrame and returns a new DataFrame or an error. 28 | type MutationFunc func(*DataFrame) (*DataFrame, error) 29 | 30 | // Select the given DataFrame columns by name. 31 | func (m *Mutator) Select(names ...string) MutationFunc { 32 | return func(df *DataFrame) (*DataFrame, error) { 33 | cols := df.SelectColumns(names...) 34 | return NewDataFrameFromShape(m.mem, cols, df.NumRows()) 35 | } 36 | } 37 | 38 | // Drop the given DataFrame columns by name. 39 | func (m *Mutator) Drop(names ...string) MutationFunc { 40 | return func(df *DataFrame) (*DataFrame, error) { 41 | cols := df.RejectColumns(names...) 42 | return NewDataFrameFromShape(m.mem, cols, df.NumRows()) 43 | } 44 | } 45 | 46 | // Slice creates a new DataFrame consisting of rows[beg:end]. 47 | func (m *Mutator) Slice(beg, end int64) MutationFunc { 48 | return func(df *DataFrame) (*DataFrame, error) { 49 | if end > df.NumRows() || beg > end { 50 | return nil, errors.Errorf("mutation: index out of range") 51 | } 52 | 53 | dfCols := df.Columns() 54 | 55 | cols := make([]array.Column, len(dfCols)) 56 | for i, col := range dfCols { 57 | cols[i] = *col.NewSlice(beg, end) 58 | } 59 | 60 | defer func() { 61 | for i := range cols { 62 | cols[i].Release() 63 | } 64 | }() 65 | 66 | rows := end - beg 67 | return NewDataFrameFromShape(m.mem, cols, rows) 68 | } 69 | } 70 | 71 | // leftJoinConfig are the config params for LeftJoin. 72 | type leftJoinConfig struct { 73 | lsuffix string 74 | rsuffix string 75 | } 76 | 77 | // newLeftJoinConfig creates a new config using options and validates it. 78 | func newLeftJoinConfig(opts ...Option) (*leftJoinConfig, error) { 79 | cfg := defaultLeftJoinConfig() 80 | for _, opt := range opts { 81 | if err := opt(cfg); err != nil { 82 | return cfg, err 83 | } 84 | } 85 | err := cfg.validate() 86 | return cfg, err 87 | } 88 | 89 | func (c *leftJoinConfig) validate() error { 90 | if c.lsuffix == c.rsuffix { 91 | return errors.Errorf("lsuffix (%s) cannot be the same as rsuffix (%s)", c.lsuffix, c.rsuffix) 92 | } 93 | return nil 94 | } 95 | 96 | // defaultLeftJoinConfig returns the default defaultLeftJoinConfig. 97 | func defaultLeftJoinConfig() *leftJoinConfig { 98 | return &leftJoinConfig{ 99 | lsuffix: "_0", 100 | rsuffix: "_1", 101 | } 102 | } 103 | 104 | // WithLsuffix configures a right or left join to use the provided left suffix. 105 | func WithLsuffix(lsuffix string) Option { 106 | return func(p interface{}) error { 107 | o, ok := p.(*leftJoinConfig) 108 | if !ok { 109 | return errors.Errorf("cannot apply WithLsuffix to: %T", p) 110 | } 111 | o.lsuffix = lsuffix 112 | return nil 113 | } 114 | } 115 | 116 | // WithRsuffix configures a right or left join to use the provided left suffix. 117 | func WithRsuffix(rsuffix string) Option { 118 | return func(p interface{}) error { 119 | o, ok := p.(*leftJoinConfig) 120 | if !ok { 121 | return errors.Errorf("cannot apply WithRsuffix to: %T", p) 122 | } 123 | o.rsuffix = rsuffix 124 | return nil 125 | } 126 | } 127 | 128 | // RightJoin returns a DataFrame containing the right join of two DataFrames. 129 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 130 | func (m *Mutator) RightJoin(rightDf *DataFrame, columnNames []string, opts ...Option) MutationFunc { 131 | // RightJoin is just a LeftJoin in reverse order. 132 | cfg, err := newLeftJoinConfig(opts...) 133 | if err == nil { 134 | // Swap lsuffix and rsuffix 135 | lsuffix := cfg.lsuffix 136 | cfg.lsuffix = cfg.rsuffix 137 | cfg.rsuffix = lsuffix 138 | } 139 | 140 | return func(leftDf *DataFrame) (*DataFrame, error) { 141 | if err != nil { 142 | return nil, err 143 | } 144 | 145 | // We swap leftDf and rightDf 146 | data, err := m.leftJoin(cfg, rightDf, leftDf, columnNames) 147 | if err != nil { 148 | return nil, err 149 | } 150 | defer data.Release() 151 | // return fn(rightDf) 152 | return data.buildDataFrame() 153 | } 154 | } 155 | 156 | // LeftJoin returns a DataFrame containing the left join of two DataFrames. 157 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 158 | func (m *Mutator) LeftJoin(rightDf *DataFrame, columnNames []string, opts ...Option) MutationFunc { 159 | cfg, err := newLeftJoinConfig(opts...) 160 | return func(leftDf *DataFrame) (*DataFrame, error) { 161 | if err != nil { 162 | return nil, err 163 | } 164 | 165 | data, err := m.leftJoin(cfg, leftDf, rightDf, columnNames) 166 | if err != nil { 167 | return nil, err 168 | } 169 | defer data.Release() 170 | return data.buildDataFrame() 171 | } 172 | } 173 | 174 | type joinFuncConfig struct { 175 | // Keep a ref to the mutator that created it 176 | mutator *Mutator 177 | 178 | matchingLeftColsLen int 179 | matchingRightColsLen int 180 | additionalLeftColsLen int 181 | additionalRightColsLen int 182 | columnNames []string 183 | leftColumns []array.Column 184 | rightColumns []array.Column 185 | schema *arrow.Schema 186 | recordBuilder *array.RecordBuilder 187 | smartBuilder *SmartBuilder 188 | } 189 | 190 | // newJoinFuncConfig builds up all the data needed to do a join. 191 | // TODO(nickpoorman): maybe rename leftJoinConfig if this is going to be used for other joins 192 | func (m *Mutator) newJoinFuncConfig(cfg *leftJoinConfig, leftDf *DataFrame, rightDf *DataFrame, columnNames []string, forceNullable bool) (*joinFuncConfig, error) { 193 | jc := &joinFuncConfig{ 194 | mutator: m, 195 | columnNames: columnNames, 196 | leftColumns: make([]array.Column, 0, leftDf.NumCols()), 197 | rightColumns: make([]array.Column, 0, rightDf.NumCols()), 198 | } 199 | 200 | // Start by making sure that both DataFrames have the columns we are looking for. 201 | for _, name := range columnNames { 202 | leftColumn := leftDf.Column(name) 203 | if leftColumn == nil { 204 | return nil, errors.Errorf("bullseye/mutations: column %s is not in left DataFrame: (%v)", name, leftDf.ColumnNames()) 205 | } 206 | rightColumn := rightDf.Column(name) 207 | if rightColumn == nil { 208 | return nil, errors.Errorf("bullseye/mutations: column %s is not in right DataFrame: (%v)", name, rightDf.ColumnNames()) 209 | } 210 | 211 | jc.leftColumns = append(jc.leftColumns, *leftColumn) 212 | jc.rightColumns = append(jc.rightColumns, *rightColumn) 213 | } 214 | // Keep track of the number of matching left and right columns. (They should be the same number) 215 | jc.matchingLeftColsLen = len(jc.leftColumns) 216 | jc.matchingRightColsLen = len(jc.rightColumns) 217 | 218 | // We will end up needing to iterate over the columns for left in step so join them back together. 219 | jc.leftColumns = append(jc.leftColumns, leftDf.RejectColumns(columnNames...)...) 220 | jc.rightColumns = append(jc.rightColumns, rightDf.RejectColumns(columnNames...)...) 221 | 222 | // Keep track of the lengths. Now that we have appended the other columns. 223 | jc.additionalLeftColsLen = len(jc.leftColumns) - jc.matchingLeftColsLen 224 | jc.additionalRightColsLen = len(jc.rightColumns) - jc.matchingRightColsLen 225 | 226 | // get all the fields that make up the schema 227 | fields := make([]arrow.Field, 0, jc.matchingLeftColsLen+jc.additionalLeftColsLen+jc.additionalRightColsLen) 228 | for i := 0; i < len(jc.leftColumns); i++ { 229 | fields = append(fields, jc.leftColumns[i].Field()) 230 | } 231 | for i := jc.matchingRightColsLen; i < len(jc.rightColumns); i++ { 232 | fcopy := jc.rightColumns[i].Field() 233 | if forceNullable { 234 | // This column's values must be nullable since there may not be any matches. 235 | fcopy.Nullable = true 236 | } 237 | // If there are any existing fields that have this name we must change the names. 238 | name := fcopy.Name 239 | // Start at the end of the matching ones because those clearly wont have a conflict. 240 | for i := jc.matchingLeftColsLen; i < len(jc.leftColumns); i++ { 241 | if fields[i].Name == name { 242 | fields[i].Name = fmt.Sprintf("%s%s", name, cfg.lsuffix) 243 | fcopy.Name = fmt.Sprintf("%s%s", name, cfg.rsuffix) 244 | break 245 | } 246 | } 247 | 248 | fields = append(fields, fcopy) 249 | } 250 | 251 | jc.schema = arrow.NewSchema(fields, nil) 252 | jc.recordBuilder = array.NewRecordBuilder(m.mem, jc.schema) 253 | jc.smartBuilder = NewSmartBuilder(jc.recordBuilder, jc.schema) 254 | 255 | return jc, nil 256 | } 257 | 258 | func (jc *joinFuncConfig) Release() { 259 | jc.recordBuilder.Release() 260 | } 261 | 262 | func (jc *joinFuncConfig) buildDataFrame() (*DataFrame, error) { 263 | rec := jc.recordBuilder.NewRecord() 264 | defer rec.Release() 265 | return NewDataFrame(jc.mutator.mem, jc.schema, rec.Columns()) 266 | } 267 | 268 | // This leftJoin implementation is shared by both LeftJoin and RightJoin. 269 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 270 | func (m *Mutator) leftJoin(cfg *leftJoinConfig, leftDf *DataFrame, rightDf *DataFrame, columnNames []string) (*joinFuncConfig, error) { 271 | data, err := m.newJoinFuncConfig(cfg, leftDf, rightDf, columnNames, true) 272 | if err != nil { 273 | return nil, err 274 | } 275 | 276 | sharedLeftJoinLogic(data, func(appendEmptyRow bool, leftStepValues *iterator.StepValue) { 277 | if appendEmptyRow { 278 | // If nothing matched then we append the row once with nil for additional right columns. 279 | cIdx := 0 280 | 281 | // Add all the values from left columns 282 | for i := range leftStepValues.Values { 283 | data.smartBuilder.Append(cIdx, leftStepValues.Values[i]) 284 | cIdx++ 285 | } 286 | 287 | for i := 0; i < data.additionalRightColsLen; i++ { 288 | // cIdx is the offset to the start of the additionalRightCols in smartBuilder 289 | data.smartBuilder.Append(cIdx+i, nil) 290 | } 291 | } 292 | }) 293 | 294 | return data, nil 295 | } 296 | 297 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 298 | func sharedLeftJoinLogic(data *joinFuncConfig, iterationEndFunc func(bool, *iterator.StepValue)) { 299 | // What I want here is a step iterator for the matchingLeftCols. 300 | leftMatchingIterator := iterator.NewStepIteratorForColumns(data.leftColumns) 301 | defer leftMatchingIterator.Release() 302 | for leftMatchingIterator.Next() { // Iterate through every row in the left df. 303 | leftStepValues := leftMatchingIterator.Values() 304 | // If we don't find a match, we'll need to append an empty row. 305 | appendEmptyRow := true 306 | 307 | func() { 308 | // What I want here is a step iterator for the matchingRightCols. 309 | rightMatchingIterator := iterator.NewStepIteratorForColumns(data.rightColumns) 310 | defer rightMatchingIterator.Release() 311 | for rightMatchingIterator.Next() { // Iterate through every row in the right df. 312 | rightStepValues := rightMatchingIterator.Values() 313 | match := true 314 | 315 | // For each matching column, 316 | // check if the row on the left, 317 | // matches with the rows on the right. 318 | for columnIndex := range data.columnNames { 319 | match = match && stepValueEqAt(leftStepValues, rightStepValues, columnIndex) 320 | } 321 | 322 | if match { 323 | // For each match, we append a new row with the 324 | // left columns values and the additional right column values. 325 | appendEmptyRow = false 326 | 327 | // Keep track of the number of columns we need to offset by so we know what index we are on. 328 | cIdx := 0 329 | 330 | // Add all the values from left columns 331 | for i := range leftStepValues.Values { 332 | data.smartBuilder.Append(cIdx, leftStepValues.Values[i]) 333 | cIdx++ 334 | } 335 | 336 | // Do the dance we did above and append the elements to each column for additionalRightCols. 337 | for i := data.matchingRightColsLen; i < len(data.rightColumns); i++ { 338 | value := rightStepValues.Values[i] 339 | data.smartBuilder.Append(cIdx, value) 340 | cIdx++ 341 | } 342 | } 343 | } 344 | }() 345 | iterationEndFunc(appendEmptyRow, leftStepValues) 346 | } 347 | } 348 | 349 | // InnerJoin returns a DataFrame containing the inner join of two DataFrames. 350 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 351 | func (m *Mutator) InnerJoin(rightDf *DataFrame, columnNames []string, opts ...Option) MutationFunc { 352 | cfg, err := newLeftJoinConfig(opts...) 353 | return func(leftDf *DataFrame) (*DataFrame, error) { 354 | if err != nil { 355 | return nil, err 356 | } 357 | 358 | data, err := m.newJoinFuncConfig(cfg, leftDf, rightDf, columnNames, false) 359 | if err != nil { 360 | return nil, err 361 | } 362 | defer data.Release() 363 | 364 | // InnerJoin is basically LeftJoin without appending nulls in iterationEndFunc so we stub that callback. 365 | sharedLeftJoinLogic(data, func(bool, *iterator.StepValue) {}) 366 | 367 | return data.buildDataFrame() 368 | } 369 | } 370 | 371 | // OuterJoin returns a DataFrame containing the outer join of two DataFrames. 372 | // Use union of keys from both frames, similar to a SQL full outer join. 373 | // Acts like SQL in that nil elements are treated as unknown so nil != nil. 374 | func (m *Mutator) OuterJoin(rightDf *DataFrame, columnNames []string, opts ...Option) MutationFunc { 375 | cfg, err := newLeftJoinConfig(opts...) 376 | return func(leftDf *DataFrame) (*DataFrame, error) { 377 | if err != nil { 378 | return nil, err 379 | } 380 | 381 | data, err := m.leftJoin(cfg, leftDf, rightDf, columnNames) 382 | if err != nil { 383 | return nil, err 384 | } 385 | defer data.Release() 386 | 387 | // Now we iterate over the right first. 388 | rightIterator := iterator.NewStepIteratorForColumns(data.rightColumns) 389 | defer rightIterator.Release() 390 | for rightIterator.Next() { // Iterate through every row in the right df. 391 | rightStepValues := rightIterator.Values() 392 | // If we don't find a match, we'll need to append an empty row. 393 | 394 | if !outerJoinAnyRowsMatch(rightStepValues, data) { 395 | // Keep track of the number of columns we need to offset by so we know what index we are on. 396 | cIdx := 0 397 | 398 | // Add all the values from right matching columns 399 | for i := 0; i < data.matchingRightColsLen; i++ { 400 | value := rightStepValues.Values[i] 401 | data.smartBuilder.Append(cIdx, value) 402 | cIdx++ 403 | } 404 | 405 | // Add nil for not matching left columns 406 | for i := 0; i < data.additionalLeftColsLen; i++ { 407 | data.smartBuilder.Append(cIdx, nil) 408 | cIdx++ 409 | } 410 | 411 | // Add the additional values from the right. 412 | for i := data.matchingRightColsLen; i < data.matchingRightColsLen+data.additionalRightColsLen; i++ { 413 | value := rightStepValues.Values[i] 414 | data.smartBuilder.Append(cIdx, value) 415 | cIdx++ 416 | } 417 | } 418 | } 419 | 420 | return data.buildDataFrame() 421 | } 422 | } 423 | 424 | func outerJoinAnyRowsMatch(rightStepValues *iterator.StepValue, data *joinFuncConfig) bool { 425 | leftIterator := iterator.NewStepIteratorForColumns(data.leftColumns) 426 | defer leftIterator.Release() 427 | for leftIterator.Next() { // Iterate through every row in the left df. 428 | leftStepValues := leftIterator.Values() 429 | match := true 430 | 431 | // For each matching column, 432 | // check if the row on the left, 433 | // matches with the rows on the right. 434 | for columnIndex := range data.columnNames { 435 | match = match && stepValueEqAt(leftStepValues, rightStepValues, columnIndex) 436 | } 437 | 438 | if match { 439 | return true 440 | } 441 | } 442 | 443 | return false 444 | } 445 | 446 | // CrossJoin returns a DataFrame containing the cross join of two DataFrames. 447 | func (m *Mutator) CrossJoin(rightDf *DataFrame, opts ...Option) MutationFunc { 448 | cfg, err := newLeftJoinConfig(opts...) 449 | return func(leftDf *DataFrame) (*DataFrame, error) { 450 | if err != nil { 451 | return nil, err 452 | } 453 | 454 | data, err := m.newJoinFuncConfig(cfg, leftDf, rightDf, nil, false) 455 | if err != nil { 456 | return nil, err 457 | } 458 | defer data.Release() 459 | 460 | leftMatchingIterator := iterator.NewStepIteratorForColumns(data.leftColumns) 461 | defer leftMatchingIterator.Release() 462 | for leftMatchingIterator.Next() { // Iterate through every row in the left df. 463 | leftStepValues := leftMatchingIterator.Values() 464 | 465 | func() { 466 | rightMatchingIterator := iterator.NewStepIteratorForColumns(data.rightColumns) 467 | defer rightMatchingIterator.Release() 468 | for rightMatchingIterator.Next() { // Iterate through every row in the right df. 469 | rightStepValues := rightMatchingIterator.Values() 470 | 471 | cIdx := 0 472 | 473 | // Add all columns from both frames. 474 | for i := range leftStepValues.Values { 475 | data.smartBuilder.Append(cIdx, leftStepValues.Values[i]) 476 | cIdx++ 477 | } 478 | for i := range rightStepValues.Values { 479 | data.smartBuilder.Append(cIdx, rightStepValues.Values[i]) 480 | cIdx++ 481 | } 482 | } 483 | }() 484 | } 485 | 486 | return data.buildDataFrame() 487 | } 488 | } 489 | 490 | func stepValueEqAt(left *iterator.StepValue, right *iterator.StepValue, i int) bool { 491 | lElem := StepValueElementAt(left, i) 492 | rElem := StepValueElementAt(right, i) 493 | 494 | v, err := lElem.Eq(rElem) 495 | if err != nil { 496 | panic(err) 497 | } 498 | 499 | return v 500 | } 501 | -------------------------------------------------------------------------------- /dataframe/smartbuilder.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | ) 10 | 11 | // AppenderFunc is the function to be used to convert the data to the correct type. 12 | type AppenderFunc func(array.Builder, interface{}) 13 | 14 | // SmartBuilder knows how to convert to the correct type when building. 15 | type SmartBuilder struct { 16 | recordBuilder *array.RecordBuilder 17 | schema *arrow.Schema 18 | fieldAppenders []AppenderFunc 19 | } 20 | 21 | // NewSmartBuilder creates a SmartBuilder that knows how to convert to the correct type when building. 22 | func NewSmartBuilder(recordBuilder *array.RecordBuilder, schema *arrow.Schema) *SmartBuilder { 23 | sb := &SmartBuilder{ 24 | recordBuilder: recordBuilder, 25 | schema: schema, 26 | fieldAppenders: make([]AppenderFunc, 0, len(schema.Fields())), 27 | } 28 | 29 | fields := sb.schema.Fields() 30 | for i := range fields { 31 | fn := initFieldAppender(&fields[i]) 32 | sb.fieldAppenders = append(sb.fieldAppenders, fn) 33 | } 34 | 35 | return sb 36 | } 37 | 38 | // Append will append the value to the builder. 39 | func (sb *SmartBuilder) Append(fieldIndex int, v interface{}) { 40 | field := sb.recordBuilder.Field(fieldIndex) 41 | appendFunc := sb.fieldAppenders[fieldIndex] 42 | if appendFunc == nil { 43 | fmt.Fprintln(os.Stderr, "warn: appendFunc is nil") 44 | } 45 | appendFunc(field, v) 46 | } 47 | 48 | func initFieldAppender(field *arrow.Field) AppenderFunc { 49 | switch field.Type.(type) { 50 | case *arrow.BooleanType: 51 | return func(field array.Builder, v interface{}) { 52 | builder := field.(*array.BooleanBuilder) 53 | if v == nil { 54 | builder.AppendNull() 55 | } else { 56 | vT := v.(bool) 57 | builder.Append(vT) 58 | } 59 | } 60 | case *arrow.Int8Type: 61 | return func(field array.Builder, v interface{}) { 62 | builder := field.(*array.Int8Builder) 63 | if v == nil { 64 | builder.AppendNull() 65 | } else { 66 | vT := v.(int8) 67 | builder.Append(vT) 68 | } 69 | } 70 | case *arrow.Int16Type: 71 | return func(field array.Builder, v interface{}) { 72 | builder := field.(*array.Int16Builder) 73 | if v == nil { 74 | builder.AppendNull() 75 | } else { 76 | vT := v.(int16) 77 | builder.Append(vT) 78 | } 79 | } 80 | case *arrow.Int32Type: 81 | return func(field array.Builder, v interface{}) { 82 | builder := field.(*array.Int32Builder) 83 | if v == nil { 84 | builder.AppendNull() 85 | } else { 86 | vT := v.(int32) 87 | builder.Append(vT) 88 | } 89 | } 90 | case *arrow.Int64Type: 91 | return func(field array.Builder, v interface{}) { 92 | builder := field.(*array.Int64Builder) 93 | if v == nil { 94 | builder.AppendNull() 95 | } else { 96 | vT := v.(int64) 97 | builder.Append(vT) 98 | } 99 | } 100 | case *arrow.Uint8Type: 101 | return func(field array.Builder, v interface{}) { 102 | builder := field.(*array.Uint8Builder) 103 | if v == nil { 104 | builder.AppendNull() 105 | } else { 106 | vT := v.(uint8) 107 | builder.Append(vT) 108 | } 109 | } 110 | case *arrow.Uint16Type: 111 | return func(field array.Builder, v interface{}) { 112 | builder := field.(*array.Uint16Builder) 113 | if v == nil { 114 | builder.AppendNull() 115 | } else { 116 | vT := v.(uint16) 117 | builder.Append(vT) 118 | } 119 | } 120 | case *arrow.Uint32Type: 121 | return func(field array.Builder, v interface{}) { 122 | builder := field.(*array.Uint32Builder) 123 | if v == nil { 124 | builder.AppendNull() 125 | } else { 126 | vT := v.(uint32) 127 | builder.Append(vT) 128 | } 129 | } 130 | case *arrow.Uint64Type: 131 | return func(field array.Builder, v interface{}) { 132 | builder := field.(*array.Uint64Builder) 133 | if v == nil { 134 | builder.AppendNull() 135 | } else { 136 | vT := v.(uint64) 137 | builder.Append(vT) 138 | } 139 | } 140 | case *arrow.Float32Type: 141 | return func(field array.Builder, v interface{}) { 142 | builder := field.(*array.Float32Builder) 143 | if v == nil { 144 | builder.AppendNull() 145 | } else { 146 | vT := v.(float32) 147 | builder.Append(vT) 148 | } 149 | } 150 | case *arrow.Float64Type: 151 | return func(field array.Builder, v interface{}) { 152 | builder := field.(*array.Float64Builder) 153 | if v == nil { 154 | builder.AppendNull() 155 | } else { 156 | vT := v.(float64) 157 | builder.Append(vT) 158 | } 159 | } 160 | case *arrow.StringType: 161 | return func(field array.Builder, v interface{}) { 162 | builder := field.(*array.StringBuilder) 163 | if v == nil { 164 | builder.AppendNull() 165 | } else { 166 | vT := v.(string) 167 | builder.Append(vT) 168 | } 169 | } 170 | 171 | default: 172 | panic(fmt.Errorf("dataframe/smartbuilder: unhandled field type %T", field.Type)) 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /dataframe/smartbuilder_test.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | "github.com/apache/arrow/go/arrow/memory" 10 | ) 11 | 12 | func TestNewSmartBuilder(t *testing.T) { 13 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 14 | defer pool.AssertSize(t, 0) 15 | 16 | schema := arrow.NewSchema( 17 | []arrow.Field{ 18 | {Name: COL0NAME, Type: arrow.PrimitiveTypes.Int32}, 19 | {Name: COL1NAME, Type: arrow.PrimitiveTypes.Float64}, 20 | }, 21 | nil, 22 | ) 23 | 24 | b := array.NewRecordBuilder(pool, schema) 25 | defer b.Release() 26 | 27 | smartBuilder := NewSmartBuilder(b, schema) 28 | 29 | int32Vals := []int32{1, 2, 3, 4, 5, 6, 7, 8, 9} 30 | for _, v := range int32Vals { 31 | smartBuilder.Append(0, v) 32 | } 33 | smartBuilder.Append(0, nil) 34 | 35 | float64Vals := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9} 36 | for _, v := range float64Vals { 37 | smartBuilder.Append(1, v) 38 | } 39 | smartBuilder.Append(1, nil) 40 | 41 | rec1 := b.NewRecord() 42 | defer rec1.Release() 43 | 44 | cols := make([]array.Column, 0, len(rec1.Columns())) 45 | for i, cI := range rec1.Columns() { 46 | field := rec1.Schema().Field(i) 47 | chunk := array.NewChunked(field.Type, []array.Interface{cI}) 48 | col := array.NewColumn(field, chunk) 49 | defer col.Release() 50 | cols = append(cols, *col) 51 | chunk.Release() 52 | } 53 | 54 | df, err := NewDataFrameFromColumns(pool, cols) 55 | if err != nil { 56 | t.Fatal(err) 57 | } 58 | defer df.Release() 59 | 60 | got := df.Display(-1) 61 | want := `rec[0]["f1-i32"]: [1 2 3 4 5 6 7 8 9 (null)] 62 | rec[0]["f2-f64"]: [1 2 3 4 5 6 7 8 9 (null)] 63 | ` 64 | 65 | if got != want { 66 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 67 | } 68 | } 69 | 70 | func buildDf(pool *memory.CheckedAllocator, dtype arrow.DataType, vals []interface{}) (*DataFrame, error) { 71 | schema := arrow.NewSchema( 72 | []arrow.Field{ 73 | {Name: fmt.Sprintf("col-%s", dtype.Name()), Type: dtype}, 74 | }, 75 | nil, 76 | ) 77 | 78 | b := array.NewRecordBuilder(pool, schema) 79 | defer b.Release() 80 | 81 | smartBuilder := NewSmartBuilder(b, schema) 82 | for i := range schema.Fields() { 83 | for j := range vals { 84 | smartBuilder.Append(i, vals[j]) 85 | } 86 | smartBuilder.Append(i, nil) 87 | } 88 | 89 | rec1 := b.NewRecord() 90 | defer rec1.Release() 91 | 92 | cols := make([]array.Column, 0, len(rec1.Columns())) 93 | for i, cI := range rec1.Columns() { 94 | field := rec1.Schema().Field(i) 95 | chunk := array.NewChunked(field.Type, []array.Interface{cI}) 96 | col := array.NewColumn(field, chunk) 97 | defer col.Release() 98 | cols = append(cols, *col) 99 | chunk.Release() 100 | } 101 | 102 | return NewDataFrameFromColumns(pool, cols) 103 | } 104 | 105 | func TestNewSmartBuilderBoolean(t *testing.T) { 106 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 107 | defer pool.AssertSize(t, 0) 108 | 109 | vals := make([]interface{}, 9) 110 | for i := range vals { 111 | vals[i] = (i%2 == 0) 112 | } 113 | df, err := buildDf(pool, arrow.FixedWidthTypes.Boolean, vals) 114 | if err != nil { 115 | t.Fatal(err) 116 | } 117 | defer df.Release() 118 | 119 | got := df.Display(-1) 120 | want := `rec[0]["col-bool"]: [true false true false true false true false true (null)] 121 | ` 122 | 123 | if got != want { 124 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 125 | } 126 | } 127 | 128 | func TestNewSmartBuilderInt8(t *testing.T) { 129 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 130 | defer pool.AssertSize(t, 0) 131 | 132 | vals := make([]interface{}, 9) 133 | for i := range vals { 134 | vals[i] = int8(i) 135 | } 136 | df, err := buildDf(pool, arrow.PrimitiveTypes.Int8, vals) 137 | if err != nil { 138 | t.Fatal(err) 139 | } 140 | defer df.Release() 141 | 142 | got := df.Display(-1) 143 | want := `rec[0]["col-int8"]: [0 1 2 3 4 5 6 7 8 (null)] 144 | ` 145 | 146 | if got != want { 147 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 148 | } 149 | } 150 | 151 | func TestNewSmartBuilderInt16(t *testing.T) { 152 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 153 | defer pool.AssertSize(t, 0) 154 | 155 | vals := make([]interface{}, 9) 156 | for i := range vals { 157 | vals[i] = int16(i) 158 | } 159 | df, err := buildDf(pool, arrow.PrimitiveTypes.Int16, vals) 160 | if err != nil { 161 | t.Fatal(err) 162 | } 163 | defer df.Release() 164 | 165 | got := df.Display(-1) 166 | want := `rec[0]["col-int16"]: [0 1 2 3 4 5 6 7 8 (null)] 167 | ` 168 | 169 | if got != want { 170 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 171 | } 172 | } 173 | 174 | func TestNewSmartBuilderInt32(t *testing.T) { 175 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 176 | defer pool.AssertSize(t, 0) 177 | 178 | vals := make([]interface{}, 9) 179 | for i := range vals { 180 | vals[i] = int32(i) 181 | } 182 | df, err := buildDf(pool, arrow.PrimitiveTypes.Int32, vals) 183 | if err != nil { 184 | t.Fatal(err) 185 | } 186 | defer df.Release() 187 | 188 | got := df.Display(-1) 189 | want := `rec[0]["col-int32"]: [0 1 2 3 4 5 6 7 8 (null)] 190 | ` 191 | 192 | if got != want { 193 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 194 | } 195 | } 196 | 197 | func TestNewSmartBuilderInt64(t *testing.T) { 198 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 199 | defer pool.AssertSize(t, 0) 200 | 201 | vals := make([]interface{}, 9) 202 | for i := range vals { 203 | vals[i] = int64(i) 204 | } 205 | df, err := buildDf(pool, arrow.PrimitiveTypes.Int64, vals) 206 | if err != nil { 207 | t.Fatal(err) 208 | } 209 | defer df.Release() 210 | 211 | got := df.Display(-1) 212 | want := `rec[0]["col-int64"]: [0 1 2 3 4 5 6 7 8 (null)] 213 | ` 214 | 215 | if got != want { 216 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 217 | } 218 | } 219 | 220 | func TestNewSmartBuilderUint8(t *testing.T) { 221 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 222 | defer pool.AssertSize(t, 0) 223 | 224 | vals := make([]interface{}, 9) 225 | for i := range vals { 226 | vals[i] = uint8(i) 227 | } 228 | df, err := buildDf(pool, arrow.PrimitiveTypes.Uint8, vals) 229 | if err != nil { 230 | t.Fatal(err) 231 | } 232 | defer df.Release() 233 | 234 | got := df.Display(-1) 235 | want := `rec[0]["col-uint8"]: [0 1 2 3 4 5 6 7 8 (null)] 236 | ` 237 | 238 | if got != want { 239 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 240 | } 241 | } 242 | func TestNewSmartBuilderUint16(t *testing.T) { 243 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 244 | defer pool.AssertSize(t, 0) 245 | 246 | vals := make([]interface{}, 9) 247 | for i := range vals { 248 | vals[i] = uint16(i) 249 | } 250 | df, err := buildDf(pool, arrow.PrimitiveTypes.Uint16, vals) 251 | if err != nil { 252 | t.Fatal(err) 253 | } 254 | defer df.Release() 255 | 256 | got := df.Display(-1) 257 | want := `rec[0]["col-uint16"]: [0 1 2 3 4 5 6 7 8 (null)] 258 | ` 259 | 260 | if got != want { 261 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 262 | } 263 | } 264 | 265 | func TestNewSmartBuilderUint32(t *testing.T) { 266 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 267 | defer pool.AssertSize(t, 0) 268 | 269 | vals := make([]interface{}, 9) 270 | for i := range vals { 271 | vals[i] = uint32(i) 272 | } 273 | df, err := buildDf(pool, arrow.PrimitiveTypes.Uint32, vals) 274 | if err != nil { 275 | t.Fatal(err) 276 | } 277 | defer df.Release() 278 | 279 | got := df.Display(-1) 280 | want := `rec[0]["col-uint32"]: [0 1 2 3 4 5 6 7 8 (null)] 281 | ` 282 | 283 | if got != want { 284 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 285 | } 286 | } 287 | 288 | func TestNewSmartBuilderUint64(t *testing.T) { 289 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 290 | defer pool.AssertSize(t, 0) 291 | 292 | vals := make([]interface{}, 9) 293 | for i := range vals { 294 | vals[i] = uint64(i) 295 | } 296 | df, err := buildDf(pool, arrow.PrimitiveTypes.Uint64, vals) 297 | if err != nil { 298 | t.Fatal(err) 299 | } 300 | defer df.Release() 301 | 302 | got := df.Display(-1) 303 | want := `rec[0]["col-uint64"]: [0 1 2 3 4 5 6 7 8 (null)] 304 | ` 305 | 306 | if got != want { 307 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 308 | } 309 | } 310 | 311 | func TestNewSmartBuilderFloat32(t *testing.T) { 312 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 313 | defer pool.AssertSize(t, 0) 314 | 315 | vals := make([]interface{}, 9) 316 | for i := range vals { 317 | vals[i] = float32(i) 318 | } 319 | df, err := buildDf(pool, arrow.PrimitiveTypes.Float32, vals) 320 | if err != nil { 321 | t.Fatal(err) 322 | } 323 | defer df.Release() 324 | 325 | got := df.Display(-1) 326 | want := `rec[0]["col-float32"]: [0 1 2 3 4 5 6 7 8 (null)] 327 | ` 328 | 329 | if got != want { 330 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 331 | } 332 | } 333 | 334 | func TestNewSmartBuilderFloat64(t *testing.T) { 335 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 336 | defer pool.AssertSize(t, 0) 337 | 338 | vals := make([]interface{}, 9) 339 | for i := range vals { 340 | vals[i] = float64(i) 341 | } 342 | df, err := buildDf(pool, arrow.PrimitiveTypes.Float64, vals) 343 | if err != nil { 344 | t.Fatal(err) 345 | } 346 | defer df.Release() 347 | 348 | got := df.Display(-1) 349 | want := `rec[0]["col-float64"]: [0 1 2 3 4 5 6 7 8 (null)] 350 | ` 351 | 352 | if got != want { 353 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 354 | } 355 | } 356 | 357 | func TestNewSmartBuilderString(t *testing.T) { 358 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 359 | defer pool.AssertSize(t, 0) 360 | 361 | vals := make([]interface{}, 9) 362 | for i := range vals { 363 | vals[i] = fmt.Sprintf("%d", i) 364 | } 365 | df, err := buildDf(pool, arrow.BinaryTypes.String, vals) 366 | if err != nil { 367 | t.Fatal(err) 368 | } 369 | defer df.Release() 370 | 371 | got := df.Display(-1) 372 | want := `rec[0]["col-utf8"]: ["0" "1" "2" "3" "4" "5" "6" "7" "8" (null)] 373 | ` 374 | 375 | if got != want { 376 | t.Fatalf("\ngot=\n%v\nwant=\n%v", got, want) 377 | } 378 | } 379 | -------------------------------------------------------------------------------- /dataframe/stepvalue.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "github.com/go-bullseye/bullseye/iterator" 5 | ) 6 | 7 | // StepValueElementAt gets the value at i from the StepValue and casts it to an Element. 8 | func StepValueElementAt(stepValue *iterator.StepValue, i int) Element { 9 | stepValueEl, dtype := stepValue.Value(i) 10 | return CastElement(dtype, stepValueEl) 11 | } 12 | -------------------------------------------------------------------------------- /dataframe/tablefacade.go: -------------------------------------------------------------------------------- 1 | package dataframe 2 | 3 | import ( 4 | "github.com/apache/arrow/go/arrow" 5 | "github.com/apache/arrow/go/arrow/array" 6 | ) 7 | 8 | // I don't want to force the DataFrame API to conform to the TableReader API. 9 | // (i.e. forcing NumCols to return int64 doesn't make sense in Go). 10 | // So this is a facade the DataFrame TableReader expects. 11 | 12 | // TableFacade is a simple facade for a TableReader. 13 | type TableFacade interface { 14 | array.Table 15 | } 16 | 17 | type tableReaderFacade struct { 18 | df *DataFrame 19 | } 20 | 21 | // NewTableFacade creates a new TableFacade for a DataFrame. 22 | func NewTableFacade(df *DataFrame) TableFacade { 23 | return &tableReaderFacade{ 24 | df: df, 25 | } 26 | } 27 | 28 | func (f *tableReaderFacade) Schema() *arrow.Schema { 29 | return f.df.Schema() 30 | } 31 | 32 | func (f *tableReaderFacade) NumRows() int64 { 33 | return f.df.NumRows() 34 | } 35 | 36 | func (f *tableReaderFacade) NumCols() int64 { 37 | return int64(f.df.NumCols()) 38 | } 39 | 40 | // Column is an immutable column data structure consisting of 41 | // a field (type metadata) and a chunked data array. 42 | func (f *tableReaderFacade) Column(i int) *array.Column { 43 | return f.df.ColumnAt(i) 44 | } 45 | 46 | func (f *tableReaderFacade) Retain() { 47 | f.df.Retain() 48 | } 49 | 50 | func (f *tableReaderFacade) Release() { 51 | f.df.Release() 52 | } 53 | -------------------------------------------------------------------------------- /datatype_numeric.gen.go.tmpldata: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Name": "Int8", 4 | "Type": "int8", 5 | "Size": 8 6 | }, 7 | { 8 | "Name": "Int16", 9 | "Type": "int16", 10 | "Size": 16 11 | }, 12 | { 13 | "Name": "Int32", 14 | "Type": "int32", 15 | "Size": 32 16 | }, 17 | { 18 | "Name": "Int64", 19 | "Type": "int64", 20 | "Size": 64 21 | }, 22 | { 23 | "Name": "Uint8", 24 | "Type": "uint8", 25 | "Size": 8 26 | }, 27 | { 28 | "Name": "Uint16", 29 | "Type": "uint16", 30 | "Size": 16 31 | }, 32 | { 33 | "Name": "Uint32", 34 | "Type": "uint32", 35 | "Size": 32 36 | }, 37 | { 38 | "Name": "Uint64", 39 | "Type": "uint64", 40 | "Size": 64 41 | }, 42 | { 43 | "Name": "Float32", 44 | "Type": "float32", 45 | "Size": 32 46 | }, 47 | { 48 | "Name": "Float64", 49 | "Type": "float64", 50 | "Size": 64 51 | }, 52 | { 53 | "Name": "Date32", 54 | "Type": "date32", 55 | "Size": 32 56 | }, 57 | { 58 | "Name": "Date64", 59 | "Type": "date64", 60 | "Size": 64 61 | } 62 | ] 63 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package bullseye provides an implementation of a DataFrame using Apache Arrow. 3 | 4 | Basics 5 | 6 | The DataFrame is an immutable heterogeneous tabular data structure with labeled columns. 7 | It stores it's raw bytes using a provided Arrow Allocator by using the fundamental data 8 | structure of Array (columns), which holds a sequence of values of the same type. An array 9 | consists of memory holding the data and an additional validity bitmap that indicates if 10 | the corresponding entry in the array is valid (not null). 11 | 12 | Any DataFrames created should be released using Release() to decrement the reference 13 | and free up the memory managed by the Arrow implementation. 14 | 15 | Getting Started 16 | 17 | Look in the dataframe package to get started. 18 | */ 19 | package bullseye -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/go-bullseye/bullseye 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/apache/arrow/go/arrow v0.0.0-20190615061817-720be32a0bb5 7 | github.com/pkg/errors v0.8.1 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/apache/arrow/go/arrow v0.0.0-20190615061817-720be32a0bb5 h1:EGCJTEx+tkmZuz6Wbc0zkA+Dgf7UXKu+126krteiZJQ= 2 | github.com/apache/arrow/go/arrow v0.0.0-20190615061817-720be32a0bb5/go.mod h1:NG5SvIQXIxzJR5lGmoXTX9R/EmkArKbPPFu0DUFSz10= 3 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 4 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= 6 | github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= 7 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 8 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 9 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 10 | github.com/stretchr/testify v1.2.0 h1:LThGCOvhuJic9Gyd1VBCkhyUXmO8vKaBFvBsJ2k03rg= 11 | github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 12 | -------------------------------------------------------------------------------- /internal/cast/dense.go: -------------------------------------------------------------------------------- 1 | package cast 2 | 3 | import "github.com/pkg/errors" 4 | 5 | // DenseCollectionToInterface casts a slice of interfaces to an interface of the correct type. 6 | func DenseCollectionToInterface(elms []interface{}) (interface{}, error) { 7 | if len(elms) == 0 { 8 | return nil, nil 9 | } 10 | 11 | // find the first one that is not nil 12 | var first interface{} 13 | for i := range elms { 14 | if elms[i] != nil { 15 | first = elms[i] 16 | break 17 | } 18 | } 19 | 20 | if first == nil { 21 | return nil, nil 22 | } 23 | 24 | var ok bool 25 | switch v := first.(type) { 26 | case bool: 27 | arr := make([]bool, len(elms)) 28 | for i, e := range elms { 29 | if e == nil { 30 | continue 31 | } 32 | if arr[i], ok = e.(bool); !ok { 33 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 34 | } 35 | } 36 | return arr, nil 37 | 38 | case int8: 39 | arr := make([]int8, len(elms)) 40 | for i, e := range elms { 41 | if e == nil { 42 | continue 43 | } 44 | if arr[i], ok = e.(int8); !ok { 45 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 46 | } 47 | } 48 | return arr, nil 49 | 50 | case int16: 51 | arr := make([]int16, len(elms)) 52 | for i, e := range elms { 53 | if e == nil { 54 | continue 55 | } 56 | if arr[i], ok = e.(int16); !ok { 57 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 58 | } 59 | } 60 | return arr, nil 61 | 62 | case int32: 63 | arr := make([]int32, len(elms)) 64 | for i, e := range elms { 65 | if e == nil { 66 | continue 67 | } 68 | if arr[i], ok = e.(int32); !ok { 69 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 70 | } 71 | } 72 | return arr, nil 73 | 74 | case int64: 75 | arr := make([]int64, len(elms)) 76 | for i, e := range elms { 77 | if e == nil { 78 | continue 79 | } 80 | if arr[i], ok = e.(int64); !ok { 81 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 82 | } 83 | } 84 | return arr, nil 85 | 86 | case uint8: 87 | arr := make([]uint8, len(elms)) 88 | for i, e := range elms { 89 | if e == nil { 90 | continue 91 | } 92 | if arr[i], ok = e.(uint8); !ok { 93 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 94 | } 95 | } 96 | return arr, nil 97 | 98 | case uint16: 99 | arr := make([]uint16, len(elms)) 100 | for i, e := range elms { 101 | if e == nil { 102 | continue 103 | } 104 | if arr[i], ok = e.(uint16); !ok { 105 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 106 | } 107 | } 108 | return arr, nil 109 | 110 | case uint32: 111 | arr := make([]uint32, len(elms)) 112 | for i, e := range elms { 113 | if e == nil { 114 | continue 115 | } 116 | if arr[i], ok = e.(uint32); !ok { 117 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 118 | } 119 | } 120 | return arr, nil 121 | 122 | case uint64: 123 | arr := make([]uint64, len(elms)) 124 | for i, e := range elms { 125 | if e == nil { 126 | continue 127 | } 128 | if arr[i], ok = e.(uint64); !ok { 129 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 130 | } 131 | } 132 | return arr, nil 133 | 134 | case float32: 135 | arr := make([]float32, len(elms)) 136 | for i, e := range elms { 137 | if e == nil { 138 | continue 139 | } 140 | if arr[i], ok = e.(float32); !ok { 141 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 142 | } 143 | } 144 | return arr, nil 145 | 146 | case float64: 147 | arr := make([]float64, len(elms)) 148 | for i, e := range elms { 149 | if e == nil { 150 | continue 151 | } 152 | if arr[i], ok = e.(float64); !ok { 153 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 154 | } 155 | } 156 | return arr, nil 157 | 158 | case string: 159 | arr := make([]string, len(elms)) 160 | for i, e := range elms { 161 | if e == nil { 162 | continue 163 | } 164 | if arr[i], ok = e.(string); !ok { 165 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 166 | } 167 | } 168 | return arr, nil 169 | 170 | case uint: 171 | arr := make([]uint, len(elms)) 172 | for i, e := range elms { 173 | if e == nil { 174 | continue 175 | } 176 | if arr[i], ok = e.(uint); !ok { 177 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 178 | } 179 | } 180 | return arr, nil 181 | 182 | case int: 183 | arr := make([]int64, len(elms)) 184 | for i, e := range elms { 185 | if e == nil { 186 | continue 187 | } 188 | vv, okk := e.(int) 189 | if !okk { 190 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 191 | } 192 | arr[i] = int64(vv) 193 | } 194 | return arr, nil 195 | 196 | default: 197 | return nil, errors.Errorf("dataframe/dense: invalid data type for %v (%T)", elms, v) 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /internal/cast/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package cast provides casting for sparse and dense arrays. 3 | 4 | */ 5 | package cast 6 | -------------------------------------------------------------------------------- /internal/cast/sparse.go: -------------------------------------------------------------------------------- 1 | package cast 2 | 3 | import "github.com/pkg/errors" 4 | 5 | const inconsistentDataTypesErrMsg = "inconsistent data types for elements, expecting %v to be of type (%T)" 6 | 7 | // SparseCollectionToInterface casts a slice of interfaces to an interface of the correct type 8 | // for the provided sparse collection. 9 | // This should be used for sparse as it should be faster for larger arrays. 10 | func SparseCollectionToInterface(elms []interface{}, indexes []int, size int) (interface{}, error) { 11 | if len(elms) == 0 { 12 | return nil, nil 13 | } 14 | 15 | first := elms[0] 16 | 17 | var ok bool 18 | switch v := first.(type) { 19 | case bool: 20 | arr := make([]bool, size) 21 | for i, idx := range indexes { 22 | e := elms[i] 23 | if e == nil { 24 | continue 25 | } 26 | if arr[idx], ok = e.(bool); !ok { 27 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 28 | } 29 | } 30 | return arr, nil 31 | 32 | case int8: 33 | arr := make([]int8, size) 34 | for i, idx := range indexes { 35 | e := elms[i] 36 | if e == nil { 37 | continue 38 | } 39 | if arr[idx], ok = e.(int8); !ok { 40 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 41 | } 42 | } 43 | return arr, nil 44 | 45 | case int16: 46 | arr := make([]int16, size) 47 | for i, idx := range indexes { 48 | e := elms[i] 49 | if e == nil { 50 | continue 51 | } 52 | if arr[idx], ok = e.(int16); !ok { 53 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 54 | } 55 | } 56 | return arr, nil 57 | 58 | case int32: 59 | arr := make([]int32, size) 60 | for i, idx := range indexes { 61 | e := elms[i] 62 | if e == nil { 63 | continue 64 | } 65 | if arr[idx], ok = e.(int32); !ok { 66 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 67 | } 68 | } 69 | return arr, nil 70 | 71 | case int64: 72 | arr := make([]int64, size) 73 | for i, idx := range indexes { 74 | e := elms[i] 75 | if e == nil { 76 | continue 77 | } 78 | if arr[idx], ok = e.(int64); !ok { 79 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 80 | } 81 | } 82 | return arr, nil 83 | 84 | case uint8: 85 | arr := make([]uint8, size) 86 | for i, idx := range indexes { 87 | e := elms[i] 88 | if e == nil { 89 | continue 90 | } 91 | if arr[idx], ok = e.(uint8); !ok { 92 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 93 | } 94 | } 95 | return arr, nil 96 | 97 | case uint16: 98 | arr := make([]uint16, size) 99 | for i, idx := range indexes { 100 | e := elms[i] 101 | if e == nil { 102 | continue 103 | } 104 | if arr[idx], ok = e.(uint16); !ok { 105 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 106 | } 107 | } 108 | return arr, nil 109 | 110 | case uint32: 111 | arr := make([]uint32, size) 112 | for i, idx := range indexes { 113 | e := elms[i] 114 | if e == nil { 115 | continue 116 | } 117 | if arr[idx], ok = e.(uint32); !ok { 118 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 119 | } 120 | } 121 | return arr, nil 122 | 123 | case uint64: 124 | arr := make([]uint64, size) 125 | for i, idx := range indexes { 126 | e := elms[i] 127 | if e == nil { 128 | continue 129 | } 130 | if arr[idx], ok = e.(uint64); !ok { 131 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 132 | } 133 | } 134 | return arr, nil 135 | 136 | case float32: 137 | arr := make([]float32, size) 138 | for i, idx := range indexes { 139 | e := elms[i] 140 | if e == nil { 141 | continue 142 | } 143 | if arr[idx], ok = e.(float32); !ok { 144 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 145 | } 146 | } 147 | return arr, nil 148 | 149 | case float64: 150 | arr := make([]float64, size) 151 | for i, idx := range indexes { 152 | e := elms[i] 153 | if e == nil { 154 | continue 155 | } 156 | if arr[idx], ok = e.(float64); !ok { 157 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 158 | } 159 | } 160 | return arr, nil 161 | 162 | case string: 163 | arr := make([]string, size) 164 | for i, idx := range indexes { 165 | e := elms[i] 166 | if e == nil { 167 | continue 168 | } 169 | if arr[idx], ok = e.(string); !ok { 170 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 171 | } 172 | } 173 | return arr, nil 174 | 175 | case uint: 176 | arr := make([]uint, size) 177 | for i, idx := range indexes { 178 | e := elms[i] 179 | if e == nil { 180 | continue 181 | } 182 | if arr[idx], ok = e.(uint); !ok { 183 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 184 | } 185 | } 186 | return arr, nil 187 | 188 | case int: 189 | arr := make([]int64, size) 190 | for i, idx := range indexes { 191 | e := elms[i] 192 | if e == nil { 193 | continue 194 | } 195 | vv, okk := e.(int) 196 | if !okk { 197 | return nil, errors.Errorf(inconsistentDataTypesErrMsg, e, v) 198 | } 199 | arr[idx] = int64(vv) 200 | } 201 | return arr, nil 202 | 203 | default: 204 | return nil, errors.Errorf("dataframe/sparse: invalid data type for %v (%T)", elms, v) 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /internal/constructors/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package constructors provides constructors for arrow types. 3 | 4 | */ 5 | package constructors 6 | -------------------------------------------------------------------------------- /internal/constructors/interface.go: -------------------------------------------------------------------------------- 1 | package constructors 2 | 3 | import ( 4 | "github.com/apache/arrow/go/arrow" 5 | "github.com/apache/arrow/go/arrow/array" 6 | "github.com/apache/arrow/go/arrow/memory" 7 | "github.com/go-bullseye/bullseye/internal/cast" 8 | "github.com/pkg/errors" 9 | ) 10 | 11 | // NewInterfaceFromMem builds a new column from memory 12 | // valid is an optional array of booleans. If not specified, all values are valid. 13 | func NewInterfaceFromMem(mem memory.Allocator, name string, values interface{}, valid []bool) (array.Interface, *arrow.Field, error) { 14 | var arr array.Interface 15 | 16 | switch v := values.(type) { 17 | case []bool: 18 | bld := array.NewBooleanBuilder(mem) 19 | defer bld.Release() 20 | 21 | bld.AppendValues(v, valid) 22 | arr = bld.NewArray() 23 | 24 | case []int8: 25 | bld := array.NewInt8Builder(mem) 26 | defer bld.Release() 27 | 28 | bld.AppendValues(v, valid) 29 | arr = bld.NewArray() 30 | 31 | case []int16: 32 | bld := array.NewInt16Builder(mem) 33 | defer bld.Release() 34 | 35 | bld.AppendValues(v, valid) 36 | arr = bld.NewArray() 37 | 38 | case []int32: 39 | bld := array.NewInt32Builder(mem) 40 | defer bld.Release() 41 | 42 | bld.AppendValues(v, valid) 43 | arr = bld.NewArray() 44 | 45 | case []int64: 46 | bld := array.NewInt64Builder(mem) 47 | defer bld.Release() 48 | 49 | bld.AppendValues(v, valid) 50 | arr = bld.NewArray() 51 | 52 | case []uint8: 53 | bld := array.NewUint8Builder(mem) 54 | defer bld.Release() 55 | 56 | bld.AppendValues(v, valid) 57 | arr = bld.NewArray() 58 | 59 | case []uint16: 60 | bld := array.NewUint16Builder(mem) 61 | defer bld.Release() 62 | 63 | bld.AppendValues(v, valid) 64 | arr = bld.NewArray() 65 | 66 | case []uint32: 67 | bld := array.NewUint32Builder(mem) 68 | defer bld.Release() 69 | 70 | bld.AppendValues(v, valid) 71 | arr = bld.NewArray() 72 | 73 | case []uint64: 74 | bld := array.NewUint64Builder(mem) 75 | defer bld.Release() 76 | 77 | bld.AppendValues(v, valid) 78 | arr = bld.NewArray() 79 | 80 | case []float32: 81 | bld := array.NewFloat32Builder(mem) 82 | defer bld.Release() 83 | 84 | bld.AppendValues(v, valid) 85 | arr = bld.NewArray() 86 | 87 | case []float64: 88 | bld := array.NewFloat64Builder(mem) 89 | defer bld.Release() 90 | 91 | bld.AppendValues(v, valid) 92 | arr = bld.NewArray() 93 | 94 | case []string: 95 | bld := array.NewStringBuilder(mem) 96 | defer bld.Release() 97 | 98 | bld.AppendValues(v, valid) 99 | arr = bld.NewArray() 100 | 101 | case []uint: 102 | bld := array.NewUint64Builder(mem) 103 | defer bld.Release() 104 | 105 | vs := make([]uint64, len(v)) 106 | for i, e := range v { 107 | vs[i] = uint64(e) 108 | } 109 | 110 | bld.AppendValues(vs, valid) 111 | arr = bld.NewArray() 112 | 113 | case []int: 114 | bld := array.NewInt64Builder(mem) 115 | defer bld.Release() 116 | 117 | vs := make([]int64, len(v)) 118 | for i, e := range v { 119 | vs[i] = int64(e) 120 | } 121 | 122 | bld.AppendValues(vs, valid) 123 | arr = bld.NewArray() 124 | 125 | case []interface{}: 126 | validDense := valid 127 | if len(validDense) == 0 { 128 | // build valid mask 129 | validDense = make([]bool, len(v)) 130 | for idx, value := range v { 131 | validDense[idx] = value != nil 132 | } 133 | } 134 | ifaceDense, err := cast.DenseCollectionToInterface(v) 135 | if err != nil { 136 | return nil, nil, err 137 | } 138 | return NewInterfaceFromMem(mem, name, ifaceDense, validDense) 139 | 140 | default: 141 | err := errors.Errorf("dataframe/interface: invalid data type for %q (%T)", name, v) 142 | return nil, nil, err 143 | } 144 | 145 | field := &arrow.Field{Name: name, Type: arr.DataType()} 146 | return arr, field, nil 147 | } 148 | -------------------------------------------------------------------------------- /internal/debug/assert_disabled.go: -------------------------------------------------------------------------------- 1 | // +build !assert 2 | 3 | package debug 4 | 5 | // Assert will panic with msg if cond is false. 6 | func Assert(cond bool, msg interface{}) {} -------------------------------------------------------------------------------- /internal/debug/assert_enabled.go: -------------------------------------------------------------------------------- 1 | // +build assert 2 | 3 | package debug 4 | 5 | // Assert will panic with msg if cond is false. 6 | func Assert(cond bool, msg interface{}) { 7 | if !cond { 8 | panic(msg) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /internal/debug/debug_disabled.go: -------------------------------------------------------------------------------- 1 | // +build !debug 2 | 3 | package debug 4 | 5 | func Debug(interface{}) {} 6 | -------------------------------------------------------------------------------- /internal/debug/debug_enabled.go: -------------------------------------------------------------------------------- 1 | // +build debug 2 | 3 | package debug 4 | 5 | import ( 6 | "log" 7 | "os" 8 | ) 9 | 10 | var ( 11 | debug = log.New(os.Stderr, "(debug) ", log.LstdFlags) 12 | ) 13 | 14 | func Debug(msg interface{}) { 15 | debug.Print(msg) 16 | } 17 | -------------------------------------------------------------------------------- /internal/debug/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package debug provides compiled assertions, debug and warn level logging. 3 | 4 | To enable runtime debug or warn level logging, build with the debug or warn tags 5 | respectively. Building with the debug tag will enable the warn level logger automatically. 6 | When the debug and warn tags are omitted, the code for the logging will be ommitted from 7 | the binary. 8 | 9 | To enable runtime assertions, build with the assert tag. When the assert tag is omitted, 10 | the code for the assertions will be ommitted from the binary. 11 | */ 12 | package debug 13 | -------------------------------------------------------------------------------- /internal/debug/warn_disabled.go: -------------------------------------------------------------------------------- 1 | // +build !debug,!warn 2 | 3 | package debug 4 | 5 | func Warn(interface{}) {} 6 | 7 | func Warnf(format string, v ...interface{}) {} 8 | -------------------------------------------------------------------------------- /internal/debug/warn_enabled.go: -------------------------------------------------------------------------------- 1 | // +build debug warn 2 | 3 | package debug 4 | 5 | import ( 6 | "log" 7 | "os" 8 | ) 9 | 10 | var ( 11 | warn = log.New(os.Stderr, "(warn) ", log.LstdFlags) 12 | ) 13 | 14 | func Warn(msg interface{}) { 15 | warn.Print(msg) 16 | } 17 | 18 | func Warnf(format string, v ...interface{}) { 19 | warn.Printf(format, v...) 20 | } 21 | -------------------------------------------------------------------------------- /iterator/booleaniterator.go: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/apache/arrow/go/arrow/array" 7 | "github.com/go-bullseye/bullseye/internal/debug" 8 | ) 9 | 10 | // BooleanValueIterator is an iterator for reading an Arrow Column value by value. 11 | type BooleanValueIterator struct { 12 | refCount int64 13 | chunkIterator *ChunkIterator 14 | 15 | // Things we need to maintain for the iterator 16 | index int // current value index 17 | ref *array.Boolean // the chunk reference 18 | done bool // there are no more elements for this iterator 19 | } 20 | 21 | // NewBooleanValueIterator creates a new BooleanValueIterator for reading an Arrow Column. 22 | func NewBooleanValueIterator(col *array.Column) *BooleanValueIterator { 23 | // We need a ChunkIterator to read the chunks 24 | chunkIterator := NewChunkIterator(col) 25 | 26 | return &BooleanValueIterator{ 27 | refCount: 1, 28 | chunkIterator: chunkIterator, 29 | 30 | index: 0, 31 | ref: nil, 32 | } 33 | } 34 | 35 | // Value will return the current value that the iterator is on and boolean value indicating if the value is actually null. 36 | func (vr *BooleanValueIterator) Value() (bool, bool) { 37 | return vr.ref.Value(vr.index), vr.ref.IsNull(vr.index) 38 | } 39 | 40 | // ValuePointer will return a pointer to the current value that the iterator is on. It will return nil if the value is actually null. 41 | func (vr *BooleanValueIterator) ValuePointer() *bool { 42 | if vr.ref.IsNull(vr.index) { 43 | return nil 44 | } 45 | value := vr.ref.Value(vr.index) 46 | return &value 47 | } 48 | 49 | // ValueInterface returns the value as an interface{}. 50 | func (vr *BooleanValueIterator) ValueInterface() interface{} { 51 | if vr.ref.IsNull(vr.index) { 52 | return nil 53 | } 54 | return vr.ref.Value(vr.index) 55 | } 56 | 57 | // Next moves the iterator to the next value. This will return false 58 | // when there are no more values. 59 | func (vr *BooleanValueIterator) Next() bool { 60 | if vr.done { 61 | return false 62 | } 63 | 64 | // Move the index up 65 | vr.index++ 66 | 67 | // Keep moving the chunk up until we get one with data 68 | for vr.ref == nil || vr.index >= vr.ref.Len() { 69 | if !vr.nextChunk() { 70 | // There were no more chunks with data in them 71 | vr.done = true 72 | return false 73 | } 74 | } 75 | 76 | return true 77 | } 78 | 79 | func (vr *BooleanValueIterator) nextChunk() bool { 80 | // Advance the chunk until we get one with data in it or we are done 81 | if !vr.chunkIterator.Next() { 82 | // No more chunks 83 | return false 84 | } 85 | 86 | // There was another chunk. 87 | // We maintain the ref and the values because the ref is going to allow us to retain the memory. 88 | ref := vr.chunkIterator.Chunk() 89 | ref.Retain() 90 | 91 | if vr.ref != nil { 92 | vr.ref.Release() 93 | } 94 | 95 | vr.ref = ref.(*array.Boolean) 96 | vr.index = 0 97 | return true 98 | } 99 | 100 | // Retain keeps a reference to the BooleanValueIterator 101 | func (vr *BooleanValueIterator) Retain() { 102 | atomic.AddInt64(&vr.refCount, 1) 103 | } 104 | 105 | // Release removes a reference to the BooleanValueIterator 106 | func (vr *BooleanValueIterator) Release() { 107 | debug.Assert(atomic.LoadInt64(&vr.refCount) > 0, "too many releases") 108 | 109 | if atomic.AddInt64(&vr.refCount, -1) == 0 { 110 | if vr.chunkIterator != nil { 111 | vr.chunkIterator.Release() 112 | vr.chunkIterator = nil 113 | } 114 | 115 | if vr.ref != nil { 116 | vr.ref.Release() 117 | vr.ref = nil 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /iterator/chunkiterator.gen.go: -------------------------------------------------------------------------------- 1 | // Code generated by iterator/chunkiterator.gen.go.tmpl. DO NOT EDIT. 2 | 3 | package iterator 4 | 5 | import ( 6 | "sync/atomic" 7 | 8 | "github.com/apache/arrow/go/arrow" 9 | "github.com/apache/arrow/go/arrow/array" 10 | "github.com/go-bullseye/bullseye/internal/debug" 11 | ) 12 | 13 | // Int64ChunkIterator is an iterator for reading an Arrow Column value by value. 14 | type Int64ChunkIterator struct { 15 | refCount int64 16 | col *array.Column 17 | 18 | // Things Chunked maintains. We're going to maintain it ourselves. 19 | chunks []*array.Int64 // cache the chunks on this iterator 20 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 21 | nulls int64 22 | dtype arrow.DataType 23 | 24 | // Things we need to maintain for the iterator 25 | currentIndex int // current chunk 26 | currentChunk *array.Int64 // current chunk 27 | } 28 | 29 | // NewInt64ChunkIterator creates a new Int64ChunkIterator for reading an Arrow Column. 30 | func NewInt64ChunkIterator(col *array.Column) *Int64ChunkIterator { 31 | col.Retain() 32 | 33 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 34 | columnChunks := col.Data().Chunks() 35 | chunks := make([]*array.Int64, len(columnChunks)) 36 | var length int64 37 | var nulls int64 38 | 39 | for i, chunk := range columnChunks { 40 | // Keep our own refs to chunks 41 | chunks[i] = chunk.(*array.Int64) 42 | // Retain the chunk 43 | chunks[i].Retain() 44 | 45 | // Keep our own counters instead of Chunked's 46 | length += int64(chunk.Len()) 47 | nulls += int64(chunk.NullN()) 48 | } 49 | 50 | return &Int64ChunkIterator{ 51 | refCount: 1, 52 | col: col, 53 | 54 | chunks: chunks, 55 | length: length, 56 | nulls: nulls, 57 | dtype: col.DataType(), 58 | 59 | currentIndex: 0, 60 | currentChunk: nil, 61 | } 62 | } 63 | 64 | // Chunk will return the current chunk that the iterator is on. 65 | func (cr *Int64ChunkIterator) Chunk() *array.Int64 { return cr.currentChunk } 66 | 67 | // ChunkValues returns the underlying []int64 chunk values. 68 | // Keep in mind the []int64 type might not be able 69 | // to account for nil values. You must check for those explicitly via the chunk. 70 | func (cr *Int64ChunkIterator) ChunkValues() []int64 { return cr.Chunk().Int64Values() } 71 | 72 | // Next moves the iterator to the next chunk. This will return false 73 | // when there are no more chunks. 74 | func (cr *Int64ChunkIterator) Next() bool { 75 | if cr.currentIndex >= len(cr.chunks) { 76 | return false 77 | } 78 | 79 | if cr.currentChunk != nil { 80 | cr.currentChunk.Release() 81 | } 82 | 83 | cr.currentChunk = cr.chunks[cr.currentIndex] 84 | cr.currentChunk.Retain() 85 | cr.currentIndex++ 86 | 87 | return true 88 | } 89 | 90 | // Retain keeps a reference to the Int64ChunkIterator 91 | func (cr *Int64ChunkIterator) Retain() { 92 | atomic.AddInt64(&cr.refCount, 1) 93 | } 94 | 95 | // Release removes a reference to the Int64ChunkIterator 96 | func (cr *Int64ChunkIterator) Release() { 97 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 98 | ref := atomic.AddInt64(&cr.refCount, -1) 99 | if ref == 0 { 100 | cr.col.Release() 101 | for i := range cr.chunks { 102 | cr.chunks[i].Release() 103 | } 104 | if cr.currentChunk != nil { 105 | cr.currentChunk.Release() 106 | cr.currentChunk = nil 107 | } 108 | cr.col = nil 109 | cr.chunks = nil 110 | cr.dtype = nil 111 | } 112 | } 113 | 114 | // Uint64ChunkIterator is an iterator for reading an Arrow Column value by value. 115 | type Uint64ChunkIterator struct { 116 | refCount int64 117 | col *array.Column 118 | 119 | // Things Chunked maintains. We're going to maintain it ourselves. 120 | chunks []*array.Uint64 // cache the chunks on this iterator 121 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 122 | nulls int64 123 | dtype arrow.DataType 124 | 125 | // Things we need to maintain for the iterator 126 | currentIndex int // current chunk 127 | currentChunk *array.Uint64 // current chunk 128 | } 129 | 130 | // NewUint64ChunkIterator creates a new Uint64ChunkIterator for reading an Arrow Column. 131 | func NewUint64ChunkIterator(col *array.Column) *Uint64ChunkIterator { 132 | col.Retain() 133 | 134 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 135 | columnChunks := col.Data().Chunks() 136 | chunks := make([]*array.Uint64, len(columnChunks)) 137 | var length int64 138 | var nulls int64 139 | 140 | for i, chunk := range columnChunks { 141 | // Keep our own refs to chunks 142 | chunks[i] = chunk.(*array.Uint64) 143 | // Retain the chunk 144 | chunks[i].Retain() 145 | 146 | // Keep our own counters instead of Chunked's 147 | length += int64(chunk.Len()) 148 | nulls += int64(chunk.NullN()) 149 | } 150 | 151 | return &Uint64ChunkIterator{ 152 | refCount: 1, 153 | col: col, 154 | 155 | chunks: chunks, 156 | length: length, 157 | nulls: nulls, 158 | dtype: col.DataType(), 159 | 160 | currentIndex: 0, 161 | currentChunk: nil, 162 | } 163 | } 164 | 165 | // Chunk will return the current chunk that the iterator is on. 166 | func (cr *Uint64ChunkIterator) Chunk() *array.Uint64 { return cr.currentChunk } 167 | 168 | // ChunkValues returns the underlying []uint64 chunk values. 169 | // Keep in mind the []uint64 type might not be able 170 | // to account for nil values. You must check for those explicitly via the chunk. 171 | func (cr *Uint64ChunkIterator) ChunkValues() []uint64 { return cr.Chunk().Uint64Values() } 172 | 173 | // Next moves the iterator to the next chunk. This will return false 174 | // when there are no more chunks. 175 | func (cr *Uint64ChunkIterator) Next() bool { 176 | if cr.currentIndex >= len(cr.chunks) { 177 | return false 178 | } 179 | 180 | if cr.currentChunk != nil { 181 | cr.currentChunk.Release() 182 | } 183 | 184 | cr.currentChunk = cr.chunks[cr.currentIndex] 185 | cr.currentChunk.Retain() 186 | cr.currentIndex++ 187 | 188 | return true 189 | } 190 | 191 | // Retain keeps a reference to the Uint64ChunkIterator 192 | func (cr *Uint64ChunkIterator) Retain() { 193 | atomic.AddInt64(&cr.refCount, 1) 194 | } 195 | 196 | // Release removes a reference to the Uint64ChunkIterator 197 | func (cr *Uint64ChunkIterator) Release() { 198 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 199 | ref := atomic.AddInt64(&cr.refCount, -1) 200 | if ref == 0 { 201 | cr.col.Release() 202 | for i := range cr.chunks { 203 | cr.chunks[i].Release() 204 | } 205 | if cr.currentChunk != nil { 206 | cr.currentChunk.Release() 207 | cr.currentChunk = nil 208 | } 209 | cr.col = nil 210 | cr.chunks = nil 211 | cr.dtype = nil 212 | } 213 | } 214 | 215 | // Float64ChunkIterator is an iterator for reading an Arrow Column value by value. 216 | type Float64ChunkIterator struct { 217 | refCount int64 218 | col *array.Column 219 | 220 | // Things Chunked maintains. We're going to maintain it ourselves. 221 | chunks []*array.Float64 // cache the chunks on this iterator 222 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 223 | nulls int64 224 | dtype arrow.DataType 225 | 226 | // Things we need to maintain for the iterator 227 | currentIndex int // current chunk 228 | currentChunk *array.Float64 // current chunk 229 | } 230 | 231 | // NewFloat64ChunkIterator creates a new Float64ChunkIterator for reading an Arrow Column. 232 | func NewFloat64ChunkIterator(col *array.Column) *Float64ChunkIterator { 233 | col.Retain() 234 | 235 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 236 | columnChunks := col.Data().Chunks() 237 | chunks := make([]*array.Float64, len(columnChunks)) 238 | var length int64 239 | var nulls int64 240 | 241 | for i, chunk := range columnChunks { 242 | // Keep our own refs to chunks 243 | chunks[i] = chunk.(*array.Float64) 244 | // Retain the chunk 245 | chunks[i].Retain() 246 | 247 | // Keep our own counters instead of Chunked's 248 | length += int64(chunk.Len()) 249 | nulls += int64(chunk.NullN()) 250 | } 251 | 252 | return &Float64ChunkIterator{ 253 | refCount: 1, 254 | col: col, 255 | 256 | chunks: chunks, 257 | length: length, 258 | nulls: nulls, 259 | dtype: col.DataType(), 260 | 261 | currentIndex: 0, 262 | currentChunk: nil, 263 | } 264 | } 265 | 266 | // Chunk will return the current chunk that the iterator is on. 267 | func (cr *Float64ChunkIterator) Chunk() *array.Float64 { return cr.currentChunk } 268 | 269 | // ChunkValues returns the underlying []float64 chunk values. 270 | // Keep in mind the []float64 type might not be able 271 | // to account for nil values. You must check for those explicitly via the chunk. 272 | func (cr *Float64ChunkIterator) ChunkValues() []float64 { return cr.Chunk().Float64Values() } 273 | 274 | // Next moves the iterator to the next chunk. This will return false 275 | // when there are no more chunks. 276 | func (cr *Float64ChunkIterator) Next() bool { 277 | if cr.currentIndex >= len(cr.chunks) { 278 | return false 279 | } 280 | 281 | if cr.currentChunk != nil { 282 | cr.currentChunk.Release() 283 | } 284 | 285 | cr.currentChunk = cr.chunks[cr.currentIndex] 286 | cr.currentChunk.Retain() 287 | cr.currentIndex++ 288 | 289 | return true 290 | } 291 | 292 | // Retain keeps a reference to the Float64ChunkIterator 293 | func (cr *Float64ChunkIterator) Retain() { 294 | atomic.AddInt64(&cr.refCount, 1) 295 | } 296 | 297 | // Release removes a reference to the Float64ChunkIterator 298 | func (cr *Float64ChunkIterator) Release() { 299 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 300 | ref := atomic.AddInt64(&cr.refCount, -1) 301 | if ref == 0 { 302 | cr.col.Release() 303 | for i := range cr.chunks { 304 | cr.chunks[i].Release() 305 | } 306 | if cr.currentChunk != nil { 307 | cr.currentChunk.Release() 308 | cr.currentChunk = nil 309 | } 310 | cr.col = nil 311 | cr.chunks = nil 312 | cr.dtype = nil 313 | } 314 | } 315 | 316 | // Int32ChunkIterator is an iterator for reading an Arrow Column value by value. 317 | type Int32ChunkIterator struct { 318 | refCount int64 319 | col *array.Column 320 | 321 | // Things Chunked maintains. We're going to maintain it ourselves. 322 | chunks []*array.Int32 // cache the chunks on this iterator 323 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 324 | nulls int64 325 | dtype arrow.DataType 326 | 327 | // Things we need to maintain for the iterator 328 | currentIndex int // current chunk 329 | currentChunk *array.Int32 // current chunk 330 | } 331 | 332 | // NewInt32ChunkIterator creates a new Int32ChunkIterator for reading an Arrow Column. 333 | func NewInt32ChunkIterator(col *array.Column) *Int32ChunkIterator { 334 | col.Retain() 335 | 336 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 337 | columnChunks := col.Data().Chunks() 338 | chunks := make([]*array.Int32, len(columnChunks)) 339 | var length int64 340 | var nulls int64 341 | 342 | for i, chunk := range columnChunks { 343 | // Keep our own refs to chunks 344 | chunks[i] = chunk.(*array.Int32) 345 | // Retain the chunk 346 | chunks[i].Retain() 347 | 348 | // Keep our own counters instead of Chunked's 349 | length += int64(chunk.Len()) 350 | nulls += int64(chunk.NullN()) 351 | } 352 | 353 | return &Int32ChunkIterator{ 354 | refCount: 1, 355 | col: col, 356 | 357 | chunks: chunks, 358 | length: length, 359 | nulls: nulls, 360 | dtype: col.DataType(), 361 | 362 | currentIndex: 0, 363 | currentChunk: nil, 364 | } 365 | } 366 | 367 | // Chunk will return the current chunk that the iterator is on. 368 | func (cr *Int32ChunkIterator) Chunk() *array.Int32 { return cr.currentChunk } 369 | 370 | // ChunkValues returns the underlying []int32 chunk values. 371 | // Keep in mind the []int32 type might not be able 372 | // to account for nil values. You must check for those explicitly via the chunk. 373 | func (cr *Int32ChunkIterator) ChunkValues() []int32 { return cr.Chunk().Int32Values() } 374 | 375 | // Next moves the iterator to the next chunk. This will return false 376 | // when there are no more chunks. 377 | func (cr *Int32ChunkIterator) Next() bool { 378 | if cr.currentIndex >= len(cr.chunks) { 379 | return false 380 | } 381 | 382 | if cr.currentChunk != nil { 383 | cr.currentChunk.Release() 384 | } 385 | 386 | cr.currentChunk = cr.chunks[cr.currentIndex] 387 | cr.currentChunk.Retain() 388 | cr.currentIndex++ 389 | 390 | return true 391 | } 392 | 393 | // Retain keeps a reference to the Int32ChunkIterator 394 | func (cr *Int32ChunkIterator) Retain() { 395 | atomic.AddInt64(&cr.refCount, 1) 396 | } 397 | 398 | // Release removes a reference to the Int32ChunkIterator 399 | func (cr *Int32ChunkIterator) Release() { 400 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 401 | ref := atomic.AddInt64(&cr.refCount, -1) 402 | if ref == 0 { 403 | cr.col.Release() 404 | for i := range cr.chunks { 405 | cr.chunks[i].Release() 406 | } 407 | if cr.currentChunk != nil { 408 | cr.currentChunk.Release() 409 | cr.currentChunk = nil 410 | } 411 | cr.col = nil 412 | cr.chunks = nil 413 | cr.dtype = nil 414 | } 415 | } 416 | 417 | // Uint32ChunkIterator is an iterator for reading an Arrow Column value by value. 418 | type Uint32ChunkIterator struct { 419 | refCount int64 420 | col *array.Column 421 | 422 | // Things Chunked maintains. We're going to maintain it ourselves. 423 | chunks []*array.Uint32 // cache the chunks on this iterator 424 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 425 | nulls int64 426 | dtype arrow.DataType 427 | 428 | // Things we need to maintain for the iterator 429 | currentIndex int // current chunk 430 | currentChunk *array.Uint32 // current chunk 431 | } 432 | 433 | // NewUint32ChunkIterator creates a new Uint32ChunkIterator for reading an Arrow Column. 434 | func NewUint32ChunkIterator(col *array.Column) *Uint32ChunkIterator { 435 | col.Retain() 436 | 437 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 438 | columnChunks := col.Data().Chunks() 439 | chunks := make([]*array.Uint32, len(columnChunks)) 440 | var length int64 441 | var nulls int64 442 | 443 | for i, chunk := range columnChunks { 444 | // Keep our own refs to chunks 445 | chunks[i] = chunk.(*array.Uint32) 446 | // Retain the chunk 447 | chunks[i].Retain() 448 | 449 | // Keep our own counters instead of Chunked's 450 | length += int64(chunk.Len()) 451 | nulls += int64(chunk.NullN()) 452 | } 453 | 454 | return &Uint32ChunkIterator{ 455 | refCount: 1, 456 | col: col, 457 | 458 | chunks: chunks, 459 | length: length, 460 | nulls: nulls, 461 | dtype: col.DataType(), 462 | 463 | currentIndex: 0, 464 | currentChunk: nil, 465 | } 466 | } 467 | 468 | // Chunk will return the current chunk that the iterator is on. 469 | func (cr *Uint32ChunkIterator) Chunk() *array.Uint32 { return cr.currentChunk } 470 | 471 | // ChunkValues returns the underlying []uint32 chunk values. 472 | // Keep in mind the []uint32 type might not be able 473 | // to account for nil values. You must check for those explicitly via the chunk. 474 | func (cr *Uint32ChunkIterator) ChunkValues() []uint32 { return cr.Chunk().Uint32Values() } 475 | 476 | // Next moves the iterator to the next chunk. This will return false 477 | // when there are no more chunks. 478 | func (cr *Uint32ChunkIterator) Next() bool { 479 | if cr.currentIndex >= len(cr.chunks) { 480 | return false 481 | } 482 | 483 | if cr.currentChunk != nil { 484 | cr.currentChunk.Release() 485 | } 486 | 487 | cr.currentChunk = cr.chunks[cr.currentIndex] 488 | cr.currentChunk.Retain() 489 | cr.currentIndex++ 490 | 491 | return true 492 | } 493 | 494 | // Retain keeps a reference to the Uint32ChunkIterator 495 | func (cr *Uint32ChunkIterator) Retain() { 496 | atomic.AddInt64(&cr.refCount, 1) 497 | } 498 | 499 | // Release removes a reference to the Uint32ChunkIterator 500 | func (cr *Uint32ChunkIterator) Release() { 501 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 502 | ref := atomic.AddInt64(&cr.refCount, -1) 503 | if ref == 0 { 504 | cr.col.Release() 505 | for i := range cr.chunks { 506 | cr.chunks[i].Release() 507 | } 508 | if cr.currentChunk != nil { 509 | cr.currentChunk.Release() 510 | cr.currentChunk = nil 511 | } 512 | cr.col = nil 513 | cr.chunks = nil 514 | cr.dtype = nil 515 | } 516 | } 517 | 518 | // Float32ChunkIterator is an iterator for reading an Arrow Column value by value. 519 | type Float32ChunkIterator struct { 520 | refCount int64 521 | col *array.Column 522 | 523 | // Things Chunked maintains. We're going to maintain it ourselves. 524 | chunks []*array.Float32 // cache the chunks on this iterator 525 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 526 | nulls int64 527 | dtype arrow.DataType 528 | 529 | // Things we need to maintain for the iterator 530 | currentIndex int // current chunk 531 | currentChunk *array.Float32 // current chunk 532 | } 533 | 534 | // NewFloat32ChunkIterator creates a new Float32ChunkIterator for reading an Arrow Column. 535 | func NewFloat32ChunkIterator(col *array.Column) *Float32ChunkIterator { 536 | col.Retain() 537 | 538 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 539 | columnChunks := col.Data().Chunks() 540 | chunks := make([]*array.Float32, len(columnChunks)) 541 | var length int64 542 | var nulls int64 543 | 544 | for i, chunk := range columnChunks { 545 | // Keep our own refs to chunks 546 | chunks[i] = chunk.(*array.Float32) 547 | // Retain the chunk 548 | chunks[i].Retain() 549 | 550 | // Keep our own counters instead of Chunked's 551 | length += int64(chunk.Len()) 552 | nulls += int64(chunk.NullN()) 553 | } 554 | 555 | return &Float32ChunkIterator{ 556 | refCount: 1, 557 | col: col, 558 | 559 | chunks: chunks, 560 | length: length, 561 | nulls: nulls, 562 | dtype: col.DataType(), 563 | 564 | currentIndex: 0, 565 | currentChunk: nil, 566 | } 567 | } 568 | 569 | // Chunk will return the current chunk that the iterator is on. 570 | func (cr *Float32ChunkIterator) Chunk() *array.Float32 { return cr.currentChunk } 571 | 572 | // ChunkValues returns the underlying []float32 chunk values. 573 | // Keep in mind the []float32 type might not be able 574 | // to account for nil values. You must check for those explicitly via the chunk. 575 | func (cr *Float32ChunkIterator) ChunkValues() []float32 { return cr.Chunk().Float32Values() } 576 | 577 | // Next moves the iterator to the next chunk. This will return false 578 | // when there are no more chunks. 579 | func (cr *Float32ChunkIterator) Next() bool { 580 | if cr.currentIndex >= len(cr.chunks) { 581 | return false 582 | } 583 | 584 | if cr.currentChunk != nil { 585 | cr.currentChunk.Release() 586 | } 587 | 588 | cr.currentChunk = cr.chunks[cr.currentIndex] 589 | cr.currentChunk.Retain() 590 | cr.currentIndex++ 591 | 592 | return true 593 | } 594 | 595 | // Retain keeps a reference to the Float32ChunkIterator 596 | func (cr *Float32ChunkIterator) Retain() { 597 | atomic.AddInt64(&cr.refCount, 1) 598 | } 599 | 600 | // Release removes a reference to the Float32ChunkIterator 601 | func (cr *Float32ChunkIterator) Release() { 602 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 603 | ref := atomic.AddInt64(&cr.refCount, -1) 604 | if ref == 0 { 605 | cr.col.Release() 606 | for i := range cr.chunks { 607 | cr.chunks[i].Release() 608 | } 609 | if cr.currentChunk != nil { 610 | cr.currentChunk.Release() 611 | cr.currentChunk = nil 612 | } 613 | cr.col = nil 614 | cr.chunks = nil 615 | cr.dtype = nil 616 | } 617 | } 618 | 619 | // Int16ChunkIterator is an iterator for reading an Arrow Column value by value. 620 | type Int16ChunkIterator struct { 621 | refCount int64 622 | col *array.Column 623 | 624 | // Things Chunked maintains. We're going to maintain it ourselves. 625 | chunks []*array.Int16 // cache the chunks on this iterator 626 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 627 | nulls int64 628 | dtype arrow.DataType 629 | 630 | // Things we need to maintain for the iterator 631 | currentIndex int // current chunk 632 | currentChunk *array.Int16 // current chunk 633 | } 634 | 635 | // NewInt16ChunkIterator creates a new Int16ChunkIterator for reading an Arrow Column. 636 | func NewInt16ChunkIterator(col *array.Column) *Int16ChunkIterator { 637 | col.Retain() 638 | 639 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 640 | columnChunks := col.Data().Chunks() 641 | chunks := make([]*array.Int16, len(columnChunks)) 642 | var length int64 643 | var nulls int64 644 | 645 | for i, chunk := range columnChunks { 646 | // Keep our own refs to chunks 647 | chunks[i] = chunk.(*array.Int16) 648 | // Retain the chunk 649 | chunks[i].Retain() 650 | 651 | // Keep our own counters instead of Chunked's 652 | length += int64(chunk.Len()) 653 | nulls += int64(chunk.NullN()) 654 | } 655 | 656 | return &Int16ChunkIterator{ 657 | refCount: 1, 658 | col: col, 659 | 660 | chunks: chunks, 661 | length: length, 662 | nulls: nulls, 663 | dtype: col.DataType(), 664 | 665 | currentIndex: 0, 666 | currentChunk: nil, 667 | } 668 | } 669 | 670 | // Chunk will return the current chunk that the iterator is on. 671 | func (cr *Int16ChunkIterator) Chunk() *array.Int16 { return cr.currentChunk } 672 | 673 | // ChunkValues returns the underlying []int16 chunk values. 674 | // Keep in mind the []int16 type might not be able 675 | // to account for nil values. You must check for those explicitly via the chunk. 676 | func (cr *Int16ChunkIterator) ChunkValues() []int16 { return cr.Chunk().Int16Values() } 677 | 678 | // Next moves the iterator to the next chunk. This will return false 679 | // when there are no more chunks. 680 | func (cr *Int16ChunkIterator) Next() bool { 681 | if cr.currentIndex >= len(cr.chunks) { 682 | return false 683 | } 684 | 685 | if cr.currentChunk != nil { 686 | cr.currentChunk.Release() 687 | } 688 | 689 | cr.currentChunk = cr.chunks[cr.currentIndex] 690 | cr.currentChunk.Retain() 691 | cr.currentIndex++ 692 | 693 | return true 694 | } 695 | 696 | // Retain keeps a reference to the Int16ChunkIterator 697 | func (cr *Int16ChunkIterator) Retain() { 698 | atomic.AddInt64(&cr.refCount, 1) 699 | } 700 | 701 | // Release removes a reference to the Int16ChunkIterator 702 | func (cr *Int16ChunkIterator) Release() { 703 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 704 | ref := atomic.AddInt64(&cr.refCount, -1) 705 | if ref == 0 { 706 | cr.col.Release() 707 | for i := range cr.chunks { 708 | cr.chunks[i].Release() 709 | } 710 | if cr.currentChunk != nil { 711 | cr.currentChunk.Release() 712 | cr.currentChunk = nil 713 | } 714 | cr.col = nil 715 | cr.chunks = nil 716 | cr.dtype = nil 717 | } 718 | } 719 | 720 | // Uint16ChunkIterator is an iterator for reading an Arrow Column value by value. 721 | type Uint16ChunkIterator struct { 722 | refCount int64 723 | col *array.Column 724 | 725 | // Things Chunked maintains. We're going to maintain it ourselves. 726 | chunks []*array.Uint16 // cache the chunks on this iterator 727 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 728 | nulls int64 729 | dtype arrow.DataType 730 | 731 | // Things we need to maintain for the iterator 732 | currentIndex int // current chunk 733 | currentChunk *array.Uint16 // current chunk 734 | } 735 | 736 | // NewUint16ChunkIterator creates a new Uint16ChunkIterator for reading an Arrow Column. 737 | func NewUint16ChunkIterator(col *array.Column) *Uint16ChunkIterator { 738 | col.Retain() 739 | 740 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 741 | columnChunks := col.Data().Chunks() 742 | chunks := make([]*array.Uint16, len(columnChunks)) 743 | var length int64 744 | var nulls int64 745 | 746 | for i, chunk := range columnChunks { 747 | // Keep our own refs to chunks 748 | chunks[i] = chunk.(*array.Uint16) 749 | // Retain the chunk 750 | chunks[i].Retain() 751 | 752 | // Keep our own counters instead of Chunked's 753 | length += int64(chunk.Len()) 754 | nulls += int64(chunk.NullN()) 755 | } 756 | 757 | return &Uint16ChunkIterator{ 758 | refCount: 1, 759 | col: col, 760 | 761 | chunks: chunks, 762 | length: length, 763 | nulls: nulls, 764 | dtype: col.DataType(), 765 | 766 | currentIndex: 0, 767 | currentChunk: nil, 768 | } 769 | } 770 | 771 | // Chunk will return the current chunk that the iterator is on. 772 | func (cr *Uint16ChunkIterator) Chunk() *array.Uint16 { return cr.currentChunk } 773 | 774 | // ChunkValues returns the underlying []uint16 chunk values. 775 | // Keep in mind the []uint16 type might not be able 776 | // to account for nil values. You must check for those explicitly via the chunk. 777 | func (cr *Uint16ChunkIterator) ChunkValues() []uint16 { return cr.Chunk().Uint16Values() } 778 | 779 | // Next moves the iterator to the next chunk. This will return false 780 | // when there are no more chunks. 781 | func (cr *Uint16ChunkIterator) Next() bool { 782 | if cr.currentIndex >= len(cr.chunks) { 783 | return false 784 | } 785 | 786 | if cr.currentChunk != nil { 787 | cr.currentChunk.Release() 788 | } 789 | 790 | cr.currentChunk = cr.chunks[cr.currentIndex] 791 | cr.currentChunk.Retain() 792 | cr.currentIndex++ 793 | 794 | return true 795 | } 796 | 797 | // Retain keeps a reference to the Uint16ChunkIterator 798 | func (cr *Uint16ChunkIterator) Retain() { 799 | atomic.AddInt64(&cr.refCount, 1) 800 | } 801 | 802 | // Release removes a reference to the Uint16ChunkIterator 803 | func (cr *Uint16ChunkIterator) Release() { 804 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 805 | ref := atomic.AddInt64(&cr.refCount, -1) 806 | if ref == 0 { 807 | cr.col.Release() 808 | for i := range cr.chunks { 809 | cr.chunks[i].Release() 810 | } 811 | if cr.currentChunk != nil { 812 | cr.currentChunk.Release() 813 | cr.currentChunk = nil 814 | } 815 | cr.col = nil 816 | cr.chunks = nil 817 | cr.dtype = nil 818 | } 819 | } 820 | 821 | // Int8ChunkIterator is an iterator for reading an Arrow Column value by value. 822 | type Int8ChunkIterator struct { 823 | refCount int64 824 | col *array.Column 825 | 826 | // Things Chunked maintains. We're going to maintain it ourselves. 827 | chunks []*array.Int8 // cache the chunks on this iterator 828 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 829 | nulls int64 830 | dtype arrow.DataType 831 | 832 | // Things we need to maintain for the iterator 833 | currentIndex int // current chunk 834 | currentChunk *array.Int8 // current chunk 835 | } 836 | 837 | // NewInt8ChunkIterator creates a new Int8ChunkIterator for reading an Arrow Column. 838 | func NewInt8ChunkIterator(col *array.Column) *Int8ChunkIterator { 839 | col.Retain() 840 | 841 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 842 | columnChunks := col.Data().Chunks() 843 | chunks := make([]*array.Int8, len(columnChunks)) 844 | var length int64 845 | var nulls int64 846 | 847 | for i, chunk := range columnChunks { 848 | // Keep our own refs to chunks 849 | chunks[i] = chunk.(*array.Int8) 850 | // Retain the chunk 851 | chunks[i].Retain() 852 | 853 | // Keep our own counters instead of Chunked's 854 | length += int64(chunk.Len()) 855 | nulls += int64(chunk.NullN()) 856 | } 857 | 858 | return &Int8ChunkIterator{ 859 | refCount: 1, 860 | col: col, 861 | 862 | chunks: chunks, 863 | length: length, 864 | nulls: nulls, 865 | dtype: col.DataType(), 866 | 867 | currentIndex: 0, 868 | currentChunk: nil, 869 | } 870 | } 871 | 872 | // Chunk will return the current chunk that the iterator is on. 873 | func (cr *Int8ChunkIterator) Chunk() *array.Int8 { return cr.currentChunk } 874 | 875 | // ChunkValues returns the underlying []int8 chunk values. 876 | // Keep in mind the []int8 type might not be able 877 | // to account for nil values. You must check for those explicitly via the chunk. 878 | func (cr *Int8ChunkIterator) ChunkValues() []int8 { return cr.Chunk().Int8Values() } 879 | 880 | // Next moves the iterator to the next chunk. This will return false 881 | // when there are no more chunks. 882 | func (cr *Int8ChunkIterator) Next() bool { 883 | if cr.currentIndex >= len(cr.chunks) { 884 | return false 885 | } 886 | 887 | if cr.currentChunk != nil { 888 | cr.currentChunk.Release() 889 | } 890 | 891 | cr.currentChunk = cr.chunks[cr.currentIndex] 892 | cr.currentChunk.Retain() 893 | cr.currentIndex++ 894 | 895 | return true 896 | } 897 | 898 | // Retain keeps a reference to the Int8ChunkIterator 899 | func (cr *Int8ChunkIterator) Retain() { 900 | atomic.AddInt64(&cr.refCount, 1) 901 | } 902 | 903 | // Release removes a reference to the Int8ChunkIterator 904 | func (cr *Int8ChunkIterator) Release() { 905 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 906 | ref := atomic.AddInt64(&cr.refCount, -1) 907 | if ref == 0 { 908 | cr.col.Release() 909 | for i := range cr.chunks { 910 | cr.chunks[i].Release() 911 | } 912 | if cr.currentChunk != nil { 913 | cr.currentChunk.Release() 914 | cr.currentChunk = nil 915 | } 916 | cr.col = nil 917 | cr.chunks = nil 918 | cr.dtype = nil 919 | } 920 | } 921 | 922 | // Uint8ChunkIterator is an iterator for reading an Arrow Column value by value. 923 | type Uint8ChunkIterator struct { 924 | refCount int64 925 | col *array.Column 926 | 927 | // Things Chunked maintains. We're going to maintain it ourselves. 928 | chunks []*array.Uint8 // cache the chunks on this iterator 929 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 930 | nulls int64 931 | dtype arrow.DataType 932 | 933 | // Things we need to maintain for the iterator 934 | currentIndex int // current chunk 935 | currentChunk *array.Uint8 // current chunk 936 | } 937 | 938 | // NewUint8ChunkIterator creates a new Uint8ChunkIterator for reading an Arrow Column. 939 | func NewUint8ChunkIterator(col *array.Column) *Uint8ChunkIterator { 940 | col.Retain() 941 | 942 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 943 | columnChunks := col.Data().Chunks() 944 | chunks := make([]*array.Uint8, len(columnChunks)) 945 | var length int64 946 | var nulls int64 947 | 948 | for i, chunk := range columnChunks { 949 | // Keep our own refs to chunks 950 | chunks[i] = chunk.(*array.Uint8) 951 | // Retain the chunk 952 | chunks[i].Retain() 953 | 954 | // Keep our own counters instead of Chunked's 955 | length += int64(chunk.Len()) 956 | nulls += int64(chunk.NullN()) 957 | } 958 | 959 | return &Uint8ChunkIterator{ 960 | refCount: 1, 961 | col: col, 962 | 963 | chunks: chunks, 964 | length: length, 965 | nulls: nulls, 966 | dtype: col.DataType(), 967 | 968 | currentIndex: 0, 969 | currentChunk: nil, 970 | } 971 | } 972 | 973 | // Chunk will return the current chunk that the iterator is on. 974 | func (cr *Uint8ChunkIterator) Chunk() *array.Uint8 { return cr.currentChunk } 975 | 976 | // ChunkValues returns the underlying []uint8 chunk values. 977 | // Keep in mind the []uint8 type might not be able 978 | // to account for nil values. You must check for those explicitly via the chunk. 979 | func (cr *Uint8ChunkIterator) ChunkValues() []uint8 { return cr.Chunk().Uint8Values() } 980 | 981 | // Next moves the iterator to the next chunk. This will return false 982 | // when there are no more chunks. 983 | func (cr *Uint8ChunkIterator) Next() bool { 984 | if cr.currentIndex >= len(cr.chunks) { 985 | return false 986 | } 987 | 988 | if cr.currentChunk != nil { 989 | cr.currentChunk.Release() 990 | } 991 | 992 | cr.currentChunk = cr.chunks[cr.currentIndex] 993 | cr.currentChunk.Retain() 994 | cr.currentIndex++ 995 | 996 | return true 997 | } 998 | 999 | // Retain keeps a reference to the Uint8ChunkIterator 1000 | func (cr *Uint8ChunkIterator) Retain() { 1001 | atomic.AddInt64(&cr.refCount, 1) 1002 | } 1003 | 1004 | // Release removes a reference to the Uint8ChunkIterator 1005 | func (cr *Uint8ChunkIterator) Release() { 1006 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1007 | ref := atomic.AddInt64(&cr.refCount, -1) 1008 | if ref == 0 { 1009 | cr.col.Release() 1010 | for i := range cr.chunks { 1011 | cr.chunks[i].Release() 1012 | } 1013 | if cr.currentChunk != nil { 1014 | cr.currentChunk.Release() 1015 | cr.currentChunk = nil 1016 | } 1017 | cr.col = nil 1018 | cr.chunks = nil 1019 | cr.dtype = nil 1020 | } 1021 | } 1022 | 1023 | // TimestampChunkIterator is an iterator for reading an Arrow Column value by value. 1024 | type TimestampChunkIterator struct { 1025 | refCount int64 1026 | col *array.Column 1027 | 1028 | // Things Chunked maintains. We're going to maintain it ourselves. 1029 | chunks []*array.Timestamp // cache the chunks on this iterator 1030 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 1031 | nulls int64 1032 | dtype arrow.DataType 1033 | 1034 | // Things we need to maintain for the iterator 1035 | currentIndex int // current chunk 1036 | currentChunk *array.Timestamp // current chunk 1037 | } 1038 | 1039 | // NewTimestampChunkIterator creates a new TimestampChunkIterator for reading an Arrow Column. 1040 | func NewTimestampChunkIterator(col *array.Column) *TimestampChunkIterator { 1041 | col.Retain() 1042 | 1043 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 1044 | columnChunks := col.Data().Chunks() 1045 | chunks := make([]*array.Timestamp, len(columnChunks)) 1046 | var length int64 1047 | var nulls int64 1048 | 1049 | for i, chunk := range columnChunks { 1050 | // Keep our own refs to chunks 1051 | chunks[i] = chunk.(*array.Timestamp) 1052 | // Retain the chunk 1053 | chunks[i].Retain() 1054 | 1055 | // Keep our own counters instead of Chunked's 1056 | length += int64(chunk.Len()) 1057 | nulls += int64(chunk.NullN()) 1058 | } 1059 | 1060 | return &TimestampChunkIterator{ 1061 | refCount: 1, 1062 | col: col, 1063 | 1064 | chunks: chunks, 1065 | length: length, 1066 | nulls: nulls, 1067 | dtype: col.DataType(), 1068 | 1069 | currentIndex: 0, 1070 | currentChunk: nil, 1071 | } 1072 | } 1073 | 1074 | // Chunk will return the current chunk that the iterator is on. 1075 | func (cr *TimestampChunkIterator) Chunk() *array.Timestamp { return cr.currentChunk } 1076 | 1077 | // ChunkValues returns the underlying []arrow.Timestamp chunk values. 1078 | // Keep in mind the []arrow.Timestamp type might not be able 1079 | // to account for nil values. You must check for those explicitly via the chunk. 1080 | func (cr *TimestampChunkIterator) ChunkValues() []arrow.Timestamp { return cr.Chunk().TimestampValues() } 1081 | 1082 | // Next moves the iterator to the next chunk. This will return false 1083 | // when there are no more chunks. 1084 | func (cr *TimestampChunkIterator) Next() bool { 1085 | if cr.currentIndex >= len(cr.chunks) { 1086 | return false 1087 | } 1088 | 1089 | if cr.currentChunk != nil { 1090 | cr.currentChunk.Release() 1091 | } 1092 | 1093 | cr.currentChunk = cr.chunks[cr.currentIndex] 1094 | cr.currentChunk.Retain() 1095 | cr.currentIndex++ 1096 | 1097 | return true 1098 | } 1099 | 1100 | // Retain keeps a reference to the TimestampChunkIterator 1101 | func (cr *TimestampChunkIterator) Retain() { 1102 | atomic.AddInt64(&cr.refCount, 1) 1103 | } 1104 | 1105 | // Release removes a reference to the TimestampChunkIterator 1106 | func (cr *TimestampChunkIterator) Release() { 1107 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1108 | ref := atomic.AddInt64(&cr.refCount, -1) 1109 | if ref == 0 { 1110 | cr.col.Release() 1111 | for i := range cr.chunks { 1112 | cr.chunks[i].Release() 1113 | } 1114 | if cr.currentChunk != nil { 1115 | cr.currentChunk.Release() 1116 | cr.currentChunk = nil 1117 | } 1118 | cr.col = nil 1119 | cr.chunks = nil 1120 | cr.dtype = nil 1121 | } 1122 | } 1123 | 1124 | // Time32ChunkIterator is an iterator for reading an Arrow Column value by value. 1125 | type Time32ChunkIterator struct { 1126 | refCount int64 1127 | col *array.Column 1128 | 1129 | // Things Chunked maintains. We're going to maintain it ourselves. 1130 | chunks []*array.Time32 // cache the chunks on this iterator 1131 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 1132 | nulls int64 1133 | dtype arrow.DataType 1134 | 1135 | // Things we need to maintain for the iterator 1136 | currentIndex int // current chunk 1137 | currentChunk *array.Time32 // current chunk 1138 | } 1139 | 1140 | // NewTime32ChunkIterator creates a new Time32ChunkIterator for reading an Arrow Column. 1141 | func NewTime32ChunkIterator(col *array.Column) *Time32ChunkIterator { 1142 | col.Retain() 1143 | 1144 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 1145 | columnChunks := col.Data().Chunks() 1146 | chunks := make([]*array.Time32, len(columnChunks)) 1147 | var length int64 1148 | var nulls int64 1149 | 1150 | for i, chunk := range columnChunks { 1151 | // Keep our own refs to chunks 1152 | chunks[i] = chunk.(*array.Time32) 1153 | // Retain the chunk 1154 | chunks[i].Retain() 1155 | 1156 | // Keep our own counters instead of Chunked's 1157 | length += int64(chunk.Len()) 1158 | nulls += int64(chunk.NullN()) 1159 | } 1160 | 1161 | return &Time32ChunkIterator{ 1162 | refCount: 1, 1163 | col: col, 1164 | 1165 | chunks: chunks, 1166 | length: length, 1167 | nulls: nulls, 1168 | dtype: col.DataType(), 1169 | 1170 | currentIndex: 0, 1171 | currentChunk: nil, 1172 | } 1173 | } 1174 | 1175 | // Chunk will return the current chunk that the iterator is on. 1176 | func (cr *Time32ChunkIterator) Chunk() *array.Time32 { return cr.currentChunk } 1177 | 1178 | // ChunkValues returns the underlying []arrow.Time32 chunk values. 1179 | // Keep in mind the []arrow.Time32 type might not be able 1180 | // to account for nil values. You must check for those explicitly via the chunk. 1181 | func (cr *Time32ChunkIterator) ChunkValues() []arrow.Time32 { return cr.Chunk().Time32Values() } 1182 | 1183 | // Next moves the iterator to the next chunk. This will return false 1184 | // when there are no more chunks. 1185 | func (cr *Time32ChunkIterator) Next() bool { 1186 | if cr.currentIndex >= len(cr.chunks) { 1187 | return false 1188 | } 1189 | 1190 | if cr.currentChunk != nil { 1191 | cr.currentChunk.Release() 1192 | } 1193 | 1194 | cr.currentChunk = cr.chunks[cr.currentIndex] 1195 | cr.currentChunk.Retain() 1196 | cr.currentIndex++ 1197 | 1198 | return true 1199 | } 1200 | 1201 | // Retain keeps a reference to the Time32ChunkIterator 1202 | func (cr *Time32ChunkIterator) Retain() { 1203 | atomic.AddInt64(&cr.refCount, 1) 1204 | } 1205 | 1206 | // Release removes a reference to the Time32ChunkIterator 1207 | func (cr *Time32ChunkIterator) Release() { 1208 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1209 | ref := atomic.AddInt64(&cr.refCount, -1) 1210 | if ref == 0 { 1211 | cr.col.Release() 1212 | for i := range cr.chunks { 1213 | cr.chunks[i].Release() 1214 | } 1215 | if cr.currentChunk != nil { 1216 | cr.currentChunk.Release() 1217 | cr.currentChunk = nil 1218 | } 1219 | cr.col = nil 1220 | cr.chunks = nil 1221 | cr.dtype = nil 1222 | } 1223 | } 1224 | 1225 | // Time64ChunkIterator is an iterator for reading an Arrow Column value by value. 1226 | type Time64ChunkIterator struct { 1227 | refCount int64 1228 | col *array.Column 1229 | 1230 | // Things Chunked maintains. We're going to maintain it ourselves. 1231 | chunks []*array.Time64 // cache the chunks on this iterator 1232 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 1233 | nulls int64 1234 | dtype arrow.DataType 1235 | 1236 | // Things we need to maintain for the iterator 1237 | currentIndex int // current chunk 1238 | currentChunk *array.Time64 // current chunk 1239 | } 1240 | 1241 | // NewTime64ChunkIterator creates a new Time64ChunkIterator for reading an Arrow Column. 1242 | func NewTime64ChunkIterator(col *array.Column) *Time64ChunkIterator { 1243 | col.Retain() 1244 | 1245 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 1246 | columnChunks := col.Data().Chunks() 1247 | chunks := make([]*array.Time64, len(columnChunks)) 1248 | var length int64 1249 | var nulls int64 1250 | 1251 | for i, chunk := range columnChunks { 1252 | // Keep our own refs to chunks 1253 | chunks[i] = chunk.(*array.Time64) 1254 | // Retain the chunk 1255 | chunks[i].Retain() 1256 | 1257 | // Keep our own counters instead of Chunked's 1258 | length += int64(chunk.Len()) 1259 | nulls += int64(chunk.NullN()) 1260 | } 1261 | 1262 | return &Time64ChunkIterator{ 1263 | refCount: 1, 1264 | col: col, 1265 | 1266 | chunks: chunks, 1267 | length: length, 1268 | nulls: nulls, 1269 | dtype: col.DataType(), 1270 | 1271 | currentIndex: 0, 1272 | currentChunk: nil, 1273 | } 1274 | } 1275 | 1276 | // Chunk will return the current chunk that the iterator is on. 1277 | func (cr *Time64ChunkIterator) Chunk() *array.Time64 { return cr.currentChunk } 1278 | 1279 | // ChunkValues returns the underlying []arrow.Time64 chunk values. 1280 | // Keep in mind the []arrow.Time64 type might not be able 1281 | // to account for nil values. You must check for those explicitly via the chunk. 1282 | func (cr *Time64ChunkIterator) ChunkValues() []arrow.Time64 { return cr.Chunk().Time64Values() } 1283 | 1284 | // Next moves the iterator to the next chunk. This will return false 1285 | // when there are no more chunks. 1286 | func (cr *Time64ChunkIterator) Next() bool { 1287 | if cr.currentIndex >= len(cr.chunks) { 1288 | return false 1289 | } 1290 | 1291 | if cr.currentChunk != nil { 1292 | cr.currentChunk.Release() 1293 | } 1294 | 1295 | cr.currentChunk = cr.chunks[cr.currentIndex] 1296 | cr.currentChunk.Retain() 1297 | cr.currentIndex++ 1298 | 1299 | return true 1300 | } 1301 | 1302 | // Retain keeps a reference to the Time64ChunkIterator 1303 | func (cr *Time64ChunkIterator) Retain() { 1304 | atomic.AddInt64(&cr.refCount, 1) 1305 | } 1306 | 1307 | // Release removes a reference to the Time64ChunkIterator 1308 | func (cr *Time64ChunkIterator) Release() { 1309 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1310 | ref := atomic.AddInt64(&cr.refCount, -1) 1311 | if ref == 0 { 1312 | cr.col.Release() 1313 | for i := range cr.chunks { 1314 | cr.chunks[i].Release() 1315 | } 1316 | if cr.currentChunk != nil { 1317 | cr.currentChunk.Release() 1318 | cr.currentChunk = nil 1319 | } 1320 | cr.col = nil 1321 | cr.chunks = nil 1322 | cr.dtype = nil 1323 | } 1324 | } 1325 | 1326 | // Date32ChunkIterator is an iterator for reading an Arrow Column value by value. 1327 | type Date32ChunkIterator struct { 1328 | refCount int64 1329 | col *array.Column 1330 | 1331 | // Things Chunked maintains. We're going to maintain it ourselves. 1332 | chunks []*array.Date32 // cache the chunks on this iterator 1333 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 1334 | nulls int64 1335 | dtype arrow.DataType 1336 | 1337 | // Things we need to maintain for the iterator 1338 | currentIndex int // current chunk 1339 | currentChunk *array.Date32 // current chunk 1340 | } 1341 | 1342 | // NewDate32ChunkIterator creates a new Date32ChunkIterator for reading an Arrow Column. 1343 | func NewDate32ChunkIterator(col *array.Column) *Date32ChunkIterator { 1344 | col.Retain() 1345 | 1346 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 1347 | columnChunks := col.Data().Chunks() 1348 | chunks := make([]*array.Date32, len(columnChunks)) 1349 | var length int64 1350 | var nulls int64 1351 | 1352 | for i, chunk := range columnChunks { 1353 | // Keep our own refs to chunks 1354 | chunks[i] = chunk.(*array.Date32) 1355 | // Retain the chunk 1356 | chunks[i].Retain() 1357 | 1358 | // Keep our own counters instead of Chunked's 1359 | length += int64(chunk.Len()) 1360 | nulls += int64(chunk.NullN()) 1361 | } 1362 | 1363 | return &Date32ChunkIterator{ 1364 | refCount: 1, 1365 | col: col, 1366 | 1367 | chunks: chunks, 1368 | length: length, 1369 | nulls: nulls, 1370 | dtype: col.DataType(), 1371 | 1372 | currentIndex: 0, 1373 | currentChunk: nil, 1374 | } 1375 | } 1376 | 1377 | // Chunk will return the current chunk that the iterator is on. 1378 | func (cr *Date32ChunkIterator) Chunk() *array.Date32 { return cr.currentChunk } 1379 | 1380 | // ChunkValues returns the underlying []arrow.Date32 chunk values. 1381 | // Keep in mind the []arrow.Date32 type might not be able 1382 | // to account for nil values. You must check for those explicitly via the chunk. 1383 | func (cr *Date32ChunkIterator) ChunkValues() []arrow.Date32 { return cr.Chunk().Date32Values() } 1384 | 1385 | // Next moves the iterator to the next chunk. This will return false 1386 | // when there are no more chunks. 1387 | func (cr *Date32ChunkIterator) Next() bool { 1388 | if cr.currentIndex >= len(cr.chunks) { 1389 | return false 1390 | } 1391 | 1392 | if cr.currentChunk != nil { 1393 | cr.currentChunk.Release() 1394 | } 1395 | 1396 | cr.currentChunk = cr.chunks[cr.currentIndex] 1397 | cr.currentChunk.Retain() 1398 | cr.currentIndex++ 1399 | 1400 | return true 1401 | } 1402 | 1403 | // Retain keeps a reference to the Date32ChunkIterator 1404 | func (cr *Date32ChunkIterator) Retain() { 1405 | atomic.AddInt64(&cr.refCount, 1) 1406 | } 1407 | 1408 | // Release removes a reference to the Date32ChunkIterator 1409 | func (cr *Date32ChunkIterator) Release() { 1410 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1411 | ref := atomic.AddInt64(&cr.refCount, -1) 1412 | if ref == 0 { 1413 | cr.col.Release() 1414 | for i := range cr.chunks { 1415 | cr.chunks[i].Release() 1416 | } 1417 | if cr.currentChunk != nil { 1418 | cr.currentChunk.Release() 1419 | cr.currentChunk = nil 1420 | } 1421 | cr.col = nil 1422 | cr.chunks = nil 1423 | cr.dtype = nil 1424 | } 1425 | } 1426 | 1427 | // Date64ChunkIterator is an iterator for reading an Arrow Column value by value. 1428 | type Date64ChunkIterator struct { 1429 | refCount int64 1430 | col *array.Column 1431 | 1432 | // Things Chunked maintains. We're going to maintain it ourselves. 1433 | chunks []*array.Date64 // cache the chunks on this iterator 1434 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 1435 | nulls int64 1436 | dtype arrow.DataType 1437 | 1438 | // Things we need to maintain for the iterator 1439 | currentIndex int // current chunk 1440 | currentChunk *array.Date64 // current chunk 1441 | } 1442 | 1443 | // NewDate64ChunkIterator creates a new Date64ChunkIterator for reading an Arrow Column. 1444 | func NewDate64ChunkIterator(col *array.Column) *Date64ChunkIterator { 1445 | col.Retain() 1446 | 1447 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 1448 | columnChunks := col.Data().Chunks() 1449 | chunks := make([]*array.Date64, len(columnChunks)) 1450 | var length int64 1451 | var nulls int64 1452 | 1453 | for i, chunk := range columnChunks { 1454 | // Keep our own refs to chunks 1455 | chunks[i] = chunk.(*array.Date64) 1456 | // Retain the chunk 1457 | chunks[i].Retain() 1458 | 1459 | // Keep our own counters instead of Chunked's 1460 | length += int64(chunk.Len()) 1461 | nulls += int64(chunk.NullN()) 1462 | } 1463 | 1464 | return &Date64ChunkIterator{ 1465 | refCount: 1, 1466 | col: col, 1467 | 1468 | chunks: chunks, 1469 | length: length, 1470 | nulls: nulls, 1471 | dtype: col.DataType(), 1472 | 1473 | currentIndex: 0, 1474 | currentChunk: nil, 1475 | } 1476 | } 1477 | 1478 | // Chunk will return the current chunk that the iterator is on. 1479 | func (cr *Date64ChunkIterator) Chunk() *array.Date64 { return cr.currentChunk } 1480 | 1481 | // ChunkValues returns the underlying []arrow.Date64 chunk values. 1482 | // Keep in mind the []arrow.Date64 type might not be able 1483 | // to account for nil values. You must check for those explicitly via the chunk. 1484 | func (cr *Date64ChunkIterator) ChunkValues() []arrow.Date64 { return cr.Chunk().Date64Values() } 1485 | 1486 | // Next moves the iterator to the next chunk. This will return false 1487 | // when there are no more chunks. 1488 | func (cr *Date64ChunkIterator) Next() bool { 1489 | if cr.currentIndex >= len(cr.chunks) { 1490 | return false 1491 | } 1492 | 1493 | if cr.currentChunk != nil { 1494 | cr.currentChunk.Release() 1495 | } 1496 | 1497 | cr.currentChunk = cr.chunks[cr.currentIndex] 1498 | cr.currentChunk.Retain() 1499 | cr.currentIndex++ 1500 | 1501 | return true 1502 | } 1503 | 1504 | // Retain keeps a reference to the Date64ChunkIterator 1505 | func (cr *Date64ChunkIterator) Retain() { 1506 | atomic.AddInt64(&cr.refCount, 1) 1507 | } 1508 | 1509 | // Release removes a reference to the Date64ChunkIterator 1510 | func (cr *Date64ChunkIterator) Release() { 1511 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 1512 | ref := atomic.AddInt64(&cr.refCount, -1) 1513 | if ref == 0 { 1514 | cr.col.Release() 1515 | for i := range cr.chunks { 1516 | cr.chunks[i].Release() 1517 | } 1518 | if cr.currentChunk != nil { 1519 | cr.currentChunk.Release() 1520 | cr.currentChunk = nil 1521 | } 1522 | cr.col = nil 1523 | cr.chunks = nil 1524 | cr.dtype = nil 1525 | } 1526 | } 1527 | -------------------------------------------------------------------------------- /iterator/chunkiterator.gen.go.tmpl: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/go-bullseye/bullseye/internal/debug" 7 | "github.com/apache/arrow/go/arrow" 8 | "github.com/apache/arrow/go/arrow/array" 9 | ) 10 | 11 | {{range .In}} 12 | // {{.Name}}ChunkIterator is an iterator for reading an Arrow Column value by value. 13 | type {{.Name}}ChunkIterator struct { 14 | refCount int64 15 | col *array.Column 16 | 17 | // Things Chunked maintains. We're going to maintain it ourselves. 18 | chunks []*array.{{.Name}} // cache the chunks on this iterator 19 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 20 | nulls int64 21 | dtype arrow.DataType 22 | 23 | // Things we need to maintain for the iterator 24 | currentIndex int // current chunk 25 | currentChunk *array.{{.Name}} // current chunk 26 | } 27 | 28 | // New{{.Name}}ChunkIterator creates a new {{.Name}}ChunkIterator for reading an Arrow Column. 29 | func New{{.Name}}ChunkIterator(col *array.Column) *{{.Name}}ChunkIterator { 30 | col.Retain() 31 | 32 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 33 | columnChunks := col.Data().Chunks() 34 | chunks := make([]*array.{{.Name}}, len(columnChunks)) 35 | var length int64 36 | var nulls int64 37 | 38 | for i, chunk := range columnChunks { 39 | // Keep our own refs to chunks 40 | chunks[i] = chunk.(*array.{{.Name}}) 41 | // Retain the chunk 42 | chunks[i].Retain() 43 | 44 | // Keep our own counters instead of Chunked's 45 | length += int64(chunk.Len()) 46 | nulls += int64(chunk.NullN()) 47 | } 48 | 49 | return &{{.Name}}ChunkIterator{ 50 | refCount: 1, 51 | col: col, 52 | 53 | chunks: chunks, 54 | length: length, 55 | nulls: nulls, 56 | dtype: col.DataType(), 57 | 58 | currentIndex: 0, 59 | currentChunk: nil, 60 | } 61 | } 62 | 63 | // Chunk will return the current chunk that the iterator is on. 64 | func (cr *{{.Name}}ChunkIterator) Chunk() *array.{{.Name}} { return cr.currentChunk } 65 | 66 | // ChunkValues returns the underlying []{{or .QualifiedType .Type}} chunk values. 67 | // Keep in mind the []{{or .QualifiedType .Type}} type might not be able 68 | // to account for nil values. You must check for those explicitly via the chunk. 69 | func (cr *{{.Name}}ChunkIterator) ChunkValues() []{{or .QualifiedType .Type}} { return cr.Chunk().{{.Name}}Values() } 70 | 71 | // Next moves the iterator to the next chunk. This will return false 72 | // when there are no more chunks. 73 | func (cr *{{.Name}}ChunkIterator) Next() bool { 74 | if cr.currentIndex >= len(cr.chunks) { 75 | return false 76 | } 77 | 78 | if cr.currentChunk != nil { 79 | cr.currentChunk.Release() 80 | } 81 | 82 | cr.currentChunk = cr.chunks[cr.currentIndex] 83 | cr.currentChunk.Retain() 84 | cr.currentIndex++ 85 | 86 | return true 87 | } 88 | 89 | // Retain keeps a reference to the {{.Name}}ChunkIterator 90 | func (cr *{{.Name}}ChunkIterator) Retain() { 91 | atomic.AddInt64(&cr.refCount, 1) 92 | } 93 | 94 | // Release removes a reference to the {{.Name}}ChunkIterator 95 | func (cr *{{.Name}}ChunkIterator) Release() { 96 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 97 | ref := atomic.AddInt64(&cr.refCount, -1) 98 | if ref == 0 { 99 | cr.col.Release() 100 | for i := range cr.chunks { 101 | cr.chunks[i].Release() 102 | } 103 | if cr.currentChunk != nil { 104 | cr.currentChunk.Release() 105 | cr.currentChunk = nil 106 | } 107 | cr.col = nil 108 | cr.chunks = nil 109 | cr.dtype = nil 110 | } 111 | } 112 | 113 | 114 | {{end}} 115 | -------------------------------------------------------------------------------- /iterator/chunkiterator.go: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/go-bullseye/bullseye/internal/debug" 9 | ) 10 | 11 | // ChunkIterator is a generic iterator for reading an Arrow Column chunk by chunk. 12 | type ChunkIterator struct { 13 | refCount int64 14 | col *array.Column 15 | 16 | // Things Chunked maintains. We're going to maintain it ourselves. 17 | chunks []array.Interface // cache the chunks on this iterator 18 | length int64 // this isn't set right on Chunked so we won't rely on it there. Instead we keep the correct value here. 19 | nulls int64 20 | dtype arrow.DataType 21 | 22 | // Things we need to maintain for the iterator 23 | currentIndex int // current chunk 24 | currentChunk array.Interface // current chunk 25 | } 26 | 27 | // NewChunkIterator creates a new ChunkIterator for reading an Arrow Column. 28 | func NewChunkIterator(col *array.Column) *ChunkIterator { 29 | col.Retain() 30 | 31 | // Chunked is not using the correct type to keep track of length so we have to recalculate it. 32 | columnChunks := col.Data().Chunks() 33 | chunks := make([]array.Interface, len(columnChunks)) 34 | var length int64 35 | var nulls int64 36 | 37 | for i, chunk := range columnChunks { 38 | // Retain the chunk 39 | chunk.Retain() 40 | 41 | // Keep our own refs to chunks 42 | chunks[i] = chunk 43 | 44 | // Keep our own counters instead of Chunked's 45 | length += int64(chunk.Len()) 46 | nulls += int64(chunk.NullN()) 47 | } 48 | 49 | return &ChunkIterator{ 50 | refCount: 1, 51 | col: col, 52 | 53 | chunks: chunks, 54 | length: length, 55 | nulls: nulls, 56 | dtype: col.DataType(), 57 | 58 | currentIndex: 0, 59 | currentChunk: nil, 60 | } 61 | } 62 | 63 | // Chunk will return the current chunk that the iterator is on. 64 | func (cr *ChunkIterator) Chunk() array.Interface { return cr.currentChunk } 65 | 66 | // Next moves the iterator to the next chunk. This will return false 67 | // when there are no more chunks. 68 | func (cr *ChunkIterator) Next() bool { 69 | if cr.currentIndex >= len(cr.chunks) { 70 | return false 71 | } 72 | 73 | if cr.currentChunk != nil { 74 | cr.currentChunk.Release() 75 | } 76 | 77 | cr.currentChunk = cr.chunks[cr.currentIndex] 78 | cr.currentChunk.Retain() 79 | cr.currentIndex++ 80 | 81 | return true 82 | } 83 | 84 | // Retain keeps a reference to the ChunkIterator 85 | func (cr *ChunkIterator) Retain() { 86 | atomic.AddInt64(&cr.refCount, 1) 87 | } 88 | 89 | // Release removes a reference to the ChunkIterator 90 | func (cr *ChunkIterator) Release() { 91 | debug.Assert(atomic.LoadInt64(&cr.refCount) > 0, "too many releases") 92 | ref := atomic.AddInt64(&cr.refCount, -1) 93 | if ref == 0 { 94 | cr.col.Release() 95 | for i := range cr.chunks { 96 | cr.chunks[i].Release() 97 | } 98 | if cr.currentChunk != nil { 99 | cr.currentChunk.Release() 100 | cr.currentChunk = nil 101 | } 102 | cr.col = nil 103 | cr.chunks = nil 104 | cr.dtype = nil 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /iterator/chunkiterator_test.go: -------------------------------------------------------------------------------- 1 | package iterator_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/apache/arrow/go/arrow/memory" 9 | "github.com/go-bullseye/bullseye/iterator" 10 | ) 11 | 12 | func buildRecords(pool *memory.CheckedAllocator, t *testing.T) ([]array.Record, *arrow.Schema) { 13 | schema := arrow.NewSchema( 14 | []arrow.Field{ 15 | {Name: "f1-i32", Type: arrow.PrimitiveTypes.Int32}, 16 | {Name: "f2-f64", Type: arrow.PrimitiveTypes.Float64}, 17 | }, 18 | nil, 19 | ) 20 | 21 | b := array.NewRecordBuilder(pool, schema) 22 | defer b.Release() 23 | 24 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{1, 2, 3, 4, 5, 6}, nil) 25 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{7, 8, 9, 10}, []bool{true, true, false, true}) 26 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, nil) 27 | 28 | rec1 := b.NewRecord() 29 | 30 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, nil) 31 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, nil) 32 | 33 | rec2 := b.NewRecord() 34 | 35 | b.Field(0).(*array.Int32Builder).AppendValues([]int32{31, 32, 33, 34, 35, 36, 37, 38, 39, 40}, nil) 36 | b.Field(1).(*array.Float64Builder).AppendValues([]float64{31, 32, 33, 34, 35, 36, 37, 38, 39, 40}, nil) 37 | 38 | rec3 := b.NewRecord() 39 | 40 | return []array.Record{rec1, rec2, rec3}, schema 41 | } 42 | 43 | func TestChunkIterator(t *testing.T) { 44 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 45 | defer pool.AssertSize(t, 0) 46 | 47 | records, schema := buildRecords(pool, t) 48 | defer func() { 49 | for i := range records { 50 | records[i].Release() 51 | } 52 | }() 53 | 54 | expectedPtrs := make([]*int32, len(records)) 55 | for i := range expectedPtrs { 56 | expectedPtrs[i] = &records[i].Column(0).(*array.Int32).Int32Values()[0] 57 | } 58 | 59 | tbl := array.NewTableFromRecords(schema, records) 60 | defer tbl.Release() 61 | 62 | column := tbl.Column(0) 63 | cr := iterator.NewChunkIterator(column) 64 | defer cr.Release() 65 | 66 | n := 0 67 | for cr.Next() { 68 | values := cr.Chunk().(*array.Int32).Int32Values() 69 | if got, want := &values[0], expectedPtrs[n]; got != want { 70 | t.Fatalf("got=%d, want=%d", got, want) 71 | } 72 | n++ 73 | } 74 | } 75 | 76 | func TestInt32ChunkIterator(t *testing.T) { 77 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 78 | defer pool.AssertSize(t, 0) 79 | 80 | records, schema := buildRecords(pool, t) 81 | defer func() { 82 | for i := range records { 83 | records[i].Release() 84 | } 85 | }() 86 | 87 | expectedPtrs := make([]*int32, len(records)) 88 | for i := range expectedPtrs { 89 | expectedPtrs[i] = &records[i].Column(0).(*array.Int32).Int32Values()[0] 90 | } 91 | 92 | tbl := array.NewTableFromRecords(schema, records) 93 | defer tbl.Release() 94 | 95 | column := tbl.Column(0) 96 | cr := iterator.NewInt32ChunkIterator(column) 97 | defer cr.Release() 98 | 99 | n := 0 100 | for cr.Next() { 101 | values := cr.ChunkValues() 102 | if got, want := &values[0], expectedPtrs[n]; got != want { 103 | t.Fatalf("got=%d, want=%d", got, want) 104 | } 105 | n++ 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /iterator/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package iterator provides iterators for chunks and values. 3 | 4 | Since Arrow can store chunks larger than the max int64 (9223372036854775807) due to how it 5 | store chunks, it's best to use iterators to iterate over chunks and their values. 6 | 7 | There are generic ChunkIterator and ValueIterator implementations as well as specific 8 | generated Arrow types for each of them, i.e. Float64ChunkIterator and Float64ValueIterator. 9 | 10 | */ 11 | package iterator 12 | -------------------------------------------------------------------------------- /iterator/stepiterator.go: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/go-bullseye/bullseye/internal/debug" 9 | ) 10 | 11 | // StepValue holds the value for a given step. 12 | type StepValue struct { 13 | Values []interface{} 14 | Exists []bool 15 | Dtypes []arrow.DataType 16 | } 17 | 18 | // Value returns the value at index i and the data type for that value. 19 | func (sv StepValue) Value(i int) (interface{}, arrow.DataType) { 20 | return sv.Values[i], sv.Dtypes[i] 21 | } 22 | 23 | // StepIterator iterates over multiple iterators in step. 24 | type StepIterator interface { 25 | Values() *StepValue 26 | Next() bool 27 | Retain() 28 | Release() 29 | } 30 | 31 | // stepIterator has a max number of elements it 32 | // can iterator over that must fit into uint64 33 | // which I doubt anyone is going to go over. 34 | type stepIterator struct { 35 | refCount int64 36 | iterators []ValueIterator 37 | index uint64 38 | stepValue *StepValue 39 | dtypes []arrow.DataType 40 | } 41 | 42 | // NewStepIteratorForColumns creates a new StepIterator given a slice of columns. 43 | func NewStepIteratorForColumns(cols []array.Column) StepIterator { 44 | itrs := make([]ValueIterator, 0, len(cols)) 45 | dtypes := make([]arrow.DataType, 0, len(cols)) 46 | for i := range cols { 47 | itrs = append(itrs, NewValueIterator(&cols[i])) 48 | dtypes = append(dtypes, cols[i].DataType()) 49 | } 50 | // NewStepIterator will retain the value iterators refs 51 | // so we need to remove our ref to them. 52 | for i := range itrs { 53 | defer itrs[i].Release() 54 | } 55 | return NewStepIterator(dtypes, itrs...) 56 | } 57 | 58 | // NewStepIterator creates a new StepIterator given a bunch of ValueIterators. 59 | func NewStepIterator(dtypes []arrow.DataType, iterators ...ValueIterator) StepIterator { 60 | for i := range iterators { 61 | iterators[i].Retain() 62 | } 63 | return &stepIterator{ 64 | refCount: 1, 65 | iterators: iterators, 66 | index: 0, 67 | dtypes: dtypes, 68 | } 69 | } 70 | 71 | // Values returns the values in the current step as a StepValue. 72 | func (s *stepIterator) Values() *StepValue { 73 | return s.stepValue 74 | } 75 | 76 | // Next returns false when there are no more rows in any iterator. 77 | func (s *stepIterator) Next() bool { 78 | // build the step values 79 | step := &StepValue{ 80 | Values: make([]interface{}, len(s.iterators)), 81 | Exists: make([]bool, len(s.iterators)), 82 | Dtypes: s.dtypes, 83 | } 84 | 85 | next := false 86 | for i, iterator := range s.iterators { 87 | exists := iterator.Next() 88 | next = exists || next 89 | step.Exists[i] = exists 90 | 91 | if exists { 92 | step.Values[i] = iterator.ValueInterface() 93 | } else { 94 | step.Values[i] = nil 95 | } 96 | } 97 | 98 | s.stepValue = step 99 | return next 100 | } 101 | 102 | func (s *stepIterator) Retain() { 103 | atomic.AddInt64(&s.refCount, 1) 104 | } 105 | 106 | func (s *stepIterator) Release() { 107 | refs := atomic.AddInt64(&s.refCount, -1) 108 | debug.Assert(refs >= 0, "too many releases") 109 | if refs == 0 { 110 | for i := range s.iterators { 111 | s.iterators[i].Release() 112 | } 113 | s.iterators = nil 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /iterator/stepiterator_test.go: -------------------------------------------------------------------------------- 1 | package iterator_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/apache/arrow/go/arrow/array" 7 | "github.com/apache/arrow/go/arrow/memory" 8 | "github.com/go-bullseye/bullseye/iterator" 9 | ) 10 | 11 | func TestNewStepIteratorForColumns(t *testing.T) { 12 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 13 | defer pool.AssertSize(t, 0) 14 | 15 | records, schema := buildRecords(pool, t) 16 | for i := range records { 17 | defer records[i].Release() 18 | } 19 | 20 | tbl := array.NewTableFromRecords(schema, records) 21 | defer tbl.Release() 22 | 23 | cols := make([]array.Column, 0, tbl.NumCols()) 24 | for i := 0; i < int(tbl.NumCols()); i++ { 25 | cols = append(cols, *tbl.Column(i)) 26 | } 27 | 28 | it := iterator.NewStepIteratorForColumns(cols) 29 | defer it.Release() 30 | } 31 | -------------------------------------------------------------------------------- /iterator/stringiterator.go: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/apache/arrow/go/arrow/array" 7 | "github.com/go-bullseye/bullseye/internal/debug" 8 | ) 9 | 10 | // StringValueIterator is an iterator for reading an Arrow Column 11 | // value by value for variable-length UTF-8 strings. 12 | type StringValueIterator struct { 13 | refCount int64 14 | chunkIterator *ChunkIterator 15 | 16 | // Things we need to maintain for the iterator 17 | index int // current value index 18 | ref *array.String // the chunk reference 19 | done bool // there are no more elements for this iterator 20 | } 21 | 22 | // NewStringValueIterator creates a new StringValueIterator for reading an Arrow Column. 23 | func NewStringValueIterator(col *array.Column) *StringValueIterator { 24 | // We need a ChunkIterator to read the chunks 25 | chunkIterator := NewChunkIterator(col) 26 | 27 | return &StringValueIterator{ 28 | refCount: 1, 29 | chunkIterator: chunkIterator, 30 | 31 | index: 0, 32 | ref: nil, 33 | } 34 | } 35 | 36 | // Value will return the current value that the iterator is on and boolean value indicating if the value is actually null. 37 | func (vr *StringValueIterator) Value() (string, bool) { 38 | return vr.ref.Value(vr.index), vr.ref.IsNull(vr.index) 39 | } 40 | 41 | // ValuePointer will return a pointer to the current value that the iterator is on. It will return nil if the value is actually null. 42 | func (vr *StringValueIterator) ValuePointer() *string { 43 | if vr.ref.IsNull(vr.index) { 44 | return nil 45 | } 46 | value := vr.ref.Value(vr.index) 47 | return &value 48 | } 49 | 50 | // ValueInterface returns the value as an interface{}. 51 | func (vr *StringValueIterator) ValueInterface() interface{} { 52 | if vr.ref.IsNull(vr.index) { 53 | return nil 54 | } 55 | return vr.ref.Value(vr.index) 56 | } 57 | 58 | // Next moves the iterator to the next value. This will return false 59 | // when there are no more values. 60 | func (vr *StringValueIterator) Next() bool { 61 | if vr.done { 62 | return false 63 | } 64 | 65 | // Move the index up 66 | vr.index++ 67 | 68 | // Keep moving the chunk up until we get one with data 69 | for vr.ref == nil || vr.index >= vr.ref.Len() { 70 | if !vr.nextChunk() { 71 | // There were no more chunks with data in them 72 | vr.done = true 73 | return false 74 | } 75 | } 76 | 77 | return true 78 | } 79 | 80 | func (vr *StringValueIterator) nextChunk() bool { 81 | // Advance the chunk until we get one with data in it or we are done 82 | if !vr.chunkIterator.Next() { 83 | // No more chunks 84 | return false 85 | } 86 | 87 | // There was another chunk. 88 | // We maintain the ref and the values because the ref is going to allow us to retain the memory. 89 | ref := vr.chunkIterator.Chunk() 90 | ref.Retain() 91 | 92 | if vr.ref != nil { 93 | vr.ref.Release() 94 | } 95 | 96 | vr.ref = ref.(*array.String) 97 | vr.index = 0 98 | return true 99 | } 100 | 101 | // Retain keeps a reference to the StringValueIterator 102 | func (vr *StringValueIterator) Retain() { 103 | atomic.AddInt64(&vr.refCount, 1) 104 | } 105 | 106 | // Release removes a reference to the StringValueIterator 107 | func (vr *StringValueIterator) Release() { 108 | debug.Assert(atomic.LoadInt64(&vr.refCount) > 0, "too many releases") 109 | 110 | if atomic.AddInt64(&vr.refCount, -1) == 0 { 111 | if vr.chunkIterator != nil { 112 | vr.chunkIterator.Release() 113 | vr.chunkIterator = nil 114 | } 115 | 116 | if vr.ref != nil { 117 | vr.ref.Release() 118 | vr.ref = nil 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /iterator/valueiterator.gen.go.tmpl: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "sync/atomic" 5 | 6 | "github.com/go-bullseye/bullseye/internal/debug" 7 | "github.com/apache/arrow/go/arrow/array" 8 | ) 9 | 10 | {{range .In}} 11 | // {{.Name}}ValueIterator is an iterator for reading an Arrow Column value by value. 12 | type {{.Name}}ValueIterator struct { 13 | refCount int64 14 | chunkIterator *{{.Name}}ChunkIterator 15 | 16 | // Things we need to maintain for the iterator 17 | index int // current value index 18 | values []{{or .QualifiedType .Type}} // current chunk values 19 | ref *array.{{.Name}} // the chunk reference 20 | done bool // there are no more elements for this iterator 21 | } 22 | 23 | // New{{.Name}}ValueIterator creates a new {{.Name}}ValueIterator for reading an Arrow Column. 24 | func New{{.Name}}ValueIterator(col *array.Column) *{{.Name}}ValueIterator { 25 | // We need a ChunkIterator to read the chunks 26 | chunkIterator := New{{.Name}}ChunkIterator(col) 27 | 28 | return &{{.Name}}ValueIterator{ 29 | refCount: 1, 30 | chunkIterator: chunkIterator, 31 | 32 | index: 0, 33 | values: nil, 34 | } 35 | } 36 | 37 | // Value will return the current value that the iterator is on and boolean value indicating if the value is actually null. 38 | func (vr *{{.Name}}ValueIterator) Value() ({{or .QualifiedType .Type}}, bool) { 39 | return vr.values[vr.index], vr.ref.IsNull(vr.index) 40 | } 41 | 42 | // ValuePointer will return a pointer to the current value that the iterator is on. It will return nil if the value is actually null. 43 | func (vr *{{.Name}}ValueIterator) ValuePointer() *{{or .QualifiedType .Type}} { 44 | if vr.ref.IsNull(vr.index) { 45 | return nil 46 | } 47 | return &vr.values[vr.index] 48 | } 49 | 50 | // ValueInterface returns the current value as an interface{}. 51 | func (vr *{{.Name}}ValueIterator) ValueInterface() interface{} { 52 | if vr.ref.IsNull(vr.index) { 53 | return nil 54 | } 55 | return vr.values[vr.index] 56 | } 57 | 58 | // Next moves the iterator to the next value. This will return false 59 | // when there are no more values. 60 | func (vr *{{.Name}}ValueIterator) Next() bool { 61 | if vr.done { 62 | return false 63 | } 64 | 65 | // Move the index up 66 | vr.index++ 67 | 68 | // Keep moving the chunk up until we get one with data 69 | for vr.values == nil || vr.index >= len(vr.values) { 70 | if !vr.nextChunk() { 71 | // There were no more chunks with data in them 72 | vr.done = true 73 | return false 74 | } 75 | } 76 | 77 | return true 78 | } 79 | 80 | func (vr *{{.Name}}ValueIterator) nextChunk() bool { 81 | // Advance the chunk until we get one with data in it or we are done 82 | if !vr.chunkIterator.Next() { 83 | // No more chunks 84 | return false 85 | } 86 | 87 | // There was another chunk. 88 | // We maintain the ref and the values because the ref is going to allow us to retain the memory. 89 | ref := vr.chunkIterator.Chunk() 90 | ref.Retain() 91 | 92 | if vr.ref != nil { 93 | vr.ref.Release() 94 | } 95 | 96 | vr.ref = ref 97 | vr.values = vr.chunkIterator.ChunkValues() 98 | vr.index = 0 99 | return true 100 | } 101 | 102 | // Retain keeps a reference to the {{.Name}}ValueIterator. 103 | func (vr *{{.Name}}ValueIterator) Retain() { 104 | atomic.AddInt64(&vr.refCount, 1) 105 | } 106 | 107 | // Release removes a reference to the {{.Name}}ValueIterator. 108 | func (vr *{{.Name}}ValueIterator) Release() { 109 | refs := atomic.AddInt64(&vr.refCount, -1) 110 | debug.Assert(refs >= 0, "too many releases") 111 | if refs == 0 { 112 | if vr.chunkIterator != nil { 113 | vr.chunkIterator.Release() 114 | vr.chunkIterator = nil 115 | } 116 | 117 | if vr.ref != nil { 118 | vr.ref.Release() 119 | vr.ref = nil 120 | } 121 | vr.values = nil 122 | } 123 | } 124 | 125 | 126 | {{end}} 127 | -------------------------------------------------------------------------------- /iterator/valueiterator.go: -------------------------------------------------------------------------------- 1 | package iterator 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | ) 9 | 10 | // ValueIterator is a generic iterator for scanning over values. 11 | type ValueIterator interface { 12 | // ValueInterface returns the current value as an interface{}. 13 | ValueInterface() interface{} 14 | 15 | // Next moves the iterator to the next value. This will return false when there are no more values. 16 | Next() bool 17 | 18 | // Retain keeps a reference to the ValueIterator. 19 | Retain() 20 | 21 | // Release removes a reference to the ValueIterator. 22 | Release() 23 | } 24 | 25 | // NewValueIterator creates a new generic ValueIterator. 26 | func NewValueIterator(column *array.Column) ValueIterator { 27 | field := column.Field() 28 | switch field.Type.(type) { 29 | case *arrow.Int8Type: 30 | return NewInt8ValueIterator(column) 31 | case *arrow.Int16Type: 32 | return NewInt16ValueIterator(column) 33 | case *arrow.Int32Type: 34 | return NewInt32ValueIterator(column) 35 | case *arrow.Int64Type: 36 | return NewInt64ValueIterator(column) 37 | case *arrow.Uint8Type: 38 | return NewUint8ValueIterator(column) 39 | case *arrow.Uint16Type: 40 | return NewUint16ValueIterator(column) 41 | case *arrow.Uint32Type: 42 | return NewUint32ValueIterator(column) 43 | case *arrow.Uint64Type: 44 | return NewUint64ValueIterator(column) 45 | case *arrow.Float32Type: 46 | return NewFloat32ValueIterator(column) 47 | case *arrow.Float64Type: 48 | return NewFloat64ValueIterator(column) 49 | case *arrow.Date32Type: 50 | return NewDate32ValueIterator(column) 51 | case *arrow.Date64Type: 52 | return NewDate64ValueIterator(column) 53 | case *arrow.BooleanType: 54 | return NewBooleanValueIterator(column) 55 | case *arrow.StringType: 56 | return NewStringValueIterator(column) 57 | 58 | default: 59 | panic(fmt.Errorf("dataframe/valueiterator: unhandled field type %T", field.Type)) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /iterator/valueiterator_test.go: -------------------------------------------------------------------------------- 1 | package iterator_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/apache/arrow/go/arrow" 7 | "github.com/apache/arrow/go/arrow/array" 8 | "github.com/apache/arrow/go/arrow/memory" 9 | "github.com/go-bullseye/bullseye/iterator" 10 | ) 11 | 12 | func TestInt32ValueIterator(t *testing.T) { 13 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 14 | defer pool.AssertSize(t, 0) 15 | 16 | records, schema := buildRecords(pool, t) 17 | var numRows int64 18 | for i := range records { 19 | defer records[i].Release() 20 | numRows += records[i].NumRows() 21 | } 22 | 23 | expectedValues := make([]int32, 0, numRows) 24 | expectedValuesBool := make([]bool, 0, numRows) 25 | for i := range records { 26 | ref := records[i].Column(0).(*array.Int32) 27 | values := ref.Int32Values() 28 | for j := range values { 29 | expectedValues = append(expectedValues, values[j]) 30 | expectedValuesBool = append(expectedValuesBool, ref.IsNull(j)) 31 | } 32 | } 33 | 34 | tbl := array.NewTableFromRecords(schema, records) 35 | defer tbl.Release() 36 | 37 | column := tbl.Column(0) 38 | cr := iterator.NewInt32ValueIterator(column) 39 | defer cr.Release() 40 | 41 | n := 0 42 | for cr.Next() { 43 | value, null := cr.Value() 44 | if got, want := value, expectedValues[n]; got != want { 45 | t.Fatalf("got=%d, want=%d", got, want) 46 | } 47 | if got, want := null, expectedValuesBool[n]; got != want { 48 | t.Fatalf("got=%v, want=%v", got, want) 49 | } 50 | n++ 51 | } 52 | } 53 | 54 | func TestInt32ValueIteratorPointer(t *testing.T) { 55 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 56 | defer pool.AssertSize(t, 0) 57 | 58 | records, schema := buildRecords(pool, t) 59 | var numRows int64 60 | for i := range records { 61 | defer records[i].Release() 62 | numRows += records[i].NumRows() 63 | } 64 | 65 | expectedPtrs := make([]*int32, 0, numRows) 66 | for i := range records { 67 | ref := records[i].Column(0).(*array.Int32) 68 | values := ref.Int32Values() 69 | for j := range values { 70 | if ref.IsNull(j) { 71 | expectedPtrs = append(expectedPtrs, nil) 72 | } else { 73 | expectedPtrs = append(expectedPtrs, &values[j]) 74 | } 75 | } 76 | } 77 | 78 | tbl := array.NewTableFromRecords(schema, records) 79 | defer tbl.Release() 80 | 81 | column := tbl.Column(0) 82 | cr := iterator.NewInt32ValueIterator(column) 83 | defer cr.Release() 84 | 85 | n := 0 86 | for cr.Next() { 87 | value := cr.ValuePointer() 88 | if got, want := value, expectedPtrs[n]; got != want { 89 | t.Fatalf("got=%d, want=%d", got, want) 90 | } 91 | n++ 92 | } 93 | } 94 | 95 | func TestFloat64ValueIterator(t *testing.T) { 96 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 97 | defer pool.AssertSize(t, 0) 98 | 99 | schema := arrow.NewSchema( 100 | []arrow.Field{ 101 | {Name: "f2-f64", Type: arrow.PrimitiveTypes.Float64}, 102 | }, 103 | nil, 104 | ) 105 | 106 | b := array.NewRecordBuilder(pool, schema) 107 | defer b.Release() 108 | 109 | expectedValues := []float64{ 110 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 111 | 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 112 | 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 113 | } 114 | 115 | expectedValuesBool := []bool{ 116 | true, true, true, true, true, true, true, true, true, true, 117 | true, false, true, false, true, true, true, true, true, false, 118 | true, true, true, true, true, true, true, true, true, true, 119 | } 120 | 121 | b.Field(0).(*array.Float64Builder).AppendValues(expectedValues[0:10], nil) 122 | rec1 := b.NewRecord() 123 | defer rec1.Release() 124 | 125 | b.Field(0).(*array.Float64Builder).AppendValues(expectedValues[10:20], expectedValuesBool[10:20]) 126 | rec2 := b.NewRecord() 127 | defer rec2.Release() 128 | 129 | b.Field(0).(*array.Float64Builder).AppendValues(expectedValues[20:30], nil) 130 | rec3 := b.NewRecord() 131 | defer rec3.Release() 132 | 133 | records := []array.Record{rec1, rec2, rec3} 134 | tbl := array.NewTableFromRecords(schema, records) 135 | defer tbl.Release() 136 | column := tbl.Column(0) 137 | vr := iterator.NewFloat64ValueIterator(column) 138 | defer vr.Release() 139 | 140 | n := 0 141 | for vr.Next() { 142 | value, null := vr.Value() 143 | if got, want := value, expectedValues[n]; got != want { 144 | t.Fatalf("got=%f, want=%f", got, want) 145 | } 146 | if got, want := !null, expectedValuesBool[n]; got != want { 147 | t.Fatalf("got=%v, want=%v (n=%d)", got, want, n) 148 | } 149 | n++ 150 | } 151 | } 152 | 153 | func TestDate32ValueIterator(t *testing.T) { 154 | t.Skip("TODO: Implement.") 155 | } 156 | 157 | func TestDate64ValueIterator(t *testing.T) { 158 | t.Skip("TODO: Implement.") 159 | } 160 | 161 | func TestBooleanValueIterator(t *testing.T) { 162 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 163 | defer pool.AssertSize(t, 0) 164 | 165 | schema := arrow.NewSchema( 166 | []arrow.Field{ 167 | {Name: "c1-bools", Type: arrow.FixedWidthTypes.Boolean}, 168 | }, 169 | nil, 170 | ) 171 | 172 | b := array.NewRecordBuilder(pool, schema) 173 | defer b.Release() 174 | 175 | expectedValues := []bool{ 176 | true, true, true, true, true, true, true, true, true, true, 177 | false, false, false, false, false, false, false, false, false, false, 178 | true, true, false, true, true, true, true, true, true, true, 179 | } 180 | 181 | expectedValuesBool := []bool{ 182 | true, true, true, true, true, true, true, true, true, true, 183 | true, false, true, false, true, true, true, true, true, false, 184 | true, true, true, true, true, true, true, true, true, true, 185 | } 186 | 187 | b.Field(0).(*array.BooleanBuilder).AppendValues(expectedValues[0:10], nil) 188 | rec1 := b.NewRecord() 189 | defer rec1.Release() 190 | 191 | b.Field(0).(*array.BooleanBuilder).AppendValues(expectedValues[10:20], expectedValuesBool[10:20]) 192 | rec2 := b.NewRecord() 193 | defer rec2.Release() 194 | 195 | b.Field(0).(*array.BooleanBuilder).AppendValues(expectedValues[20:30], nil) 196 | rec3 := b.NewRecord() 197 | defer rec3.Release() 198 | 199 | records := []array.Record{rec1, rec2, rec3} 200 | tbl := array.NewTableFromRecords(schema, records) 201 | defer tbl.Release() 202 | column := tbl.Column(0) 203 | vr := iterator.NewBooleanValueIterator(column) 204 | defer vr.Release() 205 | 206 | n := 0 207 | for vr.Next() { 208 | value, null := vr.Value() 209 | if got, want := value, expectedValues[n]; got != want { 210 | t.Fatalf("got=%t, want=%t", got, want) 211 | } 212 | if got, want := !null, expectedValuesBool[n]; got != want { 213 | t.Fatalf("got=%v, want=%v (n=%d)", got, want, n) 214 | } 215 | n++ 216 | } 217 | } 218 | 219 | func TestStringValueIterator(t *testing.T) { 220 | pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) 221 | defer pool.AssertSize(t, 0) 222 | 223 | schema := arrow.NewSchema( 224 | []arrow.Field{ 225 | {Name: "c1-strings", Type: arrow.BinaryTypes.String}, 226 | }, 227 | nil, 228 | ) 229 | 230 | b := array.NewRecordBuilder(pool, schema) 231 | defer b.Release() 232 | 233 | expectedValues := []string{ 234 | "true", "aaa", "true", "true", "true", "ccc", "true", "d", "true", "e", 235 | "false", "false", "false", "false", "false", "false", "false", "dog", "false", "false", 236 | "true", "true", "bbb", "true", "true", "true", "true", "true", "cat", "true", 237 | } 238 | 239 | expectedValuesBool := []bool{ 240 | true, true, true, true, true, true, true, true, true, true, 241 | true, false, true, false, true, true, true, true, true, false, 242 | true, true, true, true, true, true, true, true, true, true, 243 | } 244 | 245 | b.Field(0).(*array.StringBuilder).AppendValues(expectedValues[0:10], nil) 246 | rec1 := b.NewRecord() 247 | defer rec1.Release() 248 | 249 | b.Field(0).(*array.StringBuilder).AppendValues(expectedValues[10:20], expectedValuesBool[10:20]) 250 | rec2 := b.NewRecord() 251 | defer rec2.Release() 252 | 253 | b.Field(0).(*array.StringBuilder).AppendValues(expectedValues[20:30], nil) 254 | rec3 := b.NewRecord() 255 | defer rec3.Release() 256 | 257 | records := []array.Record{rec1, rec2, rec3} 258 | tbl := array.NewTableFromRecords(schema, records) 259 | defer tbl.Release() 260 | column := tbl.Column(0) 261 | vr := iterator.NewStringValueIterator(column) 262 | defer vr.Release() 263 | 264 | n := 0 265 | for vr.Next() { 266 | value, null := vr.Value() 267 | if got, want := value, expectedValues[n]; got != want { 268 | t.Fatalf("got=%s, want=%s", got, want) 269 | } 270 | if got, want := !null, expectedValuesBool[n]; got != want { 271 | t.Fatalf("got=%v, want=%v (n=%d)", got, want, n) 272 | } 273 | n++ 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /numeric.tmpldata: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Name": "Int64", 4 | "name": "int64", 5 | "Type": "int64", 6 | "Default": "0", 7 | "Size": "8" 8 | }, 9 | { 10 | "Name": "Uint64", 11 | "name": "uint64", 12 | "Type": "uint64", 13 | "Default": "0", 14 | "Size": "8" 15 | }, 16 | { 17 | "Name": "Float64", 18 | "name": "float64", 19 | "Type": "float64", 20 | "Default": "0", 21 | "Size": "8" 22 | }, 23 | { 24 | "Name": "Int32", 25 | "name": "int32", 26 | "Type": "int32", 27 | "Default": "0", 28 | "Size": "4", 29 | "Opt": { 30 | "BufferBuilder": true 31 | } 32 | }, 33 | { 34 | "Name": "Uint32", 35 | "name": "uint32", 36 | "Type": "uint32", 37 | "Default": "0", 38 | "Size": "4" 39 | }, 40 | { 41 | "Name": "Float32", 42 | "name": "float32", 43 | "Type": "float32", 44 | "Default": "0", 45 | "Size": "4" 46 | }, 47 | { 48 | "Name": "Int16", 49 | "name": "int16", 50 | "Type": "int16", 51 | "Default": "0", 52 | "Size": "2" 53 | }, 54 | { 55 | "Name": "Uint16", 56 | "name": "uint16", 57 | "Type": "uint16", 58 | "Default": "0", 59 | "Size": "2" 60 | }, 61 | { 62 | "Name": "Int8", 63 | "name": "int8", 64 | "Type": "int8", 65 | "Default": "0", 66 | "Size": "1" 67 | }, 68 | { 69 | "Name": "Uint8", 70 | "name": "uint8", 71 | "Type": "uint8", 72 | "Default": "0", 73 | "Size": "1" 74 | }, 75 | { 76 | "Name": "Timestamp", 77 | "name": "timestamp", 78 | "Type": "Timestamp", 79 | "QualifiedType": "arrow.Timestamp", 80 | "InternalType": "int64", 81 | "Default": "0", 82 | "Size": "8", 83 | "Opt": { 84 | "Parametric": true 85 | } 86 | }, 87 | { 88 | "Name": "Time32", 89 | "name": "time32", 90 | "Type": "Time32", 91 | "QualifiedType": "arrow.Time32", 92 | "InternalType": "int32", 93 | "Default": "0", 94 | "Size": "4", 95 | "Opt": { 96 | "Parametric": true 97 | } 98 | }, 99 | { 100 | "Name": "Time64", 101 | "name": "time64", 102 | "Type": "Time64", 103 | "QualifiedType": "arrow.Time64", 104 | "InternalType": "int64", 105 | "Default": "0", 106 | "Size": "8", 107 | "Opt": { 108 | "Parametric": true 109 | } 110 | }, 111 | { 112 | "Name": "Date32", 113 | "name": "date32", 114 | "Type": "Date32", 115 | "QualifiedType": "arrow.Date32", 116 | "InternalType": "int32", 117 | "Default": "0", 118 | "Size": "4" 119 | }, 120 | { 121 | "Name": "Date64", 122 | "name": "date64", 123 | "Type": "Date64", 124 | "QualifiedType": "arrow.Date64", 125 | "InternalType": "int64", 126 | "Default": "0", 127 | "Size": "8" 128 | } 129 | ] 130 | -------------------------------------------------------------------------------- /tools.go: -------------------------------------------------------------------------------- 1 | // +build tools 2 | 3 | package tools 4 | 5 | import ( 6 | _ "github.com/apache/arrow/go/arrow/_tools/tmpl" 7 | ) --------------------------------------------------------------------------------