├── .github └── dependabot.yaml ├── .gitignore ├── LICENCE.txt ├── README.md ├── bodkin.go ├── cmd ├── .gitignore └── main.go ├── go.mod ├── go.sum ├── json2parquet ├── .gitignore ├── cmd │ ├── .gitignore │ ├── cleaner │ │ └── main.go │ └── main.go └── json2parquet.go ├── option.go ├── pq └── parquet_writer.go ├── reader ├── .gitignore ├── encoder.go ├── input.go ├── loader.go ├── option.go ├── reader.go └── recordfactory.go ├── schema.go └── types.go /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: gomod 4 | directory: / 5 | schedule: 6 | interval: daily -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | go.work.sum 23 | 24 | # env file 25 | .env 26 | 27 | internal 28 | avro 29 | pochard 30 | experiments 31 | map.go 32 | *.schema 33 | *.pgo 34 | debug -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bodkin 🏹 2 | =================== 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/loicalleyne/bodkin.svg)](https://pkg.go.dev/github.com/loicalleyne/bodkin) 4 | 5 | Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow. 6 | 7 | The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet, especially on data whose schema is evolving or not strictly defined. 8 | An example would be with working with data retrieved from a 3rd-party API that does not maintain their OpenAPI spec. 9 | 10 | Bodkin enables you to use your _data_ to define and evolve your Arrow Schema. 11 | 12 | ## Features 13 | ### Arrow schema generation from data type inference 14 | - Converts a structured input (json string or []byte, Go struct or map[string]any) into an Apache Arrow schema 15 | - Supports nested types 16 | - Automatically evolves the Arrow schema with new fields when providing [new inputs](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.Unify) 17 | - Option to merge new infered schema at existing path for composibility ([bodkin.UnifyAtPath](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.UnifyAtPath)) 18 | - Converts schema field types when unifying schemas to accept evolving input data ([bodkin.WithTypeConversion](https://pkg.go.dev/github.com/loicalleyne/bodkin#WithTypeConversion)) 19 | - Tracks [changes](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.Changes) to the schema 20 | - [Export](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.ExportSchemaFile)/[import](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.ImportSchemaFile) a serialized Arrow schema to/from file or `[]byte` to transmit or persist schema definition 21 | ### Custom data loader 22 | - Load structured data directly to Arrow Records based on inferred schema 23 | - Individual input to Arrow Record with [reader.ReadToRecord](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.ReadToRecord) 24 | - io.Reader stream to Arrow Records ([bodkin.WithIOReader](https://pkg.go.dev/github.com/loicalleyne/bodkin#WithIOReader)) 25 | - retrieve a single `arrow.Record` with [reader.Next](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.Next) 26 | - retrieve a `[]arrow.Record` with [reader.NextBatch](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.NextBatch) 27 | 28 | ## 🚀 Install 29 | 30 | Using Bodkin is easy. First, use `go get` to install the latest version 31 | of the library. 32 | 33 | ```sh 34 | go get -u github.com/loicalleyne/bodkin@latest 35 | ``` 36 | 37 | ## 💡 Usage 38 | 39 | You can import `bodkin` using: 40 | 41 | ```go 42 | import "github.com/loicalleyne/bodkin" 43 | ``` 44 | 45 | Create a new Bodkin, provide some structured data and print out the resulting Arrow Schema's string representation and any field evaluation errors 46 | ```go 47 | var jsonS1 string = `{ 48 | "count": 89, 49 | "next": "https://sub.domain.com/api/search/?models=thurblig&page=3", 50 | "previous": null, 51 | "results": [{"id":7594}], 52 | "arrayscalar":[], 53 | "datefield":"1979-01-01", 54 | "timefield":"01:02:03" 55 | }` 56 | u, _ := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion()) 57 | u.Unify(jsonS1) 58 | s, _ := u.OriginSchema() 59 | fmt.Printf("original input %v\n", s.String()) 60 | for _, e := range u.Err() { 61 | fmt.Printf("%v : [%s]\n", e.Issue, e.Dotpath) 62 | } 63 | // original input schema: 64 | // fields: 5 65 | // - results: type=list, nullable>, nullable 66 | // - datefield: type=date32, nullable 67 | // - timefield: type=time64[ns], nullable 68 | // - count: type=float64, nullable 69 | // - next: type=utf8, nullable 70 | // could not determine type of unpopulated field : [$previous] 71 | // could not determine element type of empty array : [$arrayscalar] 72 | ``` 73 | 74 | Provide some more structured data and print out the new merged schema and the list of changes 75 | ```go 76 | var jsonS2 string = `{ 77 | "count": 89.5, 78 | "next": "https://sub.domain.com/api/search/?models=thurblig&page=3", 79 | "previous": "https://sub.domain.com/api/search/?models=thurblig&page=2", 80 | "results": [{"id":7594,"scalar":241.5,"nestedObj":{"strscalar":"str1","nestedarray":[123,456]}}], 81 | "arrayscalar":["str"], 82 | "datetime":"2024-10-24 19:03:09", 83 | "event_time":"2024-10-24T19:03:09+00:00", 84 | "datefield":"2024-10-24T19:03:09+00:00", 85 | "timefield":"1970-01-01" 86 | }` 87 | u.Unify(jsonS2) 88 | schema, _ := u.Schema() 89 | fmt.Printf("\nunified %v\n", schema.String()) 90 | fmt.Println(u.Changes()) 91 | // unified schema: 92 | // fields: 9 93 | // - count: type=float64, nullable 94 | // - next: type=utf8, nullable 95 | // - results: type=list>>, nullable>, nullable 96 | // - datefield: type=timestamp[ms, tz=UTC], nullable 97 | // - timefield: type=utf8, nullable 98 | // - previous: type=utf8, nullable 99 | // - datetime: type=timestamp[ms, tz=UTC], nullable 100 | // - arrayscalar: type=list, nullable 101 | // - event_time: type=timestamp[ms, tz=UTC], nullable 102 | // changes: 103 | // added $previous : utf8 104 | // added $datetime : timestamp[ms, tz=UTC] 105 | // changed $datefield : from date32 to timestamp[ms, tz=UTC] 106 | // added $results.results.elem.scalar : float64 107 | // added $results.results.elem.nested : struct> 108 | // added $arrayscalar : list 109 | // added $event_time : timestamp[ms, tz=UTC] 110 | // changed $timefield : from time64[ns] to utf8 111 | ``` 112 | 113 | Also works with nested Go structs and slices 114 | ```go 115 | stu := Student{ 116 | Name: "StudentName", 117 | Age: 25, 118 | ID: 123456, 119 | Day: 123, 120 | } 121 | sch := School{ 122 | Name: "SchoolName", 123 | Address: AddressType{ 124 | Country: "CountryName", 125 | }, 126 | } 127 | e, _ := bodkin.NewBodkin(stu, bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion()) 128 | sc, err := e.OriginSchema() 129 | fmt.Printf("original input %v\n", sc.String()) 130 | // original input schema: 131 | // fields: 5 132 | // - ID: type=int64, nullable 133 | // - Day: type=int32, nullable 134 | // - School: type=struct>, nullable 135 | // - Name: type=utf8, nullable 136 | // - Age: type=int32, nullable 137 | e.Unify(sch) 138 | sc, err = e.OriginSchema() 139 | fmt.Printf("unified %v\n", sc.String()) 140 | // unified schema: 141 | // fields: 5 142 | // - ID: type=int64, nullable 143 | // - Day: type=int32, nullable 144 | // - School: type=struct>, nullable 145 | // - Name: type=utf8, nullable 146 | // - Age: type=int32, nullable 147 | ``` 148 | 149 | Export your schema to a file, then import the file to retrieve the schema; or export/import to/from a []byte. 150 | ```go 151 | _ = u.ExportSchemaFile("./test.schema") 152 | imp, _ := u.ImportSchemaFile("./test.schema") 153 | fmt.Printf("imported %v\n", imp.String()) 154 | 155 | bs, _ := u.ExportSchemaBytes() 156 | sc, _ := u.ImportSchemaBytes(bs) 157 | fmt.Printf("imported %v\n", sc.String()) 158 | ``` 159 | 160 | Use a Bodkin Reader to load data to Arrow Records 161 | ```go 162 | u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion()) 163 | u.Unify(jsonS1) // feed data for schema generation 164 | rdr, _ := u.NewReader() // infered schema in Bodkin used to create Reader 165 | rec, _ := rdr.ReadToRecord([]byte(jsonS1)) // Reader loads data and returns Arrow Record 166 | ``` 167 | 168 | Provide a Bodkin Reader with an io.Reader to load many records 169 | ```go 170 | import "github.com/loicalleyne/bodkin/reader" 171 | ... 172 | u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion()) 173 | // Create Reader attached to Bodkin ... 174 | u.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024)) 175 | for u.Reader.Next(){ 176 | rec := r.Record() 177 | } 178 | // or create a stand-alone Reader if you have an existing *arrow.Schema 179 | rdr, _ := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024)) 180 | for rdr.Next() { 181 | rec := r.Record() 182 | ... 183 | } 184 | ``` 185 | 186 | Use the generated Arrow schema with Arrow's built-in JSON reader to decode JSON data into Arrow records 187 | ```go 188 | rdr = array.NewJSONReader(strings.NewReader(jsonS2), schema) 189 | defer rdr.Release() 190 | for rdr.Next() { 191 | rec := rdr.Record() 192 | rj, _ := rec.MarshalJSON() 193 | fmt.Printf("\nmarshaled record:\n%v\n", string(rj)) 194 | } 195 | // marshaled record: 196 | // [{"arrayscalar":["str"],"count":89.5,"datefield":"2024-10-24 19:03:09Z","datetime":"2024-10-24 19:03:09Z","event_time":"2024-10-24 19:03:09Z","next":"https://sub.domain.com/api/search/?models=thurblig\u0026page=3","previous":"https://sub.domain.com/api/search/?models=thurblig\u0026page=2","results":[{"id":7594,"nested":{"nestedarray":[123,456],"strscalar":"str1"},"scalar":241.5}],"timefield":"1970-01-01"} 197 | // ] 198 | ``` 199 | 200 | ## 💫 Show your support 201 | 202 | Give a ⭐️ if this project helped you! 203 | Feedback and PRs welcome. 204 | 205 | ## License 206 | 207 | Bodkin is released under the Apache 2.0 license. See [LICENCE.txt](LICENCE.txt) -------------------------------------------------------------------------------- /bodkin.go: -------------------------------------------------------------------------------- 1 | // Package bodkin is a Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow. 2 | // The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet with data whose shape 3 | // is evolving or not strictly defined. 4 | package bodkin 5 | 6 | import ( 7 | "bufio" 8 | "errors" 9 | "fmt" 10 | "io" 11 | "math" 12 | "os" 13 | "slices" 14 | "strings" 15 | 16 | "github.com/apache/arrow-go/v18/arrow" 17 | "github.com/apache/arrow-go/v18/arrow/flight" 18 | "github.com/apache/arrow-go/v18/arrow/memory" 19 | "github.com/loicalleyne/bodkin/reader" 20 | omap "github.com/wk8/go-ordered-map/v2" 21 | ) 22 | 23 | // Option configures a Bodkin 24 | type ( 25 | Option func(config) 26 | config *Bodkin 27 | ) 28 | 29 | // Field represents an element in the input data. 30 | type Field struct { 31 | Dotpath string `json:"dotpath"` 32 | Type arrow.Type `json:"arrow_type"` 33 | // Number of child fields if a nested type 34 | Childen int `json:"children,omitempty"` 35 | // Evaluation failure reason 36 | Issue error `json:"issue,omitempty"` 37 | } 38 | 39 | const ( 40 | unknown int = 0 41 | known int = 1 42 | ) 43 | 44 | // Bodkin is a collection of field paths, describing the columns of a structured input(s). 45 | type Bodkin struct { 46 | rr io.Reader 47 | br *bufio.Reader 48 | delim byte 49 | original *fieldPos 50 | old *fieldPos 51 | new *fieldPos 52 | opts []Option 53 | Reader *reader.DataReader 54 | knownFields *omap.OrderedMap[string, *fieldPos] 55 | untypedFields *omap.OrderedMap[string, *fieldPos] 56 | unificationCount int 57 | maxCount int 58 | inferTimeUnits bool 59 | quotedValuesAreStrings bool 60 | typeConversion bool 61 | err error 62 | changes error 63 | } 64 | 65 | func (u *Bodkin) Opts() []Option { return u.opts } 66 | 67 | // GetReader returns a DataReader, will return an existing DataReader if it exists, if not it will create a new one. If the Reader already exists, the opts are ignored. If you want to create a new Reader with different opts, use NewReader. 68 | func (u *Bodkin) GetReader(opts ...reader.Option) (*reader.DataReader, error) { 69 | if u.Reader == nil { 70 | return u.NewReader(opts...) 71 | } 72 | return u.Reader, nil 73 | } 74 | 75 | // NewReader returns a new DataReader, to be used to read structured input into Arrow records. 76 | func (u *Bodkin) NewReader(opts ...reader.Option) (*reader.DataReader, error) { 77 | schema, err := u.Schema() 78 | if err != nil { 79 | return nil, err 80 | } 81 | if schema == nil { 82 | return nil, fmt.Errorf("nil schema") 83 | } 84 | u.Reader, err = reader.NewReader(schema, 0, opts...) 85 | if err != nil { 86 | return nil, err 87 | } 88 | return u.Reader, nil 89 | } 90 | 91 | // NewBodkin returns a new Bodkin value from a structured input. 92 | // Input must be a json byte slice or string, a Go struct with exported fields or map[string]any. 93 | // Any unpopulated fields, empty objects or empty slices in JSON or map[string]any inputs are skipped as their 94 | // types cannot be evaluated and converted. 95 | func NewBodkin(opts ...Option) *Bodkin { 96 | return newBodkin(opts...) 97 | } 98 | 99 | func newBodkin(opts ...Option) *Bodkin { 100 | b := &Bodkin{} 101 | b.opts = opts 102 | for _, opt := range opts { 103 | opt(b) 104 | } 105 | 106 | // Ordered map of known fields, keys are field dotpaths. 107 | b.knownFields = omap.New[string, *fieldPos]() 108 | b.untypedFields = omap.New[string, *fieldPos]() 109 | b.maxCount = math.MaxInt 110 | return b 111 | } 112 | 113 | // Returns count of evaluated field paths. 114 | func (u *Bodkin) CountPaths() int { 115 | return u.knownFields.Len() 116 | } 117 | 118 | // Returns count of unevaluated field paths. 119 | func (u *Bodkin) CountPending() int { 120 | return u.untypedFields.Len() 121 | } 122 | 123 | // Err returns a []Field that could not be evaluated to date. 124 | func (u *Bodkin) Err() []Field { 125 | fp := u.sortMapKeysDesc(unknown) 126 | var paths []Field = make([]Field, len(fp)) 127 | for i, p := range fp { 128 | f, _ := u.untypedFields.Get(p) 129 | d := Field{Dotpath: f.dotPath(), Type: f.arrowType} 130 | switch f.arrowType { 131 | case arrow.STRUCT: 132 | d.Issue = fmt.Errorf("struct : %vs", ErrUndefinedFieldType) 133 | case arrow.LIST: 134 | d.Issue = fmt.Errorf("list : %v", ErrUndefinedArrayElementType) 135 | default: 136 | d.Issue = fmt.Errorf("%w", ErrUndefinedFieldType) 137 | } 138 | paths[i] = d 139 | } 140 | return paths 141 | } 142 | 143 | // Changes returns a list of field additions and field type conversions done 144 | // in the lifetime of the Bodkin object. 145 | func (u *Bodkin) Changes() error { return u.changes } 146 | 147 | // Count returns the number of datum evaluated for schema to date. 148 | func (u *Bodkin) Count() int { return u.unificationCount } 149 | 150 | // MaxCount returns the maximum number of datum to be evaluated for schema. 151 | func (u *Bodkin) MaxCount() int { return u.unificationCount } 152 | 153 | // ResetCount resets the count of datum evaluated for schema to date. 154 | func (u *Bodkin) ResetCount() int { 155 | u.unificationCount = 0 156 | return u.unificationCount 157 | } 158 | 159 | // ResetMaxCount resets the maximum number of datam to be evaluated for schema 160 | // to maxInt64. 161 | // ResetCount resets the count of datum evaluated for schema to date. 162 | func (u *Bodkin) ResetMaxCount() int { 163 | u.maxCount = math.MaxInt 164 | return u.unificationCount 165 | } 166 | 167 | // Paths returns a slice of dotpaths of fields successfully evaluated to date. 168 | func (u *Bodkin) Paths() []Field { 169 | fp := u.sortMapKeysDesc(known) 170 | var paths []Field = make([]Field, len(fp)) 171 | for i, p := range fp { 172 | f, ok := u.knownFields.Get(p) 173 | if !ok { 174 | continue 175 | } 176 | d := Field{Dotpath: f.dotPath(), Type: f.arrowType} 177 | switch f.arrowType { 178 | case arrow.STRUCT: 179 | d.Childen = len(f.children) 180 | } 181 | paths[i] = d 182 | } 183 | return paths 184 | } 185 | 186 | // ExportSchema exports a serialized Arrow Schema to a file. 187 | func (u *Bodkin) ExportSchemaFile(exportPath string) error { 188 | schema, err := u.Schema() 189 | if err != nil { 190 | return err 191 | } 192 | bs := flight.SerializeSchema(schema, memory.DefaultAllocator) 193 | err = os.WriteFile(exportPath, bs, 0644) 194 | if err != nil { 195 | return err 196 | } 197 | return nil 198 | } 199 | 200 | // ImportSchema imports a serialized Arrow Schema from a file. 201 | func (u *Bodkin) ImportSchemaFile(importPath string) (*arrow.Schema, error) { 202 | dat, err := os.ReadFile(importPath) 203 | if err != nil { 204 | return nil, err 205 | } 206 | return flight.DeserializeSchema(dat, memory.DefaultAllocator) 207 | } 208 | 209 | // ExportSchemaBytes exports a serialized Arrow Schema. 210 | func (u *Bodkin) ExportSchemaBytes() ([]byte, error) { 211 | schema, err := u.Schema() 212 | if err != nil { 213 | return nil, err 214 | } 215 | return flight.SerializeSchema(schema, memory.DefaultAllocator), nil 216 | } 217 | 218 | // ImportSchemaBytes imports a serialized Arrow Schema. 219 | func (u *Bodkin) ImportSchemaBytes(dat []byte) (*arrow.Schema, error) { 220 | return flight.DeserializeSchema(dat, memory.DefaultAllocator) 221 | } 222 | 223 | // Unify merges structured input's column definition with the previously input's schema. 224 | // Any unpopulated fields, empty objects or empty slices in JSON input are skipped. 225 | func (u *Bodkin) Unify(a any) error { 226 | if u.unificationCount > u.maxCount { 227 | return fmt.Errorf("maxcount exceeded") 228 | } 229 | m, err := reader.InputMap(a) 230 | if err != nil { 231 | u.err = fmt.Errorf("%v : %v", ErrInvalidInput, err) 232 | return fmt.Errorf("%v : %v", ErrInvalidInput, err) 233 | } 234 | if u.old == nil { 235 | // Keep an immutable copy of the initial evaluation. 236 | g := newFieldPos(u) 237 | mapToArrow(g, m) 238 | u.original = g 239 | // Identical to above except this one can be mutated with Unify. 240 | f := newFieldPos(u) 241 | mapToArrow(f, m) 242 | u.old = f 243 | return nil 244 | } 245 | f := newFieldPos(u) 246 | mapToArrow(f, m) 247 | u.new = f 248 | for _, field := range u.new.children { 249 | u.merge(field, nil) 250 | } 251 | u.unificationCount++ 252 | return nil 253 | } 254 | 255 | // UnifyScan reads from a provided io.Reader and merges each datum's structured input's column definition 256 | // with the previously input's schema. Any unpopulated fields, empty objects or empty slices 257 | // in JSON input are skipped. 258 | func (u *Bodkin) UnifyScan() error { 259 | var err error 260 | if u.rr == nil { 261 | return fmt.Errorf("no io.reader provided") 262 | } 263 | if u.unificationCount > u.maxCount { 264 | return fmt.Errorf("maxcount exceeded") 265 | } 266 | defer func() error { 267 | if rc := recover(); rc != nil { 268 | u.err = errors.Join(u.err, err, fmt.Errorf("panic %v", rc)) 269 | } 270 | return u.err 271 | }() 272 | for { 273 | datumBytes, err := u.br.ReadBytes(u.delim) 274 | if err != nil { 275 | if errors.Is(err, io.EOF) { 276 | u.err = nil 277 | break 278 | } 279 | u.err = err 280 | break 281 | } 282 | m, err := reader.InputMap(datumBytes) 283 | if err != nil { 284 | u.err = errors.Join(u.err, err) 285 | continue 286 | } 287 | u.Unify(m) 288 | } 289 | return u.err 290 | } 291 | 292 | // Unify merges structured input's column definition with the previously input's schema, 293 | // using a specified valid path as the root. An error is returned if the mergeAt path is 294 | // not found. 295 | // Any unpopulated fields, empty objects or empty slices in JSON input are skipped. 296 | func (u *Bodkin) UnifyAtPath(a any, mergeAt string) error { 297 | if u.old == nil { 298 | return fmt.Errorf("bodkin not initialised") 299 | } 300 | if u.unificationCount > u.maxCount { 301 | return fmt.Errorf("maxcount exceeded") 302 | } 303 | mergePath := make([]string, 0) 304 | if !(len(mergeAt) == 0 || mergeAt == "$") { 305 | mergePath = strings.Split(strings.TrimPrefix(mergeAt, "$"), ".") 306 | } 307 | if _, ok := u.knownFields.Get(mergeAt); !ok { 308 | return fmt.Errorf("unitfyatpath %s : %v", mergeAt, ErrPathNotFound) 309 | } 310 | 311 | m, err := reader.InputMap(a) 312 | if err != nil { 313 | u.err = fmt.Errorf("%v : %v", ErrInvalidInput, err) 314 | return fmt.Errorf("%v : %v", ErrInvalidInput, err) 315 | } 316 | 317 | f := newFieldPos(u) 318 | mapToArrow(f, m) 319 | u.new = f 320 | for _, field := range u.new.children { 321 | u.merge(field, mergePath) 322 | } 323 | u.unificationCount++ 324 | return nil 325 | } 326 | 327 | // Schema returns the original Arrow schema generated from the structure/types of 328 | // the initial input, and a panic recovery error if the schema could not be created. 329 | func (u *Bodkin) OriginSchema() (*arrow.Schema, error) { 330 | if u.old == nil { 331 | return nil, fmt.Errorf("bodkin not initialised") 332 | } 333 | var s *arrow.Schema 334 | defer func(s *arrow.Schema) (*arrow.Schema, error) { 335 | if pErr := recover(); pErr != nil { 336 | return nil, fmt.Errorf("schema problem: %v", pErr) 337 | } 338 | return s, nil 339 | }(s) 340 | var fields []arrow.Field 341 | for _, c := range u.original.children { 342 | fields = append(fields, c.field) 343 | } 344 | s = arrow.NewSchema(fields, nil) 345 | return s, nil 346 | } 347 | 348 | // Schema returns the current merged Arrow schema generated from the structure/types of 349 | // the input(s), and a panic recovery error if the schema could not be created. 350 | // If the Bodkin has a Reader and the schema has been updated since its creation, the Reader 351 | // will replaced with a new one matching the current schema. Any 352 | func (u *Bodkin) Schema() (*arrow.Schema, error) { 353 | if u.old == nil { 354 | return nil, fmt.Errorf("bodkin not initialised") 355 | } 356 | var s *arrow.Schema 357 | defer func(s *arrow.Schema) (*arrow.Schema, error) { 358 | if pErr := recover(); pErr != nil { 359 | return nil, fmt.Errorf("schema problem: %v", pErr) 360 | } 361 | return s, nil 362 | }(s) 363 | var fields []arrow.Field 364 | for _, c := range u.old.children { 365 | fields = append(fields, c.field) 366 | } 367 | s = arrow.NewSchema(fields, nil) 368 | if u.Reader != nil { 369 | if !u.Reader.Schema().Equal(s) { 370 | u.Reader, _ = reader.NewReader(s, 0, u.Reader.Opts()...) 371 | } 372 | } 373 | return s, nil 374 | } 375 | 376 | // LastSchema returns the Arrow schema generated from the structure/types of 377 | // the most recent input. Any unpopulated fields, empty objects or empty slices are skipped. 378 | // ErrNoLatestSchema if Unify() has never been called. A panic recovery error is returned 379 | // if the schema could not be created. 380 | func (u *Bodkin) LastSchema() (*arrow.Schema, error) { 381 | if u.new == nil { 382 | return nil, ErrNoLatestSchema 383 | } 384 | var s *arrow.Schema 385 | defer func(s *arrow.Schema) (*arrow.Schema, error) { 386 | if pErr := recover(); pErr != nil { 387 | return nil, fmt.Errorf("schema problem: %v", pErr) 388 | } 389 | return s, nil 390 | }(s) 391 | var fields []arrow.Field 392 | for _, c := range u.new.children { 393 | fields = append(fields, c.field) 394 | } 395 | s = arrow.NewSchema(fields, nil) 396 | return s, nil 397 | } 398 | 399 | // merge merges a new or changed field into the unified schema. 400 | // Conflicting TIME, DATE, TIMESTAMP types are upgraded to STRING. 401 | // DATE can upgrade to TIMESTAMP. 402 | // INTEGER can upgrade to FLOAT. 403 | func (u *Bodkin) merge(n *fieldPos, mergeAt []string) { 404 | var nPath, nParentPath []string 405 | if len(mergeAt) > 0 { 406 | nPath = slices.Concat(mergeAt, n.path) 407 | nParentPath = slices.Concat(mergeAt, n.parent.path) 408 | } else { 409 | nPath = n.path 410 | nParentPath = n.parent.path 411 | } 412 | if kin, err := u.old.getPath(nPath); err == ErrPathNotFound { 413 | // root graft 414 | if n.root == n.parent { 415 | u.old.root.graft(n) 416 | } else { 417 | // branch graft 418 | b, _ := u.old.getPath(nParentPath) 419 | b.graft(n) 420 | } 421 | } else { 422 | if u.typeConversion && (!kin.field.Equal(n.field) && kin.field.Type.ID() != n.field.Type.ID()) { 423 | switch kin.field.Type.ID() { 424 | case arrow.NULL: 425 | break 426 | case arrow.STRING: 427 | break 428 | case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64, arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64: 429 | switch n.field.Type.ID() { 430 | case arrow.FLOAT16, arrow.FLOAT32, arrow.FLOAT64: 431 | err := kin.upgradeType(n, arrow.FLOAT64) 432 | if err != nil { 433 | kin.err = errors.Join(kin.err, err) 434 | } 435 | default: 436 | err := kin.upgradeType(n, arrow.STRING) 437 | if err != nil { 438 | kin.err = errors.Join(kin.err, err) 439 | } 440 | } 441 | case arrow.FLOAT16: 442 | switch n.field.Type.ID() { 443 | case arrow.FLOAT32: 444 | err := kin.upgradeType(n, arrow.FLOAT32) 445 | if err != nil { 446 | kin.err = errors.Join(kin.err, err) 447 | } 448 | case arrow.FLOAT64: 449 | err := kin.upgradeType(n, arrow.FLOAT64) 450 | if err != nil { 451 | kin.err = errors.Join(kin.err, err) 452 | } 453 | default: 454 | err := kin.upgradeType(n, arrow.STRING) 455 | if err != nil { 456 | kin.err = errors.Join(kin.err, err) 457 | } 458 | } 459 | case arrow.FLOAT32: 460 | switch n.field.Type.ID() { 461 | case arrow.FLOAT64: 462 | err := kin.upgradeType(n, arrow.FLOAT64) 463 | if err != nil { 464 | kin.err = errors.Join(kin.err, err) 465 | } 466 | default: 467 | err := kin.upgradeType(n, arrow.STRING) 468 | if err != nil { 469 | kin.err = errors.Join(kin.err, err) 470 | } 471 | } 472 | case arrow.FLOAT64: 473 | switch n.field.Type.ID() { 474 | case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64, arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64, arrow.FLOAT16, arrow.FLOAT32: 475 | break 476 | default: 477 | err := kin.upgradeType(n, arrow.STRING) 478 | if err != nil { 479 | kin.err = errors.Join(kin.err, err) 480 | } 481 | } 482 | case arrow.TIMESTAMP: 483 | switch n.field.Type.ID() { 484 | case arrow.TIME64: 485 | err := kin.upgradeType(n, arrow.STRING) 486 | if err != nil { 487 | kin.err = errors.Join(kin.err, err) 488 | } 489 | } 490 | case arrow.DATE32: 491 | switch n.field.Type.ID() { 492 | case arrow.TIMESTAMP: 493 | err := kin.upgradeType(n, arrow.TIMESTAMP) 494 | if err != nil { 495 | kin.err = errors.Join(kin.err, err) 496 | } 497 | // case arrow.TIME64: 498 | default: 499 | err := kin.upgradeType(n, arrow.STRING) 500 | if err != nil { 501 | kin.err = errors.Join(kin.err, err) 502 | } 503 | } 504 | case arrow.TIME64: 505 | switch n.field.Type.ID() { 506 | case arrow.DATE32, arrow.TIMESTAMP: 507 | err := kin.upgradeType(n, arrow.STRING) 508 | if err != nil { 509 | kin.err = errors.Join(kin.err, err) 510 | } 511 | } 512 | } 513 | } 514 | for _, v := range n.childmap { 515 | u.merge(v, mergeAt) 516 | } 517 | } 518 | } 519 | 520 | func (u *Bodkin) sortMapKeysDesc(k int) []string { 521 | var m *omap.OrderedMap[string, *fieldPos] 522 | var sortedPaths, paths []string 523 | switch k { 524 | case known: 525 | sortedPaths = make([]string, u.knownFields.Len()) 526 | paths = make([]string, u.knownFields.Len()) 527 | m = u.knownFields 528 | case unknown: 529 | sortedPaths = make([]string, u.untypedFields.Len()) 530 | paths = make([]string, u.untypedFields.Len()) 531 | m = u.untypedFields 532 | default: 533 | return sortedPaths 534 | } 535 | if m.Len() == 0 { 536 | return sortedPaths 537 | } 538 | i := 0 539 | for pair := m.Newest(); pair != nil; pair = pair.Prev() { 540 | paths[i] = pair.Key 541 | i++ 542 | } 543 | maxDepth := 0 544 | for _, p := range paths { 545 | pathDepth := strings.Count(p, ".") 546 | if pathDepth > maxDepth { 547 | maxDepth = pathDepth 548 | } 549 | } 550 | sortIndex := 0 551 | for maxDepth >= 0 { 552 | for _, p := range paths { 553 | pathDepth := strings.Count(p, ".") 554 | if pathDepth == maxDepth { 555 | sortedPaths[sortIndex] = p 556 | sortIndex++ 557 | } 558 | } 559 | maxDepth-- 560 | } 561 | return sortedPaths 562 | } 563 | -------------------------------------------------------------------------------- /cmd/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | <<<<<<< Updated upstream 3 | *.bak 4 | main?.go 5 | ======= 6 | *.bak 7 | >>>>>>> Stashed changes 8 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "time" 9 | 10 | "github.com/loicalleyne/bodkin" 11 | "github.com/loicalleyne/bodkin/reader" 12 | ) 13 | 14 | func main() { 15 | start := time.Now() 16 | filepath := "large-file.json" 17 | log.Println("start") 18 | var u *bodkin.Bodkin 19 | if 1 == 1 { 20 | f, err := os.Open(filepath) 21 | if err != nil { 22 | panic(err) 23 | } 24 | defer f.Close() 25 | s := bufio.NewScanner(f) 26 | u = bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion()) 27 | if err != nil { 28 | panic(err) 29 | } 30 | 31 | for s.Scan() { 32 | err = u.Unify(s.Bytes()) 33 | if err != nil { 34 | panic(err) 35 | } 36 | } 37 | f.Close() 38 | err = u.ExportSchemaFile("temp.bak") 39 | if err != nil { 40 | panic(err) 41 | } 42 | } 43 | if 1 == 1 { 44 | schema, err := u.ImportSchemaFile("temp.bak") 45 | if err != nil { 46 | panic(err) 47 | } 48 | ff, err := os.Open(filepath) 49 | if err != nil { 50 | panic(err) 51 | } 52 | defer ff.Close() 53 | r, err := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024*16)) 54 | if err != nil { 55 | panic(err) 56 | } 57 | 58 | log.Printf("union %v\n", schema.String()) 59 | log.Printf("elapsed: %v\n", time.Since(start)) 60 | 61 | i := 0 62 | // for r.Next() { 63 | // rec := r.Record() 64 | // _, err := rec.MarshalJSON() 65 | // if err != nil { 66 | // fmt.Printf("error marshaling record: %v\n", err) 67 | // } 68 | // // fmt.Printf("\nmarshaled record :\n%v\n", string(rj)) 69 | // i++ 70 | // } 71 | for r.NextBatch(1024) { 72 | recs := r.RecordBatch() 73 | for _, rec := range recs { 74 | _, err := rec.MarshalJSON() 75 | if err != nil { 76 | fmt.Printf("error marshaling record: %v\n", err) 77 | } 78 | // fmt.Printf("\nmarshaled record :\n%v\n", string(rj)) 79 | i++ 80 | } 81 | } 82 | log.Println("records", r.Count(), i) 83 | } 84 | log.Printf("elapsed: %v\n", time.Since(start)) 85 | log.Println("end") 86 | } 87 | 88 | var jsonS1 string = `{"location_types":[{"enumeration_id":"702","id":81,"name":"location81"}],"misc_id":"123456789987a"}` 89 | 90 | var jsonS3 string = `{ 91 | "count": 85, 92 | "next": "https://sub.domain.com/api/search/?models=thurblig", 93 | "previous": null, 94 | "results": [ 95 | { 96 | "id": 6328, 97 | "name": "New user SMB check 2310-1", 98 | "external_id": null, 99 | "title": "New user SMB check 2310-1", 100 | "content_type": "new agent", 101 | "model": "Agent", 102 | "emptyobj":{}, 103 | "dataobj": { 104 | "id": 6328, 105 | "nestednullscalar": null, 106 | "dsp": { 107 | "id": 116, 108 | "name": "El Thingy Bueno", 109 | "nullarray":[] 110 | }, 111 | "name": "New user SMB check 2310-1", 112 | "agency":{ 113 | "id": 925, 114 | "name": "New user SMB check 2310-1", 115 | "employees":[{"id":99,"name":"abcd"},{"id":87,"name":"smart"}] 116 | }, 117 | "export_status": { 118 | "status": true 119 | } 120 | } 121 | } 122 | ] 123 | }` 124 | 125 | var jsonS2 string = `{"address":"11540 Foo Ave.","allowed_ad_types":[{"id":1,"name":"static"},{"id":2,"name":"video"},{"id":3,"name":"audio"},{"id":4,"name":"HTML"}],"allows_motion":true,"aspect_ratio":{"horizontal":16,"id":5,"name":"16:9","vertical":9},"audience_data_sources":[{"id":3,"name":"GeoPath"},{"id":4,"name":"1st party data"},{"id":7,"name":"Dutch outdoor research"},{"id":10,"name":"COMMB"}],"average_imp_multiplier":21,"average_weekly_impressions":123,"bearing":100,"bearing_direction":"E","bid_floors":[{"currency":{"code":"USD","id":1,"name":"US Dollars","symbol":"$"},"floor":10},{"currency":{"code":"CAD","id":9,"name":"Canadian dollar","symbol":"$"},"floor":0.01},{"currency":{"code":"AUD","id":8,"name":"Australian dollar","symbol":"$"},"floor":0.01}],"connectivity":1,"demography_type":"basic","device_id":"1234.broadsign.com","diagonal_size":88,"diagonal_size_units":"inches","dma":{"code":662,"id":5,"name":"Abilene-Sweetwater, TX"},"export_status":{"status":true},"geo":{"city":{"id":344757,"name":"Acme"},"country":{"id":40,"name":"Canada"},"region":{"id":485,"name":"Alberta"}},"hivestack_id":"abcd1234efgh","id":1,"internal_publisher_screen_id":"1q2w3e","is_active":true,"is_audio":false,"latitude":45.5017,"longitude":73.5673,"max_ad_duration":90,"min_ad_duration":5,"most_recent":1,"name":"Office test screen (Jody) - DO NOT DELETE","ox_enabled":false,"publisher":{"additional_currencies":[{"code":"CAD","id":9,"name":"Canadian dollar","symbol":"$"},{"code":"AUD","id":8,"name":"Australian dollar","symbol":"$"}],"currency":{"code":"USD","id":1,"name":"US Dollars","symbol":"$"},"id":1,"is_hivestack_bidder":true,"is_multi_currency_enabled":true,"is_px_bidder":true,"is_vistar_bidder":true,"name":"Publisher Demo"},"resolution":{"height":1080,"id":835,"name":"1920x1080","orientation":"landscape","title":"1920x1080","width":1920},"screen_count":1,"screen_img_url":"https://www.youtube.com/watch?v=8v7KJoGDGwI","screen_type":{"id":105,"name":"LED"},"tags":[{"id":6656,"name":"test"}],"time_zone":{"id":306,"name":"America/Edmonton"},"timestamp":"2024-11-01 05:20:06.642057","total":0,"transact_status":"ok","transact_status_ox":"ok","venue_types":[{"enumeration_id":"602","id":81,"name":"education.colleges"}],"vistar_id":"123456789987a"} 126 | ` 127 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/loicalleyne/bodkin 2 | 3 | go 1.24.1 4 | 5 | require ( 6 | github.com/apache/arrow-go/v18 v18.3.0 7 | github.com/go-viper/mapstructure/v2 v2.2.1 8 | github.com/goccy/go-json v0.10.5 9 | github.com/redpanda-data/benthos/v4 v4.52.0 10 | github.com/wk8/go-ordered-map/v2 v2.1.8 11 | ) 12 | 13 | require ( 14 | github.com/Jeffail/gabs/v2 v2.7.0 // indirect 15 | github.com/OneOfOne/xxhash v1.2.8 // indirect 16 | github.com/andybalholm/brotli v1.1.1 // indirect 17 | github.com/apache/thrift v0.21.0 // indirect 18 | github.com/bahlo/generic-list-go v0.2.0 // indirect 19 | github.com/buger/jsonparser v1.1.1 // indirect 20 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 21 | github.com/go-logr/logr v1.4.2 // indirect 22 | github.com/go-logr/stdr v1.2.2 // indirect 23 | github.com/gofrs/uuid/v5 v5.3.2 // indirect 24 | github.com/golang/snappy v1.0.0 // indirect 25 | github.com/google/flatbuffers v25.2.10+incompatible // indirect 26 | github.com/google/uuid v1.6.0 // indirect 27 | github.com/klauspost/asmfmt v1.3.2 // indirect 28 | github.com/klauspost/compress v1.18.0 // indirect 29 | github.com/klauspost/cpuid/v2 v2.2.10 // indirect 30 | github.com/mailru/easyjson v0.7.7 // indirect 31 | github.com/matoous/go-nanoid/v2 v2.1.0 // indirect 32 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect 33 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect 34 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 35 | github.com/segmentio/ksuid v1.0.4 // indirect 36 | github.com/tilinna/z85 v1.0.0 // indirect 37 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect 38 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect 39 | github.com/xeipuuv/gojsonschema v1.2.0 // indirect 40 | github.com/zeebo/xxh3 v1.0.2 // indirect 41 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 42 | go.opentelemetry.io/otel v1.36.0 // indirect 43 | go.opentelemetry.io/otel/metric v1.36.0 // indirect 44 | go.opentelemetry.io/otel/trace v1.36.0 // indirect 45 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect 46 | golang.org/x/mod v0.24.0 // indirect 47 | golang.org/x/net v0.39.0 // indirect 48 | golang.org/x/sync v0.14.0 // indirect 49 | golang.org/x/sys v0.33.0 // indirect 50 | golang.org/x/text v0.25.0 // indirect 51 | golang.org/x/tools v0.32.0 // indirect 52 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect 53 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect 54 | google.golang.org/grpc v1.72.0 // indirect 55 | google.golang.org/protobuf v1.36.6 // indirect 56 | gopkg.in/yaml.v3 v3.0.1 // indirect 57 | ) 58 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | cuelang.org/go v0.13.0 h1:Z9NQY9RK3zMbjq1ZK67hvOV58pI3FKQgfuu1Znz+akQ= 2 | cuelang.org/go v0.13.0/go.mod h1:8MoQXu+RcXsa2s9mebJN1HJ1orVDc9aI9/yKi6Dzsi4= 3 | github.com/Jeffail/gabs/v2 v2.7.0 h1:Y2edYaTcE8ZpRsR2AtmPu5xQdFDIthFG0jYhu5PY8kg= 4 | github.com/Jeffail/gabs/v2 v2.7.0/go.mod h1:dp5ocw1FvBBQYssgHsG7I1WYsiLRtkUaB1FEtSwvNUw= 5 | github.com/Jeffail/grok v1.1.0 h1:kiHmZ+0J5w/XUihRgU3DY9WIxKrNQCDjnfAb6bMLFaE= 6 | github.com/Jeffail/grok v1.1.0/go.mod h1:dm0hLksrDwOMa6To7ORXCuLbuNtASIZTfYheavLpsuE= 7 | github.com/Jeffail/shutdown v1.0.0 h1:afYjnY4pksqP/012m3NGJVccDI+WATdSzIMVHZKU8/Y= 8 | github.com/Jeffail/shutdown v1.0.0/go.mod h1:5dT4Y1oe60SJELCkmAB1pr9uQyHBhh6cwDLQTfmuO5U= 9 | github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8= 10 | github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= 11 | github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= 12 | github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= 13 | github.com/apache/arrow-go/v18 v18.3.0 h1:Xq4A6dZj9Nu33sqZibzn012LNnewkTUlfKVUFD/RX/I= 14 | github.com/apache/arrow-go/v18 v18.3.0/go.mod h1:eEM1DnUTHhgGAjf/ChvOAQbUQ+EPohtDrArffvUjPg8= 15 | github.com/apache/thrift v0.21.0 h1:tdPmh/ptjE1IJnhbhrcl2++TauVjy242rkV/UzJChnE= 16 | github.com/apache/thrift v0.21.0/go.mod h1:W1H8aR/QRtYNvrPeFXBtobyRkd0/YVhTc6i07XIAgDw= 17 | github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk= 18 | github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg= 19 | github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= 20 | github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= 21 | github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= 22 | github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= 23 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 24 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 25 | github.com/cockroachdb/apd/v3 v3.2.1 h1:U+8j7t0axsIgvQUqthuNm82HIrYXodOV2iWLWtEaIwg= 26 | github.com/cockroachdb/apd/v3 v3.2.1/go.mod h1:klXJcjp+FffLTHlhIG69tezTDvdP065naDsHzKhYSqc= 27 | github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc= 28 | github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= 29 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 30 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 31 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 32 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= 33 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= 34 | github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= 35 | github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= 36 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= 37 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= 38 | github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= 39 | github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= 40 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= 41 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= 42 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 43 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= 44 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= 45 | github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIxtHqx8aGss= 46 | github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= 47 | github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= 48 | github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= 49 | github.com/gofrs/uuid/v5 v5.3.2 h1:2jfO8j3XgSwlz/wHqemAEugfnTlikAYHhnqQ8Xh4fE0= 50 | github.com/gofrs/uuid/v5 v5.3.2/go.mod h1:CDOjlDMVAtN56jqyRUZh58JT31Tiw7/oQyEXZV+9bD8= 51 | github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= 52 | github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= 53 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 54 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 55 | github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= 56 | github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 57 | github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= 58 | github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= 59 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= 60 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= 61 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 62 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 63 | github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= 64 | github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= 65 | github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= 66 | github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= 67 | github.com/govalues/decimal v0.1.36 h1:dojDpsSvrk0ndAx8+saW5h9WDIHdWpIwrH/yhl9olyU= 68 | github.com/govalues/decimal v0.1.36/go.mod h1:Ee7eI3Llf7hfqDZtpj8Q6NCIgJy1iY3kH1pSwDrNqlM= 69 | github.com/hashicorp/golang-lru/arc/v2 v2.0.7 h1:QxkVTxwColcduO+LP7eJO56r2hFiG8zEbfAAzRv52KQ= 70 | github.com/hashicorp/golang-lru/arc/v2 v2.0.7/go.mod h1:Pe7gBlGdc8clY5LJ0LpJXMt5AmgmWNH1g+oFFVUHOEc= 71 | github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= 72 | github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= 73 | github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I= 74 | github.com/influxdata/go-syslog/v3 v3.0.0/go.mod h1:tulsOp+CecTAYC27u9miMgq21GqXRW6VdKbOG+QSP4Q= 75 | github.com/itchyny/gojq v0.12.17 h1:8av8eGduDb5+rvEdaOO+zQUjA04MS0m3Ps8HiD+fceg= 76 | github.com/itchyny/gojq v0.12.17/go.mod h1:WBrEMkgAfAGO1LUcGOckBl5O726KPp+OlkKug0I/FEY= 77 | github.com/itchyny/timefmt-go v0.1.6 h1:ia3s54iciXDdzWzwaVKXZPbiXzxxnv1SPGFfM/myJ5Q= 78 | github.com/itchyny/timefmt-go v0.1.6/go.mod h1:RRDZYC5s9ErkjQvTvvU7keJjxUYzIISJGxm9/mAERQg= 79 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= 80 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= 81 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= 82 | github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= 83 | github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= 84 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= 85 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= 86 | github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= 87 | github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= 88 | github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= 89 | github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= 90 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 91 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 92 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 93 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 94 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= 95 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= 96 | github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE= 97 | github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM= 98 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= 99 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= 100 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= 101 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 102 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= 103 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= 104 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= 105 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= 106 | github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 h1:NHrXEjTNQY7P0Zfx1aMrNhpgxHmow66XQtm0aQLY0AE= 107 | github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8= 108 | github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= 109 | github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= 110 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 111 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 112 | github.com/quipo/dependencysolver v0.0.0-20170801134659-2b009cb4ddcc h1:hK577yxEJ2f5s8w2iy2KimZmgrdAUZUNftE1ESmg2/Q= 113 | github.com/quipo/dependencysolver v0.0.0-20170801134659-2b009cb4ddcc/go.mod h1:OQt6Zo5B3Zs+C49xul8kcHo+fZ1mCLPvd0LFxiZ2DHc= 114 | github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= 115 | github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= 116 | github.com/redpanda-data/benthos/v4 v4.52.0 h1:D47ayRCWxtFN0vrZvQo62T0L5S0+rVyrMj8H0R54UbQ= 117 | github.com/redpanda-data/benthos/v4 v4.52.0/go.mod h1:NGzOuISEVc8wNsf8Xn/6jyBR/ss2J0Okw+ZYNvzl+ak= 118 | github.com/rickb777/period v1.0.14 h1:Ucj/lTa3QwpuXFP9JqOitbmtibCkQsuxq8lLOf3GEBY= 119 | github.com/rickb777/period v1.0.14/go.mod h1:eDPQSeeG0c6g2Fz8/42+VDBttXNCV6TwVe8Magn2IgM= 120 | github.com/rickb777/plural v1.4.4 h1:OpZU8uRr9P2NkYAbkLMwlKNVJyJ5HvRcRBFyXGJtKGI= 121 | github.com/rickb777/plural v1.4.4/go.mod h1:DB19dtrplGS5s6VJVHn7tvmFYPoE83p1xqio3oVnNRM= 122 | github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= 123 | github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= 124 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= 125 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= 126 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= 127 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 128 | github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c= 129 | github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE= 130 | github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= 131 | github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= 132 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 133 | github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= 134 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= 135 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 136 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 137 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 138 | github.com/tilinna/z85 v1.0.0 h1:uqFnJBlD01dosSeo5sK1G1YGbPuwqVHqR+12OJDRjUw= 139 | github.com/tilinna/z85 v1.0.0/go.mod h1:EfpFU/DUY4ddEy6CRvk2l+UQNEzHbh+bqBQS+04Nkxs= 140 | github.com/urfave/cli/v2 v2.27.6 h1:VdRdS98FNhKZ8/Az8B7MTyGQmpIr36O1EHybx/LaZ4g= 141 | github.com/urfave/cli/v2 v2.27.6/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= 142 | github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc= 143 | github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw= 144 | github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= 145 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= 146 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= 147 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= 148 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= 149 | github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= 150 | github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= 151 | github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= 152 | github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= 153 | github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= 154 | github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= 155 | github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk= 156 | github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a/go.mod h1:ul22v+Nro/R083muKhosV54bj5niojjWZvU8xrevuH4= 157 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= 158 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= 159 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= 160 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= 161 | go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= 162 | go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= 163 | go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= 164 | go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E= 165 | go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE= 166 | go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs= 167 | go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= 168 | go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= 169 | go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= 170 | go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= 171 | go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w= 172 | go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA= 173 | golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= 174 | golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= 175 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= 176 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= 177 | golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= 178 | golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= 179 | golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= 180 | golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= 181 | golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= 182 | golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= 183 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= 184 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= 185 | golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= 186 | golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= 187 | golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= 188 | golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= 189 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= 190 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= 191 | gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= 192 | gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= 193 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= 194 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= 195 | google.golang.org/grpc v1.72.0 h1:S7UkcVa60b5AAQTaO6ZKamFp1zMZSU0fGDK2WZLbBnM= 196 | google.golang.org/grpc v1.72.0/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= 197 | google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= 198 | google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= 199 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 200 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 201 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 202 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= 203 | gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= 204 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 205 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 206 | -------------------------------------------------------------------------------- /json2parquet/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.parquet -------------------------------------------------------------------------------- /json2parquet/cmd/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.parquet -------------------------------------------------------------------------------- /json2parquet/cmd/cleaner/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "os" 9 | "path/filepath" 10 | 11 | "github.com/goccy/go-json" 12 | 13 | "github.com/redpanda-data/benthos/v4/public/bloblang" 14 | ) 15 | 16 | // jcleaner takes as input a JSONL file, and removes all null fields, empty arrays, 17 | // empty objects and empty strings. 18 | func main() { 19 | inputFile := flag.String("in", "", "input file") 20 | outputFile := flag.String("out", "", "output file") 21 | flag.Parse() 22 | if *inputFile == "" { 23 | log.Fatal("no input file specified") 24 | } 25 | if *outputFile == "" { 26 | log.Fatal("no output file specified") 27 | } 28 | problemLines := fileNameWithoutExt(*outputFile) + "_problem.json" 29 | f, err := os.Open(*inputFile) 30 | if err != nil { 31 | panic(err) 32 | } 33 | defer func() { 34 | if r := recover(); r != nil { 35 | fmt.Println(err) 36 | } 37 | }() 38 | defer f.Close() 39 | bloblangMapping := `map remove_null_empty { 40 | root = match { 41 | (this.type() == "object" && this.length() == 0) => deleted() 42 | this.type() == "object" => this.map_each(i -> i.value.apply("remove_null_empty")) 43 | (this.type() == "array" && this.length() == 0) => deleted() 44 | this.type() == "array" => this.map_each(v -> v.apply("remove_null_empty")) 45 | this.type() == "null" => deleted() 46 | this.type() == "string" && this.length() == 0 => deleted() 47 | } 48 | } 49 | root = this.apply("remove_null_empty")` 50 | exe, err := bloblang.Parse(bloblangMapping) 51 | if err != nil { 52 | log.Println(err) 53 | } 54 | 55 | nf, err := os.Create(*outputFile) 56 | if err != nil { 57 | panic(err) 58 | } 59 | defer nf.Close() 60 | w := bufio.NewWriterSize(nf, 1024*4) 61 | 62 | pf, err := os.Create(problemLines) 63 | if err != nil { 64 | panic(err) 65 | } 66 | defer pf.Close() 67 | pw := bufio.NewWriterSize(nf, 1024*4) 68 | 69 | r := bufio.NewReaderSize(f, 1024*4) 70 | s := bufio.NewScanner(r) 71 | newline := []byte("\n") 72 | for s.Scan() { 73 | y := s.Bytes() 74 | b, err := ApplyBloblangMapping(y, exe) 75 | if err != nil { 76 | pw.Write(y) 77 | pw.Write(newline) 78 | continue 79 | } 80 | _, err = w.Write(b) 81 | if err != nil { 82 | pw.Write(y) 83 | pw.Write(newline) 84 | continue 85 | } 86 | w.Write(newline) 87 | } 88 | w.Flush() 89 | } 90 | 91 | func ApplyBloblangMapping(jsonInput []byte, exe *bloblang.Executor) ([]byte, error) { 92 | // Parse the JSON input into a map[string]interface{} 93 | var inputMap map[string]interface{} 94 | if err := json.Unmarshal(jsonInput, &inputMap); err != nil { 95 | return nil, err 96 | } 97 | 98 | // Execute the Bloblang mapping 99 | res, err := exe.Query(inputMap) 100 | if err != nil { 101 | return nil, err 102 | } 103 | 104 | // Convert the result back into a JSON string 105 | jsonResult, err := json.Marshal(res) 106 | if err != nil { 107 | return nil, err 108 | } 109 | 110 | return jsonResult, nil 111 | } 112 | 113 | func fileNameWithoutExt(fileName string) string { 114 | return fileName[:len(fileName)-len(filepath.Ext(fileName))] 115 | } 116 | -------------------------------------------------------------------------------- /json2parquet/cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "os" 8 | "runtime/pprof" 9 | 10 | "github.com/loicalleyne/bodkin" 11 | j2p "github.com/loicalleyne/bodkin/json2parquet" 12 | ) 13 | 14 | var cpuprofile = flag.String("cpuprofile", "default.pgo", "write cpu profile to `file`") 15 | 16 | func main() { 17 | inferMode := flag.Bool("infer_timeunits", true, "Infer date, time and timestamps from strings") 18 | quotedValuesAreStrings := flag.Bool("quoted_values_are_strings", false, "Treat quoted bool, float and integer values as strings") 19 | withTypeConversion := flag.Bool("type_conversion", false, "upgrade field types if data changes") 20 | inputFile := flag.String("in", "s.json", "input file") 21 | outputFile := flag.String("out", "t.parquet", "output file") 22 | dryRun := flag.Bool("n", false, "only print the schema") 23 | lines := flag.Int("lines", 0, "number of lines from which to infer schema; 0 means whole file is scanned") 24 | flag.Parse() 25 | if *inputFile == "" { 26 | log.Fatal("no input file specified") 27 | } 28 | log.Println("detecting schema") 29 | if *cpuprofile != "" { 30 | f, err := os.Create(*cpuprofile) 31 | if err != nil { 32 | log.Fatal("could not create CPU profile: ", err) 33 | } 34 | defer f.Close() 35 | if err := pprof.StartCPUProfile(f); err != nil { 36 | log.Fatal("could not start CPU profile: ", err) 37 | } 38 | defer pprof.StopCPUProfile() 39 | defer log.Printf("program ended\nto view profile run 'go tool pprof -http localhost:8080 %s\n", *cpuprofile) 40 | } 41 | var opts []bodkin.Option 42 | if *inferMode { 43 | opts = append(opts, bodkin.WithInferTimeUnits()) 44 | } 45 | if *withTypeConversion { 46 | opts = append(opts, bodkin.WithTypeConversion()) 47 | } 48 | if *quotedValuesAreStrings { 49 | opts = append(opts, bodkin.WithQuotedValuesAreStrings()) 50 | } 51 | if *lines != 0 { 52 | opts = append(opts, bodkin.WithMaxCount(*lines)) 53 | } 54 | arrowSchema, n, err := j2p.SchemaFromFile(*inputFile, opts...) 55 | if err == bodkin.ErrInvalidInput { 56 | fmt.Printf("schema creation error %v\n", err) 57 | } 58 | if arrowSchema == nil { 59 | log.Fatal("nil schema") 60 | } 61 | log.Printf("schema from %d records\n", n) 62 | fmt.Println(arrowSchema.String()) 63 | if !*dryRun { 64 | if *outputFile == "" { 65 | log.Fatal("no output file specified") 66 | } 67 | log.Println("starting conversion to parquet") 68 | 69 | n, err = j2p.RecordsFromFile(*inputFile, *outputFile, arrowSchema, nil) 70 | log.Printf("%d records written", n) 71 | if err != nil { 72 | log.Printf("parquet error: %v", err) 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /json2parquet/json2parquet.go: -------------------------------------------------------------------------------- 1 | package json2parquet 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "os" 9 | 10 | "github.com/apache/arrow-go/v18/arrow" 11 | "github.com/apache/arrow-go/v18/arrow/array" 12 | "github.com/apache/arrow-go/v18/parquet" 13 | "github.com/loicalleyne/bodkin" 14 | "github.com/loicalleyne/bodkin/pq" 15 | ) 16 | 17 | func FromReader(r io.Reader, opts ...bodkin.Option) (*arrow.Schema, int, error) { 18 | var err error 19 | s := bufio.NewScanner(r) 20 | u := bodkin.NewBodkin(opts...) 21 | for s.Scan() { 22 | u.Unify(s.Bytes()) 23 | if u.Count() > u.MaxCount() { 24 | break 25 | } 26 | } 27 | schema, err := u.Schema() 28 | if err != nil { 29 | return nil, u.Count(), err 30 | } 31 | return schema, u.Count(), err 32 | } 33 | 34 | func SchemaFromFile(inputFile string, opts ...bodkin.Option) (*arrow.Schema, int, error) { 35 | f, err := os.Open(inputFile) 36 | if err != nil { 37 | return nil, 0, err 38 | } 39 | defer f.Close() 40 | 41 | r := bufio.NewReaderSize(f, 1024*32) 42 | return FromReader(r, opts...) 43 | } 44 | 45 | func RecordsFromFile(inputFile, outputFile string, schema *arrow.Schema, munger func(io.Reader, io.Writer) error, opts ...parquet.WriterProperty) (int, error) { 46 | n := 0 47 | f, err := os.Open(inputFile) 48 | if err != nil { 49 | return 0, err 50 | } 51 | defer func() { 52 | if r := recover(); r != nil { 53 | fmt.Println(err) 54 | fmt.Println("Records:", n) 55 | } 56 | }() 57 | defer f.Close() 58 | var prp *parquet.WriterProperties = pq.DefaultWrtp 59 | if len(opts) != 0 { 60 | prp = parquet.NewWriterProperties(opts...) 61 | } 62 | pw, _, err := pq.NewParquetWriter(schema, prp, outputFile) 63 | if err != nil { 64 | return 0, err 65 | } 66 | defer pw.Close() 67 | 68 | var r io.Reader 69 | var rdr *array.JSONReader 70 | chunk := 1024 71 | munger = nil 72 | r = bufio.NewReaderSize(f, 1024*1024*128) 73 | if munger != nil { 74 | pr, pwr := io.Pipe() 75 | 76 | go func() { 77 | // close the writer, so the reader knows there's no more data 78 | defer pwr.Close() 79 | munger(r, pwr) 80 | }() 81 | rdr = array.NewJSONReader(pr, schema, array.WithChunk(chunk)) 82 | } else { 83 | rdr = array.NewJSONReader(r, schema, array.WithChunk(chunk)) 84 | } 85 | 86 | defer rdr.Release() 87 | 88 | for rdr.Next() { 89 | rec := rdr.Record() 90 | err1 := pw.WriteRecord(rec) 91 | if err != nil { 92 | err = errors.Join(err, fmt.Errorf("failed to write parquet record: %v", err1)) 93 | } 94 | n = n + chunk 95 | } 96 | if err := rdr.Err(); err != nil { 97 | return n, err 98 | } 99 | err = pw.Close() 100 | if err != nil { 101 | return n, err 102 | } 103 | return n, err 104 | } 105 | -------------------------------------------------------------------------------- /option.go: -------------------------------------------------------------------------------- 1 | package bodkin 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | ) 7 | 8 | // WithInferTimeUnits() enables scanning input string values for time, date and timestamp types. 9 | // 10 | // Times use a format of HH:MM or HH:MM:SS[.zzz] where the fractions of a second cannot 11 | // exceed the precision allowed by the time unit, otherwise unmarshalling will error. 12 | // 13 | // Dates use YYYY-MM-DD format. 14 | // 15 | // Timestamps use RFC3339Nano format except without a timezone, all of the following are valid: 16 | // 17 | // YYYY-MM-DD 18 | // YYYY-MM-DD[T]HH 19 | // YYYY-MM-DD[T]HH:MM 20 | // YYYY-MM-DD[T]HH:MM:SS[.zzzzzzzzzz] 21 | func WithInferTimeUnits() Option { 22 | return func(cfg config) { 23 | cfg.inferTimeUnits = true 24 | } 25 | } 26 | 27 | // WithTypeConversion enables upgrading the column types to fix compatibilty conflicts. 28 | func WithTypeConversion() Option { 29 | return func(cfg config) { 30 | cfg.typeConversion = true 31 | } 32 | } 33 | 34 | // WithTypeConversion enables upgrading the column types to fix compatibilty conflicts. 35 | func WithQuotedValuesAreStrings() Option { 36 | return func(cfg config) { 37 | cfg.quotedValuesAreStrings = true 38 | } 39 | } 40 | 41 | // WithMaxCount enables capping the number of Unify evaluations. 42 | func WithMaxCount(i int) Option { 43 | return func(cfg config) { 44 | cfg.maxCount = i 45 | } 46 | } 47 | 48 | // WithIOReader provides an io.Reader for a Bodkin to use with UnifyScan(), along 49 | // with a delimiter to use to split datum in the data stream. 50 | // Default delimiter '\n' if delimiter is not provided. 51 | func WithIOReader(r io.Reader, delim byte) Option { 52 | return func(cfg config) { 53 | cfg.rr = r 54 | cfg.br = bufio.NewReaderSize(cfg.rr, 1024*16) 55 | if delim != '\n' { 56 | cfg.delim = delim 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /pq/parquet_writer.go: -------------------------------------------------------------------------------- 1 | package pq 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/apache/arrow-go/v18/arrow" 8 | "github.com/apache/arrow-go/v18/arrow/array" 9 | "github.com/apache/arrow-go/v18/arrow/memory" 10 | "github.com/apache/arrow-go/v18/parquet" 11 | "github.com/apache/arrow-go/v18/parquet/compress" 12 | "github.com/apache/arrow-go/v18/parquet/pqarrow" 13 | "github.com/apache/arrow-go/v18/parquet/schema" 14 | ) 15 | 16 | const ( 17 | defaultRowGroupByteLimit = 10 * 1024 * 1024 18 | ) 19 | 20 | var ( 21 | DefaultWrtp = parquet.NewWriterProperties( 22 | parquet.WithDictionaryDefault(true), 23 | parquet.WithVersion(parquet.V2_LATEST), 24 | parquet.WithCompression(compress.Codecs.Zstd), 25 | parquet.WithStats(true), 26 | parquet.WithRootName("bodkin"), 27 | ) 28 | ) 29 | 30 | type ParquetWriter struct { 31 | destFile *os.File 32 | pqwrt *pqarrow.FileWriter 33 | sc *arrow.Schema 34 | count int 35 | } 36 | 37 | // NewParquetWriter creates a new ParquetWriter. 38 | // 39 | // sc is the Arrow schema to use for writing records. 40 | // wrtp are the Parquet writer properties to use. 41 | // 42 | // Returns a ParquetWriter and an error. The error will be non-nil if: 43 | // - Failed to get the Parquet schema from the Arrow schema. 44 | // - Failed to create the destination file. 45 | // - Failed to create the Parquet file writer. 46 | // 47 | // Example: 48 | // ```go 49 | // pw, err := NewParquetWriter(schema, parquet.NewWriterProperties(parquet.WithCompression(parquet.CompressionCodec_SNAPPY))) 50 | // 51 | // if err != nil { 52 | // log.Fatal(err) 53 | // } 54 | // 55 | // ``` 56 | func NewParquetWriter(sc *arrow.Schema, wrtp *parquet.WriterProperties, path string) (*ParquetWriter, *schema.Schema, error) { 57 | pqschema, err := pqarrow.ToParquet(sc, wrtp, pqarrow.DefaultWriterProps()) 58 | if err != nil { 59 | return nil, nil, fmt.Errorf("failed to get parquet schema: %w", err) 60 | } 61 | 62 | destFile, err := os.Create(path) 63 | if err != nil { 64 | return nil, nil, fmt.Errorf("failed to create destination file: %w", err) 65 | } 66 | artp := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema()) 67 | pqwrt, err := pqarrow.NewFileWriter(sc, destFile, wrtp, artp) 68 | if err != nil { 69 | return nil, nil, fmt.Errorf("failed to create parquet writer: %w", err) 70 | } 71 | 72 | return &ParquetWriter{destFile: destFile, pqwrt: pqwrt, sc: sc}, pqschema, nil 73 | } 74 | 75 | // Write writes a single record to the Parquet file. 76 | // 77 | // jsonData is the JSON encoded record data. 78 | // 79 | // Returns an error if: 80 | // - Failed to unmarshal the JSON data. 81 | // - Failed to write the record to Parquet. 82 | // 83 | // Increments the record count and creates a new row group if the current 84 | // row group exceeds the default row group byte limit. 85 | // 86 | // Example: 87 | // ```go 88 | // err := pw.Write([]byte(`{"id":1,"name":"foo"}`)) 89 | // 90 | // if err != nil { 91 | // log.Fatal(err) 92 | // } 93 | // 94 | // ``` 95 | func (pw *ParquetWriter) Write(jsonData []byte) error { 96 | recbld := array.NewRecordBuilder(memory.DefaultAllocator, pw.sc) 97 | defer recbld.Release() 98 | 99 | err := recbld.UnmarshalJSON(jsonData) 100 | if err != nil { 101 | return fmt.Errorf("failed to unmarshal JSON: %w", err) 102 | } 103 | 104 | rec := recbld.NewRecord() 105 | defer rec.Release() 106 | err = pw.pqwrt.WriteBuffered(rec) 107 | if err != nil { 108 | return fmt.Errorf("failed to write to parquet: %w", err) 109 | } 110 | 111 | if pw.pqwrt.RowGroupTotalBytesWritten() >= defaultRowGroupByteLimit { 112 | pw.pqwrt.NewBufferedRowGroup() 113 | } 114 | pw.count++ 115 | 116 | return nil 117 | } 118 | 119 | func (pw *ParquetWriter) WriteRecord(rec arrow.Record) error { 120 | err := pw.pqwrt.WriteBuffered(rec) 121 | if err != nil { 122 | return fmt.Errorf("failed to write to parquet: %w", err) 123 | } 124 | 125 | if pw.pqwrt.RowGroupTotalBytesWritten() >= defaultRowGroupByteLimit { 126 | pw.pqwrt.NewBufferedRowGroup() 127 | } 128 | pw.count++ 129 | 130 | return nil 131 | } 132 | 133 | // RecordCount returns the total number of records written. 134 | func (pw *ParquetWriter) RecordCount() int { 135 | return pw.count 136 | } 137 | 138 | // Close closes the Parquet writer. 139 | // 140 | // Returns an error if failed to close the Parquet file writer. 141 | func (pw *ParquetWriter) Close() error { 142 | if err := pw.pqwrt.Close(); err != nil { 143 | return fmt.Errorf("failed to close parquet writer: %w", err) 144 | } 145 | 146 | return nil 147 | } 148 | -------------------------------------------------------------------------------- /reader/.gitignore: -------------------------------------------------------------------------------- 1 | avro.go -------------------------------------------------------------------------------- /reader/encoder.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "encoding" 5 | "errors" 6 | "fmt" 7 | "reflect" 8 | "strings" 9 | 10 | "github.com/go-viper/mapstructure/v2" 11 | ) 12 | 13 | const ( 14 | tagNameMapStructure = "mapstructure" 15 | optionSeparator = "," 16 | optionOmitEmpty = "omitempty" 17 | optionSquash = "squash" 18 | optionRemain = "remain" 19 | optionSkip = "-" 20 | ) 21 | 22 | var ( 23 | errNonStringEncodedKey = errors.New("non string-encoded key") 24 | ) 25 | 26 | // tagInfo stores the mapstructure tag details. 27 | type tagInfo struct { 28 | name string 29 | omitEmpty bool 30 | squash bool 31 | } 32 | 33 | // An Encoder takes structured data and converts it into an 34 | // interface following the mapstructure tags. 35 | type Encoder struct { 36 | config *EncoderConfig 37 | } 38 | 39 | // EncoderConfig is the configuration used to create a new encoder. 40 | type EncoderConfig struct { 41 | // EncodeHook, if set, is a way to provide custom encoding. It 42 | // will be called before structs and primitive types. 43 | EncodeHook mapstructure.DecodeHookFunc 44 | } 45 | 46 | // New returns a new encoder for the configuration. 47 | func New(cfg *EncoderConfig) *Encoder { 48 | return &Encoder{config: cfg} 49 | } 50 | 51 | // Encode takes the input and uses reflection to encode it to 52 | // an interface based on the mapstructure spec. 53 | func (e *Encoder) Encode(input any) (any, error) { 54 | return e.encode(reflect.ValueOf(input)) 55 | } 56 | 57 | // encode processes the value based on the reflect.Kind. 58 | func (e *Encoder) encode(value reflect.Value) (any, error) { 59 | if value.IsValid() { 60 | switch value.Kind() { 61 | case reflect.Interface, reflect.Ptr: 62 | return e.encode(value.Elem()) 63 | case reflect.Map: 64 | return e.encodeMap(value) 65 | case reflect.Slice: 66 | return e.encodeSlice(value) 67 | case reflect.Struct: 68 | return e.encodeStruct(value) 69 | default: 70 | return e.encodeHook(value) 71 | } 72 | } 73 | return nil, nil 74 | } 75 | 76 | // encodeHook calls the EncodeHook in the EncoderConfig with the value passed in. 77 | // This is called before processing structs and for primitive data types. 78 | func (e *Encoder) encodeHook(value reflect.Value) (any, error) { 79 | if e.config != nil && e.config.EncodeHook != nil { 80 | out, err := mapstructure.DecodeHookExec(e.config.EncodeHook, value, value) 81 | if err != nil { 82 | return nil, fmt.Errorf("error running encode hook: %w", err) 83 | } 84 | return out, nil 85 | } 86 | return value.Interface(), nil 87 | } 88 | 89 | // encodeStruct encodes the struct by iterating over the fields, getting the 90 | // mapstructure tagInfo for each exported field, and encoding the value. 91 | func (e *Encoder) encodeStruct(value reflect.Value) (any, error) { 92 | if value.Kind() != reflect.Struct { 93 | return nil, &reflect.ValueError{ 94 | Method: "encodeStruct", 95 | Kind: value.Kind(), 96 | } 97 | } 98 | out, err := e.encodeHook(value) 99 | if err != nil { 100 | return nil, err 101 | } 102 | value = reflect.ValueOf(out) 103 | // if the output of encodeHook is no longer a struct, 104 | // call encode against it. 105 | if value.Kind() != reflect.Struct { 106 | return e.encode(value) 107 | } 108 | result := make(map[string]any) 109 | for i := 0; i < value.NumField(); i++ { 110 | field := value.Field(i) 111 | if field.CanInterface() { 112 | info := getTagInfo(value.Type().Field(i)) 113 | if (info.omitEmpty && field.IsZero()) || info.name == optionSkip { 114 | continue 115 | } 116 | encoded, err := e.encode(field) 117 | if err != nil { 118 | return nil, fmt.Errorf("error encoding field %q: %w", info.name, err) 119 | } 120 | if info.squash { 121 | if m, ok := encoded.(map[string]any); ok { 122 | for k, v := range m { 123 | result[k] = v 124 | } 125 | } 126 | } else { 127 | result[info.name] = encoded 128 | } 129 | } 130 | } 131 | return result, nil 132 | } 133 | 134 | // encodeSlice iterates over the slice and encodes each of the elements. 135 | func (e *Encoder) encodeSlice(value reflect.Value) (any, error) { 136 | if value.Kind() != reflect.Slice { 137 | return nil, &reflect.ValueError{ 138 | Method: "encodeSlice", 139 | Kind: value.Kind(), 140 | } 141 | } 142 | result := make([]any, value.Len()) 143 | for i := 0; i < value.Len(); i++ { 144 | var err error 145 | if result[i], err = e.encode(value.Index(i)); err != nil { 146 | return nil, fmt.Errorf("error encoding element in slice at index %d: %w", i, err) 147 | } 148 | } 149 | return result, nil 150 | } 151 | 152 | // encodeMap encodes a map by encoding the key and value. Returns errNonStringEncodedKey 153 | // if the key is not encoded into a string. 154 | func (e *Encoder) encodeMap(value reflect.Value) (any, error) { 155 | if value.Kind() != reflect.Map { 156 | return nil, &reflect.ValueError{ 157 | Method: "encodeMap", 158 | Kind: value.Kind(), 159 | } 160 | } 161 | result := make(map[string]any) 162 | iterator := value.MapRange() 163 | for iterator.Next() { 164 | encoded, err := e.encode(iterator.Key()) 165 | if err != nil { 166 | return nil, fmt.Errorf("error encoding key: %w", err) 167 | } 168 | 169 | v := reflect.ValueOf(encoded) 170 | var key string 171 | 172 | switch v.Kind() { 173 | case reflect.String: 174 | key = v.String() 175 | default: 176 | return nil, fmt.Errorf("%w, key: %q, kind: %v, type: %T", errNonStringEncodedKey, iterator.Key().Interface(), iterator.Key().Kind(), encoded) 177 | } 178 | 179 | if _, ok := result[key]; ok { 180 | return nil, fmt.Errorf("duplicate key %q while encoding", key) 181 | } 182 | if result[key], err = e.encode(iterator.Value()); err != nil { 183 | return nil, fmt.Errorf("error encoding map value for key %q: %w", key, err) 184 | } 185 | } 186 | return result, nil 187 | } 188 | 189 | // getTagInfo looks up the mapstructure tag and uses that if available. 190 | // Uses the lowercase field if not found. Checks for omitempty and squash. 191 | func getTagInfo(field reflect.StructField) *tagInfo { 192 | info := tagInfo{} 193 | if tag, ok := field.Tag.Lookup(tagNameMapStructure); ok { 194 | options := strings.Split(tag, optionSeparator) 195 | info.name = options[0] 196 | if len(options) > 1 { 197 | for _, option := range options[1:] { 198 | switch option { 199 | case optionOmitEmpty: 200 | info.omitEmpty = true 201 | case optionSquash, optionRemain: 202 | info.squash = true 203 | } 204 | } 205 | } 206 | } else { 207 | info.name = strings.ToLower(field.Name) 208 | } 209 | return &info 210 | } 211 | 212 | // TextMarshalerHookFunc returns a DecodeHookFuncValue that checks 213 | // for the encoding.TextMarshaler interface and calls the MarshalText 214 | // function if found. 215 | func TextMarshalerHookFunc() mapstructure.DecodeHookFuncValue { 216 | return func(from reflect.Value, _ reflect.Value) (any, error) { 217 | marshaler, ok := from.Interface().(encoding.TextMarshaler) 218 | if !ok { 219 | return from.Interface(), nil 220 | } 221 | out, err := marshaler.MarshalText() 222 | if err != nil { 223 | return nil, err 224 | } 225 | return string(out), nil 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /reader/input.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | 8 | "github.com/go-viper/mapstructure/v2" 9 | json "github.com/goccy/go-json" 10 | ) 11 | 12 | var ( 13 | ErrUndefinedInput = errors.New("nil input") 14 | ErrInvalidInput = errors.New("invalid input") 15 | ) 16 | 17 | // InputMap takes structured input data and attempts to decode it to 18 | // map[string]any. Input data can be json in string or []byte, or any other 19 | // Go data type which can be decoded by [MapStructure/v2]. 20 | // [MapStructure/v2]: github.com/go-viper/mapstructure/v2 21 | func InputMap(a any) (map[string]any, error) { 22 | m := map[string]any{} 23 | switch input := a.(type) { 24 | case nil: 25 | return nil, ErrUndefinedInput 26 | case map[string]any: 27 | return input, nil 28 | case []byte: 29 | r := bytes.NewReader(input) 30 | d := json.NewDecoder(r) 31 | d.UseNumber() 32 | err := d.Decode(&m) 33 | if err != nil { 34 | return nil, fmt.Errorf("%v : %v", ErrInvalidInput, err) 35 | } 36 | case string: 37 | r := bytes.NewReader([]byte(input)) 38 | d := json.NewDecoder(r) 39 | d.UseNumber() 40 | err := d.Decode(&m) 41 | if err != nil { 42 | return nil, fmt.Errorf("%v : %v", ErrInvalidInput, err) 43 | } 44 | default: 45 | ms := New(&EncoderConfig{EncodeHook: mapstructure.RecursiveStructToMapHookFunc()}) 46 | enc, err := ms.Encode(a) 47 | if err != nil { 48 | return nil, fmt.Errorf("Error decoding to map[string]interface{}: %v", err) 49 | } 50 | return enc.(map[string]any), nil 51 | } 52 | return m, nil 53 | } 54 | -------------------------------------------------------------------------------- /reader/loader.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "encoding/json" 7 | "errors" 8 | "fmt" 9 | "math/big" 10 | "strconv" 11 | "time" 12 | 13 | "github.com/apache/arrow-go/v18/arrow" 14 | "github.com/apache/arrow-go/v18/arrow/array" 15 | "github.com/apache/arrow-go/v18/arrow/decimal128" 16 | "github.com/apache/arrow-go/v18/arrow/decimal256" 17 | "github.com/apache/arrow-go/v18/arrow/extensions" 18 | "github.com/apache/arrow-go/v18/arrow/memory" 19 | ) 20 | 21 | type dataLoader struct { 22 | idx, depth int32 23 | list *fieldPos 24 | item *fieldPos 25 | mapField *fieldPos 26 | mapKey *fieldPos 27 | mapValue *fieldPos 28 | fields []*fieldPos 29 | children []*dataLoader 30 | } 31 | 32 | var ( 33 | ErrNullStructData = errors.New("null struct data") 34 | ) 35 | 36 | func newDataLoader() *dataLoader { return &dataLoader{idx: 0, depth: 0} } 37 | 38 | // drawTree takes the tree of field builders produced by mapFieldBuilders() 39 | // and produces another tree structure and aggregates fields whose values can 40 | // be retrieved from a `map[string]any` into a slice of builders, and creates a hierarchy to 41 | // deal with nested types (lists and maps). 42 | func (d *dataLoader) drawTree(field *fieldPos) { 43 | for _, f := range field.children() { 44 | if f.isList || f.isMap { 45 | if f.isList { 46 | c := d.newListChild(f) 47 | if !f.childrens[0].isList { 48 | c.item = f.childrens[0] 49 | c.drawTree(f.childrens[0]) 50 | } else { 51 | c.drawTree(f.childrens[0].childrens[0]) 52 | } 53 | } 54 | if f.isMap { 55 | c := d.newMapChild(f) 56 | if !arrow.IsNested(f.childrens[1].builder.Type().ID()) { 57 | c.mapKey = f.childrens[0] 58 | c.mapValue = f.childrens[1] 59 | } else { 60 | c.mapKey = f.childrens[0] 61 | m := c.newChild() 62 | m.mapValue = f.childrens[1] 63 | m.drawTree(f.childrens[1]) 64 | } 65 | } 66 | } else { 67 | d.fields = append(d.fields, f) 68 | if len(f.children()) > 0 { 69 | d.drawTree(f) 70 | } 71 | } 72 | } 73 | } 74 | 75 | // loadDatum loads data to the schema fields' builder functions. 76 | // Since array.StructBuilder.AppendNull() will recursively append null to all of the 77 | // struct's fields, in the case of nil being passed to a struct's builderFunc it will 78 | // return a ErrNullStructData error to signal that all its sub-fields can be skipped. 79 | func (d *dataLoader) loadDatum(data any) error { 80 | if d.list == nil && d.mapField == nil { 81 | if d.mapValue != nil { 82 | d.mapValue.appendFunc(data) 83 | } 84 | var NullParent *fieldPos 85 | for _, f := range d.fields { 86 | if f.parent == NullParent { 87 | continue 88 | } 89 | if d.mapValue == nil { 90 | err := f.appendFunc(f.getValue(data)) 91 | if err != nil { 92 | if err == ErrNullStructData { 93 | NullParent = f 94 | continue 95 | } 96 | return err 97 | } 98 | } else { 99 | switch dt := data.(type) { 100 | case nil: 101 | err := f.appendFunc(dt) 102 | if err != nil { 103 | if err == ErrNullStructData { 104 | NullParent = f 105 | continue 106 | } 107 | return err 108 | } 109 | case []any: 110 | if len(d.children) < 1 { 111 | for _, e := range dt { 112 | err := f.appendFunc(e) 113 | if err != nil { 114 | if err == ErrNullStructData { 115 | NullParent = f 116 | continue 117 | } 118 | return err 119 | } 120 | } 121 | } else { 122 | for _, e := range dt { 123 | d.children[0].loadDatum(e) 124 | } 125 | } 126 | case map[string]any: 127 | err := f.appendFunc(f.getValue(dt)) 128 | if err != nil { 129 | if err == ErrNullStructData { 130 | NullParent = f 131 | continue 132 | } 133 | return err 134 | } 135 | } 136 | 137 | } 138 | } 139 | for _, c := range d.children { 140 | if c.list != nil { 141 | c.loadDatum(c.list.getValue(data)) 142 | } 143 | if c.mapField != nil { 144 | switch dt := data.(type) { 145 | case nil: 146 | c.loadDatum(dt) 147 | case map[string]any: 148 | c.loadDatum(c.mapField.getValue(dt)) 149 | default: 150 | c.loadDatum(c.mapField.getValue(data)) 151 | } 152 | } 153 | } 154 | } else { 155 | if d.list != nil { 156 | switch dt := data.(type) { 157 | case nil: 158 | d.list.appendFunc(dt) 159 | case []any: 160 | d.list.appendFunc(dt) 161 | for _, e := range dt { 162 | if d.item != nil { 163 | d.item.appendFunc(e) 164 | } 165 | var NullParent *fieldPos 166 | for _, f := range d.fields { 167 | if f.parent == NullParent { 168 | continue 169 | } 170 | err := f.appendFunc(f.getValue(e)) 171 | if err != nil { 172 | if err == ErrNullStructData { 173 | NullParent = f 174 | continue 175 | } 176 | return err 177 | } 178 | } 179 | for _, c := range d.children { 180 | if c.list != nil { 181 | c.loadDatum(c.list.getValue(e)) 182 | } 183 | if c.mapField != nil { 184 | c.loadDatum(c.mapField.getValue(e)) 185 | } 186 | } 187 | } 188 | case map[string]any: 189 | d.list.appendFunc(dt) // 190 | for _, e := range dt { // 191 | if d.item != nil { 192 | d.item.appendFunc(e) 193 | } 194 | var NullParent *fieldPos 195 | for _, f := range d.fields { 196 | if f.parent == NullParent { 197 | continue 198 | } 199 | err := f.appendFunc(f.getValue(e)) 200 | if err != nil { 201 | if err == ErrNullStructData { 202 | NullParent = f 203 | continue 204 | } 205 | return err 206 | } 207 | } 208 | for _, c := range d.children { 209 | c.loadDatum(c.list.getValue(e)) 210 | } 211 | } 212 | default: 213 | d.list.appendFunc(data) 214 | d.item.appendFunc(dt) 215 | } 216 | } 217 | if d.mapField != nil { 218 | switch dt := data.(type) { 219 | case nil: 220 | d.mapField.appendFunc(dt) 221 | case map[string]any: 222 | d.mapField.appendFunc(dt) 223 | for k, v := range dt { 224 | d.mapKey.appendFunc(k) 225 | if d.mapValue != nil { 226 | d.mapValue.appendFunc(v) 227 | } else { 228 | d.children[0].loadDatum(v) 229 | } 230 | } 231 | } 232 | } 233 | } 234 | return nil 235 | } 236 | 237 | func (d *dataLoader) newChild() *dataLoader { 238 | var child *dataLoader = &dataLoader{ 239 | depth: d.depth + 1, 240 | } 241 | d.children = append(d.children, child) 242 | return child 243 | } 244 | 245 | func (d *dataLoader) newListChild(list *fieldPos) *dataLoader { 246 | var child *dataLoader = &dataLoader{ 247 | list: list, 248 | item: list.childrens[0], 249 | depth: d.depth + 1, 250 | } 251 | d.children = append(d.children, child) 252 | return child 253 | } 254 | 255 | func (d *dataLoader) newMapChild(mapField *fieldPos) *dataLoader { 256 | var child *dataLoader = &dataLoader{ 257 | mapField: mapField, 258 | depth: d.depth + 1, 259 | } 260 | d.children = append(d.children, child) 261 | return child 262 | } 263 | 264 | type fieldPos struct { 265 | parent *fieldPos 266 | fieldName string 267 | builder array.Builder 268 | source DataSource 269 | path []string 270 | isList bool 271 | isItem bool 272 | isStruct bool 273 | isMap bool 274 | typeName string 275 | appendFunc func(val interface{}) error 276 | metadatas arrow.Metadata 277 | childrens []*fieldPos 278 | index, depth int32 279 | } 280 | 281 | func newFieldPos() *fieldPos { return &fieldPos{index: -1} } 282 | 283 | func (f *fieldPos) children() []*fieldPos { return f.childrens } 284 | 285 | func (f *fieldPos) newChild(childName string, childBuilder array.Builder, meta arrow.Metadata) *fieldPos { 286 | var child fieldPos = fieldPos{ 287 | parent: f, 288 | source: f.source, 289 | fieldName: childName, 290 | builder: childBuilder, 291 | metadatas: meta, 292 | index: int32(len(f.childrens)), 293 | depth: f.depth + 1, 294 | } 295 | if f.isList { 296 | child.isItem = true 297 | } 298 | child.path = child.buildNamePath() 299 | f.childrens = append(f.childrens, &child) 300 | return &child 301 | } 302 | 303 | func (f *fieldPos) buildNamePath() []string { 304 | var path []string 305 | 306 | cur := f 307 | for i := f.depth - 1; i >= 0; i-- { 308 | if cur.fieldName != "item" { 309 | path = append([]string{cur.fieldName}, path...) 310 | } else { 311 | break 312 | } 313 | 314 | if !cur.parent.isMap { 315 | cur = cur.parent 316 | } 317 | } 318 | if f.parent.parent != nil && f.parent.parent.isList { 319 | var listPath []string 320 | for i := len(path) - 1; i >= 0; i-- { 321 | if path[i] != "elem" { 322 | listPath = append([]string{path[i]}, listPath...) 323 | } else { 324 | return listPath 325 | } 326 | } 327 | } 328 | if f.parent != nil && f.parent.fieldName == "item" { 329 | var listPath []string 330 | for i := len(path) - 1; i >= 0; i-- { 331 | if path[i] != "item" { 332 | listPath = append([]string{path[i]}, listPath...) 333 | } else { 334 | return listPath 335 | } 336 | } 337 | } 338 | // avro/arrow Maps ? 339 | // if f.parent != nil && f.parent.fieldName == "value" { 340 | // for i := len(path) - 1; i >= 0; i-- { 341 | // if path[i] != "value" { 342 | // listPath = append([]string{path[i]}, listPath...) 343 | // } else { 344 | // return listPath 345 | // } 346 | // } 347 | // } 348 | return path 349 | } 350 | 351 | // NamePath returns a slice of keys making up the path to the field 352 | func (f *fieldPos) namePath() []string { return f.path } 353 | 354 | // GetValue retrieves the value from the map[string]any 355 | // by following the field's key path 356 | func (f *fieldPos) getValue(m any) any { 357 | if _, ok := m.(map[string]any); !ok { 358 | return m 359 | } 360 | for _, key := range f.namePath() { 361 | valueMap, ok := m.(map[string]any) 362 | if !ok { 363 | if key == "item" { 364 | return m 365 | } 366 | return nil 367 | } 368 | m, ok = valueMap[key] 369 | if !ok { 370 | return nil 371 | } 372 | } 373 | return m 374 | } 375 | 376 | // Data is loaded to Arrow arrays using the following type mapping: 377 | // 378 | // Avro Go Arrow 379 | // null nil Null 380 | // boolean bool Boolean 381 | // bytes []byte Binary 382 | // float float32 Float32 383 | // double float64 Float64 384 | // long int64 Int64 385 | // int int32 Int32 386 | // string string String 387 | // array []interface{} List 388 | // enum string Dictionary 389 | // fixed []byte FixedSizeBinary 390 | // map and record map[string]any Struct 391 | // 392 | // mapFieldBuilders builds a tree of field builders matching the Arrow schema 393 | func mapFieldBuilders(b array.Builder, field arrow.Field, parent *fieldPos) { 394 | f := parent.newChild(field.Name, b, field.Metadata) 395 | switch bt := b.(type) { 396 | case *array.BinaryBuilder: 397 | f.appendFunc = func(data interface{}) error { 398 | appendBinaryData(bt, data, f.source) 399 | return nil 400 | } 401 | case *array.BinaryDictionaryBuilder: 402 | // has metadata for Avro enum symbols 403 | f.appendFunc = func(data interface{}) error { 404 | appendBinaryDictData(bt, data, f.source) 405 | return nil 406 | } 407 | // add Avro enum symbols to builder 408 | sb := array.NewStringBuilder(memory.DefaultAllocator) 409 | for _, v := range field.Metadata.Values() { 410 | sb.Append(v) 411 | } 412 | sa := sb.NewStringArray() 413 | bt.InsertStringDictValues(sa) 414 | case *array.BooleanBuilder: 415 | f.appendFunc = func(data interface{}) error { 416 | appendBoolData(bt, data, f.source) 417 | return nil 418 | } 419 | case *array.Date32Builder: 420 | f.appendFunc = func(data interface{}) error { 421 | appendDate32Data(bt, data, f.source) 422 | return nil 423 | } 424 | case *array.Decimal128Builder: 425 | f.appendFunc = func(data interface{}) error { 426 | err := appendDecimal128Data(bt, data, f.source) 427 | if err != nil { 428 | return err 429 | } 430 | return nil 431 | } 432 | case *array.Decimal256Builder: 433 | f.appendFunc = func(data interface{}) error { 434 | err := appendDecimal256Data(bt, data, f.source) 435 | if err != nil { 436 | return err 437 | } 438 | return nil 439 | } 440 | case *extensions.UUIDBuilder: 441 | f.appendFunc = func(data interface{}) error { 442 | switch dt := data.(type) { 443 | case nil: 444 | bt.AppendNull() 445 | case string: 446 | err := bt.AppendValueFromString(dt) 447 | if err != nil { 448 | return err 449 | } 450 | case []byte: 451 | err := bt.AppendValueFromString(string(dt)) 452 | if err != nil { 453 | return err 454 | } 455 | } 456 | return nil 457 | } 458 | case *array.FixedSizeBinaryBuilder: 459 | f.appendFunc = func(data interface{}) error { 460 | appendFixedSizeBinaryData(bt, data, f.source) 461 | return nil 462 | } 463 | case *array.Float32Builder: 464 | f.appendFunc = func(data interface{}) error { 465 | appendFloat32Data(bt, data, f.source) 466 | return nil 467 | } 468 | case *array.Float64Builder: 469 | f.appendFunc = func(data interface{}) error { 470 | appendFloat64Data(bt, data, f.source) 471 | return nil 472 | } 473 | case *array.Int32Builder: 474 | f.appendFunc = func(data interface{}) error { 475 | appendInt32Data(bt, data, f.source) 476 | return nil 477 | } 478 | case *array.Int64Builder: 479 | f.appendFunc = func(data interface{}) error { 480 | appendInt64Data(bt, data, f.source) 481 | return nil 482 | } 483 | case *array.LargeListBuilder: 484 | vb := bt.ValueBuilder() 485 | f.isList = true 486 | mapFieldBuilders(vb, field.Type.(*arrow.LargeListType).ElemField(), f) 487 | f.appendFunc = func(data interface{}) error { 488 | switch dt := data.(type) { 489 | case nil: 490 | bt.AppendNull() 491 | case []interface{}: 492 | if len(dt) == 0 { 493 | bt.AppendEmptyValue() 494 | } else { 495 | bt.Append(true) 496 | } 497 | default: 498 | bt.Append(true) 499 | } 500 | return nil 501 | } 502 | case *array.ListBuilder: 503 | vb := bt.ValueBuilder() 504 | f.isList = true 505 | mapFieldBuilders(vb, field.Type.(*arrow.ListType).ElemField(), f) 506 | f.appendFunc = func(data interface{}) error { 507 | switch dt := data.(type) { 508 | case nil: 509 | bt.AppendNull() 510 | case []interface{}: 511 | if len(dt) == 0 { 512 | bt.AppendEmptyValue() 513 | } else { 514 | bt.Append(true) 515 | } 516 | default: 517 | bt.Append(true) 518 | } 519 | return nil 520 | } 521 | case *array.MapBuilder: 522 | // has metadata for objects in values 523 | f.isMap = true 524 | kb := bt.KeyBuilder() 525 | ib := bt.ItemBuilder() 526 | mapFieldBuilders(kb, field.Type.(*arrow.MapType).KeyField(), f) 527 | mapFieldBuilders(ib, field.Type.(*arrow.MapType).ItemField(), f) 528 | f.appendFunc = func(data interface{}) error { 529 | switch data.(type) { 530 | case nil: 531 | bt.AppendNull() 532 | default: 533 | bt.Append(true) 534 | } 535 | return nil 536 | } 537 | case *array.MonthDayNanoIntervalBuilder: 538 | f.appendFunc = func(data interface{}) error { 539 | appendDurationData(bt, data, f.source) 540 | return nil 541 | } 542 | case *array.StringBuilder: 543 | f.appendFunc = func(data interface{}) error { 544 | appendStringData(bt, data, f.source) 545 | return nil 546 | } 547 | case *array.StructBuilder: 548 | // has metadata for Avro Union named types 549 | f.typeName, _ = field.Metadata.GetValue("typeName") 550 | f.isStruct = true 551 | // create children 552 | for i, p := range field.Type.(*arrow.StructType).Fields() { 553 | mapFieldBuilders(bt.FieldBuilder(i), p, f) 554 | } 555 | f.appendFunc = func(data interface{}) error { 556 | switch data.(type) { 557 | case nil: 558 | bt.AppendNull() 559 | return ErrNullStructData 560 | default: 561 | bt.Append(true) 562 | } 563 | return nil 564 | } 565 | case *array.Time32Builder: 566 | f.appendFunc = func(data interface{}) error { 567 | appendTime32Data(bt, data, f.source) 568 | return nil 569 | } 570 | case *array.Time64Builder: 571 | f.appendFunc = func(data interface{}) error { 572 | appendTime64Data(bt, data, f.source) 573 | return nil 574 | } 575 | case *array.TimestampBuilder: 576 | f.appendFunc = func(data interface{}) error { 577 | appendTimestampData(bt, data, f.source) 578 | return nil 579 | } 580 | } 581 | } 582 | 583 | func appendBinaryData(b *array.BinaryBuilder, data any, source DataSource) { 584 | switch dt := data.(type) { 585 | case nil: 586 | b.AppendNull() 587 | case []byte: 588 | b.Append(dt) 589 | case map[string]any: 590 | if source == DataSourceAvro { 591 | switch ct := dt["bytes"].(type) { 592 | case nil: 593 | b.AppendNull() 594 | default: 595 | b.Append(ct.([]byte)) 596 | } 597 | } 598 | default: 599 | b.Append(fmt.Append([]byte{}, data)) 600 | } 601 | } 602 | 603 | func appendBinaryDictData(b *array.BinaryDictionaryBuilder, data any, source DataSource) { 604 | switch dt := data.(type) { 605 | case nil: 606 | b.AppendNull() 607 | case string: 608 | b.AppendString(dt) 609 | case map[string]any: 610 | if source == DataSourceAvro { 611 | switch v := dt["string"].(type) { 612 | case nil: 613 | b.AppendNull() 614 | case string: 615 | b.AppendString(v) 616 | } 617 | } 618 | } 619 | } 620 | 621 | func appendBoolData(b *array.BooleanBuilder, data any, source DataSource) { 622 | switch dt := data.(type) { 623 | case nil: 624 | b.AppendNull() 625 | case bool: 626 | b.Append(dt) 627 | case map[string]any: 628 | if source == DataSourceAvro { 629 | switch v := dt["boolean"].(type) { 630 | case nil: 631 | b.AppendNull() 632 | case bool: 633 | b.Append(v) 634 | } 635 | } 636 | } 637 | } 638 | 639 | func appendDate32Data(b *array.Date32Builder, data any, source DataSource) { 640 | switch dt := data.(type) { 641 | case nil: 642 | b.AppendNull() 643 | case json.Number: 644 | // TO-DO 645 | case string: 646 | date, _ := time.Parse(time.DateOnly, dt) 647 | b.Append(arrow.Date32FromTime(date)) 648 | case time.Time: 649 | b.Append(arrow.Date32FromTime(dt)) 650 | case int32: 651 | b.Append(arrow.Date32(dt)) 652 | case map[string]any: 653 | if source == DataSourceAvro { 654 | switch v := dt["int"].(type) { 655 | case nil: 656 | b.AppendNull() 657 | case int32: 658 | b.Append(arrow.Date32(v)) 659 | } 660 | } 661 | } 662 | } 663 | 664 | func appendDecimal128Data(b *array.Decimal128Builder, data any, source DataSource) error { 665 | switch dt := data.(type) { 666 | case nil: 667 | b.AppendNull() 668 | case []byte: 669 | // TO-DO 670 | if source == DataSourceAvro { 671 | buf := bytes.NewBuffer(dt) 672 | if len(dt) <= 38 { 673 | var intData int64 674 | err := binary.Read(buf, binary.BigEndian, &intData) 675 | if err != nil { 676 | return err 677 | } 678 | b.Append(decimal128.FromI64(intData)) 679 | } else { 680 | var bigIntData big.Int 681 | b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) 682 | } 683 | } 684 | case map[string]any: 685 | if source == DataSourceAvro { 686 | buf := bytes.NewBuffer(dt["bytes"].([]byte)) 687 | if len(dt["bytes"].([]byte)) <= 38 { 688 | var intData int64 689 | err := binary.Read(buf, binary.BigEndian, &intData) 690 | if err != nil { 691 | return err 692 | } 693 | b.Append(decimal128.FromI64(intData)) 694 | } else { 695 | var bigIntData big.Int 696 | b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) 697 | } 698 | } 699 | } 700 | return nil 701 | } 702 | 703 | func appendDecimal256Data(b *array.Decimal256Builder, data any, source DataSource) error { 704 | switch dt := data.(type) { 705 | case nil: 706 | b.AppendNull() 707 | case []byte: 708 | // TO-DO 709 | if source == DataSourceAvro { 710 | var bigIntData big.Int 711 | buf := bytes.NewBuffer(dt) 712 | b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) 713 | } 714 | case map[string]any: 715 | if source == DataSourceAvro { 716 | var bigIntData big.Int 717 | buf := bytes.NewBuffer(dt["bytes"].([]byte)) 718 | b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes()))) 719 | } 720 | } 721 | return nil 722 | } 723 | 724 | // Avro duration logical type annotates Avro fixed type of size 12, which stores three little-endian 725 | // unsigned integers that represent durations at different granularities of time. The first stores 726 | // a number in months, the second stores a number in days, and the third stores a number in milliseconds. 727 | // 728 | // https://pkg.go.dev/time#Duration 729 | // Go time.Duration int64 730 | // A Duration represents the elapsed time between two instants as an int64 nanosecond count. 731 | // The representation limits the largest representable duration to approximately 290 years. 732 | func appendDurationData(b *array.MonthDayNanoIntervalBuilder, data any, source DataSource) { 733 | switch dt := data.(type) { 734 | case nil: 735 | b.AppendNull() 736 | case []byte: 737 | // TO-DO 738 | if source == DataSourceAvro { 739 | dur := new(arrow.MonthDayNanoInterval) 740 | dur.Months = int32(binary.LittleEndian.Uint16(dt[:3])) 741 | dur.Days = int32(binary.LittleEndian.Uint16(dt[4:7])) 742 | dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dt[8:]) * 1000000) 743 | b.Append(*dur) 744 | } 745 | case map[string]any: 746 | if source == DataSourceAvro { 747 | switch dtb := dt["bytes"].(type) { 748 | case nil: 749 | b.AppendNull() 750 | case []byte: 751 | dur := new(arrow.MonthDayNanoInterval) 752 | dur.Months = int32(binary.LittleEndian.Uint16(dtb[:3])) 753 | dur.Days = int32(binary.LittleEndian.Uint16(dtb[4:7])) 754 | dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dtb[8:]) * 1000000) 755 | b.Append(*dur) 756 | } 757 | } 758 | } 759 | } 760 | 761 | func appendFixedSizeBinaryData(b *array.FixedSizeBinaryBuilder, data any, source DataSource) { 762 | switch dt := data.(type) { 763 | case nil: 764 | b.AppendNull() 765 | case []byte: 766 | b.Append(dt) 767 | case map[string]any: 768 | if source == DataSourceAvro { 769 | switch v := dt["bytes"].(type) { 770 | case nil: 771 | b.AppendNull() 772 | case []byte: 773 | b.Append(v) 774 | } 775 | } 776 | } 777 | } 778 | 779 | func appendFloat32Data(b *array.Float32Builder, data any, source DataSource) { 780 | switch dt := data.(type) { 781 | case nil: 782 | b.AppendNull() 783 | case float32: 784 | b.Append(dt) 785 | case json.Number: 786 | f, _ := dt.Float64() 787 | b.Append(float32(f)) 788 | case string: 789 | i, _ := strconv.ParseFloat(dt, 32) 790 | b.Append(float32(i)) 791 | case map[string]any: 792 | if source == DataSourceAvro { 793 | switch v := dt["float"].(type) { 794 | case nil: 795 | b.AppendNull() 796 | case float32: 797 | b.Append(v) 798 | } 799 | } 800 | } 801 | } 802 | 803 | func appendFloat64Data(b *array.Float64Builder, data any, source DataSource) { 804 | switch dt := data.(type) { 805 | case nil: 806 | b.AppendNull() 807 | case float64: 808 | b.Append(dt) 809 | case json.Number: 810 | f, _ := dt.Float64() 811 | b.Append(f) 812 | case string: 813 | i, _ := strconv.ParseFloat(dt, 64) 814 | b.Append(i) 815 | case map[string]any: 816 | if source == DataSourceAvro { 817 | switch v := dt["double"].(type) { 818 | case nil: 819 | b.AppendNull() 820 | case float64: 821 | b.Append(v) 822 | } 823 | } 824 | } 825 | } 826 | 827 | func appendInt8Data(b *array.Int8Builder, data any, source DataSource) { 828 | switch dt := data.(type) { 829 | case nil: 830 | b.AppendNull() 831 | case int: 832 | b.Append(int8(dt)) 833 | case int8: 834 | b.Append(dt) 835 | case json.Number: 836 | i, _ := dt.Int64() 837 | b.Append(int8(i)) 838 | case string: 839 | i, _ := strconv.ParseInt(dt, 10, 8) 840 | b.Append(int8(i)) 841 | case map[string]any: 842 | 843 | } 844 | } 845 | 846 | func appendInt16Data(b *array.Int16Builder, data any, source DataSource) { 847 | switch dt := data.(type) { 848 | case nil: 849 | b.AppendNull() 850 | case int: 851 | b.Append(int16(dt)) 852 | case int16: 853 | b.Append(dt) 854 | case json.Number: 855 | i, _ := dt.Int64() 856 | b.Append(int16(i)) 857 | case string: 858 | i, _ := strconv.ParseInt(dt, 10, 16) 859 | b.Append(int16(i)) 860 | case map[string]any: 861 | 862 | } 863 | } 864 | 865 | func appendInt32Data(b *array.Int32Builder, data any, source DataSource) { 866 | switch dt := data.(type) { 867 | case nil: 868 | b.AppendNull() 869 | case int: 870 | b.Append(int32(dt)) 871 | case int32: 872 | b.Append(dt) 873 | case json.Number: 874 | i, _ := dt.Int64() 875 | b.Append(int32(i)) 876 | case string: 877 | i, _ := strconv.ParseInt(dt, 10, 32) 878 | b.Append(int32(i)) 879 | case map[string]any: 880 | 881 | } 882 | } 883 | 884 | func appendInt64Data(b *array.Int64Builder, data any, source DataSource) { 885 | switch dt := data.(type) { 886 | case nil: 887 | b.AppendNull() 888 | case int: 889 | b.Append(int64(dt)) 890 | case int64: 891 | b.Append(dt) 892 | case string: 893 | i, _ := strconv.ParseInt(dt, 10, 64) 894 | b.Append(i) 895 | case json.Number: 896 | i, _ := dt.Int64() 897 | b.Append(i) 898 | case map[string]any: 899 | if source == DataSourceAvro { 900 | switch v := dt["long"].(type) { 901 | case nil: 902 | b.AppendNull() 903 | case int: 904 | b.Append(int64(v)) 905 | case int64: 906 | b.Append(v) 907 | } 908 | } 909 | } 910 | } 911 | 912 | func appendStringData(b *array.StringBuilder, data any, source DataSource) { 913 | switch dt := data.(type) { 914 | case nil: 915 | b.AppendNull() 916 | case string: 917 | b.Append(dt) 918 | case map[string]any: 919 | if source == DataSourceAvro { 920 | switch v := dt["string"].(type) { 921 | case nil: 922 | b.AppendNull() 923 | case string: 924 | b.Append(v) 925 | } 926 | } 927 | default: 928 | b.Append(fmt.Sprint(data)) 929 | } 930 | } 931 | 932 | func appendTime32Data(b *array.Time32Builder, data any, source DataSource) { 933 | switch dt := data.(type) { 934 | case nil: 935 | b.AppendNull() 936 | case string: 937 | t, _ := arrow.Time32FromString(dt, arrow.Microsecond) 938 | b.Append(t) 939 | case int32: 940 | b.Append(arrow.Time32(dt)) 941 | case map[string]any: 942 | if source == DataSourceAvro { 943 | switch v := dt["int"].(type) { 944 | case nil: 945 | b.AppendNull() 946 | case int32: 947 | b.Append(arrow.Time32(v)) 948 | } 949 | } 950 | } 951 | } 952 | 953 | func appendTime64Data(b *array.Time64Builder, data any, source DataSource) { 954 | switch dt := data.(type) { 955 | case nil: 956 | b.AppendNull() 957 | case string: 958 | t, _ := arrow.Time64FromString(dt, arrow.Microsecond) 959 | b.Append(t) 960 | case int64: 961 | b.Append(arrow.Time64(dt)) 962 | case map[string]any: 963 | if source == DataSourceAvro { 964 | switch v := dt["long"].(type) { 965 | case nil: 966 | b.AppendNull() 967 | case int64: 968 | b.Append(arrow.Time64(v)) 969 | } 970 | } 971 | } 972 | } 973 | 974 | func appendTimestampData(b *array.TimestampBuilder, data any, source DataSource) { 975 | switch dt := data.(type) { 976 | case nil: 977 | b.AppendNull() 978 | case json.Number: 979 | epochSeconds, _ := dt.Int64() 980 | t, _ := arrow.TimestampFromTime(time.Unix(epochSeconds, 0), arrow.Microsecond) 981 | b.Append(t) 982 | case string: 983 | t, _ := arrow.TimestampFromString(dt, arrow.Microsecond) 984 | b.Append(t) 985 | case time.Time: 986 | t, _ := arrow.TimestampFromTime(dt, arrow.Microsecond) 987 | b.Append(t) 988 | case int64: 989 | b.Append(arrow.Timestamp(dt)) 990 | case map[string]any: 991 | switch v := dt["long"].(type) { 992 | case nil: 993 | b.AppendNull() 994 | case int64: 995 | b.Append(arrow.Timestamp(v)) 996 | } 997 | } 998 | } 999 | -------------------------------------------------------------------------------- /reader/option.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "io" 7 | 8 | "github.com/apache/arrow-go/v18/arrow/memory" 9 | ) 10 | 11 | // WithAllocator specifies the Arrow memory allocator used while building records. 12 | func WithAllocator(mem memory.Allocator) Option { 13 | return func(cfg config) { 14 | cfg.mem = mem 15 | } 16 | } 17 | 18 | // WithJSONDecoder specifies whether to use goccy/json-go as the Bodkin Reader's decoder. 19 | // The default is the Bodkin DataLoader, a linked list of builders which reduces recursive lookups 20 | // in maps when loading data. 21 | func WithJSONDecoder() Option { 22 | return func(cfg config) { 23 | cfg.jsonDecode = true 24 | } 25 | } 26 | 27 | // WithChunk specifies the chunk size used while reading data to Arrow records. 28 | // 29 | // If n is zero or 1, no chunking will take place and the reader will create 30 | // one record per row. 31 | // If n is greater than 1, chunks of n rows will be read. 32 | func WithChunk(n int) Option { 33 | return func(cfg config) { 34 | cfg.chunk = n 35 | } 36 | } 37 | 38 | // WithContext specifies the context used while reading data to Arrow records. 39 | // Calling reader.Cancel() will cancel the context and stop reading data. 40 | func WithContext(ctx context.Context) Option { 41 | return func(cfg config) { 42 | cfg.readerCtx, cfg.readCancel = context.WithCancel(ctx) 43 | } 44 | } 45 | 46 | // WithIOReader provides an io.Reader to Bodkin Reader, along with a delimiter 47 | // to use to split datum in the data stream. Default delimiter '\n' if delimiter 48 | // is not provided. 49 | func WithIOReader(r io.Reader, delim byte) Option { 50 | return func(cfg config) { 51 | cfg.rr = r 52 | cfg.br = bufio.NewReaderSize(cfg.rr, 1024*1024*16) 53 | if delim != DefaultDelimiter { 54 | cfg.delim = delim 55 | } 56 | } 57 | } 58 | 59 | // WithInputBufferSize specifies the Bodkin Reader's input buffer size. 60 | func WithInputBufferSize(n int) Option { 61 | return func(cfg config) { 62 | cfg.inputBufferSize = n 63 | } 64 | } 65 | 66 | // WithRecordBufferSize specifies the Bodkin Reader's record buffer size. 67 | func WithRecordBufferSize(n int) Option { 68 | return func(cfg config) { 69 | cfg.recordBufferSize = n 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /reader/reader.go: -------------------------------------------------------------------------------- 1 | // Package reader contains helpers for reading data and loading to Arrow. 2 | package reader 3 | 4 | import ( 5 | "bufio" 6 | "bytes" 7 | "context" 8 | "errors" 9 | "fmt" 10 | "io" 11 | "sync" 12 | "sync/atomic" 13 | 14 | "github.com/apache/arrow-go/v18/arrow" 15 | "github.com/apache/arrow-go/v18/arrow/array" 16 | "github.com/apache/arrow-go/v18/arrow/memory" 17 | json "github.com/goccy/go-json" 18 | ) 19 | 20 | type DataSource int 21 | 22 | const ( 23 | DataSourceGo DataSource = iota 24 | DataSourceJSON 25 | DataSourceAvro 26 | ) 27 | const ( 28 | Manual int = iota 29 | Scanner 30 | ) 31 | const DefaultDelimiter byte = byte('\n') 32 | 33 | // Option configures an Avro reader/writer. 34 | type ( 35 | Option func(config) 36 | config *DataReader 37 | ) 38 | 39 | type DataReader struct { 40 | rr io.Reader 41 | br *bufio.Reader 42 | delim byte 43 | refs int64 44 | source DataSource 45 | schema *arrow.Schema 46 | bld *array.RecordBuilder 47 | mem memory.Allocator 48 | opts []Option 49 | bldMap *fieldPos 50 | ldr *dataLoader 51 | cur arrow.Record 52 | curBatch []arrow.Record 53 | readerCtx context.Context 54 | readCancel func() 55 | err error 56 | anyChan chan any 57 | recChan chan arrow.Record 58 | recReq chan struct{} 59 | bldDone chan struct{} 60 | inputLock atomic.Int32 61 | factoryLock atomic.Int32 62 | wg sync.WaitGroup 63 | jsonDecode bool 64 | chunk int 65 | inputCount int 66 | inputBufferSize int 67 | recordBufferSize int 68 | } 69 | 70 | func NewReader(schema *arrow.Schema, source DataSource, opts ...Option) (*DataReader, error) { 71 | switch source { 72 | case DataSourceGo, DataSourceJSON, DataSourceAvro: 73 | break 74 | default: 75 | source = DataSourceGo 76 | } 77 | r := &DataReader{ 78 | source: source, 79 | schema: schema, 80 | mem: memory.DefaultAllocator, 81 | inputBufferSize: 1024 * 64, 82 | recordBufferSize: 1024 * 64, 83 | chunk: 0, 84 | delim: DefaultDelimiter, 85 | opts: opts, 86 | } 87 | for _, opt := range opts { 88 | opt(r) 89 | } 90 | 91 | r.anyChan = make(chan any, r.inputBufferSize) 92 | r.recChan = make(chan arrow.Record, r.recordBufferSize) 93 | r.bldDone = make(chan struct{}) 94 | r.recReq = make(chan struct{}, 100) 95 | if r.readerCtx == nil { 96 | r.readerCtx, r.readCancel = context.WithCancel(context.Background()) 97 | } 98 | if r.rr != nil { 99 | r.wg.Add(1) 100 | go r.decode2Chan() 101 | } 102 | r.bld = array.NewRecordBuilder(memory.DefaultAllocator, schema) 103 | r.bldMap = newFieldPos() 104 | r.bldMap.isStruct = true 105 | r.source = source 106 | r.ldr = newDataLoader() 107 | for idx, fb := range r.bld.Fields() { 108 | mapFieldBuilders(fb, schema.Field(idx), r.bldMap) 109 | } 110 | r.ldr.drawTree(r.bldMap) 111 | r.wg.Add(1) 112 | go r.recordFactory() 113 | 114 | return r, nil 115 | } 116 | 117 | // ReadToRecord decodes a datum directly to an arrow.Record. The record 118 | // should be released by the user when done with it. 119 | func (r *DataReader) ReadToRecord(a any) (arrow.Record, error) { 120 | var err error 121 | defer func() { 122 | if rc := recover(); rc != nil { 123 | fmt.Println(rc, err) 124 | } 125 | }() 126 | m, err := InputMap(a) 127 | if err != nil { 128 | r.err = errors.Join(r.err, err) 129 | } 130 | 131 | switch r.jsonDecode { 132 | case true: 133 | var v []byte 134 | v, err = json.Marshal(m) 135 | if err != nil { 136 | r.err = err 137 | return nil, err 138 | } 139 | d := json.NewDecoder(bytes.NewReader(v)) 140 | d.UseNumber() 141 | err = d.Decode(r.bld) 142 | if err != nil { 143 | return nil, err 144 | } 145 | default: 146 | err = r.ldr.loadDatum(m) 147 | if err != nil { 148 | return nil, err 149 | } 150 | } 151 | 152 | return r.bld.NewRecord(), nil 153 | } 154 | 155 | // NextBatch returns whether a []arrow.Record of a specified size can be received 156 | // from the converted record queue. Will still return true if the queue channel is closed and 157 | // last batch of records available < batch size specified. 158 | // The user should check Err() after a call to NextBatch that returned false to check 159 | // if an error took place. 160 | func (r *DataReader) NextBatch(batchSize int) bool { 161 | if batchSize < 1 { 162 | batchSize = 1 163 | } 164 | if len(r.curBatch) != 0 { 165 | for _, rec := range r.curBatch { 166 | rec.Release() 167 | } 168 | r.curBatch = []arrow.Record{} 169 | } 170 | r.wg.Wait() 171 | 172 | for len(r.curBatch) <= batchSize { 173 | select { 174 | case rec, ok := <-r.recChan: 175 | if !ok && rec == nil { 176 | if len(r.curBatch) > 0 { 177 | goto jump 178 | } 179 | return false 180 | } 181 | if rec != nil { 182 | r.curBatch = append(r.curBatch, rec) 183 | } 184 | case <-r.bldDone: 185 | if len(r.recChan) > 0 { 186 | rec := <-r.recChan 187 | r.curBatch = append(r.curBatch, rec) 188 | } 189 | case <-r.readerCtx.Done(): 190 | return false 191 | } 192 | } 193 | 194 | jump: 195 | if r.err != nil { 196 | return false 197 | } 198 | 199 | return len(r.curBatch) > 0 200 | } 201 | 202 | // Next returns whether a Record can be received from the converted record queue. 203 | // The user should check Err() after a call to Next that returned false to check 204 | // if an error took place. 205 | func (r *DataReader) Next() bool { 206 | var ok bool 207 | if r.cur != nil { 208 | r.cur.Release() 209 | r.cur = nil 210 | } 211 | r.wg.Wait() 212 | if r.chunk < 1 { 213 | r.recReq <- struct{}{} 214 | } 215 | select { 216 | case r.cur, ok = <-r.recChan: 217 | if !ok && r.cur == nil { 218 | return false 219 | } 220 | case <-r.bldDone: 221 | if len(r.recChan) > 0 { 222 | r.cur = <-r.recChan 223 | } 224 | case <-r.readerCtx.Done(): 225 | if len(r.recChan) > 0 { 226 | r.cur = <-r.recChan 227 | break 228 | } 229 | return false 230 | } 231 | if r.err != nil { 232 | return false 233 | } 234 | 235 | return r.cur != nil 236 | } 237 | 238 | func (r *DataReader) Mode() int { 239 | switch r.rr { 240 | case nil: 241 | return Manual 242 | default: 243 | return Scanner 244 | } 245 | } 246 | 247 | func (r *DataReader) Count() int { return r.inputCount } 248 | func (r *DataReader) ResetCount() { r.inputCount = 0 } 249 | func (r *DataReader) InputBufferSize() int { return r.inputBufferSize } 250 | func (r *DataReader) RecBufferSize() int { return r.recordBufferSize } 251 | func (r *DataReader) DataSource() DataSource { return r.source } 252 | func (r *DataReader) Opts() []Option { return r.opts } 253 | 254 | // Record returns the current Arrow record. 255 | // It is valid until the next call to Next. 256 | func (r *DataReader) Record() arrow.Record { return r.cur } 257 | 258 | // Record returns the current Arrow record batch. 259 | // It is valid until the next call to NextBatch. 260 | func (r *DataReader) RecordBatch() []arrow.Record { return r.curBatch } 261 | func (r *DataReader) Schema() *arrow.Schema { return r.schema } 262 | 263 | // Err returns the last error encountered during the reading of data. 264 | func (r *DataReader) Err() error { return r.err } 265 | 266 | // Retain increases the reference count by 1. 267 | // Retain may be called simultaneously from multiple goroutines. 268 | func (r *DataReader) Retain() { 269 | atomic.AddInt64(&r.refs, 1) 270 | } 271 | 272 | // Release decreases the reference count by 1. 273 | // When the reference count goes to zero, the memory is freed. 274 | // Release may be called simultaneously from multiple goroutines. 275 | func (r *DataReader) Release() { 276 | // debug.Assert(atomic.LoadInt64(&r.refs) > 0, "too many releases") 277 | 278 | if atomic.AddInt64(&r.refs, -1) == 0 { 279 | if r.cur != nil { 280 | r.cur.Release() 281 | } 282 | } 283 | } 284 | 285 | // Peek returns the length of the input data and Arrow Record queues. 286 | func (r *DataReader) Peek() (int, int) { 287 | return len(r.anyChan), len(r.recChan) 288 | } 289 | 290 | // Cancel cancels the Reader's io.Reader scan to Arrow. 291 | func (r *DataReader) Cancel() { 292 | r.readCancel() 293 | } 294 | 295 | // Read loads one datum. 296 | // If the Reader has an io.Reader, Read is a no-op. 297 | func (r *DataReader) Read(a any) error { 298 | if r.rr != nil { 299 | return nil 300 | } 301 | var err error 302 | defer func() error { 303 | if rc := recover(); rc != nil { 304 | r.err = errors.Join(r.err, fmt.Errorf("panic %v", rc)) 305 | } 306 | return r.err 307 | }() 308 | m, err := InputMap(a) 309 | if err != nil { 310 | r.err = errors.Join(r.err, err) 311 | return err 312 | } 313 | r.anyChan <- m 314 | r.inputCount++ 315 | return nil 316 | } 317 | 318 | // Reset resets a Reader to its initial state. 319 | func (r *DataReader) Reset() { 320 | r.readCancel() 321 | r.anyChan = make(chan any, r.inputBufferSize) 322 | r.recChan = make(chan arrow.Record, r.recordBufferSize) 323 | r.bldDone = make(chan struct{}) 324 | r.inputCount = 0 325 | 326 | // DataReader has an io.Reader 327 | if r.rr != nil { 328 | r.br.Reset(r.rr) 329 | go r.decode2Chan() 330 | r.wg.Add(1) 331 | } 332 | go r.recordFactory() 333 | r.wg.Add(1) 334 | } 335 | -------------------------------------------------------------------------------- /reader/recordfactory.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | ) 8 | 9 | func (r *DataReader) decode2Chan() { 10 | // 1 means running 11 | if r.inputLock.CompareAndSwap(0, 1) { 12 | defer r.inputLock.Store(0) 13 | } else { 14 | return 15 | } 16 | var err error 17 | defer func() { 18 | if rc := recover(); rc != nil { 19 | r.err = errors.Join(r.err, err, fmt.Errorf("panic %v", rc)) 20 | } 21 | }() 22 | defer close(r.anyChan) 23 | b := true 24 | for { 25 | datumBytes, err := r.br.ReadBytes(r.delim) 26 | if err != nil { 27 | if errors.Is(err, io.EOF) { 28 | r.err = nil 29 | return 30 | } 31 | r.err = err 32 | return 33 | } 34 | datum, err := InputMap(datumBytes[:len(datumBytes)-1]) 35 | if err != nil { 36 | r.err = errors.Join(r.err, err) 37 | continue 38 | } 39 | r.anyChan <- datum 40 | r.inputCount++ 41 | if b { 42 | r.wg.Done() // sync.WaitGroup to allow Next() to wait for records to be available 43 | b = false 44 | } 45 | select { 46 | case <-r.readerCtx.Done(): 47 | return 48 | default: 49 | } 50 | } 51 | } 52 | 53 | // recordFactory... the hits just keep on coming 54 | func (r *DataReader) recordFactory() { 55 | if r.factoryLock.CompareAndSwap(0, 1) { 56 | defer r.factoryLock.Store(0) 57 | } else { 58 | return 59 | } 60 | defer close(r.recChan) 61 | recChunk := 0 62 | 63 | r.wg.Done() // sync.WaitGroup to allow Next() to wait for records to be available 64 | 65 | switch { 66 | case r.chunk < 1: 67 | for data := range r.anyChan { 68 | err := r.ldr.loadDatum(data) 69 | if err != nil { 70 | r.err = err 71 | return 72 | } 73 | select { 74 | case <-r.readerCtx.Done(): 75 | r.bldDone <- struct{}{} 76 | return 77 | case <-r.recReq: 78 | r.recChan <- r.bld.NewRecord() 79 | default: 80 | } 81 | } 82 | r.recChan <- r.bld.NewRecord() 83 | r.bldDone <- struct{}{} 84 | case r.chunk >= 1: 85 | for data := range r.anyChan { 86 | if recChunk == 0 { 87 | r.bld.Reserve(r.chunk) 88 | } 89 | err := r.ldr.loadDatum(data) 90 | if err != nil { 91 | r.err = err 92 | return 93 | } 94 | recChunk++ 95 | if recChunk >= r.chunk { 96 | r.recChan <- r.bld.NewRecord() 97 | recChunk = 0 98 | } 99 | select { 100 | case <-r.readerCtx.Done(): 101 | r.recChan <- r.bld.NewRecord() 102 | r.bldDone <- struct{}{} 103 | return 104 | default: 105 | } 106 | } 107 | if recChunk != 0 { 108 | r.recChan <- r.bld.NewRecord() 109 | } 110 | r.bldDone <- struct{}{} 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /schema.go: -------------------------------------------------------------------------------- 1 | package bodkin 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "regexp" 7 | "slices" 8 | 9 | "github.com/apache/arrow-go/v18/arrow" 10 | "github.com/apache/arrow-go/v18/arrow/array" 11 | ) 12 | 13 | type fieldPos struct { 14 | root *fieldPos 15 | parent *fieldPos 16 | owner *Bodkin 17 | builder array.Builder 18 | name string 19 | path []string 20 | isList bool 21 | isItem bool 22 | isStruct bool 23 | isMap bool 24 | arrowType arrow.Type 25 | typeName string 26 | field arrow.Field 27 | children []*fieldPos 28 | childmap map[string]*fieldPos 29 | appendFunc func(val interface{}) error 30 | metadatas arrow.Metadata 31 | index, depth int32 32 | err error 33 | } 34 | 35 | // Schema evaluation/evolution errors. 36 | var ( 37 | ErrUndefinedInput = errors.New("nil input") 38 | ErrInvalidInput = errors.New("invalid input") 39 | ErrNoLatestSchema = errors.New("no second input has been provided") 40 | ErrUndefinedFieldType = errors.New("could not determine type of unpopulated field") 41 | ErrUndefinedArrayElementType = errors.New("could not determine element type of empty array") 42 | ErrNotAnUpgradableType = errors.New("is not an upgradable type") 43 | ErrPathNotFound = errors.New("path not found") 44 | ErrFieldTypeChanged = errors.New("changed") 45 | ErrFieldAdded = errors.New("added") 46 | ) 47 | 48 | // UpgradableTypes are scalar types that can be upgraded to a more flexible type. 49 | var UpgradableTypes []arrow.Type = []arrow.Type{arrow.INT8, 50 | arrow.UINT8, 51 | arrow.INT16, 52 | arrow.UINT16, 53 | arrow.INT32, 54 | arrow.UINT64, 55 | arrow.INT64, 56 | arrow.FLOAT16, 57 | arrow.FLOAT32, 58 | arrow.FLOAT64, 59 | arrow.DATE32, 60 | arrow.TIME64, 61 | arrow.TIMESTAMP, 62 | } 63 | 64 | // Regular expressions and variables for type inference. 65 | var ( 66 | timestampMatchers []*regexp.Regexp 67 | dateMatcher *regexp.Regexp 68 | timeMatcher *regexp.Regexp 69 | integerMatcher *regexp.Regexp 70 | floatMatcher *regexp.Regexp 71 | boolMatcher []string 72 | ) 73 | 74 | func init() { 75 | registerTsMatchers() 76 | registerQuotedStringValueMatchers() 77 | } 78 | 79 | func registerTsMatchers() { 80 | dateMatcher = regexp.MustCompile(`^\d{4}-\d{2}-\d{2}$`) 81 | timeMatcher = regexp.MustCompile(`^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$`) 82 | timestampMatchers = append(timestampMatchers, 83 | regexp.MustCompile(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$`), // ISO 8601 84 | regexp.MustCompile(`^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$`), // RFC 3339 with space instead of T 85 | regexp.MustCompile(`^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$`), // Datetime format with dashes 86 | regexp.MustCompile(`^\d{4}-\d{1,2}-\d{1,2}[T ]\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})? *(([+-]\d{1,2}(:\d{1,2})?)|Z|UTC)?$`)) 87 | } 88 | 89 | func registerQuotedStringValueMatchers() { 90 | integerMatcher = regexp.MustCompile(`^[-+]?\d+$`) 91 | floatMatcher = regexp.MustCompile(`^[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?$`) 92 | boolMatcher = append(boolMatcher, "true", "false") 93 | } 94 | 95 | func newFieldPos(b *Bodkin) *fieldPos { 96 | f := new(fieldPos) 97 | f.owner = b 98 | f.index = -1 99 | f.root = f 100 | f.childmap = make(map[string]*fieldPos) 101 | f.children = make([]*fieldPos, 0) 102 | return f 103 | } 104 | 105 | func (f *fieldPos) assignChild(child *fieldPos) { 106 | f.children = append(f.children, child) 107 | f.childmap[child.name] = child 108 | f.owner.knownFields.Set(child.dotPath(), child) 109 | f.owner.untypedFields.Delete(child.dotPath()) 110 | } 111 | 112 | func (f *fieldPos) child(index int) (*fieldPos, error) { 113 | if index < len(f.children) { 114 | return f.children[index], nil 115 | } 116 | return nil, fmt.Errorf("%v child index %d not found", f.namePath(), index) 117 | } 118 | 119 | func (f *fieldPos) error() error { return f.err } 120 | func (f *fieldPos) metadata() arrow.Metadata { return f.field.Metadata } 121 | 122 | func (f *fieldPos) newChild(childName string) *fieldPos { 123 | var child fieldPos = fieldPos{ 124 | root: f.root, 125 | parent: f, 126 | owner: f.owner, 127 | name: childName, 128 | index: int32(len(f.children)), 129 | depth: f.depth + 1, 130 | } 131 | if f.isList { 132 | child.isItem = true 133 | } 134 | child.path = child.namePath() 135 | child.childmap = make(map[string]*fieldPos) 136 | child.arrowType = arrow.NULL 137 | return &child 138 | } 139 | 140 | func (f *fieldPos) mapChildren() { 141 | for i, c := range f.children { 142 | f.childmap[c.name] = f.children[i] 143 | } 144 | } 145 | 146 | // getPath returns a field found at a defined path, otherwise returns ErrPathNotFound. 147 | func (f *fieldPos) getPath(path []string) (*fieldPos, error) { 148 | if len(path) == 0 { // degenerate input 149 | return nil, fmt.Errorf("getPath needs at least one key") 150 | } 151 | if node, ok := f.childmap[path[0]]; !ok { 152 | return nil, ErrPathNotFound 153 | } else if len(path) == 1 { // we've reached the final key 154 | return node, nil 155 | } else { // 1+ more keys 156 | return node.getPath(path[1:]) 157 | } 158 | } 159 | 160 | // namePath returns a slice of keys making up the path to the field 161 | func (f *fieldPos) namePath() []string { 162 | if len(f.path) == 0 { 163 | var path []string 164 | cur := f 165 | for i := f.depth - 1; i >= 0; i-- { 166 | path = append([]string{cur.name}, path...) 167 | cur = cur.parent 168 | } 169 | return path 170 | } 171 | return f.path 172 | } 173 | 174 | // namePath returns the path to the field in json dot notation 175 | func (f *fieldPos) dotPath() string { 176 | var path string = "$" 177 | for i, p := range f.path { 178 | path = path + p 179 | if i+1 != len(f.path) { 180 | path = path + "." 181 | } 182 | } 183 | return path 184 | } 185 | 186 | // getValue retrieves the value from the map[string]any 187 | // by following the field's key path 188 | func (f *fieldPos) getValue(m map[string]any) any { 189 | var value any = m 190 | for _, key := range f.namePath() { 191 | valueMap, ok := value.(map[string]any) 192 | if !ok { 193 | return nil 194 | } 195 | value, ok = valueMap[key] 196 | if !ok { 197 | return nil 198 | } 199 | } 200 | return value 201 | } 202 | 203 | // graft grafts a new field into the schema tree 204 | func (f *fieldPos) graft(n *fieldPos) { 205 | graft := f.newChild(n.name) 206 | graft.arrowType = n.arrowType 207 | graft.field = n.field 208 | graft.children = append(graft.children, n.children...) 209 | graft.mapChildren() 210 | f.assignChild(graft) 211 | f.owner.knownFields.Set(graft.dotPath(), graft) 212 | f.owner.untypedFields.Delete(graft.dotPath()) 213 | f.owner.changes = errors.Join(f.owner.changes, fmt.Errorf("%w %v : %v", ErrFieldAdded, graft.dotPath(), graft.field.Type.String())) 214 | if f.field.Type.ID() == arrow.STRUCT { 215 | gf := f.field.Type.(*arrow.StructType) 216 | var nf []arrow.Field 217 | nf = append(nf, gf.Fields()...) 218 | nf = append(nf, graft.field) 219 | f.field = arrow.Field{Name: f.name, Type: arrow.StructOf(nf...), Nullable: true} 220 | if (f.parent != nil) && f.parent.field.Type.ID() == arrow.LIST { 221 | f.parent.field = arrow.Field{Name: f.parent.name, Type: arrow.ListOf(f.field.Type.(*arrow.StructType)), Nullable: true} 222 | } 223 | } 224 | } 225 | 226 | // Only scalar types in UpgradableTypes[] can be upgraded: 227 | // Supported type upgrades: 228 | // 229 | // arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64 => arrow.FLOAT64 230 | // arrow.FLOAT16 => arrow.FLOAT32 231 | // arrow.FLOAT32 => arrow.FLOAT64 232 | // arrow.FLOAT64 => arrow.STRING 233 | // arrow.TIMESTAMP => arrow.STRING 234 | // arrow.DATE32 => arrow.TIMESTAMP 235 | // arrow.DATE32 => arrow.STRING 236 | // arrow.TIME64 => arrow.STRING 237 | func (o *fieldPos) upgradeType(n *fieldPos, t arrow.Type) error { 238 | if !slices.Contains(UpgradableTypes, o.field.Type.ID()) { 239 | return fmt.Errorf("%s %v %v", n.dotPath(), n.field.Type.Name(), ErrNotAnUpgradableType.Error()) 240 | } 241 | oldType := o.field.Type.String() 242 | // changes to field 243 | switch t { 244 | case arrow.FLOAT32: 245 | o.arrowType = arrow.FLOAT32 246 | o.field = arrow.Field{Name: o.name, Type: arrow.PrimitiveTypes.Float32, Nullable: true} 247 | case arrow.FLOAT64: 248 | o.arrowType = arrow.FLOAT64 249 | o.field = arrow.Field{Name: o.name, Type: arrow.PrimitiveTypes.Float64, Nullable: true} 250 | case arrow.STRING: 251 | o.arrowType = arrow.STRING 252 | o.field = arrow.Field{Name: o.name, Type: arrow.BinaryTypes.String, Nullable: true} 253 | case arrow.TIMESTAMP: 254 | o.arrowType = arrow.TIMESTAMP 255 | o.field = arrow.Field{Name: o.name, Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: true} 256 | } 257 | // changes to parent 258 | switch o.parent.field.Type.ID() { 259 | case arrow.LIST: 260 | o.parent.field = arrow.Field{Name: o.parent.name, Type: arrow.ListOf(n.field.Type), Nullable: true} 261 | case arrow.STRUCT: 262 | var fields []arrow.Field 263 | for _, c := range o.parent.children { 264 | fields = append(fields, c.field) 265 | } 266 | o.parent.field = arrow.Field{Name: o.parent.name, Type: arrow.StructOf(fields...), Nullable: true} 267 | } 268 | o.owner.changes = errors.Join(o.owner.changes, fmt.Errorf("%w %v : from %v to %v", ErrFieldTypeChanged, o.dotPath(), oldType, o.field.Type.String())) 269 | return nil 270 | } 271 | 272 | func errWrap(f *fieldPos) error { 273 | var err error 274 | if f.err != nil { 275 | err = errors.Join(f.err) 276 | } 277 | if len(f.children) > 0 { 278 | for _, field := range f.children { 279 | err = errors.Join(err, errWrap(field)) 280 | } 281 | } 282 | return err 283 | } 284 | 285 | // mapToArrow traverses a map[string]any and creates a fieldPos tree from 286 | // which an Arrow schema can be generated. 287 | func mapToArrow(f *fieldPos, m map[string]any) { 288 | for k, v := range m { 289 | child := f.newChild(k) 290 | switch t := v.(type) { 291 | case map[string]any: 292 | mapToArrow(child, t) 293 | var fields []arrow.Field 294 | for _, c := range child.children { 295 | fields = append(fields, c.field) 296 | } 297 | if len(child.children) != 0 { 298 | child.field = buildArrowField(k, arrow.StructOf(fields...), arrow.Metadata{}, true) 299 | f.assignChild(child) 300 | } else { 301 | child.arrowType = arrow.STRUCT 302 | child.isStruct = true 303 | f.owner.untypedFields.Set(child.dotPath(), child) 304 | } 305 | case []any: 306 | if len(t) <= 0 { 307 | child.arrowType = arrow.LIST 308 | child.isList = true 309 | f.owner.untypedFields.Set(child.dotPath(), child) 310 | f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, child.namePath())) 311 | } else { 312 | et := sliceElemType(child, t) 313 | child.isList = true 314 | child.field = buildArrowField(k, arrow.ListOf(et), arrow.Metadata{}, true) 315 | f.assignChild(child) 316 | } 317 | case nil: 318 | child.arrowType = arrow.NULL 319 | f.owner.untypedFields.Set(child.dotPath(), child) 320 | f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedFieldType, child.namePath())) 321 | default: 322 | child.field = buildArrowField(k, goType2Arrow(child, v), arrow.Metadata{}, true) 323 | f.assignChild(child) 324 | } 325 | } 326 | var fields []arrow.Field 327 | for _, c := range f.children { 328 | fields = append(fields, c.field) 329 | } 330 | f.arrowType = arrow.STRUCT 331 | f.field = arrow.Field{Name: f.name, Type: arrow.StructOf(fields...), Nullable: true} 332 | } 333 | 334 | // sliceElemType evaluates the slice type and returns an Arrow DataType 335 | // to be used in building an Arrow Field. 336 | func sliceElemType(f *fieldPos, v []any) arrow.DataType { 337 | switch ft := v[0].(type) { 338 | case map[string]any: 339 | child := f.newChild(f.name + ".elem") 340 | mapToArrow(child, ft) 341 | var fields []arrow.Field 342 | for _, c := range child.children { 343 | fields = append(fields, c.field) 344 | } 345 | f.assignChild(child) 346 | return arrow.StructOf(fields...) 347 | case []any: 348 | if len(ft) < 1 { 349 | f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, f.namePath())) 350 | return arrow.GetExtensionType("skip") 351 | } 352 | child := f.newChild(f.name + ".elem") 353 | et := sliceElemType(child, v[0].([]any)) 354 | f.assignChild(child) 355 | return arrow.ListOf(et) 356 | default: 357 | return goType2Arrow(f, v) 358 | } 359 | return nil 360 | } 361 | 362 | func buildArrowField(n string, t arrow.DataType, m arrow.Metadata, nullable bool) arrow.Field { 363 | return arrow.Field{ 364 | Name: n, 365 | Type: t, 366 | Metadata: m, 367 | Nullable: nullable, 368 | } 369 | } 370 | 371 | func buildTypeMetadata(k, v []string) arrow.Metadata { 372 | return arrow.NewMetadata(k, v) 373 | } 374 | -------------------------------------------------------------------------------- /types.go: -------------------------------------------------------------------------------- 1 | package bodkin 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "slices" 7 | "time" 8 | 9 | "github.com/apache/arrow-go/v18/arrow" 10 | ) 11 | 12 | // goType2Arrow maps a Go type to an Arrow DataType. 13 | func goType2Arrow(f *fieldPos, gt any) arrow.DataType { 14 | var dt arrow.DataType 15 | switch t := gt.(type) { 16 | case []any: 17 | return goType2Arrow(f, t[0]) 18 | case json.Number: 19 | if _, err := t.Int64(); err == nil { 20 | f.arrowType = arrow.INT64 21 | dt = arrow.PrimitiveTypes.Int64 22 | } else { 23 | f.arrowType = arrow.FLOAT64 24 | dt = arrow.PrimitiveTypes.Float64 25 | } 26 | case time.Time: 27 | f.arrowType = arrow.TIMESTAMP 28 | dt = arrow.FixedWidthTypes.Timestamp_us 29 | // either 32 or 64 bits 30 | case int: 31 | f.arrowType = arrow.INT64 32 | dt = arrow.PrimitiveTypes.Int64 33 | // the set of all signed 8-bit integers (-128 to 127) 34 | case int8: 35 | f.arrowType = arrow.INT8 36 | dt = arrow.PrimitiveTypes.Int8 37 | // the set of all signed 16-bit integers (-32768 to 32767) 38 | case int16: 39 | f.arrowType = arrow.INT16 40 | dt = arrow.PrimitiveTypes.Int16 41 | // the set of all signed 32-bit integers (-2147483648 to 2147483647) 42 | case int32: 43 | f.arrowType = arrow.INT32 44 | dt = arrow.PrimitiveTypes.Int32 45 | // the set of all signed 64-bit integers (-9223372036854775808 to 9223372036854775807) 46 | case int64: 47 | f.arrowType = arrow.INT64 48 | dt = arrow.PrimitiveTypes.Int64 49 | // either 32 or 64 bits 50 | case uint: 51 | f.arrowType = arrow.UINT64 52 | dt = arrow.PrimitiveTypes.Uint64 53 | // the set of all unsigned 8-bit integers (0 to 255) 54 | case uint8: 55 | f.arrowType = arrow.UINT8 56 | dt = arrow.PrimitiveTypes.Uint8 57 | // the set of all unsigned 16-bit integers (0 to 65535) 58 | case uint16: 59 | f.arrowType = arrow.UINT16 60 | dt = arrow.PrimitiveTypes.Uint16 61 | // the set of all unsigned 32-bit integers (0 to 4294967295) 62 | case uint32: 63 | f.arrowType = arrow.UINT32 64 | dt = arrow.PrimitiveTypes.Uint32 65 | // the set of all unsigned 64-bit integers (0 to 18446744073709551615) 66 | case uint64: 67 | f.arrowType = arrow.UINT64 68 | dt = arrow.PrimitiveTypes.Uint64 69 | // the set of all IEEE-754 32-bit floating-point numbers 70 | case float32: 71 | f.arrowType = arrow.FLOAT32 72 | dt = arrow.PrimitiveTypes.Float32 73 | // the set of all IEEE-754 64-bit floating-point numbers 74 | case float64: 75 | f.arrowType = arrow.FLOAT64 76 | dt = arrow.PrimitiveTypes.Float64 77 | case bool: 78 | f.arrowType = arrow.BOOL 79 | dt = arrow.FixedWidthTypes.Boolean 80 | case string: 81 | if f.owner.inferTimeUnits { 82 | for _, r := range timestampMatchers { 83 | if r.MatchString(t) { 84 | f.arrowType = arrow.TIMESTAMP 85 | return arrow.FixedWidthTypes.Timestamp_us 86 | } 87 | } 88 | if dateMatcher.MatchString(t) { 89 | f.arrowType = arrow.DATE32 90 | return arrow.FixedWidthTypes.Date32 91 | } 92 | if timeMatcher.MatchString(t) { 93 | f.arrowType = arrow.TIME64 94 | return arrow.FixedWidthTypes.Time64ns 95 | } 96 | } 97 | if !f.owner.quotedValuesAreStrings { 98 | if slices.Contains(boolMatcher, t) { 99 | f.arrowType = arrow.BOOL 100 | return arrow.FixedWidthTypes.Boolean 101 | } 102 | if integerMatcher.MatchString(t) { 103 | f.arrowType = arrow.INT64 104 | return arrow.PrimitiveTypes.Int64 105 | } 106 | if floatMatcher.MatchString(t) { 107 | f.arrowType = arrow.FLOAT64 108 | return arrow.PrimitiveTypes.Float64 109 | } 110 | } 111 | f.arrowType = arrow.STRING 112 | dt = arrow.BinaryTypes.String 113 | case []byte: 114 | f.arrowType = arrow.BINARY 115 | dt = arrow.BinaryTypes.Binary 116 | // the set of all complex numbers with float32 real and imaginary parts 117 | case complex64: 118 | // TO-DO 119 | f.arrowType = arrow.NULL 120 | f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath()) 121 | dt = arrow.BinaryTypes.Binary 122 | // the set of all complex numbers with float64 real and imaginary parts 123 | case complex128: 124 | // TO-DO 125 | f.arrowType = arrow.NULL 126 | f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath()) 127 | dt = arrow.BinaryTypes.Binary 128 | case nil: 129 | f.arrowType = arrow.NULL 130 | f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath()) 131 | dt = arrow.BinaryTypes.Binary 132 | default: 133 | // Catch-all for exotic unsupported types - ie. input field is a func 134 | f.arrowType = arrow.NULL 135 | f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath()) 136 | dt = arrow.BinaryTypes.Binary 137 | } 138 | return dt 139 | } 140 | 141 | func arrowTypeID2Type(f *fieldPos, t arrow.Type) arrow.DataType { 142 | var dt arrow.DataType 143 | switch t { 144 | // BOOL is a 1 bit, LSB bit-packed ordering 145 | case arrow.BOOL: 146 | dt = arrow.FixedWidthTypes.Boolean 147 | // the set of all signed 8-bit integers (-128 to 127) 148 | case arrow.INT8: 149 | dt = arrow.PrimitiveTypes.Int8 150 | // the set of all unsigned 8-bit integers (0 to 255) 151 | case arrow.UINT8: 152 | dt = arrow.PrimitiveTypes.Uint8 153 | // the set of all signed 16-bit integers (-32768 to 32767) 154 | case arrow.INT16: 155 | dt = arrow.PrimitiveTypes.Int16 156 | // the set of all unsigned 16-bit integers (0 to 65535) 157 | case arrow.UINT16: 158 | dt = arrow.PrimitiveTypes.Uint16 159 | // the set of all signed 32-bit integers (-2147483648 to 2147483647) 160 | case arrow.INT32: 161 | dt = arrow.PrimitiveTypes.Int32 162 | // the set of all unsigned 32-bit integers (0 to 4294967295) 163 | case arrow.UINT32: 164 | dt = arrow.PrimitiveTypes.Uint32 165 | // the set of all signed 64-bit integers (-9223372036854775808 to 9223372036854775807) 166 | case arrow.INT64: 167 | dt = arrow.PrimitiveTypes.Int64 168 | // the set of all unsigned 64-bit integers (0 to 18446744073709551615) 169 | case arrow.UINT64: 170 | dt = arrow.PrimitiveTypes.Uint64 171 | // the set of all IEEE-754 32-bit floating-point numbers 172 | case arrow.FLOAT32: 173 | dt = arrow.PrimitiveTypes.Float32 174 | // the set of all IEEE-754 64-bit floating-point numbers 175 | case arrow.FLOAT64: 176 | dt = arrow.PrimitiveTypes.Float64 177 | // TIMESTAMP is an exact timestamp encoded with int64 since UNIX epoch 178 | case arrow.TIMESTAMP: 179 | dt = arrow.FixedWidthTypes.Timestamp_us 180 | // DATE32 is int32 days since the UNIX epoch 181 | case arrow.DATE32: 182 | dt = arrow.FixedWidthTypes.Date32 183 | // TIME64 is a signed 64-bit integer, representing either microseconds or 184 | // nanoseconds since midnight 185 | case arrow.TIME64: 186 | dt = arrow.FixedWidthTypes.Time64ns 187 | // STRING is a UTF8 variable-length string 188 | case arrow.STRING: 189 | dt = arrow.BinaryTypes.String 190 | // BINARY is a Variable-length byte type (no guarantee of UTF8-ness) 191 | case arrow.BINARY: 192 | dt = arrow.BinaryTypes.Binary 193 | // NULL type having no physical storage 194 | case arrow.NULL: 195 | dt = arrow.BinaryTypes.Binary 196 | case arrow.STRUCT: 197 | var fields []arrow.Field 198 | for _, c := range f.children { 199 | fields = append(fields, c.field) 200 | } 201 | return arrow.StructOf(fields...) 202 | case arrow.LIST: 203 | var fields []arrow.Field 204 | for _, c := range f.children { 205 | fields = append(fields, c.field) 206 | } 207 | return arrow.StructOf(fields...) 208 | } 209 | return dt 210 | } 211 | --------------------------------------------------------------------------------