├── .github
    └── dependabot.yaml
├── .gitignore
├── LICENCE.txt
├── README.md
├── bodkin.go
├── cmd
    ├── .gitignore
    └── main.go
├── go.mod
├── go.sum
├── json2parquet
    ├── .gitignore
    ├── cmd
    │   ├── .gitignore
    │   ├── cleaner
    │   │   └── main.go
    │   └── main.go
    └── json2parquet.go
├── option.go
├── pq
    └── parquet_writer.go
├── reader
    ├── .gitignore
    ├── encoder.go
    ├── input.go
    ├── loader.go
    ├── option.go
    ├── reader.go
    └── recordfactory.go
├── schema.go
└── types.go


/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: gomod
4 |     directory: /
5 |     schedule:
6 |       interval: daily


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | # Go workspace file
21 | go.work
22 | go.work.sum
23 | 
24 | # env file
25 | .env
26 | 
27 | internal
28 | avro
29 | pochard
30 | experiments
31 | map.go
32 | *.schema
33 | *.pgo
34 | debug


--------------------------------------------------------------------------------
/LICENCE.txt:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Bodkin 🏹
  2 | ===================
  3 | [![Go Reference](https://pkg.go.dev/badge/github.com/loicalleyne/bodkin.svg)](https://pkg.go.dev/github.com/loicalleyne/bodkin)
  4 | 
  5 | Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow. 
  6 | 
  7 | The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet, especially on data whose schema is evolving or not strictly defined. 
  8 | An example would be with working with data retrieved from a 3rd-party API that does not maintain their OpenAPI spec.
  9 | 
 10 | Bodkin enables you to use your _data_ to define and evolve your Arrow Schema.
 11 | 
 12 | ## Features
 13 | ### Arrow schema generation from data type inference
 14 | - Converts a structured input (json string or []byte, Go struct or map[string]any) into an Apache Arrow schema
 15 | 	- Supports nested types 
 16 | - Automatically evolves the Arrow schema with new fields when providing [new inputs](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.Unify)
 17 | - Option to merge new infered schema at existing path for composibility ([bodkin.UnifyAtPath](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.UnifyAtPath))
 18 | - Converts schema field types when unifying schemas to accept evolving input data ([bodkin.WithTypeConversion](https://pkg.go.dev/github.com/loicalleyne/bodkin#WithTypeConversion))
 19 | - Tracks [changes](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.Changes) to the schema
 20 | - [Export](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.ExportSchemaFile)/[import](https://pkg.go.dev/github.com/loicalleyne/bodkin#Bodkin.ImportSchemaFile) a serialized Arrow schema to/from file or `[]byte` to transmit or persist schema definition
 21 | ### Custom data loader
 22 | - Load structured data directly to Arrow Records based on inferred schema
 23 | 	- Individual input to Arrow Record with [reader.ReadToRecord](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.ReadToRecord)
 24 | 	- io.Reader stream to Arrow Records ([bodkin.WithIOReader](https://pkg.go.dev/github.com/loicalleyne/bodkin#WithIOReader))
 25 | 		- retrieve a single `arrow.Record` with [reader.Next](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.Next)
 26 | 		- retrieve a `[]arrow.Record` with [reader.NextBatch](https://pkg.go.dev/github.com/loicalleyne/bodkin/reader#DataReader.NextBatch)
 27 | 
 28 | ## 🚀 Install
 29 | 
 30 | Using Bodkin is easy. First, use `go get` to install the latest version
 31 | of the library.
 32 | 
 33 | ```sh
 34 | go get -u github.com/loicalleyne/bodkin@latest
 35 | ```
 36 | 
 37 | ## 💡 Usage
 38 | 
 39 | You can import `bodkin` using:
 40 | 
 41 | ```go
 42 | import "github.com/loicalleyne/bodkin"
 43 | ```
 44 | 
 45 | Create a new Bodkin, provide some structured data and print out the resulting Arrow Schema's string representation and any field evaluation errors
 46 | ```go
 47 | var jsonS1 string = `{
 48 |     "count": 89,
 49 |     "next": "https://sub.domain.com/api/search/?models=thurblig&page=3",
 50 |     "previous": null,
 51 |     "results": [{"id":7594}],
 52 |     "arrayscalar":[],
 53 |     "datefield":"1979-01-01",
 54 |     "timefield":"01:02:03"
 55 |     }`
 56 | u, _ := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
 57 | u.Unify(jsonS1)
 58 | s, _ := u.OriginSchema()
 59 | fmt.Printf("original input %v\n", s.String())
 60 | for _, e := range u.Err() {
 61 | 	fmt.Printf("%v : [%s]\n", e.Issue, e.Dotpath)
 62 | }
 63 | // original input schema:
 64 | //   fields: 5
 65 | //     - results: type=list<item: struct<id: float64>, nullable>, nullable
 66 | //     - datefield: type=date32, nullable
 67 | //     - timefield: type=time64[ns], nullable
 68 | //     - count: type=float64, nullable
 69 | //     - next: type=utf8, nullable
 70 | // could not determine type of unpopulated field : [$previous]
 71 | // could not determine element type of empty array : [$arrayscalar]
 72 | ```
 73 | 
 74 | Provide some more structured data and print out the new merged schema and the list of changes
 75 | ```go
 76 | var jsonS2 string = `{
 77 | "count": 89.5,
 78 | "next": "https://sub.domain.com/api/search/?models=thurblig&page=3",
 79 | "previous": "https://sub.domain.com/api/search/?models=thurblig&page=2",
 80 | "results": [{"id":7594,"scalar":241.5,"nestedObj":{"strscalar":"str1","nestedarray":[123,456]}}],
 81 | "arrayscalar":["str"],
 82 | "datetime":"2024-10-24 19:03:09",
 83 | "event_time":"2024-10-24T19:03:09+00:00",
 84 | "datefield":"2024-10-24T19:03:09+00:00",
 85 | "timefield":"1970-01-01"
 86 | }`
 87 | u.Unify(jsonS2)
 88 | schema, _ := u.Schema()
 89 | fmt.Printf("\nunified %v\n", schema.String())
 90 | fmt.Println(u.Changes())
 91 | // unified schema:
 92 | //   fields: 9
 93 | //     - count: type=float64, nullable
 94 | //     - next: type=utf8, nullable
 95 | //     - results: type=list<item: struct<id: float64, scalar: float64, nested: struct<strscalar: utf8, nestedarray: list<item: float64, nullable>>>, nullable>, nullable
 96 | //     - datefield: type=timestamp[ms, tz=UTC], nullable
 97 | //     - timefield: type=utf8, nullable
 98 | //     - previous: type=utf8, nullable
 99 | //     - datetime: type=timestamp[ms, tz=UTC], nullable
100 | //     - arrayscalar: type=list<item: utf8, nullable>, nullable
101 | //     - event_time: type=timestamp[ms, tz=UTC], nullable
102 | // changes:
103 | // added $previous : utf8
104 | // added $datetime : timestamp[ms, tz=UTC]
105 | // changed $datefield : from date32 to timestamp[ms, tz=UTC]
106 | // added $results.results.elem.scalar : float64
107 | // added $results.results.elem.nested : struct<strscalar: utf8, nestedarray: list<item: float64, nullable>>
108 | // added $arrayscalar : list<item: utf8, nullable>
109 | // added $event_time : timestamp[ms, tz=UTC]
110 | // changed $timefield : from time64[ns] to utf8
111 | ```
112 | 
113 | Also works with nested Go structs and slices
114 | ```go
115 | 	stu := Student{
116 | 		Name: "StudentName",
117 | 		Age:  25,
118 | 		ID:   123456,
119 | 		Day:  123,
120 | 	}
121 | 	sch := School{
122 | 		Name: "SchoolName",
123 | 		Address: AddressType{
124 | 			Country: "CountryName",
125 | 		},
126 | 	}
127 | 	e, _ := bodkin.NewBodkin(stu, bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
128 | 	sc, err := e.OriginSchema()
129 | 	fmt.Printf("original input %v\n", sc.String())
130 | // original input schema:
131 | //   fields: 5
132 | //     - ID: type=int64, nullable
133 | //     - Day: type=int32, nullable
134 | //     - School: type=struct<Name: utf8, Address: struct<Street: utf8, City: utf8, Region: utf8, Country: utf8>>, nullable
135 | //     - Name: type=utf8, nullable
136 | //     - Age: type=int32, nullable
137 | 	e.Unify(sch)
138 | 	sc, err = e.OriginSchema()
139 | 	fmt.Printf("unified %v\n", sc.String())
140 | // unified schema:
141 | //   fields: 5
142 | //     - ID: type=int64, nullable
143 | //     - Day: type=int32, nullable
144 | //     - School: type=struct<Name: utf8, Address: struct<Street: utf8, City: utf8, Region: utf8, Country: utf8>>, nullable
145 | //     - Name: type=utf8, nullable
146 | //     - Age: type=int32, nullable
147 | ```
148 | 
149 | Export your schema to a file, then import the file to retrieve the schema; or export/import to/from a []byte.
150 | ```go
151 | _ = u.ExportSchemaFile("./test.schema")
152 | imp, _ := u.ImportSchemaFile("./test.schema")
153 | fmt.Printf("imported %v\n", imp.String())
154 | 
155 | bs, _ := u.ExportSchemaBytes()
156 | sc, _ := u.ImportSchemaBytes(bs)
157 | fmt.Printf("imported %v\n", sc.String())
158 | ```
159 | 
160 | Use a Bodkin Reader to load data to Arrow Records
161 | ```go
162 | u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
163 | u.Unify(jsonS1)	// feed data for schema generation
164 | rdr, _ := u.NewReader() // infered schema in Bodkin used to create Reader
165 | rec, _ := rdr.ReadToRecord([]byte(jsonS1)) // Reader loads data and returns Arrow Record
166 | ```
167 | 
168 | Provide a Bodkin Reader with an io.Reader to load many records
169 | ```go
170 | import "github.com/loicalleyne/bodkin/reader"
171 | ...
172 | u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
173 | // Create Reader attached to Bodkin ...
174 | u.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024))
175 | for u.Reader.Next(){
176 | 	rec := r.Record()
177 | }
178 | // or create a stand-alone Reader if you have an existing *arrow.Schema
179 | rdr, _ := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024))
180 | for rdr.Next() {
181 | 	rec := r.Record()
182 | ...
183 | }
184 | ```
185 | 
186 | Use the generated Arrow schema with Arrow's built-in JSON reader to decode JSON data into Arrow records
187 | ```go
188 | rdr = array.NewJSONReader(strings.NewReader(jsonS2), schema)
189 | defer rdr.Release()
190 | for rdr.Next() {
191 |     rec := rdr.Record()
192 |     rj, _ := rec.MarshalJSON()
193 |     fmt.Printf("\nmarshaled record:\n%v\n", string(rj))
194 | }
195 | // marshaled record:
196 | // [{"arrayscalar":["str"],"count":89.5,"datefield":"2024-10-24 19:03:09Z","datetime":"2024-10-24 19:03:09Z","event_time":"2024-10-24 19:03:09Z","next":"https://sub.domain.com/api/search/?models=thurblig\u0026page=3","previous":"https://sub.domain.com/api/search/?models=thurblig\u0026page=2","results":[{"id":7594,"nested":{"nestedarray":[123,456],"strscalar":"str1"},"scalar":241.5}],"timefield":"1970-01-01"}
197 | // ]
198 | ```
199 | 
200 | ## 💫 Show your support
201 | 
202 | Give a ⭐️ if this project helped you!
203 | Feedback and PRs welcome.
204 | 
205 | ## License
206 | 
207 | Bodkin is released under the Apache 2.0 license. See [LICENCE.txt](LICENCE.txt)


--------------------------------------------------------------------------------
/bodkin.go:
--------------------------------------------------------------------------------
  1 | // Package bodkin is a Go library for generating schemas and decoding generic map values and native Go structures to Apache Arrow.
  2 | // The goal is to provide a useful toolkit to make it easier to use Arrow, and by extension Parquet with data whose shape
  3 | // is evolving  or not strictly defined.
  4 | package bodkin
  5 | 
  6 | import (
  7 | 	"bufio"
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"math"
 12 | 	"os"
 13 | 	"slices"
 14 | 	"strings"
 15 | 
 16 | 	"github.com/apache/arrow-go/v18/arrow"
 17 | 	"github.com/apache/arrow-go/v18/arrow/flight"
 18 | 	"github.com/apache/arrow-go/v18/arrow/memory"
 19 | 	"github.com/loicalleyne/bodkin/reader"
 20 | 	omap "github.com/wk8/go-ordered-map/v2"
 21 | )
 22 | 
 23 | // Option configures a Bodkin
 24 | type (
 25 | 	Option func(config)
 26 | 	config *Bodkin
 27 | )
 28 | 
 29 | // Field represents an element in the input data.
 30 | type Field struct {
 31 | 	Dotpath string     `json:"dotpath"`
 32 | 	Type    arrow.Type `json:"arrow_type"`
 33 | 	// Number of child fields if a nested type
 34 | 	Childen int `json:"children,omitempty"`
 35 | 	// Evaluation failure reason
 36 | 	Issue error `json:"issue,omitempty"`
 37 | }
 38 | 
 39 | const (
 40 | 	unknown int = 0
 41 | 	known   int = 1
 42 | )
 43 | 
 44 | // Bodkin is a collection of field paths, describing the columns of a structured input(s).
 45 | type Bodkin struct {
 46 | 	rr                     io.Reader
 47 | 	br                     *bufio.Reader
 48 | 	delim                  byte
 49 | 	original               *fieldPos
 50 | 	old                    *fieldPos
 51 | 	new                    *fieldPos
 52 | 	opts                   []Option
 53 | 	Reader                 *reader.DataReader
 54 | 	knownFields            *omap.OrderedMap[string, *fieldPos]
 55 | 	untypedFields          *omap.OrderedMap[string, *fieldPos]
 56 | 	unificationCount       int
 57 | 	maxCount               int
 58 | 	inferTimeUnits         bool
 59 | 	quotedValuesAreStrings bool
 60 | 	typeConversion         bool
 61 | 	err                    error
 62 | 	changes                error
 63 | }
 64 | 
 65 | func (u *Bodkin) Opts() []Option { return u.opts }
 66 | 
 67 | // GetReader returns a DataReader, will return an existing DataReader if it exists, if not it will create a new one. If the Reader already exists, the opts are ignored. If you want to create a new Reader with different opts, use NewReader.
 68 | func (u *Bodkin) GetReader(opts ...reader.Option) (*reader.DataReader, error) {
 69 | 	if u.Reader == nil {
 70 | 		return u.NewReader(opts...)
 71 | 	}
 72 | 	return u.Reader, nil
 73 | }
 74 | 
 75 | // NewReader returns a new DataReader, to be used to read structured input into Arrow records.
 76 | func (u *Bodkin) NewReader(opts ...reader.Option) (*reader.DataReader, error) {
 77 | 	schema, err := u.Schema()
 78 | 	if err != nil {
 79 | 		return nil, err
 80 | 	}
 81 | 	if schema == nil {
 82 | 		return nil, fmt.Errorf("nil schema")
 83 | 	}
 84 | 	u.Reader, err = reader.NewReader(schema, 0, opts...)
 85 | 	if err != nil {
 86 | 		return nil, err
 87 | 	}
 88 | 	return u.Reader, nil
 89 | }
 90 | 
 91 | // NewBodkin returns a new Bodkin value from a structured input.
 92 | // Input must be a json byte slice or string, a Go struct with exported fields or map[string]any.
 93 | // Any unpopulated fields, empty objects or empty slices in JSON or map[string]any inputs are skipped as their
 94 | // types cannot be evaluated and converted.
 95 | func NewBodkin(opts ...Option) *Bodkin {
 96 | 	return newBodkin(opts...)
 97 | }
 98 | 
 99 | func newBodkin(opts ...Option) *Bodkin {
100 | 	b := &Bodkin{}
101 | 	b.opts = opts
102 | 	for _, opt := range opts {
103 | 		opt(b)
104 | 	}
105 | 
106 | 	// Ordered map of known fields, keys are field dotpaths.
107 | 	b.knownFields = omap.New[string, *fieldPos]()
108 | 	b.untypedFields = omap.New[string, *fieldPos]()
109 | 	b.maxCount = math.MaxInt
110 | 	return b
111 | }
112 | 
113 | // Returns count of evaluated field paths.
114 | func (u *Bodkin) CountPaths() int {
115 | 	return u.knownFields.Len()
116 | }
117 | 
118 | // Returns count of unevaluated field paths.
119 | func (u *Bodkin) CountPending() int {
120 | 	return u.untypedFields.Len()
121 | }
122 | 
123 | // Err returns a []Field that could not be evaluated to date.
124 | func (u *Bodkin) Err() []Field {
125 | 	fp := u.sortMapKeysDesc(unknown)
126 | 	var paths []Field = make([]Field, len(fp))
127 | 	for i, p := range fp {
128 | 		f, _ := u.untypedFields.Get(p)
129 | 		d := Field{Dotpath: f.dotPath(), Type: f.arrowType}
130 | 		switch f.arrowType {
131 | 		case arrow.STRUCT:
132 | 			d.Issue = fmt.Errorf("struct : %vs", ErrUndefinedFieldType)
133 | 		case arrow.LIST:
134 | 			d.Issue = fmt.Errorf("list : %v", ErrUndefinedArrayElementType)
135 | 		default:
136 | 			d.Issue = fmt.Errorf("%w", ErrUndefinedFieldType)
137 | 		}
138 | 		paths[i] = d
139 | 	}
140 | 	return paths
141 | }
142 | 
143 | // Changes returns a list of field additions and field type conversions done
144 | // in the lifetime of the Bodkin object.
145 | func (u *Bodkin) Changes() error { return u.changes }
146 | 
147 | // Count returns the number of datum evaluated for schema to date.
148 | func (u *Bodkin) Count() int { return u.unificationCount }
149 | 
150 | // MaxCount returns the maximum number of datum to be evaluated for schema.
151 | func (u *Bodkin) MaxCount() int { return u.unificationCount }
152 | 
153 | // ResetCount resets the count of datum evaluated for schema to date.
154 | func (u *Bodkin) ResetCount() int {
155 | 	u.unificationCount = 0
156 | 	return u.unificationCount
157 | }
158 | 
159 | // ResetMaxCount resets the maximum number of datam to be evaluated for schema
160 | // to maxInt64.
161 | // ResetCount resets the count of datum evaluated for schema to date.
162 | func (u *Bodkin) ResetMaxCount() int {
163 | 	u.maxCount = math.MaxInt
164 | 	return u.unificationCount
165 | }
166 | 
167 | // Paths returns a slice of dotpaths of fields successfully evaluated to date.
168 | func (u *Bodkin) Paths() []Field {
169 | 	fp := u.sortMapKeysDesc(known)
170 | 	var paths []Field = make([]Field, len(fp))
171 | 	for i, p := range fp {
172 | 		f, ok := u.knownFields.Get(p)
173 | 		if !ok {
174 | 			continue
175 | 		}
176 | 		d := Field{Dotpath: f.dotPath(), Type: f.arrowType}
177 | 		switch f.arrowType {
178 | 		case arrow.STRUCT:
179 | 			d.Childen = len(f.children)
180 | 		}
181 | 		paths[i] = d
182 | 	}
183 | 	return paths
184 | }
185 | 
186 | // ExportSchema exports a serialized Arrow Schema to a file.
187 | func (u *Bodkin) ExportSchemaFile(exportPath string) error {
188 | 	schema, err := u.Schema()
189 | 	if err != nil {
190 | 		return err
191 | 	}
192 | 	bs := flight.SerializeSchema(schema, memory.DefaultAllocator)
193 | 	err = os.WriteFile(exportPath, bs, 0644)
194 | 	if err != nil {
195 | 		return err
196 | 	}
197 | 	return nil
198 | }
199 | 
200 | // ImportSchema imports a serialized Arrow Schema from a file.
201 | func (u *Bodkin) ImportSchemaFile(importPath string) (*arrow.Schema, error) {
202 | 	dat, err := os.ReadFile(importPath)
203 | 	if err != nil {
204 | 		return nil, err
205 | 	}
206 | 	return flight.DeserializeSchema(dat, memory.DefaultAllocator)
207 | }
208 | 
209 | // ExportSchemaBytes exports a serialized Arrow Schema.
210 | func (u *Bodkin) ExportSchemaBytes() ([]byte, error) {
211 | 	schema, err := u.Schema()
212 | 	if err != nil {
213 | 		return nil, err
214 | 	}
215 | 	return flight.SerializeSchema(schema, memory.DefaultAllocator), nil
216 | }
217 | 
218 | // ImportSchemaBytes imports a serialized Arrow Schema.
219 | func (u *Bodkin) ImportSchemaBytes(dat []byte) (*arrow.Schema, error) {
220 | 	return flight.DeserializeSchema(dat, memory.DefaultAllocator)
221 | }
222 | 
223 | // Unify merges structured input's column definition with the previously input's schema.
224 | // Any unpopulated fields, empty objects or empty slices in JSON input are skipped.
225 | func (u *Bodkin) Unify(a any) error {
226 | 	if u.unificationCount > u.maxCount {
227 | 		return fmt.Errorf("maxcount exceeded")
228 | 	}
229 | 	m, err := reader.InputMap(a)
230 | 	if err != nil {
231 | 		u.err = fmt.Errorf("%v : %v", ErrInvalidInput, err)
232 | 		return fmt.Errorf("%v : %v", ErrInvalidInput, err)
233 | 	}
234 | 	if u.old == nil {
235 | 		// Keep an immutable copy of the initial evaluation.
236 | 		g := newFieldPos(u)
237 | 		mapToArrow(g, m)
238 | 		u.original = g
239 | 		// Identical to above except this one can be mutated with Unify.
240 | 		f := newFieldPos(u)
241 | 		mapToArrow(f, m)
242 | 		u.old = f
243 | 		return nil
244 | 	}
245 | 	f := newFieldPos(u)
246 | 	mapToArrow(f, m)
247 | 	u.new = f
248 | 	for _, field := range u.new.children {
249 | 		u.merge(field, nil)
250 | 	}
251 | 	u.unificationCount++
252 | 	return nil
253 | }
254 | 
255 | // UnifyScan reads from a provided io.Reader and merges each datum's structured input's column definition
256 | // with the previously input's schema. Any unpopulated fields, empty objects or empty slices
257 | // in JSON input are skipped.
258 | func (u *Bodkin) UnifyScan() error {
259 | 	var err error
260 | 	if u.rr == nil {
261 | 		return fmt.Errorf("no io.reader provided")
262 | 	}
263 | 	if u.unificationCount > u.maxCount {
264 | 		return fmt.Errorf("maxcount exceeded")
265 | 	}
266 | 	defer func() error {
267 | 		if rc := recover(); rc != nil {
268 | 			u.err = errors.Join(u.err, err, fmt.Errorf("panic %v", rc))
269 | 		}
270 | 		return u.err
271 | 	}()
272 | 	for {
273 | 		datumBytes, err := u.br.ReadBytes(u.delim)
274 | 		if err != nil {
275 | 			if errors.Is(err, io.EOF) {
276 | 				u.err = nil
277 | 				break
278 | 			}
279 | 			u.err = err
280 | 			break
281 | 		}
282 | 		m, err := reader.InputMap(datumBytes)
283 | 		if err != nil {
284 | 			u.err = errors.Join(u.err, err)
285 | 			continue
286 | 		}
287 | 		u.Unify(m)
288 | 	}
289 | 	return u.err
290 | }
291 | 
292 | // Unify merges structured input's column definition with the previously input's schema,
293 | // using a specified valid path as the root. An error is returned if the mergeAt path is
294 | // not found.
295 | // Any unpopulated fields, empty objects or empty slices in JSON input are skipped.
296 | func (u *Bodkin) UnifyAtPath(a any, mergeAt string) error {
297 | 	if u.old == nil {
298 | 		return fmt.Errorf("bodkin not initialised")
299 | 	}
300 | 	if u.unificationCount > u.maxCount {
301 | 		return fmt.Errorf("maxcount exceeded")
302 | 	}
303 | 	mergePath := make([]string, 0)
304 | 	if !(len(mergeAt) == 0 || mergeAt == "$") {
305 | 		mergePath = strings.Split(strings.TrimPrefix(mergeAt, "$"), ".")
306 | 	}
307 | 	if _, ok := u.knownFields.Get(mergeAt); !ok {
308 | 		return fmt.Errorf("unitfyatpath %s : %v", mergeAt, ErrPathNotFound)
309 | 	}
310 | 
311 | 	m, err := reader.InputMap(a)
312 | 	if err != nil {
313 | 		u.err = fmt.Errorf("%v : %v", ErrInvalidInput, err)
314 | 		return fmt.Errorf("%v : %v", ErrInvalidInput, err)
315 | 	}
316 | 
317 | 	f := newFieldPos(u)
318 | 	mapToArrow(f, m)
319 | 	u.new = f
320 | 	for _, field := range u.new.children {
321 | 		u.merge(field, mergePath)
322 | 	}
323 | 	u.unificationCount++
324 | 	return nil
325 | }
326 | 
327 | // Schema returns the original Arrow schema generated from the structure/types of
328 | // the initial input, and a panic recovery error if the schema could not be created.
329 | func (u *Bodkin) OriginSchema() (*arrow.Schema, error) {
330 | 	if u.old == nil {
331 | 		return nil, fmt.Errorf("bodkin not initialised")
332 | 	}
333 | 	var s *arrow.Schema
334 | 	defer func(s *arrow.Schema) (*arrow.Schema, error) {
335 | 		if pErr := recover(); pErr != nil {
336 | 			return nil, fmt.Errorf("schema problem: %v", pErr)
337 | 		}
338 | 		return s, nil
339 | 	}(s)
340 | 	var fields []arrow.Field
341 | 	for _, c := range u.original.children {
342 | 		fields = append(fields, c.field)
343 | 	}
344 | 	s = arrow.NewSchema(fields, nil)
345 | 	return s, nil
346 | }
347 | 
348 | // Schema returns the current merged Arrow schema generated from the structure/types of
349 | // the input(s), and a panic recovery error if the schema could not be created.
350 | // If the Bodkin has a Reader and the schema has been updated since its creation, the Reader
351 | // will replaced with a new one matching the current schema. Any
352 | func (u *Bodkin) Schema() (*arrow.Schema, error) {
353 | 	if u.old == nil {
354 | 		return nil, fmt.Errorf("bodkin not initialised")
355 | 	}
356 | 	var s *arrow.Schema
357 | 	defer func(s *arrow.Schema) (*arrow.Schema, error) {
358 | 		if pErr := recover(); pErr != nil {
359 | 			return nil, fmt.Errorf("schema problem: %v", pErr)
360 | 		}
361 | 		return s, nil
362 | 	}(s)
363 | 	var fields []arrow.Field
364 | 	for _, c := range u.old.children {
365 | 		fields = append(fields, c.field)
366 | 	}
367 | 	s = arrow.NewSchema(fields, nil)
368 | 	if u.Reader != nil {
369 | 		if !u.Reader.Schema().Equal(s) {
370 | 			u.Reader, _ = reader.NewReader(s, 0, u.Reader.Opts()...)
371 | 		}
372 | 	}
373 | 	return s, nil
374 | }
375 | 
376 | // LastSchema returns the Arrow schema generated from the structure/types of
377 | // the most recent input. Any unpopulated fields, empty objects or empty slices are skipped.
378 | // ErrNoLatestSchema if Unify() has never been called. A panic recovery error is returned
379 | // if the schema could not be created.
380 | func (u *Bodkin) LastSchema() (*arrow.Schema, error) {
381 | 	if u.new == nil {
382 | 		return nil, ErrNoLatestSchema
383 | 	}
384 | 	var s *arrow.Schema
385 | 	defer func(s *arrow.Schema) (*arrow.Schema, error) {
386 | 		if pErr := recover(); pErr != nil {
387 | 			return nil, fmt.Errorf("schema problem: %v", pErr)
388 | 		}
389 | 		return s, nil
390 | 	}(s)
391 | 	var fields []arrow.Field
392 | 	for _, c := range u.new.children {
393 | 		fields = append(fields, c.field)
394 | 	}
395 | 	s = arrow.NewSchema(fields, nil)
396 | 	return s, nil
397 | }
398 | 
399 | // merge merges a new or changed field into the unified schema.
400 | // Conflicting TIME, DATE, TIMESTAMP types are upgraded to STRING.
401 | // DATE can upgrade to TIMESTAMP.
402 | // INTEGER can upgrade to FLOAT.
403 | func (u *Bodkin) merge(n *fieldPos, mergeAt []string) {
404 | 	var nPath, nParentPath []string
405 | 	if len(mergeAt) > 0 {
406 | 		nPath = slices.Concat(mergeAt, n.path)
407 | 		nParentPath = slices.Concat(mergeAt, n.parent.path)
408 | 	} else {
409 | 		nPath = n.path
410 | 		nParentPath = n.parent.path
411 | 	}
412 | 	if kin, err := u.old.getPath(nPath); err == ErrPathNotFound {
413 | 		// root graft
414 | 		if n.root == n.parent {
415 | 			u.old.root.graft(n)
416 | 		} else {
417 | 			// branch graft
418 | 			b, _ := u.old.getPath(nParentPath)
419 | 			b.graft(n)
420 | 		}
421 | 	} else {
422 | 		if u.typeConversion && (!kin.field.Equal(n.field) && kin.field.Type.ID() != n.field.Type.ID()) {
423 | 			switch kin.field.Type.ID() {
424 | 			case arrow.NULL:
425 | 				break
426 | 			case arrow.STRING:
427 | 				break
428 | 			case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64, arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64:
429 | 				switch n.field.Type.ID() {
430 | 				case arrow.FLOAT16, arrow.FLOAT32, arrow.FLOAT64:
431 | 					err := kin.upgradeType(n, arrow.FLOAT64)
432 | 					if err != nil {
433 | 						kin.err = errors.Join(kin.err, err)
434 | 					}
435 | 				default:
436 | 					err := kin.upgradeType(n, arrow.STRING)
437 | 					if err != nil {
438 | 						kin.err = errors.Join(kin.err, err)
439 | 					}
440 | 				}
441 | 			case arrow.FLOAT16:
442 | 				switch n.field.Type.ID() {
443 | 				case arrow.FLOAT32:
444 | 					err := kin.upgradeType(n, arrow.FLOAT32)
445 | 					if err != nil {
446 | 						kin.err = errors.Join(kin.err, err)
447 | 					}
448 | 				case arrow.FLOAT64:
449 | 					err := kin.upgradeType(n, arrow.FLOAT64)
450 | 					if err != nil {
451 | 						kin.err = errors.Join(kin.err, err)
452 | 					}
453 | 				default:
454 | 					err := kin.upgradeType(n, arrow.STRING)
455 | 					if err != nil {
456 | 						kin.err = errors.Join(kin.err, err)
457 | 					}
458 | 				}
459 | 			case arrow.FLOAT32:
460 | 				switch n.field.Type.ID() {
461 | 				case arrow.FLOAT64:
462 | 					err := kin.upgradeType(n, arrow.FLOAT64)
463 | 					if err != nil {
464 | 						kin.err = errors.Join(kin.err, err)
465 | 					}
466 | 				default:
467 | 					err := kin.upgradeType(n, arrow.STRING)
468 | 					if err != nil {
469 | 						kin.err = errors.Join(kin.err, err)
470 | 					}
471 | 				}
472 | 			case arrow.FLOAT64:
473 | 				switch n.field.Type.ID() {
474 | 				case arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64, arrow.UINT8, arrow.UINT16, arrow.UINT32, arrow.UINT64, arrow.FLOAT16, arrow.FLOAT32:
475 | 					break
476 | 				default:
477 | 					err := kin.upgradeType(n, arrow.STRING)
478 | 					if err != nil {
479 | 						kin.err = errors.Join(kin.err, err)
480 | 					}
481 | 				}
482 | 			case arrow.TIMESTAMP:
483 | 				switch n.field.Type.ID() {
484 | 				case arrow.TIME64:
485 | 					err := kin.upgradeType(n, arrow.STRING)
486 | 					if err != nil {
487 | 						kin.err = errors.Join(kin.err, err)
488 | 					}
489 | 				}
490 | 			case arrow.DATE32:
491 | 				switch n.field.Type.ID() {
492 | 				case arrow.TIMESTAMP:
493 | 					err := kin.upgradeType(n, arrow.TIMESTAMP)
494 | 					if err != nil {
495 | 						kin.err = errors.Join(kin.err, err)
496 | 					}
497 | 				// case arrow.TIME64:
498 | 				default:
499 | 					err := kin.upgradeType(n, arrow.STRING)
500 | 					if err != nil {
501 | 						kin.err = errors.Join(kin.err, err)
502 | 					}
503 | 				}
504 | 			case arrow.TIME64:
505 | 				switch n.field.Type.ID() {
506 | 				case arrow.DATE32, arrow.TIMESTAMP:
507 | 					err := kin.upgradeType(n, arrow.STRING)
508 | 					if err != nil {
509 | 						kin.err = errors.Join(kin.err, err)
510 | 					}
511 | 				}
512 | 			}
513 | 		}
514 | 		for _, v := range n.childmap {
515 | 			u.merge(v, mergeAt)
516 | 		}
517 | 	}
518 | }
519 | 
520 | func (u *Bodkin) sortMapKeysDesc(k int) []string {
521 | 	var m *omap.OrderedMap[string, *fieldPos]
522 | 	var sortedPaths, paths []string
523 | 	switch k {
524 | 	case known:
525 | 		sortedPaths = make([]string, u.knownFields.Len())
526 | 		paths = make([]string, u.knownFields.Len())
527 | 		m = u.knownFields
528 | 	case unknown:
529 | 		sortedPaths = make([]string, u.untypedFields.Len())
530 | 		paths = make([]string, u.untypedFields.Len())
531 | 		m = u.untypedFields
532 | 	default:
533 | 		return sortedPaths
534 | 	}
535 | 	if m.Len() == 0 {
536 | 		return sortedPaths
537 | 	}
538 | 	i := 0
539 | 	for pair := m.Newest(); pair != nil; pair = pair.Prev() {
540 | 		paths[i] = pair.Key
541 | 		i++
542 | 	}
543 | 	maxDepth := 0
544 | 	for _, p := range paths {
545 | 		pathDepth := strings.Count(p, ".")
546 | 		if pathDepth > maxDepth {
547 | 			maxDepth = pathDepth
548 | 		}
549 | 	}
550 | 	sortIndex := 0
551 | 	for maxDepth >= 0 {
552 | 		for _, p := range paths {
553 | 			pathDepth := strings.Count(p, ".")
554 | 			if pathDepth == maxDepth {
555 | 				sortedPaths[sortIndex] = p
556 | 				sortIndex++
557 | 			}
558 | 		}
559 | 		maxDepth--
560 | 	}
561 | 	return sortedPaths
562 | }
563 | 


--------------------------------------------------------------------------------
/cmd/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | <<<<<<< Updated upstream
3 | *.bak
4 | main?.go
5 | =======
6 | *.bak
7 | >>>>>>> Stashed changes
8 | 


--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"os"
  8 | 	"time"
  9 | 
 10 | 	"github.com/loicalleyne/bodkin"
 11 | 	"github.com/loicalleyne/bodkin/reader"
 12 | )
 13 | 
 14 | func main() {
 15 | 	start := time.Now()
 16 | 	filepath := "large-file.json"
 17 | 	log.Println("start")
 18 | 	var u *bodkin.Bodkin
 19 | 	if 1 == 1 {
 20 | 		f, err := os.Open(filepath)
 21 | 		if err != nil {
 22 | 			panic(err)
 23 | 		}
 24 | 		defer f.Close()
 25 | 		s := bufio.NewScanner(f)
 26 | 		u = bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
 27 | 		if err != nil {
 28 | 			panic(err)
 29 | 		}
 30 | 
 31 | 		for s.Scan() {
 32 | 			err = u.Unify(s.Bytes())
 33 | 			if err != nil {
 34 | 				panic(err)
 35 | 			}
 36 | 		}
 37 | 		f.Close()
 38 | 		err = u.ExportSchemaFile("temp.bak")
 39 | 		if err != nil {
 40 | 			panic(err)
 41 | 		}
 42 | 	}
 43 | 	if 1 == 1 {
 44 | 		schema, err := u.ImportSchemaFile("temp.bak")
 45 | 		if err != nil {
 46 | 			panic(err)
 47 | 		}
 48 | 		ff, err := os.Open(filepath)
 49 | 		if err != nil {
 50 | 			panic(err)
 51 | 		}
 52 | 		defer ff.Close()
 53 | 		r, err := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024*16))
 54 | 		if err != nil {
 55 | 			panic(err)
 56 | 		}
 57 | 
 58 | 		log.Printf("union %v\n", schema.String())
 59 | 		log.Printf("elapsed: %v\n", time.Since(start))
 60 | 
 61 | 		i := 0
 62 | 		// for r.Next() {
 63 | 		// 	rec := r.Record()
 64 | 		// 	_, err := rec.MarshalJSON()
 65 | 		// 	if err != nil {
 66 | 		// 		fmt.Printf("error marshaling record: %v\n", err)
 67 | 		// 	}
 68 | 		// 	// fmt.Printf("\nmarshaled record :\n%v\n", string(rj))
 69 | 		// 	i++
 70 | 		// }
 71 | 		for r.NextBatch(1024) {
 72 | 			recs := r.RecordBatch()
 73 | 			for _, rec := range recs {
 74 | 				_, err := rec.MarshalJSON()
 75 | 				if err != nil {
 76 | 					fmt.Printf("error marshaling record: %v\n", err)
 77 | 				}
 78 | 				// fmt.Printf("\nmarshaled record :\n%v\n", string(rj))
 79 | 				i++
 80 | 			}
 81 | 		}
 82 | 		log.Println("records", r.Count(), i)
 83 | 	}
 84 | 	log.Printf("elapsed: %v\n", time.Since(start))
 85 | 	log.Println("end")
 86 | }
 87 | 
 88 | var jsonS1 string = `{"location_types":[{"enumeration_id":"702","id":81,"name":"location81"}],"misc_id":"123456789987a"}`
 89 | 
 90 | var jsonS3 string = `{
 91 | 	"count": 85,
 92 | 	"next": "https://sub.domain.com/api/search/?models=thurblig",
 93 | 	"previous": null,
 94 | 	"results": [
 95 | 	  {
 96 | 		"id": 6328,
 97 | 		"name": "New user SMB check 2310-1",
 98 | 		"external_id": null,
 99 | 		"title": "New user SMB check 2310-1",
100 | 		"content_type": "new agent",
101 | 		"model": "Agent",
102 | 		"emptyobj":{},
103 | 		"dataobj": {
104 | 		  "id": 6328,
105 | 		  "nestednullscalar": null,
106 | 		  "dsp": {
107 | 			"id": 116,
108 | 			"name": "El Thingy Bueno",
109 | 			"nullarray":[]
110 | 		  },
111 | 		  "name": "New user SMB check 2310-1",
112 | 		  "agency":{
113 | 			"id": 925,
114 | 			"name": "New user SMB check 2310-1",
115 | 			"employees":[{"id":99,"name":"abcd"},{"id":87,"name":"smart"}]
116 | 		  },
117 | 		  "export_status": {
118 | 			"status": true
119 | 		  }
120 | 		}
121 | 	  }
122 | 	]
123 |   }`
124 | 
125 | var jsonS2 string = `{"address":"11540 Foo Ave.","allowed_ad_types":[{"id":1,"name":"static"},{"id":2,"name":"video"},{"id":3,"name":"audio"},{"id":4,"name":"HTML"}],"allows_motion":true,"aspect_ratio":{"horizontal":16,"id":5,"name":"16:9","vertical":9},"audience_data_sources":[{"id":3,"name":"GeoPath"},{"id":4,"name":"1st party data"},{"id":7,"name":"Dutch outdoor research"},{"id":10,"name":"COMMB"}],"average_imp_multiplier":21,"average_weekly_impressions":123,"bearing":100,"bearing_direction":"E","bid_floors":[{"currency":{"code":"USD","id":1,"name":"US Dollars","symbol":"$"},"floor":10},{"currency":{"code":"CAD","id":9,"name":"Canadian dollar","symbol":"$"},"floor":0.01},{"currency":{"code":"AUD","id":8,"name":"Australian dollar","symbol":"$"},"floor":0.01}],"connectivity":1,"demography_type":"basic","device_id":"1234.broadsign.com","diagonal_size":88,"diagonal_size_units":"inches","dma":{"code":662,"id":5,"name":"Abilene-Sweetwater, TX"},"export_status":{"status":true},"geo":{"city":{"id":344757,"name":"Acme"},"country":{"id":40,"name":"Canada"},"region":{"id":485,"name":"Alberta"}},"hivestack_id":"abcd1234efgh","id":1,"internal_publisher_screen_id":"1q2w3e","is_active":true,"is_audio":false,"latitude":45.5017,"longitude":73.5673,"max_ad_duration":90,"min_ad_duration":5,"most_recent":1,"name":"Office test screen (Jody) - DO NOT DELETE","ox_enabled":false,"publisher":{"additional_currencies":[{"code":"CAD","id":9,"name":"Canadian dollar","symbol":"$"},{"code":"AUD","id":8,"name":"Australian dollar","symbol":"$"}],"currency":{"code":"USD","id":1,"name":"US Dollars","symbol":"$"},"id":1,"is_hivestack_bidder":true,"is_multi_currency_enabled":true,"is_px_bidder":true,"is_vistar_bidder":true,"name":"Publisher Demo"},"resolution":{"height":1080,"id":835,"name":"1920x1080","orientation":"landscape","title":"1920x1080","width":1920},"screen_count":1,"screen_img_url":"https://www.youtube.com/watch?v=8v7KJoGDGwI","screen_type":{"id":105,"name":"LED"},"tags":[{"id":6656,"name":"test"}],"time_zone":{"id":306,"name":"America/Edmonton"},"timestamp":"2024-11-01 05:20:06.642057","total":0,"transact_status":"ok","transact_status_ox":"ok","venue_types":[{"enumeration_id":"602","id":81,"name":"education.colleges"}],"vistar_id":"123456789987a"}
126 | `
127 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/loicalleyne/bodkin
 2 | 
 3 | go 1.24.1
 4 | 
 5 | require (
 6 | 	github.com/apache/arrow-go/v18 v18.3.0
 7 | 	github.com/go-viper/mapstructure/v2 v2.2.1
 8 | 	github.com/goccy/go-json v0.10.5
 9 | 	github.com/redpanda-data/benthos/v4 v4.52.0
10 | 	github.com/wk8/go-ordered-map/v2 v2.1.8
11 | )
12 | 
13 | require (
14 | 	github.com/Jeffail/gabs/v2 v2.7.0 // indirect
15 | 	github.com/OneOfOne/xxhash v1.2.8 // indirect
16 | 	github.com/andybalholm/brotli v1.1.1 // indirect
17 | 	github.com/apache/thrift v0.21.0 // indirect
18 | 	github.com/bahlo/generic-list-go v0.2.0 // indirect
19 | 	github.com/buger/jsonparser v1.1.1 // indirect
20 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
21 | 	github.com/go-logr/logr v1.4.2 // indirect
22 | 	github.com/go-logr/stdr v1.2.2 // indirect
23 | 	github.com/gofrs/uuid/v5 v5.3.2 // indirect
24 | 	github.com/golang/snappy v1.0.0 // indirect
25 | 	github.com/google/flatbuffers v25.2.10+incompatible // indirect
26 | 	github.com/google/uuid v1.6.0 // indirect
27 | 	github.com/klauspost/asmfmt v1.3.2 // indirect
28 | 	github.com/klauspost/compress v1.18.0 // indirect
29 | 	github.com/klauspost/cpuid/v2 v2.2.10 // indirect
30 | 	github.com/mailru/easyjson v0.7.7 // indirect
31 | 	github.com/matoous/go-nanoid/v2 v2.1.0 // indirect
32 | 	github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
33 | 	github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
34 | 	github.com/pierrec/lz4/v4 v4.1.22 // indirect
35 | 	github.com/segmentio/ksuid v1.0.4 // indirect
36 | 	github.com/tilinna/z85 v1.0.0 // indirect
37 | 	github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
38 | 	github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
39 | 	github.com/xeipuuv/gojsonschema v1.2.0 // indirect
40 | 	github.com/zeebo/xxh3 v1.0.2 // indirect
41 | 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
42 | 	go.opentelemetry.io/otel v1.36.0 // indirect
43 | 	go.opentelemetry.io/otel/metric v1.36.0 // indirect
44 | 	go.opentelemetry.io/otel/trace v1.36.0 // indirect
45 | 	golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
46 | 	golang.org/x/mod v0.24.0 // indirect
47 | 	golang.org/x/net v0.39.0 // indirect
48 | 	golang.org/x/sync v0.14.0 // indirect
49 | 	golang.org/x/sys v0.33.0 // indirect
50 | 	golang.org/x/text v0.25.0 // indirect
51 | 	golang.org/x/tools v0.32.0 // indirect
52 | 	golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
53 | 	google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect
54 | 	google.golang.org/grpc v1.72.0 // indirect
55 | 	google.golang.org/protobuf v1.36.6 // indirect
56 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
57 | )
58 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | cuelang.org/go v0.13.0 h1:Z9NQY9RK3zMbjq1ZK67hvOV58pI3FKQgfuu1Znz+akQ=
  2 | cuelang.org/go v0.13.0/go.mod h1:8MoQXu+RcXsa2s9mebJN1HJ1orVDc9aI9/yKi6Dzsi4=
  3 | github.com/Jeffail/gabs/v2 v2.7.0 h1:Y2edYaTcE8ZpRsR2AtmPu5xQdFDIthFG0jYhu5PY8kg=
  4 | github.com/Jeffail/gabs/v2 v2.7.0/go.mod h1:dp5ocw1FvBBQYssgHsG7I1WYsiLRtkUaB1FEtSwvNUw=
  5 | github.com/Jeffail/grok v1.1.0 h1:kiHmZ+0J5w/XUihRgU3DY9WIxKrNQCDjnfAb6bMLFaE=
  6 | github.com/Jeffail/grok v1.1.0/go.mod h1:dm0hLksrDwOMa6To7ORXCuLbuNtASIZTfYheavLpsuE=
  7 | github.com/Jeffail/shutdown v1.0.0 h1:afYjnY4pksqP/012m3NGJVccDI+WATdSzIMVHZKU8/Y=
  8 | github.com/Jeffail/shutdown v1.0.0/go.mod h1:5dT4Y1oe60SJELCkmAB1pr9uQyHBhh6cwDLQTfmuO5U=
  9 | github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
 10 | github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
 11 | github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
 12 | github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
 13 | github.com/apache/arrow-go/v18 v18.3.0 h1:Xq4A6dZj9Nu33sqZibzn012LNnewkTUlfKVUFD/RX/I=
 14 | github.com/apache/arrow-go/v18 v18.3.0/go.mod h1:eEM1DnUTHhgGAjf/ChvOAQbUQ+EPohtDrArffvUjPg8=
 15 | github.com/apache/thrift v0.21.0 h1:tdPmh/ptjE1IJnhbhrcl2++TauVjy242rkV/UzJChnE=
 16 | github.com/apache/thrift v0.21.0/go.mod h1:W1H8aR/QRtYNvrPeFXBtobyRkd0/YVhTc6i07XIAgDw=
 17 | github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
 18 | github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
 19 | github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
 20 | github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 21 | github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
 22 | github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
 23 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 24 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 25 | github.com/cockroachdb/apd/v3 v3.2.1 h1:U+8j7t0axsIgvQUqthuNm82HIrYXodOV2iWLWtEaIwg=
 26 | github.com/cockroachdb/apd/v3 v3.2.1/go.mod h1:klXJcjp+FffLTHlhIG69tezTDvdP065naDsHzKhYSqc=
 27 | github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
 28 | github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 29 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 30 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 31 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 32 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 33 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 34 | github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
 35 | github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
 36 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 37 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 38 | github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
 39 | github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
 40 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 41 | github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 42 | github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 43 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 44 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 45 | github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIxtHqx8aGss=
 46 | github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
 47 | github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
 48 | github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
 49 | github.com/gofrs/uuid/v5 v5.3.2 h1:2jfO8j3XgSwlz/wHqemAEugfnTlikAYHhnqQ8Xh4fE0=
 50 | github.com/gofrs/uuid/v5 v5.3.2/go.mod h1:CDOjlDMVAtN56jqyRUZh58JT31Tiw7/oQyEXZV+9bD8=
 51 | github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8=
 52 | github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
 53 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 54 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
 55 | github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
 56 | github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 57 | github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q=
 58 | github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
 59 | github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 60 | github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 61 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 62 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 63 | github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE=
 64 | github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w=
 65 | github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
 66 | github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
 67 | github.com/govalues/decimal v0.1.36 h1:dojDpsSvrk0ndAx8+saW5h9WDIHdWpIwrH/yhl9olyU=
 68 | github.com/govalues/decimal v0.1.36/go.mod h1:Ee7eI3Llf7hfqDZtpj8Q6NCIgJy1iY3kH1pSwDrNqlM=
 69 | github.com/hashicorp/golang-lru/arc/v2 v2.0.7 h1:QxkVTxwColcduO+LP7eJO56r2hFiG8zEbfAAzRv52KQ=
 70 | github.com/hashicorp/golang-lru/arc/v2 v2.0.7/go.mod h1:Pe7gBlGdc8clY5LJ0LpJXMt5AmgmWNH1g+oFFVUHOEc=
 71 | github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
 72 | github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 73 | github.com/influxdata/go-syslog/v3 v3.0.0 h1:jichmjSZlYK0VMmlz+k4WeOQd7z745YLsvGMqwtYt4I=
 74 | github.com/influxdata/go-syslog/v3 v3.0.0/go.mod h1:tulsOp+CecTAYC27u9miMgq21GqXRW6VdKbOG+QSP4Q=
 75 | github.com/itchyny/gojq v0.12.17 h1:8av8eGduDb5+rvEdaOO+zQUjA04MS0m3Ps8HiD+fceg=
 76 | github.com/itchyny/gojq v0.12.17/go.mod h1:WBrEMkgAfAGO1LUcGOckBl5O726KPp+OlkKug0I/FEY=
 77 | github.com/itchyny/timefmt-go v0.1.6 h1:ia3s54iciXDdzWzwaVKXZPbiXzxxnv1SPGFfM/myJ5Q=
 78 | github.com/itchyny/timefmt-go v0.1.6/go.mod h1:RRDZYC5s9ErkjQvTvvU7keJjxUYzIISJGxm9/mAERQg=
 79 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
 80 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
 81 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 82 | github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4=
 83 | github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
 84 | github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 85 | github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 86 | github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
 87 | github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
 88 | github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
 89 | github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 90 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 91 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 92 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 93 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 94 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 95 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 96 | github.com/matoous/go-nanoid/v2 v2.1.0 h1:P64+dmq21hhWdtvZfEAofnvJULaRR1Yib0+PnU669bE=
 97 | github.com/matoous/go-nanoid/v2 v2.1.0/go.mod h1:KlbGNQ+FhrUNIHUxZdL63t7tl4LaPkZNpUULS8H4uVM=
 98 | github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 99 | github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
100 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
101 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
102 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
103 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
104 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
105 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
106 | github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 h1:NHrXEjTNQY7P0Zfx1aMrNhpgxHmow66XQtm0aQLY0AE=
107 | github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8=
108 | github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU=
109 | github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
110 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
111 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
112 | github.com/quipo/dependencysolver v0.0.0-20170801134659-2b009cb4ddcc h1:hK577yxEJ2f5s8w2iy2KimZmgrdAUZUNftE1ESmg2/Q=
113 | github.com/quipo/dependencysolver v0.0.0-20170801134659-2b009cb4ddcc/go.mod h1:OQt6Zo5B3Zs+C49xul8kcHo+fZ1mCLPvd0LFxiZ2DHc=
114 | github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM=
115 | github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
116 | github.com/redpanda-data/benthos/v4 v4.52.0 h1:D47ayRCWxtFN0vrZvQo62T0L5S0+rVyrMj8H0R54UbQ=
117 | github.com/redpanda-data/benthos/v4 v4.52.0/go.mod h1:NGzOuISEVc8wNsf8Xn/6jyBR/ss2J0Okw+ZYNvzl+ak=
118 | github.com/rickb777/period v1.0.14 h1:Ucj/lTa3QwpuXFP9JqOitbmtibCkQsuxq8lLOf3GEBY=
119 | github.com/rickb777/period v1.0.14/go.mod h1:eDPQSeeG0c6g2Fz8/42+VDBttXNCV6TwVe8Magn2IgM=
120 | github.com/rickb777/plural v1.4.4 h1:OpZU8uRr9P2NkYAbkLMwlKNVJyJ5HvRcRBFyXGJtKGI=
121 | github.com/rickb777/plural v1.4.4/go.mod h1:DB19dtrplGS5s6VJVHn7tvmFYPoE83p1xqio3oVnNRM=
122 | github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
123 | github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
124 | github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
125 | github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
126 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
127 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
128 | github.com/segmentio/ksuid v1.0.4 h1:sBo2BdShXjmcugAMwjugoGUdUV0pcxY5mW4xKRn3v4c=
129 | github.com/segmentio/ksuid v1.0.4/go.mod h1:/XUiZBD3kVx5SmUOl55voK5yeAbBNNIed+2O73XgrPE=
130 | github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
131 | github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
132 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
133 | github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
134 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
135 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
136 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
137 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
138 | github.com/tilinna/z85 v1.0.0 h1:uqFnJBlD01dosSeo5sK1G1YGbPuwqVHqR+12OJDRjUw=
139 | github.com/tilinna/z85 v1.0.0/go.mod h1:EfpFU/DUY4ddEy6CRvk2l+UQNEzHbh+bqBQS+04Nkxs=
140 | github.com/urfave/cli/v2 v2.27.6 h1:VdRdS98FNhKZ8/Az8B7MTyGQmpIr36O1EHybx/LaZ4g=
141 | github.com/urfave/cli/v2 v2.27.6/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=
142 | github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
143 | github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
144 | github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
145 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo=
146 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
147 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0=
148 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
149 | github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
150 | github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
151 | github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
152 | github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
153 | github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
154 | github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
155 | github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk=
156 | github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a/go.mod h1:ul22v+Nro/R083muKhosV54bj5niojjWZvU8xrevuH4=
157 | github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
158 | github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
159 | github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
160 | github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
161 | go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
162 | go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
163 | go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg=
164 | go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E=
165 | go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE=
166 | go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs=
167 | go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A=
168 | go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU=
169 | go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk=
170 | go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w=
171 | go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w=
172 | go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA=
173 | golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8=
174 | golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw=
175 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
176 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
177 | golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU=
178 | golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww=
179 | golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
180 | golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
181 | golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
182 | golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
183 | golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
184 | golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
185 | golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
186 | golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
187 | golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU=
188 | golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s=
189 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY=
190 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
191 | gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
192 | gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
193 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4=
194 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ=
195 | google.golang.org/grpc v1.72.0 h1:S7UkcVa60b5AAQTaO6ZKamFp1zMZSU0fGDK2WZLbBnM=
196 | google.golang.org/grpc v1.72.0/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
197 | google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
198 | google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
199 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
200 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
201 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
202 | gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
203 | gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
204 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
205 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
206 | 


--------------------------------------------------------------------------------
/json2parquet/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.parquet


--------------------------------------------------------------------------------
/json2parquet/cmd/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.parquet


--------------------------------------------------------------------------------
/json2parquet/cmd/cleaner/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 
 11 | 	"github.com/goccy/go-json"
 12 | 
 13 | 	"github.com/redpanda-data/benthos/v4/public/bloblang"
 14 | )
 15 | 
 16 | // jcleaner takes as input a JSONL file, and removes all null fields, empty arrays,
 17 | // empty objects and empty strings.
 18 | func main() {
 19 | 	inputFile := flag.String("in", "", "input file")
 20 | 	outputFile := flag.String("out", "", "output file")
 21 | 	flag.Parse()
 22 | 	if *inputFile == "" {
 23 | 		log.Fatal("no input file specified")
 24 | 	}
 25 | 	if *outputFile == "" {
 26 | 		log.Fatal("no output file specified")
 27 | 	}
 28 | 	problemLines := fileNameWithoutExt(*outputFile) + "_problem.json"
 29 | 	f, err := os.Open(*inputFile)
 30 | 	if err != nil {
 31 | 		panic(err)
 32 | 	}
 33 | 	defer func() {
 34 | 		if r := recover(); r != nil {
 35 | 			fmt.Println(err)
 36 | 		}
 37 | 	}()
 38 | 	defer f.Close()
 39 | 	bloblangMapping := `map remove_null_empty {
 40 | 		root = match {
 41 | 		  (this.type() == "object" && this.length() == 0)  => deleted()
 42 | 		  this.type() == "object" => this.map_each(i -> i.value.apply("remove_null_empty"))
 43 | 		  (this.type() == "array" && this.length() == 0)  => deleted()
 44 | 		  this.type() == "array" => this.map_each(v -> v.apply("remove_null_empty"))
 45 | 		  this.type() == "null" => deleted()
 46 | 		  this.type() == "string" && this.length() == 0 => deleted()
 47 | 		  }
 48 | 		}
 49 | 	  root = this.apply("remove_null_empty")`
 50 | 	exe, err := bloblang.Parse(bloblangMapping)
 51 | 	if err != nil {
 52 | 		log.Println(err)
 53 | 	}
 54 | 
 55 | 	nf, err := os.Create(*outputFile)
 56 | 	if err != nil {
 57 | 		panic(err)
 58 | 	}
 59 | 	defer nf.Close()
 60 | 	w := bufio.NewWriterSize(nf, 1024*4)
 61 | 
 62 | 	pf, err := os.Create(problemLines)
 63 | 	if err != nil {
 64 | 		panic(err)
 65 | 	}
 66 | 	defer pf.Close()
 67 | 	pw := bufio.NewWriterSize(nf, 1024*4)
 68 | 
 69 | 	r := bufio.NewReaderSize(f, 1024*4)
 70 | 	s := bufio.NewScanner(r)
 71 | 	newline := []byte("\n")
 72 | 	for s.Scan() {
 73 | 		y := s.Bytes()
 74 | 		b, err := ApplyBloblangMapping(y, exe)
 75 | 		if err != nil {
 76 | 			pw.Write(y)
 77 | 			pw.Write(newline)
 78 | 			continue
 79 | 		}
 80 | 		_, err = w.Write(b)
 81 | 		if err != nil {
 82 | 			pw.Write(y)
 83 | 			pw.Write(newline)
 84 | 			continue
 85 | 		}
 86 | 		w.Write(newline)
 87 | 	}
 88 | 	w.Flush()
 89 | }
 90 | 
 91 | func ApplyBloblangMapping(jsonInput []byte, exe *bloblang.Executor) ([]byte, error) {
 92 | 	// Parse the JSON input into a map[string]interface{}
 93 | 	var inputMap map[string]interface{}
 94 | 	if err := json.Unmarshal(jsonInput, &inputMap); err != nil {
 95 | 		return nil, err
 96 | 	}
 97 | 
 98 | 	// Execute the Bloblang mapping
 99 | 	res, err := exe.Query(inputMap)
100 | 	if err != nil {
101 | 		return nil, err
102 | 	}
103 | 
104 | 	// Convert the result back into a JSON string
105 | 	jsonResult, err := json.Marshal(res)
106 | 	if err != nil {
107 | 		return nil, err
108 | 	}
109 | 
110 | 	return jsonResult, nil
111 | }
112 | 
113 | func fileNameWithoutExt(fileName string) string {
114 | 	return fileName[:len(fileName)-len(filepath.Ext(fileName))]
115 | }
116 | 


--------------------------------------------------------------------------------
/json2parquet/cmd/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"fmt"
 6 | 	"log"
 7 | 	"os"
 8 | 	"runtime/pprof"
 9 | 
10 | 	"github.com/loicalleyne/bodkin"
11 | 	j2p "github.com/loicalleyne/bodkin/json2parquet"
12 | )
13 | 
14 | var cpuprofile = flag.String("cpuprofile", "default.pgo", "write cpu profile to `file`")
15 | 
16 | func main() {
17 | 	inferMode := flag.Bool("infer_timeunits", true, "Infer date, time and timestamps from strings")
18 | 	quotedValuesAreStrings := flag.Bool("quoted_values_are_strings", false, "Treat quoted bool, float and integer values as strings")
19 | 	withTypeConversion := flag.Bool("type_conversion", false, "upgrade field types if data changes")
20 | 	inputFile := flag.String("in", "s.json", "input file")
21 | 	outputFile := flag.String("out", "t.parquet", "output file")
22 | 	dryRun := flag.Bool("n", false, "only print the schema")
23 | 	lines := flag.Int("lines", 0, "number of lines from which to infer schema; 0 means whole file is scanned")
24 | 	flag.Parse()
25 | 	if *inputFile == "" {
26 | 		log.Fatal("no input file specified")
27 | 	}
28 | 	log.Println("detecting schema")
29 | 	if *cpuprofile != "" {
30 | 		f, err := os.Create(*cpuprofile)
31 | 		if err != nil {
32 | 			log.Fatal("could not create CPU profile: ", err)
33 | 		}
34 | 		defer f.Close()
35 | 		if err := pprof.StartCPUProfile(f); err != nil {
36 | 			log.Fatal("could not start CPU profile: ", err)
37 | 		}
38 | 		defer pprof.StopCPUProfile()
39 | 		defer log.Printf("program ended\nto view profile run 'go tool pprof -http localhost:8080 %s\n", *cpuprofile)
40 | 	}
41 | 	var opts []bodkin.Option
42 | 	if *inferMode {
43 | 		opts = append(opts, bodkin.WithInferTimeUnits())
44 | 	}
45 | 	if *withTypeConversion {
46 | 		opts = append(opts, bodkin.WithTypeConversion())
47 | 	}
48 | 	if *quotedValuesAreStrings {
49 | 		opts = append(opts, bodkin.WithQuotedValuesAreStrings())
50 | 	}
51 | 	if *lines != 0 {
52 | 		opts = append(opts, bodkin.WithMaxCount(*lines))
53 | 	}
54 | 	arrowSchema, n, err := j2p.SchemaFromFile(*inputFile, opts...)
55 | 	if err == bodkin.ErrInvalidInput {
56 | 		fmt.Printf("schema creation error %v\n", err)
57 | 	}
58 | 	if arrowSchema == nil {
59 | 		log.Fatal("nil schema")
60 | 	}
61 | 	log.Printf("schema from %d records\n", n)
62 | 	fmt.Println(arrowSchema.String())
63 | 	if !*dryRun {
64 | 		if *outputFile == "" {
65 | 			log.Fatal("no output file specified")
66 | 		}
67 | 		log.Println("starting conversion to parquet")
68 | 
69 | 		n, err = j2p.RecordsFromFile(*inputFile, *outputFile, arrowSchema, nil)
70 | 		log.Printf("%d records written", n)
71 | 		if err != nil {
72 | 			log.Printf("parquet error: %v", err)
73 | 		}
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/json2parquet/json2parquet.go:
--------------------------------------------------------------------------------
  1 | package json2parquet
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 
 10 | 	"github.com/apache/arrow-go/v18/arrow"
 11 | 	"github.com/apache/arrow-go/v18/arrow/array"
 12 | 	"github.com/apache/arrow-go/v18/parquet"
 13 | 	"github.com/loicalleyne/bodkin"
 14 | 	"github.com/loicalleyne/bodkin/pq"
 15 | )
 16 | 
 17 | func FromReader(r io.Reader, opts ...bodkin.Option) (*arrow.Schema, int, error) {
 18 | 	var err error
 19 | 	s := bufio.NewScanner(r)
 20 | 	u := bodkin.NewBodkin(opts...)
 21 | 	for s.Scan() {
 22 | 		u.Unify(s.Bytes())
 23 | 		if u.Count() > u.MaxCount() {
 24 | 			break
 25 | 		}
 26 | 	}
 27 | 	schema, err := u.Schema()
 28 | 	if err != nil {
 29 | 		return nil, u.Count(), err
 30 | 	}
 31 | 	return schema, u.Count(), err
 32 | }
 33 | 
 34 | func SchemaFromFile(inputFile string, opts ...bodkin.Option) (*arrow.Schema, int, error) {
 35 | 	f, err := os.Open(inputFile)
 36 | 	if err != nil {
 37 | 		return nil, 0, err
 38 | 	}
 39 | 	defer f.Close()
 40 | 
 41 | 	r := bufio.NewReaderSize(f, 1024*32)
 42 | 	return FromReader(r, opts...)
 43 | }
 44 | 
 45 | func RecordsFromFile(inputFile, outputFile string, schema *arrow.Schema, munger func(io.Reader, io.Writer) error, opts ...parquet.WriterProperty) (int, error) {
 46 | 	n := 0
 47 | 	f, err := os.Open(inputFile)
 48 | 	if err != nil {
 49 | 		return 0, err
 50 | 	}
 51 | 	defer func() {
 52 | 		if r := recover(); r != nil {
 53 | 			fmt.Println(err)
 54 | 			fmt.Println("Records:", n)
 55 | 		}
 56 | 	}()
 57 | 	defer f.Close()
 58 | 	var prp *parquet.WriterProperties = pq.DefaultWrtp
 59 | 	if len(opts) != 0 {
 60 | 		prp = parquet.NewWriterProperties(opts...)
 61 | 	}
 62 | 	pw, _, err := pq.NewParquetWriter(schema, prp, outputFile)
 63 | 	if err != nil {
 64 | 		return 0, err
 65 | 	}
 66 | 	defer pw.Close()
 67 | 
 68 | 	var r io.Reader
 69 | 	var rdr *array.JSONReader
 70 | 	chunk := 1024
 71 | 	munger = nil
 72 | 	r = bufio.NewReaderSize(f, 1024*1024*128)
 73 | 	if munger != nil {
 74 | 		pr, pwr := io.Pipe()
 75 | 
 76 | 		go func() {
 77 | 			// close the writer, so the reader knows there's no more data
 78 | 			defer pwr.Close()
 79 | 			munger(r, pwr)
 80 | 		}()
 81 | 		rdr = array.NewJSONReader(pr, schema, array.WithChunk(chunk))
 82 | 	} else {
 83 | 		rdr = array.NewJSONReader(r, schema, array.WithChunk(chunk))
 84 | 	}
 85 | 
 86 | 	defer rdr.Release()
 87 | 
 88 | 	for rdr.Next() {
 89 | 		rec := rdr.Record()
 90 | 		err1 := pw.WriteRecord(rec)
 91 | 		if err != nil {
 92 | 			err = errors.Join(err, fmt.Errorf("failed to write parquet record: %v", err1))
 93 | 		}
 94 | 		n = n + chunk
 95 | 	}
 96 | 	if err := rdr.Err(); err != nil {
 97 | 		return n, err
 98 | 	}
 99 | 	err = pw.Close()
100 | 	if err != nil {
101 | 		return n, err
102 | 	}
103 | 	return n, err
104 | }
105 | 


--------------------------------------------------------------------------------
/option.go:
--------------------------------------------------------------------------------
 1 | package bodkin
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"io"
 6 | )
 7 | 
 8 | // WithInferTimeUnits() enables scanning input string values for time, date and timestamp types.
 9 | //
10 | // Times use a format of HH:MM or HH:MM:SS[.zzz] where the fractions of a second cannot
11 | // exceed the precision allowed by the time unit, otherwise unmarshalling will error.
12 | //
13 | // Dates use YYYY-MM-DD format.
14 | //
15 | // Timestamps use RFC3339Nano format except without a timezone, all of the following are valid:
16 | //
17 | //		YYYY-MM-DD
18 | //		YYYY-MM-DD[T]HH
19 | //		YYYY-MM-DD[T]HH:MM
20 | //	 YYYY-MM-DD[T]HH:MM:SS[.zzzzzzzzzz]
21 | func WithInferTimeUnits() Option {
22 | 	return func(cfg config) {
23 | 		cfg.inferTimeUnits = true
24 | 	}
25 | }
26 | 
27 | // WithTypeConversion enables upgrading the column types to fix compatibilty conflicts.
28 | func WithTypeConversion() Option {
29 | 	return func(cfg config) {
30 | 		cfg.typeConversion = true
31 | 	}
32 | }
33 | 
34 | // WithTypeConversion enables upgrading the column types to fix compatibilty conflicts.
35 | func WithQuotedValuesAreStrings() Option {
36 | 	return func(cfg config) {
37 | 		cfg.quotedValuesAreStrings = true
38 | 	}
39 | }
40 | 
41 | // WithMaxCount enables capping the number of Unify evaluations.
42 | func WithMaxCount(i int) Option {
43 | 	return func(cfg config) {
44 | 		cfg.maxCount = i
45 | 	}
46 | }
47 | 
48 | // WithIOReader provides an io.Reader for a Bodkin to use with UnifyScan(), along
49 | // with a delimiter to use to split datum in the data stream.
50 | // Default delimiter '\n' if delimiter is not provided.
51 | func WithIOReader(r io.Reader, delim byte) Option {
52 | 	return func(cfg config) {
53 | 		cfg.rr = r
54 | 		cfg.br = bufio.NewReaderSize(cfg.rr, 1024*16)
55 | 		if delim != '\n' {
56 | 			cfg.delim = delim
57 | 		}
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/pq/parquet_writer.go:
--------------------------------------------------------------------------------
  1 | package pq
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 
  7 | 	"github.com/apache/arrow-go/v18/arrow"
  8 | 	"github.com/apache/arrow-go/v18/arrow/array"
  9 | 	"github.com/apache/arrow-go/v18/arrow/memory"
 10 | 	"github.com/apache/arrow-go/v18/parquet"
 11 | 	"github.com/apache/arrow-go/v18/parquet/compress"
 12 | 	"github.com/apache/arrow-go/v18/parquet/pqarrow"
 13 | 	"github.com/apache/arrow-go/v18/parquet/schema"
 14 | )
 15 | 
 16 | const (
 17 | 	defaultRowGroupByteLimit = 10 * 1024 * 1024
 18 | )
 19 | 
 20 | var (
 21 | 	DefaultWrtp = parquet.NewWriterProperties(
 22 | 		parquet.WithDictionaryDefault(true),
 23 | 		parquet.WithVersion(parquet.V2_LATEST),
 24 | 		parquet.WithCompression(compress.Codecs.Zstd),
 25 | 		parquet.WithStats(true),
 26 | 		parquet.WithRootName("bodkin"),
 27 | 	)
 28 | )
 29 | 
 30 | type ParquetWriter struct {
 31 | 	destFile *os.File
 32 | 	pqwrt    *pqarrow.FileWriter
 33 | 	sc       *arrow.Schema
 34 | 	count    int
 35 | }
 36 | 
 37 | //	NewParquetWriter creates a new ParquetWriter.
 38 | //
 39 | // sc is the Arrow schema to use for writing records.
 40 | // wrtp are the Parquet writer properties to use.
 41 | //
 42 | // Returns a ParquetWriter and an error. The error will be non-nil if:
 43 | // - Failed to get the Parquet schema from the Arrow schema.
 44 | // - Failed to create the destination file.
 45 | // - Failed to create the Parquet file writer.
 46 | //
 47 | // Example:
 48 | // ```go
 49 | // pw, err := NewParquetWriter(schema, parquet.NewWriterProperties(parquet.WithCompression(parquet.CompressionCodec_SNAPPY)))
 50 | //
 51 | //	if err != nil {
 52 | //	  log.Fatal(err)
 53 | //	}
 54 | //
 55 | // ```
 56 | func NewParquetWriter(sc *arrow.Schema, wrtp *parquet.WriterProperties, path string) (*ParquetWriter, *schema.Schema, error) {
 57 | 	pqschema, err := pqarrow.ToParquet(sc, wrtp, pqarrow.DefaultWriterProps())
 58 | 	if err != nil {
 59 | 		return nil, nil, fmt.Errorf("failed to get parquet schema: %w", err)
 60 | 	}
 61 | 
 62 | 	destFile, err := os.Create(path)
 63 | 	if err != nil {
 64 | 		return nil, nil, fmt.Errorf("failed to create destination file: %w", err)
 65 | 	}
 66 | 	artp := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
 67 | 	pqwrt, err := pqarrow.NewFileWriter(sc, destFile, wrtp, artp)
 68 | 	if err != nil {
 69 | 		return nil, nil, fmt.Errorf("failed to create parquet writer: %w", err)
 70 | 	}
 71 | 
 72 | 	return &ParquetWriter{destFile: destFile, pqwrt: pqwrt, sc: sc}, pqschema, nil
 73 | }
 74 | 
 75 | //	Write writes a single record to the Parquet file.
 76 | //
 77 | // jsonData is the JSON encoded record data.
 78 | //
 79 | // Returns an error if:
 80 | // - Failed to unmarshal the JSON data.
 81 | // - Failed to write the record to Parquet.
 82 | //
 83 | // Increments the record count and creates a new row group if the current
 84 | // row group exceeds the default row group byte limit.
 85 | //
 86 | // Example:
 87 | // ```go
 88 | // err := pw.Write([]byte(`{"id":1,"name":"foo"}`))
 89 | //
 90 | //	if err != nil {
 91 | //	  log.Fatal(err)
 92 | //	}
 93 | //
 94 | // ```
 95 | func (pw *ParquetWriter) Write(jsonData []byte) error {
 96 | 	recbld := array.NewRecordBuilder(memory.DefaultAllocator, pw.sc)
 97 | 	defer recbld.Release()
 98 | 
 99 | 	err := recbld.UnmarshalJSON(jsonData)
100 | 	if err != nil {
101 | 		return fmt.Errorf("failed to unmarshal JSON: %w", err)
102 | 	}
103 | 
104 | 	rec := recbld.NewRecord()
105 | 	defer rec.Release()
106 | 	err = pw.pqwrt.WriteBuffered(rec)
107 | 	if err != nil {
108 | 		return fmt.Errorf("failed to write to parquet: %w", err)
109 | 	}
110 | 
111 | 	if pw.pqwrt.RowGroupTotalBytesWritten() >= defaultRowGroupByteLimit {
112 | 		pw.pqwrt.NewBufferedRowGroup()
113 | 	}
114 | 	pw.count++
115 | 
116 | 	return nil
117 | }
118 | 
119 | func (pw *ParquetWriter) WriteRecord(rec arrow.Record) error {
120 | 	err := pw.pqwrt.WriteBuffered(rec)
121 | 	if err != nil {
122 | 		return fmt.Errorf("failed to write to parquet: %w", err)
123 | 	}
124 | 
125 | 	if pw.pqwrt.RowGroupTotalBytesWritten() >= defaultRowGroupByteLimit {
126 | 		pw.pqwrt.NewBufferedRowGroup()
127 | 	}
128 | 	pw.count++
129 | 
130 | 	return nil
131 | }
132 | 
133 | // RecordCount returns the total number of records written.
134 | func (pw *ParquetWriter) RecordCount() int {
135 | 	return pw.count
136 | }
137 | 
138 | //	Close closes the Parquet writer.
139 | //
140 | // Returns an error if failed to close the Parquet file writer.
141 | func (pw *ParquetWriter) Close() error {
142 | 	if err := pw.pqwrt.Close(); err != nil {
143 | 		return fmt.Errorf("failed to close parquet writer: %w", err)
144 | 	}
145 | 
146 | 	return nil
147 | }
148 | 


--------------------------------------------------------------------------------
/reader/.gitignore:
--------------------------------------------------------------------------------
1 | avro.go


--------------------------------------------------------------------------------
/reader/encoder.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"encoding"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"reflect"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/go-viper/mapstructure/v2"
 11 | )
 12 | 
 13 | const (
 14 | 	tagNameMapStructure = "mapstructure"
 15 | 	optionSeparator     = ","
 16 | 	optionOmitEmpty     = "omitempty"
 17 | 	optionSquash        = "squash"
 18 | 	optionRemain        = "remain"
 19 | 	optionSkip          = "-"
 20 | )
 21 | 
 22 | var (
 23 | 	errNonStringEncodedKey = errors.New("non string-encoded key")
 24 | )
 25 | 
 26 | // tagInfo stores the mapstructure tag details.
 27 | type tagInfo struct {
 28 | 	name      string
 29 | 	omitEmpty bool
 30 | 	squash    bool
 31 | }
 32 | 
 33 | // An Encoder takes structured data and converts it into an
 34 | // interface following the mapstructure tags.
 35 | type Encoder struct {
 36 | 	config *EncoderConfig
 37 | }
 38 | 
 39 | // EncoderConfig is the configuration used to create a new encoder.
 40 | type EncoderConfig struct {
 41 | 	// EncodeHook, if set, is a way to provide custom encoding. It
 42 | 	// will be called before structs and primitive types.
 43 | 	EncodeHook mapstructure.DecodeHookFunc
 44 | }
 45 | 
 46 | // New returns a new encoder for the configuration.
 47 | func New(cfg *EncoderConfig) *Encoder {
 48 | 	return &Encoder{config: cfg}
 49 | }
 50 | 
 51 | // Encode takes the input and uses reflection to encode it to
 52 | // an interface based on the mapstructure spec.
 53 | func (e *Encoder) Encode(input any) (any, error) {
 54 | 	return e.encode(reflect.ValueOf(input))
 55 | }
 56 | 
 57 | // encode processes the value based on the reflect.Kind.
 58 | func (e *Encoder) encode(value reflect.Value) (any, error) {
 59 | 	if value.IsValid() {
 60 | 		switch value.Kind() {
 61 | 		case reflect.Interface, reflect.Ptr:
 62 | 			return e.encode(value.Elem())
 63 | 		case reflect.Map:
 64 | 			return e.encodeMap(value)
 65 | 		case reflect.Slice:
 66 | 			return e.encodeSlice(value)
 67 | 		case reflect.Struct:
 68 | 			return e.encodeStruct(value)
 69 | 		default:
 70 | 			return e.encodeHook(value)
 71 | 		}
 72 | 	}
 73 | 	return nil, nil
 74 | }
 75 | 
 76 | // encodeHook calls the EncodeHook in the EncoderConfig with the value passed in.
 77 | // This is called before processing structs and for primitive data types.
 78 | func (e *Encoder) encodeHook(value reflect.Value) (any, error) {
 79 | 	if e.config != nil && e.config.EncodeHook != nil {
 80 | 		out, err := mapstructure.DecodeHookExec(e.config.EncodeHook, value, value)
 81 | 		if err != nil {
 82 | 			return nil, fmt.Errorf("error running encode hook: %w", err)
 83 | 		}
 84 | 		return out, nil
 85 | 	}
 86 | 	return value.Interface(), nil
 87 | }
 88 | 
 89 | // encodeStruct encodes the struct by iterating over the fields, getting the
 90 | // mapstructure tagInfo for each exported field, and encoding the value.
 91 | func (e *Encoder) encodeStruct(value reflect.Value) (any, error) {
 92 | 	if value.Kind() != reflect.Struct {
 93 | 		return nil, &reflect.ValueError{
 94 | 			Method: "encodeStruct",
 95 | 			Kind:   value.Kind(),
 96 | 		}
 97 | 	}
 98 | 	out, err := e.encodeHook(value)
 99 | 	if err != nil {
100 | 		return nil, err
101 | 	}
102 | 	value = reflect.ValueOf(out)
103 | 	// if the output of encodeHook is no longer a struct,
104 | 	// call encode against it.
105 | 	if value.Kind() != reflect.Struct {
106 | 		return e.encode(value)
107 | 	}
108 | 	result := make(map[string]any)
109 | 	for i := 0; i < value.NumField(); i++ {
110 | 		field := value.Field(i)
111 | 		if field.CanInterface() {
112 | 			info := getTagInfo(value.Type().Field(i))
113 | 			if (info.omitEmpty && field.IsZero()) || info.name == optionSkip {
114 | 				continue
115 | 			}
116 | 			encoded, err := e.encode(field)
117 | 			if err != nil {
118 | 				return nil, fmt.Errorf("error encoding field %q: %w", info.name, err)
119 | 			}
120 | 			if info.squash {
121 | 				if m, ok := encoded.(map[string]any); ok {
122 | 					for k, v := range m {
123 | 						result[k] = v
124 | 					}
125 | 				}
126 | 			} else {
127 | 				result[info.name] = encoded
128 | 			}
129 | 		}
130 | 	}
131 | 	return result, nil
132 | }
133 | 
134 | // encodeSlice iterates over the slice and encodes each of the elements.
135 | func (e *Encoder) encodeSlice(value reflect.Value) (any, error) {
136 | 	if value.Kind() != reflect.Slice {
137 | 		return nil, &reflect.ValueError{
138 | 			Method: "encodeSlice",
139 | 			Kind:   value.Kind(),
140 | 		}
141 | 	}
142 | 	result := make([]any, value.Len())
143 | 	for i := 0; i < value.Len(); i++ {
144 | 		var err error
145 | 		if result[i], err = e.encode(value.Index(i)); err != nil {
146 | 			return nil, fmt.Errorf("error encoding element in slice at index %d: %w", i, err)
147 | 		}
148 | 	}
149 | 	return result, nil
150 | }
151 | 
152 | // encodeMap encodes a map by encoding the key and value. Returns errNonStringEncodedKey
153 | // if the key is not encoded into a string.
154 | func (e *Encoder) encodeMap(value reflect.Value) (any, error) {
155 | 	if value.Kind() != reflect.Map {
156 | 		return nil, &reflect.ValueError{
157 | 			Method: "encodeMap",
158 | 			Kind:   value.Kind(),
159 | 		}
160 | 	}
161 | 	result := make(map[string]any)
162 | 	iterator := value.MapRange()
163 | 	for iterator.Next() {
164 | 		encoded, err := e.encode(iterator.Key())
165 | 		if err != nil {
166 | 			return nil, fmt.Errorf("error encoding key: %w", err)
167 | 		}
168 | 
169 | 		v := reflect.ValueOf(encoded)
170 | 		var key string
171 | 
172 | 		switch v.Kind() {
173 | 		case reflect.String:
174 | 			key = v.String()
175 | 		default:
176 | 			return nil, fmt.Errorf("%w, key: %q, kind: %v, type: %T", errNonStringEncodedKey, iterator.Key().Interface(), iterator.Key().Kind(), encoded)
177 | 		}
178 | 
179 | 		if _, ok := result[key]; ok {
180 | 			return nil, fmt.Errorf("duplicate key %q while encoding", key)
181 | 		}
182 | 		if result[key], err = e.encode(iterator.Value()); err != nil {
183 | 			return nil, fmt.Errorf("error encoding map value for key %q: %w", key, err)
184 | 		}
185 | 	}
186 | 	return result, nil
187 | }
188 | 
189 | // getTagInfo looks up the mapstructure tag and uses that if available.
190 | // Uses the lowercase field if not found. Checks for omitempty and squash.
191 | func getTagInfo(field reflect.StructField) *tagInfo {
192 | 	info := tagInfo{}
193 | 	if tag, ok := field.Tag.Lookup(tagNameMapStructure); ok {
194 | 		options := strings.Split(tag, optionSeparator)
195 | 		info.name = options[0]
196 | 		if len(options) > 1 {
197 | 			for _, option := range options[1:] {
198 | 				switch option {
199 | 				case optionOmitEmpty:
200 | 					info.omitEmpty = true
201 | 				case optionSquash, optionRemain:
202 | 					info.squash = true
203 | 				}
204 | 			}
205 | 		}
206 | 	} else {
207 | 		info.name = strings.ToLower(field.Name)
208 | 	}
209 | 	return &info
210 | }
211 | 
212 | // TextMarshalerHookFunc returns a DecodeHookFuncValue that checks
213 | // for the encoding.TextMarshaler interface and calls the MarshalText
214 | // function if found.
215 | func TextMarshalerHookFunc() mapstructure.DecodeHookFuncValue {
216 | 	return func(from reflect.Value, _ reflect.Value) (any, error) {
217 | 		marshaler, ok := from.Interface().(encoding.TextMarshaler)
218 | 		if !ok {
219 | 			return from.Interface(), nil
220 | 		}
221 | 		out, err := marshaler.MarshalText()
222 | 		if err != nil {
223 | 			return nil, err
224 | 		}
225 | 		return string(out), nil
226 | 	}
227 | }
228 | 


--------------------------------------------------------------------------------
/reader/input.go:
--------------------------------------------------------------------------------
 1 | package reader
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"errors"
 6 | 	"fmt"
 7 | 
 8 | 	"github.com/go-viper/mapstructure/v2"
 9 | 	json "github.com/goccy/go-json"
10 | )
11 | 
12 | var (
13 | 	ErrUndefinedInput = errors.New("nil input")
14 | 	ErrInvalidInput   = errors.New("invalid input")
15 | )
16 | 
17 | // InputMap takes structured input data and attempts to decode it to
18 | // map[string]any. Input data can be json in string or []byte, or any other
19 | // Go data type which can be decoded by [MapStructure/v2].
20 | // [MapStructure/v2]: github.com/go-viper/mapstructure/v2
21 | func InputMap(a any) (map[string]any, error) {
22 | 	m := map[string]any{}
23 | 	switch input := a.(type) {
24 | 	case nil:
25 | 		return nil, ErrUndefinedInput
26 | 	case map[string]any:
27 | 		return input, nil
28 | 	case []byte:
29 | 		r := bytes.NewReader(input)
30 | 		d := json.NewDecoder(r)
31 | 		d.UseNumber()
32 | 		err := d.Decode(&m)
33 | 		if err != nil {
34 | 			return nil, fmt.Errorf("%v : %v", ErrInvalidInput, err)
35 | 		}
36 | 	case string:
37 | 		r := bytes.NewReader([]byte(input))
38 | 		d := json.NewDecoder(r)
39 | 		d.UseNumber()
40 | 		err := d.Decode(&m)
41 | 		if err != nil {
42 | 			return nil, fmt.Errorf("%v : %v", ErrInvalidInput, err)
43 | 		}
44 | 	default:
45 | 		ms := New(&EncoderConfig{EncodeHook: mapstructure.RecursiveStructToMapHookFunc()})
46 | 		enc, err := ms.Encode(a)
47 | 		if err != nil {
48 | 			return nil, fmt.Errorf("Error decoding to map[string]interface{}: %v", err)
49 | 		}
50 | 		return enc.(map[string]any), nil
51 | 	}
52 | 	return m, nil
53 | }
54 | 


--------------------------------------------------------------------------------
/reader/loader.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"encoding/json"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"math/big"
 10 | 	"strconv"
 11 | 	"time"
 12 | 
 13 | 	"github.com/apache/arrow-go/v18/arrow"
 14 | 	"github.com/apache/arrow-go/v18/arrow/array"
 15 | 	"github.com/apache/arrow-go/v18/arrow/decimal128"
 16 | 	"github.com/apache/arrow-go/v18/arrow/decimal256"
 17 | 	"github.com/apache/arrow-go/v18/arrow/extensions"
 18 | 	"github.com/apache/arrow-go/v18/arrow/memory"
 19 | )
 20 | 
 21 | type dataLoader struct {
 22 | 	idx, depth int32
 23 | 	list       *fieldPos
 24 | 	item       *fieldPos
 25 | 	mapField   *fieldPos
 26 | 	mapKey     *fieldPos
 27 | 	mapValue   *fieldPos
 28 | 	fields     []*fieldPos
 29 | 	children   []*dataLoader
 30 | }
 31 | 
 32 | var (
 33 | 	ErrNullStructData = errors.New("null struct data")
 34 | )
 35 | 
 36 | func newDataLoader() *dataLoader { return &dataLoader{idx: 0, depth: 0} }
 37 | 
 38 | // drawTree takes the tree of field builders produced by mapFieldBuilders()
 39 | // and produces another tree structure and aggregates fields whose values can
 40 | // be retrieved from a `map[string]any` into a slice of builders, and creates a hierarchy to
 41 | // deal with nested types (lists and maps).
 42 | func (d *dataLoader) drawTree(field *fieldPos) {
 43 | 	for _, f := range field.children() {
 44 | 		if f.isList || f.isMap {
 45 | 			if f.isList {
 46 | 				c := d.newListChild(f)
 47 | 				if !f.childrens[0].isList {
 48 | 					c.item = f.childrens[0]
 49 | 					c.drawTree(f.childrens[0])
 50 | 				} else {
 51 | 					c.drawTree(f.childrens[0].childrens[0])
 52 | 				}
 53 | 			}
 54 | 			if f.isMap {
 55 | 				c := d.newMapChild(f)
 56 | 				if !arrow.IsNested(f.childrens[1].builder.Type().ID()) {
 57 | 					c.mapKey = f.childrens[0]
 58 | 					c.mapValue = f.childrens[1]
 59 | 				} else {
 60 | 					c.mapKey = f.childrens[0]
 61 | 					m := c.newChild()
 62 | 					m.mapValue = f.childrens[1]
 63 | 					m.drawTree(f.childrens[1])
 64 | 				}
 65 | 			}
 66 | 		} else {
 67 | 			d.fields = append(d.fields, f)
 68 | 			if len(f.children()) > 0 {
 69 | 				d.drawTree(f)
 70 | 			}
 71 | 		}
 72 | 	}
 73 | }
 74 | 
 75 | // loadDatum loads data to the schema fields' builder functions.
 76 | // Since array.StructBuilder.AppendNull() will recursively append null to all of the
 77 | // struct's fields, in the case of nil being passed to a struct's builderFunc it will
 78 | // return a ErrNullStructData error to signal that all its sub-fields can be skipped.
 79 | func (d *dataLoader) loadDatum(data any) error {
 80 | 	if d.list == nil && d.mapField == nil {
 81 | 		if d.mapValue != nil {
 82 | 			d.mapValue.appendFunc(data)
 83 | 		}
 84 | 		var NullParent *fieldPos
 85 | 		for _, f := range d.fields {
 86 | 			if f.parent == NullParent {
 87 | 				continue
 88 | 			}
 89 | 			if d.mapValue == nil {
 90 | 				err := f.appendFunc(f.getValue(data))
 91 | 				if err != nil {
 92 | 					if err == ErrNullStructData {
 93 | 						NullParent = f
 94 | 						continue
 95 | 					}
 96 | 					return err
 97 | 				}
 98 | 			} else {
 99 | 				switch dt := data.(type) {
100 | 				case nil:
101 | 					err := f.appendFunc(dt)
102 | 					if err != nil {
103 | 						if err == ErrNullStructData {
104 | 							NullParent = f
105 | 							continue
106 | 						}
107 | 						return err
108 | 					}
109 | 				case []any:
110 | 					if len(d.children) < 1 {
111 | 						for _, e := range dt {
112 | 							err := f.appendFunc(e)
113 | 							if err != nil {
114 | 								if err == ErrNullStructData {
115 | 									NullParent = f
116 | 									continue
117 | 								}
118 | 								return err
119 | 							}
120 | 						}
121 | 					} else {
122 | 						for _, e := range dt {
123 | 							d.children[0].loadDatum(e)
124 | 						}
125 | 					}
126 | 				case map[string]any:
127 | 					err := f.appendFunc(f.getValue(dt))
128 | 					if err != nil {
129 | 						if err == ErrNullStructData {
130 | 							NullParent = f
131 | 							continue
132 | 						}
133 | 						return err
134 | 					}
135 | 				}
136 | 
137 | 			}
138 | 		}
139 | 		for _, c := range d.children {
140 | 			if c.list != nil {
141 | 				c.loadDatum(c.list.getValue(data))
142 | 			}
143 | 			if c.mapField != nil {
144 | 				switch dt := data.(type) {
145 | 				case nil:
146 | 					c.loadDatum(dt)
147 | 				case map[string]any:
148 | 					c.loadDatum(c.mapField.getValue(dt))
149 | 				default:
150 | 					c.loadDatum(c.mapField.getValue(data))
151 | 				}
152 | 			}
153 | 		}
154 | 	} else {
155 | 		if d.list != nil {
156 | 			switch dt := data.(type) {
157 | 			case nil:
158 | 				d.list.appendFunc(dt)
159 | 			case []any:
160 | 				d.list.appendFunc(dt)
161 | 				for _, e := range dt {
162 | 					if d.item != nil {
163 | 						d.item.appendFunc(e)
164 | 					}
165 | 					var NullParent *fieldPos
166 | 					for _, f := range d.fields {
167 | 						if f.parent == NullParent {
168 | 							continue
169 | 						}
170 | 						err := f.appendFunc(f.getValue(e))
171 | 						if err != nil {
172 | 							if err == ErrNullStructData {
173 | 								NullParent = f
174 | 								continue
175 | 							}
176 | 							return err
177 | 						}
178 | 					}
179 | 					for _, c := range d.children {
180 | 						if c.list != nil {
181 | 							c.loadDatum(c.list.getValue(e))
182 | 						}
183 | 						if c.mapField != nil {
184 | 							c.loadDatum(c.mapField.getValue(e))
185 | 						}
186 | 					}
187 | 				}
188 | 			case map[string]any:
189 | 				d.list.appendFunc(dt)  //
190 | 				for _, e := range dt { //
191 | 					if d.item != nil {
192 | 						d.item.appendFunc(e)
193 | 					}
194 | 					var NullParent *fieldPos
195 | 					for _, f := range d.fields {
196 | 						if f.parent == NullParent {
197 | 							continue
198 | 						}
199 | 						err := f.appendFunc(f.getValue(e))
200 | 						if err != nil {
201 | 							if err == ErrNullStructData {
202 | 								NullParent = f
203 | 								continue
204 | 							}
205 | 							return err
206 | 						}
207 | 					}
208 | 					for _, c := range d.children {
209 | 						c.loadDatum(c.list.getValue(e))
210 | 					}
211 | 				}
212 | 			default:
213 | 				d.list.appendFunc(data)
214 | 				d.item.appendFunc(dt)
215 | 			}
216 | 		}
217 | 		if d.mapField != nil {
218 | 			switch dt := data.(type) {
219 | 			case nil:
220 | 				d.mapField.appendFunc(dt)
221 | 			case map[string]any:
222 | 				d.mapField.appendFunc(dt)
223 | 				for k, v := range dt {
224 | 					d.mapKey.appendFunc(k)
225 | 					if d.mapValue != nil {
226 | 						d.mapValue.appendFunc(v)
227 | 					} else {
228 | 						d.children[0].loadDatum(v)
229 | 					}
230 | 				}
231 | 			}
232 | 		}
233 | 	}
234 | 	return nil
235 | }
236 | 
237 | func (d *dataLoader) newChild() *dataLoader {
238 | 	var child *dataLoader = &dataLoader{
239 | 		depth: d.depth + 1,
240 | 	}
241 | 	d.children = append(d.children, child)
242 | 	return child
243 | }
244 | 
245 | func (d *dataLoader) newListChild(list *fieldPos) *dataLoader {
246 | 	var child *dataLoader = &dataLoader{
247 | 		list:  list,
248 | 		item:  list.childrens[0],
249 | 		depth: d.depth + 1,
250 | 	}
251 | 	d.children = append(d.children, child)
252 | 	return child
253 | }
254 | 
255 | func (d *dataLoader) newMapChild(mapField *fieldPos) *dataLoader {
256 | 	var child *dataLoader = &dataLoader{
257 | 		mapField: mapField,
258 | 		depth:    d.depth + 1,
259 | 	}
260 | 	d.children = append(d.children, child)
261 | 	return child
262 | }
263 | 
264 | type fieldPos struct {
265 | 	parent       *fieldPos
266 | 	fieldName    string
267 | 	builder      array.Builder
268 | 	source       DataSource
269 | 	path         []string
270 | 	isList       bool
271 | 	isItem       bool
272 | 	isStruct     bool
273 | 	isMap        bool
274 | 	typeName     string
275 | 	appendFunc   func(val interface{}) error
276 | 	metadatas    arrow.Metadata
277 | 	childrens    []*fieldPos
278 | 	index, depth int32
279 | }
280 | 
281 | func newFieldPos() *fieldPos { return &fieldPos{index: -1} }
282 | 
283 | func (f *fieldPos) children() []*fieldPos { return f.childrens }
284 | 
285 | func (f *fieldPos) newChild(childName string, childBuilder array.Builder, meta arrow.Metadata) *fieldPos {
286 | 	var child fieldPos = fieldPos{
287 | 		parent:    f,
288 | 		source:    f.source,
289 | 		fieldName: childName,
290 | 		builder:   childBuilder,
291 | 		metadatas: meta,
292 | 		index:     int32(len(f.childrens)),
293 | 		depth:     f.depth + 1,
294 | 	}
295 | 	if f.isList {
296 | 		child.isItem = true
297 | 	}
298 | 	child.path = child.buildNamePath()
299 | 	f.childrens = append(f.childrens, &child)
300 | 	return &child
301 | }
302 | 
303 | func (f *fieldPos) buildNamePath() []string {
304 | 	var path []string
305 | 
306 | 	cur := f
307 | 	for i := f.depth - 1; i >= 0; i-- {
308 | 		if cur.fieldName != "item" {
309 | 			path = append([]string{cur.fieldName}, path...)
310 | 		} else {
311 | 			break
312 | 		}
313 | 
314 | 		if !cur.parent.isMap {
315 | 			cur = cur.parent
316 | 		}
317 | 	}
318 | 	if f.parent.parent != nil && f.parent.parent.isList {
319 | 		var listPath []string
320 | 		for i := len(path) - 1; i >= 0; i-- {
321 | 			if path[i] != "elem" {
322 | 				listPath = append([]string{path[i]}, listPath...)
323 | 			} else {
324 | 				return listPath
325 | 			}
326 | 		}
327 | 	}
328 | 	if f.parent != nil && f.parent.fieldName == "item" {
329 | 		var listPath []string
330 | 		for i := len(path) - 1; i >= 0; i-- {
331 | 			if path[i] != "item" {
332 | 				listPath = append([]string{path[i]}, listPath...)
333 | 			} else {
334 | 				return listPath
335 | 			}
336 | 		}
337 | 	}
338 | 	// avro/arrow Maps ?
339 | 	// if f.parent != nil && f.parent.fieldName == "value" {
340 | 	// 	for i := len(path) - 1; i >= 0; i-- {
341 | 	// 		if path[i] != "value" {
342 | 	// 			listPath = append([]string{path[i]}, listPath...)
343 | 	// 		} else {
344 | 	// 			return listPath
345 | 	// 		}
346 | 	// 	}
347 | 	// }
348 | 	return path
349 | }
350 | 
351 | // NamePath returns a slice of keys making up the path to the field
352 | func (f *fieldPos) namePath() []string { return f.path }
353 | 
354 | // GetValue retrieves the value from the map[string]any
355 | // by following the field's key path
356 | func (f *fieldPos) getValue(m any) any {
357 | 	if _, ok := m.(map[string]any); !ok {
358 | 		return m
359 | 	}
360 | 	for _, key := range f.namePath() {
361 | 		valueMap, ok := m.(map[string]any)
362 | 		if !ok {
363 | 			if key == "item" {
364 | 				return m
365 | 			}
366 | 			return nil
367 | 		}
368 | 		m, ok = valueMap[key]
369 | 		if !ok {
370 | 			return nil
371 | 		}
372 | 	}
373 | 	return m
374 | }
375 | 
376 | // Data is loaded to Arrow arrays using the following type mapping:
377 | //
378 | //	Avro					Go    			Arrow
379 | //	null					nil				Null
380 | //	boolean					bool			Boolean
381 | //	bytes					[]byte			Binary
382 | //	float					float32			Float32
383 | //	double					float64			Float64
384 | //	long					int64			Int64
385 | //	int						int32  			Int32
386 | //	string					string			String
387 | //	array					[]interface{}	List
388 | //	enum					string			Dictionary
389 | //	fixed					[]byte			FixedSizeBinary
390 | //	map and record			map[string]any	Struct
391 | //
392 | // mapFieldBuilders builds a tree of field builders matching the Arrow schema
393 | func mapFieldBuilders(b array.Builder, field arrow.Field, parent *fieldPos) {
394 | 	f := parent.newChild(field.Name, b, field.Metadata)
395 | 	switch bt := b.(type) {
396 | 	case *array.BinaryBuilder:
397 | 		f.appendFunc = func(data interface{}) error {
398 | 			appendBinaryData(bt, data, f.source)
399 | 			return nil
400 | 		}
401 | 	case *array.BinaryDictionaryBuilder:
402 | 		// has metadata for Avro enum symbols
403 | 		f.appendFunc = func(data interface{}) error {
404 | 			appendBinaryDictData(bt, data, f.source)
405 | 			return nil
406 | 		}
407 | 		// add Avro enum symbols to builder
408 | 		sb := array.NewStringBuilder(memory.DefaultAllocator)
409 | 		for _, v := range field.Metadata.Values() {
410 | 			sb.Append(v)
411 | 		}
412 | 		sa := sb.NewStringArray()
413 | 		bt.InsertStringDictValues(sa)
414 | 	case *array.BooleanBuilder:
415 | 		f.appendFunc = func(data interface{}) error {
416 | 			appendBoolData(bt, data, f.source)
417 | 			return nil
418 | 		}
419 | 	case *array.Date32Builder:
420 | 		f.appendFunc = func(data interface{}) error {
421 | 			appendDate32Data(bt, data, f.source)
422 | 			return nil
423 | 		}
424 | 	case *array.Decimal128Builder:
425 | 		f.appendFunc = func(data interface{}) error {
426 | 			err := appendDecimal128Data(bt, data, f.source)
427 | 			if err != nil {
428 | 				return err
429 | 			}
430 | 			return nil
431 | 		}
432 | 	case *array.Decimal256Builder:
433 | 		f.appendFunc = func(data interface{}) error {
434 | 			err := appendDecimal256Data(bt, data, f.source)
435 | 			if err != nil {
436 | 				return err
437 | 			}
438 | 			return nil
439 | 		}
440 | 	case *extensions.UUIDBuilder:
441 | 		f.appendFunc = func(data interface{}) error {
442 | 			switch dt := data.(type) {
443 | 			case nil:
444 | 				bt.AppendNull()
445 | 			case string:
446 | 				err := bt.AppendValueFromString(dt)
447 | 				if err != nil {
448 | 					return err
449 | 				}
450 | 			case []byte:
451 | 				err := bt.AppendValueFromString(string(dt))
452 | 				if err != nil {
453 | 					return err
454 | 				}
455 | 			}
456 | 			return nil
457 | 		}
458 | 	case *array.FixedSizeBinaryBuilder:
459 | 		f.appendFunc = func(data interface{}) error {
460 | 			appendFixedSizeBinaryData(bt, data, f.source)
461 | 			return nil
462 | 		}
463 | 	case *array.Float32Builder:
464 | 		f.appendFunc = func(data interface{}) error {
465 | 			appendFloat32Data(bt, data, f.source)
466 | 			return nil
467 | 		}
468 | 	case *array.Float64Builder:
469 | 		f.appendFunc = func(data interface{}) error {
470 | 			appendFloat64Data(bt, data, f.source)
471 | 			return nil
472 | 		}
473 | 	case *array.Int32Builder:
474 | 		f.appendFunc = func(data interface{}) error {
475 | 			appendInt32Data(bt, data, f.source)
476 | 			return nil
477 | 		}
478 | 	case *array.Int64Builder:
479 | 		f.appendFunc = func(data interface{}) error {
480 | 			appendInt64Data(bt, data, f.source)
481 | 			return nil
482 | 		}
483 | 	case *array.LargeListBuilder:
484 | 		vb := bt.ValueBuilder()
485 | 		f.isList = true
486 | 		mapFieldBuilders(vb, field.Type.(*arrow.LargeListType).ElemField(), f)
487 | 		f.appendFunc = func(data interface{}) error {
488 | 			switch dt := data.(type) {
489 | 			case nil:
490 | 				bt.AppendNull()
491 | 			case []interface{}:
492 | 				if len(dt) == 0 {
493 | 					bt.AppendEmptyValue()
494 | 				} else {
495 | 					bt.Append(true)
496 | 				}
497 | 			default:
498 | 				bt.Append(true)
499 | 			}
500 | 			return nil
501 | 		}
502 | 	case *array.ListBuilder:
503 | 		vb := bt.ValueBuilder()
504 | 		f.isList = true
505 | 		mapFieldBuilders(vb, field.Type.(*arrow.ListType).ElemField(), f)
506 | 		f.appendFunc = func(data interface{}) error {
507 | 			switch dt := data.(type) {
508 | 			case nil:
509 | 				bt.AppendNull()
510 | 			case []interface{}:
511 | 				if len(dt) == 0 {
512 | 					bt.AppendEmptyValue()
513 | 				} else {
514 | 					bt.Append(true)
515 | 				}
516 | 			default:
517 | 				bt.Append(true)
518 | 			}
519 | 			return nil
520 | 		}
521 | 	case *array.MapBuilder:
522 | 		// has metadata for objects in values
523 | 		f.isMap = true
524 | 		kb := bt.KeyBuilder()
525 | 		ib := bt.ItemBuilder()
526 | 		mapFieldBuilders(kb, field.Type.(*arrow.MapType).KeyField(), f)
527 | 		mapFieldBuilders(ib, field.Type.(*arrow.MapType).ItemField(), f)
528 | 		f.appendFunc = func(data interface{}) error {
529 | 			switch data.(type) {
530 | 			case nil:
531 | 				bt.AppendNull()
532 | 			default:
533 | 				bt.Append(true)
534 | 			}
535 | 			return nil
536 | 		}
537 | 	case *array.MonthDayNanoIntervalBuilder:
538 | 		f.appendFunc = func(data interface{}) error {
539 | 			appendDurationData(bt, data, f.source)
540 | 			return nil
541 | 		}
542 | 	case *array.StringBuilder:
543 | 		f.appendFunc = func(data interface{}) error {
544 | 			appendStringData(bt, data, f.source)
545 | 			return nil
546 | 		}
547 | 	case *array.StructBuilder:
548 | 		// has metadata for Avro Union named types
549 | 		f.typeName, _ = field.Metadata.GetValue("typeName")
550 | 		f.isStruct = true
551 | 		// create children
552 | 		for i, p := range field.Type.(*arrow.StructType).Fields() {
553 | 			mapFieldBuilders(bt.FieldBuilder(i), p, f)
554 | 		}
555 | 		f.appendFunc = func(data interface{}) error {
556 | 			switch data.(type) {
557 | 			case nil:
558 | 				bt.AppendNull()
559 | 				return ErrNullStructData
560 | 			default:
561 | 				bt.Append(true)
562 | 			}
563 | 			return nil
564 | 		}
565 | 	case *array.Time32Builder:
566 | 		f.appendFunc = func(data interface{}) error {
567 | 			appendTime32Data(bt, data, f.source)
568 | 			return nil
569 | 		}
570 | 	case *array.Time64Builder:
571 | 		f.appendFunc = func(data interface{}) error {
572 | 			appendTime64Data(bt, data, f.source)
573 | 			return nil
574 | 		}
575 | 	case *array.TimestampBuilder:
576 | 		f.appendFunc = func(data interface{}) error {
577 | 			appendTimestampData(bt, data, f.source)
578 | 			return nil
579 | 		}
580 | 	}
581 | }
582 | 
583 | func appendBinaryData(b *array.BinaryBuilder, data any, source DataSource) {
584 | 	switch dt := data.(type) {
585 | 	case nil:
586 | 		b.AppendNull()
587 | 	case []byte:
588 | 		b.Append(dt)
589 | 	case map[string]any:
590 | 		if source == DataSourceAvro {
591 | 			switch ct := dt["bytes"].(type) {
592 | 			case nil:
593 | 				b.AppendNull()
594 | 			default:
595 | 				b.Append(ct.([]byte))
596 | 			}
597 | 		}
598 | 	default:
599 | 		b.Append(fmt.Append([]byte{}, data))
600 | 	}
601 | }
602 | 
603 | func appendBinaryDictData(b *array.BinaryDictionaryBuilder, data any, source DataSource) {
604 | 	switch dt := data.(type) {
605 | 	case nil:
606 | 		b.AppendNull()
607 | 	case string:
608 | 		b.AppendString(dt)
609 | 	case map[string]any:
610 | 		if source == DataSourceAvro {
611 | 			switch v := dt["string"].(type) {
612 | 			case nil:
613 | 				b.AppendNull()
614 | 			case string:
615 | 				b.AppendString(v)
616 | 			}
617 | 		}
618 | 	}
619 | }
620 | 
621 | func appendBoolData(b *array.BooleanBuilder, data any, source DataSource) {
622 | 	switch dt := data.(type) {
623 | 	case nil:
624 | 		b.AppendNull()
625 | 	case bool:
626 | 		b.Append(dt)
627 | 	case map[string]any:
628 | 		if source == DataSourceAvro {
629 | 			switch v := dt["boolean"].(type) {
630 | 			case nil:
631 | 				b.AppendNull()
632 | 			case bool:
633 | 				b.Append(v)
634 | 			}
635 | 		}
636 | 	}
637 | }
638 | 
639 | func appendDate32Data(b *array.Date32Builder, data any, source DataSource) {
640 | 	switch dt := data.(type) {
641 | 	case nil:
642 | 		b.AppendNull()
643 | 	case json.Number:
644 | 		// TO-DO
645 | 	case string:
646 | 		date, _ := time.Parse(time.DateOnly, dt)
647 | 		b.Append(arrow.Date32FromTime(date))
648 | 	case time.Time:
649 | 		b.Append(arrow.Date32FromTime(dt))
650 | 	case int32:
651 | 		b.Append(arrow.Date32(dt))
652 | 	case map[string]any:
653 | 		if source == DataSourceAvro {
654 | 			switch v := dt["int"].(type) {
655 | 			case nil:
656 | 				b.AppendNull()
657 | 			case int32:
658 | 				b.Append(arrow.Date32(v))
659 | 			}
660 | 		}
661 | 	}
662 | }
663 | 
664 | func appendDecimal128Data(b *array.Decimal128Builder, data any, source DataSource) error {
665 | 	switch dt := data.(type) {
666 | 	case nil:
667 | 		b.AppendNull()
668 | 	case []byte:
669 | 		// TO-DO
670 | 		if source == DataSourceAvro {
671 | 			buf := bytes.NewBuffer(dt)
672 | 			if len(dt) <= 38 {
673 | 				var intData int64
674 | 				err := binary.Read(buf, binary.BigEndian, &intData)
675 | 				if err != nil {
676 | 					return err
677 | 				}
678 | 				b.Append(decimal128.FromI64(intData))
679 | 			} else {
680 | 				var bigIntData big.Int
681 | 				b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes())))
682 | 			}
683 | 		}
684 | 	case map[string]any:
685 | 		if source == DataSourceAvro {
686 | 			buf := bytes.NewBuffer(dt["bytes"].([]byte))
687 | 			if len(dt["bytes"].([]byte)) <= 38 {
688 | 				var intData int64
689 | 				err := binary.Read(buf, binary.BigEndian, &intData)
690 | 				if err != nil {
691 | 					return err
692 | 				}
693 | 				b.Append(decimal128.FromI64(intData))
694 | 			} else {
695 | 				var bigIntData big.Int
696 | 				b.Append(decimal128.FromBigInt(bigIntData.SetBytes(buf.Bytes())))
697 | 			}
698 | 		}
699 | 	}
700 | 	return nil
701 | }
702 | 
703 | func appendDecimal256Data(b *array.Decimal256Builder, data any, source DataSource) error {
704 | 	switch dt := data.(type) {
705 | 	case nil:
706 | 		b.AppendNull()
707 | 	case []byte:
708 | 		// TO-DO
709 | 		if source == DataSourceAvro {
710 | 			var bigIntData big.Int
711 | 			buf := bytes.NewBuffer(dt)
712 | 			b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes())))
713 | 		}
714 | 	case map[string]any:
715 | 		if source == DataSourceAvro {
716 | 			var bigIntData big.Int
717 | 			buf := bytes.NewBuffer(dt["bytes"].([]byte))
718 | 			b.Append(decimal256.FromBigInt(bigIntData.SetBytes(buf.Bytes())))
719 | 		}
720 | 	}
721 | 	return nil
722 | }
723 | 
724 | // Avro duration logical type annotates Avro fixed type of size 12, which stores three little-endian
725 | // unsigned integers that represent durations at different granularities of time. The first stores
726 | // a number in months, the second stores a number in days, and the third stores a number in milliseconds.
727 | //
728 | // https://pkg.go.dev/time#Duration
729 | // Go time.Duration int64
730 | // A Duration represents the elapsed time between two instants as an int64 nanosecond count.
731 | // The representation limits the largest representable duration to approximately 290 years.
732 | func appendDurationData(b *array.MonthDayNanoIntervalBuilder, data any, source DataSource) {
733 | 	switch dt := data.(type) {
734 | 	case nil:
735 | 		b.AppendNull()
736 | 	case []byte:
737 | 		// TO-DO
738 | 		if source == DataSourceAvro {
739 | 			dur := new(arrow.MonthDayNanoInterval)
740 | 			dur.Months = int32(binary.LittleEndian.Uint16(dt[:3]))
741 | 			dur.Days = int32(binary.LittleEndian.Uint16(dt[4:7]))
742 | 			dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dt[8:]) * 1000000)
743 | 			b.Append(*dur)
744 | 		}
745 | 	case map[string]any:
746 | 		if source == DataSourceAvro {
747 | 			switch dtb := dt["bytes"].(type) {
748 | 			case nil:
749 | 				b.AppendNull()
750 | 			case []byte:
751 | 				dur := new(arrow.MonthDayNanoInterval)
752 | 				dur.Months = int32(binary.LittleEndian.Uint16(dtb[:3]))
753 | 				dur.Days = int32(binary.LittleEndian.Uint16(dtb[4:7]))
754 | 				dur.Nanoseconds = int64(binary.LittleEndian.Uint32(dtb[8:]) * 1000000)
755 | 				b.Append(*dur)
756 | 			}
757 | 		}
758 | 	}
759 | }
760 | 
761 | func appendFixedSizeBinaryData(b *array.FixedSizeBinaryBuilder, data any, source DataSource) {
762 | 	switch dt := data.(type) {
763 | 	case nil:
764 | 		b.AppendNull()
765 | 	case []byte:
766 | 		b.Append(dt)
767 | 	case map[string]any:
768 | 		if source == DataSourceAvro {
769 | 			switch v := dt["bytes"].(type) {
770 | 			case nil:
771 | 				b.AppendNull()
772 | 			case []byte:
773 | 				b.Append(v)
774 | 			}
775 | 		}
776 | 	}
777 | }
778 | 
779 | func appendFloat32Data(b *array.Float32Builder, data any, source DataSource) {
780 | 	switch dt := data.(type) {
781 | 	case nil:
782 | 		b.AppendNull()
783 | 	case float32:
784 | 		b.Append(dt)
785 | 	case json.Number:
786 | 		f, _ := dt.Float64()
787 | 		b.Append(float32(f))
788 | 	case string:
789 | 		i, _ := strconv.ParseFloat(dt, 32)
790 | 		b.Append(float32(i))
791 | 	case map[string]any:
792 | 		if source == DataSourceAvro {
793 | 			switch v := dt["float"].(type) {
794 | 			case nil:
795 | 				b.AppendNull()
796 | 			case float32:
797 | 				b.Append(v)
798 | 			}
799 | 		}
800 | 	}
801 | }
802 | 
803 | func appendFloat64Data(b *array.Float64Builder, data any, source DataSource) {
804 | 	switch dt := data.(type) {
805 | 	case nil:
806 | 		b.AppendNull()
807 | 	case float64:
808 | 		b.Append(dt)
809 | 	case json.Number:
810 | 		f, _ := dt.Float64()
811 | 		b.Append(f)
812 | 	case string:
813 | 		i, _ := strconv.ParseFloat(dt, 64)
814 | 		b.Append(i)
815 | 	case map[string]any:
816 | 		if source == DataSourceAvro {
817 | 			switch v := dt["double"].(type) {
818 | 			case nil:
819 | 				b.AppendNull()
820 | 			case float64:
821 | 				b.Append(v)
822 | 			}
823 | 		}
824 | 	}
825 | }
826 | 
827 | func appendInt8Data(b *array.Int8Builder, data any, source DataSource) {
828 | 	switch dt := data.(type) {
829 | 	case nil:
830 | 		b.AppendNull()
831 | 	case int:
832 | 		b.Append(int8(dt))
833 | 	case int8:
834 | 		b.Append(dt)
835 | 	case json.Number:
836 | 		i, _ := dt.Int64()
837 | 		b.Append(int8(i))
838 | 	case string:
839 | 		i, _ := strconv.ParseInt(dt, 10, 8)
840 | 		b.Append(int8(i))
841 | 	case map[string]any:
842 | 
843 | 	}
844 | }
845 | 
846 | func appendInt16Data(b *array.Int16Builder, data any, source DataSource) {
847 | 	switch dt := data.(type) {
848 | 	case nil:
849 | 		b.AppendNull()
850 | 	case int:
851 | 		b.Append(int16(dt))
852 | 	case int16:
853 | 		b.Append(dt)
854 | 	case json.Number:
855 | 		i, _ := dt.Int64()
856 | 		b.Append(int16(i))
857 | 	case string:
858 | 		i, _ := strconv.ParseInt(dt, 10, 16)
859 | 		b.Append(int16(i))
860 | 	case map[string]any:
861 | 
862 | 	}
863 | }
864 | 
865 | func appendInt32Data(b *array.Int32Builder, data any, source DataSource) {
866 | 	switch dt := data.(type) {
867 | 	case nil:
868 | 		b.AppendNull()
869 | 	case int:
870 | 		b.Append(int32(dt))
871 | 	case int32:
872 | 		b.Append(dt)
873 | 	case json.Number:
874 | 		i, _ := dt.Int64()
875 | 		b.Append(int32(i))
876 | 	case string:
877 | 		i, _ := strconv.ParseInt(dt, 10, 32)
878 | 		b.Append(int32(i))
879 | 	case map[string]any:
880 | 
881 | 	}
882 | }
883 | 
884 | func appendInt64Data(b *array.Int64Builder, data any, source DataSource) {
885 | 	switch dt := data.(type) {
886 | 	case nil:
887 | 		b.AppendNull()
888 | 	case int:
889 | 		b.Append(int64(dt))
890 | 	case int64:
891 | 		b.Append(dt)
892 | 	case string:
893 | 		i, _ := strconv.ParseInt(dt, 10, 64)
894 | 		b.Append(i)
895 | 	case json.Number:
896 | 		i, _ := dt.Int64()
897 | 		b.Append(i)
898 | 	case map[string]any:
899 | 		if source == DataSourceAvro {
900 | 			switch v := dt["long"].(type) {
901 | 			case nil:
902 | 				b.AppendNull()
903 | 			case int:
904 | 				b.Append(int64(v))
905 | 			case int64:
906 | 				b.Append(v)
907 | 			}
908 | 		}
909 | 	}
910 | }
911 | 
912 | func appendStringData(b *array.StringBuilder, data any, source DataSource) {
913 | 	switch dt := data.(type) {
914 | 	case nil:
915 | 		b.AppendNull()
916 | 	case string:
917 | 		b.Append(dt)
918 | 	case map[string]any:
919 | 		if source == DataSourceAvro {
920 | 			switch v := dt["string"].(type) {
921 | 			case nil:
922 | 				b.AppendNull()
923 | 			case string:
924 | 				b.Append(v)
925 | 			}
926 | 		}
927 | 	default:
928 | 		b.Append(fmt.Sprint(data))
929 | 	}
930 | }
931 | 
932 | func appendTime32Data(b *array.Time32Builder, data any, source DataSource) {
933 | 	switch dt := data.(type) {
934 | 	case nil:
935 | 		b.AppendNull()
936 | 	case string:
937 | 		t, _ := arrow.Time32FromString(dt, arrow.Microsecond)
938 | 		b.Append(t)
939 | 	case int32:
940 | 		b.Append(arrow.Time32(dt))
941 | 	case map[string]any:
942 | 		if source == DataSourceAvro {
943 | 			switch v := dt["int"].(type) {
944 | 			case nil:
945 | 				b.AppendNull()
946 | 			case int32:
947 | 				b.Append(arrow.Time32(v))
948 | 			}
949 | 		}
950 | 	}
951 | }
952 | 
953 | func appendTime64Data(b *array.Time64Builder, data any, source DataSource) {
954 | 	switch dt := data.(type) {
955 | 	case nil:
956 | 		b.AppendNull()
957 | 	case string:
958 | 		t, _ := arrow.Time64FromString(dt, arrow.Microsecond)
959 | 		b.Append(t)
960 | 	case int64:
961 | 		b.Append(arrow.Time64(dt))
962 | 	case map[string]any:
963 | 		if source == DataSourceAvro {
964 | 			switch v := dt["long"].(type) {
965 | 			case nil:
966 | 				b.AppendNull()
967 | 			case int64:
968 | 				b.Append(arrow.Time64(v))
969 | 			}
970 | 		}
971 | 	}
972 | }
973 | 
974 | func appendTimestampData(b *array.TimestampBuilder, data any, source DataSource) {
975 | 	switch dt := data.(type) {
976 | 	case nil:
977 | 		b.AppendNull()
978 | 	case json.Number:
979 | 		epochSeconds, _ := dt.Int64()
980 | 		t, _ := arrow.TimestampFromTime(time.Unix(epochSeconds, 0), arrow.Microsecond)
981 | 		b.Append(t)
982 | 	case string:
983 | 		t, _ := arrow.TimestampFromString(dt, arrow.Microsecond)
984 | 		b.Append(t)
985 | 	case time.Time:
986 | 		t, _ := arrow.TimestampFromTime(dt, arrow.Microsecond)
987 | 		b.Append(t)
988 | 	case int64:
989 | 		b.Append(arrow.Timestamp(dt))
990 | 	case map[string]any:
991 | 		switch v := dt["long"].(type) {
992 | 		case nil:
993 | 			b.AppendNull()
994 | 		case int64:
995 | 			b.Append(arrow.Timestamp(v))
996 | 		}
997 | 	}
998 | }
999 | 


--------------------------------------------------------------------------------
/reader/option.go:
--------------------------------------------------------------------------------
 1 | package reader
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"context"
 6 | 	"io"
 7 | 
 8 | 	"github.com/apache/arrow-go/v18/arrow/memory"
 9 | )
10 | 
11 | // WithAllocator specifies the Arrow memory allocator used while building records.
12 | func WithAllocator(mem memory.Allocator) Option {
13 | 	return func(cfg config) {
14 | 		cfg.mem = mem
15 | 	}
16 | }
17 | 
18 | // WithJSONDecoder specifies whether to use goccy/json-go as the Bodkin Reader's decoder.
19 | // The default is the Bodkin DataLoader, a linked list of builders which reduces recursive lookups
20 | // in maps when loading data.
21 | func WithJSONDecoder() Option {
22 | 	return func(cfg config) {
23 | 		cfg.jsonDecode = true
24 | 	}
25 | }
26 | 
27 | // WithChunk specifies the chunk size used while reading data to Arrow records.
28 | //
29 | // If n is zero or 1, no chunking will take place and the reader will create
30 | // one record per row.
31 | // If n is greater than 1, chunks of n rows will be read.
32 | func WithChunk(n int) Option {
33 | 	return func(cfg config) {
34 | 		cfg.chunk = n
35 | 	}
36 | }
37 | 
38 | // WithContext specifies the context used while reading data to Arrow records.
39 | // Calling reader.Cancel() will cancel the context and stop reading data.
40 | func WithContext(ctx context.Context) Option {
41 | 	return func(cfg config) {
42 | 		cfg.readerCtx, cfg.readCancel = context.WithCancel(ctx)
43 | 	}
44 | }
45 | 
46 | // WithIOReader provides an io.Reader to Bodkin Reader, along with a delimiter
47 | // to use to split datum in the data stream. Default delimiter '\n' if delimiter
48 | // is not provided.
49 | func WithIOReader(r io.Reader, delim byte) Option {
50 | 	return func(cfg config) {
51 | 		cfg.rr = r
52 | 		cfg.br = bufio.NewReaderSize(cfg.rr, 1024*1024*16)
53 | 		if delim != DefaultDelimiter {
54 | 			cfg.delim = delim
55 | 		}
56 | 	}
57 | }
58 | 
59 | // WithInputBufferSize specifies the Bodkin Reader's input buffer size.
60 | func WithInputBufferSize(n int) Option {
61 | 	return func(cfg config) {
62 | 		cfg.inputBufferSize = n
63 | 	}
64 | }
65 | 
66 | // WithRecordBufferSize specifies the Bodkin Reader's record buffer size.
67 | func WithRecordBufferSize(n int) Option {
68 | 	return func(cfg config) {
69 | 		cfg.recordBufferSize = n
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/reader/reader.go:
--------------------------------------------------------------------------------
  1 | // Package reader contains helpers for reading data and loading to Arrow.
  2 | package reader
  3 | 
  4 | import (
  5 | 	"bufio"
  6 | 	"bytes"
  7 | 	"context"
  8 | 	"errors"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"sync"
 12 | 	"sync/atomic"
 13 | 
 14 | 	"github.com/apache/arrow-go/v18/arrow"
 15 | 	"github.com/apache/arrow-go/v18/arrow/array"
 16 | 	"github.com/apache/arrow-go/v18/arrow/memory"
 17 | 	json "github.com/goccy/go-json"
 18 | )
 19 | 
 20 | type DataSource int
 21 | 
 22 | const (
 23 | 	DataSourceGo DataSource = iota
 24 | 	DataSourceJSON
 25 | 	DataSourceAvro
 26 | )
 27 | const (
 28 | 	Manual int = iota
 29 | 	Scanner
 30 | )
 31 | const DefaultDelimiter byte = byte('\n')
 32 | 
 33 | // Option configures an Avro reader/writer.
 34 | type (
 35 | 	Option func(config)
 36 | 	config *DataReader
 37 | )
 38 | 
 39 | type DataReader struct {
 40 | 	rr               io.Reader
 41 | 	br               *bufio.Reader
 42 | 	delim            byte
 43 | 	refs             int64
 44 | 	source           DataSource
 45 | 	schema           *arrow.Schema
 46 | 	bld              *array.RecordBuilder
 47 | 	mem              memory.Allocator
 48 | 	opts             []Option
 49 | 	bldMap           *fieldPos
 50 | 	ldr              *dataLoader
 51 | 	cur              arrow.Record
 52 | 	curBatch         []arrow.Record
 53 | 	readerCtx        context.Context
 54 | 	readCancel       func()
 55 | 	err              error
 56 | 	anyChan          chan any
 57 | 	recChan          chan arrow.Record
 58 | 	recReq           chan struct{}
 59 | 	bldDone          chan struct{}
 60 | 	inputLock        atomic.Int32
 61 | 	factoryLock      atomic.Int32
 62 | 	wg               sync.WaitGroup
 63 | 	jsonDecode       bool
 64 | 	chunk            int
 65 | 	inputCount       int
 66 | 	inputBufferSize  int
 67 | 	recordBufferSize int
 68 | }
 69 | 
 70 | func NewReader(schema *arrow.Schema, source DataSource, opts ...Option) (*DataReader, error) {
 71 | 	switch source {
 72 | 	case DataSourceGo, DataSourceJSON, DataSourceAvro:
 73 | 		break
 74 | 	default:
 75 | 		source = DataSourceGo
 76 | 	}
 77 | 	r := &DataReader{
 78 | 		source:           source,
 79 | 		schema:           schema,
 80 | 		mem:              memory.DefaultAllocator,
 81 | 		inputBufferSize:  1024 * 64,
 82 | 		recordBufferSize: 1024 * 64,
 83 | 		chunk:            0,
 84 | 		delim:            DefaultDelimiter,
 85 | 		opts:             opts,
 86 | 	}
 87 | 	for _, opt := range opts {
 88 | 		opt(r)
 89 | 	}
 90 | 
 91 | 	r.anyChan = make(chan any, r.inputBufferSize)
 92 | 	r.recChan = make(chan arrow.Record, r.recordBufferSize)
 93 | 	r.bldDone = make(chan struct{})
 94 | 	r.recReq = make(chan struct{}, 100)
 95 | 	if r.readerCtx == nil {
 96 | 		r.readerCtx, r.readCancel = context.WithCancel(context.Background())
 97 | 	}
 98 | 	if r.rr != nil {
 99 | 		r.wg.Add(1)
100 | 		go r.decode2Chan()
101 | 	}
102 | 	r.bld = array.NewRecordBuilder(memory.DefaultAllocator, schema)
103 | 	r.bldMap = newFieldPos()
104 | 	r.bldMap.isStruct = true
105 | 	r.source = source
106 | 	r.ldr = newDataLoader()
107 | 	for idx, fb := range r.bld.Fields() {
108 | 		mapFieldBuilders(fb, schema.Field(idx), r.bldMap)
109 | 	}
110 | 	r.ldr.drawTree(r.bldMap)
111 | 	r.wg.Add(1)
112 | 	go r.recordFactory()
113 | 
114 | 	return r, nil
115 | }
116 | 
117 | // ReadToRecord decodes a datum directly to an arrow.Record. The record
118 | // should be released by the user when done with it.
119 | func (r *DataReader) ReadToRecord(a any) (arrow.Record, error) {
120 | 	var err error
121 | 	defer func() {
122 | 		if rc := recover(); rc != nil {
123 | 			fmt.Println(rc, err)
124 | 		}
125 | 	}()
126 | 	m, err := InputMap(a)
127 | 	if err != nil {
128 | 		r.err = errors.Join(r.err, err)
129 | 	}
130 | 
131 | 	switch r.jsonDecode {
132 | 	case true:
133 | 		var v []byte
134 | 		v, err = json.Marshal(m)
135 | 		if err != nil {
136 | 			r.err = err
137 | 			return nil, err
138 | 		}
139 | 		d := json.NewDecoder(bytes.NewReader(v))
140 | 		d.UseNumber()
141 | 		err = d.Decode(r.bld)
142 | 		if err != nil {
143 | 			return nil, err
144 | 		}
145 | 	default:
146 | 		err = r.ldr.loadDatum(m)
147 | 		if err != nil {
148 | 			return nil, err
149 | 		}
150 | 	}
151 | 
152 | 	return r.bld.NewRecord(), nil
153 | }
154 | 
155 | // NextBatch returns whether a []arrow.Record of a specified size can be received
156 | // from the converted record queue. Will still return true if the queue channel is closed and
157 | // last batch of records available < batch size specified.
158 | // The user should check Err() after a call to NextBatch that returned false to check
159 | // if an error took place.
160 | func (r *DataReader) NextBatch(batchSize int) bool {
161 | 	if batchSize < 1 {
162 | 		batchSize = 1
163 | 	}
164 | 	if len(r.curBatch) != 0 {
165 | 		for _, rec := range r.curBatch {
166 | 			rec.Release()
167 | 		}
168 | 		r.curBatch = []arrow.Record{}
169 | 	}
170 | 	r.wg.Wait()
171 | 
172 | 	for len(r.curBatch) <= batchSize {
173 | 		select {
174 | 		case rec, ok := <-r.recChan:
175 | 			if !ok && rec == nil {
176 | 				if len(r.curBatch) > 0 {
177 | 					goto jump
178 | 				}
179 | 				return false
180 | 			}
181 | 			if rec != nil {
182 | 				r.curBatch = append(r.curBatch, rec)
183 | 			}
184 | 		case <-r.bldDone:
185 | 			if len(r.recChan) > 0 {
186 | 				rec := <-r.recChan
187 | 				r.curBatch = append(r.curBatch, rec)
188 | 			}
189 | 		case <-r.readerCtx.Done():
190 | 			return false
191 | 		}
192 | 	}
193 | 
194 | jump:
195 | 	if r.err != nil {
196 | 		return false
197 | 	}
198 | 
199 | 	return len(r.curBatch) > 0
200 | }
201 | 
202 | // Next returns whether a Record can be received from the converted record queue.
203 | // The user should check Err() after a call to Next that returned false to check
204 | // if an error took place.
205 | func (r *DataReader) Next() bool {
206 | 	var ok bool
207 | 	if r.cur != nil {
208 | 		r.cur.Release()
209 | 		r.cur = nil
210 | 	}
211 | 	r.wg.Wait()
212 | 	if r.chunk < 1 {
213 | 		r.recReq <- struct{}{}
214 | 	}
215 | 	select {
216 | 	case r.cur, ok = <-r.recChan:
217 | 		if !ok && r.cur == nil {
218 | 			return false
219 | 		}
220 | 	case <-r.bldDone:
221 | 		if len(r.recChan) > 0 {
222 | 			r.cur = <-r.recChan
223 | 		}
224 | 	case <-r.readerCtx.Done():
225 | 		if len(r.recChan) > 0 {
226 | 			r.cur = <-r.recChan
227 | 			break
228 | 		}
229 | 		return false
230 | 	}
231 | 	if r.err != nil {
232 | 		return false
233 | 	}
234 | 
235 | 	return r.cur != nil
236 | }
237 | 
238 | func (r *DataReader) Mode() int {
239 | 	switch r.rr {
240 | 	case nil:
241 | 		return Manual
242 | 	default:
243 | 		return Scanner
244 | 	}
245 | }
246 | 
247 | func (r *DataReader) Count() int             { return r.inputCount }
248 | func (r *DataReader) ResetCount()            { r.inputCount = 0 }
249 | func (r *DataReader) InputBufferSize() int   { return r.inputBufferSize }
250 | func (r *DataReader) RecBufferSize() int     { return r.recordBufferSize }
251 | func (r *DataReader) DataSource() DataSource { return r.source }
252 | func (r *DataReader) Opts() []Option         { return r.opts }
253 | 
254 | // Record returns the current Arrow record.
255 | // It is valid until the next call to Next.
256 | func (r *DataReader) Record() arrow.Record { return r.cur }
257 | 
258 | // Record returns the current Arrow record batch.
259 | // It is valid until the next call to NextBatch.
260 | func (r *DataReader) RecordBatch() []arrow.Record { return r.curBatch }
261 | func (r *DataReader) Schema() *arrow.Schema       { return r.schema }
262 | 
263 | // Err returns the last error encountered during the reading of data.
264 | func (r *DataReader) Err() error { return r.err }
265 | 
266 | // Retain increases the reference count by 1.
267 | // Retain may be called simultaneously from multiple goroutines.
268 | func (r *DataReader) Retain() {
269 | 	atomic.AddInt64(&r.refs, 1)
270 | }
271 | 
272 | // Release decreases the reference count by 1.
273 | // When the reference count goes to zero, the memory is freed.
274 | // Release may be called simultaneously from multiple goroutines.
275 | func (r *DataReader) Release() {
276 | 	// debug.Assert(atomic.LoadInt64(&r.refs) > 0, "too many releases")
277 | 
278 | 	if atomic.AddInt64(&r.refs, -1) == 0 {
279 | 		if r.cur != nil {
280 | 			r.cur.Release()
281 | 		}
282 | 	}
283 | }
284 | 
285 | // Peek returns the length of the input data and Arrow Record queues.
286 | func (r *DataReader) Peek() (int, int) {
287 | 	return len(r.anyChan), len(r.recChan)
288 | }
289 | 
290 | // Cancel cancels the Reader's io.Reader scan to Arrow.
291 | func (r *DataReader) Cancel() {
292 | 	r.readCancel()
293 | }
294 | 
295 | // Read loads one datum.
296 | // If the Reader has an io.Reader, Read is a no-op.
297 | func (r *DataReader) Read(a any) error {
298 | 	if r.rr != nil {
299 | 		return nil
300 | 	}
301 | 	var err error
302 | 	defer func() error {
303 | 		if rc := recover(); rc != nil {
304 | 			r.err = errors.Join(r.err, fmt.Errorf("panic %v", rc))
305 | 		}
306 | 		return r.err
307 | 	}()
308 | 	m, err := InputMap(a)
309 | 	if err != nil {
310 | 		r.err = errors.Join(r.err, err)
311 | 		return err
312 | 	}
313 | 	r.anyChan <- m
314 | 	r.inputCount++
315 | 	return nil
316 | }
317 | 
318 | // Reset resets a Reader to its initial state.
319 | func (r *DataReader) Reset() {
320 | 	r.readCancel()
321 | 	r.anyChan = make(chan any, r.inputBufferSize)
322 | 	r.recChan = make(chan arrow.Record, r.recordBufferSize)
323 | 	r.bldDone = make(chan struct{})
324 | 	r.inputCount = 0
325 | 
326 | 	// DataReader has an io.Reader
327 | 	if r.rr != nil {
328 | 		r.br.Reset(r.rr)
329 | 		go r.decode2Chan()
330 | 		r.wg.Add(1)
331 | 	}
332 | 	go r.recordFactory()
333 | 	r.wg.Add(1)
334 | }
335 | 


--------------------------------------------------------------------------------
/reader/recordfactory.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"io"
  7 | )
  8 | 
  9 | func (r *DataReader) decode2Chan() {
 10 | 	// 1 means running
 11 | 	if r.inputLock.CompareAndSwap(0, 1) {
 12 | 		defer r.inputLock.Store(0)
 13 | 	} else {
 14 | 		return
 15 | 	}
 16 | 	var err error
 17 | 	defer func() {
 18 | 		if rc := recover(); rc != nil {
 19 | 			r.err = errors.Join(r.err, err, fmt.Errorf("panic %v", rc))
 20 | 		}
 21 | 	}()
 22 | 	defer close(r.anyChan)
 23 | 	b := true
 24 | 	for {
 25 | 		datumBytes, err := r.br.ReadBytes(r.delim)
 26 | 		if err != nil {
 27 | 			if errors.Is(err, io.EOF) {
 28 | 				r.err = nil
 29 | 				return
 30 | 			}
 31 | 			r.err = err
 32 | 			return
 33 | 		}
 34 | 		datum, err := InputMap(datumBytes[:len(datumBytes)-1])
 35 | 		if err != nil {
 36 | 			r.err = errors.Join(r.err, err)
 37 | 			continue
 38 | 		}
 39 | 		r.anyChan <- datum
 40 | 		r.inputCount++
 41 | 		if b {
 42 | 			r.wg.Done() // sync.WaitGroup to allow Next() to wait for records to be available
 43 | 			b = false
 44 | 		}
 45 | 		select {
 46 | 		case <-r.readerCtx.Done():
 47 | 			return
 48 | 		default:
 49 | 		}
 50 | 	}
 51 | }
 52 | 
 53 | // recordFactory... the hits just keep on coming
 54 | func (r *DataReader) recordFactory() {
 55 | 	if r.factoryLock.CompareAndSwap(0, 1) {
 56 | 		defer r.factoryLock.Store(0)
 57 | 	} else {
 58 | 		return
 59 | 	}
 60 | 	defer close(r.recChan)
 61 | 	recChunk := 0
 62 | 
 63 | 	r.wg.Done() // sync.WaitGroup to allow Next() to wait for records to be available
 64 | 
 65 | 	switch {
 66 | 	case r.chunk < 1:
 67 | 		for data := range r.anyChan {
 68 | 			err := r.ldr.loadDatum(data)
 69 | 			if err != nil {
 70 | 				r.err = err
 71 | 				return
 72 | 			}
 73 | 			select {
 74 | 			case <-r.readerCtx.Done():
 75 | 				r.bldDone <- struct{}{}
 76 | 				return
 77 | 			case <-r.recReq:
 78 | 				r.recChan <- r.bld.NewRecord()
 79 | 			default:
 80 | 			}
 81 | 		}
 82 | 		r.recChan <- r.bld.NewRecord()
 83 | 		r.bldDone <- struct{}{}
 84 | 	case r.chunk >= 1:
 85 | 		for data := range r.anyChan {
 86 | 			if recChunk == 0 {
 87 | 				r.bld.Reserve(r.chunk)
 88 | 			}
 89 | 			err := r.ldr.loadDatum(data)
 90 | 			if err != nil {
 91 | 				r.err = err
 92 | 				return
 93 | 			}
 94 | 			recChunk++
 95 | 			if recChunk >= r.chunk {
 96 | 				r.recChan <- r.bld.NewRecord()
 97 | 				recChunk = 0
 98 | 			}
 99 | 			select {
100 | 			case <-r.readerCtx.Done():
101 | 				r.recChan <- r.bld.NewRecord()
102 | 				r.bldDone <- struct{}{}
103 | 				return
104 | 			default:
105 | 			}
106 | 		}
107 | 		if recChunk != 0 {
108 | 			r.recChan <- r.bld.NewRecord()
109 | 		}
110 | 		r.bldDone <- struct{}{}
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/schema.go:
--------------------------------------------------------------------------------
  1 | package bodkin
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"regexp"
  7 | 	"slices"
  8 | 
  9 | 	"github.com/apache/arrow-go/v18/arrow"
 10 | 	"github.com/apache/arrow-go/v18/arrow/array"
 11 | )
 12 | 
 13 | type fieldPos struct {
 14 | 	root         *fieldPos
 15 | 	parent       *fieldPos
 16 | 	owner        *Bodkin
 17 | 	builder      array.Builder
 18 | 	name         string
 19 | 	path         []string
 20 | 	isList       bool
 21 | 	isItem       bool
 22 | 	isStruct     bool
 23 | 	isMap        bool
 24 | 	arrowType    arrow.Type
 25 | 	typeName     string
 26 | 	field        arrow.Field
 27 | 	children     []*fieldPos
 28 | 	childmap     map[string]*fieldPos
 29 | 	appendFunc   func(val interface{}) error
 30 | 	metadatas    arrow.Metadata
 31 | 	index, depth int32
 32 | 	err          error
 33 | }
 34 | 
 35 | // Schema evaluation/evolution errors.
 36 | var (
 37 | 	ErrUndefinedInput            = errors.New("nil input")
 38 | 	ErrInvalidInput              = errors.New("invalid input")
 39 | 	ErrNoLatestSchema            = errors.New("no second input has been provided")
 40 | 	ErrUndefinedFieldType        = errors.New("could not determine type of unpopulated field")
 41 | 	ErrUndefinedArrayElementType = errors.New("could not determine element type of empty array")
 42 | 	ErrNotAnUpgradableType       = errors.New("is not an upgradable type")
 43 | 	ErrPathNotFound              = errors.New("path not found")
 44 | 	ErrFieldTypeChanged          = errors.New("changed")
 45 | 	ErrFieldAdded                = errors.New("added")
 46 | )
 47 | 
 48 | // UpgradableTypes are scalar types that can be upgraded to a more flexible type.
 49 | var UpgradableTypes []arrow.Type = []arrow.Type{arrow.INT8,
 50 | 	arrow.UINT8,
 51 | 	arrow.INT16,
 52 | 	arrow.UINT16,
 53 | 	arrow.INT32,
 54 | 	arrow.UINT64,
 55 | 	arrow.INT64,
 56 | 	arrow.FLOAT16,
 57 | 	arrow.FLOAT32,
 58 | 	arrow.FLOAT64,
 59 | 	arrow.DATE32,
 60 | 	arrow.TIME64,
 61 | 	arrow.TIMESTAMP,
 62 | }
 63 | 
 64 | // Regular expressions and variables for type inference.
 65 | var (
 66 | 	timestampMatchers []*regexp.Regexp
 67 | 	dateMatcher       *regexp.Regexp
 68 | 	timeMatcher       *regexp.Regexp
 69 | 	integerMatcher    *regexp.Regexp
 70 | 	floatMatcher      *regexp.Regexp
 71 | 	boolMatcher       []string
 72 | )
 73 | 
 74 | func init() {
 75 | 	registerTsMatchers()
 76 | 	registerQuotedStringValueMatchers()
 77 | }
 78 | 
 79 | func registerTsMatchers() {
 80 | 	dateMatcher = regexp.MustCompile(`^\d{4}-\d{2}-\d{2}$`)
 81 | 	timeMatcher = regexp.MustCompile(`^\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})?$`)
 82 | 	timestampMatchers = append(timestampMatchers,
 83 | 		regexp.MustCompile(`^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$`), // ISO 8601
 84 | 		regexp.MustCompile(`^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$`), // RFC 3339 with space instead of T
 85 | 		regexp.MustCompile(`^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$`),                            // Datetime format with dashes
 86 | 		regexp.MustCompile(`^\d{4}-\d{1,2}-\d{1,2}[T ]\d{1,2}:\d{1,2}:\d{1,2}(\.\d{1,6})? *(([+-]\d{1,2}(:\d{1,2})?)|Z|UTC)?$`))
 87 | }
 88 | 
 89 | func registerQuotedStringValueMatchers() {
 90 | 	integerMatcher = regexp.MustCompile(`^[-+]?\d+$`)
 91 | 	floatMatcher = regexp.MustCompile(`^[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?$`)
 92 | 	boolMatcher = append(boolMatcher, "true", "false")
 93 | }
 94 | 
 95 | func newFieldPos(b *Bodkin) *fieldPos {
 96 | 	f := new(fieldPos)
 97 | 	f.owner = b
 98 | 	f.index = -1
 99 | 	f.root = f
100 | 	f.childmap = make(map[string]*fieldPos)
101 | 	f.children = make([]*fieldPos, 0)
102 | 	return f
103 | }
104 | 
105 | func (f *fieldPos) assignChild(child *fieldPos) {
106 | 	f.children = append(f.children, child)
107 | 	f.childmap[child.name] = child
108 | 	f.owner.knownFields.Set(child.dotPath(), child)
109 | 	f.owner.untypedFields.Delete(child.dotPath())
110 | }
111 | 
112 | func (f *fieldPos) child(index int) (*fieldPos, error) {
113 | 	if index < len(f.children) {
114 | 		return f.children[index], nil
115 | 	}
116 | 	return nil, fmt.Errorf("%v child index %d not found", f.namePath(), index)
117 | }
118 | 
119 | func (f *fieldPos) error() error             { return f.err }
120 | func (f *fieldPos) metadata() arrow.Metadata { return f.field.Metadata }
121 | 
122 | func (f *fieldPos) newChild(childName string) *fieldPos {
123 | 	var child fieldPos = fieldPos{
124 | 		root:   f.root,
125 | 		parent: f,
126 | 		owner:  f.owner,
127 | 		name:   childName,
128 | 		index:  int32(len(f.children)),
129 | 		depth:  f.depth + 1,
130 | 	}
131 | 	if f.isList {
132 | 		child.isItem = true
133 | 	}
134 | 	child.path = child.namePath()
135 | 	child.childmap = make(map[string]*fieldPos)
136 | 	child.arrowType = arrow.NULL
137 | 	return &child
138 | }
139 | 
140 | func (f *fieldPos) mapChildren() {
141 | 	for i, c := range f.children {
142 | 		f.childmap[c.name] = f.children[i]
143 | 	}
144 | }
145 | 
146 | // getPath returns a field found at a defined path, otherwise returns ErrPathNotFound.
147 | func (f *fieldPos) getPath(path []string) (*fieldPos, error) {
148 | 	if len(path) == 0 { // degenerate input
149 | 		return nil, fmt.Errorf("getPath needs at least one key")
150 | 	}
151 | 	if node, ok := f.childmap[path[0]]; !ok {
152 | 		return nil, ErrPathNotFound
153 | 	} else if len(path) == 1 { // we've reached the final key
154 | 		return node, nil
155 | 	} else { // 1+ more keys
156 | 		return node.getPath(path[1:])
157 | 	}
158 | }
159 | 
160 | // namePath returns a slice of keys making up the path to the field
161 | func (f *fieldPos) namePath() []string {
162 | 	if len(f.path) == 0 {
163 | 		var path []string
164 | 		cur := f
165 | 		for i := f.depth - 1; i >= 0; i-- {
166 | 			path = append([]string{cur.name}, path...)
167 | 			cur = cur.parent
168 | 		}
169 | 		return path
170 | 	}
171 | 	return f.path
172 | }
173 | 
174 | // namePath returns the path to the field in json dot notation
175 | func (f *fieldPos) dotPath() string {
176 | 	var path string = "$"
177 | 	for i, p := range f.path {
178 | 		path = path + p
179 | 		if i+1 != len(f.path) {
180 | 			path = path + "."
181 | 		}
182 | 	}
183 | 	return path
184 | }
185 | 
186 | // getValue retrieves the value from the map[string]any
187 | // by following the field's key path
188 | func (f *fieldPos) getValue(m map[string]any) any {
189 | 	var value any = m
190 | 	for _, key := range f.namePath() {
191 | 		valueMap, ok := value.(map[string]any)
192 | 		if !ok {
193 | 			return nil
194 | 		}
195 | 		value, ok = valueMap[key]
196 | 		if !ok {
197 | 			return nil
198 | 		}
199 | 	}
200 | 	return value
201 | }
202 | 
203 | // graft grafts a new field into the schema tree
204 | func (f *fieldPos) graft(n *fieldPos) {
205 | 	graft := f.newChild(n.name)
206 | 	graft.arrowType = n.arrowType
207 | 	graft.field = n.field
208 | 	graft.children = append(graft.children, n.children...)
209 | 	graft.mapChildren()
210 | 	f.assignChild(graft)
211 | 	f.owner.knownFields.Set(graft.dotPath(), graft)
212 | 	f.owner.untypedFields.Delete(graft.dotPath())
213 | 	f.owner.changes = errors.Join(f.owner.changes, fmt.Errorf("%w %v : %v", ErrFieldAdded, graft.dotPath(), graft.field.Type.String()))
214 | 	if f.field.Type.ID() == arrow.STRUCT {
215 | 		gf := f.field.Type.(*arrow.StructType)
216 | 		var nf []arrow.Field
217 | 		nf = append(nf, gf.Fields()...)
218 | 		nf = append(nf, graft.field)
219 | 		f.field = arrow.Field{Name: f.name, Type: arrow.StructOf(nf...), Nullable: true}
220 | 		if (f.parent != nil) && f.parent.field.Type.ID() == arrow.LIST {
221 | 			f.parent.field = arrow.Field{Name: f.parent.name, Type: arrow.ListOf(f.field.Type.(*arrow.StructType)), Nullable: true}
222 | 		}
223 | 	}
224 | }
225 | 
226 | // Only scalar types in UpgradableTypes[] can be upgraded:
227 | // Supported type upgrades:
228 | //
229 | //		arrow.INT8, arrow.INT16, arrow.INT32, arrow.INT64 => arrow.FLOAT64
230 | //		arrow.FLOAT16 => arrow.FLOAT32
231 | //		arrow.FLOAT32 => arrow.FLOAT64
232 | //	 	arrow.FLOAT64 => arrow.STRING
233 | //		arrow.TIMESTAMP => arrow.STRING
234 | //		arrow.DATE32 => arrow.TIMESTAMP
235 | //		arrow.DATE32 => arrow.STRING
236 | //		arrow.TIME64 => arrow.STRING
237 | func (o *fieldPos) upgradeType(n *fieldPos, t arrow.Type) error {
238 | 	if !slices.Contains(UpgradableTypes, o.field.Type.ID()) {
239 | 		return fmt.Errorf("%s %v %v", n.dotPath(), n.field.Type.Name(), ErrNotAnUpgradableType.Error())
240 | 	}
241 | 	oldType := o.field.Type.String()
242 | 	// changes to field
243 | 	switch t {
244 | 	case arrow.FLOAT32:
245 | 		o.arrowType = arrow.FLOAT32
246 | 		o.field = arrow.Field{Name: o.name, Type: arrow.PrimitiveTypes.Float32, Nullable: true}
247 | 	case arrow.FLOAT64:
248 | 		o.arrowType = arrow.FLOAT64
249 | 		o.field = arrow.Field{Name: o.name, Type: arrow.PrimitiveTypes.Float64, Nullable: true}
250 | 	case arrow.STRING:
251 | 		o.arrowType = arrow.STRING
252 | 		o.field = arrow.Field{Name: o.name, Type: arrow.BinaryTypes.String, Nullable: true}
253 | 	case arrow.TIMESTAMP:
254 | 		o.arrowType = arrow.TIMESTAMP
255 | 		o.field = arrow.Field{Name: o.name, Type: arrow.FixedWidthTypes.Timestamp_ms, Nullable: true}
256 | 	}
257 | 	// changes to parent
258 | 	switch o.parent.field.Type.ID() {
259 | 	case arrow.LIST:
260 | 		o.parent.field = arrow.Field{Name: o.parent.name, Type: arrow.ListOf(n.field.Type), Nullable: true}
261 | 	case arrow.STRUCT:
262 | 		var fields []arrow.Field
263 | 		for _, c := range o.parent.children {
264 | 			fields = append(fields, c.field)
265 | 		}
266 | 		o.parent.field = arrow.Field{Name: o.parent.name, Type: arrow.StructOf(fields...), Nullable: true}
267 | 	}
268 | 	o.owner.changes = errors.Join(o.owner.changes, fmt.Errorf("%w %v : from %v to %v", ErrFieldTypeChanged, o.dotPath(), oldType, o.field.Type.String()))
269 | 	return nil
270 | }
271 | 
272 | func errWrap(f *fieldPos) error {
273 | 	var err error
274 | 	if f.err != nil {
275 | 		err = errors.Join(f.err)
276 | 	}
277 | 	if len(f.children) > 0 {
278 | 		for _, field := range f.children {
279 | 			err = errors.Join(err, errWrap(field))
280 | 		}
281 | 	}
282 | 	return err
283 | }
284 | 
285 | // mapToArrow traverses a map[string]any and creates a fieldPos tree from
286 | // which an Arrow schema can be generated.
287 | func mapToArrow(f *fieldPos, m map[string]any) {
288 | 	for k, v := range m {
289 | 		child := f.newChild(k)
290 | 		switch t := v.(type) {
291 | 		case map[string]any:
292 | 			mapToArrow(child, t)
293 | 			var fields []arrow.Field
294 | 			for _, c := range child.children {
295 | 				fields = append(fields, c.field)
296 | 			}
297 | 			if len(child.children) != 0 {
298 | 				child.field = buildArrowField(k, arrow.StructOf(fields...), arrow.Metadata{}, true)
299 | 				f.assignChild(child)
300 | 			} else {
301 | 				child.arrowType = arrow.STRUCT
302 | 				child.isStruct = true
303 | 				f.owner.untypedFields.Set(child.dotPath(), child)
304 | 			}
305 | 		case []any:
306 | 			if len(t) <= 0 {
307 | 				child.arrowType = arrow.LIST
308 | 				child.isList = true
309 | 				f.owner.untypedFields.Set(child.dotPath(), child)
310 | 				f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, child.namePath()))
311 | 			} else {
312 | 				et := sliceElemType(child, t)
313 | 				child.isList = true
314 | 				child.field = buildArrowField(k, arrow.ListOf(et), arrow.Metadata{}, true)
315 | 				f.assignChild(child)
316 | 			}
317 | 		case nil:
318 | 			child.arrowType = arrow.NULL
319 | 			f.owner.untypedFields.Set(child.dotPath(), child)
320 | 			f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedFieldType, child.namePath()))
321 | 		default:
322 | 			child.field = buildArrowField(k, goType2Arrow(child, v), arrow.Metadata{}, true)
323 | 			f.assignChild(child)
324 | 		}
325 | 	}
326 | 	var fields []arrow.Field
327 | 	for _, c := range f.children {
328 | 		fields = append(fields, c.field)
329 | 	}
330 | 	f.arrowType = arrow.STRUCT
331 | 	f.field = arrow.Field{Name: f.name, Type: arrow.StructOf(fields...), Nullable: true}
332 | }
333 | 
334 | // sliceElemType evaluates the slice type and returns an Arrow DataType
335 | // to be used in building an Arrow Field.
336 | func sliceElemType(f *fieldPos, v []any) arrow.DataType {
337 | 	switch ft := v[0].(type) {
338 | 	case map[string]any:
339 | 		child := f.newChild(f.name + ".elem")
340 | 		mapToArrow(child, ft)
341 | 		var fields []arrow.Field
342 | 		for _, c := range child.children {
343 | 			fields = append(fields, c.field)
344 | 		}
345 | 		f.assignChild(child)
346 | 		return arrow.StructOf(fields...)
347 | 	case []any:
348 | 		if len(ft) < 1 {
349 | 			f.err = errors.Join(f.err, fmt.Errorf("%v : %v", ErrUndefinedArrayElementType, f.namePath()))
350 | 			return arrow.GetExtensionType("skip")
351 | 		}
352 | 		child := f.newChild(f.name + ".elem")
353 | 		et := sliceElemType(child, v[0].([]any))
354 | 		f.assignChild(child)
355 | 		return arrow.ListOf(et)
356 | 	default:
357 | 		return goType2Arrow(f, v)
358 | 	}
359 | 	return nil
360 | }
361 | 
362 | func buildArrowField(n string, t arrow.DataType, m arrow.Metadata, nullable bool) arrow.Field {
363 | 	return arrow.Field{
364 | 		Name:     n,
365 | 		Type:     t,
366 | 		Metadata: m,
367 | 		Nullable: nullable,
368 | 	}
369 | }
370 | 
371 | func buildTypeMetadata(k, v []string) arrow.Metadata {
372 | 	return arrow.NewMetadata(k, v)
373 | }
374 | 


--------------------------------------------------------------------------------
/types.go:
--------------------------------------------------------------------------------
  1 | package bodkin
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"slices"
  7 | 	"time"
  8 | 
  9 | 	"github.com/apache/arrow-go/v18/arrow"
 10 | )
 11 | 
 12 | // goType2Arrow maps a Go type to an Arrow DataType.
 13 | func goType2Arrow(f *fieldPos, gt any) arrow.DataType {
 14 | 	var dt arrow.DataType
 15 | 	switch t := gt.(type) {
 16 | 	case []any:
 17 | 		return goType2Arrow(f, t[0])
 18 | 	case json.Number:
 19 | 		if _, err := t.Int64(); err == nil {
 20 | 			f.arrowType = arrow.INT64
 21 | 			dt = arrow.PrimitiveTypes.Int64
 22 | 		} else {
 23 | 			f.arrowType = arrow.FLOAT64
 24 | 			dt = arrow.PrimitiveTypes.Float64
 25 | 		}
 26 | 	case time.Time:
 27 | 		f.arrowType = arrow.TIMESTAMP
 28 | 		dt = arrow.FixedWidthTypes.Timestamp_us
 29 | 		// either 32 or 64 bits
 30 | 	case int:
 31 | 		f.arrowType = arrow.INT64
 32 | 		dt = arrow.PrimitiveTypes.Int64
 33 | 	// the set of all signed  8-bit integers (-128 to 127)
 34 | 	case int8:
 35 | 		f.arrowType = arrow.INT8
 36 | 		dt = arrow.PrimitiveTypes.Int8
 37 | 	// the set of all signed 16-bit integers (-32768 to 32767)
 38 | 	case int16:
 39 | 		f.arrowType = arrow.INT16
 40 | 		dt = arrow.PrimitiveTypes.Int16
 41 | 	// the set of all signed 32-bit integers (-2147483648 to 2147483647)
 42 | 	case int32:
 43 | 		f.arrowType = arrow.INT32
 44 | 		dt = arrow.PrimitiveTypes.Int32
 45 | 	// the set of all signed 64-bit integers (-9223372036854775808 to 9223372036854775807)
 46 | 	case int64:
 47 | 		f.arrowType = arrow.INT64
 48 | 		dt = arrow.PrimitiveTypes.Int64
 49 | 	// either 32 or 64 bits
 50 | 	case uint:
 51 | 		f.arrowType = arrow.UINT64
 52 | 		dt = arrow.PrimitiveTypes.Uint64
 53 | 	// the set of all unsigned  8-bit integers (0 to 255)
 54 | 	case uint8:
 55 | 		f.arrowType = arrow.UINT8
 56 | 		dt = arrow.PrimitiveTypes.Uint8
 57 | 	// the set of all unsigned 16-bit integers (0 to 65535)
 58 | 	case uint16:
 59 | 		f.arrowType = arrow.UINT16
 60 | 		dt = arrow.PrimitiveTypes.Uint16
 61 | 	// the set of all unsigned 32-bit integers (0 to 4294967295)
 62 | 	case uint32:
 63 | 		f.arrowType = arrow.UINT32
 64 | 		dt = arrow.PrimitiveTypes.Uint32
 65 | 	// the set of all unsigned 64-bit integers (0 to 18446744073709551615)
 66 | 	case uint64:
 67 | 		f.arrowType = arrow.UINT64
 68 | 		dt = arrow.PrimitiveTypes.Uint64
 69 | 	// the set of all IEEE-754 32-bit floating-point numbers
 70 | 	case float32:
 71 | 		f.arrowType = arrow.FLOAT32
 72 | 		dt = arrow.PrimitiveTypes.Float32
 73 | 	// the set of all IEEE-754 64-bit floating-point numbers
 74 | 	case float64:
 75 | 		f.arrowType = arrow.FLOAT64
 76 | 		dt = arrow.PrimitiveTypes.Float64
 77 | 	case bool:
 78 | 		f.arrowType = arrow.BOOL
 79 | 		dt = arrow.FixedWidthTypes.Boolean
 80 | 	case string:
 81 | 		if f.owner.inferTimeUnits {
 82 | 			for _, r := range timestampMatchers {
 83 | 				if r.MatchString(t) {
 84 | 					f.arrowType = arrow.TIMESTAMP
 85 | 					return arrow.FixedWidthTypes.Timestamp_us
 86 | 				}
 87 | 			}
 88 | 			if dateMatcher.MatchString(t) {
 89 | 				f.arrowType = arrow.DATE32
 90 | 				return arrow.FixedWidthTypes.Date32
 91 | 			}
 92 | 			if timeMatcher.MatchString(t) {
 93 | 				f.arrowType = arrow.TIME64
 94 | 				return arrow.FixedWidthTypes.Time64ns
 95 | 			}
 96 | 		}
 97 | 		if !f.owner.quotedValuesAreStrings {
 98 | 			if slices.Contains(boolMatcher, t) {
 99 | 				f.arrowType = arrow.BOOL
100 | 				return arrow.FixedWidthTypes.Boolean
101 | 			}
102 | 			if integerMatcher.MatchString(t) {
103 | 				f.arrowType = arrow.INT64
104 | 				return arrow.PrimitiveTypes.Int64
105 | 			}
106 | 			if floatMatcher.MatchString(t) {
107 | 				f.arrowType = arrow.FLOAT64
108 | 				return arrow.PrimitiveTypes.Float64
109 | 			}
110 | 		}
111 | 		f.arrowType = arrow.STRING
112 | 		dt = arrow.BinaryTypes.String
113 | 	case []byte:
114 | 		f.arrowType = arrow.BINARY
115 | 		dt = arrow.BinaryTypes.Binary
116 | 	// the set of all complex numbers with float32 real and imaginary parts
117 | 	case complex64:
118 | 		// TO-DO
119 | 		f.arrowType = arrow.NULL
120 | 		f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath())
121 | 		dt = arrow.BinaryTypes.Binary
122 | 	// the set of all complex numbers with float64 real and imaginary parts
123 | 	case complex128:
124 | 		// TO-DO
125 | 		f.arrowType = arrow.NULL
126 | 		f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath())
127 | 		dt = arrow.BinaryTypes.Binary
128 | 	case nil:
129 | 		f.arrowType = arrow.NULL
130 | 		f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath())
131 | 		dt = arrow.BinaryTypes.Binary
132 | 	default:
133 | 		// Catch-all for exotic unsupported types - ie. input field is a func
134 | 		f.arrowType = arrow.NULL
135 | 		f.err = fmt.Errorf("%v : %v", ErrUndefinedFieldType, f.namePath())
136 | 		dt = arrow.BinaryTypes.Binary
137 | 	}
138 | 	return dt
139 | }
140 | 
141 | func arrowTypeID2Type(f *fieldPos, t arrow.Type) arrow.DataType {
142 | 	var dt arrow.DataType
143 | 	switch t {
144 | 	// BOOL is a 1 bit, LSB bit-packed ordering
145 | 	case arrow.BOOL:
146 | 		dt = arrow.FixedWidthTypes.Boolean
147 | 	// the set of all signed  8-bit integers (-128 to 127)
148 | 	case arrow.INT8:
149 | 		dt = arrow.PrimitiveTypes.Int8
150 | 	// the set of all unsigned  8-bit integers (0 to 255)
151 | 	case arrow.UINT8:
152 | 		dt = arrow.PrimitiveTypes.Uint8
153 | 	// the set of all signed 16-bit integers (-32768 to 32767)
154 | 	case arrow.INT16:
155 | 		dt = arrow.PrimitiveTypes.Int16
156 | 	// the set of all unsigned 16-bit integers (0 to 65535)
157 | 	case arrow.UINT16:
158 | 		dt = arrow.PrimitiveTypes.Uint16
159 | 	// the set of all signed 32-bit integers (-2147483648 to 2147483647)
160 | 	case arrow.INT32:
161 | 		dt = arrow.PrimitiveTypes.Int32
162 | 	// the set of all unsigned 32-bit integers (0 to 4294967295)
163 | 	case arrow.UINT32:
164 | 		dt = arrow.PrimitiveTypes.Uint32
165 | 	// the set of all signed 64-bit integers (-9223372036854775808 to 9223372036854775807)
166 | 	case arrow.INT64:
167 | 		dt = arrow.PrimitiveTypes.Int64
168 | 	// the set of all unsigned 64-bit integers (0 to 18446744073709551615)
169 | 	case arrow.UINT64:
170 | 		dt = arrow.PrimitiveTypes.Uint64
171 | 	// the set of all IEEE-754 32-bit floating-point numbers
172 | 	case arrow.FLOAT32:
173 | 		dt = arrow.PrimitiveTypes.Float32
174 | 	// the set of all IEEE-754 64-bit floating-point numbers
175 | 	case arrow.FLOAT64:
176 | 		dt = arrow.PrimitiveTypes.Float64
177 | 	// TIMESTAMP is an exact timestamp encoded with int64 since UNIX epoch
178 | 	case arrow.TIMESTAMP:
179 | 		dt = arrow.FixedWidthTypes.Timestamp_us
180 | 	// DATE32 is int32 days since the UNIX epoch
181 | 	case arrow.DATE32:
182 | 		dt = arrow.FixedWidthTypes.Date32
183 | 	// TIME64 is a signed 64-bit integer, representing either microseconds or
184 | 	// nanoseconds since midnight
185 | 	case arrow.TIME64:
186 | 		dt = arrow.FixedWidthTypes.Time64ns
187 | 	// STRING is a UTF8 variable-length string
188 | 	case arrow.STRING:
189 | 		dt = arrow.BinaryTypes.String
190 | 	// BINARY is a Variable-length byte type (no guarantee of UTF8-ness)
191 | 	case arrow.BINARY:
192 | 		dt = arrow.BinaryTypes.Binary
193 | 	// NULL type having no physical storage
194 | 	case arrow.NULL:
195 | 		dt = arrow.BinaryTypes.Binary
196 | 	case arrow.STRUCT:
197 | 		var fields []arrow.Field
198 | 		for _, c := range f.children {
199 | 			fields = append(fields, c.field)
200 | 		}
201 | 		return arrow.StructOf(fields...)
202 | 	case arrow.LIST:
203 | 		var fields []arrow.Field
204 | 		for _, c := range f.children {
205 | 			fields = append(fields, c.field)
206 | 		}
207 | 		return arrow.StructOf(fields...)
208 | 	}
209 | 	return dt
210 | }
211 | 


--------------------------------------------------------------------------------