├── .github └── workflows │ └── test.yml ├── .gitignore ├── .npmignore ├── LICENSE ├── README.md ├── docs ├── api │ ├── column.md │ ├── data-types.md │ ├── index.md │ ├── schema.md │ └── table.md ├── assets │ └── logo.svg └── index.md ├── eslint.config.js ├── package-lock.json ├── package.json ├── perf ├── build-perf.js ├── data.js ├── decode-perf.js ├── encode-perf.js ├── run-all.js └── util.js ├── rollup.config.js ├── src ├── batch-type.js ├── batch.js ├── build │ ├── buffer.js │ ├── builder.js │ ├── builders │ │ ├── batch.js │ │ ├── binary.js │ │ ├── bool.js │ │ ├── decimal.js │ │ ├── dictionary.js │ │ ├── fixed-size-binary.js │ │ ├── fixed-size-list.js │ │ ├── interval.js │ │ ├── list.js │ │ ├── map.js │ │ ├── run-end-encoded.js │ │ ├── struct.js │ │ ├── union.js │ │ ├── utf8.js │ │ ├── validity.js │ │ └── values.js │ ├── column-from-array.js │ ├── column-from-values.js │ ├── infer-type.js │ ├── table-from-arrays.js │ └── table-from-columns.js ├── column.js ├── constants.js ├── data-types.js ├── decode │ ├── block.js │ ├── data-type.js │ ├── decode-ipc.js │ ├── dictionary-batch.js │ ├── message.js │ ├── metadata.js │ ├── record-batch.js │ ├── schema.js │ └── table-from-ipc.js ├── encode │ ├── builder.js │ ├── data-type.js │ ├── dictionary-batch.js │ ├── encode-ipc.js │ ├── footer.js │ ├── message.js │ ├── metadata.js │ ├── record-batch.js │ ├── schema.js │ ├── sink.js │ └── table-to-ipc.js ├── index-types.ts ├── index.js ├── table.js ├── types.ts └── util │ ├── arrays.js │ ├── numbers.js │ ├── objects.js │ ├── read.js │ ├── strings.js │ └── struct.js ├── test ├── batch-test.js ├── column-from-array-test.js ├── column-from-values-test.js ├── data │ ├── binaryview.arrows │ ├── convert.arrows │ ├── decimal.arrow │ ├── decimal.arrows │ ├── decimal128.arrows │ ├── decimal256.arrows │ ├── decimal32.arrows │ ├── decimal64.arrows │ ├── empty.arrows │ ├── flights.arrow │ ├── flights.arrows │ ├── largelistview.arrows │ ├── listview.arrows │ ├── runendencoded.arrows │ ├── runendencoded64.arrows │ ├── scrabble.arrows │ └── utf8view.arrows ├── decode-ipc-test.js ├── duckdb-compat-test.js ├── encode-ipc-test.js ├── infer-type-test.js ├── table-from-arrays-test.js ├── table-from-ipc-test.js ├── table-test.js ├── table-to-ipc-test.js └── util │ ├── arrow-from-duckdb.js │ ├── data.js │ └── decimal.js └── tsconfig.json /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | node: [20, 22] 16 | 17 | name: Node ${{ matrix.node }} 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Setup Node ${{ matrix.node }} 23 | uses: actions/setup-node@v4 24 | with: 25 | node-version: ${{ matrix.node }} 26 | cache: "npm" 27 | 28 | - name: Install Node dependencies 29 | run: npm ci 30 | 31 | - name: Run linter 32 | run: npm run lint 33 | 34 | - name: Run tests 35 | run: npm run test 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | dist 4 | temp 5 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .github 2 | .temp 3 | .DS_Store 4 | node_modules/ 5 | perf/ 6 | temp/ 7 | test/ 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, UW Interactive Data Lab 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | --- 31 | 32 | Portions of this software are derived from Apache Arrow JS 33 | (https://github.com/apache/arrow/blob/main/js/). 34 | 35 | Licensed under the Apache License, Version 2.0 (the "License"); 36 | you may not use this file except in compliance with the License. 37 | You may obtain a copy of the License at 38 | 39 | http://www.apache.org/licenses/LICENSE-2.0 40 | 41 | Unless required by applicable law or agreed to in writing, software 42 | distributed under the License is distributed on an "AS IS" BASIS, 43 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 44 | See the License for the specific language governing permissions and 45 | limitations under the License. 46 | 47 | --- 48 | 49 | Portions of this software are derived from FlatBuffers 50 | (https://github.com/google/flatbuffers). 51 | 52 | Licensed under the Apache License, Version 2.0 (the "License"); 53 | you may not use this file except in compliance with the License. 54 | You may obtain a copy of the License at 55 | 56 | http://www.apache.org/licenses/LICENSE-2.0 57 | 58 | Unless required by applicable law or agreed to in writing, software 59 | distributed under the License is distributed on an "AS IS" BASIS, 60 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 61 | See the License for the specific language governing permissions and 62 | limitations under the License. 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Flechette 2 | 3 | **Flechette** is a JavaScript library for reading and writing the [Apache Arrow](https://arrow.apache.org/) columnar in-memory data format. It provides a faster, lighter, zero-dependency alternative to the [Arrow JS reference implementation](https://github.com/apache/arrow/tree/main/js). 4 | 5 | Flechette performs fast extraction and encoding of data columns in the Arrow binary IPC format, supporting ingestion of Arrow data from sources such as [DuckDB](https://duckdb.org/) and Arrow use in JavaScript data analysis tools like [Arquero](https://github.com/uwdata/arquero), [Mosaic](https://github.com/uwdata/mosaic), [Observable Plot](https://observablehq.com/plot/), and [Vega-Lite](https://vega.github.io/vega-lite/). 6 | 7 | For documentation, see the [**API Reference**](https://idl.uw.edu/flechette/api). For code, see the [**Flechette GitHub repo**](https://github.com/uwdata/flechette). 8 | 9 | ## Why Flechette? 10 | 11 | In the process of developing multiple data analysis packages that consume Arrow data (including Arquero, Mosaic, and Vega), we've had to develop workarounds for the performance and correctness of the Arrow JavaScript reference implementation. Instead of workarounds, Flechette addresses these issues head-on. 12 | 13 | * _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 7-11x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. 14 | 15 | * _Size_. Flechette is smaller: ~43k minified (~14k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so you only pay for what you need in custom bundles. 16 | 17 | * _Coverage_. Flechette supports data types unsupported by the reference implementation, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB), run-end encoded data, binary views, and list views. 18 | 19 | * _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data. 20 | 21 | * _Simplicity_. Our goal is to provide a smaller, simpler code base in the hope that it will make it easier for ourselves and others to improve the library. If you'd like to see support for additional Arrow features, please [file an issue](https://github.com/uwdata/flechette/issues) or [open a pull request](https://github.com/uwdata/flechette/pulls). 22 | 23 | That said, no tool is without limitations or trade-offs. Flechette assumes simpler inputs (byte buffers, no promises or streams), has less strict TypeScript typings, and may have a slightly slower initial parse (as it decodes dictionary data upfront for faster downstream access). 24 | 25 | ## What's with the name? 26 | 27 | The project name stems from the French word [fléchette](https://en.wikipedia.org/wiki/Flechette), which means "little arrow" or "dart". 🎯 28 | 29 | ## Examples 30 | 31 | ### Load and Access Arrow Data 32 | 33 | ```js 34 | import { tableFromIPC } from '@uwdata/flechette'; 35 | 36 | const url = 'https://cdn.jsdelivr.net/npm/vega-datasets@2/data/flights-200k.arrow'; 37 | const ipc = await fetch(url).then(r => r.arrayBuffer()); 38 | const table = tableFromIPC(ipc); 39 | 40 | // print table size: (231083 x 3) 41 | console.log(`${table.numRows} x ${table.numCols}`); 42 | 43 | // inspect schema for column names, data types, etc. 44 | // [ 45 | // { name: "delay", type: { typeId: 2, bitWidth: 16, signed: true }, ...}, 46 | // { name: "distance", type: { typeId: 2, bitWidth: 16, signed: true }, ...}, 47 | // { name: "time", type: { typeId: 3, precision: 1 }, ...} 48 | // ] 49 | // typeId: 2 === Type.Int, typeId: 3 === Type.Float 50 | console.log(JSON.stringify(table.schema.fields, 0, 2)); 51 | 52 | // convert a single Arrow column to a value array 53 | // when possible, zero-copy access to binary data is used 54 | const delay = table.getChild('delay').toArray(); 55 | 56 | // data columns are iterable 57 | const time = [...table.getChild('time')]; 58 | 59 | // data columns provide random access 60 | const time0 = table.getChild('time').at(0); 61 | 62 | // extract all columns into a { name: array, ... } object 63 | // { delay: Int16Array, distance: Int16Array, time: Float32Array } 64 | const columns = table.toColumns(); 65 | 66 | // convert Arrow data to an array of standard JS objects 67 | // [ { delay: 14, distance: 405, time: 0.01666666753590107 }, ... ] 68 | const objects = table.toArray(); 69 | 70 | // create a new table with a selected subset of columns 71 | // use this first to limit toColumns or toArray to fewer columns 72 | const subtable = table.select(['delay', 'time']); 73 | ``` 74 | 75 | ### Build and Encode Arrow Data 76 | 77 | ```js 78 | import { 79 | bool, dictionary, float32, int32, tableFromArrays, tableToIPC, utf8 80 | } from '@uwdata/flechette'; 81 | 82 | // data defined using standard JS types 83 | // both arrays and typed arrays work well 84 | const arrays = { 85 | ints: [1, 2, null, 4, 5], 86 | floats: [1.1, 2.2, 3.3, 4.4, 5.5], 87 | bools: [true, true, null, false, true], 88 | strings: ['a', 'b', 'c', 'b', 'a'] 89 | }; 90 | 91 | // create table with automatically inferred types 92 | const tableInfer = tableFromArrays(arrays); 93 | 94 | // encode table to bytes in Arrow IPC stream format 95 | const ipcInfer = tableToIPC(tableInfer); 96 | 97 | // create table using explicit types 98 | const tableTyped = tableFromArrays(arrays, { 99 | types: { 100 | ints: int32(), 101 | floats: float32(), 102 | bools: bool(), 103 | strings: dictionary(utf8()) 104 | } 105 | }); 106 | 107 | // encode table to bytes in Arrow IPC file format 108 | const ipcTyped = tableToIPC(tableTyped, { format: 'file' }); 109 | ``` 110 | 111 | ### Customize Data Extraction 112 | 113 | Data extraction can be customized using options provided to table generation methods. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, map-typed data is returned as an array of [key, value] pairs, and struct/row objects are returned as vanilla JS objects with extracted property values. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. 114 | 115 | ```js 116 | const table = tableFromIPC(ipc, { 117 | useDate: true, // map dates and timestamps to Date objects 118 | useDecimalInt: true, // use BigInt for decimals, do not coerce to number 119 | useBigInt: true, // use BigInt for 64-bit ints, do not coerce to number 120 | useMap: true, // create Map objects for [key, value] pair lists 121 | useProxy: true // use zero-copy proxies for struct and table row objects 122 | }); 123 | ``` 124 | 125 | The same extraction options can be passed to `tableFromArrays`. For more, see the [**API Reference**](https://idl.uw.edu/flechette/api). 126 | 127 | ## Build Instructions 128 | 129 | To build and develop Flechette locally: 130 | 131 | - Clone https://github.com/uwdata/flechette. 132 | - Run `npm i` to install dependencies. 133 | - Run `npm test` to run test cases, `npm run perf` to run performance benchmarks, and `npm run build` to build output files. 134 | -------------------------------------------------------------------------------- /docs/api/column.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Column | API Reference 3 | --- 4 | # Flechette API Reference 5 | 6 | [Top-Level](/flechette/api) | [Data Types](data-types) | [Schema](schema) | [Table](table) | [**Column**](column) 7 | 8 | ## Column Class 9 | 10 | A data column. A column provides a view over one or more value batches, each corresponding to part of an Arrow record batch. The Column class supports random access to column values by integer index using the [`at`](#at) method; however, extracting arrays using [`toArray`](#toArray) may provide more performant means of bulk access and scanning. 11 | 12 | * [constructor](#constructor) 13 | * [type](#type) 14 | * [length](#length) 15 | * [nullCount](#nullCount) 16 | * [data](#data) 17 | * [at](#at) 18 | * [get](#get) 19 | * [toArray](#toArray) 20 | * [Symbol.iterator](#iterator) 21 | 22 |
# 23 | Column.constructor(data[, type]) 24 | 25 | Create a new column with the given data batches. 26 | 27 | * *data* (`Batch[]`): The column data batches. 28 | * *type* (`DataType`): The column [data type](data-types). If not specified, the type is extracted from the data batches. This argument is only needed to ensure correct types for "empty" columns without any data. 29 | 30 |
# 31 | Column.type 32 | 33 | The column [data type](data-types). 34 | 35 |
# 36 | Column.length 37 | 38 | The column length (number of rows). 39 | 40 |
# 41 | Column.nullCount 42 | 43 | The count of null values in the column. 44 | 45 |
# 46 | Column.data 47 | 48 | An array of column data batches. 49 | 50 |
# 51 | Column.at(index) 52 | 53 | Return the column value at the given *index*. The value type is determined by the column data type and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 54 | 55 | If a column has multiple batches, this method performs binary search over the batch lengths to determine the batch from which to retrieve the value. The search makes lookup less efficient than a standard array access. If making multiple full scans of a column, consider extracting an array via `toArray()`. 56 | 57 | * *index* (`number`): The row index. 58 | 59 |
# 60 | Column.get(index) 61 | 62 | Return the column value at the given *index*. This method is the same as [`at`](#at) and is provided for better compatibility with Apache Arrow JS. 63 | 64 |
# 65 | Column.toArray() 66 | 67 | Extract column values into a single array instance. The value type is determined by the column data type and extraction options; see the [data types](data-types#data-type-overview) documentation for more. When possible, a zero-copy subarray of the input Arrow data is returned. A typed array is used if possible. If a column contains `null` values, a standard `Array` is created and populated. 68 | 69 |
# 70 | Column\[Symbol.iterator\]() 71 | 72 | Return an iterator over the values in this column. The value type is determined by the column data type and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 73 | -------------------------------------------------------------------------------- /docs/api/schema.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Schema | API Reference 3 | --- 4 | # Flechette API Reference 5 | 6 | [Top-Level](/flechette/api) | [Data Types](data-types) | [**Schema**](schema) | [Table](table) | [Column](column) 7 | 8 | ## Schema Object 9 | 10 | A schema describes the contents of a table, including all column names and types as well as any associated metadata. A schema is represented as a standard JavaScript object with the following properties: 11 | 12 | * *version* (`number`): A number indicating the Arrow version of the data, corresponding to the properties of the top-level `Version` object. When creating new tables, Flechette uses `Version.V5`. 13 | * *endianness* (`number`): The binary [endianness](https://en.wikipedia.org/wiki/Endianness) (byte order) of the Arrow data. Be default, Flechette assumes the value `Endianness.Little`. 14 | * *fields* (`Field[]`): An array of field specifications. See the [field documentation](#field-object) for more details. 15 | * *metadata* (`Map`): Custom schema-level metadata annotations. 16 | 17 | ## Field Object 18 | 19 | A field describes the name, data type, and metadata for a collection of data values. A field may correspond to either a top-level column or nested data such as the content of a list, struct, or union type. A field is represented as a standard JavaScript object with the following properties: 20 | 21 | * *name* (`string`): The field name. 22 | * *type* (`DataType`): The field data type. See the [Data Types documentation](./data-types) for more. 23 | * *nullable* (`boolean`): Metadata flag indicating if the field values may be set to `null`. 24 | * *metadata* (`Map`): Custom field-level metadata annotations. 25 | 26 |
# 27 | field(name, type[, nullable, metadata]) 28 | 29 | Create a new field instance for use in a schema or type definition. A field represents a field name, data type, and additional metadata. Fields are used to represent child types within nested types like [List](#list), [Struct](#struct), and [Union](#union). 30 | 31 | * *name* (`string`): The field name. 32 | * *type* (`DataType`): The field [data type](./data-types). 33 | * *nullable* (`boolean`): Flag indicating if the field is nullable (default `true`). 34 | * *metadata* (`Map`): Custom field metadata annotations (default `null`). 35 | -------------------------------------------------------------------------------- /docs/api/table.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Table | API Reference 3 | --- 4 | # Flechette API Reference 5 | 6 | [Top-Level](/flechette/api) | [Data Types](data-types) | [Schema](schema) | [**Table**](table) | [Column](column) 7 | 8 | ## Table Class 9 | 10 | A table consists of named data [columns](#column) (or 'children'). To extract table data directly to JavaScript values, use [`toColumns()`](#toColumns) to produce an object that maps column names to extracted value arrays, or [`toArray()`](#toArray) to extract an array of row objects. Tables are [iterable](#iterator), iterating over row objects. While `toArray()` and [table iterators](#iterator) enable convenient use by tools that expect row objects, column-oriented processing is more efficient and thus recommended. Use [`getChild`](#getChild) or [`getChildAt`](#getChildAt) to access a specific [`Column`](column). 11 | 12 | * [constructor](#constructor) 13 | * [numCols](#numCols) 14 | * [numRows](#numRows) 15 | * [getChildAt](#getChildAt) 16 | * [getChild](#getChild) 17 | * [selectAt](#selectAt) 18 | * [select](#select) 19 | * [at](#at) 20 | * [get](#get) 21 | * [toColumns](#toColumns) 22 | * [toArray](#toArray) 23 | * [Symbol.iterator](#iterator) 24 | 25 |
# 26 | Table.constructor(schema, children[, useProxy]) 27 | 28 | Create a new table with the given *schema* and *children* columns. The column types and order *must* be consistent with the given *schema*. The [`tableFromArrays`](/flechette/api/#tableFromArrays) and [`tableFromColumns`](/flechette/api/#tableFromColumns) methods provide more convenient ways to construct a table. 29 | 30 | * *schema* (`Schema`): The table schema. 31 | * *children* (`Column[]`): The table columns. 32 | * *useProxy* (`boolean`): Flag indicating if zero-copy row proxy objects should be used to represent table rows instead of standard JavaScript objects (default `false`). Proxy objects can improve performance and reduce memory usage, but do not support convenient property enumeration (`Object.keys`, `Object.values`, `Object.entries`) or spreading (`{ ...object }`). A proxy object can be converted to a standard object by calling its `toJSON()` method. 33 | 34 |
# 35 | Table.numCols 36 | 37 | The number of columns in the table. 38 | 39 |
# 40 | Table.numRows 41 | 42 | The number of rows in the table. 43 | 44 |
# 45 | Table.getChildAt(index) 46 | 47 | Return the child [column](column) at the given *index* position. 48 | 49 | * *index* (`number`): The column index. 50 | 51 |
# 52 | Table.getChild(name) 53 | 54 | Return the first child [column](column) with the given *name*. 55 | 56 | * *name* (`string`): The column name. 57 | 58 |
# 59 | Table.selectAt(indices[, as]) 60 | 61 | Construct a new table containing only columns at the specified *indices*. The order of columns in the new table matches the order of input *indices*. 62 | 63 | * *indices* (`number[]`): The indices of columns to keep. 64 | * *as* (`string[]`): Optional new names for the selected columns. 65 | 66 |
# 67 | Table.select(names[, as]) 68 | 69 | Construct a new table containing only columns with the specified *names*. If columns have duplicate names, the first (with lowest index) is used. The order of columns in the new table matches the order of input *names*. 70 | 71 | * *names* (`string[]`): The names of columns to keep. 72 | * *as* (`string[]`): Optional new names for selected columns. 73 | 74 |
# 75 | Table.at(index) 76 | 77 | Return a row object for the given *index*. The type of object (standard object or row proxy object) is determined by the table `useProxy` constructor argument. The property values of the object are determined by the column data types and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 78 | 79 | * *index* (`number`): The row index. 80 | 81 |
# 82 | Table.get(index) 83 | 84 | Return a row object for the given *index*. This method is the same as [`at`](#at) and is provided for better compatibility with Apache Arrow JS. 85 | 86 |
# 87 | Table.toColumns() 88 | 89 | Return an object that maps column names to extracted value arrays. The values in each array are determined by the column data types and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 90 | 91 |
# 92 | Table.toArray() 93 | 94 | Return an array of objects representing the rows of this table. The type of object (standard object or row proxy object) is determined by the table `useProxy` constructor argument. The property values of the object are determined by the column data types and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 95 | 96 |
# 97 | Table\[Symbol.iterator\]() 98 | 99 | Return an iterator over row objects representing the rows of this table. The type of object (standard object or row proxy object) is determined by the table `useProxy` constructor argument. The property values of the object are determined by the column data types and extraction options; see the [data types](data-types#data-type-overview) documentation for more. 100 | -------------------------------------------------------------------------------- /docs/assets/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Flechette 2 | 3 | **Flechette** is a JavaScript library for reading and writing the [Apache Arrow](https://arrow.apache.org/) columnar in-memory data format. It provides a faster, lighter, zero-dependency alternative to the [Arrow JS reference implementation](https://github.com/apache/arrow/tree/main/js). 4 | 5 | Flechette performs fast extraction and encoding of data columns in the Arrow binary IPC format, supporting ingestion of Arrow data from sources such as [DuckDB](https://duckdb.org/) and Arrow use in JavaScript data analysis tools like [Arquero](https://github.com/uwdata/arquero), [Mosaic](https://github.com/uwdata/mosaic), [Observable Plot](https://observablehq.com/plot/), and [Vega-Lite](https://vega.github.io/vega-lite/). 6 | 7 | For documentation, see the [**API Reference**](api). For code, see the [**Flechette GitHub repo**](https://github.com/uwdata/flechette). 8 | 9 | ## Why Flechette? 10 | 11 | In the process of developing multiple data analysis packages that consume Arrow data (including Arquero, Mosaic, and Vega), we've had to develop workarounds for the performance and correctness of the Arrow JavaScript reference implementation. Instead of workarounds, Flechette addresses these issues head-on. 12 | 13 | * _Speed_. Flechette provides better performance. Performance tests show 1.3-1.6x faster value iteration, 2-7x faster array extraction, 7-11x faster row object extraction, and 1.5-3.5x faster building of Arrow columns. 14 | 15 | * _Size_. Flechette is smaller: ~43k minified (~14k gzip'd) versus 163k minified (~43k gzip'd) for Arrow JS. Flechette's encoders and decoders also tree-shake cleanly, so you only pay for what you need in custom bundles. 16 | 17 | * _Coverage_. Flechette supports data types unsupported by the reference implementation, including decimal-to-number conversion, month/day/nanosecond time intervals (as used by DuckDB), run-end encoded data, binary views, and list views. 18 | 19 | * _Flexibility_. Flechette includes options to control data value conversion, such as numerical timestamps vs. Date objects for temporal data, and numbers vs. bigint values for 64-bit integer data. 20 | 21 | * _Simplicity_. Our goal is to provide a smaller, simpler code base in the hope that it will make it easier for ourselves and others to improve the library. If you'd like to see support for additional Arrow features, please [file an issue](https://github.com/uwdata/flechette/issues) or [open a pull request](https://github.com/uwdata/flechette/pulls). 22 | 23 | That said, no tool is without limitations or trade-offs. Flechette assumes simpler inputs (byte buffers, no promises or streams), has less strict TypeScript typings, and may have a slightly slower initial parse (as it decodes dictionary data upfront for faster downstream access). 24 | 25 | ## What's with the name? 26 | 27 | The project name stems from the French word [fléchette](https://en.wikipedia.org/wiki/Flechette), which means "little arrow" or "dart". 🎯 28 | 29 | ## Examples 30 | 31 | ### Load and Access Arrow Data 32 | 33 | ```js 34 | import { tableFromIPC } from '@uwdata/flechette'; 35 | 36 | const url = 'https://cdn.jsdelivr.net/npm/vega-datasets@2/data/flights-200k.arrow'; 37 | const ipc = await fetch(url).then(r => r.arrayBuffer()); 38 | const table = tableFromIPC(ipc); 39 | 40 | // print table size: (231083 x 3) 41 | console.log(`${table.numRows} x ${table.numCols}`); 42 | 43 | // inspect schema for column names, data types, etc. 44 | // [ 45 | // { name: "delay", type: { typeId: 2, bitWidth: 16, signed: true }, ...}, 46 | // { name: "distance", type: { typeId: 2, bitWidth: 16, signed: true }, ...}, 47 | // { name: "time", type: { typeId: 3, precision: 1 }, ...} 48 | // ] 49 | // typeId: 2 === Type.Int, typeId: 3 === Type.Float 50 | console.log(JSON.stringify(table.schema.fields, 0, 2)); 51 | 52 | // convert a single Arrow column to a value array 53 | // when possible, zero-copy access to binary data is used 54 | const delay = table.getChild('delay').toArray(); 55 | 56 | // data columns are iterable 57 | const time = [...table.getChild('time')]; 58 | 59 | // data columns provide random access 60 | const time0 = table.getChild('time').at(0); 61 | 62 | // extract all columns into a { name: array, ... } object 63 | // { delay: Int16Array, distance: Int16Array, time: Float32Array } 64 | const columns = table.toColumns(); 65 | 66 | // convert Arrow data to an array of standard JS objects 67 | // [ { delay: 14, distance: 405, time: 0.01666666753590107 }, ... ] 68 | const objects = table.toArray(); 69 | 70 | // create a new table with a selected subset of columns 71 | // use this first to limit toColumns or toArray to fewer columns 72 | const subtable = table.select(['delay', 'time']); 73 | ``` 74 | 75 | ### Build and Encode Arrow Data 76 | 77 | ```js 78 | import { 79 | bool, dictionary, float32, int32, tableFromArrays, tableToIPC, utf8 80 | } from '@uwdata/flechette'; 81 | 82 | // data defined using standard JS types 83 | // both arrays and typed arrays work well 84 | const arrays = { 85 | ints: [1, 2, null, 4, 5], 86 | floats: [1.1, 2.2, 3.3, 4.4, 5.5], 87 | bools: [true, true, null, false, true], 88 | strings: ['a', 'b', 'c', 'b', 'a'] 89 | }; 90 | 91 | // create table with automatically inferred types 92 | const tableInfer = tableFromArrays(arrays); 93 | 94 | // encode table to bytes in Arrow IPC stream format 95 | const ipcInfer = tableToIPC(tableInfer); 96 | 97 | // create table using explicit types 98 | const tableTyped = tableFromArrays(arrays, { 99 | types: { 100 | ints: int32(), 101 | floats: float32(), 102 | bools: bool(), 103 | strings: dictionary(utf8()) 104 | } 105 | }); 106 | 107 | // encode table to bytes in Arrow IPC file format 108 | const ipcTyped = tableToIPC(tableTyped, { format: 'file' }); 109 | ``` 110 | 111 | ### Customize Data Extraction 112 | 113 | Data extraction can be customized using options provided to table generation methods. By default, temporal data is returned as numeric timestamps, 64-bit integers are coerced to numbers, map-typed data is returned as an array of [key, value] pairs, and struct/row objects are returned as vanilla JS objects with extracted property values. These defaults can be changed via conversion options that push (or remove) transformations to the underlying data batches. 114 | 115 | ```js 116 | const table = tableFromIPC(ipc, { 117 | useDate: true, // map dates and timestamps to Date objects 118 | useDecimalInt: true, // use scaled ints for decimals, not floating point 119 | useBigInt: true, // use BigInt for 64-bit ints, do not coerce to number 120 | useMap: true, // create Map objects for [key, value] pair lists 121 | useProxy: true // use zero-copy proxies for struct and table row objects 122 | }); 123 | ``` 124 | 125 | The same extraction options can be passed to `tableFromArrays`. For more, see the [**API Reference**](api). 126 | 127 | ## Build Instructions 128 | 129 | To build and develop Flechette locally: 130 | 131 | - Clone https://github.com/uwdata/flechette. 132 | - Run `npm i` to install dependencies. 133 | - Run `npm test` to run test cases, `npm run perf` to run performance benchmarks, and `npm run build` to build output files. 134 | -------------------------------------------------------------------------------- /eslint.config.js: -------------------------------------------------------------------------------- 1 | import js from '@eslint/js'; 2 | import globals from 'globals'; 3 | 4 | /** @type {import('@types/eslint').Linter.Config[]} */ 5 | export default [ 6 | js.configs.recommended, 7 | { 8 | languageOptions: { 9 | ecmaVersion: 2022, 10 | sourceType: "module", 11 | globals: { 12 | ...globals.browser, 13 | ...globals.mocha, 14 | ...globals.node, 15 | ...globals.es6, 16 | globalThis: false 17 | } 18 | }, 19 | rules: { 20 | "no-unexpected-multiline": "off" 21 | } 22 | } 23 | ]; 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@uwdata/flechette", 3 | "version": "2.0.0", 4 | "description": "Fast, lightweight access to Apache Arrow data.", 5 | "keywords": [ 6 | "arrow", 7 | "data", 8 | "access" 9 | ], 10 | "license": "BSD-3-Clause", 11 | "author": "Jeffrey Heer (https://idl.uw.edu)", 12 | "type": "module", 13 | "main": "./dist/flechette.cjs", 14 | "module": "./src/index.js", 15 | "jsdelivr": "./dist/flechette.min.js", 16 | "unpkg": "./dist/flechette.min.js", 17 | "types": "./dist/types/index-types.d.ts", 18 | "repository": { 19 | "type": "git", 20 | "url": "https://github.com/uwdata/flechette.git" 21 | }, 22 | "scripts": { 23 | "perf": "node perf/run-all.js", 24 | "perf:build": "node perf/build-perf.js", 25 | "perf:decode": "node perf/decode-perf.js", 26 | "perf:encode": "node perf/encode-perf.js", 27 | "prebuild": "rimraf dist && mkdir dist", 28 | "build": "rollup -c rollup.config.js", 29 | "types": "tsc --project tsconfig.json", 30 | "postbuild": "npm run types", 31 | "lint": "eslint src test", 32 | "test": "mocha 'test/**/*-test.js'", 33 | "prepublishOnly": "npm run test && npm run lint && npm run build" 34 | }, 35 | "devDependencies": { 36 | "@rollup/plugin-terser": "^0.4.4", 37 | "@uwdata/mosaic-duckdb": "^0.12.2", 38 | "apache-arrow": "^19.0.1", 39 | "eslint": "^9.21.0", 40 | "mocha": "^11.1.0", 41 | "rimraf": "^6.0.1", 42 | "rollup": "^4.34.8", 43 | "rollup-plugin-bundle-size": "^1.0.3", 44 | "typescript": "^5.8.2" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /perf/build-perf.js: -------------------------------------------------------------------------------- 1 | import { Bool, DateDay, Dictionary, Float64, Int32, Utf8, vectorFromArray } from 'apache-arrow'; 2 | import { bool, columnFromArray, dateDay, dictionary, float64, int32, tableFromColumns, tableToIPC, utf8 } from '../src/index.js'; 3 | import { bools, dates, floats, ints, sample, strings, uniqueStrings } from './data.js'; 4 | import { benchmark } from './util.js'; 5 | 6 | function fl(data, typeKey) { 7 | const type = typeKey === 'bool' ? bool() 8 | : typeKey === 'int' ? int32() 9 | : typeKey === 'float' ? float64() 10 | : typeKey === 'utf8' ? utf8() 11 | : typeKey === 'date' ? dateDay() 12 | : typeKey === 'dict' ? dictionary(utf8(), int32()) 13 | : null; 14 | return columnFromArray(data, type); 15 | } 16 | 17 | function aa(data, typeKey) { 18 | const type = typeKey === 'bool' ? new Bool() 19 | : typeKey === 'int' ? new Int32() 20 | : typeKey === 'float' ? new Float64() 21 | : typeKey === 'utf8' ? new Utf8() 22 | : typeKey === 'utf8' ? new DateDay() 23 | : typeKey === 'dict' ? new Dictionary(new Utf8(), new Int32()) 24 | : null; 25 | return vectorFromArray(data, type); 26 | } 27 | 28 | function js(data) { 29 | return JSON.stringify(data); 30 | } 31 | 32 | function run(N, nulls, msg, iter = 5) { 33 | const int = ints(N, -10000, 10000, nulls); 34 | const float = floats(N, -10000, 10000, nulls); 35 | const bool = bools(N, nulls); 36 | const utf8 = strings(N, nulls); 37 | const date = dates(N, nulls); 38 | const dict = sample(N, uniqueStrings(100), nulls); 39 | 40 | console.log(`\n** Build performance for ${msg} **\n`); 41 | trial('Int Column', int, 'int', iter); 42 | trial('Float Column', float, 'float', iter); 43 | trial('Bool Column', bool, 'bool', iter); 44 | trial('Date Column', date, 'date', iter); 45 | trial('Utf8 Column', utf8, 'utf8', iter); 46 | trial('Dict Utf8 Column', dict, 'dict', iter); 47 | } 48 | 49 | function trial(task, data, typeKey, iter) { 50 | const jl = new TextEncoder().encode(JSON.stringify(data)).length; 51 | const al = tableToIPC(tableFromColumns({ v: fl(data, typeKey) })).length; 52 | const sz = `json ${(jl/1e6).toFixed(1)} MB, arrow ${(al/1e6).toFixed(1)} MB`; 53 | 54 | console.log(`${task} (${iter} iteration${iter === 1 ? '' : 's'}, ${sz})`); 55 | const j = benchmark(() => js(data, typeKey), iter); 56 | const a = benchmark(() => aa(data, typeKey), iter); 57 | const f = benchmark(() => fl(data, typeKey), iter); 58 | const p = Object.keys(a); 59 | 60 | console.table(p.map(key => ({ 61 | measure: key, 62 | json: +(j[key].toFixed(2)), 63 | 'arrow-js': +(a[key].toFixed(2)), 64 | flechette: +(f[key].toFixed(2)), 65 | ratio: +((a[key] / f[key]).toFixed(2)) 66 | }))); 67 | } 68 | 69 | run(1e6, 0, '1M values'); 70 | run(1e6, 0.05, '1M values, 5% nulls'); 71 | -------------------------------------------------------------------------------- /perf/data.js: -------------------------------------------------------------------------------- 1 | export function rint(min, max) { 2 | let delta = min; 3 | if (max === undefined) { 4 | min = 0; 5 | } else { 6 | delta = max - min; 7 | } 8 | return (min + delta * Math.random()) | 0; 9 | } 10 | 11 | export function ints(n, min, max, nullf) { 12 | const data = []; 13 | for (let i = 0; i < n; ++i) { 14 | const v = nullf && Math.random() < nullf ? null : rint(min, max); 15 | data.push(v); 16 | } 17 | return data; 18 | } 19 | 20 | export function floats(n, min, max, nullf) { 21 | const data = []; 22 | const delta = max - min; 23 | for (let i = 0; i < n; ++i) { 24 | const v = nullf && Math.random() < nullf 25 | ? null 26 | : (min + delta * Math.random()); 27 | data.push(v); 28 | } 29 | return data; 30 | } 31 | 32 | export function dates(n, nullf) { 33 | const data = []; 34 | for (let i = 0; i < n; ++i) { 35 | const v = nullf && Math.random() < nullf 36 | ? null 37 | : new Date(Date.UTC(1970 + rint(0, 41), 0, rint(1, 366))); 38 | data.push(v); 39 | } 40 | return data; 41 | } 42 | 43 | export function uniqueStrings(n) { 44 | const c = 'bcdfghjlmpqrstvwxyz'; 45 | const v = 'aeiou'; 46 | const cn = c.length; 47 | const vn = v.length; 48 | const data = []; 49 | const map = {}; 50 | while (data.length < n) { 51 | const s = c[rint(cn)] 52 | + v[rint(vn)] + c[rint(cn)] + c[rint(cn)] 53 | + v[rint(vn)] + c[rint(cn)] + c[rint(cn)]; 54 | if (!map[s]) { 55 | data.push(s); 56 | map[s] = 1; 57 | } 58 | } 59 | return data; 60 | } 61 | 62 | export function strings(n, nullf) { 63 | const c = 'bcdfghjlmpqrstvwxyz'; 64 | const v = 'aeiou'; 65 | const cn = c.length; 66 | const vn = v.length; 67 | const data = []; 68 | while (data.length < n) { 69 | const s = nullf && Math.random() < nullf 70 | ? null 71 | : (c[rint(cn)] + v[rint(vn)] + c[rint(cn)] + c[rint(cn)]); 72 | data.push(s); 73 | } 74 | return data; 75 | } 76 | 77 | export function bools(n, nullf) { 78 | const data = []; 79 | for (let i = 0; i < n; ++i) { 80 | const v = nullf && Math.random() < nullf ? null : (Math.random() < 0.5); 81 | data.push(v); 82 | } 83 | return data; 84 | } 85 | 86 | export function sample(n, values, nullf) { 87 | const data = []; 88 | for (let i = 0; i < n; ++i) { 89 | const v = nullf && Math.random() < nullf 90 | ? null 91 | : values[~~(values.length * Math.random())]; 92 | data.push(v); 93 | } 94 | return data; 95 | } 96 | -------------------------------------------------------------------------------- /perf/decode-perf.js: -------------------------------------------------------------------------------- 1 | import { readFile } from 'node:fs/promises'; 2 | import { tableFromIPC as aaTable } from 'apache-arrow'; 3 | import { tableFromIPC as flTable } from '../src/index.js'; 4 | import { benchmark } from './util.js'; 5 | 6 | // table creation 7 | const fl = bytes => flTable(bytes, { useBigInt: true, useProxy: true }); 8 | const aa = bytes => aaTable(bytes); 9 | 10 | // decode ipc data to columns 11 | function decodeIPC(table) { 12 | return table.schema.fields.map((f, i) => table.getChildAt(i)); 13 | } 14 | 15 | // extract column arrays directly 16 | function extractArrays(table) { 17 | const n = table.numCols; 18 | const data = []; 19 | for (let i = 0; i < n; ++i) { 20 | data.push(table.getChildAt(i).toArray()); 21 | } 22 | return data; 23 | } 24 | 25 | // iterate over values for each column 26 | function iterateValues(table) { 27 | const names = table.schema.fields.map(f => f.name); 28 | names.forEach(name => Array.from(table.getChild(name))); 29 | } 30 | 31 | // random access to each column value 32 | // this will be slower if there are multiple record batches 33 | // due to the need for binary search over the offsets array 34 | function randomAccess(table) { 35 | const { numRows, numCols } = table; 36 | const vals = Array(numCols); 37 | for (let j = 0; j < numCols; ++j) { 38 | const col = table.getChildAt(j); 39 | for (let i = 0; i < numRows; ++i) { 40 | vals[j] = col.at(i); 41 | } 42 | } 43 | } 44 | 45 | // generate row objects, access each property 46 | function visitObjects(table) { 47 | const nr = table.numRows; 48 | const names = table.schema.fields.map(f => f.name); 49 | const obj = table.toArray(); 50 | for (let i = 0; i < nr; ++i) { 51 | const row = obj[i]; 52 | names.forEach(name => row[name]); 53 | } 54 | } 55 | 56 | function trial(task, name, bytes, method, iter) { 57 | console.log(`${task} (${name}, ${iter} iteration${iter === 1 ? '' : 's'})`); 58 | const a = benchmark(() => method(aa(bytes)), iter); 59 | const f = benchmark(() => method(fl(bytes)), iter); 60 | const p = Object.keys(a); 61 | console.table(p.map(key => ({ 62 | measure: key, 63 | 'arrow-js': +(a[key].toFixed(2)), 64 | flechette: +(f[key].toFixed(2)), 65 | ratio: +((a[key] / f[key]).toFixed(2)) 66 | }))); 67 | } 68 | 69 | async function run(file, iter = 5) { 70 | console.log(`\n** Decoding performance using ${file} **\n`); 71 | const bytes = new Uint8Array(await readFile(`test/data/${file}`)); 72 | trial('Decode Table from IPC', file, bytes, decodeIPC, iter); 73 | trial('Extract Arrays', file, bytes, extractArrays, iter); 74 | trial('Iterate Values', file, bytes, iterateValues, iter); 75 | trial('Random Access', file, bytes, randomAccess, iter); 76 | trial('Visit Row Objects', file, bytes, visitObjects, iter); 77 | } 78 | 79 | await run('flights.arrows'); 80 | await run('scrabble.arrows'); 81 | -------------------------------------------------------------------------------- /perf/encode-perf.js: -------------------------------------------------------------------------------- 1 | import { readFile } from 'node:fs/promises'; 2 | import { tableFromIPC as aaTable, tableToIPC as aaToIPC } from 'apache-arrow'; 3 | import { tableFromIPC as flTable, tableToIPC as flToIPC } from '../src/index.js'; 4 | import { benchmark } from './util.js'; 5 | 6 | // table encoding methods 7 | const fl = table => flToIPC(table); 8 | const aa = table => aaToIPC(table); 9 | 10 | function trial(task, name, bytes, iter) { 11 | console.log(`${task} (${name}, ${iter} iteration${iter === 1 ? '' : 's'})`); 12 | const aat = aaTable(bytes); 13 | const flt = flTable(bytes, { useBigInt: true }); 14 | const a = benchmark(() => aa(aat), iter); 15 | const f = benchmark(() => fl(flt), iter); 16 | const p = Object.keys(a); 17 | console.table(p.map(key => ({ 18 | measure: key, 19 | 'arrow-js': +(a[key].toFixed(2)), 20 | flechette: +(f[key].toFixed(2)), 21 | ratio: +((a[key] / f[key]).toFixed(2)) 22 | }))); 23 | } 24 | 25 | async function run(file, iter = 5) { 26 | console.log(`\n** Encoding performance using ${file} **\n`); 27 | const bytes = new Uint8Array(await readFile(`test/data/${file}`)); 28 | trial('Encode Table to IPC', file, bytes, iter); 29 | } 30 | 31 | await run('flights.arrows'); 32 | await run('scrabble.arrows'); 33 | -------------------------------------------------------------------------------- /perf/run-all.js: -------------------------------------------------------------------------------- 1 | import { spawn } from 'child_process'; 2 | 3 | async function node(cmdstr) { 4 | return new Promise((resolve, reject) => { 5 | const child = spawn('node', [cmdstr], { 6 | cwd: process.cwd(), 7 | detached: true, 8 | stdio: 'inherit' 9 | }); 10 | child.on('close', code => { 11 | if (code !== 0) reject(code); 12 | resolve(); 13 | }); 14 | }); 15 | } 16 | 17 | await node('./perf/decode-perf.js'); 18 | await node('./perf/encode-perf.js'); 19 | await node('./perf/build-perf.js'); 20 | -------------------------------------------------------------------------------- /perf/util.js: -------------------------------------------------------------------------------- 1 | import { performance } from 'node:perf_hooks'; 2 | 3 | export function timeit(fn) { 4 | const t0 = performance.now(); 5 | const value = fn(); 6 | const t1 = performance.now(); 7 | return { time: t1 - t0, value }; 8 | } 9 | 10 | export function benchmark(fn, iter = 10) { 11 | const times = []; 12 | for (let i = 0; i < iter; ++i) { 13 | times.push(timeit(fn).time); 14 | } 15 | return stats(times); 16 | } 17 | 18 | function stats(times) { 19 | const iter = times.length; 20 | const init = times[0]; 21 | let sum = init; 22 | let min = init; 23 | let max = init; 24 | for (let i = 1; i < iter; ++i) { 25 | const t = times[i]; 26 | sum += t; 27 | if (t < min) min = t; 28 | if (t > max) max = t; 29 | } 30 | const avg = sum / iter; 31 | return { avg, init, min, max }; 32 | } 33 | -------------------------------------------------------------------------------- /rollup.config.js: -------------------------------------------------------------------------------- 1 | import bundleSize from 'rollup-plugin-bundle-size'; 2 | import terser from '@rollup/plugin-terser'; 3 | 4 | export default [ 5 | { 6 | input: 'src/index.js', 7 | plugins: [ bundleSize() ], 8 | output: [ 9 | { 10 | file: 'dist/flechette.cjs', 11 | format: 'cjs' 12 | }, 13 | { 14 | file: 'dist/flechette.mjs', 15 | format: 'esm', 16 | }, 17 | { 18 | file: 'dist/flechette.min.js', 19 | format: 'umd', 20 | sourcemap: true, 21 | plugins: [ terser({ ecma: 2018 }) ], 22 | name: 'fl' 23 | } 24 | ] 25 | } 26 | ]; 27 | -------------------------------------------------------------------------------- /src/batch-type.js: -------------------------------------------------------------------------------- 1 | import { BinaryBatch, BinaryViewBatch, BoolBatch, DateBatch, DateDayBatch, DateDayMillisecondBatch, Decimal32NumberBatch, DecimalBigIntBatch, DecimalNumberBatch, DenseUnionBatch, DictionaryBatch, DirectBatch, FixedBinaryBatch, FixedListBatch, Float16Batch, Int64Batch, IntervalDayTimeBatch, IntervalMonthDayNanoBatch, LargeBinaryBatch, LargeListBatch, LargeListViewBatch, LargeUtf8Batch, ListBatch, ListViewBatch, MapBatch, MapEntryBatch, NullBatch, RunEndEncodedBatch, SparseUnionBatch, StructBatch, StructProxyBatch, TimestampMicrosecondBatch, TimestampMillisecondBatch, TimestampNanosecondBatch, TimestampSecondBatch, Utf8Batch, Utf8ViewBatch } from './batch.js'; 2 | import { DateUnit, IntervalUnit, TimeUnit, Type } from './constants.js'; 3 | import { invalidDataType } from './data-types.js'; 4 | 5 | export function batchType(type, options = {}) { 6 | const { typeId, bitWidth, precision, unit } = type; 7 | const { useBigInt, useDate, useDecimalInt, useMap, useProxy } = options; 8 | 9 | switch (typeId) { 10 | case Type.Null: return NullBatch; 11 | case Type.Bool: return BoolBatch; 12 | case Type.Int: 13 | case Type.Time: 14 | case Type.Duration: 15 | return useBigInt || bitWidth < 64 ? DirectBatch : Int64Batch; 16 | case Type.Float: 17 | return precision ? DirectBatch : Float16Batch; 18 | case Type.Date: 19 | return wrap( 20 | unit === DateUnit.DAY ? DateDayBatch : DateDayMillisecondBatch, 21 | useDate && DateBatch 22 | ); 23 | case Type.Timestamp: 24 | return wrap( 25 | unit === TimeUnit.SECOND ? TimestampSecondBatch 26 | : unit === TimeUnit.MILLISECOND ? TimestampMillisecondBatch 27 | : unit === TimeUnit.MICROSECOND ? TimestampMicrosecondBatch 28 | : TimestampNanosecondBatch, 29 | useDate && DateBatch 30 | ); 31 | case Type.Decimal: 32 | return bitWidth === 32 33 | ? (useDecimalInt ? DirectBatch : Decimal32NumberBatch) 34 | : (useDecimalInt ? DecimalBigIntBatch : DecimalNumberBatch); 35 | case Type.Interval: 36 | return unit === IntervalUnit.DAY_TIME ? IntervalDayTimeBatch 37 | : unit === IntervalUnit.YEAR_MONTH ? DirectBatch 38 | : IntervalMonthDayNanoBatch; 39 | case Type.FixedSizeBinary: return FixedBinaryBatch; 40 | case Type.Utf8: return Utf8Batch; 41 | case Type.LargeUtf8: return LargeUtf8Batch; 42 | case Type.Binary: return BinaryBatch; 43 | case Type.LargeBinary: return LargeBinaryBatch; 44 | case Type.BinaryView: return BinaryViewBatch; 45 | case Type.Utf8View: return Utf8ViewBatch; 46 | case Type.List: return ListBatch; 47 | case Type.LargeList: return LargeListBatch; 48 | case Type.Map: return useMap ? MapBatch : MapEntryBatch; 49 | case Type.ListView: return ListViewBatch; 50 | case Type.LargeListView: return LargeListViewBatch; 51 | case Type.FixedSizeList: return FixedListBatch; 52 | case Type.Struct: return useProxy ? StructProxyBatch : StructBatch; 53 | case Type.RunEndEncoded: return RunEndEncodedBatch; 54 | case Type.Dictionary: return DictionaryBatch; 55 | case Type.Union: return type.mode ? DenseUnionBatch : SparseUnionBatch; 56 | } 57 | throw new Error(invalidDataType(typeId)); 58 | } 59 | 60 | function wrap(BaseClass, WrapperClass) { 61 | return WrapperClass 62 | ? class WrapBatch extends WrapperClass { 63 | constructor(options) { 64 | super(new BaseClass(options)); 65 | } 66 | } 67 | : BaseClass; 68 | } 69 | -------------------------------------------------------------------------------- /src/build/buffer.js: -------------------------------------------------------------------------------- 1 | import { align, grow, uint8Array } from '../util/arrays.js'; 2 | 3 | /** 4 | * Create a new resizable buffer instance. 5 | * @param {import('../types.js').TypedArrayConstructor} [arrayType] 6 | * The array type. 7 | * @returns {Buffer} The buffer. 8 | */ 9 | export function buffer(arrayType) { 10 | return new Buffer(arrayType); 11 | } 12 | 13 | /** 14 | * Resizable byte buffer. 15 | */ 16 | export class Buffer { 17 | /** 18 | * Create a new resizable buffer instance. 19 | * @param {import('../types.js').TypedArrayConstructor} arrayType 20 | */ 21 | constructor(arrayType = uint8Array) { 22 | this.buf = new arrayType(512); 23 | } 24 | /** 25 | * Return the underlying data as a 64-bit aligned array of minimum size. 26 | * @param {number} size The desired minimum array size. 27 | * @returns {import('../types.js').TypedArray} The 64-bit aligned array. 28 | */ 29 | array(size) { 30 | return align(this.buf, size); 31 | } 32 | /** 33 | * Prepare for writes to the given index, resizing as necessary. 34 | * @param {number} index The array index to prepare to write to. 35 | */ 36 | prep(index) { 37 | if (index >= this.buf.length) { 38 | this.buf = grow(this.buf, index); 39 | } 40 | } 41 | /** 42 | * Return the value at the given index. 43 | * @param {number} index The array index. 44 | */ 45 | get(index) { 46 | return this.buf[index]; 47 | } 48 | /** 49 | * Set a value at the given index. 50 | * @param {number | bigint} value The value to set. 51 | * @param {number} index The index to write to. 52 | */ 53 | set(value, index) { 54 | this.prep(index); 55 | this.buf[index] = value; 56 | } 57 | /** 58 | * Write a byte array at the given index. The method should be called 59 | * only when the underlying buffer is of type Uint8Array. 60 | * @param {Uint8Array} bytes The byte array. 61 | * @param {number} index The starting index to write to. 62 | */ 63 | write(bytes, index) { 64 | this.prep(index + bytes.length); 65 | /** @type {Uint8Array} */ (this.buf).set(bytes, index); 66 | } 67 | } 68 | 69 | /** 70 | * Create a new resizable bitmap instance. 71 | * @returns {Bitmap} The bitmap buffer. 72 | */ 73 | export function bitmap() { 74 | return new Bitmap(); 75 | } 76 | 77 | /** 78 | * Resizable bitmap buffer. 79 | */ 80 | export class Bitmap extends Buffer { 81 | /** 82 | * Set a bit to true at the given bitmap index. 83 | * @param {number} index The index to write to. 84 | */ 85 | set(index) { 86 | const i = index >> 3; 87 | this.prep(i); 88 | /** @type {Uint8Array} */ (this.buf)[i] |= (1 << (index % 8)); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/build/builder.js: -------------------------------------------------------------------------------- 1 | import { batchType } from '../batch-type.js'; 2 | import { IntervalUnit, Type } from '../constants.js'; 3 | import { invalidDataType } from '../data-types.js'; 4 | import { isInt64ArrayType } from '../util/arrays.js'; 5 | import { toBigInt, toDateDay, toDecimal32, toFloat16, toTimestamp } from '../util/numbers.js'; 6 | import { BinaryBuilder } from './builders/binary.js'; 7 | import { BoolBuilder } from './builders/bool.js'; 8 | import { DecimalBuilder } from './builders/decimal.js'; 9 | import { DictionaryBuilder, dictionaryContext } from './builders/dictionary.js'; 10 | import { FixedSizeBinaryBuilder } from './builders/fixed-size-binary.js'; 11 | import { FixedSizeListBuilder } from './builders/fixed-size-list.js'; 12 | import { IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder } from './builders/interval.js'; 13 | import { ListBuilder } from './builders/list.js'; 14 | import { MapBuilder } from './builders/map.js'; 15 | import { RunEndEncodedBuilder } from './builders/run-end-encoded.js'; 16 | import { StructBuilder } from './builders/struct.js'; 17 | import { DenseUnionBuilder, SparseUnionBuilder } from './builders/union.js'; 18 | import { Utf8Builder } from './builders/utf8.js'; 19 | import { DirectBuilder, Int64Builder, TransformBuilder } from './builders/values.js'; 20 | 21 | /** 22 | * Create a context object for shared builder state. 23 | * @param {import('../types.js').ExtractionOptions} [options] 24 | * Batch extraction options. 25 | * @param {ReturnType} [dictionaries] 26 | * Context object for tracking dictionaries. 27 | */ 28 | export function builderContext( 29 | options = {}, 30 | dictionaries = dictionaryContext() 31 | ) { 32 | return { 33 | batchType: type => batchType(type, options), 34 | builder(type) { return builder(type, this); }, 35 | dictionary(type) { return dictionaries.get(type, this); }, 36 | finish: () => dictionaries.finish(options) 37 | }; 38 | } 39 | 40 | /** 41 | * Returns a batch builder for the given type and builder context. 42 | * @param {import('../types.js').DataType} type A data type. 43 | * @param {ReturnType} [ctx] A builder context. 44 | * @returns {import('./builders/batch.js').BatchBuilder} 45 | */ 46 | export function builder(type, ctx = builderContext()) { 47 | const { typeId } = type; 48 | switch (typeId) { 49 | case Type.Int: 50 | case Type.Time: 51 | case Type.Duration: 52 | return isInt64ArrayType(type.values) 53 | ? new Int64Builder(type, ctx) 54 | : new DirectBuilder(type, ctx); 55 | case Type.Float: 56 | return type.precision 57 | ? new DirectBuilder(type, ctx) 58 | : new TransformBuilder(type, ctx, toFloat16) 59 | case Type.Binary: 60 | case Type.LargeBinary: 61 | return new BinaryBuilder(type, ctx); 62 | case Type.Utf8: 63 | case Type.LargeUtf8: 64 | return new Utf8Builder(type, ctx); 65 | case Type.Bool: 66 | return new BoolBuilder(type, ctx); 67 | case Type.Decimal: 68 | return type.bitWidth === 32 69 | ? new TransformBuilder(type, ctx, toDecimal32(type.scale)) 70 | : new DecimalBuilder(type, ctx); 71 | case Type.Date: 72 | return new TransformBuilder(type, ctx, type.unit ? toBigInt : toDateDay); 73 | case Type.Timestamp: 74 | return new TransformBuilder(type, ctx, toTimestamp(type.unit)); 75 | case Type.Interval: 76 | switch (type.unit) { 77 | case IntervalUnit.DAY_TIME: 78 | return new IntervalDayTimeBuilder(type, ctx); 79 | case IntervalUnit.MONTH_DAY_NANO: 80 | return new IntervalMonthDayNanoBuilder(type, ctx); 81 | } 82 | // case IntervalUnit.YEAR_MONTH: 83 | return new DirectBuilder(type, ctx); 84 | case Type.List: 85 | case Type.LargeList: 86 | return new ListBuilder(type, ctx); 87 | case Type.Struct: 88 | return new StructBuilder(type, ctx); 89 | case Type.Union: 90 | return type.mode 91 | ? new DenseUnionBuilder(type, ctx) 92 | : new SparseUnionBuilder(type, ctx); 93 | case Type.FixedSizeBinary: 94 | return new FixedSizeBinaryBuilder(type, ctx); 95 | case Type.FixedSizeList: 96 | return new FixedSizeListBuilder(type, ctx); 97 | case Type.Map: 98 | return new MapBuilder(type, ctx); 99 | case Type.RunEndEncoded: 100 | return new RunEndEncodedBuilder(type, ctx); 101 | 102 | case Type.Dictionary: 103 | return new DictionaryBuilder(type, ctx); 104 | } 105 | // case Type.BinaryView: 106 | // case Type.Utf8View: 107 | // case Type.ListView: 108 | // case Type.LargeListView: 109 | throw new Error(invalidDataType(typeId)); 110 | } 111 | -------------------------------------------------------------------------------- /src/build/builders/batch.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Abstract class for building a column data batch. 3 | */ 4 | export class BatchBuilder { 5 | constructor(type, ctx) { 6 | this.type = type; 7 | this.ctx = ctx; 8 | this.batchClass = ctx.batchType(type); 9 | } 10 | 11 | /** 12 | * Initialize the builder state. 13 | * @returns {this} This builder. 14 | */ 15 | init() { 16 | this.index = -1; 17 | return this; 18 | } 19 | 20 | /** 21 | * Write a value to the builder. 22 | * @param {*} value 23 | * @param {number} index 24 | * @returns {boolean | void} 25 | */ 26 | set(value, index) { 27 | this.index = index; 28 | return false; 29 | } 30 | 31 | /** 32 | * Returns a batch constructor options object. 33 | * Used internally to marshal batch data. 34 | * @returns {Record} 35 | */ 36 | done() { 37 | return null; 38 | } 39 | 40 | /** 41 | * Returns a completed batch and reinitializes the builder state. 42 | * @returns {import('../../batch.js').Batch} 43 | */ 44 | batch() { 45 | const b = new this.batchClass(this.done()); 46 | this.init(); 47 | return b; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/build/builders/binary.js: -------------------------------------------------------------------------------- 1 | import { toOffset } from '../../util/numbers.js'; 2 | import { buffer } from '../buffer.js'; 3 | import { ValidityBuilder } from './validity.js'; 4 | 5 | /** 6 | * Builder for batches of binary-typed data. 7 | */ 8 | export class BinaryBuilder extends ValidityBuilder { 9 | constructor(type, ctx) { 10 | super(type, ctx); 11 | this.toOffset = toOffset(type.offsets); 12 | } 13 | 14 | init() { 15 | this.offsets = buffer(this.type.offsets); 16 | this.values = buffer(); 17 | this.pos = 0; 18 | return super.init(); 19 | } 20 | 21 | set(value, index) { 22 | const { offsets, values, toOffset } = this; 23 | if (super.set(value, index)) { 24 | values.write(value, this.pos); 25 | this.pos += value.length; 26 | } 27 | offsets.set(toOffset(this.pos), index + 1); 28 | } 29 | 30 | done() { 31 | return { 32 | ...super.done(), 33 | offsets: this.offsets.array(this.index + 2), 34 | values: this.values.array(this.pos + 1) 35 | }; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/build/builders/bool.js: -------------------------------------------------------------------------------- 1 | import { bitmap } from '../buffer.js'; 2 | import { ValidityBuilder } from './validity.js'; 3 | 4 | /** 5 | * Builder for batches of bool-typed data. 6 | */ 7 | export class BoolBuilder extends ValidityBuilder { 8 | constructor(type, ctx) { 9 | super(type, ctx); 10 | } 11 | 12 | init() { 13 | this.values = bitmap(); 14 | return super.init(); 15 | } 16 | 17 | set(value, index) { 18 | super.set(value, index); 19 | if (value) this.values.set(index); 20 | } 21 | 22 | done() { 23 | return { 24 | ...super.done(), 25 | values: this.values.array((this.index >> 3) + 1) 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/build/builders/decimal.js: -------------------------------------------------------------------------------- 1 | import { toDecimal } from '../../util/numbers.js'; 2 | import { buffer } from '../buffer.js'; 3 | import { ValidityBuilder } from './validity.js'; 4 | 5 | /** 6 | * Builder for batches of decimal-typed data (64-bits or more). 7 | */ 8 | export class DecimalBuilder extends ValidityBuilder { 9 | constructor(type, ctx) { 10 | super(type, ctx); 11 | this.scale = 10 ** type.scale; 12 | this.stride = type.bitWidth >> 6; 13 | } 14 | 15 | init() { 16 | this.values = buffer(this.type.values); 17 | return super.init(); 18 | } 19 | 20 | set(value, index) { 21 | const { scale, stride, values } = this; 22 | if (super.set(value, index)) { 23 | values.prep((index + 1) * stride); 24 | // @ts-ignore 25 | toDecimal(value, values.buf, index * stride, stride, scale); 26 | } 27 | } 28 | 29 | done() { 30 | const { index, stride, values } = this; 31 | return { 32 | ...super.done(), 33 | values: values.array((index + 1) * stride) 34 | }; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/build/builders/dictionary.js: -------------------------------------------------------------------------------- 1 | import { Column } from '../../column.js'; 2 | import { keyString } from '../../util/strings.js'; 3 | import { batchType } from '../../batch-type.js'; 4 | import { buffer } from '../buffer.js'; 5 | import { ValidityBuilder } from './validity.js'; 6 | 7 | /** 8 | * Create a context object for managing dictionary builders. 9 | */ 10 | export function dictionaryContext() { 11 | const idMap = new Map; 12 | const dicts = new Set; 13 | return { 14 | /** 15 | * Get a dictionary values builder for the given dictionary type. 16 | * @param {import('../../types.js').DictionaryType} type 17 | * The dictionary type. 18 | * @param {*} ctx The builder context. 19 | * @returns {ReturnType} 20 | */ 21 | get(type, ctx) { 22 | // if a dictionary has a non-negative id, assume it was set 23 | // intentionally and track it for potential reuse across columns 24 | // otherwise the dictionary is used for a single column only 25 | const id = type.id; 26 | if (id >= 0 && idMap.has(id)) { 27 | return idMap.get(id); 28 | } else { 29 | const dict = dictionaryValues(type, ctx); 30 | if (id >= 0) idMap.set(id, dict); 31 | dicts.add(dict); 32 | return dict; 33 | } 34 | }, 35 | /** 36 | * Finish building dictionary values columns and assign them to 37 | * their corresponding dictionary batches. 38 | * @param {import('../../types.js').ExtractionOptions} options 39 | */ 40 | finish(options) { 41 | dicts.forEach(dict => dict.finish(options)); 42 | } 43 | }; 44 | } 45 | 46 | /** 47 | * Builder helper for creating dictionary values. 48 | * @param {import('../../types.js').DictionaryType} type 49 | * The dictionary data type. 50 | * @param {ReturnType} ctx 51 | * The builder context. 52 | */ 53 | export function dictionaryValues(type, ctx) { 54 | const keys = Object.create(null); 55 | const values = ctx.builder(type.dictionary); 56 | const batches = []; 57 | 58 | values.init(); 59 | let index = -1; 60 | 61 | return { 62 | type, 63 | values, 64 | 65 | add(batch) { 66 | batches.push(batch); 67 | return batch; 68 | }, 69 | 70 | key(value) { 71 | const v = keyString(value); 72 | let k = keys[v]; 73 | if (k === undefined) { 74 | keys[v] = k = ++index; 75 | values.set(value, k); 76 | } 77 | return k; 78 | }, 79 | 80 | finish(options) { 81 | const valueType = type.dictionary; 82 | const batch = new (batchType(valueType, options))(values.done()); 83 | const dictionary = new Column([batch]); 84 | batches.forEach(batch => batch.setDictionary(dictionary)); 85 | } 86 | }; 87 | } 88 | 89 | /** 90 | * Builder for dictionary-typed data batches. 91 | */ 92 | export class DictionaryBuilder extends ValidityBuilder { 93 | constructor(type, ctx) { 94 | super(type, ctx); 95 | this.dict = ctx.dictionary(type); 96 | } 97 | 98 | init() { 99 | this.values = buffer(this.type.indices.values); 100 | return super.init(); 101 | } 102 | 103 | set(value, index) { 104 | if (super.set(value, index)) { 105 | this.values.set(this.dict.key(value), index); 106 | } 107 | } 108 | 109 | done() { 110 | return { 111 | ...super.done(), 112 | values: this.values.array(this.index + 1) 113 | }; 114 | } 115 | 116 | batch() { 117 | // register batch with dictionary 118 | // batch will be updated when the dictionary is finished 119 | return this.dict.add(super.batch()); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/build/builders/fixed-size-binary.js: -------------------------------------------------------------------------------- 1 | import { buffer } from '../buffer.js'; 2 | import { ValidityBuilder } from './validity.js'; 3 | 4 | /** 5 | * Builder for fixed-size-binary-typed data batches. 6 | */ 7 | export class FixedSizeBinaryBuilder extends ValidityBuilder { 8 | constructor(type, ctx) { 9 | super(type, ctx); 10 | this.stride = type.stride; 11 | } 12 | 13 | init() { 14 | this.values = buffer(); 15 | return super.init(); 16 | } 17 | 18 | set(value, index) { 19 | if (super.set(value, index)) { 20 | this.values.write(value, index * this.stride); 21 | } 22 | } 23 | 24 | done() { 25 | const { stride, values } = this; 26 | return { 27 | ...super.done(), 28 | values: values.array(stride * (this.index + 1)) 29 | }; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/build/builders/fixed-size-list.js: -------------------------------------------------------------------------------- 1 | import { ValidityBuilder } from './validity.js'; 2 | 3 | /** 4 | * Builder for fixed-size-list-typed data batches. 5 | */ 6 | export class FixedSizeListBuilder extends ValidityBuilder { 7 | constructor(type, ctx) { 8 | super(type, ctx); 9 | this.child = ctx.builder(this.type.children[0].type); 10 | this.stride = type.stride; 11 | } 12 | 13 | init() { 14 | this.child.init(); 15 | return super.init(); 16 | } 17 | 18 | set(value, index) { 19 | const { child, stride } = this; 20 | const base = index * stride; 21 | if (super.set(value, index)) { 22 | for (let i = 0; i < stride; ++i) { 23 | child.set(value[i], base + i); 24 | } 25 | } else { 26 | child.index = base + stride; 27 | } 28 | } 29 | 30 | done() { 31 | const { child } = this; 32 | return { 33 | ...super.done(), 34 | children: [ child.batch() ] 35 | }; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/build/builders/interval.js: -------------------------------------------------------------------------------- 1 | import { toMonthDayNanoBytes } from '../../util/numbers.js'; 2 | import { buffer } from '../buffer.js'; 3 | import { ValidityBuilder } from './validity.js'; 4 | 5 | /** 6 | * Builder for day/time interval-typed data batches. 7 | */ 8 | export class IntervalDayTimeBuilder extends ValidityBuilder { 9 | init() { 10 | this.values = buffer(this.type.values); 11 | return super.init(); 12 | } 13 | 14 | set(value, index) { 15 | if (super.set(value, index)) { 16 | const i = index << 1; 17 | this.values.set(value[0], i); 18 | this.values.set(value[1], i + 1); 19 | } 20 | } 21 | 22 | done() { 23 | return { 24 | ...super.done(), 25 | values: this.values.array((this.index + 1) << 1) 26 | } 27 | } 28 | } 29 | 30 | /** 31 | * Builder for month/day/nano interval-typed data batches. 32 | */ 33 | export class IntervalMonthDayNanoBuilder extends ValidityBuilder { 34 | init() { 35 | this.values = buffer(); 36 | return super.init(); 37 | } 38 | 39 | set(value, index) { 40 | if (super.set(value, index)) { 41 | this.values.write(toMonthDayNanoBytes(value), index << 4); 42 | } 43 | } 44 | 45 | done() { 46 | return { 47 | ...super.done(), 48 | values: this.values.array((this.index + 1) << 4) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/build/builders/list.js: -------------------------------------------------------------------------------- 1 | import { toOffset } from '../../util/numbers.js'; 2 | import { buffer } from '../buffer.js'; 3 | import { ValidityBuilder } from './validity.js'; 4 | 5 | /** 6 | * Abstract class for building list data batches. 7 | */ 8 | export class AbstractListBuilder extends ValidityBuilder { 9 | constructor(type, ctx, child) { 10 | super(type, ctx); 11 | this.child = child; 12 | } 13 | 14 | init() { 15 | this.child.init(); 16 | const offsetType = this.type.offsets; 17 | this.offsets = buffer(offsetType); 18 | this.toOffset = toOffset(offsetType); 19 | this.pos = 0; 20 | return super.init(); 21 | } 22 | 23 | done() { 24 | return { 25 | ...super.done(), 26 | offsets: this.offsets.array(this.index + 2), 27 | children: [ this.child.batch() ] 28 | }; 29 | } 30 | } 31 | 32 | /** 33 | * Builder for list-typed data batches. 34 | */ 35 | export class ListBuilder extends AbstractListBuilder { 36 | constructor(type, ctx) { 37 | super(type, ctx, ctx.builder(type.children[0].type)); 38 | } 39 | 40 | set(value, index) { 41 | const { child, offsets, toOffset } = this; 42 | if (super.set(value, index)) { 43 | value.forEach(v => child.set(v, this.pos++)); 44 | } 45 | offsets.set(toOffset(this.pos), index + 1); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/build/builders/map.js: -------------------------------------------------------------------------------- 1 | import { AbstractListBuilder } from './list.js'; 2 | import { AbstractStructBuilder } from './struct.js'; 3 | 4 | /** 5 | * Builder for map-typed data batches. 6 | */ 7 | export class MapBuilder extends AbstractListBuilder { 8 | constructor(type, ctx) { 9 | super(type, ctx, new MapStructBuilder(type.children[0].type, ctx)); 10 | } 11 | 12 | set(value, index) { 13 | const { child, offsets, toOffset } = this; 14 | if (super.set(value, index)) { 15 | for (const keyValuePair of value) { 16 | child.set(keyValuePair, this.pos++); 17 | } 18 | } 19 | offsets.set(toOffset(this.pos), index + 1); 20 | } 21 | } 22 | 23 | /** 24 | * Builder for key-value struct batches within a map. 25 | */ 26 | class MapStructBuilder extends AbstractStructBuilder { 27 | set(value, index) { 28 | super.set(value, index); 29 | const [key, val] = this.children; 30 | key.set(value[0], index); 31 | val.set(value[1], index); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/build/builders/run-end-encoded.js: -------------------------------------------------------------------------------- 1 | import { keyString } from '../../util/strings.js'; 2 | import { BatchBuilder } from './batch.js'; 3 | 4 | const NO_VALUE = {}; // empty object that fails strict equality 5 | 6 | /** 7 | * Builder for run-end-encoded-typed data batches. 8 | */ 9 | export class RunEndEncodedBuilder extends BatchBuilder { 10 | constructor(type, ctx) { 11 | super(type, ctx); 12 | this.children = type.children.map(c => ctx.builder(c.type)); 13 | } 14 | 15 | init() { 16 | this.pos = 0; 17 | this.key = null; 18 | this.value = NO_VALUE; 19 | this.children.forEach(c => c.init()); 20 | return super.init(); 21 | } 22 | 23 | next() { 24 | const [runs, vals] = this.children; 25 | runs.set(this.index + 1, this.pos); 26 | vals.set(this.value, this.pos++); 27 | } 28 | 29 | set(value, index) { 30 | // perform fast strict equality test 31 | if (value !== this.value) { 32 | // if no match, fallback to key string test 33 | const key = keyString(value); 34 | if (key !== this.key) { 35 | // if key doesn't match, write prior run and update 36 | if (this.key) this.next(); 37 | this.key = key; 38 | this.value = value; 39 | } 40 | } 41 | this.index = index; 42 | } 43 | 44 | done() { 45 | this.next(); 46 | const { children, index, type } = this; 47 | return { 48 | length: index + 1, 49 | nullCount: 0, 50 | type, 51 | children: children.map(c => c.batch()) 52 | }; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/build/builders/struct.js: -------------------------------------------------------------------------------- 1 | import { ValidityBuilder } from './validity.js'; 2 | 3 | /** 4 | * Abstract class for building list-typed data batches. 5 | */ 6 | export class AbstractStructBuilder extends ValidityBuilder { 7 | constructor(type, ctx) { 8 | super(type, ctx); 9 | this.children = type.children.map(c => ctx.builder(c.type)); 10 | } 11 | 12 | init() { 13 | this.children.forEach(c => c.init()); 14 | return super.init(); 15 | } 16 | 17 | done() { 18 | const { children } = this; 19 | children.forEach(c => c.index = this.index); 20 | return { 21 | ...super.done(), 22 | children: children.map(c => c.batch()) 23 | }; 24 | } 25 | } 26 | 27 | /** 28 | * Builder for struct-typed data batches. 29 | */ 30 | export class StructBuilder extends AbstractStructBuilder { 31 | constructor(type, ctx) { 32 | super(type, ctx); 33 | this.setters = this.children.map((child, i) => { 34 | const name = type.children[i].name; 35 | return (value, index) => child.set(value?.[name], index); 36 | }); 37 | } 38 | 39 | set(value, index) { 40 | super.set(value, index); 41 | const setters = this.setters; 42 | for (let i = 0; i < setters.length; ++i) { 43 | setters[i](value, index); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/build/builders/union.js: -------------------------------------------------------------------------------- 1 | import { int8Array } from '../../util/arrays.js'; 2 | import { BatchBuilder } from './batch.js'; 3 | import { buffer } from '../buffer.js'; 4 | 5 | /** 6 | * Abstract class for building union-typed data batches. 7 | */ 8 | export class AbstractUnionBuilder extends BatchBuilder { 9 | constructor(type, ctx) { 10 | super(type, ctx); 11 | this.children = type.children.map(c => ctx.builder(c.type)); 12 | this.typeMap = type.typeMap; 13 | this.lookup = type.typeIdForValue; 14 | } 15 | 16 | init() { 17 | this.nullCount = 0; 18 | this.typeIds = buffer(int8Array); 19 | this.children.forEach(c => c.init()); 20 | return super.init(); 21 | } 22 | 23 | set(value, index) { 24 | const { children, lookup, typeMap, typeIds } = this; 25 | this.index = index; 26 | const typeId = lookup(value, index); 27 | const child = children[typeMap[typeId]]; 28 | typeIds.set(typeId, index); 29 | if (value == null) ++this.nullCount; 30 | // @ts-ignore 31 | this.update(value, index, child); 32 | } 33 | 34 | done() { 35 | const { children, nullCount, type, typeIds } = this; 36 | const length = this.index + 1; 37 | return { 38 | length, 39 | nullCount, 40 | type, 41 | typeIds: typeIds.array(length), 42 | children: children.map(c => c.batch()) 43 | }; 44 | } 45 | } 46 | 47 | /** 48 | * Builder for sparse union-typed data batches. 49 | */ 50 | export class SparseUnionBuilder extends AbstractUnionBuilder { 51 | update(value, index, child) { 52 | // update selected child with value 53 | // then set all other children to null 54 | child.set(value, index); 55 | this.children.forEach(c => { if (c !== child) c.set(null, index) }); 56 | } 57 | } 58 | 59 | /** 60 | * Builder for dense union-typed data batches. 61 | */ 62 | export class DenseUnionBuilder extends AbstractUnionBuilder { 63 | init() { 64 | this.offsets = buffer(this.type.offsets); 65 | return super.init(); 66 | } 67 | 68 | update(value, index, child) { 69 | const offset = child.index + 1; 70 | child.set(value, offset); 71 | this.offsets.set(offset, index); 72 | } 73 | 74 | done() { 75 | return { 76 | ...super.done(), 77 | offsets: this.offsets.array(this.index + 1) 78 | }; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/build/builders/utf8.js: -------------------------------------------------------------------------------- 1 | import { encodeUtf8 } from '../../util/strings.js'; 2 | import { BinaryBuilder } from './binary.js'; 3 | 4 | /** 5 | * Builder for utf8-typed data batches. 6 | */ 7 | export class Utf8Builder extends BinaryBuilder { 8 | set(value, index) { 9 | super.set(value && encodeUtf8(value), index); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/build/builders/validity.js: -------------------------------------------------------------------------------- 1 | import { uint8Array } from '../../util/arrays.js'; 2 | import { bitmap } from '../buffer.js'; 3 | import { BatchBuilder } from './batch.js'; 4 | 5 | /** 6 | * Builder for validity bitmaps within batches. 7 | */ 8 | export class ValidityBuilder extends BatchBuilder { 9 | constructor(type, ctx) { 10 | super(type, ctx); 11 | } 12 | 13 | init() { 14 | this.nullCount = 0; 15 | this.validity = bitmap(); 16 | return super.init(); 17 | } 18 | 19 | /** 20 | * @param {*} value 21 | * @param {number} index 22 | * @returns {boolean | void} 23 | */ 24 | set(value, index) { 25 | this.index = index; 26 | const isValid = value != null; 27 | if (isValid) { 28 | this.validity.set(index); 29 | } else { 30 | this.nullCount++; 31 | } 32 | return isValid; 33 | } 34 | 35 | done() { 36 | const { index, nullCount, type, validity } = this; 37 | return { 38 | length: index + 1, 39 | nullCount, 40 | type, 41 | validity: nullCount 42 | ? validity.array((index >> 3) + 1) 43 | : new uint8Array(0) 44 | }; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/build/builders/values.js: -------------------------------------------------------------------------------- 1 | import { toBigInt } from '../../util/numbers.js'; 2 | import { buffer } from '../buffer.js'; 3 | import { ValidityBuilder } from './validity.js'; 4 | 5 | /** 6 | * Builder for data batches that can be accessed directly as typed arrays. 7 | */ 8 | export class DirectBuilder extends ValidityBuilder { 9 | constructor(type, ctx) { 10 | super(type, ctx); 11 | this.values = buffer(type.values); 12 | } 13 | 14 | init() { 15 | this.values = buffer(this.type.values); 16 | return super.init(); 17 | } 18 | 19 | /** 20 | * @param {*} value 21 | * @param {number} index 22 | * @returns {boolean | void} 23 | */ 24 | set(value, index) { 25 | if (super.set(value, index)) { 26 | this.values.set(value, index); 27 | } 28 | } 29 | 30 | done() { 31 | return { 32 | ...super.done(), 33 | values: this.values.array(this.index + 1) 34 | }; 35 | } 36 | } 37 | 38 | /** 39 | * Builder for int64/uint64 data batches written as bigints. 40 | */ 41 | export class Int64Builder extends DirectBuilder { 42 | set(value, index) { 43 | super.set(value == null ? value : toBigInt(value), index); 44 | } 45 | } 46 | 47 | /** 48 | * Builder for data batches whose values must pass through a transform 49 | * function prior to be written to a backing buffer. 50 | */ 51 | export class TransformBuilder extends DirectBuilder { 52 | constructor(type, ctx, transform) { 53 | super(type, ctx); 54 | this.transform = transform; 55 | } 56 | set(value, index) { 57 | super.set(value == null ? value : this.transform(value), index); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/build/column-from-array.js: -------------------------------------------------------------------------------- 1 | import { float32Array, float64Array, int16Array, int32Array, int64Array, int8Array, isInt64ArrayType, isTypedArray, uint16Array, uint32Array, uint64Array, uint8Array } from '../util/arrays.js'; 2 | import { DirectBatch, Int64Batch } from '../batch.js'; 3 | import { Column } from '../column.js'; 4 | import { float32, float64, int16, int32, int64, int8, uint16, uint32, uint64, uint8 } from '../data-types.js'; 5 | import { columnFromValues } from './column-from-values.js'; 6 | 7 | /** 8 | * Create a new column from a provided data array. 9 | * @template T 10 | * @param {Array | import('../types.js').TypedArray} array The input data. 11 | * @param {import('../types.js').DataType} [type] The data type. 12 | * If not specified, type inference is attempted. 13 | * @param {import('../types.js').ColumnBuilderOptions} [options] 14 | * Builder options for the generated column. 15 | * @param {ReturnType} [dicts] 16 | * Builder context object, for internal use only. 17 | * @returns {Column} The generated column. 18 | */ 19 | export function columnFromArray(array, type, options = {}, dicts) { 20 | return !type && isTypedArray(array) 21 | ? columnFromTypedArray(array, options) 22 | : columnFromValues(v => array.forEach(v), type, options, dicts); 23 | } 24 | 25 | /** 26 | * Create a new column from a typed array input. 27 | * @template T 28 | * @param {import('../types.js').TypedArray} values The input data. 29 | * @param {import('../types.js').ColumnBuilderOptions} options 30 | * Builder options for the generated column. 31 | * @returns {Column} The generated column. 32 | */ 33 | function columnFromTypedArray(values, { maxBatchRows, useBigInt }) { 34 | const arrayType = /** @type {import('../types.js').TypedArrayConstructor} */ ( 35 | values.constructor 36 | ); 37 | const type = typeForTypedArray(arrayType); 38 | const length = values.length; 39 | const limit = Math.min(maxBatchRows || Infinity, length); 40 | const numBatches = Math.floor(length / limit); 41 | 42 | const batches = []; 43 | const batchType = isInt64ArrayType(arrayType) && !useBigInt ? Int64Batch : DirectBatch; 44 | const add = (start, end) => batches.push(new batchType({ 45 | length: end - start, 46 | nullCount: 0, 47 | type, 48 | validity: new uint8Array(0), 49 | values: values.subarray(start, end) 50 | })); 51 | 52 | let idx = 0; 53 | for (let i = 0; i < numBatches; ++i) add(idx, idx += limit); 54 | if (idx < length) add(idx, length); 55 | 56 | return new Column(batches); 57 | } 58 | 59 | /** 60 | * Return an Arrow data type for a given typed array type. 61 | * @param {import('../types.js').TypedArrayConstructor} arrayType 62 | * The typed array type. 63 | * @returns {import('../types.js').DataType} The data type. 64 | */ 65 | function typeForTypedArray(arrayType) { 66 | switch (arrayType) { 67 | case float32Array: return float32(); 68 | case float64Array: return float64(); 69 | case int8Array: return int8(); 70 | case int16Array: return int16(); 71 | case int32Array: return int32(); 72 | case int64Array: return int64(); 73 | case uint8Array: return uint8(); 74 | case uint16Array: return uint16(); 75 | case uint32Array: return uint32(); 76 | case uint64Array: return uint64(); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/build/column-from-values.js: -------------------------------------------------------------------------------- 1 | import { NullBatch } from '../batch.js'; 2 | import { Column } from '../column.js'; 3 | import { inferType } from './infer-type.js'; 4 | import { builder, builderContext } from './builder.js'; 5 | import { Type } from '../constants.js'; 6 | import { isIterable } from '../util/objects.js'; 7 | 8 | /** 9 | * Create a new column by iterating over provided values. 10 | * @template T 11 | * @param {Iterable | ((callback: (value: any) => void) => void)} values 12 | * Either an iterable object or a visitor function that applies a callback 13 | * to successive data values (akin to Array.forEach). 14 | * @param {import('../types.js').DataType} [type] The data type. 15 | * @param {import('../types.js').ColumnBuilderOptions} [options] 16 | * Builder options for the generated column. 17 | * @param {ReturnType< 18 | * import('./builders/dictionary.js').dictionaryContext 19 | * >} [dicts] Dictionary context object, for internal use only. 20 | * @returns {Column} The generated column. 21 | */ 22 | export function columnFromValues(values, type, options = {}, dicts) { 23 | const visit = isIterable(values) 24 | ? callback => { for (const value of values) callback(value); } 25 | : values; 26 | 27 | type ??= inferType(visit); 28 | const { maxBatchRows = Infinity, ...opt } = options; 29 | let data; 30 | 31 | if (type.typeId === Type.Null) { 32 | let length = 0; 33 | visit(() => ++length); 34 | data = nullBatches(type, length, maxBatchRows); 35 | } else { 36 | const ctx = builderContext(opt, dicts); 37 | const b = builder(type, ctx).init(); 38 | const next = b => data.push(b.batch()); 39 | data = []; 40 | 41 | let row = 0; 42 | visit(value => { 43 | b.set(value, row++); 44 | if (row >= maxBatchRows) { 45 | next(b); 46 | row = 0; 47 | } 48 | }); 49 | if (row) next(b); 50 | 51 | // resolve dictionaries 52 | ctx.finish(); 53 | } 54 | 55 | return new Column(data, type); 56 | } 57 | 58 | /** 59 | * Create null batches with the given batch size limit. 60 | * @param {import('../types.js').NullType} type The null data type. 61 | * @param {number} length The total column length. 62 | * @param {number} limit The maximum batch size. 63 | * @returns {import('../batch.js').NullBatch[]} The null batches. 64 | */ 65 | function nullBatches(type, length, limit) { 66 | const data = []; 67 | const batch = length => new NullBatch({ length, nullCount: length, type }); 68 | const numBatches = Math.floor(length / limit); 69 | for (let i = 0; i < numBatches; ++i) { 70 | data.push(batch(limit)); 71 | } 72 | const rem = length % limit; 73 | if (rem) data.push(batch(rem)); 74 | return data; 75 | } 76 | -------------------------------------------------------------------------------- /src/build/infer-type.js: -------------------------------------------------------------------------------- 1 | import { bool, dateDay, dictionary, field, fixedSizeList, float64, int16, int32, int64, int8, list, nullType, struct, timestamp, utf8 } from '../data-types.js'; 2 | import { isArray } from '../util/arrays.js'; 3 | 4 | /** 5 | * Infer the data type for a given input array. 6 | * @param {(visitor: (value: any) => void) => void} visit 7 | * A function that applies a callback to successive data values. 8 | * @returns {import('../types.js').DataType} The data type. 9 | */ 10 | export function inferType(visit) { 11 | const profile = profiler(); 12 | visit(value => profile.add(value)); 13 | return profile.type(); 14 | } 15 | 16 | function profiler() { 17 | let length = 0; 18 | let nullCount = 0; 19 | let boolCount = 0; 20 | let numberCount = 0; 21 | let intCount = 0; 22 | let bigintCount = 0; 23 | let dateCount = 0; 24 | let dayCount = 0; 25 | let stringCount = 0; 26 | let arrayCount = 0; 27 | let structCount = 0; 28 | let min = Infinity; 29 | let max = -Infinity; 30 | let minLength = Infinity; 31 | let maxLength = -Infinity; 32 | let minBigInt; 33 | let maxBigInt; 34 | let arrayProfile; 35 | let structProfiles = {}; 36 | 37 | return { 38 | add(value) { 39 | length++; 40 | if (value == null) { 41 | nullCount++; 42 | return; 43 | } 44 | switch (typeof value) { 45 | case 'string': 46 | stringCount++; 47 | break; 48 | case 'number': 49 | numberCount++; 50 | if (value < min) min = value; 51 | if (value > max) max = value; 52 | if (Number.isInteger(value)) intCount++; 53 | break; 54 | case 'bigint': 55 | bigintCount++; 56 | if (minBigInt === undefined) { 57 | minBigInt = maxBigInt = value; 58 | } else { 59 | if (value < minBigInt) minBigInt = value; 60 | if (value > maxBigInt) maxBigInt = value; 61 | } 62 | break; 63 | case 'boolean': 64 | boolCount++; 65 | break; 66 | case 'object': 67 | if (value instanceof Date) { 68 | dateCount++; 69 | // 1 day = 1000ms * 60s * 60min * 24hr = 86400000 70 | if ((+value % 864e5) === 0) dayCount++; 71 | } else if (isArray(value)) { 72 | arrayCount++; 73 | const len = value.length; 74 | if (len < minLength) minLength = len; 75 | if (len > maxLength) maxLength = len; 76 | arrayProfile ??= profiler(); 77 | value.forEach(arrayProfile.add); 78 | } else { 79 | structCount++; 80 | for (const key in value) { 81 | const fieldProfiler = structProfiles[key] 82 | ?? (structProfiles[key] = profiler()); 83 | fieldProfiler.add(value[key]); 84 | } 85 | } 86 | } 87 | }, 88 | type() { 89 | const valid = length - nullCount; 90 | return valid === 0 ? nullType() 91 | : intCount === valid ? intType(min, max) 92 | : numberCount === valid ? float64() 93 | : bigintCount === valid ? bigintType(minBigInt, maxBigInt) 94 | : boolCount === valid ? bool() 95 | : dayCount === valid ? dateDay() 96 | : dateCount === valid ? timestamp() 97 | : stringCount === valid ? dictionary(utf8()) 98 | : arrayCount === valid ? arrayType(arrayProfile.type(), minLength, maxLength) 99 | : structCount === valid ? struct( 100 | Object.entries(structProfiles).map(_ => field(_[0], _[1].type())) 101 | ) 102 | : unionType(); 103 | } 104 | }; 105 | } 106 | 107 | /** 108 | * Return a list or fixed list type. 109 | * @param {import('../types.js').DataType} type The child data type. 110 | * @param {number} minLength The minumum list length. 111 | * @param {number} maxLength The maximum list length. 112 | * @returns {import('../types.js').DataType} The data type. 113 | */ 114 | function arrayType(type, minLength, maxLength) { 115 | return maxLength === minLength 116 | ? fixedSizeList(type, minLength) 117 | : list(type); 118 | } 119 | 120 | /** 121 | * @param {number} min 122 | * @param {number} max 123 | * @returns {import('../types.js').DataType} 124 | */ 125 | function intType(min, max) { 126 | const v = Math.max(Math.abs(min) - 1, max); 127 | return v < (1 << 7) ? int8() 128 | : v < (1 << 15) ? int16() 129 | : v < (2 ** 31) ? int32() 130 | : float64(); 131 | } 132 | 133 | /** 134 | * @param {bigint} min 135 | * @param {bigint} max 136 | * @returns {import('../types.js').IntType} 137 | */ 138 | function bigintType(min, max) { 139 | const v = -min > max ? -min - 1n : max; 140 | if (v >= 2 ** 63) { 141 | throw new Error(`BigInt exceeds 64 bits: ${v}`); 142 | } 143 | return int64(); 144 | } 145 | 146 | /** 147 | * @returns {import('../types.js').UnionType} 148 | */ 149 | function unionType() { 150 | throw new Error('Mixed types detected, please define a union type.'); 151 | } 152 | -------------------------------------------------------------------------------- /src/build/table-from-arrays.js: -------------------------------------------------------------------------------- 1 | import { dictionaryContext } from './builders/dictionary.js'; 2 | import { columnFromArray } from './column-from-array.js'; 3 | import { tableFromColumns } from './table-from-columns.js'; 4 | 5 | /** 6 | * Create a new table from the provided arrays. 7 | * @param {[string, Array | import('../types.js').TypedArray][] 8 | * | Record} data 9 | * The input data as a collection of named arrays. 10 | * @param {import('../types.js').TableBuilderOptions} options 11 | * Table builder options, including an optional type map. 12 | * @returns {import('../table.js').Table} The new table. 13 | */ 14 | export function tableFromArrays(data, options = {}) { 15 | const { types = {}, ...opt } = options; 16 | const dicts = dictionaryContext(); 17 | const entries = Array.isArray(data) ? data : Object.entries(data); 18 | const columns = entries.map(([name, array]) => 19 | /** @type {[string, import('../column.js').Column]} */ ( 20 | [ name, columnFromArray(array, types[name], opt, dicts)] 21 | )); 22 | return tableFromColumns(columns, options.useProxy); 23 | } 24 | -------------------------------------------------------------------------------- /src/build/table-from-columns.js: -------------------------------------------------------------------------------- 1 | import { Endianness, Version } from '../constants.js'; 2 | import { field } from '../data-types.js'; 3 | import { Table } from '../table.js'; 4 | 5 | /** 6 | * Create a new table from a collection of columns. Columns are assumed 7 | * to have the same record batch sizes. 8 | * @param {[string, import('../column.js').Column][] 9 | * | Record} data The columns, 10 | * as an object with name keys, or an array of [name, column] pairs. 11 | * @param {boolean} [useProxy] Flag indicating if row proxy 12 | * objects should be used to represent table rows (default `false`). 13 | * @returns {Table} The new table. 14 | */ 15 | export function tableFromColumns(data, useProxy) { 16 | const fields = []; 17 | const entries = Array.isArray(data) ? data : Object.entries(data); 18 | const length = entries[0]?.[1].length; 19 | 20 | const columns = entries.map(([name, col]) => { 21 | if (col.length !== length) { 22 | throw new Error('All columns must have the same length.'); 23 | } 24 | fields.push(field(name, col.type)); 25 | return col; 26 | }); 27 | 28 | const schema = { 29 | version: Version.V5, 30 | endianness: Endianness.Little, 31 | fields, 32 | metadata: null 33 | }; 34 | 35 | return new Table(schema, columns, useProxy); 36 | } 37 | -------------------------------------------------------------------------------- /src/column.js: -------------------------------------------------------------------------------- 1 | import { bisect } from './util/arrays.js'; 2 | import { isDirectBatch } from './batch.js'; 3 | 4 | /** 5 | * Build up a column from batches. 6 | */ 7 | export function columnBuilder(type) { 8 | let data = []; 9 | return { 10 | add(batch) { data.push(batch); return this; }, 11 | clear: () => data = [], 12 | done: () => new Column(data, type) 13 | }; 14 | } 15 | 16 | /** 17 | * A data column. A column provides a view over one or more value batches, 18 | * each drawn from an Arrow record batch. While this class supports random 19 | * access to column values by integer index; however, extracting arrays using 20 | * `toArray()` or iterating over values (`for (const value of column) {...}`) 21 | * provide more efficient ways for bulk access or scanning. 22 | * @template T 23 | */ 24 | export class Column { 25 | /** 26 | * Create a new column instance. 27 | * @param {import('./batch.js').Batch[]} data The value batches. 28 | * @param {import('./types.js').DataType} [type] The column data type. 29 | * If not specified, the type is extracted from the batches. 30 | */ 31 | constructor(data, type = data[0]?.type) { 32 | /** 33 | * The column data type. 34 | * @type {import('./types.js').DataType} 35 | * @readonly 36 | */ 37 | this.type = type; 38 | /** 39 | * The column length. 40 | * @type {number} 41 | * @readonly 42 | */ 43 | this.length = data.reduce((m, c) => m + c.length, 0); 44 | /** 45 | * The count of null values in the column. 46 | * @type {number} 47 | * @readonly 48 | */ 49 | this.nullCount = data.reduce((m, c) => m + c.nullCount, 0); 50 | /** 51 | * An array of column data batches. 52 | * @type {readonly import('./batch.js').Batch[]} 53 | * @readonly 54 | */ 55 | this.data = data; 56 | 57 | const n = data.length; 58 | const offsets = new Int32Array(n + 1); 59 | if (n === 1) { 60 | const [ batch ] = data; 61 | offsets[1] = batch.length; 62 | // optimize access to single batch 63 | this.at = index => batch.at(index); 64 | } else { 65 | for (let i = 0, s = 0; i < n; ++i) { 66 | offsets[i + 1] = (s += data[i].length); 67 | } 68 | } 69 | 70 | /** 71 | * Index offsets for data batches. 72 | * Used to map a column row index to a batch-specific index. 73 | * @type {Int32Array} 74 | * @readonly 75 | */ 76 | this.offsets = offsets; 77 | } 78 | 79 | /** 80 | * Provide an informative object string tag. 81 | */ 82 | get [Symbol.toStringTag]() { 83 | return 'Column'; 84 | } 85 | 86 | /** 87 | * Return an iterator over the values in this column. 88 | * @returns {Iterator} 89 | */ 90 | [Symbol.iterator]() { 91 | const data = this.data; 92 | return data.length === 1 93 | ? data[0][Symbol.iterator]() 94 | : batchedIterator(data); 95 | } 96 | 97 | /** 98 | * Return the column value at the given index. If a column has multiple 99 | * batches, this method performs binary search over the batch lengths to 100 | * determine the batch from which to retrieve the value. The search makes 101 | * lookup less efficient than a standard array access. If making a full 102 | * scan of a column, consider extracting arrays via `toArray()` or using an 103 | * iterator (`for (const value of column) {...}`). 104 | * @param {number} index The row index. 105 | * @returns {T | null} The value. 106 | */ 107 | at(index) { 108 | // NOTE: if there is only one batch, this method is replaced with an 109 | // optimized version in the Column constructor. 110 | const { data, offsets } = this; 111 | const i = bisect(offsets, index) - 1; 112 | return data[i]?.at(index - offsets[i]); // undefined if out of range 113 | } 114 | 115 | /** 116 | * Return the column value at the given index. This method is the same as 117 | * `at()` and is provided for better compatibility with Apache Arrow JS. 118 | * @param {number} index The row index. 119 | * @returns {T | null} The value. 120 | */ 121 | get(index) { 122 | return this.at(index); 123 | } 124 | 125 | /** 126 | * Extract column values into a single array instance. When possible, 127 | * a zero-copy subarray of the input Arrow data is returned. 128 | * @returns {import('./types.js').ValueArray} 129 | */ 130 | toArray() { 131 | const { length, nullCount, data } = this; 132 | const copy = !nullCount && isDirectBatch(data[0]); 133 | const n = data.length; 134 | 135 | if (copy && n === 1) { 136 | // use batch array directly 137 | // @ts-ignore 138 | return data[0].values; 139 | } 140 | 141 | // determine output array type 142 | const ArrayType = !n || nullCount > 0 ? Array 143 | // @ts-ignore 144 | : (data[0].constructor.ArrayType ?? data[0].values.constructor); 145 | 146 | const array = new ArrayType(length); 147 | return copy ? copyArray(array, data) : extractArray(array, data); 148 | } 149 | 150 | /** 151 | * Return an array of cached column values. 152 | * Used internally to accelerate dictionary types. 153 | */ 154 | cache() { 155 | return this._cache ?? (this._cache = this.toArray()); 156 | } 157 | } 158 | 159 | function *batchedIterator(data) { 160 | for (let i = 0; i < data.length; ++i) { 161 | const iter = data[i][Symbol.iterator](); 162 | for (let next = iter.next(); !next.done; next = iter.next()) { 163 | yield next.value; 164 | } 165 | } 166 | } 167 | 168 | function copyArray(array, data) { 169 | for (let i = 0, offset = 0; i < data.length; ++i) { 170 | const { values } = data[i]; 171 | array.set(values, offset); 172 | offset += values.length; 173 | } 174 | return array; 175 | } 176 | 177 | function extractArray(array, data) { 178 | let index = -1; 179 | for (let i = 0; i < data.length; ++i) { 180 | const batch = data[i]; 181 | for (let j = 0; j < batch.length; ++j) { 182 | array[++index] = batch.at(j); 183 | } 184 | } 185 | return array; 186 | } 187 | -------------------------------------------------------------------------------- /src/decode/block.js: -------------------------------------------------------------------------------- 1 | import { readInt32, readInt64, readVector } from '../util/read.js'; 2 | 3 | /** 4 | * Decode a block that points to messages within an Arrow 'file' format. 5 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 6 | * @param {number} index The starting index in the byte buffer 7 | * @returns The file block. 8 | */ 9 | export function decodeBlock(buf, index) { 10 | // 0: offset 11 | // 8: metadataLength 12 | // 16: bodyLength 13 | return { 14 | offset: readInt64(buf, index), 15 | metadataLength: readInt32(buf, index + 8), 16 | bodyLength: readInt64(buf, index + 16) 17 | } 18 | } 19 | 20 | /** 21 | * Decode a vector of blocks. 22 | * @param {Uint8Array} buf 23 | * @param {number} index 24 | * @returns An array of file blocks. 25 | */ 26 | export function decodeBlocks(buf, index) { 27 | return readVector(buf, index, 24, decodeBlock); 28 | } 29 | -------------------------------------------------------------------------------- /src/decode/data-type.js: -------------------------------------------------------------------------------- 1 | import { DateUnit, IntervalUnit, Precision, TimeUnit, Type, UnionMode } from '../constants.js'; 2 | import { binary, date, decimal, duration, fixedSizeBinary, fixedSizeList, float, int, interval, invalidDataType, largeBinary, largeList, largeListView, largeUtf8, list, listView, mapType, runEndEncoded, struct, time, timestamp, union, utf8 } from '../data-types.js'; 3 | import { checkOneOf } from '../util/objects.js'; 4 | import { readBoolean, readInt16, readInt32, readObject, readOffset, readString, readVector } from '../util/read.js'; 5 | 6 | /** 7 | * Decode a data type definition for a field. 8 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data. 9 | * @param {number} index The starting index in the byte buffer. 10 | * @param {number} typeId The data type id. 11 | * @param {import('../types.js').Field[]} [children] A list of parsed child fields. 12 | * @returns {import('../types.js').DataType} The data type. 13 | */ 14 | export function decodeDataType(buf, index, typeId, children) { 15 | checkOneOf(typeId, Type, invalidDataType); 16 | const get = readObject(buf, index); 17 | 18 | switch (typeId) { 19 | // types without flatbuffer objects 20 | case Type.Binary: return binary(); 21 | case Type.Utf8: return utf8(); 22 | case Type.LargeBinary: return largeBinary(); 23 | case Type.LargeUtf8: return largeUtf8(); 24 | case Type.List: return list(children[0]); 25 | case Type.ListView: return listView(children[0]); 26 | case Type.LargeList: return largeList(children[0]); 27 | case Type.LargeListView: return largeListView(children[0]); 28 | case Type.Struct: return struct(children); 29 | case Type.RunEndEncoded: return runEndEncoded(children[0], children[1]); 30 | 31 | // types with flatbuffer objects 32 | case Type.Int: return int( 33 | // @ts-ignore 34 | get(4, readInt32, 0), // bitwidth 35 | get(6, readBoolean, false) // signed 36 | ); 37 | case Type.Float: return float( 38 | // @ts-ignore 39 | get(4, readInt16, Precision.HALF) // precision 40 | ); 41 | case Type.Decimal: return decimal( 42 | get(4, readInt32, 0), // precision 43 | get(6, readInt32, 0), // scale 44 | // @ts-ignore 45 | get(8, readInt32, 128) // bitwidth 46 | ); 47 | case Type.Date: return date( 48 | // @ts-ignore 49 | get(4, readInt16, DateUnit.MILLISECOND) // unit 50 | ); 51 | case Type.Time: return time( 52 | // @ts-ignore 53 | get(4, readInt16, TimeUnit.MILLISECOND), // unit 54 | get(6, readInt32, 32) // bitWidth 55 | ); 56 | case Type.Timestamp: return timestamp( 57 | // @ts-ignore 58 | get(4, readInt16, TimeUnit.SECOND), // unit 59 | get(6, readString) // timezone 60 | ); 61 | case Type.Interval: return interval( 62 | // @ts-ignore 63 | get(4, readInt16, IntervalUnit.YEAR_MONTH) // unit 64 | ); 65 | case Type.Duration: return duration( 66 | // @ts-ignore 67 | get(4, readInt16, TimeUnit.MILLISECOND) // unit 68 | ); 69 | 70 | case Type.FixedSizeBinary: return fixedSizeBinary( 71 | get(4, readInt32, 0) // stride 72 | ); 73 | case Type.FixedSizeList: return fixedSizeList( 74 | children[0], 75 | get(4, readInt32, 0), // stride 76 | ); 77 | case Type.Map: return mapType( 78 | get(4, readBoolean, false), // keysSorted 79 | children[0] 80 | ); 81 | 82 | case Type.Union: return union( 83 | // @ts-ignore 84 | get(4, readInt16, UnionMode.Sparse), // mode 85 | children, 86 | readVector(buf, get(6, readOffset), 4, readInt32) // type ids 87 | ); 88 | } 89 | // case Type.NONE: 90 | // case Type.Null: 91 | // case Type.Bool: 92 | // case Type.BinaryView: 93 | // case Type.Utf8View: 94 | // @ts-ignore 95 | return { typeId }; 96 | } 97 | -------------------------------------------------------------------------------- /src/decode/decode-ipc.js: -------------------------------------------------------------------------------- 1 | import { MAGIC, MessageHeader, Version } from '../constants.js'; 2 | import { readInt16, readInt32, readObject } from '../util/read.js'; 3 | import { decodeBlocks } from './block.js'; 4 | import { decodeMessage } from './message.js'; 5 | import { decodeMetadata } from './metadata.js'; 6 | import { decodeSchema } from './schema.js'; 7 | 8 | /** 9 | * Decode [Apache Arrow IPC data][1] and return parsed schema, record batch, 10 | * and dictionary batch definitions. The input binary data may be either 11 | * an `ArrayBuffer` or `Uint8Array`. For Arrow data in the IPC 'stream' format, 12 | * an array of `Uint8Array` instances is also supported. 13 | * 14 | * This method stops short of generating views over field buffers. Use the 15 | * `createData()` method on the result to enable column data access. 16 | * 17 | * [1]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc 18 | * @param {ArrayBuffer | Uint8Array | Uint8Array[]} data 19 | * The source byte buffer, or an array of buffers. If an array, each byte 20 | * array may contain one or more self-contained messages. Messages may NOT 21 | * span multiple byte arrays. 22 | * @returns {import('../types.js').ArrowData} 23 | */ 24 | export function decodeIPC(data) { 25 | const source = data instanceof ArrayBuffer 26 | ? new Uint8Array(data) 27 | : data; 28 | return source instanceof Uint8Array && isArrowFileFormat(source) 29 | ? decodeIPCFile(source) 30 | : decodeIPCStream(source); 31 | } 32 | 33 | /** 34 | * @param {Uint8Array} buf 35 | * @returns {boolean} 36 | */ 37 | function isArrowFileFormat(buf) { 38 | if (!buf || buf.length < 4) return false; 39 | for (let i = 0; i < 6; ++i) { 40 | if (MAGIC[i] !== buf[i]) return false; 41 | } 42 | return true; 43 | } 44 | 45 | /** 46 | * Decode data in the [Arrow IPC 'stream' format][1]. 47 | * 48 | * [1]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format 49 | * @param {Uint8Array | Uint8Array[]} data The source byte buffer, or an 50 | * array of buffers. If an array, each byte array may contain one or more 51 | * self-contained messages. Messages may NOT span multiple byte arrays. 52 | * @returns {import('../types.js').ArrowData} 53 | */ 54 | export function decodeIPCStream(data) { 55 | const stream = [data].flat(); 56 | 57 | let schema; 58 | const records = []; 59 | const dictionaries = []; 60 | 61 | // consume each message in the stream 62 | for (const buf of stream) { 63 | if (!(buf instanceof Uint8Array)) { 64 | throw new Error(`IPC data batch was not a Uint8Array.`); 65 | } 66 | let offset = 0; 67 | 68 | // decode all messages in current buffer 69 | while (true) { 70 | const m = decodeMessage(buf, offset); 71 | if (m === null) break; // end of messages 72 | offset = m.index; 73 | if (!m.content) continue; 74 | switch (m.type) { 75 | case MessageHeader.Schema: 76 | // ignore repeated schema messages 77 | if (!schema) schema = m.content; 78 | break; 79 | case MessageHeader.RecordBatch: 80 | records.push(m.content); 81 | break; 82 | case MessageHeader.DictionaryBatch: 83 | dictionaries.push(m.content); 84 | break; 85 | } 86 | } 87 | } 88 | 89 | return /** @type {import('../types.js').ArrowData} */ ( 90 | { schema, dictionaries, records, metadata: null } 91 | ); 92 | } 93 | 94 | /** 95 | * Decode data in the [Arrow IPC 'file' format][1]. 96 | * 97 | * [1]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format 98 | * @param {Uint8Array} data The source byte buffer. 99 | * @returns {import('../types.js').ArrowData} 100 | */ 101 | export function decodeIPCFile(data) { 102 | // find footer location 103 | const offset = data.byteLength - (MAGIC.length + 4); 104 | const length = readInt32(data, offset); 105 | 106 | // decode file footer 107 | // 4: version 108 | // 6: schema 109 | // 8: dictionaries (vector) 110 | // 10: batches (vector) 111 | // 12: metadata 112 | const get = readObject(data, offset - length); 113 | const version = /** @type {import('../types.js').Version_} */ 114 | (get(4, readInt16, Version.V1)); 115 | const dicts = get(8, decodeBlocks, []); 116 | const recs = get(10, decodeBlocks, []); 117 | 118 | return /** @type {import('../types.js').ArrowData} */ ({ 119 | schema: get(6, (buf, index) => decodeSchema(buf, index, version)), 120 | dictionaries: dicts.map(({ offset }) => decodeMessage(data, offset).content), 121 | records: recs.map(({ offset }) => decodeMessage(data, offset).content), 122 | metadata: get(12, decodeMetadata) 123 | }); 124 | } 125 | -------------------------------------------------------------------------------- /src/decode/dictionary-batch.js: -------------------------------------------------------------------------------- 1 | import { readBoolean, readInt64, readObject } from '../util/read.js'; 2 | import { decodeRecordBatch } from './record-batch.js'; 3 | 4 | /** 5 | * Decode a dictionary batch. 6 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 7 | * @param {number} index The starting index in the byte buffer 8 | * @param {import('../types.js').Version_} version Arrow version value 9 | * @returns {import('../types.js').DictionaryBatch} The dictionary batch 10 | */ 11 | export function decodeDictionaryBatch(buf, index, version) { 12 | // 4: id 13 | // 6: data 14 | // 8: isDelta 15 | const get = readObject(buf, index); 16 | return { 17 | id: get(4, readInt64, 0), 18 | data: get(6, (buf, off) => decodeRecordBatch(buf, off, version)), 19 | /** 20 | * If isDelta is true the values in the dictionary are to be appended to a 21 | * dictionary with the indicated id. If isDelta is false this dictionary 22 | * should replace the existing dictionary. 23 | */ 24 | isDelta: get(8, readBoolean, false) 25 | }; 26 | } 27 | -------------------------------------------------------------------------------- /src/decode/message.js: -------------------------------------------------------------------------------- 1 | import { MessageHeader, Version } from '../constants.js'; 2 | import { keyFor } from '../util/objects.js'; 3 | import { SIZEOF_INT, readInt16, readInt32, readInt64, readObject, readOffset, readUint8 } from '../util/read.js'; 4 | import { decodeDictionaryBatch } from './dictionary-batch.js'; 5 | import { decodeRecordBatch } from './record-batch.js'; 6 | import { decodeSchema } from './schema.js'; 7 | 8 | const invalidMessageMetadata = (expected, actual) => 9 | `Expected to read ${expected} metadata bytes, but only read ${actual}.`; 10 | 11 | const invalidMessageBodyLength = (expected, actual) => 12 | `Expected to read ${expected} bytes for message body, but only read ${actual}.`; 13 | 14 | const invalidMessageType = (type) => 15 | `Unsupported message type: ${type} (${keyFor(MessageHeader, type)})`; 16 | 17 | /** 18 | * A "message" contains a block of Apache Arrow data, such as a schema, 19 | * record batch, or dictionary batch. This message decodes a single 20 | * message, returning its associated metadata and content. 21 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 22 | * @param {number} index The starting index in the byte buffer 23 | * @returns {import('../types.js').Message} The decoded message. 24 | */ 25 | export function decodeMessage(buf, index) { 26 | // get message start 27 | let metadataLength = readInt32(buf, index) || 0; 28 | index += SIZEOF_INT; 29 | 30 | // ARROW-6313: If the first 4 bytes are continuation indicator (-1), read 31 | // the next 4 for the 32-bit metadata length. Otherwise, assume this is a 32 | // pre-v0.15 message, where the first 4 bytes are the metadata length. 33 | if (metadataLength === -1) { 34 | metadataLength = readInt32(buf, index) || 0; 35 | index += SIZEOF_INT; 36 | } 37 | if (metadataLength === 0) return null; 38 | 39 | const head = buf.subarray(index, index += metadataLength); 40 | if (head.byteLength < metadataLength) { 41 | throw new Error(invalidMessageMetadata(metadataLength, head.byteLength)); 42 | } 43 | 44 | // decode message metadata 45 | // 4: version 46 | // 6: headerType 47 | // 8: headerIndex 48 | // 10: bodyLength 49 | const get = readObject(head, 0); 50 | const version = /** @type {import('../types.js').Version_} */ 51 | (get(4, readInt16, Version.V1)); 52 | const type = /** @type {import('../types.js').MessageHeader_} */ 53 | (get(6, readUint8, MessageHeader.NONE)); 54 | const offset = get(8, readOffset, 0); 55 | const bodyLength = get(10, readInt64, 0); 56 | let content; 57 | 58 | if (offset) { 59 | // decode message header 60 | const decoder = type === MessageHeader.Schema ? decodeSchema 61 | : type === MessageHeader.DictionaryBatch ? decodeDictionaryBatch 62 | : type === MessageHeader.RecordBatch ? decodeRecordBatch 63 | : null; 64 | if (!decoder) throw new Error(invalidMessageType(type)); 65 | content = decoder(head, offset, version); 66 | 67 | // extract message body 68 | if (bodyLength > 0) { 69 | const body = buf.subarray(index, index += bodyLength); 70 | if (body.byteLength < bodyLength) { 71 | throw new Error(invalidMessageBodyLength(bodyLength, body.byteLength)); 72 | } 73 | // @ts-ignore 74 | content.body = body; 75 | } 76 | } 77 | 78 | return { version, type, index, content }; 79 | } 80 | -------------------------------------------------------------------------------- /src/decode/metadata.js: -------------------------------------------------------------------------------- 1 | import { readObject, readString, readVector } from '../util/read.js'; 2 | 3 | /** 4 | * Decode custom metadata consisting of key-value string pairs. 5 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 6 | * @param {number} index The starting index in the byte buffer 7 | * @returns {import('../types.js').Metadata | null} The custom metadata map 8 | */ 9 | export function decodeMetadata(buf, index) { 10 | const entries = readVector(buf, index, 4, (buf, pos) => { 11 | const get = readObject(buf, pos); 12 | return /** @type {[string, string]} */ ([ 13 | get(4, readString), // 4: key (string) 14 | get(6, readString) // 6: key (string) 15 | ]); 16 | }); 17 | return entries.length ? new Map(entries) : null; 18 | } 19 | -------------------------------------------------------------------------------- /src/decode/record-batch.js: -------------------------------------------------------------------------------- 1 | import { Version } from '../constants.js'; 2 | import { readInt64, readObject, readOffset, readVector } from '../util/read.js'; 3 | 4 | /** 5 | * Decode a record batch. 6 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 7 | * @param {number} index The starting index in the byte buffer 8 | * @param {import('../types.js').Version_} version Arrow version value 9 | * @returns {import('../types.js').RecordBatch} The record batch 10 | */ 11 | export function decodeRecordBatch(buf, index, version) { 12 | // 4: length 13 | // 6: nodes 14 | // 8: buffers 15 | // 10: compression (not supported) 16 | // 12: variadicBuffers (buffer counts for view-typed fields) 17 | const get = readObject(buf, index); 18 | if (get(10, readOffset, 0)) { 19 | throw new Error('Record batch compression not implemented'); 20 | } 21 | 22 | // If an Arrow buffer was written before version 4, 23 | // advance 8 bytes to skip the now-removed page_id field 24 | const offset = version < Version.V4 ? 8 : 0; 25 | 26 | return { 27 | length: get(4, readInt64, 0), 28 | nodes: readVector(buf, get(6, readOffset), 16, (buf, pos) => ({ 29 | length: readInt64(buf, pos), 30 | nullCount: readInt64(buf, pos + 8) 31 | })), 32 | regions: readVector(buf, get(8, readOffset), 16 + offset, (buf, pos) => ({ 33 | offset: readInt64(buf, pos + offset), 34 | length: readInt64(buf, pos + offset + 8) 35 | })), 36 | variadic: readVector(buf, get(12, readOffset), 8, readInt64) 37 | }; 38 | } 39 | -------------------------------------------------------------------------------- /src/decode/schema.js: -------------------------------------------------------------------------------- 1 | import { Type } from '../constants.js'; 2 | import { dictionary, int32 } from '../data-types.js'; 3 | import { readBoolean, readInt16, readInt64, readObject, readOffset, readString, readUint8, readVector } from '../util/read.js'; 4 | import { decodeDataType } from './data-type.js'; 5 | import { decodeMetadata } from './metadata.js'; 6 | 7 | /** 8 | * Decode a table schema describing the fields and their data types. 9 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data 10 | * @param {number} index The starting index in the byte buffer 11 | * @param {import('../types.js').Version_} version Arrow version value 12 | * @returns {import('../types.js').Schema} The schema 13 | */ 14 | export function decodeSchema(buf, index, version) { 15 | // 4: endianness (int16) 16 | // 6: fields (vector) 17 | // 8: metadata (vector) 18 | // 10: features (int64[]) 19 | const get = readObject(buf, index); 20 | return { 21 | version, 22 | endianness: /** @type {import('../types.js').Endianness_} */ (get(4, readInt16, 0)), 23 | fields: get(6, decodeSchemaFields, []), 24 | metadata: get(8, decodeMetadata) 25 | }; 26 | } 27 | 28 | /** 29 | * @returns {import('../types.js').Field[] | null} 30 | */ 31 | function decodeSchemaFields(buf, fieldsOffset) { 32 | return readVector(buf, fieldsOffset, 4, decodeField); 33 | } 34 | 35 | /** 36 | * @returns {import('../types.js').Field} 37 | */ 38 | function decodeField(buf, index) { 39 | // 4: name (string) 40 | // 6: nullable (bool) 41 | // 8: type id (uint8) 42 | // 10: type (union) 43 | // 12: dictionary (table) 44 | // 14: children (vector) 45 | // 16: metadata (vector) 46 | const get = readObject(buf, index); 47 | const typeId = get(8, readUint8, Type.NONE); 48 | const typeOffset = get(10, readOffset, 0); 49 | const dict = get(12, decodeDictionary); 50 | const children = get(14, (buf, off) => decodeFieldChildren(buf, off)); 51 | 52 | let type = decodeDataType(buf, typeOffset, typeId, children); 53 | if (dict) { 54 | dict.dictionary = type; 55 | type = dict; 56 | } 57 | 58 | return { 59 | name: get(4, readString), 60 | type, 61 | nullable: get(6, readBoolean, false), 62 | metadata: get(16, decodeMetadata) 63 | }; 64 | } 65 | 66 | /** 67 | * @returns {import('../types.js').Field[] | null} 68 | */ 69 | function decodeFieldChildren(buf, fieldOffset) { 70 | const children = readVector(buf, fieldOffset, 4, decodeField); 71 | return children.length ? children : null; 72 | } 73 | 74 | /** 75 | * @param {Uint8Array} buf 76 | * @param {number} index 77 | * @returns {import('../types.js').DictionaryType} 78 | */ 79 | function decodeDictionary(buf, index) { 80 | if (!index) return null; 81 | // 4: id (int64) 82 | // 6: indexType (Int type) 83 | // 8: isOrdered (boolean) 84 | // 10: kind (int16) currently only dense array is supported 85 | const get = readObject(buf, index); 86 | return dictionary( 87 | null, // data type will be populated by caller 88 | get(6, decodeInt, int32()), // index type 89 | get(8, readBoolean, false), // ordered 90 | get(4, readInt64, 0), // id 91 | ); 92 | } 93 | 94 | /** 95 | * Decode an integer data type. 96 | * @param {Uint8Array} buf A byte buffer of binary Arrow IPC data. 97 | * @param {number} index The starting index in the byte buffer. 98 | * @returns {import('../types.js').IntType} 99 | */ 100 | function decodeInt(buf, index) { 101 | return /** @type {import('../types.js').IntType} */ ( 102 | decodeDataType(buf, index, Type.Int) 103 | ); 104 | } 105 | -------------------------------------------------------------------------------- /src/decode/table-from-ipc.js: -------------------------------------------------------------------------------- 1 | import { batchType } from '../batch-type.js'; 2 | import { columnBuilder } from '../column.js'; 3 | import { Type, UnionMode, Version } from '../constants.js'; 4 | import { invalidDataType } from '../data-types.js'; 5 | import { Table } from '../table.js'; 6 | import { int8Array } from '../util/arrays.js'; 7 | import { decodeIPC } from './decode-ipc.js'; 8 | 9 | /** 10 | * Decode [Apache Arrow IPC data][1] and return a new Table. The input binary 11 | * data may be either an `ArrayBuffer` or `Uint8Array`. For Arrow data in the 12 | * [IPC 'stream' format][2], an array of `Uint8Array` values is also supported. 13 | * 14 | * [1]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc 15 | * [2]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format 16 | * @param {ArrayBuffer | Uint8Array | Uint8Array[]} data 17 | * The source byte buffer, or an array of buffers. If an array, each byte 18 | * array may contain one or more self-contained messages. Messages may NOT 19 | * span multiple byte arrays. 20 | * @param {import('../types.js').ExtractionOptions} [options] 21 | * Options for controlling how values are transformed when extracted 22 | * from an Arrow binary representation. 23 | * @returns {Table} A Table instance. 24 | */ 25 | export function tableFromIPC(data, options) { 26 | return createTable(decodeIPC(data), options); 27 | } 28 | 29 | /** 30 | * Create a table from parsed IPC data. 31 | * @param {import('../types.js').ArrowData} data 32 | * The IPC data, as returned by parseIPC. 33 | * @param {import('../types.js').ExtractionOptions} [options] 34 | * Options for controlling how values are transformed when extracted 35 | * from am Arrow binary representation. 36 | * @returns {Table} A Table instance. 37 | */ 38 | export function createTable(data, options = {}) { 39 | const { schema = { fields: [] }, dictionaries, records } = data; 40 | const { version, fields } = schema; 41 | const dictionaryMap = new Map; 42 | const context = contextGenerator(options, version, dictionaryMap); 43 | 44 | // build dictionary type map 45 | const dictionaryTypes = new Map; 46 | visitSchemaFields(schema, field => { 47 | const type = field.type; 48 | if (type.typeId === Type.Dictionary) { 49 | dictionaryTypes.set(type.id, type.dictionary); 50 | } 51 | }); 52 | 53 | // decode dictionaries, build dictionary column map 54 | const dicts = new Map; 55 | for (const dict of dictionaries) { 56 | const { id, data, isDelta, body } = dict; 57 | const type = dictionaryTypes.get(id); 58 | const batch = visit(type, context({ ...data, body })); 59 | if (!dicts.has(id)) { 60 | if (isDelta) { 61 | throw new Error('Delta update can not be first dictionary batch.'); 62 | } 63 | dicts.set(id, columnBuilder(type).add(batch)); 64 | } else { 65 | const dict = dicts.get(id); 66 | if (!isDelta) dict.clear(); 67 | dict.add(batch); 68 | } 69 | } 70 | dicts.forEach((value, key) => dictionaryMap.set(key, value.done())); 71 | 72 | // decode column fields 73 | const cols = fields.map(f => columnBuilder(f.type)); 74 | for (const batch of records) { 75 | const ctx = context(batch); 76 | fields.forEach((f, i) => cols[i].add(visit(f.type, ctx))); 77 | } 78 | 79 | return new Table(schema, cols.map(c => c.done()), options.useProxy); 80 | } 81 | 82 | /** 83 | * Visit all fields within a schema. 84 | * @param {import('../types.js').Schema} schema 85 | * @param {(field: import('../types.js').Field) => void} visitor 86 | */ 87 | function visitSchemaFields(schema, visitor) { 88 | schema.fields.forEach(function visitField(field) { 89 | visitor(field); 90 | // @ts-ignore 91 | field.type.dictionary?.children?.forEach(visitField); 92 | // @ts-ignore 93 | field.type.children?.forEach(visitField); 94 | }); 95 | } 96 | 97 | /** 98 | * Context object generator for field visitation and buffer definition. 99 | */ 100 | function contextGenerator(options, version, dictionaryMap) { 101 | const base = { 102 | version, 103 | options, 104 | dictionary: id => dictionaryMap.get(id), 105 | }; 106 | 107 | /** 108 | * Return a context generator. 109 | * @param {import('../types.js').RecordBatch} batch 110 | */ 111 | return batch => { 112 | const { length, nodes, regions, variadic, body } = batch; 113 | let nodeIndex = -1; 114 | let bufferIndex = -1; 115 | let variadicIndex = -1; 116 | return { 117 | ...base, 118 | length, 119 | node: () => nodes[++nodeIndex], 120 | buffer: (ArrayType) => { 121 | const { length, offset } = regions[++bufferIndex]; 122 | return ArrayType 123 | ? new ArrayType(body.buffer, body.byteOffset + offset, length / ArrayType.BYTES_PER_ELEMENT) 124 | : body.subarray(offset, offset + length) 125 | }, 126 | variadic: () => variadic[++variadicIndex], 127 | visit(children) { return children.map(f => visit(f.type, this)); } 128 | }; 129 | }; 130 | } 131 | 132 | /** 133 | * Visit a field, instantiating views of buffer regions. 134 | */ 135 | function visit(type, ctx) { 136 | const { typeId } = type; 137 | const { length, options, node, buffer, variadic, version } = ctx; 138 | const BatchType = batchType(type, options); 139 | 140 | if (typeId === Type.Null) { 141 | // no field node, no buffers 142 | return new BatchType({ length, nullCount: length, type }); 143 | } 144 | 145 | // extract the next { length, nullCount } field node 146 | const base = { ...node(), type }; 147 | 148 | switch (typeId) { 149 | // validity and data value buffers 150 | case Type.Bool: 151 | case Type.Int: 152 | case Type.Time: 153 | case Type.Duration: 154 | case Type.Float: 155 | case Type.Decimal: 156 | case Type.Date: 157 | case Type.Timestamp: 158 | case Type.Interval: 159 | case Type.FixedSizeBinary: 160 | return new BatchType({ 161 | ...base, 162 | validity: buffer(), 163 | values: buffer(type.values) 164 | }); 165 | 166 | // validity, offset, and value buffers 167 | case Type.Utf8: 168 | case Type.LargeUtf8: 169 | case Type.Binary: 170 | case Type.LargeBinary: 171 | return new BatchType({ 172 | ...base, 173 | validity: buffer(), 174 | offsets: buffer(type.offsets), 175 | values: buffer() 176 | }); 177 | 178 | // views with variadic buffers 179 | case Type.BinaryView: 180 | case Type.Utf8View: 181 | return new BatchType({ 182 | ...base, 183 | validity: buffer(), 184 | values: buffer(), // views buffer 185 | data: Array.from({ length: variadic() }, () => buffer()) // data buffers 186 | }); 187 | 188 | // validity, offset, and list child 189 | case Type.List: 190 | case Type.LargeList: 191 | case Type.Map: 192 | return new BatchType({ 193 | ...base, 194 | validity: buffer(), 195 | offsets: buffer(type.offsets), 196 | children: ctx.visit(type.children) 197 | }); 198 | 199 | // validity, offset, size, and list child 200 | case Type.ListView: 201 | case Type.LargeListView: 202 | return new BatchType({ 203 | ...base, 204 | validity: buffer(), 205 | offsets: buffer(type.offsets), 206 | sizes: buffer(type.offsets), 207 | children: ctx.visit(type.children) 208 | }); 209 | 210 | // validity and children 211 | case Type.FixedSizeList: 212 | case Type.Struct: 213 | return new BatchType({ 214 | ...base, 215 | validity: buffer(), 216 | children: ctx.visit(type.children) 217 | }); 218 | 219 | // children only 220 | case Type.RunEndEncoded: 221 | return new BatchType({ 222 | ...base, 223 | children: ctx.visit(type.children) 224 | }); 225 | 226 | // dictionary 227 | case Type.Dictionary: { 228 | const { id, indices } = type; 229 | return new BatchType({ 230 | ...base, 231 | validity: buffer(), 232 | values: buffer(indices.values), 233 | }).setDictionary(ctx.dictionary(id)); 234 | } 235 | 236 | // union 237 | case Type.Union: { 238 | if (version < Version.V5) { 239 | buffer(); // skip unused null bitmap 240 | } 241 | return new BatchType({ 242 | ...base, 243 | typeIds: buffer(int8Array), 244 | offsets: type.mode === UnionMode.Sparse ? null : buffer(type.offsets), 245 | children: ctx.visit(type.children) 246 | }); 247 | } 248 | 249 | // unsupported type 250 | default: 251 | throw new Error(invalidDataType(typeId)); 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/encode/data-type.js: -------------------------------------------------------------------------------- 1 | import { DateUnit, IntervalUnit, Precision, TimeUnit, Type, UnionMode } from '../constants.js'; 2 | import { invalidDataType } from '../data-types.js'; 3 | import { checkOneOf } from '../util/objects.js'; 4 | 5 | /** 6 | * Encode a data type into a flatbuffer. 7 | * @param {import('./builder.js').Builder} builder 8 | * @param {import('../types.js').DataType} type 9 | * @returns {number} The offset at which the data type is written. 10 | */ 11 | export function encodeDataType(builder, type) { 12 | const typeId = checkOneOf(type.typeId, Type, invalidDataType); 13 | 14 | switch (typeId) { 15 | case Type.Dictionary: 16 | return encodeDictionary(builder, type); 17 | case Type.Int: 18 | return encodeInt(builder, type); 19 | case Type.Float: 20 | return encodeFloat(builder, type); 21 | case Type.Decimal: 22 | return encodeDecimal(builder, type); 23 | case Type.Date: 24 | return encodeDate(builder, type); 25 | case Type.Time: 26 | return encodeTime(builder, type); 27 | case Type.Timestamp: 28 | return encodeTimestamp(builder, type); 29 | case Type.Interval: 30 | return encodeInterval(builder, type); 31 | case Type.Duration: 32 | return encodeDuration(builder, type); 33 | case Type.FixedSizeBinary: 34 | case Type.FixedSizeList: 35 | return encodeFixedSize(builder, type); 36 | case Type.Map: 37 | return encodeMap(builder, type); 38 | case Type.Union: 39 | return encodeUnion(builder, type); 40 | } 41 | // case Type.Null: 42 | // case Type.Binary: 43 | // case Type.LargeBinary: 44 | // case Type.BinaryView: 45 | // case Type.Bool: 46 | // case Type.Utf8: 47 | // case Type.Utf8View: 48 | // case Type.LargeUtf8: 49 | // case Type.List: 50 | // case Type.ListView: 51 | // case Type.LargeList: 52 | // case Type.LargeListView: 53 | // case Type.RunEndEncoded: 54 | // case Type.Struct: 55 | return builder.addObject(0); 56 | } 57 | 58 | function encodeDate(builder, type) { 59 | return builder.addObject(1, b => { 60 | b.addInt16(0, type.unit, DateUnit.MILLISECOND); 61 | }); 62 | } 63 | 64 | function encodeDecimal(builder, type) { 65 | return builder.addObject(3, b => { 66 | b.addInt32(0, type.precision, 0); 67 | b.addInt32(1, type.scale, 0); 68 | b.addInt32(2, type.bitWidth, 128); 69 | }); 70 | } 71 | 72 | function encodeDuration(builder, type) { 73 | return builder.addObject(1, b => { 74 | b.addInt16(0, type.unit, TimeUnit.MILLISECOND); 75 | }); 76 | } 77 | 78 | function encodeFixedSize(builder, type) { 79 | return builder.addObject(1, b => { 80 | b.addInt32(0, type.stride, 0); 81 | }); 82 | } 83 | 84 | function encodeFloat(builder, type) { 85 | return builder.addObject(1, b => { 86 | b.addInt16(0, type.precision, Precision.HALF); 87 | }); 88 | } 89 | 90 | function encodeInt(builder, type) { 91 | return builder.addObject(2, b => { 92 | b.addInt32(0, type.bitWidth, 0); 93 | b.addInt8(1, +type.signed, 0); 94 | }); 95 | } 96 | 97 | function encodeInterval(builder, type) { 98 | return builder.addObject(1, b => { 99 | b.addInt16(0, type.unit, IntervalUnit.YEAR_MONTH); 100 | }); 101 | } 102 | 103 | function encodeMap(builder, type) { 104 | return builder.addObject(1, b => { 105 | b.addInt8(0, +type.keysSorted, 0); 106 | }); 107 | } 108 | 109 | function encodeTime(builder, type) { 110 | return builder.addObject(2, b => { 111 | b.addInt16(0, type.unit, TimeUnit.MILLISECOND); 112 | b.addInt32(1, type.bitWidth, 32); 113 | }); 114 | } 115 | 116 | function encodeTimestamp(builder, type) { 117 | const timezoneOffset = builder.addString(type.timezone); 118 | return builder.addObject(2, b => { 119 | b.addInt16(0, type.unit, TimeUnit.SECOND); 120 | b.addOffset(1, timezoneOffset, 0); 121 | }); 122 | } 123 | 124 | function encodeUnion(builder, type) { 125 | const typeIdsOffset = builder.addVector( 126 | type.typeIds, 4, 4, 127 | (builder, value) => builder.addInt32(value) 128 | ); 129 | return builder.addObject(2, b => { 130 | b.addInt16(0, type.mode, UnionMode.Sparse); 131 | b.addOffset(1, typeIdsOffset, 0); 132 | }); 133 | } 134 | 135 | function encodeDictionary(builder, type) { 136 | // The Arrow spec uses signed 32-bit integers as the default index type. 137 | // However, multiple 3rd party tools fail on a null (default) index type, 138 | // so we always encode the index data type explicitly here. 139 | return builder.addObject(4, b => { 140 | b.addInt64(0, type.id, 0); 141 | b.addOffset(1, encodeDataType(builder, type.indices), 0); 142 | b.addInt8(2, +type.ordered, 0); 143 | // NOT SUPPORTED: 3, dictionaryKind (defaults to dense array) 144 | }); 145 | } 146 | -------------------------------------------------------------------------------- /src/encode/dictionary-batch.js: -------------------------------------------------------------------------------- 1 | import { encodeRecordBatch } from './record-batch.js'; 2 | 3 | /** 4 | * @param {import('./builder.js').Builder} builder 5 | * @param {import('../types.js').DictionaryBatch} dictionaryBatch 6 | * @returns {number} 7 | */ 8 | export function encodeDictionaryBatch(builder, dictionaryBatch) { 9 | const dataOffset = encodeRecordBatch(builder, dictionaryBatch.data); 10 | return builder.addObject(3, b => { 11 | b.addInt64(0, dictionaryBatch.id, 0); 12 | b.addOffset(1, dataOffset, 0); 13 | b.addInt8(2, +dictionaryBatch.isDelta, 0); 14 | }); 15 | } 16 | -------------------------------------------------------------------------------- /src/encode/encode-ipc.js: -------------------------------------------------------------------------------- 1 | import { EOS, MAGIC, MessageHeader } from '../constants.js'; 2 | import { Builder } from './builder.js'; 3 | import { encodeDictionaryBatch } from './dictionary-batch.js'; 4 | import { writeFooter } from './footer.js'; 5 | import { encodeRecordBatch } from './record-batch.js'; 6 | import { encodeSchema } from './schema.js'; 7 | import { writeMessage } from './message.js'; 8 | import { MemorySink } from './sink.js'; 9 | 10 | const STREAM = 'stream'; 11 | const FILE = 'file'; 12 | 13 | /** 14 | * Encode assembled data into Arrow IPC binary format. 15 | * @param {any} data Assembled table data. 16 | * @param {object} options Encoding options. 17 | * @param {import('./sink.js').Sink} [options.sink] IPC byte consumer. 18 | * @param {'stream' | 'file'} [options.format] Arrow stream or file format. 19 | * @returns {import('./sink.js').Sink} The sink that was passed in. 20 | */ 21 | export function encodeIPC(data, { sink, format = STREAM } = {}) { 22 | if (format !== STREAM && format !== FILE) { 23 | throw new Error(`Unrecognized Arrow IPC format: ${format}`); 24 | } 25 | const { schema, dictionaries = [], records = [], metadata } = data; 26 | const builder = new Builder(sink || new MemorySink()); 27 | const file = format === FILE; 28 | const dictBlocks = []; 29 | const recordBlocks = []; 30 | 31 | if (file) { 32 | builder.addBuffer(MAGIC); 33 | } 34 | 35 | // both stream and file start with the schema 36 | if (schema) { 37 | writeMessage( 38 | builder, 39 | MessageHeader.Schema, 40 | encodeSchema(builder, schema), 41 | 0 42 | ); 43 | } 44 | 45 | // write dictionary messages 46 | for (const dict of dictionaries) { 47 | const { data } = dict; 48 | writeMessage( 49 | builder, 50 | MessageHeader.DictionaryBatch, 51 | encodeDictionaryBatch(builder, dict), 52 | data.byteLength, 53 | dictBlocks 54 | ); 55 | writeBuffers(builder, data.buffers); 56 | } 57 | 58 | // write record batch messages 59 | for (const batch of records) { 60 | writeMessage( 61 | builder, 62 | MessageHeader.RecordBatch, 63 | encodeRecordBatch(builder, batch), 64 | batch.byteLength, 65 | recordBlocks 66 | ); 67 | writeBuffers(builder, batch.buffers); 68 | } 69 | 70 | // both stream and file include end-of-stream message 71 | builder.addBuffer(EOS); 72 | 73 | if (file) { 74 | writeFooter(builder, schema, dictBlocks, recordBlocks, metadata); 75 | } 76 | 77 | return builder.sink; 78 | } 79 | 80 | /** 81 | * Write byte buffers to the builder sink. 82 | * Buffers are aligned to 64 bits (8 bytes) as needed. 83 | * @param {import('./builder.js').Builder} builder 84 | * @param {Uint8Array[]} buffers 85 | */ 86 | function writeBuffers(builder, buffers) { 87 | for (let i = 0; i < buffers.length; ++i) { 88 | builder.addBuffer(buffers[i]); // handles alignment for us 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/encode/footer.js: -------------------------------------------------------------------------------- 1 | import { MAGIC, Version } from '../constants.js'; 2 | import { encodeMetadata } from './metadata.js'; 3 | import { encodeSchema } from './schema.js'; 4 | 5 | /** 6 | * Write a file footer. 7 | * @param {import('./builder.js').Builder} builder The binary builder. 8 | * @param {import('../types.js').Schema} schema The table schema. 9 | * @param {import('../types.js').Block[]} dictBlocks Dictionary batch file blocks. 10 | * @param {import('../types.js').Block[]} recordBlocks Record batch file blocks. 11 | * @param {Map | null} metadata File-level metadata. 12 | */ 13 | export function writeFooter(builder, schema, dictBlocks, recordBlocks, metadata) { 14 | // encode footer flatbuffer 15 | const metadataOffset = encodeMetadata(builder, metadata); 16 | const recsOffset = builder.addVector(recordBlocks, 24, 8, encodeBlock); 17 | const dictsOffset = builder.addVector(dictBlocks, 24, 8, encodeBlock); 18 | const schemaOffset = encodeSchema(builder, schema); 19 | builder.finish( 20 | builder.addObject(5, b => { 21 | b.addInt16(0, Version.V5, Version.V1); 22 | b.addOffset(1, schemaOffset, 0); 23 | b.addOffset(2, dictsOffset, 0); 24 | b.addOffset(3, recsOffset, 0); 25 | b.addOffset(4, metadataOffset, 0); 26 | }) 27 | ); 28 | const size = builder.offset(); 29 | 30 | // add eos with continuation indicator 31 | builder.addInt32(0); 32 | builder.addInt32(-1); 33 | 34 | // write builder contents 35 | builder.flush(); 36 | 37 | // write file tail 38 | builder.sink.write(new Uint8Array(Int32Array.of(size).buffer)); 39 | builder.sink.write(MAGIC); 40 | } 41 | 42 | /** 43 | * Encode a file pointer block. 44 | * @param {import('./builder.js').Builder} builder 45 | * @param {import('../types.js').Block} block 46 | * @returns {number} the current block offset 47 | */ 48 | function encodeBlock(builder, { offset, metadataLength, bodyLength }) { 49 | builder.writeInt64(bodyLength); 50 | builder.writeInt32(0); 51 | builder.writeInt32(metadataLength); 52 | builder.writeInt64(offset); 53 | return builder.offset(); 54 | } 55 | -------------------------------------------------------------------------------- /src/encode/message.js: -------------------------------------------------------------------------------- 1 | import { MessageHeader, Version } from '../constants.js'; 2 | 3 | /** 4 | * Write an IPC message to the builder sink. 5 | * @param {import('./builder.js').Builder} builder 6 | * @param {import('../types.js').MessageHeader_} headerType 7 | * @param {number} headerOffset 8 | * @param {number} bodyLength 9 | * @param {import('../types.js').Block[]} [blocks] 10 | */ 11 | export function writeMessage(builder, headerType, headerOffset, bodyLength, blocks) { 12 | builder.finish( 13 | builder.addObject(5, b => { 14 | b.addInt16(0, Version.V5, Version.V1); 15 | b.addInt8(1, headerType, MessageHeader.NONE); 16 | b.addOffset(2, headerOffset, 0); 17 | b.addInt64(3, bodyLength, 0); 18 | // NOT SUPPORTED: 4, message-level metadata 19 | }) 20 | ); 21 | 22 | const prefixSize = 8; // continuation indicator + message size 23 | const messageSize = builder.offset(); 24 | const alignedSize = (messageSize + prefixSize + 7) & ~7; 25 | 26 | // track blocks for file footer 27 | blocks?.push({ 28 | offset: builder.outputBytes, 29 | metadataLength: alignedSize, 30 | bodyLength 31 | }); 32 | 33 | // write size prefix (including padding) 34 | builder.addInt32(alignedSize - prefixSize); 35 | 36 | // write the stream continuation indicator 37 | builder.addInt32(-1); 38 | 39 | // flush the builder content 40 | builder.flush(); 41 | 42 | // add alignment padding as needed 43 | builder.addPadding(alignedSize - messageSize - prefixSize); 44 | } 45 | -------------------------------------------------------------------------------- /src/encode/metadata.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @param {import('./builder.js').Builder} builder 3 | * @param {Map} metadata 4 | * @returns {number} 5 | */ 6 | export function encodeMetadata(builder, metadata) { 7 | return metadata?.size > 0 8 | ? builder.addOffsetVector(Array.from(metadata, ([k, v]) => { 9 | const key = builder.addString(`${k}`); 10 | const val = builder.addString(`${v}`); 11 | return builder.addObject(2, b => { 12 | b.addOffset(0, key, 0); 13 | b.addOffset(1, val, 0); 14 | }); 15 | })) 16 | : 0; 17 | } 18 | -------------------------------------------------------------------------------- /src/encode/record-batch.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @param {import('./builder.js').Builder} builder 3 | * @param {import('../types.js').RecordBatch} batch 4 | * @returns {number} 5 | */ 6 | export function encodeRecordBatch(builder, batch) { 7 | const { nodes, regions, variadic } = batch; 8 | const nodeVector = builder.addVector(nodes, 16, 8, 9 | (builder, node) => { 10 | builder.writeInt64(node.nullCount); 11 | builder.writeInt64(node.length); 12 | return builder.offset(); 13 | } 14 | ); 15 | const regionVector = builder.addVector(regions, 16, 8, 16 | (builder, region) => { 17 | builder.writeInt64(region.length); 18 | builder.writeInt64(region.offset); 19 | return builder.offset(); 20 | } 21 | ); 22 | const variadicVector = builder.addVector(variadic, 8, 8, 23 | (builder, count) => builder.addInt64(count) 24 | ); 25 | return builder.addObject(5, b => { 26 | b.addInt64(0, nodes[0].length, 0); 27 | b.addOffset(1, nodeVector, 0); 28 | b.addOffset(2, regionVector, 0); 29 | // NOT SUPPORTED: 3, compression offset 30 | b.addOffset(4, variadicVector, 0); 31 | }); 32 | } 33 | -------------------------------------------------------------------------------- /src/encode/schema.js: -------------------------------------------------------------------------------- 1 | import { Type } from '../constants.js'; 2 | import { encodeDataType } from './data-type.js'; 3 | import { encodeMetadata } from './metadata.js'; 4 | 5 | const isLittleEndian = new Uint16Array(new Uint8Array([1, 0]).buffer)[0] === 1; 6 | 7 | /** 8 | * @param {import('./builder.js').Builder} builder 9 | * @param {import('../types.js').Schema} schema 10 | * @returns {number} 11 | */ 12 | export function encodeSchema(builder, schema) { 13 | const { fields, metadata } = schema; 14 | const fieldOffsets = fields.map(f => encodeField(builder, f)); 15 | const fieldsVectorOffset = builder.addOffsetVector(fieldOffsets); 16 | const metadataOffset = encodeMetadata(builder, metadata); 17 | return builder.addObject(4, b => { 18 | b.addInt16(0, +(!isLittleEndian), 0); 19 | b.addOffset(1, fieldsVectorOffset, 0); 20 | b.addOffset(2, metadataOffset, 0); 21 | // NOT SUPPORTED: 3, features 22 | }); 23 | } 24 | 25 | /** 26 | * @param {import('./builder.js').Builder} builder 27 | * @param {import('../types.js').Field} field 28 | * @returns {number} 29 | */ 30 | function encodeField(builder, field) { 31 | const { name, nullable, type, metadata } = field; 32 | let { typeId } = type; 33 | 34 | // encode field data type 35 | let typeOffset = 0; 36 | let dictionaryOffset = 0; 37 | if (typeId !== Type.Dictionary) { 38 | typeOffset = encodeDataType(builder, type); 39 | } else { 40 | const dict = /** @type {import('../types.js').DictionaryType} */ (type).dictionary; 41 | typeId = dict.typeId; 42 | dictionaryOffset = encodeDataType(builder, type); 43 | typeOffset = encodeDataType(builder, dict); 44 | } 45 | 46 | // encode children, metadata, name, and field object 47 | // @ts-ignore 48 | const childOffsets = (type.children || []).map(f => encodeField(builder, f)); 49 | const childrenVectorOffset = builder.addOffsetVector(childOffsets); 50 | const metadataOffset = encodeMetadata(builder, metadata); 51 | const nameOffset = builder.addString(name); 52 | return builder.addObject(7, b => { 53 | b.addOffset(0, nameOffset, 0); 54 | b.addInt8(1, +nullable, +false); 55 | b.addInt8(2, typeId, Type.NONE); 56 | b.addOffset(3, typeOffset, 0); 57 | b.addOffset(4, dictionaryOffset, 0); 58 | b.addOffset(5, childrenVectorOffset, 0); 59 | b.addOffset(6, metadataOffset, 0); 60 | }); 61 | } 62 | -------------------------------------------------------------------------------- /src/encode/sink.js: -------------------------------------------------------------------------------- 1 | export class Sink { 2 | /** 3 | * Write bytes to this sink. 4 | * @param {Uint8Array} bytes The byte buffer to write. 5 | */ 6 | write(bytes) { // eslint-disable-line no-unused-vars 7 | } 8 | 9 | /** 10 | * Write padding bytes (zeroes) to this sink. 11 | * @param {number} byteCount The number of padding bytes. 12 | */ 13 | pad(byteCount) { 14 | this.write(new Uint8Array(byteCount)); 15 | } 16 | 17 | /** 18 | * @returns {Uint8Array | null} 19 | */ 20 | finish() { 21 | return null; 22 | } 23 | } 24 | 25 | export class MemorySink extends Sink { 26 | /** 27 | * A sink that collects bytes in memory. 28 | */ 29 | constructor() { 30 | super(); 31 | this.buffers = []; 32 | } 33 | 34 | /** 35 | * Write bytes 36 | * @param {Uint8Array} bytes 37 | */ 38 | write(bytes) { 39 | this.buffers.push(bytes); 40 | } 41 | 42 | /** 43 | * @returns {Uint8Array} 44 | */ 45 | finish() { 46 | const bufs = this.buffers; 47 | const size = bufs.reduce((sum, b) => sum + b.byteLength, 0); 48 | const buf = new Uint8Array(size); 49 | for (let i = 0, off = 0; i < bufs.length; ++i) { 50 | buf.set(bufs[i], off); 51 | off += bufs[i].byteLength; 52 | } 53 | return buf; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/index-types.ts: -------------------------------------------------------------------------------- 1 | export * from './index.js'; 2 | export * from './types.js'; 3 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | export { 2 | Version, 3 | Endianness, 4 | Type, 5 | Precision, 6 | DateUnit, 7 | TimeUnit, 8 | IntervalUnit, 9 | UnionMode 10 | } from './constants.js'; 11 | 12 | export { 13 | field, 14 | nullType, 15 | int, int8, int16, int32, int64, uint8, uint16,uint32, uint64, 16 | float, float16, float32, float64, 17 | binary, 18 | utf8, 19 | bool, 20 | decimal, decimal32, decimal64, decimal128, decimal256, 21 | date, dateDay, dateMillisecond, 22 | dictionary, 23 | time, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, 24 | timestamp, 25 | interval, 26 | list, 27 | struct, 28 | union, 29 | fixedSizeBinary, 30 | fixedSizeList, 31 | map, 32 | duration, 33 | largeBinary, 34 | largeUtf8, 35 | largeList, 36 | runEndEncoded, 37 | binaryView, 38 | utf8View, 39 | listView, 40 | largeListView 41 | } from './data-types.js'; 42 | 43 | export { Batch } from './batch.js'; 44 | export { Column } from './column.js'; 45 | export { Table } from './table.js'; 46 | export { batchType } from './batch-type.js'; 47 | export { tableFromIPC } from './decode/table-from-ipc.js'; 48 | export { tableToIPC } from './encode/table-to-ipc.js'; 49 | export { tableFromArrays } from './build/table-from-arrays.js'; 50 | export { tableFromColumns } from './build/table-from-columns.js'; 51 | export { columnFromArray } from './build/column-from-array.js'; 52 | export { columnFromValues } from './build/column-from-values.js'; 53 | export { dictionaryContext } from './build/builders/dictionary.js'; 54 | -------------------------------------------------------------------------------- /src/table.js: -------------------------------------------------------------------------------- 1 | import { bisect } from './util/arrays.js'; 2 | import { objectFactory, proxyFactory } from './util/struct.js'; 3 | 4 | /** 5 | * A table consists of a collection of named columns (or 'children'). 6 | * To work with table data directly in JavaScript, use `toColumns()` 7 | * to extract an object that maps column names to extracted value arrays, 8 | * or `toArray()` to extract an array of row objects. For random access 9 | * by row index, use `getChild()` to access data for a specific column. 10 | */ 11 | export class Table { 12 | /** 13 | * Create a new table with the given schema and columns (children). 14 | * @param {import('./types.js').Schema} schema The table schema. 15 | * @param {import('./column.js').Column[]} children The table columns. 16 | * @param {boolean} [useProxy=false] Flag indicating if row proxy 17 | * objects should be used to represent table rows (default `false`). 18 | */ 19 | constructor(schema, children, useProxy = false) { 20 | const names = schema.fields.map(f => f.name); 21 | 22 | /** @readonly */ 23 | this.schema = schema; 24 | /** @readonly */ 25 | this.names = names; 26 | /** 27 | * @type {import('./column.js').Column[]} 28 | * @readonly 29 | */ 30 | this.children = children; 31 | /** 32 | * @type {import('./types.js').StructFactory} 33 | * @readonly 34 | */ 35 | this.factory = useProxy ? proxyFactory : objectFactory; 36 | 37 | // lazily created row object generators 38 | const gen = []; 39 | 40 | /** 41 | * Returns a row object generator for the given batch index. 42 | * @private 43 | * @readonly 44 | * @param {number} b The batch index. 45 | * @returns {(index: number) => Record} 46 | */ 47 | this.getFactory = b => gen[b] 48 | ?? (gen[b] = this.factory(names, children.map(c => c.data[b]))); 49 | } 50 | 51 | /** 52 | * Provide an informative object string tag. 53 | */ 54 | get [Symbol.toStringTag]() { 55 | return 'Table'; 56 | } 57 | 58 | /** 59 | * The number of columns in this table. 60 | * @return {number} The number of columns. 61 | */ 62 | get numCols() { 63 | return this.names.length; 64 | } 65 | 66 | /** 67 | * The number of rows in this table. 68 | * @return {number} The number of rows. 69 | */ 70 | get numRows() { 71 | return this.children[0]?.length ?? 0; 72 | } 73 | 74 | /** 75 | * Return the child column at the given index position. 76 | * @param {number} index The column index. 77 | * @returns {import('./column.js').Column} 78 | */ 79 | getChildAt(index) { 80 | return this.children[index]; 81 | } 82 | 83 | /** 84 | * Return the first child column with the given name. 85 | * @param {string} name The column name. 86 | * @returns {import('./column.js').Column} 87 | */ 88 | getChild(name) { 89 | const i = this.names.findIndex(x => x === name); 90 | return i > -1 ? this.children[i] : undefined; 91 | } 92 | 93 | /** 94 | * Construct a new table containing only columns at the specified indices. 95 | * The order of columns in the new table matches the order of input indices. 96 | * @param {number[]} indices The indices of columns to keep. 97 | * @param {string[]} [as] Optional new names for selected columns. 98 | * @returns {Table} A new table with columns at the specified indices. 99 | */ 100 | selectAt(indices, as = []) { 101 | const { children, factory, schema } = this; 102 | const { fields } = schema; 103 | return new Table( 104 | { 105 | ...schema, 106 | fields: indices.map((i, j) => renameField(fields[i], as[j])) 107 | }, 108 | indices.map(i => children[i]), 109 | factory === proxyFactory 110 | ); 111 | } 112 | 113 | /** 114 | * Construct a new table containing only columns with the specified names. 115 | * If columns have duplicate names, the first (with lowest index) is used. 116 | * The order of columns in the new table matches the order of input names. 117 | * @param {string[]} names Names of columns to keep. 118 | * @param {string[]} [as] Optional new names for selected columns. 119 | * @returns {Table} A new table with columns matching the specified names. 120 | */ 121 | select(names, as) { 122 | const all = this.names; 123 | const indices = names.map(name => all.indexOf(name)); 124 | return this.selectAt(indices, as); 125 | } 126 | 127 | /** 128 | * Return an object mapping column names to extracted value arrays. 129 | * @returns {Record>} 130 | */ 131 | toColumns() { 132 | const { children, names } = this; 133 | /** @type {Record>} */ 134 | const cols = {}; 135 | names.forEach((name, i) => cols[name] = children[i]?.toArray() ?? [] ); 136 | return cols; 137 | } 138 | 139 | /** 140 | * Return an array of objects representing the rows of this table. 141 | * @returns {Record[]} 142 | */ 143 | toArray() { 144 | const { children, getFactory, numRows } = this; 145 | const data = children[0]?.data ?? []; 146 | const output = Array(numRows); 147 | for (let b = 0, row = -1; b < data.length; ++b) { 148 | const f = getFactory(b); 149 | for (let i = 0; i < data[b].length; ++i) { 150 | output[++row] = f(i); 151 | } 152 | } 153 | return output; 154 | } 155 | 156 | /** 157 | * Return an iterator over objects representing the rows of this table. 158 | * @returns {Generator, any, null>} 159 | */ 160 | *[Symbol.iterator]() { 161 | const { children, getFactory } = this; 162 | const data = children[0]?.data ?? []; 163 | for (let b = 0; b < data.length; ++b) { 164 | const f = getFactory(b); 165 | for (let i = 0; i < data[b].length; ++i) { 166 | yield f(i); 167 | } 168 | } 169 | } 170 | 171 | /** 172 | * Return a row object for the given index. 173 | * @param {number} index The row index. 174 | * @returns {Record} The row object. 175 | */ 176 | at(index) { 177 | const { children, getFactory, numRows } = this; 178 | if (index < 0 || index >= numRows) return null; 179 | const [{ offsets }] = children; 180 | const b = bisect(offsets, index) - 1; 181 | return getFactory(b)(index - offsets[b]); 182 | } 183 | 184 | /** 185 | * Return a row object for the given index. This method is the same as 186 | * `at()` and is provided for better compatibility with Apache Arrow JS. 187 | * @param {number} index The row index. 188 | * @returns {Record} The row object. 189 | */ 190 | get(index) { 191 | return this.at(index); 192 | } 193 | } 194 | 195 | function renameField(field, name) { 196 | return (name != null && name !== field.name) 197 | ? { ...field, name } 198 | : field; 199 | } 200 | -------------------------------------------------------------------------------- /src/util/arrays.js: -------------------------------------------------------------------------------- 1 | export const uint8Array = Uint8Array; 2 | export const uint16Array = Uint16Array; 3 | export const uint32Array = Uint32Array; 4 | export const uint64Array = BigUint64Array; 5 | export const int8Array = Int8Array; 6 | export const int16Array = Int16Array; 7 | export const int32Array = Int32Array; 8 | export const int64Array = BigInt64Array; 9 | export const float32Array = Float32Array; 10 | export const float64Array = Float64Array; 11 | 12 | /** 13 | * Return the appropriate typed array constructor for the given 14 | * integer type metadata. 15 | * @param {number} bitWidth The integer size in bits. 16 | * @param {boolean} signed Flag indicating if the integer is signed. 17 | * @returns {import('../types.js').IntArrayConstructor} 18 | */ 19 | export function intArrayType(bitWidth, signed) { 20 | const i = Math.log2(bitWidth) - 3; 21 | return ( 22 | signed 23 | ? [int8Array, int16Array, int32Array, int64Array] 24 | : [uint8Array, uint16Array, uint32Array, uint64Array] 25 | )[i]; 26 | } 27 | 28 | /** Shared prototype for typed arrays. */ 29 | const TypedArray = Object.getPrototypeOf(Int8Array); 30 | 31 | /** 32 | * Check if a value is a typed array. 33 | * @param {*} value The value to check. 34 | * @returns {value is import('../types.js').TypedArray} 35 | * True if value is a typed array, false otherwise. 36 | */ 37 | export function isTypedArray(value) { 38 | return value instanceof TypedArray; 39 | } 40 | 41 | /** 42 | * Check if a value is either a standard array or typed array. 43 | * @param {*} value The value to check. 44 | * @returns {value is (Array | import('../types.js').TypedArray)} 45 | * True if value is an array, false otherwise. 46 | */ 47 | export function isArray(value) { 48 | return Array.isArray(value) || isTypedArray(value); 49 | } 50 | 51 | /** 52 | * Check if a value is an array type (constructor) for 64-bit integers, 53 | * one of BigInt64Array or BigUint64Array. 54 | * @param {*} value The value to check. 55 | * @returns {value is import('../types.js').Int64ArrayConstructor} 56 | * True if value is a 64-bit array type, false otherwise. 57 | */ 58 | export function isInt64ArrayType(value) { 59 | return value === int64Array || value === uint64Array; 60 | } 61 | 62 | /** 63 | * Determine the correct index into an offset array for a given 64 | * full column row index. Assumes offset indices can be manipulated 65 | * as 32-bit signed integers. 66 | * @param {import('../types.js').IntegerArray} offsets The offsets array. 67 | * @param {number} index The full column row index. 68 | */ 69 | export function bisect(offsets, index) { 70 | let a = 0; 71 | let b = offsets.length; 72 | if (b <= 2147483648) { // 2 ** 31 73 | // fast version, use unsigned bit shift 74 | // array length fits within 32-bit signed integer 75 | do { 76 | const mid = (a + b) >>> 1; 77 | if (offsets[mid] <= index) a = mid + 1; 78 | else b = mid; 79 | } while (a < b); 80 | } else { 81 | // slow version, use division and truncate 82 | // array length exceeds 32-bit signed integer 83 | do { 84 | const mid = Math.trunc((a + b) / 2); 85 | if (offsets[mid] <= index) a = mid + 1; 86 | else b = mid; 87 | } while (a < b); 88 | } 89 | return a; 90 | } 91 | 92 | /** 93 | * Compute a 64-bit aligned buffer size. 94 | * @param {number} length The starting size. 95 | * @param {number} bpe Bytes per element. 96 | * @returns {number} The aligned size. 97 | */ 98 | function align64(length, bpe = 1) { 99 | return (((length * bpe) + 7) & ~7) / bpe; 100 | } 101 | 102 | /** 103 | * Return a 64-bit aligned version of the array. 104 | * @template {import('../types.js').TypedArray} T 105 | * @param {T} array The array. 106 | * @param {number} length The current array length. 107 | * @returns {T} The aligned array. 108 | */ 109 | export function align(array, length = array.length) { 110 | const alignedLength = align64(length, array.BYTES_PER_ELEMENT); 111 | return array.length > alignedLength ? /** @type {T} */ (array.subarray(0, alignedLength)) 112 | : array.length < alignedLength ? resize(array, alignedLength) 113 | : array; 114 | } 115 | 116 | /** 117 | * Resize a typed array to exactly the specified length. 118 | * @template {import('../types.js').TypedArray} T 119 | * @param {T} array The array. 120 | * @param {number} newLength The new length. 121 | * @param {number} [offset] The offset at which to copy the old array. 122 | * @returns {T} The resized array. 123 | */ 124 | export function resize(array, newLength, offset = 0) { 125 | // @ts-ignore 126 | const newArray = new array.constructor(newLength); 127 | newArray.set(array, offset); 128 | return newArray; 129 | } 130 | 131 | /** 132 | * Grow a typed array to accommdate a minimum index. The array size is 133 | * doubled until it exceeds the minimum index. 134 | * @template {import('../types.js').TypedArray} T 135 | * @param {T} array The array. 136 | * @param {number} index The minimum index. 137 | * @param {boolean} [shift] Flag to shift copied bytes to back of array. 138 | * @returns {T} The resized array. 139 | */ 140 | export function grow(array, index, shift) { 141 | while (array.length <= index) { 142 | array = resize(array, array.length << 1, shift ? array.length : 0); 143 | } 144 | return array; 145 | } 146 | -------------------------------------------------------------------------------- /src/util/objects.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Check if a value is a Date instance 3 | * @param {*} value The value to check. 4 | * @returns {value is Date} True if value is a Date, false otherwise. 5 | */ 6 | export function isDate(value) { 7 | return value instanceof Date; 8 | } 9 | 10 | /** 11 | * Check if a value is iterable. 12 | * @param {*} value The value to check. 13 | * @returns {value is Iterable} True if value is iterable, false otherwise. 14 | */ 15 | export function isIterable(value) { 16 | return typeof value[Symbol.iterator] === 'function'; 17 | } 18 | 19 | /** 20 | * Return the input value if it passes a test. 21 | * Otherwise throw an error using the given message generator. 22 | * @template T 23 | * @param {T} value he value to check. 24 | * @param {(value: T) => boolean} test The test function. 25 | * @param {(value: *) => string} message Message generator. 26 | * @returns {T} The input value. 27 | * @throws if the value does not pass the test 28 | */ 29 | export function check(value, test, message) { 30 | if (test(value)) return value; 31 | throw new Error(message(value)); 32 | } 33 | 34 | /** 35 | * Return the input value if it exists in the provided set. 36 | * Otherwise throw an error using the given message generator. 37 | * @template T 38 | * @param {T} value The value to check. 39 | * @param {T[] | Record} set The set of valid values. 40 | * @param {(value: *) => string} [message] Message generator. 41 | * @returns {T} The input value. 42 | * @throws if the value is not included in the set 43 | */ 44 | export function checkOneOf(value, set, message) { 45 | set = Array.isArray(set) ? set : Object.values(set); 46 | return check( 47 | value, 48 | (value) => set.includes(value), 49 | message ?? (() => `${value} must be one of ${set}`) 50 | ); 51 | } 52 | 53 | /** 54 | * Return the first object key that pairs with the given value. 55 | * @param {Record} object The object to search. 56 | * @param {any} value The value to lookup. 57 | * @returns {string} The first matching key, or '' if not found. 58 | */ 59 | export function keyFor(object, value) { 60 | for (const [key, val] of Object.entries(object)) { 61 | if (val === value) return key; 62 | } 63 | return ''; 64 | } 65 | -------------------------------------------------------------------------------- /src/util/read.js: -------------------------------------------------------------------------------- 1 | import { toNumber } from './numbers.js'; 2 | import { decodeUtf8 } from './strings.js'; 3 | 4 | /** The size in bytes of a 32-bit integer. */ 5 | export const SIZEOF_INT = 4; 6 | 7 | /** The size in bytes of a 16-bit integer. */ 8 | export const SIZEOF_SHORT = 2; 9 | 10 | /** 11 | * Return a boolean for a single bit in a bitmap. 12 | * @param {Uint8Array} bitmap The bitmap. 13 | * @param {number} index The bit index to read. 14 | * @returns {boolean} The boolean bitmap value. 15 | */ 16 | export function decodeBit(bitmap, index) { 17 | return (bitmap[index >> 3] & 1 << (index % 8)) !== 0; 18 | } 19 | 20 | /** 21 | * Lookup helper for flatbuffer object (table) entries. 22 | * @param {Uint8Array} buf The byte buffer. 23 | * @param {number} index The base index of the object. 24 | */ 25 | export function readObject(buf, index) { 26 | const pos = index + readInt32(buf, index); 27 | const vtable = pos - readInt32(buf, pos); 28 | const size = readInt16(buf, vtable); 29 | /** 30 | * Retrieve a value from a flatbuffer table layout. 31 | * @template T 32 | * @param {number} index The table entry index. 33 | * @param {(buf: Uint8Array, offset: number) => T} read Read function to invoke. 34 | * @param {T} [fallback=null] The default fallback value. 35 | * @returns {T} 36 | */ 37 | return (index, read, fallback = null) => { 38 | if (index < size) { 39 | const off = readInt16(buf, vtable + index); 40 | if (off) return read(buf, pos + off); 41 | } 42 | return fallback; 43 | }; 44 | } 45 | 46 | /** 47 | * Return a buffer offset value. 48 | * @param {Uint8Array} buf 49 | * @param {number} offset 50 | * @returns {number} 51 | */ 52 | export function readOffset(buf, offset) { 53 | return offset; 54 | } 55 | 56 | /** 57 | * Return a boolean value. 58 | * @param {Uint8Array} buf 59 | * @param {number} offset 60 | * @returns {boolean} 61 | */ 62 | export function readBoolean(buf, offset) { 63 | return !!readInt8(buf, offset); 64 | } 65 | 66 | /** 67 | * Return a signed 8-bit integer value. 68 | * @param {Uint8Array} buf 69 | * @param {number} offset 70 | * @returns {number} 71 | */ 72 | export function readInt8(buf, offset) { 73 | return readUint8(buf, offset) << 24 >> 24; 74 | } 75 | 76 | /** 77 | * Return an unsigned 8-bit integer value. 78 | * @param {Uint8Array} buf 79 | * @param {number} offset 80 | * @returns {number} 81 | */ 82 | export function readUint8(buf, offset) { 83 | return buf[offset]; 84 | } 85 | 86 | /** 87 | * Return a signed 16-bit integer value. 88 | * @param {Uint8Array} buf 89 | * @param {number} offset 90 | * @returns {number} 91 | */ 92 | export function readInt16(buf, offset) { 93 | return readUint16(buf, offset) << 16 >> 16; 94 | } 95 | 96 | /** 97 | * Return an unsigned 16-bit integer value. 98 | * @param {Uint8Array} buf 99 | * @param {number} offset 100 | * @returns {number} 101 | */ 102 | export function readUint16(buf, offset) { 103 | return buf[offset] | buf[offset + 1] << 8; 104 | } 105 | 106 | /** 107 | * Return a signed 32-bit integer value. 108 | * @param {Uint8Array} buf 109 | * @param {number} offset 110 | * @returns {number} 111 | */ 112 | export function readInt32(buf, offset) { 113 | return buf[offset] 114 | | buf[offset + 1] << 8 115 | | buf[offset + 2] << 16 116 | | buf[offset + 3] << 24; 117 | } 118 | 119 | /** 120 | * Return an unsigned 32-bit integer value. 121 | * @param {Uint8Array} buf 122 | * @param {number} offset 123 | * @returns {number} 124 | */ 125 | export function readUint32(buf, offset) { 126 | return readInt32(buf, offset) >>> 0; 127 | } 128 | 129 | /** 130 | * Return a signed 64-bit integer value coerced to a JS number. 131 | * Throws an error if the value exceeds what a JS number can represent. 132 | * @param {Uint8Array} buf 133 | * @param {number} offset 134 | * @returns {number} 135 | */ 136 | export function readInt64(buf, offset) { 137 | return toNumber(BigInt.asIntN( 138 | 64, 139 | BigInt(readUint32(buf, offset)) + 140 | (BigInt(readUint32(buf, offset + SIZEOF_INT)) << 32n) 141 | )); 142 | } 143 | 144 | /** 145 | * Create a JavaScript string from UTF-8 data stored inside the FlatBuffer. 146 | * This allocates a new string and converts to wide chars upon each access. 147 | * @param {Uint8Array} buf The byte buffer. 148 | * @param {number} index The index of the string entry. 149 | * @returns {string} The decoded string. 150 | */ 151 | export function readString(buf, index) { 152 | let offset = index + readInt32(buf, index); // get the string offset 153 | const length = readInt32(buf, offset); // get the string length 154 | offset += SIZEOF_INT; // skip length value 155 | return decodeUtf8(buf.subarray(offset, offset + length)); 156 | } 157 | 158 | /** 159 | * Extract a flatbuffer vector to an array. 160 | * @template T 161 | * @param {Uint8Array} buf The byte buffer. 162 | * @param {number} offset The offset location of the vector. 163 | * @param {number} stride The stride between vector entries. 164 | * @param {(buf: Uint8Array, pos: number) => T} extract Vector entry extraction function. 165 | * @returns {T[]} The extracted vector entries. 166 | */ 167 | export function readVector(buf, offset, stride, extract) { 168 | if (!offset) return []; 169 | 170 | // get base position by adding offset delta 171 | const base = offset + readInt32(buf, offset); 172 | 173 | // read vector size, extract entries 174 | return Array.from( 175 | { length: readInt32(buf, base) }, 176 | (_, i) => extract(buf, base + SIZEOF_INT + i * stride) 177 | ); 178 | } 179 | -------------------------------------------------------------------------------- /src/util/strings.js: -------------------------------------------------------------------------------- 1 | import { isArray } from './arrays.js'; 2 | import { isDate } from './objects.js'; 3 | 4 | const textDecoder = new TextDecoder('utf-8'); 5 | const textEncoder = new TextEncoder(); 6 | 7 | /** 8 | * Return a UTF-8 string decoded from a byte buffer. 9 | * @param {Uint8Array} buf The byte buffer. 10 | * @returns {string} The decoded string. 11 | */ 12 | export function decodeUtf8(buf) { 13 | return textDecoder.decode(buf); 14 | } 15 | 16 | /** 17 | * Return a byte buffer encoded from a UTF-8 string. 18 | * @param {string } str The string to encode. 19 | * @returns {Uint8Array} The encoded byte buffer. 20 | */ 21 | export function encodeUtf8(str) { 22 | return textEncoder.encode(str); 23 | } 24 | 25 | /** 26 | * Return a string-coercible key value that uniquely identifies a value. 27 | * @param {*} value The input value. 28 | * @returns {string} The key string. 29 | */ 30 | export function keyString(value) { 31 | const val = typeof value !== 'object' || !value ? (value ?? null) 32 | : isDate(value) ? +value 33 | // @ts-ignore 34 | : isArray(value) ? `[${value.map(keyString)}]` 35 | : objectKey(value); 36 | return `${val}`; 37 | } 38 | 39 | function objectKey(value) { 40 | let s = ''; 41 | let i = -1; 42 | for (const k in value) { 43 | if (++i > 0) s += ','; 44 | s += `"${k}":${keyString(value[k])}`; 45 | } 46 | return `{${s}}`; 47 | } 48 | -------------------------------------------------------------------------------- /src/util/struct.js: -------------------------------------------------------------------------------- 1 | export const RowIndex = Symbol('rowIndex'); 2 | 3 | /** 4 | * Returns a row proxy object factory. The resulting method takes a 5 | * batch-level row index as input and returns an object that proxies 6 | * access to underlying batches. 7 | * @param {string[]} names The column (property) names 8 | * @param {import('../batch.js').Batch[]} batches The value batches. 9 | * @returns {(index: number) => Record} 10 | */ 11 | export function proxyFactory(names, batches) { 12 | class RowObject { 13 | /** 14 | * Create a new proxy row object representing a struct or table row. 15 | * @param {number} index The record batch row index. 16 | */ 17 | constructor(index) { 18 | this[RowIndex] = index; 19 | } 20 | 21 | /** 22 | * Return a JSON-compatible object representation. 23 | */ 24 | toJSON() { 25 | return structObject(names, batches, this[RowIndex]); 26 | } 27 | }; 28 | 29 | // prototype for row proxy objects 30 | const proto = RowObject.prototype; 31 | 32 | for (let i = 0; i < names.length; ++i) { 33 | // skip duplicated column names 34 | if (Object.hasOwn(proto, names[i])) continue; 35 | 36 | // add a getter method for the current batch 37 | const batch = batches[i]; 38 | Object.defineProperty(proto, names[i], { 39 | get() { return batch.at(this[RowIndex]); }, 40 | enumerable: true 41 | }); 42 | } 43 | 44 | return index => new RowObject(index); 45 | } 46 | 47 | /** 48 | * Returns a row object factory. The resulting method takes a 49 | * batch-level row index as input and returns an object whose property 50 | * values have been extracted from the batches. 51 | * @param {string[]} names The column (property) names 52 | * @param {import('../batch.js').Batch[]} batches The value batches. 53 | * @returns {(index: number) => Record} 54 | */ 55 | export function objectFactory(names, batches) { 56 | return index => structObject(names, batches, index); 57 | } 58 | 59 | /** 60 | * Return a vanilla object representing a struct (row object) type. 61 | * @param {string[]} names The column (property) names 62 | * @param {import('../batch.js').Batch[]} batches The value batches. 63 | * @param {number} index The record batch row index. 64 | * @returns {Record} 65 | */ 66 | export function structObject(names, batches, index) { 67 | const obj = {}; 68 | for (let i = 0; i < names.length; ++i) { 69 | obj[names[i]] = batches[i].at(index); 70 | } 71 | return obj; 72 | } 73 | -------------------------------------------------------------------------------- /test/batch-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { DirectBatch } from '../src/batch.js'; 3 | 4 | describe('DirectBatch', () => { 5 | it('trims the values array', () => { 6 | const b = new DirectBatch({ 7 | length: 4, 8 | nullCount: 0, 9 | values: Int32Array.of(1, 2, 3, 4, 5, 6, 7, 8) 10 | }); 11 | assert.deepEqual(b.length, 4); 12 | assert.deepEqual([...b].length, 4); 13 | assert.deepEqual(b.value(4), undefined); 14 | }); 15 | }); 16 | -------------------------------------------------------------------------------- /test/column-from-values-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { columnFromValues, int32, nullType, utf8 } from '../src/index.js'; 3 | 4 | function test(values, type, options) { 5 | compare(values, values, type, options); 6 | compare(values, callback => values.forEach(callback), type, options); 7 | } 8 | 9 | function compare(array, values, type, options) { 10 | const col = columnFromValues(values, type, options); 11 | if (type) assert.deepEqual(col.type, type); 12 | assert.strictEqual(col.length, array.length); 13 | assert.deepStrictEqual(Array.from(col), array); 14 | } 15 | 16 | describe('columnFromValues', () => { 17 | it('builds null columns', () => { 18 | test([null, null, null], nullType()); 19 | test([null, null, null, null, null], nullType(), { maxBatchRows: 2 }); 20 | }); 21 | 22 | it('builds non-null columns', () => { 23 | test([1, 2, 3]); 24 | test([1, 2, 3], int32()); 25 | test([1, 2, 3, 4, 5], int32(), { maxBatchRows: 2 }); 26 | test(['a', 'b', 'c']); 27 | test(['a', 'b', 'c'], utf8()); 28 | test(['a', 'b', 'c', 'd', 'e'], utf8(), { maxBatchRows: 2 }); 29 | 30 | // create column using only values with odd-numbered indices 31 | const values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; 32 | const filter = values.filter((_, i) => i % 2); 33 | compare(filter, callback => { 34 | values.forEach((v, i) => { if (i % 2) callback(v); }) 35 | }); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /test/data/binaryview.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/binaryview.arrows -------------------------------------------------------------------------------- /test/data/convert.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/convert.arrows -------------------------------------------------------------------------------- /test/data/decimal.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal.arrow -------------------------------------------------------------------------------- /test/data/decimal.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal.arrows -------------------------------------------------------------------------------- /test/data/decimal128.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal128.arrows -------------------------------------------------------------------------------- /test/data/decimal256.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal256.arrows -------------------------------------------------------------------------------- /test/data/decimal32.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal32.arrows -------------------------------------------------------------------------------- /test/data/decimal64.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/decimal64.arrows -------------------------------------------------------------------------------- /test/data/empty.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/empty.arrows -------------------------------------------------------------------------------- /test/data/flights.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/flights.arrow -------------------------------------------------------------------------------- /test/data/flights.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/flights.arrows -------------------------------------------------------------------------------- /test/data/largelistview.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/largelistview.arrows -------------------------------------------------------------------------------- /test/data/listview.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/listview.arrows -------------------------------------------------------------------------------- /test/data/runendencoded.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/runendencoded.arrows -------------------------------------------------------------------------------- /test/data/runendencoded64.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/runendencoded64.arrows -------------------------------------------------------------------------------- /test/data/scrabble.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/scrabble.arrows -------------------------------------------------------------------------------- /test/data/utf8view.arrows: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uwdata/flechette/158477706cb8e0268a081b190989e706f9f6437d/test/data/utf8view.arrows -------------------------------------------------------------------------------- /test/decode-ipc-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { readFile } from 'node:fs/promises'; 3 | import { decodeIPC } from '../src/decode/decode-ipc.js'; 4 | import { decimalDataDecoded } from './util/decimal.js'; 5 | 6 | describe('decodeIPC', () => { 7 | it('decodes arrow file format', async () => { 8 | const buffer = await readFile(`test/data/decimal.arrow`); 9 | const bytes = new Uint8Array(buffer); 10 | const expect = decimalDataDecoded(); 11 | assert.deepEqual(decodeIPC(buffer), expect, 'Node Buffer'); 12 | assert.deepStrictEqual(decodeIPC(bytes), expect, 'Uint8Array'); 13 | assert.deepStrictEqual(decodeIPC(bytes.buffer), expect, 'ArrayBuffer'); 14 | }); 15 | 16 | it('decodes arrow stream format', async () => { 17 | const buffer = await readFile(`test/data/decimal.arrows`); 18 | const bytes = new Uint8Array(buffer); 19 | const expect = decimalDataDecoded(); 20 | assert.deepEqual(decodeIPC(buffer), expect, 'Node Buffer'); 21 | assert.deepStrictEqual(decodeIPC(bytes), expect, 'Uint8Array'); 22 | assert.deepStrictEqual(decodeIPC([bytes]), expect, 'Uint8Array[]'); 23 | assert.deepStrictEqual(decodeIPC(bytes.buffer), expect, 'ArrayBuffer'); 24 | }); 25 | 26 | it('decodes arrow stream format from multiple buffers', () => { 27 | // decimal.arrows, divided into separate messages 28 | const array = [ 29 | Uint8Array.of(255,255,255,255,120,0,0,0,16,0,0,0,0,0,10,0,12,0,6,0,5,0,8,0,10,0,0,0,0,1,4,0,12,0,0,0,8,0,8,0,0,0,4,0,8,0,0,0,4,0,0,0,1,0,0,0,20,0,0,0,16,0,20,0,8,0,6,0,7,0,12,0,0,0,16,0,16,0,0,0,0,0,1,7,16,0,0,0,28,0,0,0,4,0,0,0,0,0,0,0,1,0,0,0,100,0,0,0,8,0,12,0,4,0,8,0,8,0,0,0,18,0,0,0,3,0,0,0), 30 | Uint8Array.of(255,255,255,255,136,0,0,0,20,0,0,0,0,0,0,0,12,0,22,0,6,0,5,0,8,0,12,0,12,0,0,0,0,3,4,0,24,0,0,0,48,0,0,0,0,0,0,0,0,0,10,0,24,0,12,0,4,0,8,0,10,0,0,0,60,0,0,0,16,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,232,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224,46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,208,132,0,0,0,0,0,0,0,0,0,0,0,0,0,0), 31 | Uint8Array.of(255,255,255,255,0,0,0,0) 32 | ]; 33 | const expect = decimalDataDecoded(); 34 | assert.deepStrictEqual(decodeIPC(array), expect, 'Uint8Array'); 35 | }); 36 | 37 | it('throws on invalid inputs', () => { 38 | assert.throws(() => decodeIPC('foo')); 39 | assert.throws(() => decodeIPC(['foo'])); 40 | }); 41 | }); 42 | -------------------------------------------------------------------------------- /test/duckdb-compat-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { DuckDB } from '@uwdata/mosaic-duckdb'; 3 | import { tableFromArrays, tableFromIPC, tableToIPC } from '../src/index.js'; 4 | import * as dataMethods from './util/data.js'; 5 | 6 | // Arrow types not supported by DuckDB 7 | const skip = new Set([ 8 | 'binaryView', 'empty', 'largeListView', 'listView', 9 | 'runEndEncoded32', 'runEndEncoded64', 'utf8View', 10 | 'decimal32', 'decimal64', 'decimal128', 'decimal256' 11 | ]); 12 | 13 | describe('DuckDB compatibility', () => { 14 | for (const [name, method] of Object.entries(dataMethods)) { 15 | if (skip.has(name)) continue; 16 | it(`includes ${name} data`, async () => { 17 | const data = await method(); 18 | const load = await Promise.all( 19 | data.map(({ bytes }) => loadIPC(tableFromIPC(bytes))) 20 | ); 21 | assert.deepStrictEqual(load, Array(data.length).fill(true)); 22 | }); 23 | } 24 | 25 | it('includes default dictionary types', async () => { 26 | const t = tableFromArrays({ foo: ['x', 'y', 'z'] }); 27 | assert.strictEqual(await loadIPC(t), true); 28 | }); 29 | }); 30 | 31 | function loadIPC(table) { 32 | const bytes = tableToIPC(table, { format: 'stream' }); 33 | return new Promise((resolve) => { 34 | const db = new DuckDB(); 35 | db.db.register_buffer('arrow_ipc', [bytes], true, (err) => { 36 | if (err) { 37 | console.error(err); 38 | resolve(false); 39 | } else { 40 | resolve(true); 41 | } 42 | }); 43 | }); 44 | } 45 | -------------------------------------------------------------------------------- /test/encode-ipc-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { tableFromIPC as arrowJSTableFromIPC } from 'apache-arrow'; 3 | import { decodeIPC } from '../src/decode/decode-ipc.js'; 4 | import { encodeIPC } from '../src/encode/encode-ipc.js'; 5 | import { MAGIC } from '../src/constants.js'; 6 | import { decimalDataDecoded, decimalDataToEncode } from './util/decimal.js'; 7 | 8 | function arrowJSCheck(input, bytes) { 9 | // cross-check against arrow-js 10 | const arrowJS = arrowJSTableFromIPC(bytes); 11 | assert.strictEqual(arrowJS.numRows, 3); 12 | assert.strictEqual(arrowJS.numCols, 1); 13 | const arrowCol = arrowJS.getChildAt(0); 14 | const arrowBuf = arrowCol.data[0].values; 15 | assert.strictEqual(arrowCol.type.typeId, 7); 16 | assert.deepStrictEqual( 17 | new Uint8Array( 18 | arrowBuf.buffer, 19 | arrowBuf.byteOffset, 20 | arrowBuf.length * arrowBuf.BYTES_PER_ELEMENT 21 | ), 22 | input.records[0].buffers[0] 23 | ); 24 | } 25 | 26 | describe('encodeIPC', () => { 27 | it('encodes arrow file format', () => { 28 | const input = decimalDataToEncode(); 29 | const expect = decimalDataDecoded(); 30 | const bytes = encodeIPC(input, { format: 'file' }).finish(); 31 | assert.deepStrictEqual(bytes.subarray(0, 6), MAGIC, 'start ARROW1 magic string'); 32 | assert.deepStrictEqual(bytes.slice(-6), MAGIC, 'end ARROW1 magic string'); 33 | assert.deepStrictEqual(decodeIPC(bytes), expect, 'Uint8Array'); 34 | arrowJSCheck(input, bytes); 35 | }); 36 | 37 | it('encodes arrow stream format', () => { 38 | const input = decimalDataToEncode(); 39 | const expect = decimalDataDecoded(); 40 | const bytes = encodeIPC(input, { format: 'stream' }).finish(); 41 | assert.deepStrictEqual(decodeIPC(bytes), expect, 'Uint8Array'); 42 | arrowJSCheck(input, bytes); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /test/infer-type-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { bool, dateDay, dictionary, fixedSizeList, float64, int16, int32, int64, int8, list, nullType, struct, timestamp, utf8 } from '../src/index.js'; 3 | import { inferType } from '../src/build/infer-type.js'; 4 | 5 | function matches(actual, expect) { 6 | assert.deepStrictEqual(actual, expect); 7 | } 8 | 9 | function infer(values) { 10 | return inferType(visitor => values.forEach(visitor)); 11 | } 12 | 13 | describe('inferType', () => { 14 | it('infers null types', () => { 15 | matches(infer([null, null, null]), nullType()); 16 | matches(infer([undefined, undefined]), nullType()); 17 | matches(infer([null, undefined]), nullType()); 18 | matches(infer([]), nullType()); 19 | }); 20 | 21 | it('infers integer types', () => { 22 | matches(infer([1, 2, 3]), int8()); 23 | matches(infer([1e3, 2e3, 3e3]), int16()); 24 | matches(infer([1e6, 2e6, 3e6]), int32()); 25 | matches(infer([1n, 2n, 3n]), int64()); 26 | 27 | matches(infer([-1, 2, 3]), int8()); 28 | matches(infer([-1e3, 2e3, 3e3]), int16()); 29 | matches(infer([-1e6, 2e6, 3e6]), int32()); 30 | matches(infer([-1n, 2n, 3n]), int64()); 31 | 32 | matches(infer([1, 2, null, undefined, 3]), int8()); 33 | matches(infer([1e3, 2e3, null, undefined, 3e3]), int16()); 34 | matches(infer([1e6, 2e6, null, undefined, 3e6]), int32()); 35 | matches(infer([1n, 2n, null, undefined, 3n]), int64()); 36 | }); 37 | 38 | it('infers float types', () => { 39 | matches(infer([1.1, 2.2, 3.3]), float64()); 40 | matches(infer([-1.1, 2.2, 3.3]), float64()); 41 | matches(infer([1, 2, 3.3]), float64()); 42 | matches(infer([1, 2, NaN]), float64()); 43 | matches(infer([NaN, null, undefined, NaN]), float64()); 44 | matches(infer([Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER]), float64()); 45 | }); 46 | 47 | it('infers utf8 dictionary types', () => { 48 | const type = dictionary(utf8(), int32()); 49 | matches(infer(['foo', 'bar', 'baz']), type); 50 | matches(infer(['foo', 'bar', null, undefined, 'baz']), type); 51 | }); 52 | 53 | it('infers bool types', () => { 54 | matches(infer([true, false, true]), bool()); 55 | matches(infer([true, false, null, undefined, true]), bool()); 56 | }); 57 | 58 | it('infers date day types', () => { 59 | matches(infer([ 60 | new Date(Date.UTC(2000, 1, 2)), 61 | new Date(Date.UTC(2006, 3, 20)), 62 | null, 63 | undefined 64 | ]), dateDay()); 65 | }); 66 | 67 | it('infers timestamp types', () => { 68 | matches( 69 | infer([ 70 | new Date(Date.UTC(2000, 1, 2)), 71 | new Date(Date.UTC(2006, 3, 20)), 72 | null, 73 | undefined, 74 | new Date(1990, 3, 12, 5, 37) 75 | ]), 76 | timestamp() 77 | ); 78 | }); 79 | 80 | it('infers list types', () => { 81 | matches( 82 | infer([[1, 2], [3, 4], [5]]), 83 | list(int8()) 84 | ); 85 | matches( 86 | infer([[true, null, false], null, undefined, [false, undefined]]), 87 | list(bool()) 88 | ); 89 | matches( 90 | infer([['foo', 'bar', null], null, ['bar', 'baz']]), 91 | list(dictionary(utf8(), int32())) 92 | ); 93 | }); 94 | 95 | it('infers fixed size list types', () => { 96 | matches( 97 | infer([[1, 2], [3, 4]]), 98 | fixedSizeList(int8(), 2) 99 | ); 100 | matches( 101 | infer([[true, null, false], null, undefined, [false, true, true]]), 102 | fixedSizeList(bool(), 3) 103 | ); 104 | matches( 105 | infer([['foo', 'bar'], null, ['bar', 'baz']]), 106 | fixedSizeList(dictionary(utf8(), int32()), 2) 107 | ); 108 | }); 109 | 110 | it('infers struct types', () => { 111 | matches( 112 | infer([ 113 | { foo: 1, bar: [1.1, 2.2] }, 114 | { foo: null, bar: [2.2, null, 3.3] }, 115 | null, 116 | undefined, 117 | { foo: 2, bar: null }, 118 | ]), 119 | struct({ foo: int8(), bar: list(float64()) }) 120 | ); 121 | }); 122 | 123 | it('throws on bigints that exceed 64 bits', () => { 124 | assert.throws(() => infer([(1n << 200n)])); 125 | }); 126 | 127 | it('throws on mixed types', () => { 128 | assert.throws(() => infer([1, true, 'foo'])); 129 | }); 130 | }); 131 | -------------------------------------------------------------------------------- /test/table-from-arrays-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { float64, int8, int32, bool, dictionary, tableFromArrays, utf8, float32, nullType } from '../src/index.js'; 3 | 4 | describe('tableFromArrays', () => { 5 | const values = { 6 | foo: [1, 2, 3, 4, 5], 7 | bar: [1.3, NaN, 1e27, Math.PI, Math.E].map(v => Math.fround(v)), 8 | baz: [true, false, null, false, true], 9 | bop: ['foo', 'bar', 'baz', 'bop', 'bip'] 10 | }; 11 | 12 | const types = { 13 | foo: int8(), 14 | bar: float64(), 15 | baz: bool(), 16 | bop: dictionary(utf8(), int32()) 17 | }; 18 | 19 | function check(table, colTypes = types) { 20 | const { fields } = table.schema; 21 | assert.strictEqual(table.numRows, 5); 22 | assert.strictEqual(table.numCols, 4); 23 | table.children.forEach((c, i) => { 24 | const { name } = fields[i]; 25 | assert.deepStrictEqual(c.type, colTypes[name]); 26 | assert.deepStrictEqual(fields[i].type, colTypes[name]); 27 | assert.deepStrictEqual([...c], values[name]); 28 | }); 29 | return table; 30 | } 31 | 32 | it('creates table from provided types', () => { 33 | // with types that match type inference results 34 | check(tableFromArrays(values, { types })); 35 | check(tableFromArrays(Object.entries(values), { types })); 36 | 37 | // with types that do not match type inference reults 38 | const opt = { types: { ...types, foo: int32(), bar: float32() } }; 39 | check(tableFromArrays(values, opt), opt.types); 40 | check(tableFromArrays({ 41 | ...values, 42 | foo: Int16Array.from(values.foo), 43 | bar: Float64Array.from(values.bar) 44 | }, opt), opt.types); 45 | }); 46 | 47 | it('creates table from inferred types', () => { 48 | check(tableFromArrays(values)); 49 | check(tableFromArrays(Object.entries(values))); 50 | 51 | // infer from typed arrays 52 | check(tableFromArrays({ 53 | ...values, 54 | foo: Int8Array.from(values.foo), 55 | bar: Float64Array.from(values.bar) 56 | })); 57 | }); 58 | 59 | it('creates empty table', () => { 60 | const withoutCols = tableFromArrays({}); 61 | assert.strictEqual(withoutCols.numRows, 0); 62 | assert.strictEqual(withoutCols.numCols, 0); 63 | assert.deepStrictEqual(withoutCols.schema.fields, []); 64 | 65 | const withCols = tableFromArrays({ foo: [], bar: [] }); 66 | assert.strictEqual(withCols.numRows, 0); 67 | assert.strictEqual(withCols.numCols, 2); 68 | assert.deepStrictEqual( 69 | withCols.schema.fields.map(f => f.type), 70 | [ nullType(), nullType() ] 71 | ); 72 | 73 | const withTypes = tableFromArrays( 74 | { foo: [], bar: [] }, 75 | { types: { foo: int32(), bar: float32() }} 76 | ); 77 | assert.strictEqual(withTypes.numRows, 0); 78 | assert.strictEqual(withTypes.numCols, 2); 79 | assert.deepStrictEqual( 80 | withTypes.schema.fields.map(f => f.type), 81 | [ int32(), float32() ] 82 | ); 83 | }); 84 | 85 | it('throws when array lengths differ', () => { 86 | assert.throws(() => tableFromArrays({ foo: [1, 2, 3], bar: [1, 2] })); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /test/table-from-ipc-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { tableFromIPC } from '../src/index.js'; 3 | import { arrowFromDuckDB } from './util/arrow-from-duckdb.js'; 4 | import { binaryView, bool, dateDay, decimal, decimal32, decimal128, decimal256, decimal64, empty, fixedListInt32, fixedListUtf8, float32, float64, int16, int32, int64, int8, intervalMonthDayNano, largeListView, listInt32, listUtf8, listView, map, runEndEncoded32, runEndEncoded64, struct, timestampMicrosecond, timestampMillisecond, timestampNanosecond, timestampSecond, uint16, uint32, uint64, uint8, union, utf8, utf8View } from './util/data.js'; 5 | import { RowIndex } from '../src/util/struct.js'; 6 | 7 | const toBigInt = v => BigInt(v); 8 | const toDate = v => new Date(v); 9 | const toFloat32 = v => Math.fround(v); 10 | const toDecimalInt = v => Math.round(v * 100); 11 | const toDecimalBigInt = v => BigInt(toDecimalInt(v)); 12 | 13 | async function test(dataMethod, arrayType, opt, transform) { 14 | const data = await dataMethod(); 15 | for (const { bytes, values, nullCount } of data) { 16 | valueTest(bytes, values, nullCount ? Array : arrayType, opt, transform); 17 | } 18 | } 19 | 20 | function valueTest(bytes, values, arrayType, opt = undefined, transform = undefined, name = 'value') { 21 | const array = transform 22 | ? values.map((v, i) => v == null ? v : transform(v, i)) 23 | : Array.from(values); 24 | const column = tableFromIPC(bytes, opt).getChild(name); 25 | compare(column, array, arrayType); 26 | return column; 27 | } 28 | 29 | function compare(column, array, arrayType = Array) { 30 | // test values extracted using toArray 31 | const data = column.toArray(); 32 | assert.ok(data instanceof arrayType, 'toArray type check'); 33 | assert.deepStrictEqual(data, arrayType.from(array), 'toArray equality'); 34 | 35 | // test values extracted using column iterator 36 | assert.deepStrictEqual([...column], array, 'iterator equality'); 37 | 38 | // test values extracted using column at() method 39 | const extract = Array.from(array, (_, i) => column.at(i)); 40 | assert.deepStrictEqual(extract, array, 'at equality'); 41 | } 42 | 43 | describe('tableFromIPC', () => { 44 | it('throws when coercing unsafe int64 values', async () => { 45 | const values = [ 46 | BigInt(Number.MAX_SAFE_INTEGER) - 1n, 47 | BigInt(Number.MAX_SAFE_INTEGER) + 1n 48 | ]; 49 | const bytes = await arrowFromDuckDB(values, 'BIGINT'); 50 | 51 | // coerced to numbers 52 | assert.throws(() => tableFromIPC(bytes).getChild('value').toArray()); 53 | 54 | // as bigints 55 | valueTest(bytes, values, BigInt64Array, { useBigInt: true }); 56 | }); 57 | 58 | it('decodes uint8 data', () => test(uint8, Uint8Array)); 59 | it('decodes uint16 data', () => test(uint16, Uint16Array)); 60 | it('decodes uint32 data', () => test(uint32, Uint32Array)); 61 | it('decodes uint64 data', () => test(uint64, Float64Array)); 62 | it('decodes uint64 data to bigint', () => test(uint64, BigUint64Array, { useBigInt: true }, toBigInt)); 63 | 64 | it('decodes int8 data', () => test(int8, Int8Array)); 65 | it('decodes int16 data', () => test(int16, Int16Array)); 66 | it('decodes int32 data', () => test(int32, Int32Array)); 67 | it('decodes int64 data', () => test(int64, Float64Array)); 68 | it('decodes int64 data to bigint', () => test(int64, BigInt64Array, { useBigInt: true }, toBigInt)); 69 | 70 | it('decodes float32 data', () => test(float32, Float32Array, {}, toFloat32)); 71 | it('decodes float64 data', () => test(float64, Float64Array)); 72 | 73 | it('decodes utf8 data', () => test(utf8)); 74 | 75 | it('decodes boolean data', () => test(bool)); 76 | 77 | it('decodes decimal data', () => test(decimal, Float64Array)); 78 | it('decodes decimal32 data', () => test(decimal32, Float64Array)); 79 | it('decodes decimal64 data', () => test(decimal64, Float64Array)); 80 | it('decodes decimal128 data', () => test(decimal128, Float64Array)); 81 | it('decodes decimal256 data', () => test(decimal256, Float64Array)); 82 | it('decodes decimal32 data to int', () => test(decimal32, Int32Array, { useDecimalInt: true }, toDecimalInt)); 83 | it('decodes decimal64 data to bigint', () => test(decimal64, Array, { useDecimalInt: true }, toDecimalBigInt)); 84 | it('decodes decimal128 data to bigint', () => test(decimal128, Array, { useDecimalInt: true }, toDecimalBigInt)); 85 | it('decodes decimal256 data to bigint', () => test(decimal256, Array, { useDecimalInt: true }, toDecimalBigInt)); 86 | 87 | it('decodes date day data', () => test(dateDay, Float64Array)); 88 | it('decodes date day data to dates', () => test(dateDay, Array, { useDate: true }, toDate)); 89 | 90 | it('decodes timestamp nanosecond data', () => test(timestampNanosecond, Float64Array)); 91 | it('decodes timestamp microsecond data', () => test(timestampMicrosecond, Float64Array)); 92 | it('decodes timestamp millisecond data', () => test(timestampMillisecond, Float64Array)); 93 | it('decodes timestamp second data', () => test(timestampSecond, Float64Array)); 94 | it('decodes timestamp nanosecond data to dates', () => test(timestampNanosecond, Array, { useDate: true }, toDate)); 95 | it('decodes timestamp microsecond data to dates', () => test(timestampMicrosecond, Array, { useDate: true }, toDate)); 96 | it('decodes timestamp millisecond data to dates', () => test(timestampMillisecond, Array, { useDate: true }, toDate)); 97 | it('decodes timestamp second data to dates', () => test(timestampSecond, Array, { useDate: true }, toDate)); 98 | 99 | it('decodes interval year/month/nano data', () => test(intervalMonthDayNano)); 100 | 101 | it('decodes list int32 data', () => test(listInt32)); 102 | it('decodes list utf8 data', () => test(listUtf8)); 103 | 104 | it('decodes fixed list int32 data', () => test(fixedListInt32)); 105 | it('decodes fixed utf8 data', () => test(fixedListUtf8)); 106 | 107 | it('decodes list view data', () => test(listView)); 108 | it('decodes large list view data', () => test(largeListView)); 109 | 110 | it('decodes union data', () => test(union)); 111 | 112 | it('decodes map data', () => test(map, Array, {}, v => Array.from(v.entries()))); 113 | it('decodes map data to maps', () => test(map, Array, { useMap: true })); 114 | 115 | it('decodes struct data', () => test(struct)); 116 | it('decodes struct data with useProxy', async () => { 117 | const data = await struct(); 118 | for (const { bytes, values } of data) { 119 | const column = tableFromIPC(bytes, { useProxy: true }).getChildAt(0); 120 | const proxies = column.toArray(); 121 | assert.strictEqual(proxies.every(p => p === null || p[RowIndex] >= 0), true); 122 | assert.deepStrictEqual(proxies.map(p => p ? p.toJSON() : null), values); 123 | } 124 | }); 125 | 126 | it('decodes run-end-encoded data with 32-bit run ends', async () => { 127 | const data = await runEndEncoded32(); 128 | for (const { bytes, runs, values } of data) { 129 | const column = valueTest(bytes, values); 130 | const ree = column.data[0].children; 131 | assert.deepStrictEqual([...ree[0]], runs.counts); 132 | assert.deepStrictEqual([...ree[1]], runs.values); 133 | } 134 | }); 135 | it('decodes run-end-encoded data with 64-bit run ends', async () => { 136 | const data = await runEndEncoded64(); 137 | for (const { bytes, runs, values } of data) { 138 | const column = valueTest(bytes, values); 139 | const ree = column.data[0].children; 140 | assert.deepStrictEqual([...ree[0]], runs.counts); 141 | assert.deepStrictEqual([...ree[1]], runs.values); 142 | } 143 | }); 144 | 145 | it('decodes binary view data', async () => { 146 | const data = await binaryView(); 147 | for (const { bytes, values: { flat, spill } } of data) { 148 | valueTest(bytes, flat, Array, {}, null, 'flat'); 149 | valueTest(bytes, spill, Array, {}, null, 'spill'); 150 | } 151 | }); 152 | 153 | it('decodes utf8 view data', async () => { 154 | const data = await utf8View(); 155 | for (const { bytes, values: { flat, spill } } of data) { 156 | valueTest(bytes, flat, Array, {}, null, 'flat'); 157 | valueTest(bytes, spill, Array, {}, null, 'spill'); 158 | } 159 | }); 160 | 161 | it('decodes empty data', async () => { 162 | const data = await empty(); 163 | for (const { bytes } of data) { 164 | const table = tableFromIPC(bytes); 165 | table.schema.fields.map((f, i) => { 166 | assert.deepStrictEqual(table.getChildAt(i).type, f.type); 167 | }); 168 | assert.strictEqual(table.numRows, 0); 169 | assert.strictEqual(table.numCols, table.schema.fields.length); 170 | assert.deepStrictEqual(table.toArray(), []); 171 | assert.deepStrictEqual([...table], []); 172 | } 173 | }); 174 | }); 175 | -------------------------------------------------------------------------------- /test/table-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { arrowFromDuckDB } from './util/arrow-from-duckdb.js'; 3 | import { tableFromIPC } from '../src/index.js'; 4 | import { Table } from '../src/table.js'; 5 | 6 | const values = [ 7 | {a: 1, b: 'foo', c: [1, null, 3] }, 8 | null, 9 | {a: 2, b: 'baz', c: [null, 5, 6] } 10 | ]; 11 | 12 | const table = tableFromIPC(await arrowFromDuckDB(values)); 13 | 14 | describe('Table', () => { 15 | it('provides row count', () => { 16 | assert.deepStrictEqual(table.numRows, 3); 17 | }); 18 | 19 | it('provides column count', () => { 20 | assert.deepStrictEqual(table.numCols, 1); 21 | }); 22 | 23 | it('provides child column accessors', () => { 24 | const col = table.getChild('value'); 25 | assert.strictEqual(col, table.getChildAt(0)); 26 | assert.deepStrictEqual(col.toArray(), values); 27 | }); 28 | 29 | it('provides object array', () => { 30 | assert.deepStrictEqual(table.toArray(), values.map(value => ({ value }))); 31 | }); 32 | 33 | it('provides column array map', () => { 34 | assert.deepStrictEqual(table.toColumns(), { value: values }); 35 | }); 36 | 37 | it('provides random access via at/get', () => { 38 | const idx = [0, 1, 2]; 39 | 40 | // table object random access 41 | const obj = values.map(value => ({ value })); 42 | assert.deepStrictEqual(idx.map(i => table.at(i)), obj); 43 | assert.deepStrictEqual(idx.map(i => table.get(i)), obj); 44 | 45 | // column value random access 46 | const col = table.getChildAt(0); 47 | assert.deepStrictEqual(idx.map(i => col.at(i)), values); 48 | assert.deepStrictEqual(idx.map(i => col.get(i)), values); 49 | }); 50 | 51 | it('provides select by index', async () => { 52 | const sel = table.selectAt([0, 0]); 53 | const col = table.getChild('value'); 54 | assert.strictEqual(sel.schema.fields.length, 2); 55 | assert.strictEqual(sel.getChildAt(0), col); 56 | assert.strictEqual(sel.getChildAt(1), col); 57 | }); 58 | 59 | it('provides select by index with rename', async () => { 60 | const sel = table.selectAt([0, 0], ['foo', 'bar']); 61 | const col = table.getChild('value'); 62 | assert.strictEqual(sel.schema.fields.length, 2); 63 | assert.strictEqual(sel.getChildAt(0), col); 64 | assert.strictEqual(sel.getChildAt(1), col); 65 | assert.strictEqual(sel.getChild('foo'), col); 66 | assert.strictEqual(sel.getChild('bar'), col); 67 | }); 68 | 69 | it('provides select by name', async () => { 70 | const sel = table.select(['value', 'value']); 71 | const col = table.getChild('value'); 72 | assert.strictEqual(sel.schema.fields.length, 2); 73 | assert.strictEqual(sel.getChildAt(0), col); 74 | assert.strictEqual(sel.getChildAt(1), col); 75 | }); 76 | 77 | it('provides select by name with rename', async () => { 78 | const sel = table.select(['value', 'value'], ['foo', 'bar']); 79 | const col = table.getChild('value'); 80 | assert.strictEqual(sel.schema.fields.length, 2); 81 | assert.strictEqual(sel.getChildAt(0), col); 82 | assert.strictEqual(sel.getChildAt(1), col); 83 | assert.strictEqual(sel.getChild('foo'), col); 84 | assert.strictEqual(sel.getChild('bar'), col); 85 | }); 86 | 87 | it('handles empty table with no schema', async () => { 88 | const test = (table) => { 89 | assert.strictEqual(table.numRows, 0); 90 | assert.strictEqual(table.numCols, 0); 91 | assert.deepStrictEqual(table.toColumns(), {}); 92 | assert.deepStrictEqual(table.toArray(), []); 93 | assert.deepStrictEqual([...table], []); 94 | } 95 | test(new Table({ fields: [] }, [])); 96 | test(new Table({ fields: [] }, []).select([])); 97 | test(new Table({ fields: [] }, []).selectAt([])); 98 | }); 99 | 100 | it('handles empty table with schema', async () => { 101 | const fields = [ 102 | { name: 'foo', type: { typeId: 2, bitWidth: 32, signed: true } }, 103 | { name: 'bar', type: { typeId: 5 } } 104 | ]; 105 | const test = (table) => { 106 | assert.strictEqual(table.numRows, 0); 107 | assert.strictEqual(table.numCols, 2); 108 | assert.deepStrictEqual(table.toColumns(), { foo: [], bar: [] }); 109 | assert.deepStrictEqual(table.toArray(), []); 110 | assert.deepStrictEqual([...table], []); 111 | } 112 | test(new Table({ fields }, [])); 113 | test(new Table({ fields }, []).select(['foo', 'bar'])); 114 | test(new Table({ fields }, []).selectAt([0, 1])); 115 | }); 116 | 117 | it('is not concat spreadable', () => { 118 | assert.ok(!table[Symbol.isConcatSpreadable]); 119 | assert.deepStrictEqual([].concat(table), [table]); 120 | }); 121 | }); 122 | -------------------------------------------------------------------------------- /test/table-to-ipc-test.js: -------------------------------------------------------------------------------- 1 | import assert from 'node:assert'; 2 | import { readFile } from 'node:fs/promises'; 3 | import { Version, columnFromArray, tableFromColumns, tableFromIPC, tableToIPC } from '../src/index.js'; 4 | import * as dataMethods from './util/data.js'; 5 | 6 | const files = [ 7 | 'flights.arrows', 8 | 'scrabble.arrows', 9 | 'convert.arrows', 10 | 'decimal.arrows' 11 | ]; 12 | 13 | describe('tableToIPC', () => { 14 | for (const [name, method] of Object.entries(dataMethods)) { 15 | it(`encodes ${name} data`, async () => { 16 | const data = await method(); 17 | data.forEach(({ bytes }) => testEncode(bytes, 'stream')); 18 | data.forEach(({ bytes }) => testEncode(bytes, 'file')); 19 | }); 20 | } 21 | 22 | for (const file of files) { 23 | it(`encodes ${file}`, async () => { 24 | const bytes = new Uint8Array(await readFile(`test/data/${file}`)); 25 | testEncode(bytes, 'stream'); 26 | testEncode(bytes, 'file'); 27 | }); 28 | } 29 | 30 | it('throws on inconsistent batch sizes', () => { 31 | const a = columnFromArray([1, 2, 3, 4, 5], null); 32 | const b = columnFromArray([1, 2, 3, 4, 5], null, { maxBatchRows: 2 }); 33 | assert.throws(() => tableToIPC(tableFromColumns({ a, b }))); 34 | assert.throws(() => tableToIPC(tableFromColumns({ b, a }))); 35 | }); 36 | }); 37 | 38 | function testEncode(bytes, format = 'stream') { 39 | // load table 40 | const table = tableFromIPC(bytes); 41 | 42 | // ensure complete schema, override version 43 | const schema = { 44 | endianness: 0, 45 | metadata: null, 46 | ...table.schema, 47 | version: Version.V5 48 | }; 49 | 50 | // encode table to ipc bytes 51 | const ipc = tableToIPC(table, { format }); 52 | 53 | // parse ipc bytes to get a "round-trip" table 54 | const round = tableFromIPC(ipc); 55 | 56 | // check schema and shape equality 57 | assert.deepStrictEqual(round.schema, schema); 58 | assert.strictEqual(round.numRows, table.numRows); 59 | assert.strictEqual(round.numCols, table.numCols); 60 | 61 | // check extracted value equality 62 | for (let i = 0; i < table.numCols; ++i) { 63 | assert.deepStrictEqual( 64 | round.getChildAt(i).toArray(), 65 | table.getChildAt(i).toArray() 66 | ); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /test/util/arrow-from-duckdb.js: -------------------------------------------------------------------------------- 1 | import { DuckDB } from '@uwdata/mosaic-duckdb'; 2 | 3 | const db = new DuckDB(); 4 | 5 | export async function arrowQuery(sql, cleanup) { 6 | const ipc = await db.arrowBuffer(sql); 7 | if (cleanup) await db.exec(cleanup); 8 | return ipc; 9 | } 10 | 11 | export function arrowFromDuckDB(values, type) { 12 | const sql = values 13 | .map(stringify) 14 | .map(v => `SELECT ${v}${type ? `::${type}` : ''} AS value`).join(' UNION ALL '); 15 | return arrowQuery(sql); 16 | } 17 | 18 | function stringify(value) { 19 | switch (typeof value) { 20 | case 'string': return `'${value}'`; 21 | case 'object': 22 | if (value == null) { 23 | return 'NULL' 24 | } else if (Array.isArray(value)) { 25 | return `[${value.map(stringify).join(', ')}]`; 26 | } else if (value instanceof Date) { 27 | return value.toISOString(); 28 | } else if (value instanceof Map) { 29 | return `MAP ${stringifyObject(Array.from(value.entries()))}`; 30 | } else { 31 | return stringifyObject(Object.entries(value)); 32 | } 33 | default: return `${value}`; 34 | } 35 | } 36 | 37 | function stringifyObject(entries) { 38 | const props = entries.map(([key, value]) => `'${key}': ${stringify(value)}`); 39 | return `{ ${props.join(', ')} }`; 40 | } 41 | -------------------------------------------------------------------------------- /test/util/data.js: -------------------------------------------------------------------------------- 1 | import { readFile } from 'node:fs/promises'; 2 | import { arrowFromDuckDB } from './arrow-from-duckdb.js'; 3 | 4 | const toTimestamp = (v, off = 0) => v == null ? null : (+new Date(v) + off); 5 | const toInt32s = v => v == null ? null : v.some(x => x == null) ? v : Int32Array.of(...v); 6 | 7 | async function dataQuery(data, type, jsValues) { 8 | return Promise.all(data.map(async (array, i) => { 9 | const values = jsValues?.[i] ?? array; 10 | return { 11 | values, 12 | bytes: await arrowFromDuckDB(array, type), 13 | nullCount: values.reduce((nc, v) => v == null ? ++nc : nc, 0) 14 | }; 15 | })); 16 | } 17 | 18 | export function bool() { 19 | return dataQuery([ 20 | [true, false, true], 21 | [true, false, null] 22 | ], 'BOOLEAN'); 23 | } 24 | 25 | export function uint8() { 26 | return dataQuery([ 27 | [1, 2, 3], 28 | [1, null, 3] 29 | ], 'UTINYINT'); 30 | } 31 | 32 | export function uint16() { 33 | return dataQuery([ 34 | [1, 2, 3], 35 | [1, null, 3] 36 | ], 'USMALLINT'); 37 | } 38 | 39 | export function uint32() { 40 | return dataQuery([ 41 | [1, 2, 3], 42 | [1, null, 3] 43 | ], 'UINTEGER'); 44 | } 45 | 46 | export function uint64() { 47 | return dataQuery([ 48 | [1, 2, 3], 49 | [1, null, 3] 50 | ], 'UBIGINT'); 51 | } 52 | 53 | export function int8() { 54 | return dataQuery([ 55 | [1, 2, 3], 56 | [1, null, 3] 57 | ], 'TINYINT'); 58 | } 59 | 60 | export function int16() { 61 | return dataQuery([ 62 | [1, 2, 3], 63 | [1, null, 3] 64 | ], 'SMALLINT'); 65 | } 66 | 67 | export function int32() { 68 | return dataQuery([ 69 | [1, 2, 3], 70 | [1, null, 3] 71 | ], 'INTEGER'); 72 | } 73 | 74 | export function int64() { 75 | return dataQuery([ 76 | [1, 2, 3], 77 | [1, null, 3] 78 | ], 'BIGINT'); 79 | } 80 | 81 | export function float32() { 82 | return dataQuery([ 83 | [1.1, 2.2, 3.3], 84 | [1.1, null, 3.3] 85 | ], 'FLOAT'); 86 | } 87 | 88 | export function float64() { 89 | return dataQuery([ 90 | [1.1, 2.2, 3.3], 91 | [1.1, null, 3.3] 92 | ], 'DOUBLE'); 93 | } 94 | 95 | export function decimal() { 96 | return dataQuery([ 97 | [1.212, 3.443, 5.600], 98 | [1.212, null, 5.600] 99 | ], 'DECIMAL(18,3)'); 100 | } 101 | 102 | async function loadDecimal(bitWidth) { 103 | const bytes = new Uint8Array(await readFile(`test/data/decimal${bitWidth}.arrows`)); 104 | return [{ 105 | values: [123.45, 0, -123.45], 106 | bytes, 107 | nullCount: 0 108 | }]; 109 | } 110 | 111 | export function decimal32() { return loadDecimal(32); } 112 | export function decimal64() { return loadDecimal(64); } 113 | export function decimal128() { return loadDecimal(128); } 114 | export function decimal256() { return loadDecimal(256); } 115 | 116 | export function dateDay() { 117 | const data = [ 118 | ['2001-01-01', '2004-02-03', '2006-12-31'], 119 | ['2001-01-01', null, '2006-12-31'] 120 | ]; 121 | const vals = data.map(v => v.map(d => toTimestamp(d))); 122 | return dataQuery(data, 'DATE', vals); 123 | } 124 | 125 | export function timestampNanosecond() { 126 | const ns = [0.4568, 0.7382]; // DuckDB truncates here 127 | const ts = ['1992-09-20T11:30:00.123456789Z', '2002-12-13T07:28:56.564738209Z']; 128 | const data = [ts, ts.concat(null)]; 129 | const vals = data.map(v => v.map((d, i) => toTimestamp(d, ns[i]))); 130 | return dataQuery(data, 'TIMESTAMP_NS', vals); 131 | } 132 | 133 | export function timestampMicrosecond() { 134 | const us = [0.457, 0.738]; // DuckDB rounds here 135 | const ts = ['1992-09-20T11:30:00.123457Z', '2002-12-13T07:28:56.564738Z']; 136 | const data = [ts, ts.concat(null)]; 137 | const vals = data.map(v => v.map((d, i) => toTimestamp(d, us[i]))); 138 | return dataQuery(data, 'TIMESTAMP_NS', vals); 139 | } 140 | 141 | export function timestampMillisecond() { 142 | const ts = ['1992-09-20T11:30:00.123Z', '2002-12-13T07:28:56.565Z']; 143 | const data = [ts, ts.concat(null)]; 144 | const vals = data.map(v => v.map(d => toTimestamp(d))); 145 | return dataQuery(data, 'TIMESTAMP_MS', vals); 146 | } 147 | 148 | export function timestampSecond() { 149 | const ts = ['1992-09-20T11:30:00Z', '2002-12-13T07:28:57Z']; 150 | const data = [ts, ts.concat(null)]; 151 | const vals = data.map(v => v.map(d => toTimestamp(d))); 152 | return dataQuery(data, 'TIMESTAMP_S', vals); 153 | } 154 | 155 | export function intervalMonthDayNano() { 156 | return dataQuery([ 157 | ['2 years', null, '12 years 2 month 1 day 5 seconds', '1 microsecond'] 158 | ], 'INTERVAL', [[ 159 | Float64Array.of(24, 0, 0), 160 | null, 161 | Float64Array.of(146, 1, 5000000000), 162 | Float64Array.of(0, 0, 1000) 163 | ]]); 164 | } 165 | 166 | export function utf8() { 167 | return dataQuery([ 168 | ['foo', 'bar', 'baz'], 169 | ['foo', null, 'baz'] 170 | ], 'VARCHAR'); 171 | } 172 | 173 | export function listInt32() { 174 | const data = [ 175 | [[1, 2, 3, 4], [5, 6], [7, 8, 9]], 176 | [[1, 2, 3, 4], null, [7, 8, 9]], 177 | [[1, 2, null, 4], [5, null, 6], [7, null, 9]] 178 | ]; 179 | const vals = data.map(v => v.map(toInt32s)); 180 | return dataQuery(data, 'INTEGER[]', vals); 181 | } 182 | 183 | export function listUtf8() { 184 | return dataQuery([ 185 | [['a', 'b', 'c', 'd'], ['e', 'f'], ['g', 'h', 'i']], 186 | [['a', 'b', 'c', 'd'], null, ['g', 'h', 'i']], 187 | [['a', 'b', null, 'd'], ['e', null, 'f'], ['g', null, 'i']] 188 | ], 'VARCHAR[]'); 189 | } 190 | 191 | export function fixedListInt32() { 192 | const data = [ 193 | [[1, 2, 3], [4, 5, 6], [7, 8, 9]], 194 | [[1, 2, 3], null, [7, 8, 9]], 195 | [[1, null, 3], [null, 5, 6], [7, 8, null]] 196 | ]; 197 | const vals = data.map(v => v.map(toInt32s)); 198 | return dataQuery(data, 'INTEGER[3]', vals); 199 | } 200 | 201 | export function fixedListUtf8() { 202 | return dataQuery([ 203 | [['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']], 204 | [['a', 'b', 'c'], null, ['g', 'h', 'i']], 205 | [['a', null, 'c'], [null, 'e', 'f'], ['g', 'h', null]] 206 | ], 'VARCHAR[3]'); 207 | } 208 | 209 | export function union() { 210 | return dataQuery([ 211 | ['a', 2, 'c'], 212 | ['a', null, 'c'] 213 | ], 'UNION(i INTEGER, v VARCHAR)'); 214 | } 215 | 216 | export function map() { 217 | return dataQuery([ 218 | [ 219 | new Map([ ['foo', 1], ['bar', 2] ]), 220 | new Map([ ['foo', null], ['baz', 3] ]) 221 | ] 222 | ]); 223 | } 224 | 225 | export function struct() { 226 | return dataQuery([ 227 | [ {a: 1, b: 'foo'}, {a: 2, b: 'baz'} ], 228 | [ {a: 1, b: 'foo'}, null, {a: 2, b: 'baz'} ], 229 | [ {a: null, b: 'foo'}, {a: 2, b: null} ], 230 | [ {a: ['a', 'b'], b: Math.E}, {a: ['c', 'd'], b: Math.PI} ] 231 | ]); 232 | } 233 | 234 | export async function listView() { 235 | const bytes = await readFile(`test/data/listview.arrows`); 236 | return [{ 237 | values: [ 238 | ['foo', 'bar', 'baz'], 239 | null, 240 | ['baz', null, 'foo'], 241 | ['foo'] 242 | ], 243 | bytes, 244 | nullCount: 1 245 | }]; 246 | } 247 | 248 | export async function largeListView() { 249 | const bytes = await readFile(`test/data/largelistview.arrows`); 250 | return [{ 251 | values: [ 252 | ['foo', 'bar', 'baz'], 253 | null, 254 | ['baz', null, 'foo'], 255 | ['foo'] 256 | ], 257 | bytes, 258 | nullCount: 1 259 | }]; 260 | } 261 | 262 | export async function runEndEncoded32() { 263 | const bytes = new Uint8Array(await readFile(`test/data/runendencoded.arrows`)); 264 | return [{ 265 | runs: { 266 | counts: [2, 3, 4, 6, 8, 9], 267 | values: ['foo', null, 'bar', 'baz', null, 'foo'] 268 | }, 269 | values: ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'], 270 | bytes, 271 | nullCount: 3 272 | }]; 273 | } 274 | 275 | export async function runEndEncoded64() { 276 | const bytes = new Uint8Array(await readFile(`test/data/runendencoded64.arrows`)); 277 | return [{ 278 | runs: { 279 | counts: [2, 3, 4, 6, 8, 9], 280 | values: ['foo', null, 'bar', 'baz', null, 'foo'] 281 | }, 282 | values: ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'], 283 | bytes, 284 | nullCount: 3 285 | }]; 286 | } 287 | 288 | export async function binaryView() { 289 | const encoder = new TextEncoder(); 290 | const toBytes = v => v == null ? null : encoder.encode(v); 291 | const bytes = new Uint8Array(await readFile(`test/data/binaryview.arrows`)); 292 | return [{ 293 | values: { 294 | flat: ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'].map(toBytes), 295 | spill: ['foobazbarbipbopboodeedoozoo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'].map(toBytes) 296 | }, 297 | bytes, 298 | nullCount: 3 299 | }]; 300 | } 301 | 302 | export async function utf8View() { 303 | const bytes = new Uint8Array(await readFile(`test/data/utf8view.arrows`)); 304 | return [{ 305 | values: { 306 | flat: ['foo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'], 307 | spill: ['foobazbarbipbopboodeedoozoo', 'foo', null, 'bar', 'baz', 'baz', null, null, 'foo'] 308 | }, 309 | bytes, 310 | nullCount: 3 311 | }]; 312 | } 313 | 314 | // For empty result sets, DuckDB node only returns a zero byte 315 | // Other variants may include a schema message 316 | export async function empty() { 317 | return [ 318 | { 319 | values: [], 320 | bytes: Uint8Array.of(0, 0, 0, 0), 321 | nullCount: 0 322 | }, 323 | { 324 | values: [], 325 | bytes: new Uint8Array(await readFile(`test/data/empty.arrows`)), 326 | nullCount: 0 327 | } 328 | ]; 329 | } 330 | -------------------------------------------------------------------------------- /test/util/decimal.js: -------------------------------------------------------------------------------- 1 | import { Type, Version } from '../../src/index.js'; 2 | 3 | export function decimalDataToEncode() { 4 | return { 5 | schema: { 6 | version: Version.V5, 7 | endianness: 0, 8 | fields: [{ 9 | name: 'd', 10 | type: { typeId: Type.Decimal, precision: 18, scale: 3, bitWidth: 128, values: BigUint64Array }, 11 | nullable: true, 12 | metadata: null 13 | }], 14 | metadata: null 15 | }, 16 | records: [{ 17 | length: 3, 18 | nodes: [ { length: 3, nullCount: 0 } ], 19 | regions: [ 20 | { offset: 0, length: 0 }, 21 | { offset: 0, length: 48 } 22 | ], 23 | variadic: [], 24 | buffers: [ 25 | Uint8Array.of(232,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224,46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,208,132,0,0,0,0,0,0,0,0,0,0,0,0,0,0) 26 | ], 27 | byteLength: 48 28 | }], 29 | dictionaries: [], 30 | metadata: null 31 | }; 32 | } 33 | 34 | export function decimalDataDecoded() { 35 | const data = decimalDataToEncode(); 36 | const record = data.records[0]; 37 | record.body = record.buffers[0]; 38 | delete record.byteLength; 39 | delete record.buffers; 40 | return data; 41 | } 42 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["src/index-types.ts"], 3 | "compilerOptions": { 4 | "allowJs": true, 5 | "checkJs": true, 6 | "declaration": true, 7 | "emitDeclarationOnly": true, 8 | "esModuleInterop": true, 9 | "module": "node16", 10 | "moduleResolution": "node16", 11 | "outDir": "dist/types", 12 | "target": "es2022", 13 | "skipLibCheck": true 14 | } 15 | } 16 | --------------------------------------------------------------------------------