├── hyparquet-writer.jpg
├── test
    ├── files
    │   ├── listy.parquet
    │   ├── signs.parquet
    │   ├── geospatial.parquet
    │   ├── fixed_length_decimal.parquet
    │   └── float16_nonzeros_and_nans.parquet
    ├── write.roundtrip.test.js
    ├── package.test.js
    ├── write.file.test.js
    ├── wkb.test.js
    ├── write.geospatial.test.js
    ├── write.list.test.js
    ├── snappy.test.js
    ├── encoding.test.js
    ├── geospatial.test.js
    ├── filewriter.test.js
    ├── schema.test.js
    ├── bytewriter.test.js
    ├── write.delta.test.js
    ├── write.splitstream.test.js
    ├── splitstream.test.js
    ├── thrift.test.js
    ├── write.schema.test.js
    ├── plain.test.js
    ├── example.js
    ├── write.multipage.test.js
    ├── metadata.test.js
    ├── delta.test.js
    └── unconvert.test.js
├── .gitignore
├── tsconfig.build.json
├── tsconfig.json
├── .github
    └── workflows
    │   └── ci.yml
├── src
    ├── index.js
    ├── write.js
    ├── node.js
    ├── indexes.js
    ├── splitstream.js
    ├── types.d.ts
    ├── encoding.js
    ├── dremel.js
    ├── plain.js
    ├── bytewriter.js
    ├── wkb.js
    ├── geospatial.js
    ├── parquet-writer.js
    ├── thrift.js
    ├── delta.js
    ├── schema.js
    ├── snappy.js
    ├── metadata.js
    ├── datapage.js
    ├── unconvert.js
    └── column.js
├── LICENSE
├── package.json
├── eslint.config.js
├── benchmark.js
└── README.md


/hyparquet-writer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/hyparquet-writer.jpg


--------------------------------------------------------------------------------
/test/files/listy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/listy.parquet


--------------------------------------------------------------------------------
/test/files/signs.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/signs.parquet


--------------------------------------------------------------------------------
/test/files/geospatial.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/geospatial.parquet


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | coverage
2 | node_modules
3 | package-lock.json
4 | *.tgz
5 | .DS_Store
6 | /*.parquet
7 | /data
8 | /types
9 | 


--------------------------------------------------------------------------------
/test/files/fixed_length_decimal.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/fixed_length_decimal.parquet


--------------------------------------------------------------------------------
/test/files/float16_nonzeros_and_nans.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/float16_nonzeros_and_nans.parquet


--------------------------------------------------------------------------------
/tsconfig.build.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "./tsconfig.json",
 3 |   "compilerOptions": {
 4 |     "noEmit": false,
 5 |     "declaration": true,
 6 |     "emitDeclarationOnly": true,
 7 |     "outDir": "types",
 8 |     "declarationMap": true
 9 |   },
10 |   "include": ["src"]
11 | }
12 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "allowJs": true,
 4 |     "checkJs": true,
 5 |     "lib": ["esnext", "dom"],
 6 |     "module": "nodenext",
 7 |     "noEmit": true,
 8 |     "resolveJsonModule": true,
 9 |     "strict": true
10 |   },
11 |   "include": ["src", "test"]
12 | }
13 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 | 
 6 | jobs:
 7 |   lint:
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 5
10 |     steps:
11 |       - uses: actions/checkout@v6
12 |       - run: npm i
13 |       - run: npm run lint
14 | 
15 |   typecheck:
16 |     runs-on: ubuntu-latest
17 |     timeout-minutes: 5
18 |     steps:
19 |       - uses: actions/checkout@v6
20 |       - run: npm i
21 |       - run: npx tsc
22 | 
23 |   test:
24 |     runs-on: ubuntu-latest
25 |     timeout-minutes: 5
26 |     steps:
27 |       - uses: actions/checkout@v6
28 |       - run: npm i
29 |       - run: npm run coverage
30 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
 1 | export { parquetWrite, parquetWriteBuffer } from './write.js'
 2 | export { autoSchemaElement, schemaFromColumnData } from './schema.js'
 3 | export { ByteWriter } from './bytewriter.js'
 4 | export { ParquetWriter } from './parquet-writer.js'
 5 | 
 6 | /**
 7 |  * @typedef {import('hyparquet').KeyValue} KeyValue
 8 |  * @typedef {import('hyparquet').SchemaElement} SchemaElement
 9 |  * @typedef {import('../src/types.d.ts').BasicType} BasicType
10 |  * @typedef {import('../src/types.d.ts').ColumnSource} ColumnSource
11 |  * @typedef {import('../src/types.d.ts').ParquetWriteOptions} ParquetWriteOptions
12 |  * @typedef {import('../src/types.d.ts').Writer} Writer
13 |  */
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/test/write.roundtrip.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet'
 3 | import { describe, expect, it } from 'vitest'
 4 | import { parquetWriteBuffer } from '../src/index.js'
 5 | 
 6 | describe('parquetWrite round-trip', () => {
 7 |   const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))
 8 | 
 9 |   files.forEach(filename => {
10 |     it(`round-trips data from ${filename}`, async () => {
11 |       const file = await asyncBufferFromFile(`test/files/${filename}`)
12 |       const metadata = await parquetMetadataAsync(file)
13 |       const rows = await parquetReadObjects({ file })
14 | 
15 |       // transpose the row data
16 |       const schemaTree = parquetSchema(metadata)
17 |       const columnData = schemaTree.children.map(({ element }) => ({
18 |         name: element.name,
19 |         data: new Array(),
20 |       }))
21 |       for (const row of rows) {
22 |         for (const { name, data } of columnData) {
23 |           data.push(row[name])
24 |         }
25 |       }
26 | 
27 |       const buffer = parquetWriteBuffer({ columnData, schema: metadata.schema })
28 |       const output = await parquetReadObjects({ file: buffer })
29 | 
30 |       expect(output.length).toBe(rows.length)
31 |       expect(output).toEqual(rows)
32 |     })
33 |   })
34 | })
35 | 


--------------------------------------------------------------------------------
/src/write.js:
--------------------------------------------------------------------------------
 1 | import { ByteWriter } from './bytewriter.js'
 2 | import { ParquetWriter } from './parquet-writer.js'
 3 | import { schemaFromColumnData } from './schema.js'
 4 | 
 5 | /**
 6 |  * Write data as parquet to a file or stream.
 7 |  *
 8 |  * @import {ParquetWriteOptions} from '../src/types.js'
 9 |  * @param {ParquetWriteOptions} options
10 |  */
11 | export function parquetWrite({
12 |   writer,
13 |   columnData,
14 |   schema,
15 |   codec = 'SNAPPY',
16 |   compressors,
17 |   statistics = true,
18 |   rowGroupSize = [100, 1000, 10000],
19 |   kvMetadata,
20 |   pageSize = 1048576,
21 | }) {
22 |   if (!schema) {
23 |     schema = schemaFromColumnData({ columnData })
24 |   } else if (columnData.some(({ type }) => type)) {
25 |     throw new Error('cannot provide both schema and columnData type')
26 |   } else {
27 |     // TODO: validate schema
28 |   }
29 |   const pq = new ParquetWriter({
30 |     writer,
31 |     schema,
32 |     codec,
33 |     compressors,
34 |     statistics,
35 |     kvMetadata,
36 |   })
37 |   pq.write({
38 |     columnData,
39 |     rowGroupSize,
40 |     pageSize,
41 |   })
42 |   pq.finish()
43 | }
44 | 
45 | /**
46 |  * Write data as parquet to an ArrayBuffer.
47 |  *
48 |  * @param {Omit<ParquetWriteOptions, 'writer'>} options
49 |  * @returns {ArrayBuffer}
50 |  */
51 | export function parquetWriteBuffer(options) {
52 |   const writer = new ByteWriter()
53 |   parquetWrite({ ...options, writer })
54 |   return writer.getBuffer()
55 | }
56 | 


--------------------------------------------------------------------------------
/test/package.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import packageJson from '../package.json' with { type: 'json' }
 3 | 
 4 | describe('package.json', () => {
 5 |   it('should have the correct name', () => {
 6 |     expect(packageJson.name).toBe('hyparquet-writer')
 7 |   })
 8 | 
 9 |   it('should have a valid version', () => {
10 |     expect(packageJson.version).toMatch(/^\d+\.\d+\.\d+$/)
11 |   })
12 | 
13 |   it('should have MIT license', () => {
14 |     expect(packageJson.license).toBe('MIT')
15 |   })
16 | 
17 |   it('should have precise dev dependency versions', () => {
18 |     const { dependencies, devDependencies } = packageJson
19 |     const allDependencies = { ...dependencies, ...devDependencies }
20 |     Object.values(allDependencies).forEach(version => {
21 |       expect(version).toMatch(/^\d+\.\d+\.\d+$/)
22 |     })
23 |   })
24 | 
25 |   it('should have no peer dependencies', () => {
26 |     expect('peerDependencies' in packageJson).toBe(false)
27 |   })
28 | 
29 |   it('should have exports with types first', () => {
30 |     const { exports } = packageJson
31 |     expect(Object.keys(exports)).toEqual(['.', './src/*.js'])
32 |     // node vs default (browser)
33 |     expect(Object.keys(exports['.'])).toEqual(['browser', 'default'])
34 |     expect(Object.keys(exports['.'].browser)).toEqual(['types', 'default'])
35 |     expect(Object.keys(exports['.'].default)).toEqual(['types', 'default'])
36 |     // deep imports
37 |     expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'default'])
38 |   })
39 | })
40 | 


--------------------------------------------------------------------------------
/test/write.file.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
 3 | import { afterEach, beforeEach, describe, expect, it } from 'vitest'
 4 | import { parquetWriteFile } from '../src/node.js'
 5 | import { exampleData, exampleMetadata } from './example.js'
 6 | 
 7 | const filedir = 'data/'
 8 | const filename = 'data/write.file.parquet'
 9 | 
10 | describe('parquetWriteFile', () => {
11 |   beforeEach(() => {
12 |     // ensure data directory exists
13 |     if (!fs.existsSync(filedir)) {
14 |       fs.mkdirSync(filedir)
15 |     }
16 |   })
17 | 
18 |   afterEach(() => {
19 |     // remove test file
20 |     if (fs.existsSync(filename)) {
21 |       fs.unlinkSync(filename)
22 |     }
23 |   })
24 | 
25 |   it('writes parquet file', async () => {
26 |     parquetWriteFile({ filename, columnData: exampleData })
27 | 
28 |     // check parquet metadata
29 |     const file = await asyncBufferFromFile(filename)
30 |     const metadata = await parquetMetadataAsync(file)
31 |     expect(metadata).toEqual(exampleMetadata)
32 | 
33 |     // check parquet data
34 |     const result = await parquetReadObjects({ file, metadata })
35 |     expect(result).toEqual([
36 |       { bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true },
37 |       { bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false },
38 |       { bool: true, int: 0x7fff, bigint: 0x7fffn, float: 123.45600128173828, double: 123.456, string: 'c', nullable: null },
39 |       { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, float: Infinity, double: 1e100, string: 'd', nullable: null },
40 |     ])
41 |   })
42 | })
43 | 


--------------------------------------------------------------------------------
/test/wkb.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { geojsonToWkb } from '../src/wkb.js'
 3 | import { wkbToGeojson } from 'hyparquet/src/wkb.js'
 4 | 
 5 | /** @import {Geometry} from 'hyparquet/src/types.js' */
 6 | 
 7 | describe('geojsonToWkb', () => {
 8 |   it('encodes point geometries', () => {
 9 |     /** @type {Geometry} */
10 |     const geometry = { type: 'Point', coordinates: [30, 10] }
11 |     const decoded = decode(geojsonToWkb(geometry))
12 |     expect(decoded).toEqual(geometry)
13 |   })
14 | 
15 |   it('encodes polygons with holes', () => {
16 |     /** @type {Geometry} */
17 |     const geometry = {
18 |       type: 'Polygon',
19 |       coordinates: [
20 |         [[35, 10], [45, 45], [15, 40], [10, 20], [35, 10]],
21 |         [[20, 30], [35, 35], [30, 20], [20, 30]],
22 |       ],
23 |     }
24 |     const decoded = decode(geojsonToWkb(geometry))
25 |     expect(decoded).toEqual(geometry)
26 |   })
27 | 
28 |   it('encodes geometry collections with mixed dimensions', () => {
29 |     /** @type {Geometry} */
30 |     const geometry = {
31 |       type: 'GeometryCollection',
32 |       geometries: [
33 |         { type: 'Point', coordinates: [30, 10, 5] },
34 |         { type: 'LineString', coordinates: [[30, 10, 5], [40, 40, 5], [20, 40, 5], [10, 20, 5]] },
35 |       ],
36 |     }
37 |     const decoded = decode(geojsonToWkb(geometry))
38 |     expect(decoded).toEqual(geometry)
39 |   })
40 | })
41 | 
42 | /**
43 |  * Decode WKB using the hyparquet reader for verification.
44 |  *
45 |  * @param {Uint8Array} wkb
46 |  * @returns {Geometry}
47 |  */
48 | function decode(wkb) {
49 |   const view = new DataView(wkb.buffer, wkb.byteOffset, wkb.byteLength)
50 |   const reader = { view, offset: 0 }
51 |   return wkbToGeojson(reader)
52 | }
53 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "hyparquet-writer",
 3 |   "version": "0.11.2",
 4 |   "description": "Parquet file writer for JavaScript",
 5 |   "author": "Hyperparam",
 6 |   "homepage": "https://hyperparam.app",
 7 |   "keywords": [
 8 |     "ai",
 9 |     "data",
10 |     "hyperparam",
11 |     "hyparquet",
12 |     "ml",
13 |     "parquet",
14 |     "snappy",
15 |     "thrift"
16 |   ],
17 |   "license": "MIT",
18 |   "repository": {
19 |     "type": "git",
20 |     "url": "git+https://github.com/hyparam/hyparquet-writer.git"
21 |   },
22 |   "main": "src/index.js",
23 |   "files": [
24 |     "src",
25 |     "types"
26 |   ],
27 |   "type": "module",
28 |   "types": "types/index.d.ts",
29 |   "exports": {
30 |     ".": {
31 |       "browser": {
32 |         "types": "./types/index.d.ts",
33 |         "default": "./src/index.js"
34 |       },
35 |       "default": {
36 |         "types": "./types/node.d.ts",
37 |         "default": "./src/node.js"
38 |       }
39 |     },
40 |     "./src/*.js": {
41 |       "types": "./types/*.d.ts",
42 |       "default": "./src/*.js"
43 |     }
44 |   },
45 |   "sideEffects": false,
46 |   "scripts": {
47 |     "build:types": "tsc -p ./tsconfig.build.json",
48 |     "coverage": "vitest run --coverage --coverage.include=src",
49 |     "lint": "eslint",
50 |     "lint:fix": "eslint --fix",
51 |     "prepare": "npm run build:types",
52 |     "test": "vitest run"
53 |   },
54 |   "dependencies": {
55 |     "hyparquet": "1.23.2"
56 |   },
57 |   "devDependencies": {
58 |     "@babel/eslint-parser": "7.28.5",
59 |     "@types/node": "25.0.3",
60 |     "@vitest/coverage-v8": "4.0.16",
61 |     "eslint": "9.39.2",
62 |     "eslint-plugin-jsdoc": "61.5.0",
63 |     "hysnappy": "1.1.0",
64 |     "typescript": "5.9.3",
65 |     "vitest": "4.0.16"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/test/write.geospatial.test.js:
--------------------------------------------------------------------------------
 1 | import { parquetMetadata } from 'hyparquet'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetWriteBuffer } from '../src/index.js'
 4 | 
 5 | /**
 6 |  * @import {ColumnSource} from '../src/types.js'
 7 |  */
 8 | 
 9 | describe('geospatial statistics', () => {
10 |   it('writes geospatial statistics into column metadata', () => {
11 |     /** @type {ColumnSource[]} */
12 |     const columnData = [{
13 |       name: 'geometry',
14 |       type: 'GEOMETRY',
15 |       data: [
16 |         { type: 'Point', coordinates: [10, 5, 100, 2] },
17 |         null,
18 |         {
19 |           type: 'LineString',
20 |           coordinates: [
21 |             [-20, -10, 50, 5],
22 |             [40, 30, 75, -5],
23 |           ],
24 |         },
25 |         {
26 |           type: 'GeometryCollection',
27 |           geometries: [
28 |             { type: 'Point', coordinates: [5, 15] },
29 |             {
30 |               type: 'MultiPoint',
31 |               coordinates: [
32 |                 [0, -5],
33 |                 [60, 10],
34 |               ],
35 |             },
36 |           ],
37 |         },
38 |       ],
39 |     }]
40 | 
41 |     const buffer = parquetWriteBuffer({ columnData })
42 |     const metadata = parquetMetadata(buffer)
43 |     const columnMeta = metadata.row_groups[0].columns[0].meta_data
44 | 
45 |     expect(columnMeta?.statistics).toEqual({ null_count: 1n })
46 |     expect(columnMeta?.geospatial_statistics).toEqual({
47 |       bbox: {
48 |         xmin: -20,
49 |         xmax: 60,
50 |         ymin: -10,
51 |         ymax: 30,
52 |         zmin: 50,
53 |         zmax: 100,
54 |         mmin: -5,
55 |         mmax: 5,
56 |       },
57 |       // sort numerically not by string order
58 |       geospatial_types: [7, 3001, 3002],
59 |     })
60 |   })
61 | })
62 | 


--------------------------------------------------------------------------------
/src/node.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { ByteWriter } from './bytewriter.js'
 3 | import { parquetWrite } from './write.js'
 4 | 
 5 | export * from './index.js'
 6 | 
 7 | /**
 8 |  * Write data as parquet to a local file.
 9 |  *
10 |  * @import {ParquetWriteOptions, Writer} from '../src/types.js'
11 |  * @param {Omit<ParquetWriteOptions, 'writer'> & {filename: string}} options
12 |  */
13 | export function parquetWriteFile(options) {
14 |   const { filename, ...rest } = options
15 |   const writer = fileWriter(filename)
16 |   parquetWrite({ ...rest, writer })
17 | }
18 | 
19 | /**
20 |  * Buffered file writer.
21 |  * Writes data to a local file in chunks using node fs.
22 |  *
23 |  * @param {string} filename
24 |  * @returns {Writer}
25 |  */
26 | export function fileWriter(filename) {
27 |   const writer = new ByteWriter()
28 |   const chunkSize = 1_000_000 // 1mb
29 | 
30 |   // create a new file or overwrite existing one
31 |   fs.writeFileSync(filename, '', { flag: 'w' })
32 | 
33 |   function flush() {
34 |     const chunk = writer.buffer.slice(0, writer.index)
35 |     // TODO: async
36 |     fs.writeFileSync(filename, new Uint8Array(chunk), { flag: 'a' })
37 |     writer.index = 0
38 |   }
39 | 
40 |   /**
41 |    * Override the ensure method
42 |    * @param {number} size
43 |    */
44 |   writer.ensure = function(size) {
45 |     if (writer.index > chunkSize) {
46 |       flush()
47 |     }
48 |     if (writer.index + size > writer.buffer.byteLength) {
49 |       const newSize = Math.max(writer.buffer.byteLength * 2, writer.index + size)
50 |       const newBuffer = new ArrayBuffer(newSize)
51 |       new Uint8Array(newBuffer).set(new Uint8Array(writer.buffer))
52 |       writer.buffer = newBuffer
53 |       writer.view = new DataView(writer.buffer)
54 |     }
55 |   }
56 |   writer.getBuffer = function() {
57 |     throw new Error('getBuffer not supported for FileWriter')
58 |   }
59 |   writer.finish = function() {
60 |     flush()
61 |   }
62 |   return writer
63 | }
64 | 


--------------------------------------------------------------------------------
/test/write.list.test.js:
--------------------------------------------------------------------------------
 1 | import { parquetReadObjects } from 'hyparquet'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetWriteBuffer } from '../src/index.js'
 4 | 
 5 | describe('parquetWrite lists', () => {
 6 |   it('writes optional list columns', async () => {
 7 |     const listy = [
 8 |       [1, 2],
 9 |       null,
10 |       [],
11 |       [3, null, 4],
12 |       [null],
13 |     ]
14 | 
15 |     const buffer = parquetWriteBuffer({
16 |       columnData: [{ name: 'listy', data: listy }],
17 |       schema: [
18 |         { name: 'root', num_children: 1 },
19 |         {
20 |           name: 'listy',
21 |           repetition_type: 'OPTIONAL',
22 |           num_children: 1,
23 |           converted_type: 'LIST',
24 |         },
25 |         {
26 |           name: 'list',
27 |           repetition_type: 'REPEATED',
28 |           num_children: 1,
29 |         },
30 |         {
31 |           name: 'element',
32 |           repetition_type: 'OPTIONAL',
33 |           type: 'INT32',
34 |         },
35 |       ],
36 |     })
37 | 
38 |     const rows = await parquetReadObjects({ file: buffer })
39 |     expect(rows).toEqual([
40 |       { listy: [1, 2] },
41 |       { listy: undefined },
42 |       { listy: [] },
43 |       { listy: [3, null, 4] },
44 |       { listy: [null] },
45 |     ])
46 |   })
47 | 
48 |   it('throws on null data for required list columns', () => {
49 |     /**
50 |      * Schema for a required list of required INT32 values.
51 |      * @type {import('hyparquet').SchemaElement[]}
52 |      */
53 |     const requiredListSchema = [
54 |       { name: 'root', num_children: 1 },
55 |       {
56 |         name: 'numbers',
57 |         repetition_type: 'REQUIRED',
58 |         num_children: 1,
59 |         converted_type: 'LIST',
60 |       },
61 |       {
62 |         name: 'list',
63 |         repetition_type: 'REPEATED',
64 |         num_children: 1,
65 |       },
66 |       {
67 |         name: 'element',
68 |         repetition_type: 'REQUIRED',
69 |         type: 'INT32',
70 |       },
71 |     ]
72 | 
73 |     expect(() => parquetWriteBuffer({
74 |       columnData: [{ name: 'numbers', data: [[420], null] }],
75 |       schema: requiredListSchema,
76 |     })).toThrow('parquet required value is undefined')
77 |   })
78 | })
79 | 


--------------------------------------------------------------------------------
/src/indexes.js:
--------------------------------------------------------------------------------
 1 | import { BoundaryOrders } from 'hyparquet/src/constants.js'
 2 | import { serializeTCompactProtocol } from './thrift.js'
 3 | 
 4 | /**
 5 |  * @import {ColumnChunk, ColumnIndex, OffsetIndex} from 'hyparquet'
 6 |  * @import {PageIndexes, Writer} from '../src/types.js'
 7 |  */
 8 | 
 9 | /**
10 |  * Write ColumnIndex and OffsetIndex for the given columns.
11 |  *
12 |  * @param {Writer} writer
13 |  * @param {PageIndexes[]} pageIndexes
14 |  */
15 | export function writeIndexes(writer, pageIndexes) {
16 |   for (const { chunk, columnIndex } of pageIndexes) {
17 |     writeColumnIndex(writer, chunk, columnIndex)
18 |   }
19 |   for (const { chunk, offsetIndex } of pageIndexes) {
20 |     writeOffsetIndex(writer, chunk, offsetIndex)
21 |   }
22 | }
23 | 
24 | /**
25 |  * @param {Writer} writer
26 |  * @param {ColumnChunk} columnChunk
27 |  * @param {ColumnIndex} [columnIndex]
28 |  */
29 | function writeColumnIndex(writer, columnChunk, columnIndex) {
30 |   // Page indexes only help when multiple pages
31 |   if (!columnIndex || columnIndex.min_values.length <= 1) return
32 |   const columnIndexOffset = writer.offset
33 |   serializeTCompactProtocol(writer, {
34 |     field_1: columnIndex.null_pages,
35 |     field_2: columnIndex.min_values,
36 |     field_3: columnIndex.max_values,
37 |     field_4: BoundaryOrders.indexOf(columnIndex.boundary_order),
38 |     field_5: columnIndex.null_counts,
39 |   })
40 |   columnChunk.column_index_offset = BigInt(columnIndexOffset)
41 |   columnChunk.column_index_length = writer.offset - columnIndexOffset
42 | }
43 | 
44 | /**
45 |  * @param {Writer} writer
46 |  * @param {ColumnChunk} columnChunk
47 |  * @param {OffsetIndex} [offsetIndex]
48 |  */
49 | function writeOffsetIndex(writer, columnChunk, offsetIndex) {
50 |   // Page indexes only help when multiple pages
51 |   if (!offsetIndex || offsetIndex.page_locations.length <= 1) return
52 |   const offsetIndexOffset = writer.offset
53 |   serializeTCompactProtocol(writer, {
54 |     field_1: offsetIndex.page_locations.map(p => ({
55 |       field_1: p.offset,
56 |       field_2: p.compressed_page_size,
57 |       field_3: p.first_row_index,
58 |     })),
59 |   })
60 |   columnChunk.offset_index_offset = BigInt(offsetIndexOffset)
61 |   columnChunk.offset_index_length = writer.offset - offsetIndexOffset
62 | }
63 | 


--------------------------------------------------------------------------------
/test/snappy.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { snappyCompress } from '../src/snappy.js'
 3 | import { snappyUncompress } from 'hyparquet'
 4 | 
 5 | describe('snappy compress', () => {
 6 | 
 7 |   it.for([
 8 |     { compressed: [0x00], uncompressed: '' },
 9 |     { compressed: [0x01, 0x00, 0x68], uncompressed: 'h' },
10 |     { compressed: [0x02, 0x04, 0x68, 0x79], uncompressed: 'hy' },
11 |     { compressed: [0x03, 0x08, 0x68, 0x79, 0x70], uncompressed: 'hyp' },
12 |     { compressed: [0x05, 0x10, 0x68, 0x79, 0x70, 0x65, 0x72], uncompressed: 'hyper' },
13 |     {
14 |       compressed: [0x0a, 0x24, 0x68, 0x79, 0x70, 0x65, 0x72, 0x70, 0x61, 0x72, 0x61, 0x6d],
15 |       uncompressed: 'hyperparam',
16 |     },
17 |     {
18 |       compressed: [0x15, 0x08, 0x68, 0x79, 0x70, 0x46, 0x03, 0x00],
19 |       uncompressed: 'hyphyphyphyphyphyphyp',
20 |     },
21 |     {
22 |       // from rowgroups.parquet
23 |       compressed: [
24 |         80, 4, 1, 0, 9, 1, 0, 2, 9, 7, 4, 0, 3, 13, 8, 0, 4, 13, 8, 0, 5, 13,
25 |         8, 0, 6, 13, 8, 0, 7, 13, 8, 0, 8, 13, 8, 60, 9, 0, 0, 0, 0, 0, 0, 0,
26 |         10, 0, 0, 0, 0, 0, 0, 0,
27 |       ],
28 |       uncompressed: new Uint8Array([
29 |         1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0,
30 |         0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0,
31 |         0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0,
32 |         0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0,
33 |       ]),
34 |     },
35 |     // from datapage_v2.snappy.parquet
36 |     { compressed: [2, 4, 0, 3], uncompressed: new Uint8Array([0, 3]) },
37 |     { compressed: [ 6, 20, 2, 0, 0, 0, 3, 23], uncompressed: new Uint8Array([2, 0, 0, 0, 3, 23]) },
38 |     // from sample data test
39 |     {
40 |       compressed: [1, 0, 5],
41 |       uncompressed: new Uint8Array([5]),
42 |     },
43 |   ])('compresses valid input %p', ({ uncompressed }) => {
44 |     const encoder = new TextEncoder()
45 |     const input = typeof uncompressed === 'string' ? encoder.encode(uncompressed) : new Uint8Array(uncompressed)
46 |     const output = snappyCompress(input)
47 |     // verify round-trip: decompress and compare to original
48 |     const decompressed = new Uint8Array(input.length)
49 |     snappyUncompress(output, decompressed)
50 |     expect(decompressed).toEqual(input)
51 |   })
52 | })
53 | 


--------------------------------------------------------------------------------
/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import javascript from '@eslint/js'
 2 | import jsdoc from 'eslint-plugin-jsdoc'
 3 | 
 4 | export default [
 5 |   {
 6 |     plugins: {
 7 |       jsdoc,
 8 |     },
 9 | 
10 |     languageOptions: {
11 |       globals: {
12 |         'TextDecoder': false,
13 |         'TextEncoder': false,
14 |         // for benchmark:
15 |         'console': false,
16 |         'fetch': false,
17 |         'performance': false,
18 |       },
19 |     },
20 | 
21 |     rules: {
22 |       ...javascript.configs.recommended.rules,
23 |       'arrow-spacing': 'error',
24 |       camelcase: 'off',
25 |       'comma-spacing': 'error',
26 |       'comma-dangle': ['error', {
27 |         arrays: 'always-multiline',
28 |         objects: 'always-multiline',
29 |         imports: 'always-multiline',
30 |         exports: 'always-multiline',
31 |         functions: 'never',
32 |       }],
33 |       'eol-last': 'error',
34 |       eqeqeq: 'error',
35 |       'func-style': ['error', 'declaration'],
36 |       indent: ['error', 2],
37 |       'jsdoc/check-param-names': 'error',
38 |       'jsdoc/check-property-names': 'error',
39 |       'jsdoc/check-tag-names': 'error',
40 |       'jsdoc/require-param': 'error',
41 |       'jsdoc/require-param-type': 'error',
42 |       'jsdoc/require-returns': 'error',
43 |       'jsdoc/require-returns-type': 'error',
44 |       'jsdoc/sort-tags': 'error',
45 |       'key-spacing': 'error',
46 |       'no-constant-condition': 'warn',
47 |       'no-extra-parens': 'warn',
48 |       'no-multi-spaces': 'error',
49 |       'no-trailing-spaces': 'error',
50 |       'no-undef': 'error',
51 |       'no-unused-vars': 'error',
52 |       'no-useless-concat': 'error',
53 |       'no-useless-rename': 'error',
54 |       'no-useless-return': 'error',
55 |       'no-var': 'error',
56 |       'object-curly-spacing': ['error', 'always'],
57 |       'object-shorthand': 'error',
58 |       'prefer-const': 'error',
59 |       'prefer-exponentiation-operator': 'error',
60 |       'prefer-promise-reject-errors': 'error',
61 |       quotes: ['error', 'single'],
62 |       'require-await': 'warn',
63 |       semi: ['error', 'never'],
64 |       'sort-imports': ['error', {
65 |         ignoreDeclarationSort: true,
66 |         ignoreMemberSort: false,
67 |         memberSyntaxSortOrder: ['none', 'all', 'multiple', 'single'],
68 |       }],
69 |       'space-infix-ops': 'error',
70 |     },
71 |   },
72 | ]
73 | 


--------------------------------------------------------------------------------
/test/encoding.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { ByteWriter } from '../src/bytewriter.js'
 3 | import { writeRleBitPackedHybrid } from '../src/encoding.js'
 4 | import { readRleBitPackedHybrid } from 'hyparquet/src/encoding.js'
 5 | 
 6 | /**
 7 |  * Round-trip serialize and deserialize the given values.
 8 |  *
 9 |  * @param {number[]} values
10 |  * @returns {number[]}
11 |  */
12 | function roundTripDeserialize(values) {
13 |   const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
14 | 
15 |   // Serialize the values using writeRleBitPackedHybrid
16 |   const writer = new ByteWriter()
17 |   writeRleBitPackedHybrid(writer, values, bitWidth)
18 |   const buffer = writer.getBuffer()
19 |   const reader = { view: new DataView(buffer), offset: 0 }
20 | 
21 |   // Decode the values using readRleBitPackedHybrid from hyparquet
22 |   /** @type {number[]} */
23 |   const output = new Array(values.length)
24 |   readRleBitPackedHybrid(reader, bitWidth, output, values.length)
25 |   return output
26 | }
27 | 
28 | describe('RLE bit-packed hybrid', () => {
29 |   it('should round-trip a typical array of values', () => {
30 |     const original = [1, 2, 3, 4, 5, 6, 7, 8, 9]
31 |     const decoded = roundTripDeserialize(original)
32 |     expect(decoded).toEqual(original)
33 |   })
34 | 
35 |   it('should round-trip an empty array', () => {
36 |     const decoded = roundTripDeserialize([])
37 |     expect(decoded).toEqual([])
38 |   })
39 | 
40 |   it('should round-trip an array of zeros', () => {
41 |     const original = [0, 0, 0, 0, 0, 0, 0, 0]
42 |     const decoded = roundTripDeserialize(original)
43 |     expect(decoded).toEqual(original)
44 |   })
45 | 
46 |   it('should round-trip an array with large numbers', () => {
47 |     const original = [1023, 511, 255, 127, 63, 31, 15, 7]
48 |     const decoded = roundTripDeserialize(original)
49 |     expect(decoded).toEqual(original)
50 |   })
51 | 
52 |   it('should round-trip a random array of values', () => {
53 |     const original = Array.from({ length: 20 }, () =>
54 |       Math.floor(Math.random() * 1000)
55 |     )
56 |     const decoded = roundTripDeserialize(original)
57 |     expect(decoded).toEqual(original)
58 |   })
59 | 
60 |   it('should round-trip a sparse array of booleans', () => {
61 |     const original = Array(10000).fill(0)
62 |     original[10] = 1
63 |     original[100] = 1
64 |     original[500] = 1
65 |     original[9999] = 1
66 |     const decoded = roundTripDeserialize(original)
67 |     expect(decoded).toEqual(original)
68 |   })
69 | })
70 | 


--------------------------------------------------------------------------------
/benchmark.js:
--------------------------------------------------------------------------------
 1 | import { createWriteStream, promises as fs } from 'fs'
 2 | import { pipeline } from 'stream/promises'
 3 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet'
 4 | import { snappyCompressor } from 'hysnappy'
 5 | import { parquetWriteFile } from './src/node.js'
 6 | 
 7 | const url = 'https://s3.hyperparam.app/wiki-en-00000-of-00041.parquet'
 8 | const filename = 'data/wiki-en-00000-of-00041.parquet'
 9 | 
10 | // download test parquet file if needed
11 | let stat = await fs.stat(filename).catch(() => undefined)
12 | if (!stat) {
13 |   console.log('downloading ' + url)
14 |   const res = await fetch(url)
15 |   if (!res.ok) throw new Error(res.statusText)
16 |   // write to file async
17 |   await pipeline(res.body, createWriteStream(filename))
18 |   stat = await fs.stat(filename)
19 |   console.log('downloaded example.parquet', stat.size)
20 | }
21 | 
22 | // asyncBuffer
23 | const file = await asyncBufferFromFile(filename)
24 | console.log(`parsing ${filename} (${stat.size.toLocaleString()} bytes)`)
25 | let startTime = performance.now()
26 | 
27 | // read parquet file
28 | const metadata = await parquetMetadataAsync(file)
29 | const rows = await parquetReadObjects({
30 |   file,
31 |   metadata,
32 |   // columns: ['l_comment'],
33 |   // rowStart: 0,
34 |   // rowEnd: 100_000,
35 | })
36 | let ms = performance.now() - startTime
37 | console.log(`parsed ${filename} ${rows.length.toLocaleString()} rows in ${ms.toFixed(0)} ms`)
38 | 
39 | // transpose rows
40 | const schema = parquetSchema(metadata)
41 | const columnData = schema.children.map(({ element }) => ({
42 |   name: element.name,
43 |   data: [],
44 |   pageIndex: true,
45 | })) // .filter(({ name }) => name === 'l_comment')
46 | for (const row of rows) {
47 |   for (const { name, data } of columnData) {
48 |     data.push(row[name])
49 |   }
50 | }
51 | 
52 | // write parquet file
53 | const outputFilename = 'data/benchmark-output.parquet'
54 | console.log(`writing ${outputFilename} (${rows.length.toLocaleString()} rows)`)
55 | startTime = performance.now()
56 | parquetWriteFile({
57 |   filename: outputFilename,
58 |   columnData,
59 |   schema: metadata.schema,
60 |   compressors: { SNAPPY: snappyCompressor() },
61 |   // rowGroupSize: 200_000,
62 | })
63 | ms = performance.now() - startTime
64 | stat = await fs.stat(outputFilename)
65 | console.log(`wrote ${outputFilename} (${stat.size.toLocaleString()} bytes) in ${ms.toFixed(0)} ms`)
66 | 
67 | // check data is the same
68 | const outputFile = await asyncBufferFromFile(outputFilename)
69 | const outputRows = await parquetReadObjects({ file: outputFile })
70 | for (let i = 0; i < rows.length; i++) {
71 |   const inputRow = JSON.stringify(rows[i])
72 |   const outputRow = JSON.stringify(outputRows[i])
73 |   if (inputRow !== outputRow) {
74 |     console.log(`row ${i} mismatch`)
75 |     console.log('input ', inputRow)
76 |     console.log('output', outputRow)
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/test/geospatial.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { geospatialStatistics } from '../src/geospatial.js'
 3 | 
 4 | describe('geospatialStatistics', () => {
 5 |   it('computes bounding boxes and geospatial type codes for nested inputs', () => {
 6 |     const result = geospatialStatistics([
 7 |       null,
 8 |       undefined,
 9 |       { type: 'Point', coordinates: [1, 2] },
10 |       {
11 |         type: 'LineString',
12 |         coordinates: [
13 |           [5, -1, 10],
14 |           [0, 3, -5],
15 |           [2, 2, undefined],
16 |           [6, 1, Infinity],
17 |         ],
18 |       },
19 |       {
20 |         type: 'Polygon',
21 |         coordinates: [
22 |           [
23 |             [9, 9, 1, 5],
24 |             [9, 10, 3, 5],
25 |             [8, 9, -4, 8],
26 |             [7, 8, Infinity, Infinity],
27 |           ],
28 |         ],
29 |       },
30 |       {
31 |         type: 'MultiPoint',
32 |         coordinates: [
33 |           [-5, -5, 0, -10],
34 |           [4, 4, 12, undefined],
35 |         ],
36 |       },
37 |       { type: 'MultiPolygon', coordinates: [] },
38 |       {
39 |         type: 'MultiLineString',
40 |         coordinates: [
41 |           [
42 |             [
43 |               [Infinity, 0],
44 |             ],
45 |           ],
46 |         ],
47 |       },
48 |       {
49 |         type: 'GeometryCollection',
50 |         geometries: [
51 |           { type: 'Point', coordinates: [2, -3, 7, 9] },
52 |           { type: 'MultiPoint', coordinates: [[60, 10, 0, 11], [3, 6]] },
53 |         ],
54 |       },
55 |       { type: 'GeometryCollection', geometries: [] },
56 |     ])
57 | 
58 |     expect(result).toEqual({
59 |       bbox: {
60 |         xmin: -5,
61 |         xmax: 60,
62 |         ymin: -5,
63 |         ymax: 10,
64 |         zmin: -5,
65 |         zmax: 12,
66 |         mmin: -10,
67 |         mmax: 11,
68 |       },
69 |       geospatial_types: [1, 5, 6, 7, 1002, 3003, 3004, 3007],
70 |     })
71 |   })
72 | 
73 |   it('omits geospatial statistics when only null-like values are present', () => {
74 |     const result = geospatialStatistics([null, undefined, null])
75 |     expect(result).toBeUndefined()
76 |   })
77 | 
78 |   it('tracks type codes even when coordinates are empty', () => {
79 |     const result = geospatialStatistics([
80 |       { type: 'Point', coordinates: [] },
81 |     ])
82 |     expect(result).toEqual({
83 |       bbox: undefined,
84 |       geospatial_types: [1],
85 |     })
86 |   })
87 | 
88 |   it('throws on invalid value types and geometry definitions', () => {
89 |     expect(() => geospatialStatistics(['oops'])).toThrow('geospatial column expects GeoJSON geometries')
90 |     expect(() => geospatialStatistics([{ type: 'Unknown', coordinates: [] }])).toThrow('unknown geometry type: Unknown')
91 |     expect(() => geospatialStatistics([{ type: 'Point', coordinates: [0, 0, 0, 0, 0] }])).toThrow('unsupported geometry dimensions: 5')
92 |   })
93 | })
94 | 


--------------------------------------------------------------------------------
/src/splitstream.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Write values using BYTE_STREAM_SPLIT encoding.
 3 |  * This encoding writes all first bytes of values, then all second bytes, etc.
 4 |  * Can improve compression for floating-point and fixed-width numeric data.
 5 |  *
 6 |  * @import {DecodedArray, ParquetType} from 'hyparquet'
 7 |  * @import {Writer} from '../src/types.js'
 8 |  * @param {Writer} writer
 9 |  * @param {DecodedArray} values
10 |  * @param {ParquetType} type
11 |  * @param {number | undefined} typeLength
12 |  */
13 | export function writeByteStreamSplit(writer, values, type, typeLength) {
14 |   const count = values.length
15 | 
16 |   // Get bytes from values based on type
17 |   /** @type {Uint8Array} */
18 |   let bytes
19 |   /** @type {number} */
20 |   let width
21 |   if (type === 'FLOAT') {
22 |     const typed = values instanceof Float32Array ? values : new Float32Array(numberArray(values))
23 |     bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength)
24 |     width = 4
25 |   } else if (type === 'DOUBLE') {
26 |     const typed = values instanceof Float64Array ? values : new Float64Array(numberArray(values))
27 |     bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength)
28 |     width = 8
29 |   } else if (type === 'INT32') {
30 |     const typed = values instanceof Int32Array ? values : new Int32Array(numberArray(values))
31 |     bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength)
32 |     width = 4
33 |   } else if (type === 'INT64') {
34 |     const typed = bigIntArray(values)
35 |     bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength)
36 |     width = 8
37 |   } else if (type === 'FIXED_LEN_BYTE_ARRAY') {
38 |     if (!typeLength) throw new Error('parquet byte_stream_split missing type_length')
39 |     width = typeLength
40 |     bytes = new Uint8Array(count * width)
41 |     for (let i = 0; i < count; i++) {
42 |       bytes.set(values[i], i * width)
43 |     }
44 |   } else {
45 |     throw new Error(`parquet byte_stream_split unsupported type: ${type}`)
46 |   }
47 | 
48 |   // Write bytes in column format (all byte 0 from all values, then byte 1, etc.)
49 |   for (let b = 0; b < width; b++) {
50 |     for (let i = 0; i < count; i++) {
51 |       writer.appendUint8(bytes[i * width + b])
52 |     }
53 |   }
54 | }
55 | 
56 | /**
57 |  * @param {DecodedArray} values
58 |  * @returns {number[]}
59 |  */
60 | function numberArray(values) {
61 |   if (Array.isArray(values) && values.every(v => typeof v === 'number')) {
62 |     return values
63 |   }
64 |   throw new Error('Expected number array for BYTE_STREAM_SPLIT encoding')
65 | }
66 | 
67 | /**
68 |  * @param {DecodedArray} values
69 |  * @returns {BigInt64Array}
70 |  */
71 | function bigIntArray(values) {
72 |   if (values instanceof BigInt64Array) return values
73 |   if (Array.isArray(values) && values.every(v => typeof v === 'bigint')) {
74 |     return new BigInt64Array(values)
75 |   }
76 |   throw new Error('Expected bigint array for BYTE_STREAM_SPLIT encoding')
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/src/types.d.ts:
--------------------------------------------------------------------------------
 1 | import type { ColumnChunk, ColumnIndex, CompressionCodec, DecodedArray, Encoding, KeyValue, OffsetIndex, SchemaElement } from 'hyparquet'
 2 | 
 3 | export type Compressor = (input: Uint8Array) => Uint8Array
 4 | export type Compressors = { [K in CompressionCodec]?: Compressor }
 5 | 
 6 | // Superset of parquet types with automatic conversions
 7 | export type BasicType =
 8 |   'BOOLEAN' |
 9 |   'INT32' |
10 |   'INT64' |
11 |   'FLOAT' |
12 |   'DOUBLE' |
13 |   'BYTE_ARRAY' |
14 |   'STRING' |
15 |   'JSON' |
16 |   'TIMESTAMP' |
17 |   'UUID' |
18 |   'FLOAT16' |
19 |   'GEOMETRY' |
20 |   'GEOGRAPHY'
21 | 
22 | export interface ParquetWriteOptions {
23 |   writer: Writer
24 |   columnData: ColumnSource[]
25 |   schema?: SchemaElement[]
26 |   codec?: CompressionCodec // global default codec, default 'SNAPPY'
27 |   compressors?: Compressors // custom compressors
28 |   statistics?: boolean // enable column statistics, default true
29 |   rowGroupSize?: number | number[] // number of rows per row group
30 |   pageSize?: number // target uncompressed page size in bytes, default 1048576
31 |   kvMetadata?: KeyValue[]
32 | }
33 | 
34 | export interface ColumnSource {
35 |   name: string
36 |   data: DecodedArray
37 |   type?: BasicType
38 |   nullable?: boolean
39 |   encoding?: Encoding
40 |   columnIndex?: boolean // write column indexes, default false
41 |   offsetIndex?: boolean // write offset indexes, default false
42 | }
43 | 
44 | export interface PageData {
45 |   values: DecodedArray
46 |   definitionLevels: number[]
47 |   repetitionLevels: number[]
48 |   numNulls: number
49 | }
50 | 
51 | export interface ColumnEncoder {
52 |   columnName: string
53 |   element: SchemaElement
54 |   schemaPath: SchemaElement[]
55 |   codec: CompressionCodec
56 |   compressors: Compressors
57 |   stats: boolean
58 |   pageSize: number
59 |   // Spec: If ColumnIndex is present, OffsetIndex must also be present
60 |   columnIndex: boolean
61 |   offsetIndex: boolean
62 |   encoding?: Encoding // user-specified encoding
63 | }
64 | 
65 | export interface PageIndexes {
66 |   chunk: ColumnChunk
67 |   columnIndex?: ColumnIndex
68 |   offsetIndex?: OffsetIndex
69 | }
70 | 
71 | export interface Writer {
72 |   buffer: ArrayBuffer
73 |   view: DataView
74 |   offset: number
75 | 
76 |   ensure(size: number): void
77 |   finish(): void
78 |   getBuffer(): ArrayBuffer
79 |   appendUint8(value: number): void
80 |   appendUint32(value: number): void
81 |   appendInt32(value: number): void
82 |   appendInt64(value: bigint): void
83 |   appendFloat32(value: number): void
84 |   appendFloat64(value: number): void
85 |   appendBuffer(buffer: ArrayBuffer): void
86 |   appendBytes(value: Uint8Array): void
87 |   appendVarInt(value: number): void
88 |   appendVarBigInt(value: bigint): void
89 |   appendZigZag(value: number | bigint): void
90 | }
91 | 
92 | export type ThriftObject = { [ key: `field_${number}` ]: ThriftType }
93 | export type ThriftType = boolean | number | bigint | string | Uint8Array | ThriftType[] | ThriftObject | undefined
94 | 


--------------------------------------------------------------------------------
/test/filewriter.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { afterEach, beforeEach, describe, expect, it } from 'vitest'
 3 | import { fileWriter } from '../src/node.js'
 4 | 
 5 | const filedir = 'data/'
 6 | const filename = 'data/filewriter.test.bin'
 7 | 
 8 | describe('FileWriter', () => {
 9 |   beforeEach(() => {
10 |     // ensure data directory exists
11 |     if (!fs.existsSync(filedir)) {
12 |       fs.mkdirSync(filedir)
13 |     }
14 |   })
15 | 
16 |   afterEach(() => {
17 |     // remove test file
18 |     if (fs.existsSync(filename)) {
19 |       fs.unlinkSync(filename)
20 |     }
21 |   })
22 | 
23 |   it('throws an error when calling getBuffer', () => {
24 |     const writer = fileWriter(filename)
25 |     expect(() => writer.getBuffer()).toThrowError('getBuffer not supported')
26 |   })
27 | 
28 |   it('writes single byte and flushes on finish', () => {
29 |     const writer = fileWriter(filename)
30 |     writer.appendUint8(0xff)
31 |     writer.finish()
32 | 
33 |     // verify file exists and content is correct
34 |     expect(fs.existsSync(filename)).toBe(true)
35 |     const contents = fs.readFileSync(filename)
36 |     expect(new Uint8Array(contents)).toEqual(new Uint8Array([0xff]))
37 |   })
38 | 
39 |   it('writes multiple data types to file', () => {
40 |     const writer = fileWriter(filename)
41 |     writer.appendUint8(0xab)
42 |     writer.appendUint32(0x12345678)
43 |     writer.appendInt32(-1)
44 |     writer.appendInt64(0x1122334455667788n)
45 |     writer.appendVarInt(300)
46 |     writer.finish()
47 | 
48 |     const contents = new Uint8Array(fs.readFileSync(filename))
49 | 
50 |     const expected = new Uint8Array([
51 |       0xab,
52 |       0x78, 0x56, 0x34, 0x12,
53 |       0xff, 0xff, 0xff, 0xff,
54 |       0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
55 |       0xac, 0x02,
56 |     ])
57 |     expect(contents).toEqual(expected)
58 |   })
59 | 
60 |   it('auto-flushes when exceeding chunk size', () => {
61 |     // default chunkSize = 1_000_000 bytes
62 |     const writer = fileWriter(filename)
63 | 
64 |     // write slightly over 1mb to trigger auto-flush
65 |     const largeArray = new Uint8Array(1_100_000).fill(0xaa)
66 |     writer.appendBytes(largeArray)
67 |     writer.appendBytes(largeArray)
68 | 
69 |     // expect first flush
70 |     expect(fs.statSync(filename).size).toBe(1_100_000)
71 | 
72 |     writer.finish()
73 | 
74 |     // expect final flush
75 |     expect(fs.statSync(filename).size).toBe(2_200_000)
76 |   })
77 | 
78 |   it('overwrites existing file if new writer is created with same filename', () => {
79 |     // first write
80 |     let writer = fileWriter(filename)
81 |     writer.appendBytes(new Uint8Array([0x11, 0x22]))
82 |     writer.finish()
83 | 
84 |     // verify the file now has [0x11, 0x22]
85 |     let contents = fs.readFileSync(filename)
86 |     expect(new Uint8Array(contents)).toEqual(new Uint8Array([0x11, 0x22]))
87 | 
88 |     // second write
89 |     writer = fileWriter(filename)
90 |     writer.appendBytes(new Uint8Array([0xaa, 0xbb]))
91 |     writer.finish()
92 | 
93 |     // should overwrite the previous content
94 |     contents = fs.readFileSync(filename)
95 |     expect(new Uint8Array(contents)).toEqual(new Uint8Array([0xaa, 0xbb]))
96 |   })
97 | })
98 | 


--------------------------------------------------------------------------------
/src/encoding.js:
--------------------------------------------------------------------------------
  1 | import { ByteWriter } from './bytewriter.js'
  2 | 
  3 | /**
  4 |  * @import {DecodedArray} from 'hyparquet'
  5 |  * @import {Writer} from '../src/types.js'
  6 |  * @param {Writer} writer
  7 |  * @param {DecodedArray} values
  8 |  * @param {number} bitWidth
  9 |  * @returns {number} bytes written
 10 |  */
 11 | export function writeRleBitPackedHybrid(writer, values, bitWidth) {
 12 |   const offsetStart = writer.offset
 13 | 
 14 |   // try both RLE and bit-packed and choose the best
 15 |   const rle = new ByteWriter()
 16 |   writeRle(rle, values, bitWidth)
 17 |   const bitPacked = new ByteWriter()
 18 |   writeBitPacked(bitPacked, values, bitWidth)
 19 | 
 20 |   if (rle.offset < bitPacked.offset) {
 21 |     writer.appendBuffer(rle.getBuffer())
 22 |   } else {
 23 |     writer.appendBuffer(bitPacked.getBuffer())
 24 |   }
 25 | 
 26 |   return writer.offset - offsetStart
 27 | }
 28 | 
 29 | /**
 30 |  * @param {Writer} writer
 31 |  * @param {DecodedArray} values
 32 |  * @param {number} bitWidth
 33 |  */
 34 | function writeBitPacked(writer, values, bitWidth) {
 35 |   // Number of 8-value groups
 36 |   const numGroups = Math.ceil(values.length / 8)
 37 | 
 38 |   // The parquet bitpack header: lower bit = 1 => "bit-packed mode"
 39 |   // upper bits = number of groups
 40 |   const header = numGroups << 1 | 1
 41 | 
 42 |   // Write the header as a varint
 43 |   writer.appendVarInt(header)
 44 | 
 45 |   // If bitWidth = 0, no data is actually needed
 46 |   if (bitWidth === 0 || values.length === 0) {
 47 |     return
 48 |   }
 49 | 
 50 |   const mask = (1 << bitWidth) - 1
 51 |   let buffer = 0 // accumulates bits
 52 |   let bitsUsed = 0 // how many bits are in 'buffer' so far
 53 | 
 54 |   // Write out each value, bit-packing into buffer
 55 |   for (let i = 0; i < values.length; i++) {
 56 |     const v = values[i] & mask // mask off bits exceeding bitWidth
 57 |     buffer |= v << bitsUsed
 58 |     bitsUsed += bitWidth
 59 | 
 60 |     // Flush full bytes
 61 |     while (bitsUsed >= 8) {
 62 |       writer.appendUint8(buffer & 0xFF)
 63 |       buffer >>>= 8
 64 |       bitsUsed -= 8
 65 |     }
 66 |   }
 67 | 
 68 |   // Pad the final partial group with zeros if needed
 69 |   const totalNeeded = numGroups * 8
 70 |   for (let padCount = values.length; padCount < totalNeeded; padCount++) {
 71 |     // Just write a 0 value into the buffer
 72 |     buffer |= 0 << bitsUsed
 73 |     bitsUsed += bitWidth
 74 |     while (bitsUsed >= 8) {
 75 |       writer.appendUint8(buffer & 0xFF)
 76 |       buffer >>>= 8
 77 |       bitsUsed -= 8
 78 |     }
 79 |   }
 80 | 
 81 |   // Flush any remaining bits
 82 |   if (bitsUsed > 0) {
 83 |     writer.appendUint8(buffer & 0xff)
 84 |   }
 85 | }
 86 | 
 87 | /**
 88 |  * Run-length encoding: write repeated values by encoding the value and its count.
 89 |  *
 90 |  * @param {Writer} writer
 91 |  * @param {DecodedArray} values
 92 |  * @param {number} bitWidth
 93 |  */
 94 | function writeRle(writer, values, bitWidth) {
 95 |   if (!values.length) return
 96 | 
 97 |   let currentValue = values[0]
 98 |   let count = 1
 99 | 
100 |   for (let i = 1; i <= values.length; i++) {
101 |     if (i < values.length && values[i] === currentValue) {
102 |       count++ // continue the run
103 |     } else {
104 |       // write the count of repeated values
105 |       const header = count << 1
106 |       writer.appendVarInt(header)
107 | 
108 |       // write the value
109 |       const width = bitWidth + 7 >> 3 // bytes needed
110 |       for (let j = 0; j < width; j++) {
111 |         writer.appendUint8(currentValue >> (j << 3) & 0xff)
112 |       }
113 | 
114 |       // reset for the next run
115 |       if (i < values.length) {
116 |         currentValue = values[i]
117 |         count = 1
118 |       }
119 |     }
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/test/schema.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js'
  3 | 
  4 | /**
  5 |  * @import {SchemaElement} from 'hyparquet'
  6 |  */
  7 | 
  8 | describe('schemaFromColumnData', () => {
  9 |   it('honours provided type with nullable = false → REQUIRED', () => {
 10 |     const schema = schemaFromColumnData({
 11 |       columnData: [
 12 |         { name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false },
 13 |       ],
 14 |     })
 15 |     expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' })
 16 |   })
 17 | 
 18 |   it('applies valid schema override verbatim', () => {
 19 |     const schema = schemaFromColumnData({
 20 |       columnData: [{ name: 'strings', data: ['a', 'b'] }],
 21 |       schemaOverrides: {
 22 |         strings: {
 23 |           name: 'strings',
 24 |           type: 'BYTE_ARRAY',
 25 |           converted_type: 'UTF8',
 26 |           repetition_type: 'OPTIONAL',
 27 |         },
 28 |       },
 29 |     })
 30 |     expect(schema[1].name).toBe('strings')
 31 |     expect(schema[1].type).toBe('BYTE_ARRAY')
 32 |     expect(schema[1].converted_type).toBe('UTF8')
 33 |     expect(schema[1].repetition_type).toBe('OPTIONAL')
 34 |   })
 35 | 
 36 |   it('throws when column lengths differ', () => {
 37 |     expect(() =>
 38 |       schemaFromColumnData({
 39 |         columnData: [
 40 |           { name: 'a', data: new Int32Array([1]) },
 41 |           { name: 'b', data: new Int32Array([1, 2]) },
 42 |         ],
 43 |       })
 44 |     ).toThrow(/columns must have the same length/)
 45 |   })
 46 | 
 47 |   it('rejects override with mismatched name', () => {
 48 |     expect(() =>
 49 |       schemaFromColumnData({
 50 |         columnData: [{ name: 'x', data: new Int32Array([1]) }],
 51 |         schemaOverrides: { x: { name: 'y', type: 'INT32' } },
 52 |       })
 53 |     ).toThrow(/does not match column name/)
 54 |   })
 55 | })
 56 | 
 57 | describe('autoSchemaElement', () => {
 58 |   it.each([
 59 |     [new Int32Array([1, 2]), 'INT32'],
 60 |     [new BigInt64Array([1n, 2n]), 'INT64'],
 61 |     [new Float32Array([1, 2]), 'FLOAT'],
 62 |     [new Float64Array([1, 2]), 'DOUBLE'],
 63 |   ])('detects typed arrays (%#)', (data, expected) => {
 64 |     const el = autoSchemaElement('col', data)
 65 |     expect(el.type).toBe(expected)
 66 |     expect(el.repetition_type).toBe('REQUIRED')
 67 |   })
 68 | 
 69 |   it('promotes INT32 + DOUBLE mix to DOUBLE', () => {
 70 |     const el = autoSchemaElement('mix', [1, 2.5])
 71 |     expect(el.type).toBe('DOUBLE')
 72 |   })
 73 | 
 74 |   it('sets repetition_type OPTIONAL when nulls present', () => {
 75 |     const el = autoSchemaElement('maybe', [null, 1])
 76 |     expect(el.repetition_type).toBe('OPTIONAL')
 77 |   })
 78 | 
 79 |   it('falls back to BYTE_ARRAY for empty arrays', () => {
 80 |     const el = autoSchemaElement('empty', [])
 81 |     expect(el.type).toBe('BYTE_ARRAY')
 82 |     expect(el.repetition_type).toBe('OPTIONAL')
 83 |   })
 84 | 
 85 |   it('throws on incompatible mixed scalar types', () => {
 86 |     expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/)
 87 |   })
 88 | })
 89 | 
 90 | describe('level helpers', () => {
 91 |   /** @type {SchemaElement[]} */
 92 |   const path = [
 93 |     { name: 'root', repetition_type: 'REPEATED' },
 94 |     { name: 'child', repetition_type: 'OPTIONAL' },
 95 |     { name: 'leaf', repetition_type: 'REPEATED' },
 96 |   ]
 97 | 
 98 |   it('computes max repetition level', () => {
 99 |     expect(getMaxRepetitionLevel(path)).toBe(2)
100 |   })
101 | 
102 |   it('computes max definition level', () => {
103 |     expect(getMaxDefinitionLevel(path)).toBe(2)
104 |   })
105 | })
106 | 


--------------------------------------------------------------------------------
/test/bytewriter.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { ByteWriter } from '../src/bytewriter.js'
  3 | 
  4 | describe('ByteWriter', () => {
  5 |   it('initializes with correct defaults', () => {
  6 |     const writer = new ByteWriter()
  7 |     expect(writer.offset).toBe(0)
  8 |     expect(writer.buffer.byteLength).toBe(1024)
  9 |   })
 10 | 
 11 |   it('appendUint8 writes single byte', () => {
 12 |     const writer = new ByteWriter()
 13 |     writer.appendUint8(255)
 14 |     expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([0xff]))
 15 |   })
 16 | 
 17 |   it('appendUint32 writes a 32-bit integer in little-endian', () => {
 18 |     const writer = new ByteWriter()
 19 |     writer.appendUint32(0x12345678)
 20 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 21 |       new Uint8Array([0x78, 0x56, 0x34, 0x12])
 22 |     )
 23 |   })
 24 | 
 25 |   it('appendInt32 writes signed 32-bit integer in little-endian', () => {
 26 |     const writer = new ByteWriter()
 27 |     writer.appendInt32(-1)
 28 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 29 |       new Uint8Array([0xff, 0xff, 0xff, 0xff])
 30 |     )
 31 |   })
 32 | 
 33 |   it('appendInt64 writes a 64-bit bigint in little-endian', () => {
 34 |     const writer = new ByteWriter()
 35 |     writer.appendInt64(0x1122334455667788n)
 36 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 37 |       new Uint8Array([0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11])
 38 |     )
 39 |   })
 40 | 
 41 |   it('appendFloat64 writes a 64-bit float in little-endian', () => {
 42 |     const writer = new ByteWriter()
 43 |     writer.appendFloat64(1.0)
 44 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 45 |       new Uint8Array([0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f])
 46 |     )
 47 |   })
 48 | 
 49 |   it('appendBytes writes raw Uint8Array data', () => {
 50 |     const writer = new ByteWriter()
 51 |     writer.appendBytes(new Uint8Array([1, 2, 3, 4]))
 52 |     expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([1, 2, 3, 4]))
 53 |   })
 54 | 
 55 |   it('appendBuffer writes raw ArrayBuffer data', () => {
 56 |     const writer = new ByteWriter()
 57 |     const buf = new Uint8Array([10, 20, 30]).buffer
 58 |     writer.appendBuffer(buf)
 59 |     expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([10, 20, 30]))
 60 |   })
 61 | 
 62 |   it('appendVarInt encodes 32-bit varint', () => {
 63 |     const writer = new ByteWriter()
 64 |     writer.appendVarInt(127)
 65 |     writer.appendVarInt(128)
 66 |     writer.appendVarInt(300)
 67 | 
 68 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 69 |       new Uint8Array([
 70 |         0x7f, // 127
 71 |         0x80, 0x01, // 128
 72 |         0xac, 0x02, // 300
 73 |       ])
 74 |     )
 75 |   })
 76 | 
 77 |   it('appendVarBigInt encodes bigint varint', () => {
 78 |     const writer = new ByteWriter()
 79 |     writer.appendVarBigInt(127n)
 80 |     writer.appendVarBigInt(128n)
 81 |     writer.appendVarBigInt(300n)
 82 | 
 83 |     expect(new Uint8Array(writer.getBuffer())).toEqual(
 84 |       new Uint8Array([
 85 |         0x7f, // 127
 86 |         0x80, 0x01, // 128
 87 |         0xac, 0x02, // 300
 88 |       ])
 89 |     )
 90 |   })
 91 | 
 92 |   it('expands buffer automatically when needed', () => {
 93 |     const writer = new ByteWriter()
 94 |     // force expansion by writing more than initial 1024 bytes
 95 |     const largeArray = new Uint8Array(2000).fill(0xaa)
 96 |     writer.appendBytes(largeArray)
 97 |     expect(writer.buffer.byteLength).toBeGreaterThanOrEqual(2000)
 98 |     expect(new Uint8Array(writer.getBuffer()).length).toBe(2000)
 99 |   })
100 | 
101 |   it('finish does nothing but is callable', () => {
102 |     const writer = new ByteWriter()
103 |     writer.finish()
104 |     expect(writer.getBuffer().byteLength).toBe(0)
105 |   })
106 | })
107 | 


--------------------------------------------------------------------------------
/test/write.delta.test.js:
--------------------------------------------------------------------------------
 1 | import { parquetMetadata, parquetReadObjects } from 'hyparquet'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetWriteBuffer } from '../src/index.js'
 4 | 
 5 | describe('DELTA_BINARY_PACKED encoding', () => {
 6 |   it('writes DELTA_BINARY_PACKED encoding for INT32', async () => {
 7 |     const data = [1, 2, 3, 100, 200, 300]
 8 |     const file = parquetWriteBuffer({
 9 |       columnData: [{ name: 'int', data, encoding: 'DELTA_BINARY_PACKED' }],
10 |     })
11 |     const metadata = parquetMetadata(file)
12 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BINARY_PACKED'])
13 |     const result = await parquetReadObjects({ file })
14 |     expect(result).toEqual(data.map(int => ({ int })))
15 |   })
16 | 
17 |   it('writes DELTA_BINARY_PACKED encoding for INT64', async () => {
18 |     const data = [1n, 2n, 3n, 100n, 200n, 300n]
19 |     const file = parquetWriteBuffer({
20 |       columnData: [{ name: 'bigint', data, encoding: 'DELTA_BINARY_PACKED' }],
21 |     })
22 |     const metadata = parquetMetadata(file)
23 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BINARY_PACKED'])
24 |     const result = await parquetReadObjects({ file })
25 |     expect(result).toEqual(data.map(bigint => ({ bigint })))
26 |   })
27 | })
28 | 
29 | describe('DELTA_LENGTH_BYTE_ARRAY encoding', () => {
30 |   it('writes DELTA_LENGTH_BYTE_ARRAY encoding for strings', async () => {
31 |     const data = ['hello', 'world', 'foo', 'bar', 'baz', 'qux']
32 |     const file = parquetWriteBuffer({
33 |       columnData: [{ name: 'string', data, encoding: 'DELTA_LENGTH_BYTE_ARRAY' }],
34 |     })
35 |     const metadata = parquetMetadata(file)
36 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_LENGTH_BYTE_ARRAY'])
37 |     const result = await parquetReadObjects({ file })
38 |     expect(result).toEqual(data.map(string => ({ string })))
39 |   })
40 | 
41 |   it('writes DELTA_LENGTH_BYTE_ARRAY encoding for byte arrays', async () => {
42 |     const data = [
43 |       Uint8Array.of(1, 2, 3),
44 |       Uint8Array.of(4, 5, 6, 7),
45 |       Uint8Array.of(8, 9),
46 |       Uint8Array.of(10, 11, 12, 13, 14),
47 |     ]
48 |     const file = parquetWriteBuffer({
49 |       columnData: [{ name: 'bytes', data, encoding: 'DELTA_LENGTH_BYTE_ARRAY' }],
50 |     })
51 |     const metadata = parquetMetadata(file)
52 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_LENGTH_BYTE_ARRAY'])
53 |     const result = await parquetReadObjects({ file, utf8: false })
54 |     expect(result).toEqual(data.map(bytes => ({ bytes })))
55 |   })
56 | })
57 | 
58 | describe('DELTA_BYTE_ARRAY encoding', () => {
59 |   it('writes DELTA_BYTE_ARRAY encoding for strings with common prefixes', async () => {
60 |     const data = ['apple', 'application', 'apply', 'banana', 'band', 'bandana']
61 |     const file = parquetWriteBuffer({
62 |       columnData: [{ name: 'string', data, encoding: 'DELTA_BYTE_ARRAY' }],
63 |     })
64 |     const metadata = parquetMetadata(file)
65 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BYTE_ARRAY'])
66 |     const result = await parquetReadObjects({ file })
67 |     expect(result).toEqual(data.map(string => ({ string })))
68 |   })
69 | 
70 |   it('writes DELTA_BYTE_ARRAY encoding for byte arrays', async () => {
71 |     const data = [
72 |       Uint8Array.of(1, 2, 3, 4),
73 |       Uint8Array.of(1, 2, 5, 6),
74 |       Uint8Array.of(1, 2, 7, 8),
75 |       Uint8Array.of(10, 11, 12, 13),
76 |     ]
77 |     const file = parquetWriteBuffer({
78 |       columnData: [{ name: 'bytes', data, encoding: 'DELTA_BYTE_ARRAY' }],
79 |     })
80 |     const metadata = parquetMetadata(file)
81 |     expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BYTE_ARRAY'])
82 |     const result = await parquetReadObjects({ file, utf8: false })
83 |     expect(result).toEqual(data.map(bytes => ({ bytes })))
84 |   })
85 | })
86 | 


--------------------------------------------------------------------------------
/test/write.splitstream.test.js:
--------------------------------------------------------------------------------
 1 | import { parquetMetadata, parquetReadObjects } from 'hyparquet'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetWriteBuffer } from '../src/index.js'
 4 | 
 5 | describe('BYTE_STREAM_SPLIT encoding', () => {
 6 |   it('writes BYTE_STREAM_SPLIT encoding for FLOAT', async () => {
 7 |     const data = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75]
 8 |     const file = parquetWriteBuffer({
 9 |       columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }],
10 |     })
11 |     const metadata = parquetMetadata(file)
12 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
13 |     expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT'])
14 |     const result = await parquetReadObjects({ file })
15 |     expect(result).toEqual(data.map(float => ({ float })))
16 |   })
17 | 
18 |   it('writes BYTE_STREAM_SPLIT encoding for DOUBLE', async () => {
19 |     const data = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75, 1e100, -1e-100]
20 |     const file = parquetWriteBuffer({
21 |       columnData: [{ name: 'double', data, type: 'DOUBLE', encoding: 'BYTE_STREAM_SPLIT' }],
22 |     })
23 |     const metadata = parquetMetadata(file)
24 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
25 |     expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT'])
26 |     const result = await parquetReadObjects({ file })
27 |     expect(result).toEqual(data.map(double => ({ double })))
28 |   })
29 | 
30 |   it('writes BYTE_STREAM_SPLIT encoding for INT32', async () => {
31 |     const data = [1, 2, 3, -100, 0, 2147483647, -2147483648]
32 |     const file = parquetWriteBuffer({
33 |       columnData: [{ name: 'int', data, encoding: 'BYTE_STREAM_SPLIT' }],
34 |     })
35 |     const metadata = parquetMetadata(file)
36 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
37 |     expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT'])
38 |     const result = await parquetReadObjects({ file })
39 |     expect(result).toEqual(data.map(int => ({ int })))
40 |   })
41 | 
42 |   it('writes BYTE_STREAM_SPLIT encoding for INT64', async () => {
43 |     const data = [1n, 2n, 3n, -100n, 0n, 9223372036854775807n, -9223372036854775808n]
44 |     const file = parquetWriteBuffer({
45 |       columnData: [{ name: 'bigint', data, encoding: 'BYTE_STREAM_SPLIT' }],
46 |     })
47 |     const metadata = parquetMetadata(file)
48 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
49 |     expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT'])
50 |     const result = await parquetReadObjects({ file })
51 |     expect(result).toEqual(data.map(bigint => ({ bigint })))
52 |   })
53 | 
54 |   it('writes BYTE_STREAM_SPLIT encoding with nulls', async () => {
55 |     const data = [1.5, null, 3.125, null, 0.0, 100.75]
56 |     const file = parquetWriteBuffer({
57 |       columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }],
58 |     })
59 |     const metadata = parquetMetadata(file)
60 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
61 |     expect(columnMetadata?.encodings).toContain('BYTE_STREAM_SPLIT')
62 |     const result = await parquetReadObjects({ file })
63 |     expect(result).toEqual(data.map(float => ({ float })))
64 |   })
65 | 
66 |   it('writes BYTE_STREAM_SPLIT encoding with compression', async () => {
67 |     const data = Array.from({ length: 1000 }, (_, i) => i * 0.1)
68 |     const file = parquetWriteBuffer({
69 |       columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }],
70 |     })
71 |     const metadata = parquetMetadata(file)
72 |     const columnMetadata = metadata.row_groups[0].columns[0].meta_data
73 |     expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT'])
74 |     expect(columnMetadata?.codec).toBe('SNAPPY')
75 |     const result = await parquetReadObjects({ file })
76 |     expect(result.length).toBe(1000)
77 |     result.forEach((row, i) => {
78 |       expect(row.float).toBeCloseTo(i * 0.1, 5)
79 |     })
80 |   })
81 | })
82 | 


--------------------------------------------------------------------------------
/test/splitstream.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { ByteWriter } from '../src/bytewriter.js'
  3 | import { writeByteStreamSplit } from '../src/splitstream.js'
  4 | import { byteStreamSplit } from 'hyparquet/src/encoding.js'
  5 | 
  6 | /**
  7 |  * @import {DecodedArray, ParquetType} from 'hyparquet'
  8 |  * @param {DecodedArray} values
  9 |  * @param {ParquetType} type
 10 |  * @param {number} [typeLength]
 11 |  * @returns {DecodedArray}
 12 |  */
 13 | function roundTrip(values, type, typeLength) {
 14 |   const writer = new ByteWriter()
 15 |   writeByteStreamSplit(writer, values, type, typeLength)
 16 |   const buffer = writer.getBuffer()
 17 |   const reader = { view: new DataView(buffer), offset: 0 }
 18 |   return byteStreamSplit(reader, values.length, type, typeLength)
 19 | }
 20 | 
 21 | describe('BYTE_STREAM_SPLIT encoding', () => {
 22 |   describe('FLOAT', () => {
 23 |     it('should round-trip float values', () => {
 24 |       const original = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75]
 25 |       expect(Array.from(roundTrip(original, 'FLOAT'))).toEqual(original)
 26 |     })
 27 | 
 28 |     it('should round-trip an empty array', () => {
 29 |       expect(Array.from(roundTrip([], 'FLOAT'))).toEqual([])
 30 |     })
 31 | 
 32 |     it('should round-trip special float values', () => {
 33 |       const decoded = roundTrip([0.0, -0.0, Infinity, -Infinity], 'FLOAT')
 34 |       expect(decoded[0]).toBe(0.0)
 35 |       expect(decoded[1]).toBe(-0.0)
 36 |       expect(decoded[2]).toBe(Infinity)
 37 |       expect(decoded[3]).toBe(-Infinity)
 38 |     })
 39 |   })
 40 | 
 41 |   describe('DOUBLE', () => {
 42 |     it('should round-trip double values', () => {
 43 |       const original = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75, 1e100, -1e-100]
 44 |       expect(Array.from(roundTrip(original, 'DOUBLE'))).toEqual(original)
 45 |     })
 46 | 
 47 |     it('should round-trip an empty array', () => {
 48 |       expect(Array.from(roundTrip([], 'DOUBLE'))).toEqual([])
 49 |     })
 50 |   })
 51 | 
 52 |   describe('INT32', () => {
 53 |     it('should round-trip int32 values', () => {
 54 |       const original = [1, 2, 3, -100, 0, 2147483647, -2147483648]
 55 |       expect(Array.from(roundTrip(original, 'INT32'))).toEqual(original)
 56 |     })
 57 | 
 58 |     it('should round-trip an empty array', () => {
 59 |       expect(Array.from(roundTrip([], 'INT32'))).toEqual([])
 60 |     })
 61 |   })
 62 | 
 63 |   describe('INT64', () => {
 64 |     it('should round-trip int64 values', () => {
 65 |       const original = [1n, 2n, 3n, -100n, 0n, 9223372036854775807n, -9223372036854775808n]
 66 |       expect(Array.from(roundTrip(original, 'INT64'))).toEqual(original)
 67 |     })
 68 | 
 69 |     it('should round-trip an empty array', () => {
 70 |       expect(Array.from(roundTrip([], 'INT64'))).toEqual([])
 71 |     })
 72 |   })
 73 | 
 74 |   describe('FIXED_LEN_BYTE_ARRAY', () => {
 75 |     it('should round-trip fixed-length byte arrays', () => {
 76 |       const original = [
 77 |         new Uint8Array([1, 2, 3, 4]),
 78 |         new Uint8Array([5, 6, 7, 8]),
 79 |         new Uint8Array([9, 10, 11, 12]),
 80 |       ]
 81 |       const decoded = roundTrip(original, 'FIXED_LEN_BYTE_ARRAY', 4)
 82 |       expect(decoded).toHaveLength(3)
 83 |       expect(Array.from(decoded[0])).toEqual([1, 2, 3, 4])
 84 |       expect(Array.from(decoded[1])).toEqual([5, 6, 7, 8])
 85 |       expect(Array.from(decoded[2])).toEqual([9, 10, 11, 12])
 86 |     })
 87 | 
 88 |     it('should round-trip an empty array', () => {
 89 |       const decoded = roundTrip([], 'FIXED_LEN_BYTE_ARRAY', 4)
 90 |       expect(Array.from(decoded)).toEqual([])
 91 |     })
 92 | 
 93 |     it('should throw without typeLength', () => {
 94 |       const writer = new ByteWriter()
 95 |       expect(() => writeByteStreamSplit(writer, [], 'FIXED_LEN_BYTE_ARRAY', undefined))
 96 |         .toThrow('missing type_length')
 97 |     })
 98 |   })
 99 | 
100 |   describe('errors', () => {
101 |     it('should throw for unsupported type', () => {
102 |       const writer = new ByteWriter()
103 |       expect(() => writeByteStreamSplit(writer, [], 'BOOLEAN', undefined))
104 |         .toThrow('unsupported type')
105 |     })
106 |   })
107 | })
108 | 


--------------------------------------------------------------------------------
/src/dremel.js:
--------------------------------------------------------------------------------
  1 | import { getMaxDefinitionLevel } from './schema.js'
  2 | 
  3 | /**
  4 |  * Encode nested list values into repetition and definition levels.
  5 |  *
  6 |  * @import {SchemaElement} from 'hyparquet'
  7 |  * @import {PageData} from '../src/types.js'
  8 |  * @param {SchemaElement[]} schemaPath schema elements from root to leaf
  9 |  * @param {any[]} rows column data for the current row group
 10 |  * @returns {PageData} encoded list values
 11 |  */
 12 | export function encodeListValues(schemaPath, rows) {
 13 |   if (schemaPath.length < 2) throw new Error('parquet list schema path must include column')
 14 |   /** @type {any[]} */
 15 |   const values = []
 16 |   /** @type {number[]} */
 17 |   const definitionLevels = []
 18 |   /** @type {number[]} */
 19 |   const repetitionLevels = []
 20 | 
 21 |   // Track repetition depth prior to each level
 22 |   const repLevelPrior = new Array(schemaPath.length)
 23 |   let repeatedCount = 0
 24 |   for (let i = 0; i < schemaPath.length; i++) {
 25 |     repLevelPrior[i] = repeatedCount
 26 |     if (schemaPath[i].repetition_type === 'REPEATED') repeatedCount++
 27 |   }
 28 | 
 29 |   const leafIndex = schemaPath.length - 1
 30 |   const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
 31 | 
 32 |   for (let row = 0; row < rows.length; row++) {
 33 |     visit(1, rows[row], 0, 0, false)
 34 |   }
 35 | 
 36 |   const numNulls = definitionLevels.reduce(
 37 |     (count, def) => def === maxDefinitionLevel ? count : count + 1,
 38 |     0
 39 |   )
 40 | 
 41 |   return { values, definitionLevels, repetitionLevels, numNulls }
 42 | 
 43 |   /**
 44 |    * Recursively walk the schema path, emitting definition/repetition pairs.
 45 |    *
 46 |    * @param {number} depth index into schemaPath
 47 |    * @param {any} value value at the current depth
 48 |    * @param {number} defLevel definition level accumulated so far
 49 |    * @param {number} repLevel repetition level for the next emitted slot
 50 |    * @param {boolean} allowNull whether the current value is allowed to be null
 51 |    */
 52 |   function visit(depth, value, defLevel, repLevel, allowNull) {
 53 |     const element = schemaPath[depth]
 54 |     const repetition = element.repetition_type || 'REQUIRED'
 55 |     const isLeaf = depth === leafIndex
 56 | 
 57 |     if (isLeaf) {
 58 |       if (value === null || value === undefined) {
 59 |         if (repetition === 'REQUIRED' && !allowNull) {
 60 |           throw new Error('parquet required value is undefined')
 61 |         }
 62 |         definitionLevels.push(defLevel)
 63 |         repetitionLevels.push(repLevel)
 64 |         values.push(null)
 65 |       } else {
 66 |         const finalDef = repetition === 'REQUIRED' ? defLevel : defLevel + 1
 67 |         definitionLevels.push(finalDef)
 68 |         repetitionLevels.push(repLevel)
 69 |         values.push(value)
 70 |       }
 71 |       return
 72 |     }
 73 | 
 74 |     if (repetition === 'REPEATED') {
 75 |       if (value === null || value === undefined) {
 76 |         if (!allowNull) throw new Error('parquet required value is undefined')
 77 |         visit(depth + 1, undefined, defLevel, repLevel, true)
 78 |         return
 79 |       }
 80 |       if (!Array.isArray(value)) {
 81 |         throw new Error(`parquet repeated field ${element.name} must be an array`)
 82 |       }
 83 |       if (!value.length) {
 84 |         visit(depth + 1, undefined, defLevel, repLevel, true)
 85 |         return
 86 |       }
 87 |       for (let i = 0; i < value.length; i++) {
 88 |         const childRep = i === 0 ? repLevel : repLevelPrior[depth] + 1
 89 |         visit(depth + 1, value[i], defLevel + 1, childRep, false)
 90 |       }
 91 |       return
 92 |     }
 93 | 
 94 |     if (repetition === 'OPTIONAL') {
 95 |       if (value === null || value === undefined) {
 96 |         visit(depth + 1, undefined, defLevel, repLevel, true)
 97 |       } else {
 98 |         visit(depth + 1, value, defLevel + 1, repLevel, false)
 99 |       }
100 |       return
101 |     }
102 | 
103 |     // REQUIRED
104 |     if (value === null || value === undefined) {
105 |       if (!allowNull) throw new Error('parquet required value is undefined')
106 |       visit(depth + 1, undefined, defLevel, repLevel, true)
107 |     } else {
108 |       visit(depth + 1, value, defLevel, repLevel, false)
109 |     }
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/test/thrift.test.js:
--------------------------------------------------------------------------------
  1 | import { deserializeTCompactProtocol } from 'hyparquet/src/thrift.js'
  2 | import { describe, expect, it } from 'vitest'
  3 | import { serializeTCompactProtocol } from '../src/thrift.js'
  4 | import { ByteWriter } from '../src/bytewriter.js'
  5 | import { logicalType } from '../src/metadata.js'
  6 | 
  7 | /**
  8 |  * Utility to decode a Thrift-serialized buffer and return the parsed object.
  9 |  * @param {ArrayBuffer} buf
 10 |  * @returns {Record<string, any>}
 11 |  */
 12 | function roundTripDeserialize(buf) {
 13 |   const view = new DataView(buf)
 14 |   const reader = { view, offset: 0 }
 15 |   return deserializeTCompactProtocol(reader)
 16 | }
 17 | 
 18 | describe('serializeTCompactProtocol', () => {
 19 |   it('serializes basic types correctly', () => {
 20 |     const data = {
 21 |       field_1: true, // BOOL -> TRUE
 22 |       field_2: false, // BOOL -> FALSE
 23 |       field_3: 127, // BYTE / I32
 24 |       field_4: 0x7fff, // I16 / I32
 25 |       field_5: 0x7fffffff, // I32
 26 |       field_6: BigInt('0x7fffffffffffffff'), // I64
 27 |       field_7: 123.456, // DOUBLE
 28 |       field_8: 'Hello, Thrift!',
 29 |       field_9: new TextEncoder().encode('Hello, Thrift!'),
 30 |     }
 31 | 
 32 |     const writer = new ByteWriter()
 33 |     serializeTCompactProtocol(writer, data)
 34 |     const result = roundTripDeserialize(writer.getBuffer())
 35 | 
 36 |     expect(result.field_1).toBe(true)
 37 |     expect(result.field_2).toBe(false)
 38 |     expect(result.field_3).toBe(127)
 39 |     expect(result.field_4).toBe(0x7fff)
 40 |     expect(result.field_5).toBe(0x7fffffff)
 41 |     expect(result.field_6).toBe(BigInt('0x7fffffffffffffff'))
 42 |     expect(result.field_7).toBeCloseTo(123.456)
 43 |     // Decode the binary back into a string
 44 |     const decoder = new TextDecoder()
 45 |     expect(decoder.decode(result.field_8)).toBe('Hello, Thrift!')
 46 |     expect(decoder.decode(result.field_9)).toBe('Hello, Thrift!')
 47 |   })
 48 | 
 49 |   it('serializes a nested STRUCT and LIST of booleans', () => {
 50 |     const data = {
 51 |       field_1: {
 52 |         field_1: 42,
 53 |         field_2: {
 54 |           field_1: true,
 55 |           field_2: false,
 56 |         },
 57 |       },
 58 |       // List of booleans
 59 |       field_2: [true, false, true, false],
 60 |     }
 61 | 
 62 |     const writer = new ByteWriter()
 63 |     serializeTCompactProtocol(writer, data)
 64 |     const result = roundTripDeserialize(writer.getBuffer())
 65 | 
 66 |     expect(result.field_1.field_1).toBe(42)
 67 |     expect(result.field_1.field_2.field_1).toBe(true)
 68 |     expect(result.field_1.field_2.field_2).toBe(false)
 69 |     expect(result.field_2).toEqual([true, false, true, false])
 70 |   })
 71 | 
 72 |   it('handles empty object (only STOP)', () => {
 73 |     const data = {}
 74 |     const writer = new ByteWriter()
 75 |     serializeTCompactProtocol(writer, data)
 76 |     const arr = new Uint8Array(writer.getBuffer())
 77 |     // The entire buffer should just be [0x00] = STOP
 78 |     expect(arr).toEqual(new Uint8Array([0x00]))
 79 | 
 80 |     // Round-trip: should deserialize to an empty object
 81 |     const result = roundTripDeserialize(writer.getBuffer())
 82 |     expect(result).toEqual({})
 83 |   })
 84 | 
 85 |   it('throws on non-monotonic field IDs', () => {
 86 |     const invalidData = {
 87 |       field_2: 2,
 88 |       field_1: 1, // field_1 is out of order (less than field_2)
 89 |     }
 90 |     const writer = new ByteWriter()
 91 |     expect(() => serializeTCompactProtocol(writer, invalidData)).toThrow()
 92 |   })
 93 | 
 94 |   it('serializes field IDs with gaps larger than 15', () => {
 95 |     const data = { field_1: 1, field_17: 17 }
 96 |     const writer = new ByteWriter()
 97 |     serializeTCompactProtocol(writer, data)
 98 |     const result = roundTripDeserialize(writer.getBuffer())
 99 |     expect(result.field_1).toBe(1)
100 |     expect(result.field_17).toBe(17)
101 |   })
102 | 
103 |   it('serializes GEOMETRY logicalType struct with field_17', () => {
104 |     const data = { field_1: logicalType({ type: 'GEOMETRY' }) }
105 |     const writer = new ByteWriter()
106 |     serializeTCompactProtocol(writer, data)
107 |     const result = roundTripDeserialize(writer.getBuffer())
108 |     expect(result.field_1.field_17).toEqual({})
109 |   })
110 | })
111 | 


--------------------------------------------------------------------------------
/src/plain.js:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |  * @import {DecodedArray, ParquetType} from 'hyparquet'
  4 |  * @import {Writer} from '../src/types.js'
  5 |  * @param {Writer} writer
  6 |  * @param {DecodedArray} values
  7 |  * @param {ParquetType} type
  8 |  * @param {number | undefined} fixedLength
  9 |  */
 10 | export function writePlain(writer, values, type, fixedLength) {
 11 |   if (type === 'BOOLEAN') {
 12 |     writePlainBoolean(writer, values)
 13 |   } else if (type === 'INT32') {
 14 |     writePlainInt32(writer, values)
 15 |   } else if (type === 'INT64') {
 16 |     writePlainInt64(writer, values)
 17 |   } else if (type === 'FLOAT') {
 18 |     writePlainFloat(writer, values)
 19 |   } else if (type === 'DOUBLE') {
 20 |     writePlainDouble(writer, values)
 21 |   } else if (type === 'BYTE_ARRAY') {
 22 |     writePlainByteArray(writer, values)
 23 |   } else if (type === 'FIXED_LEN_BYTE_ARRAY') {
 24 |     if (!fixedLength) throw new Error('parquet FIXED_LEN_BYTE_ARRAY expected type_length')
 25 |     writePlainByteArrayFixed(writer, values, fixedLength)
 26 |   } else {
 27 |     throw new Error(`parquet unsupported type: ${type}`)
 28 |   }
 29 | }
 30 | 
 31 | /**
 32 |  * @param {Writer} writer
 33 |  * @param {DecodedArray} values
 34 |  */
 35 | function writePlainBoolean(writer, values) {
 36 |   let currentByte = 0
 37 | 
 38 |   for (let i = 0; i < values.length; i++) {
 39 |     if (typeof values[i] !== 'boolean') throw new Error('parquet expected boolean value')
 40 |     const bitOffset = i % 8
 41 | 
 42 |     if (values[i]) {
 43 |       currentByte |= 1 << bitOffset
 44 |     }
 45 | 
 46 |     // once we've packed 8 bits or are at a multiple of 8, we write out the byte
 47 |     if (bitOffset === 7) {
 48 |       writer.appendUint8(currentByte)
 49 |       currentByte = 0
 50 |     }
 51 |   }
 52 | 
 53 |   // if the array length is not a multiple of 8, write the leftover bits
 54 |   if (values.length % 8 !== 0) {
 55 |     writer.appendUint8(currentByte)
 56 |   }
 57 | }
 58 | 
 59 | /**
 60 |  * @param {Writer} writer
 61 |  * @param {DecodedArray} values
 62 |  */
 63 | function writePlainInt32(writer, values) {
 64 |   for (const value of values) {
 65 |     if (!Number.isSafeInteger(value)) throw new Error('parquet expected integer value')
 66 |     writer.appendInt32(value)
 67 |   }
 68 | }
 69 | 
 70 | /**
 71 |  * @param {Writer} writer
 72 |  * @param {DecodedArray} values
 73 |  */
 74 | function writePlainInt64(writer, values) {
 75 |   for (const value of values) {
 76 |     if (typeof value !== 'bigint') throw new Error('parquet expected bigint value')
 77 |     writer.appendInt64(value)
 78 |   }
 79 | }
 80 | 
 81 | /**
 82 |  * @param {Writer} writer
 83 |  * @param {DecodedArray} values
 84 |  */
 85 | function writePlainFloat(writer, values) {
 86 |   for (const value of values) {
 87 |     if (typeof value !== 'number') throw new Error('parquet expected number value')
 88 |     writer.appendFloat32(value)
 89 |   }
 90 | }
 91 | 
 92 | /**
 93 |  * @param {Writer} writer
 94 |  * @param {DecodedArray} values
 95 |  */
 96 | function writePlainDouble(writer, values) {
 97 |   for (const value of values) {
 98 |     if (typeof value !== 'number') throw new Error('parquet expected number value')
 99 |     writer.appendFloat64(value)
100 |   }
101 | }
102 | 
103 | /**
104 |  * @param {Writer} writer
105 |  * @param {DecodedArray} values
106 |  */
107 | function writePlainByteArray(writer, values) {
108 |   for (const value of values) {
109 |     let bytes = value
110 |     if (typeof bytes === 'string') {
111 |       // convert string to Uint8Array
112 |       bytes = new TextEncoder().encode(value)
113 |     }
114 |     if (!(bytes instanceof Uint8Array)) {
115 |       throw new Error('parquet expected Uint8Array value')
116 |     }
117 |     writer.appendUint32(bytes.length)
118 |     writer.appendBytes(bytes)
119 |   }
120 | }
121 | 
122 | /**
123 |  * @param {Writer} writer
124 |  * @param {DecodedArray} values
125 |  * @param {number} fixedLength
126 |  */
127 | function writePlainByteArrayFixed(writer, values, fixedLength) {
128 |   for (const value of values) {
129 |     if (!(value instanceof Uint8Array)) throw new Error('parquet expected Uint8Array value')
130 |     if (value.length !== fixedLength) throw new Error(`parquet expected Uint8Array of length ${fixedLength}`)
131 |     writer.appendBytes(value)
132 |   }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/bytewriter.js:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |  * Generic buffered writer.
  4 |  * Writes data to an auto-expanding ArrayBuffer.
  5 |  *
  6 |  * @import {Writer} from '../src/types.js'
  7 |  * @returns {Writer}
  8 |  */
  9 | export function ByteWriter() {
 10 |   this.buffer = new ArrayBuffer(1024)
 11 |   this.view = new DataView(this.buffer)
 12 |   this.offset = 0 // bytes written
 13 |   this.index = 0 // index in buffer
 14 |   return this
 15 | }
 16 | 
 17 | /**
 18 |  * @param {number} size
 19 |  */
 20 | ByteWriter.prototype.ensure = function(size) {
 21 |   // auto-expanding buffer
 22 |   if (this.index + size > this.buffer.byteLength) {
 23 |     const newSize = Math.max(this.buffer.byteLength * 2, this.index + size)
 24 |     const newBuffer = new ArrayBuffer(newSize)
 25 |     // TODO: save buffers until later and merge once?
 26 |     new Uint8Array(newBuffer).set(new Uint8Array(this.buffer))
 27 |     this.buffer = newBuffer
 28 |     this.view = new DataView(this.buffer)
 29 |   }
 30 | }
 31 | 
 32 | ByteWriter.prototype.finish = function() {
 33 | }
 34 | 
 35 | ByteWriter.prototype.getBuffer = function() {
 36 |   return this.buffer.slice(0, this.index)
 37 | }
 38 | 
 39 | /**
 40 |  * @param {number} value
 41 |  */
 42 | ByteWriter.prototype.appendUint8 = function(value) {
 43 |   this.ensure(this.index + 1)
 44 |   this.view.setUint8(this.index, value)
 45 |   this.offset++
 46 |   this.index++
 47 | }
 48 | 
 49 | /**
 50 |  * @param {number} value
 51 |  */
 52 | ByteWriter.prototype.appendUint32 = function(value) {
 53 |   this.ensure(this.index + 4)
 54 |   this.view.setUint32(this.index, value, true)
 55 |   this.offset += 4
 56 |   this.index += 4
 57 | }
 58 | 
 59 | /**
 60 |  * @param {number} value
 61 |  */
 62 | ByteWriter.prototype.appendInt32 = function(value) {
 63 |   this.ensure(this.index + 4)
 64 |   this.view.setInt32(this.index, value, true)
 65 |   this.offset += 4
 66 |   this.index += 4
 67 | }
 68 | 
 69 | /**
 70 |  * @param {bigint} value
 71 |  */
 72 | ByteWriter.prototype.appendInt64 = function(value) {
 73 |   this.ensure(this.index + 8)
 74 |   this.view.setBigInt64(this.index, BigInt(value), true)
 75 |   this.offset += 8
 76 |   this.index += 8
 77 | }
 78 | 
 79 | /**
 80 |  * @param {number} value
 81 |  */
 82 | ByteWriter.prototype.appendFloat32 = function(value) {
 83 |   this.ensure(this.index + 8)
 84 |   this.view.setFloat32(this.index, value, true)
 85 |   this.offset += 4
 86 |   this.index += 4
 87 | }
 88 | 
 89 | /**
 90 |  * @param {number} value
 91 |  */
 92 | ByteWriter.prototype.appendFloat64 = function(value) {
 93 |   this.ensure(this.index + 8)
 94 |   this.view.setFloat64(this.index, value, true)
 95 |   this.offset += 8
 96 |   this.index += 8
 97 | }
 98 | 
 99 | /**
100 |  * @param {ArrayBuffer} value
101 |  */
102 | ByteWriter.prototype.appendBuffer = function(value) {
103 |   this.appendBytes(new Uint8Array(value))
104 | }
105 | 
106 | /**
107 |  * @param {Uint8Array} value
108 |  */
109 | ByteWriter.prototype.appendBytes = function(value) {
110 |   this.ensure(this.index + value.length)
111 |   new Uint8Array(this.buffer, this.index, value.length).set(value)
112 |   this.offset += value.length
113 |   this.index += value.length
114 | }
115 | 
116 | /**
117 |  * Convert a 32-bit signed integer to varint (1-5 bytes).
118 |  * Writes out groups of 7 bits at a time, setting high bit if more to come.
119 |  *
120 |  * @param {number} value
121 |  */
122 | ByteWriter.prototype.appendVarInt = function(value) {
123 |   while (true) {
124 |     if ((value & ~0x7f) === 0) {
125 |       // fits in 7 bits
126 |       this.appendUint8(value)
127 |       return
128 |     } else {
129 |       // write 7 bits and set high bit
130 |       this.appendUint8(value & 0x7f | 0x80)
131 |       value >>>= 7
132 |     }
133 |   }
134 | }
135 | 
136 | /**
137 |  * Convert a bigint to varint (1-10 bytes for 64-bit range).
138 |  *
139 |  * @param {bigint} value
140 |  */
141 | ByteWriter.prototype.appendVarBigInt = function(value) {
142 |   while (true) {
143 |     if ((value & ~0x7fn) === 0n) {
144 |       // fits in 7 bits
145 |       this.appendUint8(Number(value))
146 |       return
147 |     } else {
148 |       // write 7 bits and set high bit
149 |       this.appendUint8(Number(value & 0x7fn | 0x80n))
150 |       value >>= 7n
151 |     }
152 |   }
153 | }
154 | 
155 | /**
156 |  * Convert number to zigzag encoding and write as varint.
157 |  *
158 |  * @param {number | bigint} value
159 |  */
160 | ByteWriter.prototype.appendZigZag = function(value) {
161 |   if (typeof value === 'number') {
162 |     this.appendVarInt(value << 1 ^ value >> 31)
163 |   } else {
164 |     this.appendVarBigInt(value << 1n ^ value >> 63n)
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/wkb.js:
--------------------------------------------------------------------------------
  1 | import { ByteWriter } from './bytewriter.js'
  2 | 
  3 | /**
  4 |  * Serialize a GeoJSON geometry into ISO WKB.
  5 |  *
  6 |  * @import {Geometry, Position} from 'hyparquet/src/types.js'
  7 |  * @param {Geometry} geometry
  8 |  * @returns {Uint8Array}
  9 |  */
 10 | export function geojsonToWkb(geometry) {
 11 |   const writer = new ByteWriter()
 12 |   writeGeometry(writer, geometry)
 13 |   return new Uint8Array(writer.getBuffer())
 14 | }
 15 | 
 16 | /**
 17 |  * @param {ByteWriter} writer
 18 |  * @param {Geometry} geometry
 19 |  */
 20 | function writeGeometry(writer, geometry) {
 21 |   const typeCode = geometryTypeCode(geometry.type)
 22 | 
 23 |   // infer dimensions
 24 |   const dim = inferGeometryDimensions(geometry)
 25 |   let flag = 0
 26 |   if (dim === 3) flag = 1
 27 |   else if (dim === 4) flag = 3
 28 |   else if (dim > 4) throw new Error(`unsupported geometry dimensions: ${dim}`)
 29 | 
 30 |   writer.appendUint8(1) // little endian
 31 |   writer.appendUint32(typeCode + flag * 1000)
 32 | 
 33 |   if (geometry.type === 'Point') {
 34 |     writePosition(writer, geometry.coordinates, dim)
 35 |   } else if (geometry.type === 'LineString') {
 36 |     writeLine(writer, geometry.coordinates, dim)
 37 |   } else if (geometry.type === 'Polygon') {
 38 |     writer.appendUint32(geometry.coordinates.length)
 39 |     for (const ring of geometry.coordinates) {
 40 |       writeLine(writer, ring, dim)
 41 |     }
 42 |   } else if (geometry.type === 'MultiPoint') {
 43 |     writer.appendUint32(geometry.coordinates.length)
 44 |     for (const coordinates of geometry.coordinates) {
 45 |       writeGeometry(writer, { type: 'Point', coordinates })
 46 |     }
 47 |   } else if (geometry.type === 'MultiLineString') {
 48 |     writer.appendUint32(geometry.coordinates.length)
 49 |     for (const coordinates of geometry.coordinates) {
 50 |       writeGeometry(writer, { type: 'LineString', coordinates })
 51 |     }
 52 |   } else if (geometry.type === 'MultiPolygon') {
 53 |     writer.appendUint32(geometry.coordinates.length)
 54 |     for (const coordinates of geometry.coordinates) {
 55 |       writeGeometry(writer, { type: 'Polygon', coordinates })
 56 |     }
 57 |   } else if (geometry.type === 'GeometryCollection') {
 58 |     writer.appendUint32(geometry.geometries.length)
 59 |     for (const child of geometry.geometries) {
 60 |       writeGeometry(writer, child)
 61 |     }
 62 |   } else {
 63 |     throw new Error('unsupported geometry type')
 64 |   }
 65 | }
 66 | 
 67 | /**
 68 |  * @param {ByteWriter} writer
 69 |  * @param {Position} position
 70 |  * @param {number} dim
 71 |  */
 72 | function writePosition(writer, position, dim) {
 73 |   if (position.length < dim) {
 74 |     throw new Error('geometry position dimensions mismatch')
 75 |   }
 76 |   for (let i = 0; i < dim; i++) {
 77 |     writer.appendFloat64(position[i])
 78 |   }
 79 | }
 80 | 
 81 | /**
 82 |  * @param {ByteWriter} writer
 83 |  * @param {Position[]} coordinates
 84 |  * @param {number} dim
 85 |  */
 86 | function writeLine(writer, coordinates, dim) {
 87 |   writer.appendUint32(coordinates.length)
 88 |   for (const position of coordinates) {
 89 |     writePosition(writer, position, dim)
 90 |   }
 91 | }
 92 | 
 93 | /**
 94 |  * @param {Geometry['type']} type
 95 |  * @returns {number}
 96 |  */
 97 | function geometryTypeCode(type) {
 98 |   if (type === 'Point') return 1
 99 |   if (type === 'LineString') return 2
100 |   if (type === 'Polygon') return 3
101 |   if (type === 'MultiPoint') return 4
102 |   if (type === 'MultiLineString') return 5
103 |   if (type === 'MultiPolygon') return 6
104 |   if (type === 'GeometryCollection') return 7
105 |   throw new Error(`unknown geometry type: ${type}`)
106 | }
107 | 
108 | /**
109 |  * Determine the maximum coordinate dimensions for the geometry.
110 |  *
111 |  * @param {Geometry} geometry
112 |  * @returns {number}
113 |  */
114 | function inferGeometryDimensions(geometry) {
115 |   if (geometry.type === 'GeometryCollection') {
116 |     let maxDim = 0
117 |     for (const child of geometry.geometries) {
118 |       maxDim = Math.max(maxDim, inferGeometryDimensions(child))
119 |     }
120 |     return maxDim || 2
121 |   }
122 |   return inferCoordinateDimensions(geometry.coordinates)
123 | }
124 | 
125 | /**
126 |  * @param {any} value
127 |  * @returns {number}
128 |  */
129 | function inferCoordinateDimensions(value) {
130 |   if (!Array.isArray(value)) return 2
131 |   if (!value.length) return 2
132 |   if (typeof value[0] === 'number') return value.length
133 |   let maxDim = 0
134 |   for (const item of value) {
135 |     maxDim = Math.max(maxDim, inferCoordinateDimensions(item))
136 |   }
137 |   return maxDim || 2
138 | }
139 | 


--------------------------------------------------------------------------------
/src/geospatial.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Compute geospatial statistics for GEOMETRY and GEOGRAPHY columns.
  3 |  *
  4 |  * @import {BoundingBox, DecodedArray, Geometry, GeospatialStatistics} from 'hyparquet/src/types.js'
  5 |  * @param {DecodedArray} values
  6 |  * @returns {GeospatialStatistics | undefined}
  7 |  */
  8 | export function geospatialStatistics(values) {
  9 |   /** @type {Set<number>} */
 10 |   const typeCodes = new Set()
 11 |   /** @type {BoundingBox | undefined} */
 12 |   let bbox
 13 | 
 14 |   for (const value of values) {
 15 |     if (value === null || value === undefined) continue
 16 |     if (typeof value !== 'object') {
 17 |       throw new Error('geospatial column expects GeoJSON geometries')
 18 |     }
 19 |     bbox = extendBoundsFromGeometry(bbox, value)
 20 |     typeCodes.add(geometryTypeCodeWithDimension(value))
 21 |   }
 22 | 
 23 |   if (typeCodes.size || bbox) {
 24 |     return {
 25 |       bbox,
 26 |       // Geospatial type codes of all instances, or an empty list if not known
 27 |       geospatial_types: typeCodes.size ? Array.from(typeCodes).sort((a, b) => a - b) : [],
 28 |     }
 29 |   }
 30 | }
 31 | 
 32 | /**
 33 |  * @param {BoundingBox | undefined} bbox
 34 |  * @param {Geometry} geometry
 35 |  * @returns {BoundingBox | undefined}
 36 |  */
 37 | function extendBoundsFromGeometry(bbox, geometry) {
 38 |   if (geometry.type === 'GeometryCollection') {
 39 |     for (const child of geometry.geometries || []) {
 40 |       bbox = extendBoundsFromGeometry(bbox, child)
 41 |     }
 42 |     return bbox
 43 |   }
 44 |   return extendBoundsFromCoordinates(bbox, geometry.coordinates)
 45 | }
 46 | 
 47 | /**
 48 |  * @param {BoundingBox | undefined} bbox
 49 |  * @param {any[]} coordinates
 50 |  * @returns {BoundingBox | undefined}
 51 |  */
 52 | function extendBoundsFromCoordinates(bbox, coordinates) {
 53 |   if (typeof coordinates[0] === 'number') {
 54 |     return grow(bbox, coordinates)
 55 |   }
 56 |   for (const child of coordinates) {
 57 |     bbox = extendBoundsFromCoordinates(bbox, child)
 58 |   }
 59 |   return bbox
 60 | }
 61 | 
 62 | /**
 63 |  * Initialize or expand bbox with a single position [x,y,(z),(m)].
 64 |  * @param {BoundingBox | undefined} bbox
 65 |  * @param {number[]} position
 66 |  * @returns {BoundingBox | undefined}
 67 |  */
 68 | function grow(bbox, position) {
 69 |   const x = position[0]
 70 |   const y = position[1]
 71 |   if (!Number.isFinite(x) || !Number.isFinite(y)) return bbox
 72 | 
 73 |   if (!bbox) {
 74 |     bbox = { xmin: x, ymin: y, xmax: x, ymax: y }
 75 |   } else {
 76 |     updateAxis(bbox, 'xmin', 'xmax', x)
 77 |     updateAxis(bbox, 'ymin', 'ymax', y)
 78 |   }
 79 | 
 80 |   if (position.length > 2) updateAxis(bbox, 'zmin', 'zmax', position[2])
 81 |   if (position.length > 3) updateAxis(bbox, 'mmin', 'mmax', position[3])
 82 |   return bbox
 83 | }
 84 | 
 85 | /**
 86 |  * @param {BoundingBox} bbox
 87 |  * @param {'xmin' | 'ymin' | 'zmin' | 'mmin'} minKey
 88 |  * @param {'xmax' | 'ymax' | 'zmax' | 'mmax'} maxKey
 89 |  * @param {number | undefined} value
 90 |  */
 91 | function updateAxis(bbox, minKey, maxKey, value) {
 92 |   if (value === undefined || !Number.isFinite(value)) return
 93 |   if (bbox[minKey] === undefined || value < bbox[minKey]) bbox[minKey] = value
 94 |   if (bbox[maxKey] === undefined || value > bbox[maxKey]) bbox[maxKey] = value
 95 | }
 96 | 
 97 | /**
 98 |  * @param {Geometry} geometry
 99 |  * @returns {number}
100 |  */
101 | function geometryTypeCodeWithDimension(geometry) {
102 |   const base = geometryTypeCodes[geometry.type]
103 |   if (base === undefined) throw new Error(`unknown geometry type: ${geometry.type}`)
104 |   const dim = inferGeometryDimensions(geometry)
105 |   if (dim === 2) return base
106 |   if (dim === 3) return base + 1000
107 |   if (dim === 4) return base + 3000
108 |   throw new Error(`unsupported geometry dimensions: ${dim}`)
109 | }
110 | 
111 | const geometryTypeCodes = {
112 |   Point: 1,
113 |   LineString: 2,
114 |   Polygon: 3,
115 |   MultiPoint: 4,
116 |   MultiLineString: 5,
117 |   MultiPolygon: 6,
118 |   GeometryCollection: 7,
119 | }
120 | 
121 | /**
122 |  * Determine the maximum coordinate dimensions for the geometry.
123 |  * @param {Geometry} geometry
124 |  * @returns {number}
125 |  */
126 | function inferGeometryDimensions(geometry) {
127 |   if (geometry.type === 'GeometryCollection') {
128 |     let maxDim = 0
129 |     for (const child of geometry.geometries || []) {
130 |       maxDim = Math.max(maxDim, inferGeometryDimensions(child))
131 |     }
132 |     return maxDim || 2
133 |   }
134 |   return inferCoordinateDimensions(geometry.coordinates)
135 | }
136 | 
137 | /**
138 |  * @param {any[]} value
139 |  * @returns {number}
140 |  */
141 | function inferCoordinateDimensions(value) {
142 |   if (!value.length) return 2
143 |   if (typeof value[0] === 'number') return value.length
144 |   let maxDim = 0
145 |   for (const item of value) {
146 |     maxDim = Math.max(maxDim, inferCoordinateDimensions(item))
147 |   }
148 |   return maxDim || 2
149 | }
150 | 


--------------------------------------------------------------------------------
/test/write.schema.test.js:
--------------------------------------------------------------------------------
  1 | import { parquetMetadata } from 'hyparquet'
  2 | import { describe, expect, it } from 'vitest'
  3 | import { parquetWriteBuffer, schemaFromColumnData } from '../src/index.js'
  4 | 
  5 | describe('parquet schema', () => {
  6 |   it('auto detects types', () => {
  7 |     const file = parquetWriteBuffer({ columnData: [
  8 |       { name: 'strings', data: ['1', '2', '3'] },
  9 |     ] })
 10 |     const metadata = parquetMetadata(file)
 11 |     expect(metadata.schema).toEqual([
 12 |       {
 13 |         name: 'root',
 14 |         num_children: 1,
 15 |       },
 16 |       {
 17 |         converted_type: 'UTF8',
 18 |         name: 'strings',
 19 |         repetition_type: 'REQUIRED',
 20 |         type: 'BYTE_ARRAY',
 21 |       },
 22 |     ])
 23 |   })
 24 | 
 25 |   it('accepts basic type hints', () => {
 26 |     const file = parquetWriteBuffer({ columnData: [
 27 |       {
 28 |         name: 'timestamps',
 29 |         data: [new Date(1000000), new Date(2000000), new Date(3000000)],
 30 |         type: 'TIMESTAMP',
 31 |       },
 32 |     ] })
 33 |     const metadata = parquetMetadata(file)
 34 |     expect(metadata.schema).toEqual([
 35 |       {
 36 |         name: 'root',
 37 |         num_children: 1,
 38 |       },
 39 |       {
 40 |         converted_type: 'TIMESTAMP_MILLIS',
 41 |         name: 'timestamps',
 42 |         repetition_type: 'OPTIONAL',
 43 |         type: 'INT64',
 44 |       },
 45 |     ])
 46 |   })
 47 | 
 48 |   it('accepts nullable basic type hints', () => {
 49 |     const file = parquetWriteBuffer({ columnData: [
 50 |       { name: 'numbers', data: [1, 2, 3], type: 'FLOAT', nullable: false },
 51 |     ] })
 52 |     const metadata = parquetMetadata(file)
 53 |     expect(metadata.schema).toEqual([
 54 |       {
 55 |         name: 'root',
 56 |         num_children: 1,
 57 |       },
 58 |       {
 59 |         name: 'numbers',
 60 |         repetition_type: 'REQUIRED',
 61 |         type: 'FLOAT',
 62 |       },
 63 |     ])
 64 |   })
 65 | 
 66 |   it('allow zero rows to be auto-typed', () => {
 67 |     const file = parquetWriteBuffer({ columnData: [
 68 |       { name: 'numbers', data: [] },
 69 |     ] })
 70 |     const metadata = parquetMetadata(file)
 71 |     expect(metadata.schema).toEqual([
 72 |       {
 73 |         name: 'root',
 74 |         num_children: 1,
 75 |       },
 76 |       {
 77 |         name: 'numbers',
 78 |         repetition_type: 'OPTIONAL',
 79 |         type: 'BYTE_ARRAY',
 80 |       },
 81 |     ])
 82 |   })
 83 | 
 84 |   it('allow entirely null columns to be auto-typed', () => {
 85 |     const file = parquetWriteBuffer({ columnData: [
 86 |       { name: 'numbers', data: [null, null, null] },
 87 |     ] })
 88 |     const metadata = parquetMetadata(file)
 89 |     expect(metadata.schema).toEqual([
 90 |       {
 91 |         name: 'root',
 92 |         num_children: 1,
 93 |       },
 94 |       {
 95 |         name: 'numbers',
 96 |         repetition_type: 'OPTIONAL',
 97 |         type: 'BYTE_ARRAY',
 98 |       },
 99 |     ])
100 |   })
101 | 
102 |   it('accepts explicit schema', () => {
103 |     const file = parquetWriteBuffer({ columnData: [
104 |       { name: 'numbers', data: [1, 2, 3] },
105 |     ], schema: [
106 |       { name: 'root', num_children: 1 },
107 |       { name: 'numbers', type: 'FLOAT', repetition_type: 'REQUIRED' },
108 |     ] })
109 |     const metadata = parquetMetadata(file)
110 |     expect(metadata.schema).toEqual([
111 |       {
112 |         name: 'root',
113 |         num_children: 1,
114 |       },
115 |       {
116 |         name: 'numbers',
117 |         repetition_type: 'REQUIRED',
118 |         type: 'FLOAT',
119 |       },
120 |     ])
121 |   })
122 | 
123 |   it('accepts schema override', () => {
124 |     const columnData = [
125 |       { name: 'numbers', data: [1, 2, 3] },
126 |     ]
127 |     const file = parquetWriteBuffer({
128 |       columnData,
129 |       schema: schemaFromColumnData({
130 |         columnData,
131 |         schemaOverrides: {
132 |           numbers: {
133 |             name: 'numbers',
134 |             type: 'DOUBLE',
135 |             repetition_type: 'OPTIONAL',
136 |             field_id: 1,
137 |           },
138 |         },
139 |       }),
140 |     })
141 |     const metadata = parquetMetadata(file)
142 |     expect(metadata.schema).toEqual([
143 |       {
144 |         name: 'root',
145 |         num_children: 1,
146 |       },
147 |       {
148 |         field_id: 1,
149 |         name: 'numbers',
150 |         repetition_type: 'OPTIONAL',
151 |         type: 'DOUBLE',
152 |       },
153 |     ])
154 |   })
155 | 
156 |   it('throws if basic types conflict with schema', () => {
157 |     expect(() => {
158 |       parquetWriteBuffer({
159 |         columnData: [
160 |           { name: 'numbers', data: [1, 2, 3], type: 'FLOAT' },
161 |         ],
162 |         schema: [
163 |           { name: 'root', num_children: 1 },
164 |           { name: 'numbers', type: 'DOUBLE', repetition_type: 'OPTIONAL' },
165 |         ],
166 |       })
167 |     }).toThrow('cannot provide both schema and columnData type')
168 |   })
169 | })
170 | 


--------------------------------------------------------------------------------
/test/plain.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { ByteWriter } from '../src/bytewriter.js'
  3 | import { writePlain } from '../src/plain.js'
  4 | 
  5 | describe('writePlain', () => {
  6 |   it('writes BOOLEAN (multiple of 8 bits, plus leftover)', () => {
  7 |     const writer = new ByteWriter()
  8 |     const booleans = [true, false, true, true, false, false, false, true, true]
  9 |     writePlain(writer, booleans, 'BOOLEAN', undefined)
 10 | 
 11 |     expect(writer.offset).toBe(2)
 12 |     expect(writer.view.getUint8(0)).toBe(0b10001101)
 13 |     expect(writer.view.getUint8(1)).toBe(0b00000001)
 14 |   })
 15 | 
 16 |   it('writes INT32', () => {
 17 |     const writer = new ByteWriter()
 18 |     const ints = [0, 1, 255, 256, 65535, -1, -2147483648, 2147483647]
 19 |     writePlain(writer, ints, 'INT32', undefined)
 20 | 
 21 |     // 4 bytes per int
 22 |     expect(writer.offset).toBe(4 * ints.length)
 23 | 
 24 |     for (let i = 0; i < ints.length; i++) {
 25 |       const value = writer.view.getInt32(i * 4, true)
 26 |       expect(value).toBe(ints[i])
 27 |     }
 28 |   })
 29 | 
 30 |   it('writes INT64', () => {
 31 |     const writer = new ByteWriter()
 32 |     const bigints = [0n, 1n, 42n, BigInt(2 ** 53 - 1)]
 33 |     writePlain(writer, bigints, 'INT64', undefined)
 34 | 
 35 |     // 8 bytes per int64
 36 |     expect(writer.offset).toBe(8 * bigints.length)
 37 | 
 38 |     for (let i = 0; i < bigints.length; i++) {
 39 |       const value = writer.view.getBigInt64(i * 8, true)
 40 |       expect(value).toBe(bigints[i])
 41 |     }
 42 |   })
 43 | 
 44 |   it('writes FLOAT', () => {
 45 |     const writer = new ByteWriter()
 46 |     const floats = [0, 300.5, -2.7100000381469727, Infinity, -Infinity, NaN]
 47 |     writePlain(writer, floats, 'FLOAT', undefined)
 48 | 
 49 |     // 4 bytes per float
 50 |     expect(writer.offset).toBe(4 * floats.length)
 51 | 
 52 |     for (let i = 0; i < floats.length; i++) {
 53 |       const val = writer.view.getFloat32(i * 4, true)
 54 |       if (Number.isNaN(floats[i])) {
 55 |         expect(Number.isNaN(val)).toBe(true)
 56 |       } else {
 57 |         expect(val).toBe(floats[i])
 58 |       }
 59 |     }
 60 |   })
 61 | 
 62 |   it('writes DOUBLE', () => {
 63 |     const writer = new ByteWriter()
 64 |     const doubles = [0, 3.14, -2.71, Infinity, -Infinity, NaN]
 65 |     writePlain(writer, doubles, 'DOUBLE', undefined)
 66 | 
 67 |     // 8 bytes per double
 68 |     expect(writer.offset).toBe(8 * doubles.length)
 69 | 
 70 |     for (let i = 0; i < doubles.length; i++) {
 71 |       const val = writer.view.getFloat64(i * 8, true)
 72 |       if (Number.isNaN(doubles[i])) {
 73 |         expect(Number.isNaN(val)).toBe(true)
 74 |       } else {
 75 |         expect(val).toBe(doubles[i])
 76 |       }
 77 |     }
 78 |   })
 79 | 
 80 |   it('writes BYTE_ARRAY', () => {
 81 |     const writer = new ByteWriter()
 82 |     const strings = ['a', 'b', 'c', 'd']
 83 |     writePlain(writer, strings, 'BYTE_ARRAY', undefined)
 84 | 
 85 |     let offset = 0
 86 |     for (const s of strings) {
 87 |       const length = writer.view.getUint32(offset, true)
 88 |       expect(length).toBe(s.length)
 89 |       offset += 4
 90 | 
 91 |       for (let i = 0; i < s.length; i++) {
 92 |         expect(writer.view.getUint8(offset)).toBe(s.charCodeAt(i))
 93 |         offset += 1
 94 |       }
 95 |     }
 96 |   })
 97 | 
 98 |   it('writes FIXED_LENGTH_BYTE_ARRAY', () => {
 99 |     const writer = new ByteWriter()
100 |     const encoder = new TextEncoder()
101 |     const strings = ['abcd', 'efgh', 'ijkl']
102 |       .map(s => encoder.encode(s))
103 |     writePlain(writer, strings, 'FIXED_LEN_BYTE_ARRAY', 4)
104 | 
105 |     let offset = 0
106 |     for (const s of strings) {
107 |       for (let i = 0; i < s.length; i++) {
108 |         expect(writer.view.getUint8(offset)).toBe(s[i])
109 |         offset += 1
110 |       }
111 |     }
112 |   })
113 | 
114 |   it('throws error on unsupported type', () => {
115 |     const writer = new ByteWriter()
116 |     expect(() => writePlain(writer, [1, 2, 3], 'INT96', undefined))
117 |       .toThrow(/parquet unsupported type/i)
118 |   })
119 | 
120 |   it('throws error on type mismatch', () => {
121 |     const writer = new ByteWriter()
122 |     expect(() => writePlain(writer, [1, 2, 3], 'BOOLEAN', undefined))
123 |       .toThrow('parquet expected boolean value')
124 |     expect(() => writePlain(writer, [1, 2, 3.5], 'INT32', undefined))
125 |       .toThrow('parquet expected integer value')
126 |     expect(() => writePlain(writer, [1n, 2n, 3], 'INT64', undefined))
127 |       .toThrow('parquet expected bigint value')
128 |     expect(() => writePlain(writer, [1, 2, 3n], 'FLOAT', undefined))
129 |       .toThrow('parquet expected number value')
130 |     expect(() => writePlain(writer, [1, 2, 3n], 'DOUBLE', undefined))
131 |       .toThrow('parquet expected number value')
132 |     expect(() => writePlain(writer, [1, 2, 3], 'BYTE_ARRAY', undefined))
133 |       .toThrow('parquet expected Uint8Array value')
134 |     expect(() => writePlain(writer, [1, 2, 3], 'FIXED_LEN_BYTE_ARRAY', undefined))
135 |       .toThrow('parquet FIXED_LEN_BYTE_ARRAY expected type_length')
136 |     expect(() => writePlain(writer, [1, 2, 3], 'FIXED_LEN_BYTE_ARRAY', 16))
137 |       .toThrow('parquet expected Uint8Array value')
138 |   })
139 | })
140 | 


--------------------------------------------------------------------------------
/test/example.js:
--------------------------------------------------------------------------------
  1 | /** @type {ColumnSource[]} */
  2 | export const exampleData = [
  3 |   { name: 'bool', data: [true, false, true, false] },
  4 |   { name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] },
  5 |   { name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] },
  6 |   { name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT', nullable: false },
  7 |   { name: 'double', data: [0, 0.0001, 123.456, 1e100] },
  8 |   { name: 'string', data: ['a', 'b', 'c', 'd'] },
  9 |   { name: 'nullable', data: [true, false, null, null] },
 10 | ]
 11 | 
 12 | /**
 13 |  * @import {FileMetaData} from 'hyparquet'
 14 |  * @import {ColumnSource} from '../src/types.js'
 15 |  * @type {FileMetaData}
 16 |  */
 17 | export const exampleMetadata = {
 18 |   version: 2,
 19 |   created_by: 'hyparquet',
 20 |   schema: [
 21 |     { name: 'root', num_children: 7 },
 22 |     { name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
 23 |     { name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
 24 |     { name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
 25 |     { name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' },
 26 |     { name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
 27 |     { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' },
 28 |     { name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' },
 29 |   ],
 30 |   num_rows: 4n,
 31 |   row_groups: [{
 32 |     columns: [
 33 |       {
 34 |         file_offset: 4n,
 35 |         meta_data: {
 36 |           type: 'BOOLEAN',
 37 |           encodings: ['PLAIN'],
 38 |           path_in_schema: ['bool'],
 39 |           codec: 'SNAPPY',
 40 |           num_values: 4n,
 41 |           total_uncompressed_size: 24n,
 42 |           total_compressed_size: 24n,
 43 |           data_page_offset: 4n,
 44 |           statistics: {
 45 |             null_count: 0n,
 46 |             min_value: false,
 47 |             max_value: true,
 48 |           },
 49 |         },
 50 |       },
 51 |       {
 52 |         file_offset: 28n,
 53 |         meta_data: {
 54 |           type: 'INT32',
 55 |           encodings: ['PLAIN'],
 56 |           path_in_schema: ['int'],
 57 |           codec: 'SNAPPY',
 58 |           num_values: 4n,
 59 |           total_uncompressed_size: 39n,
 60 |           total_compressed_size: 39n,
 61 |           data_page_offset: 28n,
 62 |           statistics: {
 63 |             null_count: 0n,
 64 |             min_value: 0,
 65 |             max_value: 0x7fffffff,
 66 |           },
 67 |         },
 68 |       },
 69 |       {
 70 |         file_offset: 67n,
 71 |         meta_data: {
 72 |           type: 'INT64',
 73 |           encodings: ['PLAIN'],
 74 |           path_in_schema: ['bigint'],
 75 |           codec: 'SNAPPY',
 76 |           num_values: 4n,
 77 |           total_uncompressed_size: 43n,
 78 |           total_compressed_size: 43n,
 79 |           data_page_offset: 67n,
 80 |           statistics: {
 81 |             null_count: 0n,
 82 |             min_value: 0n,
 83 |             max_value: 0x7fffffffffffffffn,
 84 |           },
 85 |         },
 86 |       },
 87 |       {
 88 |         file_offset: 110n,
 89 |         meta_data: {
 90 |           type: 'FLOAT',
 91 |           encodings: ['PLAIN'],
 92 |           path_in_schema: ['float'],
 93 |           codec: 'SNAPPY',
 94 |           num_values: 4n,
 95 |           total_uncompressed_size: 39n,
 96 |           total_compressed_size: 39n,
 97 |           data_page_offset: 110n,
 98 |           statistics: {
 99 |             null_count: 0n,
100 |             min_value: 0,
101 |             max_value: Infinity,
102 |           },
103 |         },
104 |       },
105 |       {
106 |         file_offset: 149n,
107 |         meta_data: {
108 |           type: 'DOUBLE',
109 |           encodings: ['PLAIN'],
110 |           path_in_schema: ['double'],
111 |           codec: 'SNAPPY',
112 |           num_values: 4n,
113 |           total_uncompressed_size: 51n,
114 |           total_compressed_size: 51n,
115 |           data_page_offset: 149n,
116 |           statistics: {
117 |             null_count: 0n,
118 |             min_value: 0,
119 |             max_value: 1e100,
120 |           },
121 |         },
122 |       },
123 |       {
124 |         file_offset: 200n,
125 |         meta_data: {
126 |           type: 'BYTE_ARRAY',
127 |           encodings: ['PLAIN'],
128 |           path_in_schema: ['string'],
129 |           codec: 'SNAPPY',
130 |           num_values: 4n,
131 |           total_uncompressed_size: 42n,
132 |           total_compressed_size: 42n,
133 |           data_page_offset: 200n,
134 |           statistics: {
135 |             null_count: 0n,
136 |             min_value: 'a',
137 |             max_value: 'd',
138 |           },
139 |         },
140 |       },
141 |       {
142 |         file_offset: 242n,
143 |         meta_data: {
144 |           type: 'BOOLEAN',
145 |           encodings: ['PLAIN'],
146 |           path_in_schema: ['nullable'],
147 |           codec: 'SNAPPY',
148 |           num_values: 4n,
149 |           total_uncompressed_size: 26n,
150 |           total_compressed_size: 26n,
151 |           data_page_offset: 242n,
152 |           statistics: {
153 |             null_count: 2n,
154 |             min_value: false,
155 |             max_value: true,
156 |           },
157 |         },
158 |       },
159 |     ],
160 |     total_byte_size: 264n,
161 |     num_rows: 4n,
162 |   }],
163 |   metadata_length: 445,
164 | }
165 | 


--------------------------------------------------------------------------------
/test/write.multipage.test.js:
--------------------------------------------------------------------------------
  1 | import { parquetReadObjects } from 'hyparquet'
  2 | import { describe, expect, it } from 'vitest'
  3 | import { parquetWriteBuffer } from '../src/index.js'
  4 | 
  5 | /** @import {ColumnSource} from '../src/types.js' */
  6 | 
  7 | describe('parquetWrite multi-page', () => {
  8 |   it('writes with small pageSize and data is still readable', async () => {
  9 |     // Generate enough data to span multiple pages with a small pageSize
 10 |     const numRows = 1000
 11 |     /** @type {ColumnSource[]} */
 12 |     const columnData = [
 13 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
 14 |       { name: 'value', data: Array.from({ length: numRows }, (_, i) => i * 2), type: 'INT32' },
 15 |     ]
 16 | 
 17 |     // Use a very small page size to force multiple pages
 18 |     // Each INT32 is 4 bytes, so 100 bytes should hold about 25 values per page
 19 |     const buffer = parquetWriteBuffer({
 20 |       columnData,
 21 |       pageSize: 100,
 22 |     })
 23 | 
 24 |     // Read back the data
 25 |     const rows = await parquetReadObjects({ file: buffer })
 26 | 
 27 |     expect(rows.length).toBe(numRows)
 28 |     expect(rows[0]).toEqual({ id: 0, value: 0 })
 29 |     expect(rows[999]).toEqual({ id: 999, value: 1998 })
 30 |   })
 31 | 
 32 |   it('handles various data types with pageSize', async () => {
 33 |     const numRows = 500
 34 |     /** @type {ColumnSource[]} */
 35 |     const columnData = [
 36 |       { name: 'int32', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
 37 |       { name: 'int64', data: Array.from({ length: numRows }, (_, i) => BigInt(i)), type: 'INT64' },
 38 |       { name: 'float', data: Array.from({ length: numRows }, (_, i) => i * 0.5), type: 'FLOAT' },
 39 |       { name: 'double', data: Array.from({ length: numRows }, (_, i) => i * 0.5), type: 'DOUBLE' },
 40 |       { name: 'bool', data: Array.from({ length: numRows }, (_, i) => i % 2 === 0), type: 'BOOLEAN' },
 41 |     ]
 42 | 
 43 |     const buffer = parquetWriteBuffer({
 44 |       columnData,
 45 |       pageSize: 200,
 46 |       statistics: true,
 47 |     })
 48 | 
 49 |     const rows = await parquetReadObjects({ file: buffer })
 50 | 
 51 |     expect(rows.length).toBe(numRows)
 52 |     expect(rows[0].int32).toBe(0)
 53 |     expect(rows[0].bool).toBe(true)
 54 |     expect(rows[1].bool).toBe(false)
 55 |   })
 56 | 
 57 |   it('handles strings with pageSize', async () => {
 58 |     const numRows = 100
 59 |     const strings = Array.from({ length: numRows }, (_, i) => `string_value_${i}`)
 60 |     /** @type {ColumnSource[]} */
 61 |     const columnData = [
 62 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
 63 |       { name: 'str', data: strings, type: 'STRING' },
 64 |     ]
 65 | 
 66 |     const buffer = parquetWriteBuffer({
 67 |       columnData,
 68 |       pageSize: 200,
 69 |     })
 70 | 
 71 |     const rows = await parquetReadObjects({ file: buffer })
 72 | 
 73 |     expect(rows.length).toBe(numRows)
 74 |     expect(rows[0].str).toBe('string_value_0')
 75 |     expect(rows[99].str).toBe('string_value_99')
 76 |   })
 77 | 
 78 |   it('handles nulls with pageSize', async () => {
 79 |     const numRows = 200
 80 |     /** @type {ColumnSource[]} */
 81 |     const columnData = [
 82 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
 83 |       { name: 'nullable', data: Array.from({ length: numRows }, (_, i) => i % 3 === 0 ? null : i), type: 'INT32', nullable: true },
 84 |     ]
 85 | 
 86 |     const buffer = parquetWriteBuffer({
 87 |       columnData,
 88 |       pageSize: 100,
 89 |     })
 90 | 
 91 |     const rows = await parquetReadObjects({ file: buffer })
 92 | 
 93 |     expect(rows.length).toBe(numRows)
 94 |     expect(rows[0].nullable).toBe(null)
 95 |     expect(rows[1].nullable).toBe(1)
 96 |     expect(rows[2].nullable).toBe(2)
 97 |     expect(rows[3].nullable).toBe(null)
 98 |   })
 99 | 
100 |   it('works without pageSize (backwards compatibility)', async () => {
101 |     const numRows = 100
102 |     /** @type {ColumnSource[]} */
103 |     const columnData = [
104 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
105 |     ]
106 | 
107 |     // No pageSize specified
108 |     const buffer = parquetWriteBuffer({ columnData })
109 | 
110 |     const rows = await parquetReadObjects({ file: buffer })
111 |     expect(rows.length).toBe(numRows)
112 |   })
113 | 
114 |   it('handles single value per page edge case', async () => {
115 |     const numRows = 10
116 |     /** @type {ColumnSource[]} */
117 |     const columnData = [
118 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
119 |     ]
120 | 
121 |     // Very tiny pageSize - should still work
122 |     const buffer = parquetWriteBuffer({
123 |       columnData,
124 |       pageSize: 4, // exactly one INT32
125 |     })
126 | 
127 |     const rows = await parquetReadObjects({ file: buffer })
128 |     expect(rows.length).toBe(numRows)
129 |     expect(rows.map(r => r.id)).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
130 |   })
131 | 
132 |   it('handles dictionary encoding with pageSize', async () => {
133 |     // Use repeated values to trigger dictionary encoding
134 |     const numRows = 500
135 |     const values = ['apple', 'banana', 'cherry']
136 |     /** @type {ColumnSource[]} */
137 |     const columnData = [
138 |       { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' },
139 |       { name: 'fruit', data: Array.from({ length: numRows }, (_, i) => values[i % 3]), type: 'STRING' },
140 |     ]
141 | 
142 |     const buffer = parquetWriteBuffer({
143 |       columnData,
144 |       pageSize: 100,
145 |     })
146 | 
147 |     const rows = await parquetReadObjects({ file: buffer })
148 | 
149 |     expect(rows.length).toBe(numRows)
150 |     expect(rows[0].fruit).toBe('apple')
151 |     expect(rows[1].fruit).toBe('banana')
152 |     expect(rows[2].fruit).toBe('cherry')
153 |     expect(rows[3].fruit).toBe('apple')
154 |   })
155 | })
156 | 


--------------------------------------------------------------------------------
/src/parquet-writer.js:
--------------------------------------------------------------------------------
  1 | import { getSchemaPath } from 'hyparquet/src/schema.js'
  2 | import { writeColumn } from './column.js'
  3 | import { writeIndexes } from './indexes.js'
  4 | import { writeMetadata } from './metadata.js'
  5 | import { snappyCompress } from './snappy.js'
  6 | 
  7 | /**
  8 |  * ParquetWriter class allows incremental writing of parquet files.
  9 |  *
 10 |  * @import {ColumnChunk, CompressionCodec, FileMetaData, KeyValue, RowGroup, SchemaElement} from 'hyparquet'
 11 |  * @import {ColumnEncoder, ColumnSource, Compressors, PageIndexes, Writer} from '../src/types.js'
 12 |  * @param {object} options
 13 |  * @param {Writer} options.writer
 14 |  * @param {SchemaElement[]} options.schema
 15 |  * @param {CompressionCodec} [options.codec]
 16 |  * @param {Compressors} [options.compressors]
 17 |  * @param {boolean} [options.statistics]
 18 |  * @param {KeyValue[]} [options.kvMetadata]
 19 |  */
 20 | export function ParquetWriter({ writer, schema, codec = 'SNAPPY', compressors, statistics = true, kvMetadata }) {
 21 |   this.writer = writer
 22 |   this.schema = schema
 23 |   this.codec = codec
 24 |   // Include built-in snappy as fallback
 25 |   this.compressors = { SNAPPY: snappyCompress, ...compressors }
 26 |   this.statistics = statistics
 27 |   this.kvMetadata = kvMetadata
 28 | 
 29 |   /** @type {RowGroup[]} */
 30 |   this.row_groups = []
 31 |   this.num_rows = 0n
 32 | 
 33 |   /** @type {PageIndexes[]} */
 34 |   this.pendingIndexes = []
 35 | 
 36 |   // write header PAR1
 37 |   this.writer.appendUint32(0x31524150)
 38 | }
 39 | 
 40 | /**
 41 |  * Write data to the file.
 42 |  * Will split data into row groups of the specified size.
 43 |  *
 44 |  * @param {object} options
 45 |  * @param {ColumnSource[]} options.columnData
 46 |  * @param {number | number[]} [options.rowGroupSize]
 47 |  * @param {number} [options.pageSize]
 48 |  */
 49 | ParquetWriter.prototype.write = function({ columnData, rowGroupSize = [100, 1000, 10000], pageSize = 1048576 }) {
 50 |   const columnDataRows = columnData[0]?.data?.length || 0
 51 |   for (const { groupStartIndex, groupSize } of groupIterator({ columnDataRows, rowGroupSize })) {
 52 |     const groupStartOffset = this.writer.offset
 53 |     /** @type {ColumnChunk[]} */
 54 |     const columns = []
 55 | 
 56 |     // write columns
 57 |     for (let j = 0; j < columnData.length; j++) {
 58 |       const { name, data, encoding, columnIndex = false, offsetIndex = false } = columnData[j]
 59 |       const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize)
 60 | 
 61 |       const schemaTree = getSchemaPath(this.schema, [name])
 62 |       // Dive into the leaf element
 63 |       while (true) {
 64 |         const child = schemaTree[schemaTree.length - 1]
 65 |         if (!child.element.num_children) {
 66 |           break
 67 |         } else if (child.element.num_children === 1) {
 68 |           schemaTree.push(child.children[0])
 69 |         } else {
 70 |           throw new Error(`parquet column ${name} struct unsupported`)
 71 |         }
 72 |       }
 73 |       const schemaPath = schemaTree.map(node => node.element)
 74 |       const element = schemaPath.at(-1)
 75 |       if (!element) throw new Error(`parquet column ${name} missing schema element`)
 76 |       /** @type {ColumnEncoder} */
 77 |       const column = {
 78 |         columnName: name,
 79 |         element,
 80 |         schemaPath,
 81 |         codec: this.codec,
 82 |         compressors: this.compressors,
 83 |         stats: this.statistics,
 84 |         pageSize,
 85 |         columnIndex,
 86 |         offsetIndex,
 87 |         encoding,
 88 |       }
 89 | 
 90 |       const result = writeColumn({
 91 |         writer: this.writer,
 92 |         column,
 93 |         values: groupData,
 94 |       })
 95 | 
 96 |       columns.push(result.chunk)
 97 |       this.pendingIndexes.push(result)
 98 |     }
 99 | 
100 |     this.num_rows += BigInt(groupSize)
101 |     this.row_groups.push({
102 |       columns,
103 |       total_byte_size: BigInt(this.writer.offset - groupStartOffset),
104 |       num_rows: BigInt(groupSize),
105 |     })
106 |   }
107 | }
108 | 
109 | /**
110 |  * Finish writing the file.
111 |  */
112 | ParquetWriter.prototype.finish = function() {
113 |   // Write all indexes at end of file
114 |   writeIndexes(this.writer, this.pendingIndexes)
115 | 
116 |   // write metadata
117 |   /** @type {FileMetaData} */
118 |   const metadata = {
119 |     version: 2,
120 |     created_by: 'hyparquet',
121 |     schema: this.schema,
122 |     num_rows: this.num_rows,
123 |     row_groups: this.row_groups,
124 |     metadata_length: 0,
125 |     key_value_metadata: this.kvMetadata,
126 |   }
127 |   // @ts-ignore don't want to actually serialize metadata_length
128 |   delete metadata.metadata_length
129 |   writeMetadata(this.writer, metadata)
130 | 
131 |   // write footer PAR1
132 |   this.writer.appendUint32(0x31524150)
133 |   this.writer.finish()
134 | }
135 | 
136 | /**
137 |  * Create an iterator for row groups based on the specified row group size.
138 |  * If rowGroupSize is an array, it will return groups based on the sizes in the array.
139 |  * When the array runs out, it will continue with the last size.
140 |  *
141 |  * @param {object} options
142 |  * @param {number} options.columnDataRows - Total number of rows in the column data
143 |  * @param {number | number[]} options.rowGroupSize - Size of each row group or an array of sizes
144 |  * @returns {Array<{groupStartIndex: number, groupSize: number}>}
145 |  */
146 | function groupIterator({ columnDataRows, rowGroupSize }) {
147 |   if (Array.isArray(rowGroupSize) && !rowGroupSize.length) {
148 |     throw new Error('rowGroupSize array cannot be empty')
149 |   }
150 |   const groups = []
151 |   let groupIndex = 0
152 |   let groupStartIndex = 0
153 |   while (groupStartIndex < columnDataRows) {
154 |     const size = Array.isArray(rowGroupSize)
155 |       ? rowGroupSize[Math.min(groupIndex, rowGroupSize.length - 1)]
156 |       : rowGroupSize
157 |     const groupSize = Math.min(size, columnDataRows - groupStartIndex)
158 |     groups.push({ groupStartIndex, groupSize })
159 |     groupStartIndex += size
160 |     groupIndex++
161 |   }
162 |   return groups
163 | }
164 | 


--------------------------------------------------------------------------------
/src/thrift.js:
--------------------------------------------------------------------------------
  1 | import { CompactType } from 'hyparquet/src/thrift.js'
  2 | 
  3 | /**
  4 |  * Serialize a JS object in TCompactProtocol format.
  5 |  *
  6 |  * Expects keys named like "field_1", "field_2", etc. in ascending order.
  7 |  *
  8 |  * @import {ThriftType} from 'hyparquet/src/types.js'
  9 |  * @import {Writer} from '../src/types.js'
 10 |  * @param {Writer} writer
 11 |  * @param {Record<string, any>} data
 12 |  */
 13 | export function serializeTCompactProtocol(writer, data) {
 14 |   let lastFid = 0
 15 |   // write each field
 16 |   for (const [key, value] of Object.entries(data)) {
 17 |     if (value === undefined) continue
 18 | 
 19 |     // we expect key = "field_N" so we can extract N as the field ID
 20 |     const fid = parseInt(key.replace(/^field_/, ''), 10)
 21 |     if (Number.isNaN(fid)) {
 22 |       throw new Error(`thrift invalid field name: ${key}. Expected "field_###".`)
 23 |     }
 24 | 
 25 |     // write the field-begin header
 26 |     const type = getCompactTypeForValue(value)
 27 |     const delta = fid - lastFid
 28 |     if (delta <= 0) {
 29 |       throw new Error(`thrift non-monotonic field ID: fid=${fid}, lastFid=${lastFid}`)
 30 |     }
 31 |     // high nibble = delta, low nibble = type < 15 or zigzag
 32 |     if (delta <= 15) {
 33 |       writer.appendUint8(delta << 4 | type)
 34 |     } else {
 35 |       writer.appendUint8(type)
 36 |       writer.appendVarInt(fid << 1 ^ fid >> 15) // zigzag
 37 |     }
 38 | 
 39 |     // Write the field content itself
 40 |     writeElement(writer, type, value)
 41 | 
 42 |     lastFid = fid
 43 |   }
 44 | 
 45 |   // Finally write STOP
 46 |   writer.appendUint8(CompactType.STOP)
 47 | }
 48 | 
 49 | /**
 50 |  * Deduce a TCompactProtocol type from the JS value
 51 |  *
 52 |  * @param {any} value
 53 |  * @returns {number} CompactType
 54 |  */
 55 | function getCompactTypeForValue(value) {
 56 |   if (value === true) return CompactType.TRUE
 57 |   if (value === false) return CompactType.FALSE
 58 |   if (Number.isInteger(value)) return CompactType.I32
 59 |   if (typeof value === 'number') return CompactType.DOUBLE
 60 |   if (typeof value === 'bigint') return CompactType.I64
 61 |   if (typeof value === 'string') return CompactType.BINARY
 62 |   if (value instanceof Uint8Array) return CompactType.BINARY
 63 |   if (Array.isArray(value)) return CompactType.LIST
 64 |   if (value && typeof value === 'object') return CompactType.STRUCT
 65 |   throw new Error(`Cannot determine thrift compact type for: ${value}`)
 66 | }
 67 | 
 68 | /**
 69 |  * Write a single value of a given compact type.
 70 |  *
 71 |  * @param {Writer} writer
 72 |  * @param {number} type
 73 |  * @param {ThriftType} value
 74 |  */
 75 | function writeElement(writer, type, value) {
 76 |   // true/false is stored in the type
 77 |   if (type === CompactType.TRUE) return
 78 |   if (type === CompactType.FALSE) return
 79 |   if (type === CompactType.BYTE && typeof value === 'number') {
 80 |     writer.appendUint8(value)
 81 |   } else if (type === CompactType.I32 && typeof value === 'number') {
 82 |     const zigzag = value << 1 ^ value >> 31
 83 |     writer.appendVarInt(zigzag)
 84 |   } else if (type === CompactType.I64 && typeof value === 'bigint') {
 85 |     // For 64-bit (bigint) we do (value << 1n) ^ (value >> 63n) in zigzag
 86 |     const zigzag = value << 1n ^ value >> 63n
 87 |     writer.appendVarBigInt(zigzag)
 88 |   } else if (type === CompactType.DOUBLE && typeof value === 'number') {
 89 |     writer.appendFloat64(value)
 90 |   } else if (type === CompactType.BINARY && typeof value === 'string') {
 91 |     // store length as a varint, then raw bytes
 92 |     const bytes = new TextEncoder().encode(value)
 93 |     writer.appendVarInt(bytes.length)
 94 |     writer.appendBytes(bytes)
 95 |   } else if (type === CompactType.BINARY && value instanceof Uint8Array) {
 96 |     // store length as a varint, then raw bytes
 97 |     writer.appendVarInt(value.byteLength)
 98 |     writer.appendBytes(value)
 99 |   } else if (type === CompactType.LIST && Array.isArray(value)) {
100 |     // Must store (size << 4) | elementType
101 |     // We'll guess the element type from the first element
102 |     const size = value.length
103 |     if (size === 0) {
104 |       // (0 << 4) | type for an empty list – pick BYTE arbitrarily
105 |       writer.appendUint8(0 << 4 | CompactType.BYTE)
106 |       return
107 |     }
108 | 
109 |     // TODO: Check for heterogeneous lists?
110 |     const elemType = getCompactTypeForValue(value[0])
111 | 
112 |     const sizeNibble = size > 14 ? 15 : size
113 |     writer.appendUint8(sizeNibble << 4 | elemType)
114 |     if (size > 14) {
115 |       writer.appendVarInt(size)
116 |     }
117 | 
118 |     // Special trick for booleans in a list
119 |     if (elemType === CompactType.TRUE || elemType === CompactType.FALSE) {
120 |       // Write each boolean as a single 0 or 1 byte
121 |       for (const v of value) {
122 |         writer.appendUint8(v ? 1 : 0)
123 |       }
124 |     } else {
125 |       // Otherwise write them out normally
126 |       for (const v of value) {
127 |         writeElement(writer, elemType, v)
128 |       }
129 |     }
130 |   } else if (type === CompactType.STRUCT && typeof value === 'object') {
131 |     // Recursively write sub-fields as "field_N: val", end with STOP
132 |     let lastFid = 0
133 |     for (const [k, v] of Object.entries(value)) {
134 |       if (v === undefined) continue
135 | 
136 |       const fid = parseInt(k.replace(/^field_/, ''), 10)
137 |       if (Number.isNaN(fid)) {
138 |         throw new Error(`Invalid sub-field name: ${k}. Expected "field_###"`)
139 |       }
140 |       const t = getCompactTypeForValue(v)
141 |       const delta = fid - lastFid
142 |       if (delta <= 0) {
143 |         throw new Error(`Non-monotonic fid in struct: fid=${fid}, lastFid=${lastFid}`)
144 |       }
145 |       if (delta <= 15) {
146 |         writer.appendUint8(delta << 4 | t)
147 |       } else {
148 |         writer.appendUint8(t)
149 |         writer.appendVarInt(fid << 1 ^ fid >> 15)
150 |       }
151 |       writeElement(writer, t, v)
152 |       lastFid = fid
153 |     }
154 |     // Write STOP
155 |     writer.appendUint8(CompactType.STOP)
156 |   } else {
157 |     throw new Error(`unhandled type in writeElement: ${type} for value ${value}`)
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/test/metadata.test.js:
--------------------------------------------------------------------------------
  1 | import { parquetMetadata } from 'hyparquet'
  2 | import { describe, expect, it } from 'vitest'
  3 | import { ByteWriter } from '../src/bytewriter.js'
  4 | import { logicalType, writeMetadata } from '../src/metadata.js'
  5 | import { exampleMetadata } from './example.js'
  6 | 
  7 | /**
  8 |  * @import {FileMetaData, LogicalType} from 'hyparquet'
  9 |  * @import {ThriftObject} from '../src/types.js'
 10 |  */
 11 | 
 12 | describe('writeMetadata', () => {
 13 |   it('writes metadata and parses in hyparquet', () => {
 14 |     const writer = new ByteWriter()
 15 | 
 16 |     // write header PAR1
 17 |     writer.appendUint32(0x31524150)
 18 | 
 19 |     // write metadata
 20 |     /** @type {FileMetaData} */
 21 |     const withKvMetadata = {
 22 |       ...exampleMetadata,
 23 |       key_value_metadata: [
 24 |         { key: 'key1', value: 'value1' },
 25 |         { key: 'key2', value: 'value2' },
 26 |       ],
 27 |       metadata_length: 477,
 28 |     }
 29 |     writeMetadata(writer, withKvMetadata)
 30 | 
 31 |     // write footer PAR1
 32 |     writer.appendUint32(0x31524150)
 33 | 
 34 |     const file = writer.getBuffer()
 35 |     const outputMetadata = parquetMetadata(file)
 36 | 
 37 |     expect(outputMetadata).toEqual(withKvMetadata)
 38 |   })
 39 | 
 40 |   it('writes extended column metadata fields', () => {
 41 |     const writer = new ByteWriter()
 42 |     writer.appendUint32(0x31524150)
 43 | 
 44 |     /** @type {FileMetaData} */
 45 |     const extendedMetadata = {
 46 |       version: 2,
 47 |       created_by: 'hyparquet',
 48 |       schema: [
 49 |         { name: 'root', num_children: 1 },
 50 |         {
 51 |           name: 'geo',
 52 |           type: 'BYTE_ARRAY',
 53 |           repetition_type: 'REQUIRED',
 54 |           logical_type: { type: 'GEOGRAPHY', crs: 'EPSG:4326', algorithm: 'KARNEY' },
 55 |         },
 56 |       ],
 57 |       num_rows: 1n,
 58 |       row_groups: [{
 59 |         columns: [{
 60 |           file_path: 'part-0.parquet',
 61 |           file_offset: 4n,
 62 |           meta_data: {
 63 |             type: 'BYTE_ARRAY',
 64 |             encodings: ['PLAIN', 'RLE'],
 65 |             path_in_schema: [],
 66 |             codec: 'SNAPPY',
 67 |             num_values: 1n,
 68 |             total_uncompressed_size: 10n,
 69 |             total_compressed_size: 8n,
 70 |             key_value_metadata: [{ key: 'chunk', value: 'value' }],
 71 |             data_page_offset: 4n,
 72 |             index_page_offset: 12n,
 73 |             dictionary_page_offset: 20n,
 74 |             statistics: {
 75 |               null_count: 0n,
 76 |               min_value: 'a',
 77 |               max_value: 'z',
 78 |             },
 79 |             encoding_stats: [{ page_type: 'DATA_PAGE', encoding: 'PLAIN', count: 1 }],
 80 |             bloom_filter_offset: 30n,
 81 |             bloom_filter_length: 4,
 82 |             size_statistics: {
 83 |               unencoded_byte_array_data_bytes: 5n,
 84 |               repetition_level_histogram: [1n, 0n],
 85 |               definition_level_histogram: [2n, 0n],
 86 |             },
 87 |             geospatial_statistics: {
 88 |               bbox: {
 89 |                 xmin: 0,
 90 |                 xmax: 10,
 91 |                 ymin: -5,
 92 |                 ymax: 5,
 93 |                 zmin: 1,
 94 |                 zmax: 2,
 95 |                 mmin: 3,
 96 |                 mmax: 4,
 97 |               },
 98 |               geospatial_types: [0, 1],
 99 |             },
100 |           },
101 |           offset_index_offset: 40n,
102 |           offset_index_length: 16,
103 |           column_index_offset: 60n,
104 |           column_index_length: 24,
105 |           encrypted_column_metadata: new Uint8Array([7, 8, 9]),
106 |         }],
107 |         total_byte_size: 64n,
108 |         num_rows: 1n,
109 |         sorting_columns: [{
110 |           column_idx: 0,
111 |           descending: true,
112 |           nulls_first: false,
113 |         }],
114 |         file_offset: 4n,
115 |         total_compressed_size: 8n,
116 |       }],
117 |       key_value_metadata: [{ key: 'meta', value: 'data' }],
118 |       metadata_length: 223,
119 |     }
120 | 
121 |     writeMetadata(writer, extendedMetadata)
122 |     writer.appendUint32(0x31524150)
123 | 
124 |     const outputMetadata = parquetMetadata(writer.getBuffer())
125 |     expect(outputMetadata).toEqual(extendedMetadata)
126 |   })
127 | })
128 | 
129 | describe('logicalType', () => {
130 |   it('returns undefined when given undefined', () => {
131 |     expect(logicalType(undefined)).toBeUndefined()
132 |   })
133 | 
134 |   it('returns correct object for known types', () => {
135 |     /** @type {{ input: LogicalType, expected: ThriftObject }[]} */
136 |     const testCases = [
137 |       { input: { type: 'STRING' }, expected: { field_1: {} } },
138 |       { input: { type: 'MAP' }, expected: { field_2: {} } },
139 |       { input: { type: 'LIST' }, expected: { field_3: {} } },
140 |       { input: { type: 'ENUM' }, expected: { field_4: {} } },
141 |       {
142 |         input: { type: 'DECIMAL', scale: 2, precision: 5 },
143 |         expected: { field_5: { field_1: 2, field_2: 5 } },
144 |       },
145 |       { input: { type: 'DATE' }, expected: { field_6: {} } },
146 |       {
147 |         input: { type: 'TIME', isAdjustedToUTC: true, unit: 'MILLIS' },
148 |         expected: { field_7: { field_1: true, field_2: { field_1: {} } } },
149 |       },
150 |       {
151 |         input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'MICROS' },
152 |         expected: { field_8: { field_1: false, field_2: { field_2: {} } } },
153 |       },
154 |       {
155 |         input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'NANOS' },
156 |         expected: { field_8: { field_1: false, field_2: { field_3: {} } } },
157 |       },
158 |       {
159 |         input: { type: 'INTEGER', bitWidth: 32, isSigned: true },
160 |         expected: { field_10: { field_1: 32, field_2: true } },
161 |       },
162 |       { input: { type: 'NULL' }, expected: { field_11: {} } },
163 |       { input: { type: 'JSON' }, expected: { field_12: {} } },
164 |       { input: { type: 'BSON' }, expected: { field_13: {} } },
165 |       { input: { type: 'UUID' }, expected: { field_14: {} } },
166 |       { input: { type: 'FLOAT16' }, expected: { field_15: {} } },
167 |       { input: { type: 'VARIANT' }, expected: { field_16: {} } },
168 |       { input: { type: 'GEOMETRY' }, expected: { field_17: {} } },
169 |       { input: { type: 'GEOGRAPHY' }, expected: { field_18: {} } },
170 |     ]
171 | 
172 |     testCases.forEach(({ input, expected }) => {
173 |       expect(logicalType(input)).toEqual(expected)
174 |     })
175 |   })
176 | })
177 | 


--------------------------------------------------------------------------------
/src/delta.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Delta Binary Packed encoding for parquet.
  3 |  * Encodes integers as deltas with variable bit-width packing.
  4 |  *
  5 |  * @import {DecodedArray} from 'hyparquet'
  6 |  * @import {Writer} from '../src/types.js'
  7 |  */
  8 | 
  9 | const BLOCK_SIZE = 128
 10 | const MINIBLOCKS_PER_BLOCK = 4
 11 | const VALUES_PER_MINIBLOCK = BLOCK_SIZE / MINIBLOCKS_PER_BLOCK // 32
 12 | 
 13 | /**
 14 |  * Write values using delta binary packed encoding.
 15 |  *
 16 |  * @param {Writer} writer
 17 |  * @param {DecodedArray} values
 18 |  */
 19 | export function deltaBinaryPack(writer, values) {
 20 |   const count = values.length
 21 |   if (count === 0) {
 22 |     // Write header with zero count
 23 |     writer.appendVarInt(BLOCK_SIZE)
 24 |     writer.appendVarInt(MINIBLOCKS_PER_BLOCK)
 25 |     writer.appendVarInt(0)
 26 |     writer.appendVarInt(0)
 27 |     return
 28 |   }
 29 |   if (typeof values[0] !== 'number' && typeof values[0] !== 'bigint') {
 30 |     throw new Error('deltaBinaryPack only supports number or bigint arrays')
 31 |   }
 32 | 
 33 |   // Write header
 34 |   writer.appendVarInt(BLOCK_SIZE)
 35 |   writer.appendVarInt(MINIBLOCKS_PER_BLOCK)
 36 |   writer.appendVarInt(count)
 37 |   writer.appendZigZag(values[0])
 38 | 
 39 |   // Process blocks
 40 |   let index = 1
 41 |   while (index < count) {
 42 |     const blockEnd = Math.min(index + BLOCK_SIZE, count)
 43 |     const blockSize = blockEnd - index
 44 | 
 45 |     // Compute deltas for this block
 46 |     const blockDeltas = new BigInt64Array(blockSize)
 47 |     let minDelta = BigInt(values[index]) - BigInt(values[index - 1])
 48 |     blockDeltas[0] = minDelta
 49 |     for (let i = 1; i < blockSize; i++) {
 50 |       const delta = BigInt(values[index + i]) - BigInt(values[index + i - 1])
 51 |       blockDeltas[i] = delta
 52 |       if (delta < minDelta) minDelta = delta
 53 |     }
 54 |     writer.appendZigZag(minDelta)
 55 | 
 56 |     // Calculate bit widths for each miniblock
 57 |     const bitWidths = new Uint8Array(MINIBLOCKS_PER_BLOCK)
 58 |     for (let mb = 0; mb < MINIBLOCKS_PER_BLOCK; mb++) {
 59 |       const mbStart = mb * VALUES_PER_MINIBLOCK
 60 |       const mbEnd = Math.min(mbStart + VALUES_PER_MINIBLOCK, blockSize)
 61 | 
 62 |       let maxAdjusted = 0n
 63 |       for (let i = mbStart; i < mbEnd; i++) {
 64 |         const adjusted = blockDeltas[i] - minDelta
 65 |         if (adjusted > maxAdjusted) maxAdjusted = adjusted
 66 |       }
 67 |       bitWidths[mb] = bitWidth(maxAdjusted)
 68 |     }
 69 | 
 70 |     // Write bit widths
 71 |     writer.appendBytes(bitWidths)
 72 | 
 73 |     // Write packed miniblocks
 74 |     for (let mb = 0; mb < MINIBLOCKS_PER_BLOCK; mb++) {
 75 |       const bitWidth = bitWidths[mb]
 76 |       if (bitWidth === 0) continue // No data needed for zero bit width
 77 | 
 78 |       const mbStart = mb * VALUES_PER_MINIBLOCK
 79 |       const mbEnd = Math.min(mbStart + VALUES_PER_MINIBLOCK, blockSize)
 80 | 
 81 |       // Bit pack the adjusted deltas
 82 |       let buffer = 0n
 83 |       let bitsUsed = 0
 84 | 
 85 |       for (let i = 0; i < VALUES_PER_MINIBLOCK; i++) {
 86 |         const adjusted = mbStart + i < mbEnd ? blockDeltas[mbStart + i] - minDelta : 0n
 87 |         buffer |= adjusted << BigInt(bitsUsed)
 88 |         bitsUsed += bitWidth
 89 | 
 90 |         // Flush complete bytes
 91 |         while (bitsUsed >= 8) {
 92 |           writer.appendUint8(Number(buffer & 0xffn))
 93 |           buffer >>= 8n
 94 |           bitsUsed -= 8
 95 |         }
 96 |       }
 97 |       // assert(bitsUsed === 0) // because multiple of 8
 98 |     }
 99 | 
100 |     index = blockEnd
101 |   }
102 | }
103 | 
104 | /**
105 |  * Write byte arrays using delta length encoding.
106 |  * Encodes lengths using delta binary packed, then writes raw bytes.
107 |  *
108 |  * @param {Writer} writer
109 |  * @param {DecodedArray} values
110 |  */
111 | export function deltaLengthByteArray(writer, values) {
112 |   // Extract lengths
113 |   const lengths = new Int32Array(values.length)
114 |   for (let i = 0; i < values.length; i++) {
115 |     const value = values[i]
116 |     if (!(value instanceof Uint8Array)) {
117 |       throw new Error('deltaLengthByteArray expects Uint8Array values')
118 |     }
119 |     lengths[i] = value.length
120 |   }
121 | 
122 |   // Write delta-packed lengths
123 |   deltaBinaryPack(writer, lengths)
124 | 
125 |   // Write raw byte data
126 |   for (const value of values) {
127 |     writer.appendBytes(value)
128 |   }
129 | }
130 | 
131 | /**
132 |  * Write byte arrays using delta encoding with prefix compression.
133 |  * Stores common prefixes with previous value to improve compression.
134 |  *
135 |  * @param {Writer} writer
136 |  * @param {DecodedArray} values
137 |  */
138 | export function deltaByteArray(writer, values) {
139 |   if (values.length === 0) {
140 |     deltaBinaryPack(writer, [])
141 |     deltaBinaryPack(writer, [])
142 |     return
143 |   }
144 | 
145 |   // Calculate prefix lengths and suffixes
146 |   const prefixLengths = new Int32Array(values.length)
147 |   const suffixLengths = new Int32Array(values.length)
148 |   /** @type {Uint8Array[]} */
149 |   const suffixes = new Array(values.length)
150 | 
151 |   // First value has no prefix
152 |   const value = values[0]
153 |   if (!(value instanceof Uint8Array)) {
154 |     throw new Error('deltaByteArray expects Uint8Array values')
155 |   }
156 |   prefixLengths[0] = 0
157 |   suffixLengths[0] = values[0].length
158 |   suffixes[0] = values[0]
159 | 
160 |   for (let i = 1; i < values.length; i++) {
161 |     const prev = values[i - 1]
162 |     const curr = values[i]
163 |     if (!(curr instanceof Uint8Array)) {
164 |       throw new Error('deltaByteArray expects Uint8Array values')
165 |     }
166 | 
167 |     // Find common prefix length
168 |     let prefixLen = 0
169 |     const maxPrefix = Math.min(prev.length, curr.length)
170 |     while (prefixLen < maxPrefix && prev[prefixLen] === curr[prefixLen]) {
171 |       prefixLen++
172 |     }
173 | 
174 |     prefixLengths[i] = prefixLen
175 |     suffixLengths[i] = curr.length - prefixLen
176 |     suffixes[i] = curr.subarray(prefixLen)
177 |   }
178 | 
179 |   // Write delta-packed prefix lengths
180 |   deltaBinaryPack(writer, prefixLengths)
181 | 
182 |   // Write delta-packed suffix lengths
183 |   deltaBinaryPack(writer, suffixLengths)
184 | 
185 |   // Write suffix bytes
186 |   for (const suffix of suffixes) {
187 |     writer.appendBytes(suffix)
188 |   }
189 | }
190 | 
191 | /**
192 |  * Minimum bits needed to store value.
193 |  *
194 |  * @param {bigint} value
195 |  * @returns {number}
196 |  */
197 | function bitWidth(value) {
198 |   if (value === 0n) return 0
199 |   let bits = 0
200 |   while (value > 0n) {
201 |     bits++
202 |     value >>= 1n
203 |   }
204 |   return bits
205 | }
206 | 


--------------------------------------------------------------------------------
/src/schema.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Infer a schema from column data.
  3 |  * Accepts optional schemaOverrides to override the type of columns by name.
  4 |  *
  5 |  * @param {object} options
  6 |  * @param {ColumnSource[]} options.columnData
  7 |  * @param {Record<string,SchemaElement>} [options.schemaOverrides]
  8 |  * @returns {SchemaElement[]}
  9 |  */
 10 | export function schemaFromColumnData({ columnData, schemaOverrides }) {
 11 |   /** @type {SchemaElement[]} */
 12 |   const schema = [{
 13 |     name: 'root',
 14 |     num_children: columnData.length,
 15 |   }]
 16 |   let num_rows = 0
 17 | 
 18 |   for (const { name, data, type, nullable } of columnData) {
 19 |     // check if all columns have the same length
 20 |     num_rows = num_rows || data.length
 21 |     if (num_rows !== data.length) {
 22 |       throw new Error('columns must have the same length')
 23 |     }
 24 | 
 25 |     if (schemaOverrides?.[name]) {
 26 |       // use schema override
 27 |       const override = schemaOverrides[name]
 28 |       if (override.name !== name) throw new Error('schema override name does not match column name')
 29 |       schema.push(override)
 30 |     } else if (type) {
 31 |       // use provided type
 32 |       schema.push(basicTypeToSchemaElement(name, type, nullable))
 33 |     } else {
 34 |       // auto-detect type
 35 |       schema.push(autoSchemaElement(name, data))
 36 |     }
 37 |   }
 38 | 
 39 |   return schema
 40 | }
 41 | 
 42 | /**
 43 |  * @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet'
 44 |  * @import {BasicType, ColumnSource} from '../src/types.js'
 45 |  * @param {string} name
 46 |  * @param {BasicType} type
 47 |  * @param {boolean} [nullable]
 48 |  * @returns {SchemaElement}
 49 |  */
 50 | function basicTypeToSchemaElement(name, type, nullable) {
 51 |   const repetition_type = nullable === false ? 'REQUIRED' : 'OPTIONAL'
 52 |   if (type === 'STRING') {
 53 |     return { name, type: 'BYTE_ARRAY', converted_type: 'UTF8', repetition_type }
 54 |   }
 55 |   if (type === 'JSON') {
 56 |     return { name, type: 'BYTE_ARRAY', converted_type: 'JSON', repetition_type }
 57 |   }
 58 |   if (type === 'TIMESTAMP') {
 59 |     return { name, type: 'INT64', converted_type: 'TIMESTAMP_MILLIS', repetition_type }
 60 |   }
 61 |   if (type === 'UUID') {
 62 |     return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' }, repetition_type }
 63 |   }
 64 |   if (type === 'FLOAT16') {
 65 |     return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' }, repetition_type }
 66 |   }
 67 |   if (type === 'GEOMETRY') {
 68 |     return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' }, repetition_type }
 69 |   }
 70 |   if (type === 'GEOGRAPHY') {
 71 |     return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' }, repetition_type }
 72 |   }
 73 |   return { name, type, repetition_type }
 74 | }
 75 | 
 76 | /**
 77 |  * Automatically determine a SchemaElement from an array of values.
 78 |  *
 79 |  * @param {string} name
 80 |  * @param {DecodedArray} values
 81 |  * @returns {SchemaElement}
 82 |  */
 83 | export function autoSchemaElement(name, values) {
 84 |   /** @type {ParquetType | undefined} */
 85 |   let type
 86 |   /** @type {FieldRepetitionType} */
 87 |   let repetition_type = 'REQUIRED'
 88 |   /** @type {ConvertedType | undefined} */
 89 |   let converted_type = undefined
 90 | 
 91 |   if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type }
 92 |   if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type }
 93 |   if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type }
 94 |   if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type }
 95 | 
 96 |   for (const value of values) {
 97 |     if (value === null || value === undefined) {
 98 |       repetition_type = 'OPTIONAL'
 99 |     } else {
100 |       // value is defined
101 |       /** @type {ParquetType | undefined} */
102 |       let valueType = undefined
103 |       if (value === true || value === false) valueType = 'BOOLEAN'
104 |       else if (typeof value === 'bigint') valueType = 'INT64'
105 |       else if (Number.isInteger(value)) valueType = 'INT32'
106 |       else if (typeof value === 'number') valueType = 'DOUBLE'
107 |       else if (value instanceof Uint8Array) valueType = 'BYTE_ARRAY'
108 |       else if (typeof value === 'string') {
109 |         valueType = 'BYTE_ARRAY'
110 |         // make sure they are all strings
111 |         if (type && !converted_type) throw new Error('mixed types not supported')
112 |         converted_type = 'UTF8'
113 |       }
114 |       else if (value instanceof Date) {
115 |         valueType = 'INT64'
116 |         // make sure they are all dates
117 |         if (type && !converted_type) throw new Error('mixed types not supported')
118 |         converted_type = 'TIMESTAMP_MILLIS'
119 |       }
120 |       else if (typeof value === 'object') {
121 |         // use json (TODO: native list and object types)
122 |         converted_type = 'JSON'
123 |         valueType = 'BYTE_ARRAY'
124 |       }
125 |       else if (!valueType) throw new Error(`cannot determine parquet type for: ${value}`)
126 | 
127 |       // expand type if necessary
128 |       if (type === undefined) {
129 |         type = valueType
130 |       } else if (type === 'INT32' && valueType === 'DOUBLE') {
131 |         type = 'DOUBLE'
132 |       } else if (type === 'DOUBLE' && valueType === 'INT32') {
133 |         valueType = 'DOUBLE'
134 |       }
135 |       if (type !== valueType) {
136 |         throw new Error(`parquet cannot write mixed types: ${type} and ${valueType}`)
137 |       }
138 |     }
139 |   }
140 |   if (!type) {
141 |     // fallback to nullable BYTE_ARRAY
142 |     // TODO: logical_type: 'NULL'
143 |     type = 'BYTE_ARRAY'
144 |     repetition_type = 'OPTIONAL'
145 |   }
146 |   return { name, type, repetition_type, converted_type }
147 | }
148 | 
149 | /**
150 |  * Get the max repetition level for a given schema path.
151 |  *
152 |  * @param {SchemaElement[]} schemaPath
153 |  * @returns {number} max repetition level
154 |  */
155 | export function getMaxRepetitionLevel(schemaPath) {
156 |   let maxLevel = 0
157 |   for (const element of schemaPath) {
158 |     if (element.repetition_type === 'REPEATED') {
159 |       maxLevel++
160 |     }
161 |   }
162 |   return maxLevel
163 | }
164 | 
165 | /**
166 |  * Get the max definition level for a given schema path.
167 |  *
168 |  * @param {SchemaElement[]} schemaPath
169 |  * @returns {number} max definition level
170 |  */
171 | export function getMaxDefinitionLevel(schemaPath) {
172 |   let maxLevel = 0
173 |   for (const element of schemaPath.slice(1)) {
174 |     if (element.repetition_type !== 'REQUIRED') {
175 |       maxLevel++
176 |     }
177 |   }
178 |   return maxLevel
179 | }
180 | 


--------------------------------------------------------------------------------
/src/snappy.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * The MIT License (MIT)
  3 |  * Copyright (c) 2016 Zhipeng Jia
  4 |  * https://github.com/zhipeng-jia/snappyjs
  5 |  */
  6 | 
  7 | import { ByteWriter } from './bytewriter.js'
  8 | 
  9 | const BLOCK_LOG = 16
 10 | const BLOCK_SIZE = 1 << BLOCK_LOG
 11 | 
 12 | const MAX_HASH_TABLE_BITS = 14
 13 | const globalHashTables = new Array(MAX_HASH_TABLE_BITS + 1)
 14 | 
 15 | /**
 16 |  * Compress snappy data.
 17 |  * Returns Snappy-compressed bytes as Uint8Array.
 18 |  *
 19 |  * @param {Uint8Array} input - uncompressed data
 20 |  * @returns {Uint8Array}
 21 |  */
 22 | export function snappyCompress(input) {
 23 |   const writer = new ByteWriter()
 24 |   // Write uncompressed length as a varint
 25 |   writer.appendVarInt(input.length)
 26 |   if (input.length === 0) return new Uint8Array(writer.getBuffer())
 27 | 
 28 |   // Process input in 64K blocks
 29 |   let pos = 0
 30 |   while (pos < input.length) {
 31 |     const fragmentSize = Math.min(input.length - pos, BLOCK_SIZE)
 32 |     compressFragment(input, pos, fragmentSize, writer)
 33 |     pos += fragmentSize
 34 |   }
 35 |   return new Uint8Array(writer.getBuffer())
 36 | }
 37 | 
 38 | /**
 39 |  * Hash function used in the reference implementation.
 40 |  *
 41 |  * @param {number} key
 42 |  * @param {number} hashFuncShift
 43 |  * @returns {number}
 44 |  */
 45 | function hashFunc(key, hashFuncShift) {
 46 |   return key * 0x1e35a7bd >>> hashFuncShift
 47 | }
 48 | 
 49 | /**
 50 |  * Load a 32-bit little-endian integer from a byte array.
 51 |  *
 52 |  * @param {Uint8Array} array
 53 |  * @param {number} pos
 54 |  * @returns {number}
 55 |  */
 56 | function load32(array, pos) {
 57 |   // Expects Uint8Array as `array`
 58 |   return (
 59 |     array[pos] +
 60 |     (array[pos + 1] << 8) +
 61 |     (array[pos + 2] << 16) +
 62 |     (array[pos + 3] << 24)
 63 |   )
 64 | }
 65 | 
 66 | /**
 67 |  * Compare two 32-bit sequences for equality.
 68 |  *
 69 |  * @param {Uint8Array} array
 70 |  * @param {number} pos1
 71 |  * @param {number} pos2
 72 |  * @returns {boolean}
 73 |  */
 74 | function equals32(array, pos1, pos2) {
 75 |   return (
 76 |     array[pos1] === array[pos2] &&
 77 |     array[pos1 + 1] === array[pos2 + 1] &&
 78 |     array[pos1 + 2] === array[pos2 + 2] &&
 79 |     array[pos1 + 3] === array[pos2 + 3]
 80 |   )
 81 | }
 82 | 
 83 | /**
 84 |  * Emit a literal chunk of data.
 85 |  * @import {Writer} from '../src/types.js'
 86 |  * @param {Uint8Array} input
 87 |  * @param {number} ip
 88 |  * @param {number} len
 89 |  * @param {Writer} writer
 90 |  */
 91 | function emitLiteral(input, ip, len, writer) {
 92 |   // The first byte(s) encode the literal length
 93 |   if (len <= 60) {
 94 |     writer.appendUint8(len - 1 << 2)
 95 |   } else if (len < 256) {
 96 |     writer.appendUint8(60 << 2)
 97 |     writer.appendUint8(len - 1)
 98 |   } else {
 99 |     writer.appendUint8(61 << 2)
100 |     writer.appendUint8(len - 1 & 0xff)
101 |     writer.appendUint8(len - 1 >>> 8)
102 |   }
103 | 
104 |   // Then copy the literal bytes
105 |   writer.appendBytes(input.subarray(ip, ip + len))
106 | }
107 | 
108 | /**
109 |  * Emit a copy of previous data.
110 |  * @param {Writer} writer
111 |  * @param {number} offset
112 |  * @param {number} len
113 |  */
114 | function emitCopyLessThan64(writer, offset, len) {
115 |   if (len < 12 && offset < 2048) {
116 |     // Copy 4..11 bytes, offset < 2048
117 |     //    --> [  1   | (len-4)<<2 | (offset>>8)<<5 ]
118 |     writer.appendUint8(1 + (len - 4 << 2) + (offset >>> 8 << 5))
119 |     writer.appendUint8(offset & 0xff)
120 |   } else {
121 |     // Copy len bytes, offset 1..65535
122 |     //    --> [  2   | (len-1)<<2 ]
123 |     writer.appendUint8(2 + (len - 1 << 2))
124 |     writer.appendUint8(offset & 0xff)
125 |     writer.appendUint8(offset >>> 8)
126 |   }
127 | }
128 | 
129 | /**
130 |  * Emit a copy of previous data.
131 |  * @param {Writer} writer
132 |  * @param {number} offset
133 |  * @param {number} len
134 |  */
135 | function emitCopy(writer, offset, len) {
136 |   // Emit 64-byte copies as long as we can
137 |   while (len >= 68) {
138 |     emitCopyLessThan64(writer, offset, 64)
139 |     len -= 64
140 |   }
141 |   // Emit one 60-byte copy if needed
142 |   if (len > 64) {
143 |     emitCopyLessThan64(writer, offset, 60)
144 |     len -= 60
145 |   }
146 |   // Final copy
147 |   emitCopyLessThan64(writer, offset, len)
148 | }
149 | 
150 | /**
151 |  * Compress a fragment of data.
152 |  * @param {Uint8Array} input
153 |  * @param {number} ip
154 |  * @param {number} inputSize
155 |  * @param {Writer} writer
156 |  */
157 | function compressFragment(input, ip, inputSize, writer) {
158 |   let hashTableBits = 1
159 |   while (1 << hashTableBits <= inputSize && hashTableBits <= MAX_HASH_TABLE_BITS) {
160 |     hashTableBits += 1
161 |   }
162 |   hashTableBits -= 1
163 |   const hashFuncShift = 32 - hashTableBits
164 | 
165 |   // Initialize the hash table
166 |   if (typeof globalHashTables[hashTableBits] === 'undefined') {
167 |     globalHashTables[hashTableBits] = new Uint16Array(1 << hashTableBits)
168 |   }
169 |   const hashTable = globalHashTables[hashTableBits]
170 |   hashTable.fill(0)
171 | 
172 |   const ipEnd = ip + inputSize
173 |   let ipLimit
174 |   const baseIp = ip
175 |   let nextEmit = ip
176 | 
177 |   let hash, nextHash
178 |   let nextIp, candidate, skip
179 |   let bytesBetweenHashLookups
180 |   let base, matched, offset
181 |   let prevHash, curHash
182 |   let flag = true
183 | 
184 |   const INPUT_MARGIN = 15
185 |   if (inputSize >= INPUT_MARGIN) {
186 |     ipLimit = ipEnd - INPUT_MARGIN
187 |     ip += 1
188 |     nextHash = hashFunc(load32(input, ip), hashFuncShift)
189 | 
190 |     while (flag) {
191 |       skip = 32
192 |       nextIp = ip
193 |       do {
194 |         ip = nextIp
195 |         hash = nextHash
196 |         bytesBetweenHashLookups = skip >>> 5
197 |         skip += 1
198 |         nextIp = ip + bytesBetweenHashLookups
199 |         if (ip > ipLimit) {
200 |           flag = false
201 |           break
202 |         }
203 |         nextHash = hashFunc(load32(input, nextIp), hashFuncShift)
204 |         candidate = baseIp + hashTable[hash]
205 |         hashTable[hash] = ip - baseIp
206 |       } while (!equals32(input, ip, candidate))
207 | 
208 |       if (!flag) {
209 |         break
210 |       }
211 | 
212 |       // Emit the literal from `nextEmit` to `ip`
213 |       emitLiteral(input, nextEmit, ip - nextEmit, writer)
214 | 
215 |       // We found a match. Repeatedly match and emit copies
216 |       do {
217 |         base = ip
218 |         matched = 4
219 |         while (
220 |           ip + matched < ipEnd &&
221 |           input[ip + matched] === input[candidate + matched]
222 |         ) {
223 |           matched++
224 |         }
225 |         ip += matched
226 |         offset = base - candidate
227 |         emitCopy(writer, offset, matched)
228 | 
229 |         nextEmit = ip
230 |         if (ip >= ipLimit) {
231 |           flag = false
232 |           break
233 |         }
234 |         prevHash = hashFunc(load32(input, ip - 1), hashFuncShift)
235 |         hashTable[prevHash] = ip - 1 - baseIp
236 |         curHash = hashFunc(load32(input, ip), hashFuncShift)
237 |         candidate = baseIp + hashTable[curHash]
238 |         hashTable[curHash] = ip - baseIp
239 |       } while (equals32(input, ip, candidate))
240 | 
241 |       if (!flag) {
242 |         break
243 |       }
244 | 
245 |       ip += 1
246 |       nextHash = hashFunc(load32(input, ip), hashFuncShift)
247 |     }
248 |   }
249 | 
250 |   // Emit the last literal (if any)
251 |   if (nextEmit < ipEnd) {
252 |     emitLiteral(input, nextEmit, ipEnd - nextEmit, writer)
253 |   }
254 | }
255 | 


--------------------------------------------------------------------------------
/src/metadata.js:
--------------------------------------------------------------------------------
  1 | import { getSchemaPath } from 'hyparquet/src/schema.js'
  2 | import { CompressionCodecs, ConvertedTypes, Encodings, FieldRepetitionTypes, PageTypes, ParquetTypes } from 'hyparquet/src/constants.js'
  3 | import { serializeTCompactProtocol } from './thrift.js'
  4 | import { unconvertStatistics } from './unconvert.js'
  5 | 
  6 | /**
  7 |  * @import {FileMetaData, LogicalType, TimeUnit} from 'hyparquet'
  8 |  * @import {ThriftObject, Writer} from '../src/types.js'
  9 |  * @param {Writer} writer
 10 |  * @param {FileMetaData} metadata
 11 |  */
 12 | export function writeMetadata(writer, metadata) {
 13 |   /** @type {ThriftObject} */
 14 |   const compact = {
 15 |     field_1: metadata.version,
 16 |     field_2: metadata.schema && metadata.schema.map(element => ({
 17 |       field_1: element.type && ParquetTypes.indexOf(element.type),
 18 |       field_2: element.type_length,
 19 |       field_3: element.repetition_type && FieldRepetitionTypes.indexOf(element.repetition_type),
 20 |       field_4: element.name,
 21 |       field_5: element.num_children,
 22 |       field_6: element.converted_type && ConvertedTypes.indexOf(element.converted_type),
 23 |       field_7: element.scale,
 24 |       field_8: element.precision,
 25 |       field_9: element.field_id,
 26 |       field_10: logicalType(element.logical_type),
 27 |     })),
 28 |     field_3: metadata.num_rows,
 29 |     field_4: metadata.row_groups.map(rg => ({
 30 |       field_1: rg.columns.map((c, columnIndex) => ({
 31 |         field_1: c.file_path,
 32 |         field_2: c.file_offset,
 33 |         field_3: c.meta_data && {
 34 |           field_1: ParquetTypes.indexOf(c.meta_data.type),
 35 |           field_2: c.meta_data.encodings.map(e => Encodings.indexOf(e)),
 36 |           field_3: c.meta_data.path_in_schema,
 37 |           field_4: CompressionCodecs.indexOf(c.meta_data.codec),
 38 |           field_5: c.meta_data.num_values,
 39 |           field_6: c.meta_data.total_uncompressed_size,
 40 |           field_7: c.meta_data.total_compressed_size,
 41 |           field_8: c.meta_data.key_value_metadata && c.meta_data.key_value_metadata.map(kv => ({
 42 |             field_1: kv.key,
 43 |             field_2: kv.value,
 44 |           })),
 45 |           field_9: c.meta_data.data_page_offset,
 46 |           field_10: c.meta_data.index_page_offset,
 47 |           field_11: c.meta_data.dictionary_page_offset,
 48 |           field_12: c.meta_data.statistics && unconvertStatistics(
 49 |             c.meta_data.statistics,
 50 |             schemaElement(metadata.schema, c.meta_data.path_in_schema, columnIndex + 1)
 51 |           ),
 52 |           field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({
 53 |             field_1: PageTypes.indexOf(es.page_type),
 54 |             field_2: Encodings.indexOf(es.encoding),
 55 |             field_3: es.count,
 56 |           })),
 57 |           field_14: c.meta_data.bloom_filter_offset,
 58 |           field_15: c.meta_data.bloom_filter_length,
 59 |           field_16: c.meta_data.size_statistics && {
 60 |             field_1: c.meta_data.size_statistics.unencoded_byte_array_data_bytes,
 61 |             field_2: c.meta_data.size_statistics.repetition_level_histogram,
 62 |             field_3: c.meta_data.size_statistics.definition_level_histogram,
 63 |           },
 64 |           field_17: c.meta_data.geospatial_statistics && {
 65 |             field_1: c.meta_data.geospatial_statistics.bbox && {
 66 |               field_1: c.meta_data.geospatial_statistics.bbox.xmin,
 67 |               field_2: c.meta_data.geospatial_statistics.bbox.xmax,
 68 |               field_3: c.meta_data.geospatial_statistics.bbox.ymin,
 69 |               field_4: c.meta_data.geospatial_statistics.bbox.ymax,
 70 |               field_5: c.meta_data.geospatial_statistics.bbox.zmin,
 71 |               field_6: c.meta_data.geospatial_statistics.bbox.zmax,
 72 |               field_7: c.meta_data.geospatial_statistics.bbox.mmin,
 73 |               field_8: c.meta_data.geospatial_statistics.bbox.mmax,
 74 |             },
 75 |             field_2: c.meta_data.geospatial_statistics.geospatial_types,
 76 |           },
 77 |         },
 78 |         field_4: c.offset_index_offset,
 79 |         field_5: c.offset_index_length,
 80 |         field_6: c.column_index_offset,
 81 |         field_7: c.column_index_length,
 82 |         // field_8: c.crypto_metadata,
 83 |         field_9: c.encrypted_column_metadata,
 84 |       })),
 85 |       field_2: rg.total_byte_size,
 86 |       field_3: rg.num_rows,
 87 |       field_4: rg.sorting_columns && rg.sorting_columns.map(sc => ({
 88 |         field_1: sc.column_idx,
 89 |         field_2: sc.descending,
 90 |         field_3: sc.nulls_first,
 91 |       })),
 92 |       field_5: rg.file_offset,
 93 |       field_6: rg.total_compressed_size,
 94 |       // field_7: rg.ordinal, // should be int16
 95 |     })),
 96 |     field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({
 97 |       field_1: kv.key,
 98 |       field_2: kv.value,
 99 |     })),
100 |     field_6: metadata.created_by,
101 |   }
102 | 
103 |   // write metadata as thrift
104 |   const metadataStart = writer.offset
105 |   serializeTCompactProtocol(writer, compact)
106 |   // write metadata length
107 |   const metadataLength = writer.offset - metadataStart
108 |   writer.appendUint32(metadataLength)
109 | }
110 | 
111 | /**
112 |  * Resolve schema element for statistics using the stored path.
113 |  *
114 |  * @param {import('hyparquet').SchemaElement[]} schema
115 |  * @param {string[] | undefined} path
116 |  * @param {number} fallbackIndex
117 |  * @returns {import('hyparquet').SchemaElement}
118 |  */
119 | function schemaElement(schema, path, fallbackIndex) {
120 |   if (path?.length) {
121 |     const resolved = getSchemaPath(schema, path).at(-1)?.element
122 |     if (resolved) return resolved
123 |   }
124 |   return schema[fallbackIndex]
125 | }
126 | 
127 | /**
128 |  * @param {LogicalType | undefined} type
129 |  * @returns {ThriftObject | undefined}
130 |  */
131 | export function logicalType(type) {
132 |   if (!type) return
133 |   if (type.type === 'STRING') return { field_1: {} }
134 |   if (type.type === 'MAP') return { field_2: {} }
135 |   if (type.type === 'LIST') return { field_3: {} }
136 |   if (type.type === 'ENUM') return { field_4: {} }
137 |   if (type.type === 'DECIMAL') return { field_5: {
138 |     field_1: type.scale,
139 |     field_2: type.precision,
140 |   } }
141 |   if (type.type === 'DATE') return { field_6: {} }
142 |   if (type.type === 'TIME') return { field_7: {
143 |     field_1: type.isAdjustedToUTC,
144 |     field_2: timeUnit(type.unit),
145 |   } }
146 |   if (type.type === 'TIMESTAMP') return { field_8: {
147 |     field_1: type.isAdjustedToUTC,
148 |     field_2: timeUnit(type.unit),
149 |   } }
150 |   if (type.type === 'INTEGER') return { field_10: {
151 |     field_1: type.bitWidth,
152 |     field_2: type.isSigned,
153 |   } }
154 |   if (type.type === 'NULL') return { field_11: {} }
155 |   if (type.type === 'JSON') return { field_12: {} }
156 |   if (type.type === 'BSON') return { field_13: {} }
157 |   if (type.type === 'UUID') return { field_14: {} }
158 |   if (type.type === 'FLOAT16') return { field_15: {} }
159 |   if (type.type === 'VARIANT') return { field_16: {} }
160 |   if (type.type === 'GEOMETRY') return { field_17: {
161 |     field_1: type.crs,
162 |   } }
163 |   if (type.type === 'GEOGRAPHY') return { field_18: {
164 |     field_1: type.crs,
165 |     field_2: type.algorithm && edgeAlgorithm[type.algorithm],
166 |   } }
167 | }
168 | 
169 | /**
170 |  * @param {TimeUnit} unit
171 |  * @returns {ThriftObject}
172 |  */
173 | function timeUnit(unit) {
174 |   if (unit === 'NANOS') return { field_3: {} }
175 |   if (unit === 'MICROS') return { field_2: {} }
176 |   return { field_1: {} }
177 | }
178 | 
179 | /**
180 |  * @import {EdgeInterpolationAlgorithm} from 'hyparquet/src/types.js'
181 |  * @type {Record<EdgeInterpolationAlgorithm, number>}
182 |  */
183 | const edgeAlgorithm = {
184 |   SPHERICAL: 0,
185 |   VINCENTY: 1,
186 |   THOMAS: 2,
187 |   ANDOYER: 3,
188 |   KARNEY: 4,
189 | }
190 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Hyparquet Writer
  2 | 
  3 | ![hyparquet writer parakeet](hyparquet-writer.jpg)
  4 | 
  5 | [![npm](https://img.shields.io/npm/v/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer)
  6 | [![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer)
  7 | [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions)
  8 | [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
  9 | ![coverage](https://img.shields.io/badge/Coverage-95-darkred)
 10 | [![dependencies](https://img.shields.io/badge/Dependencies-1-blueviolet)](https://www.npmjs.com/package/hyparquet-writer?activeTab=dependencies)
 11 | 
 12 | Hyparquet Writer is a JavaScript library for writing [Apache Parquet](https://parquet.apache.org) files. It is designed to be lightweight, fast and store data very efficiently. It is a companion to the [hyparquet](https://github.com/hyparam/hyparquet) library, which is a JavaScript library for reading parquet files.
 13 | 
 14 | ## Quick Start
 15 | 
 16 | To write a parquet file to an `ArrayBuffer` use `parquetWriteBuffer` with argument `columnData`. Each column in `columnData` should contain:
 17 | 
 18 | - `name`: the column name
 19 | - `data`: an array of same-type values
 20 | - `type`: the parquet schema type (optional)
 21 | 
 22 | ```javascript
 23 | import { parquetWriteBuffer } from 'hyparquet-writer'
 24 | 
 25 | const arrayBuffer = parquetWriteBuffer({
 26 |   columnData: [
 27 |     { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
 28 |     { name: 'age', data: [25, 30, 35], type: 'INT32' },
 29 |   ],
 30 | })
 31 | ```
 32 | 
 33 | Note: if `type` is not provided, the type will be guessed from the data. The supported `BasicType` are a superset of the parquet primitive types:
 34 | 
 35 | | Basic Type | Equivalent Schema Element |
 36 | |------|----------------|
 37 | | `BOOLEAN` | `{ type: 'BOOLEAN' }` |
 38 | | `INT32` | `{ type: 'INT32' }` |
 39 | | `INT64` | `{ type: 'INT64' }` |
 40 | | `FLOAT` | `{ type: 'FLOAT' }` |
 41 | | `DOUBLE` | `{ type: 'DOUBLE' }` |
 42 | | `BYTE_ARRAY` | `{ type: 'BYTE_ARRAY' }` |
 43 | | `STRING` | `{ type: 'BYTE_ARRAY', converted_type: 'UTF8' }` |
 44 | | `JSON` | `{ type: 'BYTE_ARRAY', converted_type: 'JSON' }` |
 45 | | `TIMESTAMP` | `{ type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' }` |
 46 | | `UUID` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' } }` |
 47 | | `FLOAT16` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' } }` |
 48 | | `GEOMETRY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }` |
 49 | | `GEOGRAPHY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }` |
 50 | 
 51 | More types are supported but require defining the `schema` explicitly. See the [advanced usage](#advanced-usage) section for more details.
 52 | 
 53 | ### Write to Local Parquet File (nodejs)
 54 | 
 55 | To write a local parquet file in node.js use `parquetWriteFile` with arguments `filename` and `columnData`:
 56 | 
 57 | ```javascript
 58 | const { parquetWriteFile } = await import('hyparquet-writer')
 59 | 
 60 | parquetWriteFile({
 61 |   filename: 'example.parquet',
 62 |   columnData: [
 63 |     { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
 64 |     { name: 'age', data: [25, 30, 35], type: 'INT32' },
 65 |   ],
 66 | })
 67 | ```
 68 | 
 69 | Note: hyparquet-writer is published as an ES module, so dynamic `import()` may be required on the command line.
 70 | 
 71 | ## Advanced Usage
 72 | 
 73 | Options can be passed to `parquetWrite` to adjust parquet file writing behavior:
 74 | 
 75 |  - `writer`: a generic writer object
 76 |  - `schema`: parquet schema object (optional)
 77 |  - `codec`: use snappy compression (default true)
 78 |  - `compressors`: custom compressors
 79 |  - `statistics`: write column statistics (default true)
 80 |  - `rowGroupSize`: number of rows in each row group (default 100000)
 81 |  - `kvMetadata`: extra key-value metadata to be stored in the parquet footer
 82 | 
 83 | ```javascript
 84 | import { ByteWriter, parquetWrite } from 'hyparquet-writer'
 85 | import { snappyCompress } from 'hysnappy'
 86 | 
 87 | const writer = new ByteWriter()
 88 | parquetWrite({
 89 |   writer,
 90 |   columnData: [
 91 |     { name: 'name', data: ['Alice', 'Bob', 'Charlie'] },
 92 |     { name: 'age', data: [25, 30, 35] },
 93 |     { name: 'dob', data: [new Date(1000000), new Date(2000000), new Date(3000000)] },
 94 |   ],
 95 |   // explicit schema:
 96 |   schema: [
 97 |     { name: 'root', num_children: 3 },
 98 |     { name: 'name', type: 'BYTE_ARRAY', converted_type: 'UTF8' },
 99 |     { name: 'age', type: 'FIXED_LEN_BYTE_ARRAY', type_length: 4, converted_type: 'DECIMAL', scale: 2, precision: 4 },
100 |     { name: 'dob', type: 'INT32', converted_type: 'DATE' },
101 |   ],
102 |   compressors: { SNAPPY: snappyCompresss }, // high performance wasm compressor
103 |   statistics: false,
104 |   rowGroupSize: 1000,
105 |   kvMetadata: [
106 |     { key: 'key1', value: 'value1' },
107 |     { key: 'key2', value: 'value2' },
108 |   ],
109 | })
110 | const arrayBuffer = writer.getBuffer()
111 | ```
112 | 
113 | ### Types
114 | 
115 | Parquet requires an explicit schema to be defined. You can provide schema information in three ways:
116 | 
117 | 1. **Type**: You can provide a `type` in the `columnData` elements, the type will be used as the schema type.
118 | 2. **Schema**: You can provide a `schema` parameter that explicitly defines the parquet schema. The schema should be an array of `SchemaElement` objects (see [parquet-format](https://github.com/apache/parquet-format)), each containing the following properties:
119 |    - `name`: column name
120 |    - `type`: parquet type
121 |    - `num_children`: number children in parquet nested schema (optional)
122 |    - `converted_type`: parquet converted type (optional)
123 |    - `logical_type`: parquet logical type (optional)
124 |    - `repetition_type`: parquet repetition type (optional)
125 |    - `type_length`: length for `FIXED_LENGTH_BYTE_ARRAY` type (optional)
126 |    - `scale`: the scale factor for `DECIMAL` converted types (optional)
127 |    - `precision`: the precision for `DECIMAL` converted types (optional)
128 |    - `field_id`: the field id for the column (optional)
129 | 3. **Auto-detect**: If you provide no type or schema, the type will be auto-detected from the data. However, it is recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc)
130 | 
131 | Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc)
132 | 
133 | #### Schema Overrides
134 | 
135 | You can use mostly automatic schema detection, but override the schema for specific columns. This is useful if most of the column types can be automatically determined, but you want to use a specific schema element for one particular element.
136 | 
137 | ```javascript
138 | const { ByteWriter, parquetWrite, schemaFromColumnData } = await import("hyparquet-writer")
139 | 
140 | const columnData = [
141 |   { name: 'unsigned_int', data: [1000000, 2000000] },
142 |   { name: 'signed_int', data: [1000000, 2000000] },
143 | ]
144 | const writer = new ByteWriter()
145 | parquetWrite({
146 |   writer,
147 |   columnData,
148 |   // override schema for uint column
149 |   schema: schemaFromColumnData({
150 |     columnData,
151 |     schemaOverrides: {
152 |       unsigned_int: {
153 |         name: 'unsigned_int',
154 |         type: 'INT32',
155 |         converted_type: 'UINT_32',
156 |         repetition_type: 'REQUIRED',
157 |       },
158 |     },
159 |   }),
160 | })
161 | ```
162 | 
163 | ## References
164 | 
165 |  - https://github.com/hyparam/hyparquet
166 |  - https://github.com/hyparam/hyparquet-compressors
167 |  - https://github.com/apache/parquet-format
168 |  - https://github.com/apache/parquet-testing
169 | 


--------------------------------------------------------------------------------
/src/datapage.js:
--------------------------------------------------------------------------------
  1 | import { Encodings, PageTypes } from 'hyparquet/src/constants.js'
  2 | import { ByteWriter } from './bytewriter.js'
  3 | import { deltaBinaryPack, deltaByteArray, deltaLengthByteArray } from './delta.js'
  4 | import { writeRleBitPackedHybrid } from './encoding.js'
  5 | import { writePlain } from './plain.js'
  6 | import { writeByteStreamSplit } from './splitstream.js'
  7 | import { serializeTCompactProtocol } from './thrift.js'
  8 | import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
  9 | 
 10 | /**
 11 |  * @param {Writer} writer
 12 |  * @param {DecodedArray} values
 13 |  * @param {ColumnEncoder} column
 14 |  * @param {Encoding} encoding
 15 |  * @param {PageData} [listValues]
 16 |  */
 17 | export function writeDataPageV2(writer, values, column, encoding, listValues) {
 18 |   const { columnName, element, codec, compressors } = column
 19 |   const { type, type_length, repetition_type } = element
 20 | 
 21 |   if (!type) throw new Error(`column ${columnName} cannot determine type`)
 22 |   if (repetition_type === 'REPEATED') throw new Error(`column ${columnName} repeated types not supported`)
 23 | 
 24 |   // write levels to temp buffer
 25 |   const levelWriter = new ByteWriter()
 26 |   const {
 27 |     definition_levels_byte_length,
 28 |     repetition_levels_byte_length,
 29 |     num_nulls,
 30 |     num_values,
 31 |   } = writeLevels(levelWriter, column, values, listValues)
 32 | 
 33 |   const nonnull = values.filter(v => v !== null && v !== undefined)
 34 | 
 35 |   // write page data to temp buffer
 36 |   const page = new ByteWriter()
 37 |   if (encoding === 'PLAIN') {
 38 |     writePlain(page, nonnull, type, type_length)
 39 |   } else if (encoding === 'RLE') {
 40 |     if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type')
 41 |     const rleData = new ByteWriter()
 42 |     writeRleBitPackedHybrid(rleData, nonnull, 1)
 43 |     page.appendUint32(rleData.offset) // prepend byte length
 44 |     page.appendBuffer(rleData.getBuffer())
 45 |   } else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') {
 46 |     // find max bitwidth
 47 |     let maxValue = 0
 48 |     for (const v of values) if (v > maxValue) maxValue = v
 49 |     const bitWidth = Math.ceil(Math.log2(maxValue + 1))
 50 |     page.appendUint8(bitWidth) // prepend bitWidth
 51 |     writeRleBitPackedHybrid(page, nonnull, bitWidth)
 52 |   } else if (encoding === 'DELTA_BINARY_PACKED') {
 53 |     if (type !== 'INT32' && type !== 'INT64') {
 54 |       throw new Error('DELTA_BINARY_PACKED encoding only supported for INT32 and INT64 types')
 55 |     }
 56 |     deltaBinaryPack(page, nonnull)
 57 |   } else if (encoding === 'DELTA_LENGTH_BYTE_ARRAY') {
 58 |     if (type !== 'BYTE_ARRAY') {
 59 |       throw new Error('DELTA_LENGTH_BYTE_ARRAY encoding only supported for BYTE_ARRAY type')
 60 |     }
 61 |     deltaLengthByteArray(page, nonnull)
 62 |   } else if (encoding === 'DELTA_BYTE_ARRAY') {
 63 |     if (type !== 'BYTE_ARRAY') {
 64 |       throw new Error('DELTA_BYTE_ARRAY encoding only supported for BYTE_ARRAY type')
 65 |     }
 66 |     deltaByteArray(page, nonnull)
 67 |   } else if (encoding === 'BYTE_STREAM_SPLIT') {
 68 |     writeByteStreamSplit(page, nonnull, type, type_length)
 69 |   } else {
 70 |     throw new Error(`parquet unsupported encoding: ${encoding}`)
 71 |   }
 72 | 
 73 |   // compress page data
 74 |   const pageBuffer = new Uint8Array(page.getBuffer())
 75 |   const compressedBytes = compressors[codec]?.(pageBuffer) ?? pageBuffer
 76 | 
 77 |   // write page header
 78 |   writePageHeader(writer, {
 79 |     type: 'DATA_PAGE_V2',
 80 |     uncompressed_page_size: levelWriter.offset + page.offset,
 81 |     compressed_page_size: levelWriter.offset + compressedBytes.length,
 82 |     data_page_header_v2: {
 83 |       num_values,
 84 |       num_nulls,
 85 |       num_rows: values.length,
 86 |       encoding,
 87 |       definition_levels_byte_length,
 88 |       repetition_levels_byte_length,
 89 |       is_compressed: !!codec,
 90 |     },
 91 |   })
 92 | 
 93 |   // write levels
 94 |   writer.appendBuffer(levelWriter.getBuffer())
 95 | 
 96 |   // write page data
 97 |   writer.appendBytes(compressedBytes)
 98 | }
 99 | 
100 | /**
101 |  * @param {Writer} writer
102 |  * @param {PageHeader} header
103 |  */
104 | export function writePageHeader(writer, header) {
105 |   /** @type {ThriftObject} */
106 |   const compact = {
107 |     field_1: PageTypes.indexOf(header.type),
108 |     field_2: header.uncompressed_page_size,
109 |     field_3: header.compressed_page_size,
110 |     field_4: header.crc,
111 |     field_5: header.data_page_header && {
112 |       field_1: header.data_page_header.num_values,
113 |       field_2: Encodings.indexOf(header.data_page_header.encoding),
114 |       field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding),
115 |       field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding),
116 |       // field_5: header.data_page_header.statistics,
117 |     },
118 |     field_7: header.dictionary_page_header && {
119 |       field_1: header.dictionary_page_header.num_values,
120 |       field_2: Encodings.indexOf(header.dictionary_page_header.encoding),
121 |     },
122 |     field_8: header.data_page_header_v2 && {
123 |       field_1: header.data_page_header_v2.num_values,
124 |       field_2: header.data_page_header_v2.num_nulls,
125 |       field_3: header.data_page_header_v2.num_rows,
126 |       field_4: Encodings.indexOf(header.data_page_header_v2.encoding),
127 |       field_5: header.data_page_header_v2.definition_levels_byte_length,
128 |       field_6: header.data_page_header_v2.repetition_levels_byte_length,
129 |       field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
130 |     },
131 |   }
132 |   serializeTCompactProtocol(writer, compact)
133 | }
134 | 
135 | /**
136 |  * @import {DecodedArray, Encoding, PageHeader} from 'hyparquet'
137 |  * @import {ColumnEncoder, PageData, ThriftObject, Writer} from '../src/types.js'
138 |  * @param {Writer} writer
139 |  * @param {ColumnEncoder} column
140 |  * @param {DecodedArray} values
141 |  * @param {PageData} [listValues]
142 |  * @returns {{
143 |  *   definition_levels_byte_length: number
144 |  *   repetition_levels_byte_length: number
145 |  *   num_nulls: number
146 |  *   num_values: number
147 |  * }}
148 |  */
149 | function writeLevels(writer, column, values, listValues) {
150 |   const { schemaPath } = column
151 |   const definitionLevels = listValues?.definitionLevels
152 |   const repetitionLevels = listValues?.repetitionLevels
153 | 
154 |   let num_nulls = listValues?.numNulls ?? 0
155 |   let num_values = definitionLevels?.length ?? values.length
156 | 
157 |   const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
158 |   let repetition_levels_byte_length = 0
159 |   if (maxRepetitionLevel) {
160 |     const bitWidth = Math.ceil(Math.log2(maxRepetitionLevel + 1))
161 |     const reps = repetitionLevels ?? []
162 |     repetition_levels_byte_length = writeRleBitPackedHybrid(writer, reps, bitWidth)
163 |   }
164 | 
165 |   // definition levels
166 |   const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
167 |   let definition_levels_byte_length = 0
168 |   if (maxDefinitionLevel) {
169 |     const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1))
170 |     const defs = definitionLevels ?? (() => {
171 |       const generated = []
172 |       for (const value of values) {
173 |         if (value === null || value === undefined) {
174 |           generated.push(maxDefinitionLevel - 1)
175 |           num_nulls++
176 |         } else {
177 |           generated.push(maxDefinitionLevel)
178 |         }
179 |       }
180 |       num_values = generated.length
181 |       return generated
182 |     })()
183 | 
184 |     if (definitionLevels && listValues === undefined) {
185 |       num_nulls = definitionLevels.reduce(
186 |         (count, def) => def === maxDefinitionLevel ? count : count + 1,
187 |         0
188 |       )
189 |     }
190 | 
191 |     definition_levels_byte_length = writeRleBitPackedHybrid(writer, defs, bitWidth)
192 |   } else {
193 |     num_nulls = values.filter(value => value === null || value === undefined).length
194 |   }
195 |   return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls, num_values }
196 | }
197 | 


--------------------------------------------------------------------------------
/test/delta.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { ByteWriter } from '../src/bytewriter.js'
  3 | import { deltaBinaryPack, deltaByteArray, deltaLengthByteArray } from '../src/delta.js'
  4 | import { deltaBinaryUnpack, deltaByteArray as deltaByteArrayRead, deltaLengthByteArray as deltaLengthByteArrayRead } from 'hyparquet/src/delta.js'
  5 | 
  6 | /**
  7 |  * Round-trip test for deltaBinaryPack with Int32Array output.
  8 |  *
  9 |  * @param {number[]} values
 10 |  * @returns {number[]}
 11 |  */
 12 | function roundTripInt32(values) {
 13 |   const writer = new ByteWriter()
 14 |   deltaBinaryPack(writer, values)
 15 |   const buffer = writer.getBuffer()
 16 |   const reader = { view: new DataView(buffer), offset: 0 }
 17 | 
 18 |   const output = new Int32Array(values.length)
 19 |   deltaBinaryUnpack(reader, values.length, output)
 20 |   return Array.from(output)
 21 | }
 22 | 
 23 | /**
 24 |  * Round-trip test for deltaBinaryPack with BigInt64Array output.
 25 |  *
 26 |  * @param {bigint[]} values
 27 |  * @returns {bigint[]}
 28 |  */
 29 | function roundTripBigInt(values) {
 30 |   const writer = new ByteWriter()
 31 |   deltaBinaryPack(writer, values)
 32 |   const buffer = writer.getBuffer()
 33 |   const reader = { view: new DataView(buffer), offset: 0 }
 34 | 
 35 |   const output = new BigInt64Array(values.length)
 36 |   deltaBinaryUnpack(reader, values.length, output)
 37 |   return Array.from(output)
 38 | }
 39 | 
 40 | /**
 41 |  * Round-trip test for deltaLengthByteArray.
 42 |  *
 43 |  * @param {Uint8Array[]} values
 44 |  * @returns {Uint8Array[]}
 45 |  */
 46 | function roundTripLengthByteArray(values) {
 47 |   const writer = new ByteWriter()
 48 |   deltaLengthByteArray(writer, values)
 49 |   const buffer = writer.getBuffer()
 50 |   const reader = { view: new DataView(buffer), offset: 0 }
 51 | 
 52 |   /** @type {Uint8Array[]} */
 53 |   const output = new Array(values.length)
 54 |   deltaLengthByteArrayRead(reader, values.length, output)
 55 |   return output
 56 | }
 57 | 
 58 | /**
 59 |  * Round-trip test for deltaByteArray.
 60 |  *
 61 |  * @param {Uint8Array[]} values
 62 |  * @returns {Uint8Array[]}
 63 |  */
 64 | function roundTripByteArray(values) {
 65 |   const writer = new ByteWriter()
 66 |   deltaByteArray(writer, values)
 67 |   const buffer = writer.getBuffer()
 68 |   const reader = { view: new DataView(buffer), offset: 0 }
 69 | 
 70 |   /** @type {Uint8Array[]} */
 71 |   const output = new Array(values.length)
 72 |   deltaByteArrayRead(reader, values.length, output)
 73 |   return output
 74 | }
 75 | 
 76 | describe('deltaBinaryPack', () => {
 77 |   it('should round-trip empty array', () => {
 78 |     const decoded = roundTripInt32([])
 79 |     expect(decoded).toEqual([])
 80 |   })
 81 | 
 82 |   it('should round-trip single value', () => {
 83 |     const decoded = roundTripInt32([42])
 84 |     expect(decoded).toEqual([42])
 85 |   })
 86 | 
 87 |   it('should round-trip monotonically increasing values', () => {
 88 |     const original = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 89 |     const decoded = roundTripInt32(original)
 90 |     expect(decoded).toEqual(original)
 91 |   })
 92 | 
 93 |   it('should round-trip constant values', () => {
 94 |     const original = Array(100).fill(42)
 95 |     const decoded = roundTripInt32(original)
 96 |     expect(decoded).toEqual(original)
 97 |   })
 98 | 
 99 |   it('should round-trip negative deltas', () => {
100 |     const original = [100, 90, 80, 70, 60, 50, 40, 30, 20, 10]
101 |     const decoded = roundTripInt32(original)
102 |     expect(decoded).toEqual(original)
103 |   })
104 | 
105 |   it('should round-trip mixed deltas', () => {
106 |     const original = [0, 5, 3, 8, 2, 9, 1, 7, 4, 6]
107 |     const decoded = roundTripInt32(original)
108 |     expect(decoded).toEqual(original)
109 |   })
110 | 
111 |   it('should round-trip values spanning multiple blocks', () => {
112 |     // More than 128 values to test multiple blocks
113 |     const original = Array.from({ length: 300 }, (_, i) => i * 2)
114 |     const decoded = roundTripInt32(original)
115 |     expect(decoded).toEqual(original)
116 |   })
117 | 
118 |   it('should round-trip large values', () => {
119 |     const original = [1000000, 1000001, 1000002, 1000003]
120 |     const decoded = roundTripInt32(original)
121 |     expect(decoded).toEqual(original)
122 |   })
123 | 
124 |   it('should round-trip negative values', () => {
125 |     const original = [-10, -5, 0, 5, 10]
126 |     const decoded = roundTripInt32(original)
127 |     expect(decoded).toEqual(original)
128 |   })
129 | 
130 |   it('should round-trip bigint values', () => {
131 |     const original = [1n, 2n, 3n, 4n, 5n]
132 |     const decoded = roundTripBigInt(original)
133 |     expect(decoded).toEqual(original)
134 |   })
135 | 
136 |   it('should round-trip large bigint values', () => {
137 |     const original = [10000000000n, 10000000001n, 10000000002n]
138 |     const decoded = roundTripBigInt(original)
139 |     expect(decoded).toEqual(original)
140 |   })
141 | 
142 |   it('should round-trip random values', () => {
143 |     const original = Array.from({ length: 200 }, () => Math.floor(Math.random() * 10000))
144 |     const decoded = roundTripInt32(original)
145 |     expect(decoded).toEqual(original)
146 |   })
147 | 
148 |   it('should throw for unsupported types', () => {
149 |     const writer = new ByteWriter()
150 |     expect(() => deltaBinaryPack(writer, ['string'])).toThrow('deltaBinaryPack only supports number or bigint arrays')
151 |   })
152 | 
153 |   it('should handle values requiring bit flush at end of miniblock', () => {
154 |     // Values with varying bit widths to exercise the bitsUsed > 0 flush path
155 |     const original = Array.from({ length: 32 }, (_, i) => i * 7)
156 |     const decoded = roundTripInt32(original)
157 |     expect(decoded).toEqual(original)
158 |   })
159 | })
160 | 
161 | describe('deltaLengthByteArray', () => {
162 |   it('should round-trip empty array', () => {
163 |     const decoded = roundTripLengthByteArray([])
164 |     expect(decoded).toEqual([])
165 |   })
166 | 
167 |   it('should round-trip single byte array', () => {
168 |     const original = [new Uint8Array([1, 2, 3])]
169 |     const decoded = roundTripLengthByteArray(original)
170 |     expect(decoded.length).toBe(1)
171 |     expect(Array.from(decoded[0])).toEqual([1, 2, 3])
172 |   })
173 | 
174 |   it('should round-trip multiple byte arrays', () => {
175 |     const original = [
176 |       new Uint8Array([1, 2, 3]),
177 |       new Uint8Array([4, 5]),
178 |       new Uint8Array([6, 7, 8, 9]),
179 |     ]
180 |     const decoded = roundTripLengthByteArray(original)
181 |     expect(decoded.length).toBe(3)
182 |     expect(Array.from(decoded[0])).toEqual([1, 2, 3])
183 |     expect(Array.from(decoded[1])).toEqual([4, 5])
184 |     expect(Array.from(decoded[2])).toEqual([6, 7, 8, 9])
185 |   })
186 | 
187 |   it('should round-trip strings as byte arrays', () => {
188 |     const encoder = new TextEncoder()
189 |     const original = ['hello', 'world', 'test'].map(s => encoder.encode(s))
190 |     const decoded = roundTripLengthByteArray(original)
191 |     const decoder = new TextDecoder()
192 |     expect(decoded.map(d => decoder.decode(d))).toEqual(['hello', 'world', 'test'])
193 |   })
194 | 
195 |   it('should throw for non-Uint8Array values', () => {
196 |     const writer = new ByteWriter()
197 |     expect(() => deltaLengthByteArray(writer, ['string'])).toThrow('deltaLengthByteArray expects Uint8Array values')
198 |   })
199 | })
200 | 
201 | describe('deltaByteArray', () => {
202 |   it('should round-trip empty array', () => {
203 |     const decoded = roundTripByteArray([])
204 |     expect(decoded).toEqual([])
205 |   })
206 | 
207 |   it('should round-trip single byte array', () => {
208 |     const original = [new Uint8Array([1, 2, 3])]
209 |     const decoded = roundTripByteArray(original)
210 |     expect(decoded.length).toBe(1)
211 |     expect(Array.from(decoded[0])).toEqual([1, 2, 3])
212 |   })
213 | 
214 |   it('should round-trip arrays with common prefixes', () => {
215 |     const encoder = new TextEncoder()
216 |     const original = ['prefix_a', 'prefix_b', 'prefix_c'].map(s => encoder.encode(s))
217 |     const decoded = roundTripByteArray(original)
218 |     const decoder = new TextDecoder()
219 |     expect(decoded.map(d => decoder.decode(d))).toEqual(['prefix_a', 'prefix_b', 'prefix_c'])
220 |   })
221 | 
222 |   it('should round-trip arrays with no common prefix', () => {
223 |     const encoder = new TextEncoder()
224 |     const original = ['abc', 'xyz', '123'].map(s => encoder.encode(s))
225 |     const decoded = roundTripByteArray(original)
226 |     const decoder = new TextDecoder()
227 |     expect(decoded.map(d => decoder.decode(d))).toEqual(['abc', 'xyz', '123'])
228 |   })
229 | 
230 |   it('should round-trip sorted strings efficiently', () => {
231 |     const encoder = new TextEncoder()
232 |     const original = ['apple', 'application', 'apply', 'banana', 'bandana'].map(s => encoder.encode(s))
233 |     const decoded = roundTripByteArray(original)
234 |     const decoder = new TextDecoder()
235 |     expect(decoded.map(d => decoder.decode(d))).toEqual(['apple', 'application', 'apply', 'banana', 'bandana'])
236 |   })
237 | 
238 |   it('should throw for non-Uint8Array first value', () => {
239 |     const writer = new ByteWriter()
240 |     expect(() => deltaByteArray(writer, ['string'])).toThrow('deltaByteArray expects Uint8Array values')
241 |   })
242 | 
243 |   it('should throw for non-Uint8Array subsequent value', () => {
244 |     const writer = new ByteWriter()
245 |     expect(() => deltaByteArray(writer, [new Uint8Array([1]), 'string'])).toThrow('deltaByteArray expects Uint8Array values')
246 |   })
247 | })
248 | 


--------------------------------------------------------------------------------
/src/unconvert.js:
--------------------------------------------------------------------------------
  1 | import { toJson } from 'hyparquet'
  2 | import { geojsonToWkb } from './wkb.js'
  3 | 
  4 | const dayMillis = 86400000 // 1 day in milliseconds
  5 | /**
  6 |  * @import {DecodedArray, SchemaElement, Statistics} from 'hyparquet'
  7 |  * @import {MinMaxType} from 'hyparquet/src/types.js'
  8 |  * @import {ThriftObject} from '../src/types.js'
  9 |  */
 10 | 
 11 | /**
 12 |  * Convert from rich to primitive types.
 13 |  *
 14 |  * @param {SchemaElement} element
 15 |  * @param {DecodedArray} values
 16 |  * @returns {DecodedArray}
 17 |  */
 18 | export function unconvert(element, values) {
 19 |   const { type, converted_type: ctype, logical_type: ltype } = element
 20 |   if (ctype === 'DECIMAL') {
 21 |     const factor = 10 ** (element.scale || 0)
 22 |     return values.map(v => {
 23 |       if (v === null || v === undefined) return v
 24 |       if (typeof v !== 'number') throw new Error('DECIMAL must be a number')
 25 |       return unconvertDecimal(element, BigInt(Math.round(v * factor)))
 26 |     })
 27 |   }
 28 |   if (ctype === 'DATE') {
 29 |     return Array.from(values).map(v => v && v.getTime() / dayMillis)
 30 |   }
 31 |   if (ctype === 'TIMESTAMP_MILLIS') {
 32 |     return Array.from(values).map(v => v && BigInt(v.getTime()))
 33 |   }
 34 |   if (ctype === 'TIMESTAMP_MICROS') {
 35 |     return Array.from(values).map(v => v && BigInt(v.getTime() * 1000))
 36 |   }
 37 |   if (ctype === 'JSON') {
 38 |     if (!Array.isArray(values)) throw new Error('JSON must be an array')
 39 |     const encoder = new TextEncoder()
 40 |     return values.map(v => v === undefined ? undefined : encoder.encode(JSON.stringify(toJson(v))))
 41 |   }
 42 |   if (ctype === 'UTF8') {
 43 |     if (!Array.isArray(values)) throw new Error('strings must be an array')
 44 |     const encoder = new TextEncoder()
 45 |     return values.map(v => typeof v === 'string' ? encoder.encode(v) : v)
 46 |   }
 47 |   if (ltype?.type === 'FLOAT16') {
 48 |     if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('FLOAT16 must be FIXED_LEN_BYTE_ARRAY type')
 49 |     if (element.type_length !== 2) throw new Error('FLOAT16 expected type_length to be 2 bytes')
 50 |     return Array.from(values).map(unconvertFloat16)
 51 |   }
 52 |   if (ltype?.type === 'UUID') {
 53 |     if (!Array.isArray(values)) throw new Error('UUID must be an array')
 54 |     if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('UUID must be FIXED_LEN_BYTE_ARRAY type')
 55 |     if (element.type_length !== 16) throw new Error('UUID expected type_length to be 16 bytes')
 56 |     return values.map(unconvertUuid)
 57 |   }
 58 |   if (ltype?.type === 'GEOMETRY' || ltype?.type === 'GEOGRAPHY') {
 59 |     if (!Array.isArray(values)) throw new Error('geometry must be an array')
 60 |     return values.map(v => v && geojsonToWkb(v))
 61 |   }
 62 |   return values
 63 | }
 64 | 
 65 | /**
 66 |  * @param {Uint8Array | string | undefined} value
 67 |  * @returns {Uint8Array | undefined}
 68 |  */
 69 | function unconvertUuid(value) {
 70 |   if (value === undefined || value === null) return
 71 |   if (value instanceof Uint8Array) return value
 72 |   if (typeof value === 'string') {
 73 |     const uuidRegex = /^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$/i
 74 |     if (!uuidRegex.test(value)) {
 75 |       throw new Error('UUID must be a valid UUID string')
 76 |     }
 77 |     value = value.replace(/-/g, '').toLowerCase()
 78 |     const bytes = new Uint8Array(16)
 79 |     for (let i = 0; i < 16; i++) {
 80 |       bytes[i] = parseInt(value.slice(i * 2, i * 2 + 2), 16)
 81 |     }
 82 |     return bytes
 83 |   }
 84 |   throw new Error('UUID must be a string or Uint8Array')
 85 | }
 86 | 
 87 | /**
 88 |  * Uncovert from rich type to byte array for metadata statistics.
 89 |  *
 90 |  * @param {MinMaxType | undefined} value
 91 |  * @param {SchemaElement} element
 92 |  * @returns {Uint8Array | undefined}
 93 |  */
 94 | export function unconvertMinMax(value, element) {
 95 |   if (value === undefined || value === null) return undefined
 96 |   const { type, converted_type } = element
 97 |   if (type === 'BOOLEAN') return new Uint8Array([value ? 1 : 0])
 98 |   if (converted_type === 'DECIMAL') {
 99 |     if (typeof value !== 'number') throw new Error('DECIMAL must be a number')
100 |     const factor = 10 ** (element.scale || 0)
101 |     const out = unconvertDecimal(element, BigInt(Math.round(value * factor)))
102 |     if (out instanceof Uint8Array) return out
103 |     if (typeof out === 'number') {
104 |       const buffer = new ArrayBuffer(4)
105 |       new DataView(buffer).setFloat32(0, out, true)
106 |       return new Uint8Array(buffer)
107 |     }
108 |     if (typeof out === 'bigint') {
109 |       const buffer = new ArrayBuffer(8)
110 |       new DataView(buffer).setBigInt64(0, out, true)
111 |       return new Uint8Array(buffer)
112 |     }
113 |   }
114 |   if (type === 'BYTE_ARRAY' || type === 'FIXED_LEN_BYTE_ARRAY') {
115 |     // truncate byte arrays to 16 bytes for statistics
116 |     if (value instanceof Uint8Array) return value.slice(0, 16)
117 |     return new TextEncoder().encode(value.toString().slice(0, 16))
118 |   }
119 |   if (type === 'FLOAT' && typeof value === 'number') {
120 |     const buffer = new ArrayBuffer(4)
121 |     new DataView(buffer).setFloat32(0, value, true)
122 |     return new Uint8Array(buffer)
123 |   }
124 |   if (type === 'DOUBLE' && typeof value === 'number') {
125 |     const buffer = new ArrayBuffer(8)
126 |     new DataView(buffer).setFloat64(0, value, true)
127 |     return new Uint8Array(buffer)
128 |   }
129 |   if (type === 'INT32' && typeof value === 'number') {
130 |     const buffer = new ArrayBuffer(4)
131 |     new DataView(buffer).setInt32(0, value, true)
132 |     return new Uint8Array(buffer)
133 |   }
134 |   if (type === 'INT64' && typeof value === 'bigint') {
135 |     const buffer = new ArrayBuffer(8)
136 |     new DataView(buffer).setBigInt64(0, value, true)
137 |     return new Uint8Array(buffer)
138 |   }
139 |   if (type === 'INT32' && converted_type === 'DATE' && value instanceof Date) {
140 |     const buffer = new ArrayBuffer(4)
141 |     new DataView(buffer).setInt32(0, Math.floor(value.getTime() / dayMillis), true)
142 |     return new Uint8Array(buffer)
143 |   }
144 |   if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS' && value instanceof Date) {
145 |     const buffer = new ArrayBuffer(8)
146 |     new DataView(buffer).setBigInt64(0, BigInt(value.getTime()), true)
147 |     return new Uint8Array(buffer)
148 |   }
149 |   throw new Error(`unsupported type for statistics: ${type} with value ${value}`)
150 | }
151 | 
152 | /**
153 |  * @param {Statistics} stats
154 |  * @param {SchemaElement} element
155 |  * @returns {ThriftObject}
156 |  */
157 | export function unconvertStatistics(stats, element) {
158 |   return {
159 |     field_1: unconvertMinMax(stats.max, element),
160 |     field_2: unconvertMinMax(stats.min, element),
161 |     field_3: stats.null_count,
162 |     field_4: stats.distinct_count,
163 |     field_5: unconvertMinMax(stats.max_value, element),
164 |     field_6: unconvertMinMax(stats.min_value, element),
165 |     field_7: stats.is_max_value_exact,
166 |     field_8: stats.is_min_value_exact,
167 |   }
168 | }
169 | 
170 | /**
171 |  * @param {SchemaElement} element
172 |  * @param {bigint} value
173 |  * @returns {number | bigint | Uint8Array}
174 |  */
175 | export function unconvertDecimal({ type, type_length }, value) {
176 |   if (type === 'INT32') return Number(value)
177 |   if (type === 'INT64') return value
178 |   if (type === 'FIXED_LEN_BYTE_ARRAY' && !type_length) {
179 |     throw new Error('fixed length byte array type_length is required')
180 |   }
181 |   if (!type_length && !value) return new Uint8Array()
182 | 
183 |   const bytes = []
184 |   while (true) {
185 |     // extract the lowest 8 bits
186 |     const byte = Number(value & 0xffn)
187 |     bytes.unshift(byte)
188 |     value >>= 8n
189 | 
190 |     if (type_length) {
191 |       if (bytes.length >= type_length) break // fixed length
192 |     } else {
193 |       // for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n
194 |       // for negative: stop when top byte has signBit = 1 AND shifted value == -1n
195 |       const sign = byte & 0x80
196 |       if (!sign && value === 0n || sign && value === -1n) {
197 |         break
198 |       }
199 |     }
200 |   }
201 | 
202 |   return new Uint8Array(bytes)
203 | }
204 | 
205 | /**
206 |  * @param {number | undefined} value
207 |  * @returns {Uint8Array | undefined}
208 |  */
209 | export function unconvertFloat16(value) {
210 |   if (value === undefined || value === null) return
211 |   if (typeof value !== 'number') throw new Error('parquet float16 expected number value')
212 |   if (Number.isNaN(value)) return new Uint8Array([0x00, 0x7e])
213 | 
214 |   const sign = value < 0 || Object.is(value, -0) ? 1 : 0
215 |   const abs = Math.abs(value)
216 | 
217 |   // infinities
218 |   if (!isFinite(abs)) return new Uint8Array([0x00, sign << 7 | 0x7c])
219 | 
220 |   // ±0
221 |   if (abs === 0) return new Uint8Array([0x00, sign << 7])
222 | 
223 |   // write as f32 to get raw bits
224 |   const buf = new ArrayBuffer(4)
225 |   new Float32Array(buf)[0] = abs
226 |   const bits32 = new Uint32Array(buf)[0]
227 | 
228 |   let exp32 = bits32 >>> 23 & 0xff
229 |   let mant32 = bits32 & 0x7fffff
230 | 
231 |   // convert 32‑bit exponent to unbiased, then to 16‑bit
232 |   exp32 -= 127
233 | 
234 |   // handle numbers too small for a normal 16‑bit exponent
235 |   if (exp32 < -14) {
236 |     // sub‑normal: shift mantissa so that result = mant * 2^-14
237 |     const shift = -14 - exp32
238 |     mant32 = (mant32 | 0x800000) >> shift + 13
239 | 
240 |     // round‑to‑nearest‑even
241 |     if (mant32 & 1) mant32 += 1
242 | 
243 |     const bits16 = sign << 15 | mant32
244 |     return new Uint8Array([bits16 & 0xff, bits16 >> 8])
245 |   }
246 | 
247 |   // overflow
248 |   if (exp32 > 15) return new Uint8Array([0x00, sign << 7 | 0x7c])
249 | 
250 |   // normal number
251 |   let exp16 = exp32 + 15
252 |   mant32 = mant32 + 0x1000 // add rounding bit
253 | 
254 |   // handle mantissa overflow after rounding
255 |   if (mant32 & 0x800000) {
256 |     mant32 = 0
257 |     if (++exp16 === 31) // became infinity
258 |       return new Uint8Array([0x00, sign << 7 | 0x7c])
259 |   }
260 | 
261 |   const bits16 = sign << 15 | exp16 << 10 | mant32 >> 13
262 |   return new Uint8Array([bits16 & 0xff, bits16 >> 8])
263 | }
264 | 


--------------------------------------------------------------------------------
/test/unconvert.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { unconvert, unconvertDecimal, unconvertFloat16, unconvertMinMax } from '../src/unconvert.js'
  3 | import { convertMetadata } from 'hyparquet/src/metadata.js'
  4 | import { DEFAULT_PARSERS, parseFloat16 } from 'hyparquet/src/convert.js'
  5 | 
  6 | /**
  7 |  * @import {SchemaElement} from 'hyparquet'
  8 |  */
  9 | describe('unconvert', () => {
 10 |   it('should return Date objects when converted_type = DATE', () => {
 11 |     /** @type {SchemaElement} */
 12 |     const schema = { name: 'test', converted_type: 'DATE' }
 13 |     const input = [new Date('2020-01-01T00:00:00Z'), new Date('2021-01-01T00:00:00Z')]
 14 |     const result = unconvert(schema, input)
 15 |     expect(result).toEqual([18262, 18628])
 16 |   })
 17 | 
 18 |   it('should convert JSON objects to strings when converted_type = JSON', () => {
 19 |     /** @type {SchemaElement} */
 20 |     const schema = { name: 'test', converted_type: 'JSON' }
 21 |     const input = [{ foo: 'bar' }, { hello: 'world' }]
 22 |     const result = unconvert(schema, input)
 23 | 
 24 |     // We check that result is an array of Uint8Arrays containing the JSON-encoded bytes
 25 |     expect(result).toHaveLength(2)
 26 |     expect(result[0]).toBeInstanceOf(Uint8Array)
 27 |     expect(new TextDecoder().decode(result[0])).toEqual(JSON.stringify({ foo: 'bar' }))
 28 |     expect(new TextDecoder().decode(result[1])).toEqual(JSON.stringify({ hello: 'world' }))
 29 |   })
 30 | 
 31 |   it('should handle undefined values in JSON arrays', () => {
 32 |     /** @type {SchemaElement} */
 33 |     const schema = { name: 'test', converted_type: 'JSON' }
 34 |     const input = [{ foo: 'bar' }, undefined, { hello: 'world' }]
 35 |     const result = unconvert(schema, input)
 36 | 
 37 |     expect(result).toHaveLength(3)
 38 |     expect(result[0]).toBeInstanceOf(Uint8Array)
 39 |     expect(result[1]).toBeUndefined()
 40 |     expect(result[2]).toBeInstanceOf(Uint8Array)
 41 |     expect(new TextDecoder().decode(result[0])).toEqual(JSON.stringify({ foo: 'bar' }))
 42 |     expect(new TextDecoder().decode(result[2])).toEqual(JSON.stringify({ hello: 'world' }))
 43 |   })
 44 | 
 45 |   it('should convert string array to Uint8Array when converted_type = UTF8', () => {
 46 |     /** @type {SchemaElement} */
 47 |     const schema = { name: 'test', converted_type: 'UTF8' }
 48 |     const input = ['hello', 'world']
 49 |     const result = unconvert(schema, input)
 50 | 
 51 |     expect(result).toHaveLength(2)
 52 |     expect(result[0]).toBeInstanceOf(Uint8Array)
 53 |     expect(new TextDecoder().decode(result[0])).toBe('hello')
 54 |     expect(new TextDecoder().decode(result[1])).toBe('world')
 55 |   })
 56 | 
 57 |   it('should throw an error when converted_type = UTF8 and values is not an array', () => {
 58 |     expect(() => unconvert(
 59 |       { name: 'test', converted_type: 'UTF8' },
 60 |       new Uint8Array([1, 2, 3]))
 61 |     ).toThrow('strings must be an array')
 62 |   })
 63 | 
 64 |   it('should throw an error when converted_type = JSON and values is not an array', () => {
 65 |     expect(() => unconvert(
 66 |       { name: 'test', converted_type: 'JSON' },
 67 |       new Uint8Array([1, 2, 3]))
 68 |     ).toThrow('JSON must be an array')
 69 |   })
 70 | 
 71 |   it('should return original values if there is no recognized converted_type', () => {
 72 |     const input = [1, 2, 3]
 73 |     const result = unconvert({ name: 'test' }, input)
 74 |     expect(result).toEqual(input)
 75 |   })
 76 | })
 77 | 
 78 | describe('unconvertMinMax', () => {
 79 |   it('should return undefined if value is undefined or null', () => {
 80 |     /** @type {SchemaElement} */
 81 |     const schema = { name: 'test', type: 'INT32' }
 82 |     expect(unconvertMinMax(undefined, schema)).toBeUndefined()
 83 |   })
 84 | 
 85 |   it('should handle BOOLEAN type', () => {
 86 |     /** @type {SchemaElement} */
 87 |     const schema = { name: 'test', type: 'BOOLEAN' }
 88 |     expect(unconvertMinMax(true, schema)).toEqual(new Uint8Array([1]))
 89 |     expect(unconvertMinMax(false, schema)).toEqual(new Uint8Array([0]))
 90 |   })
 91 | 
 92 |   it('should truncate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY to 16 bytes', () => {
 93 |     // longer string to test truncation
 94 |     const longStr = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 95 |     const longStrUint8 = new TextEncoder().encode(longStr)
 96 | 
 97 |     // value is a Uint8Array
 98 |     const result1 = unconvertMinMax(longStrUint8, { name: 'test', type: 'BYTE_ARRAY' })
 99 |     expect(result1).toBeInstanceOf(Uint8Array)
100 |     expect(result1?.length).toBe(16)
101 | 
102 |     // value is a string
103 |     const result2 = unconvertMinMax(longStr, { name: 'test', type: 'FIXED_LEN_BYTE_ARRAY' })
104 |     expect(result2).toBeInstanceOf(Uint8Array)
105 |     expect(result2?.length).toBe(16)
106 |   })
107 | 
108 |   it('should correctly encode FLOAT values in little-endian', () => {
109 |     /** @type {SchemaElement} */
110 |     const schema = { name: 'test', type: 'FLOAT' }
111 |     const value = 1.5
112 |     const result = unconvertMinMax(value, schema)
113 |     expect(result).toBeInstanceOf(Uint8Array)
114 |     const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS)
115 |     expect(roundtrip).toEqual(1.5)
116 |   })
117 | 
118 |   it('should correctly encode DOUBLE values in little-endian', () => {
119 |     /** @type {SchemaElement} */
120 |     const schema = { name: 'test', type: 'DOUBLE' }
121 |     const value = 1.123456789
122 |     const result = unconvertMinMax(value, schema)
123 |     expect(result).toBeInstanceOf(Uint8Array)
124 |     const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS)
125 |     expect(roundtrip).toEqual(1.123456789)
126 |   })
127 | 
128 |   it('should correctly encode INT32 values in little-endian', () => {
129 |     /** @type {SchemaElement} */
130 |     const schema = { name: 'test', type: 'INT32' }
131 |     const value = 123456
132 |     const result = unconvertMinMax(value, schema)
133 |     const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS)
134 |     expect(roundtrip).toEqual(123456)
135 |   })
136 | 
137 |   it('should correctly encode INT64 values when given a bigint', () => {
138 |     /** @type {SchemaElement} */
139 |     const schema = { name: 'test', type: 'INT64' }
140 |     const value = 1234567890123456789n
141 |     const result = unconvertMinMax(value, schema)
142 |     const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS)
143 |     expect(roundtrip).toEqual(1234567890123456789n)
144 |   })
145 | 
146 |   it('should correctly encode a Date as TIMESTAMP_MILLIS for INT64', () => {
147 |     /** @type {SchemaElement} */
148 |     const schema = { name: 'test', type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' }
149 |     const date = new Date('2023-01-01T00:00:00Z')
150 |     const result = unconvertMinMax(date, schema)
151 |     const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS)
152 |     expect(roundtrip).toEqual(date)
153 |   })
154 | 
155 |   it('should throw an error for unsupported types', () => {
156 |     /** @type {SchemaElement} */
157 |     const schema = { name: 'test', type: 'INT96' }
158 |     expect(() => unconvertMinMax(123, schema))
159 |       .toThrow('unsupported type for statistics: INT96 with value 123')
160 |   })
161 | 
162 |   it('should throw an error for INT64 if value is a number instead of bigint or Date', () => {
163 |     /** @type {SchemaElement} */
164 |     const schema = { name: 'test', type: 'INT64' }
165 |     expect(() => unconvertMinMax(123, schema))
166 |       .toThrow('unsupported type for statistics: INT64 with value 123')
167 |   })
168 | })
169 | 
170 | describe('unconvertDecimal', () => {
171 |   const examples = [
172 |     { input: 0n, expected: new Uint8Array([]) },
173 |     { input: 1n, expected: new Uint8Array([0x01]) },
174 |     { input: -1n, expected: new Uint8Array([0xff]) },
175 |     { input: 1234n, expected: new Uint8Array([0x04, 0xd2]) },
176 |     { input: -1234n, expected: new Uint8Array([0xfb, 0x2e]) },
177 |     { input: 1234567890123456789n, expected: new Uint8Array([0x11, 0x22, 0x10, 0xf4, 0x7d, 0xe9, 0x81, 0x15]) },
178 |     { input: -1234567890123456789n, expected: new Uint8Array([0xee, 0xdd, 0xef, 0x0b, 0x82, 0x16, 0x7e, 0xeb]) },
179 |   ]
180 |   /** @type {SchemaElement} */
181 |   const element = {
182 |     name: 'col',
183 |     type: 'BYTE_ARRAY',
184 |   }
185 | 
186 |   it.for(examples)('should convert %p', ({ input, expected }) => {
187 |     expect(parseDecimal(expected)).toEqual(input)
188 |   })
189 | 
190 |   it.for(examples)('should unconvert %p', ({ input, expected }) => {
191 |     expect(unconvertDecimal(element, input)).toEqual(expected)
192 |   })
193 | 
194 |   it.for(examples)('should roundtrip %p', ({ input }) => {
195 |     const byteArray = unconvertDecimal(element, input)
196 |     if (!(byteArray instanceof Uint8Array)) throw new Error('expected Uint8Array')
197 |     expect(parseDecimal(byteArray)).toEqual(input)
198 |   })
199 | 
200 |   it.for(examples)('should reverse roundtrip %p', ({ expected }) => {
201 |     expect(unconvertDecimal(element, parseDecimal(expected))).toEqual(expected)
202 |   })
203 | 
204 |   it('convert to INT32', () => {
205 |     expect(unconvertDecimal({ name: 'col', type: 'INT32' }, 1234n)).toEqual(1234)
206 |   })
207 | 
208 |   it('convert to INT64', () => {
209 |     expect(unconvertDecimal({ name: 'col', type: 'INT64' }, 1234n)).toEqual(1234n)
210 |   })
211 | 
212 |   it('throws if fixed length is not specified', () => {
213 |     expect(() => unconvertDecimal({ name: 'col', type: 'FIXED_LEN_BYTE_ARRAY' }, 1234n))
214 |       .toThrow('fixed length byte array type_length is required')
215 |   })
216 | })
217 | 
218 | describe('unconvertFloat16', () => {
219 |   it('should convert number to Float16 array', () => {
220 |     expect(unconvertFloat16(undefined)).toBeUndefined()
221 |     expect(unconvertFloat16(0)).toEqual(new Uint8Array([0x00, 0x00]))
222 |     expect(unconvertFloat16(-0)).toEqual(new Uint8Array([0x00, 0x80]))
223 |     expect(unconvertFloat16(NaN)).toEqual(new Uint8Array([0x00, 0x7e]))
224 |     expect(unconvertFloat16(Infinity)).toEqual(new Uint8Array([0x00, 0x7c]))
225 |     expect(unconvertFloat16(-Infinity)).toEqual(new Uint8Array([0x00, 0xfc]))
226 |     expect(unconvertFloat16(0.5)).toEqual(new Uint8Array([0x00, 0x38]))
227 |     expect(unconvertFloat16(-0.5)).toEqual(new Uint8Array([0x00, 0xb8]))
228 |     expect(unconvertFloat16(1)).toEqual(new Uint8Array([0x00, 0x3c]))
229 |     expect(unconvertFloat16(-1)).toEqual(new Uint8Array([0x00, 0xbc]))
230 |     expect(unconvertFloat16(0.000244140625)).toEqual(new Uint8Array([0x00, 0x0c]))
231 |     // largest normal
232 |     expect(unconvertFloat16(65504)).toEqual(new Uint8Array([0xff, 0x7b]))
233 |     expect(unconvertFloat16(65505)).toEqual(new Uint8Array([0xff, 0x7b]))
234 |     // subnormal
235 |     expect(unconvertFloat16(2 ** -24)).toEqual(new Uint8Array([0x02, 0x00]))
236 |     // mantissa overflow
237 |     expect(unconvertFloat16(2047.9999)).toEqual(new Uint8Array([0x00, 0x68]))
238 |   })
239 | 
240 |   it('should round-trip Float16', () => {
241 |     expect(parseFloat16(unconvertFloat16(0))).toEqual(0)
242 |     expect(parseFloat16(unconvertFloat16(-0))).toEqual(-0)
243 |     expect(parseFloat16(unconvertFloat16(NaN))).toEqual(NaN)
244 |     expect(parseFloat16(unconvertFloat16(Infinity))).toEqual(Infinity)
245 |     expect(parseFloat16(unconvertFloat16(-Infinity))).toEqual(-Infinity)
246 |     expect(parseFloat16(unconvertFloat16(0.5))).toEqual(0.5)
247 |     expect(parseFloat16(unconvertFloat16(-0.5))).toEqual(-0.5)
248 |     expect(parseFloat16(unconvertFloat16(1))).toEqual(1)
249 |     expect(parseFloat16(unconvertFloat16(-1))).toEqual(-1)
250 |     expect(parseFloat16(unconvertFloat16(65504))).toEqual(65504)
251 |     expect(parseFloat16(unconvertFloat16(0.000244140625))).toEqual(0.000244140625)
252 |   })
253 | })
254 | 
255 | /**
256 |  * BigInt parseDecimal
257 |  * @param {Uint8Array} bytes
258 |  * @returns {bigint}
259 |  */
260 | function parseDecimal(bytes) {
261 |   let value = 0n
262 |   for (const byte of bytes) {
263 |     value = value * 256n + BigInt(byte)
264 |   }
265 | 
266 |   // handle signed
267 |   const bits = BigInt(bytes.length) * 8n
268 |   if (bits && value >= 2n ** (bits - 1n)) {
269 |     value -= 2n ** bits
270 |   }
271 | 
272 |   return value
273 | }
274 | 


--------------------------------------------------------------------------------
/src/column.js:
--------------------------------------------------------------------------------
  1 | import { ByteWriter } from './bytewriter.js'
  2 | import { writeDataPageV2, writePageHeader } from './datapage.js'
  3 | import { encodeListValues } from './dremel.js'
  4 | import { geospatialStatistics } from './geospatial.js'
  5 | import { writePlain } from './plain.js'
  6 | import { unconvert, unconvertMinMax } from './unconvert.js'
  7 | 
  8 | /**
  9 |  * Write a column chunk to the writer.
 10 |  *
 11 |  * @param {object} options
 12 |  * @param {Writer} options.writer
 13 |  * @param {ColumnEncoder} options.column
 14 |  * @param {DecodedArray} options.values
 15 |  * @returns {{ chunk: ColumnChunk, columnIndex?: ColumnIndex, offsetIndex?: OffsetIndex }}
 16 |  */
 17 | export function writeColumn({ writer, column, values }) {
 18 |   const { columnName, element, schemaPath, stats, pageSize, encoding: userEncoding } = column
 19 |   const { type, type_length } = element
 20 |   if (!type) throw new Error(`column ${columnName} cannot determine type`)
 21 |   const offsetStart = writer.offset
 22 | 
 23 |   /** @type {PageData | undefined} */
 24 |   let pageData
 25 |   if (isListLike(schemaPath)) {
 26 |     if (!Array.isArray(values)) {
 27 |       throw new Error(`parquet column ${columnName} expects array values for list encoding`)
 28 |     }
 29 |     pageData = encodeListValues(schemaPath, values)
 30 |     values = pageData.values
 31 |   }
 32 | 
 33 |   const num_values = values.length
 34 |   /** @type {Encoding[]} */
 35 |   const encodings = []
 36 | 
 37 |   const isGeospatial = element?.logical_type?.type === 'GEOMETRY' || element?.logical_type?.type === 'GEOGRAPHY'
 38 | 
 39 |   // Compute statistics
 40 |   const statistics = stats ? getStatistics(values) : undefined
 41 |   const geospatial_statistics = stats && isGeospatial ? geospatialStatistics(values) : undefined
 42 | 
 43 |   // dictionary encoding
 44 |   let dictionary_page_offset
 45 |   let data_page_offset = BigInt(writer.offset)
 46 |   const dictionary = useDictionary(values, type, userEncoding)
 47 | 
 48 |   // Determine encoding and prepare values for writing
 49 |   /** @type {Encoding} */
 50 |   let encoding
 51 |   let writeValues
 52 |   if (dictionary) {
 53 |     // replace values with dictionary indices
 54 |     const indexes = new Array(values.length)
 55 |     for (let i = 0; i < values.length; i++) {
 56 |       if (values[i] !== null && values[i] !== undefined) {
 57 |         indexes[i] = dictionary.indexOf(values[i])
 58 |       }
 59 |     }
 60 |     writeValues = indexes
 61 |     encoding = 'RLE_DICTIONARY'
 62 | 
 63 |     // write dictionary page first
 64 |     dictionary_page_offset = BigInt(writer.offset)
 65 |     const unconverted = unconvert(element, dictionary)
 66 |     writeDictionaryPage(writer, column, unconverted)
 67 |   } else {
 68 |     // unconvert values from rich types to simple
 69 |     writeValues = unconvert(element, values)
 70 |     encoding = userEncoding ?? (type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN')
 71 |   }
 72 |   encodings.push(encoding)
 73 | 
 74 |   // Split values into pages based on pageSize
 75 |   const pageBoundaries = getPageBoundaries(writeValues, type, type_length, pageSize)
 76 | 
 77 |   // Initialize index structures if requested
 78 |   /** @type {ColumnIndex | undefined} */
 79 |   const columnIndex = column.columnIndex ? {
 80 |     null_pages: [],
 81 |     min_values: [],
 82 |     max_values: [],
 83 |     boundary_order: 'UNORDERED',
 84 |     null_counts: [],
 85 |   } : undefined
 86 |   /** @type {OffsetIndex | undefined} */
 87 |   const offsetIndex = column.offsetIndex ? {
 88 |     page_locations: [],
 89 |   } : undefined
 90 | 
 91 |   // Write data pages
 92 |   data_page_offset = BigInt(writer.offset)
 93 |   let firstRowIndex = 0n
 94 |   let prevMaxValue
 95 |   let ascending = true
 96 |   let descending = true
 97 | 
 98 |   for (const { start, end } of pageBoundaries) {
 99 |     const chunk = createPageChunk(writeValues, pageData, start, end)
100 |     const pageOffset = writer.offset
101 | 
102 |     writeDataPageV2(writer, chunk.values, column, encoding, chunk.pageData)
103 | 
104 |     // Track page info for indexes
105 |     const pageRows = BigInt(end - start)
106 |     if (columnIndex) {
107 |       const originalSlice = values.slice(start, end)
108 |       const pageStats = getStatistics(originalSlice)
109 |       const nullCount = pageStats.null_count ?? 0n
110 | 
111 |       columnIndex.null_pages.push(nullCount === pageRows)
112 |       const currMin = unconvertMinMax(pageStats.min_value, element)
113 |       const currMax = unconvertMinMax(pageStats.max_value, element)
114 |       // Spec: for all-null pages set "byte[0]" whatever the fuck that means
115 |       columnIndex.min_values.push(currMin ?? 0)
116 |       columnIndex.max_values.push(currMax ?? 0)
117 |       columnIndex.null_counts?.push(nullCount)
118 | 
119 |       // Track boundary order
120 |       if (prevMaxValue !== undefined && currMin !== undefined) {
121 |         if (prevMaxValue > currMin) ascending = false
122 |         if (prevMaxValue < currMin) descending = false
123 |       }
124 |       prevMaxValue = currMax
125 |     }
126 |     if (offsetIndex) {
127 |       offsetIndex.page_locations.push({
128 |         offset: BigInt(pageOffset),
129 |         compressed_page_size: writer.offset - pageOffset,
130 |         first_row_index: BigInt(firstRowIndex),
131 |       })
132 |     }
133 |     firstRowIndex += pageRows
134 |   }
135 | 
136 |   // Set boundary order after all pages are written
137 |   if (columnIndex) {
138 |     const numPages = columnIndex.min_values.length
139 |     columnIndex.boundary_order = numPages < 2 ? 'UNORDERED'
140 |       : ascending ? 'ASCENDING' : descending ? 'DESCENDING' : 'UNORDERED'
141 |   }
142 | 
143 |   return {
144 |     chunk: {
145 |       meta_data: {
146 |         type,
147 |         encodings,
148 |         path_in_schema: schemaPath.slice(1).map(s => s.name),
149 |         codec: column.codec ?? 'UNCOMPRESSED',
150 |         num_values: BigInt(num_values),
151 |         total_compressed_size: BigInt(writer.offset - offsetStart),
152 |         total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO
153 |         data_page_offset,
154 |         dictionary_page_offset,
155 |         statistics,
156 |         geospatial_statistics,
157 |       },
158 |       file_offset: BigInt(offsetStart),
159 |     },
160 |     columnIndex,
161 |     offsetIndex,
162 |   }
163 | }
164 | 
165 | /**
166 |  * Get page boundaries based on estimated byte size.
167 |  *
168 |  * @param {DecodedArray} values
169 |  * @param {ParquetType} type
170 |  * @param {number | undefined} type_length
171 |  * @param {number | undefined} pageSize
172 |  * @returns {Array<{start: number, end: number}>}
173 |  */
174 | function getPageBoundaries(values, type, type_length, pageSize) {
175 |   // If no pageSize limit, return single page with all values
176 |   if (!pageSize) {
177 |     return [{ start: 0, end: values.length }]
178 |   }
179 | 
180 |   const boundaries = []
181 |   let start = 0
182 |   let accumulatedBytes = 0
183 | 
184 |   for (let i = 0; i < values.length; i++) {
185 |     const valueSize = estimateValueSize(values[i], type, type_length)
186 |     accumulatedBytes += valueSize
187 | 
188 |     // Check if we should start a new page
189 |     if (accumulatedBytes >= pageSize && i > start) {
190 |       boundaries.push({ start, end: i })
191 |       start = i
192 |       accumulatedBytes = valueSize
193 |     }
194 |   }
195 | 
196 |   // Final page with remaining values
197 |   if (start < values.length) {
198 |     boundaries.push({ start, end: values.length })
199 |   }
200 | 
201 |   return boundaries
202 | }
203 | 
204 | /**
205 |  * Create a page chunk with sliced values and pageData.
206 |  *
207 |  * @param {DecodedArray} values
208 |  * @param {PageData | undefined} pageData
209 |  * @param {number} start
210 |  * @param {number} end
211 |  * @returns {{values: DecodedArray, pageData: PageData | undefined}}
212 |  */
213 | function createPageChunk(values, pageData, start, end) {
214 |   const chunkValues = values.slice(start, end)
215 |   if (!pageData) {
216 |     return { values: chunkValues, pageData: undefined }
217 |   }
218 |   const defLevels = pageData.definitionLevels.slice(start, end)
219 |   const maxDefLevel = Math.max(...pageData.definitionLevels)
220 |   return {
221 |     values: chunkValues,
222 |     pageData: {
223 |       values: chunkValues,
224 |       definitionLevels: defLevels,
225 |       repetitionLevels: pageData.repetitionLevels.slice(start, end),
226 |       numNulls: defLevels.filter(level => level < maxDefLevel).length,
227 |     },
228 |   }
229 | }
230 | 
231 | /**
232 |  * Estimate the byte size of a value for page size calculation.
233 |  *
234 |  * @param {any} value
235 |  * @param {ParquetType} type
236 |  * @param {number | undefined} type_length
237 |  * @returns {number}
238 |  */
239 | function estimateValueSize(value, type, type_length) {
240 |   if (value === null || value === undefined) return 0
241 |   if (type === 'BOOLEAN') return 1 // bit, but count as byte for simplicity
242 |   if (type === 'INT32' || type === 'FLOAT') return 4
243 |   if (type === 'INT64' || type === 'DOUBLE') return 8
244 |   if (type === 'INT96') return 12
245 |   if (type === 'FIXED_LEN_BYTE_ARRAY') return type_length ?? 0
246 |   if (type === 'BYTE_ARRAY') {
247 |     if (value instanceof Uint8Array) return value.byteLength
248 |     if (typeof value === 'string') return value.length
249 |   }
250 |   return 0
251 | }
252 | 
253 | /**
254 |  * @param {DecodedArray} values
255 |  * @param {ParquetType} type
256 |  * @param {Encoding | undefined} encoding
257 |  * @returns {any[] | undefined}
258 |  */
259 | function useDictionary(values, type, encoding) {
260 |   if (encoding && encoding !== 'RLE_DICTIONARY') return
261 |   if (type === 'BOOLEAN') return
262 |   const unique = new Set(values)
263 |   unique.delete(undefined)
264 |   unique.delete(null)
265 |   if (values.length / unique.size > 2) {
266 |     // TODO: sort by frequency
267 |     return Array.from(unique)
268 |   }
269 | }
270 | 
271 | /**
272 |  * @param {Writer} writer
273 |  * @param {ColumnEncoder} column
274 |  * @param {DecodedArray} dictionary
275 |  */
276 | function writeDictionaryPage(writer, column, dictionary) {
277 |   const { element, codec, compressors } = column
278 |   const { type, type_length } = element
279 |   if (!type) throw new Error(`column ${column.columnName} cannot determine type`)
280 |   const dictionaryPage = new ByteWriter()
281 |   writePlain(dictionaryPage, dictionary, type, type_length)
282 | 
283 |   // compress dictionary page data
284 |   let compressedDictionaryPage = dictionaryPage
285 |   const compressor = compressors?.[codec]
286 |   if (compressor) {
287 |     const input = new Uint8Array(dictionaryPage.getBuffer())
288 |     const compressedData = compressor(input)
289 |     compressedDictionaryPage = new ByteWriter()
290 |     compressedDictionaryPage.appendBytes(compressedData)
291 |   }
292 | 
293 |   // write dictionary page header
294 |   writePageHeader(writer, {
295 |     type: 'DICTIONARY_PAGE',
296 |     uncompressed_page_size: dictionaryPage.offset,
297 |     compressed_page_size: compressedDictionaryPage.offset,
298 |     dictionary_page_header: {
299 |       num_values: dictionary.length,
300 |       encoding: 'PLAIN',
301 |     },
302 |   })
303 |   writer.appendBuffer(compressedDictionaryPage.getBuffer())
304 | }
305 | 
306 | /**
307 |  * @import {ColumnChunk, ColumnIndex, DecodedArray, Encoding, OffsetIndex, ParquetType, SchemaElement, Statistics} from 'hyparquet'
308 |  * @import {ColumnEncoder, PageData, Writer} from '../src/types.js'
309 |  * @param {DecodedArray} values
310 |  * @returns {Statistics}
311 |  */
312 | function getStatistics(values) {
313 |   let min_value = undefined
314 |   let max_value = undefined
315 |   let null_count = 0n
316 |   for (const value of values) {
317 |     if (value === null || value === undefined) {
318 |       null_count++
319 |       continue
320 |     }
321 |     if (typeof value === 'object') continue // skip objects
322 |     if (min_value === undefined || value < min_value) min_value = value
323 |     if (max_value === undefined || value > max_value) max_value = value
324 |   }
325 |   return { min_value, max_value, null_count }
326 | }
327 | 
328 | /**
329 |  * @param {SchemaElement[]} schemaPath
330 |  * @returns {boolean}
331 |  */
332 | function isListLike(schemaPath) {
333 |   for (let i = 1; i < schemaPath.length; i++) {
334 |     const element = schemaPath[i]
335 |     if (element?.converted_type === 'LIST') {
336 |       const repeatedChild = schemaPath[i + 1]
337 |       return repeatedChild?.repetition_type === 'REPEATED'
338 |     }
339 |   }
340 |   return false
341 | }
342 | 


--------------------------------------------------------------------------------