├── hyparquet-writer.jpg ├── test ├── files │ ├── listy.parquet │ ├── signs.parquet │ ├── geospatial.parquet │ ├── fixed_length_decimal.parquet │ └── float16_nonzeros_and_nans.parquet ├── write.roundtrip.test.js ├── package.test.js ├── write.file.test.js ├── wkb.test.js ├── write.geospatial.test.js ├── write.list.test.js ├── snappy.test.js ├── encoding.test.js ├── geospatial.test.js ├── filewriter.test.js ├── schema.test.js ├── bytewriter.test.js ├── write.delta.test.js ├── write.splitstream.test.js ├── splitstream.test.js ├── thrift.test.js ├── write.schema.test.js ├── plain.test.js ├── example.js ├── write.multipage.test.js ├── metadata.test.js ├── delta.test.js └── unconvert.test.js ├── .gitignore ├── tsconfig.build.json ├── tsconfig.json ├── .github └── workflows │ └── ci.yml ├── src ├── index.js ├── write.js ├── node.js ├── indexes.js ├── splitstream.js ├── types.d.ts ├── encoding.js ├── dremel.js ├── plain.js ├── bytewriter.js ├── wkb.js ├── geospatial.js ├── parquet-writer.js ├── thrift.js ├── delta.js ├── schema.js ├── snappy.js ├── metadata.js ├── datapage.js ├── unconvert.js └── column.js ├── LICENSE ├── package.json ├── eslint.config.js ├── benchmark.js └── README.md /hyparquet-writer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/hyparquet-writer.jpg -------------------------------------------------------------------------------- /test/files/listy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/listy.parquet -------------------------------------------------------------------------------- /test/files/signs.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/signs.parquet -------------------------------------------------------------------------------- /test/files/geospatial.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/geospatial.parquet -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | coverage 2 | node_modules 3 | package-lock.json 4 | *.tgz 5 | .DS_Store 6 | /*.parquet 7 | /data 8 | /types 9 | -------------------------------------------------------------------------------- /test/files/fixed_length_decimal.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/fixed_length_decimal.parquet -------------------------------------------------------------------------------- /test/files/float16_nonzeros_and_nans.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet-writer/HEAD/test/files/float16_nonzeros_and_nans.parquet -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "noEmit": false, 5 | "declaration": true, 6 | "emitDeclarationOnly": true, 7 | "outDir": "types", 8 | "declarationMap": true 9 | }, 10 | "include": ["src"] 11 | } 12 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "allowJs": true, 4 | "checkJs": true, 5 | "lib": ["esnext", "dom"], 6 | "module": "nodenext", 7 | "noEmit": true, 8 | "resolveJsonModule": true, 9 | "strict": true 10 | }, 11 | "include": ["src", "test"] 12 | } 13 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 5 10 | steps: 11 | - uses: actions/checkout@v6 12 | - run: npm i 13 | - run: npm run lint 14 | 15 | typecheck: 16 | runs-on: ubuntu-latest 17 | timeout-minutes: 5 18 | steps: 19 | - uses: actions/checkout@v6 20 | - run: npm i 21 | - run: npx tsc 22 | 23 | test: 24 | runs-on: ubuntu-latest 25 | timeout-minutes: 5 26 | steps: 27 | - uses: actions/checkout@v6 28 | - run: npm i 29 | - run: npm run coverage 30 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | export { parquetWrite, parquetWriteBuffer } from './write.js' 2 | export { autoSchemaElement, schemaFromColumnData } from './schema.js' 3 | export { ByteWriter } from './bytewriter.js' 4 | export { ParquetWriter } from './parquet-writer.js' 5 | 6 | /** 7 | * @typedef {import('hyparquet').KeyValue} KeyValue 8 | * @typedef {import('hyparquet').SchemaElement} SchemaElement 9 | * @typedef {import('../src/types.d.ts').BasicType} BasicType 10 | * @typedef {import('../src/types.d.ts').ColumnSource} ColumnSource 11 | * @typedef {import('../src/types.d.ts').ParquetWriteOptions} ParquetWriteOptions 12 | * @typedef {import('../src/types.d.ts').Writer} Writer 13 | */ 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /test/write.roundtrip.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet' 3 | import { describe, expect, it } from 'vitest' 4 | import { parquetWriteBuffer } from '../src/index.js' 5 | 6 | describe('parquetWrite round-trip', () => { 7 | const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet')) 8 | 9 | files.forEach(filename => { 10 | it(`round-trips data from ${filename}`, async () => { 11 | const file = await asyncBufferFromFile(`test/files/${filename}`) 12 | const metadata = await parquetMetadataAsync(file) 13 | const rows = await parquetReadObjects({ file }) 14 | 15 | // transpose the row data 16 | const schemaTree = parquetSchema(metadata) 17 | const columnData = schemaTree.children.map(({ element }) => ({ 18 | name: element.name, 19 | data: new Array(), 20 | })) 21 | for (const row of rows) { 22 | for (const { name, data } of columnData) { 23 | data.push(row[name]) 24 | } 25 | } 26 | 27 | const buffer = parquetWriteBuffer({ columnData, schema: metadata.schema }) 28 | const output = await parquetReadObjects({ file: buffer }) 29 | 30 | expect(output.length).toBe(rows.length) 31 | expect(output).toEqual(rows) 32 | }) 33 | }) 34 | }) 35 | -------------------------------------------------------------------------------- /src/write.js: -------------------------------------------------------------------------------- 1 | import { ByteWriter } from './bytewriter.js' 2 | import { ParquetWriter } from './parquet-writer.js' 3 | import { schemaFromColumnData } from './schema.js' 4 | 5 | /** 6 | * Write data as parquet to a file or stream. 7 | * 8 | * @import {ParquetWriteOptions} from '../src/types.js' 9 | * @param {ParquetWriteOptions} options 10 | */ 11 | export function parquetWrite({ 12 | writer, 13 | columnData, 14 | schema, 15 | codec = 'SNAPPY', 16 | compressors, 17 | statistics = true, 18 | rowGroupSize = [100, 1000, 10000], 19 | kvMetadata, 20 | pageSize = 1048576, 21 | }) { 22 | if (!schema) { 23 | schema = schemaFromColumnData({ columnData }) 24 | } else if (columnData.some(({ type }) => type)) { 25 | throw new Error('cannot provide both schema and columnData type') 26 | } else { 27 | // TODO: validate schema 28 | } 29 | const pq = new ParquetWriter({ 30 | writer, 31 | schema, 32 | codec, 33 | compressors, 34 | statistics, 35 | kvMetadata, 36 | }) 37 | pq.write({ 38 | columnData, 39 | rowGroupSize, 40 | pageSize, 41 | }) 42 | pq.finish() 43 | } 44 | 45 | /** 46 | * Write data as parquet to an ArrayBuffer. 47 | * 48 | * @param {Omit} options 49 | * @returns {ArrayBuffer} 50 | */ 51 | export function parquetWriteBuffer(options) { 52 | const writer = new ByteWriter() 53 | parquetWrite({ ...options, writer }) 54 | return writer.getBuffer() 55 | } 56 | -------------------------------------------------------------------------------- /test/package.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import packageJson from '../package.json' with { type: 'json' } 3 | 4 | describe('package.json', () => { 5 | it('should have the correct name', () => { 6 | expect(packageJson.name).toBe('hyparquet-writer') 7 | }) 8 | 9 | it('should have a valid version', () => { 10 | expect(packageJson.version).toMatch(/^\d+\.\d+\.\d+$/) 11 | }) 12 | 13 | it('should have MIT license', () => { 14 | expect(packageJson.license).toBe('MIT') 15 | }) 16 | 17 | it('should have precise dev dependency versions', () => { 18 | const { dependencies, devDependencies } = packageJson 19 | const allDependencies = { ...dependencies, ...devDependencies } 20 | Object.values(allDependencies).forEach(version => { 21 | expect(version).toMatch(/^\d+\.\d+\.\d+$/) 22 | }) 23 | }) 24 | 25 | it('should have no peer dependencies', () => { 26 | expect('peerDependencies' in packageJson).toBe(false) 27 | }) 28 | 29 | it('should have exports with types first', () => { 30 | const { exports } = packageJson 31 | expect(Object.keys(exports)).toEqual(['.', './src/*.js']) 32 | // node vs default (browser) 33 | expect(Object.keys(exports['.'])).toEqual(['browser', 'default']) 34 | expect(Object.keys(exports['.'].browser)).toEqual(['types', 'default']) 35 | expect(Object.keys(exports['.'].default)).toEqual(['types', 'default']) 36 | // deep imports 37 | expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'default']) 38 | }) 39 | }) 40 | -------------------------------------------------------------------------------- /test/write.file.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects } from 'hyparquet' 3 | import { afterEach, beforeEach, describe, expect, it } from 'vitest' 4 | import { parquetWriteFile } from '../src/node.js' 5 | import { exampleData, exampleMetadata } from './example.js' 6 | 7 | const filedir = 'data/' 8 | const filename = 'data/write.file.parquet' 9 | 10 | describe('parquetWriteFile', () => { 11 | beforeEach(() => { 12 | // ensure data directory exists 13 | if (!fs.existsSync(filedir)) { 14 | fs.mkdirSync(filedir) 15 | } 16 | }) 17 | 18 | afterEach(() => { 19 | // remove test file 20 | if (fs.existsSync(filename)) { 21 | fs.unlinkSync(filename) 22 | } 23 | }) 24 | 25 | it('writes parquet file', async () => { 26 | parquetWriteFile({ filename, columnData: exampleData }) 27 | 28 | // check parquet metadata 29 | const file = await asyncBufferFromFile(filename) 30 | const metadata = await parquetMetadataAsync(file) 31 | expect(metadata).toEqual(exampleMetadata) 32 | 33 | // check parquet data 34 | const result = await parquetReadObjects({ file, metadata }) 35 | expect(result).toEqual([ 36 | { bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true }, 37 | { bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false }, 38 | { bool: true, int: 0x7fff, bigint: 0x7fffn, float: 123.45600128173828, double: 123.456, string: 'c', nullable: null }, 39 | { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, float: Infinity, double: 1e100, string: 'd', nullable: null }, 40 | ]) 41 | }) 42 | }) 43 | -------------------------------------------------------------------------------- /test/wkb.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { geojsonToWkb } from '../src/wkb.js' 3 | import { wkbToGeojson } from 'hyparquet/src/wkb.js' 4 | 5 | /** @import {Geometry} from 'hyparquet/src/types.js' */ 6 | 7 | describe('geojsonToWkb', () => { 8 | it('encodes point geometries', () => { 9 | /** @type {Geometry} */ 10 | const geometry = { type: 'Point', coordinates: [30, 10] } 11 | const decoded = decode(geojsonToWkb(geometry)) 12 | expect(decoded).toEqual(geometry) 13 | }) 14 | 15 | it('encodes polygons with holes', () => { 16 | /** @type {Geometry} */ 17 | const geometry = { 18 | type: 'Polygon', 19 | coordinates: [ 20 | [[35, 10], [45, 45], [15, 40], [10, 20], [35, 10]], 21 | [[20, 30], [35, 35], [30, 20], [20, 30]], 22 | ], 23 | } 24 | const decoded = decode(geojsonToWkb(geometry)) 25 | expect(decoded).toEqual(geometry) 26 | }) 27 | 28 | it('encodes geometry collections with mixed dimensions', () => { 29 | /** @type {Geometry} */ 30 | const geometry = { 31 | type: 'GeometryCollection', 32 | geometries: [ 33 | { type: 'Point', coordinates: [30, 10, 5] }, 34 | { type: 'LineString', coordinates: [[30, 10, 5], [40, 40, 5], [20, 40, 5], [10, 20, 5]] }, 35 | ], 36 | } 37 | const decoded = decode(geojsonToWkb(geometry)) 38 | expect(decoded).toEqual(geometry) 39 | }) 40 | }) 41 | 42 | /** 43 | * Decode WKB using the hyparquet reader for verification. 44 | * 45 | * @param {Uint8Array} wkb 46 | * @returns {Geometry} 47 | */ 48 | function decode(wkb) { 49 | const view = new DataView(wkb.buffer, wkb.byteOffset, wkb.byteLength) 50 | const reader = { view, offset: 0 } 51 | return wkbToGeojson(reader) 52 | } 53 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hyparquet-writer", 3 | "version": "0.11.2", 4 | "description": "Parquet file writer for JavaScript", 5 | "author": "Hyperparam", 6 | "homepage": "https://hyperparam.app", 7 | "keywords": [ 8 | "ai", 9 | "data", 10 | "hyperparam", 11 | "hyparquet", 12 | "ml", 13 | "parquet", 14 | "snappy", 15 | "thrift" 16 | ], 17 | "license": "MIT", 18 | "repository": { 19 | "type": "git", 20 | "url": "git+https://github.com/hyparam/hyparquet-writer.git" 21 | }, 22 | "main": "src/index.js", 23 | "files": [ 24 | "src", 25 | "types" 26 | ], 27 | "type": "module", 28 | "types": "types/index.d.ts", 29 | "exports": { 30 | ".": { 31 | "browser": { 32 | "types": "./types/index.d.ts", 33 | "default": "./src/index.js" 34 | }, 35 | "default": { 36 | "types": "./types/node.d.ts", 37 | "default": "./src/node.js" 38 | } 39 | }, 40 | "./src/*.js": { 41 | "types": "./types/*.d.ts", 42 | "default": "./src/*.js" 43 | } 44 | }, 45 | "sideEffects": false, 46 | "scripts": { 47 | "build:types": "tsc -p ./tsconfig.build.json", 48 | "coverage": "vitest run --coverage --coverage.include=src", 49 | "lint": "eslint", 50 | "lint:fix": "eslint --fix", 51 | "prepare": "npm run build:types", 52 | "test": "vitest run" 53 | }, 54 | "dependencies": { 55 | "hyparquet": "1.23.2" 56 | }, 57 | "devDependencies": { 58 | "@babel/eslint-parser": "7.28.5", 59 | "@types/node": "25.0.3", 60 | "@vitest/coverage-v8": "4.0.16", 61 | "eslint": "9.39.2", 62 | "eslint-plugin-jsdoc": "61.5.0", 63 | "hysnappy": "1.1.0", 64 | "typescript": "5.9.3", 65 | "vitest": "4.0.16" 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /test/write.geospatial.test.js: -------------------------------------------------------------------------------- 1 | import { parquetMetadata } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer } from '../src/index.js' 4 | 5 | /** 6 | * @import {ColumnSource} from '../src/types.js' 7 | */ 8 | 9 | describe('geospatial statistics', () => { 10 | it('writes geospatial statistics into column metadata', () => { 11 | /** @type {ColumnSource[]} */ 12 | const columnData = [{ 13 | name: 'geometry', 14 | type: 'GEOMETRY', 15 | data: [ 16 | { type: 'Point', coordinates: [10, 5, 100, 2] }, 17 | null, 18 | { 19 | type: 'LineString', 20 | coordinates: [ 21 | [-20, -10, 50, 5], 22 | [40, 30, 75, -5], 23 | ], 24 | }, 25 | { 26 | type: 'GeometryCollection', 27 | geometries: [ 28 | { type: 'Point', coordinates: [5, 15] }, 29 | { 30 | type: 'MultiPoint', 31 | coordinates: [ 32 | [0, -5], 33 | [60, 10], 34 | ], 35 | }, 36 | ], 37 | }, 38 | ], 39 | }] 40 | 41 | const buffer = parquetWriteBuffer({ columnData }) 42 | const metadata = parquetMetadata(buffer) 43 | const columnMeta = metadata.row_groups[0].columns[0].meta_data 44 | 45 | expect(columnMeta?.statistics).toEqual({ null_count: 1n }) 46 | expect(columnMeta?.geospatial_statistics).toEqual({ 47 | bbox: { 48 | xmin: -20, 49 | xmax: 60, 50 | ymin: -10, 51 | ymax: 30, 52 | zmin: 50, 53 | zmax: 100, 54 | mmin: -5, 55 | mmax: 5, 56 | }, 57 | // sort numerically not by string order 58 | geospatial_types: [7, 3001, 3002], 59 | }) 60 | }) 61 | }) 62 | -------------------------------------------------------------------------------- /src/node.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { ByteWriter } from './bytewriter.js' 3 | import { parquetWrite } from './write.js' 4 | 5 | export * from './index.js' 6 | 7 | /** 8 | * Write data as parquet to a local file. 9 | * 10 | * @import {ParquetWriteOptions, Writer} from '../src/types.js' 11 | * @param {Omit & {filename: string}} options 12 | */ 13 | export function parquetWriteFile(options) { 14 | const { filename, ...rest } = options 15 | const writer = fileWriter(filename) 16 | parquetWrite({ ...rest, writer }) 17 | } 18 | 19 | /** 20 | * Buffered file writer. 21 | * Writes data to a local file in chunks using node fs. 22 | * 23 | * @param {string} filename 24 | * @returns {Writer} 25 | */ 26 | export function fileWriter(filename) { 27 | const writer = new ByteWriter() 28 | const chunkSize = 1_000_000 // 1mb 29 | 30 | // create a new file or overwrite existing one 31 | fs.writeFileSync(filename, '', { flag: 'w' }) 32 | 33 | function flush() { 34 | const chunk = writer.buffer.slice(0, writer.index) 35 | // TODO: async 36 | fs.writeFileSync(filename, new Uint8Array(chunk), { flag: 'a' }) 37 | writer.index = 0 38 | } 39 | 40 | /** 41 | * Override the ensure method 42 | * @param {number} size 43 | */ 44 | writer.ensure = function(size) { 45 | if (writer.index > chunkSize) { 46 | flush() 47 | } 48 | if (writer.index + size > writer.buffer.byteLength) { 49 | const newSize = Math.max(writer.buffer.byteLength * 2, writer.index + size) 50 | const newBuffer = new ArrayBuffer(newSize) 51 | new Uint8Array(newBuffer).set(new Uint8Array(writer.buffer)) 52 | writer.buffer = newBuffer 53 | writer.view = new DataView(writer.buffer) 54 | } 55 | } 56 | writer.getBuffer = function() { 57 | throw new Error('getBuffer not supported for FileWriter') 58 | } 59 | writer.finish = function() { 60 | flush() 61 | } 62 | return writer 63 | } 64 | -------------------------------------------------------------------------------- /test/write.list.test.js: -------------------------------------------------------------------------------- 1 | import { parquetReadObjects } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer } from '../src/index.js' 4 | 5 | describe('parquetWrite lists', () => { 6 | it('writes optional list columns', async () => { 7 | const listy = [ 8 | [1, 2], 9 | null, 10 | [], 11 | [3, null, 4], 12 | [null], 13 | ] 14 | 15 | const buffer = parquetWriteBuffer({ 16 | columnData: [{ name: 'listy', data: listy }], 17 | schema: [ 18 | { name: 'root', num_children: 1 }, 19 | { 20 | name: 'listy', 21 | repetition_type: 'OPTIONAL', 22 | num_children: 1, 23 | converted_type: 'LIST', 24 | }, 25 | { 26 | name: 'list', 27 | repetition_type: 'REPEATED', 28 | num_children: 1, 29 | }, 30 | { 31 | name: 'element', 32 | repetition_type: 'OPTIONAL', 33 | type: 'INT32', 34 | }, 35 | ], 36 | }) 37 | 38 | const rows = await parquetReadObjects({ file: buffer }) 39 | expect(rows).toEqual([ 40 | { listy: [1, 2] }, 41 | { listy: undefined }, 42 | { listy: [] }, 43 | { listy: [3, null, 4] }, 44 | { listy: [null] }, 45 | ]) 46 | }) 47 | 48 | it('throws on null data for required list columns', () => { 49 | /** 50 | * Schema for a required list of required INT32 values. 51 | * @type {import('hyparquet').SchemaElement[]} 52 | */ 53 | const requiredListSchema = [ 54 | { name: 'root', num_children: 1 }, 55 | { 56 | name: 'numbers', 57 | repetition_type: 'REQUIRED', 58 | num_children: 1, 59 | converted_type: 'LIST', 60 | }, 61 | { 62 | name: 'list', 63 | repetition_type: 'REPEATED', 64 | num_children: 1, 65 | }, 66 | { 67 | name: 'element', 68 | repetition_type: 'REQUIRED', 69 | type: 'INT32', 70 | }, 71 | ] 72 | 73 | expect(() => parquetWriteBuffer({ 74 | columnData: [{ name: 'numbers', data: [[420], null] }], 75 | schema: requiredListSchema, 76 | })).toThrow('parquet required value is undefined') 77 | }) 78 | }) 79 | -------------------------------------------------------------------------------- /src/indexes.js: -------------------------------------------------------------------------------- 1 | import { BoundaryOrders } from 'hyparquet/src/constants.js' 2 | import { serializeTCompactProtocol } from './thrift.js' 3 | 4 | /** 5 | * @import {ColumnChunk, ColumnIndex, OffsetIndex} from 'hyparquet' 6 | * @import {PageIndexes, Writer} from '../src/types.js' 7 | */ 8 | 9 | /** 10 | * Write ColumnIndex and OffsetIndex for the given columns. 11 | * 12 | * @param {Writer} writer 13 | * @param {PageIndexes[]} pageIndexes 14 | */ 15 | export function writeIndexes(writer, pageIndexes) { 16 | for (const { chunk, columnIndex } of pageIndexes) { 17 | writeColumnIndex(writer, chunk, columnIndex) 18 | } 19 | for (const { chunk, offsetIndex } of pageIndexes) { 20 | writeOffsetIndex(writer, chunk, offsetIndex) 21 | } 22 | } 23 | 24 | /** 25 | * @param {Writer} writer 26 | * @param {ColumnChunk} columnChunk 27 | * @param {ColumnIndex} [columnIndex] 28 | */ 29 | function writeColumnIndex(writer, columnChunk, columnIndex) { 30 | // Page indexes only help when multiple pages 31 | if (!columnIndex || columnIndex.min_values.length <= 1) return 32 | const columnIndexOffset = writer.offset 33 | serializeTCompactProtocol(writer, { 34 | field_1: columnIndex.null_pages, 35 | field_2: columnIndex.min_values, 36 | field_3: columnIndex.max_values, 37 | field_4: BoundaryOrders.indexOf(columnIndex.boundary_order), 38 | field_5: columnIndex.null_counts, 39 | }) 40 | columnChunk.column_index_offset = BigInt(columnIndexOffset) 41 | columnChunk.column_index_length = writer.offset - columnIndexOffset 42 | } 43 | 44 | /** 45 | * @param {Writer} writer 46 | * @param {ColumnChunk} columnChunk 47 | * @param {OffsetIndex} [offsetIndex] 48 | */ 49 | function writeOffsetIndex(writer, columnChunk, offsetIndex) { 50 | // Page indexes only help when multiple pages 51 | if (!offsetIndex || offsetIndex.page_locations.length <= 1) return 52 | const offsetIndexOffset = writer.offset 53 | serializeTCompactProtocol(writer, { 54 | field_1: offsetIndex.page_locations.map(p => ({ 55 | field_1: p.offset, 56 | field_2: p.compressed_page_size, 57 | field_3: p.first_row_index, 58 | })), 59 | }) 60 | columnChunk.offset_index_offset = BigInt(offsetIndexOffset) 61 | columnChunk.offset_index_length = writer.offset - offsetIndexOffset 62 | } 63 | -------------------------------------------------------------------------------- /test/snappy.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { snappyCompress } from '../src/snappy.js' 3 | import { snappyUncompress } from 'hyparquet' 4 | 5 | describe('snappy compress', () => { 6 | 7 | it.for([ 8 | { compressed: [0x00], uncompressed: '' }, 9 | { compressed: [0x01, 0x00, 0x68], uncompressed: 'h' }, 10 | { compressed: [0x02, 0x04, 0x68, 0x79], uncompressed: 'hy' }, 11 | { compressed: [0x03, 0x08, 0x68, 0x79, 0x70], uncompressed: 'hyp' }, 12 | { compressed: [0x05, 0x10, 0x68, 0x79, 0x70, 0x65, 0x72], uncompressed: 'hyper' }, 13 | { 14 | compressed: [0x0a, 0x24, 0x68, 0x79, 0x70, 0x65, 0x72, 0x70, 0x61, 0x72, 0x61, 0x6d], 15 | uncompressed: 'hyperparam', 16 | }, 17 | { 18 | compressed: [0x15, 0x08, 0x68, 0x79, 0x70, 0x46, 0x03, 0x00], 19 | uncompressed: 'hyphyphyphyphyphyphyp', 20 | }, 21 | { 22 | // from rowgroups.parquet 23 | compressed: [ 24 | 80, 4, 1, 0, 9, 1, 0, 2, 9, 7, 4, 0, 3, 13, 8, 0, 4, 13, 8, 0, 5, 13, 25 | 8, 0, 6, 13, 8, 0, 7, 13, 8, 0, 8, 13, 8, 60, 9, 0, 0, 0, 0, 0, 0, 0, 26 | 10, 0, 0, 0, 0, 0, 0, 0, 27 | ], 28 | uncompressed: new Uint8Array([ 29 | 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 30 | 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 31 | 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 32 | 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 33 | ]), 34 | }, 35 | // from datapage_v2.snappy.parquet 36 | { compressed: [2, 4, 0, 3], uncompressed: new Uint8Array([0, 3]) }, 37 | { compressed: [ 6, 20, 2, 0, 0, 0, 3, 23], uncompressed: new Uint8Array([2, 0, 0, 0, 3, 23]) }, 38 | // from sample data test 39 | { 40 | compressed: [1, 0, 5], 41 | uncompressed: new Uint8Array([5]), 42 | }, 43 | ])('compresses valid input %p', ({ uncompressed }) => { 44 | const encoder = new TextEncoder() 45 | const input = typeof uncompressed === 'string' ? encoder.encode(uncompressed) : new Uint8Array(uncompressed) 46 | const output = snappyCompress(input) 47 | // verify round-trip: decompress and compare to original 48 | const decompressed = new Uint8Array(input.length) 49 | snappyUncompress(output, decompressed) 50 | expect(decompressed).toEqual(input) 51 | }) 52 | }) 53 | -------------------------------------------------------------------------------- /eslint.config.js: -------------------------------------------------------------------------------- 1 | import javascript from '@eslint/js' 2 | import jsdoc from 'eslint-plugin-jsdoc' 3 | 4 | export default [ 5 | { 6 | plugins: { 7 | jsdoc, 8 | }, 9 | 10 | languageOptions: { 11 | globals: { 12 | 'TextDecoder': false, 13 | 'TextEncoder': false, 14 | // for benchmark: 15 | 'console': false, 16 | 'fetch': false, 17 | 'performance': false, 18 | }, 19 | }, 20 | 21 | rules: { 22 | ...javascript.configs.recommended.rules, 23 | 'arrow-spacing': 'error', 24 | camelcase: 'off', 25 | 'comma-spacing': 'error', 26 | 'comma-dangle': ['error', { 27 | arrays: 'always-multiline', 28 | objects: 'always-multiline', 29 | imports: 'always-multiline', 30 | exports: 'always-multiline', 31 | functions: 'never', 32 | }], 33 | 'eol-last': 'error', 34 | eqeqeq: 'error', 35 | 'func-style': ['error', 'declaration'], 36 | indent: ['error', 2], 37 | 'jsdoc/check-param-names': 'error', 38 | 'jsdoc/check-property-names': 'error', 39 | 'jsdoc/check-tag-names': 'error', 40 | 'jsdoc/require-param': 'error', 41 | 'jsdoc/require-param-type': 'error', 42 | 'jsdoc/require-returns': 'error', 43 | 'jsdoc/require-returns-type': 'error', 44 | 'jsdoc/sort-tags': 'error', 45 | 'key-spacing': 'error', 46 | 'no-constant-condition': 'warn', 47 | 'no-extra-parens': 'warn', 48 | 'no-multi-spaces': 'error', 49 | 'no-trailing-spaces': 'error', 50 | 'no-undef': 'error', 51 | 'no-unused-vars': 'error', 52 | 'no-useless-concat': 'error', 53 | 'no-useless-rename': 'error', 54 | 'no-useless-return': 'error', 55 | 'no-var': 'error', 56 | 'object-curly-spacing': ['error', 'always'], 57 | 'object-shorthand': 'error', 58 | 'prefer-const': 'error', 59 | 'prefer-exponentiation-operator': 'error', 60 | 'prefer-promise-reject-errors': 'error', 61 | quotes: ['error', 'single'], 62 | 'require-await': 'warn', 63 | semi: ['error', 'never'], 64 | 'sort-imports': ['error', { 65 | ignoreDeclarationSort: true, 66 | ignoreMemberSort: false, 67 | memberSyntaxSortOrder: ['none', 'all', 'multiple', 'single'], 68 | }], 69 | 'space-infix-ops': 'error', 70 | }, 71 | }, 72 | ] 73 | -------------------------------------------------------------------------------- /test/encoding.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { ByteWriter } from '../src/bytewriter.js' 3 | import { writeRleBitPackedHybrid } from '../src/encoding.js' 4 | import { readRleBitPackedHybrid } from 'hyparquet/src/encoding.js' 5 | 6 | /** 7 | * Round-trip serialize and deserialize the given values. 8 | * 9 | * @param {number[]} values 10 | * @returns {number[]} 11 | */ 12 | function roundTripDeserialize(values) { 13 | const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1)) 14 | 15 | // Serialize the values using writeRleBitPackedHybrid 16 | const writer = new ByteWriter() 17 | writeRleBitPackedHybrid(writer, values, bitWidth) 18 | const buffer = writer.getBuffer() 19 | const reader = { view: new DataView(buffer), offset: 0 } 20 | 21 | // Decode the values using readRleBitPackedHybrid from hyparquet 22 | /** @type {number[]} */ 23 | const output = new Array(values.length) 24 | readRleBitPackedHybrid(reader, bitWidth, output, values.length) 25 | return output 26 | } 27 | 28 | describe('RLE bit-packed hybrid', () => { 29 | it('should round-trip a typical array of values', () => { 30 | const original = [1, 2, 3, 4, 5, 6, 7, 8, 9] 31 | const decoded = roundTripDeserialize(original) 32 | expect(decoded).toEqual(original) 33 | }) 34 | 35 | it('should round-trip an empty array', () => { 36 | const decoded = roundTripDeserialize([]) 37 | expect(decoded).toEqual([]) 38 | }) 39 | 40 | it('should round-trip an array of zeros', () => { 41 | const original = [0, 0, 0, 0, 0, 0, 0, 0] 42 | const decoded = roundTripDeserialize(original) 43 | expect(decoded).toEqual(original) 44 | }) 45 | 46 | it('should round-trip an array with large numbers', () => { 47 | const original = [1023, 511, 255, 127, 63, 31, 15, 7] 48 | const decoded = roundTripDeserialize(original) 49 | expect(decoded).toEqual(original) 50 | }) 51 | 52 | it('should round-trip a random array of values', () => { 53 | const original = Array.from({ length: 20 }, () => 54 | Math.floor(Math.random() * 1000) 55 | ) 56 | const decoded = roundTripDeserialize(original) 57 | expect(decoded).toEqual(original) 58 | }) 59 | 60 | it('should round-trip a sparse array of booleans', () => { 61 | const original = Array(10000).fill(0) 62 | original[10] = 1 63 | original[100] = 1 64 | original[500] = 1 65 | original[9999] = 1 66 | const decoded = roundTripDeserialize(original) 67 | expect(decoded).toEqual(original) 68 | }) 69 | }) 70 | -------------------------------------------------------------------------------- /benchmark.js: -------------------------------------------------------------------------------- 1 | import { createWriteStream, promises as fs } from 'fs' 2 | import { pipeline } from 'stream/promises' 3 | import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet' 4 | import { snappyCompressor } from 'hysnappy' 5 | import { parquetWriteFile } from './src/node.js' 6 | 7 | const url = 'https://s3.hyperparam.app/wiki-en-00000-of-00041.parquet' 8 | const filename = 'data/wiki-en-00000-of-00041.parquet' 9 | 10 | // download test parquet file if needed 11 | let stat = await fs.stat(filename).catch(() => undefined) 12 | if (!stat) { 13 | console.log('downloading ' + url) 14 | const res = await fetch(url) 15 | if (!res.ok) throw new Error(res.statusText) 16 | // write to file async 17 | await pipeline(res.body, createWriteStream(filename)) 18 | stat = await fs.stat(filename) 19 | console.log('downloaded example.parquet', stat.size) 20 | } 21 | 22 | // asyncBuffer 23 | const file = await asyncBufferFromFile(filename) 24 | console.log(`parsing ${filename} (${stat.size.toLocaleString()} bytes)`) 25 | let startTime = performance.now() 26 | 27 | // read parquet file 28 | const metadata = await parquetMetadataAsync(file) 29 | const rows = await parquetReadObjects({ 30 | file, 31 | metadata, 32 | // columns: ['l_comment'], 33 | // rowStart: 0, 34 | // rowEnd: 100_000, 35 | }) 36 | let ms = performance.now() - startTime 37 | console.log(`parsed ${filename} ${rows.length.toLocaleString()} rows in ${ms.toFixed(0)} ms`) 38 | 39 | // transpose rows 40 | const schema = parquetSchema(metadata) 41 | const columnData = schema.children.map(({ element }) => ({ 42 | name: element.name, 43 | data: [], 44 | pageIndex: true, 45 | })) // .filter(({ name }) => name === 'l_comment') 46 | for (const row of rows) { 47 | for (const { name, data } of columnData) { 48 | data.push(row[name]) 49 | } 50 | } 51 | 52 | // write parquet file 53 | const outputFilename = 'data/benchmark-output.parquet' 54 | console.log(`writing ${outputFilename} (${rows.length.toLocaleString()} rows)`) 55 | startTime = performance.now() 56 | parquetWriteFile({ 57 | filename: outputFilename, 58 | columnData, 59 | schema: metadata.schema, 60 | compressors: { SNAPPY: snappyCompressor() }, 61 | // rowGroupSize: 200_000, 62 | }) 63 | ms = performance.now() - startTime 64 | stat = await fs.stat(outputFilename) 65 | console.log(`wrote ${outputFilename} (${stat.size.toLocaleString()} bytes) in ${ms.toFixed(0)} ms`) 66 | 67 | // check data is the same 68 | const outputFile = await asyncBufferFromFile(outputFilename) 69 | const outputRows = await parquetReadObjects({ file: outputFile }) 70 | for (let i = 0; i < rows.length; i++) { 71 | const inputRow = JSON.stringify(rows[i]) 72 | const outputRow = JSON.stringify(outputRows[i]) 73 | if (inputRow !== outputRow) { 74 | console.log(`row ${i} mismatch`) 75 | console.log('input ', inputRow) 76 | console.log('output', outputRow) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /test/geospatial.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { geospatialStatistics } from '../src/geospatial.js' 3 | 4 | describe('geospatialStatistics', () => { 5 | it('computes bounding boxes and geospatial type codes for nested inputs', () => { 6 | const result = geospatialStatistics([ 7 | null, 8 | undefined, 9 | { type: 'Point', coordinates: [1, 2] }, 10 | { 11 | type: 'LineString', 12 | coordinates: [ 13 | [5, -1, 10], 14 | [0, 3, -5], 15 | [2, 2, undefined], 16 | [6, 1, Infinity], 17 | ], 18 | }, 19 | { 20 | type: 'Polygon', 21 | coordinates: [ 22 | [ 23 | [9, 9, 1, 5], 24 | [9, 10, 3, 5], 25 | [8, 9, -4, 8], 26 | [7, 8, Infinity, Infinity], 27 | ], 28 | ], 29 | }, 30 | { 31 | type: 'MultiPoint', 32 | coordinates: [ 33 | [-5, -5, 0, -10], 34 | [4, 4, 12, undefined], 35 | ], 36 | }, 37 | { type: 'MultiPolygon', coordinates: [] }, 38 | { 39 | type: 'MultiLineString', 40 | coordinates: [ 41 | [ 42 | [ 43 | [Infinity, 0], 44 | ], 45 | ], 46 | ], 47 | }, 48 | { 49 | type: 'GeometryCollection', 50 | geometries: [ 51 | { type: 'Point', coordinates: [2, -3, 7, 9] }, 52 | { type: 'MultiPoint', coordinates: [[60, 10, 0, 11], [3, 6]] }, 53 | ], 54 | }, 55 | { type: 'GeometryCollection', geometries: [] }, 56 | ]) 57 | 58 | expect(result).toEqual({ 59 | bbox: { 60 | xmin: -5, 61 | xmax: 60, 62 | ymin: -5, 63 | ymax: 10, 64 | zmin: -5, 65 | zmax: 12, 66 | mmin: -10, 67 | mmax: 11, 68 | }, 69 | geospatial_types: [1, 5, 6, 7, 1002, 3003, 3004, 3007], 70 | }) 71 | }) 72 | 73 | it('omits geospatial statistics when only null-like values are present', () => { 74 | const result = geospatialStatistics([null, undefined, null]) 75 | expect(result).toBeUndefined() 76 | }) 77 | 78 | it('tracks type codes even when coordinates are empty', () => { 79 | const result = geospatialStatistics([ 80 | { type: 'Point', coordinates: [] }, 81 | ]) 82 | expect(result).toEqual({ 83 | bbox: undefined, 84 | geospatial_types: [1], 85 | }) 86 | }) 87 | 88 | it('throws on invalid value types and geometry definitions', () => { 89 | expect(() => geospatialStatistics(['oops'])).toThrow('geospatial column expects GeoJSON geometries') 90 | expect(() => geospatialStatistics([{ type: 'Unknown', coordinates: [] }])).toThrow('unknown geometry type: Unknown') 91 | expect(() => geospatialStatistics([{ type: 'Point', coordinates: [0, 0, 0, 0, 0] }])).toThrow('unsupported geometry dimensions: 5') 92 | }) 93 | }) 94 | -------------------------------------------------------------------------------- /src/splitstream.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Write values using BYTE_STREAM_SPLIT encoding. 3 | * This encoding writes all first bytes of values, then all second bytes, etc. 4 | * Can improve compression for floating-point and fixed-width numeric data. 5 | * 6 | * @import {DecodedArray, ParquetType} from 'hyparquet' 7 | * @import {Writer} from '../src/types.js' 8 | * @param {Writer} writer 9 | * @param {DecodedArray} values 10 | * @param {ParquetType} type 11 | * @param {number | undefined} typeLength 12 | */ 13 | export function writeByteStreamSplit(writer, values, type, typeLength) { 14 | const count = values.length 15 | 16 | // Get bytes from values based on type 17 | /** @type {Uint8Array} */ 18 | let bytes 19 | /** @type {number} */ 20 | let width 21 | if (type === 'FLOAT') { 22 | const typed = values instanceof Float32Array ? values : new Float32Array(numberArray(values)) 23 | bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength) 24 | width = 4 25 | } else if (type === 'DOUBLE') { 26 | const typed = values instanceof Float64Array ? values : new Float64Array(numberArray(values)) 27 | bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength) 28 | width = 8 29 | } else if (type === 'INT32') { 30 | const typed = values instanceof Int32Array ? values : new Int32Array(numberArray(values)) 31 | bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength) 32 | width = 4 33 | } else if (type === 'INT64') { 34 | const typed = bigIntArray(values) 35 | bytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength) 36 | width = 8 37 | } else if (type === 'FIXED_LEN_BYTE_ARRAY') { 38 | if (!typeLength) throw new Error('parquet byte_stream_split missing type_length') 39 | width = typeLength 40 | bytes = new Uint8Array(count * width) 41 | for (let i = 0; i < count; i++) { 42 | bytes.set(values[i], i * width) 43 | } 44 | } else { 45 | throw new Error(`parquet byte_stream_split unsupported type: ${type}`) 46 | } 47 | 48 | // Write bytes in column format (all byte 0 from all values, then byte 1, etc.) 49 | for (let b = 0; b < width; b++) { 50 | for (let i = 0; i < count; i++) { 51 | writer.appendUint8(bytes[i * width + b]) 52 | } 53 | } 54 | } 55 | 56 | /** 57 | * @param {DecodedArray} values 58 | * @returns {number[]} 59 | */ 60 | function numberArray(values) { 61 | if (Array.isArray(values) && values.every(v => typeof v === 'number')) { 62 | return values 63 | } 64 | throw new Error('Expected number array for BYTE_STREAM_SPLIT encoding') 65 | } 66 | 67 | /** 68 | * @param {DecodedArray} values 69 | * @returns {BigInt64Array} 70 | */ 71 | function bigIntArray(values) { 72 | if (values instanceof BigInt64Array) return values 73 | if (Array.isArray(values) && values.every(v => typeof v === 'bigint')) { 74 | return new BigInt64Array(values) 75 | } 76 | throw new Error('Expected bigint array for BYTE_STREAM_SPLIT encoding') 77 | } 78 | 79 | -------------------------------------------------------------------------------- /src/types.d.ts: -------------------------------------------------------------------------------- 1 | import type { ColumnChunk, ColumnIndex, CompressionCodec, DecodedArray, Encoding, KeyValue, OffsetIndex, SchemaElement } from 'hyparquet' 2 | 3 | export type Compressor = (input: Uint8Array) => Uint8Array 4 | export type Compressors = { [K in CompressionCodec]?: Compressor } 5 | 6 | // Superset of parquet types with automatic conversions 7 | export type BasicType = 8 | 'BOOLEAN' | 9 | 'INT32' | 10 | 'INT64' | 11 | 'FLOAT' | 12 | 'DOUBLE' | 13 | 'BYTE_ARRAY' | 14 | 'STRING' | 15 | 'JSON' | 16 | 'TIMESTAMP' | 17 | 'UUID' | 18 | 'FLOAT16' | 19 | 'GEOMETRY' | 20 | 'GEOGRAPHY' 21 | 22 | export interface ParquetWriteOptions { 23 | writer: Writer 24 | columnData: ColumnSource[] 25 | schema?: SchemaElement[] 26 | codec?: CompressionCodec // global default codec, default 'SNAPPY' 27 | compressors?: Compressors // custom compressors 28 | statistics?: boolean // enable column statistics, default true 29 | rowGroupSize?: number | number[] // number of rows per row group 30 | pageSize?: number // target uncompressed page size in bytes, default 1048576 31 | kvMetadata?: KeyValue[] 32 | } 33 | 34 | export interface ColumnSource { 35 | name: string 36 | data: DecodedArray 37 | type?: BasicType 38 | nullable?: boolean 39 | encoding?: Encoding 40 | columnIndex?: boolean // write column indexes, default false 41 | offsetIndex?: boolean // write offset indexes, default false 42 | } 43 | 44 | export interface PageData { 45 | values: DecodedArray 46 | definitionLevels: number[] 47 | repetitionLevels: number[] 48 | numNulls: number 49 | } 50 | 51 | export interface ColumnEncoder { 52 | columnName: string 53 | element: SchemaElement 54 | schemaPath: SchemaElement[] 55 | codec: CompressionCodec 56 | compressors: Compressors 57 | stats: boolean 58 | pageSize: number 59 | // Spec: If ColumnIndex is present, OffsetIndex must also be present 60 | columnIndex: boolean 61 | offsetIndex: boolean 62 | encoding?: Encoding // user-specified encoding 63 | } 64 | 65 | export interface PageIndexes { 66 | chunk: ColumnChunk 67 | columnIndex?: ColumnIndex 68 | offsetIndex?: OffsetIndex 69 | } 70 | 71 | export interface Writer { 72 | buffer: ArrayBuffer 73 | view: DataView 74 | offset: number 75 | 76 | ensure(size: number): void 77 | finish(): void 78 | getBuffer(): ArrayBuffer 79 | appendUint8(value: number): void 80 | appendUint32(value: number): void 81 | appendInt32(value: number): void 82 | appendInt64(value: bigint): void 83 | appendFloat32(value: number): void 84 | appendFloat64(value: number): void 85 | appendBuffer(buffer: ArrayBuffer): void 86 | appendBytes(value: Uint8Array): void 87 | appendVarInt(value: number): void 88 | appendVarBigInt(value: bigint): void 89 | appendZigZag(value: number | bigint): void 90 | } 91 | 92 | export type ThriftObject = { [ key: `field_${number}` ]: ThriftType } 93 | export type ThriftType = boolean | number | bigint | string | Uint8Array | ThriftType[] | ThriftObject | undefined 94 | -------------------------------------------------------------------------------- /test/filewriter.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { afterEach, beforeEach, describe, expect, it } from 'vitest' 3 | import { fileWriter } from '../src/node.js' 4 | 5 | const filedir = 'data/' 6 | const filename = 'data/filewriter.test.bin' 7 | 8 | describe('FileWriter', () => { 9 | beforeEach(() => { 10 | // ensure data directory exists 11 | if (!fs.existsSync(filedir)) { 12 | fs.mkdirSync(filedir) 13 | } 14 | }) 15 | 16 | afterEach(() => { 17 | // remove test file 18 | if (fs.existsSync(filename)) { 19 | fs.unlinkSync(filename) 20 | } 21 | }) 22 | 23 | it('throws an error when calling getBuffer', () => { 24 | const writer = fileWriter(filename) 25 | expect(() => writer.getBuffer()).toThrowError('getBuffer not supported') 26 | }) 27 | 28 | it('writes single byte and flushes on finish', () => { 29 | const writer = fileWriter(filename) 30 | writer.appendUint8(0xff) 31 | writer.finish() 32 | 33 | // verify file exists and content is correct 34 | expect(fs.existsSync(filename)).toBe(true) 35 | const contents = fs.readFileSync(filename) 36 | expect(new Uint8Array(contents)).toEqual(new Uint8Array([0xff])) 37 | }) 38 | 39 | it('writes multiple data types to file', () => { 40 | const writer = fileWriter(filename) 41 | writer.appendUint8(0xab) 42 | writer.appendUint32(0x12345678) 43 | writer.appendInt32(-1) 44 | writer.appendInt64(0x1122334455667788n) 45 | writer.appendVarInt(300) 46 | writer.finish() 47 | 48 | const contents = new Uint8Array(fs.readFileSync(filename)) 49 | 50 | const expected = new Uint8Array([ 51 | 0xab, 52 | 0x78, 0x56, 0x34, 0x12, 53 | 0xff, 0xff, 0xff, 0xff, 54 | 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 55 | 0xac, 0x02, 56 | ]) 57 | expect(contents).toEqual(expected) 58 | }) 59 | 60 | it('auto-flushes when exceeding chunk size', () => { 61 | // default chunkSize = 1_000_000 bytes 62 | const writer = fileWriter(filename) 63 | 64 | // write slightly over 1mb to trigger auto-flush 65 | const largeArray = new Uint8Array(1_100_000).fill(0xaa) 66 | writer.appendBytes(largeArray) 67 | writer.appendBytes(largeArray) 68 | 69 | // expect first flush 70 | expect(fs.statSync(filename).size).toBe(1_100_000) 71 | 72 | writer.finish() 73 | 74 | // expect final flush 75 | expect(fs.statSync(filename).size).toBe(2_200_000) 76 | }) 77 | 78 | it('overwrites existing file if new writer is created with same filename', () => { 79 | // first write 80 | let writer = fileWriter(filename) 81 | writer.appendBytes(new Uint8Array([0x11, 0x22])) 82 | writer.finish() 83 | 84 | // verify the file now has [0x11, 0x22] 85 | let contents = fs.readFileSync(filename) 86 | expect(new Uint8Array(contents)).toEqual(new Uint8Array([0x11, 0x22])) 87 | 88 | // second write 89 | writer = fileWriter(filename) 90 | writer.appendBytes(new Uint8Array([0xaa, 0xbb])) 91 | writer.finish() 92 | 93 | // should overwrite the previous content 94 | contents = fs.readFileSync(filename) 95 | expect(new Uint8Array(contents)).toEqual(new Uint8Array([0xaa, 0xbb])) 96 | }) 97 | }) 98 | -------------------------------------------------------------------------------- /src/encoding.js: -------------------------------------------------------------------------------- 1 | import { ByteWriter } from './bytewriter.js' 2 | 3 | /** 4 | * @import {DecodedArray} from 'hyparquet' 5 | * @import {Writer} from '../src/types.js' 6 | * @param {Writer} writer 7 | * @param {DecodedArray} values 8 | * @param {number} bitWidth 9 | * @returns {number} bytes written 10 | */ 11 | export function writeRleBitPackedHybrid(writer, values, bitWidth) { 12 | const offsetStart = writer.offset 13 | 14 | // try both RLE and bit-packed and choose the best 15 | const rle = new ByteWriter() 16 | writeRle(rle, values, bitWidth) 17 | const bitPacked = new ByteWriter() 18 | writeBitPacked(bitPacked, values, bitWidth) 19 | 20 | if (rle.offset < bitPacked.offset) { 21 | writer.appendBuffer(rle.getBuffer()) 22 | } else { 23 | writer.appendBuffer(bitPacked.getBuffer()) 24 | } 25 | 26 | return writer.offset - offsetStart 27 | } 28 | 29 | /** 30 | * @param {Writer} writer 31 | * @param {DecodedArray} values 32 | * @param {number} bitWidth 33 | */ 34 | function writeBitPacked(writer, values, bitWidth) { 35 | // Number of 8-value groups 36 | const numGroups = Math.ceil(values.length / 8) 37 | 38 | // The parquet bitpack header: lower bit = 1 => "bit-packed mode" 39 | // upper bits = number of groups 40 | const header = numGroups << 1 | 1 41 | 42 | // Write the header as a varint 43 | writer.appendVarInt(header) 44 | 45 | // If bitWidth = 0, no data is actually needed 46 | if (bitWidth === 0 || values.length === 0) { 47 | return 48 | } 49 | 50 | const mask = (1 << bitWidth) - 1 51 | let buffer = 0 // accumulates bits 52 | let bitsUsed = 0 // how many bits are in 'buffer' so far 53 | 54 | // Write out each value, bit-packing into buffer 55 | for (let i = 0; i < values.length; i++) { 56 | const v = values[i] & mask // mask off bits exceeding bitWidth 57 | buffer |= v << bitsUsed 58 | bitsUsed += bitWidth 59 | 60 | // Flush full bytes 61 | while (bitsUsed >= 8) { 62 | writer.appendUint8(buffer & 0xFF) 63 | buffer >>>= 8 64 | bitsUsed -= 8 65 | } 66 | } 67 | 68 | // Pad the final partial group with zeros if needed 69 | const totalNeeded = numGroups * 8 70 | for (let padCount = values.length; padCount < totalNeeded; padCount++) { 71 | // Just write a 0 value into the buffer 72 | buffer |= 0 << bitsUsed 73 | bitsUsed += bitWidth 74 | while (bitsUsed >= 8) { 75 | writer.appendUint8(buffer & 0xFF) 76 | buffer >>>= 8 77 | bitsUsed -= 8 78 | } 79 | } 80 | 81 | // Flush any remaining bits 82 | if (bitsUsed > 0) { 83 | writer.appendUint8(buffer & 0xff) 84 | } 85 | } 86 | 87 | /** 88 | * Run-length encoding: write repeated values by encoding the value and its count. 89 | * 90 | * @param {Writer} writer 91 | * @param {DecodedArray} values 92 | * @param {number} bitWidth 93 | */ 94 | function writeRle(writer, values, bitWidth) { 95 | if (!values.length) return 96 | 97 | let currentValue = values[0] 98 | let count = 1 99 | 100 | for (let i = 1; i <= values.length; i++) { 101 | if (i < values.length && values[i] === currentValue) { 102 | count++ // continue the run 103 | } else { 104 | // write the count of repeated values 105 | const header = count << 1 106 | writer.appendVarInt(header) 107 | 108 | // write the value 109 | const width = bitWidth + 7 >> 3 // bytes needed 110 | for (let j = 0; j < width; j++) { 111 | writer.appendUint8(currentValue >> (j << 3) & 0xff) 112 | } 113 | 114 | // reset for the next run 115 | if (i < values.length) { 116 | currentValue = values[i] 117 | count = 1 118 | } 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /test/schema.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js' 3 | 4 | /** 5 | * @import {SchemaElement} from 'hyparquet' 6 | */ 7 | 8 | describe('schemaFromColumnData', () => { 9 | it('honours provided type with nullable = false → REQUIRED', () => { 10 | const schema = schemaFromColumnData({ 11 | columnData: [ 12 | { name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false }, 13 | ], 14 | }) 15 | expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' }) 16 | }) 17 | 18 | it('applies valid schema override verbatim', () => { 19 | const schema = schemaFromColumnData({ 20 | columnData: [{ name: 'strings', data: ['a', 'b'] }], 21 | schemaOverrides: { 22 | strings: { 23 | name: 'strings', 24 | type: 'BYTE_ARRAY', 25 | converted_type: 'UTF8', 26 | repetition_type: 'OPTIONAL', 27 | }, 28 | }, 29 | }) 30 | expect(schema[1].name).toBe('strings') 31 | expect(schema[1].type).toBe('BYTE_ARRAY') 32 | expect(schema[1].converted_type).toBe('UTF8') 33 | expect(schema[1].repetition_type).toBe('OPTIONAL') 34 | }) 35 | 36 | it('throws when column lengths differ', () => { 37 | expect(() => 38 | schemaFromColumnData({ 39 | columnData: [ 40 | { name: 'a', data: new Int32Array([1]) }, 41 | { name: 'b', data: new Int32Array([1, 2]) }, 42 | ], 43 | }) 44 | ).toThrow(/columns must have the same length/) 45 | }) 46 | 47 | it('rejects override with mismatched name', () => { 48 | expect(() => 49 | schemaFromColumnData({ 50 | columnData: [{ name: 'x', data: new Int32Array([1]) }], 51 | schemaOverrides: { x: { name: 'y', type: 'INT32' } }, 52 | }) 53 | ).toThrow(/does not match column name/) 54 | }) 55 | }) 56 | 57 | describe('autoSchemaElement', () => { 58 | it.each([ 59 | [new Int32Array([1, 2]), 'INT32'], 60 | [new BigInt64Array([1n, 2n]), 'INT64'], 61 | [new Float32Array([1, 2]), 'FLOAT'], 62 | [new Float64Array([1, 2]), 'DOUBLE'], 63 | ])('detects typed arrays (%#)', (data, expected) => { 64 | const el = autoSchemaElement('col', data) 65 | expect(el.type).toBe(expected) 66 | expect(el.repetition_type).toBe('REQUIRED') 67 | }) 68 | 69 | it('promotes INT32 + DOUBLE mix to DOUBLE', () => { 70 | const el = autoSchemaElement('mix', [1, 2.5]) 71 | expect(el.type).toBe('DOUBLE') 72 | }) 73 | 74 | it('sets repetition_type OPTIONAL when nulls present', () => { 75 | const el = autoSchemaElement('maybe', [null, 1]) 76 | expect(el.repetition_type).toBe('OPTIONAL') 77 | }) 78 | 79 | it('falls back to BYTE_ARRAY for empty arrays', () => { 80 | const el = autoSchemaElement('empty', []) 81 | expect(el.type).toBe('BYTE_ARRAY') 82 | expect(el.repetition_type).toBe('OPTIONAL') 83 | }) 84 | 85 | it('throws on incompatible mixed scalar types', () => { 86 | expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/) 87 | }) 88 | }) 89 | 90 | describe('level helpers', () => { 91 | /** @type {SchemaElement[]} */ 92 | const path = [ 93 | { name: 'root', repetition_type: 'REPEATED' }, 94 | { name: 'child', repetition_type: 'OPTIONAL' }, 95 | { name: 'leaf', repetition_type: 'REPEATED' }, 96 | ] 97 | 98 | it('computes max repetition level', () => { 99 | expect(getMaxRepetitionLevel(path)).toBe(2) 100 | }) 101 | 102 | it('computes max definition level', () => { 103 | expect(getMaxDefinitionLevel(path)).toBe(2) 104 | }) 105 | }) 106 | -------------------------------------------------------------------------------- /test/bytewriter.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { ByteWriter } from '../src/bytewriter.js' 3 | 4 | describe('ByteWriter', () => { 5 | it('initializes with correct defaults', () => { 6 | const writer = new ByteWriter() 7 | expect(writer.offset).toBe(0) 8 | expect(writer.buffer.byteLength).toBe(1024) 9 | }) 10 | 11 | it('appendUint8 writes single byte', () => { 12 | const writer = new ByteWriter() 13 | writer.appendUint8(255) 14 | expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([0xff])) 15 | }) 16 | 17 | it('appendUint32 writes a 32-bit integer in little-endian', () => { 18 | const writer = new ByteWriter() 19 | writer.appendUint32(0x12345678) 20 | expect(new Uint8Array(writer.getBuffer())).toEqual( 21 | new Uint8Array([0x78, 0x56, 0x34, 0x12]) 22 | ) 23 | }) 24 | 25 | it('appendInt32 writes signed 32-bit integer in little-endian', () => { 26 | const writer = new ByteWriter() 27 | writer.appendInt32(-1) 28 | expect(new Uint8Array(writer.getBuffer())).toEqual( 29 | new Uint8Array([0xff, 0xff, 0xff, 0xff]) 30 | ) 31 | }) 32 | 33 | it('appendInt64 writes a 64-bit bigint in little-endian', () => { 34 | const writer = new ByteWriter() 35 | writer.appendInt64(0x1122334455667788n) 36 | expect(new Uint8Array(writer.getBuffer())).toEqual( 37 | new Uint8Array([0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11]) 38 | ) 39 | }) 40 | 41 | it('appendFloat64 writes a 64-bit float in little-endian', () => { 42 | const writer = new ByteWriter() 43 | writer.appendFloat64(1.0) 44 | expect(new Uint8Array(writer.getBuffer())).toEqual( 45 | new Uint8Array([0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f]) 46 | ) 47 | }) 48 | 49 | it('appendBytes writes raw Uint8Array data', () => { 50 | const writer = new ByteWriter() 51 | writer.appendBytes(new Uint8Array([1, 2, 3, 4])) 52 | expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([1, 2, 3, 4])) 53 | }) 54 | 55 | it('appendBuffer writes raw ArrayBuffer data', () => { 56 | const writer = new ByteWriter() 57 | const buf = new Uint8Array([10, 20, 30]).buffer 58 | writer.appendBuffer(buf) 59 | expect(new Uint8Array(writer.getBuffer())).toEqual(new Uint8Array([10, 20, 30])) 60 | }) 61 | 62 | it('appendVarInt encodes 32-bit varint', () => { 63 | const writer = new ByteWriter() 64 | writer.appendVarInt(127) 65 | writer.appendVarInt(128) 66 | writer.appendVarInt(300) 67 | 68 | expect(new Uint8Array(writer.getBuffer())).toEqual( 69 | new Uint8Array([ 70 | 0x7f, // 127 71 | 0x80, 0x01, // 128 72 | 0xac, 0x02, // 300 73 | ]) 74 | ) 75 | }) 76 | 77 | it('appendVarBigInt encodes bigint varint', () => { 78 | const writer = new ByteWriter() 79 | writer.appendVarBigInt(127n) 80 | writer.appendVarBigInt(128n) 81 | writer.appendVarBigInt(300n) 82 | 83 | expect(new Uint8Array(writer.getBuffer())).toEqual( 84 | new Uint8Array([ 85 | 0x7f, // 127 86 | 0x80, 0x01, // 128 87 | 0xac, 0x02, // 300 88 | ]) 89 | ) 90 | }) 91 | 92 | it('expands buffer automatically when needed', () => { 93 | const writer = new ByteWriter() 94 | // force expansion by writing more than initial 1024 bytes 95 | const largeArray = new Uint8Array(2000).fill(0xaa) 96 | writer.appendBytes(largeArray) 97 | expect(writer.buffer.byteLength).toBeGreaterThanOrEqual(2000) 98 | expect(new Uint8Array(writer.getBuffer()).length).toBe(2000) 99 | }) 100 | 101 | it('finish does nothing but is callable', () => { 102 | const writer = new ByteWriter() 103 | writer.finish() 104 | expect(writer.getBuffer().byteLength).toBe(0) 105 | }) 106 | }) 107 | -------------------------------------------------------------------------------- /test/write.delta.test.js: -------------------------------------------------------------------------------- 1 | import { parquetMetadata, parquetReadObjects } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer } from '../src/index.js' 4 | 5 | describe('DELTA_BINARY_PACKED encoding', () => { 6 | it('writes DELTA_BINARY_PACKED encoding for INT32', async () => { 7 | const data = [1, 2, 3, 100, 200, 300] 8 | const file = parquetWriteBuffer({ 9 | columnData: [{ name: 'int', data, encoding: 'DELTA_BINARY_PACKED' }], 10 | }) 11 | const metadata = parquetMetadata(file) 12 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BINARY_PACKED']) 13 | const result = await parquetReadObjects({ file }) 14 | expect(result).toEqual(data.map(int => ({ int }))) 15 | }) 16 | 17 | it('writes DELTA_BINARY_PACKED encoding for INT64', async () => { 18 | const data = [1n, 2n, 3n, 100n, 200n, 300n] 19 | const file = parquetWriteBuffer({ 20 | columnData: [{ name: 'bigint', data, encoding: 'DELTA_BINARY_PACKED' }], 21 | }) 22 | const metadata = parquetMetadata(file) 23 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BINARY_PACKED']) 24 | const result = await parquetReadObjects({ file }) 25 | expect(result).toEqual(data.map(bigint => ({ bigint }))) 26 | }) 27 | }) 28 | 29 | describe('DELTA_LENGTH_BYTE_ARRAY encoding', () => { 30 | it('writes DELTA_LENGTH_BYTE_ARRAY encoding for strings', async () => { 31 | const data = ['hello', 'world', 'foo', 'bar', 'baz', 'qux'] 32 | const file = parquetWriteBuffer({ 33 | columnData: [{ name: 'string', data, encoding: 'DELTA_LENGTH_BYTE_ARRAY' }], 34 | }) 35 | const metadata = parquetMetadata(file) 36 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_LENGTH_BYTE_ARRAY']) 37 | const result = await parquetReadObjects({ file }) 38 | expect(result).toEqual(data.map(string => ({ string }))) 39 | }) 40 | 41 | it('writes DELTA_LENGTH_BYTE_ARRAY encoding for byte arrays', async () => { 42 | const data = [ 43 | Uint8Array.of(1, 2, 3), 44 | Uint8Array.of(4, 5, 6, 7), 45 | Uint8Array.of(8, 9), 46 | Uint8Array.of(10, 11, 12, 13, 14), 47 | ] 48 | const file = parquetWriteBuffer({ 49 | columnData: [{ name: 'bytes', data, encoding: 'DELTA_LENGTH_BYTE_ARRAY' }], 50 | }) 51 | const metadata = parquetMetadata(file) 52 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_LENGTH_BYTE_ARRAY']) 53 | const result = await parquetReadObjects({ file, utf8: false }) 54 | expect(result).toEqual(data.map(bytes => ({ bytes }))) 55 | }) 56 | }) 57 | 58 | describe('DELTA_BYTE_ARRAY encoding', () => { 59 | it('writes DELTA_BYTE_ARRAY encoding for strings with common prefixes', async () => { 60 | const data = ['apple', 'application', 'apply', 'banana', 'band', 'bandana'] 61 | const file = parquetWriteBuffer({ 62 | columnData: [{ name: 'string', data, encoding: 'DELTA_BYTE_ARRAY' }], 63 | }) 64 | const metadata = parquetMetadata(file) 65 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BYTE_ARRAY']) 66 | const result = await parquetReadObjects({ file }) 67 | expect(result).toEqual(data.map(string => ({ string }))) 68 | }) 69 | 70 | it('writes DELTA_BYTE_ARRAY encoding for byte arrays', async () => { 71 | const data = [ 72 | Uint8Array.of(1, 2, 3, 4), 73 | Uint8Array.of(1, 2, 5, 6), 74 | Uint8Array.of(1, 2, 7, 8), 75 | Uint8Array.of(10, 11, 12, 13), 76 | ] 77 | const file = parquetWriteBuffer({ 78 | columnData: [{ name: 'bytes', data, encoding: 'DELTA_BYTE_ARRAY' }], 79 | }) 80 | const metadata = parquetMetadata(file) 81 | expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['DELTA_BYTE_ARRAY']) 82 | const result = await parquetReadObjects({ file, utf8: false }) 83 | expect(result).toEqual(data.map(bytes => ({ bytes }))) 84 | }) 85 | }) 86 | -------------------------------------------------------------------------------- /test/write.splitstream.test.js: -------------------------------------------------------------------------------- 1 | import { parquetMetadata, parquetReadObjects } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer } from '../src/index.js' 4 | 5 | describe('BYTE_STREAM_SPLIT encoding', () => { 6 | it('writes BYTE_STREAM_SPLIT encoding for FLOAT', async () => { 7 | const data = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75] 8 | const file = parquetWriteBuffer({ 9 | columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }], 10 | }) 11 | const metadata = parquetMetadata(file) 12 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 13 | expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT']) 14 | const result = await parquetReadObjects({ file }) 15 | expect(result).toEqual(data.map(float => ({ float }))) 16 | }) 17 | 18 | it('writes BYTE_STREAM_SPLIT encoding for DOUBLE', async () => { 19 | const data = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75, 1e100, -1e-100] 20 | const file = parquetWriteBuffer({ 21 | columnData: [{ name: 'double', data, type: 'DOUBLE', encoding: 'BYTE_STREAM_SPLIT' }], 22 | }) 23 | const metadata = parquetMetadata(file) 24 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 25 | expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT']) 26 | const result = await parquetReadObjects({ file }) 27 | expect(result).toEqual(data.map(double => ({ double }))) 28 | }) 29 | 30 | it('writes BYTE_STREAM_SPLIT encoding for INT32', async () => { 31 | const data = [1, 2, 3, -100, 0, 2147483647, -2147483648] 32 | const file = parquetWriteBuffer({ 33 | columnData: [{ name: 'int', data, encoding: 'BYTE_STREAM_SPLIT' }], 34 | }) 35 | const metadata = parquetMetadata(file) 36 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 37 | expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT']) 38 | const result = await parquetReadObjects({ file }) 39 | expect(result).toEqual(data.map(int => ({ int }))) 40 | }) 41 | 42 | it('writes BYTE_STREAM_SPLIT encoding for INT64', async () => { 43 | const data = [1n, 2n, 3n, -100n, 0n, 9223372036854775807n, -9223372036854775808n] 44 | const file = parquetWriteBuffer({ 45 | columnData: [{ name: 'bigint', data, encoding: 'BYTE_STREAM_SPLIT' }], 46 | }) 47 | const metadata = parquetMetadata(file) 48 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 49 | expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT']) 50 | const result = await parquetReadObjects({ file }) 51 | expect(result).toEqual(data.map(bigint => ({ bigint }))) 52 | }) 53 | 54 | it('writes BYTE_STREAM_SPLIT encoding with nulls', async () => { 55 | const data = [1.5, null, 3.125, null, 0.0, 100.75] 56 | const file = parquetWriteBuffer({ 57 | columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }], 58 | }) 59 | const metadata = parquetMetadata(file) 60 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 61 | expect(columnMetadata?.encodings).toContain('BYTE_STREAM_SPLIT') 62 | const result = await parquetReadObjects({ file }) 63 | expect(result).toEqual(data.map(float => ({ float }))) 64 | }) 65 | 66 | it('writes BYTE_STREAM_SPLIT encoding with compression', async () => { 67 | const data = Array.from({ length: 1000 }, (_, i) => i * 0.1) 68 | const file = parquetWriteBuffer({ 69 | columnData: [{ name: 'float', data, encoding: 'BYTE_STREAM_SPLIT' }], 70 | }) 71 | const metadata = parquetMetadata(file) 72 | const columnMetadata = metadata.row_groups[0].columns[0].meta_data 73 | expect(columnMetadata?.encodings).toEqual(['BYTE_STREAM_SPLIT']) 74 | expect(columnMetadata?.codec).toBe('SNAPPY') 75 | const result = await parquetReadObjects({ file }) 76 | expect(result.length).toBe(1000) 77 | result.forEach((row, i) => { 78 | expect(row.float).toBeCloseTo(i * 0.1, 5) 79 | }) 80 | }) 81 | }) 82 | -------------------------------------------------------------------------------- /test/splitstream.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { ByteWriter } from '../src/bytewriter.js' 3 | import { writeByteStreamSplit } from '../src/splitstream.js' 4 | import { byteStreamSplit } from 'hyparquet/src/encoding.js' 5 | 6 | /** 7 | * @import {DecodedArray, ParquetType} from 'hyparquet' 8 | * @param {DecodedArray} values 9 | * @param {ParquetType} type 10 | * @param {number} [typeLength] 11 | * @returns {DecodedArray} 12 | */ 13 | function roundTrip(values, type, typeLength) { 14 | const writer = new ByteWriter() 15 | writeByteStreamSplit(writer, values, type, typeLength) 16 | const buffer = writer.getBuffer() 17 | const reader = { view: new DataView(buffer), offset: 0 } 18 | return byteStreamSplit(reader, values.length, type, typeLength) 19 | } 20 | 21 | describe('BYTE_STREAM_SPLIT encoding', () => { 22 | describe('FLOAT', () => { 23 | it('should round-trip float values', () => { 24 | const original = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75] 25 | expect(Array.from(roundTrip(original, 'FLOAT'))).toEqual(original) 26 | }) 27 | 28 | it('should round-trip an empty array', () => { 29 | expect(Array.from(roundTrip([], 'FLOAT'))).toEqual([]) 30 | }) 31 | 32 | it('should round-trip special float values', () => { 33 | const decoded = roundTrip([0.0, -0.0, Infinity, -Infinity], 'FLOAT') 34 | expect(decoded[0]).toBe(0.0) 35 | expect(decoded[1]).toBe(-0.0) 36 | expect(decoded[2]).toBe(Infinity) 37 | expect(decoded[3]).toBe(-Infinity) 38 | }) 39 | }) 40 | 41 | describe('DOUBLE', () => { 42 | it('should round-trip double values', () => { 43 | const original = [1.5, 2.25, 3.125, -4.5, 0.0, 100.75, 1e100, -1e-100] 44 | expect(Array.from(roundTrip(original, 'DOUBLE'))).toEqual(original) 45 | }) 46 | 47 | it('should round-trip an empty array', () => { 48 | expect(Array.from(roundTrip([], 'DOUBLE'))).toEqual([]) 49 | }) 50 | }) 51 | 52 | describe('INT32', () => { 53 | it('should round-trip int32 values', () => { 54 | const original = [1, 2, 3, -100, 0, 2147483647, -2147483648] 55 | expect(Array.from(roundTrip(original, 'INT32'))).toEqual(original) 56 | }) 57 | 58 | it('should round-trip an empty array', () => { 59 | expect(Array.from(roundTrip([], 'INT32'))).toEqual([]) 60 | }) 61 | }) 62 | 63 | describe('INT64', () => { 64 | it('should round-trip int64 values', () => { 65 | const original = [1n, 2n, 3n, -100n, 0n, 9223372036854775807n, -9223372036854775808n] 66 | expect(Array.from(roundTrip(original, 'INT64'))).toEqual(original) 67 | }) 68 | 69 | it('should round-trip an empty array', () => { 70 | expect(Array.from(roundTrip([], 'INT64'))).toEqual([]) 71 | }) 72 | }) 73 | 74 | describe('FIXED_LEN_BYTE_ARRAY', () => { 75 | it('should round-trip fixed-length byte arrays', () => { 76 | const original = [ 77 | new Uint8Array([1, 2, 3, 4]), 78 | new Uint8Array([5, 6, 7, 8]), 79 | new Uint8Array([9, 10, 11, 12]), 80 | ] 81 | const decoded = roundTrip(original, 'FIXED_LEN_BYTE_ARRAY', 4) 82 | expect(decoded).toHaveLength(3) 83 | expect(Array.from(decoded[0])).toEqual([1, 2, 3, 4]) 84 | expect(Array.from(decoded[1])).toEqual([5, 6, 7, 8]) 85 | expect(Array.from(decoded[2])).toEqual([9, 10, 11, 12]) 86 | }) 87 | 88 | it('should round-trip an empty array', () => { 89 | const decoded = roundTrip([], 'FIXED_LEN_BYTE_ARRAY', 4) 90 | expect(Array.from(decoded)).toEqual([]) 91 | }) 92 | 93 | it('should throw without typeLength', () => { 94 | const writer = new ByteWriter() 95 | expect(() => writeByteStreamSplit(writer, [], 'FIXED_LEN_BYTE_ARRAY', undefined)) 96 | .toThrow('missing type_length') 97 | }) 98 | }) 99 | 100 | describe('errors', () => { 101 | it('should throw for unsupported type', () => { 102 | const writer = new ByteWriter() 103 | expect(() => writeByteStreamSplit(writer, [], 'BOOLEAN', undefined)) 104 | .toThrow('unsupported type') 105 | }) 106 | }) 107 | }) 108 | -------------------------------------------------------------------------------- /src/dremel.js: -------------------------------------------------------------------------------- 1 | import { getMaxDefinitionLevel } from './schema.js' 2 | 3 | /** 4 | * Encode nested list values into repetition and definition levels. 5 | * 6 | * @import {SchemaElement} from 'hyparquet' 7 | * @import {PageData} from '../src/types.js' 8 | * @param {SchemaElement[]} schemaPath schema elements from root to leaf 9 | * @param {any[]} rows column data for the current row group 10 | * @returns {PageData} encoded list values 11 | */ 12 | export function encodeListValues(schemaPath, rows) { 13 | if (schemaPath.length < 2) throw new Error('parquet list schema path must include column') 14 | /** @type {any[]} */ 15 | const values = [] 16 | /** @type {number[]} */ 17 | const definitionLevels = [] 18 | /** @type {number[]} */ 19 | const repetitionLevels = [] 20 | 21 | // Track repetition depth prior to each level 22 | const repLevelPrior = new Array(schemaPath.length) 23 | let repeatedCount = 0 24 | for (let i = 0; i < schemaPath.length; i++) { 25 | repLevelPrior[i] = repeatedCount 26 | if (schemaPath[i].repetition_type === 'REPEATED') repeatedCount++ 27 | } 28 | 29 | const leafIndex = schemaPath.length - 1 30 | const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) 31 | 32 | for (let row = 0; row < rows.length; row++) { 33 | visit(1, rows[row], 0, 0, false) 34 | } 35 | 36 | const numNulls = definitionLevels.reduce( 37 | (count, def) => def === maxDefinitionLevel ? count : count + 1, 38 | 0 39 | ) 40 | 41 | return { values, definitionLevels, repetitionLevels, numNulls } 42 | 43 | /** 44 | * Recursively walk the schema path, emitting definition/repetition pairs. 45 | * 46 | * @param {number} depth index into schemaPath 47 | * @param {any} value value at the current depth 48 | * @param {number} defLevel definition level accumulated so far 49 | * @param {number} repLevel repetition level for the next emitted slot 50 | * @param {boolean} allowNull whether the current value is allowed to be null 51 | */ 52 | function visit(depth, value, defLevel, repLevel, allowNull) { 53 | const element = schemaPath[depth] 54 | const repetition = element.repetition_type || 'REQUIRED' 55 | const isLeaf = depth === leafIndex 56 | 57 | if (isLeaf) { 58 | if (value === null || value === undefined) { 59 | if (repetition === 'REQUIRED' && !allowNull) { 60 | throw new Error('parquet required value is undefined') 61 | } 62 | definitionLevels.push(defLevel) 63 | repetitionLevels.push(repLevel) 64 | values.push(null) 65 | } else { 66 | const finalDef = repetition === 'REQUIRED' ? defLevel : defLevel + 1 67 | definitionLevels.push(finalDef) 68 | repetitionLevels.push(repLevel) 69 | values.push(value) 70 | } 71 | return 72 | } 73 | 74 | if (repetition === 'REPEATED') { 75 | if (value === null || value === undefined) { 76 | if (!allowNull) throw new Error('parquet required value is undefined') 77 | visit(depth + 1, undefined, defLevel, repLevel, true) 78 | return 79 | } 80 | if (!Array.isArray(value)) { 81 | throw new Error(`parquet repeated field ${element.name} must be an array`) 82 | } 83 | if (!value.length) { 84 | visit(depth + 1, undefined, defLevel, repLevel, true) 85 | return 86 | } 87 | for (let i = 0; i < value.length; i++) { 88 | const childRep = i === 0 ? repLevel : repLevelPrior[depth] + 1 89 | visit(depth + 1, value[i], defLevel + 1, childRep, false) 90 | } 91 | return 92 | } 93 | 94 | if (repetition === 'OPTIONAL') { 95 | if (value === null || value === undefined) { 96 | visit(depth + 1, undefined, defLevel, repLevel, true) 97 | } else { 98 | visit(depth + 1, value, defLevel + 1, repLevel, false) 99 | } 100 | return 101 | } 102 | 103 | // REQUIRED 104 | if (value === null || value === undefined) { 105 | if (!allowNull) throw new Error('parquet required value is undefined') 106 | visit(depth + 1, undefined, defLevel, repLevel, true) 107 | } else { 108 | visit(depth + 1, value, defLevel, repLevel, false) 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /test/thrift.test.js: -------------------------------------------------------------------------------- 1 | import { deserializeTCompactProtocol } from 'hyparquet/src/thrift.js' 2 | import { describe, expect, it } from 'vitest' 3 | import { serializeTCompactProtocol } from '../src/thrift.js' 4 | import { ByteWriter } from '../src/bytewriter.js' 5 | import { logicalType } from '../src/metadata.js' 6 | 7 | /** 8 | * Utility to decode a Thrift-serialized buffer and return the parsed object. 9 | * @param {ArrayBuffer} buf 10 | * @returns {Record} 11 | */ 12 | function roundTripDeserialize(buf) { 13 | const view = new DataView(buf) 14 | const reader = { view, offset: 0 } 15 | return deserializeTCompactProtocol(reader) 16 | } 17 | 18 | describe('serializeTCompactProtocol', () => { 19 | it('serializes basic types correctly', () => { 20 | const data = { 21 | field_1: true, // BOOL -> TRUE 22 | field_2: false, // BOOL -> FALSE 23 | field_3: 127, // BYTE / I32 24 | field_4: 0x7fff, // I16 / I32 25 | field_5: 0x7fffffff, // I32 26 | field_6: BigInt('0x7fffffffffffffff'), // I64 27 | field_7: 123.456, // DOUBLE 28 | field_8: 'Hello, Thrift!', 29 | field_9: new TextEncoder().encode('Hello, Thrift!'), 30 | } 31 | 32 | const writer = new ByteWriter() 33 | serializeTCompactProtocol(writer, data) 34 | const result = roundTripDeserialize(writer.getBuffer()) 35 | 36 | expect(result.field_1).toBe(true) 37 | expect(result.field_2).toBe(false) 38 | expect(result.field_3).toBe(127) 39 | expect(result.field_4).toBe(0x7fff) 40 | expect(result.field_5).toBe(0x7fffffff) 41 | expect(result.field_6).toBe(BigInt('0x7fffffffffffffff')) 42 | expect(result.field_7).toBeCloseTo(123.456) 43 | // Decode the binary back into a string 44 | const decoder = new TextDecoder() 45 | expect(decoder.decode(result.field_8)).toBe('Hello, Thrift!') 46 | expect(decoder.decode(result.field_9)).toBe('Hello, Thrift!') 47 | }) 48 | 49 | it('serializes a nested STRUCT and LIST of booleans', () => { 50 | const data = { 51 | field_1: { 52 | field_1: 42, 53 | field_2: { 54 | field_1: true, 55 | field_2: false, 56 | }, 57 | }, 58 | // List of booleans 59 | field_2: [true, false, true, false], 60 | } 61 | 62 | const writer = new ByteWriter() 63 | serializeTCompactProtocol(writer, data) 64 | const result = roundTripDeserialize(writer.getBuffer()) 65 | 66 | expect(result.field_1.field_1).toBe(42) 67 | expect(result.field_1.field_2.field_1).toBe(true) 68 | expect(result.field_1.field_2.field_2).toBe(false) 69 | expect(result.field_2).toEqual([true, false, true, false]) 70 | }) 71 | 72 | it('handles empty object (only STOP)', () => { 73 | const data = {} 74 | const writer = new ByteWriter() 75 | serializeTCompactProtocol(writer, data) 76 | const arr = new Uint8Array(writer.getBuffer()) 77 | // The entire buffer should just be [0x00] = STOP 78 | expect(arr).toEqual(new Uint8Array([0x00])) 79 | 80 | // Round-trip: should deserialize to an empty object 81 | const result = roundTripDeserialize(writer.getBuffer()) 82 | expect(result).toEqual({}) 83 | }) 84 | 85 | it('throws on non-monotonic field IDs', () => { 86 | const invalidData = { 87 | field_2: 2, 88 | field_1: 1, // field_1 is out of order (less than field_2) 89 | } 90 | const writer = new ByteWriter() 91 | expect(() => serializeTCompactProtocol(writer, invalidData)).toThrow() 92 | }) 93 | 94 | it('serializes field IDs with gaps larger than 15', () => { 95 | const data = { field_1: 1, field_17: 17 } 96 | const writer = new ByteWriter() 97 | serializeTCompactProtocol(writer, data) 98 | const result = roundTripDeserialize(writer.getBuffer()) 99 | expect(result.field_1).toBe(1) 100 | expect(result.field_17).toBe(17) 101 | }) 102 | 103 | it('serializes GEOMETRY logicalType struct with field_17', () => { 104 | const data = { field_1: logicalType({ type: 'GEOMETRY' }) } 105 | const writer = new ByteWriter() 106 | serializeTCompactProtocol(writer, data) 107 | const result = roundTripDeserialize(writer.getBuffer()) 108 | expect(result.field_1.field_17).toEqual({}) 109 | }) 110 | }) 111 | -------------------------------------------------------------------------------- /src/plain.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @import {DecodedArray, ParquetType} from 'hyparquet' 4 | * @import {Writer} from '../src/types.js' 5 | * @param {Writer} writer 6 | * @param {DecodedArray} values 7 | * @param {ParquetType} type 8 | * @param {number | undefined} fixedLength 9 | */ 10 | export function writePlain(writer, values, type, fixedLength) { 11 | if (type === 'BOOLEAN') { 12 | writePlainBoolean(writer, values) 13 | } else if (type === 'INT32') { 14 | writePlainInt32(writer, values) 15 | } else if (type === 'INT64') { 16 | writePlainInt64(writer, values) 17 | } else if (type === 'FLOAT') { 18 | writePlainFloat(writer, values) 19 | } else if (type === 'DOUBLE') { 20 | writePlainDouble(writer, values) 21 | } else if (type === 'BYTE_ARRAY') { 22 | writePlainByteArray(writer, values) 23 | } else if (type === 'FIXED_LEN_BYTE_ARRAY') { 24 | if (!fixedLength) throw new Error('parquet FIXED_LEN_BYTE_ARRAY expected type_length') 25 | writePlainByteArrayFixed(writer, values, fixedLength) 26 | } else { 27 | throw new Error(`parquet unsupported type: ${type}`) 28 | } 29 | } 30 | 31 | /** 32 | * @param {Writer} writer 33 | * @param {DecodedArray} values 34 | */ 35 | function writePlainBoolean(writer, values) { 36 | let currentByte = 0 37 | 38 | for (let i = 0; i < values.length; i++) { 39 | if (typeof values[i] !== 'boolean') throw new Error('parquet expected boolean value') 40 | const bitOffset = i % 8 41 | 42 | if (values[i]) { 43 | currentByte |= 1 << bitOffset 44 | } 45 | 46 | // once we've packed 8 bits or are at a multiple of 8, we write out the byte 47 | if (bitOffset === 7) { 48 | writer.appendUint8(currentByte) 49 | currentByte = 0 50 | } 51 | } 52 | 53 | // if the array length is not a multiple of 8, write the leftover bits 54 | if (values.length % 8 !== 0) { 55 | writer.appendUint8(currentByte) 56 | } 57 | } 58 | 59 | /** 60 | * @param {Writer} writer 61 | * @param {DecodedArray} values 62 | */ 63 | function writePlainInt32(writer, values) { 64 | for (const value of values) { 65 | if (!Number.isSafeInteger(value)) throw new Error('parquet expected integer value') 66 | writer.appendInt32(value) 67 | } 68 | } 69 | 70 | /** 71 | * @param {Writer} writer 72 | * @param {DecodedArray} values 73 | */ 74 | function writePlainInt64(writer, values) { 75 | for (const value of values) { 76 | if (typeof value !== 'bigint') throw new Error('parquet expected bigint value') 77 | writer.appendInt64(value) 78 | } 79 | } 80 | 81 | /** 82 | * @param {Writer} writer 83 | * @param {DecodedArray} values 84 | */ 85 | function writePlainFloat(writer, values) { 86 | for (const value of values) { 87 | if (typeof value !== 'number') throw new Error('parquet expected number value') 88 | writer.appendFloat32(value) 89 | } 90 | } 91 | 92 | /** 93 | * @param {Writer} writer 94 | * @param {DecodedArray} values 95 | */ 96 | function writePlainDouble(writer, values) { 97 | for (const value of values) { 98 | if (typeof value !== 'number') throw new Error('parquet expected number value') 99 | writer.appendFloat64(value) 100 | } 101 | } 102 | 103 | /** 104 | * @param {Writer} writer 105 | * @param {DecodedArray} values 106 | */ 107 | function writePlainByteArray(writer, values) { 108 | for (const value of values) { 109 | let bytes = value 110 | if (typeof bytes === 'string') { 111 | // convert string to Uint8Array 112 | bytes = new TextEncoder().encode(value) 113 | } 114 | if (!(bytes instanceof Uint8Array)) { 115 | throw new Error('parquet expected Uint8Array value') 116 | } 117 | writer.appendUint32(bytes.length) 118 | writer.appendBytes(bytes) 119 | } 120 | } 121 | 122 | /** 123 | * @param {Writer} writer 124 | * @param {DecodedArray} values 125 | * @param {number} fixedLength 126 | */ 127 | function writePlainByteArrayFixed(writer, values, fixedLength) { 128 | for (const value of values) { 129 | if (!(value instanceof Uint8Array)) throw new Error('parquet expected Uint8Array value') 130 | if (value.length !== fixedLength) throw new Error(`parquet expected Uint8Array of length ${fixedLength}`) 131 | writer.appendBytes(value) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/bytewriter.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * Generic buffered writer. 4 | * Writes data to an auto-expanding ArrayBuffer. 5 | * 6 | * @import {Writer} from '../src/types.js' 7 | * @returns {Writer} 8 | */ 9 | export function ByteWriter() { 10 | this.buffer = new ArrayBuffer(1024) 11 | this.view = new DataView(this.buffer) 12 | this.offset = 0 // bytes written 13 | this.index = 0 // index in buffer 14 | return this 15 | } 16 | 17 | /** 18 | * @param {number} size 19 | */ 20 | ByteWriter.prototype.ensure = function(size) { 21 | // auto-expanding buffer 22 | if (this.index + size > this.buffer.byteLength) { 23 | const newSize = Math.max(this.buffer.byteLength * 2, this.index + size) 24 | const newBuffer = new ArrayBuffer(newSize) 25 | // TODO: save buffers until later and merge once? 26 | new Uint8Array(newBuffer).set(new Uint8Array(this.buffer)) 27 | this.buffer = newBuffer 28 | this.view = new DataView(this.buffer) 29 | } 30 | } 31 | 32 | ByteWriter.prototype.finish = function() { 33 | } 34 | 35 | ByteWriter.prototype.getBuffer = function() { 36 | return this.buffer.slice(0, this.index) 37 | } 38 | 39 | /** 40 | * @param {number} value 41 | */ 42 | ByteWriter.prototype.appendUint8 = function(value) { 43 | this.ensure(this.index + 1) 44 | this.view.setUint8(this.index, value) 45 | this.offset++ 46 | this.index++ 47 | } 48 | 49 | /** 50 | * @param {number} value 51 | */ 52 | ByteWriter.prototype.appendUint32 = function(value) { 53 | this.ensure(this.index + 4) 54 | this.view.setUint32(this.index, value, true) 55 | this.offset += 4 56 | this.index += 4 57 | } 58 | 59 | /** 60 | * @param {number} value 61 | */ 62 | ByteWriter.prototype.appendInt32 = function(value) { 63 | this.ensure(this.index + 4) 64 | this.view.setInt32(this.index, value, true) 65 | this.offset += 4 66 | this.index += 4 67 | } 68 | 69 | /** 70 | * @param {bigint} value 71 | */ 72 | ByteWriter.prototype.appendInt64 = function(value) { 73 | this.ensure(this.index + 8) 74 | this.view.setBigInt64(this.index, BigInt(value), true) 75 | this.offset += 8 76 | this.index += 8 77 | } 78 | 79 | /** 80 | * @param {number} value 81 | */ 82 | ByteWriter.prototype.appendFloat32 = function(value) { 83 | this.ensure(this.index + 8) 84 | this.view.setFloat32(this.index, value, true) 85 | this.offset += 4 86 | this.index += 4 87 | } 88 | 89 | /** 90 | * @param {number} value 91 | */ 92 | ByteWriter.prototype.appendFloat64 = function(value) { 93 | this.ensure(this.index + 8) 94 | this.view.setFloat64(this.index, value, true) 95 | this.offset += 8 96 | this.index += 8 97 | } 98 | 99 | /** 100 | * @param {ArrayBuffer} value 101 | */ 102 | ByteWriter.prototype.appendBuffer = function(value) { 103 | this.appendBytes(new Uint8Array(value)) 104 | } 105 | 106 | /** 107 | * @param {Uint8Array} value 108 | */ 109 | ByteWriter.prototype.appendBytes = function(value) { 110 | this.ensure(this.index + value.length) 111 | new Uint8Array(this.buffer, this.index, value.length).set(value) 112 | this.offset += value.length 113 | this.index += value.length 114 | } 115 | 116 | /** 117 | * Convert a 32-bit signed integer to varint (1-5 bytes). 118 | * Writes out groups of 7 bits at a time, setting high bit if more to come. 119 | * 120 | * @param {number} value 121 | */ 122 | ByteWriter.prototype.appendVarInt = function(value) { 123 | while (true) { 124 | if ((value & ~0x7f) === 0) { 125 | // fits in 7 bits 126 | this.appendUint8(value) 127 | return 128 | } else { 129 | // write 7 bits and set high bit 130 | this.appendUint8(value & 0x7f | 0x80) 131 | value >>>= 7 132 | } 133 | } 134 | } 135 | 136 | /** 137 | * Convert a bigint to varint (1-10 bytes for 64-bit range). 138 | * 139 | * @param {bigint} value 140 | */ 141 | ByteWriter.prototype.appendVarBigInt = function(value) { 142 | while (true) { 143 | if ((value & ~0x7fn) === 0n) { 144 | // fits in 7 bits 145 | this.appendUint8(Number(value)) 146 | return 147 | } else { 148 | // write 7 bits and set high bit 149 | this.appendUint8(Number(value & 0x7fn | 0x80n)) 150 | value >>= 7n 151 | } 152 | } 153 | } 154 | 155 | /** 156 | * Convert number to zigzag encoding and write as varint. 157 | * 158 | * @param {number | bigint} value 159 | */ 160 | ByteWriter.prototype.appendZigZag = function(value) { 161 | if (typeof value === 'number') { 162 | this.appendVarInt(value << 1 ^ value >> 31) 163 | } else { 164 | this.appendVarBigInt(value << 1n ^ value >> 63n) 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/wkb.js: -------------------------------------------------------------------------------- 1 | import { ByteWriter } from './bytewriter.js' 2 | 3 | /** 4 | * Serialize a GeoJSON geometry into ISO WKB. 5 | * 6 | * @import {Geometry, Position} from 'hyparquet/src/types.js' 7 | * @param {Geometry} geometry 8 | * @returns {Uint8Array} 9 | */ 10 | export function geojsonToWkb(geometry) { 11 | const writer = new ByteWriter() 12 | writeGeometry(writer, geometry) 13 | return new Uint8Array(writer.getBuffer()) 14 | } 15 | 16 | /** 17 | * @param {ByteWriter} writer 18 | * @param {Geometry} geometry 19 | */ 20 | function writeGeometry(writer, geometry) { 21 | const typeCode = geometryTypeCode(geometry.type) 22 | 23 | // infer dimensions 24 | const dim = inferGeometryDimensions(geometry) 25 | let flag = 0 26 | if (dim === 3) flag = 1 27 | else if (dim === 4) flag = 3 28 | else if (dim > 4) throw new Error(`unsupported geometry dimensions: ${dim}`) 29 | 30 | writer.appendUint8(1) // little endian 31 | writer.appendUint32(typeCode + flag * 1000) 32 | 33 | if (geometry.type === 'Point') { 34 | writePosition(writer, geometry.coordinates, dim) 35 | } else if (geometry.type === 'LineString') { 36 | writeLine(writer, geometry.coordinates, dim) 37 | } else if (geometry.type === 'Polygon') { 38 | writer.appendUint32(geometry.coordinates.length) 39 | for (const ring of geometry.coordinates) { 40 | writeLine(writer, ring, dim) 41 | } 42 | } else if (geometry.type === 'MultiPoint') { 43 | writer.appendUint32(geometry.coordinates.length) 44 | for (const coordinates of geometry.coordinates) { 45 | writeGeometry(writer, { type: 'Point', coordinates }) 46 | } 47 | } else if (geometry.type === 'MultiLineString') { 48 | writer.appendUint32(geometry.coordinates.length) 49 | for (const coordinates of geometry.coordinates) { 50 | writeGeometry(writer, { type: 'LineString', coordinates }) 51 | } 52 | } else if (geometry.type === 'MultiPolygon') { 53 | writer.appendUint32(geometry.coordinates.length) 54 | for (const coordinates of geometry.coordinates) { 55 | writeGeometry(writer, { type: 'Polygon', coordinates }) 56 | } 57 | } else if (geometry.type === 'GeometryCollection') { 58 | writer.appendUint32(geometry.geometries.length) 59 | for (const child of geometry.geometries) { 60 | writeGeometry(writer, child) 61 | } 62 | } else { 63 | throw new Error('unsupported geometry type') 64 | } 65 | } 66 | 67 | /** 68 | * @param {ByteWriter} writer 69 | * @param {Position} position 70 | * @param {number} dim 71 | */ 72 | function writePosition(writer, position, dim) { 73 | if (position.length < dim) { 74 | throw new Error('geometry position dimensions mismatch') 75 | } 76 | for (let i = 0; i < dim; i++) { 77 | writer.appendFloat64(position[i]) 78 | } 79 | } 80 | 81 | /** 82 | * @param {ByteWriter} writer 83 | * @param {Position[]} coordinates 84 | * @param {number} dim 85 | */ 86 | function writeLine(writer, coordinates, dim) { 87 | writer.appendUint32(coordinates.length) 88 | for (const position of coordinates) { 89 | writePosition(writer, position, dim) 90 | } 91 | } 92 | 93 | /** 94 | * @param {Geometry['type']} type 95 | * @returns {number} 96 | */ 97 | function geometryTypeCode(type) { 98 | if (type === 'Point') return 1 99 | if (type === 'LineString') return 2 100 | if (type === 'Polygon') return 3 101 | if (type === 'MultiPoint') return 4 102 | if (type === 'MultiLineString') return 5 103 | if (type === 'MultiPolygon') return 6 104 | if (type === 'GeometryCollection') return 7 105 | throw new Error(`unknown geometry type: ${type}`) 106 | } 107 | 108 | /** 109 | * Determine the maximum coordinate dimensions for the geometry. 110 | * 111 | * @param {Geometry} geometry 112 | * @returns {number} 113 | */ 114 | function inferGeometryDimensions(geometry) { 115 | if (geometry.type === 'GeometryCollection') { 116 | let maxDim = 0 117 | for (const child of geometry.geometries) { 118 | maxDim = Math.max(maxDim, inferGeometryDimensions(child)) 119 | } 120 | return maxDim || 2 121 | } 122 | return inferCoordinateDimensions(geometry.coordinates) 123 | } 124 | 125 | /** 126 | * @param {any} value 127 | * @returns {number} 128 | */ 129 | function inferCoordinateDimensions(value) { 130 | if (!Array.isArray(value)) return 2 131 | if (!value.length) return 2 132 | if (typeof value[0] === 'number') return value.length 133 | let maxDim = 0 134 | for (const item of value) { 135 | maxDim = Math.max(maxDim, inferCoordinateDimensions(item)) 136 | } 137 | return maxDim || 2 138 | } 139 | -------------------------------------------------------------------------------- /src/geospatial.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Compute geospatial statistics for GEOMETRY and GEOGRAPHY columns. 3 | * 4 | * @import {BoundingBox, DecodedArray, Geometry, GeospatialStatistics} from 'hyparquet/src/types.js' 5 | * @param {DecodedArray} values 6 | * @returns {GeospatialStatistics | undefined} 7 | */ 8 | export function geospatialStatistics(values) { 9 | /** @type {Set} */ 10 | const typeCodes = new Set() 11 | /** @type {BoundingBox | undefined} */ 12 | let bbox 13 | 14 | for (const value of values) { 15 | if (value === null || value === undefined) continue 16 | if (typeof value !== 'object') { 17 | throw new Error('geospatial column expects GeoJSON geometries') 18 | } 19 | bbox = extendBoundsFromGeometry(bbox, value) 20 | typeCodes.add(geometryTypeCodeWithDimension(value)) 21 | } 22 | 23 | if (typeCodes.size || bbox) { 24 | return { 25 | bbox, 26 | // Geospatial type codes of all instances, or an empty list if not known 27 | geospatial_types: typeCodes.size ? Array.from(typeCodes).sort((a, b) => a - b) : [], 28 | } 29 | } 30 | } 31 | 32 | /** 33 | * @param {BoundingBox | undefined} bbox 34 | * @param {Geometry} geometry 35 | * @returns {BoundingBox | undefined} 36 | */ 37 | function extendBoundsFromGeometry(bbox, geometry) { 38 | if (geometry.type === 'GeometryCollection') { 39 | for (const child of geometry.geometries || []) { 40 | bbox = extendBoundsFromGeometry(bbox, child) 41 | } 42 | return bbox 43 | } 44 | return extendBoundsFromCoordinates(bbox, geometry.coordinates) 45 | } 46 | 47 | /** 48 | * @param {BoundingBox | undefined} bbox 49 | * @param {any[]} coordinates 50 | * @returns {BoundingBox | undefined} 51 | */ 52 | function extendBoundsFromCoordinates(bbox, coordinates) { 53 | if (typeof coordinates[0] === 'number') { 54 | return grow(bbox, coordinates) 55 | } 56 | for (const child of coordinates) { 57 | bbox = extendBoundsFromCoordinates(bbox, child) 58 | } 59 | return bbox 60 | } 61 | 62 | /** 63 | * Initialize or expand bbox with a single position [x,y,(z),(m)]. 64 | * @param {BoundingBox | undefined} bbox 65 | * @param {number[]} position 66 | * @returns {BoundingBox | undefined} 67 | */ 68 | function grow(bbox, position) { 69 | const x = position[0] 70 | const y = position[1] 71 | if (!Number.isFinite(x) || !Number.isFinite(y)) return bbox 72 | 73 | if (!bbox) { 74 | bbox = { xmin: x, ymin: y, xmax: x, ymax: y } 75 | } else { 76 | updateAxis(bbox, 'xmin', 'xmax', x) 77 | updateAxis(bbox, 'ymin', 'ymax', y) 78 | } 79 | 80 | if (position.length > 2) updateAxis(bbox, 'zmin', 'zmax', position[2]) 81 | if (position.length > 3) updateAxis(bbox, 'mmin', 'mmax', position[3]) 82 | return bbox 83 | } 84 | 85 | /** 86 | * @param {BoundingBox} bbox 87 | * @param {'xmin' | 'ymin' | 'zmin' | 'mmin'} minKey 88 | * @param {'xmax' | 'ymax' | 'zmax' | 'mmax'} maxKey 89 | * @param {number | undefined} value 90 | */ 91 | function updateAxis(bbox, minKey, maxKey, value) { 92 | if (value === undefined || !Number.isFinite(value)) return 93 | if (bbox[minKey] === undefined || value < bbox[minKey]) bbox[minKey] = value 94 | if (bbox[maxKey] === undefined || value > bbox[maxKey]) bbox[maxKey] = value 95 | } 96 | 97 | /** 98 | * @param {Geometry} geometry 99 | * @returns {number} 100 | */ 101 | function geometryTypeCodeWithDimension(geometry) { 102 | const base = geometryTypeCodes[geometry.type] 103 | if (base === undefined) throw new Error(`unknown geometry type: ${geometry.type}`) 104 | const dim = inferGeometryDimensions(geometry) 105 | if (dim === 2) return base 106 | if (dim === 3) return base + 1000 107 | if (dim === 4) return base + 3000 108 | throw new Error(`unsupported geometry dimensions: ${dim}`) 109 | } 110 | 111 | const geometryTypeCodes = { 112 | Point: 1, 113 | LineString: 2, 114 | Polygon: 3, 115 | MultiPoint: 4, 116 | MultiLineString: 5, 117 | MultiPolygon: 6, 118 | GeometryCollection: 7, 119 | } 120 | 121 | /** 122 | * Determine the maximum coordinate dimensions for the geometry. 123 | * @param {Geometry} geometry 124 | * @returns {number} 125 | */ 126 | function inferGeometryDimensions(geometry) { 127 | if (geometry.type === 'GeometryCollection') { 128 | let maxDim = 0 129 | for (const child of geometry.geometries || []) { 130 | maxDim = Math.max(maxDim, inferGeometryDimensions(child)) 131 | } 132 | return maxDim || 2 133 | } 134 | return inferCoordinateDimensions(geometry.coordinates) 135 | } 136 | 137 | /** 138 | * @param {any[]} value 139 | * @returns {number} 140 | */ 141 | function inferCoordinateDimensions(value) { 142 | if (!value.length) return 2 143 | if (typeof value[0] === 'number') return value.length 144 | let maxDim = 0 145 | for (const item of value) { 146 | maxDim = Math.max(maxDim, inferCoordinateDimensions(item)) 147 | } 148 | return maxDim || 2 149 | } 150 | -------------------------------------------------------------------------------- /test/write.schema.test.js: -------------------------------------------------------------------------------- 1 | import { parquetMetadata } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer, schemaFromColumnData } from '../src/index.js' 4 | 5 | describe('parquet schema', () => { 6 | it('auto detects types', () => { 7 | const file = parquetWriteBuffer({ columnData: [ 8 | { name: 'strings', data: ['1', '2', '3'] }, 9 | ] }) 10 | const metadata = parquetMetadata(file) 11 | expect(metadata.schema).toEqual([ 12 | { 13 | name: 'root', 14 | num_children: 1, 15 | }, 16 | { 17 | converted_type: 'UTF8', 18 | name: 'strings', 19 | repetition_type: 'REQUIRED', 20 | type: 'BYTE_ARRAY', 21 | }, 22 | ]) 23 | }) 24 | 25 | it('accepts basic type hints', () => { 26 | const file = parquetWriteBuffer({ columnData: [ 27 | { 28 | name: 'timestamps', 29 | data: [new Date(1000000), new Date(2000000), new Date(3000000)], 30 | type: 'TIMESTAMP', 31 | }, 32 | ] }) 33 | const metadata = parquetMetadata(file) 34 | expect(metadata.schema).toEqual([ 35 | { 36 | name: 'root', 37 | num_children: 1, 38 | }, 39 | { 40 | converted_type: 'TIMESTAMP_MILLIS', 41 | name: 'timestamps', 42 | repetition_type: 'OPTIONAL', 43 | type: 'INT64', 44 | }, 45 | ]) 46 | }) 47 | 48 | it('accepts nullable basic type hints', () => { 49 | const file = parquetWriteBuffer({ columnData: [ 50 | { name: 'numbers', data: [1, 2, 3], type: 'FLOAT', nullable: false }, 51 | ] }) 52 | const metadata = parquetMetadata(file) 53 | expect(metadata.schema).toEqual([ 54 | { 55 | name: 'root', 56 | num_children: 1, 57 | }, 58 | { 59 | name: 'numbers', 60 | repetition_type: 'REQUIRED', 61 | type: 'FLOAT', 62 | }, 63 | ]) 64 | }) 65 | 66 | it('allow zero rows to be auto-typed', () => { 67 | const file = parquetWriteBuffer({ columnData: [ 68 | { name: 'numbers', data: [] }, 69 | ] }) 70 | const metadata = parquetMetadata(file) 71 | expect(metadata.schema).toEqual([ 72 | { 73 | name: 'root', 74 | num_children: 1, 75 | }, 76 | { 77 | name: 'numbers', 78 | repetition_type: 'OPTIONAL', 79 | type: 'BYTE_ARRAY', 80 | }, 81 | ]) 82 | }) 83 | 84 | it('allow entirely null columns to be auto-typed', () => { 85 | const file = parquetWriteBuffer({ columnData: [ 86 | { name: 'numbers', data: [null, null, null] }, 87 | ] }) 88 | const metadata = parquetMetadata(file) 89 | expect(metadata.schema).toEqual([ 90 | { 91 | name: 'root', 92 | num_children: 1, 93 | }, 94 | { 95 | name: 'numbers', 96 | repetition_type: 'OPTIONAL', 97 | type: 'BYTE_ARRAY', 98 | }, 99 | ]) 100 | }) 101 | 102 | it('accepts explicit schema', () => { 103 | const file = parquetWriteBuffer({ columnData: [ 104 | { name: 'numbers', data: [1, 2, 3] }, 105 | ], schema: [ 106 | { name: 'root', num_children: 1 }, 107 | { name: 'numbers', type: 'FLOAT', repetition_type: 'REQUIRED' }, 108 | ] }) 109 | const metadata = parquetMetadata(file) 110 | expect(metadata.schema).toEqual([ 111 | { 112 | name: 'root', 113 | num_children: 1, 114 | }, 115 | { 116 | name: 'numbers', 117 | repetition_type: 'REQUIRED', 118 | type: 'FLOAT', 119 | }, 120 | ]) 121 | }) 122 | 123 | it('accepts schema override', () => { 124 | const columnData = [ 125 | { name: 'numbers', data: [1, 2, 3] }, 126 | ] 127 | const file = parquetWriteBuffer({ 128 | columnData, 129 | schema: schemaFromColumnData({ 130 | columnData, 131 | schemaOverrides: { 132 | numbers: { 133 | name: 'numbers', 134 | type: 'DOUBLE', 135 | repetition_type: 'OPTIONAL', 136 | field_id: 1, 137 | }, 138 | }, 139 | }), 140 | }) 141 | const metadata = parquetMetadata(file) 142 | expect(metadata.schema).toEqual([ 143 | { 144 | name: 'root', 145 | num_children: 1, 146 | }, 147 | { 148 | field_id: 1, 149 | name: 'numbers', 150 | repetition_type: 'OPTIONAL', 151 | type: 'DOUBLE', 152 | }, 153 | ]) 154 | }) 155 | 156 | it('throws if basic types conflict with schema', () => { 157 | expect(() => { 158 | parquetWriteBuffer({ 159 | columnData: [ 160 | { name: 'numbers', data: [1, 2, 3], type: 'FLOAT' }, 161 | ], 162 | schema: [ 163 | { name: 'root', num_children: 1 }, 164 | { name: 'numbers', type: 'DOUBLE', repetition_type: 'OPTIONAL' }, 165 | ], 166 | }) 167 | }).toThrow('cannot provide both schema and columnData type') 168 | }) 169 | }) 170 | -------------------------------------------------------------------------------- /test/plain.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { ByteWriter } from '../src/bytewriter.js' 3 | import { writePlain } from '../src/plain.js' 4 | 5 | describe('writePlain', () => { 6 | it('writes BOOLEAN (multiple of 8 bits, plus leftover)', () => { 7 | const writer = new ByteWriter() 8 | const booleans = [true, false, true, true, false, false, false, true, true] 9 | writePlain(writer, booleans, 'BOOLEAN', undefined) 10 | 11 | expect(writer.offset).toBe(2) 12 | expect(writer.view.getUint8(0)).toBe(0b10001101) 13 | expect(writer.view.getUint8(1)).toBe(0b00000001) 14 | }) 15 | 16 | it('writes INT32', () => { 17 | const writer = new ByteWriter() 18 | const ints = [0, 1, 255, 256, 65535, -1, -2147483648, 2147483647] 19 | writePlain(writer, ints, 'INT32', undefined) 20 | 21 | // 4 bytes per int 22 | expect(writer.offset).toBe(4 * ints.length) 23 | 24 | for (let i = 0; i < ints.length; i++) { 25 | const value = writer.view.getInt32(i * 4, true) 26 | expect(value).toBe(ints[i]) 27 | } 28 | }) 29 | 30 | it('writes INT64', () => { 31 | const writer = new ByteWriter() 32 | const bigints = [0n, 1n, 42n, BigInt(2 ** 53 - 1)] 33 | writePlain(writer, bigints, 'INT64', undefined) 34 | 35 | // 8 bytes per int64 36 | expect(writer.offset).toBe(8 * bigints.length) 37 | 38 | for (let i = 0; i < bigints.length; i++) { 39 | const value = writer.view.getBigInt64(i * 8, true) 40 | expect(value).toBe(bigints[i]) 41 | } 42 | }) 43 | 44 | it('writes FLOAT', () => { 45 | const writer = new ByteWriter() 46 | const floats = [0, 300.5, -2.7100000381469727, Infinity, -Infinity, NaN] 47 | writePlain(writer, floats, 'FLOAT', undefined) 48 | 49 | // 4 bytes per float 50 | expect(writer.offset).toBe(4 * floats.length) 51 | 52 | for (let i = 0; i < floats.length; i++) { 53 | const val = writer.view.getFloat32(i * 4, true) 54 | if (Number.isNaN(floats[i])) { 55 | expect(Number.isNaN(val)).toBe(true) 56 | } else { 57 | expect(val).toBe(floats[i]) 58 | } 59 | } 60 | }) 61 | 62 | it('writes DOUBLE', () => { 63 | const writer = new ByteWriter() 64 | const doubles = [0, 3.14, -2.71, Infinity, -Infinity, NaN] 65 | writePlain(writer, doubles, 'DOUBLE', undefined) 66 | 67 | // 8 bytes per double 68 | expect(writer.offset).toBe(8 * doubles.length) 69 | 70 | for (let i = 0; i < doubles.length; i++) { 71 | const val = writer.view.getFloat64(i * 8, true) 72 | if (Number.isNaN(doubles[i])) { 73 | expect(Number.isNaN(val)).toBe(true) 74 | } else { 75 | expect(val).toBe(doubles[i]) 76 | } 77 | } 78 | }) 79 | 80 | it('writes BYTE_ARRAY', () => { 81 | const writer = new ByteWriter() 82 | const strings = ['a', 'b', 'c', 'd'] 83 | writePlain(writer, strings, 'BYTE_ARRAY', undefined) 84 | 85 | let offset = 0 86 | for (const s of strings) { 87 | const length = writer.view.getUint32(offset, true) 88 | expect(length).toBe(s.length) 89 | offset += 4 90 | 91 | for (let i = 0; i < s.length; i++) { 92 | expect(writer.view.getUint8(offset)).toBe(s.charCodeAt(i)) 93 | offset += 1 94 | } 95 | } 96 | }) 97 | 98 | it('writes FIXED_LENGTH_BYTE_ARRAY', () => { 99 | const writer = new ByteWriter() 100 | const encoder = new TextEncoder() 101 | const strings = ['abcd', 'efgh', 'ijkl'] 102 | .map(s => encoder.encode(s)) 103 | writePlain(writer, strings, 'FIXED_LEN_BYTE_ARRAY', 4) 104 | 105 | let offset = 0 106 | for (const s of strings) { 107 | for (let i = 0; i < s.length; i++) { 108 | expect(writer.view.getUint8(offset)).toBe(s[i]) 109 | offset += 1 110 | } 111 | } 112 | }) 113 | 114 | it('throws error on unsupported type', () => { 115 | const writer = new ByteWriter() 116 | expect(() => writePlain(writer, [1, 2, 3], 'INT96', undefined)) 117 | .toThrow(/parquet unsupported type/i) 118 | }) 119 | 120 | it('throws error on type mismatch', () => { 121 | const writer = new ByteWriter() 122 | expect(() => writePlain(writer, [1, 2, 3], 'BOOLEAN', undefined)) 123 | .toThrow('parquet expected boolean value') 124 | expect(() => writePlain(writer, [1, 2, 3.5], 'INT32', undefined)) 125 | .toThrow('parquet expected integer value') 126 | expect(() => writePlain(writer, [1n, 2n, 3], 'INT64', undefined)) 127 | .toThrow('parquet expected bigint value') 128 | expect(() => writePlain(writer, [1, 2, 3n], 'FLOAT', undefined)) 129 | .toThrow('parquet expected number value') 130 | expect(() => writePlain(writer, [1, 2, 3n], 'DOUBLE', undefined)) 131 | .toThrow('parquet expected number value') 132 | expect(() => writePlain(writer, [1, 2, 3], 'BYTE_ARRAY', undefined)) 133 | .toThrow('parquet expected Uint8Array value') 134 | expect(() => writePlain(writer, [1, 2, 3], 'FIXED_LEN_BYTE_ARRAY', undefined)) 135 | .toThrow('parquet FIXED_LEN_BYTE_ARRAY expected type_length') 136 | expect(() => writePlain(writer, [1, 2, 3], 'FIXED_LEN_BYTE_ARRAY', 16)) 137 | .toThrow('parquet expected Uint8Array value') 138 | }) 139 | }) 140 | -------------------------------------------------------------------------------- /test/example.js: -------------------------------------------------------------------------------- 1 | /** @type {ColumnSource[]} */ 2 | export const exampleData = [ 3 | { name: 'bool', data: [true, false, true, false] }, 4 | { name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] }, 5 | { name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] }, 6 | { name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT', nullable: false }, 7 | { name: 'double', data: [0, 0.0001, 123.456, 1e100] }, 8 | { name: 'string', data: ['a', 'b', 'c', 'd'] }, 9 | { name: 'nullable', data: [true, false, null, null] }, 10 | ] 11 | 12 | /** 13 | * @import {FileMetaData} from 'hyparquet' 14 | * @import {ColumnSource} from '../src/types.js' 15 | * @type {FileMetaData} 16 | */ 17 | export const exampleMetadata = { 18 | version: 2, 19 | created_by: 'hyparquet', 20 | schema: [ 21 | { name: 'root', num_children: 7 }, 22 | { name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' }, 23 | { name: 'int', type: 'INT32', repetition_type: 'REQUIRED' }, 24 | { name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' }, 25 | { name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' }, 26 | { name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' }, 27 | { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' }, 28 | { name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' }, 29 | ], 30 | num_rows: 4n, 31 | row_groups: [{ 32 | columns: [ 33 | { 34 | file_offset: 4n, 35 | meta_data: { 36 | type: 'BOOLEAN', 37 | encodings: ['PLAIN'], 38 | path_in_schema: ['bool'], 39 | codec: 'SNAPPY', 40 | num_values: 4n, 41 | total_uncompressed_size: 24n, 42 | total_compressed_size: 24n, 43 | data_page_offset: 4n, 44 | statistics: { 45 | null_count: 0n, 46 | min_value: false, 47 | max_value: true, 48 | }, 49 | }, 50 | }, 51 | { 52 | file_offset: 28n, 53 | meta_data: { 54 | type: 'INT32', 55 | encodings: ['PLAIN'], 56 | path_in_schema: ['int'], 57 | codec: 'SNAPPY', 58 | num_values: 4n, 59 | total_uncompressed_size: 39n, 60 | total_compressed_size: 39n, 61 | data_page_offset: 28n, 62 | statistics: { 63 | null_count: 0n, 64 | min_value: 0, 65 | max_value: 0x7fffffff, 66 | }, 67 | }, 68 | }, 69 | { 70 | file_offset: 67n, 71 | meta_data: { 72 | type: 'INT64', 73 | encodings: ['PLAIN'], 74 | path_in_schema: ['bigint'], 75 | codec: 'SNAPPY', 76 | num_values: 4n, 77 | total_uncompressed_size: 43n, 78 | total_compressed_size: 43n, 79 | data_page_offset: 67n, 80 | statistics: { 81 | null_count: 0n, 82 | min_value: 0n, 83 | max_value: 0x7fffffffffffffffn, 84 | }, 85 | }, 86 | }, 87 | { 88 | file_offset: 110n, 89 | meta_data: { 90 | type: 'FLOAT', 91 | encodings: ['PLAIN'], 92 | path_in_schema: ['float'], 93 | codec: 'SNAPPY', 94 | num_values: 4n, 95 | total_uncompressed_size: 39n, 96 | total_compressed_size: 39n, 97 | data_page_offset: 110n, 98 | statistics: { 99 | null_count: 0n, 100 | min_value: 0, 101 | max_value: Infinity, 102 | }, 103 | }, 104 | }, 105 | { 106 | file_offset: 149n, 107 | meta_data: { 108 | type: 'DOUBLE', 109 | encodings: ['PLAIN'], 110 | path_in_schema: ['double'], 111 | codec: 'SNAPPY', 112 | num_values: 4n, 113 | total_uncompressed_size: 51n, 114 | total_compressed_size: 51n, 115 | data_page_offset: 149n, 116 | statistics: { 117 | null_count: 0n, 118 | min_value: 0, 119 | max_value: 1e100, 120 | }, 121 | }, 122 | }, 123 | { 124 | file_offset: 200n, 125 | meta_data: { 126 | type: 'BYTE_ARRAY', 127 | encodings: ['PLAIN'], 128 | path_in_schema: ['string'], 129 | codec: 'SNAPPY', 130 | num_values: 4n, 131 | total_uncompressed_size: 42n, 132 | total_compressed_size: 42n, 133 | data_page_offset: 200n, 134 | statistics: { 135 | null_count: 0n, 136 | min_value: 'a', 137 | max_value: 'd', 138 | }, 139 | }, 140 | }, 141 | { 142 | file_offset: 242n, 143 | meta_data: { 144 | type: 'BOOLEAN', 145 | encodings: ['PLAIN'], 146 | path_in_schema: ['nullable'], 147 | codec: 'SNAPPY', 148 | num_values: 4n, 149 | total_uncompressed_size: 26n, 150 | total_compressed_size: 26n, 151 | data_page_offset: 242n, 152 | statistics: { 153 | null_count: 2n, 154 | min_value: false, 155 | max_value: true, 156 | }, 157 | }, 158 | }, 159 | ], 160 | total_byte_size: 264n, 161 | num_rows: 4n, 162 | }], 163 | metadata_length: 445, 164 | } 165 | -------------------------------------------------------------------------------- /test/write.multipage.test.js: -------------------------------------------------------------------------------- 1 | import { parquetReadObjects } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetWriteBuffer } from '../src/index.js' 4 | 5 | /** @import {ColumnSource} from '../src/types.js' */ 6 | 7 | describe('parquetWrite multi-page', () => { 8 | it('writes with small pageSize and data is still readable', async () => { 9 | // Generate enough data to span multiple pages with a small pageSize 10 | const numRows = 1000 11 | /** @type {ColumnSource[]} */ 12 | const columnData = [ 13 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 14 | { name: 'value', data: Array.from({ length: numRows }, (_, i) => i * 2), type: 'INT32' }, 15 | ] 16 | 17 | // Use a very small page size to force multiple pages 18 | // Each INT32 is 4 bytes, so 100 bytes should hold about 25 values per page 19 | const buffer = parquetWriteBuffer({ 20 | columnData, 21 | pageSize: 100, 22 | }) 23 | 24 | // Read back the data 25 | const rows = await parquetReadObjects({ file: buffer }) 26 | 27 | expect(rows.length).toBe(numRows) 28 | expect(rows[0]).toEqual({ id: 0, value: 0 }) 29 | expect(rows[999]).toEqual({ id: 999, value: 1998 }) 30 | }) 31 | 32 | it('handles various data types with pageSize', async () => { 33 | const numRows = 500 34 | /** @type {ColumnSource[]} */ 35 | const columnData = [ 36 | { name: 'int32', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 37 | { name: 'int64', data: Array.from({ length: numRows }, (_, i) => BigInt(i)), type: 'INT64' }, 38 | { name: 'float', data: Array.from({ length: numRows }, (_, i) => i * 0.5), type: 'FLOAT' }, 39 | { name: 'double', data: Array.from({ length: numRows }, (_, i) => i * 0.5), type: 'DOUBLE' }, 40 | { name: 'bool', data: Array.from({ length: numRows }, (_, i) => i % 2 === 0), type: 'BOOLEAN' }, 41 | ] 42 | 43 | const buffer = parquetWriteBuffer({ 44 | columnData, 45 | pageSize: 200, 46 | statistics: true, 47 | }) 48 | 49 | const rows = await parquetReadObjects({ file: buffer }) 50 | 51 | expect(rows.length).toBe(numRows) 52 | expect(rows[0].int32).toBe(0) 53 | expect(rows[0].bool).toBe(true) 54 | expect(rows[1].bool).toBe(false) 55 | }) 56 | 57 | it('handles strings with pageSize', async () => { 58 | const numRows = 100 59 | const strings = Array.from({ length: numRows }, (_, i) => `string_value_${i}`) 60 | /** @type {ColumnSource[]} */ 61 | const columnData = [ 62 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 63 | { name: 'str', data: strings, type: 'STRING' }, 64 | ] 65 | 66 | const buffer = parquetWriteBuffer({ 67 | columnData, 68 | pageSize: 200, 69 | }) 70 | 71 | const rows = await parquetReadObjects({ file: buffer }) 72 | 73 | expect(rows.length).toBe(numRows) 74 | expect(rows[0].str).toBe('string_value_0') 75 | expect(rows[99].str).toBe('string_value_99') 76 | }) 77 | 78 | it('handles nulls with pageSize', async () => { 79 | const numRows = 200 80 | /** @type {ColumnSource[]} */ 81 | const columnData = [ 82 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 83 | { name: 'nullable', data: Array.from({ length: numRows }, (_, i) => i % 3 === 0 ? null : i), type: 'INT32', nullable: true }, 84 | ] 85 | 86 | const buffer = parquetWriteBuffer({ 87 | columnData, 88 | pageSize: 100, 89 | }) 90 | 91 | const rows = await parquetReadObjects({ file: buffer }) 92 | 93 | expect(rows.length).toBe(numRows) 94 | expect(rows[0].nullable).toBe(null) 95 | expect(rows[1].nullable).toBe(1) 96 | expect(rows[2].nullable).toBe(2) 97 | expect(rows[3].nullable).toBe(null) 98 | }) 99 | 100 | it('works without pageSize (backwards compatibility)', async () => { 101 | const numRows = 100 102 | /** @type {ColumnSource[]} */ 103 | const columnData = [ 104 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 105 | ] 106 | 107 | // No pageSize specified 108 | const buffer = parquetWriteBuffer({ columnData }) 109 | 110 | const rows = await parquetReadObjects({ file: buffer }) 111 | expect(rows.length).toBe(numRows) 112 | }) 113 | 114 | it('handles single value per page edge case', async () => { 115 | const numRows = 10 116 | /** @type {ColumnSource[]} */ 117 | const columnData = [ 118 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 119 | ] 120 | 121 | // Very tiny pageSize - should still work 122 | const buffer = parquetWriteBuffer({ 123 | columnData, 124 | pageSize: 4, // exactly one INT32 125 | }) 126 | 127 | const rows = await parquetReadObjects({ file: buffer }) 128 | expect(rows.length).toBe(numRows) 129 | expect(rows.map(r => r.id)).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) 130 | }) 131 | 132 | it('handles dictionary encoding with pageSize', async () => { 133 | // Use repeated values to trigger dictionary encoding 134 | const numRows = 500 135 | const values = ['apple', 'banana', 'cherry'] 136 | /** @type {ColumnSource[]} */ 137 | const columnData = [ 138 | { name: 'id', data: Array.from({ length: numRows }, (_, i) => i), type: 'INT32' }, 139 | { name: 'fruit', data: Array.from({ length: numRows }, (_, i) => values[i % 3]), type: 'STRING' }, 140 | ] 141 | 142 | const buffer = parquetWriteBuffer({ 143 | columnData, 144 | pageSize: 100, 145 | }) 146 | 147 | const rows = await parquetReadObjects({ file: buffer }) 148 | 149 | expect(rows.length).toBe(numRows) 150 | expect(rows[0].fruit).toBe('apple') 151 | expect(rows[1].fruit).toBe('banana') 152 | expect(rows[2].fruit).toBe('cherry') 153 | expect(rows[3].fruit).toBe('apple') 154 | }) 155 | }) 156 | -------------------------------------------------------------------------------- /src/parquet-writer.js: -------------------------------------------------------------------------------- 1 | import { getSchemaPath } from 'hyparquet/src/schema.js' 2 | import { writeColumn } from './column.js' 3 | import { writeIndexes } from './indexes.js' 4 | import { writeMetadata } from './metadata.js' 5 | import { snappyCompress } from './snappy.js' 6 | 7 | /** 8 | * ParquetWriter class allows incremental writing of parquet files. 9 | * 10 | * @import {ColumnChunk, CompressionCodec, FileMetaData, KeyValue, RowGroup, SchemaElement} from 'hyparquet' 11 | * @import {ColumnEncoder, ColumnSource, Compressors, PageIndexes, Writer} from '../src/types.js' 12 | * @param {object} options 13 | * @param {Writer} options.writer 14 | * @param {SchemaElement[]} options.schema 15 | * @param {CompressionCodec} [options.codec] 16 | * @param {Compressors} [options.compressors] 17 | * @param {boolean} [options.statistics] 18 | * @param {KeyValue[]} [options.kvMetadata] 19 | */ 20 | export function ParquetWriter({ writer, schema, codec = 'SNAPPY', compressors, statistics = true, kvMetadata }) { 21 | this.writer = writer 22 | this.schema = schema 23 | this.codec = codec 24 | // Include built-in snappy as fallback 25 | this.compressors = { SNAPPY: snappyCompress, ...compressors } 26 | this.statistics = statistics 27 | this.kvMetadata = kvMetadata 28 | 29 | /** @type {RowGroup[]} */ 30 | this.row_groups = [] 31 | this.num_rows = 0n 32 | 33 | /** @type {PageIndexes[]} */ 34 | this.pendingIndexes = [] 35 | 36 | // write header PAR1 37 | this.writer.appendUint32(0x31524150) 38 | } 39 | 40 | /** 41 | * Write data to the file. 42 | * Will split data into row groups of the specified size. 43 | * 44 | * @param {object} options 45 | * @param {ColumnSource[]} options.columnData 46 | * @param {number | number[]} [options.rowGroupSize] 47 | * @param {number} [options.pageSize] 48 | */ 49 | ParquetWriter.prototype.write = function({ columnData, rowGroupSize = [100, 1000, 10000], pageSize = 1048576 }) { 50 | const columnDataRows = columnData[0]?.data?.length || 0 51 | for (const { groupStartIndex, groupSize } of groupIterator({ columnDataRows, rowGroupSize })) { 52 | const groupStartOffset = this.writer.offset 53 | /** @type {ColumnChunk[]} */ 54 | const columns = [] 55 | 56 | // write columns 57 | for (let j = 0; j < columnData.length; j++) { 58 | const { name, data, encoding, columnIndex = false, offsetIndex = false } = columnData[j] 59 | const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize) 60 | 61 | const schemaTree = getSchemaPath(this.schema, [name]) 62 | // Dive into the leaf element 63 | while (true) { 64 | const child = schemaTree[schemaTree.length - 1] 65 | if (!child.element.num_children) { 66 | break 67 | } else if (child.element.num_children === 1) { 68 | schemaTree.push(child.children[0]) 69 | } else { 70 | throw new Error(`parquet column ${name} struct unsupported`) 71 | } 72 | } 73 | const schemaPath = schemaTree.map(node => node.element) 74 | const element = schemaPath.at(-1) 75 | if (!element) throw new Error(`parquet column ${name} missing schema element`) 76 | /** @type {ColumnEncoder} */ 77 | const column = { 78 | columnName: name, 79 | element, 80 | schemaPath, 81 | codec: this.codec, 82 | compressors: this.compressors, 83 | stats: this.statistics, 84 | pageSize, 85 | columnIndex, 86 | offsetIndex, 87 | encoding, 88 | } 89 | 90 | const result = writeColumn({ 91 | writer: this.writer, 92 | column, 93 | values: groupData, 94 | }) 95 | 96 | columns.push(result.chunk) 97 | this.pendingIndexes.push(result) 98 | } 99 | 100 | this.num_rows += BigInt(groupSize) 101 | this.row_groups.push({ 102 | columns, 103 | total_byte_size: BigInt(this.writer.offset - groupStartOffset), 104 | num_rows: BigInt(groupSize), 105 | }) 106 | } 107 | } 108 | 109 | /** 110 | * Finish writing the file. 111 | */ 112 | ParquetWriter.prototype.finish = function() { 113 | // Write all indexes at end of file 114 | writeIndexes(this.writer, this.pendingIndexes) 115 | 116 | // write metadata 117 | /** @type {FileMetaData} */ 118 | const metadata = { 119 | version: 2, 120 | created_by: 'hyparquet', 121 | schema: this.schema, 122 | num_rows: this.num_rows, 123 | row_groups: this.row_groups, 124 | metadata_length: 0, 125 | key_value_metadata: this.kvMetadata, 126 | } 127 | // @ts-ignore don't want to actually serialize metadata_length 128 | delete metadata.metadata_length 129 | writeMetadata(this.writer, metadata) 130 | 131 | // write footer PAR1 132 | this.writer.appendUint32(0x31524150) 133 | this.writer.finish() 134 | } 135 | 136 | /** 137 | * Create an iterator for row groups based on the specified row group size. 138 | * If rowGroupSize is an array, it will return groups based on the sizes in the array. 139 | * When the array runs out, it will continue with the last size. 140 | * 141 | * @param {object} options 142 | * @param {number} options.columnDataRows - Total number of rows in the column data 143 | * @param {number | number[]} options.rowGroupSize - Size of each row group or an array of sizes 144 | * @returns {Array<{groupStartIndex: number, groupSize: number}>} 145 | */ 146 | function groupIterator({ columnDataRows, rowGroupSize }) { 147 | if (Array.isArray(rowGroupSize) && !rowGroupSize.length) { 148 | throw new Error('rowGroupSize array cannot be empty') 149 | } 150 | const groups = [] 151 | let groupIndex = 0 152 | let groupStartIndex = 0 153 | while (groupStartIndex < columnDataRows) { 154 | const size = Array.isArray(rowGroupSize) 155 | ? rowGroupSize[Math.min(groupIndex, rowGroupSize.length - 1)] 156 | : rowGroupSize 157 | const groupSize = Math.min(size, columnDataRows - groupStartIndex) 158 | groups.push({ groupStartIndex, groupSize }) 159 | groupStartIndex += size 160 | groupIndex++ 161 | } 162 | return groups 163 | } 164 | -------------------------------------------------------------------------------- /src/thrift.js: -------------------------------------------------------------------------------- 1 | import { CompactType } from 'hyparquet/src/thrift.js' 2 | 3 | /** 4 | * Serialize a JS object in TCompactProtocol format. 5 | * 6 | * Expects keys named like "field_1", "field_2", etc. in ascending order. 7 | * 8 | * @import {ThriftType} from 'hyparquet/src/types.js' 9 | * @import {Writer} from '../src/types.js' 10 | * @param {Writer} writer 11 | * @param {Record} data 12 | */ 13 | export function serializeTCompactProtocol(writer, data) { 14 | let lastFid = 0 15 | // write each field 16 | for (const [key, value] of Object.entries(data)) { 17 | if (value === undefined) continue 18 | 19 | // we expect key = "field_N" so we can extract N as the field ID 20 | const fid = parseInt(key.replace(/^field_/, ''), 10) 21 | if (Number.isNaN(fid)) { 22 | throw new Error(`thrift invalid field name: ${key}. Expected "field_###".`) 23 | } 24 | 25 | // write the field-begin header 26 | const type = getCompactTypeForValue(value) 27 | const delta = fid - lastFid 28 | if (delta <= 0) { 29 | throw new Error(`thrift non-monotonic field ID: fid=${fid}, lastFid=${lastFid}`) 30 | } 31 | // high nibble = delta, low nibble = type < 15 or zigzag 32 | if (delta <= 15) { 33 | writer.appendUint8(delta << 4 | type) 34 | } else { 35 | writer.appendUint8(type) 36 | writer.appendVarInt(fid << 1 ^ fid >> 15) // zigzag 37 | } 38 | 39 | // Write the field content itself 40 | writeElement(writer, type, value) 41 | 42 | lastFid = fid 43 | } 44 | 45 | // Finally write STOP 46 | writer.appendUint8(CompactType.STOP) 47 | } 48 | 49 | /** 50 | * Deduce a TCompactProtocol type from the JS value 51 | * 52 | * @param {any} value 53 | * @returns {number} CompactType 54 | */ 55 | function getCompactTypeForValue(value) { 56 | if (value === true) return CompactType.TRUE 57 | if (value === false) return CompactType.FALSE 58 | if (Number.isInteger(value)) return CompactType.I32 59 | if (typeof value === 'number') return CompactType.DOUBLE 60 | if (typeof value === 'bigint') return CompactType.I64 61 | if (typeof value === 'string') return CompactType.BINARY 62 | if (value instanceof Uint8Array) return CompactType.BINARY 63 | if (Array.isArray(value)) return CompactType.LIST 64 | if (value && typeof value === 'object') return CompactType.STRUCT 65 | throw new Error(`Cannot determine thrift compact type for: ${value}`) 66 | } 67 | 68 | /** 69 | * Write a single value of a given compact type. 70 | * 71 | * @param {Writer} writer 72 | * @param {number} type 73 | * @param {ThriftType} value 74 | */ 75 | function writeElement(writer, type, value) { 76 | // true/false is stored in the type 77 | if (type === CompactType.TRUE) return 78 | if (type === CompactType.FALSE) return 79 | if (type === CompactType.BYTE && typeof value === 'number') { 80 | writer.appendUint8(value) 81 | } else if (type === CompactType.I32 && typeof value === 'number') { 82 | const zigzag = value << 1 ^ value >> 31 83 | writer.appendVarInt(zigzag) 84 | } else if (type === CompactType.I64 && typeof value === 'bigint') { 85 | // For 64-bit (bigint) we do (value << 1n) ^ (value >> 63n) in zigzag 86 | const zigzag = value << 1n ^ value >> 63n 87 | writer.appendVarBigInt(zigzag) 88 | } else if (type === CompactType.DOUBLE && typeof value === 'number') { 89 | writer.appendFloat64(value) 90 | } else if (type === CompactType.BINARY && typeof value === 'string') { 91 | // store length as a varint, then raw bytes 92 | const bytes = new TextEncoder().encode(value) 93 | writer.appendVarInt(bytes.length) 94 | writer.appendBytes(bytes) 95 | } else if (type === CompactType.BINARY && value instanceof Uint8Array) { 96 | // store length as a varint, then raw bytes 97 | writer.appendVarInt(value.byteLength) 98 | writer.appendBytes(value) 99 | } else if (type === CompactType.LIST && Array.isArray(value)) { 100 | // Must store (size << 4) | elementType 101 | // We'll guess the element type from the first element 102 | const size = value.length 103 | if (size === 0) { 104 | // (0 << 4) | type for an empty list – pick BYTE arbitrarily 105 | writer.appendUint8(0 << 4 | CompactType.BYTE) 106 | return 107 | } 108 | 109 | // TODO: Check for heterogeneous lists? 110 | const elemType = getCompactTypeForValue(value[0]) 111 | 112 | const sizeNibble = size > 14 ? 15 : size 113 | writer.appendUint8(sizeNibble << 4 | elemType) 114 | if (size > 14) { 115 | writer.appendVarInt(size) 116 | } 117 | 118 | // Special trick for booleans in a list 119 | if (elemType === CompactType.TRUE || elemType === CompactType.FALSE) { 120 | // Write each boolean as a single 0 or 1 byte 121 | for (const v of value) { 122 | writer.appendUint8(v ? 1 : 0) 123 | } 124 | } else { 125 | // Otherwise write them out normally 126 | for (const v of value) { 127 | writeElement(writer, elemType, v) 128 | } 129 | } 130 | } else if (type === CompactType.STRUCT && typeof value === 'object') { 131 | // Recursively write sub-fields as "field_N: val", end with STOP 132 | let lastFid = 0 133 | for (const [k, v] of Object.entries(value)) { 134 | if (v === undefined) continue 135 | 136 | const fid = parseInt(k.replace(/^field_/, ''), 10) 137 | if (Number.isNaN(fid)) { 138 | throw new Error(`Invalid sub-field name: ${k}. Expected "field_###"`) 139 | } 140 | const t = getCompactTypeForValue(v) 141 | const delta = fid - lastFid 142 | if (delta <= 0) { 143 | throw new Error(`Non-monotonic fid in struct: fid=${fid}, lastFid=${lastFid}`) 144 | } 145 | if (delta <= 15) { 146 | writer.appendUint8(delta << 4 | t) 147 | } else { 148 | writer.appendUint8(t) 149 | writer.appendVarInt(fid << 1 ^ fid >> 15) 150 | } 151 | writeElement(writer, t, v) 152 | lastFid = fid 153 | } 154 | // Write STOP 155 | writer.appendUint8(CompactType.STOP) 156 | } else { 157 | throw new Error(`unhandled type in writeElement: ${type} for value ${value}`) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /test/metadata.test.js: -------------------------------------------------------------------------------- 1 | import { parquetMetadata } from 'hyparquet' 2 | import { describe, expect, it } from 'vitest' 3 | import { ByteWriter } from '../src/bytewriter.js' 4 | import { logicalType, writeMetadata } from '../src/metadata.js' 5 | import { exampleMetadata } from './example.js' 6 | 7 | /** 8 | * @import {FileMetaData, LogicalType} from 'hyparquet' 9 | * @import {ThriftObject} from '../src/types.js' 10 | */ 11 | 12 | describe('writeMetadata', () => { 13 | it('writes metadata and parses in hyparquet', () => { 14 | const writer = new ByteWriter() 15 | 16 | // write header PAR1 17 | writer.appendUint32(0x31524150) 18 | 19 | // write metadata 20 | /** @type {FileMetaData} */ 21 | const withKvMetadata = { 22 | ...exampleMetadata, 23 | key_value_metadata: [ 24 | { key: 'key1', value: 'value1' }, 25 | { key: 'key2', value: 'value2' }, 26 | ], 27 | metadata_length: 477, 28 | } 29 | writeMetadata(writer, withKvMetadata) 30 | 31 | // write footer PAR1 32 | writer.appendUint32(0x31524150) 33 | 34 | const file = writer.getBuffer() 35 | const outputMetadata = parquetMetadata(file) 36 | 37 | expect(outputMetadata).toEqual(withKvMetadata) 38 | }) 39 | 40 | it('writes extended column metadata fields', () => { 41 | const writer = new ByteWriter() 42 | writer.appendUint32(0x31524150) 43 | 44 | /** @type {FileMetaData} */ 45 | const extendedMetadata = { 46 | version: 2, 47 | created_by: 'hyparquet', 48 | schema: [ 49 | { name: 'root', num_children: 1 }, 50 | { 51 | name: 'geo', 52 | type: 'BYTE_ARRAY', 53 | repetition_type: 'REQUIRED', 54 | logical_type: { type: 'GEOGRAPHY', crs: 'EPSG:4326', algorithm: 'KARNEY' }, 55 | }, 56 | ], 57 | num_rows: 1n, 58 | row_groups: [{ 59 | columns: [{ 60 | file_path: 'part-0.parquet', 61 | file_offset: 4n, 62 | meta_data: { 63 | type: 'BYTE_ARRAY', 64 | encodings: ['PLAIN', 'RLE'], 65 | path_in_schema: [], 66 | codec: 'SNAPPY', 67 | num_values: 1n, 68 | total_uncompressed_size: 10n, 69 | total_compressed_size: 8n, 70 | key_value_metadata: [{ key: 'chunk', value: 'value' }], 71 | data_page_offset: 4n, 72 | index_page_offset: 12n, 73 | dictionary_page_offset: 20n, 74 | statistics: { 75 | null_count: 0n, 76 | min_value: 'a', 77 | max_value: 'z', 78 | }, 79 | encoding_stats: [{ page_type: 'DATA_PAGE', encoding: 'PLAIN', count: 1 }], 80 | bloom_filter_offset: 30n, 81 | bloom_filter_length: 4, 82 | size_statistics: { 83 | unencoded_byte_array_data_bytes: 5n, 84 | repetition_level_histogram: [1n, 0n], 85 | definition_level_histogram: [2n, 0n], 86 | }, 87 | geospatial_statistics: { 88 | bbox: { 89 | xmin: 0, 90 | xmax: 10, 91 | ymin: -5, 92 | ymax: 5, 93 | zmin: 1, 94 | zmax: 2, 95 | mmin: 3, 96 | mmax: 4, 97 | }, 98 | geospatial_types: [0, 1], 99 | }, 100 | }, 101 | offset_index_offset: 40n, 102 | offset_index_length: 16, 103 | column_index_offset: 60n, 104 | column_index_length: 24, 105 | encrypted_column_metadata: new Uint8Array([7, 8, 9]), 106 | }], 107 | total_byte_size: 64n, 108 | num_rows: 1n, 109 | sorting_columns: [{ 110 | column_idx: 0, 111 | descending: true, 112 | nulls_first: false, 113 | }], 114 | file_offset: 4n, 115 | total_compressed_size: 8n, 116 | }], 117 | key_value_metadata: [{ key: 'meta', value: 'data' }], 118 | metadata_length: 223, 119 | } 120 | 121 | writeMetadata(writer, extendedMetadata) 122 | writer.appendUint32(0x31524150) 123 | 124 | const outputMetadata = parquetMetadata(writer.getBuffer()) 125 | expect(outputMetadata).toEqual(extendedMetadata) 126 | }) 127 | }) 128 | 129 | describe('logicalType', () => { 130 | it('returns undefined when given undefined', () => { 131 | expect(logicalType(undefined)).toBeUndefined() 132 | }) 133 | 134 | it('returns correct object for known types', () => { 135 | /** @type {{ input: LogicalType, expected: ThriftObject }[]} */ 136 | const testCases = [ 137 | { input: { type: 'STRING' }, expected: { field_1: {} } }, 138 | { input: { type: 'MAP' }, expected: { field_2: {} } }, 139 | { input: { type: 'LIST' }, expected: { field_3: {} } }, 140 | { input: { type: 'ENUM' }, expected: { field_4: {} } }, 141 | { 142 | input: { type: 'DECIMAL', scale: 2, precision: 5 }, 143 | expected: { field_5: { field_1: 2, field_2: 5 } }, 144 | }, 145 | { input: { type: 'DATE' }, expected: { field_6: {} } }, 146 | { 147 | input: { type: 'TIME', isAdjustedToUTC: true, unit: 'MILLIS' }, 148 | expected: { field_7: { field_1: true, field_2: { field_1: {} } } }, 149 | }, 150 | { 151 | input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'MICROS' }, 152 | expected: { field_8: { field_1: false, field_2: { field_2: {} } } }, 153 | }, 154 | { 155 | input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'NANOS' }, 156 | expected: { field_8: { field_1: false, field_2: { field_3: {} } } }, 157 | }, 158 | { 159 | input: { type: 'INTEGER', bitWidth: 32, isSigned: true }, 160 | expected: { field_10: { field_1: 32, field_2: true } }, 161 | }, 162 | { input: { type: 'NULL' }, expected: { field_11: {} } }, 163 | { input: { type: 'JSON' }, expected: { field_12: {} } }, 164 | { input: { type: 'BSON' }, expected: { field_13: {} } }, 165 | { input: { type: 'UUID' }, expected: { field_14: {} } }, 166 | { input: { type: 'FLOAT16' }, expected: { field_15: {} } }, 167 | { input: { type: 'VARIANT' }, expected: { field_16: {} } }, 168 | { input: { type: 'GEOMETRY' }, expected: { field_17: {} } }, 169 | { input: { type: 'GEOGRAPHY' }, expected: { field_18: {} } }, 170 | ] 171 | 172 | testCases.forEach(({ input, expected }) => { 173 | expect(logicalType(input)).toEqual(expected) 174 | }) 175 | }) 176 | }) 177 | -------------------------------------------------------------------------------- /src/delta.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Delta Binary Packed encoding for parquet. 3 | * Encodes integers as deltas with variable bit-width packing. 4 | * 5 | * @import {DecodedArray} from 'hyparquet' 6 | * @import {Writer} from '../src/types.js' 7 | */ 8 | 9 | const BLOCK_SIZE = 128 10 | const MINIBLOCKS_PER_BLOCK = 4 11 | const VALUES_PER_MINIBLOCK = BLOCK_SIZE / MINIBLOCKS_PER_BLOCK // 32 12 | 13 | /** 14 | * Write values using delta binary packed encoding. 15 | * 16 | * @param {Writer} writer 17 | * @param {DecodedArray} values 18 | */ 19 | export function deltaBinaryPack(writer, values) { 20 | const count = values.length 21 | if (count === 0) { 22 | // Write header with zero count 23 | writer.appendVarInt(BLOCK_SIZE) 24 | writer.appendVarInt(MINIBLOCKS_PER_BLOCK) 25 | writer.appendVarInt(0) 26 | writer.appendVarInt(0) 27 | return 28 | } 29 | if (typeof values[0] !== 'number' && typeof values[0] !== 'bigint') { 30 | throw new Error('deltaBinaryPack only supports number or bigint arrays') 31 | } 32 | 33 | // Write header 34 | writer.appendVarInt(BLOCK_SIZE) 35 | writer.appendVarInt(MINIBLOCKS_PER_BLOCK) 36 | writer.appendVarInt(count) 37 | writer.appendZigZag(values[0]) 38 | 39 | // Process blocks 40 | let index = 1 41 | while (index < count) { 42 | const blockEnd = Math.min(index + BLOCK_SIZE, count) 43 | const blockSize = blockEnd - index 44 | 45 | // Compute deltas for this block 46 | const blockDeltas = new BigInt64Array(blockSize) 47 | let minDelta = BigInt(values[index]) - BigInt(values[index - 1]) 48 | blockDeltas[0] = minDelta 49 | for (let i = 1; i < blockSize; i++) { 50 | const delta = BigInt(values[index + i]) - BigInt(values[index + i - 1]) 51 | blockDeltas[i] = delta 52 | if (delta < minDelta) minDelta = delta 53 | } 54 | writer.appendZigZag(minDelta) 55 | 56 | // Calculate bit widths for each miniblock 57 | const bitWidths = new Uint8Array(MINIBLOCKS_PER_BLOCK) 58 | for (let mb = 0; mb < MINIBLOCKS_PER_BLOCK; mb++) { 59 | const mbStart = mb * VALUES_PER_MINIBLOCK 60 | const mbEnd = Math.min(mbStart + VALUES_PER_MINIBLOCK, blockSize) 61 | 62 | let maxAdjusted = 0n 63 | for (let i = mbStart; i < mbEnd; i++) { 64 | const adjusted = blockDeltas[i] - minDelta 65 | if (adjusted > maxAdjusted) maxAdjusted = adjusted 66 | } 67 | bitWidths[mb] = bitWidth(maxAdjusted) 68 | } 69 | 70 | // Write bit widths 71 | writer.appendBytes(bitWidths) 72 | 73 | // Write packed miniblocks 74 | for (let mb = 0; mb < MINIBLOCKS_PER_BLOCK; mb++) { 75 | const bitWidth = bitWidths[mb] 76 | if (bitWidth === 0) continue // No data needed for zero bit width 77 | 78 | const mbStart = mb * VALUES_PER_MINIBLOCK 79 | const mbEnd = Math.min(mbStart + VALUES_PER_MINIBLOCK, blockSize) 80 | 81 | // Bit pack the adjusted deltas 82 | let buffer = 0n 83 | let bitsUsed = 0 84 | 85 | for (let i = 0; i < VALUES_PER_MINIBLOCK; i++) { 86 | const adjusted = mbStart + i < mbEnd ? blockDeltas[mbStart + i] - minDelta : 0n 87 | buffer |= adjusted << BigInt(bitsUsed) 88 | bitsUsed += bitWidth 89 | 90 | // Flush complete bytes 91 | while (bitsUsed >= 8) { 92 | writer.appendUint8(Number(buffer & 0xffn)) 93 | buffer >>= 8n 94 | bitsUsed -= 8 95 | } 96 | } 97 | // assert(bitsUsed === 0) // because multiple of 8 98 | } 99 | 100 | index = blockEnd 101 | } 102 | } 103 | 104 | /** 105 | * Write byte arrays using delta length encoding. 106 | * Encodes lengths using delta binary packed, then writes raw bytes. 107 | * 108 | * @param {Writer} writer 109 | * @param {DecodedArray} values 110 | */ 111 | export function deltaLengthByteArray(writer, values) { 112 | // Extract lengths 113 | const lengths = new Int32Array(values.length) 114 | for (let i = 0; i < values.length; i++) { 115 | const value = values[i] 116 | if (!(value instanceof Uint8Array)) { 117 | throw new Error('deltaLengthByteArray expects Uint8Array values') 118 | } 119 | lengths[i] = value.length 120 | } 121 | 122 | // Write delta-packed lengths 123 | deltaBinaryPack(writer, lengths) 124 | 125 | // Write raw byte data 126 | for (const value of values) { 127 | writer.appendBytes(value) 128 | } 129 | } 130 | 131 | /** 132 | * Write byte arrays using delta encoding with prefix compression. 133 | * Stores common prefixes with previous value to improve compression. 134 | * 135 | * @param {Writer} writer 136 | * @param {DecodedArray} values 137 | */ 138 | export function deltaByteArray(writer, values) { 139 | if (values.length === 0) { 140 | deltaBinaryPack(writer, []) 141 | deltaBinaryPack(writer, []) 142 | return 143 | } 144 | 145 | // Calculate prefix lengths and suffixes 146 | const prefixLengths = new Int32Array(values.length) 147 | const suffixLengths = new Int32Array(values.length) 148 | /** @type {Uint8Array[]} */ 149 | const suffixes = new Array(values.length) 150 | 151 | // First value has no prefix 152 | const value = values[0] 153 | if (!(value instanceof Uint8Array)) { 154 | throw new Error('deltaByteArray expects Uint8Array values') 155 | } 156 | prefixLengths[0] = 0 157 | suffixLengths[0] = values[0].length 158 | suffixes[0] = values[0] 159 | 160 | for (let i = 1; i < values.length; i++) { 161 | const prev = values[i - 1] 162 | const curr = values[i] 163 | if (!(curr instanceof Uint8Array)) { 164 | throw new Error('deltaByteArray expects Uint8Array values') 165 | } 166 | 167 | // Find common prefix length 168 | let prefixLen = 0 169 | const maxPrefix = Math.min(prev.length, curr.length) 170 | while (prefixLen < maxPrefix && prev[prefixLen] === curr[prefixLen]) { 171 | prefixLen++ 172 | } 173 | 174 | prefixLengths[i] = prefixLen 175 | suffixLengths[i] = curr.length - prefixLen 176 | suffixes[i] = curr.subarray(prefixLen) 177 | } 178 | 179 | // Write delta-packed prefix lengths 180 | deltaBinaryPack(writer, prefixLengths) 181 | 182 | // Write delta-packed suffix lengths 183 | deltaBinaryPack(writer, suffixLengths) 184 | 185 | // Write suffix bytes 186 | for (const suffix of suffixes) { 187 | writer.appendBytes(suffix) 188 | } 189 | } 190 | 191 | /** 192 | * Minimum bits needed to store value. 193 | * 194 | * @param {bigint} value 195 | * @returns {number} 196 | */ 197 | function bitWidth(value) { 198 | if (value === 0n) return 0 199 | let bits = 0 200 | while (value > 0n) { 201 | bits++ 202 | value >>= 1n 203 | } 204 | return bits 205 | } 206 | -------------------------------------------------------------------------------- /src/schema.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Infer a schema from column data. 3 | * Accepts optional schemaOverrides to override the type of columns by name. 4 | * 5 | * @param {object} options 6 | * @param {ColumnSource[]} options.columnData 7 | * @param {Record} [options.schemaOverrides] 8 | * @returns {SchemaElement[]} 9 | */ 10 | export function schemaFromColumnData({ columnData, schemaOverrides }) { 11 | /** @type {SchemaElement[]} */ 12 | const schema = [{ 13 | name: 'root', 14 | num_children: columnData.length, 15 | }] 16 | let num_rows = 0 17 | 18 | for (const { name, data, type, nullable } of columnData) { 19 | // check if all columns have the same length 20 | num_rows = num_rows || data.length 21 | if (num_rows !== data.length) { 22 | throw new Error('columns must have the same length') 23 | } 24 | 25 | if (schemaOverrides?.[name]) { 26 | // use schema override 27 | const override = schemaOverrides[name] 28 | if (override.name !== name) throw new Error('schema override name does not match column name') 29 | schema.push(override) 30 | } else if (type) { 31 | // use provided type 32 | schema.push(basicTypeToSchemaElement(name, type, nullable)) 33 | } else { 34 | // auto-detect type 35 | schema.push(autoSchemaElement(name, data)) 36 | } 37 | } 38 | 39 | return schema 40 | } 41 | 42 | /** 43 | * @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet' 44 | * @import {BasicType, ColumnSource} from '../src/types.js' 45 | * @param {string} name 46 | * @param {BasicType} type 47 | * @param {boolean} [nullable] 48 | * @returns {SchemaElement} 49 | */ 50 | function basicTypeToSchemaElement(name, type, nullable) { 51 | const repetition_type = nullable === false ? 'REQUIRED' : 'OPTIONAL' 52 | if (type === 'STRING') { 53 | return { name, type: 'BYTE_ARRAY', converted_type: 'UTF8', repetition_type } 54 | } 55 | if (type === 'JSON') { 56 | return { name, type: 'BYTE_ARRAY', converted_type: 'JSON', repetition_type } 57 | } 58 | if (type === 'TIMESTAMP') { 59 | return { name, type: 'INT64', converted_type: 'TIMESTAMP_MILLIS', repetition_type } 60 | } 61 | if (type === 'UUID') { 62 | return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' }, repetition_type } 63 | } 64 | if (type === 'FLOAT16') { 65 | return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' }, repetition_type } 66 | } 67 | if (type === 'GEOMETRY') { 68 | return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' }, repetition_type } 69 | } 70 | if (type === 'GEOGRAPHY') { 71 | return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' }, repetition_type } 72 | } 73 | return { name, type, repetition_type } 74 | } 75 | 76 | /** 77 | * Automatically determine a SchemaElement from an array of values. 78 | * 79 | * @param {string} name 80 | * @param {DecodedArray} values 81 | * @returns {SchemaElement} 82 | */ 83 | export function autoSchemaElement(name, values) { 84 | /** @type {ParquetType | undefined} */ 85 | let type 86 | /** @type {FieldRepetitionType} */ 87 | let repetition_type = 'REQUIRED' 88 | /** @type {ConvertedType | undefined} */ 89 | let converted_type = undefined 90 | 91 | if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type } 92 | if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type } 93 | if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type } 94 | if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type } 95 | 96 | for (const value of values) { 97 | if (value === null || value === undefined) { 98 | repetition_type = 'OPTIONAL' 99 | } else { 100 | // value is defined 101 | /** @type {ParquetType | undefined} */ 102 | let valueType = undefined 103 | if (value === true || value === false) valueType = 'BOOLEAN' 104 | else if (typeof value === 'bigint') valueType = 'INT64' 105 | else if (Number.isInteger(value)) valueType = 'INT32' 106 | else if (typeof value === 'number') valueType = 'DOUBLE' 107 | else if (value instanceof Uint8Array) valueType = 'BYTE_ARRAY' 108 | else if (typeof value === 'string') { 109 | valueType = 'BYTE_ARRAY' 110 | // make sure they are all strings 111 | if (type && !converted_type) throw new Error('mixed types not supported') 112 | converted_type = 'UTF8' 113 | } 114 | else if (value instanceof Date) { 115 | valueType = 'INT64' 116 | // make sure they are all dates 117 | if (type && !converted_type) throw new Error('mixed types not supported') 118 | converted_type = 'TIMESTAMP_MILLIS' 119 | } 120 | else if (typeof value === 'object') { 121 | // use json (TODO: native list and object types) 122 | converted_type = 'JSON' 123 | valueType = 'BYTE_ARRAY' 124 | } 125 | else if (!valueType) throw new Error(`cannot determine parquet type for: ${value}`) 126 | 127 | // expand type if necessary 128 | if (type === undefined) { 129 | type = valueType 130 | } else if (type === 'INT32' && valueType === 'DOUBLE') { 131 | type = 'DOUBLE' 132 | } else if (type === 'DOUBLE' && valueType === 'INT32') { 133 | valueType = 'DOUBLE' 134 | } 135 | if (type !== valueType) { 136 | throw new Error(`parquet cannot write mixed types: ${type} and ${valueType}`) 137 | } 138 | } 139 | } 140 | if (!type) { 141 | // fallback to nullable BYTE_ARRAY 142 | // TODO: logical_type: 'NULL' 143 | type = 'BYTE_ARRAY' 144 | repetition_type = 'OPTIONAL' 145 | } 146 | return { name, type, repetition_type, converted_type } 147 | } 148 | 149 | /** 150 | * Get the max repetition level for a given schema path. 151 | * 152 | * @param {SchemaElement[]} schemaPath 153 | * @returns {number} max repetition level 154 | */ 155 | export function getMaxRepetitionLevel(schemaPath) { 156 | let maxLevel = 0 157 | for (const element of schemaPath) { 158 | if (element.repetition_type === 'REPEATED') { 159 | maxLevel++ 160 | } 161 | } 162 | return maxLevel 163 | } 164 | 165 | /** 166 | * Get the max definition level for a given schema path. 167 | * 168 | * @param {SchemaElement[]} schemaPath 169 | * @returns {number} max definition level 170 | */ 171 | export function getMaxDefinitionLevel(schemaPath) { 172 | let maxLevel = 0 173 | for (const element of schemaPath.slice(1)) { 174 | if (element.repetition_type !== 'REQUIRED') { 175 | maxLevel++ 176 | } 177 | } 178 | return maxLevel 179 | } 180 | -------------------------------------------------------------------------------- /src/snappy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The MIT License (MIT) 3 | * Copyright (c) 2016 Zhipeng Jia 4 | * https://github.com/zhipeng-jia/snappyjs 5 | */ 6 | 7 | import { ByteWriter } from './bytewriter.js' 8 | 9 | const BLOCK_LOG = 16 10 | const BLOCK_SIZE = 1 << BLOCK_LOG 11 | 12 | const MAX_HASH_TABLE_BITS = 14 13 | const globalHashTables = new Array(MAX_HASH_TABLE_BITS + 1) 14 | 15 | /** 16 | * Compress snappy data. 17 | * Returns Snappy-compressed bytes as Uint8Array. 18 | * 19 | * @param {Uint8Array} input - uncompressed data 20 | * @returns {Uint8Array} 21 | */ 22 | export function snappyCompress(input) { 23 | const writer = new ByteWriter() 24 | // Write uncompressed length as a varint 25 | writer.appendVarInt(input.length) 26 | if (input.length === 0) return new Uint8Array(writer.getBuffer()) 27 | 28 | // Process input in 64K blocks 29 | let pos = 0 30 | while (pos < input.length) { 31 | const fragmentSize = Math.min(input.length - pos, BLOCK_SIZE) 32 | compressFragment(input, pos, fragmentSize, writer) 33 | pos += fragmentSize 34 | } 35 | return new Uint8Array(writer.getBuffer()) 36 | } 37 | 38 | /** 39 | * Hash function used in the reference implementation. 40 | * 41 | * @param {number} key 42 | * @param {number} hashFuncShift 43 | * @returns {number} 44 | */ 45 | function hashFunc(key, hashFuncShift) { 46 | return key * 0x1e35a7bd >>> hashFuncShift 47 | } 48 | 49 | /** 50 | * Load a 32-bit little-endian integer from a byte array. 51 | * 52 | * @param {Uint8Array} array 53 | * @param {number} pos 54 | * @returns {number} 55 | */ 56 | function load32(array, pos) { 57 | // Expects Uint8Array as `array` 58 | return ( 59 | array[pos] + 60 | (array[pos + 1] << 8) + 61 | (array[pos + 2] << 16) + 62 | (array[pos + 3] << 24) 63 | ) 64 | } 65 | 66 | /** 67 | * Compare two 32-bit sequences for equality. 68 | * 69 | * @param {Uint8Array} array 70 | * @param {number} pos1 71 | * @param {number} pos2 72 | * @returns {boolean} 73 | */ 74 | function equals32(array, pos1, pos2) { 75 | return ( 76 | array[pos1] === array[pos2] && 77 | array[pos1 + 1] === array[pos2 + 1] && 78 | array[pos1 + 2] === array[pos2 + 2] && 79 | array[pos1 + 3] === array[pos2 + 3] 80 | ) 81 | } 82 | 83 | /** 84 | * Emit a literal chunk of data. 85 | * @import {Writer} from '../src/types.js' 86 | * @param {Uint8Array} input 87 | * @param {number} ip 88 | * @param {number} len 89 | * @param {Writer} writer 90 | */ 91 | function emitLiteral(input, ip, len, writer) { 92 | // The first byte(s) encode the literal length 93 | if (len <= 60) { 94 | writer.appendUint8(len - 1 << 2) 95 | } else if (len < 256) { 96 | writer.appendUint8(60 << 2) 97 | writer.appendUint8(len - 1) 98 | } else { 99 | writer.appendUint8(61 << 2) 100 | writer.appendUint8(len - 1 & 0xff) 101 | writer.appendUint8(len - 1 >>> 8) 102 | } 103 | 104 | // Then copy the literal bytes 105 | writer.appendBytes(input.subarray(ip, ip + len)) 106 | } 107 | 108 | /** 109 | * Emit a copy of previous data. 110 | * @param {Writer} writer 111 | * @param {number} offset 112 | * @param {number} len 113 | */ 114 | function emitCopyLessThan64(writer, offset, len) { 115 | if (len < 12 && offset < 2048) { 116 | // Copy 4..11 bytes, offset < 2048 117 | // --> [ 1 | (len-4)<<2 | (offset>>8)<<5 ] 118 | writer.appendUint8(1 + (len - 4 << 2) + (offset >>> 8 << 5)) 119 | writer.appendUint8(offset & 0xff) 120 | } else { 121 | // Copy len bytes, offset 1..65535 122 | // --> [ 2 | (len-1)<<2 ] 123 | writer.appendUint8(2 + (len - 1 << 2)) 124 | writer.appendUint8(offset & 0xff) 125 | writer.appendUint8(offset >>> 8) 126 | } 127 | } 128 | 129 | /** 130 | * Emit a copy of previous data. 131 | * @param {Writer} writer 132 | * @param {number} offset 133 | * @param {number} len 134 | */ 135 | function emitCopy(writer, offset, len) { 136 | // Emit 64-byte copies as long as we can 137 | while (len >= 68) { 138 | emitCopyLessThan64(writer, offset, 64) 139 | len -= 64 140 | } 141 | // Emit one 60-byte copy if needed 142 | if (len > 64) { 143 | emitCopyLessThan64(writer, offset, 60) 144 | len -= 60 145 | } 146 | // Final copy 147 | emitCopyLessThan64(writer, offset, len) 148 | } 149 | 150 | /** 151 | * Compress a fragment of data. 152 | * @param {Uint8Array} input 153 | * @param {number} ip 154 | * @param {number} inputSize 155 | * @param {Writer} writer 156 | */ 157 | function compressFragment(input, ip, inputSize, writer) { 158 | let hashTableBits = 1 159 | while (1 << hashTableBits <= inputSize && hashTableBits <= MAX_HASH_TABLE_BITS) { 160 | hashTableBits += 1 161 | } 162 | hashTableBits -= 1 163 | const hashFuncShift = 32 - hashTableBits 164 | 165 | // Initialize the hash table 166 | if (typeof globalHashTables[hashTableBits] === 'undefined') { 167 | globalHashTables[hashTableBits] = new Uint16Array(1 << hashTableBits) 168 | } 169 | const hashTable = globalHashTables[hashTableBits] 170 | hashTable.fill(0) 171 | 172 | const ipEnd = ip + inputSize 173 | let ipLimit 174 | const baseIp = ip 175 | let nextEmit = ip 176 | 177 | let hash, nextHash 178 | let nextIp, candidate, skip 179 | let bytesBetweenHashLookups 180 | let base, matched, offset 181 | let prevHash, curHash 182 | let flag = true 183 | 184 | const INPUT_MARGIN = 15 185 | if (inputSize >= INPUT_MARGIN) { 186 | ipLimit = ipEnd - INPUT_MARGIN 187 | ip += 1 188 | nextHash = hashFunc(load32(input, ip), hashFuncShift) 189 | 190 | while (flag) { 191 | skip = 32 192 | nextIp = ip 193 | do { 194 | ip = nextIp 195 | hash = nextHash 196 | bytesBetweenHashLookups = skip >>> 5 197 | skip += 1 198 | nextIp = ip + bytesBetweenHashLookups 199 | if (ip > ipLimit) { 200 | flag = false 201 | break 202 | } 203 | nextHash = hashFunc(load32(input, nextIp), hashFuncShift) 204 | candidate = baseIp + hashTable[hash] 205 | hashTable[hash] = ip - baseIp 206 | } while (!equals32(input, ip, candidate)) 207 | 208 | if (!flag) { 209 | break 210 | } 211 | 212 | // Emit the literal from `nextEmit` to `ip` 213 | emitLiteral(input, nextEmit, ip - nextEmit, writer) 214 | 215 | // We found a match. Repeatedly match and emit copies 216 | do { 217 | base = ip 218 | matched = 4 219 | while ( 220 | ip + matched < ipEnd && 221 | input[ip + matched] === input[candidate + matched] 222 | ) { 223 | matched++ 224 | } 225 | ip += matched 226 | offset = base - candidate 227 | emitCopy(writer, offset, matched) 228 | 229 | nextEmit = ip 230 | if (ip >= ipLimit) { 231 | flag = false 232 | break 233 | } 234 | prevHash = hashFunc(load32(input, ip - 1), hashFuncShift) 235 | hashTable[prevHash] = ip - 1 - baseIp 236 | curHash = hashFunc(load32(input, ip), hashFuncShift) 237 | candidate = baseIp + hashTable[curHash] 238 | hashTable[curHash] = ip - baseIp 239 | } while (equals32(input, ip, candidate)) 240 | 241 | if (!flag) { 242 | break 243 | } 244 | 245 | ip += 1 246 | nextHash = hashFunc(load32(input, ip), hashFuncShift) 247 | } 248 | } 249 | 250 | // Emit the last literal (if any) 251 | if (nextEmit < ipEnd) { 252 | emitLiteral(input, nextEmit, ipEnd - nextEmit, writer) 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /src/metadata.js: -------------------------------------------------------------------------------- 1 | import { getSchemaPath } from 'hyparquet/src/schema.js' 2 | import { CompressionCodecs, ConvertedTypes, Encodings, FieldRepetitionTypes, PageTypes, ParquetTypes } from 'hyparquet/src/constants.js' 3 | import { serializeTCompactProtocol } from './thrift.js' 4 | import { unconvertStatistics } from './unconvert.js' 5 | 6 | /** 7 | * @import {FileMetaData, LogicalType, TimeUnit} from 'hyparquet' 8 | * @import {ThriftObject, Writer} from '../src/types.js' 9 | * @param {Writer} writer 10 | * @param {FileMetaData} metadata 11 | */ 12 | export function writeMetadata(writer, metadata) { 13 | /** @type {ThriftObject} */ 14 | const compact = { 15 | field_1: metadata.version, 16 | field_2: metadata.schema && metadata.schema.map(element => ({ 17 | field_1: element.type && ParquetTypes.indexOf(element.type), 18 | field_2: element.type_length, 19 | field_3: element.repetition_type && FieldRepetitionTypes.indexOf(element.repetition_type), 20 | field_4: element.name, 21 | field_5: element.num_children, 22 | field_6: element.converted_type && ConvertedTypes.indexOf(element.converted_type), 23 | field_7: element.scale, 24 | field_8: element.precision, 25 | field_9: element.field_id, 26 | field_10: logicalType(element.logical_type), 27 | })), 28 | field_3: metadata.num_rows, 29 | field_4: metadata.row_groups.map(rg => ({ 30 | field_1: rg.columns.map((c, columnIndex) => ({ 31 | field_1: c.file_path, 32 | field_2: c.file_offset, 33 | field_3: c.meta_data && { 34 | field_1: ParquetTypes.indexOf(c.meta_data.type), 35 | field_2: c.meta_data.encodings.map(e => Encodings.indexOf(e)), 36 | field_3: c.meta_data.path_in_schema, 37 | field_4: CompressionCodecs.indexOf(c.meta_data.codec), 38 | field_5: c.meta_data.num_values, 39 | field_6: c.meta_data.total_uncompressed_size, 40 | field_7: c.meta_data.total_compressed_size, 41 | field_8: c.meta_data.key_value_metadata && c.meta_data.key_value_metadata.map(kv => ({ 42 | field_1: kv.key, 43 | field_2: kv.value, 44 | })), 45 | field_9: c.meta_data.data_page_offset, 46 | field_10: c.meta_data.index_page_offset, 47 | field_11: c.meta_data.dictionary_page_offset, 48 | field_12: c.meta_data.statistics && unconvertStatistics( 49 | c.meta_data.statistics, 50 | schemaElement(metadata.schema, c.meta_data.path_in_schema, columnIndex + 1) 51 | ), 52 | field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({ 53 | field_1: PageTypes.indexOf(es.page_type), 54 | field_2: Encodings.indexOf(es.encoding), 55 | field_3: es.count, 56 | })), 57 | field_14: c.meta_data.bloom_filter_offset, 58 | field_15: c.meta_data.bloom_filter_length, 59 | field_16: c.meta_data.size_statistics && { 60 | field_1: c.meta_data.size_statistics.unencoded_byte_array_data_bytes, 61 | field_2: c.meta_data.size_statistics.repetition_level_histogram, 62 | field_3: c.meta_data.size_statistics.definition_level_histogram, 63 | }, 64 | field_17: c.meta_data.geospatial_statistics && { 65 | field_1: c.meta_data.geospatial_statistics.bbox && { 66 | field_1: c.meta_data.geospatial_statistics.bbox.xmin, 67 | field_2: c.meta_data.geospatial_statistics.bbox.xmax, 68 | field_3: c.meta_data.geospatial_statistics.bbox.ymin, 69 | field_4: c.meta_data.geospatial_statistics.bbox.ymax, 70 | field_5: c.meta_data.geospatial_statistics.bbox.zmin, 71 | field_6: c.meta_data.geospatial_statistics.bbox.zmax, 72 | field_7: c.meta_data.geospatial_statistics.bbox.mmin, 73 | field_8: c.meta_data.geospatial_statistics.bbox.mmax, 74 | }, 75 | field_2: c.meta_data.geospatial_statistics.geospatial_types, 76 | }, 77 | }, 78 | field_4: c.offset_index_offset, 79 | field_5: c.offset_index_length, 80 | field_6: c.column_index_offset, 81 | field_7: c.column_index_length, 82 | // field_8: c.crypto_metadata, 83 | field_9: c.encrypted_column_metadata, 84 | })), 85 | field_2: rg.total_byte_size, 86 | field_3: rg.num_rows, 87 | field_4: rg.sorting_columns && rg.sorting_columns.map(sc => ({ 88 | field_1: sc.column_idx, 89 | field_2: sc.descending, 90 | field_3: sc.nulls_first, 91 | })), 92 | field_5: rg.file_offset, 93 | field_6: rg.total_compressed_size, 94 | // field_7: rg.ordinal, // should be int16 95 | })), 96 | field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({ 97 | field_1: kv.key, 98 | field_2: kv.value, 99 | })), 100 | field_6: metadata.created_by, 101 | } 102 | 103 | // write metadata as thrift 104 | const metadataStart = writer.offset 105 | serializeTCompactProtocol(writer, compact) 106 | // write metadata length 107 | const metadataLength = writer.offset - metadataStart 108 | writer.appendUint32(metadataLength) 109 | } 110 | 111 | /** 112 | * Resolve schema element for statistics using the stored path. 113 | * 114 | * @param {import('hyparquet').SchemaElement[]} schema 115 | * @param {string[] | undefined} path 116 | * @param {number} fallbackIndex 117 | * @returns {import('hyparquet').SchemaElement} 118 | */ 119 | function schemaElement(schema, path, fallbackIndex) { 120 | if (path?.length) { 121 | const resolved = getSchemaPath(schema, path).at(-1)?.element 122 | if (resolved) return resolved 123 | } 124 | return schema[fallbackIndex] 125 | } 126 | 127 | /** 128 | * @param {LogicalType | undefined} type 129 | * @returns {ThriftObject | undefined} 130 | */ 131 | export function logicalType(type) { 132 | if (!type) return 133 | if (type.type === 'STRING') return { field_1: {} } 134 | if (type.type === 'MAP') return { field_2: {} } 135 | if (type.type === 'LIST') return { field_3: {} } 136 | if (type.type === 'ENUM') return { field_4: {} } 137 | if (type.type === 'DECIMAL') return { field_5: { 138 | field_1: type.scale, 139 | field_2: type.precision, 140 | } } 141 | if (type.type === 'DATE') return { field_6: {} } 142 | if (type.type === 'TIME') return { field_7: { 143 | field_1: type.isAdjustedToUTC, 144 | field_2: timeUnit(type.unit), 145 | } } 146 | if (type.type === 'TIMESTAMP') return { field_8: { 147 | field_1: type.isAdjustedToUTC, 148 | field_2: timeUnit(type.unit), 149 | } } 150 | if (type.type === 'INTEGER') return { field_10: { 151 | field_1: type.bitWidth, 152 | field_2: type.isSigned, 153 | } } 154 | if (type.type === 'NULL') return { field_11: {} } 155 | if (type.type === 'JSON') return { field_12: {} } 156 | if (type.type === 'BSON') return { field_13: {} } 157 | if (type.type === 'UUID') return { field_14: {} } 158 | if (type.type === 'FLOAT16') return { field_15: {} } 159 | if (type.type === 'VARIANT') return { field_16: {} } 160 | if (type.type === 'GEOMETRY') return { field_17: { 161 | field_1: type.crs, 162 | } } 163 | if (type.type === 'GEOGRAPHY') return { field_18: { 164 | field_1: type.crs, 165 | field_2: type.algorithm && edgeAlgorithm[type.algorithm], 166 | } } 167 | } 168 | 169 | /** 170 | * @param {TimeUnit} unit 171 | * @returns {ThriftObject} 172 | */ 173 | function timeUnit(unit) { 174 | if (unit === 'NANOS') return { field_3: {} } 175 | if (unit === 'MICROS') return { field_2: {} } 176 | return { field_1: {} } 177 | } 178 | 179 | /** 180 | * @import {EdgeInterpolationAlgorithm} from 'hyparquet/src/types.js' 181 | * @type {Record} 182 | */ 183 | const edgeAlgorithm = { 184 | SPHERICAL: 0, 185 | VINCENTY: 1, 186 | THOMAS: 2, 187 | ANDOYER: 3, 188 | KARNEY: 4, 189 | } 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hyparquet Writer 2 | 3 | ![hyparquet writer parakeet](hyparquet-writer.jpg) 4 | 5 | [![npm](https://img.shields.io/npm/v/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) 6 | [![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) 7 | [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) 8 | [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) 9 | ![coverage](https://img.shields.io/badge/Coverage-95-darkred) 10 | [![dependencies](https://img.shields.io/badge/Dependencies-1-blueviolet)](https://www.npmjs.com/package/hyparquet-writer?activeTab=dependencies) 11 | 12 | Hyparquet Writer is a JavaScript library for writing [Apache Parquet](https://parquet.apache.org) files. It is designed to be lightweight, fast and store data very efficiently. It is a companion to the [hyparquet](https://github.com/hyparam/hyparquet) library, which is a JavaScript library for reading parquet files. 13 | 14 | ## Quick Start 15 | 16 | To write a parquet file to an `ArrayBuffer` use `parquetWriteBuffer` with argument `columnData`. Each column in `columnData` should contain: 17 | 18 | - `name`: the column name 19 | - `data`: an array of same-type values 20 | - `type`: the parquet schema type (optional) 21 | 22 | ```javascript 23 | import { parquetWriteBuffer } from 'hyparquet-writer' 24 | 25 | const arrayBuffer = parquetWriteBuffer({ 26 | columnData: [ 27 | { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, 28 | { name: 'age', data: [25, 30, 35], type: 'INT32' }, 29 | ], 30 | }) 31 | ``` 32 | 33 | Note: if `type` is not provided, the type will be guessed from the data. The supported `BasicType` are a superset of the parquet primitive types: 34 | 35 | | Basic Type | Equivalent Schema Element | 36 | |------|----------------| 37 | | `BOOLEAN` | `{ type: 'BOOLEAN' }` | 38 | | `INT32` | `{ type: 'INT32' }` | 39 | | `INT64` | `{ type: 'INT64' }` | 40 | | `FLOAT` | `{ type: 'FLOAT' }` | 41 | | `DOUBLE` | `{ type: 'DOUBLE' }` | 42 | | `BYTE_ARRAY` | `{ type: 'BYTE_ARRAY' }` | 43 | | `STRING` | `{ type: 'BYTE_ARRAY', converted_type: 'UTF8' }` | 44 | | `JSON` | `{ type: 'BYTE_ARRAY', converted_type: 'JSON' }` | 45 | | `TIMESTAMP` | `{ type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' }` | 46 | | `UUID` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' } }` | 47 | | `FLOAT16` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' } }` | 48 | | `GEOMETRY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }` | 49 | | `GEOGRAPHY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }` | 50 | 51 | More types are supported but require defining the `schema` explicitly. See the [advanced usage](#advanced-usage) section for more details. 52 | 53 | ### Write to Local Parquet File (nodejs) 54 | 55 | To write a local parquet file in node.js use `parquetWriteFile` with arguments `filename` and `columnData`: 56 | 57 | ```javascript 58 | const { parquetWriteFile } = await import('hyparquet-writer') 59 | 60 | parquetWriteFile({ 61 | filename: 'example.parquet', 62 | columnData: [ 63 | { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, 64 | { name: 'age', data: [25, 30, 35], type: 'INT32' }, 65 | ], 66 | }) 67 | ``` 68 | 69 | Note: hyparquet-writer is published as an ES module, so dynamic `import()` may be required on the command line. 70 | 71 | ## Advanced Usage 72 | 73 | Options can be passed to `parquetWrite` to adjust parquet file writing behavior: 74 | 75 | - `writer`: a generic writer object 76 | - `schema`: parquet schema object (optional) 77 | - `codec`: use snappy compression (default true) 78 | - `compressors`: custom compressors 79 | - `statistics`: write column statistics (default true) 80 | - `rowGroupSize`: number of rows in each row group (default 100000) 81 | - `kvMetadata`: extra key-value metadata to be stored in the parquet footer 82 | 83 | ```javascript 84 | import { ByteWriter, parquetWrite } from 'hyparquet-writer' 85 | import { snappyCompress } from 'hysnappy' 86 | 87 | const writer = new ByteWriter() 88 | parquetWrite({ 89 | writer, 90 | columnData: [ 91 | { name: 'name', data: ['Alice', 'Bob', 'Charlie'] }, 92 | { name: 'age', data: [25, 30, 35] }, 93 | { name: 'dob', data: [new Date(1000000), new Date(2000000), new Date(3000000)] }, 94 | ], 95 | // explicit schema: 96 | schema: [ 97 | { name: 'root', num_children: 3 }, 98 | { name: 'name', type: 'BYTE_ARRAY', converted_type: 'UTF8' }, 99 | { name: 'age', type: 'FIXED_LEN_BYTE_ARRAY', type_length: 4, converted_type: 'DECIMAL', scale: 2, precision: 4 }, 100 | { name: 'dob', type: 'INT32', converted_type: 'DATE' }, 101 | ], 102 | compressors: { SNAPPY: snappyCompresss }, // high performance wasm compressor 103 | statistics: false, 104 | rowGroupSize: 1000, 105 | kvMetadata: [ 106 | { key: 'key1', value: 'value1' }, 107 | { key: 'key2', value: 'value2' }, 108 | ], 109 | }) 110 | const arrayBuffer = writer.getBuffer() 111 | ``` 112 | 113 | ### Types 114 | 115 | Parquet requires an explicit schema to be defined. You can provide schema information in three ways: 116 | 117 | 1. **Type**: You can provide a `type` in the `columnData` elements, the type will be used as the schema type. 118 | 2. **Schema**: You can provide a `schema` parameter that explicitly defines the parquet schema. The schema should be an array of `SchemaElement` objects (see [parquet-format](https://github.com/apache/parquet-format)), each containing the following properties: 119 | - `name`: column name 120 | - `type`: parquet type 121 | - `num_children`: number children in parquet nested schema (optional) 122 | - `converted_type`: parquet converted type (optional) 123 | - `logical_type`: parquet logical type (optional) 124 | - `repetition_type`: parquet repetition type (optional) 125 | - `type_length`: length for `FIXED_LENGTH_BYTE_ARRAY` type (optional) 126 | - `scale`: the scale factor for `DECIMAL` converted types (optional) 127 | - `precision`: the precision for `DECIMAL` converted types (optional) 128 | - `field_id`: the field id for the column (optional) 129 | 3. **Auto-detect**: If you provide no type or schema, the type will be auto-detected from the data. However, it is recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc) 130 | 131 | Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc) 132 | 133 | #### Schema Overrides 134 | 135 | You can use mostly automatic schema detection, but override the schema for specific columns. This is useful if most of the column types can be automatically determined, but you want to use a specific schema element for one particular element. 136 | 137 | ```javascript 138 | const { ByteWriter, parquetWrite, schemaFromColumnData } = await import("hyparquet-writer") 139 | 140 | const columnData = [ 141 | { name: 'unsigned_int', data: [1000000, 2000000] }, 142 | { name: 'signed_int', data: [1000000, 2000000] }, 143 | ] 144 | const writer = new ByteWriter() 145 | parquetWrite({ 146 | writer, 147 | columnData, 148 | // override schema for uint column 149 | schema: schemaFromColumnData({ 150 | columnData, 151 | schemaOverrides: { 152 | unsigned_int: { 153 | name: 'unsigned_int', 154 | type: 'INT32', 155 | converted_type: 'UINT_32', 156 | repetition_type: 'REQUIRED', 157 | }, 158 | }, 159 | }), 160 | }) 161 | ``` 162 | 163 | ## References 164 | 165 | - https://github.com/hyparam/hyparquet 166 | - https://github.com/hyparam/hyparquet-compressors 167 | - https://github.com/apache/parquet-format 168 | - https://github.com/apache/parquet-testing 169 | -------------------------------------------------------------------------------- /src/datapage.js: -------------------------------------------------------------------------------- 1 | import { Encodings, PageTypes } from 'hyparquet/src/constants.js' 2 | import { ByteWriter } from './bytewriter.js' 3 | import { deltaBinaryPack, deltaByteArray, deltaLengthByteArray } from './delta.js' 4 | import { writeRleBitPackedHybrid } from './encoding.js' 5 | import { writePlain } from './plain.js' 6 | import { writeByteStreamSplit } from './splitstream.js' 7 | import { serializeTCompactProtocol } from './thrift.js' 8 | import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' 9 | 10 | /** 11 | * @param {Writer} writer 12 | * @param {DecodedArray} values 13 | * @param {ColumnEncoder} column 14 | * @param {Encoding} encoding 15 | * @param {PageData} [listValues] 16 | */ 17 | export function writeDataPageV2(writer, values, column, encoding, listValues) { 18 | const { columnName, element, codec, compressors } = column 19 | const { type, type_length, repetition_type } = element 20 | 21 | if (!type) throw new Error(`column ${columnName} cannot determine type`) 22 | if (repetition_type === 'REPEATED') throw new Error(`column ${columnName} repeated types not supported`) 23 | 24 | // write levels to temp buffer 25 | const levelWriter = new ByteWriter() 26 | const { 27 | definition_levels_byte_length, 28 | repetition_levels_byte_length, 29 | num_nulls, 30 | num_values, 31 | } = writeLevels(levelWriter, column, values, listValues) 32 | 33 | const nonnull = values.filter(v => v !== null && v !== undefined) 34 | 35 | // write page data to temp buffer 36 | const page = new ByteWriter() 37 | if (encoding === 'PLAIN') { 38 | writePlain(page, nonnull, type, type_length) 39 | } else if (encoding === 'RLE') { 40 | if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type') 41 | const rleData = new ByteWriter() 42 | writeRleBitPackedHybrid(rleData, nonnull, 1) 43 | page.appendUint32(rleData.offset) // prepend byte length 44 | page.appendBuffer(rleData.getBuffer()) 45 | } else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') { 46 | // find max bitwidth 47 | let maxValue = 0 48 | for (const v of values) if (v > maxValue) maxValue = v 49 | const bitWidth = Math.ceil(Math.log2(maxValue + 1)) 50 | page.appendUint8(bitWidth) // prepend bitWidth 51 | writeRleBitPackedHybrid(page, nonnull, bitWidth) 52 | } else if (encoding === 'DELTA_BINARY_PACKED') { 53 | if (type !== 'INT32' && type !== 'INT64') { 54 | throw new Error('DELTA_BINARY_PACKED encoding only supported for INT32 and INT64 types') 55 | } 56 | deltaBinaryPack(page, nonnull) 57 | } else if (encoding === 'DELTA_LENGTH_BYTE_ARRAY') { 58 | if (type !== 'BYTE_ARRAY') { 59 | throw new Error('DELTA_LENGTH_BYTE_ARRAY encoding only supported for BYTE_ARRAY type') 60 | } 61 | deltaLengthByteArray(page, nonnull) 62 | } else if (encoding === 'DELTA_BYTE_ARRAY') { 63 | if (type !== 'BYTE_ARRAY') { 64 | throw new Error('DELTA_BYTE_ARRAY encoding only supported for BYTE_ARRAY type') 65 | } 66 | deltaByteArray(page, nonnull) 67 | } else if (encoding === 'BYTE_STREAM_SPLIT') { 68 | writeByteStreamSplit(page, nonnull, type, type_length) 69 | } else { 70 | throw new Error(`parquet unsupported encoding: ${encoding}`) 71 | } 72 | 73 | // compress page data 74 | const pageBuffer = new Uint8Array(page.getBuffer()) 75 | const compressedBytes = compressors[codec]?.(pageBuffer) ?? pageBuffer 76 | 77 | // write page header 78 | writePageHeader(writer, { 79 | type: 'DATA_PAGE_V2', 80 | uncompressed_page_size: levelWriter.offset + page.offset, 81 | compressed_page_size: levelWriter.offset + compressedBytes.length, 82 | data_page_header_v2: { 83 | num_values, 84 | num_nulls, 85 | num_rows: values.length, 86 | encoding, 87 | definition_levels_byte_length, 88 | repetition_levels_byte_length, 89 | is_compressed: !!codec, 90 | }, 91 | }) 92 | 93 | // write levels 94 | writer.appendBuffer(levelWriter.getBuffer()) 95 | 96 | // write page data 97 | writer.appendBytes(compressedBytes) 98 | } 99 | 100 | /** 101 | * @param {Writer} writer 102 | * @param {PageHeader} header 103 | */ 104 | export function writePageHeader(writer, header) { 105 | /** @type {ThriftObject} */ 106 | const compact = { 107 | field_1: PageTypes.indexOf(header.type), 108 | field_2: header.uncompressed_page_size, 109 | field_3: header.compressed_page_size, 110 | field_4: header.crc, 111 | field_5: header.data_page_header && { 112 | field_1: header.data_page_header.num_values, 113 | field_2: Encodings.indexOf(header.data_page_header.encoding), 114 | field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding), 115 | field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding), 116 | // field_5: header.data_page_header.statistics, 117 | }, 118 | field_7: header.dictionary_page_header && { 119 | field_1: header.dictionary_page_header.num_values, 120 | field_2: Encodings.indexOf(header.dictionary_page_header.encoding), 121 | }, 122 | field_8: header.data_page_header_v2 && { 123 | field_1: header.data_page_header_v2.num_values, 124 | field_2: header.data_page_header_v2.num_nulls, 125 | field_3: header.data_page_header_v2.num_rows, 126 | field_4: Encodings.indexOf(header.data_page_header_v2.encoding), 127 | field_5: header.data_page_header_v2.definition_levels_byte_length, 128 | field_6: header.data_page_header_v2.repetition_levels_byte_length, 129 | field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true 130 | }, 131 | } 132 | serializeTCompactProtocol(writer, compact) 133 | } 134 | 135 | /** 136 | * @import {DecodedArray, Encoding, PageHeader} from 'hyparquet' 137 | * @import {ColumnEncoder, PageData, ThriftObject, Writer} from '../src/types.js' 138 | * @param {Writer} writer 139 | * @param {ColumnEncoder} column 140 | * @param {DecodedArray} values 141 | * @param {PageData} [listValues] 142 | * @returns {{ 143 | * definition_levels_byte_length: number 144 | * repetition_levels_byte_length: number 145 | * num_nulls: number 146 | * num_values: number 147 | * }} 148 | */ 149 | function writeLevels(writer, column, values, listValues) { 150 | const { schemaPath } = column 151 | const definitionLevels = listValues?.definitionLevels 152 | const repetitionLevels = listValues?.repetitionLevels 153 | 154 | let num_nulls = listValues?.numNulls ?? 0 155 | let num_values = definitionLevels?.length ?? values.length 156 | 157 | const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) 158 | let repetition_levels_byte_length = 0 159 | if (maxRepetitionLevel) { 160 | const bitWidth = Math.ceil(Math.log2(maxRepetitionLevel + 1)) 161 | const reps = repetitionLevels ?? [] 162 | repetition_levels_byte_length = writeRleBitPackedHybrid(writer, reps, bitWidth) 163 | } 164 | 165 | // definition levels 166 | const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) 167 | let definition_levels_byte_length = 0 168 | if (maxDefinitionLevel) { 169 | const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1)) 170 | const defs = definitionLevels ?? (() => { 171 | const generated = [] 172 | for (const value of values) { 173 | if (value === null || value === undefined) { 174 | generated.push(maxDefinitionLevel - 1) 175 | num_nulls++ 176 | } else { 177 | generated.push(maxDefinitionLevel) 178 | } 179 | } 180 | num_values = generated.length 181 | return generated 182 | })() 183 | 184 | if (definitionLevels && listValues === undefined) { 185 | num_nulls = definitionLevels.reduce( 186 | (count, def) => def === maxDefinitionLevel ? count : count + 1, 187 | 0 188 | ) 189 | } 190 | 191 | definition_levels_byte_length = writeRleBitPackedHybrid(writer, defs, bitWidth) 192 | } else { 193 | num_nulls = values.filter(value => value === null || value === undefined).length 194 | } 195 | return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls, num_values } 196 | } 197 | -------------------------------------------------------------------------------- /test/delta.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { ByteWriter } from '../src/bytewriter.js' 3 | import { deltaBinaryPack, deltaByteArray, deltaLengthByteArray } from '../src/delta.js' 4 | import { deltaBinaryUnpack, deltaByteArray as deltaByteArrayRead, deltaLengthByteArray as deltaLengthByteArrayRead } from 'hyparquet/src/delta.js' 5 | 6 | /** 7 | * Round-trip test for deltaBinaryPack with Int32Array output. 8 | * 9 | * @param {number[]} values 10 | * @returns {number[]} 11 | */ 12 | function roundTripInt32(values) { 13 | const writer = new ByteWriter() 14 | deltaBinaryPack(writer, values) 15 | const buffer = writer.getBuffer() 16 | const reader = { view: new DataView(buffer), offset: 0 } 17 | 18 | const output = new Int32Array(values.length) 19 | deltaBinaryUnpack(reader, values.length, output) 20 | return Array.from(output) 21 | } 22 | 23 | /** 24 | * Round-trip test for deltaBinaryPack with BigInt64Array output. 25 | * 26 | * @param {bigint[]} values 27 | * @returns {bigint[]} 28 | */ 29 | function roundTripBigInt(values) { 30 | const writer = new ByteWriter() 31 | deltaBinaryPack(writer, values) 32 | const buffer = writer.getBuffer() 33 | const reader = { view: new DataView(buffer), offset: 0 } 34 | 35 | const output = new BigInt64Array(values.length) 36 | deltaBinaryUnpack(reader, values.length, output) 37 | return Array.from(output) 38 | } 39 | 40 | /** 41 | * Round-trip test for deltaLengthByteArray. 42 | * 43 | * @param {Uint8Array[]} values 44 | * @returns {Uint8Array[]} 45 | */ 46 | function roundTripLengthByteArray(values) { 47 | const writer = new ByteWriter() 48 | deltaLengthByteArray(writer, values) 49 | const buffer = writer.getBuffer() 50 | const reader = { view: new DataView(buffer), offset: 0 } 51 | 52 | /** @type {Uint8Array[]} */ 53 | const output = new Array(values.length) 54 | deltaLengthByteArrayRead(reader, values.length, output) 55 | return output 56 | } 57 | 58 | /** 59 | * Round-trip test for deltaByteArray. 60 | * 61 | * @param {Uint8Array[]} values 62 | * @returns {Uint8Array[]} 63 | */ 64 | function roundTripByteArray(values) { 65 | const writer = new ByteWriter() 66 | deltaByteArray(writer, values) 67 | const buffer = writer.getBuffer() 68 | const reader = { view: new DataView(buffer), offset: 0 } 69 | 70 | /** @type {Uint8Array[]} */ 71 | const output = new Array(values.length) 72 | deltaByteArrayRead(reader, values.length, output) 73 | return output 74 | } 75 | 76 | describe('deltaBinaryPack', () => { 77 | it('should round-trip empty array', () => { 78 | const decoded = roundTripInt32([]) 79 | expect(decoded).toEqual([]) 80 | }) 81 | 82 | it('should round-trip single value', () => { 83 | const decoded = roundTripInt32([42]) 84 | expect(decoded).toEqual([42]) 85 | }) 86 | 87 | it('should round-trip monotonically increasing values', () => { 88 | const original = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 89 | const decoded = roundTripInt32(original) 90 | expect(decoded).toEqual(original) 91 | }) 92 | 93 | it('should round-trip constant values', () => { 94 | const original = Array(100).fill(42) 95 | const decoded = roundTripInt32(original) 96 | expect(decoded).toEqual(original) 97 | }) 98 | 99 | it('should round-trip negative deltas', () => { 100 | const original = [100, 90, 80, 70, 60, 50, 40, 30, 20, 10] 101 | const decoded = roundTripInt32(original) 102 | expect(decoded).toEqual(original) 103 | }) 104 | 105 | it('should round-trip mixed deltas', () => { 106 | const original = [0, 5, 3, 8, 2, 9, 1, 7, 4, 6] 107 | const decoded = roundTripInt32(original) 108 | expect(decoded).toEqual(original) 109 | }) 110 | 111 | it('should round-trip values spanning multiple blocks', () => { 112 | // More than 128 values to test multiple blocks 113 | const original = Array.from({ length: 300 }, (_, i) => i * 2) 114 | const decoded = roundTripInt32(original) 115 | expect(decoded).toEqual(original) 116 | }) 117 | 118 | it('should round-trip large values', () => { 119 | const original = [1000000, 1000001, 1000002, 1000003] 120 | const decoded = roundTripInt32(original) 121 | expect(decoded).toEqual(original) 122 | }) 123 | 124 | it('should round-trip negative values', () => { 125 | const original = [-10, -5, 0, 5, 10] 126 | const decoded = roundTripInt32(original) 127 | expect(decoded).toEqual(original) 128 | }) 129 | 130 | it('should round-trip bigint values', () => { 131 | const original = [1n, 2n, 3n, 4n, 5n] 132 | const decoded = roundTripBigInt(original) 133 | expect(decoded).toEqual(original) 134 | }) 135 | 136 | it('should round-trip large bigint values', () => { 137 | const original = [10000000000n, 10000000001n, 10000000002n] 138 | const decoded = roundTripBigInt(original) 139 | expect(decoded).toEqual(original) 140 | }) 141 | 142 | it('should round-trip random values', () => { 143 | const original = Array.from({ length: 200 }, () => Math.floor(Math.random() * 10000)) 144 | const decoded = roundTripInt32(original) 145 | expect(decoded).toEqual(original) 146 | }) 147 | 148 | it('should throw for unsupported types', () => { 149 | const writer = new ByteWriter() 150 | expect(() => deltaBinaryPack(writer, ['string'])).toThrow('deltaBinaryPack only supports number or bigint arrays') 151 | }) 152 | 153 | it('should handle values requiring bit flush at end of miniblock', () => { 154 | // Values with varying bit widths to exercise the bitsUsed > 0 flush path 155 | const original = Array.from({ length: 32 }, (_, i) => i * 7) 156 | const decoded = roundTripInt32(original) 157 | expect(decoded).toEqual(original) 158 | }) 159 | }) 160 | 161 | describe('deltaLengthByteArray', () => { 162 | it('should round-trip empty array', () => { 163 | const decoded = roundTripLengthByteArray([]) 164 | expect(decoded).toEqual([]) 165 | }) 166 | 167 | it('should round-trip single byte array', () => { 168 | const original = [new Uint8Array([1, 2, 3])] 169 | const decoded = roundTripLengthByteArray(original) 170 | expect(decoded.length).toBe(1) 171 | expect(Array.from(decoded[0])).toEqual([1, 2, 3]) 172 | }) 173 | 174 | it('should round-trip multiple byte arrays', () => { 175 | const original = [ 176 | new Uint8Array([1, 2, 3]), 177 | new Uint8Array([4, 5]), 178 | new Uint8Array([6, 7, 8, 9]), 179 | ] 180 | const decoded = roundTripLengthByteArray(original) 181 | expect(decoded.length).toBe(3) 182 | expect(Array.from(decoded[0])).toEqual([1, 2, 3]) 183 | expect(Array.from(decoded[1])).toEqual([4, 5]) 184 | expect(Array.from(decoded[2])).toEqual([6, 7, 8, 9]) 185 | }) 186 | 187 | it('should round-trip strings as byte arrays', () => { 188 | const encoder = new TextEncoder() 189 | const original = ['hello', 'world', 'test'].map(s => encoder.encode(s)) 190 | const decoded = roundTripLengthByteArray(original) 191 | const decoder = new TextDecoder() 192 | expect(decoded.map(d => decoder.decode(d))).toEqual(['hello', 'world', 'test']) 193 | }) 194 | 195 | it('should throw for non-Uint8Array values', () => { 196 | const writer = new ByteWriter() 197 | expect(() => deltaLengthByteArray(writer, ['string'])).toThrow('deltaLengthByteArray expects Uint8Array values') 198 | }) 199 | }) 200 | 201 | describe('deltaByteArray', () => { 202 | it('should round-trip empty array', () => { 203 | const decoded = roundTripByteArray([]) 204 | expect(decoded).toEqual([]) 205 | }) 206 | 207 | it('should round-trip single byte array', () => { 208 | const original = [new Uint8Array([1, 2, 3])] 209 | const decoded = roundTripByteArray(original) 210 | expect(decoded.length).toBe(1) 211 | expect(Array.from(decoded[0])).toEqual([1, 2, 3]) 212 | }) 213 | 214 | it('should round-trip arrays with common prefixes', () => { 215 | const encoder = new TextEncoder() 216 | const original = ['prefix_a', 'prefix_b', 'prefix_c'].map(s => encoder.encode(s)) 217 | const decoded = roundTripByteArray(original) 218 | const decoder = new TextDecoder() 219 | expect(decoded.map(d => decoder.decode(d))).toEqual(['prefix_a', 'prefix_b', 'prefix_c']) 220 | }) 221 | 222 | it('should round-trip arrays with no common prefix', () => { 223 | const encoder = new TextEncoder() 224 | const original = ['abc', 'xyz', '123'].map(s => encoder.encode(s)) 225 | const decoded = roundTripByteArray(original) 226 | const decoder = new TextDecoder() 227 | expect(decoded.map(d => decoder.decode(d))).toEqual(['abc', 'xyz', '123']) 228 | }) 229 | 230 | it('should round-trip sorted strings efficiently', () => { 231 | const encoder = new TextEncoder() 232 | const original = ['apple', 'application', 'apply', 'banana', 'bandana'].map(s => encoder.encode(s)) 233 | const decoded = roundTripByteArray(original) 234 | const decoder = new TextDecoder() 235 | expect(decoded.map(d => decoder.decode(d))).toEqual(['apple', 'application', 'apply', 'banana', 'bandana']) 236 | }) 237 | 238 | it('should throw for non-Uint8Array first value', () => { 239 | const writer = new ByteWriter() 240 | expect(() => deltaByteArray(writer, ['string'])).toThrow('deltaByteArray expects Uint8Array values') 241 | }) 242 | 243 | it('should throw for non-Uint8Array subsequent value', () => { 244 | const writer = new ByteWriter() 245 | expect(() => deltaByteArray(writer, [new Uint8Array([1]), 'string'])).toThrow('deltaByteArray expects Uint8Array values') 246 | }) 247 | }) 248 | -------------------------------------------------------------------------------- /src/unconvert.js: -------------------------------------------------------------------------------- 1 | import { toJson } from 'hyparquet' 2 | import { geojsonToWkb } from './wkb.js' 3 | 4 | const dayMillis = 86400000 // 1 day in milliseconds 5 | /** 6 | * @import {DecodedArray, SchemaElement, Statistics} from 'hyparquet' 7 | * @import {MinMaxType} from 'hyparquet/src/types.js' 8 | * @import {ThriftObject} from '../src/types.js' 9 | */ 10 | 11 | /** 12 | * Convert from rich to primitive types. 13 | * 14 | * @param {SchemaElement} element 15 | * @param {DecodedArray} values 16 | * @returns {DecodedArray} 17 | */ 18 | export function unconvert(element, values) { 19 | const { type, converted_type: ctype, logical_type: ltype } = element 20 | if (ctype === 'DECIMAL') { 21 | const factor = 10 ** (element.scale || 0) 22 | return values.map(v => { 23 | if (v === null || v === undefined) return v 24 | if (typeof v !== 'number') throw new Error('DECIMAL must be a number') 25 | return unconvertDecimal(element, BigInt(Math.round(v * factor))) 26 | }) 27 | } 28 | if (ctype === 'DATE') { 29 | return Array.from(values).map(v => v && v.getTime() / dayMillis) 30 | } 31 | if (ctype === 'TIMESTAMP_MILLIS') { 32 | return Array.from(values).map(v => v && BigInt(v.getTime())) 33 | } 34 | if (ctype === 'TIMESTAMP_MICROS') { 35 | return Array.from(values).map(v => v && BigInt(v.getTime() * 1000)) 36 | } 37 | if (ctype === 'JSON') { 38 | if (!Array.isArray(values)) throw new Error('JSON must be an array') 39 | const encoder = new TextEncoder() 40 | return values.map(v => v === undefined ? undefined : encoder.encode(JSON.stringify(toJson(v)))) 41 | } 42 | if (ctype === 'UTF8') { 43 | if (!Array.isArray(values)) throw new Error('strings must be an array') 44 | const encoder = new TextEncoder() 45 | return values.map(v => typeof v === 'string' ? encoder.encode(v) : v) 46 | } 47 | if (ltype?.type === 'FLOAT16') { 48 | if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('FLOAT16 must be FIXED_LEN_BYTE_ARRAY type') 49 | if (element.type_length !== 2) throw new Error('FLOAT16 expected type_length to be 2 bytes') 50 | return Array.from(values).map(unconvertFloat16) 51 | } 52 | if (ltype?.type === 'UUID') { 53 | if (!Array.isArray(values)) throw new Error('UUID must be an array') 54 | if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('UUID must be FIXED_LEN_BYTE_ARRAY type') 55 | if (element.type_length !== 16) throw new Error('UUID expected type_length to be 16 bytes') 56 | return values.map(unconvertUuid) 57 | } 58 | if (ltype?.type === 'GEOMETRY' || ltype?.type === 'GEOGRAPHY') { 59 | if (!Array.isArray(values)) throw new Error('geometry must be an array') 60 | return values.map(v => v && geojsonToWkb(v)) 61 | } 62 | return values 63 | } 64 | 65 | /** 66 | * @param {Uint8Array | string | undefined} value 67 | * @returns {Uint8Array | undefined} 68 | */ 69 | function unconvertUuid(value) { 70 | if (value === undefined || value === null) return 71 | if (value instanceof Uint8Array) return value 72 | if (typeof value === 'string') { 73 | const uuidRegex = /^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$/i 74 | if (!uuidRegex.test(value)) { 75 | throw new Error('UUID must be a valid UUID string') 76 | } 77 | value = value.replace(/-/g, '').toLowerCase() 78 | const bytes = new Uint8Array(16) 79 | for (let i = 0; i < 16; i++) { 80 | bytes[i] = parseInt(value.slice(i * 2, i * 2 + 2), 16) 81 | } 82 | return bytes 83 | } 84 | throw new Error('UUID must be a string or Uint8Array') 85 | } 86 | 87 | /** 88 | * Uncovert from rich type to byte array for metadata statistics. 89 | * 90 | * @param {MinMaxType | undefined} value 91 | * @param {SchemaElement} element 92 | * @returns {Uint8Array | undefined} 93 | */ 94 | export function unconvertMinMax(value, element) { 95 | if (value === undefined || value === null) return undefined 96 | const { type, converted_type } = element 97 | if (type === 'BOOLEAN') return new Uint8Array([value ? 1 : 0]) 98 | if (converted_type === 'DECIMAL') { 99 | if (typeof value !== 'number') throw new Error('DECIMAL must be a number') 100 | const factor = 10 ** (element.scale || 0) 101 | const out = unconvertDecimal(element, BigInt(Math.round(value * factor))) 102 | if (out instanceof Uint8Array) return out 103 | if (typeof out === 'number') { 104 | const buffer = new ArrayBuffer(4) 105 | new DataView(buffer).setFloat32(0, out, true) 106 | return new Uint8Array(buffer) 107 | } 108 | if (typeof out === 'bigint') { 109 | const buffer = new ArrayBuffer(8) 110 | new DataView(buffer).setBigInt64(0, out, true) 111 | return new Uint8Array(buffer) 112 | } 113 | } 114 | if (type === 'BYTE_ARRAY' || type === 'FIXED_LEN_BYTE_ARRAY') { 115 | // truncate byte arrays to 16 bytes for statistics 116 | if (value instanceof Uint8Array) return value.slice(0, 16) 117 | return new TextEncoder().encode(value.toString().slice(0, 16)) 118 | } 119 | if (type === 'FLOAT' && typeof value === 'number') { 120 | const buffer = new ArrayBuffer(4) 121 | new DataView(buffer).setFloat32(0, value, true) 122 | return new Uint8Array(buffer) 123 | } 124 | if (type === 'DOUBLE' && typeof value === 'number') { 125 | const buffer = new ArrayBuffer(8) 126 | new DataView(buffer).setFloat64(0, value, true) 127 | return new Uint8Array(buffer) 128 | } 129 | if (type === 'INT32' && typeof value === 'number') { 130 | const buffer = new ArrayBuffer(4) 131 | new DataView(buffer).setInt32(0, value, true) 132 | return new Uint8Array(buffer) 133 | } 134 | if (type === 'INT64' && typeof value === 'bigint') { 135 | const buffer = new ArrayBuffer(8) 136 | new DataView(buffer).setBigInt64(0, value, true) 137 | return new Uint8Array(buffer) 138 | } 139 | if (type === 'INT32' && converted_type === 'DATE' && value instanceof Date) { 140 | const buffer = new ArrayBuffer(4) 141 | new DataView(buffer).setInt32(0, Math.floor(value.getTime() / dayMillis), true) 142 | return new Uint8Array(buffer) 143 | } 144 | if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS' && value instanceof Date) { 145 | const buffer = new ArrayBuffer(8) 146 | new DataView(buffer).setBigInt64(0, BigInt(value.getTime()), true) 147 | return new Uint8Array(buffer) 148 | } 149 | throw new Error(`unsupported type for statistics: ${type} with value ${value}`) 150 | } 151 | 152 | /** 153 | * @param {Statistics} stats 154 | * @param {SchemaElement} element 155 | * @returns {ThriftObject} 156 | */ 157 | export function unconvertStatistics(stats, element) { 158 | return { 159 | field_1: unconvertMinMax(stats.max, element), 160 | field_2: unconvertMinMax(stats.min, element), 161 | field_3: stats.null_count, 162 | field_4: stats.distinct_count, 163 | field_5: unconvertMinMax(stats.max_value, element), 164 | field_6: unconvertMinMax(stats.min_value, element), 165 | field_7: stats.is_max_value_exact, 166 | field_8: stats.is_min_value_exact, 167 | } 168 | } 169 | 170 | /** 171 | * @param {SchemaElement} element 172 | * @param {bigint} value 173 | * @returns {number | bigint | Uint8Array} 174 | */ 175 | export function unconvertDecimal({ type, type_length }, value) { 176 | if (type === 'INT32') return Number(value) 177 | if (type === 'INT64') return value 178 | if (type === 'FIXED_LEN_BYTE_ARRAY' && !type_length) { 179 | throw new Error('fixed length byte array type_length is required') 180 | } 181 | if (!type_length && !value) return new Uint8Array() 182 | 183 | const bytes = [] 184 | while (true) { 185 | // extract the lowest 8 bits 186 | const byte = Number(value & 0xffn) 187 | bytes.unshift(byte) 188 | value >>= 8n 189 | 190 | if (type_length) { 191 | if (bytes.length >= type_length) break // fixed length 192 | } else { 193 | // for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n 194 | // for negative: stop when top byte has signBit = 1 AND shifted value == -1n 195 | const sign = byte & 0x80 196 | if (!sign && value === 0n || sign && value === -1n) { 197 | break 198 | } 199 | } 200 | } 201 | 202 | return new Uint8Array(bytes) 203 | } 204 | 205 | /** 206 | * @param {number | undefined} value 207 | * @returns {Uint8Array | undefined} 208 | */ 209 | export function unconvertFloat16(value) { 210 | if (value === undefined || value === null) return 211 | if (typeof value !== 'number') throw new Error('parquet float16 expected number value') 212 | if (Number.isNaN(value)) return new Uint8Array([0x00, 0x7e]) 213 | 214 | const sign = value < 0 || Object.is(value, -0) ? 1 : 0 215 | const abs = Math.abs(value) 216 | 217 | // infinities 218 | if (!isFinite(abs)) return new Uint8Array([0x00, sign << 7 | 0x7c]) 219 | 220 | // ±0 221 | if (abs === 0) return new Uint8Array([0x00, sign << 7]) 222 | 223 | // write as f32 to get raw bits 224 | const buf = new ArrayBuffer(4) 225 | new Float32Array(buf)[0] = abs 226 | const bits32 = new Uint32Array(buf)[0] 227 | 228 | let exp32 = bits32 >>> 23 & 0xff 229 | let mant32 = bits32 & 0x7fffff 230 | 231 | // convert 32‑bit exponent to unbiased, then to 16‑bit 232 | exp32 -= 127 233 | 234 | // handle numbers too small for a normal 16‑bit exponent 235 | if (exp32 < -14) { 236 | // sub‑normal: shift mantissa so that result = mant * 2^-14 237 | const shift = -14 - exp32 238 | mant32 = (mant32 | 0x800000) >> shift + 13 239 | 240 | // round‑to‑nearest‑even 241 | if (mant32 & 1) mant32 += 1 242 | 243 | const bits16 = sign << 15 | mant32 244 | return new Uint8Array([bits16 & 0xff, bits16 >> 8]) 245 | } 246 | 247 | // overflow 248 | if (exp32 > 15) return new Uint8Array([0x00, sign << 7 | 0x7c]) 249 | 250 | // normal number 251 | let exp16 = exp32 + 15 252 | mant32 = mant32 + 0x1000 // add rounding bit 253 | 254 | // handle mantissa overflow after rounding 255 | if (mant32 & 0x800000) { 256 | mant32 = 0 257 | if (++exp16 === 31) // became infinity 258 | return new Uint8Array([0x00, sign << 7 | 0x7c]) 259 | } 260 | 261 | const bits16 = sign << 15 | exp16 << 10 | mant32 >> 13 262 | return new Uint8Array([bits16 & 0xff, bits16 >> 8]) 263 | } 264 | -------------------------------------------------------------------------------- /test/unconvert.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { unconvert, unconvertDecimal, unconvertFloat16, unconvertMinMax } from '../src/unconvert.js' 3 | import { convertMetadata } from 'hyparquet/src/metadata.js' 4 | import { DEFAULT_PARSERS, parseFloat16 } from 'hyparquet/src/convert.js' 5 | 6 | /** 7 | * @import {SchemaElement} from 'hyparquet' 8 | */ 9 | describe('unconvert', () => { 10 | it('should return Date objects when converted_type = DATE', () => { 11 | /** @type {SchemaElement} */ 12 | const schema = { name: 'test', converted_type: 'DATE' } 13 | const input = [new Date('2020-01-01T00:00:00Z'), new Date('2021-01-01T00:00:00Z')] 14 | const result = unconvert(schema, input) 15 | expect(result).toEqual([18262, 18628]) 16 | }) 17 | 18 | it('should convert JSON objects to strings when converted_type = JSON', () => { 19 | /** @type {SchemaElement} */ 20 | const schema = { name: 'test', converted_type: 'JSON' } 21 | const input = [{ foo: 'bar' }, { hello: 'world' }] 22 | const result = unconvert(schema, input) 23 | 24 | // We check that result is an array of Uint8Arrays containing the JSON-encoded bytes 25 | expect(result).toHaveLength(2) 26 | expect(result[0]).toBeInstanceOf(Uint8Array) 27 | expect(new TextDecoder().decode(result[0])).toEqual(JSON.stringify({ foo: 'bar' })) 28 | expect(new TextDecoder().decode(result[1])).toEqual(JSON.stringify({ hello: 'world' })) 29 | }) 30 | 31 | it('should handle undefined values in JSON arrays', () => { 32 | /** @type {SchemaElement} */ 33 | const schema = { name: 'test', converted_type: 'JSON' } 34 | const input = [{ foo: 'bar' }, undefined, { hello: 'world' }] 35 | const result = unconvert(schema, input) 36 | 37 | expect(result).toHaveLength(3) 38 | expect(result[0]).toBeInstanceOf(Uint8Array) 39 | expect(result[1]).toBeUndefined() 40 | expect(result[2]).toBeInstanceOf(Uint8Array) 41 | expect(new TextDecoder().decode(result[0])).toEqual(JSON.stringify({ foo: 'bar' })) 42 | expect(new TextDecoder().decode(result[2])).toEqual(JSON.stringify({ hello: 'world' })) 43 | }) 44 | 45 | it('should convert string array to Uint8Array when converted_type = UTF8', () => { 46 | /** @type {SchemaElement} */ 47 | const schema = { name: 'test', converted_type: 'UTF8' } 48 | const input = ['hello', 'world'] 49 | const result = unconvert(schema, input) 50 | 51 | expect(result).toHaveLength(2) 52 | expect(result[0]).toBeInstanceOf(Uint8Array) 53 | expect(new TextDecoder().decode(result[0])).toBe('hello') 54 | expect(new TextDecoder().decode(result[1])).toBe('world') 55 | }) 56 | 57 | it('should throw an error when converted_type = UTF8 and values is not an array', () => { 58 | expect(() => unconvert( 59 | { name: 'test', converted_type: 'UTF8' }, 60 | new Uint8Array([1, 2, 3])) 61 | ).toThrow('strings must be an array') 62 | }) 63 | 64 | it('should throw an error when converted_type = JSON and values is not an array', () => { 65 | expect(() => unconvert( 66 | { name: 'test', converted_type: 'JSON' }, 67 | new Uint8Array([1, 2, 3])) 68 | ).toThrow('JSON must be an array') 69 | }) 70 | 71 | it('should return original values if there is no recognized converted_type', () => { 72 | const input = [1, 2, 3] 73 | const result = unconvert({ name: 'test' }, input) 74 | expect(result).toEqual(input) 75 | }) 76 | }) 77 | 78 | describe('unconvertMinMax', () => { 79 | it('should return undefined if value is undefined or null', () => { 80 | /** @type {SchemaElement} */ 81 | const schema = { name: 'test', type: 'INT32' } 82 | expect(unconvertMinMax(undefined, schema)).toBeUndefined() 83 | }) 84 | 85 | it('should handle BOOLEAN type', () => { 86 | /** @type {SchemaElement} */ 87 | const schema = { name: 'test', type: 'BOOLEAN' } 88 | expect(unconvertMinMax(true, schema)).toEqual(new Uint8Array([1])) 89 | expect(unconvertMinMax(false, schema)).toEqual(new Uint8Array([0])) 90 | }) 91 | 92 | it('should truncate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY to 16 bytes', () => { 93 | // longer string to test truncation 94 | const longStr = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 95 | const longStrUint8 = new TextEncoder().encode(longStr) 96 | 97 | // value is a Uint8Array 98 | const result1 = unconvertMinMax(longStrUint8, { name: 'test', type: 'BYTE_ARRAY' }) 99 | expect(result1).toBeInstanceOf(Uint8Array) 100 | expect(result1?.length).toBe(16) 101 | 102 | // value is a string 103 | const result2 = unconvertMinMax(longStr, { name: 'test', type: 'FIXED_LEN_BYTE_ARRAY' }) 104 | expect(result2).toBeInstanceOf(Uint8Array) 105 | expect(result2?.length).toBe(16) 106 | }) 107 | 108 | it('should correctly encode FLOAT values in little-endian', () => { 109 | /** @type {SchemaElement} */ 110 | const schema = { name: 'test', type: 'FLOAT' } 111 | const value = 1.5 112 | const result = unconvertMinMax(value, schema) 113 | expect(result).toBeInstanceOf(Uint8Array) 114 | const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS) 115 | expect(roundtrip).toEqual(1.5) 116 | }) 117 | 118 | it('should correctly encode DOUBLE values in little-endian', () => { 119 | /** @type {SchemaElement} */ 120 | const schema = { name: 'test', type: 'DOUBLE' } 121 | const value = 1.123456789 122 | const result = unconvertMinMax(value, schema) 123 | expect(result).toBeInstanceOf(Uint8Array) 124 | const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS) 125 | expect(roundtrip).toEqual(1.123456789) 126 | }) 127 | 128 | it('should correctly encode INT32 values in little-endian', () => { 129 | /** @type {SchemaElement} */ 130 | const schema = { name: 'test', type: 'INT32' } 131 | const value = 123456 132 | const result = unconvertMinMax(value, schema) 133 | const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS) 134 | expect(roundtrip).toEqual(123456) 135 | }) 136 | 137 | it('should correctly encode INT64 values when given a bigint', () => { 138 | /** @type {SchemaElement} */ 139 | const schema = { name: 'test', type: 'INT64' } 140 | const value = 1234567890123456789n 141 | const result = unconvertMinMax(value, schema) 142 | const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS) 143 | expect(roundtrip).toEqual(1234567890123456789n) 144 | }) 145 | 146 | it('should correctly encode a Date as TIMESTAMP_MILLIS for INT64', () => { 147 | /** @type {SchemaElement} */ 148 | const schema = { name: 'test', type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' } 149 | const date = new Date('2023-01-01T00:00:00Z') 150 | const result = unconvertMinMax(date, schema) 151 | const roundtrip = convertMetadata(result, schema, DEFAULT_PARSERS) 152 | expect(roundtrip).toEqual(date) 153 | }) 154 | 155 | it('should throw an error for unsupported types', () => { 156 | /** @type {SchemaElement} */ 157 | const schema = { name: 'test', type: 'INT96' } 158 | expect(() => unconvertMinMax(123, schema)) 159 | .toThrow('unsupported type for statistics: INT96 with value 123') 160 | }) 161 | 162 | it('should throw an error for INT64 if value is a number instead of bigint or Date', () => { 163 | /** @type {SchemaElement} */ 164 | const schema = { name: 'test', type: 'INT64' } 165 | expect(() => unconvertMinMax(123, schema)) 166 | .toThrow('unsupported type for statistics: INT64 with value 123') 167 | }) 168 | }) 169 | 170 | describe('unconvertDecimal', () => { 171 | const examples = [ 172 | { input: 0n, expected: new Uint8Array([]) }, 173 | { input: 1n, expected: new Uint8Array([0x01]) }, 174 | { input: -1n, expected: new Uint8Array([0xff]) }, 175 | { input: 1234n, expected: new Uint8Array([0x04, 0xd2]) }, 176 | { input: -1234n, expected: new Uint8Array([0xfb, 0x2e]) }, 177 | { input: 1234567890123456789n, expected: new Uint8Array([0x11, 0x22, 0x10, 0xf4, 0x7d, 0xe9, 0x81, 0x15]) }, 178 | { input: -1234567890123456789n, expected: new Uint8Array([0xee, 0xdd, 0xef, 0x0b, 0x82, 0x16, 0x7e, 0xeb]) }, 179 | ] 180 | /** @type {SchemaElement} */ 181 | const element = { 182 | name: 'col', 183 | type: 'BYTE_ARRAY', 184 | } 185 | 186 | it.for(examples)('should convert %p', ({ input, expected }) => { 187 | expect(parseDecimal(expected)).toEqual(input) 188 | }) 189 | 190 | it.for(examples)('should unconvert %p', ({ input, expected }) => { 191 | expect(unconvertDecimal(element, input)).toEqual(expected) 192 | }) 193 | 194 | it.for(examples)('should roundtrip %p', ({ input }) => { 195 | const byteArray = unconvertDecimal(element, input) 196 | if (!(byteArray instanceof Uint8Array)) throw new Error('expected Uint8Array') 197 | expect(parseDecimal(byteArray)).toEqual(input) 198 | }) 199 | 200 | it.for(examples)('should reverse roundtrip %p', ({ expected }) => { 201 | expect(unconvertDecimal(element, parseDecimal(expected))).toEqual(expected) 202 | }) 203 | 204 | it('convert to INT32', () => { 205 | expect(unconvertDecimal({ name: 'col', type: 'INT32' }, 1234n)).toEqual(1234) 206 | }) 207 | 208 | it('convert to INT64', () => { 209 | expect(unconvertDecimal({ name: 'col', type: 'INT64' }, 1234n)).toEqual(1234n) 210 | }) 211 | 212 | it('throws if fixed length is not specified', () => { 213 | expect(() => unconvertDecimal({ name: 'col', type: 'FIXED_LEN_BYTE_ARRAY' }, 1234n)) 214 | .toThrow('fixed length byte array type_length is required') 215 | }) 216 | }) 217 | 218 | describe('unconvertFloat16', () => { 219 | it('should convert number to Float16 array', () => { 220 | expect(unconvertFloat16(undefined)).toBeUndefined() 221 | expect(unconvertFloat16(0)).toEqual(new Uint8Array([0x00, 0x00])) 222 | expect(unconvertFloat16(-0)).toEqual(new Uint8Array([0x00, 0x80])) 223 | expect(unconvertFloat16(NaN)).toEqual(new Uint8Array([0x00, 0x7e])) 224 | expect(unconvertFloat16(Infinity)).toEqual(new Uint8Array([0x00, 0x7c])) 225 | expect(unconvertFloat16(-Infinity)).toEqual(new Uint8Array([0x00, 0xfc])) 226 | expect(unconvertFloat16(0.5)).toEqual(new Uint8Array([0x00, 0x38])) 227 | expect(unconvertFloat16(-0.5)).toEqual(new Uint8Array([0x00, 0xb8])) 228 | expect(unconvertFloat16(1)).toEqual(new Uint8Array([0x00, 0x3c])) 229 | expect(unconvertFloat16(-1)).toEqual(new Uint8Array([0x00, 0xbc])) 230 | expect(unconvertFloat16(0.000244140625)).toEqual(new Uint8Array([0x00, 0x0c])) 231 | // largest normal 232 | expect(unconvertFloat16(65504)).toEqual(new Uint8Array([0xff, 0x7b])) 233 | expect(unconvertFloat16(65505)).toEqual(new Uint8Array([0xff, 0x7b])) 234 | // subnormal 235 | expect(unconvertFloat16(2 ** -24)).toEqual(new Uint8Array([0x02, 0x00])) 236 | // mantissa overflow 237 | expect(unconvertFloat16(2047.9999)).toEqual(new Uint8Array([0x00, 0x68])) 238 | }) 239 | 240 | it('should round-trip Float16', () => { 241 | expect(parseFloat16(unconvertFloat16(0))).toEqual(0) 242 | expect(parseFloat16(unconvertFloat16(-0))).toEqual(-0) 243 | expect(parseFloat16(unconvertFloat16(NaN))).toEqual(NaN) 244 | expect(parseFloat16(unconvertFloat16(Infinity))).toEqual(Infinity) 245 | expect(parseFloat16(unconvertFloat16(-Infinity))).toEqual(-Infinity) 246 | expect(parseFloat16(unconvertFloat16(0.5))).toEqual(0.5) 247 | expect(parseFloat16(unconvertFloat16(-0.5))).toEqual(-0.5) 248 | expect(parseFloat16(unconvertFloat16(1))).toEqual(1) 249 | expect(parseFloat16(unconvertFloat16(-1))).toEqual(-1) 250 | expect(parseFloat16(unconvertFloat16(65504))).toEqual(65504) 251 | expect(parseFloat16(unconvertFloat16(0.000244140625))).toEqual(0.000244140625) 252 | }) 253 | }) 254 | 255 | /** 256 | * BigInt parseDecimal 257 | * @param {Uint8Array} bytes 258 | * @returns {bigint} 259 | */ 260 | function parseDecimal(bytes) { 261 | let value = 0n 262 | for (const byte of bytes) { 263 | value = value * 256n + BigInt(byte) 264 | } 265 | 266 | // handle signed 267 | const bits = BigInt(bytes.length) * 8n 268 | if (bits && value >= 2n ** (bits - 1n)) { 269 | value -= 2n ** bits 270 | } 271 | 272 | return value 273 | } 274 | -------------------------------------------------------------------------------- /src/column.js: -------------------------------------------------------------------------------- 1 | import { ByteWriter } from './bytewriter.js' 2 | import { writeDataPageV2, writePageHeader } from './datapage.js' 3 | import { encodeListValues } from './dremel.js' 4 | import { geospatialStatistics } from './geospatial.js' 5 | import { writePlain } from './plain.js' 6 | import { unconvert, unconvertMinMax } from './unconvert.js' 7 | 8 | /** 9 | * Write a column chunk to the writer. 10 | * 11 | * @param {object} options 12 | * @param {Writer} options.writer 13 | * @param {ColumnEncoder} options.column 14 | * @param {DecodedArray} options.values 15 | * @returns {{ chunk: ColumnChunk, columnIndex?: ColumnIndex, offsetIndex?: OffsetIndex }} 16 | */ 17 | export function writeColumn({ writer, column, values }) { 18 | const { columnName, element, schemaPath, stats, pageSize, encoding: userEncoding } = column 19 | const { type, type_length } = element 20 | if (!type) throw new Error(`column ${columnName} cannot determine type`) 21 | const offsetStart = writer.offset 22 | 23 | /** @type {PageData | undefined} */ 24 | let pageData 25 | if (isListLike(schemaPath)) { 26 | if (!Array.isArray(values)) { 27 | throw new Error(`parquet column ${columnName} expects array values for list encoding`) 28 | } 29 | pageData = encodeListValues(schemaPath, values) 30 | values = pageData.values 31 | } 32 | 33 | const num_values = values.length 34 | /** @type {Encoding[]} */ 35 | const encodings = [] 36 | 37 | const isGeospatial = element?.logical_type?.type === 'GEOMETRY' || element?.logical_type?.type === 'GEOGRAPHY' 38 | 39 | // Compute statistics 40 | const statistics = stats ? getStatistics(values) : undefined 41 | const geospatial_statistics = stats && isGeospatial ? geospatialStatistics(values) : undefined 42 | 43 | // dictionary encoding 44 | let dictionary_page_offset 45 | let data_page_offset = BigInt(writer.offset) 46 | const dictionary = useDictionary(values, type, userEncoding) 47 | 48 | // Determine encoding and prepare values for writing 49 | /** @type {Encoding} */ 50 | let encoding 51 | let writeValues 52 | if (dictionary) { 53 | // replace values with dictionary indices 54 | const indexes = new Array(values.length) 55 | for (let i = 0; i < values.length; i++) { 56 | if (values[i] !== null && values[i] !== undefined) { 57 | indexes[i] = dictionary.indexOf(values[i]) 58 | } 59 | } 60 | writeValues = indexes 61 | encoding = 'RLE_DICTIONARY' 62 | 63 | // write dictionary page first 64 | dictionary_page_offset = BigInt(writer.offset) 65 | const unconverted = unconvert(element, dictionary) 66 | writeDictionaryPage(writer, column, unconverted) 67 | } else { 68 | // unconvert values from rich types to simple 69 | writeValues = unconvert(element, values) 70 | encoding = userEncoding ?? (type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN') 71 | } 72 | encodings.push(encoding) 73 | 74 | // Split values into pages based on pageSize 75 | const pageBoundaries = getPageBoundaries(writeValues, type, type_length, pageSize) 76 | 77 | // Initialize index structures if requested 78 | /** @type {ColumnIndex | undefined} */ 79 | const columnIndex = column.columnIndex ? { 80 | null_pages: [], 81 | min_values: [], 82 | max_values: [], 83 | boundary_order: 'UNORDERED', 84 | null_counts: [], 85 | } : undefined 86 | /** @type {OffsetIndex | undefined} */ 87 | const offsetIndex = column.offsetIndex ? { 88 | page_locations: [], 89 | } : undefined 90 | 91 | // Write data pages 92 | data_page_offset = BigInt(writer.offset) 93 | let firstRowIndex = 0n 94 | let prevMaxValue 95 | let ascending = true 96 | let descending = true 97 | 98 | for (const { start, end } of pageBoundaries) { 99 | const chunk = createPageChunk(writeValues, pageData, start, end) 100 | const pageOffset = writer.offset 101 | 102 | writeDataPageV2(writer, chunk.values, column, encoding, chunk.pageData) 103 | 104 | // Track page info for indexes 105 | const pageRows = BigInt(end - start) 106 | if (columnIndex) { 107 | const originalSlice = values.slice(start, end) 108 | const pageStats = getStatistics(originalSlice) 109 | const nullCount = pageStats.null_count ?? 0n 110 | 111 | columnIndex.null_pages.push(nullCount === pageRows) 112 | const currMin = unconvertMinMax(pageStats.min_value, element) 113 | const currMax = unconvertMinMax(pageStats.max_value, element) 114 | // Spec: for all-null pages set "byte[0]" whatever the fuck that means 115 | columnIndex.min_values.push(currMin ?? 0) 116 | columnIndex.max_values.push(currMax ?? 0) 117 | columnIndex.null_counts?.push(nullCount) 118 | 119 | // Track boundary order 120 | if (prevMaxValue !== undefined && currMin !== undefined) { 121 | if (prevMaxValue > currMin) ascending = false 122 | if (prevMaxValue < currMin) descending = false 123 | } 124 | prevMaxValue = currMax 125 | } 126 | if (offsetIndex) { 127 | offsetIndex.page_locations.push({ 128 | offset: BigInt(pageOffset), 129 | compressed_page_size: writer.offset - pageOffset, 130 | first_row_index: BigInt(firstRowIndex), 131 | }) 132 | } 133 | firstRowIndex += pageRows 134 | } 135 | 136 | // Set boundary order after all pages are written 137 | if (columnIndex) { 138 | const numPages = columnIndex.min_values.length 139 | columnIndex.boundary_order = numPages < 2 ? 'UNORDERED' 140 | : ascending ? 'ASCENDING' : descending ? 'DESCENDING' : 'UNORDERED' 141 | } 142 | 143 | return { 144 | chunk: { 145 | meta_data: { 146 | type, 147 | encodings, 148 | path_in_schema: schemaPath.slice(1).map(s => s.name), 149 | codec: column.codec ?? 'UNCOMPRESSED', 150 | num_values: BigInt(num_values), 151 | total_compressed_size: BigInt(writer.offset - offsetStart), 152 | total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO 153 | data_page_offset, 154 | dictionary_page_offset, 155 | statistics, 156 | geospatial_statistics, 157 | }, 158 | file_offset: BigInt(offsetStart), 159 | }, 160 | columnIndex, 161 | offsetIndex, 162 | } 163 | } 164 | 165 | /** 166 | * Get page boundaries based on estimated byte size. 167 | * 168 | * @param {DecodedArray} values 169 | * @param {ParquetType} type 170 | * @param {number | undefined} type_length 171 | * @param {number | undefined} pageSize 172 | * @returns {Array<{start: number, end: number}>} 173 | */ 174 | function getPageBoundaries(values, type, type_length, pageSize) { 175 | // If no pageSize limit, return single page with all values 176 | if (!pageSize) { 177 | return [{ start: 0, end: values.length }] 178 | } 179 | 180 | const boundaries = [] 181 | let start = 0 182 | let accumulatedBytes = 0 183 | 184 | for (let i = 0; i < values.length; i++) { 185 | const valueSize = estimateValueSize(values[i], type, type_length) 186 | accumulatedBytes += valueSize 187 | 188 | // Check if we should start a new page 189 | if (accumulatedBytes >= pageSize && i > start) { 190 | boundaries.push({ start, end: i }) 191 | start = i 192 | accumulatedBytes = valueSize 193 | } 194 | } 195 | 196 | // Final page with remaining values 197 | if (start < values.length) { 198 | boundaries.push({ start, end: values.length }) 199 | } 200 | 201 | return boundaries 202 | } 203 | 204 | /** 205 | * Create a page chunk with sliced values and pageData. 206 | * 207 | * @param {DecodedArray} values 208 | * @param {PageData | undefined} pageData 209 | * @param {number} start 210 | * @param {number} end 211 | * @returns {{values: DecodedArray, pageData: PageData | undefined}} 212 | */ 213 | function createPageChunk(values, pageData, start, end) { 214 | const chunkValues = values.slice(start, end) 215 | if (!pageData) { 216 | return { values: chunkValues, pageData: undefined } 217 | } 218 | const defLevels = pageData.definitionLevels.slice(start, end) 219 | const maxDefLevel = Math.max(...pageData.definitionLevels) 220 | return { 221 | values: chunkValues, 222 | pageData: { 223 | values: chunkValues, 224 | definitionLevels: defLevels, 225 | repetitionLevels: pageData.repetitionLevels.slice(start, end), 226 | numNulls: defLevels.filter(level => level < maxDefLevel).length, 227 | }, 228 | } 229 | } 230 | 231 | /** 232 | * Estimate the byte size of a value for page size calculation. 233 | * 234 | * @param {any} value 235 | * @param {ParquetType} type 236 | * @param {number | undefined} type_length 237 | * @returns {number} 238 | */ 239 | function estimateValueSize(value, type, type_length) { 240 | if (value === null || value === undefined) return 0 241 | if (type === 'BOOLEAN') return 1 // bit, but count as byte for simplicity 242 | if (type === 'INT32' || type === 'FLOAT') return 4 243 | if (type === 'INT64' || type === 'DOUBLE') return 8 244 | if (type === 'INT96') return 12 245 | if (type === 'FIXED_LEN_BYTE_ARRAY') return type_length ?? 0 246 | if (type === 'BYTE_ARRAY') { 247 | if (value instanceof Uint8Array) return value.byteLength 248 | if (typeof value === 'string') return value.length 249 | } 250 | return 0 251 | } 252 | 253 | /** 254 | * @param {DecodedArray} values 255 | * @param {ParquetType} type 256 | * @param {Encoding | undefined} encoding 257 | * @returns {any[] | undefined} 258 | */ 259 | function useDictionary(values, type, encoding) { 260 | if (encoding && encoding !== 'RLE_DICTIONARY') return 261 | if (type === 'BOOLEAN') return 262 | const unique = new Set(values) 263 | unique.delete(undefined) 264 | unique.delete(null) 265 | if (values.length / unique.size > 2) { 266 | // TODO: sort by frequency 267 | return Array.from(unique) 268 | } 269 | } 270 | 271 | /** 272 | * @param {Writer} writer 273 | * @param {ColumnEncoder} column 274 | * @param {DecodedArray} dictionary 275 | */ 276 | function writeDictionaryPage(writer, column, dictionary) { 277 | const { element, codec, compressors } = column 278 | const { type, type_length } = element 279 | if (!type) throw new Error(`column ${column.columnName} cannot determine type`) 280 | const dictionaryPage = new ByteWriter() 281 | writePlain(dictionaryPage, dictionary, type, type_length) 282 | 283 | // compress dictionary page data 284 | let compressedDictionaryPage = dictionaryPage 285 | const compressor = compressors?.[codec] 286 | if (compressor) { 287 | const input = new Uint8Array(dictionaryPage.getBuffer()) 288 | const compressedData = compressor(input) 289 | compressedDictionaryPage = new ByteWriter() 290 | compressedDictionaryPage.appendBytes(compressedData) 291 | } 292 | 293 | // write dictionary page header 294 | writePageHeader(writer, { 295 | type: 'DICTIONARY_PAGE', 296 | uncompressed_page_size: dictionaryPage.offset, 297 | compressed_page_size: compressedDictionaryPage.offset, 298 | dictionary_page_header: { 299 | num_values: dictionary.length, 300 | encoding: 'PLAIN', 301 | }, 302 | }) 303 | writer.appendBuffer(compressedDictionaryPage.getBuffer()) 304 | } 305 | 306 | /** 307 | * @import {ColumnChunk, ColumnIndex, DecodedArray, Encoding, OffsetIndex, ParquetType, SchemaElement, Statistics} from 'hyparquet' 308 | * @import {ColumnEncoder, PageData, Writer} from '../src/types.js' 309 | * @param {DecodedArray} values 310 | * @returns {Statistics} 311 | */ 312 | function getStatistics(values) { 313 | let min_value = undefined 314 | let max_value = undefined 315 | let null_count = 0n 316 | for (const value of values) { 317 | if (value === null || value === undefined) { 318 | null_count++ 319 | continue 320 | } 321 | if (typeof value === 'object') continue // skip objects 322 | if (min_value === undefined || value < min_value) min_value = value 323 | if (max_value === undefined || value > max_value) max_value = value 324 | } 325 | return { min_value, max_value, null_count } 326 | } 327 | 328 | /** 329 | * @param {SchemaElement[]} schemaPath 330 | * @returns {boolean} 331 | */ 332 | function isListLike(schemaPath) { 333 | for (let i = 1; i < schemaPath.length; i++) { 334 | const element = schemaPath[i] 335 | if (element?.converted_type === 'LIST') { 336 | const repeatedChild = schemaPath[i + 1] 337 | return repeatedChild?.repetition_type === 'REPEATED' 338 | } 339 | } 340 | return false 341 | } 342 | --------------------------------------------------------------------------------