├── test ├── files │ ├── issue97.json │ ├── issue115decimal.json │ ├── alpha.parquet │ ├── signs.parquet │ ├── issue23.parquet │ ├── issue72.parquet │ ├── issue90.parquet │ ├── issue97.parquet │ ├── strings.parquet │ ├── boolean_rle.parquet │ ├── duckdb2557.parquet │ ├── duckdb3734.parquet │ ├── duckdb4442.parquet │ ├── duckdb5533.parquet │ ├── geoparquet.parquet │ ├── geospatial.parquet │ ├── incorrect_map_schema.json │ ├── rowgroups.parquet │ ├── hyparquet.jpg.snappy │ ├── rowend_struct.parquet │ ├── adam_genotypes.parquet │ ├── continued_page.parquet │ ├── decimal-column.parquet │ ├── delta_byte_array.parquet │ ├── issue115decimal.parquet │ ├── nullable.impala.parquet │ ├── offset_indexed.parquet │ ├── struct_strings.parquet │ ├── brotli_compressed.parquet │ ├── byte_array_decimal.parquet │ ├── datapage_v2.snappy.parquet │ ├── lz4_raw_compressed.parquet │ ├── nonnullable.impala.parquet │ ├── byte_stream_split_v2.parquet │ ├── delta_binary_packed.parquet │ ├── fixed_length_decimal.parquet │ ├── hadoop_lz4_compressed.parquet │ ├── incorrect_map_schema.parquet │ ├── nested_structs.rust.parquet │ ├── rle_boolean_encoding.parquet │ ├── byte_stream_split.zstd.parquet │ ├── delta_length_byte_array.parquet │ ├── repeated_no_annotation.parquet │ ├── byte_stream_split_v2.json │ ├── concatenated_gzip_members.parquet │ ├── duckdb_delta_binary_packed.parquet │ ├── float16_nonzeros_and_nans.parquet │ ├── struct_strings.json │ ├── byte_stream_split_extended.gzip.parquet │ ├── delta_encoding_optional_column.parquet │ ├── delta_encoding_required_column.parquet │ ├── duckdb_delta_length_byte_array.parquet │ ├── plain-dict-uncompressed-checksum.parquet │ ├── float16_nonzeros_and_nans.json │ ├── mostlyempty.parquet │ ├── issue72.json │ ├── strings.json │ ├── mostlyempty.json │ ├── datapage_v2.snappy.json │ ├── rowgroups.json │ ├── decimal-column.json │ ├── boolean_rle.json │ ├── README.md │ ├── brotli_compressed.json │ ├── duckdb4442.column_indexes.json │ ├── lz4_raw_compressed.json │ ├── hadoop_lz4_compressed.json │ ├── signs.json │ ├── duckdb4442.json │ ├── fixed_length_decimal.json │ ├── repeated_no_annotation.json │ ├── nonnullable.impala.json │ ├── byte_array_decimal.json │ ├── duckdb5533.json │ ├── incorrect_map_schema.offset_indexes.json │ ├── plain-dict-uncompressed-checksum.offset_indexes.json │ ├── incorrect_map_schema.column_indexes.json │ ├── plain-dict-uncompressed-checksum.column_indexes.json │ ├── adam_genotypes.json │ ├── duckdb5533.offset_indexes.json │ ├── rle_boolean_encoding.json │ ├── mostlyempty.metadata.json │ ├── byte_array_decimal.metadata.json │ ├── rle_boolean_encoding.metadata.json │ ├── duckdb_delta_binary_packed.metadata.json │ ├── duckdb_delta_length_byte_array.metadata.json │ ├── concatenated_gzip_members.metadata.json │ ├── nullable.impala.json │ ├── issue90.metadata.json │ ├── delta_length_byte_array.metadata.json │ ├── duckdb5533.column_indexes.json │ ├── issue72.metadata.json │ ├── boolean_rle.metadata.json │ ├── offset_indexed.offset_indexes.json │ ├── fixed_length_decimal.metadata.json │ ├── float16_nonzeros_and_nans.metadata.json │ ├── duckdb3734.json │ ├── duckdb2557.metadata.json │ ├── struct_strings.metadata.json │ ├── issue115decimal.metadata.json │ ├── strings.metadata.json │ ├── continued_page.metadata.json │ ├── repeated_no_annotation.metadata.json │ ├── byte_stream_split.zstd.metadata.json │ ├── duckdb4442.offset_indexes.json │ ├── delta_encoding_required_column.offset_indexes.json │ ├── concatenated_gzip_members.json │ ├── incorrect_map_schema.metadata.json │ ├── issue97.metadata.json │ ├── offset_indexed.metadata.json │ ├── byte_stream_split_v2.metadata.json │ ├── lz4_raw_compressed.metadata.json │ ├── plain-dict-uncompressed-checksum.metadata.json │ ├── rowgroups.metadata.json │ ├── hadoop_lz4_compressed.metadata.json │ ├── rowend_struct.metadata.json │ ├── datapage_v2.snappy.metadata.json │ ├── delta_encoding_required_column.column_indexes.json │ └── decimal-column.metadata.json ├── schemaTree.test.js ├── helpers.js ├── plan.test.js ├── package.test.js ├── readFiles.test.js ├── read.utf8.test.js ├── rowend_struct.test.js ├── indexes.test.js ├── snappy.test.js ├── asyncbuffer.test.js ├── metadata.test.js ├── column.test.js ├── thrift.test.js └── encoding.test.js ├── .gitattributes ├── hyparquet.jpg ├── hyperparam.png ├── .gitignore ├── tsconfig.build.json ├── tsconfig.json ├── .github └── workflows │ └── ci.yml ├── LICENSE ├── src ├── node.js ├── geoparquet.js ├── indexes.js ├── constants.js ├── index.js ├── delta.js ├── wkb.js ├── snappy.js ├── schema.js ├── encoding.js └── filter.js ├── benchmark.js ├── package.json └── eslint.config.js /test/files/issue97.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.min.js -diff 2 | *.min.js.map -diff 3 | -------------------------------------------------------------------------------- /test/files/issue115decimal.json: -------------------------------------------------------------------------------- 1 | [ 2 | [-12345.67] 3 | ] 4 | -------------------------------------------------------------------------------- /hyparquet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/hyparquet.jpg -------------------------------------------------------------------------------- /hyperparam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/hyperparam.png -------------------------------------------------------------------------------- /test/files/alpha.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/alpha.parquet -------------------------------------------------------------------------------- /test/files/signs.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/signs.parquet -------------------------------------------------------------------------------- /test/files/issue23.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue23.parquet -------------------------------------------------------------------------------- /test/files/issue72.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue72.parquet -------------------------------------------------------------------------------- /test/files/issue90.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue90.parquet -------------------------------------------------------------------------------- /test/files/issue97.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue97.parquet -------------------------------------------------------------------------------- /test/files/strings.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/strings.parquet -------------------------------------------------------------------------------- /test/files/boolean_rle.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/boolean_rle.parquet -------------------------------------------------------------------------------- /test/files/duckdb2557.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb2557.parquet -------------------------------------------------------------------------------- /test/files/duckdb3734.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb3734.parquet -------------------------------------------------------------------------------- /test/files/duckdb4442.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb4442.parquet -------------------------------------------------------------------------------- /test/files/duckdb5533.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb5533.parquet -------------------------------------------------------------------------------- /test/files/geoparquet.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/geoparquet.parquet -------------------------------------------------------------------------------- /test/files/geospatial.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/geospatial.parquet -------------------------------------------------------------------------------- /test/files/incorrect_map_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | {"name": "report", "parent": "another"} 4 | ] 5 | ] 6 | -------------------------------------------------------------------------------- /test/files/rowgroups.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rowgroups.parquet -------------------------------------------------------------------------------- /test/files/hyparquet.jpg.snappy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/hyparquet.jpg.snappy -------------------------------------------------------------------------------- /test/files/rowend_struct.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rowend_struct.parquet -------------------------------------------------------------------------------- /test/files/adam_genotypes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/adam_genotypes.parquet -------------------------------------------------------------------------------- /test/files/continued_page.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/continued_page.parquet -------------------------------------------------------------------------------- /test/files/decimal-column.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/decimal-column.parquet -------------------------------------------------------------------------------- /test/files/delta_byte_array.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_byte_array.parquet -------------------------------------------------------------------------------- /test/files/issue115decimal.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue115decimal.parquet -------------------------------------------------------------------------------- /test/files/nullable.impala.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nullable.impala.parquet -------------------------------------------------------------------------------- /test/files/offset_indexed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/offset_indexed.parquet -------------------------------------------------------------------------------- /test/files/struct_strings.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/struct_strings.parquet -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | package-lock.json 3 | coverage 4 | *.tgz 5 | .vscode 6 | .DS_Store 7 | /*.parquet 8 | /types 9 | -------------------------------------------------------------------------------- /test/files/brotli_compressed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/brotli_compressed.parquet -------------------------------------------------------------------------------- /test/files/byte_array_decimal.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_array_decimal.parquet -------------------------------------------------------------------------------- /test/files/datapage_v2.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/datapage_v2.snappy.parquet -------------------------------------------------------------------------------- /test/files/lz4_raw_compressed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/lz4_raw_compressed.parquet -------------------------------------------------------------------------------- /test/files/nonnullable.impala.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nonnullable.impala.parquet -------------------------------------------------------------------------------- /test/files/byte_stream_split_v2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split_v2.parquet -------------------------------------------------------------------------------- /test/files/delta_binary_packed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_binary_packed.parquet -------------------------------------------------------------------------------- /test/files/fixed_length_decimal.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/fixed_length_decimal.parquet -------------------------------------------------------------------------------- /test/files/hadoop_lz4_compressed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/hadoop_lz4_compressed.parquet -------------------------------------------------------------------------------- /test/files/incorrect_map_schema.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/incorrect_map_schema.parquet -------------------------------------------------------------------------------- /test/files/nested_structs.rust.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nested_structs.rust.parquet -------------------------------------------------------------------------------- /test/files/rle_boolean_encoding.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rle_boolean_encoding.parquet -------------------------------------------------------------------------------- /test/files/byte_stream_split.zstd.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split.zstd.parquet -------------------------------------------------------------------------------- /test/files/delta_length_byte_array.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_length_byte_array.parquet -------------------------------------------------------------------------------- /test/files/repeated_no_annotation.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/repeated_no_annotation.parquet -------------------------------------------------------------------------------- /test/files/byte_stream_split_v2.json: -------------------------------------------------------------------------------- 1 | [ 2 | [1.5, 10.1], 3 | [2.5, 20.2], 4 | [3.5, 30.3], 5 | [4.5, 40.4], 6 | [5.5, 50.5] 7 | ] 8 | -------------------------------------------------------------------------------- /test/files/concatenated_gzip_members.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/concatenated_gzip_members.parquet -------------------------------------------------------------------------------- /test/files/duckdb_delta_binary_packed.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb_delta_binary_packed.parquet -------------------------------------------------------------------------------- /test/files/float16_nonzeros_and_nans.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/float16_nonzeros_and_nans.parquet -------------------------------------------------------------------------------- /test/files/struct_strings.json: -------------------------------------------------------------------------------- 1 | [ 2 | [{ "f64_field": null, "str_field": "hello" }], 3 | [{ "f64_field": 1.23, "str_field": null }] 4 | ] 5 | -------------------------------------------------------------------------------- /test/files/byte_stream_split_extended.gzip.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split_extended.gzip.parquet -------------------------------------------------------------------------------- /test/files/delta_encoding_optional_column.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_encoding_optional_column.parquet -------------------------------------------------------------------------------- /test/files/delta_encoding_required_column.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_encoding_required_column.parquet -------------------------------------------------------------------------------- /test/files/duckdb_delta_length_byte_array.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb_delta_length_byte_array.parquet -------------------------------------------------------------------------------- /test/files/plain-dict-uncompressed-checksum.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/plain-dict-uncompressed-checksum.parquet -------------------------------------------------------------------------------- /test/files/float16_nonzeros_and_nans.json: -------------------------------------------------------------------------------- 1 | [ 2 | [null], 3 | [1], 4 | [-2], 5 | [null], 6 | [0], 7 | [-1], 8 | [0], 9 | [2] 10 | ] 11 | -------------------------------------------------------------------------------- /test/files/mostlyempty.parquet: -------------------------------------------------------------------------------- 1 | PAR1L 2 | \,Hroot %empty& emptyPP&$&6P( hyparquetRPAR1 -------------------------------------------------------------------------------- /test/files/issue72.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["258d7fff-6418-499f-af07-c6611937d7d8"], 3 | ["086f2968-327b-48a8-8cdf-64f46bcd8173"], 4 | ["258d7fff-6418-499f-af07-c6611937d7d8"] 5 | ] 6 | -------------------------------------------------------------------------------- /test/files/strings.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["alpha", "alpha", "alpha"], 3 | ["bravo", "bravo", "bravo"], 4 | ["charlie", "charlie", "charlie"], 5 | ["delta", "delta", "delta"] 6 | ] 7 | -------------------------------------------------------------------------------- /test/files/mostlyempty.json: -------------------------------------------------------------------------------- 1 | [ 2 | [null], 3 | [null], 4 | [null], 5 | [null], 6 | [null], 7 | [null], 8 | [null], 9 | [null], 10 | [null], 11 | [null] 12 | ] 13 | -------------------------------------------------------------------------------- /test/files/datapage_v2.snappy.json: -------------------------------------------------------------------------------- 1 | [ 2 | ["abc", 1, 2, true, [1, 2, 3]], 3 | ["abc", 2, 3, true, null], 4 | ["abc", 3, 4, true, null], 5 | [null, 4, 5, false, [1, 2, 3]], 6 | ["abc", 5, 2, true, [1, 2]] 7 | ] 8 | -------------------------------------------------------------------------------- /test/files/rowgroups.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 1 ], 3 | [ 2 ], 4 | [ 3 ], 5 | [ 4 ], 6 | [ 5 ], 7 | [ 6 ], 8 | [ 7 ], 9 | [ 8 ], 10 | [ 9 ], 11 | [ 10 ], 12 | [ 11 ], 13 | [ 12 ], 14 | [ 13 ], 15 | [ 14 ], 16 | [ 15 ] 17 | ] 18 | -------------------------------------------------------------------------------- /test/files/decimal-column.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 40, 4 | 2015 5 | ], 6 | [ 7 | 74, 8 | 2015 9 | ], 10 | [ 11 | 140, 12 | 2015 13 | ], 14 | [ 15 | 152, 16 | 2015 17 | ], 18 | [ 19 | 190, 20 | 2015 21 | ] 22 | ] 23 | -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "noEmit": false, 5 | "declaration": true, 6 | "emitDeclarationOnly": true, 7 | "outDir": "types", 8 | "declarationMap": true 9 | }, 10 | "include": ["src"] 11 | } 12 | -------------------------------------------------------------------------------- /test/files/boolean_rle.json: -------------------------------------------------------------------------------- 1 | [ 2 | [true], 3 | [true], 4 | [true], 5 | [true], 6 | [true], 7 | [null], 8 | [null], 9 | [null], 10 | [null], 11 | [null], 12 | [false], 13 | [false], 14 | [false], 15 | [false], 16 | [false] 17 | ] 18 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "allowJs": true, 4 | "checkJs": true, 5 | "lib": ["esnext", "dom"], 6 | "module": "nodenext", 7 | "noEmit": true, 8 | "resolveJsonModule": true, 9 | "strict": true 10 | }, 11 | "include": ["src", "test"] 12 | } 13 | -------------------------------------------------------------------------------- /test/files/README.md: -------------------------------------------------------------------------------- 1 | # Test Files License 2 | 3 | This directory contains binary test files from [apache/parquet-testing](https://github.com/apache/parquet-testing), under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). 4 | 5 | Copyright 2004 The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /test/files/brotli_compressed.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 1593604800, 4 | "abc", 5 | 42 6 | ], 7 | [ 8 | 1593604800, 9 | "def", 10 | 7.7 11 | ], 12 | [ 13 | 1593604801, 14 | "abc", 15 | 42.125 16 | ], 17 | [ 18 | 1593604801, 19 | "def", 20 | 7.7 21 | ] 22 | ] 23 | -------------------------------------------------------------------------------- /test/files/duckdb4442.column_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | null, 4 | null, 5 | null, 6 | null, 7 | null, 8 | null, 9 | null, 10 | null, 11 | null, 12 | null, 13 | null, 14 | null, 15 | null, 16 | null, 17 | null, 18 | null, 19 | null 20 | ] 21 | ] 22 | -------------------------------------------------------------------------------- /test/files/lz4_raw_compressed.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 1593604800, 4 | "abc", 5 | 42 6 | ], 7 | [ 8 | 1593604800, 9 | "def", 10 | 7.7 11 | ], 12 | [ 13 | 1593604801, 14 | "abc", 15 | 42.125 16 | ], 17 | [ 18 | 1593604801, 19 | "def", 20 | 7.7 21 | ] 22 | ] 23 | -------------------------------------------------------------------------------- /test/files/hadoop_lz4_compressed.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 1593604800, 4 | "abc", 5 | 42 6 | ], 7 | [ 8 | 1593604800, 9 | "def", 10 | 7.7 11 | ], 12 | [ 13 | 1593604801, 14 | "abc", 15 | 42.125 16 | ], 17 | [ 18 | 1593604801, 19 | "def", 20 | 7.7 21 | ] 22 | ] 23 | -------------------------------------------------------------------------------- /test/files/signs.json: -------------------------------------------------------------------------------- 1 | [ 2 | [0, 0, 0, 0, -128, -32768, -2147483648, -9223372036854775808], 3 | [127, 32767, 2147483647, 9223372036854775807, -1, -1, -1, -1], 4 | [128, 32768, 2147483648, 9223372036854775808, 0, 0, 0, 0], 5 | [255, 65535, 4294967295, 18446744073709551615, 127, 32767, 2147483647, 9223372036854775807] 6 | ] 7 | -------------------------------------------------------------------------------- /test/files/duckdb4442.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 12, 4 | 5184, 5 | 1, 6 | 22, 7 | "2011-10-06T22:21:49.580Z", 8 | "outbound", 9 | 323020033, 10 | "{}", 11 | 2100, 12 | 33, 13 | 0, 14 | 7, 15 | 10, 16 | 0, 17 | 1317427200000, 18 | 1317939709580, 19 | 11 20 | ] 21 | ] 22 | -------------------------------------------------------------------------------- /test/files/fixed_length_decimal.json: -------------------------------------------------------------------------------- 1 | [ 2 | [1], 3 | [2], 4 | [3], 5 | [4], 6 | [5], 7 | [6], 8 | [7], 9 | [8], 10 | [9], 11 | [10], 12 | [11], 13 | [12], 14 | [13], 15 | [14], 16 | [15], 17 | [16], 18 | [17], 19 | [18], 20 | [19], 21 | [20], 22 | [21], 23 | [22], 24 | [23], 25 | [24] 26 | ] 27 | -------------------------------------------------------------------------------- /test/files/repeated_no_annotation.json: -------------------------------------------------------------------------------- 1 | [ 2 | [1, null], 3 | [2, null], 4 | [3, {"phone": []}], 5 | [4, {"phone": [{"number":5555555555,"kind":null}]}], 6 | [5, {"phone": [{"number":1111111111,"kind":"home"}]}], 7 | [6, {"phone": [{"number":1111111111,"kind":"home"},{"number":2222222222,"kind":null},{"number":3333333333,"kind":"mobile"}]}] 8 | ] 9 | -------------------------------------------------------------------------------- /test/files/nonnullable.impala.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 8, 4 | [-1], 5 | [[-1, -2], []], 6 | { "k1": -1 }, 7 | [{}, { "k1": 1 }, {}, {}], 8 | { 9 | "a": -1, 10 | "B": [-1], 11 | "c": { 12 | "D": [[{ 13 | "e": -1, 14 | "f": "nonnullable" 15 | }]] 16 | }, 17 | "G": {} 18 | } 19 | ] 20 | ] 21 | -------------------------------------------------------------------------------- /test/files/byte_array_decimal.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 1 ], 3 | [ 2 ], 4 | [ 3 ], 5 | [ 4 ], 6 | [ 5 ], 7 | [ 6 ], 8 | [ 7 ], 9 | [ 8 ], 10 | [ 9 ], 11 | [ 10 ], 12 | [ 11 ], 13 | [ 12 ], 14 | [ 13 ], 15 | [ 14 ], 16 | [ 15 ], 17 | [ 16 ], 18 | [ 17 ], 19 | [ 18 ], 20 | [ 19 ], 21 | [ 20 ], 22 | [ 21 ], 23 | [ 22 ], 24 | [ 23 ], 25 | [ 24 ] 26 | ] 27 | -------------------------------------------------------------------------------- /test/files/duckdb5533.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "2022-11-27T17:42:43.514Z", 4 | 2448, 5 | null, 6 | 1, 7 | 343 8 | ], 9 | [ 10 | "2022-11-27T17:42:43.514Z", 11 | 85016, 12 | null, 13 | -1, 14 | 343 15 | ], 16 | [ 17 | "2022-11-27T17:42:44.280Z", 18 | 1184, 19 | null, 20 | 1, 21 | 343 22 | ], 23 | [ 24 | "2022-11-27T17:42:44.280Z", 25 | 85016, 26 | null, 27 | -1, 28 | 343 29 | ] 30 | ] 31 | -------------------------------------------------------------------------------- /test/files/incorrect_map_schema.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "page_locations": [ 5 | { 6 | "offset": 4, 7 | "compressed_page_size": 69, 8 | "first_row_index": 0 9 | } 10 | ] 11 | }, 12 | { 13 | "page_locations": [ 14 | { 15 | "offset": 73, 16 | "compressed_page_size": 72, 17 | "first_row_index": 0 18 | } 19 | ] 20 | } 21 | ] 22 | ] -------------------------------------------------------------------------------- /test/files/plain-dict-uncompressed-checksum.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "page_locations": [ 5 | { 6 | "offset": 31, 7 | "compressed_page_size": 27, 8 | "first_row_index": 0 9 | } 10 | ] 11 | }, 12 | { 13 | "page_locations": [ 14 | { 15 | "offset": 117, 16 | "compressed_page_size": 27, 17 | "first_row_index": 0 18 | } 19 | ] 20 | } 21 | ] 22 | ] -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | 6 | jobs: 7 | lint: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 5 10 | steps: 11 | - uses: actions/checkout@v6 12 | - run: npm i 13 | - run: npm run lint 14 | 15 | typecheck: 16 | runs-on: ubuntu-latest 17 | timeout-minutes: 5 18 | steps: 19 | - uses: actions/checkout@v6 20 | - run: npm i 21 | - run: npx tsc 22 | 23 | test: 24 | runs-on: ubuntu-latest 25 | timeout-minutes: 5 26 | steps: 27 | - uses: actions/checkout@v6 28 | - run: npm i 29 | - run: npm run coverage 30 | -------------------------------------------------------------------------------- /test/files/incorrect_map_schema.column_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "null_pages": [ 5 | false 6 | ], 7 | "min_values": [ 8 | "name" 9 | ], 10 | "max_values": [ 11 | "parent" 12 | ], 13 | "boundary_order": "ASCENDING", 14 | "null_counts": [ 15 | 0 16 | ] 17 | }, 18 | { 19 | "null_pages": [ 20 | false 21 | ], 22 | "min_values": [ 23 | "another" 24 | ], 25 | "max_values": [ 26 | "report" 27 | ], 28 | "boundary_order": "ASCENDING", 29 | "null_counts": [ 30 | 0 31 | ] 32 | } 33 | ] 34 | ] -------------------------------------------------------------------------------- /test/files/plain-dict-uncompressed-checksum.column_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "null_pages": [ 5 | false 6 | ], 7 | "min_values": [ 8 | 0 9 | ], 10 | "max_values": [ 11 | 0 12 | ], 13 | "boundary_order": "ASCENDING", 14 | "null_counts": [ 15 | 0 16 | ] 17 | }, 18 | { 19 | "null_pages": [ 20 | false 21 | ], 22 | "min_values": [ 23 | "a655fd0e-9949-4059-bcae-fd6a002a4652" 24 | ], 25 | "max_values": [ 26 | "a655fd0e-9949-4059-bcae-fd6a002a4652" 27 | ], 28 | "boundary_order": "ASCENDING", 29 | "null_counts": [ 30 | 0 31 | ] 32 | } 33 | ] 34 | ] -------------------------------------------------------------------------------- /test/files/adam_genotypes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "alternateAllele": null, 5 | "end": null, 6 | "filtersApplied": null, 7 | "filtersFailed": null, 8 | "filtersPassed": null, 9 | "names": ["name"], 10 | "quality": null, 11 | "referenceAllele": null, 12 | "referenceName": null, 13 | "splitFromMultiAllelic": false, 14 | "start": null 15 | }, 16 | null, 17 | null, 18 | null, 19 | null, 20 | null, 21 | null, 22 | null, 23 | null, 24 | null, 25 | null, 26 | null, 27 | null, 28 | null, 29 | null, 30 | null, 31 | null, 32 | null, 33 | false, 34 | false, 35 | null, 36 | null 37 | ] 38 | ] 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /test/files/duckdb5533.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "page_locations": [ 5 | { 6 | "offset": 4, 7 | "compressed_page_size": 73, 8 | "first_row_index": 0 9 | } 10 | ] 11 | }, 12 | { 13 | "page_locations": [ 14 | { 15 | "offset": 132, 16 | "compressed_page_size": 65, 17 | "first_row_index": 0 18 | } 19 | ] 20 | }, 21 | { 22 | "page_locations": [ 23 | { 24 | "offset": 242, 25 | "compressed_page_size": 37, 26 | "first_row_index": 0 27 | } 28 | ] 29 | }, 30 | { 31 | "page_locations": [ 32 | { 33 | "offset": 315, 34 | "compressed_page_size": 68, 35 | "first_row_index": 0 36 | } 37 | ] 38 | }, 39 | { 40 | "page_locations": [ 41 | { 42 | "offset": 435, 43 | "compressed_page_size": 57, 44 | "first_row_index": 0 45 | } 46 | ] 47 | } 48 | ] 49 | ] -------------------------------------------------------------------------------- /test/files/rle_boolean_encoding.json: -------------------------------------------------------------------------------- 1 | [ 2 | [true], 3 | [false], 4 | [null], 5 | [true], 6 | [true], 7 | [false], 8 | [false], 9 | [true], 10 | [true], 11 | [true], 12 | [false], 13 | [false], 14 | [true], 15 | [true], 16 | [false], 17 | [null], 18 | [true], 19 | [true], 20 | [false], 21 | [false], 22 | [true], 23 | [true], 24 | [false], 25 | [null], 26 | [true], 27 | [true], 28 | [false], 29 | [false], 30 | [true], 31 | [true], 32 | [true], 33 | [false], 34 | [false], 35 | [false], 36 | [false], 37 | [true], 38 | [true], 39 | [false], 40 | [null], 41 | [true], 42 | [true], 43 | [false], 44 | [false], 45 | [true], 46 | [true], 47 | [true], 48 | [false], 49 | [false], 50 | [null], 51 | [true], 52 | [true], 53 | [false], 54 | [false], 55 | [true], 56 | [true], 57 | [true], 58 | [false], 59 | [true], 60 | [true], 61 | [false], 62 | [null], 63 | [true], 64 | [true], 65 | [false], 66 | [false], 67 | [true], 68 | [true], 69 | [true] 70 | ] 71 | -------------------------------------------------------------------------------- /src/node.js: -------------------------------------------------------------------------------- 1 | import { createReadStream, promises as fs } from 'fs' 2 | 3 | export * from './index.js' 4 | 5 | /** 6 | * @import {AsyncBuffer} from '../src/types.js' 7 | */ 8 | /** 9 | * Construct an AsyncBuffer for a local file using node fs package. 10 | * 11 | * @param {string} filename 12 | * @returns {Promise} 13 | */ 14 | export async function asyncBufferFromFile(filename) { 15 | const { size } = await fs.stat(filename) 16 | return { 17 | byteLength: size, 18 | slice(start, end) { 19 | // read file slice 20 | const reader = createReadStream(filename, { start, end }) 21 | return new Promise((resolve, reject) => { 22 | /** @type {any[]} */ 23 | const chunks = [] 24 | reader.on('data', chunk => chunks.push(chunk)) 25 | reader.on('error', reject) 26 | reader.on('end', () => { 27 | const buffer = Buffer.concat(chunks) 28 | resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)) 29 | }) 30 | }) 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /test/schemaTree.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { parquetMetadataAsync, parquetSchema } from '../src/index.js' 3 | import { asyncBufferFromFile } from '../src/node.js' 4 | 5 | describe('parquetSchema', () => { 6 | it('parse schema tree from rowgroups.parquet', async () => { 7 | const arrayBuffer = await asyncBufferFromFile('test/files/rowgroups.parquet') 8 | const metadata = await parquetMetadataAsync(arrayBuffer) 9 | const schemaTree = parquetSchema(metadata) 10 | expect(schemaTree).toEqual(rowgroupsSchema) 11 | }) 12 | }) 13 | 14 | // Parquet v2 from pandas with 2 row groups 15 | const rowgroupsSchema = { 16 | children: [ 17 | { 18 | children: [], 19 | count: 1, 20 | element: { 21 | name: 'numbers', 22 | repetition_type: 'OPTIONAL', 23 | type: 'INT64', 24 | }, 25 | path: ['numbers'], 26 | }, 27 | ], 28 | count: 2, 29 | element: { 30 | name: 'schema', 31 | num_children: 1, 32 | repetition_type: 'REQUIRED', 33 | }, 34 | path: [], 35 | } 36 | -------------------------------------------------------------------------------- /test/files/mostlyempty.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "name": "root", 6 | "num_children": 1 7 | }, 8 | { 9 | "type": "BYTE_ARRAY", 10 | "repetition_type": "OPTIONAL", 11 | "name": "empty" 12 | } 13 | ], 14 | "num_rows": 10, 15 | "row_groups": [ 16 | { 17 | "columns": [ 18 | { 19 | "file_offset": 4, 20 | "meta_data": { 21 | "type": "BYTE_ARRAY", 22 | "encodings": ["RLE_DICTIONARY"], 23 | "path_in_schema": ["empty"], 24 | "codec": "SNAPPY", 25 | "num_values": 10, 26 | "total_uncompressed_size": 40, 27 | "total_compressed_size": 40, 28 | "data_page_offset": 18, 29 | "dictionary_page_offset": 4, 30 | "statistics": { 31 | "null_count": 10 32 | } 33 | } 34 | } 35 | ], 36 | "total_byte_size": 40, 37 | "num_rows": 10 38 | } 39 | ], 40 | "created_by": "hyparquet", 41 | "metadata_length": 82 42 | } 43 | -------------------------------------------------------------------------------- /test/files/byte_array_decimal.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "created_by": "HVR 5.3.0/9 (linux_glibc2.5-x64-64bit)", 4 | "metadata_length": 119, 5 | "num_rows": 24, 6 | "row_groups": [ 7 | { 8 | "columns": [ 9 | { 10 | "file_offset": 4, 11 | "meta_data": { 12 | "codec": "UNCOMPRESSED", 13 | "data_page_offset": 4, 14 | "encodings": [], 15 | "num_values": 24, 16 | "path_in_schema": [ "value" ], 17 | "total_compressed_size": 168, 18 | "total_uncompressed_size": 168, 19 | "type": "BYTE_ARRAY" 20 | } 21 | } 22 | ], 23 | "num_rows": 24, 24 | "total_byte_size": 168 25 | } 26 | ], 27 | "schema": [ 28 | { 29 | "name": "schema", 30 | "num_children": 1, 31 | "repetition_type": "REQUIRED" 32 | }, 33 | { 34 | "converted_type": "DECIMAL", 35 | "field_id": 6, 36 | "name": "value", 37 | "precision": 4, 38 | "repetition_type": "OPTIONAL", 39 | "scale": 2, 40 | "type": "BYTE_ARRAY" 41 | } 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /test/helpers.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | 3 | /** 4 | * Read file and parse as JSON 5 | * 6 | * @param {string} filePath 7 | * @returns {any} 8 | */ 9 | export function fileToJson(filePath) { 10 | const buffer = fs.readFileSync(filePath) 11 | return JSON.parse(buffer.toString()) 12 | } 13 | 14 | /** 15 | * Make a DataReader from bytes 16 | * 17 | * @import {DataReader} from '../src/types.d.ts' 18 | * @param {number[]} bytes 19 | * @returns {DataReader} 20 | */ 21 | export function reader(bytes) { 22 | return { view: new DataView(new Uint8Array(bytes).buffer), offset: 0 } 23 | } 24 | 25 | /** 26 | * Wraps an AsyncBuffer to count the number of fetches made 27 | * 28 | * @import {AsyncBuffer} from '../src/types.js' 29 | * @param {AsyncBuffer} asyncBuffer 30 | * @returns {AsyncBuffer & {fetches: number, bytes: number}} 31 | */ 32 | export function countingBuffer(asyncBuffer) { 33 | return { 34 | ...asyncBuffer, 35 | fetches: 0, 36 | bytes: 0, 37 | slice(start, end) { 38 | this.fetches++ 39 | this.bytes += (end ?? asyncBuffer.byteLength) - start 40 | return asyncBuffer.slice(start, end) 41 | }, 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /test/files/rle_boolean_encoding.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "table", 6 | "num_children": 1 7 | }, 8 | { 9 | "type": "BOOLEAN", 10 | "repetition_type": "OPTIONAL", 11 | "name": "datatype_boolean", 12 | "field_id": 1 13 | } 14 | ], 15 | "num_rows": 68, 16 | "row_groups": [ 17 | { 18 | "columns": [ 19 | { 20 | "file_offset": 0, 21 | "meta_data": { 22 | "type": "BOOLEAN", 23 | "encodings": [ 24 | "RLE" 25 | ], 26 | "path_in_schema": [ 27 | "datatype_boolean" 28 | ], 29 | "codec": "GZIP", 30 | "num_values": 68, 31 | "total_uncompressed_size": 49, 32 | "total_compressed_size": 69, 33 | "data_page_offset": 4, 34 | "statistics": { 35 | "max": true, 36 | "min": false, 37 | "null_count": 6, 38 | "max_value": true, 39 | "min_value": false 40 | } 41 | } 42 | } 43 | ], 44 | "total_byte_size": 69, 45 | "num_rows": 68 46 | } 47 | ], 48 | "metadata_length": 111 49 | } 50 | -------------------------------------------------------------------------------- /benchmark.js: -------------------------------------------------------------------------------- 1 | import { createWriteStream, promises as fs } from 'fs' 2 | import { compressors } from 'hyparquet-compressors' 3 | import { pipeline } from 'stream/promises' 4 | import { parquetReadObjects } from './src/index.js' 5 | import { asyncBufferFromFile } from './src/node.js' 6 | 7 | const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' 8 | const filename = 'example.parquet' 9 | 10 | // download test parquet file if needed 11 | let stat = await fs.stat(filename).catch(() => undefined) 12 | if (!stat) { 13 | console.log('downloading ' + url) 14 | const res = await fetch(url) 15 | if (!res.ok) throw new Error(res.statusText) 16 | // write to file async 17 | await pipeline(res.body, createWriteStream(filename)) 18 | stat = await fs.stat(filename) 19 | console.log('downloaded example.parquet', stat.size) 20 | } 21 | 22 | // asyncBuffer 23 | const file = await asyncBufferFromFile(filename) 24 | const startTime = performance.now() 25 | console.log('parsing example.parquet data...') 26 | 27 | // read parquet file 28 | await parquetReadObjects({ 29 | file, 30 | compressors, 31 | }) 32 | const ms = performance.now() - startTime 33 | console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`) 34 | -------------------------------------------------------------------------------- /test/plan.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { parquetMetadataAsync } from '../src/index.js' 3 | import { asyncBufferFromFile } from '../src/node.js' 4 | import { parquetPlan } from '../src/plan.js' 5 | 6 | describe('parquetPlan', () => { 7 | it('generates a query plan', async () => { 8 | const file = await asyncBufferFromFile('test/files/offset_indexed.parquet') 9 | const metadata = await parquetMetadataAsync(file) 10 | const plan = parquetPlan({ file, metadata }) 11 | expect(plan).toMatchObject({ 12 | metadata, 13 | rowStart: 0, 14 | rowEnd: 200, 15 | fetches: [ 16 | { startByte: 4, endByte: 14772 }, 17 | { startByte: 14772, endByte: 29507 }, 18 | ], 19 | groups: [ 20 | { 21 | groupRows: 100, 22 | groupStart: 0, 23 | chunks: [ 24 | { range: { startByte: 4, endByte: 438 } }, 25 | { range: { startByte: 438, endByte: 14772 } }, 26 | ], 27 | }, 28 | { 29 | groupRows: 100, 30 | groupStart: 100, 31 | chunks: [ 32 | { range: { startByte: 14772, endByte: 15208 } }, 33 | { range: { startByte: 15208, endByte: 29507 } }, 34 | ], 35 | }, 36 | ], 37 | }) 38 | }) 39 | }) 40 | -------------------------------------------------------------------------------- /test/files/duckdb_delta_binary_packed.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "duckdb_schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "type": "INT64", 11 | "repetition_type": "OPTIONAL", 12 | "name": "range", 13 | "converted_type": "INT_64" 14 | } 15 | ], 16 | "num_rows": 1250, 17 | "row_groups": [ 18 | { 19 | "columns": [ 20 | { 21 | "file_offset": 0, 22 | "meta_data": { 23 | "type": "INT64", 24 | "encodings": ["DELTA_BINARY_PACKED"], 25 | "path_in_schema": ["range"], 26 | "codec": "SNAPPY", 27 | "num_values": 1250, 28 | "total_uncompressed_size": 40, 29 | "total_compressed_size": 42, 30 | "data_page_offset": 4, 31 | "statistics": { 32 | "max": 1249, 33 | "min": 0, 34 | "null_count": 0, 35 | "max_value": 1249, 36 | "min_value": 0 37 | } 38 | } 39 | } 40 | ], 41 | "total_byte_size": 40, 42 | "num_rows": 1250, 43 | "file_offset": 4 44 | } 45 | ], 46 | "created_by": "DuckDB version v1.2.1 (build 8e52ec4395)", 47 | "metadata_length": 169 48 | } 49 | -------------------------------------------------------------------------------- /test/files/duckdb_delta_length_byte_array.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "duckdb_schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "type": "BYTE_ARRAY", 11 | "repetition_type": "OPTIONAL", 12 | "name": "range_varchar", 13 | "converted_type": "UTF8" 14 | } 15 | ], 16 | "num_rows": 1250, 17 | "row_groups": [ 18 | { 19 | "columns": [ 20 | { 21 | "file_offset": 0, 22 | "meta_data": { 23 | "type": "BYTE_ARRAY", 24 | "encodings": ["DELTA_LENGTH_BYTE_ARRAY"], 25 | "path_in_schema": ["range_varchar"], 26 | "codec": "SNAPPY", 27 | "num_values": 1250, 28 | "total_uncompressed_size": 3996, 29 | "total_compressed_size": 3390, 30 | "data_page_offset": 4, 31 | "statistics": { 32 | "max": "999", 33 | "min": "0", 34 | "null_count": 0, 35 | "max_value": "999", 36 | "min_value": "0" 37 | } 38 | } 39 | } 40 | ], 41 | "total_byte_size": 3996, 42 | "num_rows": 1250, 43 | "file_offset": 4 44 | } 45 | ], 46 | "created_by": "DuckDB version v1.2.1 (build 8e52ec4395)", 47 | "metadata_length": 164 48 | } 49 | -------------------------------------------------------------------------------- /test/files/concatenated_gzip_members.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "metadata_length": 115, 4 | "num_rows": 513, 5 | "row_groups": [ 6 | { 7 | "columns": [ 8 | { 9 | "file_offset": 1471, 10 | "meta_data": { 11 | "codec": "GZIP", 12 | "data_page_offset": 4, 13 | "encodings": [ 14 | "PLAIN", 15 | "RLE" 16 | ], 17 | "num_values": 513, 18 | "path_in_schema": [ 19 | "long_col" 20 | ], 21 | "statistics": { 22 | "max_value": 513, 23 | "min_value": 1 24 | }, 25 | "total_compressed_size": 1467, 26 | "total_uncompressed_size": 4155, 27 | "type": "INT64" 28 | } 29 | } 30 | ], 31 | "num_rows": 513, 32 | "ordinal": 0, 33 | "total_byte_size": 4155, 34 | "total_compressed_size": 1467 35 | } 36 | ], 37 | "schema": [ 38 | { 39 | "name": "root", 40 | "num_children": 1 41 | }, 42 | { 43 | "converted_type": "UINT_64", 44 | "logical_type": { 45 | "type": "INTEGER", 46 | "bitWidth": 64, 47 | "isSigned": false 48 | }, 49 | "name": "long_col", 50 | "repetition_type": "OPTIONAL", 51 | "type": "INT64" 52 | } 53 | ] 54 | } 55 | -------------------------------------------------------------------------------- /test/files/nullable.impala.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | 1, 4 | [1, 2, 3], 5 | [[1, 2], [3, 4]], 6 | {"k1": 1, "k2": 100}, 7 | [{"k1": 1}], 8 | {"A":1,"b":[1],"C":{"d":[[{"E":10,"F":"aaa"},{"E":-10,"F":"bbb"}],[{"E":11,"F":"c"}]]},"g":{"foo":{"H":{"i":[1.1]}}}} 9 | ], 10 | [ 11 | 2, 12 | [null, 1, 2, null, 3, null], 13 | [[null, 1, 2, null], [3, null, 4], [], null], 14 | {"k1": 2, "k2": null}, 15 | [{"k1": 1, "k3": null}, null, {}], 16 | {"A":null,"b":[null],"C":{"d":[[{"E":null,"F":null},{"E":10,"F":"aaa"},{"E":null,"F":null},{"E":-10,"F":"bbb"},{"E":null,"F":null}],[{"E":11,"F":"c"},null],[],null]},"g":{"g1":{"H":{"i":[2.2,null]}},"g2":{"H":{"i":[]}},"g3":null,"g4":{"H":{}},"g5":{}}} 17 | ], 18 | [ 19 | 3, 20 | [], 21 | [null], 22 | {}, 23 | [null, null], 24 | {"A":null,"C":{"d":[]},"g":{}} 25 | ], 26 | [ 27 | 4, 28 | null, 29 | [], 30 | {}, 31 | [], 32 | {"A":null,"C":{}} 33 | ], 34 | [ 35 | 5, 36 | null, 37 | null, 38 | {}, 39 | null, 40 | {"A":null,"g":{"foo":{"H":{"i":[2.2,3.3]}}}} 41 | ], 42 | [ 43 | 6, 44 | null, 45 | null, 46 | null, 47 | null, 48 | null 49 | ], 50 | [ 51 | 7, 52 | null, 53 | [null, [5, 6]], 54 | {"k1": null, "k3": null}, 55 | null, 56 | {"A":7,"b":[2,3,null],"C":{"d":[[],[null],null]}} 57 | ] 58 | ] 59 | -------------------------------------------------------------------------------- /test/package.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import packageJson from '../package.json' with { type: 'json' } 3 | 4 | describe('package.json', () => { 5 | it('should have the correct name', () => { 6 | expect(packageJson.name).toBe('hyparquet') 7 | }) 8 | it('should have a valid version', () => { 9 | expect(packageJson.version).toMatch(/^\d+\.\d+\.\d+$/) 10 | }) 11 | it('should have MIT license', () => { 12 | expect(packageJson.license).toBe('MIT') 13 | }) 14 | it('should have precise dev dependency versions', () => { 15 | const { devDependencies } = packageJson 16 | Object.values(devDependencies).forEach(version => { 17 | expect(version).toMatch(/^\d+\.\d+\.\d+$/) 18 | }) 19 | }) 20 | it('should have no dependencies', () => { 21 | expect('dependencies' in packageJson).toBe(false) 22 | expect('peerDependencies' in packageJson).toBe(false) 23 | }) 24 | it('should have exports with types first', () => { 25 | const { exports } = packageJson 26 | expect(Object.keys(exports)).toEqual(['.', './src/*.js']) 27 | // node vs default (browser) 28 | expect(Object.keys(exports['.'])).toEqual(['browser', 'default']) 29 | expect(Object.keys(exports['.'].browser)).toEqual(['types', 'import']) 30 | expect(Object.keys(exports['.'].default)).toEqual(['types', 'import']) 31 | // deep imports 32 | expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'import']) 33 | }) 34 | }) 35 | -------------------------------------------------------------------------------- /test/files/issue90.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "duckdb_schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "type": "DOUBLE", 11 | "repetition_type": "OPTIONAL", 12 | "name": "elb_01yr_imp_val" 13 | } 14 | ], 15 | "num_rows": 6144, 16 | "row_groups": [ 17 | { 18 | "columns": [ 19 | { 20 | "file_offset": 0, 21 | "meta_data": { 22 | "type": "DOUBLE", 23 | "encodings": [ 24 | "PLAIN" 25 | ], 26 | "path_in_schema": [ 27 | "elb_01yr_imp_val" 28 | ], 29 | "codec": "SNAPPY", 30 | "num_values": 6144, 31 | "total_uncompressed_size": 45059, 32 | "total_compressed_size": 44650, 33 | "data_page_offset": 4, 34 | "statistics": { 35 | "max": 449097851.5197593, 36 | "min": 0, 37 | "null_count": 610, 38 | "max_value": 449097851.5197593, 39 | "min_value": 0, 40 | "is_max_value_exact": true, 41 | "is_min_value_exact": true 42 | } 43 | } 44 | } 45 | ], 46 | "total_byte_size": 45059, 47 | "num_rows": 6144, 48 | "file_offset": 4 49 | } 50 | ], 51 | "created_by": "DuckDB version v1.3.0 (build 71c5c07cdd)", 52 | "metadata_length": 198 53 | } 54 | -------------------------------------------------------------------------------- /test/files/delta_length_byte_array.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 1, 8 | "field_id": -1 9 | }, 10 | { 11 | "type": "BYTE_ARRAY", 12 | "repetition_type": "OPTIONAL", 13 | "name": "FRUIT", 14 | "converted_type": "UTF8", 15 | "field_id": 1, 16 | "logical_type": { 17 | "type": "STRING" 18 | } 19 | } 20 | ], 21 | "num_rows": 1000, 22 | "row_groups": [ 23 | { 24 | "columns": [ 25 | { 26 | "file_offset": 2629, 27 | "meta_data": { 28 | "type": "BYTE_ARRAY", 29 | "encodings": [ 30 | "RLE", 31 | "DELTA_LENGTH_BYTE_ARRAY" 32 | ], 33 | "path_in_schema": [ 34 | "FRUIT" 35 | ], 36 | "codec": "ZSTD", 37 | "num_values": 1000, 38 | "total_uncompressed_size": 23747, 39 | "total_compressed_size": 2625, 40 | "data_page_offset": 4, 41 | "encoding_stats": [ 42 | { 43 | "page_type": "DATA_PAGE_V2", 44 | "encoding": "DELTA_LENGTH_BYTE_ARRAY", 45 | "count": 1 46 | } 47 | ] 48 | } 49 | } 50 | ], 51 | "total_byte_size": 23747, 52 | "num_rows": 1000, 53 | "file_offset": 0, 54 | "total_compressed_size": 2625, 55 | "ordinal": 0 56 | } 57 | ], 58 | "metadata_length": 105 59 | } 60 | -------------------------------------------------------------------------------- /test/files/duckdb5533.column_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "boundary_order": "UNORDERED", 5 | "max_values": [ 6 | "2022-11-27T17:42:44.280Z" 7 | ], 8 | "min_values": [ 9 | "2022-11-27T17:42:43.514Z" 10 | ], 11 | "null_counts": [ 12 | 0 13 | ], 14 | "null_pages": [ 15 | false 16 | ] 17 | }, 18 | { 19 | "boundary_order": "UNORDERED", 20 | "max_values": [ 21 | 85016 22 | ], 23 | "min_values": [ 24 | 1184 25 | ], 26 | "null_counts": [ 27 | 0 28 | ], 29 | "null_pages": [ 30 | false 31 | ] 32 | }, 33 | { 34 | "boundary_order": "UNORDERED", 35 | "max_values": [ 36 | [ 37 | 0 38 | ] 39 | ], 40 | "min_values": [ 41 | [ 42 | 0 43 | ] 44 | ], 45 | "null_counts": [ 46 | 4 47 | ], 48 | "null_pages": [ 49 | true 50 | ] 51 | }, 52 | { 53 | "boundary_order": "UNORDERED", 54 | "max_values": [ 55 | 1 56 | ], 57 | "min_values": [ 58 | -1 59 | ], 60 | "null_counts": [ 61 | 0 62 | ], 63 | "null_pages": [ 64 | false 65 | ] 66 | }, 67 | { 68 | "boundary_order": "UNORDERED", 69 | "max_values": [ 70 | 343 71 | ], 72 | "min_values": [ 73 | 343 74 | ], 75 | "null_counts": [ 76 | 0 77 | ], 78 | "null_pages": [ 79 | false 80 | ] 81 | } 82 | ] 83 | ] 84 | -------------------------------------------------------------------------------- /src/geoparquet.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @import {KeyValue, LogicalType, SchemaElement} from '../src/types.d.ts' 3 | * @param {SchemaElement[]} schema 4 | * @param {KeyValue[] | undefined} key_value_metadata 5 | * @returns {void} 6 | */ 7 | export function markGeoColumns(schema, key_value_metadata) { 8 | // Prepare the list of GeoParquet columns 9 | /** @type {Map} */ 10 | const columns = new Map() 11 | const geo = key_value_metadata?.find(({ key }) => key === 'geo')?.value 12 | const decodedColumns = (geo && JSON.parse(geo)?.columns) ?? {} 13 | for (const [name, column] of Object.entries(decodedColumns)) { 14 | if (column.encoding !== 'WKB') { 15 | continue 16 | } 17 | const type = column.edges === 'spherical' ? 'GEOGRAPHY' : 'GEOMETRY' 18 | const id = column.crs?.id ?? column.crs?.ids?.[0] 19 | const crs = id ? `${id.authority}:${id.code.toString()}` : undefined 20 | // Note: we can't infer GEOGRAPHY's algorithm from GeoParquet 21 | columns.set(name, { type, crs }) 22 | } 23 | 24 | // Mark schema elements with logical type 25 | // Only look at root-level columns of type BYTE_ARRAY without existing logical_type 26 | for (let i = 1; i < schema.length; i++) { // skip root 27 | const element = schema[i] 28 | const { logical_type, name, num_children, repetition_type, type } = element 29 | if (num_children) { 30 | i += num_children 31 | continue // skip the element and its children 32 | } 33 | if (type === 'BYTE_ARRAY' && logical_type === undefined && repetition_type !== 'REPEATED') { 34 | element.logical_type = columns.get(name) 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /test/files/issue72.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "root", 6 | "num_children": 1 7 | }, 8 | { 9 | "type": "BYTE_ARRAY", 10 | "repetition_type": "OPTIONAL", 11 | "name": "TextColumn", 12 | "converted_type": "UTF8", 13 | "logical_type": { 14 | "type": "STRING" 15 | } 16 | } 17 | ], 18 | "num_rows": 3, 19 | "row_groups": [ 20 | { 21 | "columns": [ 22 | { 23 | "file_offset": 4, 24 | "meta_data": { 25 | "type": "BYTE_ARRAY", 26 | "encodings": [ 27 | "RLE", 28 | "BIT_PACKED", 29 | "PLAIN" 30 | ], 31 | "path_in_schema": [ 32 | "TextColumn" 33 | ], 34 | "codec": "SNAPPY", 35 | "num_values": 3, 36 | "total_uncompressed_size": 283, 37 | "total_compressed_size": 288, 38 | "data_page_offset": 4, 39 | "statistics": { 40 | "max": "258d7fff-6418-499f-af07-c6611937d7d8", 41 | "min": "086f2968-327b-48a8-8cdf-64f46bcd8173", 42 | "null_count": 0, 43 | "distinct_count": 2, 44 | "max_value": "258d7fff-6418-499f-af07-c6611937d7d8", 45 | "min_value": "086f2968-327b-48a8-8cdf-64f46bcd8173" 46 | } 47 | } 48 | } 49 | ], 50 | "total_byte_size": 288, 51 | "num_rows": 3 52 | } 53 | ], 54 | "created_by": "Parquet.Net version 4.25.0 (build 687fbb462e94eddd1dc5a0aa26f33ba8e53f60e3)", 55 | "metadata_length": 321 56 | } 57 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hyparquet", 3 | "version": "1.23.3", 4 | "description": "Parquet file parser for JavaScript", 5 | "author": "Hyperparam", 6 | "homepage": "https://hyperparam.app", 7 | "keywords": [ 8 | "ai", 9 | "data", 10 | "dataset", 11 | "hyperparam", 12 | "hyparquet", 13 | "ml", 14 | "parquet", 15 | "parquetjs", 16 | "parser", 17 | "snappy", 18 | "thrift" 19 | ], 20 | "license": "MIT", 21 | "repository": { 22 | "type": "git", 23 | "url": "git+https://github.com/hyparam/hyparquet.git" 24 | }, 25 | "files": [ 26 | "src", 27 | "types" 28 | ], 29 | "type": "module", 30 | "types": "types/index.d.ts", 31 | "main": "src/index.js", 32 | "exports": { 33 | ".": { 34 | "browser": { 35 | "types": "./types/index.d.ts", 36 | "import": "./src/index.js" 37 | }, 38 | "default": { 39 | "types": "./types/node.d.ts", 40 | "import": "./src/node.js" 41 | } 42 | }, 43 | "./src/*.js": { 44 | "types": "./types/*.d.ts", 45 | "import": "./src/*.js" 46 | } 47 | }, 48 | "sideEffects": false, 49 | "scripts": { 50 | "build:types": "tsc -p ./tsconfig.build.json", 51 | "coverage": "vitest run --coverage --coverage.include=src", 52 | "lint": "eslint", 53 | "lint:fix": "eslint --fix", 54 | "prepare": "npm run build:types", 55 | "test": "vitest run" 56 | }, 57 | "devDependencies": { 58 | "@types/node": "25.0.3", 59 | "@vitest/coverage-v8": "4.0.16", 60 | "eslint": "9.39.2", 61 | "eslint-plugin-jsdoc": "61.5.0", 62 | "hyparquet-compressors": "1.1.1", 63 | "typescript": "5.9.3", 64 | "vitest": "4.0.16" 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /test/files/boolean_rle.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "created_by": "Polars", 3 | "key_value_metadata": [ 4 | { 5 | "key": "ARROW:schema", 6 | "value": "/////3YAAAAEAAAA8v///xQAAAAEAAEAAAAKAAsACAAKAAQA+P///wwAAAAIAAgAAAAEAAEAAAAEAAAA7P///ywAAAAgAAAAGAAAAAEGAAAQABIABAAQABEACAAAAAwAAAAAAPz///8EAAQADQAAAEJvb2xlYW5Db2x1bW4A" 7 | } 8 | ], 9 | "metadata_length": 308, 10 | "num_rows": 15, 11 | "row_groups": [ 12 | { 13 | "columns": [ 14 | { 15 | "column_index_length": 17, 16 | "column_index_offset": 89, 17 | "file_offset": 47, 18 | "meta_data": { 19 | "codec": "SNAPPY", 20 | "data_page_offset": 4, 21 | "encodings": [ 22 | "RLE" 23 | ], 24 | "num_values": 15, 25 | "path_in_schema": [ 26 | "BooleanColumn" 27 | ], 28 | "statistics": { 29 | "max_value": true, 30 | "min_value": false, 31 | "null_count": 5 32 | }, 33 | "total_compressed_size": 43, 34 | "total_uncompressed_size": 41, 35 | "type": "BOOLEAN" 36 | }, 37 | "offset_index_length": 10, 38 | "offset_index_offset": 106 39 | } 40 | ], 41 | "file_offset": 4, 42 | "num_rows": 15, 43 | "ordinal": 0, 44 | "total_byte_size": 41, 45 | "total_compressed_size": 43 46 | } 47 | ], 48 | "schema": [ 49 | { 50 | "name": "root", 51 | "num_children": 1 52 | }, 53 | { 54 | "name": "BooleanColumn", 55 | "repetition_type": "OPTIONAL", 56 | "type": "BOOLEAN" 57 | } 58 | ], 59 | "version": 1 60 | } 61 | -------------------------------------------------------------------------------- /src/indexes.js: -------------------------------------------------------------------------------- 1 | import { BoundaryOrders } from './constants.js' 2 | import { DEFAULT_PARSERS } from './convert.js' 3 | import { convertMetadata } from './metadata.js' 4 | import { deserializeTCompactProtocol } from './thrift.js' 5 | 6 | /** 7 | * @param {DataReader} reader 8 | * @param {SchemaElement} schema 9 | * @param {ParquetParsers | undefined} parsers 10 | * @returns {ColumnIndex} 11 | */ 12 | export function readColumnIndex(reader, schema, parsers = undefined) { 13 | parsers = { ...DEFAULT_PARSERS, ...parsers } 14 | 15 | const thrift = deserializeTCompactProtocol(reader) 16 | return { 17 | null_pages: thrift.field_1, 18 | min_values: thrift.field_2.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)), 19 | max_values: thrift.field_3.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)), 20 | boundary_order: BoundaryOrders[thrift.field_4], 21 | null_counts: thrift.field_5, 22 | repetition_level_histograms: thrift.field_6, 23 | definition_level_histograms: thrift.field_7, 24 | } 25 | } 26 | 27 | /** 28 | * @param {DataReader} reader 29 | * @returns {OffsetIndex} 30 | */ 31 | export function readOffsetIndex(reader) { 32 | const thrift = deserializeTCompactProtocol(reader) 33 | return { 34 | page_locations: thrift.field_1.map(pageLocation), 35 | unencoded_byte_array_data_bytes: thrift.field_2, 36 | } 37 | } 38 | 39 | /** 40 | * @import {ColumnIndex, DataReader, OffsetIndex, PageLocation, ParquetParsers, SchemaElement} from '../src/types.d.ts' 41 | * @param {any} loc 42 | * @returns {PageLocation} 43 | */ 44 | function pageLocation(loc) { 45 | return { 46 | offset: loc.field_1, 47 | compressed_page_size: loc.field_2, 48 | first_row_index: loc.field_3, 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /test/files/offset_indexed.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | null, 4 | { 5 | "page_locations": [ 6 | { 7 | "offset": 438, 8 | "compressed_page_size": 2670, 9 | "first_row_index": 0 10 | }, 11 | { 12 | "offset": 3108, 13 | "compressed_page_size": 2680, 14 | "first_row_index": 18 15 | }, 16 | { 17 | "offset": 5788, 18 | "compressed_page_size": 2704, 19 | "first_row_index": 37 20 | }, 21 | { 22 | "offset": 8492, 23 | "compressed_page_size": 2660, 24 | "first_row_index": 54 25 | }, 26 | { 27 | "offset": 11152, 28 | "compressed_page_size": 2790, 29 | "first_row_index": 75 30 | }, 31 | { 32 | "offset": 13942, 33 | "compressed_page_size": 830, 34 | "first_row_index": 95 35 | } 36 | ] 37 | } 38 | ], 39 | [ 40 | null, 41 | { 42 | "page_locations": [ 43 | { 44 | "offset": 15208, 45 | "compressed_page_size": 2784, 46 | "first_row_index": 0 47 | }, 48 | { 49 | "offset": 17992, 50 | "compressed_page_size": 2660, 51 | "first_row_index": 20 52 | }, 53 | { 54 | "offset": 20652, 55 | "compressed_page_size": 2648, 56 | "first_row_index": 40 57 | }, 58 | { 59 | "offset": 23300, 60 | "compressed_page_size": 2721, 61 | "first_row_index": 58 62 | }, 63 | { 64 | "offset": 26021, 65 | "compressed_page_size": 2734, 66 | "first_row_index": 78 67 | }, 68 | { 69 | "offset": 28755, 70 | "compressed_page_size": 752, 71 | "first_row_index": 96 72 | } 73 | ] 74 | } 75 | ] 76 | ] 77 | -------------------------------------------------------------------------------- /test/readFiles.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { compressors } from 'hyparquet-compressors' 3 | import { describe, expect, it } from 'vitest' 4 | import { parquetMetadataAsync, parquetRead, toJson } from '../src/index.js' 5 | import { asyncBufferFromFile } from '../src/node.js' 6 | import { fileToJson } from './helpers.js' 7 | 8 | describe('parquetRead test files', () => { 9 | const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet')) 10 | 11 | files.forEach(filename => { 12 | it(`parse data from ${filename}`, async () => { 13 | const file = await asyncBufferFromFile(`test/files/${filename}`) 14 | await parquetRead({ 15 | file, 16 | compressors, 17 | onComplete(rows) { 18 | const base = filename.replace('.parquet', '') 19 | const expected = fileToJson(`test/files/${base}.json`) 20 | // stringify and parse to make legal json (NaN, -0, etc) 21 | expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected) 22 | }, 23 | }) 24 | }) 25 | 26 | it(`read the last row from ${filename}`, async () => { 27 | // this exercises some of the page-skipping optimizations 28 | const file = await asyncBufferFromFile(`test/files/${filename}`) 29 | const metadata = await parquetMetadataAsync(file) 30 | let numRows = Number(metadata.num_rows) 31 | // repeated_no_annotation has wrong num_rows in metadata: 32 | if (filename === 'repeated_no_annotation.parquet') numRows = 6 33 | await parquetRead({ 34 | file, 35 | compressors, 36 | rowStart: numRows - 1, 37 | rowEnd: numRows, 38 | onComplete(rows) { 39 | const base = filename.replace('.parquet', '') 40 | if (rows.length) { 41 | const expected = [fileToJson(`test/files/${base}.json`).at(-1)] 42 | expect(toJson(rows)).toEqual(expected) 43 | } 44 | }, 45 | }) 46 | }) 47 | }) 48 | }) 49 | -------------------------------------------------------------------------------- /test/files/fixed_length_decimal.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "spark_schema", 6 | "num_children": 1 7 | }, 8 | { 9 | "type": "FIXED_LEN_BYTE_ARRAY", 10 | "type_length": 11, 11 | "repetition_type": "OPTIONAL", 12 | "name": "value", 13 | "converted_type": "DECIMAL", 14 | "scale": 2, 15 | "precision": 25 16 | } 17 | ], 18 | "num_rows": 24, 19 | "row_groups": [ 20 | { 21 | "columns": [ 22 | { 23 | "file_offset": 4, 24 | "meta_data": { 25 | "type": "FIXED_LEN_BYTE_ARRAY", 26 | "encodings": [ 27 | "BIT_PACKED", 28 | "RLE", 29 | "PLAIN" 30 | ], 31 | "path_in_schema": [ 32 | "value" 33 | ], 34 | "codec": "UNCOMPRESSED", 35 | "num_values": 24, 36 | "total_uncompressed_size": 319, 37 | "total_compressed_size": 319, 38 | "data_page_offset": 4, 39 | "statistics": { 40 | "max": 24, 41 | "min": 2, 42 | "null_count": 0 43 | }, 44 | "encoding_stats": [ 45 | { 46 | "page_type": "DATA_PAGE", 47 | "encoding": "PLAIN", 48 | "count": 1 49 | } 50 | ] 51 | } 52 | } 53 | ], 54 | "total_byte_size": 319, 55 | "num_rows": 24 56 | } 57 | ], 58 | "key_value_metadata": [ 59 | { 60 | "key": "org.apache.spark.sql.parquet.row.metadata", 61 | "value": "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"decimal(25,2)\",\"nullable\":true,\"metadata\":{}}]}" 62 | } 63 | ], 64 | "created_by": "parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)", 65 | "metadata_length": 346 66 | } 67 | -------------------------------------------------------------------------------- /test/read.utf8.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { parquetReadObjects } from '../src/index.js' 3 | import { asyncBufferFromFile } from '../src/node.js' 4 | 5 | describe('parquetRead utf8', () => { 6 | it('default utf8 behavior', async () => { 7 | const file = await asyncBufferFromFile('test/files/strings.parquet') 8 | const rows = await parquetReadObjects({ file }) 9 | expect(rows).toEqual([ 10 | { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' }, 11 | { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' }, 12 | { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' }, 13 | { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' }, 14 | ]) 15 | }) 16 | 17 | it('utf8 = true', async () => { 18 | const file = await asyncBufferFromFile('test/files/strings.parquet') 19 | const rows = await parquetReadObjects({ file, utf8: true }) 20 | expect(rows).toEqual([ 21 | { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' }, 22 | { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' }, 23 | { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' }, 24 | { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' }, 25 | ]) 26 | }) 27 | 28 | it('utf8 = false', async () => { 29 | const file = await asyncBufferFromFile('test/files/strings.parquet') 30 | const rows = await parquetReadObjects({ file, utf8: false }) 31 | expect(rows).toEqual([ 32 | { 33 | bytes: new Uint8Array([97, 108, 112, 104, 97]), 34 | c_utf8: 'alpha', 35 | l_utf8: 'alpha', 36 | }, 37 | { 38 | bytes: new Uint8Array([98, 114, 97, 118, 111]), 39 | c_utf8: 'bravo', 40 | l_utf8: 'bravo', 41 | }, 42 | { 43 | bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]), 44 | c_utf8: 'charlie', 45 | l_utf8: 'charlie', 46 | }, 47 | { 48 | bytes: new Uint8Array([100, 101, 108, 116, 97]), 49 | c_utf8: 'delta', 50 | l_utf8: 'delta', 51 | }, 52 | ]) 53 | }) 54 | }) 55 | -------------------------------------------------------------------------------- /test/files/float16_nonzeros_and_nans.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "type": "FIXED_LEN_BYTE_ARRAY", 11 | "type_length": 2, 12 | "repetition_type": "OPTIONAL", 13 | "name": "x", 14 | "logical_type": { 15 | "type": "FLOAT16" 16 | } 17 | } 18 | ], 19 | "num_rows": 8, 20 | "row_groups": [ 21 | { 22 | "columns": [ 23 | { 24 | "file_offset": 80, 25 | "meta_data": { 26 | "type": "FIXED_LEN_BYTE_ARRAY", 27 | "encodings": [ 28 | "PLAIN", 29 | "RLE", 30 | "RLE_DICTIONARY" 31 | ], 32 | "path_in_schema": [ 33 | "x" 34 | ], 35 | "codec": "UNCOMPRESSED", 36 | "num_values": 8, 37 | "total_uncompressed_size": 76, 38 | "total_compressed_size": 76, 39 | "data_page_offset": 32, 40 | "dictionary_page_offset": 4, 41 | "statistics": { 42 | "max": 2, 43 | "min": -2, 44 | "null_count": 1, 45 | "max_value": 2, 46 | "min_value": -2 47 | }, 48 | "encoding_stats": [ 49 | { 50 | "page_type": "DICTIONARY_PAGE", 51 | "encoding": "PLAIN", 52 | "count": 1 53 | }, 54 | { 55 | "page_type": "DATA_PAGE", 56 | "encoding": "RLE_DICTIONARY", 57 | "count": 1 58 | } 59 | ] 60 | } 61 | } 62 | ], 63 | "total_byte_size": 76, 64 | "num_rows": 8, 65 | "file_offset": 4, 66 | "total_compressed_size": 76, 67 | "ordinal": 0 68 | } 69 | ], 70 | "key_value_metadata": [ 71 | { 72 | "key": "ARROW:schema", 73 | "value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEDEAAAABgAAAAEAAAAAAAAAAEAAAB4AAAABAAEAAQAAAAAAAAA" 74 | } 75 | ], 76 | "created_by": "parquet-cpp-arrow version 15.0.0-SNAPSHOT", 77 | "metadata_length": 346 78 | } 79 | -------------------------------------------------------------------------------- /src/constants.js: -------------------------------------------------------------------------------- 1 | 2 | /** @type {import('../src/types.d.ts').ParquetType[]} */ 3 | export const ParquetTypes = [ 4 | 'BOOLEAN', 5 | 'INT32', 6 | 'INT64', 7 | 'INT96', // deprecated 8 | 'FLOAT', 9 | 'DOUBLE', 10 | 'BYTE_ARRAY', 11 | 'FIXED_LEN_BYTE_ARRAY', 12 | ] 13 | 14 | /** @type {import('../src/types.d.ts').Encoding[]} */ 15 | export const Encodings = [ 16 | 'PLAIN', 17 | 'GROUP_VAR_INT', // deprecated 18 | 'PLAIN_DICTIONARY', 19 | 'RLE', 20 | 'BIT_PACKED', // deprecated 21 | 'DELTA_BINARY_PACKED', 22 | 'DELTA_LENGTH_BYTE_ARRAY', 23 | 'DELTA_BYTE_ARRAY', 24 | 'RLE_DICTIONARY', 25 | 'BYTE_STREAM_SPLIT', 26 | ] 27 | 28 | /** @type {import('../src/types.d.ts').FieldRepetitionType[]} */ 29 | export const FieldRepetitionTypes = [ 30 | 'REQUIRED', 31 | 'OPTIONAL', 32 | 'REPEATED', 33 | ] 34 | 35 | /** @type {import('../src/types.d.ts').ConvertedType[]} */ 36 | export const ConvertedTypes = [ 37 | 'UTF8', 38 | 'MAP', 39 | 'MAP_KEY_VALUE', 40 | 'LIST', 41 | 'ENUM', 42 | 'DECIMAL', 43 | 'DATE', 44 | 'TIME_MILLIS', 45 | 'TIME_MICROS', 46 | 'TIMESTAMP_MILLIS', 47 | 'TIMESTAMP_MICROS', 48 | 'UINT_8', 49 | 'UINT_16', 50 | 'UINT_32', 51 | 'UINT_64', 52 | 'INT_8', 53 | 'INT_16', 54 | 'INT_32', 55 | 'INT_64', 56 | 'JSON', 57 | 'BSON', 58 | 'INTERVAL', 59 | ] 60 | 61 | /** @type {import('../src/types.d.ts').CompressionCodec[]} */ 62 | export const CompressionCodecs = [ 63 | 'UNCOMPRESSED', 64 | 'SNAPPY', 65 | 'GZIP', 66 | 'LZO', 67 | 'BROTLI', 68 | 'LZ4', 69 | 'ZSTD', 70 | 'LZ4_RAW', 71 | ] 72 | 73 | /** @type {import('../src/types.d.ts').PageType[]} */ 74 | export const PageTypes = [ 75 | 'DATA_PAGE', 76 | 'INDEX_PAGE', 77 | 'DICTIONARY_PAGE', 78 | 'DATA_PAGE_V2', 79 | ] 80 | 81 | /** @type {import('../src/types.d.ts').BoundaryOrder[]} */ 82 | export const BoundaryOrders = [ 83 | 'UNORDERED', 84 | 'ASCENDING', 85 | 'DESCENDING', 86 | ] 87 | 88 | /** @type {import('../src/types.d.ts').EdgeInterpolationAlgorithm[]} */ 89 | export const EdgeInterpolationAlgorithms = [ 90 | 'SPHERICAL', 91 | 'VINCENTY', 92 | 'THOMAS', 93 | 'ANDOYER', 94 | 'KARNEY', 95 | ] 96 | -------------------------------------------------------------------------------- /test/files/duckdb3734.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "tt0000001", 4 | { 5 | "category": "self", 6 | "characters": ["[\"Self\"]"], 7 | "job": "\\N", 8 | "nconst": "nm1588970", 9 | "ordering": 1 10 | } 11 | ], 12 | [ 13 | "tt0000001", 14 | { 15 | "category": "director", 16 | "characters": ["\\N"], 17 | "job": "\\N", 18 | "nconst": "nm0005690", 19 | "ordering": 2 20 | } 21 | ], 22 | [ 23 | "tt0000001", 24 | { 25 | "category": "cinematographer", 26 | "characters": ["\\N"], 27 | "job": "director of photography", 28 | "nconst": "nm0374658", 29 | "ordering": 3 30 | } 31 | ], 32 | [ 33 | "tt0000002", 34 | { 35 | "category": "director", 36 | "characters": ["\\N"], 37 | "job": "\\N", 38 | "nconst": "nm0721526", 39 | "ordering": 1 40 | } 41 | ], 42 | [ 43 | "tt0000002", 44 | { 45 | "category": "composer", 46 | "characters": ["\\N"], 47 | "job": "\\N", 48 | "nconst": "nm1335271", 49 | "ordering": 2 50 | } 51 | ], 52 | [ 53 | "tt0000003", 54 | { 55 | "category": "director", 56 | "characters": ["\\N"], 57 | "job": "\\N", 58 | "nconst": "nm0721526", 59 | "ordering": 1 60 | } 61 | ], 62 | [ 63 | "tt0000003", 64 | { 65 | "category": "producer", 66 | "characters": ["\\N"], 67 | "job": "producer", 68 | "nconst": "nm1770680", 69 | "ordering": 2 70 | } 71 | ], 72 | [ 73 | "tt0000003", 74 | { 75 | "category": "composer", 76 | "characters": ["\\N"], 77 | "job": "\\N", 78 | "nconst": "nm1335271", 79 | "ordering": 3 80 | } 81 | ], 82 | [ 83 | "tt0000003", 84 | { 85 | "category": "editor", 86 | "characters": ["\\N"], 87 | "job": "\\N", 88 | "nconst": "nm5442200", 89 | "ordering": 4 90 | } 91 | ], 92 | [ 93 | "tt0000004", 94 | { 95 | "category": "director", 96 | "characters": ["\\N"], 97 | "job": "\\N", 98 | "nconst": "nm0721526", 99 | "ordering": 1 100 | } 101 | ] 102 | ] 103 | -------------------------------------------------------------------------------- /test/files/duckdb2557.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "root", 6 | "num_children": 3 7 | }, 8 | { 9 | "type": "BYTE_ARRAY", 10 | "repetition_type": "REPEATED", 11 | "name": "stringArray", 12 | "converted_type": "UTF8" 13 | }, 14 | { 15 | "type": "INT32", 16 | "repetition_type": "REPEATED", 17 | "name": "intArray" 18 | }, 19 | { 20 | "type": "DOUBLE", 21 | "repetition_type": "REPEATED", 22 | "name": "doubleArray" 23 | } 24 | ], 25 | "num_rows": 100, 26 | "row_groups": [ 27 | { 28 | "columns": [ 29 | { 30 | "file_offset": 4802, 31 | "meta_data": { 32 | "type": "BYTE_ARRAY", 33 | "encodings": [ 34 | "RLE", 35 | "PLAIN" 36 | ], 37 | "path_in_schema": [ 38 | "stringArray" 39 | ], 40 | "codec": "UNCOMPRESSED", 41 | "num_values": 449, 42 | "total_uncompressed_size": 4798, 43 | "total_compressed_size": 4798, 44 | "data_page_offset": 4 45 | } 46 | }, 47 | { 48 | "file_offset": 5874, 49 | "meta_data": { 50 | "type": "INT32", 51 | "encodings": [ 52 | "RLE", 53 | "PLAIN" 54 | ], 55 | "path_in_schema": [ 56 | "intArray" 57 | ], 58 | "codec": "UNCOMPRESSED", 59 | "num_values": 237, 60 | "total_uncompressed_size": 1038, 61 | "total_compressed_size": 1038, 62 | "data_page_offset": 4836 63 | } 64 | }, 65 | { 66 | "file_offset": 7663, 67 | "meta_data": { 68 | "type": "DOUBLE", 69 | "encodings": [ 70 | "RLE", 71 | "PLAIN" 72 | ], 73 | "path_in_schema": [ 74 | "doubleArray" 75 | ], 76 | "codec": "UNCOMPRESSED", 77 | "num_values": 225, 78 | "total_uncompressed_size": 1757, 79 | "total_compressed_size": 1757, 80 | "data_page_offset": 5906 81 | } 82 | } 83 | ], 84 | "total_byte_size": 7694, 85 | "num_rows": 100 86 | } 87 | ], 88 | "key_value_metadata": [], 89 | "created_by": "parquet.js", 90 | "metadata_length": 211 91 | } 92 | -------------------------------------------------------------------------------- /test/files/struct_strings.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "test", 6 | "num_children": 1 7 | }, 8 | { 9 | "repetition_type": "REQUIRED", 10 | "name": "inner", 11 | "num_children": 2 12 | }, 13 | { 14 | "type": "BYTE_ARRAY", 15 | "repetition_type": "OPTIONAL", 16 | "name": "str_field", 17 | "converted_type": "UTF8", 18 | "logical_type": { 19 | "type": "STRING" 20 | } 21 | }, 22 | { 23 | "type": "DOUBLE", 24 | "repetition_type": "OPTIONAL", 25 | "name": "f64_field" 26 | } 27 | ], 28 | "num_rows": 2, 29 | "row_groups": [ 30 | { 31 | "columns": [ 32 | { 33 | "file_offset": 4, 34 | "meta_data": { 35 | "type": "BYTE_ARRAY", 36 | "encodings": [ 37 | "RLE", 38 | "PLAIN" 39 | ], 40 | "path_in_schema": [ 41 | "inner", 42 | "str_field" 43 | ], 44 | "codec": "SNAPPY", 45 | "num_values": 2, 46 | "total_uncompressed_size": 32, 47 | "total_compressed_size": 34, 48 | "key_value_metadata": [], 49 | "data_page_offset": 4, 50 | "statistics": { 51 | "null_count": 1, 52 | "distinct_count": 1 53 | } 54 | } 55 | }, 56 | { 57 | "file_offset": 38, 58 | "meta_data": { 59 | "type": "DOUBLE", 60 | "encodings": [ 61 | "RLE", 62 | "PLAIN" 63 | ], 64 | "path_in_schema": [ 65 | "inner", 66 | "f64_field" 67 | ], 68 | "codec": "SNAPPY", 69 | "num_values": 2, 70 | "total_uncompressed_size": 31, 71 | "total_compressed_size": 33, 72 | "key_value_metadata": [], 73 | "data_page_offset": 38, 74 | "statistics": { 75 | "null_count": 1, 76 | "distinct_count": 1, 77 | "max_value": 1.23, 78 | "min_value": 1.23 79 | } 80 | } 81 | } 82 | ], 83 | "total_byte_size": 0, 84 | "num_rows": 2 85 | } 86 | ], 87 | "key_value_metadata": [], 88 | "created_by": "parquet-go", 89 | "metadata_length": 203 90 | } 91 | -------------------------------------------------------------------------------- /eslint.config.js: -------------------------------------------------------------------------------- 1 | import javascript from '@eslint/js' 2 | import jsdoc from 'eslint-plugin-jsdoc' 3 | import globals from 'globals' 4 | 5 | export default [ 6 | { 7 | plugins: { 8 | jsdoc, 9 | }, 10 | 11 | languageOptions: { 12 | globals: { 13 | ...globals.browser, 14 | ...globals.node, 15 | }, 16 | }, 17 | 18 | rules: { 19 | ...javascript.configs.recommended.rules, 20 | 'arrow-spacing': 'error', 21 | camelcase: 'off', 22 | 'comma-spacing': 'error', 23 | 'comma-dangle': ['error', { 24 | arrays: 'always-multiline', 25 | objects: 'always-multiline', 26 | imports: 'always-multiline', 27 | exports: 'always-multiline', 28 | functions: 'never', 29 | }], 30 | 'eol-last': 'error', 31 | eqeqeq: 'error', 32 | 'func-style': ['error', 'declaration'], 33 | indent: ['error', 2], 34 | 'jsdoc/check-param-names': 'error', 35 | 'jsdoc/check-property-names': 'error', 36 | 'jsdoc/check-tag-names': 'error', 37 | 'jsdoc/require-param': 'error', 38 | 'jsdoc/require-param-type': 'error', 39 | 'jsdoc/require-returns': 'error', 40 | 'jsdoc/require-returns-type': 'error', 41 | 'jsdoc/sort-tags': 'error', 42 | 'key-spacing': 'error', 43 | 'keyword-spacing': 'error', 44 | 'no-constant-condition': 'off', 45 | 'no-extra-parens': 'error', 46 | 'no-multi-spaces': 'error', 47 | 'no-trailing-spaces': 'error', 48 | 'no-undef': 'error', 49 | 'no-unused-vars': 'error', 50 | 'no-useless-concat': 'error', 51 | 'no-useless-rename': 'error', 52 | 'no-useless-return': 'error', 53 | 'no-var': 'error', 54 | 'object-curly-spacing': ['error', 'always'], 55 | 'object-shorthand': 'error', 56 | 'prefer-const': 'error', 57 | 'prefer-destructuring': ['warn', { 58 | object: true, 59 | array: false, 60 | }], 61 | 'prefer-exponentiation-operator': 'error', 62 | 'prefer-promise-reject-errors': 'error', 63 | quotes: ['error', 'single'], 64 | 'require-await': 'warn', 65 | semi: ['error', 'never'], 66 | 'sort-imports': ['error', { 67 | ignoreDeclarationSort: true, 68 | ignoreMemberSort: false, 69 | memberSyntaxSortOrder: ['none', 'all', 'multiple', 'single'], 70 | }], 71 | 'space-infix-ops': 'error', 72 | }, 73 | }, 74 | ] 75 | -------------------------------------------------------------------------------- /test/files/issue115decimal.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "type": "FIXED_LEN_BYTE_ARRAY", 11 | "type_length": 13, 12 | "repetition_type": "OPTIONAL", 13 | "name": "amount", 14 | "converted_type": "DECIMAL", 15 | "scale": 2, 16 | "precision": 29, 17 | "logical_type": { 18 | "type": "DECIMAL", 19 | "scale": 2, 20 | "precision": 29 21 | } 22 | } 23 | ], 24 | "num_rows": 1, 25 | "row_groups": [ 26 | { 27 | "columns": [ 28 | { 29 | "file_offset": 0, 30 | "meta_data": { 31 | "type": "FIXED_LEN_BYTE_ARRAY", 32 | "encodings": [ 33 | "PLAIN", 34 | "RLE", 35 | "RLE_DICTIONARY" 36 | ], 37 | "path_in_schema": [ 38 | "amount" 39 | ], 40 | "codec": "SNAPPY", 41 | "num_values": 1, 42 | "total_uncompressed_size": 117, 43 | "total_compressed_size": 121, 44 | "data_page_offset": 33, 45 | "dictionary_page_offset": 4, 46 | "statistics": { 47 | "max": -12345.67, 48 | "min": -12345.67, 49 | "null_count": 0, 50 | "max_value": -12345.67, 51 | "min_value": -12345.67 52 | }, 53 | "encoding_stats": [ 54 | { 55 | "page_type": "DICTIONARY_PAGE", 56 | "encoding": "PLAIN", 57 | "count": 1 58 | }, 59 | { 60 | "page_type": "DATA_PAGE", 61 | "encoding": "RLE_DICTIONARY", 62 | "count": 1 63 | } 64 | ] 65 | } 66 | } 67 | ], 68 | "total_byte_size": 117, 69 | "num_rows": 1, 70 | "file_offset": 4, 71 | "total_compressed_size": 121, 72 | "ordinal": 0 73 | } 74 | ], 75 | "key_value_metadata": [ 76 | { 77 | "key": "ARROW:schema", 78 | "value": "/////4AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEHEAAAACAAAAAEAAAAAAAAAAYAAABhbW91bnQAAAgADAAEAAgACAAAAB0AAAACAAAAAAAAAA==" 79 | } 80 | ], 81 | "created_by": "parquet-cpp-arrow version 19.0.1", 82 | "metadata_length": 424 83 | } 84 | -------------------------------------------------------------------------------- /test/rowend_struct.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { parquetReadObjects } from '../src/index.js' 3 | import { asyncBufferFromFile } from '../src/node.js' 4 | 5 | /** 6 | * Test for issue #147: struct children with different page counts 7 | * cause "parquet struct parsing error" when using rowEnd. 8 | * 9 | * The bug occurs when: 10 | * 1. A struct has multiple child columns 11 | * 2. One child has multiple pages (large data) 12 | * 3. Another child has fewer pages (small/compressible data) 13 | * 4. rowEnd is used to limit the number of rows read 14 | * 15 | * The root cause is in column.js - for non-flat columns, all pages 16 | * are read, but truncation only affects the last chunk. If a column 17 | * has multiple chunks (pages), earlier chunks aren't truncated, 18 | * resulting in mismatched array lengths during struct assembly. 19 | * 20 | * Test file: rowend_struct.parquet (created with pyarrow) 21 | * - 1050 rows 22 | * - struct column 's' with children: 23 | * - 'a': unique strings (2 data pages due to snappy compression) 24 | * - 'b': same string "x" (1 data page) 25 | */ 26 | describe('rowEnd with struct columns', () => { 27 | it('reads all rows without error', async () => { 28 | const file = await asyncBufferFromFile('test/files/rowend_struct.parquet') 29 | const rows = await parquetReadObjects({ file }) 30 | expect(rows.length).toBe(1050) 31 | expect(rows[0]).toEqual({ s: { a: 'v0000', b: 'x' } }) 32 | expect(rows[1049]).toEqual({ s: { a: 'v1049', b: 'x' } }) 33 | }) 34 | 35 | it('reads partial rows with rowEnd', async () => { 36 | const file = await asyncBufferFromFile('test/files/rowend_struct.parquet') 37 | // This should return 10 rows but currently throws 38 | // "parquet struct parsing error" due to mismatched child array lengths 39 | const rows = await parquetReadObjects({ file, rowEnd: 10 }) 40 | expect(rows.length).toBe(10) 41 | expect(rows[0]).toEqual({ s: { a: 'v0000', b: 'x' } }) 42 | expect(rows[9]).toEqual({ s: { a: 'v0009', b: 'x' } }) 43 | }) 44 | 45 | it('reads middle rows with rowStart and rowEnd', async () => { 46 | const file = await asyncBufferFromFile('test/files/rowend_struct.parquet') 47 | const rows = await parquetReadObjects({ file, rowStart: 100, rowEnd: 110 }) 48 | expect(rows.length).toBe(10) 49 | expect(rows[0]).toEqual({ s: { a: 'v0100', b: 'x' } }) 50 | expect(rows[9]).toEqual({ s: { a: 'v0109', b: 'x' } }) 51 | }) 52 | }) 53 | -------------------------------------------------------------------------------- /test/files/strings.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "name": "root", 6 | "num_children": 3 7 | }, 8 | { 9 | "type": "BYTE_ARRAY", 10 | "name": "bytes" 11 | }, 12 | { 13 | "type": "BYTE_ARRAY", 14 | "name": "c_utf8", 15 | "converted_type": "UTF8" 16 | }, 17 | { 18 | "type": "BYTE_ARRAY", 19 | "name": "l_utf8", 20 | "logical_type": { 21 | "type": "STRING" 22 | } 23 | } 24 | ], 25 | "num_rows": 4, 26 | "row_groups": [ 27 | { 28 | "columns": [ 29 | { 30 | "file_offset": 4, 31 | "meta_data": { 32 | "type": "BYTE_ARRAY", 33 | "encodings": ["PLAIN"], 34 | "path_in_schema": ["bytes"], 35 | "codec": "UNCOMPRESSED", 36 | "num_values": 4, 37 | "total_uncompressed_size": 62, 38 | "total_compressed_size": 62, 39 | "data_page_offset": 4, 40 | "statistics": { 41 | "null_count": 0, 42 | "max_value": "delta", 43 | "min_value": "alpha" 44 | } 45 | } 46 | }, 47 | { 48 | "file_offset": 66, 49 | "meta_data": { 50 | "type": "BYTE_ARRAY", 51 | "encodings": ["PLAIN"], 52 | "path_in_schema": ["c_utf8"], 53 | "codec": "UNCOMPRESSED", 54 | "num_values": 4, 55 | "total_uncompressed_size": 62, 56 | "total_compressed_size": 62, 57 | "data_page_offset": 66, 58 | "statistics": { 59 | "null_count": 0, 60 | "max_value": "delta", 61 | "min_value": "alpha" 62 | } 63 | } 64 | }, 65 | { 66 | "file_offset": 128, 67 | "meta_data": { 68 | "type": "BYTE_ARRAY", 69 | "encodings": ["PLAIN"], 70 | "path_in_schema": ["l_utf8"], 71 | "codec": "UNCOMPRESSED", 72 | "num_values": 4, 73 | "total_uncompressed_size": 62, 74 | "total_compressed_size": 62, 75 | "data_page_offset": 128, 76 | "statistics": { 77 | "null_count": 0, 78 | "max_value": "delta", 79 | "min_value": "alpha" 80 | } 81 | } 82 | } 83 | ], 84 | "total_byte_size": 186, 85 | "num_rows": 4 86 | } 87 | ], 88 | "created_by": "hyparquet", 89 | "metadata_length": 219 90 | } 91 | -------------------------------------------------------------------------------- /test/files/continued_page.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "repetition_type": "OPTIONAL", 11 | "name": "int_list", 12 | "num_children": 1, 13 | "converted_type": "LIST", 14 | "logical_type": { 15 | "type": "LIST" 16 | } 17 | }, 18 | { 19 | "repetition_type": "REPEATED", 20 | "name": "list", 21 | "num_children": 1 22 | }, 23 | { 24 | "type": "INT32", 25 | "repetition_type": "OPTIONAL", 26 | "name": "element" 27 | } 28 | ], 29 | "num_rows": 100, 30 | "row_groups": [ 31 | { 32 | "columns": [ 33 | { 34 | "file_offset": 0, 35 | "meta_data": { 36 | "type": "INT32", 37 | "encodings": [ 38 | "PLAIN", 39 | "RLE", 40 | "RLE_DICTIONARY" 41 | ], 42 | "path_in_schema": [ 43 | "int_list", 44 | "list", 45 | "element" 46 | ], 47 | "codec": "SNAPPY", 48 | "num_values": 2000, 49 | "total_uncompressed_size": 2692, 50 | "total_compressed_size": 2338, 51 | "data_page_offset": 426, 52 | "dictionary_page_offset": 4, 53 | "statistics": { 54 | "max": 99, 55 | "min": 0, 56 | "null_count": 0, 57 | "max_value": 99, 58 | "min_value": 0 59 | }, 60 | "encoding_stats": [ 61 | { 62 | "page_type": "DICTIONARY_PAGE", 63 | "encoding": "PLAIN", 64 | "count": 1 65 | }, 66 | { 67 | "page_type": "DATA_PAGE", 68 | "encoding": "RLE_DICTIONARY", 69 | "count": 2 70 | } 71 | ] 72 | } 73 | } 74 | ], 75 | "total_byte_size": 2692, 76 | "num_rows": 100, 77 | "file_offset": 4, 78 | "total_compressed_size": 2338, 79 | "ordinal": 0 80 | } 81 | ], 82 | "key_value_metadata": [ 83 | { 84 | "key": "ARROW:schema", 85 | "value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAEAAAAzP///wAAAQwUAAAAJAAAAAQAAAABAAAALAAAAAgAAABpbnRfbGlzdAAAAAAEAAQABAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAgAAAABAAAAAAAAAAEAAAAaXRlbQAAAAAIAAwACAAHAAgAAAAAAAABIAAAAA==" 86 | } 87 | ], 88 | "created_by": "parquet-cpp-arrow version 19.0.1", 89 | "metadata_length": 488 90 | } 91 | -------------------------------------------------------------------------------- /test/files/repeated_no_annotation.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "user", 6 | "num_children": 2 7 | }, 8 | { 9 | "type": "INT32", 10 | "repetition_type": "REQUIRED", 11 | "name": "id" 12 | }, 13 | { 14 | "repetition_type": "OPTIONAL", 15 | "name": "phoneNumbers", 16 | "num_children": 1 17 | }, 18 | { 19 | "repetition_type": "REPEATED", 20 | "name": "phone", 21 | "num_children": 2 22 | }, 23 | { 24 | "type": "INT64", 25 | "repetition_type": "REQUIRED", 26 | "name": "number" 27 | }, 28 | { 29 | "type": "BYTE_ARRAY", 30 | "repetition_type": "OPTIONAL", 31 | "name": "kind", 32 | "converted_type": "UTF8" 33 | } 34 | ], 35 | "num_rows": 0, 36 | "row_groups": [ 37 | { 38 | "columns": [ 39 | { 40 | "file_offset": 64, 41 | "meta_data": { 42 | "type": "INT32", 43 | "encodings": [ 44 | "PLAIN", 45 | "RLE_DICTIONARY" 46 | ], 47 | "path_in_schema": [ 48 | "id" 49 | ], 50 | "codec": "UNCOMPRESSED", 51 | "num_values": 6, 52 | "total_uncompressed_size": 60, 53 | "total_compressed_size": 60, 54 | "data_page_offset": 42, 55 | "dictionary_page_offset": 4 56 | } 57 | }, 58 | { 59 | "file_offset": 173, 60 | "meta_data": { 61 | "type": "INT64", 62 | "encodings": [ 63 | "PLAIN", 64 | "RLE_DICTIONARY" 65 | ], 66 | "path_in_schema": [ 67 | "phoneNumbers", 68 | "phone", 69 | "number" 70 | ], 71 | "codec": "UNCOMPRESSED", 72 | "num_values": 8, 73 | "total_uncompressed_size": 80, 74 | "total_compressed_size": 80, 75 | "data_page_offset": 139, 76 | "dictionary_page_offset": 93 77 | } 78 | }, 79 | { 80 | "file_offset": 294, 81 | "meta_data": { 82 | "type": "BYTE_ARRAY", 83 | "encodings": [ 84 | "PLAIN", 85 | "RLE_DICTIONARY" 86 | ], 87 | "path_in_schema": [ 88 | "phoneNumbers", 89 | "phone", 90 | "kind" 91 | ], 92 | "codec": "UNCOMPRESSED", 93 | "num_values": 8, 94 | "total_uncompressed_size": 65, 95 | "total_compressed_size": 65, 96 | "data_page_offset": 261, 97 | "dictionary_page_offset": 229 98 | } 99 | } 100 | ], 101 | "total_byte_size": 205, 102 | "num_rows": 6 103 | } 104 | ], 105 | "created_by": "parquet-rs version 0.3.0 (build b45ce7cba2199f22d93269c150d8a83916c69b5e)", 106 | "metadata_length": 306 107 | } 108 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | export { readColumnIndex, readOffsetIndex } from './indexes.js' 2 | export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js' 3 | export { parquetRead, parquetReadObjects } from './read.js' 4 | export { parquetQuery } from './query.js' 5 | export { snappyUncompress } from './snappy.js' 6 | export { asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js' 7 | 8 | /** 9 | * Explicitly export types for use in downstream typescript projects through 10 | * `import { ParquetReadOptions } from 'hyparquet'` for example. 11 | * 12 | * @template {any} T 13 | * @typedef {import('../src/types.d.ts').Awaitable} Awaitable 14 | */ 15 | /** 16 | * @typedef {import('../src/types.d.ts').AsyncBuffer} AsyncBuffer 17 | * @typedef {import('../src/types.d.ts').AsyncRowGroup} AsyncRowGroup 18 | * @typedef {import('../src/types.d.ts').DataReader} DataReader 19 | * @typedef {import('../src/types.d.ts').FileMetaData} FileMetaData 20 | * @typedef {import('../src/types.d.ts').SchemaTree} SchemaTree 21 | * @typedef {import('../src/types.d.ts').SchemaElement} SchemaElement 22 | * @typedef {import('../src/types.d.ts').ParquetType} ParquetType 23 | * @typedef {import('../src/types.d.ts').FieldRepetitionType} FieldRepetitionType 24 | * @typedef {import('../src/types.d.ts').ConvertedType} ConvertedType 25 | * @typedef {import('../src/types.d.ts').TimeUnit} TimeUnit 26 | * @typedef {import('../src/types.d.ts').LogicalType} LogicalType 27 | * @typedef {import('../src/types.d.ts').RowGroup} RowGroup 28 | * @typedef {import('../src/types.d.ts').ColumnChunk} ColumnChunk 29 | * @typedef {import('../src/types.d.ts').ColumnMetaData} ColumnMetaData 30 | * @typedef {import('../src/types.d.ts').Encoding} Encoding 31 | * @typedef {import('../src/types.d.ts').CompressionCodec} CompressionCodec 32 | * @typedef {import('../src/types.d.ts').Compressors} Compressors 33 | * @typedef {import('../src/types.d.ts').KeyValue} KeyValue 34 | * @typedef {import('../src/types.d.ts').Statistics} Statistics 35 | * @typedef {import('../src/types.d.ts').GeospatialStatistics} GeospatialStatistics 36 | * @typedef {import('../src/types.d.ts').BoundingBox} BoundingBox 37 | * @typedef {import('../src/types.d.ts').PageType} PageType 38 | * @typedef {import('../src/types.d.ts').PageHeader} PageHeader 39 | * @typedef {import('../src/types.d.ts').DataPageHeader} DataPageHeader 40 | * @typedef {import('../src/types.d.ts').DictionaryPageHeader} DictionaryPageHeader 41 | * @typedef {import('../src/types.d.ts').DecodedArray} DecodedArray 42 | * @typedef {import('../src/types.d.ts').OffsetIndex} OffsetIndex 43 | * @typedef {import('../src/types.d.ts').ColumnIndex} ColumnIndex 44 | * @typedef {import('../src/types.d.ts').BoundaryOrder} BoundaryOrder 45 | * @typedef {import('../src/types.d.ts').ColumnData} ColumnData 46 | * @typedef {import('../src/types.d.ts').SubColumnData} SubColumnData 47 | * @typedef {import('../src/types.d.ts').ParquetReadOptions} ParquetReadOptions 48 | * @typedef {import('../src/types.d.ts').MetadataOptions} MetadataOptions 49 | * @typedef {import('../src/types.d.ts').ParquetParsers} ParquetParsers 50 | * @typedef {import('../src/types.d.ts').ParquetQueryFilter} ParquetQueryFilter 51 | */ 52 | -------------------------------------------------------------------------------- /test/files/byte_stream_split.zstd.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 2 8 | }, 9 | { 10 | "type": "FLOAT", 11 | "repetition_type": "OPTIONAL", 12 | "name": "f32" 13 | }, 14 | { 15 | "type": "DOUBLE", 16 | "repetition_type": "OPTIONAL", 17 | "name": "f64" 18 | } 19 | ], 20 | "num_rows": 300, 21 | "row_groups": [ 22 | { 23 | "columns": [ 24 | { 25 | "file_offset": 1162, 26 | "meta_data": { 27 | "type": "FLOAT", 28 | "encodings": [ 29 | "RLE", 30 | "BYTE_STREAM_SPLIT" 31 | ], 32 | "path_in_schema": [ 33 | "f32" 34 | ], 35 | "codec": "ZSTD", 36 | "num_values": 300, 37 | "total_uncompressed_size": 1255, 38 | "total_compressed_size": 1158, 39 | "data_page_offset": 4, 40 | "statistics": { 41 | "max": 2.3831448554992676, 42 | "min": -2.772592782974243, 43 | "null_count": 0, 44 | "max_value": 2.3831448554992676, 45 | "min_value": -2.772592782974243 46 | }, 47 | "encoding_stats": [ 48 | { 49 | "page_type": "DATA_PAGE", 50 | "encoding": "BYTE_STREAM_SPLIT", 51 | "count": 1 52 | } 53 | ] 54 | } 55 | }, 56 | { 57 | "file_offset": 3513, 58 | "meta_data": { 59 | "type": "DOUBLE", 60 | "encodings": [ 61 | "RLE", 62 | "BYTE_STREAM_SPLIT" 63 | ], 64 | "path_in_schema": [ 65 | "f64" 66 | ], 67 | "codec": "ZSTD", 68 | "num_values": 300, 69 | "total_uncompressed_size": 2471, 70 | "total_compressed_size": 2283, 71 | "data_page_offset": 1230, 72 | "statistics": { 73 | "max": 2.6962240525635797, 74 | "min": -3.0461430547999266, 75 | "null_count": 0, 76 | "max_value": 2.6962240525635797, 77 | "min_value": -3.0461430547999266 78 | }, 79 | "encoding_stats": [ 80 | { 81 | "page_type": "DATA_PAGE", 82 | "encoding": "BYTE_STREAM_SPLIT", 83 | "count": 1 84 | } 85 | ] 86 | } 87 | } 88 | ], 89 | "total_byte_size": 3726, 90 | "num_rows": 300, 91 | "file_offset": 4, 92 | "total_compressed_size": 3441, 93 | "ordinal": 0 94 | } 95 | ], 96 | "key_value_metadata": [ 97 | { 98 | "key": "ARROW:schema", 99 | "value": "/////6AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABAAAAABAAAANj///8AAAEDEAAAABQAAAAEAAAAAAAAAAMAAABmNjQAxv///wAAAgAQABQACAAGAAcADAAAABAAEAAAAAAAAQMQAAAAHAAAAAQAAAAAAAAAAwAAAGYzMgAAAAYACAAGAAYAAAAAAAEA" 100 | } 101 | ], 102 | "created_by": "parquet-cpp-arrow version 14.0.2", 103 | "metadata_length": 498 104 | } 105 | -------------------------------------------------------------------------------- /test/indexes.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetMetadata, toJson } from '../src/index.js' 4 | import { readColumnIndex, readOffsetIndex } from '../src/indexes.js' 5 | import { asyncBufferFromFile } from '../src/node.js' 6 | import { getSchemaPath } from '../src/schema.js' 7 | import { fileToJson } from './helpers.js' 8 | 9 | describe('readColumnIndex', () => { 10 | const columnIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.column_indexes.json')) 11 | const parquetFiles = columnIndexesFiles.map(f => f.replace(/.column_indexes.json$/i, '.parquet')) 12 | 13 | parquetFiles.forEach((file, i) => { 14 | it(`parse column indexes from ${file}`, async () => { 15 | const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`) 16 | const metadata = parquetMetadata(arrayBuffer) 17 | 18 | const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => { 19 | if (column.column_index_offset === undefined || column.column_index_length === undefined) return null 20 | const columnIndexOffset = Number(column.column_index_offset) 21 | const columnIndexLength = Number(column.column_index_length) 22 | const columnIndexArrayBuffer = arrayBuffer.slice(columnIndexOffset, columnIndexOffset + columnIndexLength) 23 | const columnIndexReader = { view: new DataView(columnIndexArrayBuffer), offset: 0 } 24 | const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) 25 | return readColumnIndex(columnIndexReader, schemaPath.at(-1)?.element || { name: '' }) 26 | })) 27 | const expected = fileToJson(`test/files/${columnIndexesFiles[i]}`) 28 | expect(toJson(result)).toEqual(expected) 29 | }) 30 | }) 31 | }) 32 | 33 | describe('readOffsetIndex', () => { 34 | const offsetIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.offset_indexes.json')) 35 | const parquetFiles = offsetIndexesFiles.map(f => f.replace(/.offset_indexes.json$/i, '.parquet')) 36 | 37 | parquetFiles.forEach((file, i) => { 38 | it(`parse offset indexes from ${file}`, async () => { 39 | const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`) 40 | const metadata = parquetMetadata(arrayBuffer) 41 | 42 | const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => { 43 | if (column.offset_index_offset === undefined || column.offset_index_length === undefined) return null 44 | const offsetIndexOffset = Number(column.offset_index_offset) 45 | const offsetIndexLength = Number(column.offset_index_length) 46 | const offsetIndexArrayBuffer = arrayBuffer.slice(offsetIndexOffset, offsetIndexOffset + offsetIndexLength) 47 | const offsetIndexReader = { view: new DataView(offsetIndexArrayBuffer), offset: 0 } 48 | return readOffsetIndex(offsetIndexReader) 49 | })) 50 | const expected = fileToJson(`test/files/${offsetIndexesFiles[i]}`) 51 | expect(toJson(result)).toEqual(expected) 52 | }) 53 | }) 54 | }) 55 | 56 | /** 57 | * @param {string} filename 58 | * @returns {Promise} 59 | */ 60 | function readFileToArrayBuffer(filename) { 61 | return asyncBufferFromFile(filename).then((buffer) => buffer.slice(0)) 62 | } 63 | -------------------------------------------------------------------------------- /test/snappy.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { describe, expect, it } from 'vitest' 3 | import { snappyUncompress } from '../src/snappy.js' 4 | 5 | describe('snappy uncompress', () => { 6 | it('decompresses valid input correctly', () => { 7 | const testCases = [ 8 | { compressed: [0x00], expected: '' }, 9 | { compressed: [0x01, 0x00, 0x68], expected: 'h' }, 10 | { compressed: [0x02, 0x04, 0x68, 0x79], expected: 'hy' }, 11 | { compressed: [0x03, 0x08, 0x68, 0x79, 0x70], expected: 'hyp' }, 12 | { compressed: [0x05, 0x10, 0x68, 0x79, 0x70, 0x65, 0x72], expected: 'hyper' }, 13 | { 14 | compressed: [0x0a, 0x24, 0x68, 0x79, 0x70, 0x65, 0x72, 0x70, 0x61, 0x72, 0x61, 0x6d], 15 | expected: 'hyperparam', 16 | }, 17 | { 18 | compressed: [0x15, 0x08, 0x68, 0x79, 0x70, 0x46, 0x03, 0x00], 19 | expected: 'hyphyphyphyphyphyphyp', 20 | }, 21 | { 22 | // from rowgroups.parquet 23 | compressed: [ 24 | 80, 4, 1, 0, 9, 1, 0, 2, 9, 7, 4, 0, 3, 13, 8, 0, 4, 13, 8, 0, 5, 13, 25 | 8, 0, 6, 13, 8, 0, 7, 13, 8, 0, 8, 13, 8, 60, 9, 0, 0, 0, 0, 0, 0, 0, 26 | 10, 0, 0, 0, 0, 0, 0, 0, 27 | ], 28 | expected: new Uint8Array([ 29 | 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 30 | 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 31 | 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 32 | 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 33 | ]), 34 | }, 35 | // from datapage_v2.snappy.parquet 36 | { compressed: [2, 4, 0, 3], expected: new Uint8Array([0, 3]) }, 37 | { compressed: [ 6, 20, 2, 0, 0, 0, 3, 23], expected: new Uint8Array([2, 0, 0, 0, 3, 23]) }, 38 | ] 39 | 40 | for (const { compressed, expected } of testCases) { 41 | const output = new Uint8Array(expected.length) 42 | snappyUncompress(new Uint8Array(compressed), output) 43 | if (typeof expected === 'string') { 44 | const outputStr = new TextDecoder().decode(output) 45 | expect(outputStr).toBe(expected) 46 | } else { 47 | expect(output).toEqual(expected) // Uint8Array 48 | } 49 | } 50 | }) 51 | 52 | it('decompress hyparquet.jpg.snappy', async () => { 53 | const compressed = fs.readFileSync('test/files/hyparquet.jpg.snappy') 54 | const expected = fs.readFileSync('hyparquet.jpg') 55 | const output = new Uint8Array(expected.length) 56 | await snappyUncompress(compressed, output) 57 | expect(Array.from(output)).toEqual(Array.from(expected)) 58 | }) 59 | 60 | it('throws for invalid input', () => { 61 | const output = new Uint8Array(10) 62 | expect(() => snappyUncompress(new Uint8Array([]), output)) 63 | .toThrow('invalid snappy length header') 64 | expect(() => snappyUncompress(new Uint8Array([0xff]), output)) 65 | .toThrow('invalid snappy length header') 66 | expect(() => snappyUncompress(new Uint8Array([0x03, 0x61]), output)) 67 | .toThrow('missing eof marker') 68 | expect(() => snappyUncompress(new Uint8Array([0x03, 0xf1]), output)) 69 | .toThrow('missing eof marker') 70 | expect(() => snappyUncompress(new Uint8Array([0x02, 0x00, 0x68]), output)) 71 | .toThrow('premature end of input') 72 | }) 73 | }) 74 | -------------------------------------------------------------------------------- /test/asyncbuffer.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from 'vitest' 2 | import { cachedAsyncBuffer } from '../src/utils.js' 3 | 4 | describe('cachedAsyncBuffer', () => { 5 | it('caches slices of a file to avoid multiple reads', async () => { 6 | const slice = vi.fn(async (start, end) => { 7 | // Simulate an async slice operation 8 | await new Promise(resolve => setTimeout(resolve, 10)) 9 | if (end === undefined) end = 1000 10 | if (start < 0) start = Math.max(0, 1000 + start) 11 | const buffer = new ArrayBuffer(end - start) 12 | return buffer 13 | }) 14 | const cachedFile = cachedAsyncBuffer( 15 | { byteLength: 1000, slice }, 16 | { minSize: 0 } 17 | ) 18 | 19 | // Test cache miss 20 | const slice1 = await cachedFile.slice(0, 100) 21 | expect(slice).toHaveBeenCalledTimes(1) 22 | expect(slice1.byteLength).toBe(100) 23 | 24 | // Test cache hit for the same range 25 | const slice2 = await cachedFile.slice(0, 100) 26 | expect(slice).toHaveBeenCalledTimes(1) // No additional call 27 | expect(slice2).toBe(slice1) // Exact same object from cache 28 | 29 | // Test cache with undefined end, should use byteLength as end 30 | const slice3 = await cachedFile.slice(900) 31 | expect(slice).toHaveBeenCalledTimes(2) 32 | expect(slice3.byteLength).toBe(100) 33 | 34 | // Test cache hit for suffix-range 35 | const slice4 = await cachedFile.slice(-100) 36 | expect(slice).toHaveBeenCalledTimes(2) 37 | expect(slice4).toBe(slice3) 38 | 39 | // Verify that asking for the same end implicitly gets from cache 40 | const slice5 = await cachedFile.slice(900, 1000) 41 | expect(slice).toHaveBeenCalledTimes(2) 42 | expect(slice5).toBe(slice3) 43 | }) 44 | 45 | it('caches whole file if it is smaller than minSize', async () => { 46 | const slice = vi.fn(async (start, end) => { 47 | // Simulate an async slice operation 48 | await new Promise(resolve => setTimeout(resolve, 10)) 49 | if (end === undefined) end = 1000 50 | if (start < 0) start = Math.max(0, 1000 + start) 51 | const buffer = new ArrayBuffer(end - start) 52 | return buffer 53 | }) 54 | const cachedFile = cachedAsyncBuffer({ byteLength: 1000, slice }) 55 | 56 | // Test cache miss 57 | const slice1 = await cachedFile.slice(0, 100) 58 | expect(slice).toHaveBeenCalledTimes(1) 59 | expect(slice1.byteLength).toBe(100) 60 | 61 | // Test cache hit for the same range 62 | const slice2 = await cachedFile.slice(0, 100) 63 | expect(slice).toHaveBeenCalledTimes(1) // No additional call 64 | expect(slice2).toEqual(slice1) // Same data 65 | expect(slice2).not.toBe(slice1) // Different object 66 | 67 | // Test cache with undefined end, should use byteLength as end 68 | const slice3 = await cachedFile.slice(900) 69 | expect(slice).toHaveBeenCalledTimes(1) 70 | expect(slice3.byteLength).toBe(100) 71 | 72 | // Test cache hit for suffix-range 73 | const slice4 = await cachedFile.slice(-100) 74 | expect(slice).toHaveBeenCalledTimes(1) 75 | expect(slice4).toEqual(slice3) 76 | expect(slice4).not.toBe(slice3) 77 | 78 | // Verify that asking for the same end implicitly gets from cache 79 | const slice5 = await cachedFile.slice(900, 1000) 80 | expect(slice).toHaveBeenCalledTimes(1) 81 | expect(slice5).toEqual(slice3) 82 | expect(slice5).not.toBe(slice3) 83 | }) 84 | }) 85 | -------------------------------------------------------------------------------- /test/files/duckdb4442.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "page_locations": [ 5 | { 6 | "offset": 4, 7 | "compressed_page_size": 34, 8 | "first_row_index": 0 9 | } 10 | ] 11 | }, 12 | { 13 | "page_locations": [ 14 | { 15 | "offset": 73, 16 | "compressed_page_size": 34, 17 | "first_row_index": 0 18 | } 19 | ] 20 | }, 21 | { 22 | "page_locations": [ 23 | { 24 | "offset": 142, 25 | "compressed_page_size": 34, 26 | "first_row_index": 0 27 | } 28 | ] 29 | }, 30 | { 31 | "page_locations": [ 32 | { 33 | "offset": 207, 34 | "compressed_page_size": 34, 35 | "first_row_index": 0 36 | } 37 | ] 38 | }, 39 | { 40 | "page_locations": [ 41 | { 42 | "offset": 271, 43 | "compressed_page_size": 34, 44 | "first_row_index": 0 45 | } 46 | ] 47 | }, 48 | { 49 | "page_locations": [ 50 | { 51 | "offset": 335, 52 | "compressed_page_size": 38, 53 | "first_row_index": 0 54 | } 55 | ] 56 | }, 57 | { 58 | "page_locations": [ 59 | { 60 | "offset": 403, 61 | "compressed_page_size": 34, 62 | "first_row_index": 0 63 | } 64 | ] 65 | }, 66 | { 67 | "page_locations": [ 68 | { 69 | "offset": 466, 70 | "compressed_page_size": 32, 71 | "first_row_index": 0 72 | } 73 | ] 74 | }, 75 | { 76 | "page_locations": [ 77 | { 78 | "offset": 525, 79 | "compressed_page_size": 34, 80 | "first_row_index": 0 81 | } 82 | ] 83 | }, 84 | { 85 | "page_locations": [ 86 | { 87 | "offset": 586, 88 | "compressed_page_size": 34, 89 | "first_row_index": 0 90 | } 91 | ] 92 | }, 93 | { 94 | "page_locations": [ 95 | { 96 | "offset": 659, 97 | "compressed_page_size": 34, 98 | "first_row_index": 0 99 | } 100 | ] 101 | }, 102 | { 103 | "page_locations": [ 104 | { 105 | "offset": 731, 106 | "compressed_page_size": 34, 107 | "first_row_index": 0 108 | } 109 | ] 110 | }, 111 | { 112 | "page_locations": [ 113 | { 114 | "offset": 802, 115 | "compressed_page_size": 34, 116 | "first_row_index": 0 117 | } 118 | ] 119 | }, 120 | { 121 | "page_locations": [ 122 | { 123 | "offset": 870, 124 | "compressed_page_size": 34, 125 | "first_row_index": 0 126 | } 127 | ] 128 | }, 129 | { 130 | "page_locations": [ 131 | { 132 | "offset": 938, 133 | "compressed_page_size": 34, 134 | "first_row_index": 0 135 | } 136 | ] 137 | }, 138 | { 139 | "page_locations": [ 140 | { 141 | "offset": 1009, 142 | "compressed_page_size": 34, 143 | "first_row_index": 0 144 | } 145 | ] 146 | }, 147 | { 148 | "page_locations": [ 149 | { 150 | "offset": 1079, 151 | "compressed_page_size": 30, 152 | "first_row_index": 0 153 | } 154 | ] 155 | } 156 | ] 157 | ] -------------------------------------------------------------------------------- /test/metadata.test.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs' 2 | import { describe, expect, it } from 'vitest' 3 | import { parquetMetadata, parquetMetadataAsync, toJson } from '../src/index.js' 4 | import { asyncBufferFromFile } from '../src/node.js' 5 | import { fileToJson } from './helpers.js' 6 | 7 | const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet')) 8 | 9 | describe('parquetMetadata', () => { 10 | files.forEach(file => { 11 | it(`parse metadata from ${file}`, async () => { 12 | const asyncBuffer = await asyncBufferFromFile(`test/files/${file}`) 13 | const arrayBuffer = await asyncBuffer.slice(0) 14 | const result = toJson(parquetMetadata(arrayBuffer)) 15 | const base = file.replace('.parquet', '') 16 | const expected = fileToJson(`test/files/${base}.metadata.json`) 17 | expect(result, JSON.stringify(result, null, 2)).toEqual(expected) 18 | }) 19 | }) 20 | 21 | it('throws for arrayBuffer undefined', () => { 22 | // @ts-expect-error testing invalid input 23 | expect(() => parquetMetadata(undefined)).toThrow('parquet expected ArrayBuffer') 24 | }) 25 | 26 | it('throws for a too short file', () => { 27 | const arrayBuffer = new ArrayBuffer(0) 28 | expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short') 29 | }) 30 | 31 | it('throws for invalid metadata length', () => { 32 | const arrayBuffer = new ArrayBuffer(12) 33 | const view = new DataView(arrayBuffer) 34 | view.setUint32(0, 0x31524150, true) // magic number PAR1 35 | view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer 36 | view.setUint32(8, 0x31524150, true) // magic number PAR1 37 | expect(() => parquetMetadata(arrayBuffer)) 38 | .toThrow('parquet metadata length 1000 exceeds available buffer 4') 39 | }) 40 | 41 | it('throws for invalid magic number', () => { 42 | const arrayBuffer = new ArrayBuffer(8) 43 | expect(() => parquetMetadata(arrayBuffer)) 44 | .toThrow('parquet file invalid (footer != PAR1)') 45 | }) 46 | 47 | it('throws for invalid metadata length', () => { 48 | const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49]) 49 | expect(() => parquetMetadata(buffer)) 50 | .toThrow('parquet metadata length 4294967295 exceeds available buffer 0') 51 | }) 52 | }) 53 | 54 | describe('parquetMetadataAsync', () => { 55 | files.forEach(file => { 56 | it(`parse metadata async from ${file}`, async () => { 57 | const asyncBuffer = await asyncBufferFromFile(`test/files/${file}`) 58 | const result = await parquetMetadataAsync(asyncBuffer) 59 | const base = file.replace('.parquet', '') 60 | const expected = fileToJson(`test/files/${base}.metadata.json`) 61 | expect(toJson(result)).toEqual(expected) 62 | }) 63 | }) 64 | 65 | it('throws for asyncBuffer undefined', async () => { 66 | const arrayBuffer = undefined 67 | // @ts-expect-error testing invalid input 68 | await expect(parquetMetadataAsync(arrayBuffer)).rejects 69 | .toThrow('parquet expected AsyncBuffer') 70 | }) 71 | 72 | it('throws for invalid magic number', async () => { 73 | const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255]) 74 | await expect(parquetMetadataAsync(buffer)).rejects 75 | .toThrow('parquet file invalid (footer != PAR1)') 76 | }) 77 | 78 | it('throws for invalid metadata length', async () => { 79 | const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49]) 80 | await expect(parquetMetadataAsync(buffer)).rejects 81 | .toThrow('parquet metadata length 4294967295 exceeds available buffer 0') 82 | }) 83 | }) 84 | -------------------------------------------------------------------------------- /test/files/delta_encoding_required_column.offset_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "page_locations": [ 5 | { 6 | "offset": 4, 7 | "compressed_page_size": 50, 8 | "first_row_index": 0 9 | } 10 | ] 11 | }, 12 | { 13 | "page_locations": [ 14 | { 15 | "offset": 54, 16 | "compressed_page_size": 388, 17 | "first_row_index": 0 18 | } 19 | ] 20 | }, 21 | { 22 | "page_locations": [ 23 | { 24 | "offset": 442, 25 | "compressed_page_size": 261, 26 | "first_row_index": 0 27 | } 28 | ] 29 | }, 30 | { 31 | "page_locations": [ 32 | { 33 | "offset": 703, 34 | "compressed_page_size": 307, 35 | "first_row_index": 0 36 | } 37 | ] 38 | }, 39 | { 40 | "page_locations": [ 41 | { 42 | "offset": 1010, 43 | "compressed_page_size": 247, 44 | "first_row_index": 0 45 | } 46 | ] 47 | }, 48 | { 49 | "page_locations": [ 50 | { 51 | "offset": 1257, 52 | "compressed_page_size": 247, 53 | "first_row_index": 0 54 | } 55 | ] 56 | }, 57 | { 58 | "page_locations": [ 59 | { 60 | "offset": 1504, 61 | "compressed_page_size": 131, 62 | "first_row_index": 0 63 | } 64 | ] 65 | }, 66 | { 67 | "page_locations": [ 68 | { 69 | "offset": 1635, 70 | "compressed_page_size": 115, 71 | "first_row_index": 0 72 | } 73 | ] 74 | }, 75 | { 76 | "page_locations": [ 77 | { 78 | "offset": 1750, 79 | "compressed_page_size": 144, 80 | "first_row_index": 0 81 | } 82 | ] 83 | }, 84 | { 85 | "page_locations": [ 86 | { 87 | "offset": 1894, 88 | "compressed_page_size": 933, 89 | "first_row_index": 0 90 | } 91 | ] 92 | }, 93 | { 94 | "page_locations": [ 95 | { 96 | "offset": 2827, 97 | "compressed_page_size": 378, 98 | "first_row_index": 0 99 | } 100 | ] 101 | }, 102 | { 103 | "page_locations": [ 104 | { 105 | "offset": 3205, 106 | "compressed_page_size": 707, 107 | "first_row_index": 0 108 | } 109 | ] 110 | }, 111 | { 112 | "page_locations": [ 113 | { 114 | "offset": 3912, 115 | "compressed_page_size": 751, 116 | "first_row_index": 0 117 | } 118 | ] 119 | }, 120 | { 121 | "page_locations": [ 122 | { 123 | "offset": 4663, 124 | "compressed_page_size": 154, 125 | "first_row_index": 0 126 | } 127 | ] 128 | }, 129 | { 130 | "page_locations": [ 131 | { 132 | "offset": 4817, 133 | "compressed_page_size": 1154, 134 | "first_row_index": 0 135 | } 136 | ] 137 | }, 138 | { 139 | "page_locations": [ 140 | { 141 | "offset": 5971, 142 | "compressed_page_size": 2857, 143 | "first_row_index": 0 144 | } 145 | ] 146 | }, 147 | { 148 | "page_locations": [ 149 | { 150 | "offset": 8828, 151 | "compressed_page_size": 405, 152 | "first_row_index": 0 153 | } 154 | ] 155 | } 156 | ] 157 | ] -------------------------------------------------------------------------------- /test/column.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { readColumn } from '../src/column.js' 3 | import { DEFAULT_PARSERS } from '../src/convert.js' 4 | import { parquetMetadata } from '../src/index.js' 5 | import { asyncBufferFromFile } from '../src/node.js' 6 | import { getSchemaPath } from '../src/schema.js' 7 | 8 | const values = [null, 1, -2, NaN, 0, -1, -0, 2] 9 | 10 | describe('readColumn', () => { 11 | it.for([ 12 | { selectEnd: Infinity, expected: [values] }, 13 | { selectEnd: 2, expected: [values] }, // readColumn does not truncate 14 | { selectEnd: 0, expected: [] }, 15 | ])('readColumn with rowGroupEnd %p', async ({ selectEnd, expected }) => { 16 | const testFile = 'test/files/float16_nonzeros_and_nans.parquet' 17 | const file = await asyncBufferFromFile(testFile) 18 | const arrayBuffer = await file.slice(0) 19 | const metadata = parquetMetadata(arrayBuffer) 20 | 21 | const column = metadata.row_groups[0].columns[0] 22 | if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`) 23 | const { startByte, endByte } = getChunkPlan(column.meta_data) 24 | const columnArrayBuffer = arrayBuffer.slice(startByte, endByte) 25 | const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) 26 | const reader = { view: new DataView(columnArrayBuffer), offset: 0 } 27 | const columnDecoder = { 28 | pathInSchema: column.meta_data.path_in_schema, 29 | type: column.meta_data.type, 30 | element: schemaPath[schemaPath.length - 1].element, 31 | schemaPath, 32 | parsers: DEFAULT_PARSERS, 33 | codec: column.meta_data.codec, 34 | } 35 | const rowGroupSelect = { 36 | groupStart: 0, 37 | selectStart: 0, 38 | selectEnd, 39 | groupRows: expected.length, 40 | } 41 | 42 | const result = readColumn(reader, rowGroupSelect, columnDecoder) 43 | expect(result).toEqual(expected) 44 | }) 45 | 46 | it('readColumn should return a typed array', async () => { 47 | const testFile = 'test/files/datapage_v2.snappy.parquet' 48 | const file = await asyncBufferFromFile(testFile) 49 | const arrayBuffer = await file.slice(0) 50 | const metadata = parquetMetadata(arrayBuffer) 51 | 52 | const column = metadata.row_groups[0].columns[1] // second column 53 | if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`) 54 | const { startByte, endByte } = getChunkPlan(column.meta_data) 55 | const columnArrayBuffer = arrayBuffer.slice(startByte, endByte) 56 | const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) 57 | const reader = { view: new DataView(columnArrayBuffer), offset: 0 } 58 | const columnDecoder = { 59 | pathInSchema: column.meta_data.path_in_schema, 60 | type: column.meta_data.type, 61 | element: schemaPath[schemaPath.length - 1].element, 62 | schemaPath, 63 | parsers: DEFAULT_PARSERS, 64 | codec: column.meta_data.codec, 65 | } 66 | const rowGroupSelect = { 67 | groupStart: 0, 68 | selectStart: 0, 69 | selectEnd: Infinity, 70 | groupRows: Number(column.meta_data.num_values), 71 | } 72 | 73 | const columnData = readColumn(reader, rowGroupSelect, columnDecoder) 74 | expect(columnData[0]).toBeInstanceOf(Int32Array) 75 | }) 76 | }) 77 | 78 | /** 79 | * @import {ByteRange, ColumnMetaData} from '../src/types.js' 80 | * @param {ColumnMetaData} meta 81 | * @returns {ByteRange} 82 | */ 83 | function getChunkPlan(meta) { 84 | const columnOffset = meta.dictionary_page_offset || meta.data_page_offset 85 | return { 86 | startByte: Number(columnOffset), 87 | endByte: Number(columnOffset + meta.total_compressed_size), 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /test/files/concatenated_gzip_members.json: -------------------------------------------------------------------------------- 1 | [ 2 | [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], 3 | [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], 4 | [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], 5 | [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], 6 | [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], 7 | [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], 8 | [61], [62], [63], [64], [65], [66], [67], [68], [69], [70], 9 | [71], [72], [73], [74], [75], [76], [77], [78], [79], [80], 10 | [81], [82], [83], [84], [85], [86], [87], [88], [89], [90], 11 | [91], [92], [93], [94], [95], [96], [97], [98], [99], [100], 12 | [101], [102], [103], [104], [105], [106], [107], [108], [109], [110], 13 | [111], [112], [113], [114], [115], [116], [117], [118], [119], [120], 14 | [121], [122], [123], [124], [125], [126], [127], [128], [129], [130], 15 | [131], [132], [133], [134], [135], [136], [137], [138], [139], [140], 16 | [141], [142], [143], [144], [145], [146], [147], [148], [149], [150], 17 | [151], [152], [153], [154], [155], [156], [157], [158], [159], [160], 18 | [161], [162], [163], [164], [165], [166], [167], [168], [169], [170], 19 | [171], [172], [173], [174], [175], [176], [177], [178], [179], [180], 20 | [181], [182], [183], [184], [185], [186], [187], [188], [189], [190], 21 | [191], [192], [193], [194], [195], [196], [197], [198], [199], [200], 22 | [201], [202], [203], [204], [205], [206], [207], [208], [209], [210], 23 | [211], [212], [213], [214], [215], [216], [217], [218], [219], [220], 24 | [221], [222], [223], [224], [225], [226], [227], [228], [229], [230], 25 | [231], [232], [233], [234], [235], [236], [237], [238], [239], [240], 26 | [241], [242], [243], [244], [245], [246], [247], [248], [249], [250], 27 | [251], [252], [253], [254], [255], [256], [257], [258], [259], [260], 28 | [261], [262], [263], [264], [265], [266], [267], [268], [269], [270], 29 | [271], [272], [273], [274], [275], [276], [277], [278], [279], [280], 30 | [281], [282], [283], [284], [285], [286], [287], [288], [289], [290], 31 | [291], [292], [293], [294], [295], [296], [297], [298], [299], [300], 32 | [301], [302], [303], [304], [305], [306], [307], [308], [309], [310], 33 | [311], [312], [313], [314], [315], [316], [317], [318], [319], [320], 34 | [321], [322], [323], [324], [325], [326], [327], [328], [329], [330], 35 | [331], [332], [333], [334], [335], [336], [337], [338], [339], [340], 36 | [341], [342], [343], [344], [345], [346], [347], [348], [349], [350], 37 | [351], [352], [353], [354], [355], [356], [357], [358], [359], [360], 38 | [361], [362], [363], [364], [365], [366], [367], [368], [369], [370], 39 | [371], [372], [373], [374], [375], [376], [377], [378], [379], [380], 40 | [381], [382], [383], [384], [385], [386], [387], [388], [389], [390], 41 | [391], [392], [393], [394], [395], [396], [397], [398], [399], [400], 42 | [401], [402], [403], [404], [405], [406], [407], [408], [409], [410], 43 | [411], [412], [413], [414], [415], [416], [417], [418], [419], [420], 44 | [421], [422], [423], [424], [425], [426], [427], [428], [429], [430], 45 | [431], [432], [433], [434], [435], [436], [437], [438], [439], [440], 46 | [441], [442], [443], [444], [445], [446], [447], [448], [449], [450], 47 | [451], [452], [453], [454], [455], [456], [457], [458], [459], [460], 48 | [461], [462], [463], [464], [465], [466], [467], [468], [469], [470], 49 | [471], [472], [473], [474], [475], [476], [477], [478], [479], [480], 50 | [481], [482], [483], [484], [485], [486], [487], [488], [489], [490], 51 | [491], [492], [493], [494], [495], [496], [497], [498], [499], [500], 52 | [501], [502], [503], [504], [505], [506], [507], [508], [509], [510], 53 | [511], [512], [513] 54 | ] 55 | -------------------------------------------------------------------------------- /test/files/incorrect_map_schema.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "hive_schema", 6 | "num_children": 1 7 | }, 8 | { 9 | "repetition_type": "OPTIONAL", 10 | "name": "my_map", 11 | "num_children": 1, 12 | "converted_type": "MAP", 13 | "logical_type": { 14 | "type": "MAP" 15 | } 16 | }, 17 | { 18 | "repetition_type": "REPEATED", 19 | "name": "key_value", 20 | "num_children": 2, 21 | "converted_type": "MAP_KEY_VALUE" 22 | }, 23 | { 24 | "type": "BYTE_ARRAY", 25 | "repetition_type": "OPTIONAL", 26 | "name": "key", 27 | "converted_type": "UTF8", 28 | "logical_type": { 29 | "type": "STRING" 30 | } 31 | }, 32 | { 33 | "type": "BYTE_ARRAY", 34 | "repetition_type": "OPTIONAL", 35 | "name": "value", 36 | "converted_type": "UTF8", 37 | "logical_type": { 38 | "type": "STRING" 39 | } 40 | } 41 | ], 42 | "num_rows": 1, 43 | "row_groups": [ 44 | { 45 | "columns": [ 46 | { 47 | "file_offset": 4, 48 | "meta_data": { 49 | "type": "BYTE_ARRAY", 50 | "encodings": [ 51 | "PLAIN", 52 | "RLE" 53 | ], 54 | "path_in_schema": [ 55 | "my_map", 56 | "key_value", 57 | "key" 58 | ], 59 | "codec": "GZIP", 60 | "num_values": 2, 61 | "total_uncompressed_size": 54, 62 | "total_compressed_size": 69, 63 | "data_page_offset": 4, 64 | "statistics": { 65 | "null_count": 0, 66 | "max_value": "parent", 67 | "min_value": "name" 68 | }, 69 | "encoding_stats": [ 70 | { 71 | "page_type": "DATA_PAGE", 72 | "encoding": "PLAIN", 73 | "count": 1 74 | } 75 | ] 76 | }, 77 | "offset_index_offset": 198, 78 | "offset_index_length": 11, 79 | "column_index_offset": 145, 80 | "column_index_length": 25 81 | }, 82 | { 83 | "file_offset": 73, 84 | "meta_data": { 85 | "type": "BYTE_ARRAY", 86 | "encodings": [ 87 | "PLAIN", 88 | "RLE" 89 | ], 90 | "path_in_schema": [ 91 | "my_map", 92 | "key_value", 93 | "value" 94 | ], 95 | "codec": "GZIP", 96 | "num_values": 2, 97 | "total_uncompressed_size": 57, 98 | "total_compressed_size": 72, 99 | "data_page_offset": 73, 100 | "statistics": { 101 | "null_count": 0, 102 | "max_value": "report", 103 | "min_value": "another" 104 | }, 105 | "encoding_stats": [ 106 | { 107 | "page_type": "DATA_PAGE", 108 | "encoding": "PLAIN", 109 | "count": 1 110 | } 111 | ] 112 | }, 113 | "offset_index_offset": 209, 114 | "offset_index_length": 12, 115 | "column_index_offset": 170, 116 | "column_index_length": 28 117 | } 118 | ], 119 | "total_byte_size": 111, 120 | "num_rows": 1, 121 | "file_offset": 4, 122 | "total_compressed_size": 141, 123 | "ordinal": 0 124 | } 125 | ], 126 | "created_by": "parquet-mr version 1.12.2 (build 77e30c8093386ec52c3cfa6c34b7ef3321322c94)", 127 | "metadata_length": 366 128 | } 129 | -------------------------------------------------------------------------------- /test/files/issue97.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 2 8 | }, 9 | { 10 | "type": "DOUBLE", 11 | "repetition_type": "OPTIONAL", 12 | "name": "a" 13 | }, 14 | { 15 | "type": "BOOLEAN", 16 | "repetition_type": "OPTIONAL", 17 | "name": "b" 18 | } 19 | ], 20 | "num_rows": 0, 21 | "row_groups": [ 22 | { 23 | "columns": [ 24 | { 25 | "file_offset": 0, 26 | "meta_data": { 27 | "type": "DOUBLE", 28 | "encodings": [ 29 | "PLAIN", 30 | "RLE" 31 | ], 32 | "path_in_schema": [ 33 | "a" 34 | ], 35 | "codec": "UNCOMPRESSED", 36 | "num_values": 0, 37 | "total_uncompressed_size": 14, 38 | "total_compressed_size": 14, 39 | "data_page_offset": 0, 40 | "dictionary_page_offset": 4, 41 | "encoding_stats": [ 42 | { 43 | "page_type": "DICTIONARY_PAGE", 44 | "encoding": "PLAIN", 45 | "count": 1 46 | } 47 | ] 48 | } 49 | }, 50 | { 51 | "file_offset": 0, 52 | "meta_data": { 53 | "type": "BOOLEAN", 54 | "encodings": [ 55 | "RLE" 56 | ], 57 | "path_in_schema": [ 58 | "b" 59 | ], 60 | "codec": "UNCOMPRESSED", 61 | "num_values": 0, 62 | "total_uncompressed_size": 0, 63 | "total_compressed_size": 0, 64 | "data_page_offset": 0, 65 | "encoding_stats": [] 66 | } 67 | } 68 | ], 69 | "total_byte_size": 14, 70 | "num_rows": 0, 71 | "file_offset": 4, 72 | "total_compressed_size": 14 73 | } 74 | ], 75 | "key_value_metadata": [ 76 | { 77 | "key": "pandas", 78 | "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 0, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"a\", \"field_name\": \"a\", \"pandas_type\": \"float64\", \"numpy_type\": \"float64\", \"metadata\": null}, {\"name\": \"b\", \"field_name\": \"b\", \"pandas_type\": \"bool\", \"numpy_type\": \"bool\", \"metadata\": null}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"20.0.0\"}, \"pandas_version\": \"2.3.0\"}" 79 | }, 80 | { 81 | "key": "ARROW:schema", 82 | "value": "/////+ACAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAEACAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAYAgAABAAAAAsCAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDAsICJzdGVwIjogMX1dLCAiY29sdW1uX2luZGV4ZXMiOiBbeyJuYW1lIjogbnVsbCwgImZpZWxkX25hbWUiOiBudWxsLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IHsiZW5jb2RpbmciOiAiVVRGLTgifX1dLCAiY29sdW1ucyI6IFt7Im5hbWUiOiAiYSIsICJmaWVsZF9uYW1lIjogImEiLCAicGFuZGFzX3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogImIiLCAiZmllbGRfbmFtZSI6ICJiIiwgInBhbmRhc190eXBlIjogImJvb2wiLCAibnVtcHlfdHlwZSI6ICJib29sIiwgIm1ldGFkYXRhIjogbnVsbH1dLCAiY3JlYXRvciI6IHsibGlicmFyeSI6ICJweWFycm93IiwgInZlcnNpb24iOiAiMjAuMC4wIn0sICJwYW5kYXNfdmVyc2lvbiI6ICIyLjMuMCJ9AAYAAABwYW5kYXMAAAIAAABAAAAABAAAANj///8AAAEGEAAAABgAAAAEAAAAAAAAAAEAAABiAAAABAAEAAQAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQMQAAAAGAAAAAQAAAAAAAAAAQAAAGEABgAIAAYABgAAAAAAAgAAAAAA" 83 | } 84 | ], 85 | "created_by": "parquet-cpp-arrow version 20.0.0", 86 | "metadata_length": 1700 87 | } -------------------------------------------------------------------------------- /test/files/offset_indexed.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "name": "root", 6 | "num_children": 2 7 | }, 8 | { 9 | "type": "INT64", 10 | "repetition_type": "REQUIRED", 11 | "name": "id" 12 | }, 13 | { 14 | "type": "BYTE_ARRAY", 15 | "repetition_type": "REQUIRED", 16 | "name": "content", 17 | "converted_type": "UTF8" 18 | } 19 | ], 20 | "num_rows": 200, 21 | "row_groups": [ 22 | { 23 | "columns": [ 24 | { 25 | "file_offset": 4, 26 | "meta_data": { 27 | "type": "INT64", 28 | "encodings": [ 29 | "PLAIN" 30 | ], 31 | "path_in_schema": [ 32 | "id" 33 | ], 34 | "codec": "SNAPPY", 35 | "num_values": 100, 36 | "total_uncompressed_size": 434, 37 | "total_compressed_size": 434, 38 | "data_page_offset": 4, 39 | "statistics": { 40 | "null_count": 0, 41 | "max_value": 100, 42 | "min_value": 1 43 | } 44 | } 45 | }, 46 | { 47 | "file_offset": 438, 48 | "meta_data": { 49 | "type": "BYTE_ARRAY", 50 | "encodings": [ 51 | "PLAIN" 52 | ], 53 | "path_in_schema": [ 54 | "content" 55 | ], 56 | "codec": "SNAPPY", 57 | "num_values": 100, 58 | "total_uncompressed_size": 14334, 59 | "total_compressed_size": 14334, 60 | "data_page_offset": 438, 61 | "statistics": { 62 | "null_count": 0, 63 | "max_value": "the dolor the ju", 64 | "min_value": "adipiscing adipi" 65 | } 66 | }, 67 | "offset_index_offset": 29507, 68 | "offset_index_length": 62 69 | } 70 | ], 71 | "total_byte_size": 14768, 72 | "num_rows": 100 73 | }, 74 | { 75 | "columns": [ 76 | { 77 | "file_offset": 14772, 78 | "meta_data": { 79 | "type": "INT64", 80 | "encodings": [ 81 | "PLAIN" 82 | ], 83 | "path_in_schema": [ 84 | "id" 85 | ], 86 | "codec": "SNAPPY", 87 | "num_values": 100, 88 | "total_uncompressed_size": 436, 89 | "total_compressed_size": 436, 90 | "data_page_offset": 14772, 91 | "statistics": { 92 | "null_count": 0, 93 | "max_value": 200, 94 | "min_value": 101 95 | } 96 | } 97 | }, 98 | { 99 | "file_offset": 15208, 100 | "meta_data": { 101 | "type": "BYTE_ARRAY", 102 | "encodings": [ 103 | "PLAIN" 104 | ], 105 | "path_in_schema": [ 106 | "content" 107 | ], 108 | "codec": "SNAPPY", 109 | "num_values": 100, 110 | "total_uncompressed_size": 14299, 111 | "total_compressed_size": 14299, 112 | "data_page_offset": 15208, 113 | "statistics": { 114 | "null_count": 0, 115 | "max_value": "the storage over", 116 | "min_value": "adipiscing encod" 117 | } 118 | }, 119 | "offset_index_offset": 29569, 120 | "offset_index_length": 65 121 | } 122 | ], 123 | "total_byte_size": 14735, 124 | "num_rows": 100 125 | } 126 | ], 127 | "created_by": "hyparquet", 128 | "metadata_length": 352 129 | } 130 | -------------------------------------------------------------------------------- /test/files/byte_stream_split_v2.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 2 8 | }, 9 | { 10 | "type": "DOUBLE", 11 | "repetition_type": "OPTIONAL", 12 | "name": "float_col" 13 | }, 14 | { 15 | "type": "DOUBLE", 16 | "repetition_type": "OPTIONAL", 17 | "name": "double_col" 18 | } 19 | ], 20 | "num_rows": 5, 21 | "row_groups": [ 22 | { 23 | "columns": [ 24 | { 25 | "file_offset": 0, 26 | "meta_data": { 27 | "type": "DOUBLE", 28 | "encodings": [ 29 | "RLE", 30 | "BYTE_STREAM_SPLIT" 31 | ], 32 | "path_in_schema": [ 33 | "float_col" 34 | ], 35 | "codec": "SNAPPY", 36 | "num_values": 5, 37 | "total_uncompressed_size": 110, 38 | "total_compressed_size": 87, 39 | "data_page_offset": 4, 40 | "statistics": { 41 | "max": 5.5, 42 | "min": 1.5, 43 | "null_count": 0, 44 | "max_value": 5.5, 45 | "min_value": 1.5, 46 | "is_max_value_exact": true, 47 | "is_min_value_exact": true 48 | }, 49 | "encoding_stats": [ 50 | { 51 | "page_type": "DATA_PAGE", 52 | "encoding": "BYTE_STREAM_SPLIT", 53 | "count": 1 54 | } 55 | ], 56 | "size_statistics": { 57 | "repetition_level_histogram": [], 58 | "definition_level_histogram": [ 59 | 0, 60 | 5 61 | ] 62 | } 63 | } 64 | }, 65 | { 66 | "file_offset": 0, 67 | "meta_data": { 68 | "type": "DOUBLE", 69 | "encodings": [ 70 | "RLE", 71 | "BYTE_STREAM_SPLIT" 72 | ], 73 | "path_in_schema": [ 74 | "double_col" 75 | ], 76 | "codec": "SNAPPY", 77 | "num_values": 5, 78 | "total_uncompressed_size": 110, 79 | "total_compressed_size": 97, 80 | "data_page_offset": 91, 81 | "statistics": { 82 | "max": 50.5, 83 | "min": 10.1, 84 | "null_count": 0, 85 | "max_value": 50.5, 86 | "min_value": 10.1, 87 | "is_max_value_exact": true, 88 | "is_min_value_exact": true 89 | }, 90 | "encoding_stats": [ 91 | { 92 | "page_type": "DATA_PAGE", 93 | "encoding": "BYTE_STREAM_SPLIT", 94 | "count": 1 95 | } 96 | ], 97 | "size_statistics": { 98 | "repetition_level_histogram": [], 99 | "definition_level_histogram": [ 100 | 0, 101 | 5 102 | ] 103 | } 104 | } 105 | } 106 | ], 107 | "total_byte_size": 220, 108 | "num_rows": 5, 109 | "file_offset": 4, 110 | "total_compressed_size": 184 111 | } 112 | ], 113 | "key_value_metadata": [ 114 | { 115 | "key": "ARROW:schema", 116 | "value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABIAAAABAAAAND///8AAAEDEAAAABwAAAAEAAAAAAAAAAoAAABkb3VibGVfY29sAADC////AAACABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAxAAAAAgAAAABAAAAAAAAAAJAAAAZmxvYXRfY29sAAYACAAGAAYAAAAAAAIAAAAAAA==" 117 | } 118 | ], 119 | "created_by": "parquet-cpp-arrow version 22.0.0", 120 | "metadata_length": 576 121 | } 122 | -------------------------------------------------------------------------------- /test/files/lz4_raw_compressed.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "created_by": "parquet-cpp version 1.5.1-SNAPSHOT", 3 | "metadata_length": 330, 4 | "num_rows": 4, 5 | "row_groups": [ 6 | { 7 | "columns": [ 8 | { 9 | "file_offset": 89, 10 | "meta_data": { 11 | "codec": "LZ4_RAW", 12 | "data_page_offset": 4, 13 | "encoding_stats": [ 14 | { 15 | "count": 1, 16 | "encoding": "PLAIN", 17 | "page_type": "DATA_PAGE" 18 | } 19 | ], 20 | "encodings": [ 21 | "PLAIN", 22 | "RLE" 23 | ], 24 | "num_values": 4, 25 | "path_in_schema": [ 26 | "c0" 27 | ], 28 | "statistics": { 29 | "max": 1593604801, 30 | "max_value": 1593604801, 31 | "min": 1593604800, 32 | "min_value": 1593604800, 33 | "null_count": 0 34 | }, 35 | "total_compressed_size": 85, 36 | "total_uncompressed_size": 93, 37 | "type": "INT64" 38 | } 39 | }, 40 | { 41 | "file_offset": 229, 42 | "meta_data": { 43 | "codec": "LZ4_RAW", 44 | "data_page_offset": 171, 45 | "encoding_stats": [ 46 | { 47 | "count": 1, 48 | "encoding": "PLAIN", 49 | "page_type": "DATA_PAGE" 50 | } 51 | ], 52 | "encodings": [ 53 | "PLAIN", 54 | "RLE" 55 | ], 56 | "num_values": 4, 57 | "path_in_schema": [ 58 | "c1" 59 | ], 60 | "statistics": { 61 | "max_value": "def", 62 | "min_value": "abc", 63 | "null_count": 0 64 | }, 65 | "total_compressed_size": 58, 66 | "total_uncompressed_size": 59, 67 | "type": "BYTE_ARRAY" 68 | } 69 | }, 70 | { 71 | "file_offset": 375, 72 | "meta_data": { 73 | "codec": "LZ4_RAW", 74 | "data_page_offset": 280, 75 | "encoding_stats": [ 76 | { 77 | "count": 1, 78 | "encoding": "PLAIN", 79 | "page_type": "DATA_PAGE" 80 | } 81 | ], 82 | "encodings": [ 83 | "PLAIN", 84 | "RLE" 85 | ], 86 | "num_values": 4, 87 | "path_in_schema": [ 88 | "v11" 89 | ], 90 | "statistics": { 91 | "max": 42.125, 92 | "max_value": 42.125, 93 | "min": 7.7, 94 | "min_value": 7.7, 95 | "null_count": 0 96 | }, 97 | "total_compressed_size": 95, 98 | "total_uncompressed_size": 99, 99 | "type": "DOUBLE" 100 | } 101 | } 102 | ], 103 | "file_offset": 89, 104 | "num_rows": 4, 105 | "ordinal": 0, 106 | "total_byte_size": 251, 107 | "total_compressed_size": 238 108 | } 109 | ], 110 | "schema": [ 111 | { 112 | "name": "schema", 113 | "num_children": 3, 114 | "repetition_type": "REQUIRED" 115 | }, 116 | { 117 | "name": "c0", 118 | "repetition_type": "REQUIRED", 119 | "type": "INT64" 120 | }, 121 | { 122 | "name": "c1", 123 | "repetition_type": "REQUIRED", 124 | "type": "BYTE_ARRAY" 125 | }, 126 | { 127 | "name": "v11", 128 | "repetition_type": "OPTIONAL", 129 | "type": "DOUBLE" 130 | } 131 | ], 132 | "version": 1 133 | } 134 | -------------------------------------------------------------------------------- /src/delta.js: -------------------------------------------------------------------------------- 1 | import { readVarInt, readZigZagBigInt } from './thrift.js' 2 | 3 | /** 4 | * @import {DataReader} from '../src/types.d.ts' 5 | * @param {DataReader} reader 6 | * @param {number} count number of values to read 7 | * @param {Int32Array | BigInt64Array} output 8 | */ 9 | export function deltaBinaryUnpack(reader, count, output) { 10 | const int32 = output instanceof Int32Array 11 | const blockSize = readVarInt(reader) 12 | const miniblockPerBlock = readVarInt(reader) 13 | readVarInt(reader) // assert(=== count) 14 | let value = readZigZagBigInt(reader) // first value 15 | let outputIndex = 0 16 | output[outputIndex++] = int32 ? Number(value) : value 17 | 18 | const valuesPerMiniblock = blockSize / miniblockPerBlock 19 | 20 | while (outputIndex < count) { 21 | // new block 22 | const minDelta = readZigZagBigInt(reader) 23 | const bitWidths = new Uint8Array(miniblockPerBlock) 24 | for (let i = 0; i < miniblockPerBlock; i++) { 25 | bitWidths[i] = reader.view.getUint8(reader.offset++) 26 | } 27 | 28 | for (let i = 0; i < miniblockPerBlock && outputIndex < count; i++) { 29 | // new miniblock 30 | const bitWidth = BigInt(bitWidths[i]) 31 | if (bitWidth) { 32 | let bitpackPos = 0n 33 | let miniblockCount = valuesPerMiniblock 34 | const mask = (1n << bitWidth) - 1n 35 | while (miniblockCount && outputIndex < count) { 36 | let bits = BigInt(reader.view.getUint8(reader.offset)) >> bitpackPos & mask // TODO: don't re-read value every time 37 | bitpackPos += bitWidth 38 | while (bitpackPos >= 8) { 39 | bitpackPos -= 8n 40 | reader.offset++ 41 | if (bitpackPos) { 42 | bits |= BigInt(reader.view.getUint8(reader.offset)) << bitWidth - bitpackPos & mask 43 | } 44 | } 45 | const delta = minDelta + bits 46 | value += delta 47 | output[outputIndex++] = int32 ? Number(value) : value 48 | miniblockCount-- 49 | } 50 | if (miniblockCount) { 51 | // consume leftover miniblock 52 | reader.offset += Math.ceil((miniblockCount * Number(bitWidth) + Number(bitpackPos)) / 8) 53 | } 54 | } else { 55 | for (let j = 0; j < valuesPerMiniblock && outputIndex < count; j++) { 56 | value += minDelta 57 | output[outputIndex++] = int32 ? Number(value) : value 58 | } 59 | } 60 | } 61 | } 62 | } 63 | 64 | /** 65 | * @param {DataReader} reader 66 | * @param {number} count 67 | * @param {Uint8Array[]} output 68 | */ 69 | export function deltaLengthByteArray(reader, count, output) { 70 | const lengths = new Int32Array(count) 71 | deltaBinaryUnpack(reader, count, lengths) 72 | for (let i = 0; i < count; i++) { 73 | output[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, lengths[i]) 74 | reader.offset += lengths[i] 75 | } 76 | } 77 | 78 | /** 79 | * @param {DataReader} reader 80 | * @param {number} count 81 | * @param {Uint8Array[]} output 82 | */ 83 | export function deltaByteArray(reader, count, output) { 84 | const prefixData = new Int32Array(count) 85 | deltaBinaryUnpack(reader, count, prefixData) 86 | const suffixData = new Int32Array(count) 87 | deltaBinaryUnpack(reader, count, suffixData) 88 | 89 | for (let i = 0; i < count; i++) { 90 | const suffix = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, suffixData[i]) 91 | if (prefixData[i]) { 92 | // copy from previous value 93 | output[i] = new Uint8Array(prefixData[i] + suffixData[i]) 94 | output[i].set(output[i - 1].subarray(0, prefixData[i])) 95 | output[i].set(suffix, prefixData[i]) 96 | } else { 97 | output[i] = suffix 98 | } 99 | reader.offset += suffixData[i] 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /test/files/plain-dict-uncompressed-checksum.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "schema": [ 4 | { 5 | "name": "m", 6 | "num_children": 2 7 | }, 8 | { 9 | "type": "INT64", 10 | "repetition_type": "REQUIRED", 11 | "name": "long_field" 12 | }, 13 | { 14 | "type": "BYTE_ARRAY", 15 | "repetition_type": "REQUIRED", 16 | "name": "binary_field" 17 | } 18 | ], 19 | "num_rows": 1000, 20 | "row_groups": [ 21 | { 22 | "columns": [ 23 | { 24 | "file_offset": 31, 25 | "meta_data": { 26 | "type": "INT64", 27 | "encodings": [ 28 | "PLAIN_DICTIONARY", 29 | "BIT_PACKED" 30 | ], 31 | "path_in_schema": [ 32 | "long_field" 33 | ], 34 | "codec": "UNCOMPRESSED", 35 | "num_values": 1000, 36 | "total_uncompressed_size": 54, 37 | "total_compressed_size": 54, 38 | "data_page_offset": 31, 39 | "dictionary_page_offset": 4, 40 | "statistics": { 41 | "max": 0, 42 | "min": 0, 43 | "null_count": 0, 44 | "max_value": 0, 45 | "min_value": 0 46 | }, 47 | "encoding_stats": [ 48 | { 49 | "page_type": "DICTIONARY_PAGE", 50 | "encoding": "PLAIN_DICTIONARY", 51 | "count": 1 52 | }, 53 | { 54 | "page_type": "DATA_PAGE", 55 | "encoding": "PLAIN_DICTIONARY", 56 | "count": 1 57 | } 58 | ] 59 | }, 60 | "offset_index_offset": 262, 61 | "offset_index_length": 10, 62 | "column_index_offset": 144, 63 | "column_index_length": 31 64 | }, 65 | { 66 | "file_offset": 117, 67 | "meta_data": { 68 | "type": "BYTE_ARRAY", 69 | "encodings": [ 70 | "PLAIN_DICTIONARY", 71 | "BIT_PACKED" 72 | ], 73 | "path_in_schema": [ 74 | "binary_field" 75 | ], 76 | "codec": "UNCOMPRESSED", 77 | "num_values": 1000, 78 | "total_uncompressed_size": 86, 79 | "total_compressed_size": 86, 80 | "data_page_offset": 117, 81 | "dictionary_page_offset": 58, 82 | "statistics": { 83 | "max": "a655fd0e-9949-4059-bcae-fd6a002a4652", 84 | "min": "a655fd0e-9949-4059-bcae-fd6a002a4652", 85 | "null_count": 0, 86 | "max_value": "a655fd0e-9949-4059-bcae-fd6a002a4652", 87 | "min_value": "a655fd0e-9949-4059-bcae-fd6a002a4652" 88 | }, 89 | "encoding_stats": [ 90 | { 91 | "page_type": "DICTIONARY_PAGE", 92 | "encoding": "PLAIN_DICTIONARY", 93 | "count": 1 94 | }, 95 | { 96 | "page_type": "DATA_PAGE", 97 | "encoding": "PLAIN_DICTIONARY", 98 | "count": 1 99 | } 100 | ] 101 | }, 102 | "offset_index_offset": 272, 103 | "offset_index_length": 11, 104 | "column_index_offset": 175, 105 | "column_index_length": 87 106 | } 107 | ], 108 | "total_byte_size": 140, 109 | "num_rows": 1000, 110 | "file_offset": 4, 111 | "total_compressed_size": 140, 112 | "ordinal": 0 113 | } 114 | ], 115 | "key_value_metadata": [ 116 | { 117 | "key": "writer.model.name", 118 | "value": "example" 119 | } 120 | ], 121 | "created_by": "parquet-mr version 1.13.0-SNAPSHOT (build 261f7d2679407c833545b56f4c85a4ae8b5c9ed4)", 122 | "metadata_length": 525 123 | } 124 | -------------------------------------------------------------------------------- /src/wkb.js: -------------------------------------------------------------------------------- 1 | /** 2 | * WKB (Well-Known Binary) decoder for geometry objects. 3 | * 4 | * @import {DataReader, Geometry} from '../src/types.js' 5 | * @param {DataReader} reader 6 | * @returns {Geometry} geometry object 7 | */ 8 | export function wkbToGeojson(reader) { 9 | const flags = getFlags(reader) 10 | 11 | if (flags.type === 1) { // Point 12 | return { type: 'Point', coordinates: readPosition(reader, flags) } 13 | } else if (flags.type === 2) { // LineString 14 | return { type: 'LineString', coordinates: readLine(reader, flags) } 15 | } else if (flags.type === 3) { // Polygon 16 | return { type: 'Polygon', coordinates: readPolygon(reader, flags) } 17 | } else if (flags.type === 4) { // MultiPoint 18 | const points = [] 19 | for (let i = 0; i < flags.count; i++) { 20 | points.push(readPosition(reader, getFlags(reader))) 21 | } 22 | return { type: 'MultiPoint', coordinates: points } 23 | } else if (flags.type === 5) { // MultiLineString 24 | const lines = [] 25 | for (let i = 0; i < flags.count; i++) { 26 | lines.push(readLine(reader, getFlags(reader))) 27 | } 28 | return { type: 'MultiLineString', coordinates: lines } 29 | } else if (flags.type === 6) { // MultiPolygon 30 | const polygons = [] 31 | for (let i = 0; i < flags.count; i++) { 32 | polygons.push(readPolygon(reader, getFlags(reader))) 33 | } 34 | return { type: 'MultiPolygon', coordinates: polygons } 35 | } else if (flags.type === 7) { // GeometryCollection 36 | const geometries = [] 37 | for (let i = 0; i < flags.count; i++) { 38 | geometries.push(wkbToGeojson(reader)) 39 | } 40 | return { type: 'GeometryCollection', geometries } 41 | } else { 42 | throw new Error(`Unsupported geometry type: ${flags.type}`) 43 | } 44 | } 45 | 46 | /** 47 | * @typedef {object} WkbFlags 48 | * @property {boolean} littleEndian 49 | * @property {number} type 50 | * @property {number} dim 51 | * @property {number} count 52 | */ 53 | 54 | /** 55 | * Extract ISO WKB flags and base geometry type. 56 | * 57 | * @param {DataReader} reader 58 | * @returns {WkbFlags} 59 | */ 60 | function getFlags(reader) { 61 | const { view } = reader 62 | const littleEndian = view.getUint8(reader.offset++) === 1 63 | const rawType = view.getUint32(reader.offset, littleEndian) 64 | reader.offset += 4 65 | 66 | const type = rawType % 1000 67 | const flags = Math.floor(rawType / 1000) 68 | 69 | let count = 0 70 | if (type > 1 && type <= 7) { 71 | count = view.getUint32(reader.offset, littleEndian) 72 | reader.offset += 4 73 | } 74 | 75 | // XY, XYZ, XYM, XYZM 76 | let dim = 2 77 | if (flags) dim++ 78 | if (flags === 3) dim++ 79 | 80 | return { littleEndian, type, dim, count } 81 | } 82 | 83 | /** 84 | * @param {DataReader} reader 85 | * @param {WkbFlags} flags 86 | * @returns {number[]} 87 | */ 88 | function readPosition(reader, flags) { 89 | const points = [] 90 | for (let i = 0; i < flags.dim; i++) { 91 | const coord = reader.view.getFloat64(reader.offset, flags.littleEndian) 92 | reader.offset += 8 93 | points.push(coord) 94 | } 95 | return points 96 | } 97 | 98 | /** 99 | * @param {DataReader} reader 100 | * @param {WkbFlags} flags 101 | * @returns {number[][]} 102 | */ 103 | function readLine(reader, flags) { 104 | const points = [] 105 | for (let i = 0; i < flags.count; i++) { 106 | points.push(readPosition(reader, flags)) 107 | } 108 | return points 109 | } 110 | 111 | /** 112 | * @param {DataReader} reader 113 | * @param {WkbFlags} flags 114 | * @returns {number[][][]} 115 | */ 116 | function readPolygon(reader, flags) { 117 | const { view } = reader 118 | const rings = [] 119 | for (let r = 0; r < flags.count; r++) { 120 | const count = view.getUint32(reader.offset, flags.littleEndian) 121 | reader.offset += 4 122 | rings.push(readLine(reader, { ...flags, count })) 123 | } 124 | return rings 125 | } 126 | -------------------------------------------------------------------------------- /test/files/rowgroups.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "created_by": "parquet-cpp-arrow version 14.0.2", 4 | "metadata_length": 1602, 5 | "schema": [ 6 | { 7 | "repetition_type": "REQUIRED", 8 | "name": "schema", 9 | "num_children": 1 10 | }, 11 | { 12 | "type": "INT64", 13 | "repetition_type": "OPTIONAL", 14 | "name": "numbers" 15 | } 16 | ], 17 | "num_rows": 15, 18 | "row_groups": [ 19 | { 20 | "columns": [ 21 | { 22 | "file_offset": 150, 23 | "meta_data": { 24 | "codec": "SNAPPY", 25 | "data_page_offset": 71, 26 | "dictionary_page_offset": 4, 27 | "encoding_stats": [ 28 | { "count": 1, "encoding": "PLAIN", "page_type": "DICTIONARY_PAGE" }, 29 | { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": "DATA_PAGE" } 30 | ], 31 | "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"], 32 | "num_values": 10, 33 | "path_in_schema": ["numbers"], 34 | "statistics": { 35 | "max": 10, 36 | "min": 1, 37 | "max_value": 10, 38 | "min_value": 1, 39 | "null_count": 0 40 | }, 41 | "total_compressed_size": 146, 42 | "total_uncompressed_size": 172, 43 | "type": "INT64" 44 | } 45 | } 46 | ], 47 | "file_offset": 4, 48 | "num_rows": 10, 49 | "ordinal": 0, 50 | "total_byte_size": 172, 51 | "total_compressed_size": 146 52 | }, 53 | { 54 | "columns": [ 55 | { 56 | "file_offset": 368, 57 | "meta_data": { 58 | "codec": "SNAPPY", 59 | "data_page_offset": 294, 60 | "dictionary_page_offset": 248, 61 | "encoding_stats": [ 62 | { "count": 1, "encoding": "PLAIN", "page_type": "DICTIONARY_PAGE" }, 63 | { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": "DATA_PAGE" } 64 | ], 65 | "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"], 66 | "num_values": 5, 67 | "path_in_schema": ["numbers"], 68 | "statistics": { 69 | "max": 15, 70 | "min": 11, 71 | "max_value": 15, 72 | "min_value": 11, 73 | "null_count": 0 74 | }, 75 | "total_compressed_size": 120, 76 | "total_uncompressed_size": 126, 77 | "type": "INT64" 78 | } 79 | } 80 | ], 81 | "file_offset": 248, 82 | "num_rows": 5, 83 | "ordinal": 1, 84 | "total_byte_size": 126, 85 | "total_compressed_size": 120 86 | } 87 | ], 88 | "key_value_metadata": [ 89 | { 90 | "key": "pandas", 91 | "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 15, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"numbers\", \"field_name\": \"numbers\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"14.0.2\"}, \"pandas_version\": \"2.1.4\"}" 92 | }, 93 | { 94 | "key": "ARROW:schema", 95 | "value": "/////2gCAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAOgBAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAALMBAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDE1LCAic3RlcCI6IDF9XSwgImNvbHVtbl9pbmRleGVzIjogW3sibmFtZSI6IG51bGwsICJmaWVsZF9uYW1lIjogbnVsbCwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7ImVuY29kaW5nIjogIlVURi04In19XSwgImNvbHVtbnMiOiBbeyJuYW1lIjogIm51bWJlcnMiLCAiZmllbGRfbmFtZSI6ICJudW1iZXJzIiwgInBhbmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0YWRhdGEiOiBudWxsfV0sICJjcmVhdG9yIjogeyJsaWJyYXJ5IjogInB5YXJyb3ciLCAidmVyc2lvbiI6ICIxNC4wLjIifSwgInBhbmRhc192ZXJzaW9uIjogIjIuMS40In0AAQAAABQAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAIAAAAAQAAAAAAAAABwAAAG51bWJlcnMACAAMAAgABwAIAAAAAAAAAUAAAAAAAAAA" 96 | } 97 | ] 98 | } 99 | -------------------------------------------------------------------------------- /src/snappy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The MIT License (MIT) 3 | * Copyright (c) 2016 Zhipeng Jia 4 | * https://github.com/zhipeng-jia/snappyjs 5 | */ 6 | 7 | const WORD_MASK = [0, 0xff, 0xffff, 0xffffff, 0xffffffff] 8 | 9 | /** 10 | * Copy bytes from one array to another 11 | * 12 | * @param {Uint8Array} fromArray source array 13 | * @param {number} fromPos source position 14 | * @param {Uint8Array} toArray destination array 15 | * @param {number} toPos destination position 16 | * @param {number} length number of bytes to copy 17 | */ 18 | function copyBytes(fromArray, fromPos, toArray, toPos, length) { 19 | for (let i = 0; i < length; i++) { 20 | toArray[toPos + i] = fromArray[fromPos + i] 21 | } 22 | } 23 | 24 | /** 25 | * Decompress snappy data. 26 | * Accepts an output buffer to avoid allocating a new buffer for each call. 27 | * 28 | * @param {Uint8Array} input compressed data 29 | * @param {Uint8Array} output output buffer 30 | */ 31 | export function snappyUncompress(input, output) { 32 | const inputLength = input.byteLength 33 | const outputLength = output.byteLength 34 | let pos = 0 35 | let outPos = 0 36 | 37 | // skip preamble (contains uncompressed length as varint) 38 | while (pos < inputLength) { 39 | const c = input[pos] 40 | pos++ 41 | if (c < 128) { 42 | break 43 | } 44 | } 45 | if (outputLength && pos >= inputLength) { 46 | throw new Error('invalid snappy length header') 47 | } 48 | 49 | while (pos < inputLength) { 50 | const c = input[pos] 51 | let len = 0 52 | pos++ 53 | 54 | if (pos >= inputLength) { 55 | throw new Error('missing eof marker') 56 | } 57 | 58 | // There are two types of elements, literals and copies (back references) 59 | if ((c & 0x3) === 0) { 60 | // Literals are uncompressed data stored directly in the byte stream 61 | let len = (c >>> 2) + 1 62 | // Longer literal length is encoded in multiple bytes 63 | if (len > 60) { 64 | if (pos + 3 >= inputLength) { 65 | throw new Error('snappy error literal pos + 3 >= inputLength') 66 | } 67 | const lengthSize = len - 60 // length bytes - 1 68 | len = input[pos] 69 | + (input[pos + 1] << 8) 70 | + (input[pos + 2] << 16) 71 | + (input[pos + 3] << 24) 72 | len = (len & WORD_MASK[lengthSize]) + 1 73 | pos += lengthSize 74 | } 75 | if (pos + len > inputLength) { 76 | throw new Error('snappy error literal exceeds input length') 77 | } 78 | copyBytes(input, pos, output, outPos, len) 79 | pos += len 80 | outPos += len 81 | } else { 82 | // Copy elements 83 | let offset = 0 // offset back from current position to read 84 | switch (c & 0x3) { 85 | case 1: 86 | // Copy with 1-byte offset 87 | len = (c >>> 2 & 0x7) + 4 88 | offset = input[pos] + (c >>> 5 << 8) 89 | pos++ 90 | break 91 | case 2: 92 | // Copy with 2-byte offset 93 | if (inputLength <= pos + 1) { 94 | throw new Error('snappy error end of input') 95 | } 96 | len = (c >>> 2) + 1 97 | offset = input[pos] + (input[pos + 1] << 8) 98 | pos += 2 99 | break 100 | case 3: 101 | // Copy with 4-byte offset 102 | if (inputLength <= pos + 3) { 103 | throw new Error('snappy error end of input') 104 | } 105 | len = (c >>> 2) + 1 106 | offset = input[pos] 107 | + (input[pos + 1] << 8) 108 | + (input[pos + 2] << 16) 109 | + (input[pos + 3] << 24) 110 | pos += 4 111 | break 112 | default: 113 | break 114 | } 115 | if (offset === 0 || isNaN(offset)) { 116 | throw new Error(`invalid offset ${offset} pos ${pos} inputLength ${inputLength}`) 117 | } 118 | if (offset > outPos) { 119 | throw new Error('cannot copy from before start of buffer') 120 | } 121 | copyBytes(output, outPos - offset, output, outPos, len) 122 | outPos += len 123 | } 124 | } 125 | 126 | if (outPos !== outputLength) throw new Error('premature end of input') 127 | } 128 | -------------------------------------------------------------------------------- /test/files/hadoop_lz4_compressed.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "created_by": "parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)", 3 | "metadata_length": 376, 4 | "num_rows": 4, 5 | "row_groups": [ 6 | { 7 | "columns": [ 8 | { 9 | "file_offset": 4, 10 | "meta_data": { 11 | "codec": "LZ4", 12 | "data_page_offset": 4, 13 | "encoding_stats": [ 14 | { 15 | "count": 1, 16 | "encoding": "PLAIN_DICTIONARY", 17 | "page_type": "DICTIONARY_PAGE" 18 | }, 19 | { 20 | "count": 1, 21 | "encoding": "PLAIN_DICTIONARY", 22 | "page_type": "DATA_PAGE" 23 | } 24 | ], 25 | "encodings": [ 26 | "BIT_PACKED", 27 | "PLAIN_DICTIONARY" 28 | ], 29 | "num_values": 4, 30 | "path_in_schema": [ 31 | "c0" 32 | ], 33 | "statistics": { 34 | "max": 1593604801, 35 | "max_value": 1593604801, 36 | "min": 1593604800, 37 | "min_value": 1593604800, 38 | "null_count": 0 39 | }, 40 | "total_compressed_size": 112, 41 | "total_uncompressed_size": 93, 42 | "type": "INT64" 43 | } 44 | }, 45 | { 46 | "file_offset": 116, 47 | "meta_data": { 48 | "codec": "LZ4", 49 | "data_page_offset": 116, 50 | "encoding_stats": [ 51 | { 52 | "count": 1, 53 | "encoding": "PLAIN_DICTIONARY", 54 | "page_type": "DICTIONARY_PAGE" 55 | }, 56 | { 57 | "count": 1, 58 | "encoding": "PLAIN_DICTIONARY", 59 | "page_type": "DATA_PAGE" 60 | } 61 | ], 62 | "encodings": [ 63 | "BIT_PACKED", 64 | "PLAIN_DICTIONARY" 65 | ], 66 | "num_values": 4, 67 | "path_in_schema": [ 68 | "c1" 69 | ], 70 | "statistics": { 71 | "max_value": "def", 72 | "min_value": "abc", 73 | "null_count": 0 74 | }, 75 | "total_compressed_size": 79, 76 | "total_uncompressed_size": 61, 77 | "type": "BYTE_ARRAY" 78 | } 79 | }, 80 | { 81 | "file_offset": 195, 82 | "meta_data": { 83 | "codec": "LZ4", 84 | "data_page_offset": 195, 85 | "encoding_stats": [ 86 | { 87 | "count": 1, 88 | "encoding": "PLAIN_DICTIONARY", 89 | "page_type": "DICTIONARY_PAGE" 90 | }, 91 | { 92 | "count": 1, 93 | "encoding": "PLAIN_DICTIONARY", 94 | "page_type": "DATA_PAGE" 95 | } 96 | ], 97 | "encodings": [ 98 | "BIT_PACKED", 99 | "PLAIN_DICTIONARY", 100 | "RLE" 101 | ], 102 | "num_values": 4, 103 | "path_in_schema": [ 104 | "v11" 105 | ], 106 | "statistics": { 107 | "max": 42.125, 108 | "max_value": 42.125, 109 | "min": 7.7, 110 | "min_value": 7.7, 111 | "null_count": 0 112 | }, 113 | "total_compressed_size": 123, 114 | "total_uncompressed_size": 108, 115 | "type": "DOUBLE" 116 | } 117 | } 118 | ], 119 | "num_rows": 4, 120 | "total_byte_size": 262 121 | } 122 | ], 123 | "schema": [ 124 | { 125 | "name": "", 126 | "num_children": 3 127 | }, 128 | { 129 | "name": "c0", 130 | "repetition_type": "REQUIRED", 131 | "type": "INT64" 132 | }, 133 | { 134 | "name": "c1", 135 | "repetition_type": "REQUIRED", 136 | "type": "BYTE_ARRAY" 137 | }, 138 | { 139 | "name": "v11", 140 | "repetition_type": "OPTIONAL", 141 | "type": "DOUBLE" 142 | } 143 | ], 144 | "version": 1 145 | } 146 | -------------------------------------------------------------------------------- /test/files/rowend_struct.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 1 8 | }, 9 | { 10 | "repetition_type": "OPTIONAL", 11 | "name": "s", 12 | "num_children": 2 13 | }, 14 | { 15 | "type": "BYTE_ARRAY", 16 | "repetition_type": "OPTIONAL", 17 | "name": "a", 18 | "converted_type": "UTF8", 19 | "logical_type": { 20 | "type": "STRING" 21 | } 22 | }, 23 | { 24 | "type": "BYTE_ARRAY", 25 | "repetition_type": "OPTIONAL", 26 | "name": "b", 27 | "converted_type": "UTF8", 28 | "logical_type": { 29 | "type": "STRING" 30 | } 31 | } 32 | ], 33 | "num_rows": 1050, 34 | "row_groups": [ 35 | { 36 | "columns": [ 37 | { 38 | "file_offset": 0, 39 | "meta_data": { 40 | "type": "BYTE_ARRAY", 41 | "encodings": [ 42 | "PLAIN", 43 | "RLE", 44 | "RLE_DICTIONARY" 45 | ], 46 | "path_in_schema": [ 47 | "s", 48 | "a" 49 | ], 50 | "codec": "SNAPPY", 51 | "num_values": 1050, 52 | "total_uncompressed_size": 10884, 53 | "total_compressed_size": 5709, 54 | "data_page_offset": 4290, 55 | "dictionary_page_offset": 4, 56 | "statistics": { 57 | "null_count": 0, 58 | "max_value": "v1049", 59 | "min_value": "v0000" 60 | }, 61 | "encoding_stats": [ 62 | { 63 | "page_type": "DICTIONARY_PAGE", 64 | "encoding": "PLAIN", 65 | "count": 1 66 | }, 67 | { 68 | "page_type": "DATA_PAGE", 69 | "encoding": "RLE_DICTIONARY", 70 | "count": 2 71 | } 72 | ], 73 | "size_statistics": { 74 | "unencoded_byte_array_data_bytes": 5250, 75 | "repetition_level_histogram": [], 76 | "definition_level_histogram": [ 77 | 0, 78 | 0, 79 | 1050 80 | ] 81 | } 82 | } 83 | }, 84 | { 85 | "file_offset": 0, 86 | "meta_data": { 87 | "type": "BYTE_ARRAY", 88 | "encodings": [ 89 | "PLAIN", 90 | "RLE", 91 | "RLE_DICTIONARY" 92 | ], 93 | "path_in_schema": [ 94 | "s", 95 | "b" 96 | ], 97 | "codec": "SNAPPY", 98 | "num_values": 1050, 99 | "total_uncompressed_size": 58, 100 | "total_compressed_size": 62, 101 | "data_page_offset": 5734, 102 | "dictionary_page_offset": 5713, 103 | "statistics": { 104 | "null_count": 0, 105 | "max_value": "x", 106 | "min_value": "x" 107 | }, 108 | "encoding_stats": [ 109 | { 110 | "page_type": "DICTIONARY_PAGE", 111 | "encoding": "PLAIN", 112 | "count": 1 113 | }, 114 | { 115 | "page_type": "DATA_PAGE", 116 | "encoding": "RLE_DICTIONARY", 117 | "count": 1 118 | } 119 | ], 120 | "size_statistics": { 121 | "unencoded_byte_array_data_bytes": 1050, 122 | "repetition_level_histogram": [], 123 | "definition_level_histogram": [ 124 | 0, 125 | 0, 126 | 1050 127 | ] 128 | } 129 | } 130 | } 131 | ], 132 | "total_byte_size": 10942, 133 | "num_rows": 1050, 134 | "file_offset": 4, 135 | "total_compressed_size": 5771 136 | } 137 | ], 138 | "key_value_metadata": [ 139 | { 140 | "key": "ARROW:schema", 141 | "value": "/////8AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAEAAAAsP///wAAAQ0YAAAAHAAAAAQAAAACAAAASAAAABAAAAABAAAAcwAAAKj////c////AAABBRAAAAAUAAAABAAAAAAAAAABAAAAYgAAAMz///8QABQACAAGAAcADAAAABAAEAAAAAAAAQUQAAAAGAAAAAQAAAAAAAAAAQAAAGEAAAAEAAQABAAAAAAAAAA=" 142 | } 143 | ], 144 | "created_by": "parquet-cpp-arrow version 21.0.0", 145 | "metadata_length": 558 146 | } 147 | -------------------------------------------------------------------------------- /src/schema.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Build a tree from the schema elements. 3 | * 4 | * @import {SchemaElement, SchemaTree} from '../src/types.d.ts' 5 | * @param {SchemaElement[]} schema 6 | * @param {number} rootIndex index of the root element 7 | * @param {string[]} path path to the element 8 | * @returns {SchemaTree} tree of schema elements 9 | */ 10 | function schemaTree(schema, rootIndex, path) { 11 | const element = schema[rootIndex] 12 | const children = [] 13 | let count = 1 14 | 15 | // Read the specified number of children 16 | if (element.num_children) { 17 | while (children.length < element.num_children) { 18 | const childElement = schema[rootIndex + count] 19 | const child = schemaTree(schema, rootIndex + count, [...path, childElement.name]) 20 | count += child.count 21 | children.push(child) 22 | } 23 | } 24 | 25 | return { count, element, children, path } 26 | } 27 | 28 | /** 29 | * Get schema elements from the root to the given element name. 30 | * 31 | * @param {SchemaElement[]} schema 32 | * @param {string[]} name path to the element 33 | * @returns {SchemaTree[]} list of schema elements 34 | */ 35 | export function getSchemaPath(schema, name) { 36 | let tree = schemaTree(schema, 0, []) 37 | const path = [tree] 38 | for (const part of name) { 39 | const child = tree.children.find(child => child.element.name === part) 40 | if (!child) throw new Error(`parquet schema element not found: ${name}`) 41 | path.push(child) 42 | tree = child 43 | } 44 | return path 45 | } 46 | 47 | /** 48 | * Get all physical (leaf) column names. 49 | * 50 | * @param {SchemaTree} schemaTree 51 | * @returns {string[]} list of physical column names 52 | */ 53 | export function getPhysicalColumns(schemaTree) { 54 | /** @type {string[]} */ 55 | const columns = [] 56 | /** @param {SchemaTree} node */ 57 | function traverse(node) { 58 | if (node.children.length) { 59 | for (const child of node.children) { 60 | traverse(child) 61 | } 62 | } else { 63 | columns.push(node.path.join('.')) 64 | } 65 | } 66 | traverse(schemaTree) 67 | return columns 68 | } 69 | 70 | /** 71 | * Get the max repetition level for a given schema path. 72 | * 73 | * @param {SchemaTree[]} schemaPath 74 | * @returns {number} max repetition level 75 | */ 76 | export function getMaxRepetitionLevel(schemaPath) { 77 | let maxLevel = 0 78 | for (const { element } of schemaPath) { 79 | if (element.repetition_type === 'REPEATED') { 80 | maxLevel++ 81 | } 82 | } 83 | return maxLevel 84 | } 85 | 86 | /** 87 | * Get the max definition level for a given schema path. 88 | * 89 | * @param {SchemaTree[]} schemaPath 90 | * @returns {number} max definition level 91 | */ 92 | export function getMaxDefinitionLevel(schemaPath) { 93 | let maxLevel = 0 94 | for (const { element } of schemaPath.slice(1)) { 95 | if (element.repetition_type !== 'REQUIRED') { 96 | maxLevel++ 97 | } 98 | } 99 | return maxLevel 100 | } 101 | 102 | /** 103 | * Check if a column is list-like. 104 | * 105 | * @param {SchemaTree} schema 106 | * @returns {boolean} true if list-like 107 | */ 108 | export function isListLike(schema) { 109 | if (!schema) return false 110 | if (schema.element.converted_type !== 'LIST') return false 111 | if (schema.children.length > 1) return false 112 | 113 | const firstChild = schema.children[0] 114 | if (firstChild.children.length > 1) return false 115 | if (firstChild.element.repetition_type !== 'REPEATED') return false 116 | 117 | return true 118 | } 119 | 120 | /** 121 | * Check if a column is map-like. 122 | * 123 | * @param {SchemaTree} schema 124 | * @returns {boolean} true if map-like 125 | */ 126 | export function isMapLike(schema) { 127 | if (!schema) return false 128 | if (schema.element.converted_type !== 'MAP') return false 129 | if (schema.children.length > 1) return false 130 | 131 | const firstChild = schema.children[0] 132 | if (firstChild.children.length !== 2) return false 133 | if (firstChild.element.repetition_type !== 'REPEATED') return false 134 | 135 | const keyChild = firstChild.children.find(child => child.element.name === 'key') 136 | if (keyChild?.element.repetition_type === 'REPEATED') return false 137 | 138 | const valueChild = firstChild.children.find(child => child.element.name === 'value') 139 | if (valueChild?.element.repetition_type === 'REPEATED') return false 140 | 141 | return true 142 | } 143 | 144 | /** 145 | * Returns true if a column is non-nested. 146 | * 147 | * @param {SchemaTree[]} schemaPath 148 | * @returns {boolean} 149 | */ 150 | export function isFlatColumn(schemaPath) { 151 | if (schemaPath.length !== 2) return false 152 | const [, column] = schemaPath 153 | if (column.element.repetition_type === 'REPEATED') return false 154 | if (column.children.length) return false 155 | return true 156 | } 157 | -------------------------------------------------------------------------------- /test/files/datapage_v2.snappy.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "created_by": "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)", 4 | "key_value_metadata": [ 5 | { 6 | "key": "org.apache.spark.sql.parquet.row.metadata", 7 | "value": "{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c\",\"type\":\"double\",\"nullable\":false,\"metadata\":{}},{\"name\":\"d\",\"type\":\"boolean\",\"nullable\":false,\"metadata\":{}},{\"name\":\"e\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":false},\"nullable\":true,\"metadata\":{}}]}" 8 | } 9 | ], 10 | "metadata_length": 836, 11 | "num_rows": 5, 12 | "row_groups": [ 13 | { 14 | "columns": [ 15 | { 16 | "file_offset": 4, 17 | "meta_data": { 18 | "codec": "SNAPPY", 19 | "data_page_offset": 4, 20 | "encodings": ["PLAIN", "RLE_DICTIONARY"], 21 | "num_values": 5, 22 | "path_in_schema": ["a"], 23 | "statistics": { 24 | "max": "abc", 25 | "min": "abc", 26 | "null_count": 1 27 | }, 28 | "total_compressed_size": 63, 29 | "total_uncompressed_size": 59, 30 | "type": "BYTE_ARRAY" 31 | } 32 | }, 33 | { 34 | "file_offset": 67, 35 | "meta_data": { 36 | "codec": "SNAPPY", 37 | "data_page_offset": 67, 38 | "encodings": ["DELTA_BINARY_PACKED"], 39 | "num_values": 5, 40 | "path_in_schema": ["b"], 41 | "statistics": { 42 | "max": 5, 43 | "min": 1, 44 | "null_count": 0 45 | }, 46 | "total_compressed_size": 49, 47 | "total_uncompressed_size": 47, 48 | "type": "INT32" 49 | } 50 | }, 51 | { 52 | "file_offset": 116, 53 | "meta_data": { 54 | "codec": "SNAPPY", 55 | "data_page_offset": 116, 56 | "encodings": ["PLAIN", "RLE_DICTIONARY"], 57 | "num_values": 5, 58 | "path_in_schema": ["c"], 59 | "statistics": { 60 | "max": 5, 61 | "min": 2, 62 | "null_count": 0 63 | }, 64 | "total_compressed_size": 88, 65 | "total_uncompressed_size": 94, 66 | "type": "DOUBLE" 67 | } 68 | }, 69 | { 70 | "file_offset": 204, 71 | "meta_data": { 72 | "codec": "SNAPPY", 73 | "data_page_offset": 204, 74 | "encodings": ["RLE"], 75 | "num_values": 5, 76 | "path_in_schema": ["d"], 77 | "statistics": { 78 | "max": true, 79 | "min": false, 80 | "null_count": 0 81 | }, 82 | "total_compressed_size": 39, 83 | "total_uncompressed_size": 37, 84 | "type": "BOOLEAN" 85 | } 86 | }, 87 | { 88 | "file_offset": 243, 89 | "meta_data": { 90 | "codec": "SNAPPY", 91 | "data_page_offset": 243, 92 | "encodings": ["PLAIN", "RLE_DICTIONARY"], 93 | "num_values": 10, 94 | "path_in_schema": [ 95 | "e", 96 | "list", 97 | "element" 98 | ], 99 | "statistics": { 100 | "max": 3, 101 | "min": 1, 102 | "null_count": 2 103 | }, 104 | "total_compressed_size": 78, 105 | "total_uncompressed_size": 74, 106 | "type": "INT32" 107 | } 108 | } 109 | ], 110 | "num_rows": 5, 111 | "total_byte_size": 311 112 | } 113 | ], 114 | "schema": [ 115 | { 116 | "name": "spark_schema", 117 | "num_children": 5 118 | }, 119 | { 120 | "converted_type": "UTF8", 121 | "name": "a", 122 | "repetition_type": "OPTIONAL", 123 | "type": "BYTE_ARRAY" 124 | }, 125 | { 126 | "name": "b", 127 | "repetition_type": "REQUIRED", 128 | "type": "INT32" 129 | }, 130 | { 131 | "name": "c", 132 | "repetition_type": "REQUIRED", 133 | "type": "DOUBLE" 134 | }, 135 | { 136 | "name": "d", 137 | "repetition_type": "REQUIRED", 138 | "type": "BOOLEAN" 139 | }, 140 | { 141 | "converted_type": "LIST", 142 | "name": "e", 143 | "num_children": 1, 144 | "repetition_type": "OPTIONAL" 145 | }, 146 | { 147 | "name": "list", 148 | "num_children": 1, 149 | "repetition_type": "REPEATED" 150 | }, 151 | { 152 | "name": "element", 153 | "repetition_type": "REQUIRED", 154 | "type": "INT32" 155 | } 156 | ] 157 | } 158 | -------------------------------------------------------------------------------- /test/thrift.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { deserializeTCompactProtocol, readVarInt } from '../src/thrift.js' 3 | import { reader } from './helpers.js' 4 | 5 | describe('deserializeTCompactProtocol function', () => { 6 | 7 | it('parses basic types correctly', () => { 8 | const buffer = new ArrayBuffer(128) 9 | const view = new DataView(buffer) 10 | let index = 0 11 | 12 | // Boolean 13 | view.setUint8(index++, 0x11) // Field 1 type TRUE 14 | view.setUint8(index++, 0x12) // Field 2 type FALSE 15 | 16 | // Byte 17 | view.setUint8(index++, 0x13) // Field 3 type BYTE 18 | view.setUint8(index++, 0x7f) // Max value for a signed byte 19 | 20 | // Int16 21 | view.setUint8(index++, 0x14) // Field 4 type int16 22 | view.setUint8(index++, 0xfe) // 0xfffe zigzag => 16-bit max value 0x7fff 23 | view.setUint8(index++, 0xff) 24 | view.setUint8(index++, 0x3) 25 | 26 | // Int32 27 | view.setUint8(index++, 0x15) // Field 5 type int32 28 | view.setUint8(index++, 0xfe) // 0xfffffffe zigzag => 32-bit max value 0x7fffffff 29 | view.setUint8(index++, 0xff) 30 | view.setUint8(index++, 0xff) 31 | view.setUint8(index++, 0xff) 32 | view.setUint8(index++, 0x0f) 33 | 34 | // Int64 35 | view.setUint8(index++, 0x16) // Field 6 type int64 36 | view.setUint8(index++, 0xfe) 37 | view.setUint8(index++, 0xff) 38 | view.setUint8(index++, 0xff) 39 | view.setUint8(index++, 0xff) 40 | view.setUint8(index++, 0xff) 41 | view.setUint8(index++, 0xff) 42 | view.setUint8(index++, 0xff) 43 | view.setUint8(index++, 0xff) 44 | view.setUint8(index++, 0xff) 45 | view.setUint8(index++, 0x01) 46 | 47 | // Double 48 | view.setUint8(index++, 0x17) // Field 7 type DOUBLE 49 | view.setFloat64(index, 123.456, true) 50 | index += 8 51 | 52 | // String 53 | const str = 'Hello, Thrift!' 54 | view.setUint8(index++, 0x18) // Field 8 type STRING 55 | // write string length as varint 56 | const stringLengthVarInt = toVarInt(str.length) 57 | stringLengthVarInt.forEach(byte => view.setUint8(index++, byte)) 58 | // write string bytes 59 | for (let i = 0; i < str.length; i++) { 60 | view.setUint8(index++, str.charCodeAt(i)) 61 | } 62 | 63 | // Mark the end of the structure 64 | view.setUint8(index, 0x00) // STOP field 65 | 66 | const reader = { view, offset: 0 } 67 | const value = deserializeTCompactProtocol(reader) 68 | expect(reader.offset).toBe(index + 1) 69 | 70 | // Assertions for each basic type 71 | expect(value.field_1).toBe(true) // TRUE 72 | expect(value.field_2).toBe(false) // FALSE 73 | expect(value.field_3).toBe(0x7f) // BYTE 74 | expect(value.field_4).toBe(0x7fff) // I16 75 | expect(value.field_5).toBe(0x7fffffff) // I32 76 | expect(value.field_6).toBe(BigInt('0x7fffffffffffffff')) // I64 77 | expect(value.field_7).toBeCloseTo(123.456) // DOUBLE 78 | expect(new TextDecoder().decode(value.field_8)).toBe('Hello, Thrift!') // STRING 79 | }) 80 | 81 | it('parses rle-dict column index correctly', () => { 82 | const buffer = new Uint8Array([25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0]) 83 | const view = new DataView(buffer.buffer) 84 | const reader = { view, offset: 0 } 85 | const value = deserializeTCompactProtocol(reader) 86 | expect(value.field_1).toEqual([false]) 87 | expect(value.field_2).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])]) 88 | expect(value.field_3).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])]) 89 | expect(value.field_4).toEqual(1) 90 | expect(value.field_5).toEqual([0n]) 91 | expect(value.field_6).toBeUndefined() 92 | expect(value.field_7).toBeUndefined() 93 | expect(value.field_8).toBeUndefined() 94 | }) 95 | 96 | }) 97 | 98 | describe('readVarInt', () => { 99 | it('read single-byte varint', () => { 100 | expect(readVarInt(reader([0x01]))).toBe(1) 101 | expect(readVarInt(reader([0x7f]))).toBe(127) 102 | }) 103 | 104 | it('read multi-byte varint', () => { 105 | // 129 as varint (0b10000001 00000001) 106 | expect(readVarInt(reader([0x81, 0x01]))).toBe(129) 107 | // 16515 as varint (0b10000011 10000010 00000001) 108 | expect(readVarInt(reader([0x83, 0x82, 0x01]))).toBe(16643) 109 | }) 110 | 111 | it('read maximum int32 varint', () => { 112 | // 2147483647 as varint (0b11111111 11111111 11111111 11111111 00000111) 113 | expect(readVarInt(reader([0xff, 0xff, 0xff, 0xff, 0x07]))).toBe(2147483647) 114 | }) 115 | }) 116 | 117 | /** 118 | * Convert int to varint. Outputs 1-5 bytes for int32. 119 | * 120 | * @param {number} n 121 | * @returns {number[]} 122 | */ 123 | function toVarInt(n) { 124 | let idx = 0 125 | const varInt = [] 126 | while (true) { 127 | if ((n & ~0x7f) === 0) { 128 | varInt[idx++] = n 129 | break 130 | } else { 131 | varInt[idx++] = n & 0x7f | 0x80 132 | n >>>= 7 133 | } 134 | } 135 | return varInt 136 | } 137 | -------------------------------------------------------------------------------- /test/files/delta_encoding_required_column.column_indexes.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | { 4 | "null_pages": [ 5 | false 6 | ], 7 | "min_values": [ 8 | 1 9 | ], 10 | "max_values": [ 11 | 105 12 | ], 13 | "boundary_order": "ASCENDING", 14 | "null_counts": [ 15 | 0 16 | ] 17 | }, 18 | { 19 | "null_pages": [ 20 | false 21 | ], 22 | "min_values": [ 23 | 8817 24 | ], 25 | "max_values": [ 26 | 1895444 27 | ], 28 | "boundary_order": "ASCENDING", 29 | "null_counts": [ 30 | 0 31 | ] 32 | }, 33 | { 34 | "null_pages": [ 35 | false 36 | ], 37 | "min_values": [ 38 | 37 39 | ], 40 | "max_values": [ 41 | 7135 42 | ], 43 | "boundary_order": "ASCENDING", 44 | "null_counts": [ 45 | 0 46 | ] 47 | }, 48 | { 49 | "null_pages": [ 50 | false 51 | ], 52 | "min_values": [ 53 | 464 54 | ], 55 | "max_values": [ 56 | 49388 57 | ], 58 | "boundary_order": "ASCENDING", 59 | "null_counts": [ 60 | 0 61 | ] 62 | }, 63 | { 64 | "null_pages": [ 65 | false 66 | ], 67 | "min_values": [ 68 | 2449130 69 | ], 70 | "max_values": [ 71 | 2452641 72 | ], 73 | "boundary_order": "ASCENDING", 74 | "null_counts": [ 75 | 0 76 | ] 77 | }, 78 | { 79 | "null_pages": [ 80 | false 81 | ], 82 | "min_values": [ 83 | 2449100 84 | ], 85 | "max_values": [ 86 | 2452611 87 | ], 88 | "boundary_order": "ASCENDING", 89 | "null_counts": [ 90 | 0 91 | ] 92 | }, 93 | { 94 | "null_pages": [ 95 | false 96 | ], 97 | "min_values": [ 98 | 1 99 | ], 100 | "max_values": [ 101 | 30 102 | ], 103 | "boundary_order": "ASCENDING", 104 | "null_counts": [ 105 | 0 106 | ] 107 | }, 108 | { 109 | "null_pages": [ 110 | false 111 | ], 112 | "min_values": [ 113 | 1 114 | ], 115 | "max_values": [ 116 | 12 117 | ], 118 | "boundary_order": "ASCENDING", 119 | "null_counts": [ 120 | 0 121 | ] 122 | }, 123 | { 124 | "null_pages": [ 125 | false 126 | ], 127 | "min_values": [ 128 | 1925 129 | ], 130 | "max_values": [ 131 | 1991 132 | ], 133 | "boundary_order": "ASCENDING", 134 | "null_counts": [ 135 | 0 136 | ] 137 | }, 138 | { 139 | "null_pages": [ 140 | false 141 | ], 142 | "min_values": [ 143 | "AAAAAAAAABAAAAAA" 144 | ], 145 | "max_values": [ 146 | "AAAAAAAAPFAAAAAA" 147 | ], 148 | "boundary_order": "ASCENDING", 149 | "null_counts": [ 150 | 0 151 | ] 152 | }, 153 | { 154 | "null_pages": [ 155 | false 156 | ], 157 | "min_values": [ 158 | "Dr." 159 | ], 160 | "max_values": [ 161 | "Sir" 162 | ], 163 | "boundary_order": "ASCENDING", 164 | "null_counts": [ 165 | 0 166 | ] 167 | }, 168 | { 169 | "null_pages": [ 170 | false 171 | ], 172 | "min_values": [ 173 | "Albert" 174 | ], 175 | "max_values": [ 176 | "William" 177 | ], 178 | "boundary_order": "ASCENDING", 179 | "null_counts": [ 180 | 0 181 | ] 182 | }, 183 | { 184 | "null_pages": [ 185 | false 186 | ], 187 | "min_values": [ 188 | "Baker" 189 | ], 190 | "max_values": [ 191 | "Young" 192 | ], 193 | "boundary_order": "ASCENDING", 194 | "null_counts": [ 195 | 0 196 | ] 197 | }, 198 | { 199 | "null_pages": [ 200 | false 201 | ], 202 | "min_values": [ 203 | "N" 204 | ], 205 | "max_values": [ 206 | "Y" 207 | ], 208 | "boundary_order": "ASCENDING", 209 | "null_counts": [ 210 | 0 211 | ] 212 | }, 213 | { 214 | "null_pages": [ 215 | false 216 | ], 217 | "min_values": [ 218 | "AFGHANISTAN" 219 | ], 220 | "max_values": [ 221 | "WALLIS AND FUTUNA" 222 | ], 223 | "boundary_order": "ASCENDING", 224 | "null_counts": [ 225 | 0 226 | ] 227 | }, 228 | { 229 | "null_pages": [ 230 | false 231 | ], 232 | "min_values": [ 233 | "Albert.Brunson@62.com" 234 | ], 235 | "max_values": [ 236 | "William.Warner@zegnrzurU.org" 237 | ], 238 | "boundary_order": "ASCENDING", 239 | "null_counts": [ 240 | 0 241 | ] 242 | }, 243 | { 244 | "null_pages": [ 245 | false 246 | ], 247 | "min_values": [ 248 | "2452293" 249 | ], 250 | "max_values": [ 251 | "2452644" 252 | ], 253 | "boundary_order": "ASCENDING", 254 | "null_counts": [ 255 | 0 256 | ] 257 | } 258 | ] 259 | ] -------------------------------------------------------------------------------- /test/files/decimal-column.metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "schema": [ 4 | { 5 | "repetition_type": "REQUIRED", 6 | "name": "schema", 7 | "num_children": 2 8 | }, 9 | { 10 | "type": "INT64", 11 | "repetition_type": "OPTIONAL", 12 | "name": "mid" 13 | }, 14 | { 15 | "type": "FIXED_LEN_BYTE_ARRAY", 16 | "type_length": 6, 17 | "repetition_type": "OPTIONAL", 18 | "name": "value", 19 | "converted_type": "DECIMAL", 20 | "scale": 10, 21 | "precision": 14, 22 | "logical_type": { 23 | "type": "DECIMAL", 24 | "scale": 10, 25 | "precision": 14 26 | } 27 | } 28 | ], 29 | "num_rows": 5, 30 | "row_groups": [ 31 | { 32 | "columns": [ 33 | { 34 | "file_offset": 0, 35 | "meta_data": { 36 | "type": "INT64", 37 | "encodings": [ 38 | "PLAIN", 39 | "RLE", 40 | "RLE_DICTIONARY" 41 | ], 42 | "path_in_schema": [ 43 | "mid" 44 | ], 45 | "codec": "SNAPPY", 46 | "num_values": 5, 47 | "total_uncompressed_size": 126, 48 | "total_compressed_size": 120, 49 | "data_page_offset": 50, 50 | "dictionary_page_offset": 4, 51 | "statistics": { 52 | "max": 190, 53 | "min": 40, 54 | "null_count": 0, 55 | "max_value": 190, 56 | "min_value": 40 57 | }, 58 | "encoding_stats": [ 59 | { 60 | "page_type": "DICTIONARY_PAGE", 61 | "encoding": "PLAIN", 62 | "count": 1 63 | }, 64 | { 65 | "page_type": "DATA_PAGE", 66 | "encoding": "RLE_DICTIONARY", 67 | "count": 1 68 | } 69 | ] 70 | } 71 | }, 72 | { 73 | "file_offset": 0, 74 | "meta_data": { 75 | "type": "FIXED_LEN_BYTE_ARRAY", 76 | "encodings": [ 77 | "PLAIN", 78 | "RLE", 79 | "RLE_DICTIONARY" 80 | ], 81 | "path_in_schema": [ 82 | "value" 83 | ], 84 | "codec": "SNAPPY", 85 | "num_values": 5, 86 | "total_uncompressed_size": 82, 87 | "total_compressed_size": 86, 88 | "data_page_offset": 146, 89 | "dictionary_page_offset": 124, 90 | "statistics": { 91 | "max": 2015, 92 | "min": 2015, 93 | "null_count": 0, 94 | "max_value": 2015, 95 | "min_value": 2015 96 | }, 97 | "encoding_stats": [ 98 | { 99 | "page_type": "DICTIONARY_PAGE", 100 | "encoding": "PLAIN", 101 | "count": 1 102 | }, 103 | { 104 | "page_type": "DATA_PAGE", 105 | "encoding": "RLE_DICTIONARY", 106 | "count": 1 107 | } 108 | ] 109 | } 110 | } 111 | ], 112 | "total_byte_size": 208, 113 | "num_rows": 5, 114 | "file_offset": 4, 115 | "total_compressed_size": 206, 116 | "ordinal": 0 117 | } 118 | ], 119 | "key_value_metadata": [ 120 | { 121 | "key": "pandas", 122 | "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 5, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"mid\", \"field_name\": \"mid\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"name\": \"value\", \"field_name\": \"value\", \"pandas_type\": \"decimal\", \"numpy_type\": \"object\", \"metadata\": {\"precision\": 14, \"scale\": 10}}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"19.0.0\"}, \"pandas_version\": \"2.2.3\"}" 123 | }, 124 | { 125 | "key": "ARROW:schema", 126 | "value": "/////xgDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAGACAAAEAAAAAQAAAAQAAACA/f//QAIAAAQAAAAyAgAAeyJpbmRleF9jb2x1bW5zIjogW3sia2luZCI6ICJyYW5nZSIsICJuYW1lIjogbnVsbCwgInN0YXJ0IjogMCwgInN0b3AiOiA1LCAic3RlcCI6IDF9XSwgImNvbHVtbl9pbmRleGVzIjogW3sibmFtZSI6IG51bGwsICJmaWVsZF9uYW1lIjogbnVsbCwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7ImVuY29kaW5nIjogIlVURi04In19XSwgImNvbHVtbnMiOiBbeyJuYW1lIjogIm1pZCIsICJmaWVsZF9uYW1lIjogIm1pZCIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ2YWx1ZSIsICJmaWVsZF9uYW1lIjogInZhbHVlIiwgInBhbmRhc190eXBlIjogImRlY2ltYWwiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7InByZWNpc2lvbiI6IDE0LCAic2NhbGUiOiAxMH19XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJvdyIsICJ2ZXJzaW9uIjogIjE5LjAuMCJ9LCAicGFuZGFzX3ZlcnNpb24iOiAiMi4yLjMifQAABgAAAHBhbmRhcwAAAgAAAFAAAAAEAAAAyP///wAAAQcQAAAAIAAAAAQAAAAAAAAABQAAAHZhbHVlAAAACAAMAAQACAAIAAAADgAAAAoAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAHAAAAAQAAAAAAAAAAwAAAG1pZAAIAAwACAAHAAgAAAAAAAABQAAAAAAAAAA=" 127 | } 128 | ], 129 | "created_by": "parquet-cpp-arrow version 19.0.0", 130 | "metadata_length": 1959 131 | } 132 | -------------------------------------------------------------------------------- /src/encoding.js: -------------------------------------------------------------------------------- 1 | import { readVarInt } from './thrift.js' 2 | 3 | /** 4 | * Minimum bits needed to store value. 5 | * 6 | * @param {number} value 7 | * @returns {number} 8 | */ 9 | export function bitWidth(value) { 10 | return 32 - Math.clz32(value) 11 | } 12 | 13 | /** 14 | * Read values from a run-length encoded/bit-packed hybrid encoding. 15 | * 16 | * If length is zero, then read int32 length at the start. 17 | * 18 | * @param {DataReader} reader 19 | * @param {number} width - bitwidth 20 | * @param {DecodedArray} output 21 | * @param {number} [length] - length of the encoded data 22 | */ 23 | export function readRleBitPackedHybrid(reader, width, output, length) { 24 | if (length === undefined) { 25 | length = reader.view.getUint32(reader.offset, true) 26 | reader.offset += 4 27 | } 28 | const startOffset = reader.offset 29 | let seen = 0 30 | while (seen < output.length) { 31 | const header = readVarInt(reader) 32 | if (header & 1) { 33 | // bit-packed 34 | seen = readBitPacked(reader, header, width, output, seen) 35 | } else { 36 | // rle 37 | const count = header >>> 1 38 | readRle(reader, count, width, output, seen) 39 | seen += count 40 | } 41 | } 42 | reader.offset = startOffset + length // duckdb writes an empty block 43 | } 44 | 45 | /** 46 | * Run-length encoding: read value with bitWidth and repeat it count times. 47 | * 48 | * @param {DataReader} reader 49 | * @param {number} count 50 | * @param {number} bitWidth 51 | * @param {DecodedArray} output 52 | * @param {number} seen 53 | */ 54 | function readRle(reader, count, bitWidth, output, seen) { 55 | const width = bitWidth + 7 >> 3 56 | let value = 0 57 | for (let i = 0; i < width; i++) { 58 | value |= reader.view.getUint8(reader.offset++) << (i << 3) 59 | } 60 | // assert(value < 1 << bitWidth) 61 | 62 | // repeat value count times 63 | for (let i = 0; i < count; i++) { 64 | output[seen + i] = value 65 | } 66 | } 67 | 68 | /** 69 | * Read a bit-packed run of the rle/bitpack hybrid. 70 | * Supports width > 8 (crossing bytes). 71 | * 72 | * @param {DataReader} reader 73 | * @param {number} header - bit-pack header 74 | * @param {number} bitWidth 75 | * @param {DecodedArray} output 76 | * @param {number} seen 77 | * @returns {number} total output values so far 78 | */ 79 | function readBitPacked(reader, header, bitWidth, output, seen) { 80 | let count = header >> 1 << 3 // values to read 81 | const mask = (1 << bitWidth) - 1 82 | 83 | let data = 0 84 | if (reader.offset < reader.view.byteLength) { 85 | data = reader.view.getUint8(reader.offset++) 86 | } else if (mask) { 87 | // sometimes out-of-bounds reads are masked out 88 | throw new Error(`parquet bitpack offset ${reader.offset} out of range`) 89 | } 90 | let left = 8 91 | let right = 0 92 | 93 | // read values 94 | while (count) { 95 | // if we have crossed a byte boundary, shift the data 96 | if (right > 8) { 97 | right -= 8 98 | left -= 8 99 | data >>>= 8 100 | } else if (left - right < bitWidth) { 101 | // if we don't have bitWidth number of bits to read, read next byte 102 | data |= reader.view.getUint8(reader.offset) << left 103 | reader.offset++ 104 | left += 8 105 | } else { 106 | if (seen < output.length) { 107 | // emit value 108 | output[seen++] = data >> right & mask 109 | } 110 | count-- 111 | right += bitWidth 112 | } 113 | } 114 | 115 | return seen 116 | } 117 | 118 | /** 119 | * @param {DataReader} reader 120 | * @param {number} count 121 | * @param {ParquetType} type 122 | * @param {number | undefined} typeLength 123 | * @returns {DecodedArray} 124 | */ 125 | export function byteStreamSplit(reader, count, type, typeLength) { 126 | const width = byteWidth(type, typeLength) 127 | const bytes = new Uint8Array(count * width) 128 | for (let b = 0; b < width; b++) { 129 | for (let i = 0; i < count; i++) { 130 | bytes[i * width + b] = reader.view.getUint8(reader.offset++) 131 | } 132 | } 133 | // interpret bytes as typed array 134 | if (type === 'FLOAT') return new Float32Array(bytes.buffer) 135 | else if (type === 'DOUBLE') return new Float64Array(bytes.buffer) 136 | else if (type === 'INT32') return new Int32Array(bytes.buffer) 137 | else if (type === 'INT64') return new BigInt64Array(bytes.buffer) 138 | else if (type === 'FIXED_LEN_BYTE_ARRAY') { 139 | // split into arrays of typeLength 140 | const split = new Array(count) 141 | for (let i = 0; i < count; i++) { 142 | split[i] = bytes.subarray(i * width, (i + 1) * width) 143 | } 144 | return split 145 | } 146 | throw new Error(`parquet byte_stream_split unsupported type: ${type}`) 147 | } 148 | 149 | /** 150 | * @import {DataReader, DecodedArray, ParquetType} from '../src/types.d.ts' 151 | * @param {ParquetType} type 152 | * @param {number | undefined} typeLength 153 | * @returns {number} 154 | */ 155 | function byteWidth(type, typeLength) { 156 | switch (type) { 157 | case 'INT32': 158 | case 'FLOAT': 159 | return 4 160 | case 'INT64': 161 | case 'DOUBLE': 162 | return 8 163 | case 'FIXED_LEN_BYTE_ARRAY': 164 | if (!typeLength) throw new Error('parquet byteWidth missing type_length') 165 | return typeLength 166 | default: 167 | throw new Error(`parquet unsupported type: ${type}`) 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /test/encoding.test.js: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest' 2 | import { bitWidth, readRleBitPackedHybrid } from '../src/encoding.js' 3 | 4 | describe('readRle', () => { 5 | it('reads RLE values with explicit length', () => { 6 | const buffer = new ArrayBuffer(4) 7 | const view = new DataView(buffer) 8 | // RLE 3x true 9 | view.setUint8(0, 0b00000110) 10 | view.setUint8(1, 1) 11 | // RLE 3x 100 12 | view.setUint8(2, 0b00000110) 13 | view.setUint8(3, 100) 14 | const reader = { view, offset: 0 } 15 | 16 | const values = new Array(6) 17 | readRleBitPackedHybrid(reader, 1, values, 4) 18 | expect(reader.offset).toBe(4) 19 | expect(values).toEqual([1, 1, 1, 100, 100, 100]) 20 | }) 21 | 22 | it('reads RLE values with bitwidth=16', () => { 23 | const buffer = new ArrayBuffer(6) 24 | const view = new DataView(buffer) 25 | // RLE 3x 65535 26 | view.setUint8(3, 0b00000110) 27 | view.setUint16(4, 65535, true) 28 | const reader = { view, offset: 0 } 29 | 30 | const values = new Array(3) 31 | readRleBitPackedHybrid(reader, 16, values, 6) 32 | expect(reader.offset).toBe(6) 33 | expect(values).toEqual([65535, 65535, 65535]) 34 | }) 35 | 36 | it('reads RLE values with bitwidth=24', () => { 37 | const buffer = new ArrayBuffer(4) 38 | const view = new DataView(buffer) 39 | // RLE 2x 16777215 40 | view.setUint8(0, 0b00000100) 41 | view.setUint8(1, 255) 42 | view.setUint8(2, 255) 43 | view.setUint8(3, 255) 44 | const reader = { view, offset: 0 } 45 | 46 | const values = new Array(2) 47 | readRleBitPackedHybrid(reader, 24, values, 4) 48 | expect(reader.offset).toBe(4) 49 | expect(values).toEqual([16777215, 16777215]) 50 | }) 51 | 52 | it('reads RLE values with bitwidth=32', () => { 53 | const buffer = new ArrayBuffer(5) 54 | const view = new DataView(buffer) 55 | // RLE 3x 234000 56 | view.setUint8(0, 0b00000110) 57 | view.setUint32(1, 234000, true) 58 | const reader = { view, offset: 0 } 59 | 60 | const values = new Array(3) 61 | readRleBitPackedHybrid(reader, 32, values, 5) 62 | expect(reader.offset).toBe(5) 63 | expect(values).toEqual([234000, 234000, 234000]) 64 | }) 65 | }) 66 | 67 | describe('readBitPacked', () => { 68 | it('reads bit-packed values with implicit length', () => { 69 | // Bit-packed values: false, false, true 70 | const buffer = new ArrayBuffer(8) 71 | const view = new DataView(buffer) 72 | view.setInt32(0, 2, true) // length 2 little-endian 73 | view.setUint8(4, 0b00000011) // Bit-packed header for 1-8 values 74 | view.setUint8(5, 0b00000100) // Bit-packed values (false, false, true) 75 | const reader = { view, offset: 0 } 76 | 77 | const values = new Array(3) 78 | readRleBitPackedHybrid(reader, 1, values) 79 | expect(reader.offset).toBe(6) 80 | expect(values).toEqual([0, 0, 1]) 81 | }) 82 | 83 | it('reads multi-byte bit-packed values', () => { 84 | // Bit-packed 9x true 85 | const buffer = new ArrayBuffer(3) 86 | const view = new DataView(buffer) 87 | view.setUint8(0, 0b00000101) // Bit-packed header for 9-16 values 88 | view.setUint8(1, 0b11111111) 89 | view.setUint8(2, 0b00000001) 90 | const reader = { view, offset: 0 } 91 | 92 | const values = new Array(9) 93 | readRleBitPackedHybrid(reader, 1, values, 3) 94 | expect(reader.offset).toBe(3) 95 | expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1]) 96 | }) 97 | 98 | it('handles bitpack unsigned shifting', () => { 99 | // Bit-packed [131071, 0, ..., 0, 131071, 0, ...] 100 | // Tests for issue #13 where leftmost bit is set to 1 and shifted 101 | const buffer = new ArrayBuffer(154) 102 | const view = new DataView(buffer) 103 | view.setUint8(0, 0b00010011) // Bit-packed header for 72 values 104 | view.setUint8(1, 0b11111111) 105 | view.setUint8(2, 0b11111111) 106 | view.setUint8(3, 0b00000001) 107 | view.setUint8(139, 0b11111110) 108 | view.setUint8(140, 0b11111111) 109 | view.setUint8(141, 0b0000011) 110 | const reader = { view, offset: 0 } 111 | 112 | const values = new Array(72) 113 | readRleBitPackedHybrid(reader, 17, values, 154) 114 | expect(reader.offset).toBe(154) 115 | expect(values).toEqual([ 116 | 131071, 0, 0, 0, 0, 0, 0, 0, 117 | 0, 0, 0, 0, 0, 0, 0, 0, 118 | 0, 0, 0, 0, 0, 0, 0, 0, 119 | 0, 0, 0, 0, 0, 0, 0, 0, 120 | 0, 0, 0, 0, 0, 0, 0, 0, 121 | 0, 0, 0, 0, 0, 0, 0, 0, 122 | 0, 0, 0, 0, 0, 0, 0, 0, 123 | 0, 0, 0, 0, 0, 0, 0, 0, 124 | 0, 131071, 0, 0, 0, 0, 0, 0, 125 | ]) 126 | }) 127 | 128 | it('throws for invalid bit-packed offset', () => { 129 | const buffer = new ArrayBuffer(1) 130 | const view = new DataView(buffer) 131 | view.setUint8(0, 0b00000011) // Bit-packed header for 3 values 132 | const reader = { view, offset: 0 } 133 | 134 | const values = new Array(3) 135 | expect(() => readRleBitPackedHybrid(reader, 1, values, 3)) 136 | .toThrow('parquet bitpack offset 1 out of range') 137 | }) 138 | }) 139 | 140 | describe('bitWidth', () => { 141 | it('calculates bit widths', () => { 142 | expect(bitWidth(0)).toBe(0) 143 | expect(bitWidth(1)).toBe(1) 144 | expect(bitWidth(7)).toBe(3) 145 | expect(bitWidth(8)).toBe(4) 146 | expect(bitWidth(255)).toBe(8) 147 | expect(bitWidth(256)).toBe(9) 148 | expect(bitWidth(1023)).toBe(10) 149 | expect(bitWidth(1048575)).toBe(20) 150 | }) 151 | }) 152 | -------------------------------------------------------------------------------- /src/filter.js: -------------------------------------------------------------------------------- 1 | import { equals } from './utils.js' 2 | 3 | /** 4 | * @import {ParquetQueryFilter, RowGroup} from '../src/types.js' 5 | */ 6 | 7 | /** 8 | * Returns an array of column names needed to evaluate the filter. 9 | * 10 | * @param {ParquetQueryFilter} [filter] 11 | * @returns {string[]} 12 | */ 13 | export function columnsNeededForFilter(filter) { 14 | if (!filter) return [] 15 | /** @type {string[]} */ 16 | const columns = [] 17 | if ('$and' in filter && Array.isArray(filter.$and)) { 18 | columns.push(...filter.$and.flatMap(columnsNeededForFilter)) 19 | } else if ('$or' in filter && Array.isArray(filter.$or)) { 20 | columns.push(...filter.$or.flatMap(columnsNeededForFilter)) 21 | } else if ('$nor' in filter && Array.isArray(filter.$nor)) { 22 | columns.push(...filter.$nor.flatMap(columnsNeededForFilter)) 23 | } else { 24 | columns.push(...Object.keys(filter)) 25 | } 26 | return columns 27 | } 28 | 29 | /** 30 | * Match a record against a query filter 31 | * 32 | * @param {Record} record 33 | * @param {ParquetQueryFilter} filter 34 | * @param {boolean} [strict] 35 | * @returns {boolean} 36 | */ 37 | export function matchFilter(record, filter, strict = true) { 38 | if ('$and' in filter && Array.isArray(filter.$and)) { 39 | return filter.$and.every(subQuery => matchFilter(record, subQuery, strict)) 40 | } 41 | if ('$or' in filter && Array.isArray(filter.$or)) { 42 | return filter.$or.some(subQuery => matchFilter(record, subQuery, strict)) 43 | } 44 | if ('$nor' in filter && Array.isArray(filter.$nor)) { 45 | return !filter.$nor.some(subQuery => matchFilter(record, subQuery, strict)) 46 | } 47 | 48 | return Object.entries(filter).every(([field, condition]) => { 49 | const value = record[field] 50 | 51 | // implicit $eq for non-object conditions 52 | if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) { 53 | return equals(value, condition, strict) 54 | } 55 | 56 | return Object.entries(condition || {}).every(([operator, target]) => { 57 | if (operator === '$gt') return value > target 58 | if (operator === '$gte') return value >= target 59 | if (operator === '$lt') return value < target 60 | if (operator === '$lte') return value <= target 61 | if (operator === '$eq') return equals(value, target, strict) 62 | if (operator === '$ne') return !equals(value, target, strict) 63 | if (operator === '$in') return Array.isArray(target) && target.includes(value) 64 | if (operator === '$nin') return Array.isArray(target) && !target.includes(value) 65 | if (operator === '$not') return !matchFilter({ [field]: value }, { [field]: target }, strict) 66 | return true 67 | }) 68 | }) 69 | } 70 | 71 | /** 72 | * Check if a row group can be skipped based on filter and column statistics. 73 | * 74 | * @param {object} options 75 | * @param {RowGroup} options.rowGroup 76 | * @param {string[]} options.physicalColumns 77 | * @param {ParquetQueryFilter | undefined} options.filter 78 | * @param {boolean} [options.strict] 79 | * @returns {boolean} true if the row group can be skipped 80 | */ 81 | export function canSkipRowGroup({ rowGroup, physicalColumns, filter, strict = true }) { 82 | if (!filter) return false 83 | 84 | // Handle logical operators 85 | if ('$and' in filter && Array.isArray(filter.$and)) { 86 | // For AND, we can skip if ANY condition allows skipping 87 | return filter.$and.some(subFilter => canSkipRowGroup({ rowGroup, physicalColumns, filter: subFilter, strict })) 88 | } 89 | if ('$or' in filter && Array.isArray(filter.$or)) { 90 | // For OR, we can skip only if ALL conditions allow skipping 91 | return filter.$or.every(subFilter => canSkipRowGroup({ rowGroup, physicalColumns, filter: subFilter, strict })) 92 | } 93 | if ('$nor' in filter && Array.isArray(filter.$nor)) { 94 | // For NOR, we can skip if none of the conditions allow skipping 95 | // This is complex, so we'll be conservative and not skip 96 | return false 97 | } 98 | 99 | // Check column filters 100 | for (const [field, condition] of Object.entries(filter)) { 101 | // Find the column chunk for this field 102 | const columnIndex = physicalColumns.indexOf(field) 103 | if (columnIndex === -1) continue 104 | 105 | const stats = rowGroup.columns[columnIndex].meta_data?.statistics 106 | if (!stats) continue // No statistics available, can't skip 107 | 108 | const { min, max, min_value, max_value } = stats 109 | const minVal = min_value !== undefined ? min_value : min 110 | const maxVal = max_value !== undefined ? max_value : max 111 | 112 | if (minVal === undefined || maxVal === undefined) continue 113 | 114 | // Handle operators 115 | for (const [operator, target] of Object.entries(condition || {})) { 116 | if (operator === '$gt' && maxVal <= target) return true 117 | if (operator === '$gte' && maxVal < target) return true 118 | if (operator === '$lt' && minVal >= target) return true 119 | if (operator === '$lte' && minVal > target) return true 120 | if (operator === '$eq' && (target < minVal || target > maxVal)) return true 121 | if (operator === '$ne' && equals(minVal, maxVal, strict) && equals(minVal, target, strict)) return true 122 | if (operator === '$in' && Array.isArray(target) && target.every(v => v < minVal || v > maxVal)) return true 123 | if (operator === '$nin' && Array.isArray(target) && equals(minVal, maxVal, strict) && target.includes(minVal)) return true 124 | } 125 | } 126 | 127 | return false 128 | } 129 | --------------------------------------------------------------------------------