├── test
    ├── files
    │   ├── issue97.json
    │   ├── issue115decimal.json
    │   ├── alpha.parquet
    │   ├── signs.parquet
    │   ├── issue23.parquet
    │   ├── issue72.parquet
    │   ├── issue90.parquet
    │   ├── issue97.parquet
    │   ├── strings.parquet
    │   ├── boolean_rle.parquet
    │   ├── duckdb2557.parquet
    │   ├── duckdb3734.parquet
    │   ├── duckdb4442.parquet
    │   ├── duckdb5533.parquet
    │   ├── geoparquet.parquet
    │   ├── geospatial.parquet
    │   ├── incorrect_map_schema.json
    │   ├── rowgroups.parquet
    │   ├── hyparquet.jpg.snappy
    │   ├── rowend_struct.parquet
    │   ├── adam_genotypes.parquet
    │   ├── continued_page.parquet
    │   ├── decimal-column.parquet
    │   ├── delta_byte_array.parquet
    │   ├── issue115decimal.parquet
    │   ├── nullable.impala.parquet
    │   ├── offset_indexed.parquet
    │   ├── struct_strings.parquet
    │   ├── brotli_compressed.parquet
    │   ├── byte_array_decimal.parquet
    │   ├── datapage_v2.snappy.parquet
    │   ├── lz4_raw_compressed.parquet
    │   ├── nonnullable.impala.parquet
    │   ├── byte_stream_split_v2.parquet
    │   ├── delta_binary_packed.parquet
    │   ├── fixed_length_decimal.parquet
    │   ├── hadoop_lz4_compressed.parquet
    │   ├── incorrect_map_schema.parquet
    │   ├── nested_structs.rust.parquet
    │   ├── rle_boolean_encoding.parquet
    │   ├── byte_stream_split.zstd.parquet
    │   ├── delta_length_byte_array.parquet
    │   ├── repeated_no_annotation.parquet
    │   ├── byte_stream_split_v2.json
    │   ├── concatenated_gzip_members.parquet
    │   ├── duckdb_delta_binary_packed.parquet
    │   ├── float16_nonzeros_and_nans.parquet
    │   ├── struct_strings.json
    │   ├── byte_stream_split_extended.gzip.parquet
    │   ├── delta_encoding_optional_column.parquet
    │   ├── delta_encoding_required_column.parquet
    │   ├── duckdb_delta_length_byte_array.parquet
    │   ├── plain-dict-uncompressed-checksum.parquet
    │   ├── float16_nonzeros_and_nans.json
    │   ├── mostlyempty.parquet
    │   ├── issue72.json
    │   ├── strings.json
    │   ├── mostlyempty.json
    │   ├── datapage_v2.snappy.json
    │   ├── rowgroups.json
    │   ├── decimal-column.json
    │   ├── boolean_rle.json
    │   ├── README.md
    │   ├── brotli_compressed.json
    │   ├── duckdb4442.column_indexes.json
    │   ├── lz4_raw_compressed.json
    │   ├── hadoop_lz4_compressed.json
    │   ├── signs.json
    │   ├── duckdb4442.json
    │   ├── fixed_length_decimal.json
    │   ├── repeated_no_annotation.json
    │   ├── nonnullable.impala.json
    │   ├── byte_array_decimal.json
    │   ├── duckdb5533.json
    │   ├── incorrect_map_schema.offset_indexes.json
    │   ├── plain-dict-uncompressed-checksum.offset_indexes.json
    │   ├── incorrect_map_schema.column_indexes.json
    │   ├── plain-dict-uncompressed-checksum.column_indexes.json
    │   ├── adam_genotypes.json
    │   ├── duckdb5533.offset_indexes.json
    │   ├── rle_boolean_encoding.json
    │   ├── mostlyempty.metadata.json
    │   ├── byte_array_decimal.metadata.json
    │   ├── rle_boolean_encoding.metadata.json
    │   ├── duckdb_delta_binary_packed.metadata.json
    │   ├── duckdb_delta_length_byte_array.metadata.json
    │   ├── concatenated_gzip_members.metadata.json
    │   ├── nullable.impala.json
    │   ├── issue90.metadata.json
    │   ├── delta_length_byte_array.metadata.json
    │   ├── duckdb5533.column_indexes.json
    │   ├── issue72.metadata.json
    │   ├── boolean_rle.metadata.json
    │   ├── offset_indexed.offset_indexes.json
    │   ├── fixed_length_decimal.metadata.json
    │   ├── float16_nonzeros_and_nans.metadata.json
    │   ├── duckdb3734.json
    │   ├── duckdb2557.metadata.json
    │   ├── struct_strings.metadata.json
    │   ├── issue115decimal.metadata.json
    │   ├── strings.metadata.json
    │   ├── continued_page.metadata.json
    │   ├── repeated_no_annotation.metadata.json
    │   ├── byte_stream_split.zstd.metadata.json
    │   ├── duckdb4442.offset_indexes.json
    │   ├── delta_encoding_required_column.offset_indexes.json
    │   ├── concatenated_gzip_members.json
    │   ├── incorrect_map_schema.metadata.json
    │   ├── issue97.metadata.json
    │   ├── offset_indexed.metadata.json
    │   ├── byte_stream_split_v2.metadata.json
    │   ├── lz4_raw_compressed.metadata.json
    │   ├── plain-dict-uncompressed-checksum.metadata.json
    │   ├── rowgroups.metadata.json
    │   ├── hadoop_lz4_compressed.metadata.json
    │   ├── rowend_struct.metadata.json
    │   ├── datapage_v2.snappy.metadata.json
    │   ├── delta_encoding_required_column.column_indexes.json
    │   └── decimal-column.metadata.json
    ├── schemaTree.test.js
    ├── helpers.js
    ├── plan.test.js
    ├── package.test.js
    ├── readFiles.test.js
    ├── read.utf8.test.js
    ├── rowend_struct.test.js
    ├── indexes.test.js
    ├── snappy.test.js
    ├── asyncbuffer.test.js
    ├── metadata.test.js
    ├── column.test.js
    ├── thrift.test.js
    └── encoding.test.js
├── .gitattributes
├── hyparquet.jpg
├── hyperparam.png
├── .gitignore
├── tsconfig.build.json
├── tsconfig.json
├── .github
    └── workflows
    │   └── ci.yml
├── LICENSE
├── src
    ├── node.js
    ├── geoparquet.js
    ├── indexes.js
    ├── constants.js
    ├── index.js
    ├── delta.js
    ├── wkb.js
    ├── snappy.js
    ├── schema.js
    ├── encoding.js
    └── filter.js
├── benchmark.js
├── package.json
└── eslint.config.js


/test/files/issue97.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.min.js -diff
2 | *.min.js.map -diff
3 | 


--------------------------------------------------------------------------------
/test/files/issue115decimal.json:
--------------------------------------------------------------------------------
1 | [
2 |   [-12345.67]
3 | ]
4 | 


--------------------------------------------------------------------------------
/hyparquet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/hyparquet.jpg


--------------------------------------------------------------------------------
/hyperparam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/hyperparam.png


--------------------------------------------------------------------------------
/test/files/alpha.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/alpha.parquet


--------------------------------------------------------------------------------
/test/files/signs.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/signs.parquet


--------------------------------------------------------------------------------
/test/files/issue23.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue23.parquet


--------------------------------------------------------------------------------
/test/files/issue72.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue72.parquet


--------------------------------------------------------------------------------
/test/files/issue90.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue90.parquet


--------------------------------------------------------------------------------
/test/files/issue97.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue97.parquet


--------------------------------------------------------------------------------
/test/files/strings.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/strings.parquet


--------------------------------------------------------------------------------
/test/files/boolean_rle.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/boolean_rle.parquet


--------------------------------------------------------------------------------
/test/files/duckdb2557.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb2557.parquet


--------------------------------------------------------------------------------
/test/files/duckdb3734.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb3734.parquet


--------------------------------------------------------------------------------
/test/files/duckdb4442.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb4442.parquet


--------------------------------------------------------------------------------
/test/files/duckdb5533.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb5533.parquet


--------------------------------------------------------------------------------
/test/files/geoparquet.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/geoparquet.parquet


--------------------------------------------------------------------------------
/test/files/geospatial.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/geospatial.parquet


--------------------------------------------------------------------------------
/test/files/incorrect_map_schema.json:
--------------------------------------------------------------------------------
1 | [
2 |   [
3 |     {"name": "report", "parent": "another"}
4 |   ]
5 | ]
6 | 


--------------------------------------------------------------------------------
/test/files/rowgroups.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rowgroups.parquet


--------------------------------------------------------------------------------
/test/files/hyparquet.jpg.snappy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/hyparquet.jpg.snappy


--------------------------------------------------------------------------------
/test/files/rowend_struct.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rowend_struct.parquet


--------------------------------------------------------------------------------
/test/files/adam_genotypes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/adam_genotypes.parquet


--------------------------------------------------------------------------------
/test/files/continued_page.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/continued_page.parquet


--------------------------------------------------------------------------------
/test/files/decimal-column.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/decimal-column.parquet


--------------------------------------------------------------------------------
/test/files/delta_byte_array.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_byte_array.parquet


--------------------------------------------------------------------------------
/test/files/issue115decimal.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/issue115decimal.parquet


--------------------------------------------------------------------------------
/test/files/nullable.impala.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nullable.impala.parquet


--------------------------------------------------------------------------------
/test/files/offset_indexed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/offset_indexed.parquet


--------------------------------------------------------------------------------
/test/files/struct_strings.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/struct_strings.parquet


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | package-lock.json
3 | coverage
4 | *.tgz
5 | .vscode
6 | .DS_Store
7 | /*.parquet
8 | /types
9 | 


--------------------------------------------------------------------------------
/test/files/brotli_compressed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/brotli_compressed.parquet


--------------------------------------------------------------------------------
/test/files/byte_array_decimal.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_array_decimal.parquet


--------------------------------------------------------------------------------
/test/files/datapage_v2.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/datapage_v2.snappy.parquet


--------------------------------------------------------------------------------
/test/files/lz4_raw_compressed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/lz4_raw_compressed.parquet


--------------------------------------------------------------------------------
/test/files/nonnullable.impala.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nonnullable.impala.parquet


--------------------------------------------------------------------------------
/test/files/byte_stream_split_v2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split_v2.parquet


--------------------------------------------------------------------------------
/test/files/delta_binary_packed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_binary_packed.parquet


--------------------------------------------------------------------------------
/test/files/fixed_length_decimal.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/fixed_length_decimal.parquet


--------------------------------------------------------------------------------
/test/files/hadoop_lz4_compressed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/hadoop_lz4_compressed.parquet


--------------------------------------------------------------------------------
/test/files/incorrect_map_schema.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/incorrect_map_schema.parquet


--------------------------------------------------------------------------------
/test/files/nested_structs.rust.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/nested_structs.rust.parquet


--------------------------------------------------------------------------------
/test/files/rle_boolean_encoding.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/rle_boolean_encoding.parquet


--------------------------------------------------------------------------------
/test/files/byte_stream_split.zstd.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split.zstd.parquet


--------------------------------------------------------------------------------
/test/files/delta_length_byte_array.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_length_byte_array.parquet


--------------------------------------------------------------------------------
/test/files/repeated_no_annotation.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/repeated_no_annotation.parquet


--------------------------------------------------------------------------------
/test/files/byte_stream_split_v2.json:
--------------------------------------------------------------------------------
1 | [
2 |   [1.5, 10.1],
3 |   [2.5, 20.2],
4 |   [3.5, 30.3],
5 |   [4.5, 40.4],
6 |   [5.5, 50.5]
7 | ]
8 | 


--------------------------------------------------------------------------------
/test/files/concatenated_gzip_members.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/concatenated_gzip_members.parquet


--------------------------------------------------------------------------------
/test/files/duckdb_delta_binary_packed.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb_delta_binary_packed.parquet


--------------------------------------------------------------------------------
/test/files/float16_nonzeros_and_nans.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/float16_nonzeros_and_nans.parquet


--------------------------------------------------------------------------------
/test/files/struct_strings.json:
--------------------------------------------------------------------------------
1 | [
2 |   [{ "f64_field": null, "str_field": "hello" }],
3 |   [{ "f64_field": 1.23, "str_field": null }]
4 | ]
5 | 


--------------------------------------------------------------------------------
/test/files/byte_stream_split_extended.gzip.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/byte_stream_split_extended.gzip.parquet


--------------------------------------------------------------------------------
/test/files/delta_encoding_optional_column.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_encoding_optional_column.parquet


--------------------------------------------------------------------------------
/test/files/delta_encoding_required_column.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/delta_encoding_required_column.parquet


--------------------------------------------------------------------------------
/test/files/duckdb_delta_length_byte_array.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/duckdb_delta_length_byte_array.parquet


--------------------------------------------------------------------------------
/test/files/plain-dict-uncompressed-checksum.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hyparam/hyparquet/HEAD/test/files/plain-dict-uncompressed-checksum.parquet


--------------------------------------------------------------------------------
/test/files/float16_nonzeros_and_nans.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [null],
 3 |   [1],
 4 |   [-2],
 5 |   [null],
 6 |   [0],
 7 |   [-1],
 8 |   [0],
 9 |   [2]
10 | ]
11 | 


--------------------------------------------------------------------------------
/test/files/mostlyempty.parquet:
--------------------------------------------------------------------------------
1 | PAR1 L     
2 | \      ,Hroot %empty &emptyPP&$&6   P (	hyparquet R   PAR1


--------------------------------------------------------------------------------
/test/files/issue72.json:
--------------------------------------------------------------------------------
1 | [
2 |   ["258d7fff-6418-499f-af07-c6611937d7d8"],
3 |   ["086f2968-327b-48a8-8cdf-64f46bcd8173"],
4 |   ["258d7fff-6418-499f-af07-c6611937d7d8"]
5 | ]
6 | 


--------------------------------------------------------------------------------
/test/files/strings.json:
--------------------------------------------------------------------------------
1 | [
2 |   ["alpha", "alpha", "alpha"],
3 |   ["bravo", "bravo", "bravo"],
4 |   ["charlie", "charlie", "charlie"],
5 |   ["delta", "delta", "delta"]
6 | ]
7 | 


--------------------------------------------------------------------------------
/test/files/mostlyempty.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [null],
 3 |   [null],
 4 |   [null],
 5 |   [null],
 6 |   [null],
 7 |   [null],
 8 |   [null],
 9 |   [null],
10 |   [null],
11 |   [null]
12 | ]
13 | 


--------------------------------------------------------------------------------
/test/files/datapage_v2.snappy.json:
--------------------------------------------------------------------------------
1 | [
2 |   ["abc", 1, 2, true, [1, 2, 3]],
3 |   ["abc", 2, 3, true, null],
4 |   ["abc", 3, 4, true, null],
5 |   [null, 4, 5, false, [1, 2, 3]],
6 |   ["abc", 5, 2, true, [1, 2]]
7 | ]
8 | 


--------------------------------------------------------------------------------
/test/files/rowgroups.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [ 1 ],
 3 |   [ 2 ],
 4 |   [ 3 ],
 5 |   [ 4 ],
 6 |   [ 5 ],
 7 |   [ 6 ],
 8 |   [ 7 ],
 9 |   [ 8 ],
10 |   [ 9 ],
11 |   [ 10 ],
12 |   [ 11 ],
13 |   [ 12 ],
14 |   [ 13 ],
15 |   [ 14 ],
16 |   [ 15 ]
17 | ]
18 | 


--------------------------------------------------------------------------------
/test/files/decimal-column.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     40,
 4 |     2015
 5 |   ],
 6 |   [
 7 |     74,
 8 |     2015
 9 |   ],
10 |   [
11 |     140,
12 |     2015
13 |   ],
14 |   [
15 |     152,
16 |     2015
17 |   ],
18 |   [
19 |     190,
20 |     2015
21 |   ]
22 | ]
23 | 


--------------------------------------------------------------------------------
/tsconfig.build.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "./tsconfig.json",
 3 |   "compilerOptions": {
 4 |     "noEmit": false,
 5 |     "declaration": true,
 6 |     "emitDeclarationOnly": true,
 7 |     "outDir": "types",
 8 |     "declarationMap": true
 9 |   },
10 |   "include": ["src"]
11 | }
12 | 


--------------------------------------------------------------------------------
/test/files/boolean_rle.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     [true],
 3 |     [true],
 4 |     [true],
 5 |     [true],
 6 |     [true],
 7 |     [null],
 8 |     [null],
 9 |     [null],
10 |     [null],
11 |     [null],
12 |     [false],
13 |     [false],
14 |     [false],
15 |     [false],
16 |     [false]
17 | ]
18 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "allowJs": true,
 4 |     "checkJs": true,
 5 |     "lib": ["esnext", "dom"],
 6 |     "module": "nodenext",
 7 |     "noEmit": true,
 8 |     "resolveJsonModule": true,
 9 |     "strict": true
10 |   },
11 |   "include": ["src", "test"]
12 | }
13 | 


--------------------------------------------------------------------------------
/test/files/README.md:
--------------------------------------------------------------------------------
1 | # Test Files License
2 | 
3 | This directory contains binary test files from [apache/parquet-testing](https://github.com/apache/parquet-testing), under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0).
4 | 
5 | Copyright 2004 The Apache Software Foundation (http://www.apache.org/).
6 | 


--------------------------------------------------------------------------------
/test/files/brotli_compressed.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     1593604800,
 4 |     "abc",
 5 |     42
 6 |   ],
 7 |   [
 8 |     1593604800,
 9 |     "def",
10 |     7.7
11 |   ],
12 |   [
13 |     1593604801,
14 |     "abc",
15 |     42.125
16 |   ],
17 |   [
18 |     1593604801,
19 |     "def",
20 |     7.7
21 |   ]
22 | ]
23 | 


--------------------------------------------------------------------------------
/test/files/duckdb4442.column_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     null,
 4 |     null,
 5 |     null,
 6 |     null,
 7 |     null,
 8 |     null,
 9 |     null,
10 |     null,
11 |     null,
12 |     null,
13 |     null,
14 |     null,
15 |     null,
16 |     null,
17 |     null,
18 |     null,
19 |     null
20 |   ]
21 | ]
22 | 


--------------------------------------------------------------------------------
/test/files/lz4_raw_compressed.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     1593604800,
 4 |     "abc",
 5 |     42
 6 |   ],
 7 |   [
 8 |     1593604800,
 9 |     "def",
10 |     7.7
11 |   ],
12 |   [
13 |     1593604801,
14 |     "abc",
15 |     42.125
16 |   ],
17 |   [
18 |     1593604801,
19 |     "def",
20 |     7.7
21 |   ]
22 | ]
23 | 


--------------------------------------------------------------------------------
/test/files/hadoop_lz4_compressed.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     1593604800,
 4 |     "abc",
 5 |     42
 6 |   ],
 7 |   [
 8 |     1593604800,
 9 |     "def",
10 |     7.7
11 |   ],
12 |   [
13 |     1593604801,
14 |     "abc",
15 |     42.125
16 |   ],
17 |   [
18 |     1593604801,
19 |     "def",
20 |     7.7
21 |   ]
22 | ]
23 | 


--------------------------------------------------------------------------------
/test/files/signs.json:
--------------------------------------------------------------------------------
1 | [
2 |   [0, 0, 0, 0, -128, -32768, -2147483648, -9223372036854775808],
3 |   [127, 32767, 2147483647, 9223372036854775807, -1, -1, -1, -1],
4 |   [128, 32768, 2147483648, 9223372036854775808, 0, 0, 0, 0],
5 |   [255, 65535, 4294967295, 18446744073709551615, 127, 32767, 2147483647, 9223372036854775807]
6 | ]
7 | 


--------------------------------------------------------------------------------
/test/files/duckdb4442.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     12,
 4 |     5184,
 5 |     1,
 6 |     22,
 7 |     "2011-10-06T22:21:49.580Z",
 8 |     "outbound",
 9 |     323020033,
10 |     "{}",
11 |     2100,
12 |     33,
13 |     0,
14 |     7,
15 |     10,
16 |     0,
17 |     1317427200000,
18 |     1317939709580,
19 |     11
20 |   ]
21 | ]
22 | 


--------------------------------------------------------------------------------
/test/files/fixed_length_decimal.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [1],
 3 |   [2],
 4 |   [3],
 5 |   [4],
 6 |   [5],
 7 |   [6],
 8 |   [7],
 9 |   [8],
10 |   [9],
11 |   [10],
12 |   [11],
13 |   [12],
14 |   [13],
15 |   [14],
16 |   [15],
17 |   [16],
18 |   [17],
19 |   [18],
20 |   [19],
21 |   [20],
22 |   [21],
23 |   [22],
24 |   [23],
25 |   [24]
26 | ]
27 | 


--------------------------------------------------------------------------------
/test/files/repeated_no_annotation.json:
--------------------------------------------------------------------------------
1 | [
2 |   [1, null],
3 |   [2, null],
4 |   [3, {"phone": []}],
5 |   [4, {"phone": [{"number":5555555555,"kind":null}]}],
6 |   [5, {"phone": [{"number":1111111111,"kind":"home"}]}],
7 |   [6, {"phone": [{"number":1111111111,"kind":"home"},{"number":2222222222,"kind":null},{"number":3333333333,"kind":"mobile"}]}]
8 | ]
9 | 


--------------------------------------------------------------------------------
/test/files/nonnullable.impala.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     8,
 4 |     [-1],
 5 |     [[-1, -2], []],
 6 |     { "k1": -1 },
 7 |     [{}, { "k1": 1 }, {}, {}],
 8 |     {
 9 |       "a": -1,
10 |       "B": [-1],
11 |       "c": {
12 |         "D": [[{
13 |           "e": -1,
14 |           "f": "nonnullable"
15 |         }]]
16 |       },
17 |       "G": {}
18 |     }
19 |   ]
20 | ]
21 | 


--------------------------------------------------------------------------------
/test/files/byte_array_decimal.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [ 1 ],
 3 |   [ 2 ],
 4 |   [ 3 ],
 5 |   [ 4 ],
 6 |   [ 5 ],
 7 |   [ 6 ],
 8 |   [ 7 ],
 9 |   [ 8 ],
10 |   [ 9 ],
11 |   [ 10 ],
12 |   [ 11 ],
13 |   [ 12 ],
14 |   [ 13 ],
15 |   [ 14 ],
16 |   [ 15 ],
17 |   [ 16 ],
18 |   [ 17 ],
19 |   [ 18 ],
20 |   [ 19 ],
21 |   [ 20 ],
22 |   [ 21 ],
23 |   [ 22 ],
24 |   [ 23 ],
25 |   [ 24 ]
26 | ]
27 | 


--------------------------------------------------------------------------------
/test/files/duckdb5533.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     "2022-11-27T17:42:43.514Z",
 4 |     2448,
 5 |     null,
 6 |     1,
 7 |     343
 8 |   ],
 9 |   [
10 |     "2022-11-27T17:42:43.514Z",
11 |     85016,
12 |     null,
13 |     -1,
14 |     343
15 |   ],
16 |   [
17 |     "2022-11-27T17:42:44.280Z",
18 |     1184,
19 |     null,
20 |     1,
21 |     343
22 |   ],
23 |   [
24 |     "2022-11-27T17:42:44.280Z",
25 |     85016,
26 |     null,
27 |     -1,
28 |     343
29 |   ]
30 | ]
31 | 


--------------------------------------------------------------------------------
/test/files/incorrect_map_schema.offset_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "page_locations": [
 5 |         {
 6 |           "offset": 4,
 7 |           "compressed_page_size": 69,
 8 |           "first_row_index": 0
 9 |         }
10 |       ]
11 |     },
12 |     {
13 |       "page_locations": [
14 |         {
15 |           "offset": 73,
16 |           "compressed_page_size": 72,
17 |           "first_row_index": 0
18 |         }
19 |       ]
20 |     }
21 |   ]
22 | ]


--------------------------------------------------------------------------------
/test/files/plain-dict-uncompressed-checksum.offset_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "page_locations": [
 5 |         {
 6 |           "offset": 31,
 7 |           "compressed_page_size": 27,
 8 |           "first_row_index": 0
 9 |         }
10 |       ]
11 |     },
12 |     {
13 |       "page_locations": [
14 |         {
15 |           "offset": 117,
16 |           "compressed_page_size": 27,
17 |           "first_row_index": 0
18 |         }
19 |       ]
20 |     }
21 |   ]
22 | ]


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 | 
 6 | jobs:
 7 |   lint:
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 5
10 |     steps:
11 |       - uses: actions/checkout@v6
12 |       - run: npm i
13 |       - run: npm run lint
14 | 
15 |   typecheck:
16 |     runs-on: ubuntu-latest
17 |     timeout-minutes: 5
18 |     steps:
19 |       - uses: actions/checkout@v6
20 |       - run: npm i
21 |       - run: npx tsc
22 | 
23 |   test:
24 |     runs-on: ubuntu-latest
25 |     timeout-minutes: 5
26 |     steps:
27 |       - uses: actions/checkout@v6
28 |       - run: npm i
29 |       - run: npm run coverage
30 | 


--------------------------------------------------------------------------------
/test/files/incorrect_map_schema.column_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "null_pages": [
 5 |         false
 6 |       ],
 7 |       "min_values": [
 8 |         "name"
 9 |       ],
10 |       "max_values": [
11 |         "parent"
12 |       ],
13 |       "boundary_order": "ASCENDING",
14 |       "null_counts": [
15 |         0
16 |       ]
17 |     },
18 |     {
19 |       "null_pages": [
20 |         false
21 |       ],
22 |       "min_values": [
23 |         "another"
24 |       ],
25 |       "max_values": [
26 |         "report"
27 |       ],
28 |       "boundary_order": "ASCENDING",
29 |       "null_counts": [
30 |         0
31 |       ]
32 |     }
33 |   ]
34 | ]


--------------------------------------------------------------------------------
/test/files/plain-dict-uncompressed-checksum.column_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "null_pages": [
 5 |         false
 6 |       ],
 7 |       "min_values": [
 8 |         0
 9 |       ],
10 |       "max_values": [
11 |         0
12 |       ],
13 |       "boundary_order": "ASCENDING",
14 |       "null_counts": [
15 |         0
16 |       ]
17 |     },
18 |     {
19 |       "null_pages": [
20 |         false
21 |       ],
22 |       "min_values": [
23 |         "a655fd0e-9949-4059-bcae-fd6a002a4652"
24 |       ],
25 |       "max_values": [
26 |         "a655fd0e-9949-4059-bcae-fd6a002a4652"
27 |       ],
28 |       "boundary_order": "ASCENDING",
29 |       "null_counts": [
30 |         0
31 |       ]
32 |     }
33 |   ]
34 | ]


--------------------------------------------------------------------------------
/test/files/adam_genotypes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "alternateAllele": null,
 5 |       "end": null,
 6 |       "filtersApplied": null,
 7 |       "filtersFailed": null,
 8 |       "filtersPassed": null,
 9 |       "names": ["name"],
10 |       "quality": null,
11 |       "referenceAllele": null,
12 |       "referenceName": null,
13 |       "splitFromMultiAllelic": false,
14 |       "start": null
15 |     },
16 |     null,
17 |     null,
18 |     null,
19 |     null,
20 |     null,
21 |     null,
22 |     null,
23 |     null,
24 |     null,
25 |     null,
26 |     null,
27 |     null,
28 |     null,
29 |     null,
30 |     null,
31 |     null,
32 |     null,
33 |     false,
34 |     false,
35 |     null,
36 |     null
37 |   ]
38 | ]
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/test/files/duckdb5533.offset_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "page_locations": [
 5 |         {
 6 |           "offset": 4,
 7 |           "compressed_page_size": 73,
 8 |           "first_row_index": 0
 9 |         }
10 |       ]
11 |     },
12 |     {
13 |       "page_locations": [
14 |         {
15 |           "offset": 132,
16 |           "compressed_page_size": 65,
17 |           "first_row_index": 0
18 |         }
19 |       ]
20 |     },
21 |     {
22 |       "page_locations": [
23 |         {
24 |           "offset": 242,
25 |           "compressed_page_size": 37,
26 |           "first_row_index": 0
27 |         }
28 |       ]
29 |     },
30 |     {
31 |       "page_locations": [
32 |         {
33 |           "offset": 315,
34 |           "compressed_page_size": 68,
35 |           "first_row_index": 0
36 |         }
37 |       ]
38 |     },
39 |     {
40 |       "page_locations": [
41 |         {
42 |           "offset": 435,
43 |           "compressed_page_size": 57,
44 |           "first_row_index": 0
45 |         }
46 |       ]
47 |     }
48 |   ]
49 | ]


--------------------------------------------------------------------------------
/test/files/rle_boolean_encoding.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [true],
 3 |   [false],
 4 |   [null],
 5 |   [true],
 6 |   [true],
 7 |   [false],
 8 |   [false],
 9 |   [true],
10 |   [true],
11 |   [true],
12 |   [false],
13 |   [false],
14 |   [true],
15 |   [true],
16 |   [false],
17 |   [null],
18 |   [true],
19 |   [true],
20 |   [false],
21 |   [false],
22 |   [true],
23 |   [true],
24 |   [false],
25 |   [null],
26 |   [true],
27 |   [true],
28 |   [false],
29 |   [false],
30 |   [true],
31 |   [true],
32 |   [true],
33 |   [false],
34 |   [false],
35 |   [false],
36 |   [false],
37 |   [true],
38 |   [true],
39 |   [false],
40 |   [null],
41 |   [true],
42 |   [true],
43 |   [false],
44 |   [false],
45 |   [true],
46 |   [true],
47 |   [true],
48 |   [false],
49 |   [false],
50 |   [null],
51 |   [true],
52 |   [true],
53 |   [false],
54 |   [false],
55 |   [true],
56 |   [true],
57 |   [true],
58 |   [false],
59 |   [true],
60 |   [true],
61 |   [false],
62 |   [null],
63 |   [true],
64 |   [true],
65 |   [false],
66 |   [false],
67 |   [true],
68 |   [true],
69 |   [true]
70 | ]
71 | 


--------------------------------------------------------------------------------
/src/node.js:
--------------------------------------------------------------------------------
 1 | import { createReadStream, promises as fs } from 'fs'
 2 | 
 3 | export * from './index.js'
 4 | 
 5 | /**
 6 |  * @import {AsyncBuffer} from '../src/types.js'
 7 |  */
 8 | /**
 9 |  * Construct an AsyncBuffer for a local file using node fs package.
10 |  *
11 |  * @param {string} filename
12 |  * @returns {Promise<AsyncBuffer>}
13 |  */
14 | export async function asyncBufferFromFile(filename) {
15 |   const { size } = await fs.stat(filename)
16 |   return {
17 |     byteLength: size,
18 |     slice(start, end) {
19 |       // read file slice
20 |       const reader = createReadStream(filename, { start, end })
21 |       return new Promise((resolve, reject) => {
22 |         /** @type {any[]} */
23 |         const chunks = []
24 |         reader.on('data', chunk => chunks.push(chunk))
25 |         reader.on('error', reject)
26 |         reader.on('end', () => {
27 |           const buffer = Buffer.concat(chunks)
28 |           resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
29 |         })
30 |       })
31 |     },
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/test/schemaTree.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { parquetMetadataAsync, parquetSchema } from '../src/index.js'
 3 | import { asyncBufferFromFile } from '../src/node.js'
 4 | 
 5 | describe('parquetSchema', () => {
 6 |   it('parse schema tree from rowgroups.parquet', async () => {
 7 |     const arrayBuffer = await asyncBufferFromFile('test/files/rowgroups.parquet')
 8 |     const metadata = await parquetMetadataAsync(arrayBuffer)
 9 |     const schemaTree = parquetSchema(metadata)
10 |     expect(schemaTree).toEqual(rowgroupsSchema)
11 |   })
12 | })
13 | 
14 | // Parquet v2 from pandas with 2 row groups
15 | const rowgroupsSchema = {
16 |   children: [
17 |     {
18 |       children: [],
19 |       count: 1,
20 |       element: {
21 |         name: 'numbers',
22 |         repetition_type: 'OPTIONAL',
23 |         type: 'INT64',
24 |       },
25 |       path: ['numbers'],
26 |     },
27 |   ],
28 |   count: 2,
29 |   element: {
30 |     name: 'schema',
31 |     num_children: 1,
32 |     repetition_type: 'REQUIRED',
33 |   },
34 |   path: [],
35 | }
36 | 


--------------------------------------------------------------------------------
/test/files/mostlyempty.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "name": "root",
 6 |       "num_children": 1
 7 |     },
 8 |     {
 9 |       "type": "BYTE_ARRAY",
10 |       "repetition_type": "OPTIONAL",
11 |       "name": "empty"
12 |     }
13 |   ],
14 |   "num_rows": 10,
15 |   "row_groups": [
16 |     {
17 |       "columns": [
18 |         {
19 |           "file_offset": 4,
20 |           "meta_data": {
21 |             "type": "BYTE_ARRAY",
22 |             "encodings": ["RLE_DICTIONARY"],
23 |             "path_in_schema": ["empty"],
24 |             "codec": "SNAPPY",
25 |             "num_values": 10,
26 |             "total_uncompressed_size": 40,
27 |             "total_compressed_size": 40,
28 |             "data_page_offset": 18,
29 |             "dictionary_page_offset": 4,
30 |             "statistics": {
31 |               "null_count": 10
32 |             }
33 |           }
34 |         }
35 |       ],
36 |       "total_byte_size": 40,
37 |       "num_rows": 10
38 |     }
39 |   ],
40 |   "created_by": "hyparquet",
41 |   "metadata_length": 82
42 | }
43 | 


--------------------------------------------------------------------------------
/test/files/byte_array_decimal.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "created_by": "HVR 5.3.0/9 (linux_glibc2.5-x64-64bit)",
 4 |   "metadata_length": 119,
 5 |   "num_rows": 24,
 6 |   "row_groups": [
 7 |     {
 8 |       "columns": [
 9 |         {
10 |           "file_offset": 4,
11 |           "meta_data": {
12 |             "codec": "UNCOMPRESSED",
13 |             "data_page_offset": 4,
14 |             "encodings": [],
15 |             "num_values": 24,
16 |             "path_in_schema": [ "value" ],
17 |             "total_compressed_size": 168,
18 |             "total_uncompressed_size": 168,
19 |             "type": "BYTE_ARRAY"
20 |           }
21 |         }
22 |       ],
23 |       "num_rows": 24,
24 |       "total_byte_size": 168
25 |     }
26 |   ],
27 |   "schema": [
28 |     {
29 |       "name": "schema",
30 |       "num_children": 1,
31 |       "repetition_type": "REQUIRED"
32 |     },
33 |     {
34 |       "converted_type": "DECIMAL",
35 |       "field_id": 6,
36 |       "name": "value",
37 |       "precision": 4,
38 |       "repetition_type": "OPTIONAL",
39 |       "scale": 2,
40 |       "type": "BYTE_ARRAY"
41 |     }
42 |   ]
43 | }
44 | 


--------------------------------------------------------------------------------
/test/helpers.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | 
 3 | /**
 4 |  * Read file and parse as JSON
 5 |  *
 6 |  * @param {string} filePath
 7 |  * @returns {any}
 8 |  */
 9 | export function fileToJson(filePath) {
10 |   const buffer = fs.readFileSync(filePath)
11 |   return JSON.parse(buffer.toString())
12 | }
13 | 
14 | /**
15 |  * Make a DataReader from bytes
16 |  *
17 |  * @import {DataReader} from '../src/types.d.ts'
18 |  * @param {number[]} bytes
19 |  * @returns {DataReader}
20 |  */
21 | export function reader(bytes) {
22 |   return { view: new DataView(new Uint8Array(bytes).buffer), offset: 0 }
23 | }
24 | 
25 | /**
26 |  * Wraps an AsyncBuffer to count the number of fetches made
27 |  *
28 |  * @import {AsyncBuffer} from '../src/types.js'
29 |  * @param {AsyncBuffer} asyncBuffer
30 |  * @returns {AsyncBuffer & {fetches: number, bytes: number}}
31 |  */
32 | export function countingBuffer(asyncBuffer) {
33 |   return {
34 |     ...asyncBuffer,
35 |     fetches: 0,
36 |     bytes: 0,
37 |     slice(start, end) {
38 |       this.fetches++
39 |       this.bytes += (end ?? asyncBuffer.byteLength) - start
40 |       return asyncBuffer.slice(start, end)
41 |     },
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/test/files/rle_boolean_encoding.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "name": "table",
 6 |       "num_children": 1
 7 |     },
 8 |     {
 9 |       "type": "BOOLEAN",
10 |       "repetition_type": "OPTIONAL",
11 |       "name": "datatype_boolean",
12 |       "field_id": 1
13 |     }
14 |   ],
15 |   "num_rows": 68,
16 |   "row_groups": [
17 |     {
18 |       "columns": [
19 |         {
20 |           "file_offset": 0,
21 |           "meta_data": {
22 |             "type": "BOOLEAN",
23 |             "encodings": [
24 |               "RLE"
25 |             ],
26 |             "path_in_schema": [
27 |               "datatype_boolean"
28 |             ],
29 |             "codec": "GZIP",
30 |             "num_values": 68,
31 |             "total_uncompressed_size": 49,
32 |             "total_compressed_size": 69,
33 |             "data_page_offset": 4,
34 |             "statistics": {
35 |               "max": true,
36 |               "min": false,
37 |               "null_count": 6,
38 |               "max_value": true,
39 |               "min_value": false
40 |             }
41 |           }
42 |         }
43 |       ],
44 |       "total_byte_size": 69,
45 |       "num_rows": 68
46 |     }
47 |   ],
48 |   "metadata_length": 111
49 | }
50 | 


--------------------------------------------------------------------------------
/benchmark.js:
--------------------------------------------------------------------------------
 1 | import { createWriteStream, promises as fs } from 'fs'
 2 | import { compressors } from 'hyparquet-compressors'
 3 | import { pipeline } from 'stream/promises'
 4 | import { parquetReadObjects } from './src/index.js'
 5 | import { asyncBufferFromFile } from './src/node.js'
 6 | 
 7 | const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
 8 | const filename = 'example.parquet'
 9 | 
10 | // download test parquet file if needed
11 | let stat = await fs.stat(filename).catch(() => undefined)
12 | if (!stat) {
13 |   console.log('downloading ' + url)
14 |   const res = await fetch(url)
15 |   if (!res.ok) throw new Error(res.statusText)
16 |   // write to file async
17 |   await pipeline(res.body, createWriteStream(filename))
18 |   stat = await fs.stat(filename)
19 |   console.log('downloaded example.parquet', stat.size)
20 | }
21 | 
22 | // asyncBuffer
23 | const file = await asyncBufferFromFile(filename)
24 | const startTime = performance.now()
25 | console.log('parsing example.parquet data...')
26 | 
27 | // read parquet file
28 | await parquetReadObjects({
29 |   file,
30 |   compressors,
31 | })
32 | const ms = performance.now() - startTime
33 | console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`)
34 | 


--------------------------------------------------------------------------------
/test/plan.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { parquetMetadataAsync } from '../src/index.js'
 3 | import { asyncBufferFromFile } from '../src/node.js'
 4 | import { parquetPlan } from '../src/plan.js'
 5 | 
 6 | describe('parquetPlan', () => {
 7 |   it('generates a query plan', async () => {
 8 |     const file = await asyncBufferFromFile('test/files/offset_indexed.parquet')
 9 |     const metadata = await parquetMetadataAsync(file)
10 |     const plan = parquetPlan({ file, metadata })
11 |     expect(plan).toMatchObject({
12 |       metadata,
13 |       rowStart: 0,
14 |       rowEnd: 200,
15 |       fetches: [
16 |         { startByte: 4, endByte: 14772 },
17 |         { startByte: 14772, endByte: 29507 },
18 |       ],
19 |       groups: [
20 |         {
21 |           groupRows: 100,
22 |           groupStart: 0,
23 |           chunks: [
24 |             { range: { startByte: 4, endByte: 438 } },
25 |             { range: { startByte: 438, endByte: 14772 } },
26 |           ],
27 |         },
28 |         {
29 |           groupRows: 100,
30 |           groupStart: 100,
31 |           chunks: [
32 |             { range: { startByte: 14772, endByte: 15208 } },
33 |             { range: { startByte: 15208, endByte: 29507 } },
34 |           ],
35 |         },
36 |       ],
37 |     })
38 |   })
39 | })
40 | 


--------------------------------------------------------------------------------
/test/files/duckdb_delta_binary_packed.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "duckdb_schema",
 7 |       "num_children": 1
 8 |     },
 9 |     {
10 |       "type": "INT64",
11 |       "repetition_type": "OPTIONAL",
12 |       "name": "range",
13 |       "converted_type": "INT_64"
14 |     }
15 |   ],
16 |   "num_rows": 1250,
17 |   "row_groups": [
18 |     {
19 |       "columns": [
20 |         {
21 |           "file_offset": 0,
22 |           "meta_data": {
23 |             "type": "INT64",
24 |             "encodings": ["DELTA_BINARY_PACKED"],
25 |             "path_in_schema": ["range"],
26 |             "codec": "SNAPPY",
27 |             "num_values": 1250,
28 |             "total_uncompressed_size": 40,
29 |             "total_compressed_size": 42,
30 |             "data_page_offset": 4,
31 |             "statistics": {
32 |               "max": 1249,
33 |               "min": 0,
34 |               "null_count": 0,
35 |               "max_value": 1249,
36 |               "min_value": 0
37 |             }
38 |           }
39 |         }
40 |       ],
41 |       "total_byte_size": 40,
42 |       "num_rows": 1250,
43 |       "file_offset": 4
44 |     }
45 |   ],
46 |   "created_by": "DuckDB version v1.2.1 (build 8e52ec4395)",
47 |   "metadata_length": 169
48 | }
49 | 


--------------------------------------------------------------------------------
/test/files/duckdb_delta_length_byte_array.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "duckdb_schema",
 7 |       "num_children": 1
 8 |     },
 9 |     {
10 |       "type": "BYTE_ARRAY",
11 |       "repetition_type": "OPTIONAL",
12 |       "name": "range_varchar",
13 |       "converted_type": "UTF8"
14 |     }
15 |   ],
16 |   "num_rows": 1250,
17 |   "row_groups": [
18 |     {
19 |       "columns": [
20 |         {
21 |           "file_offset": 0,
22 |           "meta_data": {
23 |             "type": "BYTE_ARRAY",
24 |             "encodings": ["DELTA_LENGTH_BYTE_ARRAY"],
25 |             "path_in_schema": ["range_varchar"],
26 |             "codec": "SNAPPY",
27 |             "num_values": 1250,
28 |             "total_uncompressed_size": 3996,
29 |             "total_compressed_size": 3390,
30 |             "data_page_offset": 4,
31 |             "statistics": {
32 |               "max": "999",
33 |               "min": "0",
34 |               "null_count": 0,
35 |               "max_value": "999",
36 |               "min_value": "0"
37 |             }
38 |           }
39 |         }
40 |       ],
41 |       "total_byte_size": 3996,
42 |       "num_rows": 1250,
43 |       "file_offset": 4
44 |     }
45 |   ],
46 |   "created_by": "DuckDB version v1.2.1 (build 8e52ec4395)",
47 |   "metadata_length": 164
48 | }
49 | 


--------------------------------------------------------------------------------
/test/files/concatenated_gzip_members.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "metadata_length": 115,
 4 |   "num_rows": 513,
 5 |   "row_groups": [
 6 |     {
 7 |       "columns": [
 8 |         {
 9 |           "file_offset": 1471,
10 |           "meta_data": {
11 |             "codec": "GZIP",
12 |             "data_page_offset": 4,
13 |             "encodings": [
14 |               "PLAIN",
15 |               "RLE"
16 |             ],
17 |             "num_values": 513,
18 |             "path_in_schema": [
19 |               "long_col"
20 |             ],
21 |             "statistics": {
22 |               "max_value": 513,
23 |               "min_value": 1
24 |             },
25 |             "total_compressed_size": 1467,
26 |             "total_uncompressed_size": 4155,
27 |             "type": "INT64"
28 |           }
29 |         }
30 |       ],
31 |       "num_rows": 513,
32 |       "ordinal": 0,
33 |       "total_byte_size": 4155,
34 |       "total_compressed_size": 1467
35 |     }
36 |   ],
37 |   "schema": [
38 |     {
39 |       "name": "root",
40 |       "num_children": 1
41 |     },
42 |     {
43 |       "converted_type": "UINT_64",
44 |       "logical_type": {
45 |         "type": "INTEGER",
46 |         "bitWidth": 64,
47 |         "isSigned": false
48 |       },
49 |       "name": "long_col",
50 |       "repetition_type": "OPTIONAL",
51 |       "type": "INT64"
52 |     }
53 |   ]
54 | }
55 | 


--------------------------------------------------------------------------------
/test/files/nullable.impala.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     1,
 4 |     [1, 2, 3],
 5 |     [[1, 2], [3, 4]],
 6 |     {"k1": 1, "k2": 100},
 7 |     [{"k1": 1}],
 8 |     {"A":1,"b":[1],"C":{"d":[[{"E":10,"F":"aaa"},{"E":-10,"F":"bbb"}],[{"E":11,"F":"c"}]]},"g":{"foo":{"H":{"i":[1.1]}}}}
 9 |   ],
10 |   [
11 |     2,
12 |     [null, 1, 2, null, 3, null],
13 |     [[null, 1, 2, null], [3, null, 4], [], null],
14 |     {"k1": 2, "k2": null},
15 |     [{"k1": 1, "k3": null}, null, {}],
16 |     {"A":null,"b":[null],"C":{"d":[[{"E":null,"F":null},{"E":10,"F":"aaa"},{"E":null,"F":null},{"E":-10,"F":"bbb"},{"E":null,"F":null}],[{"E":11,"F":"c"},null],[],null]},"g":{"g1":{"H":{"i":[2.2,null]}},"g2":{"H":{"i":[]}},"g3":null,"g4":{"H":{}},"g5":{}}}
17 |   ],
18 |   [
19 |     3,
20 |     [],
21 |     [null],
22 |     {},
23 |     [null, null],
24 |     {"A":null,"C":{"d":[]},"g":{}}
25 |   ],
26 |   [
27 |     4,
28 |     null,
29 |     [],
30 |     {},
31 |     [],
32 |     {"A":null,"C":{}}
33 |   ],
34 |   [
35 |     5,
36 |     null,
37 |     null,
38 |     {},
39 |     null,
40 |     {"A":null,"g":{"foo":{"H":{"i":[2.2,3.3]}}}}
41 |   ],
42 |   [
43 |     6,
44 |     null,
45 |     null,
46 |     null,
47 |     null,
48 |     null
49 |   ],
50 |   [
51 |     7,
52 |     null,
53 |     [null, [5, 6]],
54 |     {"k1": null, "k3": null},
55 |     null,
56 |     {"A":7,"b":[2,3,null],"C":{"d":[[],[null],null]}}
57 |   ]
58 | ]
59 | 


--------------------------------------------------------------------------------
/test/package.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import packageJson from '../package.json' with { type: 'json' }
 3 | 
 4 | describe('package.json', () => {
 5 |   it('should have the correct name', () => {
 6 |     expect(packageJson.name).toBe('hyparquet')
 7 |   })
 8 |   it('should have a valid version', () => {
 9 |     expect(packageJson.version).toMatch(/^\d+\.\d+\.\d+$/)
10 |   })
11 |   it('should have MIT license', () => {
12 |     expect(packageJson.license).toBe('MIT')
13 |   })
14 |   it('should have precise dev dependency versions', () => {
15 |     const { devDependencies } = packageJson
16 |     Object.values(devDependencies).forEach(version => {
17 |       expect(version).toMatch(/^\d+\.\d+\.\d+$/)
18 |     })
19 |   })
20 |   it('should have no dependencies', () => {
21 |     expect('dependencies' in packageJson).toBe(false)
22 |     expect('peerDependencies' in packageJson).toBe(false)
23 |   })
24 |   it('should have exports with types first', () => {
25 |     const { exports } = packageJson
26 |     expect(Object.keys(exports)).toEqual(['.', './src/*.js'])
27 |     // node vs default (browser)
28 |     expect(Object.keys(exports['.'])).toEqual(['browser', 'default'])
29 |     expect(Object.keys(exports['.'].browser)).toEqual(['types', 'import'])
30 |     expect(Object.keys(exports['.'].default)).toEqual(['types', 'import'])
31 |     // deep imports
32 |     expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'import'])
33 |   })
34 | })
35 | 


--------------------------------------------------------------------------------
/test/files/issue90.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "duckdb_schema",
 7 |       "num_children": 1
 8 |     },
 9 |     {
10 |       "type": "DOUBLE",
11 |       "repetition_type": "OPTIONAL",
12 |       "name": "elb_01yr_imp_val"
13 |     }
14 |   ],
15 |   "num_rows": 6144,
16 |   "row_groups": [
17 |     {
18 |       "columns": [
19 |         {
20 |           "file_offset": 0,
21 |           "meta_data": {
22 |             "type": "DOUBLE",
23 |             "encodings": [
24 |               "PLAIN"
25 |             ],
26 |             "path_in_schema": [
27 |               "elb_01yr_imp_val"
28 |             ],
29 |             "codec": "SNAPPY",
30 |             "num_values": 6144,
31 |             "total_uncompressed_size": 45059,
32 |             "total_compressed_size": 44650,
33 |             "data_page_offset": 4,
34 |             "statistics": {
35 |               "max": 449097851.5197593,
36 |               "min": 0,
37 |               "null_count": 610,
38 |               "max_value": 449097851.5197593,
39 |               "min_value": 0,
40 |               "is_max_value_exact": true,
41 |               "is_min_value_exact": true
42 |             }
43 |           }
44 |         }
45 |       ],
46 |       "total_byte_size": 45059,
47 |       "num_rows": 6144,
48 |       "file_offset": 4
49 |     }
50 |   ],
51 |   "created_by": "DuckDB version v1.3.0 (build 71c5c07cdd)",
52 |   "metadata_length": 198
53 | }
54 | 


--------------------------------------------------------------------------------
/test/files/delta_length_byte_array.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "schema",
 7 |       "num_children": 1,
 8 |       "field_id": -1
 9 |     },
10 |     {
11 |       "type": "BYTE_ARRAY",
12 |       "repetition_type": "OPTIONAL",
13 |       "name": "FRUIT",
14 |       "converted_type": "UTF8",
15 |       "field_id": 1,
16 |       "logical_type": {
17 |         "type": "STRING"
18 |       }
19 |     }
20 |   ],
21 |   "num_rows": 1000,
22 |   "row_groups": [
23 |     {
24 |       "columns": [
25 |         {
26 |           "file_offset": 2629,
27 |           "meta_data": {
28 |             "type": "BYTE_ARRAY",
29 |             "encodings": [
30 |               "RLE",
31 |               "DELTA_LENGTH_BYTE_ARRAY"
32 |             ],
33 |             "path_in_schema": [
34 |               "FRUIT"
35 |             ],
36 |             "codec": "ZSTD",
37 |             "num_values": 1000,
38 |             "total_uncompressed_size": 23747,
39 |             "total_compressed_size": 2625,
40 |             "data_page_offset": 4,
41 |             "encoding_stats": [
42 |               {
43 |                 "page_type": "DATA_PAGE_V2",
44 |                 "encoding": "DELTA_LENGTH_BYTE_ARRAY",
45 |                 "count": 1
46 |               }
47 |             ]
48 |           }
49 |         }
50 |       ],
51 |       "total_byte_size": 23747,
52 |       "num_rows": 1000,
53 |       "file_offset": 0,
54 |       "total_compressed_size": 2625,
55 |       "ordinal": 0
56 |     }
57 |   ],
58 |   "metadata_length": 105
59 | }
60 | 


--------------------------------------------------------------------------------
/test/files/duckdb5533.column_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     {
 4 |       "boundary_order": "UNORDERED",
 5 |       "max_values": [
 6 |         "2022-11-27T17:42:44.280Z"
 7 |       ],
 8 |       "min_values": [
 9 |         "2022-11-27T17:42:43.514Z"
10 |       ],
11 |       "null_counts": [
12 |         0
13 |       ],
14 |       "null_pages": [
15 |         false
16 |       ]
17 |     },
18 |     {
19 |       "boundary_order": "UNORDERED",
20 |       "max_values": [
21 |         85016
22 |       ],
23 |       "min_values": [
24 |         1184
25 |       ],
26 |       "null_counts": [
27 |         0
28 |       ],
29 |       "null_pages": [
30 |         false
31 |       ]
32 |     },
33 |     {
34 |       "boundary_order": "UNORDERED",
35 |       "max_values": [
36 |         [
37 |           0
38 |         ]
39 |       ],
40 |       "min_values": [
41 |         [
42 |           0
43 |         ]
44 |       ],
45 |       "null_counts": [
46 |         4
47 |       ],
48 |       "null_pages": [
49 |         true
50 |       ]
51 |     },
52 |     {
53 |       "boundary_order": "UNORDERED",
54 |       "max_values": [
55 |         1
56 |       ],
57 |       "min_values": [
58 |         -1
59 |       ],
60 |       "null_counts": [
61 |         0
62 |       ],
63 |       "null_pages": [
64 |         false
65 |       ]
66 |     },
67 |     {
68 |       "boundary_order": "UNORDERED",
69 |       "max_values": [
70 |         343
71 |       ],
72 |       "min_values": [
73 |         343
74 |       ],
75 |       "null_counts": [
76 |         0
77 |       ],
78 |       "null_pages": [
79 |         false
80 |       ]
81 |     }
82 |   ]
83 | ]
84 | 


--------------------------------------------------------------------------------
/src/geoparquet.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @import {KeyValue, LogicalType, SchemaElement} from '../src/types.d.ts'
 3 |  * @param {SchemaElement[]} schema
 4 |  * @param {KeyValue[] | undefined} key_value_metadata
 5 |  * @returns {void}
 6 |  */
 7 | export function markGeoColumns(schema, key_value_metadata) {
 8 |   // Prepare the list of GeoParquet columns
 9 |   /** @type {Map<string, LogicalType>} */
10 |   const columns = new Map()
11 |   const geo = key_value_metadata?.find(({ key }) => key === 'geo')?.value
12 |   const decodedColumns = (geo && JSON.parse(geo)?.columns) ?? {}
13 |   for (const [name, column] of Object.entries(decodedColumns)) {
14 |     if (column.encoding !== 'WKB') {
15 |       continue
16 |     }
17 |     const type = column.edges === 'spherical' ? 'GEOGRAPHY' : 'GEOMETRY'
18 |     const id = column.crs?.id ?? column.crs?.ids?.[0]
19 |     const crs = id ? `${id.authority}:${id.code.toString()}` : undefined
20 |     // Note: we can't infer GEOGRAPHY's algorithm from GeoParquet
21 |     columns.set(name, { type, crs })
22 |   }
23 | 
24 |   // Mark schema elements with logical type
25 |   // Only look at root-level columns of type BYTE_ARRAY without existing logical_type
26 |   for (let i = 1; i < schema.length; i++) { // skip root
27 |     const element = schema[i]
28 |     const { logical_type, name, num_children, repetition_type, type } = element
29 |     if (num_children) {
30 |       i += num_children
31 |       continue // skip the element and its children
32 |     }
33 |     if (type === 'BYTE_ARRAY' && logical_type === undefined && repetition_type !== 'REPEATED') {
34 |       element.logical_type = columns.get(name)
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/test/files/issue72.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "name": "root",
 6 |       "num_children": 1
 7 |     },
 8 |     {
 9 |       "type": "BYTE_ARRAY",
10 |       "repetition_type": "OPTIONAL",
11 |       "name": "TextColumn",
12 |       "converted_type": "UTF8",
13 |       "logical_type": {
14 |         "type": "STRING"
15 |       }
16 |     }
17 |   ],
18 |   "num_rows": 3,
19 |   "row_groups": [
20 |     {
21 |       "columns": [
22 |         {
23 |           "file_offset": 4,
24 |           "meta_data": {
25 |             "type": "BYTE_ARRAY",
26 |             "encodings": [
27 |               "RLE",
28 |               "BIT_PACKED",
29 |               "PLAIN"
30 |             ],
31 |             "path_in_schema": [
32 |               "TextColumn"
33 |             ],
34 |             "codec": "SNAPPY",
35 |             "num_values": 3,
36 |             "total_uncompressed_size": 283,
37 |             "total_compressed_size": 288,
38 |             "data_page_offset": 4,
39 |             "statistics": {
40 |               "max": "258d7fff-6418-499f-af07-c6611937d7d8",
41 |               "min": "086f2968-327b-48a8-8cdf-64f46bcd8173",
42 |               "null_count": 0,
43 |               "distinct_count": 2,
44 |               "max_value": "258d7fff-6418-499f-af07-c6611937d7d8",
45 |               "min_value": "086f2968-327b-48a8-8cdf-64f46bcd8173"
46 |             }
47 |           }
48 |         }
49 |       ],
50 |       "total_byte_size": 288,
51 |       "num_rows": 3
52 |     }
53 |   ],
54 |   "created_by": "Parquet.Net version 4.25.0 (build 687fbb462e94eddd1dc5a0aa26f33ba8e53f60e3)",
55 |   "metadata_length": 321
56 | }
57 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "hyparquet",
 3 |   "version": "1.23.3",
 4 |   "description": "Parquet file parser for JavaScript",
 5 |   "author": "Hyperparam",
 6 |   "homepage": "https://hyperparam.app",
 7 |   "keywords": [
 8 |     "ai",
 9 |     "data",
10 |     "dataset",
11 |     "hyperparam",
12 |     "hyparquet",
13 |     "ml",
14 |     "parquet",
15 |     "parquetjs",
16 |     "parser",
17 |     "snappy",
18 |     "thrift"
19 |   ],
20 |   "license": "MIT",
21 |   "repository": {
22 |     "type": "git",
23 |     "url": "git+https://github.com/hyparam/hyparquet.git"
24 |   },
25 |   "files": [
26 |     "src",
27 |     "types"
28 |   ],
29 |   "type": "module",
30 |   "types": "types/index.d.ts",
31 |   "main": "src/index.js",
32 |   "exports": {
33 |     ".": {
34 |       "browser": {
35 |         "types": "./types/index.d.ts",
36 |         "import": "./src/index.js"
37 |       },
38 |       "default": {
39 |         "types": "./types/node.d.ts",
40 |         "import": "./src/node.js"
41 |       }
42 |     },
43 |     "./src/*.js": {
44 |       "types": "./types/*.d.ts",
45 |       "import": "./src/*.js"
46 |     }
47 |   },
48 |   "sideEffects": false,
49 |   "scripts": {
50 |     "build:types": "tsc -p ./tsconfig.build.json",
51 |     "coverage": "vitest run --coverage --coverage.include=src",
52 |     "lint": "eslint",
53 |     "lint:fix": "eslint --fix",
54 |     "prepare": "npm run build:types",
55 |     "test": "vitest run"
56 |   },
57 |   "devDependencies": {
58 |     "@types/node": "25.0.3",
59 |     "@vitest/coverage-v8": "4.0.16",
60 |     "eslint": "9.39.2",
61 |     "eslint-plugin-jsdoc": "61.5.0",
62 |     "hyparquet-compressors": "1.1.1",
63 |     "typescript": "5.9.3",
64 |     "vitest": "4.0.16"
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/test/files/boolean_rle.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "created_by": "Polars",
 3 |   "key_value_metadata": [
 4 |     {
 5 |       "key": "ARROW:schema",
 6 |       "value": "/////3YAAAAEAAAA8v///xQAAAAEAAEAAAAKAAsACAAKAAQA+P///wwAAAAIAAgAAAAEAAEAAAAEAAAA7P///ywAAAAgAAAAGAAAAAEGAAAQABIABAAQABEACAAAAAwAAAAAAPz///8EAAQADQAAAEJvb2xlYW5Db2x1bW4A"
 7 |     }
 8 |   ],
 9 |   "metadata_length": 308,
10 |   "num_rows": 15,
11 |   "row_groups": [
12 |     {
13 |       "columns": [
14 |         {
15 |           "column_index_length": 17,
16 |           "column_index_offset": 89,
17 |           "file_offset": 47,
18 |           "meta_data": {
19 |             "codec": "SNAPPY",
20 |             "data_page_offset": 4,
21 |             "encodings": [
22 |               "RLE"
23 |             ],
24 |             "num_values": 15,
25 |             "path_in_schema": [
26 |               "BooleanColumn"
27 |             ],
28 |             "statistics": {
29 |               "max_value": true,
30 |               "min_value": false,
31 |               "null_count": 5
32 |             },
33 |             "total_compressed_size": 43,
34 |             "total_uncompressed_size": 41,
35 |             "type": "BOOLEAN"
36 |           },
37 |           "offset_index_length": 10,
38 |           "offset_index_offset": 106
39 |         }
40 |       ],
41 |       "file_offset": 4,
42 |       "num_rows": 15,
43 |       "ordinal": 0,
44 |       "total_byte_size": 41,
45 |       "total_compressed_size": 43
46 |     }
47 |   ],
48 |   "schema": [
49 |     {
50 |       "name": "root",
51 |       "num_children": 1
52 |     },
53 |     {
54 |       "name": "BooleanColumn",
55 |       "repetition_type": "OPTIONAL",
56 |       "type": "BOOLEAN"
57 |     }
58 |   ],
59 |   "version": 1
60 | }
61 | 


--------------------------------------------------------------------------------
/src/indexes.js:
--------------------------------------------------------------------------------
 1 | import { BoundaryOrders } from './constants.js'
 2 | import { DEFAULT_PARSERS } from './convert.js'
 3 | import { convertMetadata } from './metadata.js'
 4 | import { deserializeTCompactProtocol } from './thrift.js'
 5 | 
 6 | /**
 7 |  * @param {DataReader} reader
 8 |  * @param {SchemaElement} schema
 9 |  * @param {ParquetParsers | undefined} parsers
10 |  * @returns {ColumnIndex}
11 |  */
12 | export function readColumnIndex(reader, schema, parsers = undefined) {
13 |   parsers = { ...DEFAULT_PARSERS, ...parsers }
14 | 
15 |   const thrift = deserializeTCompactProtocol(reader)
16 |   return {
17 |     null_pages: thrift.field_1,
18 |     min_values: thrift.field_2.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)),
19 |     max_values: thrift.field_3.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)),
20 |     boundary_order: BoundaryOrders[thrift.field_4],
21 |     null_counts: thrift.field_5,
22 |     repetition_level_histograms: thrift.field_6,
23 |     definition_level_histograms: thrift.field_7,
24 |   }
25 | }
26 | 
27 | /**
28 |  * @param {DataReader} reader
29 |  * @returns {OffsetIndex}
30 |  */
31 | export function readOffsetIndex(reader) {
32 |   const thrift = deserializeTCompactProtocol(reader)
33 |   return {
34 |     page_locations: thrift.field_1.map(pageLocation),
35 |     unencoded_byte_array_data_bytes: thrift.field_2,
36 |   }
37 | }
38 | 
39 | /**
40 |  * @import {ColumnIndex, DataReader, OffsetIndex, PageLocation, ParquetParsers, SchemaElement} from '../src/types.d.ts'
41 |  * @param {any} loc
42 |  * @returns {PageLocation}
43 |  */
44 | function pageLocation(loc) {
45 |   return {
46 |     offset: loc.field_1,
47 |     compressed_page_size: loc.field_2,
48 |     first_row_index: loc.field_3,
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/test/files/offset_indexed.offset_indexes.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     null,
 4 |     {
 5 |       "page_locations": [
 6 |         {
 7 |           "offset": 438,
 8 |           "compressed_page_size": 2670,
 9 |           "first_row_index": 0
10 |         },
11 |         {
12 |           "offset": 3108,
13 |           "compressed_page_size": 2680,
14 |           "first_row_index": 18
15 |         },
16 |         {
17 |           "offset": 5788,
18 |           "compressed_page_size": 2704,
19 |           "first_row_index": 37
20 |         },
21 |         {
22 |           "offset": 8492,
23 |           "compressed_page_size": 2660,
24 |           "first_row_index": 54
25 |         },
26 |         {
27 |           "offset": 11152,
28 |           "compressed_page_size": 2790,
29 |           "first_row_index": 75
30 |         },
31 |         {
32 |           "offset": 13942,
33 |           "compressed_page_size": 830,
34 |           "first_row_index": 95
35 |         }
36 |       ]
37 |     }
38 |   ],
39 |   [
40 |     null,
41 |     {
42 |       "page_locations": [
43 |         {
44 |           "offset": 15208,
45 |           "compressed_page_size": 2784,
46 |           "first_row_index": 0
47 |         },
48 |         {
49 |           "offset": 17992,
50 |           "compressed_page_size": 2660,
51 |           "first_row_index": 20
52 |         },
53 |         {
54 |           "offset": 20652,
55 |           "compressed_page_size": 2648,
56 |           "first_row_index": 40
57 |         },
58 |         {
59 |           "offset": 23300,
60 |           "compressed_page_size": 2721,
61 |           "first_row_index": 58
62 |         },
63 |         {
64 |           "offset": 26021,
65 |           "compressed_page_size": 2734,
66 |           "first_row_index": 78
67 |         },
68 |         {
69 |           "offset": 28755,
70 |           "compressed_page_size": 752,
71 |           "first_row_index": 96
72 |         }
73 |       ]
74 |     }
75 |   ]
76 | ]
77 | 


--------------------------------------------------------------------------------
/test/readFiles.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { compressors } from 'hyparquet-compressors'
 3 | import { describe, expect, it } from 'vitest'
 4 | import { parquetMetadataAsync, parquetRead, toJson } from '../src/index.js'
 5 | import { asyncBufferFromFile } from '../src/node.js'
 6 | import { fileToJson } from './helpers.js'
 7 | 
 8 | describe('parquetRead test files', () => {
 9 |   const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))
10 | 
11 |   files.forEach(filename => {
12 |     it(`parse data from ${filename}`, async () => {
13 |       const file = await asyncBufferFromFile(`test/files/${filename}`)
14 |       await parquetRead({
15 |         file,
16 |         compressors,
17 |         onComplete(rows) {
18 |           const base = filename.replace('.parquet', '')
19 |           const expected = fileToJson(`test/files/${base}.json`)
20 |           // stringify and parse to make legal json (NaN, -0, etc)
21 |           expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected)
22 |         },
23 |       })
24 |     })
25 | 
26 |     it(`read the last row from ${filename}`, async () => {
27 |       // this exercises some of the page-skipping optimizations
28 |       const file = await asyncBufferFromFile(`test/files/${filename}`)
29 |       const metadata = await parquetMetadataAsync(file)
30 |       let numRows = Number(metadata.num_rows)
31 |       // repeated_no_annotation has wrong num_rows in metadata:
32 |       if (filename === 'repeated_no_annotation.parquet') numRows = 6
33 |       await parquetRead({
34 |         file,
35 |         compressors,
36 |         rowStart: numRows - 1,
37 |         rowEnd: numRows,
38 |         onComplete(rows) {
39 |           const base = filename.replace('.parquet', '')
40 |           if (rows.length) {
41 |             const expected = [fileToJson(`test/files/${base}.json`).at(-1)]
42 |             expect(toJson(rows)).toEqual(expected)
43 |           }
44 |         },
45 |       })
46 |     })
47 |   })
48 | })
49 | 


--------------------------------------------------------------------------------
/test/files/fixed_length_decimal.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 1,
 3 |     "schema": [
 4 |       {
 5 |         "name": "spark_schema",
 6 |         "num_children": 1
 7 |       },
 8 |       {
 9 |         "type": "FIXED_LEN_BYTE_ARRAY",
10 |         "type_length": 11,
11 |         "repetition_type": "OPTIONAL",
12 |         "name": "value",
13 |         "converted_type": "DECIMAL",
14 |         "scale": 2,
15 |         "precision": 25
16 |       }
17 |     ],
18 |     "num_rows": 24,
19 |     "row_groups": [
20 |       {
21 |         "columns": [
22 |           {
23 |             "file_offset": 4,
24 |             "meta_data": {
25 |               "type": "FIXED_LEN_BYTE_ARRAY",
26 |               "encodings": [
27 |                 "BIT_PACKED",
28 |                 "RLE",
29 |                 "PLAIN"
30 |               ],
31 |               "path_in_schema": [
32 |                 "value"
33 |               ],
34 |               "codec": "UNCOMPRESSED",
35 |               "num_values": 24,
36 |               "total_uncompressed_size": 319,
37 |               "total_compressed_size": 319,
38 |               "data_page_offset": 4,
39 |               "statistics": {
40 |                 "max": 24,
41 |                 "min": 2,
42 |                 "null_count": 0
43 |               },
44 |               "encoding_stats": [
45 |                 {
46 |                   "page_type": "DATA_PAGE",
47 |                   "encoding": "PLAIN",
48 |                   "count": 1
49 |                 }
50 |               ]
51 |             }
52 |           }
53 |         ],
54 |         "total_byte_size": 319,
55 |         "num_rows": 24
56 |       }
57 |     ],
58 |     "key_value_metadata": [
59 |       {
60 |         "key": "org.apache.spark.sql.parquet.row.metadata",
61 |         "value": "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"decimal(25,2)\",\"nullable\":true,\"metadata\":{}}]}"
62 |       }
63 |     ],
64 |     "created_by": "parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)",
65 |     "metadata_length": 346
66 |   }
67 | 


--------------------------------------------------------------------------------
/test/read.utf8.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { parquetReadObjects } from '../src/index.js'
 3 | import { asyncBufferFromFile } from '../src/node.js'
 4 | 
 5 | describe('parquetRead utf8', () => {
 6 |   it('default utf8 behavior', async () => {
 7 |     const file = await asyncBufferFromFile('test/files/strings.parquet')
 8 |     const rows = await parquetReadObjects({ file })
 9 |     expect(rows).toEqual([
10 |       { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
11 |       { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
12 |       { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
13 |       { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
14 |     ])
15 |   })
16 | 
17 |   it('utf8 = true', async () => {
18 |     const file = await asyncBufferFromFile('test/files/strings.parquet')
19 |     const rows = await parquetReadObjects({ file, utf8: true })
20 |     expect(rows).toEqual([
21 |       { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
22 |       { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
23 |       { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
24 |       { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
25 |     ])
26 |   })
27 | 
28 |   it('utf8 = false', async () => {
29 |     const file = await asyncBufferFromFile('test/files/strings.parquet')
30 |     const rows = await parquetReadObjects({ file, utf8: false })
31 |     expect(rows).toEqual([
32 |       {
33 |         bytes: new Uint8Array([97, 108, 112, 104, 97]),
34 |         c_utf8: 'alpha',
35 |         l_utf8: 'alpha',
36 |       },
37 |       {
38 |         bytes: new Uint8Array([98, 114, 97, 118, 111]),
39 |         c_utf8: 'bravo',
40 |         l_utf8: 'bravo',
41 |       },
42 |       {
43 |         bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]),
44 |         c_utf8: 'charlie',
45 |         l_utf8: 'charlie',
46 |       },
47 |       {
48 |         bytes: new Uint8Array([100, 101, 108, 116, 97]),
49 |         c_utf8: 'delta',
50 |         l_utf8: 'delta',
51 |       },
52 |     ])
53 |   })
54 | })
55 | 


--------------------------------------------------------------------------------
/test/files/float16_nonzeros_and_nans.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |     "repetition_type": "REQUIRED",
 6 |     "name": "schema",
 7 |     "num_children": 1
 8 |     },
 9 |     {
10 |     "type": "FIXED_LEN_BYTE_ARRAY",
11 |     "type_length": 2,
12 |     "repetition_type": "OPTIONAL",
13 |     "name": "x",
14 |     "logical_type": {
15 |       "type": "FLOAT16"
16 |     }
17 |     }
18 |   ],
19 |   "num_rows": 8,
20 |   "row_groups": [
21 |     {
22 |     "columns": [
23 |       {
24 |       "file_offset": 80,
25 |       "meta_data": {
26 |         "type": "FIXED_LEN_BYTE_ARRAY",
27 |         "encodings": [
28 |           "PLAIN",
29 |           "RLE",
30 |           "RLE_DICTIONARY"
31 |         ],
32 |         "path_in_schema": [
33 |           "x"
34 |         ],
35 |         "codec": "UNCOMPRESSED",
36 |         "num_values": 8,
37 |         "total_uncompressed_size": 76,
38 |         "total_compressed_size": 76,
39 |         "data_page_offset": 32,
40 |         "dictionary_page_offset": 4,
41 |         "statistics": {
42 |         "max": 2,
43 |         "min": -2,
44 |         "null_count": 1,
45 |         "max_value": 2,
46 |         "min_value": -2
47 |       },
48 |       "encoding_stats": [
49 |         {
50 |           "page_type": "DICTIONARY_PAGE",
51 |           "encoding": "PLAIN",
52 |           "count": 1
53 |         },
54 |         {
55 |           "page_type": "DATA_PAGE",
56 |           "encoding": "RLE_DICTIONARY",
57 |           "count": 1
58 |         }
59 |         ]
60 |       }
61 |       }
62 |     ],
63 |     "total_byte_size": 76,
64 |     "num_rows": 8,
65 |     "file_offset": 4,
66 |     "total_compressed_size": 76,
67 |     "ordinal": 0
68 |     }
69 |   ],
70 |   "key_value_metadata": [
71 |     {
72 |     "key": "ARROW:schema",
73 |     "value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEDEAAAABgAAAAEAAAAAAAAAAEAAAB4AAAABAAEAAQAAAAAAAAA"
74 |     }
75 |   ],
76 |   "created_by": "parquet-cpp-arrow version 15.0.0-SNAPSHOT",
77 |   "metadata_length": 346
78 | }
79 | 


--------------------------------------------------------------------------------
/src/constants.js:
--------------------------------------------------------------------------------
 1 | 
 2 | /** @type {import('../src/types.d.ts').ParquetType[]} */
 3 | export const ParquetTypes = [
 4 |   'BOOLEAN',
 5 |   'INT32',
 6 |   'INT64',
 7 |   'INT96', // deprecated
 8 |   'FLOAT',
 9 |   'DOUBLE',
10 |   'BYTE_ARRAY',
11 |   'FIXED_LEN_BYTE_ARRAY',
12 | ]
13 | 
14 | /** @type {import('../src/types.d.ts').Encoding[]} */
15 | export const Encodings = [
16 |   'PLAIN',
17 |   'GROUP_VAR_INT', // deprecated
18 |   'PLAIN_DICTIONARY',
19 |   'RLE',
20 |   'BIT_PACKED', // deprecated
21 |   'DELTA_BINARY_PACKED',
22 |   'DELTA_LENGTH_BYTE_ARRAY',
23 |   'DELTA_BYTE_ARRAY',
24 |   'RLE_DICTIONARY',
25 |   'BYTE_STREAM_SPLIT',
26 | ]
27 | 
28 | /** @type {import('../src/types.d.ts').FieldRepetitionType[]} */
29 | export const FieldRepetitionTypes = [
30 |   'REQUIRED',
31 |   'OPTIONAL',
32 |   'REPEATED',
33 | ]
34 | 
35 | /** @type {import('../src/types.d.ts').ConvertedType[]} */
36 | export const ConvertedTypes = [
37 |   'UTF8',
38 |   'MAP',
39 |   'MAP_KEY_VALUE',
40 |   'LIST',
41 |   'ENUM',
42 |   'DECIMAL',
43 |   'DATE',
44 |   'TIME_MILLIS',
45 |   'TIME_MICROS',
46 |   'TIMESTAMP_MILLIS',
47 |   'TIMESTAMP_MICROS',
48 |   'UINT_8',
49 |   'UINT_16',
50 |   'UINT_32',
51 |   'UINT_64',
52 |   'INT_8',
53 |   'INT_16',
54 |   'INT_32',
55 |   'INT_64',
56 |   'JSON',
57 |   'BSON',
58 |   'INTERVAL',
59 | ]
60 | 
61 | /** @type {import('../src/types.d.ts').CompressionCodec[]} */
62 | export const CompressionCodecs = [
63 |   'UNCOMPRESSED',
64 |   'SNAPPY',
65 |   'GZIP',
66 |   'LZO',
67 |   'BROTLI',
68 |   'LZ4',
69 |   'ZSTD',
70 |   'LZ4_RAW',
71 | ]
72 | 
73 | /** @type {import('../src/types.d.ts').PageType[]} */
74 | export const PageTypes = [
75 |   'DATA_PAGE',
76 |   'INDEX_PAGE',
77 |   'DICTIONARY_PAGE',
78 |   'DATA_PAGE_V2',
79 | ]
80 | 
81 | /** @type {import('../src/types.d.ts').BoundaryOrder[]} */
82 | export const BoundaryOrders = [
83 |   'UNORDERED',
84 |   'ASCENDING',
85 |   'DESCENDING',
86 | ]
87 | 
88 | /** @type {import('../src/types.d.ts').EdgeInterpolationAlgorithm[]} */
89 | export const EdgeInterpolationAlgorithms = [
90 |   'SPHERICAL',
91 |   'VINCENTY',
92 |   'THOMAS',
93 |   'ANDOYER',
94 |   'KARNEY',
95 | ]
96 | 


--------------------------------------------------------------------------------
/test/files/duckdb3734.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     "tt0000001",
  4 |     {
  5 |       "category": "self",
  6 |       "characters": ["[\"Self\"]"],
  7 |       "job": "\\N",
  8 |       "nconst": "nm1588970",
  9 |       "ordering": 1
 10 |     }
 11 |   ],
 12 |   [
 13 |     "tt0000001",
 14 |     {
 15 |       "category": "director",
 16 |       "characters": ["\\N"],
 17 |       "job": "\\N",
 18 |       "nconst": "nm0005690",
 19 |       "ordering": 2
 20 |     }
 21 |   ],
 22 |   [
 23 |     "tt0000001",
 24 |     {
 25 |       "category": "cinematographer",
 26 |       "characters": ["\\N"],
 27 |       "job": "director of photography",
 28 |       "nconst": "nm0374658",
 29 |       "ordering": 3
 30 |     }
 31 |   ],
 32 |   [
 33 |     "tt0000002",
 34 |     {
 35 |       "category": "director",
 36 |       "characters": ["\\N"],
 37 |       "job": "\\N",
 38 |       "nconst": "nm0721526",
 39 |       "ordering": 1
 40 |     }
 41 |   ],
 42 |   [
 43 |     "tt0000002",
 44 |     {
 45 |       "category": "composer",
 46 |       "characters": ["\\N"],
 47 |       "job": "\\N",
 48 |       "nconst": "nm1335271",
 49 |       "ordering": 2
 50 |     }
 51 |   ],
 52 |   [
 53 |     "tt0000003",
 54 |     {
 55 |       "category": "director",
 56 |       "characters": ["\\N"],
 57 |       "job": "\\N",
 58 |       "nconst": "nm0721526",
 59 |       "ordering": 1
 60 |     }
 61 |   ],
 62 |   [
 63 |     "tt0000003",
 64 |     {
 65 |       "category": "producer",
 66 |       "characters": ["\\N"],
 67 |       "job": "producer",
 68 |       "nconst": "nm1770680",
 69 |       "ordering": 2
 70 |     }
 71 |   ],
 72 |   [
 73 |     "tt0000003",
 74 |     {
 75 |       "category": "composer",
 76 |       "characters": ["\\N"],
 77 |       "job": "\\N",
 78 |       "nconst": "nm1335271",
 79 |       "ordering": 3
 80 |     }
 81 |   ],
 82 |   [
 83 |     "tt0000003",
 84 |     {
 85 |       "category": "editor",
 86 |       "characters": ["\\N"],
 87 |       "job": "\\N",
 88 |       "nconst": "nm5442200",
 89 |       "ordering": 4
 90 |     }
 91 |   ],
 92 |   [
 93 |     "tt0000004",
 94 |     {
 95 |       "category": "director",
 96 |       "characters": ["\\N"],
 97 |       "job": "\\N",
 98 |       "nconst": "nm0721526",
 99 |       "ordering": 1
100 |     }
101 |   ]
102 | ]
103 | 


--------------------------------------------------------------------------------
/test/files/duckdb2557.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "version": 1,
 3 |  "schema": [
 4 |    {
 5 |      "name": "root",
 6 |      "num_children": 3
 7 |    },
 8 |    {
 9 |      "type": "BYTE_ARRAY",
10 |      "repetition_type": "REPEATED",
11 |      "name": "stringArray",
12 |      "converted_type": "UTF8"
13 |    },
14 |    {
15 |      "type": "INT32",
16 |      "repetition_type": "REPEATED",
17 |      "name": "intArray"
18 |    },
19 |    {
20 |      "type": "DOUBLE",
21 |      "repetition_type": "REPEATED",
22 |      "name": "doubleArray"
23 |    }
24 |  ],
25 |  "num_rows": 100,
26 |  "row_groups": [
27 |    {
28 |      "columns": [
29 |        {
30 |          "file_offset": 4802,
31 |          "meta_data": {
32 |            "type": "BYTE_ARRAY",
33 |            "encodings": [
34 |              "RLE",
35 |              "PLAIN"
36 |            ],
37 |            "path_in_schema": [
38 |              "stringArray"
39 |            ],
40 |            "codec": "UNCOMPRESSED",
41 |            "num_values": 449,
42 |            "total_uncompressed_size": 4798,
43 |            "total_compressed_size": 4798,
44 |            "data_page_offset": 4
45 |          }
46 |        },
47 |        {
48 |          "file_offset": 5874,
49 |          "meta_data": {
50 |            "type": "INT32",
51 |            "encodings": [
52 |              "RLE",
53 |              "PLAIN"
54 |            ],
55 |            "path_in_schema": [
56 |              "intArray"
57 |            ],
58 |            "codec": "UNCOMPRESSED",
59 |            "num_values": 237,
60 |            "total_uncompressed_size": 1038,
61 |            "total_compressed_size": 1038,
62 |            "data_page_offset": 4836
63 |          }
64 |        },
65 |        {
66 |          "file_offset": 7663,
67 |          "meta_data": {
68 |            "type": "DOUBLE",
69 |            "encodings": [
70 |              "RLE",
71 |              "PLAIN"
72 |            ],
73 |            "path_in_schema": [
74 |              "doubleArray"
75 |            ],
76 |            "codec": "UNCOMPRESSED",
77 |            "num_values": 225,
78 |            "total_uncompressed_size": 1757,
79 |            "total_compressed_size": 1757,
80 |            "data_page_offset": 5906
81 |          }
82 |        }
83 |      ],
84 |      "total_byte_size": 7694,
85 |      "num_rows": 100
86 |    }
87 |  ],
88 |  "key_value_metadata": [],
89 |  "created_by": "parquet.js",
90 |  "metadata_length": 211
91 | }
92 | 


--------------------------------------------------------------------------------
/test/files/struct_strings.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "schema": [
 4 |     {
 5 |       "name": "test",
 6 |       "num_children": 1
 7 |     },
 8 |     {
 9 |       "repetition_type": "REQUIRED",
10 |       "name": "inner",
11 |       "num_children": 2
12 |     },
13 |     {
14 |       "type": "BYTE_ARRAY",
15 |       "repetition_type": "OPTIONAL",
16 |       "name": "str_field",
17 |       "converted_type": "UTF8",
18 |       "logical_type": {
19 |         "type": "STRING"
20 |       }
21 |     },
22 |     {
23 |       "type": "DOUBLE",
24 |       "repetition_type": "OPTIONAL",
25 |       "name": "f64_field"
26 |     }
27 |   ],
28 |   "num_rows": 2,
29 |   "row_groups": [
30 |     {
31 |       "columns": [
32 |         {
33 |           "file_offset": 4,
34 |           "meta_data": {
35 |             "type": "BYTE_ARRAY",
36 |             "encodings": [
37 |               "RLE",
38 |               "PLAIN"
39 |             ],
40 |             "path_in_schema": [
41 |               "inner",
42 |               "str_field"
43 |             ],
44 |             "codec": "SNAPPY",
45 |             "num_values": 2,
46 |             "total_uncompressed_size": 32,
47 |             "total_compressed_size": 34,
48 |             "key_value_metadata": [],
49 |             "data_page_offset": 4,
50 |             "statistics": {
51 |               "null_count": 1,
52 |               "distinct_count": 1
53 |             }
54 |           }
55 |         },
56 |         {
57 |           "file_offset": 38,
58 |           "meta_data": {
59 |             "type": "DOUBLE",
60 |             "encodings": [
61 |               "RLE",
62 |               "PLAIN"
63 |             ],
64 |             "path_in_schema": [
65 |               "inner",
66 |               "f64_field"
67 |             ],
68 |             "codec": "SNAPPY",
69 |             "num_values": 2,
70 |             "total_uncompressed_size": 31,
71 |             "total_compressed_size": 33,
72 |             "key_value_metadata": [],
73 |             "data_page_offset": 38,
74 |             "statistics": {
75 |               "null_count": 1,
76 |               "distinct_count": 1,
77 |               "max_value": 1.23,
78 |               "min_value": 1.23
79 |             }
80 |           }
81 |         }
82 |       ],
83 |       "total_byte_size": 0,
84 |       "num_rows": 2
85 |     }
86 |   ],
87 |   "key_value_metadata": [],
88 |   "created_by": "parquet-go",
89 |   "metadata_length": 203
90 | }
91 | 


--------------------------------------------------------------------------------
/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import javascript from '@eslint/js'
 2 | import jsdoc from 'eslint-plugin-jsdoc'
 3 | import globals from 'globals'
 4 | 
 5 | export default [
 6 |   {
 7 |     plugins: {
 8 |       jsdoc,
 9 |     },
10 | 
11 |     languageOptions: {
12 |       globals: {
13 |         ...globals.browser,
14 |         ...globals.node,
15 |       },
16 |     },
17 | 
18 |     rules: {
19 |       ...javascript.configs.recommended.rules,
20 |       'arrow-spacing': 'error',
21 |       camelcase: 'off',
22 |       'comma-spacing': 'error',
23 |       'comma-dangle': ['error', {
24 |         arrays: 'always-multiline',
25 |         objects: 'always-multiline',
26 |         imports: 'always-multiline',
27 |         exports: 'always-multiline',
28 |         functions: 'never',
29 |       }],
30 |       'eol-last': 'error',
31 |       eqeqeq: 'error',
32 |       'func-style': ['error', 'declaration'],
33 |       indent: ['error', 2],
34 |       'jsdoc/check-param-names': 'error',
35 |       'jsdoc/check-property-names': 'error',
36 |       'jsdoc/check-tag-names': 'error',
37 |       'jsdoc/require-param': 'error',
38 |       'jsdoc/require-param-type': 'error',
39 |       'jsdoc/require-returns': 'error',
40 |       'jsdoc/require-returns-type': 'error',
41 |       'jsdoc/sort-tags': 'error',
42 |       'key-spacing': 'error',
43 |       'keyword-spacing': 'error',
44 |       'no-constant-condition': 'off',
45 |       'no-extra-parens': 'error',
46 |       'no-multi-spaces': 'error',
47 |       'no-trailing-spaces': 'error',
48 |       'no-undef': 'error',
49 |       'no-unused-vars': 'error',
50 |       'no-useless-concat': 'error',
51 |       'no-useless-rename': 'error',
52 |       'no-useless-return': 'error',
53 |       'no-var': 'error',
54 |       'object-curly-spacing': ['error', 'always'],
55 |       'object-shorthand': 'error',
56 |       'prefer-const': 'error',
57 |       'prefer-destructuring': ['warn', {
58 |         object: true,
59 |         array: false,
60 |       }],
61 |       'prefer-exponentiation-operator': 'error',
62 |       'prefer-promise-reject-errors': 'error',
63 |       quotes: ['error', 'single'],
64 |       'require-await': 'warn',
65 |       semi: ['error', 'never'],
66 |       'sort-imports': ['error', {
67 |         ignoreDeclarationSort: true,
68 |         ignoreMemberSort: false,
69 |         memberSyntaxSortOrder: ['none', 'all', 'multiple', 'single'],
70 |       }],
71 |       'space-infix-ops': 'error',
72 |     },
73 |   },
74 | ]
75 | 


--------------------------------------------------------------------------------
/test/files/issue115decimal.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "schema",
 7 |       "num_children": 1
 8 |     },
 9 |     {
10 |       "type": "FIXED_LEN_BYTE_ARRAY",
11 |       "type_length": 13,
12 |       "repetition_type": "OPTIONAL",
13 |       "name": "amount",
14 |       "converted_type": "DECIMAL",
15 |       "scale": 2,
16 |       "precision": 29,
17 |       "logical_type": {
18 |         "type": "DECIMAL",
19 |         "scale": 2,
20 |         "precision": 29
21 |       }
22 |     }
23 |   ],
24 |   "num_rows": 1,
25 |   "row_groups": [
26 |     {
27 |       "columns": [
28 |         {
29 |           "file_offset": 0,
30 |           "meta_data": {
31 |             "type": "FIXED_LEN_BYTE_ARRAY",
32 |             "encodings": [
33 |               "PLAIN",
34 |               "RLE",
35 |               "RLE_DICTIONARY"
36 |             ],
37 |             "path_in_schema": [
38 |               "amount"
39 |             ],
40 |             "codec": "SNAPPY",
41 |             "num_values": 1,
42 |             "total_uncompressed_size": 117,
43 |             "total_compressed_size": 121,
44 |             "data_page_offset": 33,
45 |             "dictionary_page_offset": 4,
46 |             "statistics": {
47 |               "max": -12345.67,
48 |               "min": -12345.67,
49 |               "null_count": 0,
50 |               "max_value": -12345.67,
51 |               "min_value": -12345.67
52 |             },
53 |             "encoding_stats": [
54 |               {
55 |                 "page_type": "DICTIONARY_PAGE",
56 |                 "encoding": "PLAIN",
57 |                 "count": 1
58 |               },
59 |               {
60 |                 "page_type": "DATA_PAGE",
61 |                 "encoding": "RLE_DICTIONARY",
62 |                 "count": 1
63 |               }
64 |             ]
65 |           }
66 |         }
67 |       ],
68 |       "total_byte_size": 117,
69 |       "num_rows": 1,
70 |       "file_offset": 4,
71 |       "total_compressed_size": 121,
72 |       "ordinal": 0
73 |     }
74 |   ],
75 |   "key_value_metadata": [
76 |     {
77 |       "key": "ARROW:schema",
78 |       "value": "/////4AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEHEAAAACAAAAAEAAAAAAAAAAYAAABhbW91bnQAAAgADAAEAAgACAAAAB0AAAACAAAAAAAAAA=="
79 |     }
80 |   ],
81 |   "created_by": "parquet-cpp-arrow version 19.0.1",
82 |   "metadata_length": 424
83 | }
84 | 


--------------------------------------------------------------------------------
/test/rowend_struct.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { parquetReadObjects } from '../src/index.js'
 3 | import { asyncBufferFromFile } from '../src/node.js'
 4 | 
 5 | /**
 6 |  * Test for issue #147: struct children with different page counts
 7 |  * cause "parquet struct parsing error" when using rowEnd.
 8 |  *
 9 |  * The bug occurs when:
10 |  * 1. A struct has multiple child columns
11 |  * 2. One child has multiple pages (large data)
12 |  * 3. Another child has fewer pages (small/compressible data)
13 |  * 4. rowEnd is used to limit the number of rows read
14 |  *
15 |  * The root cause is in column.js - for non-flat columns, all pages
16 |  * are read, but truncation only affects the last chunk. If a column
17 |  * has multiple chunks (pages), earlier chunks aren't truncated,
18 |  * resulting in mismatched array lengths during struct assembly.
19 |  *
20 |  * Test file: rowend_struct.parquet (created with pyarrow)
21 |  * - 1050 rows
22 |  * - struct column 's' with children:
23 |  *   - 'a': unique strings (2 data pages due to snappy compression)
24 |  *   - 'b': same string "x" (1 data page)
25 |  */
26 | describe('rowEnd with struct columns', () => {
27 |   it('reads all rows without error', async () => {
28 |     const file = await asyncBufferFromFile('test/files/rowend_struct.parquet')
29 |     const rows = await parquetReadObjects({ file })
30 |     expect(rows.length).toBe(1050)
31 |     expect(rows[0]).toEqual({ s: { a: 'v0000', b: 'x' } })
32 |     expect(rows[1049]).toEqual({ s: { a: 'v1049', b: 'x' } })
33 |   })
34 | 
35 |   it('reads partial rows with rowEnd', async () => {
36 |     const file = await asyncBufferFromFile('test/files/rowend_struct.parquet')
37 |     // This should return 10 rows but currently throws
38 |     // "parquet struct parsing error" due to mismatched child array lengths
39 |     const rows = await parquetReadObjects({ file, rowEnd: 10 })
40 |     expect(rows.length).toBe(10)
41 |     expect(rows[0]).toEqual({ s: { a: 'v0000', b: 'x' } })
42 |     expect(rows[9]).toEqual({ s: { a: 'v0009', b: 'x' } })
43 |   })
44 | 
45 |   it('reads middle rows with rowStart and rowEnd', async () => {
46 |     const file = await asyncBufferFromFile('test/files/rowend_struct.parquet')
47 |     const rows = await parquetReadObjects({ file, rowStart: 100, rowEnd: 110 })
48 |     expect(rows.length).toBe(10)
49 |     expect(rows[0]).toEqual({ s: { a: 'v0100', b: 'x' } })
50 |     expect(rows[9]).toEqual({ s: { a: 'v0109', b: 'x' } })
51 |   })
52 | })
53 | 


--------------------------------------------------------------------------------
/test/files/strings.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "name": "root",
 6 |       "num_children": 3
 7 |     },
 8 |     {
 9 |       "type": "BYTE_ARRAY",
10 |       "name": "bytes"
11 |     },
12 |     {
13 |       "type": "BYTE_ARRAY",
14 |       "name": "c_utf8",
15 |       "converted_type": "UTF8"
16 |     },
17 |     {
18 |       "type": "BYTE_ARRAY",
19 |       "name": "l_utf8",
20 |       "logical_type": {
21 |         "type": "STRING"
22 |       }
23 |     }
24 |   ],
25 |   "num_rows": 4,
26 |   "row_groups": [
27 |     {
28 |       "columns": [
29 |         {
30 |           "file_offset": 4,
31 |           "meta_data": {
32 |             "type": "BYTE_ARRAY",
33 |             "encodings": ["PLAIN"],
34 |             "path_in_schema": ["bytes"],
35 |             "codec": "UNCOMPRESSED",
36 |             "num_values": 4,
37 |             "total_uncompressed_size": 62,
38 |             "total_compressed_size": 62,
39 |             "data_page_offset": 4,
40 |             "statistics": {
41 |               "null_count": 0,
42 |               "max_value": "delta",
43 |               "min_value": "alpha"
44 |             }
45 |           }
46 |         },
47 |         {
48 |           "file_offset": 66,
49 |           "meta_data": {
50 |             "type": "BYTE_ARRAY",
51 |             "encodings": ["PLAIN"],
52 |             "path_in_schema": ["c_utf8"],
53 |             "codec": "UNCOMPRESSED",
54 |             "num_values": 4,
55 |             "total_uncompressed_size": 62,
56 |             "total_compressed_size": 62,
57 |             "data_page_offset": 66,
58 |             "statistics": {
59 |               "null_count": 0,
60 |               "max_value": "delta",
61 |               "min_value": "alpha"
62 |             }
63 |           }
64 |         },
65 |         {
66 |           "file_offset": 128,
67 |           "meta_data": {
68 |             "type": "BYTE_ARRAY",
69 |             "encodings": ["PLAIN"],
70 |             "path_in_schema": ["l_utf8"],
71 |             "codec": "UNCOMPRESSED",
72 |             "num_values": 4,
73 |             "total_uncompressed_size": 62,
74 |             "total_compressed_size": 62,
75 |             "data_page_offset": 128,
76 |             "statistics": {
77 |               "null_count": 0,
78 |               "max_value": "delta",
79 |               "min_value": "alpha"
80 |             }
81 |           }
82 |         }
83 |       ],
84 |       "total_byte_size": 186,
85 |       "num_rows": 4
86 |     }
87 |   ],
88 |   "created_by": "hyparquet",
89 |   "metadata_length": 219
90 | }
91 | 


--------------------------------------------------------------------------------
/test/files/continued_page.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "schema",
 7 |       "num_children": 1
 8 |     },
 9 |     {
10 |       "repetition_type": "OPTIONAL",
11 |       "name": "int_list",
12 |       "num_children": 1,
13 |       "converted_type": "LIST",
14 |       "logical_type": {
15 |         "type": "LIST"
16 |       }
17 |     },
18 |     {
19 |       "repetition_type": "REPEATED",
20 |       "name": "list",
21 |       "num_children": 1
22 |     },
23 |     {
24 |       "type": "INT32",
25 |       "repetition_type": "OPTIONAL",
26 |       "name": "element"
27 |     }
28 |   ],
29 |   "num_rows": 100,
30 |   "row_groups": [
31 |     {
32 |       "columns": [
33 |         {
34 |           "file_offset": 0,
35 |           "meta_data": {
36 |             "type": "INT32",
37 |             "encodings": [
38 |               "PLAIN",
39 |               "RLE",
40 |               "RLE_DICTIONARY"
41 |             ],
42 |             "path_in_schema": [
43 |               "int_list",
44 |               "list",
45 |               "element"
46 |             ],
47 |             "codec": "SNAPPY",
48 |             "num_values": 2000,
49 |             "total_uncompressed_size": 2692,
50 |             "total_compressed_size": 2338,
51 |             "data_page_offset": 426,
52 |             "dictionary_page_offset": 4,
53 |             "statistics": {
54 |               "max": 99,
55 |               "min": 0,
56 |               "null_count": 0,
57 |               "max_value": 99,
58 |               "min_value": 0
59 |             },
60 |             "encoding_stats": [
61 |               {
62 |                 "page_type": "DICTIONARY_PAGE",
63 |                 "encoding": "PLAIN",
64 |                 "count": 1
65 |               },
66 |               {
67 |                 "page_type": "DATA_PAGE",
68 |                 "encoding": "RLE_DICTIONARY",
69 |                 "count": 2
70 |               }
71 |             ]
72 |           }
73 |         }
74 |       ],
75 |       "total_byte_size": 2692,
76 |       "num_rows": 100,
77 |       "file_offset": 4,
78 |       "total_compressed_size": 2338,
79 |       "ordinal": 0
80 |     }
81 |   ],
82 |   "key_value_metadata": [
83 |     {
84 |       "key": "ARROW:schema",
85 |       "value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAEAAAAzP///wAAAQwUAAAAJAAAAAQAAAABAAAALAAAAAgAAABpbnRfbGlzdAAAAAAEAAQABAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAgAAAABAAAAAAAAAAEAAAAaXRlbQAAAAAIAAwACAAHAAgAAAAAAAABIAAAAA=="
86 |     }
87 |   ],
88 |   "created_by": "parquet-cpp-arrow version 19.0.1",
89 |   "metadata_length": 488
90 | }
91 | 


--------------------------------------------------------------------------------
/test/files/repeated_no_annotation.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 1,
  3 |   "schema": [
  4 |     {
  5 |       "name": "user",
  6 |       "num_children": 2
  7 |     },
  8 |     {
  9 |       "type": "INT32",
 10 |       "repetition_type": "REQUIRED",
 11 |       "name": "id"
 12 |     },
 13 |     {
 14 |       "repetition_type": "OPTIONAL",
 15 |       "name": "phoneNumbers",
 16 |       "num_children": 1
 17 |     },
 18 |     {
 19 |       "repetition_type": "REPEATED",
 20 |       "name": "phone",
 21 |       "num_children": 2
 22 |     },
 23 |     {
 24 |       "type": "INT64",
 25 |       "repetition_type": "REQUIRED",
 26 |       "name": "number"
 27 |     },
 28 |     {
 29 |       "type": "BYTE_ARRAY",
 30 |       "repetition_type": "OPTIONAL",
 31 |       "name": "kind",
 32 |       "converted_type": "UTF8"
 33 |     }
 34 |   ],
 35 |   "num_rows": 0,
 36 |   "row_groups": [
 37 |     {
 38 |       "columns": [
 39 |         {
 40 |           "file_offset": 64,
 41 |           "meta_data": {
 42 |             "type": "INT32",
 43 |             "encodings": [
 44 |               "PLAIN",
 45 |               "RLE_DICTIONARY"
 46 |             ],
 47 |             "path_in_schema": [
 48 |               "id"
 49 |             ],
 50 |             "codec": "UNCOMPRESSED",
 51 |             "num_values": 6,
 52 |             "total_uncompressed_size": 60,
 53 |             "total_compressed_size": 60,
 54 |             "data_page_offset": 42,
 55 |             "dictionary_page_offset": 4
 56 |           }
 57 |         },
 58 |         {
 59 |           "file_offset": 173,
 60 |           "meta_data": {
 61 |             "type": "INT64",
 62 |             "encodings": [
 63 |               "PLAIN",
 64 |               "RLE_DICTIONARY"
 65 |             ],
 66 |             "path_in_schema": [
 67 |               "phoneNumbers",
 68 |               "phone",
 69 |               "number"
 70 |             ],
 71 |             "codec": "UNCOMPRESSED",
 72 |             "num_values": 8,
 73 |             "total_uncompressed_size": 80,
 74 |             "total_compressed_size": 80,
 75 |             "data_page_offset": 139,
 76 |             "dictionary_page_offset": 93
 77 |           }
 78 |         },
 79 |         {
 80 |           "file_offset": 294,
 81 |           "meta_data": {
 82 |             "type": "BYTE_ARRAY",
 83 |             "encodings": [
 84 |               "PLAIN",
 85 |               "RLE_DICTIONARY"
 86 |             ],
 87 |             "path_in_schema": [
 88 |               "phoneNumbers",
 89 |               "phone",
 90 |               "kind"
 91 |             ],
 92 |             "codec": "UNCOMPRESSED",
 93 |             "num_values": 8,
 94 |             "total_uncompressed_size": 65,
 95 |             "total_compressed_size": 65,
 96 |             "data_page_offset": 261,
 97 |             "dictionary_page_offset": 229
 98 |           }
 99 |         }
100 |       ],
101 |       "total_byte_size": 205,
102 |       "num_rows": 6
103 |     }
104 |   ],
105 |   "created_by": "parquet-rs version 0.3.0 (build b45ce7cba2199f22d93269c150d8a83916c69b5e)",
106 |   "metadata_length": 306
107 | }
108 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
 1 | export { readColumnIndex, readOffsetIndex } from './indexes.js'
 2 | export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js'
 3 | export { parquetRead, parquetReadObjects } from './read.js'
 4 | export { parquetQuery } from './query.js'
 5 | export { snappyUncompress } from './snappy.js'
 6 | export { asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js'
 7 | 
 8 | /**
 9 |  * Explicitly export types for use in downstream typescript projects through
10 |  * `import { ParquetReadOptions } from 'hyparquet'` for example.
11 |  *
12 |  * @template {any} T
13 |  * @typedef {import('../src/types.d.ts').Awaitable<T>} Awaitable<T>
14 |  */
15 | /**
16 |  * @typedef {import('../src/types.d.ts').AsyncBuffer} AsyncBuffer
17 |  * @typedef {import('../src/types.d.ts').AsyncRowGroup} AsyncRowGroup
18 |  * @typedef {import('../src/types.d.ts').DataReader} DataReader
19 |  * @typedef {import('../src/types.d.ts').FileMetaData} FileMetaData
20 |  * @typedef {import('../src/types.d.ts').SchemaTree} SchemaTree
21 |  * @typedef {import('../src/types.d.ts').SchemaElement} SchemaElement
22 |  * @typedef {import('../src/types.d.ts').ParquetType} ParquetType
23 |  * @typedef {import('../src/types.d.ts').FieldRepetitionType} FieldRepetitionType
24 |  * @typedef {import('../src/types.d.ts').ConvertedType} ConvertedType
25 |  * @typedef {import('../src/types.d.ts').TimeUnit} TimeUnit
26 |  * @typedef {import('../src/types.d.ts').LogicalType} LogicalType
27 |  * @typedef {import('../src/types.d.ts').RowGroup} RowGroup
28 |  * @typedef {import('../src/types.d.ts').ColumnChunk} ColumnChunk
29 |  * @typedef {import('../src/types.d.ts').ColumnMetaData} ColumnMetaData
30 |  * @typedef {import('../src/types.d.ts').Encoding} Encoding
31 |  * @typedef {import('../src/types.d.ts').CompressionCodec} CompressionCodec
32 |  * @typedef {import('../src/types.d.ts').Compressors} Compressors
33 |  * @typedef {import('../src/types.d.ts').KeyValue} KeyValue
34 |  * @typedef {import('../src/types.d.ts').Statistics} Statistics
35 |  * @typedef {import('../src/types.d.ts').GeospatialStatistics} GeospatialStatistics
36 |  * @typedef {import('../src/types.d.ts').BoundingBox} BoundingBox
37 |  * @typedef {import('../src/types.d.ts').PageType} PageType
38 |  * @typedef {import('../src/types.d.ts').PageHeader} PageHeader
39 |  * @typedef {import('../src/types.d.ts').DataPageHeader} DataPageHeader
40 |  * @typedef {import('../src/types.d.ts').DictionaryPageHeader} DictionaryPageHeader
41 |  * @typedef {import('../src/types.d.ts').DecodedArray} DecodedArray
42 |  * @typedef {import('../src/types.d.ts').OffsetIndex} OffsetIndex
43 |  * @typedef {import('../src/types.d.ts').ColumnIndex} ColumnIndex
44 |  * @typedef {import('../src/types.d.ts').BoundaryOrder} BoundaryOrder
45 |  * @typedef {import('../src/types.d.ts').ColumnData} ColumnData
46 |  * @typedef {import('../src/types.d.ts').SubColumnData} SubColumnData
47 |  * @typedef {import('../src/types.d.ts').ParquetReadOptions} ParquetReadOptions
48 |  * @typedef {import('../src/types.d.ts').MetadataOptions} MetadataOptions
49 |  * @typedef {import('../src/types.d.ts').ParquetParsers} ParquetParsers
50 |  * @typedef {import('../src/types.d.ts').ParquetQueryFilter} ParquetQueryFilter
51 |  */
52 | 


--------------------------------------------------------------------------------
/test/files/byte_stream_split.zstd.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 2,
  3 |   "schema": [
  4 |     {
  5 |       "repetition_type": "REQUIRED",
  6 |       "name": "schema",
  7 |       "num_children": 2
  8 |     },
  9 |     {
 10 |       "type": "FLOAT",
 11 |       "repetition_type": "OPTIONAL",
 12 |       "name": "f32"
 13 |     },
 14 |     {
 15 |       "type": "DOUBLE",
 16 |       "repetition_type": "OPTIONAL",
 17 |       "name": "f64"
 18 |     }
 19 |   ],
 20 |   "num_rows": 300,
 21 |   "row_groups": [
 22 |     {
 23 |       "columns": [
 24 |         {
 25 |           "file_offset": 1162,
 26 |           "meta_data": {
 27 |             "type": "FLOAT",
 28 |             "encodings": [
 29 |               "RLE",
 30 |               "BYTE_STREAM_SPLIT"
 31 |             ],
 32 |             "path_in_schema": [
 33 |               "f32"
 34 |             ],
 35 |             "codec": "ZSTD",
 36 |             "num_values": 300,
 37 |             "total_uncompressed_size": 1255,
 38 |             "total_compressed_size": 1158,
 39 |             "data_page_offset": 4,
 40 |             "statistics": {
 41 |               "max": 2.3831448554992676,
 42 |               "min": -2.772592782974243,
 43 |               "null_count": 0,
 44 |               "max_value": 2.3831448554992676,
 45 |               "min_value": -2.772592782974243
 46 |             },
 47 |             "encoding_stats": [
 48 |               {
 49 |                 "page_type": "DATA_PAGE",
 50 |                 "encoding": "BYTE_STREAM_SPLIT",
 51 |                 "count": 1
 52 |               }
 53 |             ]
 54 |           }
 55 |         },
 56 |         {
 57 |           "file_offset": 3513,
 58 |           "meta_data": {
 59 |             "type": "DOUBLE",
 60 |             "encodings": [
 61 |               "RLE",
 62 |               "BYTE_STREAM_SPLIT"
 63 |             ],
 64 |             "path_in_schema": [
 65 |               "f64"
 66 |             ],
 67 |             "codec": "ZSTD",
 68 |             "num_values": 300,
 69 |             "total_uncompressed_size": 2471,
 70 |             "total_compressed_size": 2283,
 71 |             "data_page_offset": 1230,
 72 |             "statistics": {
 73 |               "max": 2.6962240525635797,
 74 |               "min": -3.0461430547999266,
 75 |               "null_count": 0,
 76 |               "max_value": 2.6962240525635797,
 77 |               "min_value": -3.0461430547999266
 78 |             },
 79 |             "encoding_stats": [
 80 |               {
 81 |                 "page_type": "DATA_PAGE",
 82 |                 "encoding": "BYTE_STREAM_SPLIT",
 83 |                 "count": 1
 84 |               }
 85 |             ]
 86 |           }
 87 |         }
 88 |       ],
 89 |       "total_byte_size": 3726,
 90 |       "num_rows": 300,
 91 |       "file_offset": 4,
 92 |       "total_compressed_size": 3441,
 93 |       "ordinal": 0
 94 |     }
 95 |   ],
 96 |   "key_value_metadata": [
 97 |     {
 98 |       "key": "ARROW:schema",
 99 |       "value": "/////6AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABAAAAABAAAANj///8AAAEDEAAAABQAAAAEAAAAAAAAAAMAAABmNjQAxv///wAAAgAQABQACAAGAAcADAAAABAAEAAAAAAAAQMQAAAAHAAAAAQAAAAAAAAAAwAAAGYzMgAAAAYACAAGAAYAAAAAAAEA"
100 |     }
101 |   ],
102 |   "created_by": "parquet-cpp-arrow version 14.0.2",
103 |   "metadata_length": 498
104 | }
105 | 


--------------------------------------------------------------------------------
/test/indexes.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetMetadata, toJson } from '../src/index.js'
 4 | import { readColumnIndex, readOffsetIndex } from '../src/indexes.js'
 5 | import { asyncBufferFromFile } from '../src/node.js'
 6 | import { getSchemaPath } from '../src/schema.js'
 7 | import { fileToJson } from './helpers.js'
 8 | 
 9 | describe('readColumnIndex', () => {
10 |   const columnIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.column_indexes.json'))
11 |   const parquetFiles = columnIndexesFiles.map(f => f.replace(/.column_indexes.json$/i, '.parquet'))
12 | 
13 |   parquetFiles.forEach((file, i) => {
14 |     it(`parse column indexes from ${file}`, async () => {
15 |       const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
16 |       const metadata = parquetMetadata(arrayBuffer)
17 | 
18 |       const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => {
19 |         if (column.column_index_offset === undefined || column.column_index_length === undefined) return null
20 |         const columnIndexOffset = Number(column.column_index_offset)
21 |         const columnIndexLength = Number(column.column_index_length)
22 |         const columnIndexArrayBuffer = arrayBuffer.slice(columnIndexOffset, columnIndexOffset + columnIndexLength)
23 |         const columnIndexReader = { view: new DataView(columnIndexArrayBuffer), offset: 0 }
24 |         const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
25 |         return readColumnIndex(columnIndexReader, schemaPath.at(-1)?.element || { name: '' })
26 |       }))
27 |       const expected = fileToJson(`test/files/${columnIndexesFiles[i]}`)
28 |       expect(toJson(result)).toEqual(expected)
29 |     })
30 |   })
31 | })
32 | 
33 | describe('readOffsetIndex', () => {
34 |   const offsetIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.offset_indexes.json'))
35 |   const parquetFiles = offsetIndexesFiles.map(f => f.replace(/.offset_indexes.json$/i, '.parquet'))
36 | 
37 |   parquetFiles.forEach((file, i) => {
38 |     it(`parse offset indexes from ${file}`, async () => {
39 |       const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
40 |       const metadata = parquetMetadata(arrayBuffer)
41 | 
42 |       const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => {
43 |         if (column.offset_index_offset === undefined || column.offset_index_length === undefined) return null
44 |         const offsetIndexOffset = Number(column.offset_index_offset)
45 |         const offsetIndexLength = Number(column.offset_index_length)
46 |         const offsetIndexArrayBuffer = arrayBuffer.slice(offsetIndexOffset, offsetIndexOffset + offsetIndexLength)
47 |         const offsetIndexReader = { view: new DataView(offsetIndexArrayBuffer), offset: 0 }
48 |         return readOffsetIndex(offsetIndexReader)
49 |       }))
50 |       const expected = fileToJson(`test/files/${offsetIndexesFiles[i]}`)
51 |       expect(toJson(result)).toEqual(expected)
52 |     })
53 |   })
54 | })
55 | 
56 | /**
57 |  * @param {string} filename
58 |  * @returns {Promise<ArrayBuffer>}
59 |  */
60 | function readFileToArrayBuffer(filename) {
61 |   return asyncBufferFromFile(filename).then((buffer) => buffer.slice(0))
62 | }
63 | 


--------------------------------------------------------------------------------
/test/snappy.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { snappyUncompress } from '../src/snappy.js'
 4 | 
 5 | describe('snappy uncompress', () => {
 6 |   it('decompresses valid input correctly', () => {
 7 |     const testCases = [
 8 |       { compressed: [0x00], expected: '' },
 9 |       { compressed: [0x01, 0x00, 0x68], expected: 'h' },
10 |       { compressed: [0x02, 0x04, 0x68, 0x79], expected: 'hy' },
11 |       { compressed: [0x03, 0x08, 0x68, 0x79, 0x70], expected: 'hyp' },
12 |       { compressed: [0x05, 0x10, 0x68, 0x79, 0x70, 0x65, 0x72], expected: 'hyper' },
13 |       {
14 |         compressed: [0x0a, 0x24, 0x68, 0x79, 0x70, 0x65, 0x72, 0x70, 0x61, 0x72, 0x61, 0x6d],
15 |         expected: 'hyperparam',
16 |       },
17 |       {
18 |         compressed: [0x15, 0x08, 0x68, 0x79, 0x70, 0x46, 0x03, 0x00],
19 |         expected: 'hyphyphyphyphyphyphyp',
20 |       },
21 |       {
22 |         // from rowgroups.parquet
23 |         compressed: [
24 |           80, 4, 1, 0, 9, 1, 0, 2, 9, 7, 4, 0, 3, 13, 8, 0, 4, 13, 8, 0, 5, 13,
25 |           8, 0, 6, 13, 8, 0, 7, 13, 8, 0, 8, 13, 8, 60, 9, 0, 0, 0, 0, 0, 0, 0,
26 |           10, 0, 0, 0, 0, 0, 0, 0,
27 |         ],
28 |         expected: new Uint8Array([
29 |           1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0,
30 |           0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0,
31 |           0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0,
32 |           0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0,
33 |         ]),
34 |       },
35 |       // from datapage_v2.snappy.parquet
36 |       { compressed: [2, 4, 0, 3], expected: new Uint8Array([0, 3]) },
37 |       { compressed: [ 6, 20, 2, 0, 0, 0, 3, 23], expected: new Uint8Array([2, 0, 0, 0, 3, 23]) },
38 |     ]
39 | 
40 |     for (const { compressed, expected } of testCases) {
41 |       const output = new Uint8Array(expected.length)
42 |       snappyUncompress(new Uint8Array(compressed), output)
43 |       if (typeof expected === 'string') {
44 |         const outputStr = new TextDecoder().decode(output)
45 |         expect(outputStr).toBe(expected)
46 |       } else {
47 |         expect(output).toEqual(expected) // Uint8Array
48 |       }
49 |     }
50 |   })
51 | 
52 |   it('decompress hyparquet.jpg.snappy', async () => {
53 |     const compressed = fs.readFileSync('test/files/hyparquet.jpg.snappy')
54 |     const expected = fs.readFileSync('hyparquet.jpg')
55 |     const output = new Uint8Array(expected.length)
56 |     await snappyUncompress(compressed, output)
57 |     expect(Array.from(output)).toEqual(Array.from(expected))
58 |   })
59 | 
60 |   it('throws for invalid input', () => {
61 |     const output = new Uint8Array(10)
62 |     expect(() => snappyUncompress(new Uint8Array([]), output))
63 |       .toThrow('invalid snappy length header')
64 |     expect(() => snappyUncompress(new Uint8Array([0xff]), output))
65 |       .toThrow('invalid snappy length header')
66 |     expect(() => snappyUncompress(new Uint8Array([0x03, 0x61]), output))
67 |       .toThrow('missing eof marker')
68 |     expect(() => snappyUncompress(new Uint8Array([0x03, 0xf1]), output))
69 |       .toThrow('missing eof marker')
70 |     expect(() => snappyUncompress(new Uint8Array([0x02, 0x00, 0x68]), output))
71 |       .toThrow('premature end of input')
72 |   })
73 | })
74 | 


--------------------------------------------------------------------------------
/test/asyncbuffer.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from 'vitest'
 2 | import { cachedAsyncBuffer } from '../src/utils.js'
 3 | 
 4 | describe('cachedAsyncBuffer', () => {
 5 |   it('caches slices of a file to avoid multiple reads', async () => {
 6 |     const slice = vi.fn(async (start, end) => {
 7 |       // Simulate an async slice operation
 8 |       await new Promise(resolve => setTimeout(resolve, 10))
 9 |       if (end === undefined) end = 1000
10 |       if (start < 0) start = Math.max(0, 1000 + start)
11 |       const buffer = new ArrayBuffer(end - start)
12 |       return buffer
13 |     })
14 |     const cachedFile = cachedAsyncBuffer(
15 |       { byteLength: 1000, slice },
16 |       { minSize: 0 }
17 |     )
18 | 
19 |     // Test cache miss
20 |     const slice1 = await cachedFile.slice(0, 100)
21 |     expect(slice).toHaveBeenCalledTimes(1)
22 |     expect(slice1.byteLength).toBe(100)
23 | 
24 |     // Test cache hit for the same range
25 |     const slice2 = await cachedFile.slice(0, 100)
26 |     expect(slice).toHaveBeenCalledTimes(1) // No additional call
27 |     expect(slice2).toBe(slice1) // Exact same object from cache
28 | 
29 |     // Test cache with undefined end, should use byteLength as end
30 |     const slice3 = await cachedFile.slice(900)
31 |     expect(slice).toHaveBeenCalledTimes(2)
32 |     expect(slice3.byteLength).toBe(100)
33 | 
34 |     // Test cache hit for suffix-range
35 |     const slice4 = await cachedFile.slice(-100)
36 |     expect(slice).toHaveBeenCalledTimes(2)
37 |     expect(slice4).toBe(slice3)
38 | 
39 |     // Verify that asking for the same end implicitly gets from cache
40 |     const slice5 = await cachedFile.slice(900, 1000)
41 |     expect(slice).toHaveBeenCalledTimes(2)
42 |     expect(slice5).toBe(slice3)
43 |   })
44 | 
45 |   it('caches whole file if it is smaller than minSize', async () => {
46 |     const slice = vi.fn(async (start, end) => {
47 |       // Simulate an async slice operation
48 |       await new Promise(resolve => setTimeout(resolve, 10))
49 |       if (end === undefined) end = 1000
50 |       if (start < 0) start = Math.max(0, 1000 + start)
51 |       const buffer = new ArrayBuffer(end - start)
52 |       return buffer
53 |     })
54 |     const cachedFile = cachedAsyncBuffer({ byteLength: 1000, slice })
55 | 
56 |     // Test cache miss
57 |     const slice1 = await cachedFile.slice(0, 100)
58 |     expect(slice).toHaveBeenCalledTimes(1)
59 |     expect(slice1.byteLength).toBe(100)
60 | 
61 |     // Test cache hit for the same range
62 |     const slice2 = await cachedFile.slice(0, 100)
63 |     expect(slice).toHaveBeenCalledTimes(1) // No additional call
64 |     expect(slice2).toEqual(slice1) // Same data
65 |     expect(slice2).not.toBe(slice1) // Different object
66 | 
67 |     // Test cache with undefined end, should use byteLength as end
68 |     const slice3 = await cachedFile.slice(900)
69 |     expect(slice).toHaveBeenCalledTimes(1)
70 |     expect(slice3.byteLength).toBe(100)
71 | 
72 |     // Test cache hit for suffix-range
73 |     const slice4 = await cachedFile.slice(-100)
74 |     expect(slice).toHaveBeenCalledTimes(1)
75 |     expect(slice4).toEqual(slice3)
76 |     expect(slice4).not.toBe(slice3)
77 | 
78 |     // Verify that asking for the same end implicitly gets from cache
79 |     const slice5 = await cachedFile.slice(900, 1000)
80 |     expect(slice).toHaveBeenCalledTimes(1)
81 |     expect(slice5).toEqual(slice3)
82 |     expect(slice5).not.toBe(slice3)
83 |   })
84 | })
85 | 


--------------------------------------------------------------------------------
/test/files/duckdb4442.offset_indexes.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     {
  4 |       "page_locations": [
  5 |         {
  6 |           "offset": 4,
  7 |           "compressed_page_size": 34,
  8 |           "first_row_index": 0
  9 |         }
 10 |       ]
 11 |     },
 12 |     {
 13 |       "page_locations": [
 14 |         {
 15 |           "offset": 73,
 16 |           "compressed_page_size": 34,
 17 |           "first_row_index": 0
 18 |         }
 19 |       ]
 20 |     },
 21 |     {
 22 |       "page_locations": [
 23 |         {
 24 |           "offset": 142,
 25 |           "compressed_page_size": 34,
 26 |           "first_row_index": 0
 27 |         }
 28 |       ]
 29 |     },
 30 |     {
 31 |       "page_locations": [
 32 |         {
 33 |           "offset": 207,
 34 |           "compressed_page_size": 34,
 35 |           "first_row_index": 0
 36 |         }
 37 |       ]
 38 |     },
 39 |     {
 40 |       "page_locations": [
 41 |         {
 42 |           "offset": 271,
 43 |           "compressed_page_size": 34,
 44 |           "first_row_index": 0
 45 |         }
 46 |       ]
 47 |     },
 48 |     {
 49 |       "page_locations": [
 50 |         {
 51 |           "offset": 335,
 52 |           "compressed_page_size": 38,
 53 |           "first_row_index": 0
 54 |         }
 55 |       ]
 56 |     },
 57 |     {
 58 |       "page_locations": [
 59 |         {
 60 |           "offset": 403,
 61 |           "compressed_page_size": 34,
 62 |           "first_row_index": 0
 63 |         }
 64 |       ]
 65 |     },
 66 |     {
 67 |       "page_locations": [
 68 |         {
 69 |           "offset": 466,
 70 |           "compressed_page_size": 32,
 71 |           "first_row_index": 0
 72 |         }
 73 |       ]
 74 |     },
 75 |     {
 76 |       "page_locations": [
 77 |         {
 78 |           "offset": 525,
 79 |           "compressed_page_size": 34,
 80 |           "first_row_index": 0
 81 |         }
 82 |       ]
 83 |     },
 84 |     {
 85 |       "page_locations": [
 86 |         {
 87 |           "offset": 586,
 88 |           "compressed_page_size": 34,
 89 |           "first_row_index": 0
 90 |         }
 91 |       ]
 92 |     },
 93 |     {
 94 |       "page_locations": [
 95 |         {
 96 |           "offset": 659,
 97 |           "compressed_page_size": 34,
 98 |           "first_row_index": 0
 99 |         }
100 |       ]
101 |     },
102 |     {
103 |       "page_locations": [
104 |         {
105 |           "offset": 731,
106 |           "compressed_page_size": 34,
107 |           "first_row_index": 0
108 |         }
109 |       ]
110 |     },
111 |     {
112 |       "page_locations": [
113 |         {
114 |           "offset": 802,
115 |           "compressed_page_size": 34,
116 |           "first_row_index": 0
117 |         }
118 |       ]
119 |     },
120 |     {
121 |       "page_locations": [
122 |         {
123 |           "offset": 870,
124 |           "compressed_page_size": 34,
125 |           "first_row_index": 0
126 |         }
127 |       ]
128 |     },
129 |     {
130 |       "page_locations": [
131 |         {
132 |           "offset": 938,
133 |           "compressed_page_size": 34,
134 |           "first_row_index": 0
135 |         }
136 |       ]
137 |     },
138 |     {
139 |       "page_locations": [
140 |         {
141 |           "offset": 1009,
142 |           "compressed_page_size": 34,
143 |           "first_row_index": 0
144 |         }
145 |       ]
146 |     },
147 |     {
148 |       "page_locations": [
149 |         {
150 |           "offset": 1079,
151 |           "compressed_page_size": 30,
152 |           "first_row_index": 0
153 |         }
154 |       ]
155 |     }
156 |   ]
157 | ]


--------------------------------------------------------------------------------
/test/metadata.test.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs'
 2 | import { describe, expect, it } from 'vitest'
 3 | import { parquetMetadata, parquetMetadataAsync, toJson } from '../src/index.js'
 4 | import { asyncBufferFromFile } from '../src/node.js'
 5 | import { fileToJson } from './helpers.js'
 6 | 
 7 | const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))
 8 | 
 9 | describe('parquetMetadata', () => {
10 |   files.forEach(file => {
11 |     it(`parse metadata from ${file}`, async () => {
12 |       const asyncBuffer = await asyncBufferFromFile(`test/files/${file}`)
13 |       const arrayBuffer = await asyncBuffer.slice(0)
14 |       const result = toJson(parquetMetadata(arrayBuffer))
15 |       const base = file.replace('.parquet', '')
16 |       const expected = fileToJson(`test/files/${base}.metadata.json`)
17 |       expect(result, JSON.stringify(result, null, 2)).toEqual(expected)
18 |     })
19 |   })
20 | 
21 |   it('throws for arrayBuffer undefined', () => {
22 |     // @ts-expect-error testing invalid input
23 |     expect(() => parquetMetadata(undefined)).toThrow('parquet expected ArrayBuffer')
24 |   })
25 | 
26 |   it('throws for a too short file', () => {
27 |     const arrayBuffer = new ArrayBuffer(0)
28 |     expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
29 |   })
30 | 
31 |   it('throws for invalid metadata length', () => {
32 |     const arrayBuffer = new ArrayBuffer(12)
33 |     const view = new DataView(arrayBuffer)
34 |     view.setUint32(0, 0x31524150, true) // magic number PAR1
35 |     view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer
36 |     view.setUint32(8, 0x31524150, true) // magic number PAR1
37 |     expect(() => parquetMetadata(arrayBuffer))
38 |       .toThrow('parquet metadata length 1000 exceeds available buffer 4')
39 |   })
40 | 
41 |   it('throws for invalid magic number', () => {
42 |     const arrayBuffer = new ArrayBuffer(8)
43 |     expect(() => parquetMetadata(arrayBuffer))
44 |       .toThrow('parquet file invalid (footer != PAR1)')
45 |   })
46 | 
47 |   it('throws for invalid metadata length', () => {
48 |     const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
49 |     expect(() => parquetMetadata(buffer))
50 |       .toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
51 |   })
52 | })
53 | 
54 | describe('parquetMetadataAsync', () => {
55 |   files.forEach(file => {
56 |     it(`parse metadata async from ${file}`, async () => {
57 |       const asyncBuffer = await asyncBufferFromFile(`test/files/${file}`)
58 |       const result = await parquetMetadataAsync(asyncBuffer)
59 |       const base = file.replace('.parquet', '')
60 |       const expected = fileToJson(`test/files/${base}.metadata.json`)
61 |       expect(toJson(result)).toEqual(expected)
62 |     })
63 |   })
64 | 
65 |   it('throws for asyncBuffer undefined', async () => {
66 |     const arrayBuffer = undefined
67 |     // @ts-expect-error testing invalid input
68 |     await expect(parquetMetadataAsync(arrayBuffer)).rejects
69 |       .toThrow('parquet expected AsyncBuffer')
70 |   })
71 | 
72 |   it('throws for invalid magic number', async () => {
73 |     const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255])
74 |     await expect(parquetMetadataAsync(buffer)).rejects
75 |       .toThrow('parquet file invalid (footer != PAR1)')
76 |   })
77 | 
78 |   it('throws for invalid metadata length', async () => {
79 |     const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
80 |     await expect(parquetMetadataAsync(buffer)).rejects
81 |       .toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
82 |   })
83 | })
84 | 


--------------------------------------------------------------------------------
/test/files/delta_encoding_required_column.offset_indexes.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     {
  4 |       "page_locations": [
  5 |         {
  6 |           "offset": 4,
  7 |           "compressed_page_size": 50,
  8 |           "first_row_index": 0
  9 |         }
 10 |       ]
 11 |     },
 12 |     {
 13 |       "page_locations": [
 14 |         {
 15 |           "offset": 54,
 16 |           "compressed_page_size": 388,
 17 |           "first_row_index": 0
 18 |         }
 19 |       ]
 20 |     },
 21 |     {
 22 |       "page_locations": [
 23 |         {
 24 |           "offset": 442,
 25 |           "compressed_page_size": 261,
 26 |           "first_row_index": 0
 27 |         }
 28 |       ]
 29 |     },
 30 |     {
 31 |       "page_locations": [
 32 |         {
 33 |           "offset": 703,
 34 |           "compressed_page_size": 307,
 35 |           "first_row_index": 0
 36 |         }
 37 |       ]
 38 |     },
 39 |     {
 40 |       "page_locations": [
 41 |         {
 42 |           "offset": 1010,
 43 |           "compressed_page_size": 247,
 44 |           "first_row_index": 0
 45 |         }
 46 |       ]
 47 |     },
 48 |     {
 49 |       "page_locations": [
 50 |         {
 51 |           "offset": 1257,
 52 |           "compressed_page_size": 247,
 53 |           "first_row_index": 0
 54 |         }
 55 |       ]
 56 |     },
 57 |     {
 58 |       "page_locations": [
 59 |         {
 60 |           "offset": 1504,
 61 |           "compressed_page_size": 131,
 62 |           "first_row_index": 0
 63 |         }
 64 |       ]
 65 |     },
 66 |     {
 67 |       "page_locations": [
 68 |         {
 69 |           "offset": 1635,
 70 |           "compressed_page_size": 115,
 71 |           "first_row_index": 0
 72 |         }
 73 |       ]
 74 |     },
 75 |     {
 76 |       "page_locations": [
 77 |         {
 78 |           "offset": 1750,
 79 |           "compressed_page_size": 144,
 80 |           "first_row_index": 0
 81 |         }
 82 |       ]
 83 |     },
 84 |     {
 85 |       "page_locations": [
 86 |         {
 87 |           "offset": 1894,
 88 |           "compressed_page_size": 933,
 89 |           "first_row_index": 0
 90 |         }
 91 |       ]
 92 |     },
 93 |     {
 94 |       "page_locations": [
 95 |         {
 96 |           "offset": 2827,
 97 |           "compressed_page_size": 378,
 98 |           "first_row_index": 0
 99 |         }
100 |       ]
101 |     },
102 |     {
103 |       "page_locations": [
104 |         {
105 |           "offset": 3205,
106 |           "compressed_page_size": 707,
107 |           "first_row_index": 0
108 |         }
109 |       ]
110 |     },
111 |     {
112 |       "page_locations": [
113 |         {
114 |           "offset": 3912,
115 |           "compressed_page_size": 751,
116 |           "first_row_index": 0
117 |         }
118 |       ]
119 |     },
120 |     {
121 |       "page_locations": [
122 |         {
123 |           "offset": 4663,
124 |           "compressed_page_size": 154,
125 |           "first_row_index": 0
126 |         }
127 |       ]
128 |     },
129 |     {
130 |       "page_locations": [
131 |         {
132 |           "offset": 4817,
133 |           "compressed_page_size": 1154,
134 |           "first_row_index": 0
135 |         }
136 |       ]
137 |     },
138 |     {
139 |       "page_locations": [
140 |         {
141 |           "offset": 5971,
142 |           "compressed_page_size": 2857,
143 |           "first_row_index": 0
144 |         }
145 |       ]
146 |     },
147 |     {
148 |       "page_locations": [
149 |         {
150 |           "offset": 8828,
151 |           "compressed_page_size": 405,
152 |           "first_row_index": 0
153 |         }
154 |       ]
155 |     }
156 |   ]
157 | ]


--------------------------------------------------------------------------------
/test/column.test.js:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from 'vitest'
 2 | import { readColumn } from '../src/column.js'
 3 | import { DEFAULT_PARSERS } from '../src/convert.js'
 4 | import { parquetMetadata } from '../src/index.js'
 5 | import { asyncBufferFromFile } from '../src/node.js'
 6 | import { getSchemaPath } from '../src/schema.js'
 7 | 
 8 | const values = [null, 1, -2, NaN, 0, -1, -0, 2]
 9 | 
10 | describe('readColumn', () => {
11 |   it.for([
12 |     { selectEnd: Infinity, expected: [values] },
13 |     { selectEnd: 2, expected: [values] }, // readColumn does not truncate
14 |     { selectEnd: 0, expected: [] },
15 |   ])('readColumn with rowGroupEnd %p', async ({ selectEnd, expected }) => {
16 |     const testFile = 'test/files/float16_nonzeros_and_nans.parquet'
17 |     const file = await asyncBufferFromFile(testFile)
18 |     const arrayBuffer = await file.slice(0)
19 |     const metadata = parquetMetadata(arrayBuffer)
20 | 
21 |     const column = metadata.row_groups[0].columns[0]
22 |     if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
23 |     const { startByte, endByte } = getChunkPlan(column.meta_data)
24 |     const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
25 |     const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
26 |     const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
27 |     const columnDecoder = {
28 |       pathInSchema: column.meta_data.path_in_schema,
29 |       type: column.meta_data.type,
30 |       element: schemaPath[schemaPath.length - 1].element,
31 |       schemaPath,
32 |       parsers: DEFAULT_PARSERS,
33 |       codec: column.meta_data.codec,
34 |     }
35 |     const rowGroupSelect = {
36 |       groupStart: 0,
37 |       selectStart: 0,
38 |       selectEnd,
39 |       groupRows: expected.length,
40 |     }
41 | 
42 |     const result = readColumn(reader, rowGroupSelect, columnDecoder)
43 |     expect(result).toEqual(expected)
44 |   })
45 | 
46 |   it('readColumn should return a typed array', async () => {
47 |     const testFile = 'test/files/datapage_v2.snappy.parquet'
48 |     const file = await asyncBufferFromFile(testFile)
49 |     const arrayBuffer = await file.slice(0)
50 |     const metadata = parquetMetadata(arrayBuffer)
51 | 
52 |     const column = metadata.row_groups[0].columns[1] // second column
53 |     if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
54 |     const { startByte, endByte } = getChunkPlan(column.meta_data)
55 |     const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
56 |     const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
57 |     const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
58 |     const columnDecoder = {
59 |       pathInSchema: column.meta_data.path_in_schema,
60 |       type: column.meta_data.type,
61 |       element: schemaPath[schemaPath.length - 1].element,
62 |       schemaPath,
63 |       parsers: DEFAULT_PARSERS,
64 |       codec: column.meta_data.codec,
65 |     }
66 |     const rowGroupSelect = {
67 |       groupStart: 0,
68 |       selectStart: 0,
69 |       selectEnd: Infinity,
70 |       groupRows: Number(column.meta_data.num_values),
71 |     }
72 | 
73 |     const columnData = readColumn(reader, rowGroupSelect, columnDecoder)
74 |     expect(columnData[0]).toBeInstanceOf(Int32Array)
75 |   })
76 | })
77 | 
78 | /**
79 |  * @import {ByteRange, ColumnMetaData} from '../src/types.js'
80 |  * @param {ColumnMetaData} meta
81 |  * @returns {ByteRange}
82 |  */
83 | function getChunkPlan(meta) {
84 |   const columnOffset = meta.dictionary_page_offset || meta.data_page_offset
85 |   return {
86 |     startByte: Number(columnOffset),
87 |     endByte: Number(columnOffset + meta.total_compressed_size),
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/test/files/concatenated_gzip_members.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [1], [2], [3], [4], [5], [6], [7], [8], [9], [10],
 3 |   [11], [12], [13], [14], [15], [16], [17], [18], [19], [20],
 4 |   [21], [22], [23], [24], [25], [26], [27], [28], [29], [30],
 5 |   [31], [32], [33], [34], [35], [36], [37], [38], [39], [40],
 6 |   [41], [42], [43], [44], [45], [46], [47], [48], [49], [50],
 7 |   [51], [52], [53], [54], [55], [56], [57], [58], [59], [60],
 8 |   [61], [62], [63], [64], [65], [66], [67], [68], [69], [70],
 9 |   [71], [72], [73], [74], [75], [76], [77], [78], [79], [80],
10 |   [81], [82], [83], [84], [85], [86], [87], [88], [89], [90],
11 |   [91], [92], [93], [94], [95], [96], [97], [98], [99], [100],
12 |   [101], [102], [103], [104], [105], [106], [107], [108], [109], [110],
13 |   [111], [112], [113], [114], [115], [116], [117], [118], [119], [120],
14 |   [121], [122], [123], [124], [125], [126], [127], [128], [129], [130],
15 |   [131], [132], [133], [134], [135], [136], [137], [138], [139], [140],
16 |   [141], [142], [143], [144], [145], [146], [147], [148], [149], [150],
17 |   [151], [152], [153], [154], [155], [156], [157], [158], [159], [160],
18 |   [161], [162], [163], [164], [165], [166], [167], [168], [169], [170],
19 |   [171], [172], [173], [174], [175], [176], [177], [178], [179], [180],
20 |   [181], [182], [183], [184], [185], [186], [187], [188], [189], [190],
21 |   [191], [192], [193], [194], [195], [196], [197], [198], [199], [200],
22 |   [201], [202], [203], [204], [205], [206], [207], [208], [209], [210],
23 |   [211], [212], [213], [214], [215], [216], [217], [218], [219], [220],
24 |   [221], [222], [223], [224], [225], [226], [227], [228], [229], [230],
25 |   [231], [232], [233], [234], [235], [236], [237], [238], [239], [240],
26 |   [241], [242], [243], [244], [245], [246], [247], [248], [249], [250],
27 |   [251], [252], [253], [254], [255], [256], [257], [258], [259], [260],
28 |   [261], [262], [263], [264], [265], [266], [267], [268], [269], [270],
29 |   [271], [272], [273], [274], [275], [276], [277], [278], [279], [280],
30 |   [281], [282], [283], [284], [285], [286], [287], [288], [289], [290],
31 |   [291], [292], [293], [294], [295], [296], [297], [298], [299], [300],
32 |   [301], [302], [303], [304], [305], [306], [307], [308], [309], [310],
33 |   [311], [312], [313], [314], [315], [316], [317], [318], [319], [320],
34 |   [321], [322], [323], [324], [325], [326], [327], [328], [329], [330],
35 |   [331], [332], [333], [334], [335], [336], [337], [338], [339], [340],
36 |   [341], [342], [343], [344], [345], [346], [347], [348], [349], [350],
37 |   [351], [352], [353], [354], [355], [356], [357], [358], [359], [360],
38 |   [361], [362], [363], [364], [365], [366], [367], [368], [369], [370],
39 |   [371], [372], [373], [374], [375], [376], [377], [378], [379], [380],
40 |   [381], [382], [383], [384], [385], [386], [387], [388], [389], [390],
41 |   [391], [392], [393], [394], [395], [396], [397], [398], [399], [400],
42 |   [401], [402], [403], [404], [405], [406], [407], [408], [409], [410],
43 |   [411], [412], [413], [414], [415], [416], [417], [418], [419], [420],
44 |   [421], [422], [423], [424], [425], [426], [427], [428], [429], [430],
45 |   [431], [432], [433], [434], [435], [436], [437], [438], [439], [440],
46 |   [441], [442], [443], [444], [445], [446], [447], [448], [449], [450],
47 |   [451], [452], [453], [454], [455], [456], [457], [458], [459], [460],
48 |   [461], [462], [463], [464], [465], [466], [467], [468], [469], [470],
49 |   [471], [472], [473], [474], [475], [476], [477], [478], [479], [480],
50 |   [481], [482], [483], [484], [485], [486], [487], [488], [489], [490],
51 |   [491], [492], [493], [494], [495], [496], [497], [498], [499], [500],
52 |   [501], [502], [503], [504], [505], [506], [507], [508], [509], [510],
53 |   [511], [512], [513]
54 | ]
55 | 


--------------------------------------------------------------------------------
/test/files/incorrect_map_schema.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 1,
  3 |   "schema": [
  4 |     {
  5 |       "name": "hive_schema",
  6 |       "num_children": 1
  7 |     },
  8 |     {
  9 |       "repetition_type": "OPTIONAL",
 10 |       "name": "my_map",
 11 |       "num_children": 1,
 12 |       "converted_type": "MAP",
 13 |       "logical_type": {
 14 |         "type": "MAP"
 15 |       }
 16 |     },
 17 |     {
 18 |       "repetition_type": "REPEATED",
 19 |       "name": "key_value",
 20 |       "num_children": 2,
 21 |       "converted_type": "MAP_KEY_VALUE"
 22 |     },
 23 |     {
 24 |       "type": "BYTE_ARRAY",
 25 |       "repetition_type": "OPTIONAL",
 26 |       "name": "key",
 27 |       "converted_type": "UTF8",
 28 |       "logical_type": {
 29 |         "type": "STRING"
 30 |       }
 31 |     },
 32 |     {
 33 |       "type": "BYTE_ARRAY",
 34 |       "repetition_type": "OPTIONAL",
 35 |       "name": "value",
 36 |       "converted_type": "UTF8",
 37 |       "logical_type": {
 38 |         "type": "STRING"
 39 |       }
 40 |     }
 41 |   ],
 42 |   "num_rows": 1,
 43 |   "row_groups": [
 44 |     {
 45 |       "columns": [
 46 |         {
 47 |           "file_offset": 4,
 48 |           "meta_data": {
 49 |             "type": "BYTE_ARRAY",
 50 |             "encodings": [
 51 |               "PLAIN",
 52 |               "RLE"
 53 |             ],
 54 |             "path_in_schema": [
 55 |               "my_map",
 56 |               "key_value",
 57 |               "key"
 58 |             ],
 59 |             "codec": "GZIP",
 60 |             "num_values": 2,
 61 |             "total_uncompressed_size": 54,
 62 |             "total_compressed_size": 69,
 63 |             "data_page_offset": 4,
 64 |             "statistics": {
 65 |               "null_count": 0,
 66 |               "max_value": "parent",
 67 |               "min_value": "name"
 68 |             },
 69 |             "encoding_stats": [
 70 |               {
 71 |                 "page_type": "DATA_PAGE",
 72 |                 "encoding": "PLAIN",
 73 |                 "count": 1
 74 |               }
 75 |             ]
 76 |           },
 77 |           "offset_index_offset": 198,
 78 |           "offset_index_length": 11,
 79 |           "column_index_offset": 145,
 80 |           "column_index_length": 25
 81 |         },
 82 |         {
 83 |           "file_offset": 73,
 84 |           "meta_data": {
 85 |             "type": "BYTE_ARRAY",
 86 |             "encodings": [
 87 |               "PLAIN",
 88 |               "RLE"
 89 |             ],
 90 |             "path_in_schema": [
 91 |               "my_map",
 92 |               "key_value",
 93 |               "value"
 94 |             ],
 95 |             "codec": "GZIP",
 96 |             "num_values": 2,
 97 |             "total_uncompressed_size": 57,
 98 |             "total_compressed_size": 72,
 99 |             "data_page_offset": 73,
100 |             "statistics": {
101 |               "null_count": 0,
102 |               "max_value": "report",
103 |               "min_value": "another"
104 |             },
105 |             "encoding_stats": [
106 |               {
107 |                 "page_type": "DATA_PAGE",
108 |                 "encoding": "PLAIN",
109 |                 "count": 1
110 |               }
111 |             ]
112 |           },
113 |           "offset_index_offset": 209,
114 |           "offset_index_length": 12,
115 |           "column_index_offset": 170,
116 |           "column_index_length": 28
117 |         }
118 |       ],
119 |       "total_byte_size": 111,
120 |       "num_rows": 1,
121 |       "file_offset": 4,
122 |       "total_compressed_size": 141,
123 |       "ordinal": 0
124 |     }
125 |   ],
126 |   "created_by": "parquet-mr version 1.12.2 (build 77e30c8093386ec52c3cfa6c34b7ef3321322c94)",
127 |   "metadata_length": 366
128 | }
129 | 


--------------------------------------------------------------------------------
/test/files/issue97.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "schema": [
 4 |     {
 5 |       "repetition_type": "REQUIRED",
 6 |       "name": "schema",
 7 |       "num_children": 2
 8 |     },
 9 |     {
10 |       "type": "DOUBLE",
11 |       "repetition_type": "OPTIONAL",
12 |       "name": "a"
13 |     },
14 |     {
15 |       "type": "BOOLEAN",
16 |       "repetition_type": "OPTIONAL",
17 |       "name": "b"
18 |     }
19 |   ],
20 |   "num_rows": 0,
21 |   "row_groups": [
22 |     {
23 |       "columns": [
24 |         {
25 |           "file_offset": 0,
26 |           "meta_data": {
27 |             "type": "DOUBLE",
28 |             "encodings": [
29 |               "PLAIN",
30 |               "RLE"
31 |             ],
32 |             "path_in_schema": [
33 |               "a"
34 |             ],
35 |             "codec": "UNCOMPRESSED",
36 |             "num_values": 0,
37 |             "total_uncompressed_size": 14,
38 |             "total_compressed_size": 14,
39 |             "data_page_offset": 0,
40 |             "dictionary_page_offset": 4,
41 |             "encoding_stats": [
42 |               {
43 |                 "page_type": "DICTIONARY_PAGE",
44 |                 "encoding": "PLAIN",
45 |                 "count": 1
46 |               }
47 |             ]
48 |           }
49 |         },
50 |         {
51 |           "file_offset": 0,
52 |           "meta_data": {
53 |             "type": "BOOLEAN",
54 |             "encodings": [
55 |               "RLE"
56 |             ],
57 |             "path_in_schema": [
58 |               "b"
59 |             ],
60 |             "codec": "UNCOMPRESSED",
61 |             "num_values": 0,
62 |             "total_uncompressed_size": 0,
63 |             "total_compressed_size": 0,
64 |             "data_page_offset": 0,
65 |             "encoding_stats": []
66 |           }
67 |         }
68 |       ],
69 |       "total_byte_size": 14,
70 |       "num_rows": 0,
71 |       "file_offset": 4,
72 |       "total_compressed_size": 14
73 |     }
74 |   ],
75 |   "key_value_metadata": [
76 |     {
77 |       "key": "pandas",
78 |       "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 0, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"a\", \"field_name\": \"a\", \"pandas_type\": \"float64\", \"numpy_type\": \"float64\", \"metadata\": null}, {\"name\": \"b\", \"field_name\": \"b\", \"pandas_type\": \"bool\", \"numpy_type\": \"bool\", \"metadata\": null}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"20.0.0\"}, \"pandas_version\": \"2.3.0\"}"
79 |     },
80 |     {
81 |       "key": "ARROW:schema",
82 |       "value": "/////+ACAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAEACAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAYAgAABAAAAAsCAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDAsICJzdGVwIjogMX1dLCAiY29sdW1uX2luZGV4ZXMiOiBbeyJuYW1lIjogbnVsbCwgImZpZWxkX25hbWUiOiBudWxsLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IHsiZW5jb2RpbmciOiAiVVRGLTgifX1dLCAiY29sdW1ucyI6IFt7Im5hbWUiOiAiYSIsICJmaWVsZF9uYW1lIjogImEiLCAicGFuZGFzX3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAibWV0YWRhdGEiOiBudWxsfSwgeyJuYW1lIjogImIiLCAiZmllbGRfbmFtZSI6ICJiIiwgInBhbmRhc190eXBlIjogImJvb2wiLCAibnVtcHlfdHlwZSI6ICJib29sIiwgIm1ldGFkYXRhIjogbnVsbH1dLCAiY3JlYXRvciI6IHsibGlicmFyeSI6ICJweWFycm93IiwgInZlcnNpb24iOiAiMjAuMC4wIn0sICJwYW5kYXNfdmVyc2lvbiI6ICIyLjMuMCJ9AAYAAABwYW5kYXMAAAIAAABAAAAABAAAANj///8AAAEGEAAAABgAAAAEAAAAAAAAAAEAAABiAAAABAAEAAQAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQMQAAAAGAAAAAQAAAAAAAAAAQAAAGEABgAIAAYABgAAAAAAAgAAAAAA"
83 |     }
84 |   ],
85 |   "created_by": "parquet-cpp-arrow version 20.0.0",
86 |   "metadata_length": 1700
87 | }


--------------------------------------------------------------------------------
/test/files/offset_indexed.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 2,
  3 |   "schema": [
  4 |     {
  5 |       "name": "root",
  6 |       "num_children": 2
  7 |     },
  8 |     {
  9 |       "type": "INT64",
 10 |       "repetition_type": "REQUIRED",
 11 |       "name": "id"
 12 |     },
 13 |     {
 14 |       "type": "BYTE_ARRAY",
 15 |       "repetition_type": "REQUIRED",
 16 |       "name": "content",
 17 |       "converted_type": "UTF8"
 18 |     }
 19 |   ],
 20 |   "num_rows": 200,
 21 |   "row_groups": [
 22 |     {
 23 |       "columns": [
 24 |         {
 25 |           "file_offset": 4,
 26 |           "meta_data": {
 27 |             "type": "INT64",
 28 |             "encodings": [
 29 |               "PLAIN"
 30 |             ],
 31 |             "path_in_schema": [
 32 |               "id"
 33 |             ],
 34 |             "codec": "SNAPPY",
 35 |             "num_values": 100,
 36 |             "total_uncompressed_size": 434,
 37 |             "total_compressed_size": 434,
 38 |             "data_page_offset": 4,
 39 |             "statistics": {
 40 |               "null_count": 0,
 41 |               "max_value": 100,
 42 |               "min_value": 1
 43 |             }
 44 |           }
 45 |         },
 46 |         {
 47 |           "file_offset": 438,
 48 |           "meta_data": {
 49 |             "type": "BYTE_ARRAY",
 50 |             "encodings": [
 51 |               "PLAIN"
 52 |             ],
 53 |             "path_in_schema": [
 54 |               "content"
 55 |             ],
 56 |             "codec": "SNAPPY",
 57 |             "num_values": 100,
 58 |             "total_uncompressed_size": 14334,
 59 |             "total_compressed_size": 14334,
 60 |             "data_page_offset": 438,
 61 |             "statistics": {
 62 |               "null_count": 0,
 63 |               "max_value": "the dolor the ju",
 64 |               "min_value": "adipiscing adipi"
 65 |             }
 66 |           },
 67 |           "offset_index_offset": 29507,
 68 |           "offset_index_length": 62
 69 |         }
 70 |       ],
 71 |       "total_byte_size": 14768,
 72 |       "num_rows": 100
 73 |     },
 74 |     {
 75 |       "columns": [
 76 |         {
 77 |           "file_offset": 14772,
 78 |           "meta_data": {
 79 |             "type": "INT64",
 80 |             "encodings": [
 81 |               "PLAIN"
 82 |             ],
 83 |             "path_in_schema": [
 84 |               "id"
 85 |             ],
 86 |             "codec": "SNAPPY",
 87 |             "num_values": 100,
 88 |             "total_uncompressed_size": 436,
 89 |             "total_compressed_size": 436,
 90 |             "data_page_offset": 14772,
 91 |             "statistics": {
 92 |               "null_count": 0,
 93 |               "max_value": 200,
 94 |               "min_value": 101
 95 |             }
 96 |           }
 97 |         },
 98 |         {
 99 |           "file_offset": 15208,
100 |           "meta_data": {
101 |             "type": "BYTE_ARRAY",
102 |             "encodings": [
103 |               "PLAIN"
104 |             ],
105 |             "path_in_schema": [
106 |               "content"
107 |             ],
108 |             "codec": "SNAPPY",
109 |             "num_values": 100,
110 |             "total_uncompressed_size": 14299,
111 |             "total_compressed_size": 14299,
112 |             "data_page_offset": 15208,
113 |             "statistics": {
114 |               "null_count": 0,
115 |               "max_value": "the storage over",
116 |               "min_value": "adipiscing encod"
117 |             }
118 |           },
119 |           "offset_index_offset": 29569,
120 |           "offset_index_length": 65
121 |         }
122 |       ],
123 |       "total_byte_size": 14735,
124 |       "num_rows": 100
125 |     }
126 |   ],
127 |   "created_by": "hyparquet",
128 |   "metadata_length": 352
129 | }
130 | 


--------------------------------------------------------------------------------
/test/files/byte_stream_split_v2.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 2,
  3 |   "schema": [
  4 |     {
  5 |       "repetition_type": "REQUIRED",
  6 |       "name": "schema",
  7 |       "num_children": 2
  8 |     },
  9 |     {
 10 |       "type": "DOUBLE",
 11 |       "repetition_type": "OPTIONAL",
 12 |       "name": "float_col"
 13 |     },
 14 |     {
 15 |       "type": "DOUBLE",
 16 |       "repetition_type": "OPTIONAL",
 17 |       "name": "double_col"
 18 |     }
 19 |   ],
 20 |   "num_rows": 5,
 21 |   "row_groups": [
 22 |     {
 23 |       "columns": [
 24 |         {
 25 |           "file_offset": 0,
 26 |           "meta_data": {
 27 |             "type": "DOUBLE",
 28 |             "encodings": [
 29 |               "RLE",
 30 |               "BYTE_STREAM_SPLIT"
 31 |             ],
 32 |             "path_in_schema": [
 33 |               "float_col"
 34 |             ],
 35 |             "codec": "SNAPPY",
 36 |             "num_values": 5,
 37 |             "total_uncompressed_size": 110,
 38 |             "total_compressed_size": 87,
 39 |             "data_page_offset": 4,
 40 |             "statistics": {
 41 |               "max": 5.5,
 42 |               "min": 1.5,
 43 |               "null_count": 0,
 44 |               "max_value": 5.5,
 45 |               "min_value": 1.5,
 46 |               "is_max_value_exact": true,
 47 |               "is_min_value_exact": true
 48 |             },
 49 |             "encoding_stats": [
 50 |               {
 51 |                 "page_type": "DATA_PAGE",
 52 |                 "encoding": "BYTE_STREAM_SPLIT",
 53 |                 "count": 1
 54 |               }
 55 |             ],
 56 |             "size_statistics": {
 57 |               "repetition_level_histogram": [],
 58 |               "definition_level_histogram": [
 59 |                 0,
 60 |                 5
 61 |               ]
 62 |             }
 63 |           }
 64 |         },
 65 |         {
 66 |           "file_offset": 0,
 67 |           "meta_data": {
 68 |             "type": "DOUBLE",
 69 |             "encodings": [
 70 |               "RLE",
 71 |               "BYTE_STREAM_SPLIT"
 72 |             ],
 73 |             "path_in_schema": [
 74 |               "double_col"
 75 |             ],
 76 |             "codec": "SNAPPY",
 77 |             "num_values": 5,
 78 |             "total_uncompressed_size": 110,
 79 |             "total_compressed_size": 97,
 80 |             "data_page_offset": 91,
 81 |             "statistics": {
 82 |               "max": 50.5,
 83 |               "min": 10.1,
 84 |               "null_count": 0,
 85 |               "max_value": 50.5,
 86 |               "min_value": 10.1,
 87 |               "is_max_value_exact": true,
 88 |               "is_min_value_exact": true
 89 |             },
 90 |             "encoding_stats": [
 91 |               {
 92 |                 "page_type": "DATA_PAGE",
 93 |                 "encoding": "BYTE_STREAM_SPLIT",
 94 |                 "count": 1
 95 |               }
 96 |             ],
 97 |             "size_statistics": {
 98 |               "repetition_level_histogram": [],
 99 |               "definition_level_histogram": [
100 |                 0,
101 |                 5
102 |               ]
103 |             }
104 |           }
105 |         }
106 |       ],
107 |       "total_byte_size": 220,
108 |       "num_rows": 5,
109 |       "file_offset": 4,
110 |       "total_compressed_size": 184
111 |     }
112 |   ],
113 |   "key_value_metadata": [
114 |     {
115 |       "key": "ARROW:schema",
116 |       "value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABIAAAABAAAAND///8AAAEDEAAAABwAAAAEAAAAAAAAAAoAAABkb3VibGVfY29sAADC////AAACABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAxAAAAAgAAAABAAAAAAAAAAJAAAAZmxvYXRfY29sAAYACAAGAAYAAAAAAAIAAAAAAA=="
117 |     }
118 |   ],
119 |   "created_by": "parquet-cpp-arrow version 22.0.0",
120 |   "metadata_length": 576
121 | }
122 | 


--------------------------------------------------------------------------------
/test/files/lz4_raw_compressed.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "created_by": "parquet-cpp version 1.5.1-SNAPSHOT",
  3 |   "metadata_length": 330,
  4 |   "num_rows": 4,
  5 |   "row_groups": [
  6 |     {
  7 |       "columns": [
  8 |         {
  9 |           "file_offset": 89,
 10 |           "meta_data": {
 11 |             "codec": "LZ4_RAW",
 12 |             "data_page_offset": 4,
 13 |             "encoding_stats": [
 14 |               {
 15 |                 "count": 1,
 16 |                 "encoding": "PLAIN",
 17 |                 "page_type": "DATA_PAGE"
 18 |               }
 19 |             ],
 20 |             "encodings": [
 21 |               "PLAIN",
 22 |               "RLE"
 23 |             ],
 24 |             "num_values": 4,
 25 |             "path_in_schema": [
 26 |               "c0"
 27 |             ],
 28 |             "statistics": {
 29 |               "max": 1593604801,
 30 |               "max_value": 1593604801,
 31 |               "min": 1593604800,
 32 |               "min_value": 1593604800,
 33 |               "null_count": 0
 34 |             },
 35 |             "total_compressed_size": 85,
 36 |             "total_uncompressed_size": 93,
 37 |             "type": "INT64"
 38 |           }
 39 |         },
 40 |         {
 41 |           "file_offset": 229,
 42 |           "meta_data": {
 43 |             "codec": "LZ4_RAW",
 44 |             "data_page_offset": 171,
 45 |             "encoding_stats": [
 46 |               {
 47 |                 "count": 1,
 48 |                 "encoding": "PLAIN",
 49 |                 "page_type": "DATA_PAGE"
 50 |               }
 51 |             ],
 52 |             "encodings": [
 53 |               "PLAIN",
 54 |               "RLE"
 55 |             ],
 56 |             "num_values": 4,
 57 |             "path_in_schema": [
 58 |               "c1"
 59 |             ],
 60 |             "statistics": {
 61 |               "max_value": "def",
 62 |               "min_value": "abc",
 63 |               "null_count": 0
 64 |             },
 65 |             "total_compressed_size": 58,
 66 |             "total_uncompressed_size": 59,
 67 |             "type": "BYTE_ARRAY"
 68 |           }
 69 |         },
 70 |         {
 71 |           "file_offset": 375,
 72 |           "meta_data": {
 73 |             "codec": "LZ4_RAW",
 74 |             "data_page_offset": 280,
 75 |             "encoding_stats": [
 76 |               {
 77 |                 "count": 1,
 78 |                 "encoding": "PLAIN",
 79 |                 "page_type": "DATA_PAGE"
 80 |               }
 81 |             ],
 82 |             "encodings": [
 83 |               "PLAIN",
 84 |               "RLE"
 85 |             ],
 86 |             "num_values": 4,
 87 |             "path_in_schema": [
 88 |               "v11"
 89 |             ],
 90 |             "statistics": {
 91 |               "max": 42.125,
 92 |               "max_value": 42.125,
 93 |               "min": 7.7,
 94 |               "min_value": 7.7,
 95 |               "null_count": 0
 96 |             },
 97 |             "total_compressed_size": 95,
 98 |             "total_uncompressed_size": 99,
 99 |             "type": "DOUBLE"
100 |           }
101 |         }
102 |       ],
103 |       "file_offset": 89,
104 |       "num_rows": 4,
105 |       "ordinal": 0,
106 |       "total_byte_size": 251,
107 |       "total_compressed_size": 238
108 |     }
109 |   ],
110 |   "schema": [
111 |     {
112 |       "name": "schema",
113 |       "num_children": 3,
114 |       "repetition_type": "REQUIRED"
115 |     },
116 |     {
117 |       "name": "c0",
118 |       "repetition_type": "REQUIRED",
119 |       "type": "INT64"
120 |     },
121 |     {
122 |       "name": "c1",
123 |       "repetition_type": "REQUIRED",
124 |       "type": "BYTE_ARRAY"
125 |     },
126 |     {
127 |       "name": "v11",
128 |       "repetition_type": "OPTIONAL",
129 |       "type": "DOUBLE"
130 |     }
131 |   ],
132 |   "version": 1
133 | }
134 | 


--------------------------------------------------------------------------------
/src/delta.js:
--------------------------------------------------------------------------------
  1 | import { readVarInt, readZigZagBigInt } from './thrift.js'
  2 | 
  3 | /**
  4 |  * @import {DataReader} from '../src/types.d.ts'
  5 |  * @param {DataReader} reader
  6 |  * @param {number} count number of values to read
  7 |  * @param {Int32Array | BigInt64Array} output
  8 |  */
  9 | export function deltaBinaryUnpack(reader, count, output) {
 10 |   const int32 = output instanceof Int32Array
 11 |   const blockSize = readVarInt(reader)
 12 |   const miniblockPerBlock = readVarInt(reader)
 13 |   readVarInt(reader) // assert(=== count)
 14 |   let value = readZigZagBigInt(reader) // first value
 15 |   let outputIndex = 0
 16 |   output[outputIndex++] = int32 ? Number(value) : value
 17 | 
 18 |   const valuesPerMiniblock = blockSize / miniblockPerBlock
 19 | 
 20 |   while (outputIndex < count) {
 21 |     // new block
 22 |     const minDelta = readZigZagBigInt(reader)
 23 |     const bitWidths = new Uint8Array(miniblockPerBlock)
 24 |     for (let i = 0; i < miniblockPerBlock; i++) {
 25 |       bitWidths[i] = reader.view.getUint8(reader.offset++)
 26 |     }
 27 | 
 28 |     for (let i = 0; i < miniblockPerBlock && outputIndex < count; i++) {
 29 |       // new miniblock
 30 |       const bitWidth = BigInt(bitWidths[i])
 31 |       if (bitWidth) {
 32 |         let bitpackPos = 0n
 33 |         let miniblockCount = valuesPerMiniblock
 34 |         const mask = (1n << bitWidth) - 1n
 35 |         while (miniblockCount && outputIndex < count) {
 36 |           let bits = BigInt(reader.view.getUint8(reader.offset)) >> bitpackPos & mask // TODO: don't re-read value every time
 37 |           bitpackPos += bitWidth
 38 |           while (bitpackPos >= 8) {
 39 |             bitpackPos -= 8n
 40 |             reader.offset++
 41 |             if (bitpackPos) {
 42 |               bits |= BigInt(reader.view.getUint8(reader.offset)) << bitWidth - bitpackPos & mask
 43 |             }
 44 |           }
 45 |           const delta = minDelta + bits
 46 |           value += delta
 47 |           output[outputIndex++] = int32 ? Number(value) : value
 48 |           miniblockCount--
 49 |         }
 50 |         if (miniblockCount) {
 51 |           // consume leftover miniblock
 52 |           reader.offset += Math.ceil((miniblockCount * Number(bitWidth) + Number(bitpackPos)) / 8)
 53 |         }
 54 |       } else {
 55 |         for (let j = 0; j < valuesPerMiniblock && outputIndex < count; j++) {
 56 |           value += minDelta
 57 |           output[outputIndex++] = int32 ? Number(value) : value
 58 |         }
 59 |       }
 60 |     }
 61 |   }
 62 | }
 63 | 
 64 | /**
 65 |  * @param {DataReader} reader
 66 |  * @param {number} count
 67 |  * @param {Uint8Array[]} output
 68 |  */
 69 | export function deltaLengthByteArray(reader, count, output) {
 70 |   const lengths = new Int32Array(count)
 71 |   deltaBinaryUnpack(reader, count, lengths)
 72 |   for (let i = 0; i < count; i++) {
 73 |     output[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, lengths[i])
 74 |     reader.offset += lengths[i]
 75 |   }
 76 | }
 77 | 
 78 | /**
 79 |  * @param {DataReader} reader
 80 |  * @param {number} count
 81 |  * @param {Uint8Array[]} output
 82 |  */
 83 | export function deltaByteArray(reader, count, output) {
 84 |   const prefixData = new Int32Array(count)
 85 |   deltaBinaryUnpack(reader, count, prefixData)
 86 |   const suffixData = new Int32Array(count)
 87 |   deltaBinaryUnpack(reader, count, suffixData)
 88 | 
 89 |   for (let i = 0; i < count; i++) {
 90 |     const suffix = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, suffixData[i])
 91 |     if (prefixData[i]) {
 92 |       // copy from previous value
 93 |       output[i] = new Uint8Array(prefixData[i] + suffixData[i])
 94 |       output[i].set(output[i - 1].subarray(0, prefixData[i]))
 95 |       output[i].set(suffix, prefixData[i])
 96 |     } else {
 97 |       output[i] = suffix
 98 |     }
 99 |     reader.offset += suffixData[i]
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/test/files/plain-dict-uncompressed-checksum.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 1,
  3 |   "schema": [
  4 |     {
  5 |       "name": "m",
  6 |       "num_children": 2
  7 |     },
  8 |     {
  9 |       "type": "INT64",
 10 |       "repetition_type": "REQUIRED",
 11 |       "name": "long_field"
 12 |     },
 13 |     {
 14 |       "type": "BYTE_ARRAY",
 15 |       "repetition_type": "REQUIRED",
 16 |       "name": "binary_field"
 17 |     }
 18 |   ],
 19 |   "num_rows": 1000,
 20 |   "row_groups": [
 21 |     {
 22 |       "columns": [
 23 |         {
 24 |           "file_offset": 31,
 25 |           "meta_data": {
 26 |             "type": "INT64",
 27 |             "encodings": [
 28 |               "PLAIN_DICTIONARY",
 29 |               "BIT_PACKED"
 30 |             ],
 31 |             "path_in_schema": [
 32 |               "long_field"
 33 |             ],
 34 |             "codec": "UNCOMPRESSED",
 35 |             "num_values": 1000,
 36 |             "total_uncompressed_size": 54,
 37 |             "total_compressed_size": 54,
 38 |             "data_page_offset": 31,
 39 |             "dictionary_page_offset": 4,
 40 |             "statistics": {
 41 |               "max": 0,
 42 |               "min": 0,
 43 |               "null_count": 0,
 44 |               "max_value": 0,
 45 |               "min_value": 0
 46 |             },
 47 |             "encoding_stats": [
 48 |               {
 49 |                 "page_type": "DICTIONARY_PAGE",
 50 |                 "encoding": "PLAIN_DICTIONARY",
 51 |                 "count": 1
 52 |               },
 53 |               {
 54 |                 "page_type": "DATA_PAGE",
 55 |                 "encoding": "PLAIN_DICTIONARY",
 56 |                 "count": 1
 57 |               }
 58 |             ]
 59 |           },
 60 |           "offset_index_offset": 262,
 61 |           "offset_index_length": 10,
 62 |           "column_index_offset": 144,
 63 |           "column_index_length": 31
 64 |         },
 65 |         {
 66 |           "file_offset": 117,
 67 |           "meta_data": {
 68 |             "type": "BYTE_ARRAY",
 69 |             "encodings": [
 70 |               "PLAIN_DICTIONARY",
 71 |               "BIT_PACKED"
 72 |             ],
 73 |             "path_in_schema": [
 74 |               "binary_field"
 75 |             ],
 76 |             "codec": "UNCOMPRESSED",
 77 |             "num_values": 1000,
 78 |             "total_uncompressed_size": 86,
 79 |             "total_compressed_size": 86,
 80 |             "data_page_offset": 117,
 81 |             "dictionary_page_offset": 58,
 82 |             "statistics": {
 83 |               "max": "a655fd0e-9949-4059-bcae-fd6a002a4652",
 84 |               "min": "a655fd0e-9949-4059-bcae-fd6a002a4652",
 85 |               "null_count": 0,
 86 |               "max_value": "a655fd0e-9949-4059-bcae-fd6a002a4652",
 87 |               "min_value": "a655fd0e-9949-4059-bcae-fd6a002a4652"
 88 |             },
 89 |             "encoding_stats": [
 90 |               {
 91 |                 "page_type": "DICTIONARY_PAGE",
 92 |                 "encoding": "PLAIN_DICTIONARY",
 93 |                 "count": 1
 94 |               },
 95 |               {
 96 |                 "page_type": "DATA_PAGE",
 97 |                 "encoding": "PLAIN_DICTIONARY",
 98 |                 "count": 1
 99 |               }
100 |             ]
101 |           },
102 |           "offset_index_offset": 272,
103 |           "offset_index_length": 11,
104 |           "column_index_offset": 175,
105 |           "column_index_length": 87
106 |         }
107 |       ],
108 |       "total_byte_size": 140,
109 |       "num_rows": 1000,
110 |       "file_offset": 4,
111 |       "total_compressed_size": 140,
112 |       "ordinal": 0
113 |     }
114 |   ],
115 |   "key_value_metadata": [
116 |     {
117 |       "key": "writer.model.name",
118 |       "value": "example"
119 |     }
120 |   ],
121 |   "created_by": "parquet-mr version 1.13.0-SNAPSHOT (build 261f7d2679407c833545b56f4c85a4ae8b5c9ed4)",
122 |   "metadata_length": 525
123 | }
124 | 


--------------------------------------------------------------------------------
/src/wkb.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * WKB (Well-Known Binary) decoder for geometry objects.
  3 |  *
  4 |  * @import {DataReader, Geometry} from '../src/types.js'
  5 |  * @param {DataReader} reader
  6 |  * @returns {Geometry} geometry object
  7 |  */
  8 | export function wkbToGeojson(reader) {
  9 |   const flags = getFlags(reader)
 10 | 
 11 |   if (flags.type === 1) { // Point
 12 |     return { type: 'Point', coordinates: readPosition(reader, flags) }
 13 |   } else if (flags.type === 2) { // LineString
 14 |     return { type: 'LineString', coordinates: readLine(reader, flags) }
 15 |   } else if (flags.type === 3) { // Polygon
 16 |     return { type: 'Polygon', coordinates: readPolygon(reader, flags) }
 17 |   } else if (flags.type === 4) { // MultiPoint
 18 |     const points = []
 19 |     for (let i = 0; i < flags.count; i++) {
 20 |       points.push(readPosition(reader, getFlags(reader)))
 21 |     }
 22 |     return { type: 'MultiPoint', coordinates: points }
 23 |   } else if (flags.type === 5) { // MultiLineString
 24 |     const lines = []
 25 |     for (let i = 0; i < flags.count; i++) {
 26 |       lines.push(readLine(reader, getFlags(reader)))
 27 |     }
 28 |     return { type: 'MultiLineString', coordinates: lines }
 29 |   } else if (flags.type === 6) { // MultiPolygon
 30 |     const polygons = []
 31 |     for (let i = 0; i < flags.count; i++) {
 32 |       polygons.push(readPolygon(reader, getFlags(reader)))
 33 |     }
 34 |     return { type: 'MultiPolygon', coordinates: polygons }
 35 |   } else if (flags.type === 7) { // GeometryCollection
 36 |     const geometries = []
 37 |     for (let i = 0; i < flags.count; i++) {
 38 |       geometries.push(wkbToGeojson(reader))
 39 |     }
 40 |     return { type: 'GeometryCollection', geometries }
 41 |   } else {
 42 |     throw new Error(`Unsupported geometry type: ${flags.type}`)
 43 |   }
 44 | }
 45 | 
 46 | /**
 47 |  * @typedef {object} WkbFlags
 48 |  * @property {boolean} littleEndian
 49 |  * @property {number} type
 50 |  * @property {number} dim
 51 |  * @property {number} count
 52 |  */
 53 | 
 54 | /**
 55 |  * Extract ISO WKB flags and base geometry type.
 56 |  *
 57 |  * @param {DataReader} reader
 58 |  * @returns {WkbFlags}
 59 |  */
 60 | function getFlags(reader) {
 61 |   const { view } = reader
 62 |   const littleEndian = view.getUint8(reader.offset++) === 1
 63 |   const rawType = view.getUint32(reader.offset, littleEndian)
 64 |   reader.offset += 4
 65 | 
 66 |   const type = rawType % 1000
 67 |   const flags = Math.floor(rawType / 1000)
 68 | 
 69 |   let count = 0
 70 |   if (type > 1 && type <= 7) {
 71 |     count = view.getUint32(reader.offset, littleEndian)
 72 |     reader.offset += 4
 73 |   }
 74 | 
 75 |   // XY, XYZ, XYM, XYZM
 76 |   let dim = 2
 77 |   if (flags) dim++
 78 |   if (flags === 3) dim++
 79 | 
 80 |   return { littleEndian, type, dim, count }
 81 | }
 82 | 
 83 | /**
 84 |  * @param {DataReader} reader
 85 |  * @param {WkbFlags} flags
 86 |  * @returns {number[]}
 87 |  */
 88 | function readPosition(reader, flags) {
 89 |   const points = []
 90 |   for (let i = 0; i < flags.dim; i++) {
 91 |     const coord = reader.view.getFloat64(reader.offset, flags.littleEndian)
 92 |     reader.offset += 8
 93 |     points.push(coord)
 94 |   }
 95 |   return points
 96 | }
 97 | 
 98 | /**
 99 |  * @param {DataReader} reader
100 |  * @param {WkbFlags} flags
101 |  * @returns {number[][]}
102 |  */
103 | function readLine(reader, flags) {
104 |   const points = []
105 |   for (let i = 0; i < flags.count; i++) {
106 |     points.push(readPosition(reader, flags))
107 |   }
108 |   return points
109 | }
110 | 
111 | /**
112 |  * @param {DataReader} reader
113 |  * @param {WkbFlags} flags
114 |  * @returns {number[][][]}
115 |  */
116 | function readPolygon(reader, flags) {
117 |   const { view } = reader
118 |   const rings = []
119 |   for (let r = 0; r < flags.count; r++) {
120 |     const count = view.getUint32(reader.offset, flags.littleEndian)
121 |     reader.offset += 4
122 |     rings.push(readLine(reader, { ...flags, count }))
123 |   }
124 |   return rings
125 | }
126 | 


--------------------------------------------------------------------------------
/test/files/rowgroups.metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 2,
 3 |   "created_by": "parquet-cpp-arrow version 14.0.2",
 4 |   "metadata_length": 1602,
 5 |   "schema": [
 6 |     {
 7 |       "repetition_type": "REQUIRED",
 8 |       "name": "schema",
 9 |       "num_children": 1
10 |     },
11 |     {
12 |       "type": "INT64",
13 |       "repetition_type": "OPTIONAL",
14 |       "name": "numbers"
15 |     }
16 |   ],
17 |   "num_rows": 15,
18 |   "row_groups": [
19 |     {
20 |       "columns": [
21 |         {
22 |           "file_offset": 150,
23 |           "meta_data": {
24 |             "codec": "SNAPPY",
25 |             "data_page_offset": 71,
26 |             "dictionary_page_offset": 4,
27 |             "encoding_stats": [
28 |               { "count": 1, "encoding": "PLAIN", "page_type": "DICTIONARY_PAGE" },
29 |               { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": "DATA_PAGE" }
30 |             ],
31 |             "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
32 |             "num_values": 10,
33 |             "path_in_schema": ["numbers"],
34 |             "statistics": {
35 |               "max": 10,
36 |               "min": 1,
37 |               "max_value": 10,
38 |               "min_value": 1,
39 |               "null_count": 0
40 |             },
41 |             "total_compressed_size": 146,
42 |             "total_uncompressed_size": 172,
43 |             "type": "INT64"
44 |           }
45 |         }
46 |       ],
47 |       "file_offset": 4,
48 |       "num_rows": 10,
49 |       "ordinal": 0,
50 |       "total_byte_size": 172,
51 |       "total_compressed_size": 146
52 |     },
53 |     {
54 |       "columns": [
55 |         {
56 |           "file_offset": 368,
57 |           "meta_data": {
58 |             "codec": "SNAPPY",
59 |             "data_page_offset": 294,
60 |             "dictionary_page_offset": 248,
61 |             "encoding_stats": [
62 |               { "count": 1, "encoding": "PLAIN", "page_type": "DICTIONARY_PAGE" },
63 |               { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": "DATA_PAGE" }
64 |             ],
65 |             "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
66 |             "num_values": 5,
67 |             "path_in_schema": ["numbers"],
68 |             "statistics": {
69 |               "max": 15,
70 |               "min": 11,
71 |               "max_value": 15,
72 |               "min_value": 11,
73 |               "null_count": 0
74 |             },
75 |             "total_compressed_size": 120,
76 |             "total_uncompressed_size": 126,
77 |             "type": "INT64"
78 |           }
79 |         }
80 |       ],
81 |       "file_offset": 248,
82 |       "num_rows": 5,
83 |       "ordinal": 1,
84 |       "total_byte_size": 126,
85 |       "total_compressed_size": 120
86 |     }
87 |   ],
88 |   "key_value_metadata": [
89 |     {
90 |       "key": "pandas",
91 |       "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 15, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"numbers\", \"field_name\": \"numbers\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"14.0.2\"}, \"pandas_version\": \"2.1.4\"}"
92 |     },
93 |     {
94 |       "key": "ARROW:schema",
95 |       "value": "/////2gCAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAOgBAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAALMBAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDE1LCAic3RlcCI6IDF9XSwgImNvbHVtbl9pbmRleGVzIjogW3sibmFtZSI6IG51bGwsICJmaWVsZF9uYW1lIjogbnVsbCwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7ImVuY29kaW5nIjogIlVURi04In19XSwgImNvbHVtbnMiOiBbeyJuYW1lIjogIm51bWJlcnMiLCAiZmllbGRfbmFtZSI6ICJudW1iZXJzIiwgInBhbmRhc190eXBlIjogImludDY0IiwgIm51bXB5X3R5cGUiOiAiaW50NjQiLCAibWV0YWRhdGEiOiBudWxsfV0sICJjcmVhdG9yIjogeyJsaWJyYXJ5IjogInB5YXJyb3ciLCAidmVyc2lvbiI6ICIxNC4wLjIifSwgInBhbmRhc192ZXJzaW9uIjogIjIuMS40In0AAQAAABQAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAIAAAAAQAAAAAAAAABwAAAG51bWJlcnMACAAMAAgABwAIAAAAAAAAAUAAAAAAAAAA"
96 |     }
97 |   ]
98 | }
99 | 


--------------------------------------------------------------------------------
/src/snappy.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * The MIT License (MIT)
  3 |  * Copyright (c) 2016 Zhipeng Jia
  4 |  * https://github.com/zhipeng-jia/snappyjs
  5 |  */
  6 | 
  7 | const WORD_MASK = [0, 0xff, 0xffff, 0xffffff, 0xffffffff]
  8 | 
  9 | /**
 10 |  * Copy bytes from one array to another
 11 |  *
 12 |  * @param {Uint8Array} fromArray source array
 13 |  * @param {number} fromPos source position
 14 |  * @param {Uint8Array} toArray destination array
 15 |  * @param {number} toPos destination position
 16 |  * @param {number} length number of bytes to copy
 17 |  */
 18 | function copyBytes(fromArray, fromPos, toArray, toPos, length) {
 19 |   for (let i = 0; i < length; i++) {
 20 |     toArray[toPos + i] = fromArray[fromPos + i]
 21 |   }
 22 | }
 23 | 
 24 | /**
 25 |  * Decompress snappy data.
 26 |  * Accepts an output buffer to avoid allocating a new buffer for each call.
 27 |  *
 28 |  * @param {Uint8Array} input compressed data
 29 |  * @param {Uint8Array} output output buffer
 30 |  */
 31 | export function snappyUncompress(input, output) {
 32 |   const inputLength = input.byteLength
 33 |   const outputLength = output.byteLength
 34 |   let pos = 0
 35 |   let outPos = 0
 36 | 
 37 |   // skip preamble (contains uncompressed length as varint)
 38 |   while (pos < inputLength) {
 39 |     const c = input[pos]
 40 |     pos++
 41 |     if (c < 128) {
 42 |       break
 43 |     }
 44 |   }
 45 |   if (outputLength && pos >= inputLength) {
 46 |     throw new Error('invalid snappy length header')
 47 |   }
 48 | 
 49 |   while (pos < inputLength) {
 50 |     const c = input[pos]
 51 |     let len = 0
 52 |     pos++
 53 | 
 54 |     if (pos >= inputLength) {
 55 |       throw new Error('missing eof marker')
 56 |     }
 57 | 
 58 |     // There are two types of elements, literals and copies (back references)
 59 |     if ((c & 0x3) === 0) {
 60 |       // Literals are uncompressed data stored directly in the byte stream
 61 |       let len = (c >>> 2) + 1
 62 |       // Longer literal length is encoded in multiple bytes
 63 |       if (len > 60) {
 64 |         if (pos + 3 >= inputLength) {
 65 |           throw new Error('snappy error literal pos + 3 >= inputLength')
 66 |         }
 67 |         const lengthSize = len - 60 // length bytes - 1
 68 |         len = input[pos]
 69 |           + (input[pos + 1] << 8)
 70 |           + (input[pos + 2] << 16)
 71 |           + (input[pos + 3] << 24)
 72 |         len = (len & WORD_MASK[lengthSize]) + 1
 73 |         pos += lengthSize
 74 |       }
 75 |       if (pos + len > inputLength) {
 76 |         throw new Error('snappy error literal exceeds input length')
 77 |       }
 78 |       copyBytes(input, pos, output, outPos, len)
 79 |       pos += len
 80 |       outPos += len
 81 |     } else {
 82 |       // Copy elements
 83 |       let offset = 0 // offset back from current position to read
 84 |       switch (c & 0x3) {
 85 |       case 1:
 86 |         // Copy with 1-byte offset
 87 |         len = (c >>> 2 & 0x7) + 4
 88 |         offset = input[pos] + (c >>> 5 << 8)
 89 |         pos++
 90 |         break
 91 |       case 2:
 92 |         // Copy with 2-byte offset
 93 |         if (inputLength <= pos + 1) {
 94 |           throw new Error('snappy error end of input')
 95 |         }
 96 |         len = (c >>> 2) + 1
 97 |         offset = input[pos] + (input[pos + 1] << 8)
 98 |         pos += 2
 99 |         break
100 |       case 3:
101 |         // Copy with 4-byte offset
102 |         if (inputLength <= pos + 3) {
103 |           throw new Error('snappy error end of input')
104 |         }
105 |         len = (c >>> 2) + 1
106 |         offset = input[pos]
107 |           + (input[pos + 1] << 8)
108 |           + (input[pos + 2] << 16)
109 |           + (input[pos + 3] << 24)
110 |         pos += 4
111 |         break
112 |       default:
113 |         break
114 |       }
115 |       if (offset === 0 || isNaN(offset)) {
116 |         throw new Error(`invalid offset ${offset} pos ${pos} inputLength ${inputLength}`)
117 |       }
118 |       if (offset > outPos) {
119 |         throw new Error('cannot copy from before start of buffer')
120 |       }
121 |       copyBytes(output, outPos - offset, output, outPos, len)
122 |       outPos += len
123 |     }
124 |   }
125 | 
126 |   if (outPos !== outputLength) throw new Error('premature end of input')
127 | }
128 | 


--------------------------------------------------------------------------------
/test/files/hadoop_lz4_compressed.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "created_by": "parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)",
  3 |   "metadata_length": 376,
  4 |   "num_rows": 4,
  5 |   "row_groups": [
  6 |     {
  7 |       "columns": [
  8 |         {
  9 |           "file_offset": 4,
 10 |           "meta_data": {
 11 |             "codec": "LZ4",
 12 |             "data_page_offset": 4,
 13 |             "encoding_stats": [
 14 |               {
 15 |                 "count": 1,
 16 |                 "encoding": "PLAIN_DICTIONARY",
 17 |                 "page_type": "DICTIONARY_PAGE"
 18 |               },
 19 |               {
 20 |                 "count": 1,
 21 |                 "encoding": "PLAIN_DICTIONARY",
 22 |                 "page_type": "DATA_PAGE"
 23 |               }
 24 |             ],
 25 |             "encodings": [
 26 |               "BIT_PACKED",
 27 |               "PLAIN_DICTIONARY"
 28 |             ],
 29 |             "num_values": 4,
 30 |             "path_in_schema": [
 31 |               "c0"
 32 |             ],
 33 |             "statistics": {
 34 |               "max": 1593604801,
 35 |               "max_value": 1593604801,
 36 |               "min": 1593604800,
 37 |               "min_value": 1593604800,
 38 |               "null_count": 0
 39 |             },
 40 |             "total_compressed_size": 112,
 41 |             "total_uncompressed_size": 93,
 42 |             "type": "INT64"
 43 |           }
 44 |         },
 45 |         {
 46 |           "file_offset": 116,
 47 |           "meta_data": {
 48 |             "codec": "LZ4",
 49 |             "data_page_offset": 116,
 50 |             "encoding_stats": [
 51 |               {
 52 |                 "count": 1,
 53 |                 "encoding": "PLAIN_DICTIONARY",
 54 |                 "page_type": "DICTIONARY_PAGE"
 55 |               },
 56 |               {
 57 |                 "count": 1,
 58 |                 "encoding": "PLAIN_DICTIONARY",
 59 |                 "page_type": "DATA_PAGE"
 60 |               }
 61 |             ],
 62 |             "encodings": [
 63 |               "BIT_PACKED",
 64 |               "PLAIN_DICTIONARY"
 65 |             ],
 66 |             "num_values": 4,
 67 |             "path_in_schema": [
 68 |               "c1"
 69 |             ],
 70 |             "statistics": {
 71 |               "max_value": "def",
 72 |               "min_value": "abc",
 73 |               "null_count": 0
 74 |             },
 75 |             "total_compressed_size": 79,
 76 |             "total_uncompressed_size": 61,
 77 |             "type": "BYTE_ARRAY"
 78 |           }
 79 |         },
 80 |         {
 81 |           "file_offset": 195,
 82 |           "meta_data": {
 83 |             "codec": "LZ4",
 84 |             "data_page_offset": 195,
 85 |             "encoding_stats": [
 86 |               {
 87 |                 "count": 1,
 88 |                 "encoding": "PLAIN_DICTIONARY",
 89 |                 "page_type": "DICTIONARY_PAGE"
 90 |               },
 91 |               {
 92 |                 "count": 1,
 93 |                 "encoding": "PLAIN_DICTIONARY",
 94 |                 "page_type": "DATA_PAGE"
 95 |               }
 96 |             ],
 97 |             "encodings": [
 98 |               "BIT_PACKED",
 99 |               "PLAIN_DICTIONARY",
100 |               "RLE"
101 |             ],
102 |             "num_values": 4,
103 |             "path_in_schema": [
104 |               "v11"
105 |             ],
106 |             "statistics": {
107 |               "max": 42.125,
108 |               "max_value": 42.125,
109 |               "min": 7.7,
110 |               "min_value": 7.7,
111 |               "null_count": 0
112 |             },
113 |             "total_compressed_size": 123,
114 |             "total_uncompressed_size": 108,
115 |             "type": "DOUBLE"
116 |           }
117 |         }
118 |       ],
119 |       "num_rows": 4,
120 |       "total_byte_size": 262
121 |     }
122 |   ],
123 |   "schema": [
124 |     {
125 |       "name": "",
126 |       "num_children": 3
127 |     },
128 |     {
129 |       "name": "c0",
130 |       "repetition_type": "REQUIRED",
131 |       "type": "INT64"
132 |     },
133 |     {
134 |       "name": "c1",
135 |       "repetition_type": "REQUIRED",
136 |       "type": "BYTE_ARRAY"
137 |     },
138 |     {
139 |       "name": "v11",
140 |       "repetition_type": "OPTIONAL",
141 |       "type": "DOUBLE"
142 |     }
143 |   ],
144 |   "version": 1
145 | }
146 | 


--------------------------------------------------------------------------------
/test/files/rowend_struct.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 2,
  3 |   "schema": [
  4 |     {
  5 |       "repetition_type": "REQUIRED",
  6 |       "name": "schema",
  7 |       "num_children": 1
  8 |     },
  9 |     {
 10 |       "repetition_type": "OPTIONAL",
 11 |       "name": "s",
 12 |       "num_children": 2
 13 |     },
 14 |     {
 15 |       "type": "BYTE_ARRAY",
 16 |       "repetition_type": "OPTIONAL",
 17 |       "name": "a",
 18 |       "converted_type": "UTF8",
 19 |       "logical_type": {
 20 |         "type": "STRING"
 21 |       }
 22 |     },
 23 |     {
 24 |       "type": "BYTE_ARRAY",
 25 |       "repetition_type": "OPTIONAL",
 26 |       "name": "b",
 27 |       "converted_type": "UTF8",
 28 |       "logical_type": {
 29 |         "type": "STRING"
 30 |       }
 31 |     }
 32 |   ],
 33 |   "num_rows": 1050,
 34 |   "row_groups": [
 35 |     {
 36 |       "columns": [
 37 |         {
 38 |           "file_offset": 0,
 39 |           "meta_data": {
 40 |             "type": "BYTE_ARRAY",
 41 |             "encodings": [
 42 |               "PLAIN",
 43 |               "RLE",
 44 |               "RLE_DICTIONARY"
 45 |             ],
 46 |             "path_in_schema": [
 47 |               "s",
 48 |               "a"
 49 |             ],
 50 |             "codec": "SNAPPY",
 51 |             "num_values": 1050,
 52 |             "total_uncompressed_size": 10884,
 53 |             "total_compressed_size": 5709,
 54 |             "data_page_offset": 4290,
 55 |             "dictionary_page_offset": 4,
 56 |             "statistics": {
 57 |               "null_count": 0,
 58 |               "max_value": "v1049",
 59 |               "min_value": "v0000"
 60 |             },
 61 |             "encoding_stats": [
 62 |               {
 63 |                 "page_type": "DICTIONARY_PAGE",
 64 |                 "encoding": "PLAIN",
 65 |                 "count": 1
 66 |               },
 67 |               {
 68 |                 "page_type": "DATA_PAGE",
 69 |                 "encoding": "RLE_DICTIONARY",
 70 |                 "count": 2
 71 |               }
 72 |             ],
 73 |             "size_statistics": {
 74 |               "unencoded_byte_array_data_bytes": 5250,
 75 |               "repetition_level_histogram": [],
 76 |               "definition_level_histogram": [
 77 |                 0,
 78 |                 0,
 79 |                 1050
 80 |               ]
 81 |             }
 82 |           }
 83 |         },
 84 |         {
 85 |           "file_offset": 0,
 86 |           "meta_data": {
 87 |             "type": "BYTE_ARRAY",
 88 |             "encodings": [
 89 |               "PLAIN",
 90 |               "RLE",
 91 |               "RLE_DICTIONARY"
 92 |             ],
 93 |             "path_in_schema": [
 94 |               "s",
 95 |               "b"
 96 |             ],
 97 |             "codec": "SNAPPY",
 98 |             "num_values": 1050,
 99 |             "total_uncompressed_size": 58,
100 |             "total_compressed_size": 62,
101 |             "data_page_offset": 5734,
102 |             "dictionary_page_offset": 5713,
103 |             "statistics": {
104 |               "null_count": 0,
105 |               "max_value": "x",
106 |               "min_value": "x"
107 |             },
108 |             "encoding_stats": [
109 |               {
110 |                 "page_type": "DICTIONARY_PAGE",
111 |                 "encoding": "PLAIN",
112 |                 "count": 1
113 |               },
114 |               {
115 |                 "page_type": "DATA_PAGE",
116 |                 "encoding": "RLE_DICTIONARY",
117 |                 "count": 1
118 |               }
119 |             ],
120 |             "size_statistics": {
121 |               "unencoded_byte_array_data_bytes": 1050,
122 |               "repetition_level_histogram": [],
123 |               "definition_level_histogram": [
124 |                 0,
125 |                 0,
126 |                 1050
127 |               ]
128 |             }
129 |           }
130 |         }
131 |       ],
132 |       "total_byte_size": 10942,
133 |       "num_rows": 1050,
134 |       "file_offset": 4,
135 |       "total_compressed_size": 5771
136 |     }
137 |   ],
138 |   "key_value_metadata": [
139 |     {
140 |       "key": "ARROW:schema",
141 |       "value": "/////8AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAEAAAAsP///wAAAQ0YAAAAHAAAAAQAAAACAAAASAAAABAAAAABAAAAcwAAAKj////c////AAABBRAAAAAUAAAABAAAAAAAAAABAAAAYgAAAMz///8QABQACAAGAAcADAAAABAAEAAAAAAAAQUQAAAAGAAAAAQAAAAAAAAAAQAAAGEAAAAEAAQABAAAAAAAAAA="
142 |     }
143 |   ],
144 |   "created_by": "parquet-cpp-arrow version 21.0.0",
145 |   "metadata_length": 558
146 | }
147 | 


--------------------------------------------------------------------------------
/src/schema.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Build a tree from the schema elements.
  3 |  *
  4 |  * @import {SchemaElement, SchemaTree} from '../src/types.d.ts'
  5 |  * @param {SchemaElement[]} schema
  6 |  * @param {number} rootIndex index of the root element
  7 |  * @param {string[]} path path to the element
  8 |  * @returns {SchemaTree} tree of schema elements
  9 |  */
 10 | function schemaTree(schema, rootIndex, path) {
 11 |   const element = schema[rootIndex]
 12 |   const children = []
 13 |   let count = 1
 14 | 
 15 |   // Read the specified number of children
 16 |   if (element.num_children) {
 17 |     while (children.length < element.num_children) {
 18 |       const childElement = schema[rootIndex + count]
 19 |       const child = schemaTree(schema, rootIndex + count, [...path, childElement.name])
 20 |       count += child.count
 21 |       children.push(child)
 22 |     }
 23 |   }
 24 | 
 25 |   return { count, element, children, path }
 26 | }
 27 | 
 28 | /**
 29 |  * Get schema elements from the root to the given element name.
 30 |  *
 31 |  * @param {SchemaElement[]} schema
 32 |  * @param {string[]} name path to the element
 33 |  * @returns {SchemaTree[]} list of schema elements
 34 |  */
 35 | export function getSchemaPath(schema, name) {
 36 |   let tree = schemaTree(schema, 0, [])
 37 |   const path = [tree]
 38 |   for (const part of name) {
 39 |     const child = tree.children.find(child => child.element.name === part)
 40 |     if (!child) throw new Error(`parquet schema element not found: ${name}`)
 41 |     path.push(child)
 42 |     tree = child
 43 |   }
 44 |   return path
 45 | }
 46 | 
 47 | /**
 48 |  * Get all physical (leaf) column names.
 49 |  *
 50 |  * @param {SchemaTree} schemaTree
 51 |  * @returns {string[]} list of physical column names
 52 |  */
 53 | export function getPhysicalColumns(schemaTree) {
 54 |   /** @type {string[]} */
 55 |   const columns = []
 56 |   /** @param {SchemaTree} node */
 57 |   function traverse(node) {
 58 |     if (node.children.length) {
 59 |       for (const child of node.children) {
 60 |         traverse(child)
 61 |       }
 62 |     } else {
 63 |       columns.push(node.path.join('.'))
 64 |     }
 65 |   }
 66 |   traverse(schemaTree)
 67 |   return columns
 68 | }
 69 | 
 70 | /**
 71 |  * Get the max repetition level for a given schema path.
 72 |  *
 73 |  * @param {SchemaTree[]} schemaPath
 74 |  * @returns {number} max repetition level
 75 |  */
 76 | export function getMaxRepetitionLevel(schemaPath) {
 77 |   let maxLevel = 0
 78 |   for (const { element } of schemaPath) {
 79 |     if (element.repetition_type === 'REPEATED') {
 80 |       maxLevel++
 81 |     }
 82 |   }
 83 |   return maxLevel
 84 | }
 85 | 
 86 | /**
 87 |  * Get the max definition level for a given schema path.
 88 |  *
 89 |  * @param {SchemaTree[]} schemaPath
 90 |  * @returns {number} max definition level
 91 |  */
 92 | export function getMaxDefinitionLevel(schemaPath) {
 93 |   let maxLevel = 0
 94 |   for (const { element } of schemaPath.slice(1)) {
 95 |     if (element.repetition_type !== 'REQUIRED') {
 96 |       maxLevel++
 97 |     }
 98 |   }
 99 |   return maxLevel
100 | }
101 | 
102 | /**
103 |  * Check if a column is list-like.
104 |  *
105 |  * @param {SchemaTree} schema
106 |  * @returns {boolean} true if list-like
107 |  */
108 | export function isListLike(schema) {
109 |   if (!schema) return false
110 |   if (schema.element.converted_type !== 'LIST') return false
111 |   if (schema.children.length > 1) return false
112 | 
113 |   const firstChild = schema.children[0]
114 |   if (firstChild.children.length > 1) return false
115 |   if (firstChild.element.repetition_type !== 'REPEATED') return false
116 | 
117 |   return true
118 | }
119 | 
120 | /**
121 |  * Check if a column is map-like.
122 |  *
123 |  * @param {SchemaTree} schema
124 |  * @returns {boolean} true if map-like
125 |  */
126 | export function isMapLike(schema) {
127 |   if (!schema) return false
128 |   if (schema.element.converted_type !== 'MAP') return false
129 |   if (schema.children.length > 1) return false
130 | 
131 |   const firstChild = schema.children[0]
132 |   if (firstChild.children.length !== 2) return false
133 |   if (firstChild.element.repetition_type !== 'REPEATED') return false
134 | 
135 |   const keyChild = firstChild.children.find(child => child.element.name === 'key')
136 |   if (keyChild?.element.repetition_type === 'REPEATED') return false
137 | 
138 |   const valueChild = firstChild.children.find(child => child.element.name === 'value')
139 |   if (valueChild?.element.repetition_type === 'REPEATED') return false
140 | 
141 |   return true
142 | }
143 | 
144 | /**
145 |  * Returns true if a column is non-nested.
146 |  *
147 |  * @param {SchemaTree[]} schemaPath
148 |  * @returns {boolean}
149 |  */
150 | export function isFlatColumn(schemaPath) {
151 |   if (schemaPath.length !== 2) return false
152 |   const [, column] = schemaPath
153 |   if (column.element.repetition_type === 'REPEATED') return false
154 |   if (column.children.length) return false
155 |   return true
156 | }
157 | 


--------------------------------------------------------------------------------
/test/files/datapage_v2.snappy.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 1,
  3 |   "created_by": "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)",
  4 |   "key_value_metadata": [
  5 |     {
  6 |       "key": "org.apache.spark.sql.parquet.row.metadata",
  7 |       "value": "{\"type\":\"struct\",\"fields\":[{\"name\":\"a\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c\",\"type\":\"double\",\"nullable\":false,\"metadata\":{}},{\"name\":\"d\",\"type\":\"boolean\",\"nullable\":false,\"metadata\":{}},{\"name\":\"e\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":false},\"nullable\":true,\"metadata\":{}}]}"
  8 |     }
  9 |   ],
 10 |   "metadata_length": 836,
 11 |   "num_rows": 5,
 12 |   "row_groups": [
 13 |     {
 14 |       "columns": [
 15 |         {
 16 |           "file_offset": 4,
 17 |           "meta_data": {
 18 |             "codec": "SNAPPY",
 19 |             "data_page_offset": 4,
 20 |             "encodings": ["PLAIN", "RLE_DICTIONARY"],
 21 |             "num_values": 5,
 22 |             "path_in_schema": ["a"],
 23 |             "statistics": {
 24 |               "max": "abc",
 25 |               "min": "abc",
 26 |               "null_count": 1
 27 |             },
 28 |             "total_compressed_size": 63,
 29 |             "total_uncompressed_size": 59,
 30 |             "type": "BYTE_ARRAY"
 31 |           }
 32 |         },
 33 |         {
 34 |           "file_offset": 67,
 35 |           "meta_data": {
 36 |             "codec": "SNAPPY",
 37 |             "data_page_offset": 67,
 38 |             "encodings": ["DELTA_BINARY_PACKED"],
 39 |             "num_values": 5,
 40 |             "path_in_schema": ["b"],
 41 |             "statistics": {
 42 |               "max": 5,
 43 |               "min": 1,
 44 |               "null_count": 0
 45 |             },
 46 |             "total_compressed_size": 49,
 47 |             "total_uncompressed_size": 47,
 48 |             "type": "INT32"
 49 |           }
 50 |         },
 51 |         {
 52 |           "file_offset": 116,
 53 |           "meta_data": {
 54 |             "codec": "SNAPPY",
 55 |             "data_page_offset": 116,
 56 |             "encodings": ["PLAIN", "RLE_DICTIONARY"],
 57 |             "num_values": 5,
 58 |             "path_in_schema": ["c"],
 59 |             "statistics": {
 60 |               "max": 5,
 61 |               "min": 2,
 62 |               "null_count": 0
 63 |             },
 64 |             "total_compressed_size": 88,
 65 |             "total_uncompressed_size": 94,
 66 |             "type": "DOUBLE"
 67 |           }
 68 |         },
 69 |         {
 70 |           "file_offset": 204,
 71 |           "meta_data": {
 72 |             "codec": "SNAPPY",
 73 |             "data_page_offset": 204,
 74 |             "encodings": ["RLE"],
 75 |             "num_values": 5,
 76 |             "path_in_schema": ["d"],
 77 |             "statistics": {
 78 |               "max": true,
 79 |               "min": false,
 80 |               "null_count": 0
 81 |             },
 82 |             "total_compressed_size": 39,
 83 |             "total_uncompressed_size": 37,
 84 |             "type": "BOOLEAN"
 85 |           }
 86 |         },
 87 |         {
 88 |           "file_offset": 243,
 89 |           "meta_data": {
 90 |             "codec": "SNAPPY",
 91 |             "data_page_offset": 243,
 92 |             "encodings": ["PLAIN", "RLE_DICTIONARY"],
 93 |             "num_values": 10,
 94 |             "path_in_schema": [
 95 |               "e",
 96 |               "list",
 97 |               "element"
 98 |             ],
 99 |             "statistics": {
100 |               "max": 3,
101 |               "min": 1,
102 |               "null_count": 2
103 |             },
104 |             "total_compressed_size": 78,
105 |             "total_uncompressed_size": 74,
106 |             "type": "INT32"
107 |           }
108 |         }
109 |       ],
110 |       "num_rows": 5,
111 |       "total_byte_size": 311
112 |     }
113 |   ],
114 |   "schema": [
115 |     {
116 |       "name": "spark_schema",
117 |       "num_children": 5
118 |     },
119 |     {
120 |       "converted_type": "UTF8",
121 |       "name": "a",
122 |       "repetition_type": "OPTIONAL",
123 |       "type": "BYTE_ARRAY"
124 |     },
125 |     {
126 |       "name": "b",
127 |       "repetition_type": "REQUIRED",
128 |       "type": "INT32"
129 |     },
130 |     {
131 |       "name": "c",
132 |       "repetition_type": "REQUIRED",
133 |       "type": "DOUBLE"
134 |     },
135 |     {
136 |       "name": "d",
137 |       "repetition_type": "REQUIRED",
138 |       "type": "BOOLEAN"
139 |     },
140 |     {
141 |       "converted_type": "LIST",
142 |       "name": "e",
143 |       "num_children": 1,
144 |       "repetition_type": "OPTIONAL"
145 |     },
146 |     {
147 |       "name": "list",
148 |       "num_children": 1,
149 |       "repetition_type": "REPEATED"
150 |     },
151 |     {
152 |       "name": "element",
153 |       "repetition_type": "REQUIRED",
154 |       "type": "INT32"
155 |     }
156 |   ]
157 | }
158 | 


--------------------------------------------------------------------------------
/test/thrift.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { deserializeTCompactProtocol, readVarInt } from '../src/thrift.js'
  3 | import { reader } from './helpers.js'
  4 | 
  5 | describe('deserializeTCompactProtocol function', () => {
  6 | 
  7 |   it('parses basic types correctly', () => {
  8 |     const buffer = new ArrayBuffer(128)
  9 |     const view = new DataView(buffer)
 10 |     let index = 0
 11 | 
 12 |     // Boolean
 13 |     view.setUint8(index++, 0x11) // Field 1 type TRUE
 14 |     view.setUint8(index++, 0x12) // Field 2 type FALSE
 15 | 
 16 |     // Byte
 17 |     view.setUint8(index++, 0x13) // Field 3 type BYTE
 18 |     view.setUint8(index++, 0x7f) // Max value for a signed byte
 19 | 
 20 |     // Int16
 21 |     view.setUint8(index++, 0x14) // Field 4 type int16
 22 |     view.setUint8(index++, 0xfe) // 0xfffe zigzag => 16-bit max value 0x7fff
 23 |     view.setUint8(index++, 0xff)
 24 |     view.setUint8(index++, 0x3)
 25 | 
 26 |     // Int32
 27 |     view.setUint8(index++, 0x15) // Field 5 type int32
 28 |     view.setUint8(index++, 0xfe) // 0xfffffffe zigzag => 32-bit max value 0x7fffffff
 29 |     view.setUint8(index++, 0xff)
 30 |     view.setUint8(index++, 0xff)
 31 |     view.setUint8(index++, 0xff)
 32 |     view.setUint8(index++, 0x0f)
 33 | 
 34 |     // Int64
 35 |     view.setUint8(index++, 0x16) // Field 6 type int64
 36 |     view.setUint8(index++, 0xfe)
 37 |     view.setUint8(index++, 0xff)
 38 |     view.setUint8(index++, 0xff)
 39 |     view.setUint8(index++, 0xff)
 40 |     view.setUint8(index++, 0xff)
 41 |     view.setUint8(index++, 0xff)
 42 |     view.setUint8(index++, 0xff)
 43 |     view.setUint8(index++, 0xff)
 44 |     view.setUint8(index++, 0xff)
 45 |     view.setUint8(index++, 0x01)
 46 | 
 47 |     // Double
 48 |     view.setUint8(index++, 0x17) // Field 7 type DOUBLE
 49 |     view.setFloat64(index, 123.456, true)
 50 |     index += 8
 51 | 
 52 |     // String
 53 |     const str = 'Hello, Thrift!'
 54 |     view.setUint8(index++, 0x18) // Field 8 type STRING
 55 |     // write string length as varint
 56 |     const stringLengthVarInt = toVarInt(str.length)
 57 |     stringLengthVarInt.forEach(byte => view.setUint8(index++, byte))
 58 |     // write string bytes
 59 |     for (let i = 0; i < str.length; i++) {
 60 |       view.setUint8(index++, str.charCodeAt(i))
 61 |     }
 62 | 
 63 |     // Mark the end of the structure
 64 |     view.setUint8(index, 0x00) // STOP field
 65 | 
 66 |     const reader = { view, offset: 0 }
 67 |     const value = deserializeTCompactProtocol(reader)
 68 |     expect(reader.offset).toBe(index + 1)
 69 | 
 70 |     // Assertions for each basic type
 71 |     expect(value.field_1).toBe(true) // TRUE
 72 |     expect(value.field_2).toBe(false) // FALSE
 73 |     expect(value.field_3).toBe(0x7f) // BYTE
 74 |     expect(value.field_4).toBe(0x7fff) // I16
 75 |     expect(value.field_5).toBe(0x7fffffff) // I32
 76 |     expect(value.field_6).toBe(BigInt('0x7fffffffffffffff')) // I64
 77 |     expect(value.field_7).toBeCloseTo(123.456) // DOUBLE
 78 |     expect(new TextDecoder().decode(value.field_8)).toBe('Hello, Thrift!') // STRING
 79 |   })
 80 | 
 81 |   it('parses rle-dict column index correctly', () => {
 82 |     const buffer = new Uint8Array([25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0])
 83 |     const view = new DataView(buffer.buffer)
 84 |     const reader = { view, offset: 0 }
 85 |     const value = deserializeTCompactProtocol(reader)
 86 |     expect(value.field_1).toEqual([false])
 87 |     expect(value.field_2).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])])
 88 |     expect(value.field_3).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])])
 89 |     expect(value.field_4).toEqual(1)
 90 |     expect(value.field_5).toEqual([0n])
 91 |     expect(value.field_6).toBeUndefined()
 92 |     expect(value.field_7).toBeUndefined()
 93 |     expect(value.field_8).toBeUndefined()
 94 |   })
 95 | 
 96 | })
 97 | 
 98 | describe('readVarInt', () => {
 99 |   it('read single-byte varint', () => {
100 |     expect(readVarInt(reader([0x01]))).toBe(1)
101 |     expect(readVarInt(reader([0x7f]))).toBe(127)
102 |   })
103 | 
104 |   it('read multi-byte varint', () => {
105 |     // 129 as varint (0b10000001 00000001)
106 |     expect(readVarInt(reader([0x81, 0x01]))).toBe(129)
107 |     // 16515 as varint (0b10000011 10000010 00000001)
108 |     expect(readVarInt(reader([0x83, 0x82, 0x01]))).toBe(16643)
109 |   })
110 | 
111 |   it('read maximum int32 varint', () => {
112 |     // 2147483647 as varint (0b11111111 11111111 11111111 11111111 00000111)
113 |     expect(readVarInt(reader([0xff, 0xff, 0xff, 0xff, 0x07]))).toBe(2147483647)
114 |   })
115 | })
116 | 
117 | /**
118 |  * Convert int to varint. Outputs 1-5 bytes for int32.
119 |  *
120 |  * @param {number} n
121 |  * @returns {number[]}
122 |  */
123 | function toVarInt(n) {
124 |   let idx = 0
125 |   const varInt = []
126 |   while (true) {
127 |     if ((n & ~0x7f) === 0) {
128 |       varInt[idx++] = n
129 |       break
130 |     } else {
131 |       varInt[idx++] = n & 0x7f | 0x80
132 |       n >>>= 7
133 |     }
134 |   }
135 |   return varInt
136 | }
137 | 


--------------------------------------------------------------------------------
/test/files/delta_encoding_required_column.column_indexes.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     {
  4 |       "null_pages": [
  5 |         false
  6 |       ],
  7 |       "min_values": [
  8 |         1
  9 |       ],
 10 |       "max_values": [
 11 |         105
 12 |       ],
 13 |       "boundary_order": "ASCENDING",
 14 |       "null_counts": [
 15 |         0
 16 |       ]
 17 |     },
 18 |     {
 19 |       "null_pages": [
 20 |         false
 21 |       ],
 22 |       "min_values": [
 23 |         8817
 24 |       ],
 25 |       "max_values": [
 26 |         1895444
 27 |       ],
 28 |       "boundary_order": "ASCENDING",
 29 |       "null_counts": [
 30 |         0
 31 |       ]
 32 |     },
 33 |     {
 34 |       "null_pages": [
 35 |         false
 36 |       ],
 37 |       "min_values": [
 38 |         37
 39 |       ],
 40 |       "max_values": [
 41 |         7135
 42 |       ],
 43 |       "boundary_order": "ASCENDING",
 44 |       "null_counts": [
 45 |         0
 46 |       ]
 47 |     },
 48 |     {
 49 |       "null_pages": [
 50 |         false
 51 |       ],
 52 |       "min_values": [
 53 |         464
 54 |       ],
 55 |       "max_values": [
 56 |         49388
 57 |       ],
 58 |       "boundary_order": "ASCENDING",
 59 |       "null_counts": [
 60 |         0
 61 |       ]
 62 |     },
 63 |     {
 64 |       "null_pages": [
 65 |         false
 66 |       ],
 67 |       "min_values": [
 68 |         2449130
 69 |       ],
 70 |       "max_values": [
 71 |         2452641
 72 |       ],
 73 |       "boundary_order": "ASCENDING",
 74 |       "null_counts": [
 75 |         0
 76 |       ]
 77 |     },
 78 |     {
 79 |       "null_pages": [
 80 |         false
 81 |       ],
 82 |       "min_values": [
 83 |         2449100
 84 |       ],
 85 |       "max_values": [
 86 |         2452611
 87 |       ],
 88 |       "boundary_order": "ASCENDING",
 89 |       "null_counts": [
 90 |         0
 91 |       ]
 92 |     },
 93 |     {
 94 |       "null_pages": [
 95 |         false
 96 |       ],
 97 |       "min_values": [
 98 |         1
 99 |       ],
100 |       "max_values": [
101 |         30
102 |       ],
103 |       "boundary_order": "ASCENDING",
104 |       "null_counts": [
105 |         0
106 |       ]
107 |     },
108 |     {
109 |       "null_pages": [
110 |         false
111 |       ],
112 |       "min_values": [
113 |         1
114 |       ],
115 |       "max_values": [
116 |         12
117 |       ],
118 |       "boundary_order": "ASCENDING",
119 |       "null_counts": [
120 |         0
121 |       ]
122 |     },
123 |     {
124 |       "null_pages": [
125 |         false
126 |       ],
127 |       "min_values": [
128 |         1925
129 |       ],
130 |       "max_values": [
131 |         1991
132 |       ],
133 |       "boundary_order": "ASCENDING",
134 |       "null_counts": [
135 |         0
136 |       ]
137 |     },
138 |     {
139 |       "null_pages": [
140 |         false
141 |       ],
142 |       "min_values": [
143 |         "AAAAAAAAABAAAAAA"
144 |       ],
145 |       "max_values": [
146 |         "AAAAAAAAPFAAAAAA"
147 |       ],
148 |       "boundary_order": "ASCENDING",
149 |       "null_counts": [
150 |         0
151 |       ]
152 |     },
153 |     {
154 |       "null_pages": [
155 |         false
156 |       ],
157 |       "min_values": [
158 |         "Dr."
159 |       ],
160 |       "max_values": [
161 |         "Sir"
162 |       ],
163 |       "boundary_order": "ASCENDING",
164 |       "null_counts": [
165 |         0
166 |       ]
167 |     },
168 |     {
169 |       "null_pages": [
170 |         false
171 |       ],
172 |       "min_values": [
173 |         "Albert"
174 |       ],
175 |       "max_values": [
176 |         "William"
177 |       ],
178 |       "boundary_order": "ASCENDING",
179 |       "null_counts": [
180 |         0
181 |       ]
182 |     },
183 |     {
184 |       "null_pages": [
185 |         false
186 |       ],
187 |       "min_values": [
188 |         "Baker"
189 |       ],
190 |       "max_values": [
191 |         "Young"
192 |       ],
193 |       "boundary_order": "ASCENDING",
194 |       "null_counts": [
195 |         0
196 |       ]
197 |     },
198 |     {
199 |       "null_pages": [
200 |         false
201 |       ],
202 |       "min_values": [
203 |         "N"
204 |       ],
205 |       "max_values": [
206 |         "Y"
207 |       ],
208 |       "boundary_order": "ASCENDING",
209 |       "null_counts": [
210 |         0
211 |       ]
212 |     },
213 |     {
214 |       "null_pages": [
215 |         false
216 |       ],
217 |       "min_values": [
218 |         "AFGHANISTAN"
219 |       ],
220 |       "max_values": [
221 |         "WALLIS AND FUTUNA"
222 |       ],
223 |       "boundary_order": "ASCENDING",
224 |       "null_counts": [
225 |         0
226 |       ]
227 |     },
228 |     {
229 |       "null_pages": [
230 |         false
231 |       ],
232 |       "min_values": [
233 |         "Albert.Brunson@62.com"
234 |       ],
235 |       "max_values": [
236 |         "William.Warner@zegnrzurU.org"
237 |       ],
238 |       "boundary_order": "ASCENDING",
239 |       "null_counts": [
240 |         0
241 |       ]
242 |     },
243 |     {
244 |       "null_pages": [
245 |         false
246 |       ],
247 |       "min_values": [
248 |         "2452293"
249 |       ],
250 |       "max_values": [
251 |         "2452644"
252 |       ],
253 |       "boundary_order": "ASCENDING",
254 |       "null_counts": [
255 |         0
256 |       ]
257 |     }
258 |   ]
259 | ]


--------------------------------------------------------------------------------
/test/files/decimal-column.metadata.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 2,
  3 |   "schema": [
  4 |     {
  5 |       "repetition_type": "REQUIRED",
  6 |       "name": "schema",
  7 |       "num_children": 2
  8 |     },
  9 |     {
 10 |       "type": "INT64",
 11 |       "repetition_type": "OPTIONAL",
 12 |       "name": "mid"
 13 |     },
 14 |     {
 15 |       "type": "FIXED_LEN_BYTE_ARRAY",
 16 |       "type_length": 6,
 17 |       "repetition_type": "OPTIONAL",
 18 |       "name": "value",
 19 |       "converted_type": "DECIMAL",
 20 |       "scale": 10,
 21 |       "precision": 14,
 22 |       "logical_type": {
 23 |         "type": "DECIMAL",
 24 |         "scale": 10,
 25 |         "precision": 14
 26 |       }
 27 |     }
 28 |   ],
 29 |   "num_rows": 5,
 30 |   "row_groups": [
 31 |     {
 32 |       "columns": [
 33 |         {
 34 |           "file_offset": 0,
 35 |           "meta_data": {
 36 |             "type": "INT64",
 37 |             "encodings": [
 38 |               "PLAIN",
 39 |               "RLE",
 40 |               "RLE_DICTIONARY"
 41 |             ],
 42 |             "path_in_schema": [
 43 |               "mid"
 44 |             ],
 45 |             "codec": "SNAPPY",
 46 |             "num_values": 5,
 47 |             "total_uncompressed_size": 126,
 48 |             "total_compressed_size": 120,
 49 |             "data_page_offset": 50,
 50 |             "dictionary_page_offset": 4,
 51 |             "statistics": {
 52 |               "max": 190,
 53 |               "min": 40,
 54 |               "null_count": 0,
 55 |               "max_value": 190,
 56 |               "min_value": 40
 57 |             },
 58 |             "encoding_stats": [
 59 |               {
 60 |                 "page_type": "DICTIONARY_PAGE",
 61 |                 "encoding": "PLAIN",
 62 |                 "count": 1
 63 |               },
 64 |               {
 65 |                 "page_type": "DATA_PAGE",
 66 |                 "encoding": "RLE_DICTIONARY",
 67 |                 "count": 1
 68 |               }
 69 |             ]
 70 |           }
 71 |         },
 72 |         {
 73 |           "file_offset": 0,
 74 |           "meta_data": {
 75 |             "type": "FIXED_LEN_BYTE_ARRAY",
 76 |             "encodings": [
 77 |               "PLAIN",
 78 |               "RLE",
 79 |               "RLE_DICTIONARY"
 80 |             ],
 81 |             "path_in_schema": [
 82 |               "value"
 83 |             ],
 84 |             "codec": "SNAPPY",
 85 |             "num_values": 5,
 86 |             "total_uncompressed_size": 82,
 87 |             "total_compressed_size": 86,
 88 |             "data_page_offset": 146,
 89 |             "dictionary_page_offset": 124,
 90 |             "statistics": {
 91 |               "max": 2015,
 92 |               "min": 2015,
 93 |               "null_count": 0,
 94 |               "max_value": 2015,
 95 |               "min_value": 2015
 96 |             },
 97 |             "encoding_stats": [
 98 |               {
 99 |                 "page_type": "DICTIONARY_PAGE",
100 |                 "encoding": "PLAIN",
101 |                 "count": 1
102 |               },
103 |               {
104 |                 "page_type": "DATA_PAGE",
105 |                 "encoding": "RLE_DICTIONARY",
106 |                 "count": 1
107 |               }
108 |             ]
109 |           }
110 |         }
111 |       ],
112 |       "total_byte_size": 208,
113 |       "num_rows": 5,
114 |       "file_offset": 4,
115 |       "total_compressed_size": 206,
116 |       "ordinal": 0
117 |     }
118 |   ],
119 |   "key_value_metadata": [
120 |     {
121 |       "key": "pandas",
122 |       "value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 5, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"mid\", \"field_name\": \"mid\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"name\": \"value\", \"field_name\": \"value\", \"pandas_type\": \"decimal\", \"numpy_type\": \"object\", \"metadata\": {\"precision\": 14, \"scale\": 10}}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"19.0.0\"}, \"pandas_version\": \"2.2.3\"}"
123 |     },
124 |     {
125 |       "key": "ARROW:schema",
126 |       "value": "/////xgDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAGACAAAEAAAAAQAAAAQAAACA/f//QAIAAAQAAAAyAgAAeyJpbmRleF9jb2x1bW5zIjogW3sia2luZCI6ICJyYW5nZSIsICJuYW1lIjogbnVsbCwgInN0YXJ0IjogMCwgInN0b3AiOiA1LCAic3RlcCI6IDF9XSwgImNvbHVtbl9pbmRleGVzIjogW3sibmFtZSI6IG51bGwsICJmaWVsZF9uYW1lIjogbnVsbCwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7ImVuY29kaW5nIjogIlVURi04In19XSwgImNvbHVtbnMiOiBbeyJuYW1lIjogIm1pZCIsICJmaWVsZF9uYW1lIjogIm1pZCIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ2YWx1ZSIsICJmaWVsZF9uYW1lIjogInZhbHVlIiwgInBhbmRhc190eXBlIjogImRlY2ltYWwiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7InByZWNpc2lvbiI6IDE0LCAic2NhbGUiOiAxMH19XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJvdyIsICJ2ZXJzaW9uIjogIjE5LjAuMCJ9LCAicGFuZGFzX3ZlcnNpb24iOiAiMi4yLjMifQAABgAAAHBhbmRhcwAAAgAAAFAAAAAEAAAAyP///wAAAQcQAAAAIAAAAAQAAAAAAAAABQAAAHZhbHVlAAAACAAMAAQACAAIAAAADgAAAAoAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAHAAAAAQAAAAAAAAAAwAAAG1pZAAIAAwACAAHAAgAAAAAAAABQAAAAAAAAAA="
127 |     }
128 |   ],
129 |   "created_by": "parquet-cpp-arrow version 19.0.0",
130 |   "metadata_length": 1959
131 | }
132 | 


--------------------------------------------------------------------------------
/src/encoding.js:
--------------------------------------------------------------------------------
  1 | import { readVarInt } from './thrift.js'
  2 | 
  3 | /**
  4 |  * Minimum bits needed to store value.
  5 |  *
  6 |  * @param {number} value
  7 |  * @returns {number}
  8 |  */
  9 | export function bitWidth(value) {
 10 |   return 32 - Math.clz32(value)
 11 | }
 12 | 
 13 | /**
 14 |  * Read values from a run-length encoded/bit-packed hybrid encoding.
 15 |  *
 16 |  * If length is zero, then read int32 length at the start.
 17 |  *
 18 |  * @param {DataReader} reader
 19 |  * @param {number} width - bitwidth
 20 |  * @param {DecodedArray} output
 21 |  * @param {number} [length] - length of the encoded data
 22 |  */
 23 | export function readRleBitPackedHybrid(reader, width, output, length) {
 24 |   if (length === undefined) {
 25 |     length = reader.view.getUint32(reader.offset, true)
 26 |     reader.offset += 4
 27 |   }
 28 |   const startOffset = reader.offset
 29 |   let seen = 0
 30 |   while (seen < output.length) {
 31 |     const header = readVarInt(reader)
 32 |     if (header & 1) {
 33 |       // bit-packed
 34 |       seen = readBitPacked(reader, header, width, output, seen)
 35 |     } else {
 36 |       // rle
 37 |       const count = header >>> 1
 38 |       readRle(reader, count, width, output, seen)
 39 |       seen += count
 40 |     }
 41 |   }
 42 |   reader.offset = startOffset + length // duckdb writes an empty block
 43 | }
 44 | 
 45 | /**
 46 |  * Run-length encoding: read value with bitWidth and repeat it count times.
 47 |  *
 48 |  * @param {DataReader} reader
 49 |  * @param {number} count
 50 |  * @param {number} bitWidth
 51 |  * @param {DecodedArray} output
 52 |  * @param {number} seen
 53 |  */
 54 | function readRle(reader, count, bitWidth, output, seen) {
 55 |   const width = bitWidth + 7 >> 3
 56 |   let value = 0
 57 |   for (let i = 0; i < width; i++) {
 58 |     value |= reader.view.getUint8(reader.offset++) << (i << 3)
 59 |   }
 60 |   // assert(value < 1 << bitWidth)
 61 | 
 62 |   // repeat value count times
 63 |   for (let i = 0; i < count; i++) {
 64 |     output[seen + i] = value
 65 |   }
 66 | }
 67 | 
 68 | /**
 69 |  * Read a bit-packed run of the rle/bitpack hybrid.
 70 |  * Supports width > 8 (crossing bytes).
 71 |  *
 72 |  * @param {DataReader} reader
 73 |  * @param {number} header - bit-pack header
 74 |  * @param {number} bitWidth
 75 |  * @param {DecodedArray} output
 76 |  * @param {number} seen
 77 |  * @returns {number} total output values so far
 78 |  */
 79 | function readBitPacked(reader, header, bitWidth, output, seen) {
 80 |   let count = header >> 1 << 3 // values to read
 81 |   const mask = (1 << bitWidth) - 1
 82 | 
 83 |   let data = 0
 84 |   if (reader.offset < reader.view.byteLength) {
 85 |     data = reader.view.getUint8(reader.offset++)
 86 |   } else if (mask) {
 87 |     // sometimes out-of-bounds reads are masked out
 88 |     throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
 89 |   }
 90 |   let left = 8
 91 |   let right = 0
 92 | 
 93 |   // read values
 94 |   while (count) {
 95 |     // if we have crossed a byte boundary, shift the data
 96 |     if (right > 8) {
 97 |       right -= 8
 98 |       left -= 8
 99 |       data >>>= 8
100 |     } else if (left - right < bitWidth) {
101 |       // if we don't have bitWidth number of bits to read, read next byte
102 |       data |= reader.view.getUint8(reader.offset) << left
103 |       reader.offset++
104 |       left += 8
105 |     } else {
106 |       if (seen < output.length) {
107 |         // emit value
108 |         output[seen++] = data >> right & mask
109 |       }
110 |       count--
111 |       right += bitWidth
112 |     }
113 |   }
114 | 
115 |   return seen
116 | }
117 | 
118 | /**
119 |  * @param {DataReader} reader
120 |  * @param {number} count
121 |  * @param {ParquetType} type
122 |  * @param {number | undefined} typeLength
123 |  * @returns {DecodedArray}
124 |  */
125 | export function byteStreamSplit(reader, count, type, typeLength) {
126 |   const width = byteWidth(type, typeLength)
127 |   const bytes = new Uint8Array(count * width)
128 |   for (let b = 0; b < width; b++) {
129 |     for (let i = 0; i < count; i++) {
130 |       bytes[i * width + b] = reader.view.getUint8(reader.offset++)
131 |     }
132 |   }
133 |   // interpret bytes as typed array
134 |   if (type === 'FLOAT') return new Float32Array(bytes.buffer)
135 |   else if (type === 'DOUBLE') return new Float64Array(bytes.buffer)
136 |   else if (type === 'INT32') return new Int32Array(bytes.buffer)
137 |   else if (type === 'INT64') return new BigInt64Array(bytes.buffer)
138 |   else if (type === 'FIXED_LEN_BYTE_ARRAY') {
139 |     // split into arrays of typeLength
140 |     const split = new Array(count)
141 |     for (let i = 0; i < count; i++) {
142 |       split[i] = bytes.subarray(i * width, (i + 1) * width)
143 |     }
144 |     return split
145 |   }
146 |   throw new Error(`parquet byte_stream_split unsupported type: ${type}`)
147 | }
148 | 
149 | /**
150 |  * @import {DataReader, DecodedArray, ParquetType} from '../src/types.d.ts'
151 |  * @param {ParquetType} type
152 |  * @param {number | undefined} typeLength
153 |  * @returns {number}
154 |  */
155 | function byteWidth(type, typeLength) {
156 |   switch (type) {
157 |   case 'INT32':
158 |   case 'FLOAT':
159 |     return 4
160 |   case 'INT64':
161 |   case 'DOUBLE':
162 |     return 8
163 |   case 'FIXED_LEN_BYTE_ARRAY':
164 |     if (!typeLength) throw new Error('parquet byteWidth missing type_length')
165 |     return typeLength
166 |   default:
167 |     throw new Error(`parquet unsupported type: ${type}`)
168 |   }
169 | }
170 | 


--------------------------------------------------------------------------------
/test/encoding.test.js:
--------------------------------------------------------------------------------
  1 | import { describe, expect, it } from 'vitest'
  2 | import { bitWidth, readRleBitPackedHybrid } from '../src/encoding.js'
  3 | 
  4 | describe('readRle', () => {
  5 |   it('reads RLE values with explicit length', () => {
  6 |     const buffer = new ArrayBuffer(4)
  7 |     const view = new DataView(buffer)
  8 |     // RLE 3x true
  9 |     view.setUint8(0, 0b00000110)
 10 |     view.setUint8(1, 1)
 11 |     // RLE 3x 100
 12 |     view.setUint8(2, 0b00000110)
 13 |     view.setUint8(3, 100)
 14 |     const reader = { view, offset: 0 }
 15 | 
 16 |     const values = new Array(6)
 17 |     readRleBitPackedHybrid(reader, 1, values, 4)
 18 |     expect(reader.offset).toBe(4)
 19 |     expect(values).toEqual([1, 1, 1, 100, 100, 100])
 20 |   })
 21 | 
 22 |   it('reads RLE values with bitwidth=16', () => {
 23 |     const buffer = new ArrayBuffer(6)
 24 |     const view = new DataView(buffer)
 25 |     // RLE 3x 65535
 26 |     view.setUint8(3, 0b00000110)
 27 |     view.setUint16(4, 65535, true)
 28 |     const reader = { view, offset: 0 }
 29 | 
 30 |     const values = new Array(3)
 31 |     readRleBitPackedHybrid(reader, 16, values, 6)
 32 |     expect(reader.offset).toBe(6)
 33 |     expect(values).toEqual([65535, 65535, 65535])
 34 |   })
 35 | 
 36 |   it('reads RLE values with bitwidth=24', () => {
 37 |     const buffer = new ArrayBuffer(4)
 38 |     const view = new DataView(buffer)
 39 |     // RLE 2x 16777215
 40 |     view.setUint8(0, 0b00000100)
 41 |     view.setUint8(1, 255)
 42 |     view.setUint8(2, 255)
 43 |     view.setUint8(3, 255)
 44 |     const reader = { view, offset: 0 }
 45 | 
 46 |     const values = new Array(2)
 47 |     readRleBitPackedHybrid(reader, 24, values, 4)
 48 |     expect(reader.offset).toBe(4)
 49 |     expect(values).toEqual([16777215, 16777215])
 50 |   })
 51 | 
 52 |   it('reads RLE values with bitwidth=32', () => {
 53 |     const buffer = new ArrayBuffer(5)
 54 |     const view = new DataView(buffer)
 55 |     // RLE 3x 234000
 56 |     view.setUint8(0, 0b00000110)
 57 |     view.setUint32(1, 234000, true)
 58 |     const reader = { view, offset: 0 }
 59 | 
 60 |     const values = new Array(3)
 61 |     readRleBitPackedHybrid(reader, 32, values, 5)
 62 |     expect(reader.offset).toBe(5)
 63 |     expect(values).toEqual([234000, 234000, 234000])
 64 |   })
 65 | })
 66 | 
 67 | describe('readBitPacked', () => {
 68 |   it('reads bit-packed values with implicit length', () => {
 69 |     // Bit-packed values: false, false, true
 70 |     const buffer = new ArrayBuffer(8)
 71 |     const view = new DataView(buffer)
 72 |     view.setInt32(0, 2, true) // length 2 little-endian
 73 |     view.setUint8(4, 0b00000011) // Bit-packed header for 1-8 values
 74 |     view.setUint8(5, 0b00000100) // Bit-packed values (false, false, true)
 75 |     const reader = { view, offset: 0 }
 76 | 
 77 |     const values = new Array(3)
 78 |     readRleBitPackedHybrid(reader, 1, values)
 79 |     expect(reader.offset).toBe(6)
 80 |     expect(values).toEqual([0, 0, 1])
 81 |   })
 82 | 
 83 |   it('reads multi-byte bit-packed values', () => {
 84 |     // Bit-packed 9x true
 85 |     const buffer = new ArrayBuffer(3)
 86 |     const view = new DataView(buffer)
 87 |     view.setUint8(0, 0b00000101) // Bit-packed header for 9-16 values
 88 |     view.setUint8(1, 0b11111111)
 89 |     view.setUint8(2, 0b00000001)
 90 |     const reader = { view, offset: 0 }
 91 | 
 92 |     const values = new Array(9)
 93 |     readRleBitPackedHybrid(reader, 1, values, 3)
 94 |     expect(reader.offset).toBe(3)
 95 |     expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1])
 96 |   })
 97 | 
 98 |   it('handles bitpack unsigned shifting', () => {
 99 |     // Bit-packed [131071, 0, ..., 0, 131071, 0, ...]
100 |     // Tests for issue #13 where leftmost bit is set to 1 and shifted
101 |     const buffer = new ArrayBuffer(154)
102 |     const view = new DataView(buffer)
103 |     view.setUint8(0, 0b00010011) // Bit-packed header for 72 values
104 |     view.setUint8(1, 0b11111111)
105 |     view.setUint8(2, 0b11111111)
106 |     view.setUint8(3, 0b00000001)
107 |     view.setUint8(139, 0b11111110)
108 |     view.setUint8(140, 0b11111111)
109 |     view.setUint8(141, 0b0000011)
110 |     const reader = { view, offset: 0 }
111 | 
112 |     const values = new Array(72)
113 |     readRleBitPackedHybrid(reader, 17, values, 154)
114 |     expect(reader.offset).toBe(154)
115 |     expect(values).toEqual([
116 |       131071, 0, 0, 0, 0, 0, 0, 0,
117 |       0, 0, 0, 0, 0, 0, 0, 0,
118 |       0, 0, 0, 0, 0, 0, 0, 0,
119 |       0, 0, 0, 0, 0, 0, 0, 0,
120 |       0, 0, 0, 0, 0, 0, 0, 0,
121 |       0, 0, 0, 0, 0, 0, 0, 0,
122 |       0, 0, 0, 0, 0, 0, 0, 0,
123 |       0, 0, 0, 0, 0, 0, 0, 0,
124 |       0, 131071, 0, 0, 0, 0, 0, 0,
125 |     ])
126 |   })
127 | 
128 |   it('throws for invalid bit-packed offset', () => {
129 |     const buffer = new ArrayBuffer(1)
130 |     const view = new DataView(buffer)
131 |     view.setUint8(0, 0b00000011) // Bit-packed header for 3 values
132 |     const reader = { view, offset: 0 }
133 | 
134 |     const values = new Array(3)
135 |     expect(() => readRleBitPackedHybrid(reader, 1, values, 3))
136 |       .toThrow('parquet bitpack offset 1 out of range')
137 |   })
138 | })
139 | 
140 | describe('bitWidth', () => {
141 |   it('calculates bit widths', () => {
142 |     expect(bitWidth(0)).toBe(0)
143 |     expect(bitWidth(1)).toBe(1)
144 |     expect(bitWidth(7)).toBe(3)
145 |     expect(bitWidth(8)).toBe(4)
146 |     expect(bitWidth(255)).toBe(8)
147 |     expect(bitWidth(256)).toBe(9)
148 |     expect(bitWidth(1023)).toBe(10)
149 |     expect(bitWidth(1048575)).toBe(20)
150 |   })
151 | })
152 | 


--------------------------------------------------------------------------------
/src/filter.js:
--------------------------------------------------------------------------------
  1 | import { equals } from './utils.js'
  2 | 
  3 | /**
  4 |  * @import {ParquetQueryFilter, RowGroup} from '../src/types.js'
  5 |  */
  6 | 
  7 | /**
  8 |  * Returns an array of column names needed to evaluate the filter.
  9 |  *
 10 |  * @param {ParquetQueryFilter} [filter]
 11 |  * @returns {string[]}
 12 |  */
 13 | export function columnsNeededForFilter(filter) {
 14 |   if (!filter) return []
 15 |   /** @type {string[]} */
 16 |   const columns = []
 17 |   if ('$and' in filter && Array.isArray(filter.$and)) {
 18 |     columns.push(...filter.$and.flatMap(columnsNeededForFilter))
 19 |   } else if ('$or' in filter && Array.isArray(filter.$or)) {
 20 |     columns.push(...filter.$or.flatMap(columnsNeededForFilter))
 21 |   } else if ('$nor' in filter && Array.isArray(filter.$nor)) {
 22 |     columns.push(...filter.$nor.flatMap(columnsNeededForFilter))
 23 |   } else {
 24 |     columns.push(...Object.keys(filter))
 25 |   }
 26 |   return columns
 27 | }
 28 | 
 29 | /**
 30 |  * Match a record against a query filter
 31 |  *
 32 |  * @param {Record<string, any>} record
 33 |  * @param {ParquetQueryFilter} filter
 34 |  * @param {boolean} [strict]
 35 |  * @returns {boolean}
 36 |  */
 37 | export function matchFilter(record, filter, strict = true) {
 38 |   if ('$and' in filter && Array.isArray(filter.$and)) {
 39 |     return filter.$and.every(subQuery => matchFilter(record, subQuery, strict))
 40 |   }
 41 |   if ('$or' in filter && Array.isArray(filter.$or)) {
 42 |     return filter.$or.some(subQuery => matchFilter(record, subQuery, strict))
 43 |   }
 44 |   if ('$nor' in filter && Array.isArray(filter.$nor)) {
 45 |     return !filter.$nor.some(subQuery => matchFilter(record, subQuery, strict))
 46 |   }
 47 | 
 48 |   return Object.entries(filter).every(([field, condition]) => {
 49 |     const value = record[field]
 50 | 
 51 |     // implicit $eq for non-object conditions
 52 |     if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) {
 53 |       return equals(value, condition, strict)
 54 |     }
 55 | 
 56 |     return Object.entries(condition || {}).every(([operator, target]) => {
 57 |       if (operator === '$gt') return value > target
 58 |       if (operator === '$gte') return value >= target
 59 |       if (operator === '$lt') return value < target
 60 |       if (operator === '$lte') return value <= target
 61 |       if (operator === '$eq') return equals(value, target, strict)
 62 |       if (operator === '$ne') return !equals(value, target, strict)
 63 |       if (operator === '$in') return Array.isArray(target) && target.includes(value)
 64 |       if (operator === '$nin') return Array.isArray(target) && !target.includes(value)
 65 |       if (operator === '$not') return !matchFilter({ [field]: value }, { [field]: target }, strict)
 66 |       return true
 67 |     })
 68 |   })
 69 | }
 70 | 
 71 | /**
 72 |  * Check if a row group can be skipped based on filter and column statistics.
 73 |  *
 74 |  * @param {object} options
 75 |  * @param {RowGroup} options.rowGroup
 76 |  * @param {string[]} options.physicalColumns
 77 |  * @param {ParquetQueryFilter | undefined} options.filter
 78 |  * @param {boolean} [options.strict]
 79 |  * @returns {boolean} true if the row group can be skipped
 80 |  */
 81 | export function canSkipRowGroup({ rowGroup, physicalColumns, filter, strict = true }) {
 82 |   if (!filter) return false
 83 | 
 84 |   // Handle logical operators
 85 |   if ('$and' in filter && Array.isArray(filter.$and)) {
 86 |     // For AND, we can skip if ANY condition allows skipping
 87 |     return filter.$and.some(subFilter => canSkipRowGroup({ rowGroup, physicalColumns, filter: subFilter, strict }))
 88 |   }
 89 |   if ('$or' in filter && Array.isArray(filter.$or)) {
 90 |     // For OR, we can skip only if ALL conditions allow skipping
 91 |     return filter.$or.every(subFilter => canSkipRowGroup({ rowGroup, physicalColumns, filter: subFilter, strict }))
 92 |   }
 93 |   if ('$nor' in filter && Array.isArray(filter.$nor)) {
 94 |     // For NOR, we can skip if none of the conditions allow skipping
 95 |     // This is complex, so we'll be conservative and not skip
 96 |     return false
 97 |   }
 98 | 
 99 |   // Check column filters
100 |   for (const [field, condition] of Object.entries(filter)) {
101 |     // Find the column chunk for this field
102 |     const columnIndex = physicalColumns.indexOf(field)
103 |     if (columnIndex === -1) continue
104 | 
105 |     const stats = rowGroup.columns[columnIndex].meta_data?.statistics
106 |     if (!stats) continue // No statistics available, can't skip
107 | 
108 |     const { min, max, min_value, max_value } = stats
109 |     const minVal = min_value !== undefined ? min_value : min
110 |     const maxVal = max_value !== undefined ? max_value : max
111 | 
112 |     if (minVal === undefined || maxVal === undefined) continue
113 | 
114 |     // Handle operators
115 |     for (const [operator, target] of Object.entries(condition || {})) {
116 |       if (operator === '$gt' && maxVal <= target) return true
117 |       if (operator === '$gte' && maxVal < target) return true
118 |       if (operator === '$lt' && minVal >= target) return true
119 |       if (operator === '$lte' && minVal > target) return true
120 |       if (operator === '$eq' && (target < minVal || target > maxVal)) return true
121 |       if (operator === '$ne' && equals(minVal, maxVal, strict) && equals(minVal, target, strict)) return true
122 |       if (operator === '$in' && Array.isArray(target) && target.every(v => v < minVal || v > maxVal)) return true
123 |       if (operator === '$nin' && Array.isArray(target) && equals(minVal, maxVal, strict) && target.includes(minVal)) return true
124 |     }
125 |   }
126 | 
127 |   return false
128 | }
129 | 


--------------------------------------------------------------------------------