├── tests ├── __init__.py ├── core │ ├── __init__.py │ ├── test_array_reader.py │ ├── test_record_batch_reader.py │ ├── test_list_flatten.py │ ├── test_misc.py │ ├── test_schema.py │ ├── test_record_batch.py │ ├── test_list_offsets.py │ ├── test_struct_field.py │ ├── test_data_type.py │ ├── test_chunked_array.py │ ├── test_buffer_protocol.py │ ├── test_ffi.py │ ├── test_table.py │ └── test_constructors.py ├── io │ ├── __init__.py │ ├── test_parquet.py │ └── test_ipc.py ├── compute │ ├── test_arith.py │ └── test_aggregate.py └── test_dictionary.py ├── .python-version ├── docs ├── index.md └── api │ ├── compute.md │ ├── core │ ├── array.md │ ├── field.md │ ├── scalar.md │ ├── schema.md │ ├── table.md │ ├── types.md │ ├── array-reader.md │ ├── chunked-array.md │ ├── record-batch.md │ ├── datatype.md │ ├── record-batch-reader.md │ ├── constructors.md │ └── accessors.md │ └── io │ ├── csv.md │ ├── parquet.md │ ├── json.md │ └── arrow-ipc.md ├── arro3-io ├── python │ └── arro3 │ │ └── io │ │ ├── py.typed │ │ ├── __init__.py │ │ ├── store.pyi │ │ ├── _io.pyi │ │ ├── _ipc.pyi │ │ ├── _json.pyi │ │ ├── _pyo3_object_store.pyi │ │ └── _csv.pyi ├── README.md ├── pyproject.toml ├── Cargo.toml └── src │ ├── error.rs │ ├── lib.rs │ ├── json.rs │ ├── ipc.rs │ └── utils.rs ├── arro3-core ├── python │ └── arro3 │ │ └── core │ │ ├── py.typed │ │ ├── __init__.py │ │ ├── _buffer.pyi │ │ ├── _field.pyi │ │ ├── _scalar.pyi │ │ ├── _record_batch_reader.pyi │ │ ├── _array_reader.pyi │ │ ├── types.py │ │ ├── _schema.pyi │ │ └── _array.pyi ├── src │ ├── accessors │ │ ├── mod.rs │ │ ├── struct_field.rs │ │ ├── list_flatten.rs │ │ ├── list_offsets.rs │ │ └── dictionary.rs │ └── lib.rs ├── README.md ├── pyproject.toml └── Cargo.toml ├── arro3-compute ├── python │ └── arro3 │ │ └── compute │ │ ├── py.typed │ │ ├── __init__.py │ │ ├── types.py │ │ ├── _aggregate.pyi │ │ ├── _filter.pyi │ │ ├── _dictionary.pyi │ │ ├── _compute.pyi │ │ ├── _arith.pyi │ │ ├── _boolean.pyi │ │ ├── _take.pyi │ │ ├── _cast.pyi │ │ ├── enums.py │ │ └── _temporal.pyi ├── README.md ├── src │ ├── concat.rs │ ├── take.rs │ ├── cast.rs │ ├── boolean.rs │ ├── filter.rs │ ├── arith.rs │ ├── lib.rs │ ├── dictionary.rs │ └── temporal.rs ├── pyproject.toml └── Cargo.toml ├── pyo3-arrow ├── src │ ├── interop │ │ ├── mod.rs │ │ └── numpy │ │ │ ├── mod.rs │ │ │ └── to_numpy.rs │ ├── ffi │ │ ├── to_python │ │ │ ├── mod.rs │ │ │ ├── nanoarrow.rs │ │ │ ├── chunked.rs │ │ │ └── utils.rs │ │ ├── from_python │ │ │ ├── mod.rs │ │ │ ├── field.rs │ │ │ ├── table.rs │ │ │ ├── datatypes.rs │ │ │ ├── schema.rs │ │ │ ├── chunked.rs │ │ │ ├── array_reader.rs │ │ │ ├── scalar.rs │ │ │ ├── record_batch_reader.rs │ │ │ ├── record_batch.rs │ │ │ ├── array.rs │ │ │ ├── input.rs │ │ │ ├── ffi_stream.rs │ │ │ └── utils.rs │ │ └── mod.rs │ ├── lib.rs │ ├── utils.rs │ └── error.rs └── Cargo.toml ├── .github └── workflows │ ├── conventional-commits.yml │ ├── ci.yml │ ├── test-python.yml │ ├── docs.yml │ └── pyodide-wheels.yml ├── .pre-commit-config.yaml ├── pyproject.toml ├── LICENSE_MIT ├── DEVELOP.md ├── Cargo.toml ├── .gitignore └── mkdocs.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyo3-arrow/src/interop/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod numpy; 2 | -------------------------------------------------------------------------------- /docs/api/compute.md: -------------------------------------------------------------------------------- 1 | # arro3.compute 2 | 3 | ::: arro3.compute 4 | -------------------------------------------------------------------------------- /pyo3-arrow/src/interop/numpy/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod from_numpy; 2 | pub(crate) mod to_numpy; 3 | -------------------------------------------------------------------------------- /docs/api/core/array.md: -------------------------------------------------------------------------------- 1 | # Array 2 | 3 | ::: arro3.core.Array 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/field.md: -------------------------------------------------------------------------------- 1 | # Field 2 | 3 | ::: arro3.core.Field 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/scalar.md: -------------------------------------------------------------------------------- 1 | # Scalar 2 | 3 | ::: arro3.core.Scalar 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/schema.md: -------------------------------------------------------------------------------- 1 | # Schema 2 | 3 | ::: arro3.core.Schema 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/table.md: -------------------------------------------------------------------------------- 1 | # Table 2 | 3 | ::: arro3.core.Table 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/types.md: -------------------------------------------------------------------------------- 1 | # types 2 | 3 | ::: arro3.core.types 4 | options: 5 | show_if_no_docstring: true 6 | -------------------------------------------------------------------------------- /docs/api/core/array-reader.md: -------------------------------------------------------------------------------- 1 | # ArrayReader 2 | 3 | ::: arro3.core.ArrayReader 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/chunked-array.md: -------------------------------------------------------------------------------- 1 | # ChunkedArray 2 | 3 | ::: arro3.core.ChunkedArray 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/core/record-batch.md: -------------------------------------------------------------------------------- 1 | # RecordBatch 2 | 3 | ::: arro3.core.RecordBatch 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/io/csv.md: -------------------------------------------------------------------------------- 1 | # CSV 2 | 3 | ::: arro3.io.infer_csv_schema 4 | ::: arro3.io.read_csv 5 | ::: arro3.io.write_csv 6 | -------------------------------------------------------------------------------- /docs/api/core/datatype.md: -------------------------------------------------------------------------------- 1 | # DataType 2 | 3 | ::: arro3.core.DataType 4 | options: 5 | show_if_no_docstring: true 6 | -------------------------------------------------------------------------------- /docs/api/core/record-batch-reader.md: -------------------------------------------------------------------------------- 1 | # RecordBatchReader 2 | 3 | ::: arro3.core.RecordBatchReader 4 | options: 5 | members: 6 | -------------------------------------------------------------------------------- /docs/api/io/parquet.md: -------------------------------------------------------------------------------- 1 | # Parquet 2 | 3 | ::: arro3.io.read_parquet 4 | ::: arro3.io.read_parquet_async 5 | ::: arro3.io.write_parquet 6 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/__init__.py: -------------------------------------------------------------------------------- 1 | from ._core import * 2 | from ._core import ___version 3 | 4 | __version__: str = ___version() 5 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/__init__.py: -------------------------------------------------------------------------------- 1 | from ._io import * 2 | from ._io import ___version, store 3 | 4 | __version__: str = ___version() 5 | -------------------------------------------------------------------------------- /docs/api/io/json.md: -------------------------------------------------------------------------------- 1 | # JSON 2 | 3 | ::: arro3.io.infer_json_schema 4 | ::: arro3.io.read_json 5 | ::: arro3.io.write_json 6 | ::: arro3.io.write_ndjson 7 | -------------------------------------------------------------------------------- /arro3-core/src/accessors/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod dictionary; 2 | pub(crate) mod list_flatten; 3 | pub(crate) mod list_offsets; 4 | pub(crate) mod struct_field; 5 | -------------------------------------------------------------------------------- /docs/api/io/arrow-ipc.md: -------------------------------------------------------------------------------- 1 | # Arrow IPC 2 | 3 | ::: arro3.io.read_ipc 4 | ::: arro3.io.read_ipc_stream 5 | ::: arro3.io.write_ipc 6 | ::: arro3.io.write_ipc_stream 7 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/__init__.py: -------------------------------------------------------------------------------- 1 | from . import enums, types 2 | from ._compute import * 3 | from ._compute import ___version 4 | 5 | __version__: str = ___version() 6 | -------------------------------------------------------------------------------- /docs/api/core/constructors.md: -------------------------------------------------------------------------------- 1 | # Constructors 2 | 3 | ::: arro3.core 4 | options: 5 | members: 6 | - fixed_size_list_array 7 | - list_array 8 | - struct_array 9 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/to_python/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod chunked; 2 | pub mod ffi_stream; 3 | pub mod nanoarrow; 4 | mod utils; 5 | 6 | pub use utils::{to_array_pycapsules, to_schema_pycapsule, to_stream_pycapsule}; 7 | -------------------------------------------------------------------------------- /docs/api/core/accessors.md: -------------------------------------------------------------------------------- 1 | # Accessors 2 | 3 | ::: arro3.core 4 | options: 5 | members: 6 | - dictionary_dictionary 7 | - dictionary_indices 8 | - list_flatten 9 | - list_offsets 10 | - struct_field 11 | -------------------------------------------------------------------------------- /arro3-io/README.md: -------------------------------------------------------------------------------- 1 | # arro3-io 2 | 3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs). 4 | 5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/). 6 | -------------------------------------------------------------------------------- /arro3-core/README.md: -------------------------------------------------------------------------------- 1 | # arro3-core 2 | 3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs). 4 | 5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/). 6 | -------------------------------------------------------------------------------- /arro3-compute/README.md: -------------------------------------------------------------------------------- 1 | # arro3-compute 2 | 3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs). 4 | 5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/). 6 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/mod.rs: -------------------------------------------------------------------------------- 1 | mod array; 2 | mod array_reader; 3 | mod chunked; 4 | mod datatypes; 5 | pub(crate) mod ffi_stream; 6 | mod field; 7 | mod input; 8 | mod record_batch; 9 | mod record_batch_reader; 10 | mod scalar; 11 | mod schema; 12 | mod table; 13 | pub(crate) mod utils; 14 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/mod.rs: -------------------------------------------------------------------------------- 1 | //! Utilities for managing Arrow FFI between Python and Rust. 2 | 3 | pub(crate) mod from_python; 4 | pub(crate) mod to_python; 5 | 6 | pub use to_python::chunked::{ArrayIterator, ArrayReader}; 7 | pub use to_python::{to_array_pycapsules, to_schema_pycapsule, to_stream_pycapsule}; 8 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/store.pyi: -------------------------------------------------------------------------------- 1 | # TODO: move to reusable types package 2 | from ._pyo3_object_store import AzureStore as AzureStore 3 | from ._pyo3_object_store import GCSStore as GCSStore 4 | from ._pyo3_object_store import HTTPStore as HTTPStore 5 | from ._pyo3_object_store import LocalStore as LocalStore 6 | from ._pyo3_object_store import MemoryStore as MemoryStore 7 | from ._pyo3_object_store import S3Store as S3Store 8 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/field.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_schema; 2 | use crate::field::PyField; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyField { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_schema(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/table.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_stream; 2 | use crate::table::PyTable; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyTable { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_stream(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/datatypes.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_schema; 2 | use crate::PyDataType; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyDataType { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_schema(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/schema.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_schema; 2 | use crate::schema::PySchema; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PySchema { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let schema_ptr = call_arrow_c_schema(ob)?; 9 | Self::from_arrow_pycapsule(&schema_ptr) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/chunked.rs: -------------------------------------------------------------------------------- 1 | use crate::chunked::PyChunkedArray; 2 | use crate::ffi::from_python::utils::call_arrow_c_stream; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyChunkedArray { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_stream(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/array_reader.rs: -------------------------------------------------------------------------------- 1 | use crate::array_reader::PyArrayReader; 2 | use crate::ffi::from_python::utils::call_arrow_c_stream; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyArrayReader { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_stream(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/scalar.rs: -------------------------------------------------------------------------------- 1 | use crate::array::*; 2 | use crate::PyScalar; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyScalar { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let array = ob.extract::()?; 9 | let (array, field) = array.into_inner(); 10 | Self::try_new(array, field).map_err(|err| err.into()) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/record_batch_reader.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_stream; 2 | use crate::record_batch_reader::PyRecordBatchReader; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyRecordBatchReader { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let capsule = call_arrow_c_stream(ob)?; 9 | Self::from_arrow_pycapsule(&capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /.github/workflows/conventional-commits.yml: -------------------------------------------------------------------------------- 1 | name: PR Conventional Commit Validation 2 | 3 | on: 4 | pull_request_target: 5 | types: [opened, synchronize, reopened, edited] 6 | 7 | jobs: 8 | validate-pr-title: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: PR Conventional Commit Validation 12 | uses: ytanikin/pr-conventional-commits@1.4.0 13 | with: 14 | task_types: '["feat","fix","docs","test","ci","refactor","perf","chore","revert"]' 15 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/record_batch.rs: -------------------------------------------------------------------------------- 1 | use crate::ffi::from_python::utils::call_arrow_c_array; 2 | use crate::record_batch::PyRecordBatch; 3 | use pyo3::prelude::*; 4 | use pyo3::{PyAny, PyResult}; 5 | 6 | impl<'a> FromPyObject<'a> for PyRecordBatch { 7 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 8 | let (schema_capsule, array_capsule) = call_arrow_c_array(ob)?; 9 | Self::from_arrow_pycapsule(&schema_capsule, &array_capsule) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /arro3-compute/src/concat.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3_arrow::error::PyArrowResult; 3 | use pyo3_arrow::{PyArray, PyChunkedArray}; 4 | 5 | #[pyfunction] 6 | pub fn concat(py: Python, input: PyChunkedArray) -> PyArrowResult { 7 | let (chunks, field) = input.into_inner(); 8 | let array_refs = chunks.iter().map(|arr| arr.as_ref()).collect::>(); 9 | let concatted = arrow_select::concat::concat(array_refs.as_slice())?; 10 | Ok(PyArray::new(concatted, field).to_arro3(py)?.unbind()) 11 | } 12 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_buffer.pyi: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version_info >= (3, 12): 4 | from collections.abc import Buffer as _Buffer 5 | else: 6 | from typing_extensions import Buffer as _Buffer 7 | 8 | class Buffer(_Buffer): 9 | """An Arrow Buffer""" 10 | def __init__(self, buffer) -> None: ... 11 | def __buffer__(self, flags: int) -> memoryview: ... 12 | def __len__(self) -> int: ... 13 | def to_bytes(self) -> bytes: 14 | """Copy this buffer into a Python `bytes` object.""" 15 | -------------------------------------------------------------------------------- /tests/core/test_array_reader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from arro3.core import ArrayReader 3 | 4 | 5 | class CustomException(Exception): 6 | pass 7 | 8 | 9 | class ArrowCStreamFails: 10 | def __arrow_c_stream__(self, requested_schema=None): 11 | raise CustomException 12 | 13 | 14 | def test_array_reader_import_preserve_exception(): 15 | """https://github.com/kylebarron/arro3/issues/325""" 16 | 17 | c_stream_obj = ArrowCStreamFails() 18 | with pytest.raises(CustomException): 19 | ArrayReader.from_arrow(c_stream_obj) 20 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Literal 4 | 5 | DatePartT = Literal[ 6 | "quarter", 7 | "year", 8 | "month", 9 | "week", 10 | "day", 11 | "dayofweeksunday0", 12 | "dayofweekmonday0", 13 | "dayofyear", 14 | "hour", 15 | "minute", 16 | "second", 17 | "millisecond", 18 | "microsecond", 19 | "nanosecond", 20 | ] 21 | """ 22 | Acceptable strings to be passed into the `part` parameter for 23 | [`date_part`][arro3.compute.date_part]. 24 | """ 25 | -------------------------------------------------------------------------------- /arro3-compute/src/take.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use pyo3_arrow::error::PyArrowResult; 3 | use pyo3_arrow::PyArray; 4 | 5 | /// Take elements by index from an Array, creating a new Array from those 6 | /// indexes. 7 | #[pyfunction] 8 | pub fn take(py: Python, values: PyArray, indices: PyArray) -> PyArrowResult { 9 | let output_array = 10 | py.allow_threads(|| arrow_select::take::take(values.as_ref(), indices.as_ref(), None))?; 11 | Ok(PyArray::new(output_array, values.field().clone()) 12 | .to_arro3(py)? 13 | .unbind()) 14 | } 15 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_aggregate.pyi: -------------------------------------------------------------------------------- 1 | from arro3.core import Scalar 2 | from arro3.core.types import ArrayInput, ArrowStreamExportable 3 | 4 | def max(input: ArrayInput | ArrowStreamExportable) -> Scalar: 5 | """ 6 | Returns the max of values in the array. 7 | """ 8 | 9 | def min(input: ArrayInput | ArrowStreamExportable) -> Scalar: 10 | """ 11 | Returns the min of values in the array. 12 | """ 13 | 14 | def sum(input: ArrayInput | ArrowStreamExportable) -> Scalar: 15 | """ 16 | Returns the sum of values in the array. 17 | """ 18 | -------------------------------------------------------------------------------- /tests/core/test_record_batch_reader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from arro3.core import RecordBatchReader 3 | 4 | 5 | class CustomException(Exception): 6 | pass 7 | 8 | 9 | class ArrowCStreamFails: 10 | def __arrow_c_stream__(self, requested_schema=None): 11 | raise CustomException 12 | 13 | 14 | def test_record_batch_reader_import_preserve_exception(): 15 | """https://github.com/kylebarron/arro3/issues/325""" 16 | 17 | c_stream_obj = ArrowCStreamFails() 18 | with pytest.raises(CustomException): 19 | RecordBatchReader.from_arrow(c_stream_obj) 20 | -------------------------------------------------------------------------------- /arro3-io/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.4.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "arro3-io" 7 | requires-python = ">=3.9" 8 | dependencies = ["arro3-core"] 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | ] 14 | dynamic = ["version"] 15 | 16 | [tool.maturin] 17 | features = ["pyo3/extension-module"] 18 | module-name = "arro3.io._io" 19 | python-source = "python" 20 | strip = true 21 | -------------------------------------------------------------------------------- /tests/compute/test_arith.py: -------------------------------------------------------------------------------- 1 | import arro3.compute as ac 2 | import pyarrow as pa 3 | from arro3.core import Array, DataType 4 | 5 | 6 | def test_add(): 7 | arr1 = Array([1, 2, 3], DataType.int16()) 8 | assert ac.min(arr1).as_py() == 1 9 | 10 | arr2 = Array([3, 2, 0], DataType.int16()) 11 | assert ac.min(arr2).as_py() == 0 12 | 13 | add1 = ac.add(arr1, arr2) 14 | assert pa.array(add1) == pa.array(Array([4, 4, 3], DataType.int16())) 15 | 16 | s = arr1[0] 17 | add2 = ac.add(arr1, s) 18 | assert pa.array(add2) == pa.array(Array([2, 3, 4], DataType.int16())) 19 | -------------------------------------------------------------------------------- /arro3-compute/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.4.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "arro3-compute" 7 | requires-python = ">=3.9" 8 | dependencies = ["arro3-core"] 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | ] 14 | dynamic = ["version"] 15 | 16 | [tool.maturin] 17 | features = ["pyo3/extension-module"] 18 | module-name = "arro3.compute._compute" 19 | python-source = "python" 20 | strip = true 21 | -------------------------------------------------------------------------------- /arro3-core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.4.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "arro3-core" 7 | requires-python = ">=3.9" 8 | dependencies = ["typing-extensions; python_version < '3.12'"] 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | ] 14 | dynamic = ["version"] 15 | 16 | [tool.maturin] 17 | features = ["pyo3/extension-module"] 18 | module-name = "arro3.core._core" 19 | python-source = "python" 20 | strip = true 21 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/_io.pyi: -------------------------------------------------------------------------------- 1 | from ._csv import infer_csv_schema, read_csv, write_csv 2 | from ._ipc import read_ipc, read_ipc_stream, write_ipc, write_ipc_stream 3 | from ._json import infer_json_schema, read_json, write_json, write_ndjson 4 | from ._parquet import read_parquet, read_parquet_async, write_parquet 5 | 6 | __all__ = [ 7 | "infer_csv_schema", 8 | "read_csv", 9 | "write_csv", 10 | "infer_json_schema", 11 | "read_json", 12 | "write_json", 13 | "write_ndjson", 14 | "read_ipc", 15 | "read_ipc_stream", 16 | "write_ipc", 17 | "write_ipc_stream", 18 | "read_parquet", 19 | "read_parquet_async", 20 | "write_parquet", 21 | ] 22 | -------------------------------------------------------------------------------- /tests/core/test_list_flatten.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | from arro3.core import list_flatten 3 | 4 | 5 | def test_list_flatten(): 6 | list_arr = pa.array([[1, 2], [3, 4]]) 7 | out = pa.array(list_flatten(list_arr)) 8 | assert out == pa.array([1, 2, 3, 4]) 9 | 10 | 11 | def test_list_flatten_sliced_end(): 12 | list_arr = pa.array([[1, 2], [3, 4]]) 13 | sliced = list_arr.slice(1, 2) 14 | out = pa.array(list_flatten(sliced)) 15 | assert out == pa.array([3, 4]) 16 | 17 | 18 | def test_list_flatten_sliced_start(): 19 | list_arr = pa.array([[1, 2], [3, 4]]) 20 | sliced = list_arr.slice(0, 1) 21 | out = pa.array(list_flatten(sliced)) 22 | assert out == pa.array([1, 2]) 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | 4 | # Default to Python 3 5 | default_language_version: 6 | python: python3 7 | 8 | # Optionally both commit and push 9 | default_stages: [pre-commit] 10 | 11 | repos: 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v2.4.0 14 | hooks: 15 | - id: trailing-whitespace 16 | - id: end-of-file-fixer 17 | - id: check-added-large-files 18 | args: ["--maxkb=500"] 19 | 20 | - repo: https://github.com/astral-sh/ruff-pre-commit 21 | rev: v0.12.10 22 | hooks: 23 | - id: ruff 24 | args: ["--fix"] 25 | - id: ruff-format 26 | -------------------------------------------------------------------------------- /tests/core/test_misc.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | 5 | def test_numpy_backed_array_to_pyarrow(): 6 | # Passing a numpy-backed `arro3.core.Array` to `pyarrow.Array` 7 | # caused a segfault at interpreter shutdown. 8 | # Affected versions: 0.4.0, 0.4.1 9 | # See: [#230](https://github.com/kylebarron/arro3/issues/230) 10 | code = ( 11 | "import numpy as np\n" 12 | "import pyarrow as pa\n" 13 | "from arro3.core import Array\n" 14 | "\n" 15 | "numpy_arr = np.array([0, 1, 2, 3], dtype=np.float64)\n" 16 | "arro3_arr = Array(numpy_arr)\n" 17 | "pyarrow_arr = pa.array(arro3_arr)\n" 18 | ) 19 | subprocess.check_call([sys.executable, "-c", code]) 20 | -------------------------------------------------------------------------------- /pyo3-arrow/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | #![deny(missing_docs)] 3 | 4 | mod array; 5 | mod array_reader; 6 | #[cfg(feature = "buffer_protocol")] 7 | pub mod buffer; 8 | mod chunked; 9 | mod datatypes; 10 | pub mod error; 11 | pub mod export; 12 | pub mod ffi; 13 | mod field; 14 | pub mod input; 15 | mod interop; 16 | mod record_batch; 17 | mod record_batch_reader; 18 | mod scalar; 19 | mod schema; 20 | mod table; 21 | mod utils; 22 | 23 | pub use array::PyArray; 24 | pub use array_reader::PyArrayReader; 25 | pub use chunked::PyChunkedArray; 26 | pub use datatypes::PyDataType; 27 | pub use field::PyField; 28 | pub use record_batch::PyRecordBatch; 29 | pub use record_batch_reader::PyRecordBatchReader; 30 | pub use scalar::PyScalar; 31 | pub use schema::PySchema; 32 | pub use table::PyTable; 33 | -------------------------------------------------------------------------------- /tests/core/test_schema.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pytest 3 | from arro3.core import Field, Schema, Table 4 | 5 | 6 | def test_schema_iterable(): 7 | a = pa.chunked_array([[1, 2, 3, 4]]) 8 | b = pa.chunked_array([["a", "b", "c", "d"]]) 9 | table = Table.from_pydict({"a": a, "b": b}) 10 | schema = table.schema 11 | for field in schema: 12 | assert isinstance(field, Field) 13 | assert field.name in ["a", "b"] 14 | 15 | 16 | class CustomException(Exception): 17 | pass 18 | 19 | 20 | class ArrowCSchemaFails: 21 | def __arrow_c_schema__(self): 22 | raise CustomException 23 | 24 | 25 | def test_schema_import_preserve_exception(): 26 | """https://github.com/kylebarron/arro3/issues/325""" 27 | 28 | c_stream_obj = ArrowCSchemaFails() 29 | with pytest.raises(CustomException): 30 | Schema.from_arrow(c_stream_obj) 31 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_filter.pyi: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | 3 | from arro3.core import Array, ArrayReader 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable 5 | 6 | @overload 7 | def filter( 8 | values: ArrayInput, 9 | predicate: ArrayInput, 10 | ) -> Array: ... 11 | @overload 12 | def filter( 13 | values: ArrowStreamExportable, 14 | predicate: ArrowStreamExportable, 15 | ) -> ArrayReader: ... 16 | def filter( 17 | values: ArrayInput | ArrowStreamExportable, 18 | predicate: ArrayInput | ArrowStreamExportable, 19 | ) -> Array | ArrayReader: 20 | """ 21 | Returns a filtered `values` array where the corresponding elements of 22 | `predicate` are `true`. 23 | 24 | If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` 25 | or `ArrayReader`, an `ArrayReader` will be returned. 26 | """ 27 | -------------------------------------------------------------------------------- /pyo3-arrow/src/utils.rs: -------------------------------------------------------------------------------- 1 | use arrow_cast::display::FormatOptions; 2 | use arrow_schema::Schema; 3 | 4 | /// Check whether two schemas are equal 5 | /// 6 | /// This allows schemas to have different top-level metadata, as well as different nested field 7 | /// names and keys. 8 | pub(crate) fn schema_equals(left: &Schema, right: &Schema) -> bool { 9 | left.fields 10 | .iter() 11 | .zip(right.fields.iter()) 12 | .all(|(left_field, right_field)| { 13 | left_field.name() == right_field.name() 14 | && left_field 15 | .data_type() 16 | .equals_datatype(right_field.data_type()) 17 | }) 18 | } 19 | 20 | pub(crate) fn default_repr_options<'a>() -> FormatOptions<'a> { 21 | FormatOptions::new() 22 | .with_display_error(true) 23 | .with_null("null") 24 | .with_types_info(true) 25 | } 26 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_dictionary.pyi: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | 3 | from arro3.core import Array, ArrayReader 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable 5 | 6 | @overload 7 | def dictionary_encode(array: ArrayInput) -> Array: ... 8 | @overload 9 | def dictionary_encode(array: ArrowStreamExportable) -> ArrayReader: ... 10 | def dictionary_encode( 11 | array: ArrayInput | ArrowStreamExportable, 12 | ) -> Array | ArrayReader: 13 | """ 14 | Dictionary-encode array. 15 | 16 | Return a dictionary-encoded version of the input array. This function does nothing if the input is already a dictionary array. 17 | 18 | Note: for stream input, each output array will not necessarily have the same dictionary. 19 | 20 | Args: 21 | array: Argument to compute function. 22 | 23 | Returns: 24 | The dictionary-encoded array. 25 | """ 26 | -------------------------------------------------------------------------------- /arro3-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "arro3-core" 3 | version = { workspace = true } 4 | authors = { workspace = true } 5 | edition = { workspace = true } 6 | description = "Core library for representing Arrow data in Python." 7 | readme = "README.md" 8 | repository = { workspace = true } 9 | homepage = { workspace = true } 10 | license = { workspace = true } 11 | keywords = { workspace = true } 12 | categories = { workspace = true } 13 | rust-version = { workspace = true } 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | [lib] 17 | name = "_core" 18 | crate-type = ["cdylib"] 19 | 20 | [dependencies] 21 | arrow-array = { workspace = true } 22 | arrow-buffer = { workspace = true } 23 | arrow-cast = { workspace = true, features = ["prettyprint"] } 24 | arrow-schema = { workspace = true } 25 | pyo3-arrow = { workspace = true } 26 | pyo3 = { workspace = true } 27 | -------------------------------------------------------------------------------- /tests/core/test_record_batch.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pytest 3 | from arro3.core import RecordBatch 4 | 5 | 6 | def test_nonempty_batch_no_columns(): 7 | batch = pa.record_batch({"a": [1, 2, 3, 4]}).select([]) 8 | assert len(batch) == 4 9 | assert batch.num_columns == 0 10 | arro3_batch = RecordBatch.from_arrow(batch) 11 | retour = pa.record_batch(arro3_batch) 12 | assert batch == retour 13 | 14 | 15 | class CustomException(Exception): 16 | pass 17 | 18 | 19 | class ArrowCArrayFails: 20 | def __arrow_c_array__(self, requested_schema=None): 21 | raise CustomException 22 | 23 | 24 | def test_record_batch_import_preserve_exception(): 25 | """https://github.com/kylebarron/arro3/issues/325""" 26 | 27 | c_stream_obj = ArrowCArrayFails() 28 | with pytest.raises(CustomException): 29 | RecordBatch.from_arrow(c_stream_obj) 30 | 31 | with pytest.raises(CustomException): 32 | RecordBatch(c_stream_obj) 33 | -------------------------------------------------------------------------------- /arro3-compute/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "arro3-compute" 3 | version = { workspace = true } 4 | authors = { workspace = true } 5 | edition = { workspace = true } 6 | description = "Rust-based compute kernels for Arrow in Python." 7 | readme = "README.md" 8 | repository = { workspace = true } 9 | homepage = { workspace = true } 10 | license = { workspace = true } 11 | keywords = { workspace = true } 12 | categories = { workspace = true } 13 | rust-version = { workspace = true } 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | [lib] 17 | name = "_compute" 18 | crate-type = ["cdylib"] 19 | 20 | [dependencies] 21 | arrow-arith = { workspace = true } 22 | arrow-array = { workspace = true } 23 | arrow-buffer = { workspace = true } 24 | arrow-cast = { workspace = true } 25 | arrow-schema = { workspace = true } 26 | arrow-select = { workspace = true } 27 | pyo3 = { workspace = true } 28 | pyo3-arrow = { workspace = true } 29 | thiserror = { workspace = true } 30 | -------------------------------------------------------------------------------- /tests/core/test_list_offsets.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | from arro3.core import list_offsets 3 | 4 | 5 | def test_list_flatten(): 6 | list_arr = pa.array([[1, 2], [3, 4]]) 7 | out = pa.array(list_offsets(list_arr)) 8 | assert out == list_arr.offsets 9 | 10 | 11 | def test_list_flatten_sliced_end(): 12 | list_arr = pa.array([[1, 2], [3, 4]]) 13 | sliced = list_arr.slice(1, 1) 14 | 15 | out = pa.array(list_offsets(sliced, logical=False)) 16 | assert out == pa.array([2, 4], type=pa.int32()) 17 | 18 | out = pa.array(list_offsets(sliced, logical=True)) 19 | assert out == pa.array([0, 2], type=pa.int32()) 20 | 21 | 22 | def test_list_flatten_sliced_start(): 23 | list_arr = pa.array([[1, 2], [3, 4]]) 24 | sliced = list_arr.slice(0, 1) 25 | 26 | out = pa.array(list_offsets(sliced, logical=False)) 27 | assert out == pa.array([0, 2], type=pa.int32()) 28 | 29 | out = pa.array(list_offsets(sliced, logical=True)) 30 | assert out == pa.array([0, 2], type=pa.int32()) 31 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/array.rs: -------------------------------------------------------------------------------- 1 | use crate::array::*; 2 | #[cfg(feature = "buffer_protocol")] 3 | use crate::buffer::AnyBufferProtocol; 4 | use crate::ffi::from_python::utils::call_arrow_c_array; 5 | use pyo3::exceptions::PyValueError; 6 | use pyo3::prelude::*; 7 | use pyo3::{intern, PyAny, PyResult}; 8 | 9 | impl<'a> FromPyObject<'a> for PyArray { 10 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 11 | if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? { 12 | let (schema_capsule, array_capsule) = call_arrow_c_array(ob)?; 13 | Self::from_arrow_pycapsule(&schema_capsule, &array_capsule) 14 | } else { 15 | #[cfg(feature = "buffer_protocol")] 16 | if let Ok(buf) = ob.extract::() { 17 | return Ok(buf.try_into()?); 18 | } 19 | 20 | Err(PyValueError::new_err( 21 | "Expected object with __arrow_c_array__ method or implementing buffer protocol.", 22 | )) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/to_python/nanoarrow.rs: -------------------------------------------------------------------------------- 1 | use pyo3::intern; 2 | use pyo3::prelude::*; 3 | use pyo3::types::{PyCapsule, PyTuple}; 4 | 5 | pub fn to_nanoarrow_schema<'py>( 6 | py: Python<'py>, 7 | capsule: &Bound<'py, PyCapsule>, 8 | ) -> PyResult> { 9 | let na_mod = py.import(intern!(py, "nanoarrow"))?; 10 | na_mod 11 | .getattr(intern!(py, "Schema"))? 12 | .call1(PyTuple::new(py, vec![capsule])?) 13 | } 14 | 15 | pub fn to_nanoarrow_array<'py>( 16 | py: Python<'py>, 17 | capsules: Bound<'py, PyTuple>, 18 | ) -> PyResult> { 19 | let na_mod = py.import(intern!(py, "nanoarrow"))?; 20 | na_mod.getattr(intern!(py, "Array"))?.call1(capsules) 21 | } 22 | 23 | pub fn to_nanoarrow_array_stream<'py>( 24 | py: Python<'py>, 25 | capsule: &Bound<'py, PyCapsule>, 26 | ) -> PyResult> { 27 | let na_mod = py.import(intern!(py, "nanoarrow"))?; 28 | na_mod 29 | .getattr(intern!(py, "ArrayStream"))? 30 | .call1(PyTuple::new(py, vec![capsule])?) 31 | } 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "arro3" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [] 8 | 9 | [tool.uv] 10 | dev-dependencies = [ 11 | "black>=24.10.0", 12 | "boto3>=1.35.38", 13 | "geoarrow-types>=0.3.0", 14 | "griffe-inherited-docstrings>=1.0.1", 15 | "ipykernel>=6.29.5", 16 | "maturin>=1.7.4", 17 | "mike>=2.1.3", 18 | "mkdocs-material[imaging]>=9.6.7", 19 | "mkdocs-redirects>=1.2.2", 20 | "mkdocs>=1.6.1", 21 | "mkdocstrings[python]>=0.28.3", 22 | "pandas-stubs>=2.2.3.250527", 23 | "pandas>=2.2.3", 24 | "pip>=24.2", 25 | "pyarrow>=21.0.0", 26 | "pytest>=8.3.3", 27 | ] 28 | 29 | [tool.ruff.lint] 30 | select = [ 31 | # Pyflakes 32 | "F", 33 | # Pycodestyle 34 | # "E", 35 | "W", 36 | # isort 37 | "I", 38 | ] 39 | 40 | [tool.ruff.lint.extend-per-file-ignores] 41 | "__init__.py" = [ 42 | "F401", # Allow unused imports in __init__.py files 43 | "F403", # unable to detect undefined names 44 | ] 45 | -------------------------------------------------------------------------------- /LICENSE_MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kyle Barron 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_compute.pyi: -------------------------------------------------------------------------------- 1 | from arro3.compute._aggregate import max as max 2 | from arro3.compute._aggregate import min as min 3 | from arro3.compute._aggregate import sum as sum 4 | from arro3.compute._arith import add as add 5 | from arro3.compute._arith import add_wrapping as add_wrapping 6 | from arro3.compute._arith import div as div 7 | from arro3.compute._arith import mul as mul 8 | from arro3.compute._arith import mul_wrapping as mul_wrapping 9 | from arro3.compute._arith import neg as neg 10 | from arro3.compute._arith import neg_wrapping as neg_wrapping 11 | from arro3.compute._arith import rem as rem 12 | from arro3.compute._arith import sub as sub 13 | from arro3.compute._arith import sub_wrapping as sub_wrapping 14 | from arro3.compute._boolean import is_not_null as is_not_null 15 | from arro3.compute._boolean import is_null as is_null 16 | from arro3.compute._cast import can_cast_types as can_cast_types 17 | from arro3.compute._cast import cast as cast 18 | from arro3.compute._dictionary import dictionary_encode as dictionary_encode 19 | from arro3.compute._filter import filter as filter 20 | from arro3.compute._take import take as take 21 | from arro3.compute._temporal import date_part as date_part 22 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_arith.pyi: -------------------------------------------------------------------------------- 1 | from arro3.core import Array 2 | from arro3.core.types import ArrayInput 3 | 4 | def add(lhs: ArrayInput, rhs: ArrayInput) -> Array: 5 | """Perform `lhs + rhs`, returning an error on overflow""" 6 | 7 | def add_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array: 8 | """Perform `lhs + rhs`, wrapping on overflow for integer data types.""" 9 | 10 | def div(lhs: ArrayInput, rhs: ArrayInput) -> Array: 11 | """Perform `lhs / rhs`""" 12 | 13 | def mul(lhs: ArrayInput, rhs: ArrayInput) -> Array: 14 | """Perform `lhs * rhs`, returning an error on overflow""" 15 | 16 | def mul_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array: 17 | """Perform `lhs * rhs`, wrapping on overflow for integer data types.""" 18 | 19 | def neg(array: ArrayInput) -> Array: 20 | """Negates each element of array, returning an error on overflow""" 21 | 22 | def neg_wrapping(array: ArrayInput) -> Array: 23 | """Negates each element of array, wrapping on overflow for integer data types.""" 24 | 25 | def rem(lhs: ArrayInput, rhs: ArrayInput) -> Array: 26 | """Perform `lhs % rhs`""" 27 | 28 | def sub(lhs: ArrayInput, rhs: ArrayInput) -> Array: 29 | """Perform `lhs - rhs`, returning an error on overflow""" 30 | 31 | def sub_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array: 32 | """Perform `lhs - rhs`, wrapping on overflow for integer data types.""" 33 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | lint-test: 11 | name: Lint and Test 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: "recursive" 17 | 18 | - name: Install Rust 19 | uses: dtolnay/rust-toolchain@stable 20 | with: 21 | components: rustfmt, clippy 22 | 23 | - uses: Swatinem/rust-cache@v2 24 | 25 | - name: Cargo fmt 26 | run: cargo fmt --all -- --check 27 | 28 | - name: "clippy --all" 29 | run: cargo clippy --all --all-features --tests -- -D warnings 30 | 31 | - name: "cargo check" 32 | run: cargo check --all --all-features 33 | 34 | - name: "cargo test" 35 | run: | 36 | cargo test --all 37 | cargo test --all --all-features 38 | 39 | check-features_pyo3_arrow: 40 | runs-on: ubuntu-latest 41 | strategy: 42 | fail-fast: false 43 | matrix: 44 | args: 45 | - "--no-default-features" 46 | steps: 47 | - uses: actions/checkout@v4 48 | with: 49 | submodules: "recursive" 50 | - uses: dtolnay/rust-toolchain@stable 51 | - uses: Swatinem/rust-cache@v2 52 | - name: Test 53 | run: cd pyo3-arrow && cargo check ${{ matrix.args }} 54 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_boolean.pyi: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | 3 | from arro3.core import Array, ArrayReader 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable 5 | 6 | @overload 7 | def is_null(input: ArrayInput) -> Array: ... 8 | @overload 9 | def is_null(input: ArrowStreamExportable) -> ArrayReader: ... 10 | def is_null( 11 | input: ArrayInput | ArrowStreamExportable, 12 | ) -> Array | ArrayReader: 13 | """ 14 | Returns a non-null boolean-typed array with whether each value of the array is null. 15 | 16 | If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned. 17 | 18 | Args: 19 | input: Input data 20 | 21 | Returns: 22 | Output 23 | """ 24 | 25 | @overload 26 | def is_not_null(input: ArrayInput) -> Array: ... 27 | @overload 28 | def is_not_null(input: ArrowStreamExportable) -> ArrayReader: ... 29 | def is_not_null( 30 | input: ArrayInput | ArrowStreamExportable, 31 | ) -> Array | ArrayReader: 32 | """ 33 | Returns a non-null boolean-typed array with whether each value of the array is not null. 34 | 35 | If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned. 36 | 37 | Args: 38 | input: Input data 39 | 40 | Returns: 41 | Output 42 | """ 43 | -------------------------------------------------------------------------------- /pyo3-arrow/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pyo3-arrow" 3 | version = "0.11.0" 4 | authors = ["Kyle Barron "] 5 | edition = "2021" 6 | description = "Arrow integration for pyo3." 7 | readme = "README.md" 8 | repository = "https://github.com/kylebarron/arro3" 9 | license = "MIT OR Apache-2.0" 10 | keywords = ["python", "arrow"] 11 | categories = [] 12 | rust-version = "1.75" 13 | 14 | [features] 15 | default = ["buffer_protocol"] 16 | 17 | # Support buffer protocol. Requires `abi3-py311` pyo3 feature or non-abi3 18 | # wheels. 19 | buffer_protocol = [] 20 | 21 | [dependencies] 22 | arrow-array = { version = "56", features = ["chrono-tz", "ffi"] } 23 | arrow-buffer = "56" 24 | arrow-cast = { version = "56", features = ["prettyprint"] } 25 | arrow-data = "56" 26 | arrow-schema = "56" 27 | arrow-select = "56" 28 | pyo3 = { version = "0.26", features = ["chrono", "chrono-tz", "indexmap"] } 29 | half = "2" 30 | indexmap = "2" 31 | # numpy = { version = "0.26", features = ["half"] } 32 | numpy = { git = "https://github.com/Icxolu/rust-numpy", rev = "2480e2c86f6e91dc815b7f8e473b71bb18486bb1", features = [ 33 | "half", 34 | ] } 35 | thiserror = "1" 36 | 37 | [lib] 38 | crate-type = ["rlib"] 39 | 40 | [patch.crates-io] 41 | pyo3 = { version = "0.26.0", features = [ 42 | "macros", 43 | ], git = "https://github.com/pyo3/pyo3.git", tag = "v0.26.0" } 44 | numpy = { git = "https://github.com/Icxolu/rust-numpy", rev = "2480e2c86f6e91dc815b7f8e473b71bb18486bb1" } 45 | -------------------------------------------------------------------------------- /tests/io/test_parquet.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from arro3.io import read_parquet, write_parquet 6 | 7 | 8 | def test_parquet_round_trip(): 9 | table = pa.table({"a": [1, 2, 3, 4]}) 10 | write_parquet(table, "test.parquet") 11 | table_retour = pa.table(read_parquet("test.parquet")) 12 | assert table == table_retour 13 | 14 | 15 | def test_parquet_round_trip_bytes_io(): 16 | table = pa.table({"a": [1, 2, 3, 4]}) 17 | with BytesIO() as bio: 18 | write_parquet(table, bio) 19 | bio.seek(0) 20 | table_retour = pa.table(read_parquet(bio)) 21 | assert table == table_retour 22 | 23 | 24 | def test_copy_parquet_kv_metadata(): 25 | metadata = {"hello": "world"} 26 | table = pa.table({"a": [1, 2, 3]}) 27 | write_parquet( 28 | table, 29 | "test.parquet", 30 | key_value_metadata=metadata, 31 | skip_arrow_metadata=True, 32 | ) 33 | 34 | # Assert metadata was written, but arrow schema was not 35 | pq_meta = pq.read_metadata("test.parquet").metadata 36 | assert pq_meta[b"hello"] == b"world" 37 | assert b"ARROW:schema" not in pq_meta.keys() 38 | 39 | # When reading with pyarrow, kv meta gets assigned to table 40 | pa_table = pq.read_table("test.parquet") 41 | assert pa_table.schema.metadata[b"hello"] == b"world" 42 | 43 | reader = read_parquet("test.parquet") 44 | assert reader.schema.metadata[b"hello"] == b"world" 45 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_take.pyi: -------------------------------------------------------------------------------- 1 | from arro3.core import Array 2 | from arro3.core.types import ArrayInput 3 | 4 | def take(values: ArrayInput, indices: ArrayInput) -> Array: 5 | """Take elements by index from Array, creating a new Array from those indexes. 6 | 7 | ``` 8 | ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ 9 | │ A │ │ 0 │ │ A │ 10 | ├─────────────────┤ ├─────────┤ ├─────────────────┤ 11 | │ D │ │ 2 │ │ B │ 12 | ├─────────────────┤ ├─────────┤ take(values, indices) ├─────────────────┤ 13 | │ B │ │ 3 │ ─────────────────────────▶ │ C │ 14 | ├─────────────────┤ ├─────────┤ ├─────────────────┤ 15 | │ C │ │ 1 │ │ D │ 16 | ├─────────────────┤ └─────────┘ └─────────────────┘ 17 | │ E │ 18 | └─────────────────┘ 19 | values array indices array result 20 | ``` 21 | 22 | Args: 23 | values: The input Arrow data to select from. 24 | indices: The indices within `values` to take. This must be a numeric array. 25 | 26 | Returns: 27 | The selected arrow data. 28 | """ 29 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_cast.pyi: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | 3 | from arro3.core import Array, ArrayReader 4 | from arro3.core.types import ( 5 | ArrayInput, 6 | ArrowSchemaExportable, 7 | ArrowStreamExportable, 8 | ) 9 | 10 | @overload 11 | def cast( 12 | input: ArrayInput, 13 | to_type: ArrowSchemaExportable, 14 | ) -> Array: ... 15 | @overload 16 | def cast( 17 | input: ArrowStreamExportable, 18 | to_type: ArrowSchemaExportable, 19 | ) -> ArrayReader: ... 20 | def cast( 21 | input: ArrayInput | ArrowStreamExportable, 22 | to_type: ArrowSchemaExportable, 23 | ) -> Array | ArrayReader: 24 | """ 25 | Cast `input` to the provided data type and return a new Array with type `to_type`, if possible. 26 | 27 | If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned. 28 | 29 | Args: 30 | input: Input data to cast. 31 | to_type: The target data type to cast to. You may pass in a `Field` here if you wish to include Arrow extension metadata on the output array. 32 | 33 | Returns: 34 | The casted Arrow data. 35 | """ 36 | 37 | def can_cast_types( 38 | from_type: ArrowSchemaExportable, to_type: ArrowSchemaExportable 39 | ) -> bool: 40 | """Return true if a value of type `from_type` can be cast into a value of `to_type`. 41 | 42 | Args: 43 | from_type: Source type 44 | to_type: Destination type 45 | 46 | Returns: 47 | True if can be casted. 48 | """ 49 | -------------------------------------------------------------------------------- /tests/core/test_struct_field.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | import pytest 3 | from arro3.core import struct_field 4 | 5 | 6 | def test_struct_field(): 7 | a = pa.array([1, 2, 3]) 8 | b = pa.array([3, 4, 5]) 9 | struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"]) 10 | assert pa.array(struct_field(struct_arr, [0])) == a 11 | 12 | 13 | def test_struct_field_sliced_end(): 14 | a = pa.array([1, 2, 3]) 15 | b = pa.array([3, 4, 5]) 16 | struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"]) 17 | sliced = struct_arr.slice(1, 2) 18 | sliced.offset 19 | out = pa.array(struct_field(sliced, [0])) 20 | assert out == sliced.field(0) 21 | 22 | 23 | def test_struct_field_sliced_start(): 24 | a = pa.array([1, 2, 3]) 25 | b = pa.array([3, 4, 5]) 26 | struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"]) 27 | sliced = struct_arr.slice(0, 1) 28 | out = pa.array(struct_field(sliced, [0])) 29 | assert out == sliced.field(0) 30 | 31 | 32 | def test_struct_field_nested(): 33 | a = pa.array([1, 2, 3]) 34 | b = pa.array([3, 4, 5]) 35 | c = pa.array([7, 8, 9]) 36 | inner = pa.StructArray.from_arrays([a, b], names=["a", "b"]) 37 | outer = pa.StructArray.from_arrays([inner, c], names=["inner", "c"]) 38 | assert pa.array(struct_field(outer, [0, 0])) == a 39 | assert pa.array(struct_field(outer, [0, 1])) == b 40 | assert pa.array(struct_field(outer, [1])) == c 41 | 42 | with pytest.raises(Exception): 43 | assert pa.array(struct_field(outer, [2])) 44 | -------------------------------------------------------------------------------- /pyo3-arrow/src/error.rs: -------------------------------------------------------------------------------- 1 | //! Contains the [`PyArrowError`], the Error returned by most fallible functions in this crate. 2 | 3 | use numpy::BorrowError; 4 | use pyo3::exceptions::{PyException, PyValueError}; 5 | use pyo3::prelude::*; 6 | use pyo3::DowncastError; 7 | use thiserror::Error; 8 | 9 | /// The Error variants returned by this crate. 10 | #[derive(Error, Debug)] 11 | #[non_exhaustive] 12 | pub enum PyArrowError { 13 | /// A wrapped [arrow::error::ArrowError] 14 | #[error(transparent)] 15 | ArrowError(#[from] arrow_schema::ArrowError), 16 | 17 | /// A wrapped [PyErr] 18 | #[error(transparent)] 19 | PyErr(#[from] PyErr), 20 | 21 | /// Indicates why borrowing an array failed. 22 | #[error(transparent)] 23 | NumpyBorrowError(#[from] BorrowError), 24 | } 25 | 26 | impl From for PyErr { 27 | fn from(error: PyArrowError) -> Self { 28 | match error { 29 | PyArrowError::PyErr(err) => err, 30 | PyArrowError::ArrowError(err) => PyException::new_err(err.to_string()), 31 | PyArrowError::NumpyBorrowError(err) => PyException::new_err(err.to_string()), 32 | } 33 | } 34 | } 35 | 36 | impl<'a, 'py> From> for PyArrowError { 37 | fn from(other: DowncastError<'a, 'py>) -> Self { 38 | Self::PyErr(PyValueError::new_err(format!( 39 | "Could not downcast: {}", 40 | other 41 | ))) 42 | } 43 | } 44 | 45 | /// A type wrapper around `Result`. 46 | pub type PyArrowResult = Result; 47 | -------------------------------------------------------------------------------- /arro3-io/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "arro3-io" 3 | version = { workspace = true } 4 | authors = { workspace = true } 5 | edition = { workspace = true } 6 | description = "Rust-based readers and writers for Arrow in Python." 7 | readme = "README.md" 8 | repository = { workspace = true } 9 | homepage = { workspace = true } 10 | license = { workspace = true } 11 | keywords = { workspace = true } 12 | categories = { workspace = true } 13 | rust-version = { workspace = true } 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | [lib] 17 | name = "_io" 18 | crate-type = ["cdylib"] 19 | 20 | [features] 21 | default = ["async"] 22 | # Include async code. This feature won't compile for pyodide. 23 | async = [ 24 | "dep:pyo3-object_store", 25 | "dep:pyo3-async-runtimes", 26 | "parquet/object_store", 27 | "dep:object_store", 28 | "dep:futures", 29 | ] 30 | 31 | [dependencies] 32 | arrow-array = { workspace = true } 33 | arrow-buffer = { workspace = true } 34 | arrow-csv = { workspace = true } 35 | arrow-ipc = { workspace = true } 36 | arrow-json = { workspace = true } 37 | arrow-schema = { workspace = true } 38 | bytes = { workspace = true } 39 | futures = { version = "0.3.30", optional = true } 40 | object_store = { workspace = true, optional = true } 41 | parquet = { workspace = true } 42 | pyo3 = { workspace = true } 43 | pyo3-arrow = { workspace = true } 44 | pyo3-async-runtimes = { workspace = true, features = [ 45 | "tokio-runtime", 46 | ], optional = true } 47 | pyo3-file = { workspace = true } 48 | pyo3-object_store = { workspace = true, optional = true } 49 | thiserror = { workspace = true } 50 | -------------------------------------------------------------------------------- /DEVELOP.md: -------------------------------------------------------------------------------- 1 | ## Docs 2 | 3 | ```bash 4 | rm -rf .venv 5 | uv sync 6 | # Note: need to install core first because others depend on core 7 | uv run maturin dev -m arro3-core/Cargo.toml 8 | uv run maturin dev -m arro3-compute/Cargo.toml 9 | uv run maturin dev -m arro3-io/Cargo.toml 10 | uv run mkdocs serve 11 | ``` 12 | 13 | ### Adding a new module 14 | 15 | - Add new module to Github Actions matrix in `wheels.yml` 16 | - Update `docs.yml` to include module 17 | 18 | ## Emscripten Python wheels 19 | 20 | Install rust nightly and add wasm toolchain 21 | 22 | ```bash 23 | rustup toolchain install nightly 24 | rustup target add --toolchain nightly wasm32-unknown-emscripten 25 | ``` 26 | 27 | Install maturin and pyodide-build (choose a specific version of pyodide-build if desired) 28 | 29 | ```bash 30 | pip install -U maturin 31 | pip install pyodide-build 32 | ``` 33 | 34 | Clone emsdk. I clone this into a specific path at `~/github/emscripten-core/emsdk` so that it can be shared across projects. 35 | 36 | ```bash 37 | mkdir -p ~/github/emscripten-core/ 38 | git clone https://github.com/emscripten-core/emsdk.git ~/github/emscripten-core/emsdk 39 | # Or, set this manually 40 | PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version) 41 | ~/github/emscripten-core/emsdk/emsdk install ${PYODIDE_EMSCRIPTEN_VERSION} 42 | ~/github/emscripten-core/emsdk/emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION} 43 | source ~/github/emscripten-core/emsdk/emsdk_env.sh 44 | ``` 45 | 46 | Build `arro3-core`: 47 | 48 | ```bash 49 | RUSTUP_TOOLCHAIN=nightly \ 50 | maturin build \ 51 | --release \ 52 | -o dist \ 53 | -m arro3-core/Cargo.toml \ 54 | --target wasm32-unknown-emscripten \ 55 | -i python3.11 56 | ``` 57 | -------------------------------------------------------------------------------- /arro3-io/src/error.rs: -------------------------------------------------------------------------------- 1 | //! Contains the [`Arro3IoError`], the Error returned by most fallible functions in this crate. 2 | 3 | use pyo3::exceptions::{PyException, PyValueError}; 4 | use pyo3::prelude::*; 5 | use pyo3::DowncastError; 6 | use thiserror::Error; 7 | 8 | /// The Error variants returned by this crate. 9 | #[derive(Error, Debug)] 10 | #[non_exhaustive] 11 | pub enum Arro3IoError { 12 | /// A wrapped [arrow::error::ArrowError] 13 | #[error(transparent)] 14 | ArrowError(#[from] arrow_schema::ArrowError), 15 | 16 | /// A wrapped [object_store::Error] 17 | #[error(transparent)] 18 | ObjectStoreError(#[from] object_store::Error), 19 | 20 | /// A wrapped [parquet::errors::ParquetError] 21 | #[error(transparent)] 22 | ParquetError(#[from] parquet::errors::ParquetError), 23 | 24 | /// A wrapped [PyErr] 25 | #[error(transparent)] 26 | PyErr(#[from] PyErr), 27 | } 28 | 29 | impl From for PyErr { 30 | fn from(error: Arro3IoError) -> Self { 31 | match error { 32 | Arro3IoError::PyErr(err) => err, 33 | Arro3IoError::ArrowError(err) => PyException::new_err(err.to_string()), 34 | Arro3IoError::ObjectStoreError(err) => PyException::new_err(err.to_string()), 35 | Arro3IoError::ParquetError(err) => PyException::new_err(err.to_string()), 36 | } 37 | } 38 | } 39 | 40 | impl<'a, 'py> From> for Arro3IoError { 41 | fn from(other: DowncastError<'a, 'py>) -> Self { 42 | Self::PyErr(PyValueError::new_err(format!( 43 | "Could not downcast: {other}" 44 | ))) 45 | } 46 | } 47 | 48 | /// A type wrapper around `Result`. 49 | pub type Arro3IoResult = Result; 50 | -------------------------------------------------------------------------------- /tests/core/test_data_type.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from arro3.core import DataType, Field 3 | 4 | 5 | def test_value_type_fixed_size_list_type(): 6 | value_type = DataType.int8() 7 | list_dt = DataType.list(Field("inner", value_type), 2) 8 | assert list_dt.value_type == value_type 9 | 10 | 11 | def test_value_field_list_type(): 12 | value_type = DataType.int8() 13 | value_field = Field("inner", value_type, nullable=True) 14 | list_dt = DataType.list( 15 | value_field, 16 | 2, 17 | ) 18 | assert list_dt.value_field == value_field 19 | 20 | 21 | def test_fields_struct_type(): 22 | field_foo = Field("foo", DataType.int8(), nullable=True) 23 | field_bar = Field("bar", DataType.string(), nullable=False) 24 | struct_type = DataType.struct([field_foo, field_bar]) 25 | assert struct_type.fields == [field_foo, field_bar] 26 | 27 | 28 | @pytest.mark.xfail 29 | def test_list_data_type_construction_with_dt(): 30 | _ = DataType.list(DataType.int16()) 31 | 32 | 33 | def test_hashable(): 34 | # We should be able to use DataType as a key in a dict 35 | _dtype_map = { 36 | DataType.uint8(): DataType.int8(), 37 | DataType.uint16(): DataType.int16(), 38 | DataType.uint32(): DataType.int32(), 39 | DataType.uint64(): DataType.int64(), 40 | } 41 | 42 | 43 | class CustomException(Exception): 44 | pass 45 | 46 | 47 | class ArrowCSchemaFails: 48 | def __arrow_c_schema__(self): 49 | raise CustomException 50 | 51 | 52 | def test_schema_import_preserve_exception(): 53 | """https://github.com/kylebarron/arro3/issues/325""" 54 | 55 | c_stream_obj = ArrowCSchemaFails() 56 | with pytest.raises(CustomException): 57 | DataType.from_arrow(c_stream_obj) 58 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | 4 | class StrEnum(str, Enum): 5 | def __new__(cls, value, *args, **kwargs): 6 | if not isinstance(value, (str, auto)): 7 | raise TypeError( 8 | f"Values of StrEnums must be strings: {value!r} is a {type(value)}" 9 | ) 10 | return super().__new__(cls, value, *args, **kwargs) 11 | 12 | def __str__(self): 13 | return str(self.value) 14 | 15 | def _generate_next_value_(name, *_): 16 | return name.lower() 17 | 18 | 19 | class DatePart(StrEnum): 20 | """Valid parts to extract from date/time/timestamp arrays. 21 | 22 | See [`date_part`][arro3.compute.date_part]. 23 | """ 24 | 25 | Quarter = auto() 26 | """Quarter of the year, in range `1..=4`""" 27 | 28 | Year = auto() 29 | """Calendar year""" 30 | 31 | Month = auto() 32 | """Month in the year, in range `1..=12`""" 33 | 34 | Week = auto() 35 | """ISO week of the year, in range `1..=53`""" 36 | 37 | Day = auto() 38 | """Day of the month, in range `1..=31`""" 39 | 40 | DayOfWeekSunday0 = auto() 41 | """Day of the week, in range `0..=6`, where Sunday is `0`""" 42 | 43 | DayOfWeekMonday0 = auto() 44 | """Day of the week, in range `0..=6`, where Monday is `0`""" 45 | 46 | DayOfYear = auto() 47 | """Day of year, in range `1..=366`""" 48 | 49 | Hour = auto() 50 | """Hour of the day, in range `0..=23`""" 51 | 52 | Minute = auto() 53 | """Minute of the hour, in range `0..=59`""" 54 | 55 | Second = auto() 56 | """Second of the minute, in range `0..=59`""" 57 | 58 | Millisecond = auto() 59 | """Millisecond of the second""" 60 | 61 | Microsecond = auto() 62 | """Microsecond of the second""" 63 | 64 | Nanosecond = auto() 65 | """Nanosecond of the second""" 66 | -------------------------------------------------------------------------------- /arro3-core/src/accessors/struct_field.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::cast::AsArray; 2 | use arrow_array::ArrayRef; 3 | use arrow_schema::{ArrowError, DataType, FieldRef}; 4 | use pyo3::prelude::*; 5 | use pyo3_arrow::error::PyArrowResult; 6 | use pyo3_arrow::PyArray; 7 | 8 | #[derive(FromPyObject)] 9 | pub(crate) enum StructIndex { 10 | Int(usize), 11 | ListInt(Vec), 12 | } 13 | 14 | impl StructIndex { 15 | fn into_list(self) -> Vec { 16 | match self { 17 | Self::Int(i) => vec![i], 18 | Self::ListInt(i) => i, 19 | } 20 | } 21 | } 22 | 23 | #[pyfunction] 24 | #[pyo3(signature=(values, /, indices, * ))] 25 | pub(crate) fn struct_field( 26 | py: Python, 27 | values: PyArray, 28 | indices: StructIndex, 29 | ) -> PyArrowResult { 30 | let (orig_array, field) = values.into_inner(); 31 | let indices = indices.into_list(); 32 | 33 | let mut array_ref = &orig_array; 34 | let mut field_ref = &field; 35 | for i in indices { 36 | (array_ref, field_ref) = get_child(array_ref, i)?; 37 | } 38 | 39 | Ok(PyArray::new( 40 | array_ref.slice(orig_array.offset(), orig_array.len()), 41 | field_ref.clone(), 42 | ) 43 | .to_arro3(py)? 44 | .unbind()) 45 | } 46 | 47 | fn get_child(array: &ArrayRef, i: usize) -> Result<(&ArrayRef, &FieldRef), ArrowError> { 48 | match array.data_type() { 49 | DataType::Struct(fields) => { 50 | let arr = array.as_struct(); 51 | let inner_arr = arr.columns().get(i).ok_or(ArrowError::SchemaError( 52 | "Out of range for number of fields".into(), 53 | ))?; 54 | let inner_field = &fields[i]; 55 | Ok((inner_arr, inner_field)) 56 | } 57 | _ => Err(ArrowError::SchemaError( 58 | "DataType must be struct.".to_string(), 59 | )), 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /arro3-compute/python/arro3/compute/_temporal.pyi: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | 3 | from arro3.core import Array, ArrayReader 4 | from arro3.core.types import ArrowArrayExportable, ArrowStreamExportable 5 | 6 | from .enums import DatePart 7 | from .types import DatePartT 8 | 9 | # # Examples 10 | 11 | # ``` 12 | # # use arrow_array::{Int32Array, TimestampMicrosecondArray}; 13 | # # use arrow_arith::temporal::{DatePart, date_part}; 14 | # let input: TimestampMicrosecondArray = 15 | # vec![Some(1612025847000000), None, Some(1722015847000000)].into(); 16 | 17 | # let actual = date_part(&input, DatePart::Week).unwrap(); 18 | # let expected: Int32Array = vec![Some(4), None, Some(30)].into(); 19 | # assert_eq!(actual.as_ref(), &expected); 20 | # ``` 21 | 22 | @overload 23 | def date_part(input: ArrowArrayExportable, part: DatePart | DatePartT) -> Array: ... 24 | @overload 25 | def date_part( 26 | input: ArrowStreamExportable, part: DatePart | DatePartT 27 | ) -> ArrayReader: ... 28 | def date_part( 29 | input: ArrowArrayExportable | ArrowStreamExportable, part: DatePart | DatePartT 30 | ) -> Array | ArrayReader: 31 | """ 32 | Given an array, return a new array with the extracted [`DatePart`] as signed 32-bit 33 | integer values. 34 | 35 | Currently only supports temporal types: 36 | - Date32/Date64 37 | - Time32/Time64 38 | - Timestamp 39 | - Interval 40 | - Duration 41 | 42 | Returns an int32-typed array unless input was a dictionary type, in which case 43 | returns the dictionary but with this function applied onto its values. 44 | 45 | If array passed in is not of the above listed types (or is a dictionary array where 46 | the values array isn't of the above listed types), then this function will return an 47 | error. 48 | 49 | Args: 50 | input: Argument to compute function. 51 | 52 | Returns: 53 | The extracted date part. 54 | """ 55 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/_ipc.pyi: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import IO, Literal 3 | 4 | # Note: importing with 5 | # `from arro3.core import Array` 6 | # will cause Array to be included in the generated docs in this module. 7 | import arro3.core as core 8 | import arro3.core.types as types 9 | 10 | def read_ipc(file: IO[bytes] | Path | str) -> core.RecordBatchReader: 11 | """Read an Arrow IPC file into memory 12 | 13 | Args: 14 | file: The input Arrow IPC file path or buffer. 15 | 16 | Returns: 17 | An arrow RecordBatchReader. 18 | """ 19 | 20 | def read_ipc_stream(file: IO[bytes] | Path | str) -> core.RecordBatchReader: 21 | """Read an Arrow IPC stream into memory 22 | 23 | Args: 24 | file: The input Arrow IPC stream path or buffer. 25 | 26 | Returns: 27 | An arrow RecordBatchReader. 28 | """ 29 | 30 | def write_ipc( 31 | data: types.ArrowStreamExportable | types.ArrowArrayExportable, 32 | file: IO[bytes] | Path | str, 33 | *, 34 | compression: Literal["LZ4", "lz4", "ZSTD", "zstd"] | None = None, 35 | ) -> None: 36 | """Write Arrow data to an Arrow IPC file 37 | 38 | Args: 39 | data: the Arrow Table, RecordBatchReader, or RecordBatch to write. 40 | file: the output file or buffer to write to 41 | 42 | Other Args: 43 | compression: Compression to apply to file. 44 | """ 45 | 46 | def write_ipc_stream( 47 | data: types.ArrowStreamExportable | types.ArrowArrayExportable, 48 | file: IO[bytes] | Path | str, 49 | *, 50 | compression: Literal["LZ4", "lz4", "ZSTD", "zstd"] | None = None, 51 | ) -> None: 52 | """Write Arrow data to an Arrow IPC stream 53 | 54 | Args: 55 | data: the Arrow Table, RecordBatchReader, or RecordBatch to write. 56 | file: the output file or buffer to write to 57 | 58 | Other Args: 59 | compression: Compression to apply to file. 60 | """ 61 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/to_python/chunked.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::ArrayRef; 2 | use arrow_schema::{ArrowError, FieldRef}; 3 | 4 | /// Trait for types that can read `ArrayRef`'s. 5 | /// 6 | /// To create from an iterator, see [ArrayIterator]. 7 | pub trait ArrayReader: Iterator> { 8 | /// Returns the field of this `ArrayReader`. 9 | /// 10 | /// Implementation of this trait should guarantee that all `ArrayRef`'s returned by this 11 | /// reader should have the same field as returned from this method. 12 | fn field(&self) -> FieldRef; 13 | } 14 | 15 | impl ArrayReader for Box { 16 | fn field(&self) -> FieldRef { 17 | self.as_ref().field() 18 | } 19 | } 20 | 21 | /// An iterator of [`ArrayRef`] with an attached [`FieldRef`] 22 | pub struct ArrayIterator 23 | where 24 | I: IntoIterator>, 25 | { 26 | inner: I::IntoIter, 27 | inner_field: FieldRef, 28 | } 29 | 30 | impl ArrayIterator 31 | where 32 | I: IntoIterator>, 33 | { 34 | /// Create a new [ArrayIterator]. 35 | /// 36 | /// If `iter` is an infallible iterator, use `.map(Ok)`. 37 | pub fn new(iter: I, field: FieldRef) -> Self { 38 | Self { 39 | inner: iter.into_iter(), 40 | inner_field: field, 41 | } 42 | } 43 | } 44 | 45 | impl Iterator for ArrayIterator 46 | where 47 | I: IntoIterator>, 48 | { 49 | type Item = I::Item; 50 | 51 | fn next(&mut self) -> Option { 52 | self.inner.next() 53 | } 54 | 55 | fn size_hint(&self) -> (usize, Option) { 56 | self.inner.size_hint() 57 | } 58 | } 59 | 60 | impl ArrayReader for ArrayIterator 61 | where 62 | I: IntoIterator>, 63 | { 64 | fn field(&self) -> FieldRef { 65 | self.inner_field.clone() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/io/test_ipc.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | 4 | import pyarrow as pa 5 | from arro3.io import read_ipc, read_ipc_stream, write_ipc, write_ipc_stream 6 | 7 | 8 | def test_ipc_round_trip_string(): 9 | table = pa.table({"a": [1, 2, 3, 4]}) 10 | write_ipc(table, "test.arrow") 11 | table_retour = pa.table(read_ipc("test.arrow")) 12 | assert table == table_retour 13 | 14 | write_ipc_stream(table, "test.arrows") 15 | table_retour = pa.table(read_ipc_stream("test.arrows")) 16 | assert table == table_retour 17 | 18 | 19 | def test_ipc_round_trip_path(): 20 | table = pa.table({"a": [1, 2, 3, 4]}) 21 | write_ipc(table, Path("test.arrow")) 22 | table_retour = pa.table(read_ipc(Path("test.arrow"))) 23 | assert table == table_retour 24 | 25 | write_ipc_stream(table, Path("test.arrows")) 26 | table_retour = pa.table(read_ipc_stream(Path("test.arrows"))) 27 | assert table == table_retour 28 | 29 | 30 | def test_ipc_round_trip_buffer(): 31 | table = pa.table({"a": [1, 2, 3, 4]}) 32 | bio = BytesIO() 33 | write_ipc(table, bio) 34 | table_retour = pa.table(read_ipc(bio)) 35 | assert table == table_retour 36 | 37 | bio = BytesIO() 38 | write_ipc_stream(table, bio) 39 | bio.seek(0) 40 | table_retour = pa.table(read_ipc_stream(bio)) 41 | assert table == table_retour 42 | 43 | 44 | def test_ipc_round_trip_compression(): 45 | table = pa.table({"a": [1, 2, 3, 4]}) 46 | write_ipc(table, "test.arrow", compression="lz4") 47 | table_retour = pa.table(read_ipc("test.arrow")) 48 | assert table == table_retour 49 | 50 | table = pa.table({"a": [1, 2, 3, 4]}) 51 | write_ipc(table, "test.arrow", compression="zstd") 52 | table_retour = pa.table(read_ipc("test.arrow")) 53 | assert table == table_retour 54 | 55 | table = pa.table({"a": [1, 2, 3, 4]}) 56 | write_ipc(table, "test.arrow", compression=None) 57 | table_retour = pa.table(read_ipc("test.arrow")) 58 | assert table == table_retour 59 | -------------------------------------------------------------------------------- /tests/compute/test_aggregate.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | 3 | import arro3.compute as ac 4 | import pyarrow as pa 5 | from arro3.core import Array, ChunkedArray, DataType 6 | 7 | 8 | def test_min(): 9 | arr1 = Array([1, 2, 3], DataType.int16()) 10 | assert ac.min(arr1).as_py() == 1 11 | 12 | arr2 = Array([3, 2, 0], DataType.int16()) 13 | assert ac.min(arr2).as_py() == 0 14 | 15 | ca = ChunkedArray([arr1, arr2]) 16 | assert ac.min(ca).as_py() == 0 17 | 18 | arr = Array(["c", "a", "b"], DataType.string()) 19 | assert ac.min(arr).as_py() == "a" 20 | 21 | 22 | def test_max(): 23 | arr1 = Array([1, 2, 3], DataType.int16()) 24 | assert ac.max(arr1).as_py() == 3 25 | 26 | arr2 = Array([4, 2, 0], DataType.int16()) 27 | assert ac.max(arr2).as_py() == 4 28 | 29 | ca = ChunkedArray([arr1, arr2]) 30 | assert ac.max(ca).as_py() == 4 31 | 32 | arr = Array(["c", "a", "b"], DataType.string()) 33 | assert ac.max(arr).as_py() == "c" 34 | 35 | 36 | def test_sum(): 37 | arr1 = Array([1, 2, 3], DataType.int16()) 38 | assert ac.sum(arr1).as_py() == 6 39 | 40 | arr2 = Array([4, 2, 0], DataType.int16()) 41 | assert ac.sum(arr2).as_py() == 6 42 | 43 | ca = ChunkedArray([arr1, arr2]) 44 | assert ac.sum(ca).as_py() == 12 45 | 46 | 47 | def test_min_max_datetime(): 48 | dt1 = datetime.now() 49 | dt2 = datetime.now() 50 | dt3 = datetime.now() 51 | 52 | pa_arr = pa.array([dt1, dt2, dt3], type=pa.timestamp("ns", None)) 53 | arro3_arr = Array(pa_arr) 54 | assert ac.min(arro3_arr).as_py() == dt1 55 | assert ac.max(arro3_arr).as_py() == dt3 56 | 57 | 58 | def test_min_max_datetime_with_timezone(): 59 | dt1 = datetime.now(timezone.utc) 60 | dt2 = datetime.now(timezone.utc) 61 | dt3 = datetime.now(timezone.utc) 62 | arr = pa.array([dt1, dt2, dt3]) 63 | assert arr.type.tz == "UTC" 64 | 65 | assert ac.min(arr).as_py() == dt1 66 | assert ac.min(arr).type.tz == "UTC" 67 | assert ac.max(arr).as_py() == dt3 68 | assert ac.max(arr).type.tz == "UTC" 69 | -------------------------------------------------------------------------------- /arro3-compute/src/cast.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyTypeError; 2 | use pyo3::prelude::*; 3 | use pyo3_arrow::error::PyArrowResult; 4 | use pyo3_arrow::ffi::ArrayIterator; 5 | use pyo3_arrow::input::AnyArray; 6 | use pyo3_arrow::{PyArray, PyArrayReader, PyDataType, PyField}; 7 | 8 | /// Cast `input` to the provided data type and return a new Arrow object with type `to_type`, if 9 | /// possible. 10 | /// 11 | /// Args: 12 | /// input: an Arrow Array, RecordBatch, ChunkedArray, Table, ArrayReader, or RecordBatchReader 13 | /// to_type: an Arrow DataType, Field, or Schema describing the output type of the cast. 14 | #[pyfunction] 15 | pub fn cast(py: Python, input: AnyArray, to_type: PyField) -> PyArrowResult { 16 | match input { 17 | AnyArray::Array(arr) => { 18 | let new_field = to_type.into_inner(); 19 | let out = arrow_cast::cast(arr.as_ref(), new_field.data_type())?; 20 | Ok(PyArray::new(out, new_field).to_arro3(py)?.unbind()) 21 | } 22 | AnyArray::Stream(stream) => { 23 | let reader = stream.into_reader()?; 24 | let field = reader.field(); 25 | let from_type = field.data_type(); 26 | 27 | let new_field = to_type.into_inner(); 28 | let to_type = new_field.data_type().clone(); 29 | if !arrow_cast::can_cast_types(from_type, &to_type) { 30 | return Err(PyTypeError::new_err(format!( 31 | "Unable to cast from type {from_type} to {to_type}" 32 | )) 33 | .into()); 34 | } 35 | 36 | let iter = reader 37 | .into_iter() 38 | .map(move |array| arrow_cast::cast(&array?, &to_type)); 39 | Ok( 40 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, new_field))) 41 | .to_arro3(py)? 42 | .unbind(), 43 | ) 44 | } 45 | } 46 | } 47 | 48 | #[pyfunction] 49 | pub fn can_cast_types(from_type: PyDataType, to_type: PyDataType) -> bool { 50 | arrow_cast::can_cast_types(from_type.as_ref(), to_type.as_ref()) 51 | } 52 | -------------------------------------------------------------------------------- /.github/workflows/test-python.yml: -------------------------------------------------------------------------------- 1 | name: Python 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | pre-commit: 15 | name: Run pre-commit on Python code 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.11" 23 | 24 | - name: Cache pre-commit virtualenvs 25 | uses: actions/cache@v4 26 | with: 27 | path: ~/.cache/pre-commit 28 | key: pre-commit-3|${{ hashFiles('.pre-commit-config.yaml') }} 29 | 30 | - name: run pre-commit 31 | run: | 32 | python -m pip install pre-commit 33 | pre-commit run --all-files 34 | 35 | test-python: 36 | name: Build and test Python 37 | runs-on: ubuntu-latest 38 | strategy: 39 | fail-fast: true 40 | matrix: 41 | python-version: ["3.9", "3.12"] 42 | steps: 43 | - uses: actions/checkout@v4 44 | 45 | - name: Install Rust 46 | uses: dtolnay/rust-toolchain@stable 47 | 48 | - uses: Swatinem/rust-cache@v2 49 | 50 | - name: Set up Python 51 | id: setup-python 52 | uses: actions/setup-python@v5 53 | with: 54 | python-version: ${{ matrix.python-version }} 55 | 56 | - name: Install a specific version of uv 57 | uses: astral-sh/setup-uv@v3 58 | with: 59 | enable-cache: true 60 | version: "0.4.x" 61 | 62 | - name: Build rust submodules 63 | run: | 64 | # Note: core module must be first, because it's depended on by others 65 | uv run maturin dev -m arro3-core/Cargo.toml 66 | uv run maturin dev -m arro3-compute/Cargo.toml 67 | uv run maturin dev -m arro3-io/Cargo.toml 68 | 69 | - name: Run python tests 70 | run: | 71 | uv run pytest tests 72 | 73 | # Ensure docs build without warnings 74 | - name: Check docs 75 | run: uv run mkdocs build --strict 76 | -------------------------------------------------------------------------------- /arro3-io/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyRuntimeWarning; 2 | use pyo3::intern; 3 | use pyo3::prelude::*; 4 | use pyo3::types::PyTuple; 5 | 6 | mod csv; 7 | mod error; 8 | mod ipc; 9 | mod json; 10 | mod parquet; 11 | mod utils; 12 | 13 | const VERSION: &str = env!("CARGO_PKG_VERSION"); 14 | 15 | #[pyfunction] 16 | fn ___version() -> &'static str { 17 | VERSION 18 | } 19 | 20 | /// Raise RuntimeWarning for debug builds 21 | #[pyfunction] 22 | fn check_debug_build(py: Python) -> PyResult<()> { 23 | #[cfg(debug_assertions)] 24 | { 25 | let warnings_mod = py.import(intern!(py, "warnings"))?; 26 | let warning = PyRuntimeWarning::new_err( 27 | "arro3.io has not been compiled in release mode. Performance will be degraded.", 28 | ); 29 | let args = PyTuple::new(py, vec![warning])?; 30 | warnings_mod.call_method1(intern!(py, "warn"), args)?; 31 | } 32 | 33 | Ok(()) 34 | } 35 | 36 | #[pymodule] 37 | fn _io(py: Python, m: &Bound) -> PyResult<()> { 38 | check_debug_build(py)?; 39 | 40 | m.add_wrapped(wrap_pyfunction!(___version))?; 41 | 42 | pyo3_object_store::register_store_module(py, m, "arro3.io", "store")?; 43 | pyo3_object_store::register_exceptions_module(py, m, "arro3.io", "exceptions")?; 44 | 45 | m.add_wrapped(wrap_pyfunction!(csv::infer_csv_schema))?; 46 | m.add_wrapped(wrap_pyfunction!(csv::read_csv))?; 47 | m.add_wrapped(wrap_pyfunction!(csv::write_csv))?; 48 | 49 | m.add_wrapped(wrap_pyfunction!(json::infer_json_schema))?; 50 | m.add_wrapped(wrap_pyfunction!(json::read_json))?; 51 | m.add_wrapped(wrap_pyfunction!(json::write_json))?; 52 | m.add_wrapped(wrap_pyfunction!(json::write_ndjson))?; 53 | 54 | m.add_wrapped(wrap_pyfunction!(ipc::read_ipc))?; 55 | m.add_wrapped(wrap_pyfunction!(ipc::read_ipc_stream))?; 56 | m.add_wrapped(wrap_pyfunction!(ipc::write_ipc))?; 57 | m.add_wrapped(wrap_pyfunction!(ipc::write_ipc_stream))?; 58 | 59 | m.add_wrapped(wrap_pyfunction!(parquet::read_parquet))?; 60 | m.add_wrapped(wrap_pyfunction!(parquet::read_parquet_async))?; 61 | m.add_wrapped(wrap_pyfunction!(parquet::write_parquet))?; 62 | 63 | Ok(()) 64 | } 65 | -------------------------------------------------------------------------------- /arro3-compute/src/boolean.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use arrow_array::ArrayRef; 4 | use arrow_schema::{DataType, Field}; 5 | use pyo3::prelude::*; 6 | use pyo3_arrow::error::PyArrowResult; 7 | use pyo3_arrow::ffi::ArrayIterator; 8 | use pyo3_arrow::input::AnyArray; 9 | use pyo3_arrow::{PyArray, PyArrayReader}; 10 | 11 | #[pyfunction] 12 | pub fn is_null(py: Python, input: AnyArray) -> PyArrowResult { 13 | match input { 14 | AnyArray::Array(input) => { 15 | let out = arrow_arith::boolean::is_null(input.as_ref())?; 16 | Ok(PyArray::from_array_ref(Arc::new(out)) 17 | .to_arro3(py)? 18 | .unbind()) 19 | } 20 | AnyArray::Stream(input) => { 21 | let input = input.into_reader()?; 22 | let out_field = Field::new("", DataType::Boolean, true); 23 | 24 | let iter = input.into_iter().map(move |input| { 25 | let out = arrow_arith::boolean::is_null(&input?)?; 26 | Ok(Arc::new(out) as ArrayRef) 27 | }); 28 | Ok( 29 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) 30 | .to_arro3(py)? 31 | .unbind(), 32 | ) 33 | } 34 | } 35 | } 36 | 37 | #[pyfunction] 38 | pub fn is_not_null(py: Python, input: AnyArray) -> PyArrowResult { 39 | match input { 40 | AnyArray::Array(input) => { 41 | let out = arrow_arith::boolean::is_not_null(input.as_ref())?; 42 | Ok(PyArray::from_array_ref(Arc::new(out)) 43 | .to_arro3(py)? 44 | .unbind()) 45 | } 46 | AnyArray::Stream(input) => { 47 | let input = input.into_reader()?; 48 | let out_field = Field::new("", DataType::Boolean, true); 49 | 50 | let iter = input.into_iter().map(move |input| { 51 | let out = arrow_arith::boolean::is_not_null(&input?)?; 52 | Ok(Arc::new(out) as ArrayRef) 53 | }); 54 | Ok( 55 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) 56 | .to_arro3(py)? 57 | .unbind(), 58 | ) 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /arro3-compute/src/filter.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::cast::AsArray; 2 | use arrow_schema::{ArrowError, DataType}; 3 | use pyo3::exceptions::PyValueError; 4 | use pyo3::prelude::*; 5 | use pyo3_arrow::error::PyArrowResult; 6 | use pyo3_arrow::ffi::ArrayIterator; 7 | use pyo3_arrow::input::AnyArray; 8 | use pyo3_arrow::{PyArray, PyArrayReader}; 9 | 10 | #[pyfunction] 11 | pub fn filter(py: Python, values: AnyArray, predicate: AnyArray) -> PyArrowResult { 12 | match (values, predicate) { 13 | (AnyArray::Array(values), AnyArray::Array(predicate)) => { 14 | let (values, values_field) = values.into_inner(); 15 | let predicate = predicate 16 | .as_ref() 17 | .as_boolean_opt() 18 | .ok_or(ArrowError::ComputeError( 19 | "Expected boolean array for predicate".to_string(), 20 | ))?; 21 | 22 | let filtered = arrow_select::filter::filter(values.as_ref(), predicate)?; 23 | Ok(PyArray::new(filtered, values_field).to_arro3(py)?.unbind()) 24 | } 25 | (AnyArray::Stream(values), AnyArray::Stream(predicate)) => { 26 | let values = values.into_reader()?; 27 | let predicate = predicate.into_reader()?; 28 | 29 | if !predicate 30 | .field() 31 | .data_type() 32 | .equals_datatype(&DataType::Boolean) 33 | { 34 | return Err(PyValueError::new_err("Expected boolean array for predicate").into()); 35 | } 36 | 37 | let values_field = values.field(); 38 | 39 | let iter = values 40 | .into_iter() 41 | .zip(predicate) 42 | .map(move |(values, predicate)| { 43 | let predicate_arr = predicate?; 44 | let filtered = 45 | arrow_select::filter::filter(values?.as_ref(), predicate_arr.as_boolean())?; 46 | Ok(filtered) 47 | }); 48 | Ok( 49 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, values_field))) 50 | .to_arro3(py)? 51 | .unbind(), 52 | ) 53 | } 54 | _ => Err(PyValueError::new_err("Unsupported combination of array and stream").into()), 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /arro3-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyRuntimeWarning; 2 | use pyo3::intern; 3 | use pyo3::prelude::*; 4 | use pyo3::types::PyTuple; 5 | 6 | mod accessors; 7 | mod constructors; 8 | 9 | const VERSION: &str = env!("CARGO_PKG_VERSION"); 10 | 11 | #[pyfunction] 12 | fn ___version() -> &'static str { 13 | VERSION 14 | } 15 | 16 | /// Raise RuntimeWarning for debug builds 17 | #[pyfunction] 18 | fn check_debug_build(py: Python) -> PyResult<()> { 19 | #[cfg(debug_assertions)] 20 | { 21 | let warnings_mod = py.import(intern!(py, "warnings"))?; 22 | let warning = PyRuntimeWarning::new_err( 23 | "arro3.core has not been compiled in release mode. Performance will be degraded.", 24 | ); 25 | let args = PyTuple::new(py, vec![warning])?; 26 | warnings_mod.call_method1(intern!(py, "warn"), args)?; 27 | } 28 | 29 | Ok(()) 30 | } 31 | 32 | /// A Python module implemented in Rust. 33 | #[pymodule] 34 | fn _core(py: Python, m: &Bound) -> PyResult<()> { 35 | check_debug_build(py)?; 36 | 37 | m.add_wrapped(wrap_pyfunction!(___version))?; 38 | 39 | m.add_class::()?; 40 | m.add_class::()?; 41 | m.add_class::()?; 42 | m.add_class::()?; 43 | m.add_class::()?; 44 | m.add_class::()?; 45 | m.add_class::()?; 46 | m.add_class::()?; 47 | m.add_class::()?; 48 | m.add_class::()?; 49 | m.add_class::()?; 50 | 51 | m.add_wrapped(wrap_pyfunction!( 52 | accessors::dictionary::dictionary_dictionary 53 | ))?; 54 | m.add_wrapped(wrap_pyfunction!(accessors::dictionary::dictionary_indices))?; 55 | m.add_wrapped(wrap_pyfunction!(accessors::list_flatten::list_flatten))?; 56 | m.add_wrapped(wrap_pyfunction!(accessors::list_offsets::list_offsets))?; 57 | m.add_wrapped(wrap_pyfunction!(accessors::struct_field::struct_field))?; 58 | 59 | m.add_wrapped(wrap_pyfunction!(constructors::fixed_size_list_array))?; 60 | m.add_wrapped(wrap_pyfunction!(constructors::list_array))?; 61 | m.add_wrapped(wrap_pyfunction!(constructors::struct_array))?; 62 | 63 | Ok(()) 64 | } 65 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/input.rs: -------------------------------------------------------------------------------- 1 | use crate::array_reader::PyArrayReader; 2 | use crate::input::{AnyArray, AnyDatum, AnyRecordBatch}; 3 | use crate::{PyArray, PyScalar}; 4 | use pyo3::exceptions::PyValueError; 5 | use pyo3::prelude::*; 6 | use pyo3::{intern, PyAny, PyResult}; 7 | 8 | impl<'a> FromPyObject<'a> for AnyRecordBatch { 9 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 10 | if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? { 11 | Ok(Self::RecordBatch(ob.extract()?)) 12 | } else if ob.hasattr(intern!(ob.py(), "__arrow_c_stream__"))? { 13 | Ok(Self::Stream(ob.extract()?)) 14 | } else { 15 | Err(PyValueError::new_err( 16 | "Expected object with __arrow_c_array__ or __arrow_c_stream__ method", 17 | )) 18 | } 19 | } 20 | } 21 | 22 | impl<'a> FromPyObject<'a> for AnyArray { 23 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 24 | // First extract infallibly if __arrow_c_array__ method is present, so that any exception 25 | // in that gets propagated. Also check if PyArray extract works so that Buffer Protocol 26 | // conversion still works. 27 | // Do the same for __arrow_c_stream__ and PyArrayReader below. 28 | if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? { 29 | Ok(Self::Array(ob.extract()?)) 30 | } else if let Ok(arr) = ob.extract::() { 31 | Ok(Self::Array(arr)) 32 | } else if ob.hasattr(intern!(ob.py(), "__arrow_c_stream__"))? { 33 | Ok(Self::Stream(ob.extract()?)) 34 | } else if let Ok(stream) = ob.extract::() { 35 | Ok(Self::Stream(stream)) 36 | } else { 37 | Err(PyValueError::new_err( 38 | "Expected object with __arrow_c_array__ or __arrow_c_stream__ method or implementing buffer protocol.", 39 | )) 40 | } 41 | } 42 | } 43 | 44 | impl<'a> FromPyObject<'a> for AnyDatum { 45 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 46 | let array = ob.extract::()?; 47 | if array.as_ref().len() == 1 { 48 | let (array, field) = array.into_inner(); 49 | Ok(Self::Scalar(PyScalar::try_new(array, field)?)) 50 | } else { 51 | Ok(Self::Array(array)) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /arro3-compute/src/arith.rs: -------------------------------------------------------------------------------- 1 | use arrow_arith::numeric; 2 | use pyo3::prelude::*; 3 | use pyo3_arrow::error::PyArrowResult; 4 | use pyo3_arrow::input::AnyDatum; 5 | use pyo3_arrow::PyArray; 6 | 7 | #[pyfunction] 8 | pub fn add(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 9 | Ok(PyArray::from_array_ref(numeric::add(&lhs, &rhs)?) 10 | .to_arro3(py)? 11 | .unbind()) 12 | } 13 | 14 | #[pyfunction] 15 | pub fn add_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 16 | Ok(PyArray::from_array_ref(numeric::add_wrapping(&lhs, &rhs)?) 17 | .to_arro3(py)? 18 | .unbind()) 19 | } 20 | 21 | #[pyfunction] 22 | pub fn div(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 23 | Ok(PyArray::from_array_ref(numeric::div(&lhs, &rhs)?) 24 | .to_arro3(py)? 25 | .unbind()) 26 | } 27 | 28 | #[pyfunction] 29 | pub fn mul(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 30 | Ok(PyArray::from_array_ref(numeric::mul(&lhs, &rhs)?) 31 | .to_arro3(py)? 32 | .unbind()) 33 | } 34 | 35 | #[pyfunction] 36 | pub fn mul_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 37 | Ok(PyArray::from_array_ref(numeric::mul_wrapping(&lhs, &rhs)?) 38 | .to_arro3(py)? 39 | .unbind()) 40 | } 41 | 42 | #[pyfunction] 43 | pub fn neg(py: Python, array: PyArray) -> PyArrowResult { 44 | Ok(PyArray::from_array_ref(numeric::neg(array.as_ref())?) 45 | .to_arro3(py)? 46 | .unbind()) 47 | } 48 | 49 | #[pyfunction] 50 | pub fn neg_wrapping(py: Python, array: PyArray) -> PyArrowResult { 51 | Ok( 52 | PyArray::from_array_ref(numeric::neg_wrapping(array.as_ref())?) 53 | .to_arro3(py)? 54 | .unbind(), 55 | ) 56 | } 57 | 58 | #[pyfunction] 59 | pub fn rem(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 60 | Ok(PyArray::from_array_ref(numeric::rem(&lhs, &rhs)?) 61 | .to_arro3(py)? 62 | .unbind()) 63 | } 64 | 65 | #[pyfunction] 66 | pub fn sub(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 67 | Ok(PyArray::from_array_ref(numeric::sub(&lhs, &rhs)?) 68 | .to_arro3(py)? 69 | .unbind()) 70 | } 71 | 72 | #[pyfunction] 73 | pub fn sub_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult { 74 | Ok(PyArray::from_array_ref(numeric::sub_wrapping(&lhs, &rhs)?) 75 | .to_arro3(py)? 76 | .unbind()) 77 | } 78 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Publish arro3 docs 2 | 3 | # Only run on new tags starting with `py-v` 4 | on: 5 | push: 6 | tags: 7 | - "py-v*" 8 | workflow_dispatch: 9 | 10 | # https://stackoverflow.com/a/77412363 11 | permissions: 12 | contents: write 13 | pages: write 14 | 15 | jobs: 16 | build: 17 | name: Deploy Python docs 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | # We need to additionally fetch the gh-pages branch for mike deploy 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Install Rust 26 | uses: dtolnay/rust-toolchain@stable 27 | 28 | - uses: Swatinem/rust-cache@v2 29 | 30 | - name: Set up Python 3.11 31 | id: setup-python 32 | uses: actions/setup-python@v4 33 | with: 34 | python-version: "3.11" 35 | 36 | - name: Install a specific version of uv 37 | uses: astral-sh/setup-uv@v3 38 | with: 39 | enable-cache: true 40 | version: "0.4.x" 41 | 42 | - name: Install dependencies 43 | run: uv sync 44 | 45 | - name: Build python packages 46 | run: | 47 | # arro3-core needs to be first 48 | uv run maturin dev -m arro3-core/Cargo.toml 49 | uv run maturin dev -m arro3-compute/Cargo.toml 50 | uv run maturin dev -m arro3-io/Cargo.toml 51 | 52 | - name: Deploy docs 53 | env: 54 | GIT_COMMITTER_NAME: CI 55 | GIT_COMMITTER_EMAIL: ci-bot@example.com 56 | run: | 57 | # Get most recent git tag 58 | # https://stackoverflow.com/a/7261049 59 | # https://stackoverflow.com/a/3867811 60 | # We don't use {{github.ref_name}} because if triggered manually, it 61 | # will be a branch name instead of a tag version. 62 | # Then remove `py-` from the tag 63 | VERSION=$(git describe --tags --match="py-*" --abbrev=0 | cut -c 4-) 64 | 65 | # Only push docs if no letters in git tag after the first character 66 | # (usually the git tag will have v as the first character) 67 | # Note the `cut` index is 1-ordered 68 | if echo $VERSION | cut -c 2- | grep -q "[A-Za-z]"; then 69 | echo "Is beta version" 70 | # For beta versions publish but don't set as latest 71 | uv run mike deploy $VERSION --update-aliases --push 72 | else 73 | echo "Is NOT beta version" 74 | uv run mike deploy $VERSION latest --update-aliases --push 75 | fi 76 | -------------------------------------------------------------------------------- /arro3-compute/src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::exceptions::PyRuntimeWarning; 2 | use pyo3::intern; 3 | use pyo3::prelude::*; 4 | use pyo3::types::PyTuple; 5 | 6 | mod aggregate; 7 | mod arith; 8 | mod boolean; 9 | mod cast; 10 | mod concat; 11 | mod dictionary; 12 | mod filter; 13 | mod take; 14 | mod temporal; 15 | 16 | const VERSION: &str = env!("CARGO_PKG_VERSION"); 17 | 18 | #[pyfunction] 19 | fn ___version() -> &'static str { 20 | VERSION 21 | } 22 | 23 | /// Raise RuntimeWarning for debug builds 24 | #[pyfunction] 25 | fn check_debug_build(py: Python) -> PyResult<()> { 26 | #[cfg(debug_assertions)] 27 | { 28 | let warnings_mod = py.import(intern!(py, "warnings"))?; 29 | let warning = PyRuntimeWarning::new_err( 30 | "arro3.compute has not been compiled in release mode. Performance will be degraded.", 31 | ); 32 | let args = PyTuple::new(py, vec![warning])?; 33 | warnings_mod.call_method1(intern!(py, "warn"), args)?; 34 | } 35 | 36 | Ok(()) 37 | } 38 | 39 | #[pymodule] 40 | fn _compute(py: Python, m: &Bound) -> PyResult<()> { 41 | check_debug_build(py)?; 42 | 43 | m.add_wrapped(wrap_pyfunction!(___version))?; 44 | 45 | m.add_wrapped(wrap_pyfunction!(aggregate::max))?; 46 | m.add_wrapped(wrap_pyfunction!(aggregate::min))?; 47 | m.add_wrapped(wrap_pyfunction!(aggregate::sum))?; 48 | m.add_wrapped(wrap_pyfunction!(arith::add_wrapping))?; 49 | m.add_wrapped(wrap_pyfunction!(arith::add))?; 50 | m.add_wrapped(wrap_pyfunction!(arith::div))?; 51 | m.add_wrapped(wrap_pyfunction!(arith::mul_wrapping))?; 52 | m.add_wrapped(wrap_pyfunction!(arith::mul))?; 53 | m.add_wrapped(wrap_pyfunction!(arith::neg_wrapping))?; 54 | m.add_wrapped(wrap_pyfunction!(arith::neg))?; 55 | m.add_wrapped(wrap_pyfunction!(arith::rem))?; 56 | m.add_wrapped(wrap_pyfunction!(arith::sub_wrapping))?; 57 | m.add_wrapped(wrap_pyfunction!(arith::sub))?; 58 | m.add_wrapped(wrap_pyfunction!(boolean::is_not_null))?; 59 | m.add_wrapped(wrap_pyfunction!(boolean::is_null))?; 60 | m.add_wrapped(wrap_pyfunction!(cast::can_cast_types))?; 61 | m.add_wrapped(wrap_pyfunction!(cast::cast))?; 62 | m.add_wrapped(wrap_pyfunction!(cast::cast))?; 63 | m.add_wrapped(wrap_pyfunction!(concat::concat))?; 64 | m.add_wrapped(wrap_pyfunction!(concat::concat))?; 65 | m.add_wrapped(wrap_pyfunction!(dictionary::dictionary_encode))?; 66 | m.add_wrapped(wrap_pyfunction!(filter::filter))?; 67 | m.add_wrapped(wrap_pyfunction!(take::take))?; 68 | m.add_wrapped(wrap_pyfunction!(temporal::date_part))?; 69 | 70 | Ok(()) 71 | } 72 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_field.pyi: -------------------------------------------------------------------------------- 1 | from ._data_type import DataType 2 | from .types import ArrowSchemaExportable 3 | 4 | class Field: 5 | """An Arrow Field.""" 6 | def __init__( 7 | self, 8 | name: str, 9 | type: ArrowSchemaExportable, 10 | nullable: bool = True, 11 | *, 12 | metadata: dict[str, str] | dict[bytes, bytes] | None = None, 13 | ) -> None: ... 14 | def __arrow_c_schema__(self) -> object: 15 | """ 16 | An implementation of the [Arrow PyCapsule 17 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 18 | This dunder method should not be called directly, but enables zero-copy data 19 | transfer to other Python libraries that understand Arrow memory. 20 | 21 | For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this 22 | array into a pyarrow field, without copying memory. 23 | """ 24 | def __eq__(self, other) -> bool: ... 25 | def __repr__(self) -> str: ... 26 | @classmethod 27 | def from_arrow(cls, input: ArrowSchemaExportable) -> Field: 28 | """Construct this from an existing Arrow object. 29 | 30 | It can be called on anything that exports the Arrow schema interface 31 | (has an `__arrow_c_schema__` method). 32 | """ 33 | @classmethod 34 | def from_arrow_pycapsule(cls, capsule) -> Field: 35 | """Construct this object from a bare Arrow PyCapsule""" 36 | 37 | def equals(self, other: ArrowSchemaExportable) -> bool: 38 | """Test if this field is equal to the other.""" 39 | @property 40 | def metadata(self) -> dict[bytes, bytes]: 41 | """The schema's metadata.""" 42 | @property 43 | def metadata_str(self) -> dict[str, str]: 44 | """The schema's metadata where keys and values are `str`, not `bytes`.""" 45 | @property 46 | def name(self) -> str: 47 | """The field name.""" 48 | @property 49 | def nullable(self) -> bool: 50 | """The field nullability.""" 51 | def remove_metadata(self) -> Field: 52 | """Create new field without metadata, if any.""" 53 | @property 54 | def type(self) -> DataType: 55 | """Access the data type of this field.""" 56 | def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Field: 57 | """Add metadata as dict of string keys and values to Field.""" 58 | def with_name(self, name: str) -> Field: 59 | """A copy of this field with the replaced name.""" 60 | def with_nullable(self, nullable: bool) -> Field: 61 | """A copy of this field with the replaced nullability.""" 62 | def with_type(self, new_type: ArrowSchemaExportable) -> Field: 63 | """A copy of this field with the replaced type""" 64 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/_json.pyi: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import IO 3 | 4 | # Note: importing with 5 | # `from arro3.core import Array` 6 | # will cause Array to be included in the generated docs in this module. 7 | import arro3.core as core 8 | import arro3.core.types as types 9 | 10 | def infer_json_schema( 11 | file: IO[bytes] | Path | str, 12 | *, 13 | max_records: int | None = None, 14 | ) -> core.Schema: 15 | """ 16 | Infer the schema of a JSON file by reading the first n records of the buffer, with 17 | `max_records` controlling the maximum number of records to read. 18 | 19 | Args: 20 | file: The input JSON path or buffer. 21 | max_records: The maximum number of records to read to infer schema. If not 22 | provided, will read the entire file to deduce field types. Defaults to None. 23 | 24 | Returns: 25 | Inferred Arrow Schema 26 | """ 27 | 28 | def read_json( 29 | file: IO[bytes] | Path | str, 30 | schema: types.ArrowSchemaExportable, 31 | *, 32 | batch_size: int | None = None, 33 | ) -> core.RecordBatchReader: 34 | """Reads JSON data with a known schema into Arrow 35 | 36 | Args: 37 | file: The JSON file or buffer to read from. 38 | schema: The Arrow schema representing the JSON data. 39 | batch_size: Set the batch size (number of records to load at one time). Defaults 40 | to None. 41 | 42 | Returns: 43 | An arrow RecordBatchReader. 44 | """ 45 | 46 | def write_json( 47 | data: types.ArrowStreamExportable | types.ArrowArrayExportable, 48 | file: IO[bytes] | Path | str, 49 | *, 50 | explicit_nulls: bool | None = None, 51 | ) -> None: 52 | """Write Arrow data to JSON. 53 | 54 | By default the writer will skip writing keys with null values for backward 55 | compatibility. 56 | 57 | Args: 58 | data: the Arrow Table, RecordBatchReader, or RecordBatch to write. 59 | file: the output file or buffer to write to 60 | explicit_nulls: Set whether to keep keys with null values, or to omit writing 61 | them. Defaults to skipping nulls. 62 | """ 63 | 64 | def write_ndjson( 65 | data: types.ArrowStreamExportable | types.ArrowArrayExportable, 66 | file: IO[bytes] | Path | str, 67 | *, 68 | explicit_nulls: bool | None = None, 69 | ) -> None: 70 | """Write Arrow data to newline-delimited JSON. 71 | 72 | By default the writer will skip writing keys with null values for backward 73 | compatibility. 74 | 75 | Args: 76 | data: the Arrow Table, RecordBatchReader, or RecordBatch to write. 77 | file: the output file or buffer to write to 78 | explicit_nulls: Set whether to keep keys with null values, or to omit writing 79 | them. Defaults to skipping nulls. 80 | """ 81 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["arro3-compute", "arro3-core", "arro3-io"] 3 | # Note: we exclude pyo3-arrow from the top-level workspace because we have a 4 | # circular dependency. pyo3-arrow is depended on by obstore to return arrow 5 | # results as a list, which is depended on by arro3-io. This makes it hard to 6 | # upgrade versions. 7 | exclude = ["pyo3-arrow"] 8 | resolver = "2" 9 | 10 | [workspace.package] 11 | # Package version for arro3-*, not for pyo3-arrow 12 | version = "0.6.1" 13 | authors = ["Kyle Barron "] 14 | edition = "2021" 15 | homepage = "https://kylebarron.dev/arro3" 16 | repository = "https://github.com/kylebarron/arro3" 17 | license = "MIT OR Apache-2.0" 18 | keywords = ["python", "arrow"] 19 | categories = [] 20 | rust-version = "1.75" 21 | 22 | [workspace.dependencies] 23 | arrow-arith = "56" 24 | arrow-array = { version = "56", features = ["ffi"] } 25 | arrow-buffer = "56" 26 | arrow-cast = "56" 27 | arrow-csv = "56" 28 | arrow-ipc = { version = "56", features = ["lz4", "zstd"] } 29 | arrow-json = "56" 30 | arrow-schema = "56" 31 | arrow-select = "56" 32 | bytes = "1.7.0" 33 | half = "2" 34 | indexmap = "2" 35 | numpy = "0.25" 36 | object_store = "0.12.1" 37 | parquet = "56" 38 | pyo3 = { version = "0.25", features = ["macros", "indexmap"] } 39 | # pyo3-arrow = "0.11.0" 40 | pyo3-arrow = { git = "https://github.com/kylebarron/arro3", rev = "cb2453bf022d0d8704e56e81a324ab5a772e0247" } 41 | # pyo3-arrow = { path = "./pyo3-arrow" } 42 | pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] } 43 | pyo3-file = "0.13.0" 44 | pyo3-object_store = "0.5" 45 | thiserror = "1.0.63" 46 | 47 | [patch.crates-io] 48 | arrow-arith = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 49 | arrow-array = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 50 | arrow-buffer = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 51 | arrow-cast = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 52 | arrow-csv = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 53 | arrow-ipc = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 54 | arrow-json = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 55 | arrow-schema = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 56 | arrow-select = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 57 | parquet = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" } 58 | 59 | [profile.release] 60 | lto = true 61 | codegen-units = 1 62 | -------------------------------------------------------------------------------- /tests/core/test_chunked_array.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | import pyarrow as pa 4 | import pytest 5 | from arro3.core import Array, ChunkedArray, DataType 6 | 7 | 8 | def test_constructor(): 9 | arr = Array([1, 2, 3], DataType.int16()) 10 | arr2 = Array([4, 5, 6], DataType.int16()) 11 | ca = ChunkedArray([arr, arr2]) 12 | assert pa.chunked_array(ca) == pa.chunked_array([arr, arr2]) 13 | 14 | 15 | def test_repr(): 16 | arr = Array([1, 2, 3], DataType.int16()) 17 | arr2 = Array([4, 5, 6], DataType.int16()) 18 | ca = ChunkedArray([arr, arr2]) 19 | expected = """\ 20 | arro3.core.ChunkedArray 21 | [ 22 | [ 23 | 1, 24 | 2, 25 | 3, 26 | ] 27 | [ 28 | 4, 29 | 5, 30 | 6, 31 | ] 32 | ] 33 | """ 34 | assert repr(ca) == dedent(expected) 35 | 36 | arr = Array([1.0, 2.0, 3.0], DataType.float64()) 37 | arr2 = Array([4.0, 5.0, 6.0], DataType.float64()) 38 | ca = ChunkedArray([arr, arr2]) 39 | expected = """\ 40 | arro3.core.ChunkedArray 41 | [ 42 | [ 43 | 1.0, 44 | 2.0, 45 | 3.0, 46 | ] 47 | [ 48 | 4.0, 49 | 5.0, 50 | 6.0, 51 | ] 52 | ] 53 | """ 54 | assert repr(ca) == dedent(expected) 55 | 56 | arr = Array(["foo"], DataType.string()) 57 | arr2 = Array(["bar"], DataType.string()) 58 | arr3 = Array(["baz"], DataType.string()) 59 | ca = ChunkedArray([arr, arr2, arr3]) 60 | expected = """\ 61 | arro3.core.ChunkedArray 62 | [ 63 | [ 64 | foo, 65 | ] 66 | [ 67 | bar, 68 | ] 69 | [ 70 | baz, 71 | ] 72 | ] 73 | """ 74 | assert repr(ca) == dedent(expected) 75 | 76 | 77 | class CustomException(Exception): 78 | pass 79 | 80 | 81 | class ArrowCStreamFails: 82 | def __arrow_c_stream__(self, requested_schema=None): 83 | raise CustomException 84 | 85 | 86 | class ArrowCArrayFails: 87 | def __arrow_c_array__(self, requested_schema=None): 88 | raise CustomException 89 | 90 | 91 | def test_chunked_array_import_preserve_exception(): 92 | """https://github.com/kylebarron/arro3/issues/325""" 93 | 94 | c_stream_obj = ArrowCStreamFails() 95 | with pytest.raises(CustomException): 96 | ChunkedArray.from_arrow(c_stream_obj) 97 | 98 | with pytest.raises(CustomException): 99 | ChunkedArray(c_stream_obj) 100 | 101 | c_array_obj = ArrowCArrayFails() 102 | with pytest.raises(CustomException): 103 | ChunkedArray.from_arrow(c_array_obj) 104 | 105 | with pytest.raises(CustomException): 106 | ChunkedArray(c_array_obj) 107 | -------------------------------------------------------------------------------- /tests/core/test_buffer_protocol.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import arro3.compute as ac 4 | import numpy as np 5 | import pyarrow as pa 6 | import pytest 7 | from arro3.core import Array, Buffer 8 | 9 | 10 | def test_from_buffer(): 11 | arr = np.array([1.0, 2.0, 3.0], dtype=np.float64) 12 | mv = memoryview(arr) 13 | assert pa.array(mv) == pa.array(Array.from_buffer(mv)) 14 | 15 | arr = np.array([1, 2, 3], dtype=np.int64) 16 | mv = memoryview(arr) 17 | assert pa.array(mv) == pa.array(Array.from_buffer(mv)) 18 | 19 | # pyarrow applies some casting; this is weird 20 | # According to joris, this may be because pyarrow doesn't implement direct import of 21 | # buffer protocol objects, and instead infers from `pa.array(list(memoryview()))` 22 | # float32 -> float64 23 | # int32 -> int64 24 | # uint64 -> int64 25 | 26 | arr = np.array([1.0, 2.0, 3.0], dtype=np.float32) 27 | assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.float32() 28 | 29 | arr = np.array([1, 2, 3], dtype=np.int32) 30 | assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.int32() 31 | 32 | arr = np.array([1, 2, 3], dtype=np.int64) 33 | assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.int64() 34 | 35 | arr = np.array([1, 2, 3], dtype=np.uint64) 36 | assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.uint64() 37 | 38 | # Datetime array 39 | # https://stackoverflow.com/a/34325416 40 | arr = np.arange(datetime(1985, 7, 1), datetime(2015, 7, 1), timedelta(days=1)) 41 | with pytest.raises(ValueError): 42 | Array.from_buffer(arr) 43 | 44 | 45 | def test_operation_on_buffer(): 46 | np_arr = np.arange(1000, dtype=np.uint64) 47 | assert np.max(np_arr) == 999 48 | assert ac.max(np_arr).as_py() == 999 49 | 50 | indices = np.array([2, 3, 4], dtype=np.uint64) 51 | out = ac.take(np_arr, indices) 52 | assert pa.array(out) == pa.array(indices) 53 | 54 | 55 | def test_multi_dimensional(): 56 | np_arr = np.arange(6, dtype=np.uint8).reshape((2, 3)) 57 | arro3_arr = Array(np_arr) 58 | pa_arr = pa.array(arro3_arr) 59 | assert pa_arr.type.list_size == 3 60 | assert pa_arr.type.value_type == pa.uint8() 61 | 62 | np_arr = np.arange(12, dtype=np.uint8).reshape((1, 2, 3, 2)) 63 | arro3_arr = Array(np_arr) 64 | pa_arr = pa.array(arro3_arr) 65 | assert pa_arr.type.list_size == 2 66 | assert pa_arr.type.value_type.list_size == 3 67 | assert pa_arr.type.value_type.value_type.list_size == 2 68 | assert pa_arr.type.value_type.value_type.value_type == pa.uint8() 69 | 70 | 71 | def test_round_trip_buffer(): 72 | arr = np.arange(5, dtype=np.uint8) 73 | buffer = Buffer(arr) 74 | # Restore when upgrading to pyo3-arrow 0.6 75 | # assert len(buffer) == arr.nbytes 76 | retour = np.frombuffer(buffer, dtype=np.uint8) 77 | assert np.array_equal(arr, retour) 78 | 79 | assert np.array_equal(arr, Array(buffer).to_numpy()) 80 | -------------------------------------------------------------------------------- /tests/test_dictionary.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pyarrow as pa 4 | import pyarrow.compute as pc 5 | from arro3.compute import dictionary_encode 6 | from arro3.core import ChunkedArray, dictionary_dictionary, dictionary_indices 7 | 8 | 9 | def test_dictionary_encode(): 10 | arr = pa.array([1, 2, 3, 1, 2, 2, 3, 1, 1, 1], type=pa.uint16()) 11 | out = dictionary_encode(arr) 12 | out_pc = pc.dictionary_encode(arr) # type: ignore 13 | assert pa.array(out) == out_pc 14 | 15 | arr = pa.array(["1", "2", "3", "1", "2", "2", "3", "1", "1", "1"], type=pa.utf8()) 16 | out = dictionary_encode(arr) 17 | out_pc = pc.dictionary_encode(arr) # type: ignore 18 | assert pa.array(out) == out_pc 19 | 20 | arr = arr.cast(pa.large_utf8()) 21 | out = dictionary_encode(arr) 22 | out_pc = pc.dictionary_encode(arr) # type: ignore 23 | assert pa.array(out) == out_pc 24 | 25 | arr = arr.cast(pa.binary()) 26 | out = dictionary_encode(arr) 27 | out_pc = pc.dictionary_encode(arr) # type: ignore 28 | assert pa.array(out) == out_pc 29 | 30 | arr = arr.cast(pa.large_binary()) 31 | out = dictionary_encode(arr) 32 | out_pc = pc.dictionary_encode(arr) # type: ignore 33 | assert pa.array(out) == out_pc 34 | 35 | now = datetime.now() 36 | later = datetime.now() 37 | arr = pa.array([now, later, now, now, later]) 38 | out = dictionary_encode(arr) 39 | out_pc = pc.dictionary_encode(arr) # type: ignore 40 | assert pa.array(out) == out_pc 41 | 42 | 43 | def test_dictionary_encode_chunked(): 44 | arr = pa.chunked_array([[3, 2, 3], [1, 2, 2], [3, 1, 1, 1]], type=pa.uint16()) 45 | out = ChunkedArray(dictionary_encode(arr)) 46 | 47 | out_retour = pa.chunked_array(out) 48 | out_pc = pc.dictionary_encode(arr) # type: ignore 49 | 50 | # Since these arrays have different dictionaries, array and arrow scalar comparison 51 | # will fail. 52 | assert len(out_retour) == len(out_pc) 53 | for i in range(len(out_retour)): 54 | assert out_retour[i].as_py() == out_pc[i].as_py() 55 | 56 | 57 | def test_dictionary_access(): 58 | arr = pa.array([1, 2, 3, 1, 2, 2, 3, 1, 1, 1], type=pa.uint16()) 59 | out = dictionary_encode(arr) 60 | out_pc = pc.dictionary_encode(arr) # type: ignore 61 | 62 | keys = dictionary_dictionary(out) 63 | assert pa.array(keys) == out_pc.dictionary 64 | 65 | indices = dictionary_indices(out) 66 | assert pa.array(indices) == out_pc.indices 67 | 68 | 69 | def test_dictionary_access_chunked(): 70 | arr = pa.chunked_array([[3, 2, 3], [1, 2, 2], [3, 1, 1, 1]], type=pa.uint16()) 71 | out = ChunkedArray(dictionary_encode(arr)) 72 | out_pa = pa.chunked_array(out) 73 | 74 | dictionary = ChunkedArray(dictionary_dictionary(out)) 75 | assert pa.chunked_array(dictionary).chunks[0] == out_pa.chunks[0].dictionary 76 | 77 | indices = ChunkedArray(dictionary_indices(out)) 78 | assert pa.chunked_array(indices).chunks[0] == out_pa.chunks[0].indices 79 | -------------------------------------------------------------------------------- /tests/core/test_ffi.py: -------------------------------------------------------------------------------- 1 | import pyarrow as pa 2 | from arro3.core import Array, ChunkedArray, DataType, RecordBatchReader, Schema, Table 3 | 4 | 5 | def test_table_stream_export_schema_request(): 6 | a = pa.array(["a", "b", "c"], type=pa.utf8()) 7 | table = Table.from_pydict({"a": a}) 8 | 9 | requested_schema = Schema([pa.field("a", type=pa.large_utf8())]) 10 | requested_schema_capsule = requested_schema.__arrow_c_schema__() 11 | stream_capsule = table.__arrow_c_stream__(requested_schema_capsule) 12 | 13 | retour = Table.from_arrow_pycapsule(stream_capsule) 14 | assert retour.schema.field("a").type == DataType.large_utf8() 15 | 16 | 17 | def test_record_batch_reader_stream_export_schema_request(): 18 | a = pa.array(["a", "b", "c"], type=pa.utf8()) 19 | table = Table.from_pydict({"a": a}) 20 | reader = RecordBatchReader.from_batches(table.schema, table.to_batches()) 21 | 22 | requested_schema = Schema([pa.field("a", type=pa.large_utf8())]) 23 | requested_schema_capsule = requested_schema.__arrow_c_schema__() 24 | stream_capsule = reader.__arrow_c_stream__(requested_schema_capsule) 25 | 26 | retour = Table.from_arrow_pycapsule(stream_capsule) 27 | assert retour.schema.field("a").type == DataType.large_utf8() 28 | 29 | 30 | def test_chunked_array_stream_export_schema_request(): 31 | a = pa.array(["a", "b", "c"], type=pa.utf8()) 32 | ca = ChunkedArray([a, a]) 33 | 34 | requested_schema_capsule = pa.large_utf8().__arrow_c_schema__() 35 | stream_capsule = ca.__arrow_c_stream__(requested_schema_capsule) 36 | 37 | retour = ChunkedArray.from_arrow_pycapsule(stream_capsule) 38 | assert retour.type == DataType.large_utf8() 39 | 40 | 41 | def test_array_export_schema_request(): 42 | a = pa.array(["a", "b", "c"], type=pa.utf8()) 43 | arr = Array(a) 44 | 45 | requested_schema_capsule = pa.large_utf8().__arrow_c_schema__() 46 | capsules = arr.__arrow_c_array__(requested_schema_capsule) 47 | 48 | retour = Array.from_arrow_pycapsule(*capsules) 49 | assert retour.type == DataType.large_utf8() 50 | 51 | 52 | def test_table_metadata_preserved(): 53 | metadata = {b"hello": b"world"} 54 | pa_table = pa.table({"a": [1, 2, 3]}) 55 | pa_table = pa_table.replace_schema_metadata(metadata) 56 | 57 | arro3_table = Table(pa_table) 58 | assert arro3_table.schema.metadata == metadata 59 | 60 | pa_table_retour = pa.table(arro3_table) 61 | assert pa_table_retour.schema.metadata == metadata 62 | 63 | 64 | def test_record_batch_reader_metadata_preserved(): 65 | metadata = {b"hello": b"world"} 66 | pa_table = pa.table({"a": [1, 2, 3]}) 67 | pa_table = pa_table.replace_schema_metadata(metadata) 68 | pa_reader = pa.RecordBatchReader.from_stream(pa_table) 69 | 70 | arro3_reader = RecordBatchReader.from_stream(pa_reader) 71 | assert arro3_reader.schema.metadata == metadata 72 | 73 | pa_reader_retour = pa.RecordBatchReader.from_stream(arro3_reader) 74 | assert pa_reader_retour.schema.metadata == metadata 75 | -------------------------------------------------------------------------------- /arro3-core/src/accessors/list_flatten.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::cast::AsArray; 2 | use arrow_array::ArrayRef; 3 | use arrow_schema::{ArrowError, DataType, FieldRef}; 4 | use pyo3::prelude::*; 5 | use pyo3_arrow::error::PyArrowResult; 6 | use pyo3_arrow::ffi::ArrayIterator; 7 | use pyo3_arrow::input::AnyArray; 8 | use pyo3_arrow::{PyArray, PyArrayReader}; 9 | 10 | #[pyfunction] 11 | pub fn list_flatten(py: Python, input: AnyArray) -> PyArrowResult { 12 | match input { 13 | AnyArray::Array(array) => { 14 | let (array, field) = array.into_inner(); 15 | let flat_array = flatten_array(array)?; 16 | let flat_field = flatten_field(field)?; 17 | Ok(PyArray::new(flat_array, flat_field).to_arro3(py)?.unbind()) 18 | } 19 | AnyArray::Stream(stream) => { 20 | let reader = stream.into_reader()?; 21 | let flatten_field = flatten_field(reader.field())?; 22 | 23 | let iter = reader.into_iter().map(move |array| { 24 | let out = flatten_array(array?)?; 25 | Ok(out) 26 | }); 27 | Ok( 28 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, flatten_field))) 29 | .to_arro3(py)? 30 | .unbind(), 31 | ) 32 | } 33 | } 34 | } 35 | 36 | fn flatten_array(array: ArrayRef) -> Result { 37 | let offset = array.offset(); 38 | let length = array.len(); 39 | match array.data_type() { 40 | DataType::List(_) => { 41 | let arr = array.as_list::(); 42 | let start = arr.offsets().get(offset).unwrap(); 43 | let end = arr.offsets().get(offset + length).unwrap(); 44 | Ok(arr 45 | .values() 46 | .slice(*start as usize, (*end - *start) as usize) 47 | .clone()) 48 | } 49 | DataType::LargeList(_) => { 50 | let arr = array.as_list::(); 51 | let start = arr.offsets().get(offset).unwrap(); 52 | let end = arr.offsets().get(offset + length).unwrap(); 53 | Ok(arr 54 | .values() 55 | .slice(*start as usize, (*end - *start) as usize) 56 | .clone()) 57 | } 58 | DataType::FixedSizeList(_, list_size) => { 59 | let arr = array.as_fixed_size_list(); 60 | Ok(arr.values().clone().slice( 61 | offset * (*list_size as usize), 62 | (offset + length) * (*list_size as usize), 63 | )) 64 | } 65 | _ => Err(ArrowError::SchemaError( 66 | "Expected list-typed Array".to_string(), 67 | )), 68 | } 69 | } 70 | 71 | fn flatten_field(field: FieldRef) -> Result { 72 | match field.data_type() { 73 | DataType::List(inner_field) 74 | | DataType::LargeList(inner_field) 75 | | DataType::FixedSizeList(inner_field, _) => Ok(inner_field.clone()), 76 | _ => Err(ArrowError::SchemaError( 77 | "Expected list-typed Array".to_string(), 78 | )), 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /.github/workflows/pyodide-wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build pyodide wheels 2 | 3 | on: 4 | # push: 5 | # tags: 6 | # - "py-v*" 7 | workflow_dispatch: 8 | inputs: 9 | python: 10 | description: "Python version" 11 | required: true 12 | default: "3.12" 13 | type: choice 14 | options: 15 | - 3.12 16 | - 3.13 17 | - 3.14 18 | - 3.15 19 | pyodide: 20 | description: "New Pyodide version to build for" 21 | required: true 22 | type: string 23 | 24 | permissions: 25 | contents: write 26 | 27 | jobs: 28 | build: 29 | runs-on: ubuntu-latest 30 | strategy: 31 | matrix: 32 | module: 33 | - arro3-core 34 | - arro3-compute 35 | - arro3-io 36 | steps: 37 | - uses: actions/checkout@v4 38 | - uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ inputs.python }} 41 | 42 | - name: Install Rust 43 | uses: dtolnay/rust-toolchain@nightly 44 | with: 45 | targets: wasm32-unknown-emscripten 46 | 47 | - uses: Swatinem/rust-cache@v2 48 | 49 | - name: Install Python build dependencies 50 | run: pip install maturin pyodide-build==${{ inputs.pyodide }} wheel-filename 51 | 52 | - name: Get emscripten version 53 | run: | 54 | echo PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version) >> $GITHUB_ENV 55 | 56 | - name: Install emsdk & build wheels 57 | run: | 58 | git clone https://github.com/emscripten-core/emsdk.git 59 | cd emsdk 60 | PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version) 61 | ./emsdk install ${PYODIDE_EMSCRIPTEN_VERSION} 62 | ./emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION} 63 | source emsdk_env.sh 64 | cd .. 65 | RUSTUP_TOOLCHAIN=nightly maturin build --release -o dist --target wasm32-unknown-emscripten -i python${{ inputs.python }} --manifest-path ${{ matrix.module }}/Cargo.toml 66 | 67 | - name: Get info from built wheel file 68 | run: | 69 | # get arrow version and wheel name and make metafile 70 | ARRO3_WHEEL=$(basename dist/*.whl) 71 | ARRO3_VERSION=$(wheel-filename ${ARRO3_WHEEL} | jq -r '.version') 72 | ARROW_SHA256=$(sha256sum dist/*.whl | cut -d ' ' -f 1) 73 | echo ARRO3_WHEEL=${ARRO3_WHEEL}>>$GITHUB_ENV 74 | echo ARRO3_VERSION=${ARRO3_VERSION}>>$GITHUB_ENV 75 | 76 | - name: Upload wheels 77 | uses: actions/upload-artifact@v4 78 | with: 79 | name: wheels-pyodide-${{ matrix.module }} 80 | path: dist 81 | 82 | - name: Create release 83 | uses: ncipollo/release-action@v1 84 | with: 85 | tag: pyodide-v${{ inputs.pyodide }}-arro3-v${{ env.ARRO3_VERSION }} 86 | name: Build of arro3 for pyodide v${{ inputs.pyodide}} and arro3 v${{ env.ARRO3_VERSION }} 87 | artifacts: dist/* 88 | replacesArtifacts: true 89 | allowUpdates: true 90 | updateOnlyUnreleased: true 91 | prerelease: true 92 | -------------------------------------------------------------------------------- /arro3-io/src/json.rs: -------------------------------------------------------------------------------- 1 | use std::io::BufReader; 2 | 3 | use arrow_json::writer::{JsonArray, LineDelimited}; 4 | use arrow_json::{ReaderBuilder, WriterBuilder}; 5 | use pyo3::prelude::*; 6 | use pyo3_arrow::error::PyArrowResult; 7 | use pyo3_arrow::export::{Arro3RecordBatchReader, Arro3Schema}; 8 | use pyo3_arrow::input::AnyRecordBatch; 9 | use pyo3_arrow::{PyRecordBatchReader, PySchema}; 10 | 11 | use crate::utils::{FileReader, FileWriter}; 12 | 13 | /// Infer a JSON file's schema 14 | #[pyfunction] 15 | #[pyo3(signature = ( 16 | file, 17 | *, 18 | max_records=None, 19 | ))] 20 | pub fn infer_json_schema( 21 | file: FileReader, 22 | max_records: Option, 23 | ) -> PyArrowResult { 24 | let buf_file = BufReader::new(file); 25 | let (schema, _records_read) = arrow_json::reader::infer_json_schema(buf_file, max_records)?; 26 | Ok(schema.into()) 27 | } 28 | 29 | /// Read a JSON file to an Arrow RecordBatchReader 30 | #[pyfunction] 31 | #[pyo3(signature = ( 32 | file, 33 | schema, 34 | *, 35 | batch_size=None, 36 | ))] 37 | pub fn read_json( 38 | file: FileReader, 39 | schema: PySchema, 40 | batch_size: Option, 41 | ) -> PyArrowResult { 42 | let mut builder = ReaderBuilder::new(schema.into()); 43 | 44 | if let Some(batch_size) = batch_size { 45 | builder = builder.with_batch_size(batch_size); 46 | } 47 | 48 | let buf_file = BufReader::new(file); 49 | let reader = builder.build(buf_file)?; 50 | Ok(PyRecordBatchReader::new(Box::new(reader)).into()) 51 | } 52 | 53 | /// Write an Arrow Table or stream to a JSON file 54 | #[pyfunction] 55 | #[pyo3(signature = ( 56 | data, 57 | file, 58 | *, 59 | explicit_nulls=None, 60 | ))] 61 | #[allow(clippy::too_many_arguments)] 62 | pub fn write_json( 63 | data: AnyRecordBatch, 64 | file: FileWriter, 65 | explicit_nulls: Option, 66 | ) -> PyArrowResult<()> { 67 | let mut builder = WriterBuilder::new(); 68 | 69 | if let Some(explicit_nulls) = explicit_nulls { 70 | builder = builder.with_explicit_nulls(explicit_nulls); 71 | } 72 | 73 | let mut writer = builder.build::<_, JsonArray>(file); 74 | for batch in data.into_reader()? { 75 | writer.write(&batch?)?; 76 | } 77 | Ok(()) 78 | } 79 | 80 | /// Write an Arrow Table or stream to a newline-delimited JSON file 81 | #[pyfunction] 82 | #[pyo3(signature = ( 83 | data, 84 | file, 85 | *, 86 | explicit_nulls=None, 87 | ))] 88 | #[allow(clippy::too_many_arguments)] 89 | pub fn write_ndjson( 90 | data: AnyRecordBatch, 91 | file: FileWriter, 92 | explicit_nulls: Option, 93 | ) -> PyArrowResult<()> { 94 | let mut builder = WriterBuilder::new(); 95 | 96 | if let Some(explicit_nulls) = explicit_nulls { 97 | builder = builder.with_explicit_nulls(explicit_nulls); 98 | } 99 | 100 | let mut writer = builder.build::<_, LineDelimited>(file); 101 | for batch in data.into_reader()? { 102 | writer.write(&batch?)?; 103 | } 104 | writer.finish()?; 105 | Ok(()) 106 | } 107 | -------------------------------------------------------------------------------- /arro3-compute/src/dictionary.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use arrow_array::builder::{GenericByteDictionaryBuilder, PrimitiveDictionaryBuilder}; 4 | use arrow_array::cast::AsArray; 5 | use arrow_array::downcast_primitive_array; 6 | use arrow_array::types::{ 7 | BinaryType, ByteArrayType, Int32Type, LargeBinaryType, LargeUtf8Type, Utf8Type, 8 | }; 9 | use arrow_array::{ArrayRef, ArrowPrimitiveType, GenericByteArray, PrimitiveArray}; 10 | use arrow_schema::{ArrowError, DataType, Field}; 11 | use pyo3::prelude::*; 12 | use pyo3_arrow::error::PyArrowResult; 13 | use pyo3_arrow::ffi::ArrayIterator; 14 | use pyo3_arrow::input::AnyArray; 15 | use pyo3_arrow::{PyArray, PyArrayReader}; 16 | 17 | // Note: for chunked array input, each output chunk will not necessarily have the same dictionary 18 | #[pyfunction] 19 | pub(crate) fn dictionary_encode(py: Python, array: AnyArray) -> PyArrowResult { 20 | match array { 21 | AnyArray::Array(array) => { 22 | let (array, _field) = array.into_inner(); 23 | let output_array = dictionary_encode_array(array)?; 24 | Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind()) 25 | } 26 | AnyArray::Stream(stream) => { 27 | let reader = stream.into_reader()?; 28 | 29 | let existing_field = reader.field(); 30 | let output_data_type = DataType::Dictionary( 31 | Box::new(DataType::Int32), 32 | Box::new(existing_field.data_type().clone()), 33 | ); 34 | let output_field = Field::new("", output_data_type, true); 35 | 36 | let iter = reader 37 | .into_iter() 38 | .map(move |array| dictionary_encode_array(array?)); 39 | Ok( 40 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, output_field.into()))) 41 | .to_arro3(py)? 42 | .unbind(), 43 | ) 44 | } 45 | } 46 | } 47 | 48 | fn dictionary_encode_array(array: ArrayRef) -> Result { 49 | let array_ref = array.as_ref(); 50 | let array = downcast_primitive_array!( 51 | array_ref => { 52 | primitive_dictionary_encode(array_ref) 53 | } 54 | DataType::Utf8 => bytes_dictionary_encode(array.as_bytes::()), 55 | DataType::LargeUtf8 => bytes_dictionary_encode(array.as_bytes::()), 56 | DataType::Binary => bytes_dictionary_encode(array.as_bytes::()), 57 | DataType::LargeBinary => bytes_dictionary_encode(array.as_bytes::()), 58 | DataType::Dictionary(_, _) => array, 59 | d => return Err(ArrowError::ComputeError(format!("{d:?} not supported in rank"))) 60 | ); 61 | Ok(array) 62 | } 63 | 64 | #[inline(never)] 65 | fn primitive_dictionary_encode(array: &PrimitiveArray) -> ArrayRef { 66 | let mut builder = PrimitiveDictionaryBuilder::::new(); 67 | for value in array { 68 | builder.append_option(value); 69 | } 70 | Arc::new(builder.finish()) 71 | } 72 | 73 | #[inline(never)] 74 | fn bytes_dictionary_encode(array: &GenericByteArray) -> ArrayRef { 75 | let mut builder = GenericByteDictionaryBuilder::::new(); 76 | for value in array { 77 | builder.append_option(value); 78 | } 79 | Arc::new(builder.finish()) 80 | } 81 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_scalar.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, overload 2 | 3 | from ._data_type import DataType 4 | from ._field import Field 5 | from .types import ArrayInput, ArrowArrayExportable, ArrowSchemaExportable 6 | 7 | class Scalar: 8 | """An arrow Scalar.""" 9 | @overload 10 | def __init__(self, obj: ArrayInput, /, type: None = None) -> None: ... 11 | @overload 12 | def __init__(self, obj: Any, /, type: ArrowSchemaExportable) -> None: ... 13 | def __init__( 14 | self, 15 | obj: ArrayInput | Any, 16 | /, 17 | type: ArrowSchemaExportable | None = None, 18 | ) -> None: 19 | """Create arro3.Scalar instance from a Python object. 20 | 21 | Args: 22 | obj: An input object. 23 | type: Explicit type to attempt to coerce to. You may pass in a `Field` to `type` in order to associate extension metadata with this array. 24 | """ 25 | def __arrow_c_array__( 26 | self, requested_schema: object | None = None 27 | ) -> tuple[object, object]: 28 | """ 29 | An implementation of the [Arrow PyCapsule 30 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 31 | This dunder method should not be called directly, but enables zero-copy data 32 | transfer to other Python libraries that understand Arrow memory. 33 | 34 | For example, you can call [`pyarrow.array()`][pyarrow.array] to 35 | convert this Scalar into a pyarrow Array, without copying memory. The generated 36 | array is guaranteed to have length 1. 37 | """ 38 | def __eq__(self, other) -> bool: 39 | """Check for equality with other Python objects (`==`) 40 | 41 | If `other` is not an Arrow scalar, `self` will be converted to a Python object 42 | (with `as_py`), and then its `__eq__` method will be called. 43 | """ 44 | def __repr__(self) -> str: ... 45 | @classmethod 46 | def from_arrow(cls, input: ArrowArrayExportable) -> Scalar: 47 | """Construct this from an existing Arrow Scalar. 48 | 49 | It can be called on anything that exports the Arrow data interface (has a 50 | `__arrow_c_array__` method) and returns an array with a single element. 51 | 52 | Args: 53 | input: Arrow scalar to use for constructing this object 54 | 55 | Returns: 56 | new Scalar 57 | """ 58 | @classmethod 59 | def from_arrow_pycapsule(cls, schema_capsule, array_capsule) -> Scalar: 60 | """Construct this object from bare Arrow PyCapsules""" 61 | def as_py(self) -> Any: 62 | """Convert this scalar to a pure-Python object.""" 63 | def cast(self, target_type: ArrowSchemaExportable) -> Scalar: 64 | """Cast scalar to another data type 65 | 66 | Args: 67 | target_type: Type to cast to. 68 | """ 69 | 70 | @property 71 | def field(self) -> Field: 72 | """Access the field stored on this Scalar. 73 | 74 | Note that this field usually will not have a name associated, but it may have 75 | metadata that signifies that this scalar is an extension (user-defined typed) 76 | scalar. 77 | """ 78 | 79 | @property 80 | def is_valid(self) -> bool: 81 | """Return `True` if this scalar is not null.""" 82 | @property 83 | def type(self) -> DataType: 84 | """Access the type of this scalar.""" 85 | -------------------------------------------------------------------------------- /arro3-core/src/accessors/list_offsets.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use arrow_array::cast::AsArray; 4 | use arrow_array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; 5 | use arrow_buffer::{OffsetBuffer, ScalarBuffer}; 6 | use arrow_schema::{ArrowError, DataType, Field}; 7 | use pyo3::prelude::*; 8 | use pyo3_arrow::error::PyArrowResult; 9 | use pyo3_arrow::ffi::ArrayIterator; 10 | use pyo3_arrow::input::AnyArray; 11 | use pyo3_arrow::{PyArray, PyArrayReader}; 12 | 13 | #[pyfunction] 14 | #[pyo3(signature = (input, *, logical=true))] 15 | pub fn list_offsets(py: Python, input: AnyArray, logical: bool) -> PyArrowResult { 16 | match input { 17 | AnyArray::Array(array) => { 18 | let (array, _field) = array.into_inner(); 19 | let offsets = _list_offsets(array, logical)?; 20 | Ok(PyArray::from_array_ref(offsets).to_arro3(py)?.unbind()) 21 | } 22 | AnyArray::Stream(stream) => { 23 | let reader = stream.into_reader()?; 24 | let out_field = match reader.field().data_type() { 25 | DataType::List(_) => Field::new("", DataType::Int32, false), 26 | DataType::LargeList(_) => Field::new("", DataType::Int64, false), 27 | _ => { 28 | return Err( 29 | ArrowError::SchemaError("Expected list-typed Array".to_string()).into(), 30 | ); 31 | } 32 | }; 33 | 34 | let iter = reader 35 | .into_iter() 36 | .map(move |array| _list_offsets(array?, logical)); 37 | Ok( 38 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) 39 | .to_arro3(py)? 40 | .unbind(), 41 | ) 42 | } 43 | } 44 | } 45 | 46 | fn _list_offsets(array: ArrayRef, logical: bool) -> Result { 47 | let offset = array.offset(); 48 | let length = array.len(); 49 | 50 | match array.data_type() { 51 | DataType::List(_) => { 52 | let arr = array.as_list::(); 53 | let offsets = arr.offsets(); 54 | let offsets = if logical { 55 | slice_offsets(offsets, offset, length) 56 | } else { 57 | offsets.clone().into_inner() 58 | }; 59 | Ok(Arc::new(Int32Array::new(offsets, None))) 60 | } 61 | DataType::LargeList(_) => { 62 | let arr = array.as_list::(); 63 | let offsets = arr.offsets(); 64 | let offsets = if logical { 65 | slice_offsets(offsets, offset, length) 66 | } else { 67 | offsets.clone().into_inner() 68 | }; 69 | Ok(Arc::new(Int64Array::new(offsets, None))) 70 | } 71 | _ => Err(ArrowError::SchemaError( 72 | "Expected list-typed Array".to_string(), 73 | )), 74 | } 75 | } 76 | 77 | fn slice_offsets( 78 | offsets: &OffsetBuffer, 79 | offset: usize, 80 | length: usize, 81 | ) -> ScalarBuffer { 82 | let sliced = offsets.slice(offset, length); 83 | let first_offset = sliced.first().copied().unwrap_or(O::zero()); 84 | if first_offset.to_usize().unwrap() == 0 { 85 | sliced.into_inner() 86 | } else { 87 | let mut new_offsets = Vec::with_capacity(sliced.len()); 88 | for value in sliced.iter() { 89 | new_offsets.push(*value - first_offset); 90 | } 91 | ScalarBuffer::from(new_offsets) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/_pyo3_object_store.pyi: -------------------------------------------------------------------------------- 1 | # TODO: move this to a standalone package/docs website that can be shared across 2 | # multiple python packages. 3 | 4 | from __future__ import annotations 5 | 6 | from datetime import timedelta 7 | from typing import Dict, TypedDict 8 | 9 | import boto3 10 | import botocore 11 | import botocore.session 12 | 13 | class BackoffConfig(TypedDict): 14 | init_backoff: timedelta 15 | max_backoff: timedelta 16 | base: int | float 17 | 18 | class RetryConfig(TypedDict): 19 | backoff: BackoffConfig 20 | max_retries: int 21 | retry_timeout: timedelta 22 | 23 | class AzureStore: 24 | @classmethod 25 | def from_env( 26 | cls, 27 | container: str, 28 | *, 29 | config: Dict[str, str] | None = None, 30 | client_options: Dict[str, str] | None = None, 31 | retry_config: RetryConfig | None = None, 32 | ) -> S3Store: ... 33 | @classmethod 34 | def from_url( 35 | cls, 36 | url: str, 37 | *, 38 | config: Dict[str, str] | None = None, 39 | client_options: Dict[str, str] | None = None, 40 | retry_config: RetryConfig | None = None, 41 | ) -> S3Store: ... 42 | 43 | class GCSStore: 44 | @classmethod 45 | def from_env( 46 | cls, 47 | bucket: str, 48 | *, 49 | config: Dict[str, str] | None = None, 50 | client_options: Dict[str, str] | None = None, 51 | retry_config: RetryConfig | None = None, 52 | ) -> S3Store: ... 53 | @classmethod 54 | def from_url( 55 | cls, 56 | url: str, 57 | *, 58 | config: Dict[str, str] | None = None, 59 | client_options: Dict[str, str] | None = None, 60 | retry_config: RetryConfig | None = None, 61 | ) -> S3Store: ... 62 | 63 | class HTTPStore: 64 | @classmethod 65 | def from_url( 66 | cls, 67 | url: str, 68 | *, 69 | client_options: Dict[str, str] | None = None, 70 | retry_config: RetryConfig | None = None, 71 | ) -> S3Store: ... 72 | 73 | class S3Store: 74 | @classmethod 75 | def from_env( 76 | cls, 77 | bucket: str, 78 | *, 79 | config: Dict[str, str] | None = None, 80 | client_options: Dict[str, str] | None = None, 81 | retry_config: RetryConfig | None = None, 82 | ) -> S3Store: ... 83 | @classmethod 84 | def from_session( 85 | cls, 86 | session: boto3.Session | botocore.session.Session, 87 | bucket: str, 88 | *, 89 | config: Dict[str, str] | None = None, 90 | client_options: Dict[str, str] | None = None, 91 | retry_config: RetryConfig | None = None, 92 | ) -> S3Store: ... 93 | @classmethod 94 | def from_url( 95 | cls, 96 | url: str, 97 | *, 98 | config: Dict[str, str] | None = None, 99 | client_options: Dict[str, str] | None = None, 100 | retry_config: RetryConfig | None = None, 101 | ) -> S3Store: ... 102 | 103 | class LocalStore: 104 | """ 105 | Local filesystem storage providing an ObjectStore interface to files on local disk. 106 | Can optionally be created with a directory prefix. 107 | 108 | """ 109 | def __init__(self, prefix: str | None = None) -> None: ... 110 | 111 | class MemoryStore: 112 | """A fully in-memory implementation of ObjectStore.""" 113 | def __init__(self) -> None: ... 114 | 115 | ObjectStore = AzureStore | GCSStore | HTTPStore | S3Store | LocalStore | MemoryStore 116 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_record_batch_reader.pyi: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from ._record_batch import RecordBatch 4 | from ._schema import Schema 5 | from ._table import Table 6 | from .types import ArrowArrayExportable, ArrowSchemaExportable, ArrowStreamExportable 7 | 8 | class RecordBatchReader: 9 | """An Arrow RecordBatchReader. 10 | 11 | A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch]. 12 | """ 13 | def __arrow_c_schema__(self) -> object: 14 | """ 15 | An implementation of the [Arrow PyCapsule 16 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 17 | This dunder method should not be called directly, but enables zero-copy data 18 | transfer to other Python libraries that understand Arrow memory. 19 | 20 | This allows Arrow consumers to inspect the data type of this RecordBatchReader. 21 | Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the 22 | exported data to a supported data type. 23 | """ 24 | def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: 25 | """ 26 | An implementation of the [Arrow PyCapsule 27 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 28 | This dunder method should not be called directly, but enables zero-copy data 29 | transfer to other Python libraries that understand Arrow memory. 30 | 31 | For example, you can call 32 | [`pyarrow.RecordBatchReader.from_stream`][pyarrow.RecordBatchReader.from_stream] 33 | to convert this stream to a pyarrow `RecordBatchReader`. Alternatively, you can 34 | call [`pyarrow.table()`][pyarrow.table] to consume this stream to a pyarrow 35 | table or [`Table.from_arrow()`][arro3.core.Table] to consume this stream to an 36 | arro3 Table. 37 | """ 38 | def __iter__(self) -> RecordBatchReader: ... 39 | def __next__(self) -> RecordBatch: ... 40 | def __repr__(self) -> str: ... 41 | @classmethod 42 | def from_arrow( 43 | cls, input: ArrowArrayExportable | ArrowStreamExportable 44 | ) -> RecordBatchReader: 45 | """ 46 | Construct this from an existing Arrow object. 47 | 48 | It can be called on anything that exports the Arrow stream interface 49 | (has an `__arrow_c_stream__` method), such as a `Table` or `RecordBatchReader`. 50 | """ 51 | @classmethod 52 | def from_arrow_pycapsule(cls, capsule) -> RecordBatchReader: 53 | """Construct this object from a bare Arrow PyCapsule""" 54 | @classmethod 55 | def from_batches( 56 | cls, schema: ArrowSchemaExportable, batches: Sequence[ArrowArrayExportable] 57 | ) -> RecordBatchReader: 58 | """Construct a new RecordBatchReader from existing data. 59 | 60 | Args: 61 | schema: The schema of the Arrow batches. 62 | batches: The existing batches. 63 | """ 64 | @classmethod 65 | def from_stream(cls, data: ArrowStreamExportable) -> RecordBatchReader: 66 | """Import a RecordBatchReader from an object that exports an Arrow C Stream.""" 67 | @property 68 | def closed(self) -> bool: 69 | """Returns `true` if this reader has already been consumed.""" 70 | def read_all(self) -> Table: 71 | """Read all batches into a Table.""" 72 | def read_next_batch(self) -> RecordBatch: 73 | """Read the next batch in the stream.""" 74 | @property 75 | def schema(self) -> Schema: 76 | """Access the schema of this table.""" 77 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_array_reader.pyi: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from ._array import Array 4 | from ._chunked_array import ChunkedArray 5 | from ._field import Field 6 | from .types import ArrowArrayExportable, ArrowSchemaExportable, ArrowStreamExportable 7 | 8 | class ArrayReader: 9 | """A stream of Arrow `Array`s. 10 | 11 | This is similar to the [`RecordBatchReader`][arro3.core.RecordBatchReader] but each 12 | item yielded from the stream is an [`Array`][arro3.core.Array], not a 13 | [`RecordBatch`][arro3.core.RecordBatch]. 14 | """ 15 | def __arrow_c_schema__(self) -> object: 16 | """ 17 | An implementation of the [Arrow PyCapsule 18 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 19 | This dunder method should not be called directly, but enables zero-copy data 20 | transfer to other Python libraries that understand Arrow memory. 21 | 22 | This allows Arrow consumers to inspect the data type of this ArrayReader. Then 23 | the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported 24 | data to a supported data type. 25 | """ 26 | def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: 27 | """ 28 | An implementation of the [Arrow PyCapsule 29 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 30 | This dunder method should not be called directly, but enables zero-copy data 31 | transfer to other Python libraries that understand Arrow memory. 32 | 33 | For example, you can call [`pyarrow.chunked_array()`][pyarrow.chunked_array] to 34 | convert this ArrayReader to a pyarrow ChunkedArray, without copying memory. 35 | """ 36 | def __iter__(self) -> ArrayReader: ... 37 | def __next__(self) -> Array: ... 38 | def __repr__(self) -> str: ... 39 | @classmethod 40 | def from_arrow( 41 | cls, input: ArrowArrayExportable | ArrowStreamExportable 42 | ) -> ArrayReader: 43 | """Construct this from an existing Arrow object. 44 | 45 | It can be called on anything that exports the Arrow stream interface 46 | (has an `__arrow_c_stream__` method), such as a `Table` or `ArrayReader`. 47 | """ 48 | @classmethod 49 | def from_arrow_pycapsule(cls, capsule) -> ArrayReader: 50 | """Construct this object from a bare Arrow PyCapsule""" 51 | @classmethod 52 | def from_arrays( 53 | cls, field: ArrowSchemaExportable, arrays: Sequence[ArrowArrayExportable] 54 | ) -> ArrayReader: 55 | """Construct an ArrayReader from existing data. 56 | 57 | Args: 58 | field: The Arrow field that describes the sequence of array data. 59 | arrays: A sequence (list or tuple) of Array data. 60 | """ 61 | @classmethod 62 | def from_stream(cls, data: ArrowStreamExportable) -> ArrayReader: 63 | """Construct this from an existing Arrow object. 64 | 65 | This is an alias of and has the same behavior as 66 | [`from_arrow`][arro3.core.ArrayReader.from_arrow], but is included for parity 67 | with [`pyarrow.RecordBatchReader`][pyarrow.RecordBatchReader]. 68 | """ 69 | @property 70 | def closed(self) -> bool: 71 | """Returns `true` if this reader has already been consumed.""" 72 | def read_all(self) -> ChunkedArray: 73 | """Read all batches from this stream into a ChunkedArray.""" 74 | def read_next_array(self) -> Array: 75 | """Read the next array from this stream.""" 76 | @property 77 | def field(self) -> Field: 78 | """Access the field of this reader.""" 79 | -------------------------------------------------------------------------------- /arro3-io/src/ipc.rs: -------------------------------------------------------------------------------- 1 | use std::io::{BufReader, BufWriter}; 2 | 3 | use arrow_ipc::reader::{FileReaderBuilder, StreamReader}; 4 | use arrow_ipc::writer::IpcWriteOptions; 5 | use pyo3::exceptions::PyValueError; 6 | use pyo3::prelude::*; 7 | use pyo3_arrow::error::PyArrowResult; 8 | use pyo3_arrow::export::Arro3RecordBatchReader; 9 | use pyo3_arrow::input::AnyRecordBatch; 10 | use pyo3_arrow::PyRecordBatchReader; 11 | 12 | use crate::utils::{FileReader, FileWriter}; 13 | 14 | /// Read an Arrow IPC file to an Arrow RecordBatchReader 15 | #[pyfunction] 16 | pub fn read_ipc(file: FileReader) -> PyArrowResult { 17 | let builder = FileReaderBuilder::new(); 18 | let buf_file = BufReader::new(file); 19 | let reader = builder.build(buf_file)?; 20 | Ok(PyRecordBatchReader::new(Box::new(reader)).into()) 21 | } 22 | 23 | /// Read an Arrow IPC Stream file to an Arrow RecordBatchReader 24 | #[pyfunction] 25 | pub fn read_ipc_stream(file: FileReader) -> PyArrowResult { 26 | let reader = StreamReader::try_new(file, None)?; 27 | Ok(PyRecordBatchReader::new(Box::new(reader)).into()) 28 | } 29 | 30 | #[allow(clippy::upper_case_acronyms)] 31 | pub enum IpcCompression { 32 | LZ4, 33 | ZSTD, 34 | } 35 | 36 | impl<'py> FromPyObject<'py> for IpcCompression { 37 | fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { 38 | let s: String = ob.extract()?; 39 | match s.to_lowercase().as_str() { 40 | "lz4" | "lz4_frame" | "lz4frame" => Ok(Self::LZ4), 41 | "zstd" => Ok(Self::ZSTD), 42 | _ => Err(PyValueError::new_err( 43 | "Unexpected compression. Should be one of 'LZ4', 'ZSTD'.", 44 | )), 45 | } 46 | } 47 | } 48 | 49 | impl From for arrow_ipc::CompressionType { 50 | fn from(value: IpcCompression) -> Self { 51 | match value { 52 | IpcCompression::LZ4 => Self::LZ4_FRAME, 53 | IpcCompression::ZSTD => Self::ZSTD, 54 | } 55 | } 56 | } 57 | 58 | /// Write an Arrow Table or stream to an IPC File 59 | #[pyfunction] 60 | #[pyo3( 61 | signature = (data, file, *, compression = IpcCompression::LZ4), 62 | text_signature = "(data, file, *, compression = 'LZ4')") 63 | ] 64 | pub fn write_ipc( 65 | data: AnyRecordBatch, 66 | file: FileWriter, 67 | compression: Option, 68 | ) -> PyArrowResult<()> { 69 | let buf_writer = BufWriter::new(file); 70 | let reader = data.into_reader()?; 71 | let options = IpcWriteOptions::default().try_with_compression(compression.map(|x| x.into()))?; 72 | let mut writer = 73 | arrow_ipc::writer::FileWriter::try_new_with_options(buf_writer, &reader.schema(), options)?; 74 | for batch in reader { 75 | writer.write(&batch?)?; 76 | } 77 | writer.finish()?; 78 | Ok(()) 79 | } 80 | 81 | /// Write an Arrow Table or stream to an IPC Stream 82 | #[pyfunction] 83 | #[pyo3( 84 | signature = (data, file, *, compression = IpcCompression::LZ4), 85 | text_signature = "(data, file, *, compression = 'LZ4')") 86 | ] 87 | pub fn write_ipc_stream( 88 | data: AnyRecordBatch, 89 | file: FileWriter, 90 | compression: Option, 91 | ) -> PyArrowResult<()> { 92 | let buf_writer = BufWriter::new(file); 93 | let reader = data.into_reader()?; 94 | let options = IpcWriteOptions::default().try_with_compression(compression.map(|x| x.into()))?; 95 | let mut writer = arrow_ipc::writer::StreamWriter::try_new_with_options( 96 | buf_writer, 97 | &reader.schema(), 98 | options, 99 | )?; 100 | for batch in reader { 101 | writer.write(&batch?)?; 102 | } 103 | writer.finish()?; 104 | Ok(()) 105 | } 106 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/ffi_stream.rs: -------------------------------------------------------------------------------- 1 | //! A custom implementation of ArrowArrayStreamReader to support ChunkedArrays: a stream of arrays 2 | //! of any data type that is not expected to represent record batches. 3 | //! 4 | //! This is derived from 5 | //! 6 | 7 | use std::ffi::CStr; 8 | use std::sync::Arc; 9 | 10 | use arrow_array::ffi::{from_ffi_and_data_type, FFI_ArrowArray, FFI_ArrowSchema}; 11 | use arrow_array::ffi_stream::FFI_ArrowArrayStream; 12 | use arrow_array::{make_array, Array}; 13 | use arrow_schema::{ArrowError, Field, FieldRef}; 14 | 15 | use crate::ffi::ArrayReader; 16 | 17 | #[derive(Debug)] 18 | pub struct ArrowArrayStreamReader { 19 | stream: FFI_ArrowArrayStream, 20 | field: FieldRef, 21 | } 22 | 23 | /// Gets schema from a raw pointer of `FFI_ArrowArrayStream`. This is used when constructing 24 | /// `ArrowArrayStreamReader` to cache schema. 25 | fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result { 26 | let mut schema = FFI_ArrowSchema::empty(); 27 | 28 | let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, &mut schema) }; 29 | 30 | if ret_code == 0 { 31 | let field = Field::try_from(&schema)?; 32 | Ok(Arc::new(field)) 33 | } else { 34 | Err(ArrowError::CDataInterface(format!( 35 | "Cannot get schema from input stream. Error code: {ret_code:?}" 36 | ))) 37 | } 38 | } 39 | 40 | impl ArrowArrayStreamReader { 41 | /// Creates a new `ArrowArrayStreamReader` from a `FFI_ArrowArrayStream`. 42 | /// This is used to import from the C Stream Interface. 43 | #[allow(dead_code)] 44 | pub fn try_new(mut stream: FFI_ArrowArrayStream) -> Result { 45 | if stream.release.is_none() { 46 | return Err(ArrowError::CDataInterface( 47 | "input stream is already released".to_string(), 48 | )); 49 | } 50 | 51 | let field = get_stream_schema(&mut stream)?; 52 | 53 | Ok(Self { stream, field }) 54 | } 55 | 56 | pub fn field(&self) -> FieldRef { 57 | self.field.clone() 58 | } 59 | 60 | /// Get the last error from `ArrowArrayStreamReader` 61 | fn get_stream_last_error(&mut self) -> Option { 62 | let get_last_error = self.stream.get_last_error?; 63 | 64 | let error_str = unsafe { get_last_error(&mut self.stream) }; 65 | if error_str.is_null() { 66 | return None; 67 | } 68 | 69 | let error_str = unsafe { CStr::from_ptr(error_str) }; 70 | Some(error_str.to_string_lossy().to_string()) 71 | } 72 | } 73 | 74 | impl Iterator for ArrowArrayStreamReader { 75 | type Item = Result, ArrowError>; 76 | 77 | fn next(&mut self) -> Option { 78 | let mut array = FFI_ArrowArray::empty(); 79 | 80 | let ret_code = unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) }; 81 | 82 | if ret_code == 0 { 83 | // The end of stream has been reached 84 | if array.is_released() { 85 | return None; 86 | } 87 | 88 | let result = unsafe { from_ffi_and_data_type(array, self.field().data_type().clone()) }; 89 | 90 | Some(result.map(make_array)) 91 | } else { 92 | let last_error = self.get_stream_last_error(); 93 | let err = ArrowError::CDataInterface(last_error.unwrap()); 94 | Some(Err(err)) 95 | } 96 | } 97 | } 98 | 99 | impl ArrayReader for ArrowArrayStreamReader { 100 | fn field(&self) -> FieldRef { 101 | self.field.clone() 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/to_python/utils.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::CString; 2 | use std::sync::Arc; 3 | 4 | use arrow_array::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; 5 | use arrow_array::Array; 6 | use arrow_cast::{can_cast_types, cast}; 7 | use arrow_schema::{ArrowError, Field, FieldRef}; 8 | use pyo3::prelude::*; 9 | use pyo3::types::{PyCapsule, PyTuple}; 10 | 11 | use crate::error::PyArrowResult; 12 | use crate::ffi::from_python::utils::import_schema_pycapsule; 13 | use crate::ffi::to_python::ffi_stream::new_stream; 14 | use crate::ffi::{ArrayIterator, ArrayReader}; 15 | 16 | /// Export a [`arrow_schema::Schema`], [`arrow_schema::Field`], or [`arrow_schema::DataType`] to a 17 | /// PyCapsule holding an Arrow C Schema pointer. 18 | pub fn to_schema_pycapsule( 19 | py: Python, 20 | field: impl TryInto, 21 | ) -> PyArrowResult> { 22 | let ffi_schema: FFI_ArrowSchema = field.try_into()?; 23 | let schema_capsule_name = CString::new("arrow_schema").unwrap(); 24 | let schema_capsule = PyCapsule::new(py, ffi_schema, Some(schema_capsule_name))?; 25 | Ok(schema_capsule) 26 | } 27 | 28 | /// Export an [`Array`] and [`FieldRef`] to a tuple of PyCapsules holding an Arrow C Schema and 29 | /// Arrow C Array pointers. 30 | pub fn to_array_pycapsules<'py>( 31 | py: Python<'py>, 32 | field: FieldRef, 33 | array: &dyn Array, 34 | requested_schema: Option>, 35 | ) -> PyArrowResult> { 36 | // Cast array if requested 37 | let (array_data, field) = if let Some(capsule) = requested_schema { 38 | let schema_ptr = import_schema_pycapsule(&capsule)?; 39 | let output_field = 40 | Arc::new(Field::try_from(schema_ptr)?.with_metadata(field.metadata().clone())); 41 | 42 | // Only cast the array if we can cast the types. 43 | if can_cast_types(field.data_type(), output_field.data_type()) { 44 | let casted_array = cast(array, output_field.data_type())?; 45 | (casted_array.to_data(), output_field) 46 | } else { 47 | (array.to_data(), field) 48 | } 49 | } else { 50 | (array.to_data(), field) 51 | }; 52 | 53 | let ffi_schema = FFI_ArrowSchema::try_from(&field)?; 54 | let ffi_array = FFI_ArrowArray::new(&array_data); 55 | 56 | let schema_capsule_name = CString::new("arrow_schema").unwrap(); 57 | let array_capsule_name = CString::new("arrow_array").unwrap(); 58 | 59 | let schema_capsule = PyCapsule::new(py, ffi_schema, Some(schema_capsule_name))?; 60 | let array_capsule = PyCapsule::new(py, ffi_array, Some(array_capsule_name))?; 61 | let tuple = PyTuple::new(py, vec![schema_capsule, array_capsule])?; 62 | 63 | Ok(tuple) 64 | } 65 | 66 | /// Export an [`ArrayIterator`][crate::ffi::ArrayIterator] to a PyCapsule holding an Arrow C Stream 67 | /// pointer. 68 | pub fn to_stream_pycapsule<'py>( 69 | py: Python<'py>, 70 | mut array_reader: Box, 71 | requested_schema: Option>, 72 | ) -> PyArrowResult> { 73 | // Cast array if requested 74 | if let Some(capsule) = requested_schema { 75 | let schema_ptr = import_schema_pycapsule(&capsule)?; 76 | 77 | let existing_field = array_reader.field(); 78 | let output_field = 79 | Arc::new(Field::try_from(schema_ptr)?.with_metadata(existing_field.metadata().clone())); 80 | let iter_field = output_field.clone(); 81 | 82 | // Only cast the reader if we can cast the types. 83 | if can_cast_types(existing_field.data_type(), output_field.data_type()) { 84 | let array_iter = array_reader.map(move |array| { 85 | let out = cast(array?.as_ref(), output_field.data_type())?; 86 | Ok(out) 87 | }); 88 | array_reader = Box::new(ArrayIterator::new(array_iter, iter_field)); 89 | } 90 | } 91 | 92 | let ffi_stream = new_stream(array_reader); 93 | let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); 94 | Ok(PyCapsule::new(py, ffi_stream, Some(stream_capsule_name))?) 95 | } 96 | -------------------------------------------------------------------------------- /arro3-core/src/accessors/dictionary.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::cast::AsArray; 2 | use arrow_array::ArrayRef; 3 | use arrow_schema::{ArrowError, DataType, Field}; 4 | use pyo3::prelude::*; 5 | use pyo3_arrow::error::PyArrowResult; 6 | use pyo3_arrow::ffi::ArrayIterator; 7 | use pyo3_arrow::input::AnyArray; 8 | use pyo3_arrow::{PyArray, PyArrayReader}; 9 | 10 | #[pyfunction] 11 | pub(crate) fn dictionary_indices(py: Python, array: AnyArray) -> PyArrowResult { 12 | match array { 13 | AnyArray::Array(array) => { 14 | let (array, _field) = array.into_inner(); 15 | let output_array = _dictionary_indices(array)?; 16 | Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind()) 17 | } 18 | AnyArray::Stream(stream) => { 19 | let reader = stream.into_reader()?; 20 | let existing_field = reader.field(); 21 | let out_field = match existing_field.data_type() { 22 | DataType::Dictionary(key_type, _value_type) => { 23 | Field::new("", *key_type.clone(), true) 24 | } 25 | _ => { 26 | return Err(ArrowError::ComputeError( 27 | "Expected dictionary-typed Array".to_string(), 28 | ) 29 | .into()) 30 | } 31 | }; 32 | let iter = reader 33 | .into_iter() 34 | .map(move |array| _dictionary_indices(array?)); 35 | Ok( 36 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) 37 | .to_arro3(py)? 38 | .unbind(), 39 | ) 40 | } 41 | } 42 | } 43 | 44 | /// Access the dictionary of the dictionary array 45 | /// 46 | /// This is equivalent to the `.dictionary` attribute on a PyArrow DictionaryArray. 47 | #[pyfunction] 48 | pub(crate) fn dictionary_dictionary(py: Python, array: AnyArray) -> PyArrowResult { 49 | match array { 50 | AnyArray::Array(array) => { 51 | let (array, _field) = array.into_inner(); 52 | let output_array = _dictionary_dictionary(array)?; 53 | Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind()) 54 | } 55 | AnyArray::Stream(stream) => { 56 | let reader = stream.into_reader()?; 57 | let existing_field = reader.field(); 58 | let out_field = match existing_field.data_type() { 59 | DataType::Dictionary(_key_type, value_type) => { 60 | Field::new("", *value_type.clone(), true) 61 | } 62 | _ => { 63 | return Err(ArrowError::ComputeError( 64 | "Expected dictionary-typed Array".to_string(), 65 | ) 66 | .into()) 67 | } 68 | }; 69 | let iter = reader 70 | .into_iter() 71 | .map(move |array| _dictionary_dictionary(array?)); 72 | Ok( 73 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into()))) 74 | .to_arro3(py)? 75 | .unbind(), 76 | ) 77 | } 78 | } 79 | } 80 | 81 | fn _dictionary_indices(array: ArrayRef) -> Result { 82 | match array.data_type() { 83 | DataType::Dictionary(_, _) => { 84 | let dict_arr = array.as_any_dictionary(); 85 | let keys_arr = dict_arr.keys(); 86 | let keys_arr_ref = keys_arr.slice(0, keys_arr.len()); 87 | Ok(keys_arr_ref) 88 | } 89 | _ => Err(ArrowError::ComputeError( 90 | "Expected dictionary-typed Array".to_string(), 91 | )), 92 | } 93 | } 94 | 95 | fn _dictionary_dictionary(array: ArrayRef) -> Result { 96 | match array.data_type() { 97 | DataType::Dictionary(_, _) => { 98 | let dict_arr = array.as_any_dictionary(); 99 | let values_arr = dict_arr.values().clone(); 100 | Ok(values_arr) 101 | } 102 | _ => Err(ArrowError::ComputeError( 103 | "Expected dictionary-typed Array".to_string(), 104 | )), 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/types.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import array as _array 4 | import mmap 5 | import sys 6 | from typing import TYPE_CHECKING, Protocol, Tuple, Union 7 | 8 | if sys.version_info >= (3, 12): 9 | from collections.abc import Buffer as _Buffer 10 | else: 11 | from typing_extensions import Buffer as _Buffer 12 | 13 | if TYPE_CHECKING: 14 | import numpy as np 15 | 16 | 17 | class ArrowSchemaExportable(Protocol): 18 | """ 19 | An object with an `__arrow_c_schema__` method implementing the [Arrow C Data Interface 20 | interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the 21 | [Arrow PyCapsule 22 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 23 | 24 | Such objects include: 25 | 26 | - arro3 [`Schema`][arro3.core.Schema], [`Field`][arro3.core.Field], or [`DataType`][arro3.core.DataType] objects. 27 | - pyarrow [`Schema`][pyarrow.Schema], [`Field`][pyarrow.Field], or [`DataType`][pyarrow.DataType] objects. 28 | 29 | This allows for zero-copy Arrow data interchange across libraries. 30 | """ 31 | 32 | def __arrow_c_schema__(self) -> object: ... 33 | 34 | 35 | class ArrowArrayExportable(Protocol): 36 | """ 37 | An object with an `__arrow_c_array__` method implementing the [Arrow C Data Interface 38 | interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the 39 | [Arrow PyCapsule 40 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 41 | 42 | Such objects include: 43 | 44 | - arro3 [`Array`][arro3.core.Array] or [`RecordBatch`][arro3.core.RecordBatch] objects. 45 | - pyarrow [`Array`][pyarrow.Array] or [`RecordBatch`][pyarrow.RecordBatch] objects 46 | 47 | This allows for zero-copy Arrow data interchange across libraries. 48 | """ 49 | 50 | def __arrow_c_array__( 51 | self, requested_schema: object | None = None 52 | ) -> Tuple[object, object]: ... 53 | 54 | 55 | class ArrowStreamExportable(Protocol): 56 | """ 57 | An object with an `__arrow_c_stream__` method implementing the [Arrow C Stream 58 | interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the 59 | [Arrow PyCapsule 60 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 61 | 62 | Supported objects include: 63 | 64 | - arro3 [`Table`][arro3.core.Table], [`RecordBatchReader`][arro3.core.RecordBatchReader], [`ChunkedArray`][arro3.core.ChunkedArray], or [`ArrayReader`][arro3.core.ArrayReader] objects. 65 | - Polars `Series` or `DataFrame` objects (polars v1.2 or higher) 66 | - DuckDB table-like objects, such as [`DuckDBPyRelation`][duckdb.DuckDBPyRelation] or [`DuckDBPyConnection`][duckdb.DuckDBPyConnection]. 67 | - pyarrow [`RecordBatchReader`][pyarrow.RecordBatchReader], [`Table`][pyarrow.Table], or [`ChunkedArray`][pyarrow.ChunkedArray] objects (pyarrow v14 or 68 | higher) 69 | - pandas [`DataFrame`][pandas.DataFrame]s (pandas v2.2 or higher) 70 | - ibis `Table` objects. 71 | 72 | This allows for zero-copy Arrow data interchange across libraries. 73 | 74 | For an up to date list of supported objects, see [this 75 | issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008). 76 | """ 77 | 78 | def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... 79 | 80 | 81 | # From numpy 82 | # https://github.com/numpy/numpy/blob/961b70f6aaeed67147245b56ddb3f12ed1a050b5/numpy/__init__.pyi#L1772C1-L1785C1 83 | if sys.version_info >= (3, 12): 84 | from collections.abc import Buffer as _SupportsBuffer 85 | else: 86 | _SupportsBuffer = Union[ 87 | bytes, 88 | bytearray, 89 | memoryview, 90 | _array.array, 91 | mmap.mmap, 92 | "np.ndarray", 93 | _Buffer, 94 | ] 95 | 96 | 97 | # Numpy arrays don't yet declare `__buffer__` (or maybe just on a very recent version) 98 | ArrayInput = Union[ArrowArrayExportable, _SupportsBuffer] 99 | """Accepted input as an Arrow array. 100 | 101 | Buffer protocol input (such as numpy arrays) will be interpreted zero-copy except in the 102 | case of boolean-typed input, which must be copied to the Arrow format. 103 | """ 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.parquet 2 | *.whl 3 | *.arrow 4 | *.arrows 5 | 6 | # Generated by Cargo 7 | # will have compiled files and executables 8 | debug/ 9 | target/ 10 | 11 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 12 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 13 | # Cargo.lock 14 | 15 | # These are backup files generated by rustfmt 16 | **/*.rs.bk 17 | 18 | # MSVC Windows builds of rustc generate these, which store debugging information 19 | *.pdb 20 | 21 | # Byte-compiled / optimized / DLL files 22 | __pycache__/ 23 | *.py[cod] 24 | *$py.class 25 | 26 | # C extensions 27 | *.so 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | share/python-wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | MANIFEST 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .nox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | *.py,cover 70 | .hypothesis/ 71 | .pytest_cache/ 72 | cover/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | db.sqlite3 82 | db.sqlite3-journal 83 | 84 | # Flask stuff: 85 | instance/ 86 | .webassets-cache 87 | 88 | # Scrapy stuff: 89 | .scrapy 90 | 91 | # Sphinx documentation 92 | docs/_build/ 93 | 94 | # PyBuilder 95 | .pybuilder/ 96 | target/ 97 | 98 | # Jupyter Notebook 99 | .ipynb_checkpoints 100 | 101 | # IPython 102 | profile_default/ 103 | ipython_config.py 104 | 105 | # pyenv 106 | # For a library or package, you might want to ignore these files since the code is 107 | # intended to run in multiple environments; otherwise, check them in: 108 | # .python-version 109 | 110 | # pipenv 111 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 112 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 113 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 114 | # install all needed dependencies. 115 | #Pipfile.lock 116 | 117 | # poetry 118 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 119 | # This is especially recommended for binary packages to ensure reproducibility, and is more 120 | # commonly ignored for libraries. 121 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 122 | #poetry.lock 123 | 124 | # pdm 125 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 126 | #pdm.lock 127 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 128 | # in version control. 129 | # https://pdm.fming.dev/#use-with-ide 130 | .pdm.toml 131 | 132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 133 | __pypackages__/ 134 | 135 | # Celery stuff 136 | celerybeat-schedule 137 | celerybeat.pid 138 | 139 | # SageMath parsed files 140 | *.sage.py 141 | 142 | # Environments 143 | .env 144 | .venv 145 | env/ 146 | venv/ 147 | ENV/ 148 | env.bak/ 149 | venv.bak/ 150 | 151 | # Spyder project settings 152 | .spyderproject 153 | .spyproject 154 | 155 | # Rope project settings 156 | .ropeproject 157 | 158 | # mkdocs documentation 159 | /site 160 | 161 | # mypy 162 | .mypy_cache/ 163 | .dmypy.json 164 | dmypy.json 165 | 166 | # Pyre type checker 167 | .pyre/ 168 | 169 | # pytype static type analyzer 170 | .pytype/ 171 | 172 | # Cython debug symbols 173 | cython_debug/ 174 | 175 | # PyCharm 176 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 177 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 178 | # and can be added to the global gitignore or merged into this file. For a more nuclear 179 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 180 | #.idea/ 181 | -------------------------------------------------------------------------------- /pyo3-arrow/src/ffi/from_python/utils.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; 2 | use arrow_array::ffi_stream::FFI_ArrowArrayStream; 3 | use arrow_array::{make_array, ArrayRef}; 4 | use arrow_schema::Field; 5 | use pyo3::exceptions::{PyTypeError, PyValueError}; 6 | use pyo3::prelude::*; 7 | use pyo3::types::{PyCapsule, PyTuple}; 8 | use pyo3::{intern, PyAny, PyResult}; 9 | 10 | /// Validate PyCapsule has provided name 11 | pub fn validate_pycapsule_name(capsule: &Bound, expected_name: &str) -> PyResult<()> { 12 | let capsule_name = capsule.name()?; 13 | if let Some(capsule_name) = capsule_name { 14 | let capsule_name = capsule_name.to_str()?; 15 | if capsule_name != expected_name { 16 | return Err(PyValueError::new_err(format!( 17 | "Expected name '{}' in PyCapsule, instead got '{}'", 18 | expected_name, capsule_name 19 | ))); 20 | } 21 | } else { 22 | return Err(PyValueError::new_err( 23 | "Expected schema PyCapsule to have name set.", 24 | )); 25 | } 26 | 27 | Ok(()) 28 | } 29 | 30 | /// Import `__arrow_c_schema__` across Python boundary 31 | pub(crate) fn call_arrow_c_schema<'py>(ob: &'py Bound) -> PyResult> { 32 | let py_str = intern!(ob.py(), "__arrow_c_schema__"); 33 | if !ob.hasattr(py_str)? { 34 | return Err(PyValueError::new_err( 35 | "Expected an object with dunder __arrow_c_schema__", 36 | )); 37 | } 38 | 39 | Ok(ob.getattr(py_str)?.call0()?.downcast_into()?) 40 | } 41 | 42 | pub(crate) fn import_schema_pycapsule<'py>( 43 | capsule: &'py Bound, 44 | ) -> PyResult<&'py FFI_ArrowSchema> { 45 | validate_pycapsule_name(capsule, "arrow_schema")?; 46 | 47 | let schema_ptr = unsafe { capsule.reference::() }; 48 | Ok(schema_ptr) 49 | } 50 | 51 | /// Import `__arrow_c_array__` across Python boundary 52 | pub(crate) fn call_arrow_c_array<'py>( 53 | ob: &'py Bound, 54 | ) -> PyResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> { 55 | let py_str = intern!(ob.py(), "__arrow_c_array__"); 56 | if !ob.hasattr(py_str)? { 57 | return Err(PyValueError::new_err( 58 | "Expected an object with dunder __arrow_c_array__", 59 | )); 60 | } 61 | 62 | let tuple = ob.getattr(py_str)?.call0()?; 63 | if !tuple.is_instance_of::() { 64 | return Err(PyTypeError::new_err( 65 | "Expected __arrow_c_array__ to return a tuple.", 66 | )); 67 | } 68 | 69 | let schema_capsule = tuple.get_item(0)?.downcast_into()?; 70 | let array_capsule = tuple.get_item(1)?.downcast_into()?; 71 | Ok((schema_capsule, array_capsule)) 72 | } 73 | 74 | pub(crate) fn import_array_pycapsules( 75 | schema_capsule: &Bound, 76 | array_capsule: &Bound, 77 | ) -> PyResult<(ArrayRef, Field)> { 78 | validate_pycapsule_name(schema_capsule, "arrow_schema")?; 79 | validate_pycapsule_name(array_capsule, "arrow_array")?; 80 | 81 | let schema_ptr = unsafe { schema_capsule.reference::() }; 82 | let array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; 83 | 84 | let array_data = unsafe { arrow_array::ffi::from_ffi(array, schema_ptr) } 85 | .map_err(|err| PyTypeError::new_err(err.to_string()))?; 86 | let field = Field::try_from(schema_ptr).map_err(|err| PyTypeError::new_err(err.to_string()))?; 87 | let array = make_array(array_data); 88 | Ok((array, field)) 89 | } 90 | 91 | /// Import `__arrow_c_stream__` across Python boundary. 92 | pub(crate) fn call_arrow_c_stream<'py>(ob: &'py Bound) -> PyResult> { 93 | let py_str = intern!(ob.py(), "__arrow_c_stream__"); 94 | if !ob.hasattr(py_str)? { 95 | return Err(PyValueError::new_err( 96 | "Expected an object with dunder __arrow_c_stream__", 97 | )); 98 | } 99 | 100 | let capsule = ob.getattr(py_str)?.call0()?.downcast_into()?; 101 | Ok(capsule) 102 | } 103 | 104 | pub(crate) fn import_stream_pycapsule( 105 | capsule: &Bound, 106 | ) -> PyResult { 107 | validate_pycapsule_name(capsule, "arrow_array_stream")?; 108 | 109 | let stream = unsafe { FFI_ArrowArrayStream::from_raw(capsule.pointer() as _) }; 110 | Ok(stream) 111 | } 112 | -------------------------------------------------------------------------------- /pyo3-arrow/src/interop/numpy/to_numpy.rs: -------------------------------------------------------------------------------- 1 | use arrow_array::cast::AsArray; 2 | use arrow_array::types::*; 3 | use arrow_array::{Array, BinaryArrayType, StringArrayType}; 4 | use arrow_schema::DataType; 5 | use numpy::ToPyArray; 6 | use pyo3::exceptions::{PyNotImplementedError, PyValueError}; 7 | use pyo3::prelude::*; 8 | use pyo3::types::{PyAnyMethods, PyBytes, PyDict, PyList, PyString, PyTuple}; 9 | use pyo3::{intern, PyResult, Python}; 10 | 11 | pub fn to_numpy<'py>(py: Python<'py>, arr: &'py dyn Array) -> PyResult> { 12 | if arr.null_count() > 0 { 13 | return Err(PyValueError::new_err( 14 | "Cannot create numpy array from arrow array with nulls.", 15 | )); 16 | } 17 | 18 | macro_rules! impl_primitive { 19 | ($arrow_type:ty) => { 20 | arr.as_primitive::<$arrow_type>() 21 | .values() 22 | .to_pyarray(py) 23 | .into_any() 24 | }; 25 | } 26 | 27 | let result = match arr.data_type() { 28 | DataType::Float16 => impl_primitive!(Float16Type), 29 | DataType::Float32 => impl_primitive!(Float32Type), 30 | DataType::Float64 => impl_primitive!(Float64Type), 31 | DataType::UInt8 => impl_primitive!(UInt8Type), 32 | DataType::UInt16 => impl_primitive!(UInt16Type), 33 | DataType::UInt32 => impl_primitive!(UInt32Type), 34 | DataType::UInt64 => impl_primitive!(UInt64Type), 35 | DataType::Int8 => impl_primitive!(Int8Type), 36 | DataType::Int16 => impl_primitive!(Int16Type), 37 | DataType::Int32 => impl_primitive!(Int32Type), 38 | DataType::Int64 => impl_primitive!(Int64Type), 39 | DataType::Boolean => { 40 | let bools = arr.as_boolean().values().iter().collect::>(); 41 | bools.to_pyarray(py).into_any() 42 | } 43 | // For other data types we create Python objects and then create an object-typed numpy 44 | // array 45 | DataType::Binary => binary_to_numpy(py, arr.as_binary::())?, 46 | DataType::LargeBinary => binary_to_numpy(py, arr.as_binary::())?, 47 | DataType::BinaryView => binary_to_numpy(py, arr.as_binary_view())?, 48 | DataType::Utf8 => string_to_numpy(py, arr.as_string::())?, 49 | DataType::LargeUtf8 => string_to_numpy(py, arr.as_string::())?, 50 | DataType::Utf8View => string_to_numpy(py, arr.as_string_view())?, 51 | dt => { 52 | return Err(PyNotImplementedError::new_err(format!( 53 | "Unsupported type in to_numpy {dt}" 54 | ))) 55 | } 56 | }; 57 | Ok(result) 58 | } 59 | 60 | fn binary_to_numpy<'a>( 61 | py: Python<'a>, 62 | arr: impl BinaryArrayType<'a>, 63 | ) -> PyResult> { 64 | let mut py_bytes = Vec::with_capacity(arr.len()); 65 | arr.iter() 66 | .for_each(|x| py_bytes.push(PyBytes::new(py, x.unwrap()))); 67 | let py_list = PyList::new(py, py_bytes)?; 68 | let numpy_mod = py.import(intern!(py, "numpy"))?; 69 | let kwargs = PyDict::new(py); 70 | kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; 71 | numpy_mod.call_method( 72 | intern!(py, "array"), 73 | PyTuple::new(py, vec![py_list])?, 74 | Some(&kwargs), 75 | ) 76 | } 77 | 78 | fn string_to_numpy<'a>( 79 | py: Python<'a>, 80 | arr: impl StringArrayType<'a>, 81 | ) -> PyResult> { 82 | let mut py_bytes = Vec::with_capacity(arr.len()); 83 | arr.iter() 84 | .for_each(|x| py_bytes.push(PyString::new(py, x.unwrap()))); 85 | let py_list = PyList::new(py, py_bytes)?; 86 | let numpy_mod = py.import(intern!(py, "numpy"))?; 87 | let kwargs = PyDict::new(py); 88 | kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?; 89 | numpy_mod.call_method( 90 | intern!(py, "array"), 91 | PyTuple::new(py, vec![py_list])?, 92 | Some(&kwargs), 93 | ) 94 | } 95 | 96 | pub fn chunked_to_numpy<'py>( 97 | py: Python<'py>, 98 | arrs: Vec<&'py dyn Array>, 99 | ) -> PyResult> { 100 | let py_arrays = arrs 101 | .iter() 102 | .map(|arr| to_numpy(py, *arr)) 103 | .collect::>>()?; 104 | 105 | let numpy_mod = py.import(intern!(py, "numpy"))?; 106 | numpy_mod.call_method1(intern!(py, "concatenate"), (py_arrays,)) 107 | } 108 | -------------------------------------------------------------------------------- /arro3-compute/src/temporal.rs: -------------------------------------------------------------------------------- 1 | use arrow_schema::{DataType, Field}; 2 | use pyo3::exceptions::PyValueError; 3 | use pyo3::prelude::*; 4 | use pyo3_arrow::error::PyArrowResult; 5 | use pyo3_arrow::ffi::ArrayIterator; 6 | use pyo3_arrow::input::AnyArray; 7 | use pyo3_arrow::{PyArray, PyArrayReader}; 8 | 9 | pub enum DatePart { 10 | /// Quarter of the year, in range `1..=4` 11 | Quarter, 12 | /// Calendar year 13 | Year, 14 | /// Month in the year, in range `1..=12` 15 | Month, 16 | /// ISO week of the year, in range `1..=53` 17 | Week, 18 | /// Day of the month, in range `1..=31` 19 | Day, 20 | /// Day of the week, in range `0..=6`, where Sunday is `0` 21 | DayOfWeekSunday0, 22 | /// Day of the week, in range `0..=6`, where Monday is `0` 23 | DayOfWeekMonday0, 24 | /// Day of year, in range `1..=366` 25 | DayOfYear, 26 | /// Hour of the day, in range `0..=23` 27 | Hour, 28 | /// Minute of the hour, in range `0..=59` 29 | Minute, 30 | /// Second of the minute, in range `0..=59` 31 | Second, 32 | /// Millisecond of the second 33 | Millisecond, 34 | /// Microsecond of the second 35 | Microsecond, 36 | /// Nanosecond of the second 37 | Nanosecond, 38 | } 39 | 40 | impl<'a> FromPyObject<'a> for DatePart { 41 | fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { 42 | let s: String = ob.extract()?; 43 | match s.to_lowercase().as_str() { 44 | "quarter" => Ok(Self::Quarter), 45 | "year" => Ok(Self::Year), 46 | "month" => Ok(Self::Month), 47 | "week" => Ok(Self::Week), 48 | "day" => Ok(Self::Day), 49 | "dayofweeksunday0" => Ok(Self::DayOfWeekSunday0), 50 | "dayofweekmonday0" => Ok(Self::DayOfWeekMonday0), 51 | "dayofyear" => Ok(Self::DayOfYear), 52 | "hour" => Ok(Self::Hour), 53 | "minute" => Ok(Self::Minute), 54 | "second" => Ok(Self::Second), 55 | "millisecond" => Ok(Self::Millisecond), 56 | "microsecond" => Ok(Self::Microsecond), 57 | "nanosecond" => Ok(Self::Nanosecond), 58 | _ => Err(PyValueError::new_err("Unexpected date part")), 59 | } 60 | } 61 | } 62 | 63 | impl From for arrow_arith::temporal::DatePart { 64 | fn from(value: DatePart) -> Self { 65 | match value { 66 | DatePart::Quarter => arrow_arith::temporal::DatePart::Quarter, 67 | DatePart::Year => arrow_arith::temporal::DatePart::Year, 68 | DatePart::Month => arrow_arith::temporal::DatePart::Month, 69 | DatePart::Week => arrow_arith::temporal::DatePart::Week, 70 | DatePart::Day => arrow_arith::temporal::DatePart::Day, 71 | DatePart::DayOfWeekSunday0 => arrow_arith::temporal::DatePart::DayOfWeekSunday0, 72 | DatePart::DayOfWeekMonday0 => arrow_arith::temporal::DatePart::DayOfWeekMonday0, 73 | DatePart::DayOfYear => arrow_arith::temporal::DatePart::DayOfYear, 74 | DatePart::Hour => arrow_arith::temporal::DatePart::Hour, 75 | DatePart::Minute => arrow_arith::temporal::DatePart::Minute, 76 | DatePart::Second => arrow_arith::temporal::DatePart::Second, 77 | DatePart::Millisecond => arrow_arith::temporal::DatePart::Millisecond, 78 | DatePart::Microsecond => arrow_arith::temporal::DatePart::Microsecond, 79 | DatePart::Nanosecond => arrow_arith::temporal::DatePart::Nanosecond, 80 | } 81 | } 82 | } 83 | 84 | #[pyfunction] 85 | pub fn date_part(py: Python, input: AnyArray, part: DatePart) -> PyArrowResult { 86 | match input { 87 | AnyArray::Array(input) => { 88 | let out = arrow_arith::temporal::date_part(input.as_ref(), part.into())?; 89 | Ok(PyArray::from_array_ref(out).to_arro3(py)?.unbind()) 90 | } 91 | AnyArray::Stream(stream) => { 92 | let reader = stream.into_reader()?; 93 | let output_field = Field::new("", DataType::Int32, true); 94 | let part = part.into(); 95 | 96 | let iter = reader 97 | .into_iter() 98 | .map(move |array| arrow_arith::temporal::date_part(array?.as_ref(), part)); 99 | Ok( 100 | PyArrayReader::new(Box::new(ArrayIterator::new(iter, output_field.into()))) 101 | .to_arro3(py)? 102 | .unbind(), 103 | ) 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /tests/core/test_table.py: -------------------------------------------------------------------------------- 1 | import geoarrow.types as gt 2 | import numpy as np 3 | import pandas as pd 4 | import pyarrow as pa 5 | import pytest 6 | from arro3.core import Array, ChunkedArray, DataType, Field, Table 7 | 8 | 9 | def test_table_getitem(): 10 | a = pa.chunked_array([[1, 2, 3, 4]]) 11 | b = pa.chunked_array([["a", "b", "c", "d"]]) 12 | table = Table.from_pydict({"a": a, "b": b}) 13 | 14 | assert a == pa.chunked_array(table["a"]) 15 | assert b == pa.chunked_array(table["b"]) 16 | assert a == pa.chunked_array(table[0]) 17 | assert b == pa.chunked_array(table[1]) 18 | 19 | with pytest.raises(KeyError): 20 | table["foo"] 21 | 22 | with pytest.raises(IndexError): 23 | table[10] 24 | 25 | 26 | def test_table_from_arrays(): 27 | a = pa.array([1, 2, 3, 4]) 28 | b = pa.array(["a", "b", "c", "d"]) 29 | arro3_table = Table.from_arrays([a, b], names=["a", "b"]) 30 | pa_table = pa.Table.from_arrays([a, b], names=["a", "b"]) 31 | assert pa.table(arro3_table) == pa_table 32 | 33 | 34 | def test_table_from_pydict(): 35 | mapping = {"a": pa.array([1, 2, 3, 4]), "b": pa.array(["a", "b", "c", "d"])} 36 | arro3_table = Table.from_pydict(mapping) 37 | pa_table = pa.Table.from_pydict(mapping) 38 | assert pa.table(arro3_table) == pa_table 39 | 40 | 41 | def test_table_constructor_ext_array(): 42 | typ = DataType.uint8() 43 | metadata = {"ARROW:extension:name": "ext_name"} 44 | field = Field("", type=typ, nullable=True, metadata=metadata) 45 | arr = Array([1, 2, 3, 4], field) 46 | t = Table({"a": arr}) 47 | assert t.schema.field("a").metadata_str["ARROW:extension:name"] == "ext_name" 48 | 49 | ca = ChunkedArray([arr], field) 50 | t = Table({"a": ca}) 51 | assert t.schema.field("a").metadata_str["ARROW:extension:name"] == "ext_name" 52 | 53 | 54 | def test_table_append_array_extension_type(): 55 | """ 56 | Test that extension metadata gets propagated from an array to a column on a table. 57 | """ 58 | # Test that extension 59 | extension_type = gt.point(dimensions="xy", coord_type="interleaved").to_pyarrow() 60 | coords = np.array([1, 2, 3, 4], dtype=np.float64) 61 | ext_array = pa.FixedSizeListArray.from_arrays(coords, 2).cast(extension_type) 62 | 63 | table = Table.from_arrays([pa.array(["a", "b"])], names=["a"]) 64 | geo_table = table.append_column("geometry", ChunkedArray([ext_array])) 65 | 66 | meta = geo_table.schema["geometry"].metadata 67 | assert b"ARROW:extension:name" in meta.keys() 68 | assert meta[b"ARROW:extension:name"] == b"geoarrow.point" 69 | 70 | 71 | def test_table_from_batches_empty_columns_with_len(): 72 | df = pd.DataFrame({"a": [1, 2, 3]}) 73 | no_columns = df[[]] 74 | pa_table = pa.Table.from_pandas(no_columns) 75 | table = Table.from_batches(pa_table.to_batches()) 76 | assert table.num_columns == 0 77 | assert table.num_rows == 3 78 | 79 | 80 | def test_rechunk(): 81 | a = pa.chunked_array([[1, 2, 3, 4]]) 82 | b = pa.chunked_array([["a", "b", "c", "d"]]) 83 | table = Table.from_pydict({"a": a, "b": b}) 84 | 85 | rechunked1 = table.rechunk(max_chunksize=1) 86 | assert rechunked1.chunk_lengths == [1, 1, 1, 1] 87 | 88 | rechunked2 = rechunked1.rechunk(max_chunksize=2) 89 | assert rechunked2.chunk_lengths == [2, 2] 90 | assert rechunked2.rechunk().chunk_lengths == [4] 91 | 92 | 93 | def test_slice(): 94 | a = pa.chunked_array([[1, 2], [3, 4]]) 95 | b = pa.chunked_array([["a", "b"], ["c", "d"]]) 96 | table = Table.from_pydict({"a": a, "b": b}) 97 | 98 | sliced1 = table.slice(0, 1) 99 | assert sliced1.num_rows == 1 100 | assert sliced1.chunk_lengths == [1] 101 | 102 | sliced2 = table.slice(1, 2) 103 | assert sliced2.num_rows == 2 104 | assert sliced2.chunk_lengths == [1, 1] 105 | 106 | 107 | def test_nonempty_table_no_columns(): 108 | table = pa.table({"a": [1, 2, 3, 4]}).select([]) 109 | assert len(table) == 4 110 | assert table.num_columns == 0 111 | arro3_table = Table.from_arrow(table) 112 | retour = pa.table(arro3_table) 113 | assert table == retour 114 | 115 | 116 | class CustomException(Exception): 117 | pass 118 | 119 | 120 | class ArrowCStreamFails: 121 | def __arrow_c_stream__(self, requested_schema=None): 122 | raise CustomException 123 | 124 | 125 | def test_table_import_preserve_exception(): 126 | """https://github.com/kylebarron/arro3/issues/325""" 127 | 128 | c_stream_obj = ArrowCStreamFails() 129 | with pytest.raises(CustomException): 130 | Table.from_arrow(c_stream_obj) 131 | 132 | with pytest.raises(CustomException): 133 | Table(c_stream_obj) 134 | -------------------------------------------------------------------------------- /arro3-io/python/arro3/io/_csv.pyi: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import IO 3 | 4 | # Note: importing with 5 | # `from arro3.core import Array` 6 | # will cause Array to be included in the generated docs in this module. 7 | import arro3.core as core 8 | import arro3.core.types as types 9 | 10 | __all__ = ["infer_csv_schema", "read_csv", "write_csv"] 11 | 12 | def infer_csv_schema( 13 | file: IO[bytes] | Path | str, 14 | *, 15 | has_header: bool | None = None, 16 | max_records: int | None = None, 17 | delimiter: str | None = None, 18 | escape: str | None = None, 19 | quote: str | None = None, 20 | terminator: str | None = None, 21 | comment: str | None = None, 22 | ) -> core.Schema: 23 | """Infer a CSV file's schema 24 | 25 | If `max_records` is `None`, all records will be read, otherwise up to `max_records` 26 | records are read to infer the schema 27 | 28 | Args: 29 | file: The input CSV path or buffer. 30 | has_header: Set whether the CSV file has a header. Defaults to None. 31 | max_records: The maximum number of records to read to infer schema. Defaults to 32 | None. 33 | delimiter: Set the CSV file's column delimiter as a byte character. Defaults to 34 | None. 35 | escape: Set the CSV escape character. Defaults to None. 36 | quote: Set the CSV quote character. Defaults to None. 37 | terminator: Set the line terminator. Defaults to None. 38 | comment: Set the comment character. Defaults to None. 39 | 40 | Returns: 41 | inferred schema from data 42 | """ 43 | 44 | def read_csv( 45 | file: IO[bytes] | Path | str, 46 | schema: types.ArrowSchemaExportable, 47 | *, 48 | has_header: bool | None = None, 49 | batch_size: int | None = None, 50 | delimiter: str | None = None, 51 | escape: str | None = None, 52 | quote: str | None = None, 53 | terminator: str | None = None, 54 | comment: str | None = None, 55 | ) -> core.RecordBatchReader: 56 | """Read a CSV file to an Arrow RecordBatchReader. 57 | 58 | Args: 59 | file: The input CSV path or buffer. 60 | schema: The Arrow schema for this CSV file. Use 61 | [infer_csv_schema][arro3.io.infer_csv_schema] to infer an Arrow schema if 62 | needed. 63 | has_header: Set whether the CSV file has a header. Defaults to None. 64 | batch_size: Set the batch size (number of records to load at one time). 65 | Defaults to None. 66 | delimiter: Set the CSV file's column delimiter as a byte character. Defaults to 67 | None. 68 | escape: Set the CSV escape character. Defaults to None. 69 | quote: Set the CSV quote character. Defaults to None. 70 | terminator: Set the line terminator. Defaults to None. 71 | comment: Set the comment character. Defaults to None. 72 | 73 | Returns: 74 | A RecordBatchReader with read CSV data 75 | """ 76 | 77 | def write_csv( 78 | data: types.ArrowStreamExportable | types.ArrowArrayExportable, 79 | file: IO[bytes] | Path | str, 80 | *, 81 | header: bool | None = None, 82 | delimiter: str | None = None, 83 | escape: str | None = None, 84 | quote: str | None = None, 85 | date_format: str | None = None, 86 | datetime_format: str | None = None, 87 | time_format: str | None = None, 88 | timestamp_format: str | None = None, 89 | timestamp_tz_format: str | None = None, 90 | null: str | None = None, 91 | ) -> None: 92 | """Write an Arrow Table or stream to a CSV file. 93 | 94 | Args: 95 | data: The Arrow Table, RecordBatchReader, or RecordBatch to write. 96 | file: The output buffer or file path for where to write the CSV. 97 | header: Set whether to write the CSV file with a header. Defaults to None. 98 | delimiter: Set the CSV file's column delimiter as a byte character. Defaults to 99 | None. 100 | escape: Set the CSV file's escape character as a byte character. 101 | 102 | In some variants of CSV, quotes are escaped using a special escape character 103 | like `\\` (instead of escaping quotes by doubling them). 104 | 105 | By default, writing these idiosyncratic escapes is disabled, and is only 106 | used when double_quote is disabled. Defaults to None. 107 | quote: Set the CSV file's quote character as a byte character. Defaults to None. 108 | date_format: Set the CSV file's date format. Defaults to None. 109 | datetime_format: Set the CSV file's datetime format. Defaults to None. 110 | time_format: Set the CSV file's time format. Defaults to None. 111 | timestamp_format: Set the CSV file's timestamp format. Defaults to None. 112 | timestamp_tz_format: Set the CSV file's timestamp tz format. Defaults to None. 113 | null: Set the value to represent null in output. Defaults to None. 114 | """ 115 | -------------------------------------------------------------------------------- /arro3-io/src/utils.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use parquet::file::reader::{ChunkReader, Length}; 3 | use pyo3_file::PyFileLikeObject; 4 | 5 | use pyo3::prelude::*; 6 | use std::fs::File; 7 | use std::io::{BufReader, Read, Seek, SeekFrom, Write}; 8 | use std::path::PathBuf; 9 | 10 | /// Represents either a path `File` or a file-like object `FileLike` 11 | #[derive(Debug)] 12 | pub enum FileReader { 13 | File(File), 14 | FileLike(PyFileLikeObject), 15 | } 16 | 17 | impl FileReader { 18 | fn try_clone(&self) -> std::io::Result { 19 | match self { 20 | Self::File(f) => Ok(Self::File(f.try_clone()?)), 21 | Self::FileLike(f) => Ok(Self::FileLike(f.clone())), 22 | } 23 | } 24 | } 25 | 26 | impl<'py> FromPyObject<'py> for FileReader { 27 | fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { 28 | if let Ok(path) = ob.extract::() { 29 | Ok(Self::File(File::open(path)?)) 30 | } else if let Ok(path) = ob.extract::() { 31 | Ok(Self::File(File::open(path)?)) 32 | } else { 33 | Ok(Self::FileLike(PyFileLikeObject::py_with_requirements( 34 | ob.clone(), 35 | true, 36 | false, 37 | true, 38 | false, 39 | )?)) 40 | } 41 | } 42 | } 43 | 44 | impl Read for FileReader { 45 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 46 | match self { 47 | Self::File(f) => f.read(buf), 48 | Self::FileLike(f) => f.read(buf), 49 | } 50 | } 51 | } 52 | 53 | impl Seek for FileReader { 54 | fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { 55 | match self { 56 | Self::File(f) => f.seek(pos), 57 | Self::FileLike(f) => f.seek(pos), 58 | } 59 | } 60 | } 61 | 62 | impl Length for FileReader { 63 | fn len(&self) -> u64 { 64 | match self { 65 | Self::File(f) => f.len(), 66 | Self::FileLike(f) => { 67 | let mut file = f.clone(); 68 | // Keep track of current pos 69 | let pos = file.stream_position().unwrap(); 70 | 71 | // Seek to end of file 72 | file.seek(std::io::SeekFrom::End(0)).unwrap(); 73 | let len = file.stream_position().unwrap(); 74 | 75 | // Seek back 76 | file.seek(std::io::SeekFrom::Start(pos)).unwrap(); 77 | len 78 | } 79 | } 80 | } 81 | } 82 | 83 | impl ChunkReader for FileReader { 84 | type T = BufReader; 85 | 86 | fn get_read(&self, start: u64) -> parquet::errors::Result { 87 | let mut reader = self.try_clone()?; 88 | reader.seek(SeekFrom::Start(start))?; 89 | Ok(BufReader::new(self.try_clone()?)) 90 | } 91 | 92 | fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result { 93 | let mut buffer = Vec::with_capacity(length); 94 | let mut reader = self.try_clone()?; 95 | reader.seek(SeekFrom::Start(start))?; 96 | let read = reader.take(length as _).read_to_end(&mut buffer)?; 97 | 98 | if read != length { 99 | return Err(parquet::errors::ParquetError::EOF(format!( 100 | "Expected to read {length} bytes, read only {read}" 101 | ))); 102 | } 103 | Ok(buffer.into()) 104 | } 105 | } 106 | 107 | /// Represents either a path `File` or a file-like object `FileLike` 108 | #[derive(Debug)] 109 | pub enum FileWriter { 110 | File(File), 111 | FileLike(PyFileLikeObject), 112 | } 113 | 114 | impl<'py> FromPyObject<'py> for FileWriter { 115 | fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { 116 | if let Ok(path) = ob.extract::() { 117 | Ok(Self::File(File::create(path)?)) 118 | } else if let Ok(path) = ob.extract::() { 119 | Ok(Self::File(File::create(path)?)) 120 | } else { 121 | Ok(Self::FileLike(PyFileLikeObject::py_with_requirements( 122 | ob.clone(), 123 | false, 124 | true, 125 | true, 126 | false, 127 | )?)) 128 | } 129 | } 130 | } 131 | 132 | impl Write for FileWriter { 133 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 134 | match self { 135 | Self::File(f) => f.write(buf), 136 | Self::FileLike(f) => f.write(buf), 137 | } 138 | } 139 | 140 | fn flush(&mut self) -> std::io::Result<()> { 141 | match self { 142 | Self::File(f) => f.flush(), 143 | Self::FileLike(f) => f.flush(), 144 | } 145 | } 146 | } 147 | 148 | impl Seek for FileWriter { 149 | fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { 150 | match self { 151 | Self::File(f) => f.seek(pos), 152 | Self::FileLike(f) => f.seek(pos), 153 | } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /tests/core/test_constructors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pyarrow as pa 3 | from arro3.core import ( 4 | Array, 5 | DataType, 6 | Field, 7 | fixed_size_list_array, 8 | list_array, 9 | list_offsets, 10 | struct_array, 11 | ) 12 | 13 | 14 | def test_fixed_size_list_array(): 15 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 16 | flat_array = Array.from_numpy(np_arr) 17 | array = fixed_size_list_array(flat_array, 2) 18 | pa_array = pa.array(array) 19 | assert pa.types.is_fixed_size_list(pa_array.type) 20 | assert pa_array.type.list_size == 2 21 | 22 | 23 | def test_fixed_size_list_array_with_type(): 24 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 25 | flat_array = Array.from_numpy(np_arr) 26 | list_type = DataType.list(Field("inner", DataType.float64()), 2) 27 | array = fixed_size_list_array(flat_array, 2, type=list_type) 28 | pa_array = pa.array(array) 29 | assert pa.types.is_fixed_size_list(pa_array.type) 30 | assert pa_array.type.list_size == 2 31 | assert pa_array.type.field(0).name == "inner" 32 | 33 | 34 | def test_fixed_size_list_array_with_mask(): 35 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 36 | flat_array = Array.from_numpy(np_arr) 37 | 38 | np_mask = np.array([True, False, True], dtype=bool) 39 | mask = Array.from_numpy(np_mask) 40 | 41 | arro3_array = fixed_size_list_array(flat_array, 2, mask=mask) 42 | 43 | # Note that we don't exactly match the pyarrow array because we still allocate for 44 | # null values. 45 | pa_arr = pa.array( 46 | [[1, 2], [3, 4], [5, 6]], 47 | type=pa.field(arro3_array.type).type, 48 | mask=np_mask, 49 | ) 50 | 51 | assert arro3_array[0].is_valid == pa_arr[0].is_valid 52 | assert arro3_array[1].is_valid == pa_arr[1].is_valid 53 | assert arro3_array[1] == Array(pa_arr)[1] 54 | assert arro3_array[2].is_valid == pa_arr[2].is_valid 55 | 56 | 57 | def test_list_array(): 58 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 59 | flat_array = Array.from_numpy(np_arr) 60 | offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32)) 61 | array = list_array(offsets_array, flat_array) 62 | pa_array = pa.array(array) 63 | assert pa.types.is_list(pa_array.type) 64 | assert list_offsets(array) == offsets_array 65 | 66 | 67 | def test_list_array_with_type(): 68 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 69 | flat_array = Array.from_numpy(np_arr) 70 | offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32)) 71 | 72 | list_type = DataType.list(Field("inner", DataType.float64())) 73 | array = list_array(offsets_array, flat_array, type=list_type) 74 | pa_array = pa.array(array) 75 | assert pa.types.is_list(pa_array.type) 76 | assert list_offsets(array) == offsets_array 77 | assert pa_array.type.field(0).name == "inner" 78 | 79 | 80 | def test_list_array_with_mask(): 81 | np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) 82 | flat_array = Array.from_numpy(np_arr) 83 | offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32)) 84 | 85 | np_mask = np.array([True, False, True], dtype=bool) 86 | mask = Array.from_numpy(np_mask) 87 | 88 | arro3_array = list_array(offsets_array, flat_array, mask=mask) 89 | 90 | # Note that we don't exactly match the pyarrow array because we still allocate for 91 | # null values. 92 | pa_arr = pa.array( 93 | [[1, 2], [3, 4, 5], [6]], type=pa.field(arro3_array.type).type, mask=np_mask 94 | ) 95 | 96 | assert arro3_array[0].is_valid == pa_arr[0].is_valid 97 | assert arro3_array[1].is_valid == pa_arr[1].is_valid 98 | assert arro3_array[1] == Array(pa_arr)[1] 99 | assert arro3_array[2].is_valid == pa_arr[2].is_valid 100 | 101 | 102 | def test_struct_array(): 103 | a = pa.array([1, 2, 3, 4]) 104 | b = pa.array(["a", "b", "c", "d"]) 105 | 106 | arr = struct_array([a, b], fields=[Field("a", a.type), Field("b", b.type)]) 107 | pa_type = pa.array(arr).type 108 | assert pa.types.is_struct(pa_type) 109 | assert pa_type.field(0).name == "a" 110 | assert pa_type.field(1).name == "b" 111 | 112 | 113 | def test_struct_array_with_mask(): 114 | a = pa.array([1, 2, 3, 4]) 115 | b = pa.array(["a", "b", "c", "d"]) 116 | 117 | np_mask = np.array([True, False, True, False], dtype=bool) 118 | mask = Array.from_numpy(np_mask) 119 | 120 | arro3_arr = struct_array( 121 | [a, b], 122 | fields=[Field("a", a.type), Field("b", b.type)], 123 | mask=mask, 124 | ) 125 | 126 | pa_arr = pa.array( 127 | [ 128 | {"a": 1, "b": "a"}, 129 | {"a": 2, "b": "b"}, 130 | {"a": 3, "b": "c"}, 131 | {"a": 4, "b": "d"}, 132 | ], 133 | type=pa.field(arro3_arr.type).type, 134 | mask=np_mask, 135 | ) 136 | 137 | for i in range(len(arro3_arr)): 138 | assert arro3_arr[i].is_valid == pa_arr[i].is_valid 139 | assert arro3_arr[i] == Array(pa_arr)[i] 140 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: arro3 2 | repo_name: kylebarron/arro3 3 | repo_url: https://github.com/kylebarron/arro3 4 | site_description: A minimal Python library for Apache Arrow, binding to the Rust Arrow implementation. 5 | site_author: Kyle Barron 6 | # Note: trailing slash recommended with mike: 7 | # https://squidfunk.github.io/mkdocs-material/setup/setting-up-versioning/#publishing-a-new-version 8 | site_url: https://kylebarron.dev/arro3/ 9 | docs_dir: docs 10 | 11 | extra: 12 | social: 13 | - icon: "fontawesome/brands/github" 14 | link: "https://github.com/kylebarron" 15 | - icon: "fontawesome/brands/twitter" 16 | link: "https://twitter.com/kylebarron2" 17 | version: 18 | provider: mike 19 | 20 | nav: 21 | - "index.md" 22 | - API Reference: 23 | - arro3.core: 24 | - api/core/array.md 25 | - api/core/array-reader.md 26 | - api/core/chunked-array.md 27 | - api/core/datatype.md 28 | - api/core/field.md 29 | - api/core/record-batch.md 30 | - api/core/record-batch-reader.md 31 | - api/core/scalar.md 32 | - api/core/schema.md 33 | - api/core/table.md 34 | - api/core/constructors.md 35 | - api/core/accessors.md 36 | - api/core/types.md 37 | - api/compute.md 38 | - arro3.io: 39 | - api/io/arrow-ipc.md 40 | - api/io/csv.md 41 | - api/io/json.md 42 | - api/io/parquet.md 43 | 44 | watch: 45 | - arro3-compute/python 46 | - arro3-core/python 47 | - arro3-io/python 48 | - docs 49 | 50 | theme: 51 | name: material 52 | palette: 53 | # Palette toggle for automatic mode 54 | - media: "(prefers-color-scheme)" 55 | toggle: 56 | icon: material/brightness-auto 57 | name: Switch to light mode 58 | 59 | # Palette toggle for light mode 60 | - media: "(prefers-color-scheme: light)" 61 | primary: indigo 62 | accent: indigo 63 | toggle: 64 | icon: material/brightness-7 65 | name: Switch to dark mode 66 | 67 | # Palette toggle for dark mode 68 | - media: "(prefers-color-scheme: dark)" 69 | scheme: slate 70 | primary: indigo 71 | accent: indigo 72 | toggle: 73 | icon: material/brightness-4 74 | name: Switch to system preference 75 | 76 | font: 77 | text: Roboto 78 | code: Roboto Mono 79 | 80 | features: 81 | - content.code.annotate 82 | - content.code.copy 83 | - navigation.indexes 84 | - navigation.instant 85 | - navigation.tracking 86 | - search.suggest 87 | - search.share 88 | 89 | plugins: 90 | - search 91 | - social: 92 | enabled: !ENV [CI, false] 93 | - mike: 94 | alias_type: "copy" 95 | canonical_version: "latest" 96 | - mkdocstrings: 97 | enable_inventory: true 98 | handlers: 99 | python: 100 | paths: [arro3-compute/python, arro3-core/python, arro3-io/python] 101 | options: 102 | # We set allow_inspection: false to ensure that all docstrings come 103 | # from the pyi files, not the Rust-facing doc comments. 104 | allow_inspection: false 105 | docstring_section_style: list 106 | docstring_style: google 107 | line_length: 80 108 | separate_signature: true 109 | show_root_heading: true 110 | show_signature_annotations: true 111 | show_source: false 112 | show_symbol_type_toc: true 113 | signature_crossrefs: true 114 | extensions: 115 | - griffe_inherited_docstrings 116 | 117 | inventories: 118 | - https://arrow.apache.org/docs/objects.inv 119 | - https://docs.pola.rs/api/python/stable/objects.inv 120 | - https://docs.python.org/3/objects.inv 121 | - https://duckdb.org/docs/stable/clients/python/reference/objects.inv 122 | - https://numpy.org/doc/stable/objects.inv 123 | - https://pandas.pydata.org/pandas-docs/stable/objects.inv 124 | - redirects: 125 | redirect_maps: 126 | "api/io.md": "api/io/parquet.md" 127 | 128 | # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 129 | markdown_extensions: 130 | - admonition 131 | - attr_list 132 | - codehilite: 133 | guess_lang: false 134 | - def_list 135 | - footnotes 136 | - md_in_html 137 | - pymdownx.arithmatex 138 | - pymdownx.betterem 139 | - pymdownx.caret: 140 | insert: false 141 | - pymdownx.details 142 | - pymdownx.emoji: 143 | emoji_index: !!python/name:material.extensions.emoji.twemoji 144 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 145 | - pymdownx.escapeall: 146 | hardbreak: true 147 | nbsp: true 148 | - pymdownx.magiclink: 149 | hide_protocol: true 150 | repo_url_shortener: true 151 | - pymdownx.smartsymbols 152 | - pymdownx.superfences 153 | - pymdownx.tasklist: 154 | custom_checkbox: true 155 | - pymdownx.tilde 156 | - toc: 157 | permalink: true 158 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_schema.pyi: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from ._data_type import DataType 4 | from ._field import Field 5 | from ._table import Table 6 | from .types import ArrowSchemaExportable 7 | 8 | class Schema: 9 | """An arrow Schema.""" 10 | def __init__( 11 | self, 12 | fields: Sequence[ArrowSchemaExportable], 13 | *, 14 | metadata: dict[str, str] | dict[bytes, bytes] | None = None, 15 | ) -> None: ... 16 | def __arrow_c_schema__(self) -> object: 17 | """ 18 | An implementation of the [Arrow PyCapsule 19 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 20 | This dunder method should not be called directly, but enables zero-copy data 21 | transfer to other Python libraries that understand Arrow memory. 22 | 23 | For example, you can call [`pyarrow.schema()`][pyarrow.schema] to convert this 24 | array into a pyarrow schema, without copying memory. 25 | """ 26 | 27 | def __eq__(self, other) -> bool: ... 28 | def __getitem__(self, key: int | str) -> Field: ... 29 | def __len__(self) -> int: ... 30 | def __repr__(self) -> str: ... 31 | @classmethod 32 | def from_arrow(cls, input: ArrowSchemaExportable) -> Schema: 33 | """Construct this from an existing Arrow object 34 | 35 | Args: 36 | input: Arrow schema to use for constructing this object 37 | 38 | Returns: 39 | _description_ 40 | """ 41 | @classmethod 42 | def from_arrow_pycapsule(cls, capsule) -> Schema: 43 | """Construct this object from a bare Arrow PyCapsule""" 44 | def append(self, field: ArrowSchemaExportable) -> Schema: 45 | """Append a field at the end of the schema. 46 | 47 | In contrast to Python's `list.append()` it does return a new object, leaving the 48 | original Schema unmodified. 49 | 50 | Args: 51 | field: new field 52 | 53 | Returns: 54 | New Schema 55 | """ 56 | def empty_table(self) -> Table: 57 | """Provide an empty table according to the schema. 58 | 59 | Returns: 60 | Table 61 | """ 62 | 63 | def equals(self, other: ArrowSchemaExportable) -> bool: 64 | """Test if this schema is equal to the other 65 | 66 | Args: 67 | other: _description_ 68 | 69 | Returns: 70 | _description_ 71 | """ 72 | 73 | def field(self, i: int | str) -> Field: 74 | """Select a field by its column name or numeric index. 75 | 76 | Args: 77 | i: other 78 | 79 | Returns: 80 | _description_ 81 | """ 82 | def get_all_field_indices(self, name: str) -> list[int]: 83 | """Return sorted list of indices for the fields with the given name. 84 | 85 | Args: 86 | name: _description_ 87 | 88 | Returns: 89 | _description_ 90 | """ 91 | def get_field_index(self, name: str) -> int: 92 | """Return index of the unique field with the given name. 93 | 94 | Args: 95 | name: _description_ 96 | 97 | Returns: 98 | _description_ 99 | """ 100 | def insert(self, i: int, field: ArrowSchemaExportable) -> Schema: 101 | """Add a field at position `i` to the schema. 102 | 103 | Args: 104 | i: _description_ 105 | field: _description_ 106 | 107 | Returns: 108 | _description_ 109 | """ 110 | @property 111 | def metadata(self) -> dict[bytes, bytes]: 112 | """The schema's metadata. 113 | 114 | Returns: 115 | _description_ 116 | """ 117 | 118 | @property 119 | def metadata_str(self) -> dict[str, str]: 120 | """The schema's metadata where keys and values are `str`, not `bytes`. 121 | 122 | Returns: 123 | _description_ 124 | """ 125 | @property 126 | def names(self) -> list[str]: 127 | """The schema's field names.""" 128 | 129 | def remove(self, i: int) -> Schema: 130 | """Remove the field at index i from the schema. 131 | 132 | Args: 133 | i: _description_ 134 | 135 | Returns: 136 | _description_ 137 | """ 138 | def remove_metadata(self) -> Schema: 139 | """Create new schema without metadata, if any 140 | 141 | 142 | Returns: 143 | _description_ 144 | """ 145 | def set(self, i: int, field: ArrowSchemaExportable) -> Schema: 146 | """Replace a field at position `i` in the schema. 147 | 148 | Args: 149 | i: _description_ 150 | field: _description_ 151 | 152 | Returns: 153 | _description_ 154 | """ 155 | @property 156 | def types(self) -> list[DataType]: 157 | """The schema's field types. 158 | 159 | Returns: 160 | _description_ 161 | """ 162 | def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Schema: 163 | """Add metadata as dict of string keys and values to Schema. 164 | 165 | Args: 166 | metadata: _description_ 167 | 168 | Returns: 169 | _description_ 170 | """ 171 | -------------------------------------------------------------------------------- /arro3-core/python/arro3/core/_array.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterable, Sequence, overload 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | from ._data_type import DataType 7 | from ._field import Field 8 | from ._scalar import Scalar 9 | from .types import ( 10 | ArrayInput, 11 | ArrowArrayExportable, 12 | ArrowSchemaExportable, 13 | ArrowStreamExportable, 14 | _SupportsBuffer, 15 | ) 16 | 17 | class Array: 18 | """An Arrow Array.""" 19 | @overload 20 | def __init__(self, obj: ArrayInput, /, type: None = None) -> None: ... 21 | @overload 22 | def __init__(self, obj: Sequence[Any], /, type: ArrowSchemaExportable) -> None: ... 23 | def __init__( 24 | self, 25 | obj: ArrayInput | Sequence[Any], 26 | /, 27 | type: ArrowSchemaExportable | None = None, 28 | ) -> None: 29 | """Create arro3.Array instance from a sequence of Python objects. 30 | 31 | Args: 32 | obj: A sequence of input objects. 33 | type: Explicit type to attempt to coerce to. You may pass in a `Field` to `type` in order to associate extension metadata with this array. 34 | """ 35 | def __array__(self, dtype=None, copy=None) -> NDArray: 36 | """ 37 | An implementation of the Array interface, for interoperability with numpy and 38 | other array libraries. 39 | """ 40 | def __arrow_c_array__( 41 | self, requested_schema: object | None = None 42 | ) -> tuple[object, object]: 43 | """ 44 | An implementation of the [Arrow PyCapsule 45 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 46 | This dunder method should not be called directly, but enables zero-copy data 47 | transfer to other Python libraries that understand Arrow memory. 48 | 49 | For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this 50 | array into a pyarrow array, without copying memory. 51 | """ 52 | def __arrow_c_schema__(self) -> object: 53 | """ 54 | An implementation of the [Arrow PyCapsule 55 | Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). 56 | This dunder method should not be called directly, but enables zero-copy data 57 | transfer to other Python libraries that understand Arrow memory. 58 | 59 | This allows Arrow consumers to inspect the data type of this array. Then the 60 | consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data 61 | to a supported data type. 62 | """ 63 | def __eq__(self, other) -> bool: ... 64 | def __getitem__(self, i: int) -> Scalar: ... 65 | # Note: we don't actually implement this, but it's inferred by having a __getitem__ 66 | # key 67 | def __iter__(self) -> Iterable[Scalar]: ... 68 | def __len__(self) -> int: ... 69 | def __repr__(self) -> str: ... 70 | @classmethod 71 | def from_arrow(cls, input: ArrowArrayExportable | ArrowStreamExportable) -> Array: 72 | """ 73 | Construct this object from an existing Arrow object. 74 | 75 | It can be called on anything that exports the Arrow data interface 76 | (`__arrow_c_array__`). 77 | 78 | Args: 79 | input: Arrow array to use for constructing this object 80 | 81 | Returns: 82 | Self 83 | """ 84 | 85 | @classmethod 86 | def from_arrow_pycapsule(cls, schema_capsule, array_capsule) -> Array: 87 | """Construct this object from bare Arrow PyCapsules""" 88 | 89 | # We allow Any here because not many types have updated to expose __buffer__ yet 90 | @classmethod 91 | def from_buffer(cls, buffer: _SupportsBuffer) -> Array: 92 | """Construct an Array from an object implementing the Python Buffer Protocol.""" 93 | 94 | @classmethod 95 | def from_numpy(cls, array: np.ndarray) -> Array: 96 | """Construct an Array from a numpy ndarray""" 97 | 98 | def cast(self, target_type: ArrowSchemaExportable) -> Array: 99 | """Cast array values to another data type 100 | 101 | Args: 102 | target_type: Type to cast array to. 103 | """ 104 | 105 | @property 106 | def field(self) -> Field: 107 | """Access the field stored on this Array. 108 | 109 | Note that this field usually will not have a name associated, but it may have 110 | metadata that signifies that this array is an extension (user-defined typed) 111 | array. 112 | """ 113 | @property 114 | def nbytes(self) -> int: 115 | """The number of bytes in this Array.""" 116 | @property 117 | def null_count(self) -> int: 118 | """The number of null values in this Array.""" 119 | def slice(self, offset: int = 0, length: int | None = None) -> Array: 120 | """Compute zero-copy slice of this array. 121 | 122 | Args: 123 | offset: Defaults to 0. 124 | length: Defaults to None. 125 | 126 | Returns: 127 | The sliced array 128 | """ 129 | def take(self, indices: ArrayInput) -> Array: 130 | """Take specific indices from this Array.""" 131 | def to_numpy(self) -> NDArray: 132 | """Return a numpy copy of this array.""" 133 | def to_pylist(self) -> NDArray: 134 | """Convert to a list of native Python objects.""" 135 | 136 | @property 137 | def type(self) -> DataType: 138 | """The data type of this array.""" 139 | --------------------------------------------------------------------------------