├── tests
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── test_array_reader.py
    │   ├── test_record_batch_reader.py
    │   ├── test_list_flatten.py
    │   ├── test_misc.py
    │   ├── test_schema.py
    │   ├── test_record_batch.py
    │   ├── test_list_offsets.py
    │   ├── test_struct_field.py
    │   ├── test_data_type.py
    │   ├── test_chunked_array.py
    │   ├── test_buffer_protocol.py
    │   ├── test_ffi.py
    │   ├── test_table.py
    │   └── test_constructors.py
    ├── io
    │   ├── __init__.py
    │   ├── test_parquet.py
    │   └── test_ipc.py
    ├── compute
    │   ├── test_arith.py
    │   └── test_aggregate.py
    └── test_dictionary.py
├── .python-version
├── docs
    ├── index.md
    └── api
    │   ├── compute.md
    │   ├── core
    │       ├── array.md
    │       ├── field.md
    │       ├── scalar.md
    │       ├── schema.md
    │       ├── table.md
    │       ├── types.md
    │       ├── array-reader.md
    │       ├── chunked-array.md
    │       ├── record-batch.md
    │       ├── datatype.md
    │       ├── record-batch-reader.md
    │       ├── constructors.md
    │       └── accessors.md
    │   └── io
    │       ├── csv.md
    │       ├── parquet.md
    │       ├── json.md
    │       └── arrow-ipc.md
├── arro3-io
    ├── python
    │   └── arro3
    │   │   └── io
    │   │       ├── py.typed
    │   │       ├── __init__.py
    │   │       ├── store.pyi
    │   │       ├── _io.pyi
    │   │       ├── _ipc.pyi
    │   │       ├── _json.pyi
    │   │       ├── _pyo3_object_store.pyi
    │   │       └── _csv.pyi
    ├── README.md
    ├── pyproject.toml
    ├── Cargo.toml
    └── src
    │   ├── error.rs
    │   ├── lib.rs
    │   ├── json.rs
    │   ├── ipc.rs
    │   └── utils.rs
├── arro3-core
    ├── python
    │   └── arro3
    │   │   └── core
    │   │       ├── py.typed
    │   │       ├── __init__.py
    │   │       ├── _buffer.pyi
    │   │       ├── _field.pyi
    │   │       ├── _scalar.pyi
    │   │       ├── _record_batch_reader.pyi
    │   │       ├── _array_reader.pyi
    │   │       ├── types.py
    │   │       ├── _schema.pyi
    │   │       └── _array.pyi
    ├── src
    │   ├── accessors
    │   │   ├── mod.rs
    │   │   ├── struct_field.rs
    │   │   ├── list_flatten.rs
    │   │   ├── list_offsets.rs
    │   │   └── dictionary.rs
    │   └── lib.rs
    ├── README.md
    ├── pyproject.toml
    └── Cargo.toml
├── arro3-compute
    ├── python
    │   └── arro3
    │   │   └── compute
    │   │       ├── py.typed
    │   │       ├── __init__.py
    │   │       ├── types.py
    │   │       ├── _aggregate.pyi
    │   │       ├── _filter.pyi
    │   │       ├── _dictionary.pyi
    │   │       ├── _compute.pyi
    │   │       ├── _arith.pyi
    │   │       ├── _boolean.pyi
    │   │       ├── _take.pyi
    │   │       ├── _cast.pyi
    │   │       ├── enums.py
    │   │       └── _temporal.pyi
    ├── README.md
    ├── src
    │   ├── concat.rs
    │   ├── take.rs
    │   ├── cast.rs
    │   ├── boolean.rs
    │   ├── filter.rs
    │   ├── arith.rs
    │   ├── lib.rs
    │   ├── dictionary.rs
    │   └── temporal.rs
    ├── pyproject.toml
    └── Cargo.toml
├── pyo3-arrow
    ├── src
    │   ├── interop
    │   │   ├── mod.rs
    │   │   └── numpy
    │   │   │   ├── mod.rs
    │   │   │   └── to_numpy.rs
    │   ├── ffi
    │   │   ├── to_python
    │   │   │   ├── mod.rs
    │   │   │   ├── nanoarrow.rs
    │   │   │   ├── chunked.rs
    │   │   │   └── utils.rs
    │   │   ├── from_python
    │   │   │   ├── mod.rs
    │   │   │   ├── field.rs
    │   │   │   ├── table.rs
    │   │   │   ├── datatypes.rs
    │   │   │   ├── schema.rs
    │   │   │   ├── chunked.rs
    │   │   │   ├── array_reader.rs
    │   │   │   ├── scalar.rs
    │   │   │   ├── record_batch_reader.rs
    │   │   │   ├── record_batch.rs
    │   │   │   ├── array.rs
    │   │   │   ├── input.rs
    │   │   │   ├── ffi_stream.rs
    │   │   │   └── utils.rs
    │   │   └── mod.rs
    │   ├── lib.rs
    │   ├── utils.rs
    │   └── error.rs
    └── Cargo.toml
├── .github
    └── workflows
    │   ├── conventional-commits.yml
    │   ├── ci.yml
    │   ├── test-python.yml
    │   ├── docs.yml
    │   └── pyodide-wheels.yml
├── .pre-commit-config.yaml
├── pyproject.toml
├── LICENSE_MIT
├── DEVELOP.md
├── Cargo.toml
├── .gitignore
└── mkdocs.yml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ../README.md


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/interop/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod numpy;
2 | 


--------------------------------------------------------------------------------
/docs/api/compute.md:
--------------------------------------------------------------------------------
1 | # arro3.compute
2 | 
3 | ::: arro3.compute
4 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/interop/numpy/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod from_numpy;
2 | pub(crate) mod to_numpy;
3 | 


--------------------------------------------------------------------------------
/docs/api/core/array.md:
--------------------------------------------------------------------------------
1 | # Array
2 | 
3 | ::: arro3.core.Array
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/field.md:
--------------------------------------------------------------------------------
1 | # Field
2 | 
3 | ::: arro3.core.Field
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/scalar.md:
--------------------------------------------------------------------------------
1 | # Scalar
2 | 
3 | ::: arro3.core.Scalar
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/schema.md:
--------------------------------------------------------------------------------
1 | # Schema
2 | 
3 | ::: arro3.core.Schema
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/table.md:
--------------------------------------------------------------------------------
1 | # Table
2 | 
3 | ::: arro3.core.Table
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/types.md:
--------------------------------------------------------------------------------
1 | # types
2 | 
3 | ::: arro3.core.types
4 |     options:
5 |       show_if_no_docstring: true
6 | 


--------------------------------------------------------------------------------
/docs/api/core/array-reader.md:
--------------------------------------------------------------------------------
1 | # ArrayReader
2 | 
3 | ::: arro3.core.ArrayReader
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/chunked-array.md:
--------------------------------------------------------------------------------
1 | # ChunkedArray
2 | 
3 | ::: arro3.core.ChunkedArray
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/core/record-batch.md:
--------------------------------------------------------------------------------
1 | # RecordBatch
2 | 
3 | ::: arro3.core.RecordBatch
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/io/csv.md:
--------------------------------------------------------------------------------
1 | # CSV
2 | 
3 | ::: arro3.io.infer_csv_schema
4 | ::: arro3.io.read_csv
5 | ::: arro3.io.write_csv
6 | 


--------------------------------------------------------------------------------
/docs/api/core/datatype.md:
--------------------------------------------------------------------------------
1 | # DataType
2 | 
3 | ::: arro3.core.DataType
4 |     options:
5 |       show_if_no_docstring: true
6 | 


--------------------------------------------------------------------------------
/docs/api/core/record-batch-reader.md:
--------------------------------------------------------------------------------
1 | # RecordBatchReader
2 | 
3 | ::: arro3.core.RecordBatchReader
4 |     options:
5 |       members:
6 | 


--------------------------------------------------------------------------------
/docs/api/io/parquet.md:
--------------------------------------------------------------------------------
1 | # Parquet
2 | 
3 | ::: arro3.io.read_parquet
4 | ::: arro3.io.read_parquet_async
5 | ::: arro3.io.write_parquet
6 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/__init__.py:
--------------------------------------------------------------------------------
1 | from ._core import *
2 | from ._core import ___version
3 | 
4 | __version__: str = ___version()
5 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/__init__.py:
--------------------------------------------------------------------------------
1 | from ._io import *
2 | from ._io import ___version, store
3 | 
4 | __version__: str = ___version()
5 | 


--------------------------------------------------------------------------------
/docs/api/io/json.md:
--------------------------------------------------------------------------------
1 | # JSON
2 | 
3 | ::: arro3.io.infer_json_schema
4 | ::: arro3.io.read_json
5 | ::: arro3.io.write_json
6 | ::: arro3.io.write_ndjson
7 | 


--------------------------------------------------------------------------------
/arro3-core/src/accessors/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod dictionary;
2 | pub(crate) mod list_flatten;
3 | pub(crate) mod list_offsets;
4 | pub(crate) mod struct_field;
5 | 


--------------------------------------------------------------------------------
/docs/api/io/arrow-ipc.md:
--------------------------------------------------------------------------------
1 | # Arrow IPC
2 | 
3 | ::: arro3.io.read_ipc
4 | ::: arro3.io.read_ipc_stream
5 | ::: arro3.io.write_ipc
6 | ::: arro3.io.write_ipc_stream
7 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/__init__.py:
--------------------------------------------------------------------------------
1 | from . import enums, types
2 | from ._compute import *
3 | from ._compute import ___version
4 | 
5 | __version__: str = ___version()
6 | 


--------------------------------------------------------------------------------
/docs/api/core/constructors.md:
--------------------------------------------------------------------------------
1 | # Constructors
2 | 
3 | ::: arro3.core
4 |     options:
5 |       members:
6 |         - fixed_size_list_array
7 |         - list_array
8 |         - struct_array
9 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/to_python/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod chunked;
2 | pub mod ffi_stream;
3 | pub mod nanoarrow;
4 | mod utils;
5 | 
6 | pub use utils::{to_array_pycapsules, to_schema_pycapsule, to_stream_pycapsule};
7 | 


--------------------------------------------------------------------------------
/docs/api/core/accessors.md:
--------------------------------------------------------------------------------
 1 | # Accessors
 2 | 
 3 | ::: arro3.core
 4 |     options:
 5 |       members:
 6 |         - dictionary_dictionary
 7 |         - dictionary_indices
 8 |         - list_flatten
 9 |         - list_offsets
10 |         - struct_field
11 | 


--------------------------------------------------------------------------------
/arro3-io/README.md:
--------------------------------------------------------------------------------
1 | # arro3-io
2 | 
3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs).
4 | 
5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/).
6 | 


--------------------------------------------------------------------------------
/arro3-core/README.md:
--------------------------------------------------------------------------------
1 | # arro3-core
2 | 
3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs).
4 | 
5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/).
6 | 


--------------------------------------------------------------------------------
/arro3-compute/README.md:
--------------------------------------------------------------------------------
1 | # arro3-compute
2 | 
3 | A minimal Python library for [Apache Arrow](https://arrow.apache.org/docs/index.html), binding to the [Rust Arrow implementation](https://github.com/apache/arrow-rs).
4 | 
5 | Consult the [documentation](https://kylebarron.dev/arro3/latest/).
6 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/mod.rs:
--------------------------------------------------------------------------------
 1 | mod array;
 2 | mod array_reader;
 3 | mod chunked;
 4 | mod datatypes;
 5 | pub(crate) mod ffi_stream;
 6 | mod field;
 7 | mod input;
 8 | mod record_batch;
 9 | mod record_batch_reader;
10 | mod scalar;
11 | mod schema;
12 | mod table;
13 | pub(crate) mod utils;
14 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/mod.rs:
--------------------------------------------------------------------------------
1 | //! Utilities for managing Arrow FFI between Python and Rust.
2 | 
3 | pub(crate) mod from_python;
4 | pub(crate) mod to_python;
5 | 
6 | pub use to_python::chunked::{ArrayIterator, ArrayReader};
7 | pub use to_python::{to_array_pycapsules, to_schema_pycapsule, to_stream_pycapsule};
8 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/store.pyi:
--------------------------------------------------------------------------------
1 | # TODO: move to reusable types package
2 | from ._pyo3_object_store import AzureStore as AzureStore
3 | from ._pyo3_object_store import GCSStore as GCSStore
4 | from ._pyo3_object_store import HTTPStore as HTTPStore
5 | from ._pyo3_object_store import LocalStore as LocalStore
6 | from ._pyo3_object_store import MemoryStore as MemoryStore
7 | from ._pyo3_object_store import S3Store as S3Store
8 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/field.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_schema;
 2 | use crate::field::PyField;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyField {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_schema(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/table.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_stream;
 2 | use crate::table::PyTable;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyTable {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_stream(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/datatypes.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_schema;
 2 | use crate::PyDataType;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyDataType {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_schema(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/schema.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_schema;
 2 | use crate::schema::PySchema;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PySchema {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let schema_ptr = call_arrow_c_schema(ob)?;
 9 |         Self::from_arrow_pycapsule(&schema_ptr)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/chunked.rs:
--------------------------------------------------------------------------------
 1 | use crate::chunked::PyChunkedArray;
 2 | use crate::ffi::from_python::utils::call_arrow_c_stream;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyChunkedArray {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_stream(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/array_reader.rs:
--------------------------------------------------------------------------------
 1 | use crate::array_reader::PyArrayReader;
 2 | use crate::ffi::from_python::utils::call_arrow_c_stream;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyArrayReader {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_stream(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/scalar.rs:
--------------------------------------------------------------------------------
 1 | use crate::array::*;
 2 | use crate::PyScalar;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyScalar {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let array = ob.extract::<PyArray>()?;
 9 |         let (array, field) = array.into_inner();
10 |         Self::try_new(array, field).map_err(|err| err.into())
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/record_batch_reader.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_stream;
 2 | use crate::record_batch_reader::PyRecordBatchReader;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyRecordBatchReader {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let capsule = call_arrow_c_stream(ob)?;
 9 |         Self::from_arrow_pycapsule(&capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/.github/workflows/conventional-commits.yml:
--------------------------------------------------------------------------------
 1 | name: PR Conventional Commit Validation
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types: [opened, synchronize, reopened, edited]
 6 | 
 7 | jobs:
 8 |   validate-pr-title:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: PR Conventional Commit Validation
12 |         uses: ytanikin/pr-conventional-commits@1.4.0
13 |         with:
14 |           task_types: '["feat","fix","docs","test","ci","refactor","perf","chore","revert"]'
15 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/record_batch.rs:
--------------------------------------------------------------------------------
 1 | use crate::ffi::from_python::utils::call_arrow_c_array;
 2 | use crate::record_batch::PyRecordBatch;
 3 | use pyo3::prelude::*;
 4 | use pyo3::{PyAny, PyResult};
 5 | 
 6 | impl<'a> FromPyObject<'a> for PyRecordBatch {
 7 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 8 |         let (schema_capsule, array_capsule) = call_arrow_c_array(ob)?;
 9 |         Self::from_arrow_pycapsule(&schema_capsule, &array_capsule)
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/arro3-compute/src/concat.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3_arrow::error::PyArrowResult;
 3 | use pyo3_arrow::{PyArray, PyChunkedArray};
 4 | 
 5 | #[pyfunction]
 6 | pub fn concat(py: Python, input: PyChunkedArray) -> PyArrowResult<PyObject> {
 7 |     let (chunks, field) = input.into_inner();
 8 |     let array_refs = chunks.iter().map(|arr| arr.as_ref()).collect::<Vec<_>>();
 9 |     let concatted = arrow_select::concat::concat(array_refs.as_slice())?;
10 |     Ok(PyArray::new(concatted, field).to_arro3(py)?.unbind())
11 | }
12 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_buffer.pyi:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | if sys.version_info >= (3, 12):
 4 |     from collections.abc import Buffer as _Buffer
 5 | else:
 6 |     from typing_extensions import Buffer as _Buffer
 7 | 
 8 | class Buffer(_Buffer):
 9 |     """An Arrow Buffer"""
10 |     def __init__(self, buffer) -> None: ...
11 |     def __buffer__(self, flags: int) -> memoryview: ...
12 |     def __len__(self) -> int: ...
13 |     def to_bytes(self) -> bytes:
14 |         """Copy this buffer into a Python `bytes` object."""
15 | 


--------------------------------------------------------------------------------
/tests/core/test_array_reader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from arro3.core import ArrayReader
 3 | 
 4 | 
 5 | class CustomException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class ArrowCStreamFails:
10 |     def __arrow_c_stream__(self, requested_schema=None):
11 |         raise CustomException
12 | 
13 | 
14 | def test_array_reader_import_preserve_exception():
15 |     """https://github.com/kylebarron/arro3/issues/325"""
16 | 
17 |     c_stream_obj = ArrowCStreamFails()
18 |     with pytest.raises(CustomException):
19 |         ArrayReader.from_arrow(c_stream_obj)
20 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Literal
 4 | 
 5 | DatePartT = Literal[
 6 |     "quarter",
 7 |     "year",
 8 |     "month",
 9 |     "week",
10 |     "day",
11 |     "dayofweeksunday0",
12 |     "dayofweekmonday0",
13 |     "dayofyear",
14 |     "hour",
15 |     "minute",
16 |     "second",
17 |     "millisecond",
18 |     "microsecond",
19 |     "nanosecond",
20 | ]
21 | """
22 | Acceptable strings to be passed into the `part` parameter for
23 | [`date_part`][arro3.compute.date_part].
24 | """
25 | 


--------------------------------------------------------------------------------
/arro3-compute/src/take.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::prelude::*;
 2 | use pyo3_arrow::error::PyArrowResult;
 3 | use pyo3_arrow::PyArray;
 4 | 
 5 | /// Take elements by index from an Array, creating a new Array from those
 6 | /// indexes.
 7 | #[pyfunction]
 8 | pub fn take(py: Python, values: PyArray, indices: PyArray) -> PyArrowResult<PyObject> {
 9 |     let output_array =
10 |         py.allow_threads(|| arrow_select::take::take(values.as_ref(), indices.as_ref(), None))?;
11 |     Ok(PyArray::new(output_array, values.field().clone())
12 |         .to_arro3(py)?
13 |         .unbind())
14 | }
15 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_aggregate.pyi:
--------------------------------------------------------------------------------
 1 | from arro3.core import Scalar
 2 | from arro3.core.types import ArrayInput, ArrowStreamExportable
 3 | 
 4 | def max(input: ArrayInput | ArrowStreamExportable) -> Scalar:
 5 |     """
 6 |     Returns the max of values in the array.
 7 |     """
 8 | 
 9 | def min(input: ArrayInput | ArrowStreamExportable) -> Scalar:
10 |     """
11 |     Returns the min of values in the array.
12 |     """
13 | 
14 | def sum(input: ArrayInput | ArrowStreamExportable) -> Scalar:
15 |     """
16 |     Returns the sum of values in the array.
17 |     """
18 | 


--------------------------------------------------------------------------------
/tests/core/test_record_batch_reader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from arro3.core import RecordBatchReader
 3 | 
 4 | 
 5 | class CustomException(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class ArrowCStreamFails:
10 |     def __arrow_c_stream__(self, requested_schema=None):
11 |         raise CustomException
12 | 
13 | 
14 | def test_record_batch_reader_import_preserve_exception():
15 |     """https://github.com/kylebarron/arro3/issues/325"""
16 | 
17 |     c_stream_obj = ArrowCStreamFails()
18 |     with pytest.raises(CustomException):
19 |         RecordBatchReader.from_arrow(c_stream_obj)
20 | 


--------------------------------------------------------------------------------
/arro3-io/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.4.0,<2.0"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "arro3-io"
 7 | requires-python = ">=3.9"
 8 | dependencies = ["arro3-core"]
 9 | classifiers = [
10 |     "Programming Language :: Rust",
11 |     "Programming Language :: Python :: Implementation :: CPython",
12 |     "Programming Language :: Python :: Implementation :: PyPy",
13 | ]
14 | dynamic = ["version"]
15 | 
16 | [tool.maturin]
17 | features = ["pyo3/extension-module"]
18 | module-name = "arro3.io._io"
19 | python-source = "python"
20 | strip = true
21 | 


--------------------------------------------------------------------------------
/tests/compute/test_arith.py:
--------------------------------------------------------------------------------
 1 | import arro3.compute as ac
 2 | import pyarrow as pa
 3 | from arro3.core import Array, DataType
 4 | 
 5 | 
 6 | def test_add():
 7 |     arr1 = Array([1, 2, 3], DataType.int16())
 8 |     assert ac.min(arr1).as_py() == 1
 9 | 
10 |     arr2 = Array([3, 2, 0], DataType.int16())
11 |     assert ac.min(arr2).as_py() == 0
12 | 
13 |     add1 = ac.add(arr1, arr2)
14 |     assert pa.array(add1) == pa.array(Array([4, 4, 3], DataType.int16()))
15 | 
16 |     s = arr1[0]
17 |     add2 = ac.add(arr1, s)
18 |     assert pa.array(add2) == pa.array(Array([2, 3, 4], DataType.int16()))
19 | 


--------------------------------------------------------------------------------
/arro3-compute/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.4.0,<2.0"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "arro3-compute"
 7 | requires-python = ">=3.9"
 8 | dependencies = ["arro3-core"]
 9 | classifiers = [
10 |     "Programming Language :: Rust",
11 |     "Programming Language :: Python :: Implementation :: CPython",
12 |     "Programming Language :: Python :: Implementation :: PyPy",
13 | ]
14 | dynamic = ["version"]
15 | 
16 | [tool.maturin]
17 | features = ["pyo3/extension-module"]
18 | module-name = "arro3.compute._compute"
19 | python-source = "python"
20 | strip = true
21 | 


--------------------------------------------------------------------------------
/arro3-core/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.4.0,<2.0"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "arro3-core"
 7 | requires-python = ">=3.9"
 8 | dependencies = ["typing-extensions; python_version < '3.12'"]
 9 | classifiers = [
10 |     "Programming Language :: Rust",
11 |     "Programming Language :: Python :: Implementation :: CPython",
12 |     "Programming Language :: Python :: Implementation :: PyPy",
13 | ]
14 | dynamic = ["version"]
15 | 
16 | [tool.maturin]
17 | features = ["pyo3/extension-module"]
18 | module-name = "arro3.core._core"
19 | python-source = "python"
20 | strip = true
21 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/_io.pyi:
--------------------------------------------------------------------------------
 1 | from ._csv import infer_csv_schema, read_csv, write_csv
 2 | from ._ipc import read_ipc, read_ipc_stream, write_ipc, write_ipc_stream
 3 | from ._json import infer_json_schema, read_json, write_json, write_ndjson
 4 | from ._parquet import read_parquet, read_parquet_async, write_parquet
 5 | 
 6 | __all__ = [
 7 |     "infer_csv_schema",
 8 |     "read_csv",
 9 |     "write_csv",
10 |     "infer_json_schema",
11 |     "read_json",
12 |     "write_json",
13 |     "write_ndjson",
14 |     "read_ipc",
15 |     "read_ipc_stream",
16 |     "write_ipc",
17 |     "write_ipc_stream",
18 |     "read_parquet",
19 |     "read_parquet_async",
20 |     "write_parquet",
21 | ]
22 | 


--------------------------------------------------------------------------------
/tests/core/test_list_flatten.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | from arro3.core import list_flatten
 3 | 
 4 | 
 5 | def test_list_flatten():
 6 |     list_arr = pa.array([[1, 2], [3, 4]])
 7 |     out = pa.array(list_flatten(list_arr))
 8 |     assert out == pa.array([1, 2, 3, 4])
 9 | 
10 | 
11 | def test_list_flatten_sliced_end():
12 |     list_arr = pa.array([[1, 2], [3, 4]])
13 |     sliced = list_arr.slice(1, 2)
14 |     out = pa.array(list_flatten(sliced))
15 |     assert out == pa.array([3, 4])
16 | 
17 | 
18 | def test_list_flatten_sliced_start():
19 |     list_arr = pa.array([[1, 2], [3, 4]])
20 |     sliced = list_arr.slice(0, 1)
21 |     out = pa.array(list_flatten(sliced))
22 |     assert out == pa.array([1, 2])
23 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | 
 4 | # Default to Python 3
 5 | default_language_version:
 6 |   python: python3
 7 | 
 8 | # Optionally both commit and push
 9 | default_stages: [pre-commit]
10 | 
11 | repos:
12 |   - repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: v2.4.0
14 |     hooks:
15 |       - id: trailing-whitespace
16 |       - id: end-of-file-fixer
17 |       - id: check-added-large-files
18 |         args: ["--maxkb=500"]
19 | 
20 |   - repo: https://github.com/astral-sh/ruff-pre-commit
21 |     rev: v0.12.10
22 |     hooks:
23 |       - id: ruff
24 |         args: ["--fix"]
25 |       - id: ruff-format
26 | 


--------------------------------------------------------------------------------
/tests/core/test_misc.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | 
 5 | def test_numpy_backed_array_to_pyarrow():
 6 |     # Passing a numpy-backed `arro3.core.Array` to `pyarrow.Array`
 7 |     # caused a segfault at interpreter shutdown.
 8 |     # Affected versions: 0.4.0, 0.4.1
 9 |     # See: [#230](https://github.com/kylebarron/arro3/issues/230)
10 |     code = (
11 |         "import numpy as np\n"
12 |         "import pyarrow as pa\n"
13 |         "from arro3.core import Array\n"
14 |         "\n"
15 |         "numpy_arr = np.array([0, 1, 2, 3], dtype=np.float64)\n"
16 |         "arro3_arr = Array(numpy_arr)\n"
17 |         "pyarrow_arr = pa.array(arro3_arr)\n"
18 |     )
19 |     subprocess.check_call([sys.executable, "-c", code])
20 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | #![deny(missing_docs)]
 3 | 
 4 | mod array;
 5 | mod array_reader;
 6 | #[cfg(feature = "buffer_protocol")]
 7 | pub mod buffer;
 8 | mod chunked;
 9 | mod datatypes;
10 | pub mod error;
11 | pub mod export;
12 | pub mod ffi;
13 | mod field;
14 | pub mod input;
15 | mod interop;
16 | mod record_batch;
17 | mod record_batch_reader;
18 | mod scalar;
19 | mod schema;
20 | mod table;
21 | mod utils;
22 | 
23 | pub use array::PyArray;
24 | pub use array_reader::PyArrayReader;
25 | pub use chunked::PyChunkedArray;
26 | pub use datatypes::PyDataType;
27 | pub use field::PyField;
28 | pub use record_batch::PyRecordBatch;
29 | pub use record_batch_reader::PyRecordBatchReader;
30 | pub use scalar::PyScalar;
31 | pub use schema::PySchema;
32 | pub use table::PyTable;
33 | 


--------------------------------------------------------------------------------
/tests/core/test_schema.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | import pytest
 3 | from arro3.core import Field, Schema, Table
 4 | 
 5 | 
 6 | def test_schema_iterable():
 7 |     a = pa.chunked_array([[1, 2, 3, 4]])
 8 |     b = pa.chunked_array([["a", "b", "c", "d"]])
 9 |     table = Table.from_pydict({"a": a, "b": b})
10 |     schema = table.schema
11 |     for field in schema:
12 |         assert isinstance(field, Field)
13 |         assert field.name in ["a", "b"]
14 | 
15 | 
16 | class CustomException(Exception):
17 |     pass
18 | 
19 | 
20 | class ArrowCSchemaFails:
21 |     def __arrow_c_schema__(self):
22 |         raise CustomException
23 | 
24 | 
25 | def test_schema_import_preserve_exception():
26 |     """https://github.com/kylebarron/arro3/issues/325"""
27 | 
28 |     c_stream_obj = ArrowCSchemaFails()
29 |     with pytest.raises(CustomException):
30 |         Schema.from_arrow(c_stream_obj)
31 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_filter.pyi:
--------------------------------------------------------------------------------
 1 | from typing import overload
 2 | 
 3 | from arro3.core import Array, ArrayReader
 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable
 5 | 
 6 | @overload
 7 | def filter(
 8 |     values: ArrayInput,
 9 |     predicate: ArrayInput,
10 | ) -> Array: ...
11 | @overload
12 | def filter(
13 |     values: ArrowStreamExportable,
14 |     predicate: ArrowStreamExportable,
15 | ) -> ArrayReader: ...
16 | def filter(
17 |     values: ArrayInput | ArrowStreamExportable,
18 |     predicate: ArrayInput | ArrowStreamExportable,
19 | ) -> Array | ArrayReader:
20 |     """
21 |     Returns a filtered `values` array where the corresponding elements of
22 |     `predicate` are `true`.
23 | 
24 |     If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray`
25 |     or `ArrayReader`, an `ArrayReader` will be returned.
26 |     """
27 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use arrow_cast::display::FormatOptions;
 2 | use arrow_schema::Schema;
 3 | 
 4 | /// Check whether two schemas are equal
 5 | ///
 6 | /// This allows schemas to have different top-level metadata, as well as different nested field
 7 | /// names and keys.
 8 | pub(crate) fn schema_equals(left: &Schema, right: &Schema) -> bool {
 9 |     left.fields
10 |         .iter()
11 |         .zip(right.fields.iter())
12 |         .all(|(left_field, right_field)| {
13 |             left_field.name() == right_field.name()
14 |                 && left_field
15 |                     .data_type()
16 |                     .equals_datatype(right_field.data_type())
17 |         })
18 | }
19 | 
20 | pub(crate) fn default_repr_options<'a>() -> FormatOptions<'a> {
21 |     FormatOptions::new()
22 |         .with_display_error(true)
23 |         .with_null("null")
24 |         .with_types_info(true)
25 | }
26 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_dictionary.pyi:
--------------------------------------------------------------------------------
 1 | from typing import overload
 2 | 
 3 | from arro3.core import Array, ArrayReader
 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable
 5 | 
 6 | @overload
 7 | def dictionary_encode(array: ArrayInput) -> Array: ...
 8 | @overload
 9 | def dictionary_encode(array: ArrowStreamExportable) -> ArrayReader: ...
10 | def dictionary_encode(
11 |     array: ArrayInput | ArrowStreamExportable,
12 | ) -> Array | ArrayReader:
13 |     """
14 |     Dictionary-encode array.
15 | 
16 |     Return a dictionary-encoded version of the input array. This function does nothing if the input is already a dictionary array.
17 | 
18 |     Note: for stream input, each output array will not necessarily have the same dictionary.
19 | 
20 |     Args:
21 |         array: Argument to compute function.
22 | 
23 |     Returns:
24 |         The dictionary-encoded array.
25 |     """
26 | 


--------------------------------------------------------------------------------
/arro3-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "arro3-core"
 3 | version = { workspace = true }
 4 | authors = { workspace = true }
 5 | edition = { workspace = true }
 6 | description = "Core library for representing Arrow data in Python."
 7 | readme = "README.md"
 8 | repository = { workspace = true }
 9 | homepage = { workspace = true }
10 | license = { workspace = true }
11 | keywords = { workspace = true }
12 | categories = { workspace = true }
13 | rust-version = { workspace = true }
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | [lib]
17 | name = "_core"
18 | crate-type = ["cdylib"]
19 | 
20 | [dependencies]
21 | arrow-array = { workspace = true }
22 | arrow-buffer = { workspace = true }
23 | arrow-cast = { workspace = true, features = ["prettyprint"] }
24 | arrow-schema = { workspace = true }
25 | pyo3-arrow = { workspace = true }
26 | pyo3 = { workspace = true }
27 | 


--------------------------------------------------------------------------------
/tests/core/test_record_batch.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | import pytest
 3 | from arro3.core import RecordBatch
 4 | 
 5 | 
 6 | def test_nonempty_batch_no_columns():
 7 |     batch = pa.record_batch({"a": [1, 2, 3, 4]}).select([])
 8 |     assert len(batch) == 4
 9 |     assert batch.num_columns == 0
10 |     arro3_batch = RecordBatch.from_arrow(batch)
11 |     retour = pa.record_batch(arro3_batch)
12 |     assert batch == retour
13 | 
14 | 
15 | class CustomException(Exception):
16 |     pass
17 | 
18 | 
19 | class ArrowCArrayFails:
20 |     def __arrow_c_array__(self, requested_schema=None):
21 |         raise CustomException
22 | 
23 | 
24 | def test_record_batch_import_preserve_exception():
25 |     """https://github.com/kylebarron/arro3/issues/325"""
26 | 
27 |     c_stream_obj = ArrowCArrayFails()
28 |     with pytest.raises(CustomException):
29 |         RecordBatch.from_arrow(c_stream_obj)
30 | 
31 |     with pytest.raises(CustomException):
32 |         RecordBatch(c_stream_obj)
33 | 


--------------------------------------------------------------------------------
/arro3-compute/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "arro3-compute"
 3 | version = { workspace = true }
 4 | authors = { workspace = true }
 5 | edition = { workspace = true }
 6 | description = "Rust-based compute kernels for Arrow in Python."
 7 | readme = "README.md"
 8 | repository = { workspace = true }
 9 | homepage = { workspace = true }
10 | license = { workspace = true }
11 | keywords = { workspace = true }
12 | categories = { workspace = true }
13 | rust-version = { workspace = true }
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | [lib]
17 | name = "_compute"
18 | crate-type = ["cdylib"]
19 | 
20 | [dependencies]
21 | arrow-arith = { workspace = true }
22 | arrow-array = { workspace = true }
23 | arrow-buffer = { workspace = true }
24 | arrow-cast = { workspace = true }
25 | arrow-schema = { workspace = true }
26 | arrow-select = { workspace = true }
27 | pyo3 = { workspace = true }
28 | pyo3-arrow = { workspace = true }
29 | thiserror = { workspace = true }
30 | 


--------------------------------------------------------------------------------
/tests/core/test_list_offsets.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | from arro3.core import list_offsets
 3 | 
 4 | 
 5 | def test_list_flatten():
 6 |     list_arr = pa.array([[1, 2], [3, 4]])
 7 |     out = pa.array(list_offsets(list_arr))
 8 |     assert out == list_arr.offsets
 9 | 
10 | 
11 | def test_list_flatten_sliced_end():
12 |     list_arr = pa.array([[1, 2], [3, 4]])
13 |     sliced = list_arr.slice(1, 1)
14 | 
15 |     out = pa.array(list_offsets(sliced, logical=False))
16 |     assert out == pa.array([2, 4], type=pa.int32())
17 | 
18 |     out = pa.array(list_offsets(sliced, logical=True))
19 |     assert out == pa.array([0, 2], type=pa.int32())
20 | 
21 | 
22 | def test_list_flatten_sliced_start():
23 |     list_arr = pa.array([[1, 2], [3, 4]])
24 |     sliced = list_arr.slice(0, 1)
25 | 
26 |     out = pa.array(list_offsets(sliced, logical=False))
27 |     assert out == pa.array([0, 2], type=pa.int32())
28 | 
29 |     out = pa.array(list_offsets(sliced, logical=True))
30 |     assert out == pa.array([0, 2], type=pa.int32())
31 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/array.rs:
--------------------------------------------------------------------------------
 1 | use crate::array::*;
 2 | #[cfg(feature = "buffer_protocol")]
 3 | use crate::buffer::AnyBufferProtocol;
 4 | use crate::ffi::from_python::utils::call_arrow_c_array;
 5 | use pyo3::exceptions::PyValueError;
 6 | use pyo3::prelude::*;
 7 | use pyo3::{intern, PyAny, PyResult};
 8 | 
 9 | impl<'a> FromPyObject<'a> for PyArray {
10 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
11 |         if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? {
12 |             let (schema_capsule, array_capsule) = call_arrow_c_array(ob)?;
13 |             Self::from_arrow_pycapsule(&schema_capsule, &array_capsule)
14 |         } else {
15 |             #[cfg(feature = "buffer_protocol")]
16 |             if let Ok(buf) = ob.extract::<AnyBufferProtocol>() {
17 |                 return Ok(buf.try_into()?);
18 |             }
19 | 
20 |             Err(PyValueError::new_err(
21 |                 "Expected object with __arrow_c_array__ method or implementing buffer protocol.",
22 |             ))
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/to_python/nanoarrow.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::intern;
 2 | use pyo3::prelude::*;
 3 | use pyo3::types::{PyCapsule, PyTuple};
 4 | 
 5 | pub fn to_nanoarrow_schema<'py>(
 6 |     py: Python<'py>,
 7 |     capsule: &Bound<'py, PyCapsule>,
 8 | ) -> PyResult<Bound<'py, PyAny>> {
 9 |     let na_mod = py.import(intern!(py, "nanoarrow"))?;
10 |     na_mod
11 |         .getattr(intern!(py, "Schema"))?
12 |         .call1(PyTuple::new(py, vec![capsule])?)
13 | }
14 | 
15 | pub fn to_nanoarrow_array<'py>(
16 |     py: Python<'py>,
17 |     capsules: Bound<'py, PyTuple>,
18 | ) -> PyResult<Bound<'py, PyAny>> {
19 |     let na_mod = py.import(intern!(py, "nanoarrow"))?;
20 |     na_mod.getattr(intern!(py, "Array"))?.call1(capsules)
21 | }
22 | 
23 | pub fn to_nanoarrow_array_stream<'py>(
24 |     py: Python<'py>,
25 |     capsule: &Bound<'py, PyCapsule>,
26 | ) -> PyResult<Bound<'py, PyAny>> {
27 |     let na_mod = py.import(intern!(py, "nanoarrow"))?;
28 |     na_mod
29 |         .getattr(intern!(py, "ArrayStream"))?
30 |         .call1(PyTuple::new(py, vec![capsule])?)
31 | }
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "arro3"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | dependencies = []
 8 | 
 9 | [tool.uv]
10 | dev-dependencies = [
11 |     "black>=24.10.0",
12 |     "boto3>=1.35.38",
13 |     "geoarrow-types>=0.3.0",
14 |     "griffe-inherited-docstrings>=1.0.1",
15 |     "ipykernel>=6.29.5",
16 |     "maturin>=1.7.4",
17 |     "mike>=2.1.3",
18 |     "mkdocs-material[imaging]>=9.6.7",
19 |     "mkdocs-redirects>=1.2.2",
20 |     "mkdocs>=1.6.1",
21 |     "mkdocstrings[python]>=0.28.3",
22 |     "pandas-stubs>=2.2.3.250527",
23 |     "pandas>=2.2.3",
24 |     "pip>=24.2",
25 |     "pyarrow>=21.0.0",
26 |     "pytest>=8.3.3",
27 | ]
28 | 
29 | [tool.ruff.lint]
30 | select = [
31 |     # Pyflakes
32 |     "F",
33 |     # Pycodestyle
34 |     # "E",
35 |     "W",
36 |     # isort
37 |     "I",
38 | ]
39 | 
40 | [tool.ruff.lint.extend-per-file-ignores]
41 | "__init__.py" = [
42 |     "F401", # Allow unused imports in __init__.py files
43 |     "F403", # unable to detect undefined names
44 | ]
45 | 


--------------------------------------------------------------------------------
/LICENSE_MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kyle Barron
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_compute.pyi:
--------------------------------------------------------------------------------
 1 | from arro3.compute._aggregate import max as max
 2 | from arro3.compute._aggregate import min as min
 3 | from arro3.compute._aggregate import sum as sum
 4 | from arro3.compute._arith import add as add
 5 | from arro3.compute._arith import add_wrapping as add_wrapping
 6 | from arro3.compute._arith import div as div
 7 | from arro3.compute._arith import mul as mul
 8 | from arro3.compute._arith import mul_wrapping as mul_wrapping
 9 | from arro3.compute._arith import neg as neg
10 | from arro3.compute._arith import neg_wrapping as neg_wrapping
11 | from arro3.compute._arith import rem as rem
12 | from arro3.compute._arith import sub as sub
13 | from arro3.compute._arith import sub_wrapping as sub_wrapping
14 | from arro3.compute._boolean import is_not_null as is_not_null
15 | from arro3.compute._boolean import is_null as is_null
16 | from arro3.compute._cast import can_cast_types as can_cast_types
17 | from arro3.compute._cast import cast as cast
18 | from arro3.compute._dictionary import dictionary_encode as dictionary_encode
19 | from arro3.compute._filter import filter as filter
20 | from arro3.compute._take import take as take
21 | from arro3.compute._temporal import date_part as date_part
22 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_arith.pyi:
--------------------------------------------------------------------------------
 1 | from arro3.core import Array
 2 | from arro3.core.types import ArrayInput
 3 | 
 4 | def add(lhs: ArrayInput, rhs: ArrayInput) -> Array:
 5 |     """Perform `lhs + rhs`, returning an error on overflow"""
 6 | 
 7 | def add_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array:
 8 |     """Perform `lhs + rhs`, wrapping on overflow for integer data types."""
 9 | 
10 | def div(lhs: ArrayInput, rhs: ArrayInput) -> Array:
11 |     """Perform `lhs / rhs`"""
12 | 
13 | def mul(lhs: ArrayInput, rhs: ArrayInput) -> Array:
14 |     """Perform `lhs * rhs`, returning an error on overflow"""
15 | 
16 | def mul_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array:
17 |     """Perform `lhs * rhs`, wrapping on overflow for integer data types."""
18 | 
19 | def neg(array: ArrayInput) -> Array:
20 |     """Negates each element of array, returning an error on overflow"""
21 | 
22 | def neg_wrapping(array: ArrayInput) -> Array:
23 |     """Negates each element of array, wrapping on overflow for integer data types."""
24 | 
25 | def rem(lhs: ArrayInput, rhs: ArrayInput) -> Array:
26 |     """Perform `lhs % rhs`"""
27 | 
28 | def sub(lhs: ArrayInput, rhs: ArrayInput) -> Array:
29 |     """Perform `lhs - rhs`, returning an error on overflow"""
30 | 
31 | def sub_wrapping(lhs: ArrayInput, rhs: ArrayInput) -> Array:
32 |     """Perform `lhs - rhs`, wrapping on overflow for integer data types."""
33 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Rust
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   lint-test:
11 |     name: Lint and Test
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |         with:
16 |           submodules: "recursive"
17 | 
18 |       - name: Install Rust
19 |         uses: dtolnay/rust-toolchain@stable
20 |         with:
21 |           components: rustfmt, clippy
22 | 
23 |       - uses: Swatinem/rust-cache@v2
24 | 
25 |       - name: Cargo fmt
26 |         run: cargo fmt --all -- --check
27 | 
28 |       - name: "clippy --all"
29 |         run: cargo clippy --all --all-features --tests -- -D warnings
30 | 
31 |       - name: "cargo check"
32 |         run: cargo check --all --all-features
33 | 
34 |       - name: "cargo test"
35 |         run: |
36 |           cargo test --all
37 |           cargo test --all --all-features
38 | 
39 |   check-features_pyo3_arrow:
40 |     runs-on: ubuntu-latest
41 |     strategy:
42 |       fail-fast: false
43 |       matrix:
44 |         args:
45 |           - "--no-default-features"
46 |     steps:
47 |       - uses: actions/checkout@v4
48 |         with:
49 |           submodules: "recursive"
50 |       - uses: dtolnay/rust-toolchain@stable
51 |       - uses: Swatinem/rust-cache@v2
52 |       - name: Test
53 |         run: cd pyo3-arrow && cargo check ${{ matrix.args }}
54 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_boolean.pyi:
--------------------------------------------------------------------------------
 1 | from typing import overload
 2 | 
 3 | from arro3.core import Array, ArrayReader
 4 | from arro3.core.types import ArrayInput, ArrowStreamExportable
 5 | 
 6 | @overload
 7 | def is_null(input: ArrayInput) -> Array: ...
 8 | @overload
 9 | def is_null(input: ArrowStreamExportable) -> ArrayReader: ...
10 | def is_null(
11 |     input: ArrayInput | ArrowStreamExportable,
12 | ) -> Array | ArrayReader:
13 |     """
14 |     Returns a non-null boolean-typed array with whether each value of the array is null.
15 | 
16 |     If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned.
17 | 
18 |     Args:
19 |         input: Input data
20 | 
21 |     Returns:
22 |         Output
23 |     """
24 | 
25 | @overload
26 | def is_not_null(input: ArrayInput) -> Array: ...
27 | @overload
28 | def is_not_null(input: ArrowStreamExportable) -> ArrayReader: ...
29 | def is_not_null(
30 |     input: ArrayInput | ArrowStreamExportable,
31 | ) -> Array | ArrayReader:
32 |     """
33 |     Returns a non-null boolean-typed array with whether each value of the array is not null.
34 | 
35 |     If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned.
36 | 
37 |     Args:
38 |         input: Input data
39 | 
40 |     Returns:
41 |         Output
42 |     """
43 | 


--------------------------------------------------------------------------------
/pyo3-arrow/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pyo3-arrow"
 3 | version = "0.11.0"
 4 | authors = ["Kyle Barron <kylebarron2@gmail.com>"]
 5 | edition = "2021"
 6 | description = "Arrow integration for pyo3."
 7 | readme = "README.md"
 8 | repository = "https://github.com/kylebarron/arro3"
 9 | license = "MIT OR Apache-2.0"
10 | keywords = ["python", "arrow"]
11 | categories = []
12 | rust-version = "1.75"
13 | 
14 | [features]
15 | default = ["buffer_protocol"]
16 | 
17 | # Support buffer protocol. Requires `abi3-py311` pyo3 feature or non-abi3
18 | # wheels.
19 | buffer_protocol = []
20 | 
21 | [dependencies]
22 | arrow-array = { version = "56", features = ["chrono-tz", "ffi"] }
23 | arrow-buffer = "56"
24 | arrow-cast = { version = "56", features = ["prettyprint"] }
25 | arrow-data = "56"
26 | arrow-schema = "56"
27 | arrow-select = "56"
28 | pyo3 = { version = "0.26", features = ["chrono", "chrono-tz", "indexmap"] }
29 | half = "2"
30 | indexmap = "2"
31 | # numpy = { version = "0.26", features = ["half"] }
32 | numpy = { git = "https://github.com/Icxolu/rust-numpy", rev = "2480e2c86f6e91dc815b7f8e473b71bb18486bb1", features = [
33 |     "half",
34 | ] }
35 | thiserror = "1"
36 | 
37 | [lib]
38 | crate-type = ["rlib"]
39 | 
40 | [patch.crates-io]
41 | pyo3 = { version = "0.26.0", features = [
42 |     "macros",
43 | ], git = "https://github.com/pyo3/pyo3.git", tag = "v0.26.0" }
44 | numpy = { git = "https://github.com/Icxolu/rust-numpy", rev = "2480e2c86f6e91dc815b7f8e473b71bb18486bb1" }
45 | 


--------------------------------------------------------------------------------
/tests/io/test_parquet.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | from arro3.io import read_parquet, write_parquet
 6 | 
 7 | 
 8 | def test_parquet_round_trip():
 9 |     table = pa.table({"a": [1, 2, 3, 4]})
10 |     write_parquet(table, "test.parquet")
11 |     table_retour = pa.table(read_parquet("test.parquet"))
12 |     assert table == table_retour
13 | 
14 | 
15 | def test_parquet_round_trip_bytes_io():
16 |     table = pa.table({"a": [1, 2, 3, 4]})
17 |     with BytesIO() as bio:
18 |         write_parquet(table, bio)
19 |         bio.seek(0)
20 |         table_retour = pa.table(read_parquet(bio))
21 |     assert table == table_retour
22 | 
23 | 
24 | def test_copy_parquet_kv_metadata():
25 |     metadata = {"hello": "world"}
26 |     table = pa.table({"a": [1, 2, 3]})
27 |     write_parquet(
28 |         table,
29 |         "test.parquet",
30 |         key_value_metadata=metadata,
31 |         skip_arrow_metadata=True,
32 |     )
33 | 
34 |     # Assert metadata was written, but arrow schema was not
35 |     pq_meta = pq.read_metadata("test.parquet").metadata
36 |     assert pq_meta[b"hello"] == b"world"
37 |     assert b"ARROW:schema" not in pq_meta.keys()
38 | 
39 |     # When reading with pyarrow, kv meta gets assigned to table
40 |     pa_table = pq.read_table("test.parquet")
41 |     assert pa_table.schema.metadata[b"hello"] == b"world"
42 | 
43 |     reader = read_parquet("test.parquet")
44 |     assert reader.schema.metadata[b"hello"] == b"world"
45 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_take.pyi:
--------------------------------------------------------------------------------
 1 | from arro3.core import Array
 2 | from arro3.core.types import ArrayInput
 3 | 
 4 | def take(values: ArrayInput, indices: ArrayInput) -> Array:
 5 |     """Take elements by index from Array, creating a new Array from those indexes.
 6 | 
 7 |     ```
 8 |     ┌─────────────────┐      ┌─────────┐                              ┌─────────────────┐
 9 |     │        A        │      │    0    │                              │        A        │
10 |     ├─────────────────┤      ├─────────┤                              ├─────────────────┤
11 |     │        D        │      │    2    │                              │        B        │
12 |     ├─────────────────┤      ├─────────┤   take(values, indices)      ├─────────────────┤
13 |     │        B        │      │    3    │ ─────────────────────────▶   │        C        │
14 |     ├─────────────────┤      ├─────────┤                              ├─────────────────┤
15 |     │        C        │      │    1    │                              │        D        │
16 |     ├─────────────────┤      └─────────┘                              └─────────────────┘
17 |     │        E        │
18 |     └─────────────────┘
19 |     values array             indices array                            result
20 |     ```
21 | 
22 |     Args:
23 |         values: The input Arrow data to select from.
24 |         indices: The indices within `values` to take. This must be a numeric array.
25 | 
26 |     Returns:
27 |         The selected arrow data.
28 |     """
29 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_cast.pyi:
--------------------------------------------------------------------------------
 1 | from typing import overload
 2 | 
 3 | from arro3.core import Array, ArrayReader
 4 | from arro3.core.types import (
 5 |     ArrayInput,
 6 |     ArrowSchemaExportable,
 7 |     ArrowStreamExportable,
 8 | )
 9 | 
10 | @overload
11 | def cast(
12 |     input: ArrayInput,
13 |     to_type: ArrowSchemaExportable,
14 | ) -> Array: ...
15 | @overload
16 | def cast(
17 |     input: ArrowStreamExportable,
18 |     to_type: ArrowSchemaExportable,
19 | ) -> ArrayReader: ...
20 | def cast(
21 |     input: ArrayInput | ArrowStreamExportable,
22 |     to_type: ArrowSchemaExportable,
23 | ) -> Array | ArrayReader:
24 |     """
25 |     Cast `input` to the provided data type and return a new Array with type `to_type`, if possible.
26 | 
27 |     If `input` is an Array, an `Array` will be returned. If `input` is a `ChunkedArray` or `ArrayReader`, an `ArrayReader` will be returned.
28 | 
29 |     Args:
30 |         input: Input data to cast.
31 |         to_type: The target data type to cast to. You may pass in a `Field` here if you wish to include Arrow extension metadata on the output array.
32 | 
33 |     Returns:
34 |         The casted Arrow data.
35 |     """
36 | 
37 | def can_cast_types(
38 |     from_type: ArrowSchemaExportable, to_type: ArrowSchemaExportable
39 | ) -> bool:
40 |     """Return true if a value of type `from_type` can be cast into a value of `to_type`.
41 | 
42 |     Args:
43 |         from_type: Source type
44 |         to_type: Destination type
45 | 
46 |     Returns:
47 |         True if can be casted.
48 |     """
49 | 


--------------------------------------------------------------------------------
/tests/core/test_struct_field.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | import pytest
 3 | from arro3.core import struct_field
 4 | 
 5 | 
 6 | def test_struct_field():
 7 |     a = pa.array([1, 2, 3])
 8 |     b = pa.array([3, 4, 5])
 9 |     struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"])
10 |     assert pa.array(struct_field(struct_arr, [0])) == a
11 | 
12 | 
13 | def test_struct_field_sliced_end():
14 |     a = pa.array([1, 2, 3])
15 |     b = pa.array([3, 4, 5])
16 |     struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"])
17 |     sliced = struct_arr.slice(1, 2)
18 |     sliced.offset
19 |     out = pa.array(struct_field(sliced, [0]))
20 |     assert out == sliced.field(0)
21 | 
22 | 
23 | def test_struct_field_sliced_start():
24 |     a = pa.array([1, 2, 3])
25 |     b = pa.array([3, 4, 5])
26 |     struct_arr = pa.StructArray.from_arrays([a, b], names=["a", "b"])
27 |     sliced = struct_arr.slice(0, 1)
28 |     out = pa.array(struct_field(sliced, [0]))
29 |     assert out == sliced.field(0)
30 | 
31 | 
32 | def test_struct_field_nested():
33 |     a = pa.array([1, 2, 3])
34 |     b = pa.array([3, 4, 5])
35 |     c = pa.array([7, 8, 9])
36 |     inner = pa.StructArray.from_arrays([a, b], names=["a", "b"])
37 |     outer = pa.StructArray.from_arrays([inner, c], names=["inner", "c"])
38 |     assert pa.array(struct_field(outer, [0, 0])) == a
39 |     assert pa.array(struct_field(outer, [0, 1])) == b
40 |     assert pa.array(struct_field(outer, [1])) == c
41 | 
42 |     with pytest.raises(Exception):
43 |         assert pa.array(struct_field(outer, [2]))
44 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/error.rs:
--------------------------------------------------------------------------------
 1 | //! Contains the [`PyArrowError`], the Error returned by most fallible functions in this crate.
 2 | 
 3 | use numpy::BorrowError;
 4 | use pyo3::exceptions::{PyException, PyValueError};
 5 | use pyo3::prelude::*;
 6 | use pyo3::DowncastError;
 7 | use thiserror::Error;
 8 | 
 9 | /// The Error variants returned by this crate.
10 | #[derive(Error, Debug)]
11 | #[non_exhaustive]
12 | pub enum PyArrowError {
13 |     /// A wrapped [arrow::error::ArrowError]
14 |     #[error(transparent)]
15 |     ArrowError(#[from] arrow_schema::ArrowError),
16 | 
17 |     /// A wrapped [PyErr]
18 |     #[error(transparent)]
19 |     PyErr(#[from] PyErr),
20 | 
21 |     /// Indicates why borrowing an array failed.
22 |     #[error(transparent)]
23 |     NumpyBorrowError(#[from] BorrowError),
24 | }
25 | 
26 | impl From<PyArrowError> for PyErr {
27 |     fn from(error: PyArrowError) -> Self {
28 |         match error {
29 |             PyArrowError::PyErr(err) => err,
30 |             PyArrowError::ArrowError(err) => PyException::new_err(err.to_string()),
31 |             PyArrowError::NumpyBorrowError(err) => PyException::new_err(err.to_string()),
32 |         }
33 |     }
34 | }
35 | 
36 | impl<'a, 'py> From<DowncastError<'a, 'py>> for PyArrowError {
37 |     fn from(other: DowncastError<'a, 'py>) -> Self {
38 |         Self::PyErr(PyValueError::new_err(format!(
39 |             "Could not downcast: {}",
40 |             other
41 |         )))
42 |     }
43 | }
44 | 
45 | /// A type wrapper around `Result<T, PyArrowError>`.
46 | pub type PyArrowResult<T> = Result<T, PyArrowError>;
47 | 


--------------------------------------------------------------------------------
/arro3-io/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "arro3-io"
 3 | version = { workspace = true }
 4 | authors = { workspace = true }
 5 | edition = { workspace = true }
 6 | description = "Rust-based readers and writers for Arrow in Python."
 7 | readme = "README.md"
 8 | repository = { workspace = true }
 9 | homepage = { workspace = true }
10 | license = { workspace = true }
11 | keywords = { workspace = true }
12 | categories = { workspace = true }
13 | rust-version = { workspace = true }
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | [lib]
17 | name = "_io"
18 | crate-type = ["cdylib"]
19 | 
20 | [features]
21 | default = ["async"]
22 | # Include async code. This feature won't compile for pyodide.
23 | async = [
24 |     "dep:pyo3-object_store",
25 |     "dep:pyo3-async-runtimes",
26 |     "parquet/object_store",
27 |     "dep:object_store",
28 |     "dep:futures",
29 | ]
30 | 
31 | [dependencies]
32 | arrow-array = { workspace = true }
33 | arrow-buffer = { workspace = true }
34 | arrow-csv = { workspace = true }
35 | arrow-ipc = { workspace = true }
36 | arrow-json = { workspace = true }
37 | arrow-schema = { workspace = true }
38 | bytes = { workspace = true }
39 | futures = { version = "0.3.30", optional = true }
40 | object_store = { workspace = true, optional = true }
41 | parquet = { workspace = true }
42 | pyo3 = { workspace = true }
43 | pyo3-arrow = { workspace = true }
44 | pyo3-async-runtimes = { workspace = true, features = [
45 |     "tokio-runtime",
46 | ], optional = true }
47 | pyo3-file = { workspace = true }
48 | pyo3-object_store = { workspace = true, optional = true }
49 | thiserror = { workspace = true }
50 | 


--------------------------------------------------------------------------------
/DEVELOP.md:
--------------------------------------------------------------------------------
 1 | ## Docs
 2 | 
 3 | ```bash
 4 | rm -rf .venv
 5 | uv sync
 6 | # Note: need to install core first because others depend on core
 7 | uv run maturin dev -m arro3-core/Cargo.toml
 8 | uv run maturin dev -m arro3-compute/Cargo.toml
 9 | uv run maturin dev -m arro3-io/Cargo.toml
10 | uv run mkdocs serve
11 | ```
12 | 
13 | ### Adding a new module
14 | 
15 | - Add new module to Github Actions matrix in `wheels.yml`
16 | - Update `docs.yml` to include module
17 | 
18 | ## Emscripten Python wheels
19 | 
20 | Install rust nightly and add wasm toolchain
21 | 
22 | ```bash
23 | rustup toolchain install nightly
24 | rustup target add --toolchain nightly wasm32-unknown-emscripten
25 | ```
26 | 
27 | Install maturin and pyodide-build (choose a specific version of pyodide-build if desired)
28 | 
29 | ```bash
30 | pip install -U maturin
31 | pip install pyodide-build
32 | ```
33 | 
34 | Clone emsdk. I clone this into a specific path at `~/github/emscripten-core/emsdk` so that it can be shared across projects.
35 | 
36 | ```bash
37 | mkdir -p ~/github/emscripten-core/
38 | git clone https://github.com/emscripten-core/emsdk.git ~/github/emscripten-core/emsdk
39 | # Or, set this manually
40 | PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version)
41 | ~/github/emscripten-core/emsdk/emsdk install ${PYODIDE_EMSCRIPTEN_VERSION}
42 | ~/github/emscripten-core/emsdk/emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION}
43 | source ~/github/emscripten-core/emsdk/emsdk_env.sh
44 | ```
45 | 
46 | Build `arro3-core`:
47 | 
48 | ```bash
49 | RUSTUP_TOOLCHAIN=nightly \
50 |     maturin build \
51 |     --release \
52 |     -o dist \
53 |     -m arro3-core/Cargo.toml \
54 |     --target wasm32-unknown-emscripten \
55 |     -i python3.11
56 | ```
57 | 


--------------------------------------------------------------------------------
/arro3-io/src/error.rs:
--------------------------------------------------------------------------------
 1 | //! Contains the [`Arro3IoError`], the Error returned by most fallible functions in this crate.
 2 | 
 3 | use pyo3::exceptions::{PyException, PyValueError};
 4 | use pyo3::prelude::*;
 5 | use pyo3::DowncastError;
 6 | use thiserror::Error;
 7 | 
 8 | /// The Error variants returned by this crate.
 9 | #[derive(Error, Debug)]
10 | #[non_exhaustive]
11 | pub enum Arro3IoError {
12 |     /// A wrapped [arrow::error::ArrowError]
13 |     #[error(transparent)]
14 |     ArrowError(#[from] arrow_schema::ArrowError),
15 | 
16 |     /// A wrapped [object_store::Error]
17 |     #[error(transparent)]
18 |     ObjectStoreError(#[from] object_store::Error),
19 | 
20 |     /// A wrapped [parquet::errors::ParquetError]
21 |     #[error(transparent)]
22 |     ParquetError(#[from] parquet::errors::ParquetError),
23 | 
24 |     /// A wrapped [PyErr]
25 |     #[error(transparent)]
26 |     PyErr(#[from] PyErr),
27 | }
28 | 
29 | impl From<Arro3IoError> for PyErr {
30 |     fn from(error: Arro3IoError) -> Self {
31 |         match error {
32 |             Arro3IoError::PyErr(err) => err,
33 |             Arro3IoError::ArrowError(err) => PyException::new_err(err.to_string()),
34 |             Arro3IoError::ObjectStoreError(err) => PyException::new_err(err.to_string()),
35 |             Arro3IoError::ParquetError(err) => PyException::new_err(err.to_string()),
36 |         }
37 |     }
38 | }
39 | 
40 | impl<'a, 'py> From<DowncastError<'a, 'py>> for Arro3IoError {
41 |     fn from(other: DowncastError<'a, 'py>) -> Self {
42 |         Self::PyErr(PyValueError::new_err(format!(
43 |             "Could not downcast: {other}"
44 |         )))
45 |     }
46 | }
47 | 
48 | /// A type wrapper around `Result<T, Arro3IoError>`.
49 | pub type Arro3IoResult<T> = Result<T, Arro3IoError>;
50 | 


--------------------------------------------------------------------------------
/tests/core/test_data_type.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from arro3.core import DataType, Field
 3 | 
 4 | 
 5 | def test_value_type_fixed_size_list_type():
 6 |     value_type = DataType.int8()
 7 |     list_dt = DataType.list(Field("inner", value_type), 2)
 8 |     assert list_dt.value_type == value_type
 9 | 
10 | 
11 | def test_value_field_list_type():
12 |     value_type = DataType.int8()
13 |     value_field = Field("inner", value_type, nullable=True)
14 |     list_dt = DataType.list(
15 |         value_field,
16 |         2,
17 |     )
18 |     assert list_dt.value_field == value_field
19 | 
20 | 
21 | def test_fields_struct_type():
22 |     field_foo = Field("foo", DataType.int8(), nullable=True)
23 |     field_bar = Field("bar", DataType.string(), nullable=False)
24 |     struct_type = DataType.struct([field_foo, field_bar])
25 |     assert struct_type.fields == [field_foo, field_bar]
26 | 
27 | 
28 | @pytest.mark.xfail
29 | def test_list_data_type_construction_with_dt():
30 |     _ = DataType.list(DataType.int16())
31 | 
32 | 
33 | def test_hashable():
34 |     # We should be able to use DataType as a key in a dict
35 |     _dtype_map = {
36 |         DataType.uint8(): DataType.int8(),
37 |         DataType.uint16(): DataType.int16(),
38 |         DataType.uint32(): DataType.int32(),
39 |         DataType.uint64(): DataType.int64(),
40 |     }
41 | 
42 | 
43 | class CustomException(Exception):
44 |     pass
45 | 
46 | 
47 | class ArrowCSchemaFails:
48 |     def __arrow_c_schema__(self):
49 |         raise CustomException
50 | 
51 | 
52 | def test_schema_import_preserve_exception():
53 |     """https://github.com/kylebarron/arro3/issues/325"""
54 | 
55 |     c_stream_obj = ArrowCSchemaFails()
56 |     with pytest.raises(CustomException):
57 |         DataType.from_arrow(c_stream_obj)
58 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | 
 3 | 
 4 | class StrEnum(str, Enum):
 5 |     def __new__(cls, value, *args, **kwargs):
 6 |         if not isinstance(value, (str, auto)):
 7 |             raise TypeError(
 8 |                 f"Values of StrEnums must be strings: {value!r} is a {type(value)}"
 9 |             )
10 |         return super().__new__(cls, value, *args, **kwargs)
11 | 
12 |     def __str__(self):
13 |         return str(self.value)
14 | 
15 |     def _generate_next_value_(name, *_):
16 |         return name.lower()
17 | 
18 | 
19 | class DatePart(StrEnum):
20 |     """Valid parts to extract from date/time/timestamp arrays.
21 | 
22 |     See [`date_part`][arro3.compute.date_part].
23 |     """
24 | 
25 |     Quarter = auto()
26 |     """Quarter of the year, in range `1..=4`"""
27 | 
28 |     Year = auto()
29 |     """Calendar year"""
30 | 
31 |     Month = auto()
32 |     """Month in the year, in range `1..=12`"""
33 | 
34 |     Week = auto()
35 |     """ISO week of the year, in range `1..=53`"""
36 | 
37 |     Day = auto()
38 |     """Day of the month, in range `1..=31`"""
39 | 
40 |     DayOfWeekSunday0 = auto()
41 |     """Day of the week, in range `0..=6`, where Sunday is `0`"""
42 | 
43 |     DayOfWeekMonday0 = auto()
44 |     """Day of the week, in range `0..=6`, where Monday is `0`"""
45 | 
46 |     DayOfYear = auto()
47 |     """Day of year, in range `1..=366`"""
48 | 
49 |     Hour = auto()
50 |     """Hour of the day, in range `0..=23`"""
51 | 
52 |     Minute = auto()
53 |     """Minute of the hour, in range `0..=59`"""
54 | 
55 |     Second = auto()
56 |     """Second of the minute, in range `0..=59`"""
57 | 
58 |     Millisecond = auto()
59 |     """Millisecond of the second"""
60 | 
61 |     Microsecond = auto()
62 |     """Microsecond of the second"""
63 | 
64 |     Nanosecond = auto()
65 |     """Nanosecond of the second"""
66 | 


--------------------------------------------------------------------------------
/arro3-core/src/accessors/struct_field.rs:
--------------------------------------------------------------------------------
 1 | use arrow_array::cast::AsArray;
 2 | use arrow_array::ArrayRef;
 3 | use arrow_schema::{ArrowError, DataType, FieldRef};
 4 | use pyo3::prelude::*;
 5 | use pyo3_arrow::error::PyArrowResult;
 6 | use pyo3_arrow::PyArray;
 7 | 
 8 | #[derive(FromPyObject)]
 9 | pub(crate) enum StructIndex {
10 |     Int(usize),
11 |     ListInt(Vec<usize>),
12 | }
13 | 
14 | impl StructIndex {
15 |     fn into_list(self) -> Vec<usize> {
16 |         match self {
17 |             Self::Int(i) => vec![i],
18 |             Self::ListInt(i) => i,
19 |         }
20 |     }
21 | }
22 | 
23 | #[pyfunction]
24 | #[pyo3(signature=(values, /, indices, * ))]
25 | pub(crate) fn struct_field(
26 |     py: Python,
27 |     values: PyArray,
28 |     indices: StructIndex,
29 | ) -> PyArrowResult<PyObject> {
30 |     let (orig_array, field) = values.into_inner();
31 |     let indices = indices.into_list();
32 | 
33 |     let mut array_ref = &orig_array;
34 |     let mut field_ref = &field;
35 |     for i in indices {
36 |         (array_ref, field_ref) = get_child(array_ref, i)?;
37 |     }
38 | 
39 |     Ok(PyArray::new(
40 |         array_ref.slice(orig_array.offset(), orig_array.len()),
41 |         field_ref.clone(),
42 |     )
43 |     .to_arro3(py)?
44 |     .unbind())
45 | }
46 | 
47 | fn get_child(array: &ArrayRef, i: usize) -> Result<(&ArrayRef, &FieldRef), ArrowError> {
48 |     match array.data_type() {
49 |         DataType::Struct(fields) => {
50 |             let arr = array.as_struct();
51 |             let inner_arr = arr.columns().get(i).ok_or(ArrowError::SchemaError(
52 |                 "Out of range for number of fields".into(),
53 |             ))?;
54 |             let inner_field = &fields[i];
55 |             Ok((inner_arr, inner_field))
56 |         }
57 |         _ => Err(ArrowError::SchemaError(
58 |             "DataType must be struct.".to_string(),
59 |         )),
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/arro3-compute/python/arro3/compute/_temporal.pyi:
--------------------------------------------------------------------------------
 1 | from typing import overload
 2 | 
 3 | from arro3.core import Array, ArrayReader
 4 | from arro3.core.types import ArrowArrayExportable, ArrowStreamExportable
 5 | 
 6 | from .enums import DatePart
 7 | from .types import DatePartT
 8 | 
 9 | # # Examples
10 | 
11 | # ```
12 | # # use arrow_array::{Int32Array, TimestampMicrosecondArray};
13 | # # use arrow_arith::temporal::{DatePart, date_part};
14 | # let input: TimestampMicrosecondArray =
15 | #     vec![Some(1612025847000000), None, Some(1722015847000000)].into();
16 | 
17 | # let actual = date_part(&input, DatePart::Week).unwrap();
18 | # let expected: Int32Array = vec![Some(4), None, Some(30)].into();
19 | # assert_eq!(actual.as_ref(), &expected);
20 | # ```
21 | 
22 | @overload
23 | def date_part(input: ArrowArrayExportable, part: DatePart | DatePartT) -> Array: ...
24 | @overload
25 | def date_part(
26 |     input: ArrowStreamExportable, part: DatePart | DatePartT
27 | ) -> ArrayReader: ...
28 | def date_part(
29 |     input: ArrowArrayExportable | ArrowStreamExportable, part: DatePart | DatePartT
30 | ) -> Array | ArrayReader:
31 |     """
32 |     Given an array, return a new array with the extracted [`DatePart`] as signed 32-bit
33 |     integer values.
34 | 
35 |     Currently only supports temporal types:
36 |       - Date32/Date64
37 |       - Time32/Time64
38 |       - Timestamp
39 |       - Interval
40 |       - Duration
41 | 
42 |     Returns an int32-typed array unless input was a dictionary type, in which case
43 |     returns the dictionary but with this function applied onto its values.
44 | 
45 |     If array passed in is not of the above listed types (or is a dictionary array where
46 |     the values array isn't of the above listed types), then this function will return an
47 |     error.
48 | 
49 |     Args:
50 |         input: Argument to compute function.
51 | 
52 |     Returns:
53 |         The extracted date part.
54 |     """
55 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/_ipc.pyi:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import IO, Literal
 3 | 
 4 | # Note: importing with
 5 | # `from arro3.core import Array`
 6 | # will cause Array to be included in the generated docs in this module.
 7 | import arro3.core as core
 8 | import arro3.core.types as types
 9 | 
10 | def read_ipc(file: IO[bytes] | Path | str) -> core.RecordBatchReader:
11 |     """Read an Arrow IPC file into memory
12 | 
13 |     Args:
14 |         file: The input Arrow IPC file path or buffer.
15 | 
16 |     Returns:
17 |         An arrow RecordBatchReader.
18 |     """
19 | 
20 | def read_ipc_stream(file: IO[bytes] | Path | str) -> core.RecordBatchReader:
21 |     """Read an Arrow IPC stream into memory
22 | 
23 |     Args:
24 |         file: The input Arrow IPC stream path or buffer.
25 | 
26 |     Returns:
27 |         An arrow RecordBatchReader.
28 |     """
29 | 
30 | def write_ipc(
31 |     data: types.ArrowStreamExportable | types.ArrowArrayExportable,
32 |     file: IO[bytes] | Path | str,
33 |     *,
34 |     compression: Literal["LZ4", "lz4", "ZSTD", "zstd"] | None = None,
35 | ) -> None:
36 |     """Write Arrow data to an Arrow IPC file
37 | 
38 |     Args:
39 |         data: the Arrow Table, RecordBatchReader, or RecordBatch to write.
40 |         file: the output file or buffer to write to
41 | 
42 |     Other Args:
43 |         compression: Compression to apply to file.
44 |     """
45 | 
46 | def write_ipc_stream(
47 |     data: types.ArrowStreamExportable | types.ArrowArrayExportable,
48 |     file: IO[bytes] | Path | str,
49 |     *,
50 |     compression: Literal["LZ4", "lz4", "ZSTD", "zstd"] | None = None,
51 | ) -> None:
52 |     """Write Arrow data to an Arrow IPC stream
53 | 
54 |     Args:
55 |         data: the Arrow Table, RecordBatchReader, or RecordBatch to write.
56 |         file: the output file or buffer to write to
57 | 
58 |     Other Args:
59 |         compression: Compression to apply to file.
60 |     """
61 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/to_python/chunked.rs:
--------------------------------------------------------------------------------
 1 | use arrow_array::ArrayRef;
 2 | use arrow_schema::{ArrowError, FieldRef};
 3 | 
 4 | /// Trait for types that can read `ArrayRef`'s.
 5 | ///
 6 | /// To create from an iterator, see [ArrayIterator].
 7 | pub trait ArrayReader: Iterator<Item = Result<ArrayRef, ArrowError>> {
 8 |     /// Returns the field of this `ArrayReader`.
 9 |     ///
10 |     /// Implementation of this trait should guarantee that all `ArrayRef`'s returned by this
11 |     /// reader should have the same field as returned from this method.
12 |     fn field(&self) -> FieldRef;
13 | }
14 | 
15 | impl<R: ArrayReader + ?Sized> ArrayReader for Box<R> {
16 |     fn field(&self) -> FieldRef {
17 |         self.as_ref().field()
18 |     }
19 | }
20 | 
21 | /// An iterator of [`ArrayRef`] with an attached [`FieldRef`]
22 | pub struct ArrayIterator<I>
23 | where
24 |     I: IntoIterator<Item = Result<ArrayRef, ArrowError>>,
25 | {
26 |     inner: I::IntoIter,
27 |     inner_field: FieldRef,
28 | }
29 | 
30 | impl<I> ArrayIterator<I>
31 | where
32 |     I: IntoIterator<Item = Result<ArrayRef, ArrowError>>,
33 | {
34 |     /// Create a new [ArrayIterator].
35 |     ///
36 |     /// If `iter` is an infallible iterator, use `.map(Ok)`.
37 |     pub fn new(iter: I, field: FieldRef) -> Self {
38 |         Self {
39 |             inner: iter.into_iter(),
40 |             inner_field: field,
41 |         }
42 |     }
43 | }
44 | 
45 | impl<I> Iterator for ArrayIterator<I>
46 | where
47 |     I: IntoIterator<Item = Result<ArrayRef, ArrowError>>,
48 | {
49 |     type Item = I::Item;
50 | 
51 |     fn next(&mut self) -> Option<Self::Item> {
52 |         self.inner.next()
53 |     }
54 | 
55 |     fn size_hint(&self) -> (usize, Option<usize>) {
56 |         self.inner.size_hint()
57 |     }
58 | }
59 | 
60 | impl<I> ArrayReader for ArrayIterator<I>
61 | where
62 |     I: IntoIterator<Item = Result<ArrayRef, ArrowError>>,
63 | {
64 |     fn field(&self) -> FieldRef {
65 |         self.inner_field.clone()
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/tests/io/test_ipc.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from pathlib import Path
 3 | 
 4 | import pyarrow as pa
 5 | from arro3.io import read_ipc, read_ipc_stream, write_ipc, write_ipc_stream
 6 | 
 7 | 
 8 | def test_ipc_round_trip_string():
 9 |     table = pa.table({"a": [1, 2, 3, 4]})
10 |     write_ipc(table, "test.arrow")
11 |     table_retour = pa.table(read_ipc("test.arrow"))
12 |     assert table == table_retour
13 | 
14 |     write_ipc_stream(table, "test.arrows")
15 |     table_retour = pa.table(read_ipc_stream("test.arrows"))
16 |     assert table == table_retour
17 | 
18 | 
19 | def test_ipc_round_trip_path():
20 |     table = pa.table({"a": [1, 2, 3, 4]})
21 |     write_ipc(table, Path("test.arrow"))
22 |     table_retour = pa.table(read_ipc(Path("test.arrow")))
23 |     assert table == table_retour
24 | 
25 |     write_ipc_stream(table, Path("test.arrows"))
26 |     table_retour = pa.table(read_ipc_stream(Path("test.arrows")))
27 |     assert table == table_retour
28 | 
29 | 
30 | def test_ipc_round_trip_buffer():
31 |     table = pa.table({"a": [1, 2, 3, 4]})
32 |     bio = BytesIO()
33 |     write_ipc(table, bio)
34 |     table_retour = pa.table(read_ipc(bio))
35 |     assert table == table_retour
36 | 
37 |     bio = BytesIO()
38 |     write_ipc_stream(table, bio)
39 |     bio.seek(0)
40 |     table_retour = pa.table(read_ipc_stream(bio))
41 |     assert table == table_retour
42 | 
43 | 
44 | def test_ipc_round_trip_compression():
45 |     table = pa.table({"a": [1, 2, 3, 4]})
46 |     write_ipc(table, "test.arrow", compression="lz4")
47 |     table_retour = pa.table(read_ipc("test.arrow"))
48 |     assert table == table_retour
49 | 
50 |     table = pa.table({"a": [1, 2, 3, 4]})
51 |     write_ipc(table, "test.arrow", compression="zstd")
52 |     table_retour = pa.table(read_ipc("test.arrow"))
53 |     assert table == table_retour
54 | 
55 |     table = pa.table({"a": [1, 2, 3, 4]})
56 |     write_ipc(table, "test.arrow", compression=None)
57 |     table_retour = pa.table(read_ipc("test.arrow"))
58 |     assert table == table_retour
59 | 


--------------------------------------------------------------------------------
/tests/compute/test_aggregate.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | 
 3 | import arro3.compute as ac
 4 | import pyarrow as pa
 5 | from arro3.core import Array, ChunkedArray, DataType
 6 | 
 7 | 
 8 | def test_min():
 9 |     arr1 = Array([1, 2, 3], DataType.int16())
10 |     assert ac.min(arr1).as_py() == 1
11 | 
12 |     arr2 = Array([3, 2, 0], DataType.int16())
13 |     assert ac.min(arr2).as_py() == 0
14 | 
15 |     ca = ChunkedArray([arr1, arr2])
16 |     assert ac.min(ca).as_py() == 0
17 | 
18 |     arr = Array(["c", "a", "b"], DataType.string())
19 |     assert ac.min(arr).as_py() == "a"
20 | 
21 | 
22 | def test_max():
23 |     arr1 = Array([1, 2, 3], DataType.int16())
24 |     assert ac.max(arr1).as_py() == 3
25 | 
26 |     arr2 = Array([4, 2, 0], DataType.int16())
27 |     assert ac.max(arr2).as_py() == 4
28 | 
29 |     ca = ChunkedArray([arr1, arr2])
30 |     assert ac.max(ca).as_py() == 4
31 | 
32 |     arr = Array(["c", "a", "b"], DataType.string())
33 |     assert ac.max(arr).as_py() == "c"
34 | 
35 | 
36 | def test_sum():
37 |     arr1 = Array([1, 2, 3], DataType.int16())
38 |     assert ac.sum(arr1).as_py() == 6
39 | 
40 |     arr2 = Array([4, 2, 0], DataType.int16())
41 |     assert ac.sum(arr2).as_py() == 6
42 | 
43 |     ca = ChunkedArray([arr1, arr2])
44 |     assert ac.sum(ca).as_py() == 12
45 | 
46 | 
47 | def test_min_max_datetime():
48 |     dt1 = datetime.now()
49 |     dt2 = datetime.now()
50 |     dt3 = datetime.now()
51 | 
52 |     pa_arr = pa.array([dt1, dt2, dt3], type=pa.timestamp("ns", None))
53 |     arro3_arr = Array(pa_arr)
54 |     assert ac.min(arro3_arr).as_py() == dt1
55 |     assert ac.max(arro3_arr).as_py() == dt3
56 | 
57 | 
58 | def test_min_max_datetime_with_timezone():
59 |     dt1 = datetime.now(timezone.utc)
60 |     dt2 = datetime.now(timezone.utc)
61 |     dt3 = datetime.now(timezone.utc)
62 |     arr = pa.array([dt1, dt2, dt3])
63 |     assert arr.type.tz == "UTC"
64 | 
65 |     assert ac.min(arr).as_py() == dt1
66 |     assert ac.min(arr).type.tz == "UTC"
67 |     assert ac.max(arr).as_py() == dt3
68 |     assert ac.max(arr).type.tz == "UTC"
69 | 


--------------------------------------------------------------------------------
/arro3-compute/src/cast.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions::PyTypeError;
 2 | use pyo3::prelude::*;
 3 | use pyo3_arrow::error::PyArrowResult;
 4 | use pyo3_arrow::ffi::ArrayIterator;
 5 | use pyo3_arrow::input::AnyArray;
 6 | use pyo3_arrow::{PyArray, PyArrayReader, PyDataType, PyField};
 7 | 
 8 | /// Cast `input` to the provided data type and return a new Arrow object with type `to_type`, if
 9 | /// possible.
10 | ///
11 | /// Args:
12 | ///     input: an Arrow Array, RecordBatch, ChunkedArray, Table, ArrayReader, or RecordBatchReader
13 | ///     to_type: an Arrow DataType, Field, or Schema describing the output type of the cast.
14 | #[pyfunction]
15 | pub fn cast(py: Python, input: AnyArray, to_type: PyField) -> PyArrowResult<PyObject> {
16 |     match input {
17 |         AnyArray::Array(arr) => {
18 |             let new_field = to_type.into_inner();
19 |             let out = arrow_cast::cast(arr.as_ref(), new_field.data_type())?;
20 |             Ok(PyArray::new(out, new_field).to_arro3(py)?.unbind())
21 |         }
22 |         AnyArray::Stream(stream) => {
23 |             let reader = stream.into_reader()?;
24 |             let field = reader.field();
25 |             let from_type = field.data_type();
26 | 
27 |             let new_field = to_type.into_inner();
28 |             let to_type = new_field.data_type().clone();
29 |             if !arrow_cast::can_cast_types(from_type, &to_type) {
30 |                 return Err(PyTypeError::new_err(format!(
31 |                     "Unable to cast from type {from_type} to {to_type}"
32 |                 ))
33 |                 .into());
34 |             }
35 | 
36 |             let iter = reader
37 |                 .into_iter()
38 |                 .map(move |array| arrow_cast::cast(&array?, &to_type));
39 |             Ok(
40 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, new_field)))
41 |                     .to_arro3(py)?
42 |                     .unbind(),
43 |             )
44 |         }
45 |     }
46 | }
47 | 
48 | #[pyfunction]
49 | pub fn can_cast_types(from_type: PyDataType, to_type: PyDataType) -> bool {
50 |     arrow_cast::can_cast_types(from_type.as_ref(), to_type.as_ref())
51 | }
52 | 


--------------------------------------------------------------------------------
/.github/workflows/test-python.yml:
--------------------------------------------------------------------------------
 1 | name: Python
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   pre-commit:
15 |     name: Run pre-commit on Python code
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - uses: actions/setup-python@v5
21 |         with:
22 |           python-version: "3.11"
23 | 
24 |       - name: Cache pre-commit virtualenvs
25 |         uses: actions/cache@v4
26 |         with:
27 |           path: ~/.cache/pre-commit
28 |           key: pre-commit-3|${{ hashFiles('.pre-commit-config.yaml') }}
29 | 
30 |       - name: run pre-commit
31 |         run: |
32 |           python -m pip install pre-commit
33 |           pre-commit run --all-files
34 | 
35 |   test-python:
36 |     name: Build and test Python
37 |     runs-on: ubuntu-latest
38 |     strategy:
39 |       fail-fast: true
40 |       matrix:
41 |         python-version: ["3.9", "3.12"]
42 |     steps:
43 |       - uses: actions/checkout@v4
44 | 
45 |       - name: Install Rust
46 |         uses: dtolnay/rust-toolchain@stable
47 | 
48 |       - uses: Swatinem/rust-cache@v2
49 | 
50 |       - name: Set up Python
51 |         id: setup-python
52 |         uses: actions/setup-python@v5
53 |         with:
54 |           python-version: ${{ matrix.python-version }}
55 | 
56 |       - name: Install a specific version of uv
57 |         uses: astral-sh/setup-uv@v3
58 |         with:
59 |           enable-cache: true
60 |           version: "0.4.x"
61 | 
62 |       - name: Build rust submodules
63 |         run: |
64 |           # Note: core module must be first, because it's depended on by others
65 |           uv run maturin dev -m arro3-core/Cargo.toml
66 |           uv run maturin dev -m arro3-compute/Cargo.toml
67 |           uv run maturin dev -m arro3-io/Cargo.toml
68 | 
69 |       - name: Run python tests
70 |         run: |
71 |           uv run pytest tests
72 | 
73 |       # Ensure docs build without warnings
74 |       - name: Check docs
75 |         run: uv run mkdocs build --strict
76 | 


--------------------------------------------------------------------------------
/arro3-io/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions::PyRuntimeWarning;
 2 | use pyo3::intern;
 3 | use pyo3::prelude::*;
 4 | use pyo3::types::PyTuple;
 5 | 
 6 | mod csv;
 7 | mod error;
 8 | mod ipc;
 9 | mod json;
10 | mod parquet;
11 | mod utils;
12 | 
13 | const VERSION: &str = env!("CARGO_PKG_VERSION");
14 | 
15 | #[pyfunction]
16 | fn ___version() -> &'static str {
17 |     VERSION
18 | }
19 | 
20 | /// Raise RuntimeWarning for debug builds
21 | #[pyfunction]
22 | fn check_debug_build(py: Python) -> PyResult<()> {
23 |     #[cfg(debug_assertions)]
24 |     {
25 |         let warnings_mod = py.import(intern!(py, "warnings"))?;
26 |         let warning = PyRuntimeWarning::new_err(
27 |             "arro3.io has not been compiled in release mode. Performance will be degraded.",
28 |         );
29 |         let args = PyTuple::new(py, vec![warning])?;
30 |         warnings_mod.call_method1(intern!(py, "warn"), args)?;
31 |     }
32 | 
33 |     Ok(())
34 | }
35 | 
36 | #[pymodule]
37 | fn _io(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
38 |     check_debug_build(py)?;
39 | 
40 |     m.add_wrapped(wrap_pyfunction!(___version))?;
41 | 
42 |     pyo3_object_store::register_store_module(py, m, "arro3.io", "store")?;
43 |     pyo3_object_store::register_exceptions_module(py, m, "arro3.io", "exceptions")?;
44 | 
45 |     m.add_wrapped(wrap_pyfunction!(csv::infer_csv_schema))?;
46 |     m.add_wrapped(wrap_pyfunction!(csv::read_csv))?;
47 |     m.add_wrapped(wrap_pyfunction!(csv::write_csv))?;
48 | 
49 |     m.add_wrapped(wrap_pyfunction!(json::infer_json_schema))?;
50 |     m.add_wrapped(wrap_pyfunction!(json::read_json))?;
51 |     m.add_wrapped(wrap_pyfunction!(json::write_json))?;
52 |     m.add_wrapped(wrap_pyfunction!(json::write_ndjson))?;
53 | 
54 |     m.add_wrapped(wrap_pyfunction!(ipc::read_ipc))?;
55 |     m.add_wrapped(wrap_pyfunction!(ipc::read_ipc_stream))?;
56 |     m.add_wrapped(wrap_pyfunction!(ipc::write_ipc))?;
57 |     m.add_wrapped(wrap_pyfunction!(ipc::write_ipc_stream))?;
58 | 
59 |     m.add_wrapped(wrap_pyfunction!(parquet::read_parquet))?;
60 |     m.add_wrapped(wrap_pyfunction!(parquet::read_parquet_async))?;
61 |     m.add_wrapped(wrap_pyfunction!(parquet::write_parquet))?;
62 | 
63 |     Ok(())
64 | }
65 | 


--------------------------------------------------------------------------------
/arro3-compute/src/boolean.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use arrow_array::ArrayRef;
 4 | use arrow_schema::{DataType, Field};
 5 | use pyo3::prelude::*;
 6 | use pyo3_arrow::error::PyArrowResult;
 7 | use pyo3_arrow::ffi::ArrayIterator;
 8 | use pyo3_arrow::input::AnyArray;
 9 | use pyo3_arrow::{PyArray, PyArrayReader};
10 | 
11 | #[pyfunction]
12 | pub fn is_null(py: Python, input: AnyArray) -> PyArrowResult<PyObject> {
13 |     match input {
14 |         AnyArray::Array(input) => {
15 |             let out = arrow_arith::boolean::is_null(input.as_ref())?;
16 |             Ok(PyArray::from_array_ref(Arc::new(out))
17 |                 .to_arro3(py)?
18 |                 .unbind())
19 |         }
20 |         AnyArray::Stream(input) => {
21 |             let input = input.into_reader()?;
22 |             let out_field = Field::new("", DataType::Boolean, true);
23 | 
24 |             let iter = input.into_iter().map(move |input| {
25 |                 let out = arrow_arith::boolean::is_null(&input?)?;
26 |                 Ok(Arc::new(out) as ArrayRef)
27 |             });
28 |             Ok(
29 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into())))
30 |                     .to_arro3(py)?
31 |                     .unbind(),
32 |             )
33 |         }
34 |     }
35 | }
36 | 
37 | #[pyfunction]
38 | pub fn is_not_null(py: Python, input: AnyArray) -> PyArrowResult<PyObject> {
39 |     match input {
40 |         AnyArray::Array(input) => {
41 |             let out = arrow_arith::boolean::is_not_null(input.as_ref())?;
42 |             Ok(PyArray::from_array_ref(Arc::new(out))
43 |                 .to_arro3(py)?
44 |                 .unbind())
45 |         }
46 |         AnyArray::Stream(input) => {
47 |             let input = input.into_reader()?;
48 |             let out_field = Field::new("", DataType::Boolean, true);
49 | 
50 |             let iter = input.into_iter().map(move |input| {
51 |                 let out = arrow_arith::boolean::is_not_null(&input?)?;
52 |                 Ok(Arc::new(out) as ArrayRef)
53 |             });
54 |             Ok(
55 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into())))
56 |                     .to_arro3(py)?
57 |                     .unbind(),
58 |             )
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/arro3-compute/src/filter.rs:
--------------------------------------------------------------------------------
 1 | use arrow_array::cast::AsArray;
 2 | use arrow_schema::{ArrowError, DataType};
 3 | use pyo3::exceptions::PyValueError;
 4 | use pyo3::prelude::*;
 5 | use pyo3_arrow::error::PyArrowResult;
 6 | use pyo3_arrow::ffi::ArrayIterator;
 7 | use pyo3_arrow::input::AnyArray;
 8 | use pyo3_arrow::{PyArray, PyArrayReader};
 9 | 
10 | #[pyfunction]
11 | pub fn filter(py: Python, values: AnyArray, predicate: AnyArray) -> PyArrowResult<PyObject> {
12 |     match (values, predicate) {
13 |         (AnyArray::Array(values), AnyArray::Array(predicate)) => {
14 |             let (values, values_field) = values.into_inner();
15 |             let predicate = predicate
16 |                 .as_ref()
17 |                 .as_boolean_opt()
18 |                 .ok_or(ArrowError::ComputeError(
19 |                     "Expected boolean array for predicate".to_string(),
20 |                 ))?;
21 | 
22 |             let filtered = arrow_select::filter::filter(values.as_ref(), predicate)?;
23 |             Ok(PyArray::new(filtered, values_field).to_arro3(py)?.unbind())
24 |         }
25 |         (AnyArray::Stream(values), AnyArray::Stream(predicate)) => {
26 |             let values = values.into_reader()?;
27 |             let predicate = predicate.into_reader()?;
28 | 
29 |             if !predicate
30 |                 .field()
31 |                 .data_type()
32 |                 .equals_datatype(&DataType::Boolean)
33 |             {
34 |                 return Err(PyValueError::new_err("Expected boolean array for predicate").into());
35 |             }
36 | 
37 |             let values_field = values.field();
38 | 
39 |             let iter = values
40 |                 .into_iter()
41 |                 .zip(predicate)
42 |                 .map(move |(values, predicate)| {
43 |                     let predicate_arr = predicate?;
44 |                     let filtered =
45 |                         arrow_select::filter::filter(values?.as_ref(), predicate_arr.as_boolean())?;
46 |                     Ok(filtered)
47 |                 });
48 |             Ok(
49 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, values_field)))
50 |                     .to_arro3(py)?
51 |                     .unbind(),
52 |             )
53 |         }
54 |         _ => Err(PyValueError::new_err("Unsupported combination of array and stream").into()),
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/arro3-core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions::PyRuntimeWarning;
 2 | use pyo3::intern;
 3 | use pyo3::prelude::*;
 4 | use pyo3::types::PyTuple;
 5 | 
 6 | mod accessors;
 7 | mod constructors;
 8 | 
 9 | const VERSION: &str = env!("CARGO_PKG_VERSION");
10 | 
11 | #[pyfunction]
12 | fn ___version() -> &'static str {
13 |     VERSION
14 | }
15 | 
16 | /// Raise RuntimeWarning for debug builds
17 | #[pyfunction]
18 | fn check_debug_build(py: Python) -> PyResult<()> {
19 |     #[cfg(debug_assertions)]
20 |     {
21 |         let warnings_mod = py.import(intern!(py, "warnings"))?;
22 |         let warning = PyRuntimeWarning::new_err(
23 |             "arro3.core has not been compiled in release mode. Performance will be degraded.",
24 |         );
25 |         let args = PyTuple::new(py, vec![warning])?;
26 |         warnings_mod.call_method1(intern!(py, "warn"), args)?;
27 |     }
28 | 
29 |     Ok(())
30 | }
31 | 
32 | /// A Python module implemented in Rust.
33 | #[pymodule]
34 | fn _core(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
35 |     check_debug_build(py)?;
36 | 
37 |     m.add_wrapped(wrap_pyfunction!(___version))?;
38 | 
39 |     m.add_class::<pyo3_arrow::PyArray>()?;
40 |     m.add_class::<pyo3_arrow::PyArrayReader>()?;
41 |     m.add_class::<pyo3_arrow::buffer::PyArrowBuffer>()?;
42 |     m.add_class::<pyo3_arrow::PyChunkedArray>()?;
43 |     m.add_class::<pyo3_arrow::PyDataType>()?;
44 |     m.add_class::<pyo3_arrow::PyField>()?;
45 |     m.add_class::<pyo3_arrow::PyRecordBatch>()?;
46 |     m.add_class::<pyo3_arrow::PyRecordBatchReader>()?;
47 |     m.add_class::<pyo3_arrow::PyScalar>()?;
48 |     m.add_class::<pyo3_arrow::PySchema>()?;
49 |     m.add_class::<pyo3_arrow::PyTable>()?;
50 | 
51 |     m.add_wrapped(wrap_pyfunction!(
52 |         accessors::dictionary::dictionary_dictionary
53 |     ))?;
54 |     m.add_wrapped(wrap_pyfunction!(accessors::dictionary::dictionary_indices))?;
55 |     m.add_wrapped(wrap_pyfunction!(accessors::list_flatten::list_flatten))?;
56 |     m.add_wrapped(wrap_pyfunction!(accessors::list_offsets::list_offsets))?;
57 |     m.add_wrapped(wrap_pyfunction!(accessors::struct_field::struct_field))?;
58 | 
59 |     m.add_wrapped(wrap_pyfunction!(constructors::fixed_size_list_array))?;
60 |     m.add_wrapped(wrap_pyfunction!(constructors::list_array))?;
61 |     m.add_wrapped(wrap_pyfunction!(constructors::struct_array))?;
62 | 
63 |     Ok(())
64 | }
65 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/input.rs:
--------------------------------------------------------------------------------
 1 | use crate::array_reader::PyArrayReader;
 2 | use crate::input::{AnyArray, AnyDatum, AnyRecordBatch};
 3 | use crate::{PyArray, PyScalar};
 4 | use pyo3::exceptions::PyValueError;
 5 | use pyo3::prelude::*;
 6 | use pyo3::{intern, PyAny, PyResult};
 7 | 
 8 | impl<'a> FromPyObject<'a> for AnyRecordBatch {
 9 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
10 |         if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? {
11 |             Ok(Self::RecordBatch(ob.extract()?))
12 |         } else if ob.hasattr(intern!(ob.py(), "__arrow_c_stream__"))? {
13 |             Ok(Self::Stream(ob.extract()?))
14 |         } else {
15 |             Err(PyValueError::new_err(
16 |                 "Expected object with __arrow_c_array__ or __arrow_c_stream__ method",
17 |             ))
18 |         }
19 |     }
20 | }
21 | 
22 | impl<'a> FromPyObject<'a> for AnyArray {
23 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
24 |         // First extract infallibly if __arrow_c_array__ method is present, so that any exception
25 |         // in that gets propagated. Also check if PyArray extract works so that Buffer Protocol
26 |         // conversion still works.
27 |         // Do the same for __arrow_c_stream__ and PyArrayReader below.
28 |         if ob.hasattr(intern!(ob.py(), "__arrow_c_array__"))? {
29 |             Ok(Self::Array(ob.extract()?))
30 |         } else if let Ok(arr) = ob.extract::<PyArray>() {
31 |             Ok(Self::Array(arr))
32 |         } else if ob.hasattr(intern!(ob.py(), "__arrow_c_stream__"))? {
33 |             Ok(Self::Stream(ob.extract()?))
34 |         } else if let Ok(stream) = ob.extract::<PyArrayReader>() {
35 |             Ok(Self::Stream(stream))
36 |         } else {
37 |             Err(PyValueError::new_err(
38 |                 "Expected object with __arrow_c_array__ or __arrow_c_stream__ method or implementing buffer protocol.",
39 |             ))
40 |         }
41 |     }
42 | }
43 | 
44 | impl<'a> FromPyObject<'a> for AnyDatum {
45 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
46 |         let array = ob.extract::<PyArray>()?;
47 |         if array.as_ref().len() == 1 {
48 |             let (array, field) = array.into_inner();
49 |             Ok(Self::Scalar(PyScalar::try_new(array, field)?))
50 |         } else {
51 |             Ok(Self::Array(array))
52 |         }
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/arro3-compute/src/arith.rs:
--------------------------------------------------------------------------------
 1 | use arrow_arith::numeric;
 2 | use pyo3::prelude::*;
 3 | use pyo3_arrow::error::PyArrowResult;
 4 | use pyo3_arrow::input::AnyDatum;
 5 | use pyo3_arrow::PyArray;
 6 | 
 7 | #[pyfunction]
 8 | pub fn add(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
 9 |     Ok(PyArray::from_array_ref(numeric::add(&lhs, &rhs)?)
10 |         .to_arro3(py)?
11 |         .unbind())
12 | }
13 | 
14 | #[pyfunction]
15 | pub fn add_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
16 |     Ok(PyArray::from_array_ref(numeric::add_wrapping(&lhs, &rhs)?)
17 |         .to_arro3(py)?
18 |         .unbind())
19 | }
20 | 
21 | #[pyfunction]
22 | pub fn div(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
23 |     Ok(PyArray::from_array_ref(numeric::div(&lhs, &rhs)?)
24 |         .to_arro3(py)?
25 |         .unbind())
26 | }
27 | 
28 | #[pyfunction]
29 | pub fn mul(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
30 |     Ok(PyArray::from_array_ref(numeric::mul(&lhs, &rhs)?)
31 |         .to_arro3(py)?
32 |         .unbind())
33 | }
34 | 
35 | #[pyfunction]
36 | pub fn mul_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
37 |     Ok(PyArray::from_array_ref(numeric::mul_wrapping(&lhs, &rhs)?)
38 |         .to_arro3(py)?
39 |         .unbind())
40 | }
41 | 
42 | #[pyfunction]
43 | pub fn neg(py: Python, array: PyArray) -> PyArrowResult<PyObject> {
44 |     Ok(PyArray::from_array_ref(numeric::neg(array.as_ref())?)
45 |         .to_arro3(py)?
46 |         .unbind())
47 | }
48 | 
49 | #[pyfunction]
50 | pub fn neg_wrapping(py: Python, array: PyArray) -> PyArrowResult<PyObject> {
51 |     Ok(
52 |         PyArray::from_array_ref(numeric::neg_wrapping(array.as_ref())?)
53 |             .to_arro3(py)?
54 |             .unbind(),
55 |     )
56 | }
57 | 
58 | #[pyfunction]
59 | pub fn rem(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
60 |     Ok(PyArray::from_array_ref(numeric::rem(&lhs, &rhs)?)
61 |         .to_arro3(py)?
62 |         .unbind())
63 | }
64 | 
65 | #[pyfunction]
66 | pub fn sub(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
67 |     Ok(PyArray::from_array_ref(numeric::sub(&lhs, &rhs)?)
68 |         .to_arro3(py)?
69 |         .unbind())
70 | }
71 | 
72 | #[pyfunction]
73 | pub fn sub_wrapping(py: Python, lhs: AnyDatum, rhs: AnyDatum) -> PyArrowResult<PyObject> {
74 |     Ok(PyArray::from_array_ref(numeric::sub_wrapping(&lhs, &rhs)?)
75 |         .to_arro3(py)?
76 |         .unbind())
77 | }
78 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Publish arro3 docs
 2 | 
 3 | # Only run on new tags starting with `py-v`
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - "py-v*"
 8 |   workflow_dispatch:
 9 | 
10 | # https://stackoverflow.com/a/77412363
11 | permissions:
12 |   contents: write
13 |   pages: write
14 | 
15 | jobs:
16 |   build:
17 |     name: Deploy Python docs
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |         # We need to additionally fetch the gh-pages branch for mike deploy
22 |         with:
23 |           fetch-depth: 0
24 | 
25 |       - name: Install Rust
26 |         uses: dtolnay/rust-toolchain@stable
27 | 
28 |       - uses: Swatinem/rust-cache@v2
29 | 
30 |       - name: Set up Python 3.11
31 |         id: setup-python
32 |         uses: actions/setup-python@v4
33 |         with:
34 |           python-version: "3.11"
35 | 
36 |       - name: Install a specific version of uv
37 |         uses: astral-sh/setup-uv@v3
38 |         with:
39 |           enable-cache: true
40 |           version: "0.4.x"
41 | 
42 |       - name: Install dependencies
43 |         run: uv sync
44 | 
45 |       - name: Build python packages
46 |         run: |
47 |           # arro3-core needs to be first
48 |           uv run maturin dev -m arro3-core/Cargo.toml
49 |           uv run maturin dev -m arro3-compute/Cargo.toml
50 |           uv run maturin dev -m arro3-io/Cargo.toml
51 | 
52 |       - name: Deploy docs
53 |         env:
54 |           GIT_COMMITTER_NAME: CI
55 |           GIT_COMMITTER_EMAIL: ci-bot@example.com
56 |         run: |
57 |           # Get most recent git tag
58 |           # https://stackoverflow.com/a/7261049
59 |           # https://stackoverflow.com/a/3867811
60 |           # We don't use {{github.ref_name}} because if triggered manually, it
61 |           # will be a branch name instead of a tag version.
62 |           # Then remove `py-` from the tag
63 |           VERSION=$(git describe --tags --match="py-*" --abbrev=0 | cut -c 4-)
64 | 
65 |           # Only push docs if no letters in git tag after the first character
66 |           # (usually the git tag will have v as the first character)
67 |           # Note the `cut` index is 1-ordered
68 |           if echo $VERSION | cut -c 2- | grep -q "[A-Za-z]"; then
69 |             echo "Is beta version"
70 |             # For beta versions publish but don't set as latest
71 |             uv run mike deploy $VERSION --update-aliases --push
72 |           else
73 |             echo "Is NOT beta version"
74 |             uv run mike deploy $VERSION latest --update-aliases --push
75 |           fi
76 | 


--------------------------------------------------------------------------------
/arro3-compute/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use pyo3::exceptions::PyRuntimeWarning;
 2 | use pyo3::intern;
 3 | use pyo3::prelude::*;
 4 | use pyo3::types::PyTuple;
 5 | 
 6 | mod aggregate;
 7 | mod arith;
 8 | mod boolean;
 9 | mod cast;
10 | mod concat;
11 | mod dictionary;
12 | mod filter;
13 | mod take;
14 | mod temporal;
15 | 
16 | const VERSION: &str = env!("CARGO_PKG_VERSION");
17 | 
18 | #[pyfunction]
19 | fn ___version() -> &'static str {
20 |     VERSION
21 | }
22 | 
23 | /// Raise RuntimeWarning for debug builds
24 | #[pyfunction]
25 | fn check_debug_build(py: Python) -> PyResult<()> {
26 |     #[cfg(debug_assertions)]
27 |     {
28 |         let warnings_mod = py.import(intern!(py, "warnings"))?;
29 |         let warning = PyRuntimeWarning::new_err(
30 |             "arro3.compute has not been compiled in release mode. Performance will be degraded.",
31 |         );
32 |         let args = PyTuple::new(py, vec![warning])?;
33 |         warnings_mod.call_method1(intern!(py, "warn"), args)?;
34 |     }
35 | 
36 |     Ok(())
37 | }
38 | 
39 | #[pymodule]
40 | fn _compute(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
41 |     check_debug_build(py)?;
42 | 
43 |     m.add_wrapped(wrap_pyfunction!(___version))?;
44 | 
45 |     m.add_wrapped(wrap_pyfunction!(aggregate::max))?;
46 |     m.add_wrapped(wrap_pyfunction!(aggregate::min))?;
47 |     m.add_wrapped(wrap_pyfunction!(aggregate::sum))?;
48 |     m.add_wrapped(wrap_pyfunction!(arith::add_wrapping))?;
49 |     m.add_wrapped(wrap_pyfunction!(arith::add))?;
50 |     m.add_wrapped(wrap_pyfunction!(arith::div))?;
51 |     m.add_wrapped(wrap_pyfunction!(arith::mul_wrapping))?;
52 |     m.add_wrapped(wrap_pyfunction!(arith::mul))?;
53 |     m.add_wrapped(wrap_pyfunction!(arith::neg_wrapping))?;
54 |     m.add_wrapped(wrap_pyfunction!(arith::neg))?;
55 |     m.add_wrapped(wrap_pyfunction!(arith::rem))?;
56 |     m.add_wrapped(wrap_pyfunction!(arith::sub_wrapping))?;
57 |     m.add_wrapped(wrap_pyfunction!(arith::sub))?;
58 |     m.add_wrapped(wrap_pyfunction!(boolean::is_not_null))?;
59 |     m.add_wrapped(wrap_pyfunction!(boolean::is_null))?;
60 |     m.add_wrapped(wrap_pyfunction!(cast::can_cast_types))?;
61 |     m.add_wrapped(wrap_pyfunction!(cast::cast))?;
62 |     m.add_wrapped(wrap_pyfunction!(cast::cast))?;
63 |     m.add_wrapped(wrap_pyfunction!(concat::concat))?;
64 |     m.add_wrapped(wrap_pyfunction!(concat::concat))?;
65 |     m.add_wrapped(wrap_pyfunction!(dictionary::dictionary_encode))?;
66 |     m.add_wrapped(wrap_pyfunction!(filter::filter))?;
67 |     m.add_wrapped(wrap_pyfunction!(take::take))?;
68 |     m.add_wrapped(wrap_pyfunction!(temporal::date_part))?;
69 | 
70 |     Ok(())
71 | }
72 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_field.pyi:
--------------------------------------------------------------------------------
 1 | from ._data_type import DataType
 2 | from .types import ArrowSchemaExportable
 3 | 
 4 | class Field:
 5 |     """An Arrow Field."""
 6 |     def __init__(
 7 |         self,
 8 |         name: str,
 9 |         type: ArrowSchemaExportable,
10 |         nullable: bool = True,
11 |         *,
12 |         metadata: dict[str, str] | dict[bytes, bytes] | None = None,
13 |     ) -> None: ...
14 |     def __arrow_c_schema__(self) -> object:
15 |         """
16 |         An implementation of the [Arrow PyCapsule
17 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
18 |         This dunder method should not be called directly, but enables zero-copy data
19 |         transfer to other Python libraries that understand Arrow memory.
20 | 
21 |         For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this
22 |         array into a pyarrow field, without copying memory.
23 |         """
24 |     def __eq__(self, other) -> bool: ...
25 |     def __repr__(self) -> str: ...
26 |     @classmethod
27 |     def from_arrow(cls, input: ArrowSchemaExportable) -> Field:
28 |         """Construct this from an existing Arrow object.
29 | 
30 |         It can be called on anything that exports the Arrow schema interface
31 |         (has an `__arrow_c_schema__` method).
32 |         """
33 |     @classmethod
34 |     def from_arrow_pycapsule(cls, capsule) -> Field:
35 |         """Construct this object from a bare Arrow PyCapsule"""
36 | 
37 |     def equals(self, other: ArrowSchemaExportable) -> bool:
38 |         """Test if this field is equal to the other."""
39 |     @property
40 |     def metadata(self) -> dict[bytes, bytes]:
41 |         """The schema's metadata."""
42 |     @property
43 |     def metadata_str(self) -> dict[str, str]:
44 |         """The schema's metadata where keys and values are `str`, not `bytes`."""
45 |     @property
46 |     def name(self) -> str:
47 |         """The field name."""
48 |     @property
49 |     def nullable(self) -> bool:
50 |         """The field nullability."""
51 |     def remove_metadata(self) -> Field:
52 |         """Create new field without metadata, if any."""
53 |     @property
54 |     def type(self) -> DataType:
55 |         """Access the data type of this field."""
56 |     def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Field:
57 |         """Add metadata as dict of string keys and values to Field."""
58 |     def with_name(self, name: str) -> Field:
59 |         """A copy of this field with the replaced name."""
60 |     def with_nullable(self, nullable: bool) -> Field:
61 |         """A copy of this field with the replaced nullability."""
62 |     def with_type(self, new_type: ArrowSchemaExportable) -> Field:
63 |         """A copy of this field with the replaced type"""
64 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/_json.pyi:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import IO
 3 | 
 4 | # Note: importing with
 5 | # `from arro3.core import Array`
 6 | # will cause Array to be included in the generated docs in this module.
 7 | import arro3.core as core
 8 | import arro3.core.types as types
 9 | 
10 | def infer_json_schema(
11 |     file: IO[bytes] | Path | str,
12 |     *,
13 |     max_records: int | None = None,
14 | ) -> core.Schema:
15 |     """
16 |     Infer the schema of a JSON file by reading the first n records of the buffer, with
17 |     `max_records` controlling the maximum number of records to read.
18 | 
19 |     Args:
20 |         file: The input JSON path or buffer.
21 |         max_records: The maximum number of records to read to infer schema. If not
22 |             provided, will read the entire file to deduce field types. Defaults to None.
23 | 
24 |     Returns:
25 |         Inferred Arrow Schema
26 |     """
27 | 
28 | def read_json(
29 |     file: IO[bytes] | Path | str,
30 |     schema: types.ArrowSchemaExportable,
31 |     *,
32 |     batch_size: int | None = None,
33 | ) -> core.RecordBatchReader:
34 |     """Reads JSON data with a known schema into Arrow
35 | 
36 |     Args:
37 |         file: The JSON file or buffer to read from.
38 |         schema: The Arrow schema representing the JSON data.
39 |         batch_size: Set the batch size (number of records to load at one time). Defaults
40 |             to None.
41 | 
42 |     Returns:
43 |         An arrow RecordBatchReader.
44 |     """
45 | 
46 | def write_json(
47 |     data: types.ArrowStreamExportable | types.ArrowArrayExportable,
48 |     file: IO[bytes] | Path | str,
49 |     *,
50 |     explicit_nulls: bool | None = None,
51 | ) -> None:
52 |     """Write Arrow data to JSON.
53 | 
54 |     By default the writer will skip writing keys with null values for backward
55 |     compatibility.
56 | 
57 |     Args:
58 |         data: the Arrow Table, RecordBatchReader, or RecordBatch to write.
59 |         file: the output file or buffer to write to
60 |         explicit_nulls: Set whether to keep keys with null values, or to omit writing
61 |             them. Defaults to skipping nulls.
62 |     """
63 | 
64 | def write_ndjson(
65 |     data: types.ArrowStreamExportable | types.ArrowArrayExportable,
66 |     file: IO[bytes] | Path | str,
67 |     *,
68 |     explicit_nulls: bool | None = None,
69 | ) -> None:
70 |     """Write Arrow data to newline-delimited JSON.
71 | 
72 |     By default the writer will skip writing keys with null values for backward
73 |     compatibility.
74 | 
75 |     Args:
76 |         data: the Arrow Table, RecordBatchReader, or RecordBatch to write.
77 |         file: the output file or buffer to write to
78 |         explicit_nulls: Set whether to keep keys with null values, or to omit writing
79 |             them. Defaults to skipping nulls.
80 |     """
81 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["arro3-compute", "arro3-core", "arro3-io"]
 3 | # Note: we exclude pyo3-arrow from the top-level workspace because we have a
 4 | # circular dependency. pyo3-arrow is depended on by obstore to return arrow
 5 | # results as a list, which is depended on by arro3-io. This makes it hard to
 6 | # upgrade versions.
 7 | exclude = ["pyo3-arrow"]
 8 | resolver = "2"
 9 | 
10 | [workspace.package]
11 | # Package version for arro3-*, not for pyo3-arrow
12 | version = "0.6.1"
13 | authors = ["Kyle Barron <kylebarron2@gmail.com>"]
14 | edition = "2021"
15 | homepage = "https://kylebarron.dev/arro3"
16 | repository = "https://github.com/kylebarron/arro3"
17 | license = "MIT OR Apache-2.0"
18 | keywords = ["python", "arrow"]
19 | categories = []
20 | rust-version = "1.75"
21 | 
22 | [workspace.dependencies]
23 | arrow-arith = "56"
24 | arrow-array = { version = "56", features = ["ffi"] }
25 | arrow-buffer = "56"
26 | arrow-cast = "56"
27 | arrow-csv = "56"
28 | arrow-ipc = { version = "56", features = ["lz4", "zstd"] }
29 | arrow-json = "56"
30 | arrow-schema = "56"
31 | arrow-select = "56"
32 | bytes = "1.7.0"
33 | half = "2"
34 | indexmap = "2"
35 | numpy = "0.25"
36 | object_store = "0.12.1"
37 | parquet = "56"
38 | pyo3 = { version = "0.25", features = ["macros", "indexmap"] }
39 | # pyo3-arrow = "0.11.0"
40 | pyo3-arrow = { git = "https://github.com/kylebarron/arro3", rev = "cb2453bf022d0d8704e56e81a324ab5a772e0247" }
41 | # pyo3-arrow = { path = "./pyo3-arrow" }
42 | pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
43 | pyo3-file = "0.13.0"
44 | pyo3-object_store = "0.5"
45 | thiserror = "1.0.63"
46 | 
47 | [patch.crates-io]
48 | arrow-arith = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
49 | arrow-array = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
50 | arrow-buffer = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
51 | arrow-cast = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
52 | arrow-csv = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
53 | arrow-ipc = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
54 | arrow-json = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
55 | arrow-schema = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
56 | arrow-select = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
57 | parquet = { git = "https://github.com/jhorstmann/arrow-rs", rev = "e6ba0869949806f420b94d7cf7b8228ce6d5a369" }
58 | 
59 | [profile.release]
60 | lto = true
61 | codegen-units = 1
62 | 


--------------------------------------------------------------------------------
/tests/core/test_chunked_array.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | import pyarrow as pa
  4 | import pytest
  5 | from arro3.core import Array, ChunkedArray, DataType
  6 | 
  7 | 
  8 | def test_constructor():
  9 |     arr = Array([1, 2, 3], DataType.int16())
 10 |     arr2 = Array([4, 5, 6], DataType.int16())
 11 |     ca = ChunkedArray([arr, arr2])
 12 |     assert pa.chunked_array(ca) == pa.chunked_array([arr, arr2])
 13 | 
 14 | 
 15 | def test_repr():
 16 |     arr = Array([1, 2, 3], DataType.int16())
 17 |     arr2 = Array([4, 5, 6], DataType.int16())
 18 |     ca = ChunkedArray([arr, arr2])
 19 |     expected = """\
 20 |         arro3.core.ChunkedArray<Int16>
 21 |         [
 22 |           [
 23 |             1,
 24 |             2,
 25 |             3,
 26 |           ]
 27 |           [
 28 |             4,
 29 |             5,
 30 |             6,
 31 |           ]
 32 |         ]
 33 |         """
 34 |     assert repr(ca) == dedent(expected)
 35 | 
 36 |     arr = Array([1.0, 2.0, 3.0], DataType.float64())
 37 |     arr2 = Array([4.0, 5.0, 6.0], DataType.float64())
 38 |     ca = ChunkedArray([arr, arr2])
 39 |     expected = """\
 40 |         arro3.core.ChunkedArray<Float64>
 41 |         [
 42 |           [
 43 |             1.0,
 44 |             2.0,
 45 |             3.0,
 46 |           ]
 47 |           [
 48 |             4.0,
 49 |             5.0,
 50 |             6.0,
 51 |           ]
 52 |         ]
 53 |         """
 54 |     assert repr(ca) == dedent(expected)
 55 | 
 56 |     arr = Array(["foo"], DataType.string())
 57 |     arr2 = Array(["bar"], DataType.string())
 58 |     arr3 = Array(["baz"], DataType.string())
 59 |     ca = ChunkedArray([arr, arr2, arr3])
 60 |     expected = """\
 61 |         arro3.core.ChunkedArray<Utf8>
 62 |         [
 63 |           [
 64 |             foo,
 65 |           ]
 66 |           [
 67 |             bar,
 68 |           ]
 69 |           [
 70 |             baz,
 71 |           ]
 72 |         ]
 73 |         """
 74 |     assert repr(ca) == dedent(expected)
 75 | 
 76 | 
 77 | class CustomException(Exception):
 78 |     pass
 79 | 
 80 | 
 81 | class ArrowCStreamFails:
 82 |     def __arrow_c_stream__(self, requested_schema=None):
 83 |         raise CustomException
 84 | 
 85 | 
 86 | class ArrowCArrayFails:
 87 |     def __arrow_c_array__(self, requested_schema=None):
 88 |         raise CustomException
 89 | 
 90 | 
 91 | def test_chunked_array_import_preserve_exception():
 92 |     """https://github.com/kylebarron/arro3/issues/325"""
 93 | 
 94 |     c_stream_obj = ArrowCStreamFails()
 95 |     with pytest.raises(CustomException):
 96 |         ChunkedArray.from_arrow(c_stream_obj)
 97 | 
 98 |     with pytest.raises(CustomException):
 99 |         ChunkedArray(c_stream_obj)
100 | 
101 |     c_array_obj = ArrowCArrayFails()
102 |     with pytest.raises(CustomException):
103 |         ChunkedArray.from_arrow(c_array_obj)
104 | 
105 |     with pytest.raises(CustomException):
106 |         ChunkedArray(c_array_obj)
107 | 


--------------------------------------------------------------------------------
/tests/core/test_buffer_protocol.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | import arro3.compute as ac
 4 | import numpy as np
 5 | import pyarrow as pa
 6 | import pytest
 7 | from arro3.core import Array, Buffer
 8 | 
 9 | 
10 | def test_from_buffer():
11 |     arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)
12 |     mv = memoryview(arr)
13 |     assert pa.array(mv) == pa.array(Array.from_buffer(mv))
14 | 
15 |     arr = np.array([1, 2, 3], dtype=np.int64)
16 |     mv = memoryview(arr)
17 |     assert pa.array(mv) == pa.array(Array.from_buffer(mv))
18 | 
19 |     # pyarrow applies some casting; this is weird
20 |     # According to joris, this may be because pyarrow doesn't implement direct import of
21 |     # buffer protocol objects, and instead infers from `pa.array(list(memoryview()))`
22 |     # float32 -> float64
23 |     # int32 -> int64
24 |     # uint64 -> int64
25 | 
26 |     arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)
27 |     assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.float32()
28 | 
29 |     arr = np.array([1, 2, 3], dtype=np.int32)
30 |     assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.int32()
31 | 
32 |     arr = np.array([1, 2, 3], dtype=np.int64)
33 |     assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.int64()
34 | 
35 |     arr = np.array([1, 2, 3], dtype=np.uint64)
36 |     assert pa.array(Array.from_buffer(memoryview(arr))).type == pa.uint64()
37 | 
38 |     # Datetime array
39 |     # https://stackoverflow.com/a/34325416
40 |     arr = np.arange(datetime(1985, 7, 1), datetime(2015, 7, 1), timedelta(days=1))
41 |     with pytest.raises(ValueError):
42 |         Array.from_buffer(arr)
43 | 
44 | 
45 | def test_operation_on_buffer():
46 |     np_arr = np.arange(1000, dtype=np.uint64)
47 |     assert np.max(np_arr) == 999
48 |     assert ac.max(np_arr).as_py() == 999
49 | 
50 |     indices = np.array([2, 3, 4], dtype=np.uint64)
51 |     out = ac.take(np_arr, indices)
52 |     assert pa.array(out) == pa.array(indices)
53 | 
54 | 
55 | def test_multi_dimensional():
56 |     np_arr = np.arange(6, dtype=np.uint8).reshape((2, 3))
57 |     arro3_arr = Array(np_arr)
58 |     pa_arr = pa.array(arro3_arr)
59 |     assert pa_arr.type.list_size == 3
60 |     assert pa_arr.type.value_type == pa.uint8()
61 | 
62 |     np_arr = np.arange(12, dtype=np.uint8).reshape((1, 2, 3, 2))
63 |     arro3_arr = Array(np_arr)
64 |     pa_arr = pa.array(arro3_arr)
65 |     assert pa_arr.type.list_size == 2
66 |     assert pa_arr.type.value_type.list_size == 3
67 |     assert pa_arr.type.value_type.value_type.list_size == 2
68 |     assert pa_arr.type.value_type.value_type.value_type == pa.uint8()
69 | 
70 | 
71 | def test_round_trip_buffer():
72 |     arr = np.arange(5, dtype=np.uint8)
73 |     buffer = Buffer(arr)
74 |     # Restore when upgrading to pyo3-arrow 0.6
75 |     # assert len(buffer) == arr.nbytes
76 |     retour = np.frombuffer(buffer, dtype=np.uint8)
77 |     assert np.array_equal(arr, retour)
78 | 
79 |     assert np.array_equal(arr, Array(buffer).to_numpy())
80 | 


--------------------------------------------------------------------------------
/tests/test_dictionary.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pyarrow as pa
 4 | import pyarrow.compute as pc
 5 | from arro3.compute import dictionary_encode
 6 | from arro3.core import ChunkedArray, dictionary_dictionary, dictionary_indices
 7 | 
 8 | 
 9 | def test_dictionary_encode():
10 |     arr = pa.array([1, 2, 3, 1, 2, 2, 3, 1, 1, 1], type=pa.uint16())
11 |     out = dictionary_encode(arr)
12 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
13 |     assert pa.array(out) == out_pc
14 | 
15 |     arr = pa.array(["1", "2", "3", "1", "2", "2", "3", "1", "1", "1"], type=pa.utf8())
16 |     out = dictionary_encode(arr)
17 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
18 |     assert pa.array(out) == out_pc
19 | 
20 |     arr = arr.cast(pa.large_utf8())
21 |     out = dictionary_encode(arr)
22 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
23 |     assert pa.array(out) == out_pc
24 | 
25 |     arr = arr.cast(pa.binary())
26 |     out = dictionary_encode(arr)
27 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
28 |     assert pa.array(out) == out_pc
29 | 
30 |     arr = arr.cast(pa.large_binary())
31 |     out = dictionary_encode(arr)
32 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
33 |     assert pa.array(out) == out_pc
34 | 
35 |     now = datetime.now()
36 |     later = datetime.now()
37 |     arr = pa.array([now, later, now, now, later])
38 |     out = dictionary_encode(arr)
39 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
40 |     assert pa.array(out) == out_pc
41 | 
42 | 
43 | def test_dictionary_encode_chunked():
44 |     arr = pa.chunked_array([[3, 2, 3], [1, 2, 2], [3, 1, 1, 1]], type=pa.uint16())
45 |     out = ChunkedArray(dictionary_encode(arr))
46 | 
47 |     out_retour = pa.chunked_array(out)
48 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
49 | 
50 |     # Since these arrays have different dictionaries, array and arrow scalar comparison
51 |     # will fail.
52 |     assert len(out_retour) == len(out_pc)
53 |     for i in range(len(out_retour)):
54 |         assert out_retour[i].as_py() == out_pc[i].as_py()
55 | 
56 | 
57 | def test_dictionary_access():
58 |     arr = pa.array([1, 2, 3, 1, 2, 2, 3, 1, 1, 1], type=pa.uint16())
59 |     out = dictionary_encode(arr)
60 |     out_pc = pc.dictionary_encode(arr)  # type: ignore
61 | 
62 |     keys = dictionary_dictionary(out)
63 |     assert pa.array(keys) == out_pc.dictionary
64 | 
65 |     indices = dictionary_indices(out)
66 |     assert pa.array(indices) == out_pc.indices
67 | 
68 | 
69 | def test_dictionary_access_chunked():
70 |     arr = pa.chunked_array([[3, 2, 3], [1, 2, 2], [3, 1, 1, 1]], type=pa.uint16())
71 |     out = ChunkedArray(dictionary_encode(arr))
72 |     out_pa = pa.chunked_array(out)
73 | 
74 |     dictionary = ChunkedArray(dictionary_dictionary(out))
75 |     assert pa.chunked_array(dictionary).chunks[0] == out_pa.chunks[0].dictionary
76 | 
77 |     indices = ChunkedArray(dictionary_indices(out))
78 |     assert pa.chunked_array(indices).chunks[0] == out_pa.chunks[0].indices
79 | 


--------------------------------------------------------------------------------
/tests/core/test_ffi.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | from arro3.core import Array, ChunkedArray, DataType, RecordBatchReader, Schema, Table
 3 | 
 4 | 
 5 | def test_table_stream_export_schema_request():
 6 |     a = pa.array(["a", "b", "c"], type=pa.utf8())
 7 |     table = Table.from_pydict({"a": a})
 8 | 
 9 |     requested_schema = Schema([pa.field("a", type=pa.large_utf8())])
10 |     requested_schema_capsule = requested_schema.__arrow_c_schema__()
11 |     stream_capsule = table.__arrow_c_stream__(requested_schema_capsule)
12 | 
13 |     retour = Table.from_arrow_pycapsule(stream_capsule)
14 |     assert retour.schema.field("a").type == DataType.large_utf8()
15 | 
16 | 
17 | def test_record_batch_reader_stream_export_schema_request():
18 |     a = pa.array(["a", "b", "c"], type=pa.utf8())
19 |     table = Table.from_pydict({"a": a})
20 |     reader = RecordBatchReader.from_batches(table.schema, table.to_batches())
21 | 
22 |     requested_schema = Schema([pa.field("a", type=pa.large_utf8())])
23 |     requested_schema_capsule = requested_schema.__arrow_c_schema__()
24 |     stream_capsule = reader.__arrow_c_stream__(requested_schema_capsule)
25 | 
26 |     retour = Table.from_arrow_pycapsule(stream_capsule)
27 |     assert retour.schema.field("a").type == DataType.large_utf8()
28 | 
29 | 
30 | def test_chunked_array_stream_export_schema_request():
31 |     a = pa.array(["a", "b", "c"], type=pa.utf8())
32 |     ca = ChunkedArray([a, a])
33 | 
34 |     requested_schema_capsule = pa.large_utf8().__arrow_c_schema__()
35 |     stream_capsule = ca.__arrow_c_stream__(requested_schema_capsule)
36 | 
37 |     retour = ChunkedArray.from_arrow_pycapsule(stream_capsule)
38 |     assert retour.type == DataType.large_utf8()
39 | 
40 | 
41 | def test_array_export_schema_request():
42 |     a = pa.array(["a", "b", "c"], type=pa.utf8())
43 |     arr = Array(a)
44 | 
45 |     requested_schema_capsule = pa.large_utf8().__arrow_c_schema__()
46 |     capsules = arr.__arrow_c_array__(requested_schema_capsule)
47 | 
48 |     retour = Array.from_arrow_pycapsule(*capsules)
49 |     assert retour.type == DataType.large_utf8()
50 | 
51 | 
52 | def test_table_metadata_preserved():
53 |     metadata = {b"hello": b"world"}
54 |     pa_table = pa.table({"a": [1, 2, 3]})
55 |     pa_table = pa_table.replace_schema_metadata(metadata)
56 | 
57 |     arro3_table = Table(pa_table)
58 |     assert arro3_table.schema.metadata == metadata
59 | 
60 |     pa_table_retour = pa.table(arro3_table)
61 |     assert pa_table_retour.schema.metadata == metadata
62 | 
63 | 
64 | def test_record_batch_reader_metadata_preserved():
65 |     metadata = {b"hello": b"world"}
66 |     pa_table = pa.table({"a": [1, 2, 3]})
67 |     pa_table = pa_table.replace_schema_metadata(metadata)
68 |     pa_reader = pa.RecordBatchReader.from_stream(pa_table)
69 | 
70 |     arro3_reader = RecordBatchReader.from_stream(pa_reader)
71 |     assert arro3_reader.schema.metadata == metadata
72 | 
73 |     pa_reader_retour = pa.RecordBatchReader.from_stream(arro3_reader)
74 |     assert pa_reader_retour.schema.metadata == metadata
75 | 


--------------------------------------------------------------------------------
/arro3-core/src/accessors/list_flatten.rs:
--------------------------------------------------------------------------------
 1 | use arrow_array::cast::AsArray;
 2 | use arrow_array::ArrayRef;
 3 | use arrow_schema::{ArrowError, DataType, FieldRef};
 4 | use pyo3::prelude::*;
 5 | use pyo3_arrow::error::PyArrowResult;
 6 | use pyo3_arrow::ffi::ArrayIterator;
 7 | use pyo3_arrow::input::AnyArray;
 8 | use pyo3_arrow::{PyArray, PyArrayReader};
 9 | 
10 | #[pyfunction]
11 | pub fn list_flatten(py: Python, input: AnyArray) -> PyArrowResult<PyObject> {
12 |     match input {
13 |         AnyArray::Array(array) => {
14 |             let (array, field) = array.into_inner();
15 |             let flat_array = flatten_array(array)?;
16 |             let flat_field = flatten_field(field)?;
17 |             Ok(PyArray::new(flat_array, flat_field).to_arro3(py)?.unbind())
18 |         }
19 |         AnyArray::Stream(stream) => {
20 |             let reader = stream.into_reader()?;
21 |             let flatten_field = flatten_field(reader.field())?;
22 | 
23 |             let iter = reader.into_iter().map(move |array| {
24 |                 let out = flatten_array(array?)?;
25 |                 Ok(out)
26 |             });
27 |             Ok(
28 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, flatten_field)))
29 |                     .to_arro3(py)?
30 |                     .unbind(),
31 |             )
32 |         }
33 |     }
34 | }
35 | 
36 | fn flatten_array(array: ArrayRef) -> Result<ArrayRef, ArrowError> {
37 |     let offset = array.offset();
38 |     let length = array.len();
39 |     match array.data_type() {
40 |         DataType::List(_) => {
41 |             let arr = array.as_list::<i32>();
42 |             let start = arr.offsets().get(offset).unwrap();
43 |             let end = arr.offsets().get(offset + length).unwrap();
44 |             Ok(arr
45 |                 .values()
46 |                 .slice(*start as usize, (*end - *start) as usize)
47 |                 .clone())
48 |         }
49 |         DataType::LargeList(_) => {
50 |             let arr = array.as_list::<i64>();
51 |             let start = arr.offsets().get(offset).unwrap();
52 |             let end = arr.offsets().get(offset + length).unwrap();
53 |             Ok(arr
54 |                 .values()
55 |                 .slice(*start as usize, (*end - *start) as usize)
56 |                 .clone())
57 |         }
58 |         DataType::FixedSizeList(_, list_size) => {
59 |             let arr = array.as_fixed_size_list();
60 |             Ok(arr.values().clone().slice(
61 |                 offset * (*list_size as usize),
62 |                 (offset + length) * (*list_size as usize),
63 |             ))
64 |         }
65 |         _ => Err(ArrowError::SchemaError(
66 |             "Expected list-typed Array".to_string(),
67 |         )),
68 |     }
69 | }
70 | 
71 | fn flatten_field(field: FieldRef) -> Result<FieldRef, ArrowError> {
72 |     match field.data_type() {
73 |         DataType::List(inner_field)
74 |         | DataType::LargeList(inner_field)
75 |         | DataType::FixedSizeList(inner_field, _) => Ok(inner_field.clone()),
76 |         _ => Err(ArrowError::SchemaError(
77 |             "Expected list-typed Array".to_string(),
78 |         )),
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/.github/workflows/pyodide-wheels.yml:
--------------------------------------------------------------------------------
 1 | name: Build pyodide wheels
 2 | 
 3 | on:
 4 |   # push:
 5 |   #   tags:
 6 |   #     - "py-v*"
 7 |   workflow_dispatch:
 8 |     inputs:
 9 |       python:
10 |         description: "Python version"
11 |         required: true
12 |         default: "3.12"
13 |         type: choice
14 |         options:
15 |           - 3.12
16 |           - 3.13
17 |           - 3.14
18 |           - 3.15
19 |       pyodide:
20 |         description: "New Pyodide version to build for"
21 |         required: true
22 |         type: string
23 | 
24 | permissions:
25 |   contents: write
26 | 
27 | jobs:
28 |   build:
29 |     runs-on: ubuntu-latest
30 |     strategy:
31 |       matrix:
32 |         module:
33 |           - arro3-core
34 |           - arro3-compute
35 |           - arro3-io
36 |     steps:
37 |       - uses: actions/checkout@v4
38 |       - uses: actions/setup-python@v5
39 |         with:
40 |           python-version: ${{ inputs.python }}
41 | 
42 |       - name: Install Rust
43 |         uses: dtolnay/rust-toolchain@nightly
44 |         with:
45 |           targets: wasm32-unknown-emscripten
46 | 
47 |       - uses: Swatinem/rust-cache@v2
48 | 
49 |       - name: Install Python build dependencies
50 |         run: pip install maturin pyodide-build==${{ inputs.pyodide }} wheel-filename
51 | 
52 |       - name: Get emscripten version
53 |         run: |
54 |           echo PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version) >> $GITHUB_ENV
55 | 
56 |       - name: Install emsdk & build wheels
57 |         run: |
58 |           git clone https://github.com/emscripten-core/emsdk.git
59 |           cd emsdk
60 |           PYODIDE_EMSCRIPTEN_VERSION=$(pyodide config get emscripten_version)
61 |           ./emsdk install ${PYODIDE_EMSCRIPTEN_VERSION}
62 |           ./emsdk activate ${PYODIDE_EMSCRIPTEN_VERSION}
63 |           source emsdk_env.sh
64 |           cd ..
65 |           RUSTUP_TOOLCHAIN=nightly maturin build --release -o dist --target wasm32-unknown-emscripten -i python${{ inputs.python  }} --manifest-path ${{ matrix.module }}/Cargo.toml
66 | 
67 |       - name: Get info from built wheel file
68 |         run: |
69 |           # get arrow version and wheel name and make metafile
70 |           ARRO3_WHEEL=$(basename dist/*.whl)
71 |           ARRO3_VERSION=$(wheel-filename ${ARRO3_WHEEL} | jq -r '.version')
72 |           ARROW_SHA256=$(sha256sum dist/*.whl | cut -d ' ' -f 1)
73 |           echo ARRO3_WHEEL=${ARRO3_WHEEL}>>$GITHUB_ENV
74 |           echo ARRO3_VERSION=${ARRO3_VERSION}>>$GITHUB_ENV
75 | 
76 |       - name: Upload wheels
77 |         uses: actions/upload-artifact@v4
78 |         with:
79 |           name: wheels-pyodide-${{ matrix.module }}
80 |           path: dist
81 | 
82 |       - name: Create release
83 |         uses: ncipollo/release-action@v1
84 |         with:
85 |           tag: pyodide-v${{ inputs.pyodide }}-arro3-v${{ env.ARRO3_VERSION }}
86 |           name: Build of arro3 for pyodide v${{ inputs.pyodide}} and arro3 v${{ env.ARRO3_VERSION }}
87 |           artifacts: dist/*
88 |           replacesArtifacts: true
89 |           allowUpdates: true
90 |           updateOnlyUnreleased: true
91 |           prerelease: true
92 | 


--------------------------------------------------------------------------------
/arro3-io/src/json.rs:
--------------------------------------------------------------------------------
  1 | use std::io::BufReader;
  2 | 
  3 | use arrow_json::writer::{JsonArray, LineDelimited};
  4 | use arrow_json::{ReaderBuilder, WriterBuilder};
  5 | use pyo3::prelude::*;
  6 | use pyo3_arrow::error::PyArrowResult;
  7 | use pyo3_arrow::export::{Arro3RecordBatchReader, Arro3Schema};
  8 | use pyo3_arrow::input::AnyRecordBatch;
  9 | use pyo3_arrow::{PyRecordBatchReader, PySchema};
 10 | 
 11 | use crate::utils::{FileReader, FileWriter};
 12 | 
 13 | /// Infer a JSON file's schema
 14 | #[pyfunction]
 15 | #[pyo3(signature = (
 16 |     file,
 17 |     *,
 18 |     max_records=None,
 19 | ))]
 20 | pub fn infer_json_schema(
 21 |     file: FileReader,
 22 |     max_records: Option<usize>,
 23 | ) -> PyArrowResult<Arro3Schema> {
 24 |     let buf_file = BufReader::new(file);
 25 |     let (schema, _records_read) = arrow_json::reader::infer_json_schema(buf_file, max_records)?;
 26 |     Ok(schema.into())
 27 | }
 28 | 
 29 | /// Read a JSON file to an Arrow RecordBatchReader
 30 | #[pyfunction]
 31 | #[pyo3(signature = (
 32 |     file,
 33 |     schema,
 34 |     *,
 35 |     batch_size=None,
 36 | ))]
 37 | pub fn read_json(
 38 |     file: FileReader,
 39 |     schema: PySchema,
 40 |     batch_size: Option<usize>,
 41 | ) -> PyArrowResult<Arro3RecordBatchReader> {
 42 |     let mut builder = ReaderBuilder::new(schema.into());
 43 | 
 44 |     if let Some(batch_size) = batch_size {
 45 |         builder = builder.with_batch_size(batch_size);
 46 |     }
 47 | 
 48 |     let buf_file = BufReader::new(file);
 49 |     let reader = builder.build(buf_file)?;
 50 |     Ok(PyRecordBatchReader::new(Box::new(reader)).into())
 51 | }
 52 | 
 53 | /// Write an Arrow Table or stream to a JSON file
 54 | #[pyfunction]
 55 | #[pyo3(signature = (
 56 |     data,
 57 |     file,
 58 |     *,
 59 |     explicit_nulls=None,
 60 | ))]
 61 | #[allow(clippy::too_many_arguments)]
 62 | pub fn write_json(
 63 |     data: AnyRecordBatch,
 64 |     file: FileWriter,
 65 |     explicit_nulls: Option<bool>,
 66 | ) -> PyArrowResult<()> {
 67 |     let mut builder = WriterBuilder::new();
 68 | 
 69 |     if let Some(explicit_nulls) = explicit_nulls {
 70 |         builder = builder.with_explicit_nulls(explicit_nulls);
 71 |     }
 72 | 
 73 |     let mut writer = builder.build::<_, JsonArray>(file);
 74 |     for batch in data.into_reader()? {
 75 |         writer.write(&batch?)?;
 76 |     }
 77 |     Ok(())
 78 | }
 79 | 
 80 | /// Write an Arrow Table or stream to a newline-delimited JSON file
 81 | #[pyfunction]
 82 | #[pyo3(signature = (
 83 |     data,
 84 |     file,
 85 |     *,
 86 |     explicit_nulls=None,
 87 | ))]
 88 | #[allow(clippy::too_many_arguments)]
 89 | pub fn write_ndjson(
 90 |     data: AnyRecordBatch,
 91 |     file: FileWriter,
 92 |     explicit_nulls: Option<bool>,
 93 | ) -> PyArrowResult<()> {
 94 |     let mut builder = WriterBuilder::new();
 95 | 
 96 |     if let Some(explicit_nulls) = explicit_nulls {
 97 |         builder = builder.with_explicit_nulls(explicit_nulls);
 98 |     }
 99 | 
100 |     let mut writer = builder.build::<_, LineDelimited>(file);
101 |     for batch in data.into_reader()? {
102 |         writer.write(&batch?)?;
103 |     }
104 |     writer.finish()?;
105 |     Ok(())
106 | }
107 | 


--------------------------------------------------------------------------------
/arro3-compute/src/dictionary.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use arrow_array::builder::{GenericByteDictionaryBuilder, PrimitiveDictionaryBuilder};
 4 | use arrow_array::cast::AsArray;
 5 | use arrow_array::downcast_primitive_array;
 6 | use arrow_array::types::{
 7 |     BinaryType, ByteArrayType, Int32Type, LargeBinaryType, LargeUtf8Type, Utf8Type,
 8 | };
 9 | use arrow_array::{ArrayRef, ArrowPrimitiveType, GenericByteArray, PrimitiveArray};
10 | use arrow_schema::{ArrowError, DataType, Field};
11 | use pyo3::prelude::*;
12 | use pyo3_arrow::error::PyArrowResult;
13 | use pyo3_arrow::ffi::ArrayIterator;
14 | use pyo3_arrow::input::AnyArray;
15 | use pyo3_arrow::{PyArray, PyArrayReader};
16 | 
17 | // Note: for chunked array input, each output chunk will not necessarily have the same dictionary
18 | #[pyfunction]
19 | pub(crate) fn dictionary_encode(py: Python, array: AnyArray) -> PyArrowResult<PyObject> {
20 |     match array {
21 |         AnyArray::Array(array) => {
22 |             let (array, _field) = array.into_inner();
23 |             let output_array = dictionary_encode_array(array)?;
24 |             Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind())
25 |         }
26 |         AnyArray::Stream(stream) => {
27 |             let reader = stream.into_reader()?;
28 | 
29 |             let existing_field = reader.field();
30 |             let output_data_type = DataType::Dictionary(
31 |                 Box::new(DataType::Int32),
32 |                 Box::new(existing_field.data_type().clone()),
33 |             );
34 |             let output_field = Field::new("", output_data_type, true);
35 | 
36 |             let iter = reader
37 |                 .into_iter()
38 |                 .map(move |array| dictionary_encode_array(array?));
39 |             Ok(
40 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, output_field.into())))
41 |                     .to_arro3(py)?
42 |                     .unbind(),
43 |             )
44 |         }
45 |     }
46 | }
47 | 
48 | fn dictionary_encode_array(array: ArrayRef) -> Result<ArrayRef, ArrowError> {
49 |     let array_ref = array.as_ref();
50 |     let array = downcast_primitive_array!(
51 |         array_ref => {
52 |             primitive_dictionary_encode(array_ref)
53 |         }
54 |         DataType::Utf8 => bytes_dictionary_encode(array.as_bytes::<Utf8Type>()),
55 |         DataType::LargeUtf8 => bytes_dictionary_encode(array.as_bytes::<LargeUtf8Type>()),
56 |         DataType::Binary => bytes_dictionary_encode(array.as_bytes::<BinaryType>()),
57 |         DataType::LargeBinary => bytes_dictionary_encode(array.as_bytes::<LargeBinaryType>()),
58 |         DataType::Dictionary(_, _) => array,
59 |         d => return Err(ArrowError::ComputeError(format!("{d:?} not supported in rank")))
60 |     );
61 |     Ok(array)
62 | }
63 | 
64 | #[inline(never)]
65 | fn primitive_dictionary_encode<T: ArrowPrimitiveType>(array: &PrimitiveArray<T>) -> ArrayRef {
66 |     let mut builder = PrimitiveDictionaryBuilder::<Int32Type, T>::new();
67 |     for value in array {
68 |         builder.append_option(value);
69 |     }
70 |     Arc::new(builder.finish())
71 | }
72 | 
73 | #[inline(never)]
74 | fn bytes_dictionary_encode<T: ByteArrayType>(array: &GenericByteArray<T>) -> ArrayRef {
75 |     let mut builder = GenericByteDictionaryBuilder::<Int32Type, T>::new();
76 |     for value in array {
77 |         builder.append_option(value);
78 |     }
79 |     Arc::new(builder.finish())
80 | }
81 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_scalar.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any, overload
 2 | 
 3 | from ._data_type import DataType
 4 | from ._field import Field
 5 | from .types import ArrayInput, ArrowArrayExportable, ArrowSchemaExportable
 6 | 
 7 | class Scalar:
 8 |     """An arrow Scalar."""
 9 |     @overload
10 |     def __init__(self, obj: ArrayInput, /, type: None = None) -> None: ...
11 |     @overload
12 |     def __init__(self, obj: Any, /, type: ArrowSchemaExportable) -> None: ...
13 |     def __init__(
14 |         self,
15 |         obj: ArrayInput | Any,
16 |         /,
17 |         type: ArrowSchemaExportable | None = None,
18 |     ) -> None:
19 |         """Create arro3.Scalar instance from a Python object.
20 | 
21 |         Args:
22 |             obj: An input object.
23 |             type: Explicit type to attempt to coerce to. You may pass in a `Field` to `type` in order to associate extension metadata with this array.
24 |         """
25 |     def __arrow_c_array__(
26 |         self, requested_schema: object | None = None
27 |     ) -> tuple[object, object]:
28 |         """
29 |         An implementation of the [Arrow PyCapsule
30 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
31 |         This dunder method should not be called directly, but enables zero-copy data
32 |         transfer to other Python libraries that understand Arrow memory.
33 | 
34 |         For example, you can call [`pyarrow.array()`][pyarrow.array] to
35 |         convert this Scalar into a pyarrow Array, without copying memory. The generated
36 |         array is guaranteed to have length 1.
37 |         """
38 |     def __eq__(self, other) -> bool:
39 |         """Check for equality with other Python objects (`==`)
40 | 
41 |         If `other` is not an Arrow scalar, `self` will be converted to a Python object
42 |         (with `as_py`), and then its `__eq__` method will be called.
43 |         """
44 |     def __repr__(self) -> str: ...
45 |     @classmethod
46 |     def from_arrow(cls, input: ArrowArrayExportable) -> Scalar:
47 |         """Construct this from an existing Arrow Scalar.
48 | 
49 |         It can be called on anything that exports the Arrow data interface (has a
50 |         `__arrow_c_array__` method) and returns an array with a single element.
51 | 
52 |         Args:
53 |             input: Arrow scalar to use for constructing this object
54 | 
55 |         Returns:
56 |             new Scalar
57 |         """
58 |     @classmethod
59 |     def from_arrow_pycapsule(cls, schema_capsule, array_capsule) -> Scalar:
60 |         """Construct this object from bare Arrow PyCapsules"""
61 |     def as_py(self) -> Any:
62 |         """Convert this scalar to a pure-Python object."""
63 |     def cast(self, target_type: ArrowSchemaExportable) -> Scalar:
64 |         """Cast scalar to another data type
65 | 
66 |         Args:
67 |             target_type: Type to cast to.
68 |         """
69 | 
70 |     @property
71 |     def field(self) -> Field:
72 |         """Access the field stored on this Scalar.
73 | 
74 |         Note that this field usually will not have a name associated, but it may have
75 |         metadata that signifies that this scalar is an extension (user-defined typed)
76 |         scalar.
77 |         """
78 | 
79 |     @property
80 |     def is_valid(self) -> bool:
81 |         """Return `True` if this scalar is not null."""
82 |     @property
83 |     def type(self) -> DataType:
84 |         """Access the type of this scalar."""
85 | 


--------------------------------------------------------------------------------
/arro3-core/src/accessors/list_offsets.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use arrow_array::cast::AsArray;
 4 | use arrow_array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
 5 | use arrow_buffer::{OffsetBuffer, ScalarBuffer};
 6 | use arrow_schema::{ArrowError, DataType, Field};
 7 | use pyo3::prelude::*;
 8 | use pyo3_arrow::error::PyArrowResult;
 9 | use pyo3_arrow::ffi::ArrayIterator;
10 | use pyo3_arrow::input::AnyArray;
11 | use pyo3_arrow::{PyArray, PyArrayReader};
12 | 
13 | #[pyfunction]
14 | #[pyo3(signature = (input, *, logical=true))]
15 | pub fn list_offsets(py: Python, input: AnyArray, logical: bool) -> PyArrowResult<PyObject> {
16 |     match input {
17 |         AnyArray::Array(array) => {
18 |             let (array, _field) = array.into_inner();
19 |             let offsets = _list_offsets(array, logical)?;
20 |             Ok(PyArray::from_array_ref(offsets).to_arro3(py)?.unbind())
21 |         }
22 |         AnyArray::Stream(stream) => {
23 |             let reader = stream.into_reader()?;
24 |             let out_field = match reader.field().data_type() {
25 |                 DataType::List(_) => Field::new("", DataType::Int32, false),
26 |                 DataType::LargeList(_) => Field::new("", DataType::Int64, false),
27 |                 _ => {
28 |                     return Err(
29 |                         ArrowError::SchemaError("Expected list-typed Array".to_string()).into(),
30 |                     );
31 |                 }
32 |             };
33 | 
34 |             let iter = reader
35 |                 .into_iter()
36 |                 .map(move |array| _list_offsets(array?, logical));
37 |             Ok(
38 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into())))
39 |                     .to_arro3(py)?
40 |                     .unbind(),
41 |             )
42 |         }
43 |     }
44 | }
45 | 
46 | fn _list_offsets(array: ArrayRef, logical: bool) -> Result<ArrayRef, ArrowError> {
47 |     let offset = array.offset();
48 |     let length = array.len();
49 | 
50 |     match array.data_type() {
51 |         DataType::List(_) => {
52 |             let arr = array.as_list::<i32>();
53 |             let offsets = arr.offsets();
54 |             let offsets = if logical {
55 |                 slice_offsets(offsets, offset, length)
56 |             } else {
57 |                 offsets.clone().into_inner()
58 |             };
59 |             Ok(Arc::new(Int32Array::new(offsets, None)))
60 |         }
61 |         DataType::LargeList(_) => {
62 |             let arr = array.as_list::<i64>();
63 |             let offsets = arr.offsets();
64 |             let offsets = if logical {
65 |                 slice_offsets(offsets, offset, length)
66 |             } else {
67 |                 offsets.clone().into_inner()
68 |             };
69 |             Ok(Arc::new(Int64Array::new(offsets, None)))
70 |         }
71 |         _ => Err(ArrowError::SchemaError(
72 |             "Expected list-typed Array".to_string(),
73 |         )),
74 |     }
75 | }
76 | 
77 | fn slice_offsets<O: OffsetSizeTrait>(
78 |     offsets: &OffsetBuffer<O>,
79 |     offset: usize,
80 |     length: usize,
81 | ) -> ScalarBuffer<O> {
82 |     let sliced = offsets.slice(offset, length);
83 |     let first_offset = sliced.first().copied().unwrap_or(O::zero());
84 |     if first_offset.to_usize().unwrap() == 0 {
85 |         sliced.into_inner()
86 |     } else {
87 |         let mut new_offsets = Vec::with_capacity(sliced.len());
88 |         for value in sliced.iter() {
89 |             new_offsets.push(*value - first_offset);
90 |         }
91 |         ScalarBuffer::from(new_offsets)
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/_pyo3_object_store.pyi:
--------------------------------------------------------------------------------
  1 | # TODO: move this to a standalone package/docs website that can be shared across
  2 | # multiple python packages.
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | from datetime import timedelta
  7 | from typing import Dict, TypedDict
  8 | 
  9 | import boto3
 10 | import botocore
 11 | import botocore.session
 12 | 
 13 | class BackoffConfig(TypedDict):
 14 |     init_backoff: timedelta
 15 |     max_backoff: timedelta
 16 |     base: int | float
 17 | 
 18 | class RetryConfig(TypedDict):
 19 |     backoff: BackoffConfig
 20 |     max_retries: int
 21 |     retry_timeout: timedelta
 22 | 
 23 | class AzureStore:
 24 |     @classmethod
 25 |     def from_env(
 26 |         cls,
 27 |         container: str,
 28 |         *,
 29 |         config: Dict[str, str] | None = None,
 30 |         client_options: Dict[str, str] | None = None,
 31 |         retry_config: RetryConfig | None = None,
 32 |     ) -> S3Store: ...
 33 |     @classmethod
 34 |     def from_url(
 35 |         cls,
 36 |         url: str,
 37 |         *,
 38 |         config: Dict[str, str] | None = None,
 39 |         client_options: Dict[str, str] | None = None,
 40 |         retry_config: RetryConfig | None = None,
 41 |     ) -> S3Store: ...
 42 | 
 43 | class GCSStore:
 44 |     @classmethod
 45 |     def from_env(
 46 |         cls,
 47 |         bucket: str,
 48 |         *,
 49 |         config: Dict[str, str] | None = None,
 50 |         client_options: Dict[str, str] | None = None,
 51 |         retry_config: RetryConfig | None = None,
 52 |     ) -> S3Store: ...
 53 |     @classmethod
 54 |     def from_url(
 55 |         cls,
 56 |         url: str,
 57 |         *,
 58 |         config: Dict[str, str] | None = None,
 59 |         client_options: Dict[str, str] | None = None,
 60 |         retry_config: RetryConfig | None = None,
 61 |     ) -> S3Store: ...
 62 | 
 63 | class HTTPStore:
 64 |     @classmethod
 65 |     def from_url(
 66 |         cls,
 67 |         url: str,
 68 |         *,
 69 |         client_options: Dict[str, str] | None = None,
 70 |         retry_config: RetryConfig | None = None,
 71 |     ) -> S3Store: ...
 72 | 
 73 | class S3Store:
 74 |     @classmethod
 75 |     def from_env(
 76 |         cls,
 77 |         bucket: str,
 78 |         *,
 79 |         config: Dict[str, str] | None = None,
 80 |         client_options: Dict[str, str] | None = None,
 81 |         retry_config: RetryConfig | None = None,
 82 |     ) -> S3Store: ...
 83 |     @classmethod
 84 |     def from_session(
 85 |         cls,
 86 |         session: boto3.Session | botocore.session.Session,
 87 |         bucket: str,
 88 |         *,
 89 |         config: Dict[str, str] | None = None,
 90 |         client_options: Dict[str, str] | None = None,
 91 |         retry_config: RetryConfig | None = None,
 92 |     ) -> S3Store: ...
 93 |     @classmethod
 94 |     def from_url(
 95 |         cls,
 96 |         url: str,
 97 |         *,
 98 |         config: Dict[str, str] | None = None,
 99 |         client_options: Dict[str, str] | None = None,
100 |         retry_config: RetryConfig | None = None,
101 |     ) -> S3Store: ...
102 | 
103 | class LocalStore:
104 |     """
105 |     Local filesystem storage providing an ObjectStore interface to files on local disk.
106 |     Can optionally be created with a directory prefix.
107 | 
108 |     """
109 |     def __init__(self, prefix: str | None = None) -> None: ...
110 | 
111 | class MemoryStore:
112 |     """A fully in-memory implementation of ObjectStore."""
113 |     def __init__(self) -> None: ...
114 | 
115 | ObjectStore = AzureStore | GCSStore | HTTPStore | S3Store | LocalStore | MemoryStore
116 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_record_batch_reader.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Sequence
 2 | 
 3 | from ._record_batch import RecordBatch
 4 | from ._schema import Schema
 5 | from ._table import Table
 6 | from .types import ArrowArrayExportable, ArrowSchemaExportable, ArrowStreamExportable
 7 | 
 8 | class RecordBatchReader:
 9 |     """An Arrow RecordBatchReader.
10 | 
11 |     A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch].
12 |     """
13 |     def __arrow_c_schema__(self) -> object:
14 |         """
15 |         An implementation of the [Arrow PyCapsule
16 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
17 |         This dunder method should not be called directly, but enables zero-copy data
18 |         transfer to other Python libraries that understand Arrow memory.
19 | 
20 |         This allows Arrow consumers to inspect the data type of this RecordBatchReader.
21 |         Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the
22 |         exported data to a supported data type.
23 |         """
24 |     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
25 |         """
26 |         An implementation of the [Arrow PyCapsule
27 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
28 |         This dunder method should not be called directly, but enables zero-copy data
29 |         transfer to other Python libraries that understand Arrow memory.
30 | 
31 |         For example, you can call
32 |         [`pyarrow.RecordBatchReader.from_stream`][pyarrow.RecordBatchReader.from_stream]
33 |         to convert this stream to a pyarrow `RecordBatchReader`. Alternatively, you can
34 |         call [`pyarrow.table()`][pyarrow.table] to consume this stream to a pyarrow
35 |         table or [`Table.from_arrow()`][arro3.core.Table] to consume this stream to an
36 |         arro3 Table.
37 |         """
38 |     def __iter__(self) -> RecordBatchReader: ...
39 |     def __next__(self) -> RecordBatch: ...
40 |     def __repr__(self) -> str: ...
41 |     @classmethod
42 |     def from_arrow(
43 |         cls, input: ArrowArrayExportable | ArrowStreamExportable
44 |     ) -> RecordBatchReader:
45 |         """
46 |         Construct this from an existing Arrow object.
47 | 
48 |         It can be called on anything that exports the Arrow stream interface
49 |         (has an `__arrow_c_stream__` method), such as a `Table` or `RecordBatchReader`.
50 |         """
51 |     @classmethod
52 |     def from_arrow_pycapsule(cls, capsule) -> RecordBatchReader:
53 |         """Construct this object from a bare Arrow PyCapsule"""
54 |     @classmethod
55 |     def from_batches(
56 |         cls, schema: ArrowSchemaExportable, batches: Sequence[ArrowArrayExportable]
57 |     ) -> RecordBatchReader:
58 |         """Construct a new RecordBatchReader from existing data.
59 | 
60 |         Args:
61 |             schema: The schema of the Arrow batches.
62 |             batches: The existing batches.
63 |         """
64 |     @classmethod
65 |     def from_stream(cls, data: ArrowStreamExportable) -> RecordBatchReader:
66 |         """Import a RecordBatchReader from an object that exports an Arrow C Stream."""
67 |     @property
68 |     def closed(self) -> bool:
69 |         """Returns `true` if this reader has already been consumed."""
70 |     def read_all(self) -> Table:
71 |         """Read all batches into a Table."""
72 |     def read_next_batch(self) -> RecordBatch:
73 |         """Read the next batch in the stream."""
74 |     @property
75 |     def schema(self) -> Schema:
76 |         """Access the schema of this table."""
77 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_array_reader.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Sequence
 2 | 
 3 | from ._array import Array
 4 | from ._chunked_array import ChunkedArray
 5 | from ._field import Field
 6 | from .types import ArrowArrayExportable, ArrowSchemaExportable, ArrowStreamExportable
 7 | 
 8 | class ArrayReader:
 9 |     """A stream of Arrow `Array`s.
10 | 
11 |     This is similar to the [`RecordBatchReader`][arro3.core.RecordBatchReader] but each
12 |     item yielded from the stream is an [`Array`][arro3.core.Array], not a
13 |     [`RecordBatch`][arro3.core.RecordBatch].
14 |     """
15 |     def __arrow_c_schema__(self) -> object:
16 |         """
17 |         An implementation of the [Arrow PyCapsule
18 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
19 |         This dunder method should not be called directly, but enables zero-copy data
20 |         transfer to other Python libraries that understand Arrow memory.
21 | 
22 |         This allows Arrow consumers to inspect the data type of this ArrayReader. Then
23 |         the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
24 |         data to a supported data type.
25 |         """
26 |     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
27 |         """
28 |         An implementation of the [Arrow PyCapsule
29 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
30 |         This dunder method should not be called directly, but enables zero-copy data
31 |         transfer to other Python libraries that understand Arrow memory.
32 | 
33 |         For example, you can call [`pyarrow.chunked_array()`][pyarrow.chunked_array] to
34 |         convert this ArrayReader to a pyarrow ChunkedArray, without copying memory.
35 |         """
36 |     def __iter__(self) -> ArrayReader: ...
37 |     def __next__(self) -> Array: ...
38 |     def __repr__(self) -> str: ...
39 |     @classmethod
40 |     def from_arrow(
41 |         cls, input: ArrowArrayExportable | ArrowStreamExportable
42 |     ) -> ArrayReader:
43 |         """Construct this from an existing Arrow object.
44 | 
45 |         It can be called on anything that exports the Arrow stream interface
46 |         (has an `__arrow_c_stream__` method), such as a `Table` or `ArrayReader`.
47 |         """
48 |     @classmethod
49 |     def from_arrow_pycapsule(cls, capsule) -> ArrayReader:
50 |         """Construct this object from a bare Arrow PyCapsule"""
51 |     @classmethod
52 |     def from_arrays(
53 |         cls, field: ArrowSchemaExportable, arrays: Sequence[ArrowArrayExportable]
54 |     ) -> ArrayReader:
55 |         """Construct an ArrayReader from existing data.
56 | 
57 |         Args:
58 |             field: The Arrow field that describes the sequence of array data.
59 |             arrays: A sequence (list or tuple) of Array data.
60 |         """
61 |     @classmethod
62 |     def from_stream(cls, data: ArrowStreamExportable) -> ArrayReader:
63 |         """Construct this from an existing Arrow object.
64 | 
65 |         This is an alias of and has the same behavior as
66 |         [`from_arrow`][arro3.core.ArrayReader.from_arrow], but is included for parity
67 |         with [`pyarrow.RecordBatchReader`][pyarrow.RecordBatchReader].
68 |         """
69 |     @property
70 |     def closed(self) -> bool:
71 |         """Returns `true` if this reader has already been consumed."""
72 |     def read_all(self) -> ChunkedArray:
73 |         """Read all batches from this stream into a ChunkedArray."""
74 |     def read_next_array(self) -> Array:
75 |         """Read the next array from this stream."""
76 |     @property
77 |     def field(self) -> Field:
78 |         """Access the field of this reader."""
79 | 


--------------------------------------------------------------------------------
/arro3-io/src/ipc.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{BufReader, BufWriter};
  2 | 
  3 | use arrow_ipc::reader::{FileReaderBuilder, StreamReader};
  4 | use arrow_ipc::writer::IpcWriteOptions;
  5 | use pyo3::exceptions::PyValueError;
  6 | use pyo3::prelude::*;
  7 | use pyo3_arrow::error::PyArrowResult;
  8 | use pyo3_arrow::export::Arro3RecordBatchReader;
  9 | use pyo3_arrow::input::AnyRecordBatch;
 10 | use pyo3_arrow::PyRecordBatchReader;
 11 | 
 12 | use crate::utils::{FileReader, FileWriter};
 13 | 
 14 | /// Read an Arrow IPC file to an Arrow RecordBatchReader
 15 | #[pyfunction]
 16 | pub fn read_ipc(file: FileReader) -> PyArrowResult<Arro3RecordBatchReader> {
 17 |     let builder = FileReaderBuilder::new();
 18 |     let buf_file = BufReader::new(file);
 19 |     let reader = builder.build(buf_file)?;
 20 |     Ok(PyRecordBatchReader::new(Box::new(reader)).into())
 21 | }
 22 | 
 23 | /// Read an Arrow IPC Stream file to an Arrow RecordBatchReader
 24 | #[pyfunction]
 25 | pub fn read_ipc_stream(file: FileReader) -> PyArrowResult<Arro3RecordBatchReader> {
 26 |     let reader = StreamReader::try_new(file, None)?;
 27 |     Ok(PyRecordBatchReader::new(Box::new(reader)).into())
 28 | }
 29 | 
 30 | #[allow(clippy::upper_case_acronyms)]
 31 | pub enum IpcCompression {
 32 |     LZ4,
 33 |     ZSTD,
 34 | }
 35 | 
 36 | impl<'py> FromPyObject<'py> for IpcCompression {
 37 |     fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
 38 |         let s: String = ob.extract()?;
 39 |         match s.to_lowercase().as_str() {
 40 |             "lz4" | "lz4_frame" | "lz4frame" => Ok(Self::LZ4),
 41 |             "zstd" => Ok(Self::ZSTD),
 42 |             _ => Err(PyValueError::new_err(
 43 |                 "Unexpected compression. Should be one of 'LZ4', 'ZSTD'.",
 44 |             )),
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | impl From<IpcCompression> for arrow_ipc::CompressionType {
 50 |     fn from(value: IpcCompression) -> Self {
 51 |         match value {
 52 |             IpcCompression::LZ4 => Self::LZ4_FRAME,
 53 |             IpcCompression::ZSTD => Self::ZSTD,
 54 |         }
 55 |     }
 56 | }
 57 | 
 58 | /// Write an Arrow Table or stream to an IPC File
 59 | #[pyfunction]
 60 | #[pyo3(
 61 |     signature = (data, file, *, compression = IpcCompression::LZ4),
 62 |     text_signature = "(data, file, *, compression = 'LZ4')")
 63 | ]
 64 | pub fn write_ipc(
 65 |     data: AnyRecordBatch,
 66 |     file: FileWriter,
 67 |     compression: Option<IpcCompression>,
 68 | ) -> PyArrowResult<()> {
 69 |     let buf_writer = BufWriter::new(file);
 70 |     let reader = data.into_reader()?;
 71 |     let options = IpcWriteOptions::default().try_with_compression(compression.map(|x| x.into()))?;
 72 |     let mut writer =
 73 |         arrow_ipc::writer::FileWriter::try_new_with_options(buf_writer, &reader.schema(), options)?;
 74 |     for batch in reader {
 75 |         writer.write(&batch?)?;
 76 |     }
 77 |     writer.finish()?;
 78 |     Ok(())
 79 | }
 80 | 
 81 | /// Write an Arrow Table or stream to an IPC Stream
 82 | #[pyfunction]
 83 | #[pyo3(
 84 |     signature = (data, file, *, compression = IpcCompression::LZ4),
 85 |     text_signature = "(data, file, *, compression = 'LZ4')")
 86 | ]
 87 | pub fn write_ipc_stream(
 88 |     data: AnyRecordBatch,
 89 |     file: FileWriter,
 90 |     compression: Option<IpcCompression>,
 91 | ) -> PyArrowResult<()> {
 92 |     let buf_writer = BufWriter::new(file);
 93 |     let reader = data.into_reader()?;
 94 |     let options = IpcWriteOptions::default().try_with_compression(compression.map(|x| x.into()))?;
 95 |     let mut writer = arrow_ipc::writer::StreamWriter::try_new_with_options(
 96 |         buf_writer,
 97 |         &reader.schema(),
 98 |         options,
 99 |     )?;
100 |     for batch in reader {
101 |         writer.write(&batch?)?;
102 |     }
103 |     writer.finish()?;
104 |     Ok(())
105 | }
106 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/ffi_stream.rs:
--------------------------------------------------------------------------------
  1 | //! A custom implementation of ArrowArrayStreamReader to support ChunkedArrays: a stream of arrays
  2 | //! of any data type that is not expected to represent record batches.
  3 | //!
  4 | //! This is derived from
  5 | //! <https://github.com/apache/arrow-rs/blob/9d0abcc6f4e11594c23811c2c2d297f2eb2963af/arrow/src/ffi_stream.rs>
  6 | 
  7 | use std::ffi::CStr;
  8 | use std::sync::Arc;
  9 | 
 10 | use arrow_array::ffi::{from_ffi_and_data_type, FFI_ArrowArray, FFI_ArrowSchema};
 11 | use arrow_array::ffi_stream::FFI_ArrowArrayStream;
 12 | use arrow_array::{make_array, Array};
 13 | use arrow_schema::{ArrowError, Field, FieldRef};
 14 | 
 15 | use crate::ffi::ArrayReader;
 16 | 
 17 | #[derive(Debug)]
 18 | pub struct ArrowArrayStreamReader {
 19 |     stream: FFI_ArrowArrayStream,
 20 |     field: FieldRef,
 21 | }
 22 | 
 23 | /// Gets schema from a raw pointer of `FFI_ArrowArrayStream`. This is used when constructing
 24 | /// `ArrowArrayStreamReader` to cache schema.
 25 | fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result<FieldRef, ArrowError> {
 26 |     let mut schema = FFI_ArrowSchema::empty();
 27 | 
 28 |     let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, &mut schema) };
 29 | 
 30 |     if ret_code == 0 {
 31 |         let field = Field::try_from(&schema)?;
 32 |         Ok(Arc::new(field))
 33 |     } else {
 34 |         Err(ArrowError::CDataInterface(format!(
 35 |             "Cannot get schema from input stream. Error code: {ret_code:?}"
 36 |         )))
 37 |     }
 38 | }
 39 | 
 40 | impl ArrowArrayStreamReader {
 41 |     /// Creates a new `ArrowArrayStreamReader` from a `FFI_ArrowArrayStream`.
 42 |     /// This is used to import from the C Stream Interface.
 43 |     #[allow(dead_code)]
 44 |     pub fn try_new(mut stream: FFI_ArrowArrayStream) -> Result<Self, ArrowError> {
 45 |         if stream.release.is_none() {
 46 |             return Err(ArrowError::CDataInterface(
 47 |                 "input stream is already released".to_string(),
 48 |             ));
 49 |         }
 50 | 
 51 |         let field = get_stream_schema(&mut stream)?;
 52 | 
 53 |         Ok(Self { stream, field })
 54 |     }
 55 | 
 56 |     pub fn field(&self) -> FieldRef {
 57 |         self.field.clone()
 58 |     }
 59 | 
 60 |     /// Get the last error from `ArrowArrayStreamReader`
 61 |     fn get_stream_last_error(&mut self) -> Option<String> {
 62 |         let get_last_error = self.stream.get_last_error?;
 63 | 
 64 |         let error_str = unsafe { get_last_error(&mut self.stream) };
 65 |         if error_str.is_null() {
 66 |             return None;
 67 |         }
 68 | 
 69 |         let error_str = unsafe { CStr::from_ptr(error_str) };
 70 |         Some(error_str.to_string_lossy().to_string())
 71 |     }
 72 | }
 73 | 
 74 | impl Iterator for ArrowArrayStreamReader {
 75 |     type Item = Result<Arc<dyn Array>, ArrowError>;
 76 | 
 77 |     fn next(&mut self) -> Option<Self::Item> {
 78 |         let mut array = FFI_ArrowArray::empty();
 79 | 
 80 |         let ret_code = unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) };
 81 | 
 82 |         if ret_code == 0 {
 83 |             // The end of stream has been reached
 84 |             if array.is_released() {
 85 |                 return None;
 86 |             }
 87 | 
 88 |             let result = unsafe { from_ffi_and_data_type(array, self.field().data_type().clone()) };
 89 | 
 90 |             Some(result.map(make_array))
 91 |         } else {
 92 |             let last_error = self.get_stream_last_error();
 93 |             let err = ArrowError::CDataInterface(last_error.unwrap());
 94 |             Some(Err(err))
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | impl ArrayReader for ArrowArrayStreamReader {
100 |     fn field(&self) -> FieldRef {
101 |         self.field.clone()
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/to_python/utils.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::CString;
 2 | use std::sync::Arc;
 3 | 
 4 | use arrow_array::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
 5 | use arrow_array::Array;
 6 | use arrow_cast::{can_cast_types, cast};
 7 | use arrow_schema::{ArrowError, Field, FieldRef};
 8 | use pyo3::prelude::*;
 9 | use pyo3::types::{PyCapsule, PyTuple};
10 | 
11 | use crate::error::PyArrowResult;
12 | use crate::ffi::from_python::utils::import_schema_pycapsule;
13 | use crate::ffi::to_python::ffi_stream::new_stream;
14 | use crate::ffi::{ArrayIterator, ArrayReader};
15 | 
16 | /// Export a [`arrow_schema::Schema`], [`arrow_schema::Field`], or [`arrow_schema::DataType`] to a
17 | /// PyCapsule holding an Arrow C Schema pointer.
18 | pub fn to_schema_pycapsule(
19 |     py: Python,
20 |     field: impl TryInto<FFI_ArrowSchema, Error = ArrowError>,
21 | ) -> PyArrowResult<Bound<PyCapsule>> {
22 |     let ffi_schema: FFI_ArrowSchema = field.try_into()?;
23 |     let schema_capsule_name = CString::new("arrow_schema").unwrap();
24 |     let schema_capsule = PyCapsule::new(py, ffi_schema, Some(schema_capsule_name))?;
25 |     Ok(schema_capsule)
26 | }
27 | 
28 | /// Export an [`Array`] and [`FieldRef`] to a tuple of PyCapsules holding an Arrow C Schema and
29 | /// Arrow C Array pointers.
30 | pub fn to_array_pycapsules<'py>(
31 |     py: Python<'py>,
32 |     field: FieldRef,
33 |     array: &dyn Array,
34 |     requested_schema: Option<Bound<'py, PyCapsule>>,
35 | ) -> PyArrowResult<Bound<'py, PyTuple>> {
36 |     // Cast array if requested
37 |     let (array_data, field) = if let Some(capsule) = requested_schema {
38 |         let schema_ptr = import_schema_pycapsule(&capsule)?;
39 |         let output_field =
40 |             Arc::new(Field::try_from(schema_ptr)?.with_metadata(field.metadata().clone()));
41 | 
42 |         // Only cast the array if we can cast the types.
43 |         if can_cast_types(field.data_type(), output_field.data_type()) {
44 |             let casted_array = cast(array, output_field.data_type())?;
45 |             (casted_array.to_data(), output_field)
46 |         } else {
47 |             (array.to_data(), field)
48 |         }
49 |     } else {
50 |         (array.to_data(), field)
51 |     };
52 | 
53 |     let ffi_schema = FFI_ArrowSchema::try_from(&field)?;
54 |     let ffi_array = FFI_ArrowArray::new(&array_data);
55 | 
56 |     let schema_capsule_name = CString::new("arrow_schema").unwrap();
57 |     let array_capsule_name = CString::new("arrow_array").unwrap();
58 | 
59 |     let schema_capsule = PyCapsule::new(py, ffi_schema, Some(schema_capsule_name))?;
60 |     let array_capsule = PyCapsule::new(py, ffi_array, Some(array_capsule_name))?;
61 |     let tuple = PyTuple::new(py, vec![schema_capsule, array_capsule])?;
62 | 
63 |     Ok(tuple)
64 | }
65 | 
66 | /// Export an [`ArrayIterator`][crate::ffi::ArrayIterator] to a PyCapsule holding an Arrow C Stream
67 | /// pointer.
68 | pub fn to_stream_pycapsule<'py>(
69 |     py: Python<'py>,
70 |     mut array_reader: Box<dyn ArrayReader + Send>,
71 |     requested_schema: Option<Bound<'py, PyCapsule>>,
72 | ) -> PyArrowResult<Bound<'py, PyCapsule>> {
73 |     // Cast array if requested
74 |     if let Some(capsule) = requested_schema {
75 |         let schema_ptr = import_schema_pycapsule(&capsule)?;
76 | 
77 |         let existing_field = array_reader.field();
78 |         let output_field =
79 |             Arc::new(Field::try_from(schema_ptr)?.with_metadata(existing_field.metadata().clone()));
80 |         let iter_field = output_field.clone();
81 | 
82 |         // Only cast the reader if we can cast the types.
83 |         if can_cast_types(existing_field.data_type(), output_field.data_type()) {
84 |             let array_iter = array_reader.map(move |array| {
85 |                 let out = cast(array?.as_ref(), output_field.data_type())?;
86 |                 Ok(out)
87 |             });
88 |             array_reader = Box::new(ArrayIterator::new(array_iter, iter_field));
89 |         }
90 |     }
91 | 
92 |     let ffi_stream = new_stream(array_reader);
93 |     let stream_capsule_name = CString::new("arrow_array_stream").unwrap();
94 |     Ok(PyCapsule::new(py, ffi_stream, Some(stream_capsule_name))?)
95 | }
96 | 


--------------------------------------------------------------------------------
/arro3-core/src/accessors/dictionary.rs:
--------------------------------------------------------------------------------
  1 | use arrow_array::cast::AsArray;
  2 | use arrow_array::ArrayRef;
  3 | use arrow_schema::{ArrowError, DataType, Field};
  4 | use pyo3::prelude::*;
  5 | use pyo3_arrow::error::PyArrowResult;
  6 | use pyo3_arrow::ffi::ArrayIterator;
  7 | use pyo3_arrow::input::AnyArray;
  8 | use pyo3_arrow::{PyArray, PyArrayReader};
  9 | 
 10 | #[pyfunction]
 11 | pub(crate) fn dictionary_indices(py: Python, array: AnyArray) -> PyArrowResult<PyObject> {
 12 |     match array {
 13 |         AnyArray::Array(array) => {
 14 |             let (array, _field) = array.into_inner();
 15 |             let output_array = _dictionary_indices(array)?;
 16 |             Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind())
 17 |         }
 18 |         AnyArray::Stream(stream) => {
 19 |             let reader = stream.into_reader()?;
 20 |             let existing_field = reader.field();
 21 |             let out_field = match existing_field.data_type() {
 22 |                 DataType::Dictionary(key_type, _value_type) => {
 23 |                     Field::new("", *key_type.clone(), true)
 24 |                 }
 25 |                 _ => {
 26 |                     return Err(ArrowError::ComputeError(
 27 |                         "Expected dictionary-typed Array".to_string(),
 28 |                     )
 29 |                     .into())
 30 |                 }
 31 |             };
 32 |             let iter = reader
 33 |                 .into_iter()
 34 |                 .map(move |array| _dictionary_indices(array?));
 35 |             Ok(
 36 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into())))
 37 |                     .to_arro3(py)?
 38 |                     .unbind(),
 39 |             )
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | /// Access the dictionary of the dictionary array
 45 | ///
 46 | /// This is equivalent to the `.dictionary` attribute on a PyArrow DictionaryArray.
 47 | #[pyfunction]
 48 | pub(crate) fn dictionary_dictionary(py: Python, array: AnyArray) -> PyArrowResult<PyObject> {
 49 |     match array {
 50 |         AnyArray::Array(array) => {
 51 |             let (array, _field) = array.into_inner();
 52 |             let output_array = _dictionary_dictionary(array)?;
 53 |             Ok(PyArray::from_array_ref(output_array).to_arro3(py)?.unbind())
 54 |         }
 55 |         AnyArray::Stream(stream) => {
 56 |             let reader = stream.into_reader()?;
 57 |             let existing_field = reader.field();
 58 |             let out_field = match existing_field.data_type() {
 59 |                 DataType::Dictionary(_key_type, value_type) => {
 60 |                     Field::new("", *value_type.clone(), true)
 61 |                 }
 62 |                 _ => {
 63 |                     return Err(ArrowError::ComputeError(
 64 |                         "Expected dictionary-typed Array".to_string(),
 65 |                     )
 66 |                     .into())
 67 |                 }
 68 |             };
 69 |             let iter = reader
 70 |                 .into_iter()
 71 |                 .map(move |array| _dictionary_dictionary(array?));
 72 |             Ok(
 73 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, out_field.into())))
 74 |                     .to_arro3(py)?
 75 |                     .unbind(),
 76 |             )
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | fn _dictionary_indices(array: ArrayRef) -> Result<ArrayRef, ArrowError> {
 82 |     match array.data_type() {
 83 |         DataType::Dictionary(_, _) => {
 84 |             let dict_arr = array.as_any_dictionary();
 85 |             let keys_arr = dict_arr.keys();
 86 |             let keys_arr_ref = keys_arr.slice(0, keys_arr.len());
 87 |             Ok(keys_arr_ref)
 88 |         }
 89 |         _ => Err(ArrowError::ComputeError(
 90 |             "Expected dictionary-typed Array".to_string(),
 91 |         )),
 92 |     }
 93 | }
 94 | 
 95 | fn _dictionary_dictionary(array: ArrayRef) -> Result<ArrayRef, ArrowError> {
 96 |     match array.data_type() {
 97 |         DataType::Dictionary(_, _) => {
 98 |             let dict_arr = array.as_any_dictionary();
 99 |             let values_arr = dict_arr.values().clone();
100 |             Ok(values_arr)
101 |         }
102 |         _ => Err(ArrowError::ComputeError(
103 |             "Expected dictionary-typed Array".to_string(),
104 |         )),
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/types.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import array as _array
  4 | import mmap
  5 | import sys
  6 | from typing import TYPE_CHECKING, Protocol, Tuple, Union
  7 | 
  8 | if sys.version_info >= (3, 12):
  9 |     from collections.abc import Buffer as _Buffer
 10 | else:
 11 |     from typing_extensions import Buffer as _Buffer
 12 | 
 13 | if TYPE_CHECKING:
 14 |     import numpy as np
 15 | 
 16 | 
 17 | class ArrowSchemaExportable(Protocol):
 18 |     """
 19 |     An object with an `__arrow_c_schema__` method implementing the [Arrow C Data Interface
 20 |     interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the
 21 |     [Arrow PyCapsule
 22 |     Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 23 | 
 24 |     Such objects include:
 25 | 
 26 |     - arro3 [`Schema`][arro3.core.Schema], [`Field`][arro3.core.Field], or [`DataType`][arro3.core.DataType] objects.
 27 |     - pyarrow [`Schema`][pyarrow.Schema], [`Field`][pyarrow.Field], or [`DataType`][pyarrow.DataType] objects.
 28 | 
 29 |     This allows for zero-copy Arrow data interchange across libraries.
 30 |     """
 31 | 
 32 |     def __arrow_c_schema__(self) -> object: ...
 33 | 
 34 | 
 35 | class ArrowArrayExportable(Protocol):
 36 |     """
 37 |     An object with an `__arrow_c_array__` method implementing the [Arrow C Data Interface
 38 |     interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the
 39 |     [Arrow PyCapsule
 40 |     Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 41 | 
 42 |     Such objects include:
 43 | 
 44 |     - arro3 [`Array`][arro3.core.Array] or [`RecordBatch`][arro3.core.RecordBatch] objects.
 45 |     - pyarrow [`Array`][pyarrow.Array] or [`RecordBatch`][pyarrow.RecordBatch] objects
 46 | 
 47 |     This allows for zero-copy Arrow data interchange across libraries.
 48 |     """
 49 | 
 50 |     def __arrow_c_array__(
 51 |         self, requested_schema: object | None = None
 52 |     ) -> Tuple[object, object]: ...
 53 | 
 54 | 
 55 | class ArrowStreamExportable(Protocol):
 56 |     """
 57 |     An object with an `__arrow_c_stream__` method implementing the [Arrow C Stream
 58 |     interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the
 59 |     [Arrow PyCapsule
 60 |     Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 61 | 
 62 |     Supported objects include:
 63 | 
 64 |     - arro3 [`Table`][arro3.core.Table], [`RecordBatchReader`][arro3.core.RecordBatchReader], [`ChunkedArray`][arro3.core.ChunkedArray], or [`ArrayReader`][arro3.core.ArrayReader] objects.
 65 |     - Polars `Series` or `DataFrame` objects (polars v1.2 or higher)
 66 |     - DuckDB table-like objects, such as [`DuckDBPyRelation`][duckdb.DuckDBPyRelation] or [`DuckDBPyConnection`][duckdb.DuckDBPyConnection].
 67 |     - pyarrow [`RecordBatchReader`][pyarrow.RecordBatchReader], [`Table`][pyarrow.Table], or [`ChunkedArray`][pyarrow.ChunkedArray] objects (pyarrow v14 or
 68 |         higher)
 69 |     - pandas [`DataFrame`][pandas.DataFrame]s  (pandas v2.2 or higher)
 70 |     - ibis `Table` objects.
 71 | 
 72 |     This allows for zero-copy Arrow data interchange across libraries.
 73 | 
 74 |     For an up to date list of supported objects, see [this
 75 |     issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008).
 76 |     """
 77 | 
 78 |     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...
 79 | 
 80 | 
 81 | # From numpy
 82 | # https://github.com/numpy/numpy/blob/961b70f6aaeed67147245b56ddb3f12ed1a050b5/numpy/__init__.pyi#L1772C1-L1785C1
 83 | if sys.version_info >= (3, 12):
 84 |     from collections.abc import Buffer as _SupportsBuffer
 85 | else:
 86 |     _SupportsBuffer = Union[
 87 |         bytes,
 88 |         bytearray,
 89 |         memoryview,
 90 |         _array.array,
 91 |         mmap.mmap,
 92 |         "np.ndarray",
 93 |         _Buffer,
 94 |     ]
 95 | 
 96 | 
 97 | # Numpy arrays don't yet declare `__buffer__` (or maybe just on a very recent version)
 98 | ArrayInput = Union[ArrowArrayExportable, _SupportsBuffer]
 99 | """Accepted input as an Arrow array.
100 | 
101 | Buffer protocol input (such as numpy arrays) will be interpreted zero-copy except in the
102 | case of boolean-typed input, which must be copied to the Arrow format.
103 | """
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.parquet
  2 | *.whl
  3 | *.arrow
  4 | *.arrows
  5 | 
  6 | # Generated by Cargo
  7 | # will have compiled files and executables
  8 | debug/
  9 | target/
 10 | 
 11 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 12 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 13 | # Cargo.lock
 14 | 
 15 | # These are backup files generated by rustfmt
 16 | **/*.rs.bk
 17 | 
 18 | # MSVC Windows builds of rustc generate these, which store debugging information
 19 | *.pdb
 20 | 
 21 | # Byte-compiled / optimized / DLL files
 22 | __pycache__/
 23 | *.py[cod]
 24 | *$py.class
 25 | 
 26 | # C extensions
 27 | *.so
 28 | 
 29 | # Distribution / packaging
 30 | .Python
 31 | build/
 32 | develop-eggs/
 33 | dist/
 34 | downloads/
 35 | eggs/
 36 | .eggs/
 37 | lib/
 38 | lib64/
 39 | parts/
 40 | sdist/
 41 | var/
 42 | wheels/
 43 | share/python-wheels/
 44 | *.egg-info/
 45 | .installed.cfg
 46 | *.egg
 47 | MANIFEST
 48 | 
 49 | # PyInstaller
 50 | #  Usually these files are written by a python script from a template
 51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 52 | *.manifest
 53 | *.spec
 54 | 
 55 | # Installer logs
 56 | pip-log.txt
 57 | pip-delete-this-directory.txt
 58 | 
 59 | # Unit test / coverage reports
 60 | htmlcov/
 61 | .tox/
 62 | .nox/
 63 | .coverage
 64 | .coverage.*
 65 | .cache
 66 | nosetests.xml
 67 | coverage.xml
 68 | *.cover
 69 | *.py,cover
 70 | .hypothesis/
 71 | .pytest_cache/
 72 | cover/
 73 | 
 74 | # Translations
 75 | *.mo
 76 | *.pot
 77 | 
 78 | # Django stuff:
 79 | *.log
 80 | local_settings.py
 81 | db.sqlite3
 82 | db.sqlite3-journal
 83 | 
 84 | # Flask stuff:
 85 | instance/
 86 | .webassets-cache
 87 | 
 88 | # Scrapy stuff:
 89 | .scrapy
 90 | 
 91 | # Sphinx documentation
 92 | docs/_build/
 93 | 
 94 | # PyBuilder
 95 | .pybuilder/
 96 | target/
 97 | 
 98 | # Jupyter Notebook
 99 | .ipynb_checkpoints
100 | 
101 | # IPython
102 | profile_default/
103 | ipython_config.py
104 | 
105 | # pyenv
106 | #   For a library or package, you might want to ignore these files since the code is
107 | #   intended to run in multiple environments; otherwise, check them in:
108 | # .python-version
109 | 
110 | # pipenv
111 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
112 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
113 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
114 | #   install all needed dependencies.
115 | #Pipfile.lock
116 | 
117 | # poetry
118 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
119 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
120 | #   commonly ignored for libraries.
121 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
122 | #poetry.lock
123 | 
124 | # pdm
125 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
126 | #pdm.lock
127 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
128 | #   in version control.
129 | #   https://pdm.fming.dev/#use-with-ide
130 | .pdm.toml
131 | 
132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133 | __pypackages__/
134 | 
135 | # Celery stuff
136 | celerybeat-schedule
137 | celerybeat.pid
138 | 
139 | # SageMath parsed files
140 | *.sage.py
141 | 
142 | # Environments
143 | .env
144 | .venv
145 | env/
146 | venv/
147 | ENV/
148 | env.bak/
149 | venv.bak/
150 | 
151 | # Spyder project settings
152 | .spyderproject
153 | .spyproject
154 | 
155 | # Rope project settings
156 | .ropeproject
157 | 
158 | # mkdocs documentation
159 | /site
160 | 
161 | # mypy
162 | .mypy_cache/
163 | .dmypy.json
164 | dmypy.json
165 | 
166 | # Pyre type checker
167 | .pyre/
168 | 
169 | # pytype static type analyzer
170 | .pytype/
171 | 
172 | # Cython debug symbols
173 | cython_debug/
174 | 
175 | # PyCharm
176 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
177 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
178 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
179 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
180 | #.idea/
181 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/ffi/from_python/utils.rs:
--------------------------------------------------------------------------------
  1 | use arrow_array::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
  2 | use arrow_array::ffi_stream::FFI_ArrowArrayStream;
  3 | use arrow_array::{make_array, ArrayRef};
  4 | use arrow_schema::Field;
  5 | use pyo3::exceptions::{PyTypeError, PyValueError};
  6 | use pyo3::prelude::*;
  7 | use pyo3::types::{PyCapsule, PyTuple};
  8 | use pyo3::{intern, PyAny, PyResult};
  9 | 
 10 | /// Validate PyCapsule has provided name
 11 | pub fn validate_pycapsule_name(capsule: &Bound<PyCapsule>, expected_name: &str) -> PyResult<()> {
 12 |     let capsule_name = capsule.name()?;
 13 |     if let Some(capsule_name) = capsule_name {
 14 |         let capsule_name = capsule_name.to_str()?;
 15 |         if capsule_name != expected_name {
 16 |             return Err(PyValueError::new_err(format!(
 17 |                 "Expected name '{}' in PyCapsule, instead got '{}'",
 18 |                 expected_name, capsule_name
 19 |             )));
 20 |         }
 21 |     } else {
 22 |         return Err(PyValueError::new_err(
 23 |             "Expected schema PyCapsule to have name set.",
 24 |         ));
 25 |     }
 26 | 
 27 |     Ok(())
 28 | }
 29 | 
 30 | /// Import `__arrow_c_schema__` across Python boundary
 31 | pub(crate) fn call_arrow_c_schema<'py>(ob: &'py Bound<PyAny>) -> PyResult<Bound<'py, PyCapsule>> {
 32 |     let py_str = intern!(ob.py(), "__arrow_c_schema__");
 33 |     if !ob.hasattr(py_str)? {
 34 |         return Err(PyValueError::new_err(
 35 |             "Expected an object with dunder __arrow_c_schema__",
 36 |         ));
 37 |     }
 38 | 
 39 |     Ok(ob.getattr(py_str)?.call0()?.downcast_into()?)
 40 | }
 41 | 
 42 | pub(crate) fn import_schema_pycapsule<'py>(
 43 |     capsule: &'py Bound<PyCapsule>,
 44 | ) -> PyResult<&'py FFI_ArrowSchema> {
 45 |     validate_pycapsule_name(capsule, "arrow_schema")?;
 46 | 
 47 |     let schema_ptr = unsafe { capsule.reference::<FFI_ArrowSchema>() };
 48 |     Ok(schema_ptr)
 49 | }
 50 | 
 51 | /// Import `__arrow_c_array__` across Python boundary
 52 | pub(crate) fn call_arrow_c_array<'py>(
 53 |     ob: &'py Bound<PyAny>,
 54 | ) -> PyResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> {
 55 |     let py_str = intern!(ob.py(), "__arrow_c_array__");
 56 |     if !ob.hasattr(py_str)? {
 57 |         return Err(PyValueError::new_err(
 58 |             "Expected an object with dunder __arrow_c_array__",
 59 |         ));
 60 |     }
 61 | 
 62 |     let tuple = ob.getattr(py_str)?.call0()?;
 63 |     if !tuple.is_instance_of::<PyTuple>() {
 64 |         return Err(PyTypeError::new_err(
 65 |             "Expected __arrow_c_array__ to return a tuple.",
 66 |         ));
 67 |     }
 68 | 
 69 |     let schema_capsule = tuple.get_item(0)?.downcast_into()?;
 70 |     let array_capsule = tuple.get_item(1)?.downcast_into()?;
 71 |     Ok((schema_capsule, array_capsule))
 72 | }
 73 | 
 74 | pub(crate) fn import_array_pycapsules(
 75 |     schema_capsule: &Bound<PyCapsule>,
 76 |     array_capsule: &Bound<PyCapsule>,
 77 | ) -> PyResult<(ArrayRef, Field)> {
 78 |     validate_pycapsule_name(schema_capsule, "arrow_schema")?;
 79 |     validate_pycapsule_name(array_capsule, "arrow_array")?;
 80 | 
 81 |     let schema_ptr = unsafe { schema_capsule.reference::<FFI_ArrowSchema>() };
 82 |     let array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) };
 83 | 
 84 |     let array_data = unsafe { arrow_array::ffi::from_ffi(array, schema_ptr) }
 85 |         .map_err(|err| PyTypeError::new_err(err.to_string()))?;
 86 |     let field = Field::try_from(schema_ptr).map_err(|err| PyTypeError::new_err(err.to_string()))?;
 87 |     let array = make_array(array_data);
 88 |     Ok((array, field))
 89 | }
 90 | 
 91 | /// Import `__arrow_c_stream__` across Python boundary.
 92 | pub(crate) fn call_arrow_c_stream<'py>(ob: &'py Bound<PyAny>) -> PyResult<Bound<'py, PyCapsule>> {
 93 |     let py_str = intern!(ob.py(), "__arrow_c_stream__");
 94 |     if !ob.hasattr(py_str)? {
 95 |         return Err(PyValueError::new_err(
 96 |             "Expected an object with dunder __arrow_c_stream__",
 97 |         ));
 98 |     }
 99 | 
100 |     let capsule = ob.getattr(py_str)?.call0()?.downcast_into()?;
101 |     Ok(capsule)
102 | }
103 | 
104 | pub(crate) fn import_stream_pycapsule(
105 |     capsule: &Bound<PyCapsule>,
106 | ) -> PyResult<FFI_ArrowArrayStream> {
107 |     validate_pycapsule_name(capsule, "arrow_array_stream")?;
108 | 
109 |     let stream = unsafe { FFI_ArrowArrayStream::from_raw(capsule.pointer() as _) };
110 |     Ok(stream)
111 | }
112 | 


--------------------------------------------------------------------------------
/pyo3-arrow/src/interop/numpy/to_numpy.rs:
--------------------------------------------------------------------------------
  1 | use arrow_array::cast::AsArray;
  2 | use arrow_array::types::*;
  3 | use arrow_array::{Array, BinaryArrayType, StringArrayType};
  4 | use arrow_schema::DataType;
  5 | use numpy::ToPyArray;
  6 | use pyo3::exceptions::{PyNotImplementedError, PyValueError};
  7 | use pyo3::prelude::*;
  8 | use pyo3::types::{PyAnyMethods, PyBytes, PyDict, PyList, PyString, PyTuple};
  9 | use pyo3::{intern, PyResult, Python};
 10 | 
 11 | pub fn to_numpy<'py>(py: Python<'py>, arr: &'py dyn Array) -> PyResult<Bound<'py, PyAny>> {
 12 |     if arr.null_count() > 0 {
 13 |         return Err(PyValueError::new_err(
 14 |             "Cannot create numpy array from arrow array with nulls.",
 15 |         ));
 16 |     }
 17 | 
 18 |     macro_rules! impl_primitive {
 19 |         ($arrow_type:ty) => {
 20 |             arr.as_primitive::<$arrow_type>()
 21 |                 .values()
 22 |                 .to_pyarray(py)
 23 |                 .into_any()
 24 |         };
 25 |     }
 26 | 
 27 |     let result = match arr.data_type() {
 28 |         DataType::Float16 => impl_primitive!(Float16Type),
 29 |         DataType::Float32 => impl_primitive!(Float32Type),
 30 |         DataType::Float64 => impl_primitive!(Float64Type),
 31 |         DataType::UInt8 => impl_primitive!(UInt8Type),
 32 |         DataType::UInt16 => impl_primitive!(UInt16Type),
 33 |         DataType::UInt32 => impl_primitive!(UInt32Type),
 34 |         DataType::UInt64 => impl_primitive!(UInt64Type),
 35 |         DataType::Int8 => impl_primitive!(Int8Type),
 36 |         DataType::Int16 => impl_primitive!(Int16Type),
 37 |         DataType::Int32 => impl_primitive!(Int32Type),
 38 |         DataType::Int64 => impl_primitive!(Int64Type),
 39 |         DataType::Boolean => {
 40 |             let bools = arr.as_boolean().values().iter().collect::<Vec<_>>();
 41 |             bools.to_pyarray(py).into_any()
 42 |         }
 43 |         // For other data types we create Python objects and then create an object-typed numpy
 44 |         // array
 45 |         DataType::Binary => binary_to_numpy(py, arr.as_binary::<i32>())?,
 46 |         DataType::LargeBinary => binary_to_numpy(py, arr.as_binary::<i64>())?,
 47 |         DataType::BinaryView => binary_to_numpy(py, arr.as_binary_view())?,
 48 |         DataType::Utf8 => string_to_numpy(py, arr.as_string::<i32>())?,
 49 |         DataType::LargeUtf8 => string_to_numpy(py, arr.as_string::<i64>())?,
 50 |         DataType::Utf8View => string_to_numpy(py, arr.as_string_view())?,
 51 |         dt => {
 52 |             return Err(PyNotImplementedError::new_err(format!(
 53 |                 "Unsupported type in to_numpy {dt}"
 54 |             )))
 55 |         }
 56 |     };
 57 |     Ok(result)
 58 | }
 59 | 
 60 | fn binary_to_numpy<'a>(
 61 |     py: Python<'a>,
 62 |     arr: impl BinaryArrayType<'a>,
 63 | ) -> PyResult<Bound<'a, PyAny>> {
 64 |     let mut py_bytes = Vec::with_capacity(arr.len());
 65 |     arr.iter()
 66 |         .for_each(|x| py_bytes.push(PyBytes::new(py, x.unwrap())));
 67 |     let py_list = PyList::new(py, py_bytes)?;
 68 |     let numpy_mod = py.import(intern!(py, "numpy"))?;
 69 |     let kwargs = PyDict::new(py);
 70 |     kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?;
 71 |     numpy_mod.call_method(
 72 |         intern!(py, "array"),
 73 |         PyTuple::new(py, vec![py_list])?,
 74 |         Some(&kwargs),
 75 |     )
 76 | }
 77 | 
 78 | fn string_to_numpy<'a>(
 79 |     py: Python<'a>,
 80 |     arr: impl StringArrayType<'a>,
 81 | ) -> PyResult<Bound<'a, PyAny>> {
 82 |     let mut py_bytes = Vec::with_capacity(arr.len());
 83 |     arr.iter()
 84 |         .for_each(|x| py_bytes.push(PyString::new(py, x.unwrap())));
 85 |     let py_list = PyList::new(py, py_bytes)?;
 86 |     let numpy_mod = py.import(intern!(py, "numpy"))?;
 87 |     let kwargs = PyDict::new(py);
 88 |     kwargs.set_item("dtype", numpy_mod.getattr(intern!(py, "object_"))?)?;
 89 |     numpy_mod.call_method(
 90 |         intern!(py, "array"),
 91 |         PyTuple::new(py, vec![py_list])?,
 92 |         Some(&kwargs),
 93 |     )
 94 | }
 95 | 
 96 | pub fn chunked_to_numpy<'py>(
 97 |     py: Python<'py>,
 98 |     arrs: Vec<&'py dyn Array>,
 99 | ) -> PyResult<Bound<'py, PyAny>> {
100 |     let py_arrays = arrs
101 |         .iter()
102 |         .map(|arr| to_numpy(py, *arr))
103 |         .collect::<PyResult<Vec<_>>>()?;
104 | 
105 |     let numpy_mod = py.import(intern!(py, "numpy"))?;
106 |     numpy_mod.call_method1(intern!(py, "concatenate"), (py_arrays,))
107 | }
108 | 


--------------------------------------------------------------------------------
/arro3-compute/src/temporal.rs:
--------------------------------------------------------------------------------
  1 | use arrow_schema::{DataType, Field};
  2 | use pyo3::exceptions::PyValueError;
  3 | use pyo3::prelude::*;
  4 | use pyo3_arrow::error::PyArrowResult;
  5 | use pyo3_arrow::ffi::ArrayIterator;
  6 | use pyo3_arrow::input::AnyArray;
  7 | use pyo3_arrow::{PyArray, PyArrayReader};
  8 | 
  9 | pub enum DatePart {
 10 |     /// Quarter of the year, in range `1..=4`
 11 |     Quarter,
 12 |     /// Calendar year
 13 |     Year,
 14 |     /// Month in the year, in range `1..=12`
 15 |     Month,
 16 |     /// ISO week of the year, in range `1..=53`
 17 |     Week,
 18 |     /// Day of the month, in range `1..=31`
 19 |     Day,
 20 |     /// Day of the week, in range `0..=6`, where Sunday is `0`
 21 |     DayOfWeekSunday0,
 22 |     /// Day of the week, in range `0..=6`, where Monday is `0`
 23 |     DayOfWeekMonday0,
 24 |     /// Day of year, in range `1..=366`
 25 |     DayOfYear,
 26 |     /// Hour of the day, in range `0..=23`
 27 |     Hour,
 28 |     /// Minute of the hour, in range `0..=59`
 29 |     Minute,
 30 |     /// Second of the minute, in range `0..=59`
 31 |     Second,
 32 |     /// Millisecond of the second
 33 |     Millisecond,
 34 |     /// Microsecond of the second
 35 |     Microsecond,
 36 |     /// Nanosecond of the second
 37 |     Nanosecond,
 38 | }
 39 | 
 40 | impl<'a> FromPyObject<'a> for DatePart {
 41 |     fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult<Self> {
 42 |         let s: String = ob.extract()?;
 43 |         match s.to_lowercase().as_str() {
 44 |             "quarter" => Ok(Self::Quarter),
 45 |             "year" => Ok(Self::Year),
 46 |             "month" => Ok(Self::Month),
 47 |             "week" => Ok(Self::Week),
 48 |             "day" => Ok(Self::Day),
 49 |             "dayofweeksunday0" => Ok(Self::DayOfWeekSunday0),
 50 |             "dayofweekmonday0" => Ok(Self::DayOfWeekMonday0),
 51 |             "dayofyear" => Ok(Self::DayOfYear),
 52 |             "hour" => Ok(Self::Hour),
 53 |             "minute" => Ok(Self::Minute),
 54 |             "second" => Ok(Self::Second),
 55 |             "millisecond" => Ok(Self::Millisecond),
 56 |             "microsecond" => Ok(Self::Microsecond),
 57 |             "nanosecond" => Ok(Self::Nanosecond),
 58 |             _ => Err(PyValueError::new_err("Unexpected date part")),
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | impl From<DatePart> for arrow_arith::temporal::DatePart {
 64 |     fn from(value: DatePart) -> Self {
 65 |         match value {
 66 |             DatePart::Quarter => arrow_arith::temporal::DatePart::Quarter,
 67 |             DatePart::Year => arrow_arith::temporal::DatePart::Year,
 68 |             DatePart::Month => arrow_arith::temporal::DatePart::Month,
 69 |             DatePart::Week => arrow_arith::temporal::DatePart::Week,
 70 |             DatePart::Day => arrow_arith::temporal::DatePart::Day,
 71 |             DatePart::DayOfWeekSunday0 => arrow_arith::temporal::DatePart::DayOfWeekSunday0,
 72 |             DatePart::DayOfWeekMonday0 => arrow_arith::temporal::DatePart::DayOfWeekMonday0,
 73 |             DatePart::DayOfYear => arrow_arith::temporal::DatePart::DayOfYear,
 74 |             DatePart::Hour => arrow_arith::temporal::DatePart::Hour,
 75 |             DatePart::Minute => arrow_arith::temporal::DatePart::Minute,
 76 |             DatePart::Second => arrow_arith::temporal::DatePart::Second,
 77 |             DatePart::Millisecond => arrow_arith::temporal::DatePart::Millisecond,
 78 |             DatePart::Microsecond => arrow_arith::temporal::DatePart::Microsecond,
 79 |             DatePart::Nanosecond => arrow_arith::temporal::DatePart::Nanosecond,
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | #[pyfunction]
 85 | pub fn date_part(py: Python, input: AnyArray, part: DatePart) -> PyArrowResult<PyObject> {
 86 |     match input {
 87 |         AnyArray::Array(input) => {
 88 |             let out = arrow_arith::temporal::date_part(input.as_ref(), part.into())?;
 89 |             Ok(PyArray::from_array_ref(out).to_arro3(py)?.unbind())
 90 |         }
 91 |         AnyArray::Stream(stream) => {
 92 |             let reader = stream.into_reader()?;
 93 |             let output_field = Field::new("", DataType::Int32, true);
 94 |             let part = part.into();
 95 | 
 96 |             let iter = reader
 97 |                 .into_iter()
 98 |                 .map(move |array| arrow_arith::temporal::date_part(array?.as_ref(), part));
 99 |             Ok(
100 |                 PyArrayReader::new(Box::new(ArrayIterator::new(iter, output_field.into())))
101 |                     .to_arro3(py)?
102 |                     .unbind(),
103 |             )
104 |         }
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/tests/core/test_table.py:
--------------------------------------------------------------------------------
  1 | import geoarrow.types as gt
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pyarrow as pa
  5 | import pytest
  6 | from arro3.core import Array, ChunkedArray, DataType, Field, Table
  7 | 
  8 | 
  9 | def test_table_getitem():
 10 |     a = pa.chunked_array([[1, 2, 3, 4]])
 11 |     b = pa.chunked_array([["a", "b", "c", "d"]])
 12 |     table = Table.from_pydict({"a": a, "b": b})
 13 | 
 14 |     assert a == pa.chunked_array(table["a"])
 15 |     assert b == pa.chunked_array(table["b"])
 16 |     assert a == pa.chunked_array(table[0])
 17 |     assert b == pa.chunked_array(table[1])
 18 | 
 19 |     with pytest.raises(KeyError):
 20 |         table["foo"]
 21 | 
 22 |     with pytest.raises(IndexError):
 23 |         table[10]
 24 | 
 25 | 
 26 | def test_table_from_arrays():
 27 |     a = pa.array([1, 2, 3, 4])
 28 |     b = pa.array(["a", "b", "c", "d"])
 29 |     arro3_table = Table.from_arrays([a, b], names=["a", "b"])
 30 |     pa_table = pa.Table.from_arrays([a, b], names=["a", "b"])
 31 |     assert pa.table(arro3_table) == pa_table
 32 | 
 33 | 
 34 | def test_table_from_pydict():
 35 |     mapping = {"a": pa.array([1, 2, 3, 4]), "b": pa.array(["a", "b", "c", "d"])}
 36 |     arro3_table = Table.from_pydict(mapping)
 37 |     pa_table = pa.Table.from_pydict(mapping)
 38 |     assert pa.table(arro3_table) == pa_table
 39 | 
 40 | 
 41 | def test_table_constructor_ext_array():
 42 |     typ = DataType.uint8()
 43 |     metadata = {"ARROW:extension:name": "ext_name"}
 44 |     field = Field("", type=typ, nullable=True, metadata=metadata)
 45 |     arr = Array([1, 2, 3, 4], field)
 46 |     t = Table({"a": arr})
 47 |     assert t.schema.field("a").metadata_str["ARROW:extension:name"] == "ext_name"
 48 | 
 49 |     ca = ChunkedArray([arr], field)
 50 |     t = Table({"a": ca})
 51 |     assert t.schema.field("a").metadata_str["ARROW:extension:name"] == "ext_name"
 52 | 
 53 | 
 54 | def test_table_append_array_extension_type():
 55 |     """
 56 |     Test that extension metadata gets propagated from an array to a column on a table.
 57 |     """
 58 |     # Test that extension
 59 |     extension_type = gt.point(dimensions="xy", coord_type="interleaved").to_pyarrow()
 60 |     coords = np.array([1, 2, 3, 4], dtype=np.float64)
 61 |     ext_array = pa.FixedSizeListArray.from_arrays(coords, 2).cast(extension_type)
 62 | 
 63 |     table = Table.from_arrays([pa.array(["a", "b"])], names=["a"])
 64 |     geo_table = table.append_column("geometry", ChunkedArray([ext_array]))
 65 | 
 66 |     meta = geo_table.schema["geometry"].metadata
 67 |     assert b"ARROW:extension:name" in meta.keys()
 68 |     assert meta[b"ARROW:extension:name"] == b"geoarrow.point"
 69 | 
 70 | 
 71 | def test_table_from_batches_empty_columns_with_len():
 72 |     df = pd.DataFrame({"a": [1, 2, 3]})
 73 |     no_columns = df[[]]
 74 |     pa_table = pa.Table.from_pandas(no_columns)
 75 |     table = Table.from_batches(pa_table.to_batches())
 76 |     assert table.num_columns == 0
 77 |     assert table.num_rows == 3
 78 | 
 79 | 
 80 | def test_rechunk():
 81 |     a = pa.chunked_array([[1, 2, 3, 4]])
 82 |     b = pa.chunked_array([["a", "b", "c", "d"]])
 83 |     table = Table.from_pydict({"a": a, "b": b})
 84 | 
 85 |     rechunked1 = table.rechunk(max_chunksize=1)
 86 |     assert rechunked1.chunk_lengths == [1, 1, 1, 1]
 87 | 
 88 |     rechunked2 = rechunked1.rechunk(max_chunksize=2)
 89 |     assert rechunked2.chunk_lengths == [2, 2]
 90 |     assert rechunked2.rechunk().chunk_lengths == [4]
 91 | 
 92 | 
 93 | def test_slice():
 94 |     a = pa.chunked_array([[1, 2], [3, 4]])
 95 |     b = pa.chunked_array([["a", "b"], ["c", "d"]])
 96 |     table = Table.from_pydict({"a": a, "b": b})
 97 | 
 98 |     sliced1 = table.slice(0, 1)
 99 |     assert sliced1.num_rows == 1
100 |     assert sliced1.chunk_lengths == [1]
101 | 
102 |     sliced2 = table.slice(1, 2)
103 |     assert sliced2.num_rows == 2
104 |     assert sliced2.chunk_lengths == [1, 1]
105 | 
106 | 
107 | def test_nonempty_table_no_columns():
108 |     table = pa.table({"a": [1, 2, 3, 4]}).select([])
109 |     assert len(table) == 4
110 |     assert table.num_columns == 0
111 |     arro3_table = Table.from_arrow(table)
112 |     retour = pa.table(arro3_table)
113 |     assert table == retour
114 | 
115 | 
116 | class CustomException(Exception):
117 |     pass
118 | 
119 | 
120 | class ArrowCStreamFails:
121 |     def __arrow_c_stream__(self, requested_schema=None):
122 |         raise CustomException
123 | 
124 | 
125 | def test_table_import_preserve_exception():
126 |     """https://github.com/kylebarron/arro3/issues/325"""
127 | 
128 |     c_stream_obj = ArrowCStreamFails()
129 |     with pytest.raises(CustomException):
130 |         Table.from_arrow(c_stream_obj)
131 | 
132 |     with pytest.raises(CustomException):
133 |         Table(c_stream_obj)
134 | 


--------------------------------------------------------------------------------
/arro3-io/python/arro3/io/_csv.pyi:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import IO
  3 | 
  4 | # Note: importing with
  5 | # `from arro3.core import Array`
  6 | # will cause Array to be included in the generated docs in this module.
  7 | import arro3.core as core
  8 | import arro3.core.types as types
  9 | 
 10 | __all__ = ["infer_csv_schema", "read_csv", "write_csv"]
 11 | 
 12 | def infer_csv_schema(
 13 |     file: IO[bytes] | Path | str,
 14 |     *,
 15 |     has_header: bool | None = None,
 16 |     max_records: int | None = None,
 17 |     delimiter: str | None = None,
 18 |     escape: str | None = None,
 19 |     quote: str | None = None,
 20 |     terminator: str | None = None,
 21 |     comment: str | None = None,
 22 | ) -> core.Schema:
 23 |     """Infer a CSV file's schema
 24 | 
 25 |     If `max_records` is `None`, all records will be read, otherwise up to `max_records`
 26 |     records are read to infer the schema
 27 | 
 28 |     Args:
 29 |         file: The input CSV path or buffer.
 30 |         has_header: Set whether the CSV file has a header. Defaults to None.
 31 |         max_records: The maximum number of records to read to infer schema. Defaults to
 32 |             None.
 33 |         delimiter: Set the CSV file's column delimiter as a byte character. Defaults to
 34 |             None.
 35 |         escape: Set the CSV escape character. Defaults to None.
 36 |         quote: Set the CSV quote character. Defaults to None.
 37 |         terminator: Set the line terminator. Defaults to None.
 38 |         comment: Set the comment character. Defaults to None.
 39 | 
 40 |     Returns:
 41 |         inferred schema from data
 42 |     """
 43 | 
 44 | def read_csv(
 45 |     file: IO[bytes] | Path | str,
 46 |     schema: types.ArrowSchemaExportable,
 47 |     *,
 48 |     has_header: bool | None = None,
 49 |     batch_size: int | None = None,
 50 |     delimiter: str | None = None,
 51 |     escape: str | None = None,
 52 |     quote: str | None = None,
 53 |     terminator: str | None = None,
 54 |     comment: str | None = None,
 55 | ) -> core.RecordBatchReader:
 56 |     """Read a CSV file to an Arrow RecordBatchReader.
 57 | 
 58 |     Args:
 59 |         file: The input CSV path or buffer.
 60 |         schema: The Arrow schema for this CSV file. Use
 61 |             [infer_csv_schema][arro3.io.infer_csv_schema] to infer an Arrow schema if
 62 |             needed.
 63 |         has_header: Set whether the CSV file has a header. Defaults to None.
 64 |         batch_size: Set the batch size (number of records to load at one time).
 65 |             Defaults to None.
 66 |         delimiter: Set the CSV file's column delimiter as a byte character. Defaults to
 67 |             None.
 68 |         escape: Set the CSV escape character. Defaults to None.
 69 |         quote: Set the CSV quote character. Defaults to None.
 70 |         terminator: Set the line terminator. Defaults to None.
 71 |         comment: Set the comment character. Defaults to None.
 72 | 
 73 |     Returns:
 74 |         A RecordBatchReader with read CSV data
 75 |     """
 76 | 
 77 | def write_csv(
 78 |     data: types.ArrowStreamExportable | types.ArrowArrayExportable,
 79 |     file: IO[bytes] | Path | str,
 80 |     *,
 81 |     header: bool | None = None,
 82 |     delimiter: str | None = None,
 83 |     escape: str | None = None,
 84 |     quote: str | None = None,
 85 |     date_format: str | None = None,
 86 |     datetime_format: str | None = None,
 87 |     time_format: str | None = None,
 88 |     timestamp_format: str | None = None,
 89 |     timestamp_tz_format: str | None = None,
 90 |     null: str | None = None,
 91 | ) -> None:
 92 |     """Write an Arrow Table or stream to a CSV file.
 93 | 
 94 |     Args:
 95 |         data: The Arrow Table, RecordBatchReader, or RecordBatch to write.
 96 |         file: The output buffer or file path for where to write the CSV.
 97 |         header: Set whether to write the CSV file with a header. Defaults to None.
 98 |         delimiter: Set the CSV file's column delimiter as a byte character. Defaults to
 99 |             None.
100 |         escape: Set the CSV file's escape character as a byte character.
101 | 
102 |             In some variants of CSV, quotes are escaped using a special escape character
103 |             like `\\` (instead of escaping quotes by doubling them).
104 | 
105 |             By default, writing these idiosyncratic escapes is disabled, and is only
106 |             used when double_quote is disabled. Defaults to None.
107 |         quote: Set the CSV file's quote character as a byte character. Defaults to None.
108 |         date_format: Set the CSV file's date format. Defaults to None.
109 |         datetime_format: Set the CSV file's datetime format. Defaults to None.
110 |         time_format: Set the CSV file's time format. Defaults to None.
111 |         timestamp_format: Set the CSV file's timestamp format. Defaults to None.
112 |         timestamp_tz_format: Set the CSV file's timestamp tz format. Defaults to None.
113 |         null: Set the value to represent null in output. Defaults to None.
114 |     """
115 | 


--------------------------------------------------------------------------------
/arro3-io/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use bytes::Bytes;
  2 | use parquet::file::reader::{ChunkReader, Length};
  3 | use pyo3_file::PyFileLikeObject;
  4 | 
  5 | use pyo3::prelude::*;
  6 | use std::fs::File;
  7 | use std::io::{BufReader, Read, Seek, SeekFrom, Write};
  8 | use std::path::PathBuf;
  9 | 
 10 | /// Represents either a path `File` or a file-like object `FileLike`
 11 | #[derive(Debug)]
 12 | pub enum FileReader {
 13 |     File(File),
 14 |     FileLike(PyFileLikeObject),
 15 | }
 16 | 
 17 | impl FileReader {
 18 |     fn try_clone(&self) -> std::io::Result<Self> {
 19 |         match self {
 20 |             Self::File(f) => Ok(Self::File(f.try_clone()?)),
 21 |             Self::FileLike(f) => Ok(Self::FileLike(f.clone())),
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | impl<'py> FromPyObject<'py> for FileReader {
 27 |     fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
 28 |         if let Ok(path) = ob.extract::<PathBuf>() {
 29 |             Ok(Self::File(File::open(path)?))
 30 |         } else if let Ok(path) = ob.extract::<String>() {
 31 |             Ok(Self::File(File::open(path)?))
 32 |         } else {
 33 |             Ok(Self::FileLike(PyFileLikeObject::py_with_requirements(
 34 |                 ob.clone(),
 35 |                 true,
 36 |                 false,
 37 |                 true,
 38 |                 false,
 39 |             )?))
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | impl Read for FileReader {
 45 |     fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
 46 |         match self {
 47 |             Self::File(f) => f.read(buf),
 48 |             Self::FileLike(f) => f.read(buf),
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | impl Seek for FileReader {
 54 |     fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
 55 |         match self {
 56 |             Self::File(f) => f.seek(pos),
 57 |             Self::FileLike(f) => f.seek(pos),
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | impl Length for FileReader {
 63 |     fn len(&self) -> u64 {
 64 |         match self {
 65 |             Self::File(f) => f.len(),
 66 |             Self::FileLike(f) => {
 67 |                 let mut file = f.clone();
 68 |                 // Keep track of current pos
 69 |                 let pos = file.stream_position().unwrap();
 70 | 
 71 |                 // Seek to end of file
 72 |                 file.seek(std::io::SeekFrom::End(0)).unwrap();
 73 |                 let len = file.stream_position().unwrap();
 74 | 
 75 |                 // Seek back
 76 |                 file.seek(std::io::SeekFrom::Start(pos)).unwrap();
 77 |                 len
 78 |             }
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | impl ChunkReader for FileReader {
 84 |     type T = BufReader<FileReader>;
 85 | 
 86 |     fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
 87 |         let mut reader = self.try_clone()?;
 88 |         reader.seek(SeekFrom::Start(start))?;
 89 |         Ok(BufReader::new(self.try_clone()?))
 90 |     }
 91 | 
 92 |     fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
 93 |         let mut buffer = Vec::with_capacity(length);
 94 |         let mut reader = self.try_clone()?;
 95 |         reader.seek(SeekFrom::Start(start))?;
 96 |         let read = reader.take(length as _).read_to_end(&mut buffer)?;
 97 | 
 98 |         if read != length {
 99 |             return Err(parquet::errors::ParquetError::EOF(format!(
100 |                 "Expected to read {length} bytes, read only {read}"
101 |             )));
102 |         }
103 |         Ok(buffer.into())
104 |     }
105 | }
106 | 
107 | /// Represents either a path `File` or a file-like object `FileLike`
108 | #[derive(Debug)]
109 | pub enum FileWriter {
110 |     File(File),
111 |     FileLike(PyFileLikeObject),
112 | }
113 | 
114 | impl<'py> FromPyObject<'py> for FileWriter {
115 |     fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
116 |         if let Ok(path) = ob.extract::<PathBuf>() {
117 |             Ok(Self::File(File::create(path)?))
118 |         } else if let Ok(path) = ob.extract::<String>() {
119 |             Ok(Self::File(File::create(path)?))
120 |         } else {
121 |             Ok(Self::FileLike(PyFileLikeObject::py_with_requirements(
122 |                 ob.clone(),
123 |                 false,
124 |                 true,
125 |                 true,
126 |                 false,
127 |             )?))
128 |         }
129 |     }
130 | }
131 | 
132 | impl Write for FileWriter {
133 |     fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
134 |         match self {
135 |             Self::File(f) => f.write(buf),
136 |             Self::FileLike(f) => f.write(buf),
137 |         }
138 |     }
139 | 
140 |     fn flush(&mut self) -> std::io::Result<()> {
141 |         match self {
142 |             Self::File(f) => f.flush(),
143 |             Self::FileLike(f) => f.flush(),
144 |         }
145 |     }
146 | }
147 | 
148 | impl Seek for FileWriter {
149 |     fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
150 |         match self {
151 |             Self::File(f) => f.seek(pos),
152 |             Self::FileLike(f) => f.seek(pos),
153 |         }
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/tests/core/test_constructors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pyarrow as pa
  3 | from arro3.core import (
  4 |     Array,
  5 |     DataType,
  6 |     Field,
  7 |     fixed_size_list_array,
  8 |     list_array,
  9 |     list_offsets,
 10 |     struct_array,
 11 | )
 12 | 
 13 | 
 14 | def test_fixed_size_list_array():
 15 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 16 |     flat_array = Array.from_numpy(np_arr)
 17 |     array = fixed_size_list_array(flat_array, 2)
 18 |     pa_array = pa.array(array)
 19 |     assert pa.types.is_fixed_size_list(pa_array.type)
 20 |     assert pa_array.type.list_size == 2
 21 | 
 22 | 
 23 | def test_fixed_size_list_array_with_type():
 24 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 25 |     flat_array = Array.from_numpy(np_arr)
 26 |     list_type = DataType.list(Field("inner", DataType.float64()), 2)
 27 |     array = fixed_size_list_array(flat_array, 2, type=list_type)
 28 |     pa_array = pa.array(array)
 29 |     assert pa.types.is_fixed_size_list(pa_array.type)
 30 |     assert pa_array.type.list_size == 2
 31 |     assert pa_array.type.field(0).name == "inner"
 32 | 
 33 | 
 34 | def test_fixed_size_list_array_with_mask():
 35 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 36 |     flat_array = Array.from_numpy(np_arr)
 37 | 
 38 |     np_mask = np.array([True, False, True], dtype=bool)
 39 |     mask = Array.from_numpy(np_mask)
 40 | 
 41 |     arro3_array = fixed_size_list_array(flat_array, 2, mask=mask)
 42 | 
 43 |     # Note that we don't exactly match the pyarrow array because we still allocate for
 44 |     # null values.
 45 |     pa_arr = pa.array(
 46 |         [[1, 2], [3, 4], [5, 6]],
 47 |         type=pa.field(arro3_array.type).type,
 48 |         mask=np_mask,
 49 |     )
 50 | 
 51 |     assert arro3_array[0].is_valid == pa_arr[0].is_valid
 52 |     assert arro3_array[1].is_valid == pa_arr[1].is_valid
 53 |     assert arro3_array[1] == Array(pa_arr)[1]
 54 |     assert arro3_array[2].is_valid == pa_arr[2].is_valid
 55 | 
 56 | 
 57 | def test_list_array():
 58 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 59 |     flat_array = Array.from_numpy(np_arr)
 60 |     offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32))
 61 |     array = list_array(offsets_array, flat_array)
 62 |     pa_array = pa.array(array)
 63 |     assert pa.types.is_list(pa_array.type)
 64 |     assert list_offsets(array) == offsets_array
 65 | 
 66 | 
 67 | def test_list_array_with_type():
 68 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 69 |     flat_array = Array.from_numpy(np_arr)
 70 |     offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32))
 71 | 
 72 |     list_type = DataType.list(Field("inner", DataType.float64()))
 73 |     array = list_array(offsets_array, flat_array, type=list_type)
 74 |     pa_array = pa.array(array)
 75 |     assert pa.types.is_list(pa_array.type)
 76 |     assert list_offsets(array) == offsets_array
 77 |     assert pa_array.type.field(0).name == "inner"
 78 | 
 79 | 
 80 | def test_list_array_with_mask():
 81 |     np_arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64)
 82 |     flat_array = Array.from_numpy(np_arr)
 83 |     offsets_array = Array.from_numpy(np.array([0, 2, 5, 6], dtype=np.int32))
 84 | 
 85 |     np_mask = np.array([True, False, True], dtype=bool)
 86 |     mask = Array.from_numpy(np_mask)
 87 | 
 88 |     arro3_array = list_array(offsets_array, flat_array, mask=mask)
 89 | 
 90 |     # Note that we don't exactly match the pyarrow array because we still allocate for
 91 |     # null values.
 92 |     pa_arr = pa.array(
 93 |         [[1, 2], [3, 4, 5], [6]], type=pa.field(arro3_array.type).type, mask=np_mask
 94 |     )
 95 | 
 96 |     assert arro3_array[0].is_valid == pa_arr[0].is_valid
 97 |     assert arro3_array[1].is_valid == pa_arr[1].is_valid
 98 |     assert arro3_array[1] == Array(pa_arr)[1]
 99 |     assert arro3_array[2].is_valid == pa_arr[2].is_valid
100 | 
101 | 
102 | def test_struct_array():
103 |     a = pa.array([1, 2, 3, 4])
104 |     b = pa.array(["a", "b", "c", "d"])
105 | 
106 |     arr = struct_array([a, b], fields=[Field("a", a.type), Field("b", b.type)])
107 |     pa_type = pa.array(arr).type
108 |     assert pa.types.is_struct(pa_type)
109 |     assert pa_type.field(0).name == "a"
110 |     assert pa_type.field(1).name == "b"
111 | 
112 | 
113 | def test_struct_array_with_mask():
114 |     a = pa.array([1, 2, 3, 4])
115 |     b = pa.array(["a", "b", "c", "d"])
116 | 
117 |     np_mask = np.array([True, False, True, False], dtype=bool)
118 |     mask = Array.from_numpy(np_mask)
119 | 
120 |     arro3_arr = struct_array(
121 |         [a, b],
122 |         fields=[Field("a", a.type), Field("b", b.type)],
123 |         mask=mask,
124 |     )
125 | 
126 |     pa_arr = pa.array(
127 |         [
128 |             {"a": 1, "b": "a"},
129 |             {"a": 2, "b": "b"},
130 |             {"a": 3, "b": "c"},
131 |             {"a": 4, "b": "d"},
132 |         ],
133 |         type=pa.field(arro3_arr.type).type,
134 |         mask=np_mask,
135 |     )
136 | 
137 |     for i in range(len(arro3_arr)):
138 |         assert arro3_arr[i].is_valid == pa_arr[i].is_valid
139 |         assert arro3_arr[i] == Array(pa_arr)[i]
140 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: arro3
  2 | repo_name: kylebarron/arro3
  3 | repo_url: https://github.com/kylebarron/arro3
  4 | site_description: A minimal Python library for Apache Arrow, binding to the Rust Arrow implementation.
  5 | site_author: Kyle Barron
  6 | # Note: trailing slash recommended with mike:
  7 | # https://squidfunk.github.io/mkdocs-material/setup/setting-up-versioning/#publishing-a-new-version
  8 | site_url: https://kylebarron.dev/arro3/
  9 | docs_dir: docs
 10 | 
 11 | extra:
 12 |   social:
 13 |     - icon: "fontawesome/brands/github"
 14 |       link: "https://github.com/kylebarron"
 15 |     - icon: "fontawesome/brands/twitter"
 16 |       link: "https://twitter.com/kylebarron2"
 17 |   version:
 18 |     provider: mike
 19 | 
 20 | nav:
 21 |   - "index.md"
 22 |   - API Reference:
 23 |       - arro3.core:
 24 |           - api/core/array.md
 25 |           - api/core/array-reader.md
 26 |           - api/core/chunked-array.md
 27 |           - api/core/datatype.md
 28 |           - api/core/field.md
 29 |           - api/core/record-batch.md
 30 |           - api/core/record-batch-reader.md
 31 |           - api/core/scalar.md
 32 |           - api/core/schema.md
 33 |           - api/core/table.md
 34 |           - api/core/constructors.md
 35 |           - api/core/accessors.md
 36 |           - api/core/types.md
 37 |       - api/compute.md
 38 |       - arro3.io:
 39 |           - api/io/arrow-ipc.md
 40 |           - api/io/csv.md
 41 |           - api/io/json.md
 42 |           - api/io/parquet.md
 43 | 
 44 | watch:
 45 |   - arro3-compute/python
 46 |   - arro3-core/python
 47 |   - arro3-io/python
 48 |   - docs
 49 | 
 50 | theme:
 51 |   name: material
 52 |   palette:
 53 |     # Palette toggle for automatic mode
 54 |     - media: "(prefers-color-scheme)"
 55 |       toggle:
 56 |         icon: material/brightness-auto
 57 |         name: Switch to light mode
 58 | 
 59 |     # Palette toggle for light mode
 60 |     - media: "(prefers-color-scheme: light)"
 61 |       primary: indigo
 62 |       accent: indigo
 63 |       toggle:
 64 |         icon: material/brightness-7
 65 |         name: Switch to dark mode
 66 | 
 67 |     # Palette toggle for dark mode
 68 |     - media: "(prefers-color-scheme: dark)"
 69 |       scheme: slate
 70 |       primary: indigo
 71 |       accent: indigo
 72 |       toggle:
 73 |         icon: material/brightness-4
 74 |         name: Switch to system preference
 75 | 
 76 |   font:
 77 |     text: Roboto
 78 |     code: Roboto Mono
 79 | 
 80 |   features:
 81 |     - content.code.annotate
 82 |     - content.code.copy
 83 |     - navigation.indexes
 84 |     - navigation.instant
 85 |     - navigation.tracking
 86 |     - search.suggest
 87 |     - search.share
 88 | 
 89 | plugins:
 90 |   - search
 91 |   - social:
 92 |       enabled: !ENV [CI, false]
 93 |   - mike:
 94 |       alias_type: "copy"
 95 |       canonical_version: "latest"
 96 |   - mkdocstrings:
 97 |       enable_inventory: true
 98 |       handlers:
 99 |         python:
100 |           paths: [arro3-compute/python, arro3-core/python, arro3-io/python]
101 |           options:
102 |             # We set allow_inspection: false to ensure that all docstrings come
103 |             # from the pyi files, not the Rust-facing doc comments.
104 |             allow_inspection: false
105 |             docstring_section_style: list
106 |             docstring_style: google
107 |             line_length: 80
108 |             separate_signature: true
109 |             show_root_heading: true
110 |             show_signature_annotations: true
111 |             show_source: false
112 |             show_symbol_type_toc: true
113 |             signature_crossrefs: true
114 |             extensions:
115 |               - griffe_inherited_docstrings
116 | 
117 |           inventories:
118 |             - https://arrow.apache.org/docs/objects.inv
119 |             - https://docs.pola.rs/api/python/stable/objects.inv
120 |             - https://docs.python.org/3/objects.inv
121 |             - https://duckdb.org/docs/stable/clients/python/reference/objects.inv
122 |             - https://numpy.org/doc/stable/objects.inv
123 |             - https://pandas.pydata.org/pandas-docs/stable/objects.inv
124 |   - redirects:
125 |       redirect_maps:
126 |         "api/io.md": "api/io/parquet.md"
127 | 
128 | # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140
129 | markdown_extensions:
130 |   - admonition
131 |   - attr_list
132 |   - codehilite:
133 |       guess_lang: false
134 |   - def_list
135 |   - footnotes
136 |   - md_in_html
137 |   - pymdownx.arithmatex
138 |   - pymdownx.betterem
139 |   - pymdownx.caret:
140 |       insert: false
141 |   - pymdownx.details
142 |   - pymdownx.emoji:
143 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
144 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
145 |   - pymdownx.escapeall:
146 |       hardbreak: true
147 |       nbsp: true
148 |   - pymdownx.magiclink:
149 |       hide_protocol: true
150 |       repo_url_shortener: true
151 |   - pymdownx.smartsymbols
152 |   - pymdownx.superfences
153 |   - pymdownx.tasklist:
154 |       custom_checkbox: true
155 |   - pymdownx.tilde
156 |   - toc:
157 |       permalink: true
158 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_schema.pyi:
--------------------------------------------------------------------------------
  1 | from typing import Sequence
  2 | 
  3 | from ._data_type import DataType
  4 | from ._field import Field
  5 | from ._table import Table
  6 | from .types import ArrowSchemaExportable
  7 | 
  8 | class Schema:
  9 |     """An arrow Schema."""
 10 |     def __init__(
 11 |         self,
 12 |         fields: Sequence[ArrowSchemaExportable],
 13 |         *,
 14 |         metadata: dict[str, str] | dict[bytes, bytes] | None = None,
 15 |     ) -> None: ...
 16 |     def __arrow_c_schema__(self) -> object:
 17 |         """
 18 |         An implementation of the [Arrow PyCapsule
 19 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 20 |         This dunder method should not be called directly, but enables zero-copy data
 21 |         transfer to other Python libraries that understand Arrow memory.
 22 | 
 23 |         For example, you can call [`pyarrow.schema()`][pyarrow.schema] to convert this
 24 |         array into a pyarrow schema, without copying memory.
 25 |         """
 26 | 
 27 |     def __eq__(self, other) -> bool: ...
 28 |     def __getitem__(self, key: int | str) -> Field: ...
 29 |     def __len__(self) -> int: ...
 30 |     def __repr__(self) -> str: ...
 31 |     @classmethod
 32 |     def from_arrow(cls, input: ArrowSchemaExportable) -> Schema:
 33 |         """Construct this from an existing Arrow object
 34 | 
 35 |         Args:
 36 |             input: Arrow schema to use for constructing this object
 37 | 
 38 |         Returns:
 39 |             _description_
 40 |         """
 41 |     @classmethod
 42 |     def from_arrow_pycapsule(cls, capsule) -> Schema:
 43 |         """Construct this object from a bare Arrow PyCapsule"""
 44 |     def append(self, field: ArrowSchemaExportable) -> Schema:
 45 |         """Append a field at the end of the schema.
 46 | 
 47 |         In contrast to Python's `list.append()` it does return a new object, leaving the
 48 |         original Schema unmodified.
 49 | 
 50 |         Args:
 51 |             field: new field
 52 | 
 53 |         Returns:
 54 |             New Schema
 55 |         """
 56 |     def empty_table(self) -> Table:
 57 |         """Provide an empty table according to the schema.
 58 | 
 59 |         Returns:
 60 |             Table
 61 |         """
 62 | 
 63 |     def equals(self, other: ArrowSchemaExportable) -> bool:
 64 |         """Test if this schema is equal to the other
 65 | 
 66 |         Args:
 67 |             other: _description_
 68 | 
 69 |         Returns:
 70 |             _description_
 71 |         """
 72 | 
 73 |     def field(self, i: int | str) -> Field:
 74 |         """Select a field by its column name or numeric index.
 75 | 
 76 |         Args:
 77 |             i: other
 78 | 
 79 |         Returns:
 80 |             _description_
 81 |         """
 82 |     def get_all_field_indices(self, name: str) -> list[int]:
 83 |         """Return sorted list of indices for the fields with the given name.
 84 | 
 85 |         Args:
 86 |             name: _description_
 87 | 
 88 |         Returns:
 89 |             _description_
 90 |         """
 91 |     def get_field_index(self, name: str) -> int:
 92 |         """Return index of the unique field with the given name.
 93 | 
 94 |         Args:
 95 |             name: _description_
 96 | 
 97 |         Returns:
 98 |             _description_
 99 |         """
100 |     def insert(self, i: int, field: ArrowSchemaExportable) -> Schema:
101 |         """Add a field at position `i` to the schema.
102 | 
103 |         Args:
104 |             i: _description_
105 |             field: _description_
106 | 
107 |         Returns:
108 |             _description_
109 |         """
110 |     @property
111 |     def metadata(self) -> dict[bytes, bytes]:
112 |         """The schema's metadata.
113 | 
114 |         Returns:
115 |             _description_
116 |         """
117 | 
118 |     @property
119 |     def metadata_str(self) -> dict[str, str]:
120 |         """The schema's metadata where keys and values are `str`, not `bytes`.
121 | 
122 |         Returns:
123 |             _description_
124 |         """
125 |     @property
126 |     def names(self) -> list[str]:
127 |         """The schema's field names."""
128 | 
129 |     def remove(self, i: int) -> Schema:
130 |         """Remove the field at index i from the schema.
131 | 
132 |         Args:
133 |             i: _description_
134 | 
135 |         Returns:
136 |             _description_
137 |         """
138 |     def remove_metadata(self) -> Schema:
139 |         """Create new schema without metadata, if any
140 | 
141 | 
142 |         Returns:
143 |             _description_
144 |         """
145 |     def set(self, i: int, field: ArrowSchemaExportable) -> Schema:
146 |         """Replace a field at position `i` in the schema.
147 | 
148 |         Args:
149 |             i: _description_
150 |             field: _description_
151 | 
152 |         Returns:
153 |             _description_
154 |         """
155 |     @property
156 |     def types(self) -> list[DataType]:
157 |         """The schema's field types.
158 | 
159 |         Returns:
160 |             _description_
161 |         """
162 |     def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Schema:
163 |         """Add metadata as dict of string keys and values to Schema.
164 | 
165 |         Args:
166 |             metadata: _description_
167 | 
168 |         Returns:
169 |             _description_
170 |         """
171 | 


--------------------------------------------------------------------------------
/arro3-core/python/arro3/core/_array.pyi:
--------------------------------------------------------------------------------
  1 | from typing import Any, Iterable, Sequence, overload
  2 | 
  3 | import numpy as np
  4 | from numpy.typing import NDArray
  5 | 
  6 | from ._data_type import DataType
  7 | from ._field import Field
  8 | from ._scalar import Scalar
  9 | from .types import (
 10 |     ArrayInput,
 11 |     ArrowArrayExportable,
 12 |     ArrowSchemaExportable,
 13 |     ArrowStreamExportable,
 14 |     _SupportsBuffer,
 15 | )
 16 | 
 17 | class Array:
 18 |     """An Arrow Array."""
 19 |     @overload
 20 |     def __init__(self, obj: ArrayInput, /, type: None = None) -> None: ...
 21 |     @overload
 22 |     def __init__(self, obj: Sequence[Any], /, type: ArrowSchemaExportable) -> None: ...
 23 |     def __init__(
 24 |         self,
 25 |         obj: ArrayInput | Sequence[Any],
 26 |         /,
 27 |         type: ArrowSchemaExportable | None = None,
 28 |     ) -> None:
 29 |         """Create arro3.Array instance from a sequence of Python objects.
 30 | 
 31 |         Args:
 32 |             obj: A sequence of input objects.
 33 |             type: Explicit type to attempt to coerce to. You may pass in a `Field` to `type` in order to associate extension metadata with this array.
 34 |         """
 35 |     def __array__(self, dtype=None, copy=None) -> NDArray:
 36 |         """
 37 |         An implementation of the Array interface, for interoperability with numpy and
 38 |         other array libraries.
 39 |         """
 40 |     def __arrow_c_array__(
 41 |         self, requested_schema: object | None = None
 42 |     ) -> tuple[object, object]:
 43 |         """
 44 |         An implementation of the [Arrow PyCapsule
 45 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 46 |         This dunder method should not be called directly, but enables zero-copy data
 47 |         transfer to other Python libraries that understand Arrow memory.
 48 | 
 49 |         For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this
 50 |         array into a pyarrow array, without copying memory.
 51 |         """
 52 |     def __arrow_c_schema__(self) -> object:
 53 |         """
 54 |         An implementation of the [Arrow PyCapsule
 55 |         Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
 56 |         This dunder method should not be called directly, but enables zero-copy data
 57 |         transfer to other Python libraries that understand Arrow memory.
 58 | 
 59 |         This allows Arrow consumers to inspect the data type of this array. Then the
 60 |         consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data
 61 |         to a supported data type.
 62 |         """
 63 |     def __eq__(self, other) -> bool: ...
 64 |     def __getitem__(self, i: int) -> Scalar: ...
 65 |     # Note: we don't actually implement this, but it's inferred by having a __getitem__
 66 |     # key
 67 |     def __iter__(self) -> Iterable[Scalar]: ...
 68 |     def __len__(self) -> int: ...
 69 |     def __repr__(self) -> str: ...
 70 |     @classmethod
 71 |     def from_arrow(cls, input: ArrowArrayExportable | ArrowStreamExportable) -> Array:
 72 |         """
 73 |         Construct this object from an existing Arrow object.
 74 | 
 75 |         It can be called on anything that exports the Arrow data interface
 76 |         (`__arrow_c_array__`).
 77 | 
 78 |         Args:
 79 |             input: Arrow array to use for constructing this object
 80 | 
 81 |         Returns:
 82 |             Self
 83 |         """
 84 | 
 85 |     @classmethod
 86 |     def from_arrow_pycapsule(cls, schema_capsule, array_capsule) -> Array:
 87 |         """Construct this object from bare Arrow PyCapsules"""
 88 | 
 89 |     # We allow Any here because not many types have updated to expose __buffer__ yet
 90 |     @classmethod
 91 |     def from_buffer(cls, buffer: _SupportsBuffer) -> Array:
 92 |         """Construct an Array from an object implementing the Python Buffer Protocol."""
 93 | 
 94 |     @classmethod
 95 |     def from_numpy(cls, array: np.ndarray) -> Array:
 96 |         """Construct an Array from a numpy ndarray"""
 97 | 
 98 |     def cast(self, target_type: ArrowSchemaExportable) -> Array:
 99 |         """Cast array values to another data type
100 | 
101 |         Args:
102 |             target_type: Type to cast array to.
103 |         """
104 | 
105 |     @property
106 |     def field(self) -> Field:
107 |         """Access the field stored on this Array.
108 | 
109 |         Note that this field usually will not have a name associated, but it may have
110 |         metadata that signifies that this array is an extension (user-defined typed)
111 |         array.
112 |         """
113 |     @property
114 |     def nbytes(self) -> int:
115 |         """The number of bytes in this Array."""
116 |     @property
117 |     def null_count(self) -> int:
118 |         """The number of null values in this Array."""
119 |     def slice(self, offset: int = 0, length: int | None = None) -> Array:
120 |         """Compute zero-copy slice of this array.
121 | 
122 |         Args:
123 |             offset: Defaults to 0.
124 |             length: Defaults to None.
125 | 
126 |         Returns:
127 |             The sliced array
128 |         """
129 |     def take(self, indices: ArrayInput) -> Array:
130 |         """Take specific indices from this Array."""
131 |     def to_numpy(self) -> NDArray:
132 |         """Return a numpy copy of this array."""
133 |     def to_pylist(self) -> NDArray:
134 |         """Convert to a list of native Python objects."""
135 | 
136 |     @property
137 |     def type(self) -> DataType:
138 |         """The data type of this array."""
139 | 


--------------------------------------------------------------------------------