├── marrow ├── README.md ├── __init__.mojo ├── module │ ├── __init__.mojo │ ├── arrays │ │ ├── __init__.mojo │ │ └── primitive_api.mojo │ └── dtypes_api.mojo ├── .python-version ├── arrays │ ├── tests │ │ ├── __init__.mojod │ │ ├── test_binary.mojo │ │ ├── test_chunked_array.mojo │ │ ├── test_base.mojo │ │ ├── test_nested.mojo │ │ └── test_primitive.mojo │ ├── __init__.mojo │ ├── chunked_array.mojo │ ├── binary.mojo │ ├── base.mojo │ ├── nested.mojo │ └── primitive.mojo ├── test_fixtures │ ├── __init__.mojo │ ├── bool_array.mojo │ └── arrays.mojo ├── main.py ├── io │ ├── __init__.mojo │ ├── formatter.mojo │ └── tests │ │ └── test_formatter.mojo ├── tabular.mojo ├── MEMORY.md ├── schema.mojo ├── tests │ ├── test_schema.mojo │ ├── test_dtypes.mojo │ ├── test_buffers.mojo │ └── test_c_data.mojo ├── dtypes.mojo ├── buffers.mojo └── c_data.mojo ├── python ├── .gitignore ├── pytest.ini ├── README.md ├── tests │ ├── arrays │ │ ├── __init__.py │ │ └── test_primitive_api.py │ └── test_dtypes_api.py └── lib.mojo ├── package └── .gitignore ├── .envrc ├── .gitattributes ├── .github └── workflows │ └── test.yml ├── run_tests.sh ├── pixi.toml ├── .gitignore ├── CLAUDE.md ├── README.md └── LICENSE.txt /marrow/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /marrow/__init__.mojo: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /marrow/module/__init__.mojo: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | -------------------------------------------------------------------------------- /marrow/.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /package/.gitignore: -------------------------------------------------------------------------------- 1 | *.mojopkg 2 | -------------------------------------------------------------------------------- /marrow/arrays/tests/__init__.mojod: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /marrow/module/arrays/__init__.mojo: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /marrow/test_fixtures/__init__.mojo: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath=. 3 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | watch_file pixi.lock 2 | eval "$(pixi shell-hook)" 3 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | Allow marrow to be called from python. 2 | -------------------------------------------------------------------------------- /python/tests/arrays/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for marrow arrays.""" 2 | -------------------------------------------------------------------------------- /marrow/main.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | print("Hello from marrow!") 3 | 4 | 5 | if __name__ == "__main__": 6 | main() 7 | -------------------------------------------------------------------------------- /marrow/io/__init__.mojo: -------------------------------------------------------------------------------- 1 | """IO utilities for formatting and displaying Marrow arrays.""" 2 | 3 | from .formatter import Formatter 4 | -------------------------------------------------------------------------------- /marrow/tabular.mojo: -------------------------------------------------------------------------------- 1 | from .arrays import * 2 | from .schema import Schema 3 | 4 | 5 | @fieldwise_init 6 | struct RecordBatch: 7 | var schema: Schema 8 | var fields: List[ArrayData] 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # SCM syntax highlighting & preventing 3-way merges 2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true 3 | 4 | magic.lock merge=binary linguist-language=YAML linguist-generated=true 5 | -------------------------------------------------------------------------------- /marrow/arrays/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .binary import * 3 | from .nested import * 4 | from .primitive import * 5 | 6 | 7 | fn array[T: DataType](*values: Scalar[T.native]) -> PrimitiveArray[T]: 8 | """Create a primitive array with the given values.""" 9 | var a = PrimitiveArray[T](len(values)) 10 | for value in values: 11 | a.unsafe_append(value) 12 | return a^ 13 | -------------------------------------------------------------------------------- /marrow/test_fixtures/bool_array.mojo: -------------------------------------------------------------------------------- 1 | from marrow.arrays import BoolArray 2 | 3 | 4 | fn as_bool_array_scalar(value: Bool) -> BoolArray.scalar: 5 | """Bool conversion function.""" 6 | return BoolArray.scalar(Scalar[DType.bool](value)) 7 | 8 | 9 | fn bool_array(*values: Bool) -> BoolArray: 10 | var a = BoolArray(len(values)) 11 | for value in values: 12 | a.unsafe_append(as_bool_array_scalar(value)) 13 | return a^ 14 | -------------------------------------------------------------------------------- /python/tests/arrays/test_primitive_api.py: -------------------------------------------------------------------------------- 1 | """Test the primitive array Python API. 2 | 3 | Over time we should implement 4 | https://github.com/apache/arrow/blob/c6ef0fe73cc716d7949e06ca7ba4dfd0931bf10e/python/pyarrow/tests/test_array.py 5 | """ 6 | 7 | import marrow as ma 8 | 9 | 10 | def test_getitem(): 11 | arr = ma.array([1, 2]) 12 | assert arr.__len__() == 2 13 | assert arr.__getitem__(0) == 1 14 | assert arr.__getitem__(1) == 2 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Marrow Test 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | env: 11 | MODULAR_HOME: "/home/runner/.modular" 12 | steps: 13 | - name: Checkout repo 14 | uses: actions/checkout@v2 15 | - uses: prefix-dev/setup-pixi@v0.8.8 16 | with: 17 | cache: true 18 | cache-write: ${{ github.event_name == 'push' && github.ref_name == 'main' }} 19 | - name: checks 20 | run: pixi run fmt 21 | - name: tests 22 | run: | 23 | pixi run test 24 | pixi run package 25 | -------------------------------------------------------------------------------- /python/lib.mojo: -------------------------------------------------------------------------------- 1 | from python import PythonObject, Python 2 | from python.bindings import PythonModuleBuilder 3 | import math 4 | from marrow.module.dtypes_api import add_to_module as add_dtypes 5 | from marrow.module.arrays.primitive_api import add_to_module as add_primitive 6 | from os import abort 7 | 8 | 9 | @export 10 | fn PyInit_marrow() -> PythonObject: 11 | try: 12 | var m = PythonModuleBuilder("marrow") 13 | add_dtypes(m) 14 | add_primitive(m) 15 | return m.finalize() 16 | except e: 17 | return abort[PythonObject]( 18 | String("error creating Python Mojo module:", e) 19 | ) 20 | -------------------------------------------------------------------------------- /marrow/arrays/tests/test_binary.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | 3 | 4 | from marrow.arrays import * 5 | from marrow.dtypes import * 6 | 7 | 8 | def test_string_builder(): 9 | var a = StringArray() 10 | assert_equal(len(a), 0) 11 | assert_equal(a.capacity, 0) 12 | 13 | a.grow(2) 14 | assert_equal(len(a), 0) 15 | assert_equal(a.capacity, 2) 16 | 17 | a.unsafe_append("hello") 18 | a.unsafe_append("world") 19 | assert_equal(len(a), 2) 20 | assert_equal(a.capacity, 2) 21 | 22 | assert_equal(String(a.unsafe_get(0)), "hello") 23 | assert_equal(String(a.unsafe_get(1)), "world") 24 | 25 | assert_equal( 26 | a.__str__().strip(), 27 | 'StringArray( length=2, data= ["hello", "world", ])', 28 | ) 29 | 30 | 31 | def main(): 32 | TestSuite.discover_tests[__functions_in_module()]().run() 33 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Run tests, see https://forum.modular.com/t/proposal-deprecating-mojo-test/2371 4 | set -e 5 | 6 | # Get test directory from first argument, default to marrow if not provided 7 | test_dir="${1:-.}" 8 | 9 | echo "### ------------------------------------------------------------- ###" 10 | echo "Running tests in: $test_dir" 11 | 12 | # Find all test files and run them 13 | # Use a temporary file to track failures since pipe creates subshell 14 | tmpfile=$(mktemp) 15 | trap "rm -f $tmpfile" EXIT 16 | 17 | find "$test_dir" -name "test_*.mojo" -type f -not -path "*/.pixi/*" | sort | while IFS= read -r test_file; do 18 | echo "Running: $test_file" 19 | if ! mojo run -I . "$test_file"; then 20 | echo "1" >"$tmpfile" 21 | fi 22 | echo "### ------------------------------------------------------------- ###" 23 | done 24 | 25 | # Check if any tests failed 26 | if [ -f "$tmpfile" ] && [ -s "$tmpfile" ]; then 27 | exit 1 28 | fi 29 | -------------------------------------------------------------------------------- /pixi.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | authors = ["Krisztian Szucs "] 3 | channels = ["conda-forge", "https://conda.modular.com/max-nightly"] 4 | name = "marrow" 5 | platforms = ["osx-arm64", "linux-64"] 6 | version = "0.1.0" 7 | 8 | [tasks] 9 | build_python = {cmd="mojo build -I . python/lib.mojo --emit shared-lib -o python/marrow.so"} 10 | test_mojo = {cmd = "bash run_tests.sh" } 11 | test_python = {cmd = "pytest -s -v python/tests", depends-on=["build_python"]} 12 | test = {depends-on = ["test_mojo", "test_python"]} 13 | fmt_mojo = {cmd = "mojo format marrow"} 14 | fmt_python = {cmd = "ruff format python" } 15 | fmt = {depends-on = ["fmt_mojo", "fmt_python"]} 16 | package = "mojo package marrow -o package/marrow.mojopkg" 17 | 18 | [dependencies] 19 | mojo = "<1.0.0" 20 | # Python 3.14 breaks the pyarrow installation, stay on an older version. 21 | python = "3.12.*" 22 | pytest = "8.*" 23 | ruff = "0.14.*" 24 | 25 | [pypi-dependencies] 26 | pyarrow = ">=19.0.1, <21" 27 | -------------------------------------------------------------------------------- /python/tests/test_dtypes_api.py: -------------------------------------------------------------------------------- 1 | """Test the DataType Python api.""" 2 | 3 | import marrow as ma 4 | 5 | 6 | def test_factory_functions() -> None: 7 | """Test that all DataType factory functions work and return DataType.""" 8 | assert isinstance(ma.null(), ma.DataType) 9 | assert isinstance(ma.bool_(), ma.DataType) 10 | assert isinstance(ma.int8(), ma.DataType) 11 | assert isinstance(ma.int16(), ma.DataType) 12 | assert isinstance(ma.int32(), ma.DataType) 13 | assert isinstance(ma.int64(), ma.DataType) 14 | assert isinstance(ma.uint8(), ma.DataType) 15 | assert isinstance(ma.uint16(), ma.DataType) 16 | assert isinstance(ma.uint32(), ma.DataType) 17 | assert isinstance(ma.uint64(), ma.DataType) 18 | assert isinstance(ma.float16(), ma.DataType) 19 | assert isinstance(ma.float32(), ma.DataType) 20 | assert isinstance(ma.float64(), ma.DataType) 21 | assert isinstance(ma.string(), ma.DataType) 22 | assert isinstance(ma.binary(), ma.DataType) 23 | -------------------------------------------------------------------------------- /marrow/MEMORY.md: -------------------------------------------------------------------------------- 1 | Starting with the Sept 2025 version of Mojo the compiler is starting to enforce 2 | [lifetimes](https://docs.modular.com/mojo/manual/values/lifetimes/). This document proposes an approach at using memory. 3 | 4 | 1. Low level memory format 5 | 6 | Apache Arrow defines a columnar memory [format](https://arrow.apache.org/docs/format/Columnar.html) that can 7 | be accessed in many languages, including Python. Marrow defines 8 | an API to access this format in Mojo. One of the goals is to allow high performance 9 | integration between Python and Mojo when it comes to process vast amounts of data. 10 | 11 | 1. ArrayData owns the data 12 | 13 | In Marrow ArrayData is the low level API that will access the Arrow memory block. 14 | 15 | As such it should own the data type, bitmap, buffers and children. 16 | 17 | 2. Typed arrays own ArrayData 18 | 19 | The next level in the API are the typed arrays: PrimitiveArray, ListArray, StructArray and so on. 20 | 21 | When constructing a typed array from an ArrowData the typed array will own the ArrayData. 22 | 23 | The typed arrays provide convenient accessors into the ArrayData. For example PrimitiveArray 24 | provides a `bitmap` and a `buffer`. Since Mojo doesn't currently provide properties these 25 | helper accessors will be implemented as functions. 26 | 27 | 3. The Array trait 28 | 29 | All of the typed Arrays are expected to implement the Array trait by providing 2 methods: 30 | 31 | - `fn take_data(deinit self) -> ArrayData` creates a standalone ArrayData by destroying the self. 32 | - `fn data(self) -> ref [self] ArrayData` access a read only copy of the ArrayGata in the typed array. 33 | -------------------------------------------------------------------------------- /marrow/arrays/chunked_array.mojo: -------------------------------------------------------------------------------- 1 | """Implements a ChunkedArray class for handling pyarrow.Array-like objects.""" 2 | 3 | from marrow.buffers import Buffer, Bitmap 4 | from marrow.dtypes import DataType, DType 5 | 6 | 7 | struct ChunkedArray: 8 | """An array-like composed from a (possibly empty) collection of pyarrow.Arrays. 9 | 10 | [Reference](https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow-chunkedarray). 11 | """ 12 | 13 | var dtype: DataType 14 | var length: Int 15 | var chunks: List[ArrayData] 16 | 17 | fn _compute_length(mut self) -> None: 18 | """Update the length of the array from the length of its chunks.""" 19 | var total_length = 0 20 | for chunk in self.chunks: 21 | total_length += chunk.length 22 | self.length = total_length 23 | 24 | fn __init__(out self, var dtype: DataType, var chunks: List[ArrayData]): 25 | self.dtype = dtype^ 26 | self.chunks = chunks^ 27 | self.length = 0 28 | self._compute_length() 29 | 30 | fn chunk(self, index: Int) -> ref [self.chunks] ArrayData: 31 | """Returns the chunk at the given index. 32 | 33 | Args: 34 | index: The desired index. 35 | 36 | Returns: 37 | A reference to the chunk at the given index. 38 | """ 39 | return self.chunks[index] 40 | 41 | fn combine_chunks(var self, out combined: ArrayData): 42 | """Combines all chunks into a single array.""" 43 | var bitmap = ArcPointer(Bitmap.alloc(self.length)) 44 | combined = ArrayData( 45 | dtype=self.dtype.copy(), 46 | length=self.length, 47 | bitmap=bitmap, 48 | buffers=List[ArcPointer[Buffer]](), 49 | children=List[ArcPointer[ArrayData]](), 50 | offset=0, 51 | ) 52 | var start = 0 53 | while self.chunks: 54 | var chunk = self.chunks.pop(0) 55 | start += chunk^.append_to_array(combined, start) 56 | return combined^ 57 | -------------------------------------------------------------------------------- /marrow/arrays/tests/test_chunked_array.mojo: -------------------------------------------------------------------------------- 1 | """Test the chunked array implementation.""" 2 | 3 | from testing import assert_equal, TestSuite 4 | from marrow.arrays.base import ArrayData 5 | from marrow.buffers import Buffer, Bitmap 6 | from marrow.arrays.chunked_array import ChunkedArray 7 | from marrow.dtypes import int8 8 | from memory import ArcPointer 9 | from marrow.test_fixtures.arrays import build_array_data, assert_bitmap_set 10 | 11 | 12 | def test_chunked_array(): 13 | var first_array_data = build_array_data(1, 0) 14 | var arrays = List[ArrayData]() 15 | arrays.append(first_array_data^) 16 | 17 | var second_array_data = build_array_data(2, 0) 18 | arrays.append(second_array_data^) 19 | 20 | var chunked_array = ChunkedArray(materialize[int8](), arrays^) 21 | assert_equal(chunked_array.length, 3) 22 | 23 | assert_equal(chunked_array.chunk(0).length, 1) 24 | var second_chunk = chunked_array.chunk(1).copy().as_uint8() 25 | assert_equal(second_chunk.data.length, 2) 26 | assert_equal(second_chunk.unsafe_get(0), 0) 27 | assert_equal(second_chunk.unsafe_get(1), 1) 28 | 29 | 30 | def test_combine_chunked_array(): 31 | var first_array_data = build_array_data(1, 0) 32 | var arrays = List[ArrayData]() 33 | arrays.append(first_array_data^) 34 | 35 | var second_array_data = build_array_data(2, 0) 36 | arrays.append(second_array_data^) 37 | 38 | var chunked_array = ChunkedArray(materialize[int8](), arrays^) 39 | assert_equal(chunked_array.length, 3) 40 | assert_equal(len(chunked_array.chunks), 2) 41 | assert_equal(chunked_array.chunk(1).copy().as_uint8().unsafe_get(1), 1) 42 | 43 | var combined_array = chunked_array^.combine_chunks() 44 | assert_equal(combined_array.length, 3) 45 | assert_equal(combined_array.dtype, materialize[int8]()) 46 | # Ensure that the last element of the last buffer has the expected value. 47 | assert_equal(combined_array.buffers[1][].unsafe_get(1), 1) 48 | 49 | 50 | def main(): 51 | TestSuite.discover_tests[__functions_in_module()]().run() 52 | -------------------------------------------------------------------------------- /marrow/schema.mojo: -------------------------------------------------------------------------------- 1 | """Define the Mojo representation of the Arrow Schema. 2 | 3 | [Reference](https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema) 4 | """ 5 | from .dtypes import Field 6 | from .c_data import CArrowSchema 7 | from collections import Dict 8 | from collections.string import StringSlice 9 | 10 | 11 | struct Schema(Copyable, Movable): 12 | var fields: List[Field] 13 | var metadata: Dict[String, String] 14 | 15 | fn __init__( 16 | out self, 17 | *, 18 | var fields: List[Field] = List[Field](), 19 | var metadata: Dict[String, String] = Dict[String, String](), 20 | ): 21 | """Initializes a schema with the given fields, if provided.""" 22 | self.fields = fields^ 23 | self.metadata = metadata^ 24 | 25 | @staticmethod 26 | fn from_c(c_arrow_schema: CArrowSchema) raises -> Schema: 27 | """Initializes a schema from a CArrowSchema.""" 28 | var fields = List[Field]() 29 | for i in range(c_arrow_schema.n_children): 30 | var child = c_arrow_schema.children[i] 31 | var field = child[].to_field() 32 | fields.append(field^) 33 | 34 | return Schema(fields=fields^) 35 | 36 | fn append(mut self, var field: Field): 37 | """Appends a field to the schema.""" 38 | self.fields.append(field^) 39 | 40 | fn names(self) -> List[String]: 41 | """Returns the names of the fields in the schema.""" 42 | var names = List[String]() 43 | for field in self.fields: 44 | names.append(field.name) 45 | return names^ 46 | 47 | fn field( 48 | self, 49 | *, 50 | index: Optional[Int] = None, 51 | name: Optional[StringSlice[mut=False, origin=ImmutAnyOrigin]] = None, 52 | ) raises -> ref [self.fields] Field: 53 | """Returns the field at the given index or with the given name.""" 54 | if index and name: 55 | raise Error("Either an index or a name must be provided, not both.") 56 | if index: 57 | return self.fields[index.value()] 58 | if not name: 59 | raise Error("Either an index or a name must be provided.") 60 | for field in self.fields: 61 | if field.name.as_string_slice() == name.value(): 62 | return field 63 | raise Error( 64 | StringSlice("Field with name `{}` not found.").format(name.value()) 65 | ) 66 | -------------------------------------------------------------------------------- /marrow/module/arrays/primitive_api.mojo: -------------------------------------------------------------------------------- 1 | """Python interface for primitive array.""" 2 | 3 | from os import abort 4 | from python.bindings import PythonModuleBuilder, PythonObject 5 | from marrow.dtypes import DataType 6 | from marrow.arrays.base import ArrayData 7 | from marrow.arrays import primitive 8 | from python import Python 9 | 10 | 11 | @fieldwise_init 12 | struct PrimitiveArray(Movable, Representable): 13 | """Type erased PrimitiveArray so that we can return to python.""" 14 | 15 | var data: ArrayData 16 | var offset: Int 17 | var capacity: Int 18 | 19 | fn __repr__(self) -> String: 20 | return "PrimitiveArray" 21 | 22 | @staticmethod 23 | fn __len__(py_self: PythonObject) raises -> PythonObject: 24 | """Return the length of the underlying ArrayData.""" 25 | var self_ptr = py_self.downcast_value_ptr[Self]() 26 | return self_ptr[].data.length 27 | 28 | @staticmethod 29 | fn __getitem__( 30 | py_self: PythonObject, index: PythonObject 31 | ) raises -> PythonObject: 32 | """Access the element at the given index.""" 33 | var self_ptr = py_self.downcast_value_ptr[Self]() 34 | return primitive.Int64Array(self_ptr[].data.copy()).unsafe_get( 35 | Int(index) 36 | ) 37 | 38 | 39 | fn array(content: PythonObject) raises -> PythonObject: 40 | """Create a primitive array, only In64 implemented so far. 41 | 42 | Args: 43 | content: An iterable of Ints. 44 | 45 | Returns: 46 | A PrimitiveArray wrapped in a PythonObject. 47 | 48 | """ 49 | var actual = primitive.Int64Array() 50 | 51 | var iter = content.__iter__() 52 | while iter.__has_next__(): 53 | var next = iter.__next__() 54 | var value = Int(next) 55 | actual.append(value) 56 | 57 | var result = PrimitiveArray( 58 | data=actual.data.copy(), 59 | offset=actual.offset, 60 | capacity=actual.capacity, 61 | ) 62 | return PythonObject(alloc=result^) 63 | 64 | 65 | def add_to_module(mut builder: PythonModuleBuilder) -> None: 66 | """Add primitive array support to the python API.""" 67 | 68 | _ = ( 69 | builder.add_type[PrimitiveArray]("PrimitiveArray") 70 | .def_method[PrimitiveArray.__len__]("__len__") 71 | .def_method[PrimitiveArray.__getitem__]("__getitem__") 72 | ) 73 | builder.def_function[array]( 74 | "array", 75 | docstring="Build a primitive array with the given data and datatype", 76 | ) 77 | -------------------------------------------------------------------------------- /marrow/arrays/tests/test_base.mojo: -------------------------------------------------------------------------------- 1 | """Test the base module.""" 2 | from testing import assert_true, assert_false, assert_equal, TestSuite 3 | from memory import LegacyUnsafePointer, ArcPointer 4 | from marrow.arrays.base import ArrayData 5 | from marrow.buffers import Buffer, Bitmap 6 | from marrow.dtypes import DType, int8, uint8, int64 7 | from marrow.test_fixtures.arrays import build_array_data, assert_bitmap_set 8 | 9 | 10 | def test_array_data_with_offset(): 11 | """Test ArrayData with offset functionality.""" 12 | # Create ArrayData with offset 13 | var bitmap = ArcPointer(Bitmap.alloc(10)) 14 | var buffer = ArcPointer(Buffer.alloc[int8.native](10)) 15 | 16 | # Set some data in the buffer 17 | buffer[].unsafe_set[int8.native](2, 100) 18 | buffer[].unsafe_set[int8.native](3, 200) 19 | buffer[].unsafe_set[int8.native](4, 300) 20 | 21 | # Set validity bits 22 | bitmap[].unsafe_set(2, True) 23 | bitmap[].unsafe_set(3, True) 24 | bitmap[].unsafe_set(4, True) 25 | 26 | # Create ArrayData with offset=2 27 | var array_data = ArrayData( 28 | dtype=materialize[int8](), 29 | length=3, 30 | bitmap=bitmap, 31 | buffers=List(buffer), 32 | children=List[ArcPointer[ArrayData]](), 33 | offset=2, 34 | ) 35 | 36 | assert_equal(array_data.offset, 2) 37 | 38 | # Test is_valid with offset 39 | assert_true(array_data.is_valid(0)) # Should check bitmap[2] 40 | assert_true(array_data.is_valid(1)) # Should check bitmap[3] 41 | assert_true(array_data.is_valid(2)) # Should check bitmap[4] 42 | 43 | 44 | def test_array_data_fieldwise_init(): 45 | """Test that @fieldwise_init decorator works with offset field.""" 46 | var bitmap = ArcPointer(Bitmap.alloc(5)) 47 | var buffer = ArcPointer(Buffer.alloc[int8.native](5)) 48 | 49 | # Test creating ArrayData with all fields specified including offset 50 | var array_data = ArrayData( 51 | dtype=materialize[int8](), 52 | length=5, 53 | bitmap=bitmap, 54 | buffers=List(buffer), 55 | children=List[ArcPointer[ArrayData]](), 56 | offset=3, 57 | ) 58 | 59 | assert_equal(array_data.dtype, materialize[int8]()) 60 | assert_equal(array_data.length, 5) 61 | assert_equal(array_data.offset, 3) 62 | 63 | 64 | def test_array_data_write_to_with_offset(): 65 | """Test ArrayData write_to method respects offset.""" 66 | 67 | var bitmap = ArcPointer(Bitmap.alloc(10)) 68 | var buffer = ArcPointer(Buffer.alloc[DType.uint8](10)) 69 | 70 | @parameter 71 | for dtype in [uint8, int64]: 72 | # Set up data with values at positions 1,2,3 73 | buffer[].unsafe_set[dtype.native](1, 10) 74 | buffer[].unsafe_set[dtype.native](2, 11) 75 | buffer[].unsafe_set[dtype.native](3, 12) 76 | 77 | # Set validity for positions 1,2,3 78 | bitmap[].unsafe_set(1, True) 79 | bitmap[].unsafe_set(2, True) 80 | bitmap[].unsafe_set(3, True) 81 | 82 | # Create ArrayData with offset=1, so logical indices 0,1,2 map to physical indices 1,2,3 83 | var array_data = ArrayData( 84 | dtype=materialize[dtype](), 85 | length=3, 86 | bitmap=bitmap, 87 | buffers=List(buffer), 88 | children=List[ArcPointer[ArrayData]](), 89 | offset=1, 90 | ) 91 | 92 | var writer = String() 93 | writer.write(array_data) 94 | assert_equal(writer.strip(), "10 11 12") 95 | 96 | 97 | def main(): 98 | TestSuite.discover_tests[__functions_in_module()]().run() 99 | -------------------------------------------------------------------------------- /marrow/tests/test_schema.mojo: -------------------------------------------------------------------------------- 1 | """Test the schema.mojo file.""" 2 | from testing import assert_equal, assert_true, TestSuite 3 | from python import Python, PythonObject 4 | from marrow.schema import Schema 5 | from marrow.dtypes import ( 6 | int8, 7 | int16, 8 | int32, 9 | int64, 10 | uint8, 11 | uint16, 12 | uint32, 13 | uint64, 14 | ) 15 | from marrow.dtypes import float16, float32, float64, binary, string, list_ 16 | from marrow.c_data import Field, CArrowSchema 17 | 18 | 19 | def test_schema_primitive_fields(): 20 | """Test the schema with primitive fields.""" 21 | 22 | # Create a schema with different data types 23 | fields = List[Field]( 24 | Field("field1", materialize[int8]()), 25 | Field("field2", materialize[int16]()), 26 | Field("field3", materialize[int32]()), 27 | Field("field4", materialize[int64]()), 28 | Field("field5", materialize[uint8]()), 29 | Field("field6", materialize[uint16]()), 30 | Field("field7", materialize[uint32]()), 31 | Field("field8", materialize[uint64]()), 32 | Field("field9", materialize[float16]()), 33 | Field("field10", materialize[float32]()), 34 | Field("field11", materialize[float64]()), 35 | Field("field12", materialize[binary]()), 36 | Field("field13", materialize[string]()), 37 | ) 38 | var nb_fields = len(fields) 39 | 40 | var schema = Schema(fields=fields^) 41 | 42 | # Check the number of fields in the schema 43 | assert_equal(len(schema.fields), nb_fields) 44 | 45 | # Check the names of the fields in the schema 46 | for i in range(nb_fields): 47 | assert_equal(schema.field(index=i).name, "field" + String(i + 1)) 48 | 49 | 50 | def test_schema_names() -> None: 51 | fields = List[Field]( 52 | Field("field1", materialize[int8](), False), 53 | Field("field2", materialize[int16](), False), 54 | ) 55 | 56 | var schema = Schema(fields=fields^) 57 | assert_equal( 58 | schema.names(), 59 | List[String](["field{}".format(i + 1) for i in range(2)]), 60 | ) 61 | 62 | schema.append(Field("field3", materialize[int32]())) 63 | assert_equal( 64 | schema.names(), 65 | List[String](["field{}".format(i + 1) for i in range(3)]), 66 | ) 67 | 68 | 69 | def test_from_c_schema() -> None: 70 | var pa = Python.import_module("pyarrow") 71 | var pa_schema = pa.schema( 72 | Python.list( 73 | pa.field("field1", pa.list_(pa.int32())), 74 | pa.field( 75 | "field2", 76 | pa.`struct`( 77 | Python.list( 78 | pa.field("field_a", pa.int32()), 79 | pa.field("field_b", pa.float64()), 80 | ) 81 | ), 82 | ), 83 | ) 84 | ) 85 | 86 | var c_schema = CArrowSchema.from_pyarrow(pa_schema) 87 | var schema = Schema.from_c(c_schema) 88 | 89 | assert_equal(len(schema.fields), 2) 90 | 91 | # Test first field. 92 | ref field_0 = schema.field(index=0) 93 | assert_true(field_0.dtype.is_list()) 94 | assert_true(field_0.dtype.fields[0].dtype.is_integer()) 95 | 96 | # Test second field. 97 | ref field_1 = schema.field(index=1) 98 | assert_true(field_1.dtype.is_struct()) 99 | assert_equal(field_1.dtype.fields[0].name, "field_a") 100 | assert_equal(field_1.dtype.fields[1].name, "field_b") 101 | 102 | 103 | def main(): 104 | TestSuite.discover_tests[__functions_in_module()]().run() 105 | -------------------------------------------------------------------------------- /marrow/io/formatter.mojo: -------------------------------------------------------------------------------- 1 | from marrow.arrays import * 2 | from marrow.dtypes import * 3 | 4 | 5 | struct Formatter: 6 | """Recursively formats and prints ArrayData and typed arrays.""" 7 | 8 | # How many elements to print. 9 | var limit: Int 10 | 11 | fn __init__(out self, limit: Int = 3): 12 | self.limit = limit 13 | 14 | fn format[ 15 | T: DataType, //, W: Writer 16 | ](self, mut writer: W, value: PrimitiveArray[T]) raises: 17 | """Output a PrimitiveArray to the given Writer.""" 18 | writer.write("PrimitiveArray[") 19 | writer.write(materialize[value.dtype]()) 20 | writer.write("]([") 21 | 22 | for i in range(value.data.length): 23 | if i > 0: 24 | writer.write(", ") 25 | if i >= self.limit: 26 | writer.write("...") 27 | break 28 | 29 | if value.is_valid(i): 30 | writer.write(value.unsafe_get(i)) 31 | else: 32 | writer.write("NULL") 33 | writer.write("])") 34 | 35 | fn format[W: Writer](self, mut writer: W, value: ListArray) raises: 36 | """Output a ListArray to the given Writer.""" 37 | writer.write("ListArray([") 38 | for i in range(value.data.length): 39 | if i > 0: 40 | writer.write(", ") 41 | if i >= self.limit: 42 | writer.write("...") 43 | break 44 | 45 | if value.is_valid(i): 46 | self.format(writer, value.unsafe_get(i)) 47 | else: 48 | writer.write("NULL") 49 | writer.write("])") 50 | 51 | fn format[W: Writer](self, mut writer: W, value: StringArray) raises: 52 | """Output a StringArray to the given Writer.""" 53 | writer.write("StringArray([") 54 | for i in range(value.data.length): 55 | if i > 0: 56 | writer.write(", ") 57 | if i >= self.limit: 58 | writer.write("...") 59 | break 60 | 61 | if value.is_valid(i): 62 | writer.write(value.unsafe_get(UInt(i))) 63 | else: 64 | writer.write("NULL") 65 | writer.write("])") 66 | 67 | fn format[W: Writer](self, mut writer: W, array_data: ArrayData) raises: 68 | """Output a dynamic ArrayData to the given writer.""" 69 | if array_data.dtype.is_numeric(): 70 | 71 | @parameter 72 | for dtype in all_numeric_dtypes: 73 | if array_data.dtype == materialize[dtype](): 74 | self.format( 75 | writer, 76 | PrimitiveArray[dtype](data=array_data.copy()), 77 | ) 78 | return 79 | elif array_data.dtype.is_list(): 80 | self.format(writer, ListArray(array_data.copy())) 81 | return 82 | elif array_data.dtype.is_struct(): 83 | self.format(writer, StructArray(data=array_data.copy())) 84 | return 85 | elif array_data.dtype.is_string(): 86 | self.format(writer, StringArray(data=array_data.copy())) 87 | return 88 | raise Error("Unknown dtype {} in format.".format(array_data.dtype)) 89 | 90 | fn format[W: Writer](self, mut writer: W, value: StructArray) raises: 91 | """Output a StructArray to the Writer.""" 92 | writer.write("StructArray({") 93 | if len(value.data.children) > 0: 94 | for i in range(len(value.fields)): 95 | if i > 0: 96 | writer.write(", ") 97 | ref field = value.fields[i] 98 | writer.write("'") 99 | writer.write(field.name) 100 | writer.write("': ") 101 | ref field_value = value.unsafe_get(field.name) 102 | self.format(writer, field_value) 103 | writer.write("})") 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | 165 | # magic environments 166 | .magic 167 | 168 | # pixi environments 169 | .pixi 170 | *.egg-info 171 | 172 | # Mojo build files 173 | kgen.trace.* 174 | -------------------------------------------------------------------------------- /marrow/module/dtypes_api.mojo: -------------------------------------------------------------------------------- 1 | """Python interface for dtypes.""" 2 | 3 | from python.bindings import PythonModuleBuilder, PythonObject 4 | from marrow import dtypes 5 | 6 | 7 | fn null() raises -> PythonObject: 8 | """Create a null DataType.""" 9 | var result = materialize[dtypes.null]() 10 | return PythonObject(alloc=result^) 11 | 12 | 13 | fn bool_() raises -> PythonObject: 14 | """Create a boolean DataType.""" 15 | var result = materialize[dtypes.bool_]() 16 | return PythonObject(alloc=result^) 17 | 18 | 19 | fn int8() raises -> PythonObject: 20 | """Create an int8 DataType.""" 21 | var result = materialize[dtypes.int8]() 22 | return PythonObject(alloc=result^) 23 | 24 | 25 | fn int16() raises -> PythonObject: 26 | """Create an int16 DataType.""" 27 | var result = materialize[dtypes.int16]() 28 | return PythonObject(alloc=result^) 29 | 30 | 31 | fn int32() raises -> PythonObject: 32 | """Create an int32 DataType.""" 33 | var result = materialize[dtypes.int32]() 34 | return PythonObject(alloc=result^) 35 | 36 | 37 | fn int64() raises -> PythonObject: 38 | """Create an int64 DataType.""" 39 | var result = materialize[dtypes.int64]() 40 | return PythonObject(alloc=result^) 41 | 42 | 43 | fn uint8() raises -> PythonObject: 44 | """Create a uint8 DataType.""" 45 | var result = materialize[dtypes.uint8]() 46 | return PythonObject(alloc=result^) 47 | 48 | 49 | fn uint16() raises -> PythonObject: 50 | """Create a uint16 DataType.""" 51 | var result = materialize[dtypes.uint16]() 52 | return PythonObject(alloc=result^) 53 | 54 | 55 | fn uint32() raises -> PythonObject: 56 | """Create a uint32 DataType.""" 57 | var result = materialize[dtypes.uint32]() 58 | return PythonObject(alloc=result^) 59 | 60 | 61 | fn uint64() raises -> PythonObject: 62 | """Create a uint64 DataType.""" 63 | var result = materialize[dtypes.uint64]() 64 | return PythonObject(alloc=result^) 65 | 66 | 67 | fn float16() raises -> PythonObject: 68 | """Create a float16 DataType.""" 69 | var result = materialize[dtypes.float16]() 70 | return PythonObject(alloc=result^) 71 | 72 | 73 | fn float32() raises -> PythonObject: 74 | """Create a float32 DataType.""" 75 | var result = materialize[dtypes.float32]() 76 | return PythonObject(alloc=result^) 77 | 78 | 79 | fn float64() raises -> PythonObject: 80 | """Create a float64 DataType.""" 81 | var result = materialize[dtypes.float64]() 82 | return PythonObject(alloc=result^) 83 | 84 | 85 | fn string() raises -> PythonObject: 86 | """Create a string DataType.""" 87 | var result = materialize[dtypes.string]() 88 | return PythonObject(alloc=result^) 89 | 90 | 91 | fn binary() raises -> PythonObject: 92 | """Create a binary DataType.""" 93 | var result = materialize[dtypes.binary]() 94 | return PythonObject(alloc=result^) 95 | 96 | 97 | def add_to_module(mut builder: PythonModuleBuilder) -> None: 98 | """Add DataType related data to the Python API.""" 99 | 100 | _ = builder.add_type[dtypes.DataType]("DataType") 101 | builder.def_function[null]("null", docstring="Create a null DataType.") 102 | builder.def_function[bool_]("bool_", docstring="Create a boolean DataType.") 103 | builder.def_function[int8]("int8", docstring="Create an int8 DataType.") 104 | builder.def_function[int16]("int16", docstring="Create an int16 DataType.") 105 | builder.def_function[int32]("int32", docstring="Create an int32 DataType.") 106 | builder.def_function[int64]("int64", docstring="Create an int64 DataType.") 107 | builder.def_function[uint8]("uint8", docstring="Create a uint8 DataType.") 108 | builder.def_function[uint16]( 109 | "uint16", docstring="Create a uint16 DataType." 110 | ) 111 | builder.def_function[uint32]( 112 | "uint32", docstring="Create a uint32 DataType." 113 | ) 114 | builder.def_function[uint64]( 115 | "uint64", docstring="Create a uint64 DataType." 116 | ) 117 | builder.def_function[float16]( 118 | "float16", docstring="Create a float16 DataType." 119 | ) 120 | builder.def_function[float32]( 121 | "float32", docstring="Create a float32 DataType." 122 | ) 123 | builder.def_function[float64]( 124 | "float64", docstring="Create a float64 DataType." 125 | ) 126 | builder.def_function[string]( 127 | "string", docstring="Create a string DataType." 128 | ) 129 | builder.def_function[binary]( 130 | "binary", docstring="Create a binary DataType." 131 | ) 132 | -------------------------------------------------------------------------------- /marrow/arrays/tests/test_nested.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | 3 | 4 | from marrow.arrays import * 5 | from marrow.dtypes import * 6 | from marrow.test_fixtures.bool_array import as_bool_array_scalar 7 | from marrow.test_fixtures.arrays import build_list_of_list, build_struct 8 | 9 | 10 | def test_list_int_array(): 11 | var ints = Int64Array( 12 | ArrayData.from_buffer[int64]( 13 | Buffer.from_values[DType.int64](1, 2, 3), 3 14 | ) 15 | ) 16 | var lists = ListArray(ints^) 17 | assert_equal(lists.data.dtype, list_(materialize[int64]())) 18 | 19 | var first_value = lists.unsafe_get(0) 20 | assert_equal(first_value.__str__().strip(), "1 2 3") 21 | 22 | assert_equal(len(lists), 1) 23 | 24 | var data = lists^.take_data() 25 | assert_equal(data.length, 1) 26 | 27 | var arr = data^.as_list() 28 | assert_equal(len(arr), 1) 29 | 30 | 31 | def test_list_bool_array(): 32 | var bools = BoolArray() 33 | 34 | bools.append(as_bool_array_scalar(True)) 35 | bools.append(as_bool_array_scalar(False)) 36 | bools.append(as_bool_array_scalar(True)) 37 | 38 | var lists = ListArray(bools^) 39 | assert_equal(len(lists), 1) 40 | var first_value = lists.unsafe_get(0) 41 | var buffer = first_value.buffers[0] 42 | 43 | fn get(index: Int) -> Bool: 44 | return buffer[].unsafe_get[DType.bool](index) 45 | 46 | assert_equal(get(0), True) 47 | assert_equal(get(1), False) 48 | assert_equal(get(2), True) 49 | 50 | 51 | def test_list_str(): 52 | var strings = StringArray() 53 | strings.unsafe_append("hello") 54 | strings.unsafe_append("world") 55 | 56 | var lists = ListArray(strings^) 57 | var first_value = StringArray(lists.unsafe_get(0)) 58 | 59 | assert_equal(first_value.unsafe_get(0), "hello") 60 | assert_equal(first_value.unsafe_get(1), "world") 61 | 62 | 63 | def test_list_of_list(): 64 | list2 = build_list_of_list[int64]() 65 | top = ListArray(list2.unsafe_get(0)) 66 | middle_0 = top.unsafe_get(0) 67 | bottom = Int64Array(middle_0^) 68 | assert_equal(bottom.unsafe_get(1), 2) 69 | assert_equal(bottom.unsafe_get(0), 1) 70 | middle_1 = top.unsafe_get(1) 71 | bottom = Int64Array(middle_1^) 72 | assert_equal(bottom.unsafe_get(0), 3) 73 | assert_equal(bottom.unsafe_get(1), 4) 74 | 75 | 76 | def test_struct_array(): 77 | var fields = List[Field]( 78 | Field("id", materialize[int64]()), 79 | Field("name", materialize[string]()), 80 | Field("active", materialize[bool_]()), 81 | ) 82 | 83 | var struct_arr = StructArray(fields^, capacity=10) 84 | assert_equal(len(struct_arr), 0) 85 | assert_equal(struct_arr.capacity, 10) 86 | 87 | var data = struct_arr^.take_data() 88 | assert_equal(data.length, 0) 89 | assert_true(data.dtype.is_struct()) 90 | assert_equal(len(data.dtype.fields), 3) 91 | assert_equal(data.dtype.fields[0].name, "id") 92 | assert_equal(data.dtype.fields[1].name, "name") 93 | assert_equal(data.dtype.fields[2].name, "active") 94 | 95 | 96 | def test_list_array_str_repr(): 97 | var ints = Int64Array() 98 | var lists = ListArray(ints^) 99 | 100 | var str_repr = lists.__str__() 101 | var repr_repr = lists.__repr__() 102 | 103 | assert_equal(str_repr, "ListArray(length=1)") 104 | assert_equal(repr_repr, "ListArray(length=1)") 105 | assert_equal(str_repr, repr_repr) 106 | 107 | 108 | def test_struct_array_str_repr(): 109 | var fields = List[Field]( 110 | Field("id", materialize[int64]()), 111 | Field("name", materialize[string]()), 112 | ) 113 | 114 | var struct_arr = StructArray(fields^, capacity=5) 115 | 116 | var str_repr = struct_arr.__str__() 117 | var repr_repr = struct_arr.__repr__() 118 | 119 | assert_equal(str_repr, "StructArray(length=0)") 120 | assert_equal(repr_repr, "StructArray(length=0)") 121 | assert_equal(str_repr, repr_repr) 122 | 123 | 124 | def test_struct_array_unsafe_get(): 125 | var struct_array = build_struct() 126 | ref int_data_a = struct_array.unsafe_get("int_data_a") 127 | var int_a = Int32Array(int_data_a.copy()) 128 | assert_equal(int_a.unsafe_get(0), 1) 129 | assert_equal(int_a.unsafe_get(4), 5) 130 | ref int_data_b = struct_array.unsafe_get("int_data_b") 131 | var int_b = Int32Array(int_data_b.copy()) 132 | assert_equal(int_b.unsafe_get(0), 10) 133 | assert_equal(int_b.unsafe_get(2), 30) 134 | 135 | 136 | def main(): 137 | TestSuite.discover_tests[__functions_in_module()]().run() 138 | -------------------------------------------------------------------------------- /marrow/io/tests/test_formatter.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | 3 | from marrow.arrays import * 4 | from marrow.dtypes import * 5 | from marrow.io.formatter import Formatter 6 | from marrow.test_fixtures.arrays import ( 7 | build_list_of_list, 8 | build_list_of_int, 9 | build_struct, 10 | ) 11 | 12 | 13 | def test_primitive_array(): 14 | """Test the formatter for a primitive array.""" 15 | var arr = array[int32](42, 84, 126) 16 | 17 | var output = String() 18 | var formatter = Formatter() 19 | formatter.format(output, arr) 20 | assert_equal( 21 | output, 22 | """PrimitiveArray[DataType(code=int32)]([42, 84, 126])""", 23 | ) 24 | 25 | 26 | def test_list_int_array(): 27 | var arr = build_list_of_int[int64]() 28 | var output = String() 29 | var formatter = Formatter() 30 | formatter.format(output, arr) 31 | assert_equal( 32 | output, 33 | ( 34 | "ListArray([PrimitiveArray[DataType(code=int64)]([1, 2])," 35 | " PrimitiveArray[DataType(code=int64)]([3, 4])," 36 | " PrimitiveArray[DataType(code=int64)]([5, 6, 7]), ...])" 37 | ), 38 | ) 39 | 40 | 41 | def test_list_list_array(): 42 | var arr = build_list_of_list[int16]() 43 | var output = String() 44 | var formatter = Formatter() 45 | formatter.format(output, arr) 46 | assert_equal( 47 | output, 48 | ( 49 | "ListArray([ListArray([PrimitiveArray[DataType(code=int16)]([1," 50 | " 2]), PrimitiveArray[DataType(code=int16)]([3, 4])])," 51 | " ListArray([PrimitiveArray[DataType(code=int16)]([5, 6, 7])," 52 | " PrimitiveArray[DataType(code=int16)]([])," 53 | " PrimitiveArray[DataType(code=int16)]([8])])," 54 | " ListArray([PrimitiveArray[DataType(code=int16)]([9, 10])])," 55 | " ...])" 56 | ), 57 | ) 58 | 59 | 60 | def test_empty_struct(): 61 | var fields = List[Field]( 62 | Field("id", materialize[int64]()), 63 | Field("name", materialize[string]()), 64 | Field("active", materialize[bool_]()), 65 | ) 66 | 67 | var struct_arr = StructArray(fields^, capacity=10) 68 | 69 | var output = String() 70 | var formatter = Formatter() 71 | formatter.format(output, struct_arr) 72 | assert_equal( 73 | output, 74 | "StructArray({})", 75 | ) 76 | 77 | 78 | def test_struct(): 79 | var struct_arr = build_struct() 80 | 81 | var output = String() 82 | var formatter = Formatter() 83 | formatter.format(output, struct_arr) 84 | assert_equal( 85 | output, 86 | ( 87 | "StructArray({'int_data_a': " 88 | "PrimitiveArray[DataType(code=int32)]([1, 2, 3, ...]), " 89 | "'int_data_b': PrimitiveArray[DataType(code=int32)]([10, 20, " 90 | "30])})" 91 | ), 92 | ) 93 | 94 | 95 | def test_formatter_with_different_limits(): 96 | """Test formatter with various limit values.""" 97 | var arr = array[int32](1, 2, 3, 4, 5, 6, 7, 8, 9, 10) 98 | 99 | # Test limit=0 100 | var output0 = String() 101 | var formatter0 = Formatter(limit=0) 102 | formatter0.format(output0, arr) 103 | assert_equal(output0, "PrimitiveArray[DataType(code=int32)]([...])") 104 | 105 | # Test limit=1 106 | var output1 = String() 107 | var formatter1 = Formatter(limit=1) 108 | formatter1.format(output1, arr) 109 | assert_equal(output1, "PrimitiveArray[DataType(code=int32)]([1, ...])") 110 | 111 | # Test limit=10 (should show all elements) 112 | var output10 = String() 113 | var formatter10 = Formatter(limit=10) 114 | formatter10.format(output10, arr) 115 | assert_equal( 116 | output10, 117 | "PrimitiveArray[DataType(code=int32)]([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])", 118 | ) 119 | 120 | 121 | def test_empty_array(): 122 | """Test formatter with an empty array.""" 123 | var arr = Int32Array(0) 124 | 125 | var output = String() 126 | var formatter = Formatter() 127 | formatter.format(output, arr) 128 | assert_equal(output, "PrimitiveArray[DataType(code=int32)]([])") 129 | 130 | 131 | def test_all_null_array(): 132 | """Test formatter with an array of all NULL values.""" 133 | var arr = Int32Array(3) 134 | arr.data.length = 3 135 | arr.data.bitmap[].unsafe_range_set(0, 3, False) 136 | 137 | var output = String() 138 | var formatter = Formatter() 139 | formatter.format(output, arr) 140 | assert_equal( 141 | output, "PrimitiveArray[DataType(code=int32)]([NULL, NULL, NULL])" 142 | ) 143 | 144 | 145 | def test_array_with_nulls(): 146 | """Test formatter with an array containing some NULL values.""" 147 | var arr = Int32Array(5) 148 | arr.append(1) 149 | arr.append(2) 150 | arr.data.bitmap[].unsafe_set(2, False) # Make third element NULL 151 | arr.data.length = 3 152 | arr.append(4) 153 | 154 | var output = String() 155 | var formatter = Formatter() 156 | formatter.format(output, arr) 157 | assert_equal( 158 | output, "PrimitiveArray[DataType(code=int32)]([1, 2, NULL, ...])" 159 | ) 160 | 161 | 162 | def main(): 163 | TestSuite.discover_tests[__functions_in_module()]().run() 164 | -------------------------------------------------------------------------------- /marrow/arrays/binary.mojo: -------------------------------------------------------------------------------- 1 | from memory import ArcPointer, memcpy, Span 2 | from collections.string import StringSlice 3 | from ..buffers import Buffer 4 | from ..dtypes import * 5 | 6 | 7 | struct StringArray(Array): 8 | var data: ArrayData 9 | var capacity: Int 10 | 11 | fn __init__(out self, var data: ArrayData) raises: 12 | if data.dtype != materialize[string](): 13 | raise Error( 14 | "Unexpected dtype '{}' instead of 'string'.".format(data.dtype) 15 | ) 16 | elif len(data.buffers) != 2: 17 | raise Error("StringArray requires exactly two buffers") 18 | 19 | self.capacity = data.length 20 | self.data = data^ 21 | 22 | fn bitmap(self) -> ref [self.data.bitmap] ArcPointer[Bitmap]: 23 | return self.data.bitmap 24 | 25 | fn offsets(self) -> ref [self.data.buffers] ArcPointer[Buffer]: 26 | return self.data.buffers[0] 27 | 28 | fn values(self) -> ref [self.data.buffers] ArcPointer[Buffer]: 29 | return self.data.buffers[1] 30 | 31 | fn __init__(out self, capacity: Int = 0): 32 | var bitmap = Bitmap.alloc(capacity) 33 | # TODO(kszucs): initial values capacity should be either 0 or some value received from the user 34 | var values = Buffer.alloc[DType.uint8](capacity) 35 | var offsets = Buffer.alloc[DType.uint32](capacity + 1) 36 | offsets.unsafe_set[DType.uint32](0, 0) 37 | 38 | self.capacity = capacity 39 | self.data = ArrayData( 40 | dtype=materialize[string](), 41 | length=0, 42 | bitmap=ArcPointer(bitmap^), 43 | buffers=List(ArcPointer(offsets^), ArcPointer(values^)), 44 | children=List[ArcPointer[ArrayData]](), 45 | offset=0, 46 | ) 47 | 48 | fn __moveinit__(out self, deinit existing: Self): 49 | self.data = existing.data^ 50 | self.capacity = existing.capacity 51 | 52 | fn __len__(self) -> Int: 53 | return self.data.length 54 | 55 | fn as_data[ 56 | self_origin: ImmutOrigin 57 | ](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData, mut=False]: 58 | return LegacyUnsafePointer(to=self.data) 59 | 60 | fn take_data(deinit self) -> ArrayData: 61 | return self.data^ 62 | 63 | fn grow(mut self, capacity: Int): 64 | self.bitmap()[].grow(capacity) 65 | self.offsets()[].grow[DType.uint32](capacity + 1) 66 | self.capacity = capacity 67 | 68 | # fn shrink_to_fit(out self): 69 | 70 | fn is_valid(self, index: Int) -> Bool: 71 | return self.bitmap()[].unsafe_get(index) 72 | 73 | fn unsafe_append(mut self, value: String): 74 | # todo(kszucs): use unsafe set 75 | var index = self.data.length 76 | var last_offset = self.offsets()[].unsafe_get[DType.uint32](index) 77 | var next_offset = last_offset + len(value) 78 | self.data.length += 1 79 | self.bitmap()[].unsafe_set(index, True) 80 | self.offsets()[].unsafe_set[DType.uint32](index + 1, next_offset) 81 | self.values()[].grow[DType.uint8](next_offset) 82 | var dst_address = self.values()[].get_ptr_at(Int(last_offset)) 83 | var src_address = value.unsafe_ptr() 84 | memcpy(dest=dst_address, src=src_address, count=len(value)) 85 | 86 | fn unsafe_get(self, index: UInt) -> StringSlice[ImmutAnyOrigin]: 87 | var offset_idx = Int(index) + self.data.offset 88 | var start_offset = self.offsets()[].unsafe_get[DType.uint32](offset_idx) 89 | var end_offset = self.offsets()[].unsafe_get[DType.uint32]( 90 | offset_idx + 1 91 | ) 92 | var address = self.values()[].get_ptr_at(Int(start_offset)) 93 | var length = Int(end_offset) - Int(start_offset) 94 | return StringSlice( 95 | unsafe_from_utf8=Span[Byte]( 96 | ptr=UnsafePointer(address).mut_cast[False](), length=length 97 | ) 98 | ) 99 | 100 | fn unsafe_set(mut self, index: Int, value: String) raises: 101 | var start_offset = self.offsets()[].unsafe_get[DType.int32](index) 102 | var end_offset = self.offsets()[].unsafe_get[DType.int32](index + 1) 103 | var length = Int(end_offset - start_offset) 104 | 105 | if length != len(value): 106 | raise Error( 107 | "String length mismatch, inplace update must have the same" 108 | " length" 109 | ) 110 | 111 | var dst_address = self.values()[].get_ptr_at(Int(start_offset)) 112 | var src_address = value.unsafe_ptr() 113 | memcpy(dest=dst_address, src=src_address, count=length) 114 | 115 | fn write_to[W: Writer](self, mut writer: W): 116 | """ 117 | Formats this StringArray to the provided Writer. 118 | 119 | Parameters: 120 | W: A type conforming to the Writable trait. 121 | 122 | Args: 123 | writer: The object to write to. 124 | """ 125 | 126 | writer.write("StringArray( length=") 127 | writer.write(self.data.length) 128 | writer.write(", data= [") 129 | for i in range(self.data.length): 130 | writer.write('"') 131 | writer.write(self.unsafe_get(UInt(i))) 132 | writer.write('", ') 133 | if i > 1: 134 | break 135 | writer.write(" ])") 136 | 137 | fn __str__(self) -> String: 138 | return String.write(self) 139 | 140 | fn __repr__(self) -> String: 141 | return String.write(self) 142 | -------------------------------------------------------------------------------- /marrow/tests/test_dtypes.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | import marrow.dtypes as dt 3 | 4 | 5 | def test_bool_type(): 6 | assert_true(materialize[dt.bool_]() == materialize[dt.bool_]()) 7 | assert_false(materialize[dt.bool_]() == materialize[dt.int64]()) 8 | assert_true(materialize[dt.bool_]() is materialize[dt.bool_]()) 9 | assert_false(materialize[dt.bool_]() is materialize[dt.int64]()) 10 | 11 | 12 | def test_list_type(): 13 | assert_true( 14 | dt.list_(materialize[dt.int64]()) == dt.list_(materialize[dt.int64]()) 15 | ) 16 | assert_false( 17 | dt.list_(materialize[dt.int64]()) == dt.list_(materialize[dt.int32]()) 18 | ) 19 | 20 | 21 | def test_field(): 22 | var field = dt.Field("a", materialize[dt.int64](), False) 23 | var writer = String() 24 | writer.write(field) 25 | var expected = ( 26 | 'Field(name="a", dtype=DataType(code=int64), nullable=False, )' 27 | ) 28 | assert_equal(writer, expected) 29 | assert_equal(String(field), expected) 30 | assert_equal(field.__repr__(), expected) 31 | 32 | 33 | def test_struct_type(): 34 | s1 = dt.struct_( 35 | dt.Field("a", materialize[dt.int64](), False), 36 | dt.Field("b", materialize[dt.int32](), False), 37 | ) 38 | s2 = dt.struct_( 39 | dt.Field("a", materialize[dt.int64](), False), 40 | dt.Field("b", materialize[dt.int32](), False), 41 | ) 42 | s3 = dt.struct_( 43 | dt.Field("a", materialize[dt.int64](), False), 44 | dt.Field("b", materialize[dt.int32](), False), 45 | dt.Field("c", materialize[dt.int8](), False), 46 | ) 47 | assert_true(s1 == s2) 48 | assert_false(s1 == s3) 49 | 50 | 51 | def test_is_integer(): 52 | assert_true(materialize[dt.int8]().is_integer()) 53 | assert_true(materialize[dt.int16]().is_integer()) 54 | assert_true(materialize[dt.int32]().is_integer()) 55 | assert_true(materialize[dt.int64]().is_integer()) 56 | assert_true(materialize[dt.uint8]().is_integer()) 57 | assert_true(materialize[dt.uint16]().is_integer()) 58 | assert_true(materialize[dt.uint32]().is_integer()) 59 | assert_true(materialize[dt.uint64]().is_integer()) 60 | assert_false(materialize[dt.bool_]().is_integer()) 61 | assert_false(materialize[dt.float32]().is_integer()) 62 | assert_false(materialize[dt.float64]().is_integer()) 63 | assert_false(dt.list_(materialize[dt.int64]()).is_integer()) 64 | 65 | 66 | def test_is_signed_integer(): 67 | assert_true(materialize[dt.int8]().is_signed_integer()) 68 | assert_true(materialize[dt.int16]().is_signed_integer()) 69 | assert_true(materialize[dt.int32]().is_signed_integer()) 70 | assert_true(materialize[dt.int64]().is_signed_integer()) 71 | assert_false(materialize[dt.uint8]().is_signed_integer()) 72 | assert_false(materialize[dt.uint16]().is_signed_integer()) 73 | assert_false(materialize[dt.uint32]().is_signed_integer()) 74 | assert_false(materialize[dt.uint64]().is_signed_integer()) 75 | assert_false(materialize[dt.bool_]().is_signed_integer()) 76 | assert_false(materialize[dt.float32]().is_signed_integer()) 77 | assert_false(materialize[dt.float64]().is_signed_integer()) 78 | 79 | 80 | def test_is_unsigned_integer(): 81 | assert_false(materialize[dt.int8]().is_unsigned_integer()) 82 | assert_false(materialize[dt.int16]().is_unsigned_integer()) 83 | assert_false(materialize[dt.int32]().is_unsigned_integer()) 84 | assert_false(materialize[dt.int64]().is_unsigned_integer()) 85 | assert_true(materialize[dt.uint8]().is_unsigned_integer()) 86 | assert_true(materialize[dt.uint16]().is_unsigned_integer()) 87 | assert_true(materialize[dt.uint32]().is_unsigned_integer()) 88 | assert_true(materialize[dt.uint64]().is_unsigned_integer()) 89 | assert_false(materialize[dt.bool_]().is_unsigned_integer()) 90 | assert_false(materialize[dt.float32]().is_unsigned_integer()) 91 | assert_false(materialize[dt.float64]().is_unsigned_integer()) 92 | 93 | 94 | def test_is_floating_point(): 95 | assert_false(materialize[dt.int8]().is_floating_point()) 96 | assert_false(materialize[dt.int16]().is_floating_point()) 97 | assert_false(materialize[dt.int32]().is_floating_point()) 98 | assert_false(materialize[dt.int64]().is_floating_point()) 99 | assert_false(materialize[dt.uint8]().is_floating_point()) 100 | assert_false(materialize[dt.uint16]().is_floating_point()) 101 | assert_false(materialize[dt.uint32]().is_floating_point()) 102 | assert_false(materialize[dt.uint64]().is_floating_point()) 103 | assert_false(materialize[dt.bool_]().is_floating_point()) 104 | assert_true(materialize[dt.float32]().is_floating_point()) 105 | assert_true(materialize[dt.float64]().is_floating_point()) 106 | 107 | 108 | def test_bitwidth(): 109 | assert_equal(materialize[dt.int8]().bitwidth(), 8) 110 | assert_equal(materialize[dt.int16]().bitwidth(), 16) 111 | assert_equal(materialize[dt.int32]().bitwidth(), 32) 112 | assert_equal(materialize[dt.int64]().bitwidth(), 64) 113 | assert_equal(materialize[dt.uint8]().bitwidth(), 8) 114 | assert_equal(materialize[dt.uint16]().bitwidth(), 16) 115 | assert_equal(materialize[dt.uint32]().bitwidth(), 32) 116 | assert_equal(materialize[dt.uint64]().bitwidth(), 64) 117 | assert_equal(materialize[dt.bool_]().bitwidth(), 1) 118 | assert_equal(materialize[dt.float32]().bitwidth(), 32) 119 | assert_equal(materialize[dt.float64]().bitwidth(), 64) 120 | assert_equal(dt.list_(materialize[dt.int64]()).bitwidth(), 0) 121 | 122 | 123 | def main(): 124 | TestSuite.discover_tests[__functions_in_module()]().run() 125 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | ## Project Overview 6 | 7 | Marrow is an implementation of Apache Arrow in Mojo. Apache Arrow is a cross-language development platform for in-memory data with a standardized columnar memory format. This implementation is in early/experimental stages as Mojo itself is under heavy development. 8 | 9 | For information about the Mojo programming language and the standard library see https://github.com/modular/modular 10 | 11 | ## Build System & Commands 12 | 13 | This project uses **pixi** as the package manager. All commands are run through pixi: 14 | 15 | ```bash 16 | # Run all tests 17 | pixi run test 18 | 19 | # Format code 20 | pixi run fmt 21 | 22 | # Build package 23 | pixi run package 24 | ``` 25 | 26 | ### Running Individual Tests 27 | 28 | To run tests for a specific module: 29 | ```bash 30 | mojo test marrow/tests/test_dtypes.mojo -I . 31 | mojo test marrow/arrays/tests/test_primitive.mojo -I . 32 | ``` 33 | 34 | The `-I .` flag is important as it adds the current directory to the import path. 35 | 36 | ## Core Architecture 37 | 38 | ### Memory Ownership Model 39 | 40 | The codebase follows a strict ownership hierarchy (documented in `marrow/MEMORY.md`): 41 | 42 | 1. **ArrayData** - Low-level structure that owns: 43 | - Data type (`dtype`) 44 | - Validity bitmap (`bitmap`) 45 | - Data buffers (`buffers`) 46 | - Child arrays (`children` for nested types) 47 | 48 | 2. **Typed Arrays** - High-level views that own an `ArrayData`: 49 | - `PrimitiveArray[T]` for numeric/boolean types 50 | - `StringArray` for UTF-8 strings 51 | - `ListArray` for nested lists 52 | - `StructArray` for nested structs 53 | - `ChunkedArray` for arrays split across multiple chunks 54 | 55 | 3. **Array Trait** - All typed arrays implement: 56 | - `fn take_data(deinit self) -> ArrayData` - Consumes self to create standalone ArrayData 57 | - `fn as_data[self_origin](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData]` - Read-only reference to wrapped ArrayData 58 | 59 | ### Key Abstractions 60 | 61 | **Buffer & Bitmap** (`marrow/buffers.mojo`): 62 | - `Buffer` - Manages contiguous memory regions with reference counting via `ArcPointer` 63 | - `Bitmap` - Tracks null/validity for array elements using bit-packing 64 | - Both use 64-byte alignment for SIMD optimization 65 | 66 | **DataType** (`marrow/dtypes.mojo`): 67 | - Enum-based type system matching Arrow specification 68 | - Supports primitive types (bool, int8-64, uint8-64, float32/64) 69 | - Nested types via `list_(DataType)` and `struct_(Field, ...)` 70 | - Uses `code` field for type identification and optional `native` field for DType mapping 71 | 72 | **C Data Interface** (`marrow/c_data.mojo`): 73 | - `CArrowSchema` and `CArrowArray` for zero-copy data exchange 74 | - Primary use case: interop with PyArrow via `from_pyarrow()` and `to_pyarrow()` 75 | - Release callbacks not yet fully implemented (Mojo limitation with C function callbacks) 76 | 77 | ### Directory Structure 78 | 79 | ``` 80 | marrow/ 81 | ├── dtypes.mojo # Type system (DataType, Field) 82 | ├── buffers.mojo # Memory management (Buffer, Bitmap) 83 | ├── arrays/ 84 | │ ├── base.mojo # ArrayData & Array trait 85 | │ ├── primitive.mojo # PrimitiveArray[T] for fixed-size types 86 | │ ├── binary.mojo # StringArray, BinaryArray (variable-length) 87 | │ ├── nested.mojo # ListArray, StructArray 88 | │ └── chunked_array.mojo # ChunkedArray (multiple ArrayData) 89 | |-- module # Export functions for the python module 90 | ├── c_data.mojo # Arrow C Data Interface 91 | ├── schema.mojo # Schema with Fields and metadata 92 | ├── tests/ # Core module tests 93 | └── test_fixtures/ # Shared test utilities 94 | python/ # The Python module top level 95 | ``` 96 | 97 | ## Implementation Patterns 98 | 99 | ### Creating Arrays 100 | 101 | ```mojo 102 | # From values (primitive) 103 | from marrow.arrays import array 104 | var a = array[int8](1, 2, 3, 4) 105 | 106 | # Building incrementally (string) 107 | var s = StringArray() 108 | s.unsafe_append("hello") 109 | s.unsafe_append("world") 110 | 111 | # From PyArrow (zero-copy) 112 | var c_array = CArrowArray.from_pyarrow(pyarrow_array) 113 | var c_schema = CArrowSchema.from_pyarrow(pyarrow_array.type) 114 | var dtype = c_schema.to_dtype() 115 | var data = c_array.to_array(dtype) 116 | ``` 117 | 118 | ### Null Handling 119 | 120 | Arrays use a validity bitmap where `True` = valid, `False` = null: 121 | - Check validity: `array.is_valid(index)` or `array.data.is_valid(index)` 122 | - Access values: `unsafe_get(index)` (no bounds/null checking for performance) 123 | - Set values: `unsafe_set(index, value)` 124 | 125 | ### Type Constraints 126 | 127 | Mojo lacks dynamic dispatch, so the codebase uses: 128 | - Compile-time parameterization (`PrimitiveArray[int64]`) 129 | - Common `ArrayData` layout with specialized typed views 130 | - Runtime type checking via `DataType.code` comparison 131 | 132 | ## Known Limitations 133 | 134 | 1. **Type system**: Variant elements must be copyable; references/lifetimes still evolving 135 | 2. **C callbacks**: Release callbacks in C Data Interface not called (Mojo limitation) 136 | 3. **Testing**: Relies on PyArrow for conformance testing until Mojo has JSON library 137 | 4. **Coverage**: Only bool, numeric, string, list, struct types implemented 138 | 5. **RecordBatch/Table**: Not yet implemented 139 | 140 | ## Dependencies 141 | 142 | - Mojo `<1.0.0` (nightly builds from conda-forge and modular channels) 143 | - PyArrow `>=19.0.1, <21` (for testing and C Data Interface validation) 144 | 145 | ## Mojo Version Notes 146 | 147 | This codebase targets recent Mojo versions (as of Sept 2025 commits, using ~v25.7.0). Lifetime enforcement is being phased in. When working on this code: 148 | - Use `var ^` for move semantics 149 | - Use `deinit` for consuming parameters 150 | - ArcPointer is used for shared ownership of buffers/bitmaps 151 | - Many methods use `raises` for error propagation 152 | -------------------------------------------------------------------------------- /marrow/arrays/base.mojo: -------------------------------------------------------------------------------- 1 | from .primitive import * 2 | from ..buffers import Buffer, Bitmap 3 | 4 | 5 | trait Array(Movable, Representable, Sized, Stringable, Writable): 6 | fn take_data(deinit self) -> ArrayData: 7 | """Construct an ArrayData by consuming self.""" 8 | ... 9 | 10 | fn as_data[ 11 | self_origin: ImmutOrigin 12 | ](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData, mut=False]: 13 | """Return a read only reference to the ArrayData wrapped by self. 14 | 15 | Note that ideally the output type would be `ref [self_origin] ArrayData` but this is not supported yet. 16 | https://forum.modular.com/t/how-to-mark-a-trait-as-applying-to-not-register-passable/2265/6?u=mseritan 17 | """ 18 | ... 19 | 20 | 21 | @fieldwise_init 22 | struct ArrayData(Copyable, Movable, Representable, Stringable, Writable): 23 | """ArrayData is the lower level abstraction directly usable by the library consumer. 24 | 25 | Equivalent with https://github.com/apache/arrow/blob/7184439dea96cd285e6de00e07c5114e4919a465/cpp/src/arrow/array/data.h#L62-L84. 26 | """ 27 | 28 | var dtype: DataType 29 | var length: Int 30 | var bitmap: ArcPointer[Bitmap] 31 | var buffers: List[ArcPointer[Buffer]] 32 | var children: List[ArcPointer[ArrayData]] 33 | var offset: Int 34 | 35 | @staticmethod 36 | fn from_buffer[ 37 | dtype: DataType 38 | ](var buffer: Buffer, length: Int) -> ArrayData: 39 | """Build an ArrayData from a buffer where all the values are not null. 40 | """ 41 | var bitmap = Bitmap.alloc(length) 42 | bitmap.unsafe_range_set(0, length, True) 43 | return ArrayData( 44 | dtype=materialize[dtype](), 45 | length=length, 46 | bitmap=ArcPointer(bitmap^), 47 | buffers=List(ArcPointer(buffer^)), 48 | children=List[ArcPointer[ArrayData]](), 49 | offset=0, 50 | ) 51 | 52 | fn __copyinit__(out self, existing: Self): 53 | self.dtype = existing.dtype.copy() 54 | self.length = existing.length 55 | self.bitmap = existing.bitmap 56 | self.buffers = existing.buffers.copy() 57 | self.children = existing.children.copy() 58 | self.offset = existing.offset 59 | 60 | fn is_valid(self, index: Int) -> Bool: 61 | return self.bitmap[].unsafe_get(index + self.offset) 62 | 63 | fn as_primitive[T: DataType](var self) raises -> PrimitiveArray[T]: 64 | return PrimitiveArray[T](self^) 65 | 66 | fn as_int8(var self) raises -> Int8Array: 67 | return Int8Array(self^) 68 | 69 | fn as_int16(var self) raises -> Int16Array: 70 | return Int16Array(self^) 71 | 72 | fn as_int32(var self) raises -> Int32Array: 73 | return Int32Array(self^) 74 | 75 | fn as_int64(var self) raises -> Int64Array: 76 | return Int64Array(self^) 77 | 78 | fn as_uint8(var self) raises -> UInt8Array: 79 | return UInt8Array(self^) 80 | 81 | fn as_uint16(var self) raises -> UInt16Array: 82 | return UInt16Array(self^) 83 | 84 | fn as_uint32(var self) raises -> UInt32Array: 85 | return UInt32Array(self^) 86 | 87 | fn as_uint64(var self) raises -> UInt64Array: 88 | return UInt64Array(self^) 89 | 90 | fn as_float32(var self) raises -> Float32Array: 91 | return Float32Array(self^) 92 | 93 | fn as_float64(var self) raises -> Float64Array: 94 | return Float64Array(self^) 95 | 96 | fn as_string(var self) raises -> StringArray: 97 | return StringArray(self^) 98 | 99 | fn as_list(var self) raises -> ListArray: 100 | return ListArray(self^) 101 | 102 | fn _dynamic_write[W: Writer](self, index: Int, mut writer: W): 103 | """Write to the given stream dispatching on the dtype.""" 104 | 105 | @parameter 106 | for known_type in [ 107 | DType.bool, 108 | DType.int16, 109 | DType.int32, 110 | DType.int64, 111 | DType.int8, 112 | DType.float32, 113 | DType.float64, 114 | DType.uint16, 115 | DType.uint32, 116 | DType.uint64, 117 | DType.uint8, 118 | ]: 119 | if self.dtype.native == known_type: 120 | writer.write(self.buffers[0][].unsafe_get[known_type](index)) 121 | return 122 | if self.dtype.is_string(): 123 | # Should print a StringArray through the element specific write_to. 124 | writer.write("") 125 | return 126 | writer.write("dtype=") 127 | writer.write(self.dtype) 128 | 129 | fn write_to[W: Writer](self, mut writer: W): 130 | """ 131 | Formats this ArrayData to the provided Writer. 132 | 133 | Parameters: 134 | W: A type conforming to the Writable trait. 135 | 136 | Args: 137 | writer: The object to write to. 138 | """ 139 | 140 | for i in range(self.length): 141 | if self.is_valid(i): 142 | var real_index = i + self.offset 143 | self._dynamic_write(real_index, writer) 144 | else: 145 | writer.write("-") 146 | writer.write(" ") 147 | if i > 10: 148 | break 149 | 150 | fn __str__(self) -> String: 151 | return String.write(self) 152 | 153 | fn __repr__(self) -> String: 154 | return String.write(self) 155 | 156 | fn append_to_array( 157 | deinit self: ArrayData, mut combined: ArrayData, start: Int 158 | ) -> Int: 159 | """Append the content self to the combined array, consumes self. 160 | 161 | Args: 162 | combined: Array to append to. 163 | start: Position where to append. 164 | 165 | Returns: 166 | The new start position. 167 | """ 168 | combined.bitmap[].extend(self.bitmap[], start, self.length) 169 | combined.buffers.extend(self.buffers^) 170 | combined.children.extend(self.children^) 171 | return start + self.length 172 | -------------------------------------------------------------------------------- /marrow/test_fixtures/arrays.mojo: -------------------------------------------------------------------------------- 1 | from marrow.arrays import ( 2 | BoolArray, 3 | ArrayData, 4 | ListArray, 5 | StructArray, 6 | ) 7 | from memory import ArcPointer 8 | from marrow.buffers import Buffer, Bitmap 9 | from marrow.dtypes import uint8, DataType, list_, int32, Field, struct_ 10 | from testing import assert_equal 11 | from builtin._location import __call_location 12 | 13 | 14 | fn as_bool_array_scalar(value: Bool) -> BoolArray.scalar: 15 | """Bool conversion function.""" 16 | return BoolArray.scalar(Scalar[DType.bool](value)) 17 | 18 | 19 | fn bool_array(*values: Bool) -> BoolArray: 20 | var a = BoolArray(len(values)) 21 | for value in values: 22 | a.unsafe_append(as_bool_array_scalar(value)) 23 | return a^ 24 | 25 | 26 | def build_array_data(length: Int, nulls: Int) -> ArrayData: 27 | """Builds an ArrayData object with nulls. 28 | 29 | Args: 30 | length: The length of the array. 31 | nulls: The number of nulls to set. 32 | """ 33 | var bitmap = Bitmap.alloc(length) 34 | var buffer = Buffer.alloc[DType.uint8](length) 35 | for i in range(length): 36 | buffer.unsafe_set(i, i % 256) 37 | # Check to see if the current index should be valid or null. 38 | var is_valid = True 39 | if nulls > 0: 40 | if i % (Int(length / nulls)) == 0: 41 | is_valid = False 42 | bitmap.unsafe_set(i, is_valid) 43 | 44 | var buffers = List(ArcPointer(buffer^)) 45 | return ArrayData( 46 | dtype=materialize[uint8](), 47 | length=length, 48 | bitmap=ArcPointer(bitmap^), 49 | buffers=buffers^, 50 | children=List[ArcPointer[ArrayData]](), 51 | offset=0, 52 | ) 53 | 54 | 55 | @always_inline 56 | def assert_bitmap_set( 57 | bitmap: Bitmap, expected_true_pos: List[Int], message: StringLiteral 58 | ) -> None: 59 | var list_pos = 0 60 | for i in range(bitmap.length()): 61 | var expected_value = False 62 | if list_pos < len(expected_true_pos): 63 | if expected_true_pos[list_pos] == i: 64 | expected_value = True 65 | list_pos += 1 66 | var current_value = bitmap.unsafe_get(i) 67 | assert_equal( 68 | current_value, 69 | expected_value, 70 | String( 71 | "{}: Bitmap index {} is {}, expected {} as per list position {}" 72 | ).format(message, i, current_value, expected_value, list_pos), 73 | location=__call_location(), 74 | ) 75 | 76 | 77 | fn build_list_of_int[data_type: DataType]() raises -> ListArray: 78 | """Build a test ListArray that itself contains a ListArray of IntArrays.""" 79 | # Define all the values. 80 | var bitmap = Bitmap.alloc(10) 81 | bitmap.unsafe_range_set(0, 10, True) 82 | var buffer = ArcPointer(Buffer.alloc[data_type.native](10)) 83 | for i in range(10): 84 | buffer[].unsafe_set[data_type.native](i, i + 1) 85 | 86 | var value_data = ArrayData( 87 | dtype=materialize[data_type](), 88 | length=10, 89 | bitmap=ArcPointer(bitmap^), 90 | buffers=List(buffer), 91 | children=List[ArcPointer[ArrayData]](), 92 | offset=0, 93 | ) 94 | 95 | # Define the PrimitiveArrays. 96 | var value_offset = ArcPointer( 97 | Buffer.from_values[DType.int32](0, 2, 4, 7, 7, 8, 10) 98 | ) 99 | 100 | var list_bitmap = ArcPointer(Bitmap.alloc(6)) 101 | list_bitmap[].unsafe_range_set(0, 6, True) 102 | list_bitmap[].unsafe_set(3, False) 103 | var list_data = ArrayData( 104 | dtype=list_(materialize[data_type]()), 105 | length=6, 106 | buffers=List(value_offset), 107 | children=List(ArcPointer(value_data^)), 108 | bitmap=list_bitmap, 109 | offset=0, 110 | ) 111 | return ListArray(list_data^) 112 | 113 | 114 | fn build_list_of_list[data_type: DataType]() raises -> ListArray: 115 | """Build a test ListArray that itself contains a ListArray of IntArrays. 116 | 117 | See: https://elferherrera.github.io/arrow_guide/arrays_nested.html 118 | """ 119 | 120 | # Define all the values. 121 | var bitmap = ArcPointer(Bitmap.alloc(10)) 122 | bitmap[].unsafe_range_set(0, 10, True) 123 | var buffer = ArcPointer(Buffer.alloc[data_type.native](10)) 124 | for i in range(10): 125 | buffer[].unsafe_set[data_type.native](i, i + 1) 126 | 127 | var value_data = ArrayData( 128 | dtype=materialize[data_type](), 129 | length=10, 130 | bitmap=bitmap, 131 | buffers=List(buffer), 132 | children=List[ArcPointer[ArrayData]](), 133 | offset=0, 134 | ) 135 | 136 | # Define the PrimitiveArrays. 137 | var value_offset = ArcPointer( 138 | Buffer.from_values[DType.int32](0, 2, 4, 7, 7, 8, 10) 139 | ) 140 | 141 | var list_bitmap = ArcPointer(Bitmap.alloc(6)) 142 | list_bitmap[].unsafe_range_set(0, 6, True) 143 | list_bitmap[].unsafe_set(3, False) 144 | var list_data = ArrayData( 145 | dtype=list_(materialize[data_type]()), 146 | length=6, 147 | buffers=List(value_offset), 148 | children=List(ArcPointer(value_data^)), 149 | bitmap=list_bitmap, 150 | offset=0, 151 | ) 152 | 153 | # Now define the master array data. 154 | var top_offsets = Buffer.from_values[DType.int32](0, 2, 5, 6) 155 | var top_bitmap = ArcPointer(Bitmap.alloc(4)) 156 | top_bitmap[].unsafe_range_set(0, 4, True) 157 | return ListArray( 158 | ArrayData( 159 | dtype=list_(list_(materialize[data_type]())), 160 | length=4, 161 | buffers=List(ArcPointer(top_offsets^)), 162 | children=List(ArcPointer(list_data^)), 163 | bitmap=top_bitmap, 164 | offset=0, 165 | ) 166 | ) 167 | 168 | 169 | def build_struct() -> StructArray: 170 | var int_data_a = ArrayData.from_buffer[int32]( 171 | Buffer.from_values[DType.int32](1, 2, 3, 4, 5), 5 172 | ) 173 | var field_1 = Field("int_data_a", materialize[int32]()) 174 | 175 | var int_data_b = ArrayData.from_buffer[int32]( 176 | Buffer.from_values[DType.int32](10, 20, 30), 3 177 | ) 178 | var field_2 = Field("int_data_b", materialize[int32]()) 179 | bitmap = Bitmap.alloc(2) 180 | bitmap.unsafe_range_set(0, 2, True) 181 | var struct_array_data = ArrayData( 182 | dtype=struct_(List(field_1^, field_2^)), 183 | length=2, 184 | bitmap=ArcPointer(bitmap^), 185 | offset=0, 186 | buffers=List[ArcPointer[Buffer]](), 187 | children=List(ArcPointer(int_data_a^), ArcPointer(int_data_b^)), 188 | ) 189 | return StructArray(data=struct_array_data^) 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # In-progress implementation of Apache Arrow in Mojo 2 | 3 | Initial motivation for this project was to learn the Mojo programming language and the best is to learn by doing. Since I've been involved in the Apache Arrow project for a while, I thought it would be a good idea to implement the Arrow specification in Mojo. 4 | 5 | The implementation is far from being complete or usable in practice, but I prefer to share it its early stage so others can join the effort. 6 | 7 | ### What is Arrow? 8 | 9 | [Apache Arrow](https://arrow.apache.org) is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware like CPUs and GPUs. 10 | 11 | ### What is Mojo? 12 | 13 | [Mojo](https://www.modular.com/mojo) is promising new programming language built on top of MLIR providing the expressiveness of Python, with the performance of systems programming languages. 14 | 15 | ### Why Arrow in Mojo? 16 | 17 | I find the Mojo language really promising and Arrow should be a first-class citizen in Mojo's ecosystem. Since the language itself is still in its early stages, under heavy development, this Arrow implementation is still in an experimental phase. 18 | 19 | ## Currently implemented abstractions 20 | 21 | - `Buffer` providing the memory management for contiguous memory regions. 22 | - `DataType` for defining the `Arrow` data types. 23 | - `ArrayData` as the common layout for all `Arrow` arrays. 24 | - Typed array views for primitive, string and nested arrow arrays providing more convenient and efficient access to the underlying `ArrayData`. 25 | - [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) to exchange arrow data between other implementations in a zero-copy manner, but only one direction is implemented for now. 26 | 27 | ## Examples 28 | 29 | ### Creating a primitive array 30 | 31 | ```mojo 32 | from marrow.arrays import array, StringArray, ListArray, Int64Array 33 | from marrow.dtypes import int8, bool_, list_ 34 | 35 | var a = array[int8](1, 2, 3, 4) 36 | var b = array[bool_](True, False, True) 37 | ``` 38 | 39 | ### Creating a string array 40 | 41 | ```mojo 42 | var s = StringArray() 43 | s.unsafe_append("hello") 44 | s.unsafe_append("world") 45 | ``` 46 | 47 | More convenient APIs are planned to be added in the future. 48 | 49 | ### Creating a list array 50 | 51 | ```mojo 52 | var ints = Int64Array() 53 | var lists = ListArray(ints) 54 | 55 | ints.append(1) 56 | ints.append(2) 57 | ints.append(3) 58 | lists.unsafe_append(True) 59 | assert_equal(len(lists), 1) 60 | assert_equal(lists.data.dtype, list_(int64)) 61 | ``` 62 | 63 | ### Formatting arrays for display 64 | 65 | ```mojo 66 | from marrow.io import Formatter 67 | from marrow.arrays import array 68 | from marrow.dtypes import int32 69 | 70 | var arr = array[int32](1, 2, 3, 4, 5, 6, 7, 8, 9, 10) 71 | var output = String() 72 | var formatter = Formatter(limit=3) # Show first 3 elements 73 | formatter.format(output, arr) 74 | print(output) 75 | # Output: PrimitiveArray[DataType(code=int32)]([1, 2, 3, ...]) 76 | ``` 77 | 78 | The formatter supports all array types including nested structures like lists and structs, and automatically handles NULL values. 79 | 80 | ### Zero-copy access of a PyArrow array in Mojo 81 | 82 | For more details see the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). 83 | 84 | ```mojo 85 | var pa = Python.import_module("pyarrow") 86 | var pyarr = pa.array( 87 | [1, 2, 3, 4, 5], mask=[False, False, False, False, True] 88 | ) 89 | 90 | var c_array = CArrowArray.from_pyarrow(pyarr) 91 | var c_schema = CArrowSchema.from_pyarrow(pyarr.type) 92 | 93 | var dtype = c_schema.to_dtype() 94 | assert_equal(dtype, int64) 95 | assert_equal(c_array.length, 5) 96 | assert_equal(c_array.null_count, 1) 97 | assert_equal(c_array.offset, 0) 98 | assert_equal(c_array.n_buffers, 2) 99 | assert_equal(c_array.n_children, 0) 100 | 101 | var data = c_array.to_array(dtype) 102 | var array = data.as_int64() 103 | assert_equal(array.bitmap[].size, 64) 104 | assert_equal(array.is_valid(0), True) 105 | assert_equal(array.is_valid(1), True) 106 | assert_equal(array.is_valid(2), True) 107 | assert_equal(array.is_valid(3), True) 108 | assert_equal(array.is_valid(4), False) 109 | assert_equal(array.unsafe_get(0), 1) 110 | assert_equal(array.unsafe_get(1), 2) 111 | assert_equal(array.unsafe_get(2), 3) 112 | assert_equal(array.unsafe_get(3), 4) 113 | assert_equal(array.unsafe_get(4), 0) 114 | 115 | array.unsafe_set(0, 10) 116 | assert_equal(array.unsafe_get(0), 10) 117 | assert_equal(str(pyarr), "[\n 10,\n 2,\n 3,\n 4,\n null\n]") 118 | ``` 119 | 120 | ## Rough edges and limitations 121 | 122 | So far the implementation has been focused to provide a solid foundation for further development, not for memory efficiency, performance or completeness. 123 | 124 | A couple of notable limitations: 125 | 126 | 1. The chosen abstractions may not be ideal, but: 127 | - mojo lacks support for dynamic dispatch at the moment 128 | - variant elements must be copyable 129 | - references and lifetimes are not hardened yet 130 | - expressing nested data types is not straightforward 131 | 132 | Due to these reasons polymorphism is achieved by defining a common layout for type hierarchies and providing specialized views for each child type. This approach seems to work well for nested `DataType` and `Array` types and the implementation can be continued while `Mojo` gains the necessary features to rethink theses abstractions. 133 | 134 | 2. The `C Data Interface` doesn't call the release callbacks yet and only consuming arrow data is implemented for now because a `Mojo` callback cannot be passed to a `C` function yet. As mojo matures, this limitation will be certainly addressed. 135 | 136 | 3. Testing of the conformance against the `Arrow` specification is done by reading arrow data from the python implementation `PyArrow` since `Mojo` can already call python functions. If the project manages to evolve further, it should be wired into the arrow integration testing suite, but first that requires a `JSON` library in `Mojo`. 137 | 138 | 4. Only boolean, numeric, string, list and struct datatypes are supported for now since these cover most of the implementation complexity. Support for the rest of the arrow data types can be added incrementally. 139 | 140 | 5. A convenient API hasn't been designed yet, preferably that should be tackled once the implementation is more mature. 141 | 142 | 6. No `ChunkedArray`s, `RecordBatch`es, `Table`s are implemented yet, but soon they will be. 143 | 144 | ## Development 145 | 146 | I shared the implementation it its current state so others can join the effort. 147 | If the project manages to evolve, ideally it should be donated to the upstream Apache Arrow project. 148 | 149 | Please install pixi by following the instructions in the [documentation](https://pixi.sh/latest/installation/). 150 | The tests can be run with: 151 | 152 | ```bash 153 | pixi run test 154 | 155 | ```bash 156 | 157 | ## References 158 | 159 | - [Another effort to implement Arrow in Mojo](https://github.com/mojo-data/arrow.mojo) 160 | -------------------------------------------------------------------------------- /marrow/arrays/nested.mojo: -------------------------------------------------------------------------------- 1 | from memory import ArcPointer 2 | from ..buffers import Buffer, Bitmap 3 | 4 | 5 | struct ListArray(Array): 6 | var data: ArrayData 7 | var capacity: Int 8 | 9 | fn __init__(out self, var data: ArrayData) raises: 10 | if not data.dtype.is_list(): 11 | raise Error( 12 | "Unexpected dtype {} instead of 'list'".format(data.dtype) 13 | ) 14 | elif len(data.buffers) != 1: 15 | raise Error("ListArray requires exactly one buffer") 16 | elif len(data.children) != 1: 17 | raise Error("ListArray requires exactly one child array") 18 | 19 | self.capacity = data.length 20 | self.data = data^ 21 | 22 | fn bitmap(self) -> ArcPointer[Bitmap]: 23 | return self.data.bitmap 24 | 25 | fn offsets(self) -> ArcPointer[Buffer]: 26 | return self.data.buffers[0] 27 | 28 | fn values(self) -> ArcPointer[ArrayData]: 29 | return self.data.children[0] 30 | 31 | fn __init__[T: Array](out self, var values: T, capacity: Int = 1): 32 | """Initialize a list with the given values. 33 | 34 | Default capacity is at least 1 to accomodate the values. 35 | 36 | Args: 37 | values: Array to use as the first element in the ListArray. 38 | capacity: The capacity of the ListArray. 39 | """ 40 | var values_data = values^.take_data() 41 | var list_dtype = list_(values_data.dtype.copy()) 42 | 43 | var bitmap = Bitmap.alloc(capacity) 44 | bitmap.unsafe_set(0, True) 45 | var offsets = Buffer.alloc[DType.uint32](capacity + 1) 46 | offsets.unsafe_set[DType.uint32](0, 0) 47 | offsets.unsafe_set[DType.uint32](1, values_data.length) 48 | 49 | self.capacity = capacity 50 | self.data = ArrayData( 51 | dtype=list_dtype^, 52 | length=1, 53 | bitmap=ArcPointer(bitmap^), 54 | buffers=List(ArcPointer(offsets^)), 55 | children=List(ArcPointer(values_data^)), 56 | offset=0, 57 | ) 58 | 59 | fn __moveinit__(out self, deinit existing: Self): 60 | self.data = existing.data^ 61 | self.capacity = existing.capacity 62 | 63 | fn __len__(self) -> Int: 64 | return self.data.length 65 | 66 | fn as_data[ 67 | self_origin: ImmutOrigin 68 | ](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData, mut=False]: 69 | return LegacyUnsafePointer(to=self.data) 70 | 71 | fn take_data(deinit self) -> ArrayData: 72 | return self.data^ 73 | 74 | fn is_valid(self, index: Int) -> Bool: 75 | return self.bitmap()[].unsafe_get(index) 76 | 77 | fn unsafe_append(mut self, is_valid: Bool): 78 | self.bitmap()[].unsafe_set(self.data.length, is_valid) 79 | self.offsets()[].unsafe_set[DType.uint32]( 80 | self.data.length + 1, self.values()[].length 81 | ) 82 | self.data.length += 1 83 | 84 | fn unsafe_get(self, index: Int, out array_data: ArrayData) raises: 85 | """Access the value at a given index in the list array. 86 | 87 | Use an out argument to allow the caller to re-use memory while iterating over a pyarrow structure. 88 | """ 89 | var start = Int( 90 | self.offsets()[].unsafe_get[DType.int32](self.data.offset + index) 91 | ) 92 | var end = Int( 93 | self.offsets()[].unsafe_get[DType.int32]( 94 | self.data.offset + index + 1 95 | ) 96 | ) 97 | ref first_child = self.data.children[0][] 98 | return ArrayData( 99 | dtype=first_child.dtype.copy(), 100 | bitmap=first_child.bitmap, 101 | buffers=first_child.buffers.copy(), 102 | offset=start, 103 | length=end - start, 104 | children=first_child.children.copy(), 105 | ) 106 | 107 | fn write_to[W: Writer](self, mut writer: W): 108 | """ 109 | Formats this ListArray to the provided Writer. 110 | 111 | Parameters: 112 | W: A type conforming to the Writable trait. 113 | 114 | Args: 115 | writer: The object to write to. 116 | """ 117 | 118 | writer.write("ListArray(") 119 | writer.write("length=") 120 | writer.write(self.data.length) 121 | writer.write(")") 122 | 123 | fn __str__(self) -> String: 124 | return String.write(self) 125 | 126 | fn __repr__(self) -> String: 127 | return String.write(self) 128 | 129 | 130 | struct StructArray(Array): 131 | var data: ArrayData 132 | var fields: List[Field] 133 | var capacity: Int 134 | 135 | fn __init__( 136 | out self, 137 | var fields: List[Field], 138 | capacity: Int = 0, 139 | ): 140 | var bitmap = Bitmap.alloc(capacity) 141 | bitmap.unsafe_range_set(0, capacity, True) 142 | 143 | var struct_dtype = struct_(fields) 144 | 145 | self.capacity = capacity 146 | self.fields = fields^ 147 | self.data = ArrayData( 148 | dtype=struct_dtype^, 149 | length=0, 150 | bitmap=ArcPointer(bitmap^), 151 | buffers=List[ArcPointer[Buffer]](), 152 | children=List[ArcPointer[ArrayData]](), 153 | offset=0, 154 | ) 155 | 156 | fn __init__(out self, *, var data: ArrayData): 157 | self.fields = data.dtype.fields.copy() 158 | self.capacity = data.length 159 | self.data = data^ 160 | 161 | fn __moveinit__(out self, deinit existing: Self): 162 | self.data = existing.data^ 163 | self.fields = existing.fields^ 164 | self.capacity = existing.capacity 165 | 166 | fn __len__(self) -> Int: 167 | return self.data.length 168 | 169 | fn take_data(deinit self) -> ArrayData: 170 | return self.data^ 171 | 172 | fn as_data[ 173 | self_origin: ImmutOrigin 174 | ](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData, mut=False]: 175 | return LegacyUnsafePointer(to=self.data) 176 | 177 | fn write_to[W: Writer](self, mut writer: W): 178 | """ 179 | Formats this StructArray to the provided Writer. 180 | 181 | Parameters: 182 | W: A type conforming to the Writable trait. 183 | 184 | Args: 185 | writer: The object to write to. 186 | """ 187 | 188 | writer.write("StructArray(") 189 | writer.write("length=") 190 | writer.write(self.data.length) 191 | writer.write(")") 192 | 193 | fn _index_for_field_name(self, name: StringSlice) raises -> Int: 194 | for idx, ref field in enumerate(self.data.dtype.fields): 195 | if field.name == name: 196 | return idx 197 | 198 | raise Error("Field {} does not exist in this StructArray.".format(name)) 199 | 200 | fn unsafe_get( 201 | self, name: StringSlice 202 | ) raises -> ref [self.data.children[0]] ArrayData: 203 | """Access the field with the given name in the struct.""" 204 | return self.data.children[self._index_for_field_name(name)][] 205 | 206 | fn __str__(self) -> String: 207 | return String.write(self) 208 | 209 | fn __repr__(self) -> String: 210 | return String.write(self) 211 | -------------------------------------------------------------------------------- /marrow/arrays/primitive.mojo: -------------------------------------------------------------------------------- 1 | from memory import ArcPointer 2 | from ..buffers import Buffer, Bitmap 3 | from ..dtypes import * 4 | from sys import size_of 5 | 6 | 7 | fn drop_nulls[ 8 | T: DType 9 | ]( 10 | mut buffer: ArcPointer[Buffer], 11 | mut bitmap: ArcPointer[Bitmap], 12 | buffer_start: Int, 13 | buffer_end: Int, 14 | ) -> None: 15 | """Drop nulls from a region in the buffer. 16 | 17 | Args: 18 | buffer: The buffer to drop nulls from. 19 | bitmap: The validity bitmap. 20 | buffer_start: The start individualx of the buffer. 21 | buffer_end: The end index of the buffer. 22 | """ 23 | var start = buffer_start 24 | # Find the end of a run of valid bits. 25 | start = start + bitmap[].count_leading_bits(start, value=True) 26 | while start < buffer_end: 27 | # Find the end of the run of nulls, could be just one null. 28 | var leading = bitmap[].count_leading_bits(start, value=False) 29 | var end_nulls = start + leading 30 | end_nulls = min(end_nulls, buffer_end) 31 | 32 | # Find the end of the run of values after the end of nulls. 33 | var end_values = end_nulls + bitmap[].count_leading_bits( 34 | end_nulls, value=True 35 | ) 36 | end_values = min(end_values, buffer_end) 37 | var values_len = end_values - end_nulls 38 | if values_len == 0: 39 | # No valid entries to move, just skip. 40 | start = end_nulls 41 | continue 42 | 43 | # Compact the data. 44 | memcpy( 45 | dest=buffer[].get_ptr_at(start), 46 | src=buffer[].get_ptr_at(end_nulls), 47 | count=values_len * size_of[T](), 48 | ) 49 | # Adjust the bitmp. 50 | var new_values_start = start 51 | var new_values_end = start + values_len 52 | var new_nulls_end = end_values 53 | bitmap[].unsafe_range_set( 54 | new_values_start, new_values_end - new_values_start, True 55 | ) 56 | bitmap[].unsafe_range_set( 57 | new_values_end, new_nulls_end - new_values_end, False 58 | ) 59 | 60 | # Get ready for next iteration. 61 | start = new_values_end 62 | 63 | 64 | struct PrimitiveArray[T: DataType](Array): 65 | """An Arrow array of primitive types.""" 66 | 67 | comptime dtype = Self.T 68 | comptime scalar = Scalar[Self.T.native] 69 | var data: ArrayData 70 | var offset: Int 71 | var capacity: Int 72 | 73 | fn __init__(out self, var data: ArrayData, offset: Int = 0) raises: 74 | # TODO(kszucs): put a dtype constraint here 75 | if data.dtype != materialize[Self.T](): 76 | raise Error( 77 | "Unexpected dtype '{}' instead of '{}'.".format( 78 | data.dtype, materialize[Self.T]() 79 | ) 80 | ) 81 | elif len(data.buffers) != 1: 82 | raise Error("PrimitiveArray requires exactly one buffer") 83 | 84 | self.offset = data.offset + offset 85 | self.capacity = data.length 86 | self.data = data^ 87 | 88 | fn __init__(out self, capacity: Int = 0, offset: Int = 0): 89 | self.capacity = capacity 90 | self.offset = offset 91 | bitmap = ArcPointer(Bitmap.alloc(capacity)) 92 | buffer = ArcPointer(Buffer.alloc[Self.T.native](capacity)) 93 | self.data = ArrayData( 94 | dtype=materialize[Self.T](), 95 | length=0, 96 | bitmap=bitmap, 97 | buffers=List(buffer), 98 | children=List[ArcPointer[ArrayData]](), 99 | offset=self.offset, 100 | ) 101 | 102 | fn __moveinit__(out self, deinit existing: Self): 103 | self.data = existing.data^ 104 | self.capacity = existing.capacity 105 | self.offset = existing.offset 106 | 107 | fn bitmap(self) -> ref [self.data.bitmap] ArcPointer[Bitmap]: 108 | return self.data.bitmap 109 | 110 | fn buffer(self) -> ref [self.data.buffers] ArcPointer[Buffer]: 111 | return self.data.buffers[0] 112 | 113 | fn take_data(deinit self) -> ArrayData: 114 | return self.data^ 115 | 116 | fn as_data[ 117 | self_origin: ImmutOrigin 118 | ](ref [self_origin]self) -> LegacyUnsafePointer[ArrayData, mut=False]: 119 | return LegacyUnsafePointer(to=self.data) 120 | 121 | fn grow(mut self, capacity: Int): 122 | self.bitmap()[].grow(capacity) 123 | self.buffer()[].grow[Self.T.native](capacity) 124 | self.capacity = capacity 125 | 126 | @always_inline 127 | fn __len__(self) -> Int: 128 | return self.data.length 129 | 130 | @always_inline 131 | fn is_valid(self, index: Int) -> Bool: 132 | return self.bitmap()[].unsafe_get(index + self.offset) 133 | 134 | @always_inline 135 | fn unsafe_get(self, index: Int) -> Self.scalar: 136 | return self.buffer()[].unsafe_get[Self.T.native](index + self.offset) 137 | 138 | @always_inline 139 | fn unsafe_set(mut self, index: Int, value: Self.scalar): 140 | self.bitmap()[].unsafe_set(index + self.offset, True) 141 | self.buffer()[].unsafe_set[Self.T.native](index + self.offset, value) 142 | 143 | @always_inline 144 | fn unsafe_append(mut self, value: Self.scalar): 145 | self.unsafe_set(self.data.length, value) 146 | self.data.length += 1 147 | 148 | @staticmethod 149 | fn nulls(size: Int) raises -> PrimitiveArray[Self.T]: 150 | """Creates a new PrimitiveArray filled with null values.""" 151 | var bitmap = Bitmap.alloc(size) 152 | bitmap.unsafe_range_set(0, size, False) 153 | var buffer = Buffer.alloc[Self.T.native](size) 154 | return PrimitiveArray[Self.T]( 155 | data=ArrayData( 156 | dtype=materialize[Self.T](), 157 | length=size, 158 | bitmap=ArcPointer(bitmap^), 159 | buffers=List(ArcPointer(buffer^)), 160 | children=List[ArcPointer[ArrayData]](), 161 | offset=0, 162 | ), 163 | ) 164 | 165 | fn append(mut self, value: Self.scalar): 166 | if self.data.length >= self.capacity: 167 | self.grow(max(self.capacity * 2, self.data.length + 1)) 168 | self.unsafe_append(value) 169 | 170 | # fn append(mut self, value: Optional[Self.scalar]): 171 | 172 | fn extend(mut self, values: List[self.scalar]): 173 | if self.__len__() + len(values) >= self.capacity: 174 | self.grow(self.capacity + len(values)) 175 | for value in values: 176 | self.unsafe_append(value) 177 | 178 | fn drop_nulls[dtype: DType](mut self) -> None: 179 | """Drops null values from the Array. 180 | 181 | Currently we drop nulls from individual buffers, we do not delete buffers. 182 | """ 183 | drop_nulls[dtype]( 184 | self.data.buffers[0], self.data.bitmap, 0, self.data.length 185 | ) 186 | self.data.length = self.bitmap()[].buffer.bit_count() 187 | 188 | fn null_count(self) -> Int: 189 | """Returns the number of null values in the array.""" 190 | var valid_count = self.bitmap()[].buffer.bit_count() 191 | return self.data.length - valid_count 192 | 193 | fn write_to[W: Writer](self, mut writer: W): 194 | """ 195 | Formats this PrimitiveArray to the provided Writer. 196 | 197 | Parameters: 198 | W: A type conforming to the Writable trait. 199 | 200 | Args: 201 | writer: The object to write to. 202 | """ 203 | 204 | writer.write("PrimitiveArray( dtype=") 205 | writer.write(materialize[Self.dtype]()) 206 | writer.write(", offset=") 207 | writer.write(self.offset) 208 | writer.write(", capacity=") 209 | writer.write(self.capacity) 210 | writer.write(", buffer=[") 211 | for i in range(self.capacity): 212 | if self.is_valid(i): 213 | writer.write( 214 | self.buffer()[].unsafe_get[Self.T.native](i + self.offset) 215 | ) 216 | else: 217 | writer.write("NULL") 218 | writer.write(", ") 219 | if i > 10: 220 | writer.write("...") 221 | break 222 | writer.write("])") 223 | 224 | fn __str__(self) -> String: 225 | return String.write(self) 226 | 227 | fn __repr__(self) -> String: 228 | return String.write(self) 229 | 230 | 231 | comptime BoolArray = PrimitiveArray[bool_] 232 | comptime Int8Array = PrimitiveArray[int8] 233 | comptime Int16Array = PrimitiveArray[int16] 234 | comptime Int32Array = PrimitiveArray[int32] 235 | comptime Int64Array = PrimitiveArray[int64] 236 | comptime UInt8Array = PrimitiveArray[uint8] 237 | comptime UInt16Array = PrimitiveArray[uint16] 238 | comptime UInt32Array = PrimitiveArray[uint32] 239 | comptime UInt64Array = PrimitiveArray[uint64] 240 | comptime Float32Array = PrimitiveArray[float32] 241 | comptime Float64Array = PrimitiveArray[float64] 242 | -------------------------------------------------------------------------------- /marrow/arrays/tests/test_primitive.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | 3 | 4 | from marrow.arrays import * 5 | from marrow.test_fixtures.bool_array import as_bool_array_scalar 6 | from marrow.test_fixtures.arrays import build_array_data, assert_bitmap_set 7 | 8 | 9 | def test_boolean_array(): 10 | var a = BoolArray() 11 | assert_equal(len(a), 0) 12 | assert_equal(a.capacity, 0) 13 | 14 | a.grow(3) 15 | assert_equal(len(a), 0) 16 | assert_equal(a.capacity, 3) 17 | 18 | a.append(as_bool_array_scalar(True)) 19 | a.append(as_bool_array_scalar(False)) 20 | a.append(as_bool_array_scalar(True)) 21 | assert_equal(len(a), 3) 22 | assert_equal(a.capacity, 3) 23 | 24 | a.append(as_bool_array_scalar(True)) 25 | assert_equal(len(a), 4) 26 | assert_equal(a.capacity, 6) 27 | assert_true(a.is_valid(0)) 28 | assert_true(a.is_valid(1)) 29 | assert_true(a.is_valid(2)) 30 | assert_true(a.is_valid(3)) 31 | 32 | var d = a^.take_data() 33 | assert_equal(d.length, 4) 34 | 35 | var b = d^.as_primitive[bool_]() 36 | 37 | 38 | def test_append(): 39 | var a = Int8Array() 40 | assert_equal(len(a), 0) 41 | assert_equal(a.capacity, 0) 42 | a.append(1) 43 | a.append(2) 44 | a.append(3) 45 | assert_equal(len(a), 3) 46 | assert_true(a.capacity >= len(a)) 47 | 48 | 49 | def test_array_from_ints(): 50 | var g = array[int8](1, 2) 51 | assert_equal(len(g), 2) 52 | assert_equal(materialize[g.dtype](), materialize[int8]()) 53 | assert_equal(g.unsafe_get(0), 1) 54 | assert_equal(g.unsafe_get(1), 2) 55 | 56 | 57 | def test_drop_null() -> None: 58 | """Test the drop null function.""" 59 | var array_data = build_array_data(10, 5) 60 | 61 | var primitive_array = PrimitiveArray[uint8](array_data^) 62 | # 63 | # Check the setup. 64 | assert_equal(primitive_array.null_count(), 5) 65 | assert_bitmap_set( 66 | primitive_array.bitmap()[], List[Int](1, 3, 5, 7, 9), "check setup" 67 | ) 68 | 69 | primitive_array.drop_nulls[DType.uint8]() 70 | assert_equal(primitive_array.unsafe_get(0), 1) 71 | assert_equal(primitive_array.unsafe_get(1), 3) 72 | assert_equal(primitive_array.null_count(), 0) 73 | assert_bitmap_set( 74 | primitive_array.bitmap()[], List[Int](0, 1, 2, 3, 4), "after drop" 75 | ) 76 | 77 | 78 | def test_primitive_array_with_offset(): 79 | """Test PrimitiveArray with offset functionality.""" 80 | # Create a regular array first 81 | var arr = Int32Array(10) 82 | arr.unsafe_set(0, 100) 83 | arr.unsafe_set(1, 200) 84 | arr.unsafe_set(2, 300) 85 | arr.unsafe_set(3, 400) 86 | arr.unsafe_set(4, 500) 87 | 88 | # Default offset should be 0 89 | assert_equal(arr.offset, 0) 90 | assert_equal(arr.unsafe_get(0), 100) 91 | assert_equal(arr.unsafe_get(1), 200) 92 | 93 | # Create a copy of array with offset, should point to the same buffers. 94 | var arr_data = arr.as_data() 95 | var arr_with_offset = PrimitiveArray[int32](arr_data[].copy(), offset=2) 96 | assert_equal(arr_with_offset.offset, 2) 97 | 98 | # Test that offset affects get operations 99 | assert_equal(arr_with_offset.unsafe_get(0), 300) # Should get arr[2] 100 | assert_equal(arr_with_offset.unsafe_get(1), 400) # Should get arr[3] 101 | assert_equal(arr_with_offset.unsafe_get(2), 500) # Should get arr[4] 102 | 103 | # Test that offset affects set operations 104 | arr_with_offset.unsafe_set(3, 999) # Should set arr[5] 105 | assert_equal(arr.unsafe_get(5), 999) 106 | 107 | 108 | def test_primitive_array_moveinit_with_offset(): 109 | """Test __moveinit__ preserves offset.""" 110 | var arr = Int16Array(5, offset=3) 111 | arr.unsafe_set(0, 123) 112 | 113 | var moved_arr = arr^ 114 | assert_equal(moved_arr.offset, 3) 115 | assert_equal(moved_arr.unsafe_get(0), 123) 116 | 117 | 118 | def test_primitive_array_constructor_with_offset(): 119 | """Test PrimitiveArray constructor with offset parameter.""" 120 | var arr1 = Int8Array(10) # Default offset=0 121 | assert_equal(arr1.offset, 0) 122 | 123 | var arr2 = Int8Array(10, offset=5) # Explicit offset 124 | assert_equal(arr2.offset, 5) 125 | 126 | # Test that data.offset is also set correctly 127 | assert_equal(arr2.data.offset, 5) 128 | 129 | 130 | def test_primitive_array_offset_with_validity(): 131 | """Test that offset works correctly with validity bitmap.""" 132 | var arr = UInt8Array(10, offset=1) 133 | 134 | # Set some values with validity 135 | arr.unsafe_set(0, 42) # This should set buffer[1] and bitmap[1] 136 | arr.unsafe_set(1, 43) # This should set buffer[2] and bitmap[2] 137 | 138 | # Verify values are accessible through offset 139 | assert_equal(arr.unsafe_get(0), 42) 140 | assert_equal(arr.unsafe_get(1), 43) 141 | 142 | # Verify bitmap is also offset correctly 143 | assert_true(arr.is_valid(0)) # Should check bitmap[1] 144 | assert_true(arr.is_valid(1)) # Should check bitmap[2] 145 | 146 | 147 | def test_primitive_array_nulls_with_offset(): 148 | """Test PrimitiveArray.nulls static method creates array with default offset. 149 | """ 150 | var null_arr = Int64Array.nulls(5) 151 | assert_equal(null_arr.offset, 0) 152 | assert_equal(null_arr.data.offset, 0) 153 | 154 | # All elements should be invalid (null) 155 | for i in range(5): 156 | assert_false(null_arr.is_valid(i)) 157 | 158 | 159 | def test_primitive_array_write_to(): 160 | """Test write_to method formats PrimitiveArray correctly.""" 161 | var arr = Int32Array(5) 162 | arr.append(10) 163 | arr.append(20) 164 | arr.append(30) 165 | 166 | var output = String() 167 | arr.write_to(output) 168 | 169 | # Check that output contains expected format 170 | var result = String(output) 171 | assert_true("PrimitiveArray(" in result) 172 | assert_true("dtype=" in result) 173 | assert_true("offset=" in result) 174 | assert_true("capacity=" in result) 175 | assert_true("buffer=" in result) 176 | assert_true("10" in result) # At least first value should work 177 | 178 | 179 | def test_primitive_array_write_to_with_nulls(): 180 | """Test write_to method handles null values correctly.""" 181 | var array_data = build_array_data(5, 2) 182 | var arr = PrimitiveArray[uint8](array_data^) 183 | 184 | var output = String() 185 | arr.write_to(output) 186 | 187 | # Check that output contains NULL for invalid entries 188 | var result = String(output) 189 | assert_true("PrimitiveArray(" in result) 190 | assert_true("NULL" in result) 191 | 192 | 193 | def test_primitive_array_write_to_with_offset(): 194 | """Test write_to method works correctly with offset.""" 195 | var arr = Int16Array(10, offset=2) 196 | arr.append(100) 197 | arr.append(200) 198 | 199 | var output = String() 200 | arr.write_to(output) 201 | 202 | var result = String(output) 203 | assert_true("PrimitiveArray(" in result) 204 | assert_true("offset=2" in result) 205 | # Note: Due to offset bug in write_to, values may not appear correctly 206 | 207 | 208 | def test_primitive_array_write_to_large_array(): 209 | """Test write_to method truncates large arrays with ellipsis.""" 210 | var arr = Int8Array(20) # Use capacity > 10 to trigger truncation 211 | # Fill with values 0, 1, 2, ..., 14 212 | for i in range(15): 213 | arr.append(i) 214 | 215 | var output = String() 216 | arr.write_to(output) 217 | 218 | var result = String(output) 219 | assert_true("PrimitiveArray(" in result) 220 | assert_true("..." in result) # Should truncate after 10 elements 221 | 222 | 223 | def test_primitive_array_str(): 224 | """Test __str__ method returns formatted string representation.""" 225 | var arr = array[int32](42, 84, 126) 226 | 227 | var result = arr.__str__() 228 | assert_true("PrimitiveArray(" in result) 229 | assert_true("42" in result) # At least first value should work 230 | 231 | 232 | def test_primitive_array_str_empty(): 233 | """Test __str__ method on empty array.""" 234 | var arr = Float32Array(0) 235 | 236 | var result = arr.__str__() 237 | assert_true("PrimitiveArray(" in result) 238 | assert_true("capacity=0" in result) 239 | 240 | 241 | def test_primitive_array_repr(): 242 | """Test __repr__ method returns same as __str__.""" 243 | var arr = UInt8Array(5) 244 | arr.append(255) 245 | arr.append(128) 246 | 247 | var str_result = arr.__str__() 248 | var repr_result = arr.__repr__() 249 | 250 | # Both should be identical 251 | assert_equal(str_result, repr_result) 252 | assert_equal( 253 | repr_result, 254 | ( 255 | "PrimitiveArray( dtype=DataType(code=uint8), offset=0, capacity=5," 256 | " buffer=[255, 128, NULL, NULL, NULL, ])" 257 | ), 258 | ) 259 | 260 | var arr64 = Int64Array() 261 | arr64.append(1) 262 | arr64.append(3) 263 | arr64.append(5) 264 | assert_equal( 265 | arr64.__repr__(), 266 | ( 267 | "PrimitiveArray( dtype=DataType(code=int64), offset=0, capacity=4," 268 | " buffer=[1, 3, 5, NULL, ])" 269 | ), 270 | ) 271 | 272 | 273 | def main(): 274 | TestSuite.discover_tests[__functions_in_module()]().run() 275 | -------------------------------------------------------------------------------- /marrow/tests/test_buffers.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | from marrow.test_fixtures.arrays import assert_bitmap_set 3 | 4 | from marrow.buffers import * 5 | 6 | 7 | def is_aligned[T: AnyType](ptr: LegacyUnsafePointer[T], alignment: Int) -> Bool: 8 | return (Int(ptr) % alignment) == 0 9 | 10 | 11 | def test_buffer_init(): 12 | var b = Buffer.alloc(10) 13 | assert_equal(b.size, 64) 14 | assert_true(is_aligned(b.ptr, 64)) 15 | 16 | var b1 = Buffer.alloc[DType.bool](10) 17 | assert_equal(b1.size, 64) 18 | assert_true(is_aligned(b1.ptr, 64)) 19 | 20 | var b2 = Buffer.alloc[DType.bool](64 * 8 + 1) 21 | assert_equal(b2.size, 128) 22 | assert_true(is_aligned(b2.ptr, 64)) 23 | 24 | 25 | def test_buffer_grow(): 26 | var b = Buffer.alloc(10) 27 | b.unsafe_set(0, 111) 28 | assert_equal(b.size, 64) 29 | b.grow(20) 30 | assert_equal(b.size, 64) 31 | assert_equal(b.unsafe_get(0), 111) 32 | b.grow(80) 33 | assert_equal(b.size, 128) 34 | assert_equal(b.unsafe_get(0), 111) 35 | 36 | 37 | def test_buffer_set_get(): 38 | var buf = Buffer.alloc(10) 39 | assert_equal(buf.size, 64) 40 | 41 | buf.unsafe_set(0, 42) 42 | buf.unsafe_set(1, 43) 43 | buf.unsafe_set(2, 44) 44 | assert_equal(buf.unsafe_get(0), 42) 45 | assert_equal(buf.unsafe_get(1), 43) 46 | assert_equal(buf.unsafe_get(2), 44) 47 | 48 | assert_equal(buf.size, 64) 49 | assert_equal( 50 | buf.length[DType.uint16](), 32 51 | ) # 64 bytes / 2 bytes per element 52 | # reinterpreting the underlying bits as uint16 53 | assert_equal(buf.unsafe_get[DType.uint16](0), 42 + (43 << 8)) 54 | assert_equal(buf.unsafe_get[DType.uint16](1), 44) 55 | 56 | 57 | def test_buffer_from_values(): 58 | var buf = Buffer.from_values[DType.int64](-3, 9, 81) 59 | 60 | assert_equal(buf.unsafe_get[DType.int64](0), -3) 61 | assert_equal(buf.unsafe_get[DType.int64](1), 9) 62 | assert_equal(buf.unsafe_get[DType.int64](2), 81) 63 | 64 | 65 | def test_buffer_swap(): 66 | var one = Buffer.alloc(10) 67 | one.unsafe_set(0, 111) 68 | var two = Buffer.alloc(10) 69 | two.unsafe_set(0, 222) 70 | 71 | one.swap(two) 72 | 73 | assert_equal(one.unsafe_get(0), 222) 74 | assert_equal(two.unsafe_get(0), 111) 75 | 76 | 77 | def test_bitmap(): 78 | var b = Bitmap.alloc(10) 79 | assert_equal(b.size(), 64) 80 | assert_equal(b.length(), 64 * 8) 81 | assert_equal(b.bit_count(), 0) 82 | 83 | assert_false(b.unsafe_get(0)) 84 | b.unsafe_set(0, True) 85 | assert_true(b.unsafe_get(0)) 86 | assert_equal(b.bit_count(), 1) 87 | assert_false(b.unsafe_get(1)) 88 | b.unsafe_set(1, True) 89 | assert_true(b.unsafe_get(1)) 90 | assert_equal(b.bit_count(), 2) 91 | 92 | 93 | def test_count_leading_zeros(): 94 | var b = Bitmap.alloc(10) 95 | var expected_bits = b.length() 96 | assert_equal(b.count_leading_zeros(), expected_bits) 97 | assert_equal(b.count_leading_zeros(10), expected_bits - 10) 98 | 99 | b.unsafe_set(0, True) 100 | assert_equal(b.count_leading_zeros(), 0) 101 | assert_equal(b.count_leading_zeros(1), expected_bits - 1) 102 | b.unsafe_set(0, False) 103 | 104 | var to_test = [0, 1, 7, 8, 10, 16, 31] 105 | for i in range(len(to_test)): 106 | bit_position = to_test[i] 107 | b.unsafe_set(bit_position, True) 108 | assert_equal(b.count_leading_zeros(), bit_position) 109 | if bit_position > 4: 110 | # Count with start position. 111 | assert_equal(b.count_leading_bits(4), bit_position - 4) 112 | b.unsafe_set(bit_position, False) 113 | 114 | 115 | def test_count_leading_ones(): 116 | var b = Bitmap.alloc(10) 117 | assert_equal(b.count_leading_ones(), 0) 118 | b.unsafe_set(0, True) 119 | assert_equal(b.count_leading_ones(), 1) 120 | assert_equal(b.count_leading_ones(1), 0) 121 | 122 | b.unsafe_set(1, True) 123 | assert_equal(b.count_leading_ones(), 2) 124 | assert_equal(b.count_leading_ones(1), 1) 125 | 126 | 127 | def _reset(mut bitmap: Bitmap): 128 | bitmap.unsafe_range_set(0, bitmap.length(), False) 129 | assert_bitmap_set(bitmap, [], "after _reset") 130 | 131 | 132 | def test_unsafe_range_set(): 133 | var bitmap = Bitmap.alloc(16) 134 | 135 | bitmap.unsafe_range_set(0, 10, True) 136 | assert_bitmap_set(bitmap, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "range 0-10") 137 | bitmap.unsafe_range_set(0, 10, False) 138 | assert_bitmap_set(bitmap, [], "reset") 139 | 140 | bitmap.unsafe_range_set(0, 0, True) 141 | assert_bitmap_set(bitmap, [], "range 0") 142 | 143 | var to_test = [0, 1, 7, 8, 15] 144 | for pos in range(len(to_test)): 145 | _reset(bitmap) 146 | var start_bit = to_test[pos] 147 | bitmap.unsafe_range_set(start_bit, 1, True) 148 | assert_bitmap_set(bitmap, List(start_bit), "range 1") 149 | if to_test[pos] < bitmap.length() - 1: 150 | _reset(bitmap) 151 | bitmap.unsafe_range_set(start_bit, 2, True) 152 | assert_bitmap_set(bitmap, List(start_bit, start_bit + 1), "range 2") 153 | 154 | 155 | def test_partial_byte_set(): 156 | var bitmap = Bitmap.alloc(16) 157 | 158 | bitmap.unsafe_range_set(0, 0, True) 159 | assert_bitmap_set(bitmap, [], "range 0") 160 | 161 | # Set one bit to True. 162 | bitmap.partial_byte_set(0, 0, 1, True) 163 | assert_bitmap_set(bitmap, [0], "set bit 0") 164 | 165 | # Set one bit to False. 166 | bitmap.partial_byte_set(0, 0, 1, False) 167 | assert_bitmap_set(bitmap, [], "reset bit 0") 168 | 169 | # Set multiple bits to True. 170 | bitmap.partial_byte_set(1, 2, 5, True) 171 | assert_bitmap_set(bitmap, [10, 11, 12], "set multiple bits") 172 | 173 | # Set multiple bits to False. 174 | bitmap.partial_byte_set(1, 3, 5, False) 175 | assert_bitmap_set(bitmap, [10], "reset multiple bits") 176 | 177 | 178 | def test_expand_bitmap() -> None: 179 | var bitmap = Bitmap.alloc(6) 180 | bitmap.unsafe_set(0, True) 181 | bitmap.unsafe_set(5, True) 182 | assert_bitmap_set(bitmap, [0, 5], "initial setup") 183 | 184 | # Create a new bitmap with 2 bits 185 | var new_bitmap = Bitmap.alloc(2) 186 | new_bitmap.unsafe_set(0, True) 187 | 188 | # Expand the bitmap 189 | bitmap.extend(new_bitmap, 6, 2) 190 | assert_bitmap_set(bitmap, [0, 5, 6], "after expand") 191 | 192 | 193 | def test_buffer_with_offset(): 194 | # Test Buffer with offset functionality 195 | var buf = Buffer.alloc(10) 196 | assert_equal(buf.offset, 0) # Default offset should be 0 197 | 198 | # Set values in buffer without offset 199 | buf.unsafe_set(0, 42) 200 | buf.unsafe_set(1, 43) 201 | buf.unsafe_set(2, 44) 202 | 203 | # Create buffer with offset 204 | var buf_with_offset = Buffer(buf.ptr, buf.size, buf.owns, offset=2) 205 | assert_equal(buf_with_offset.offset, 2) 206 | 207 | # Test that offset affects get operations 208 | assert_equal(buf_with_offset.unsafe_get(0), 44) # Should get buf[2] 209 | 210 | # Test that offset affects set operations 211 | buf_with_offset.unsafe_set(1, 99) # Should set buf[3] 212 | assert_equal(buf.unsafe_get(3), 99) 213 | 214 | # Test offset with boolean data type - simplified test 215 | var buf_bool = Buffer.alloc[DType.bool](16) 216 | # Test basic functionality first 217 | buf_bool.unsafe_set[DType.bool](0, True) 218 | assert_true(buf_bool.unsafe_get[DType.bool](0)) 219 | 220 | # Now test with offset - use a simple offset of 1 bit 221 | var buf_bool_offset = Buffer(buf_bool.ptr, buf_bool.size, False, offset=1) 222 | buf_bool_offset.unsafe_set[DType.bool](0, True) # Should set buf[1] 223 | assert_true(buf_bool.unsafe_get[DType.bool](1)) # Check if buf[1] was set 224 | 225 | 226 | def test_buffer_moveinit_with_offset(): 227 | # Test __moveinit__ preserves offset 228 | var buf = Buffer.alloc(5) 229 | buf.offset = 3 230 | buf.unsafe_set(0, 123) 231 | 232 | var moved_buf = buf^ 233 | assert_equal(moved_buf.offset, 3) 234 | assert_equal(moved_buf.unsafe_get(0), 123) 235 | 236 | 237 | def test_buffer_swap_with_offset(): 238 | # Test swap preserves offsets correctly 239 | var buf1 = Buffer.alloc(5) 240 | buf1.offset = 2 241 | buf1.unsafe_set(0, 111) 242 | 243 | var buf2 = Buffer.alloc(5) 244 | buf2.offset = 4 245 | buf2.unsafe_set(0, 222) 246 | 247 | buf1.swap(buf2) 248 | 249 | # After swap, buf1 should have buf2's original offset and data 250 | assert_equal(buf1.offset, 4) 251 | assert_equal(buf1.unsafe_get(0), 222) 252 | 253 | # And buf2 should have buf1's original offset and data 254 | assert_equal(buf2.offset, 2) 255 | assert_equal(buf2.unsafe_get(0), 111) 256 | 257 | 258 | def test_bitmap_with_offset(): 259 | # Test Bitmap with offset functionality 260 | var buffer = Buffer.alloc[DType.bool](16) 261 | # Set some bits in the underlying buffer 262 | buffer.unsafe_set[DType.bool](3, True) 263 | buffer.unsafe_set[DType.bool](4, False) 264 | buffer.unsafe_set[DType.bool](5, True) 265 | buffer.unsafe_set[DType.bool](6, True) 266 | 267 | var bitmap = Bitmap(buffer^, offset=3) 268 | assert_equal(bitmap.offset, 3) 269 | 270 | # Test that offset affects get operations 271 | assert_true(bitmap.unsafe_get(0)) # Should get buffer[3] 272 | assert_false(bitmap.unsafe_get(1)) # Should get buffer[4] 273 | assert_true(bitmap.unsafe_get(2)) # Should get buffer[5] 274 | assert_true(bitmap.unsafe_get(3)) # Should get buffer[6] 275 | 276 | # Test that offset affects set operations 277 | bitmap.unsafe_set(4, True) # Should set buffer[7] 278 | assert_true(bitmap.buffer.unsafe_get[DType.bool](7)) 279 | 280 | 281 | def test_bitmap_moveinit_with_offset(): 282 | # Test __moveinit__ preserves offset 283 | var buffer = Buffer.alloc[DType.bool](8) 284 | var bitmap = Bitmap(buffer^, offset=2) 285 | bitmap.unsafe_set(0, True) 286 | 287 | var moved_bitmap = bitmap^ 288 | assert_equal(moved_bitmap.offset, 2) 289 | assert_true(moved_bitmap.unsafe_get(0)) 290 | 291 | 292 | def main(): 293 | TestSuite.discover_tests[__functions_in_module()]().run() 294 | -------------------------------------------------------------------------------- /marrow/tests/test_c_data.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_equal, assert_true, assert_false, TestSuite 2 | from python import Python, PythonObject 3 | from marrow.c_data import * 4 | 5 | 6 | def test_schema_from_pyarrow(): 7 | var pa = Python.import_module("pyarrow") 8 | var pyint = pa.field("int_field", pa.int32()) 9 | var pystring = pa.field("string_field", pa.string()) 10 | var pyschema = pa.schema(Python.list()) 11 | pyschema = pyschema.append(pyint) 12 | pyschema = pyschema.append(pystring) 13 | 14 | var c_schema = CArrowSchema.from_pyarrow(pyschema) 15 | var schema = c_schema.to_dtype() 16 | 17 | assert_equal(schema.fields[0].name, "int_field") 18 | assert_equal(schema.fields[0].dtype, materialize[int32]()) 19 | assert_equal(schema.fields[1].name, "string_field") 20 | assert_equal(schema.fields[1].dtype, materialize[string]()) 21 | var writer = String() 22 | writer.write(c_schema) 23 | assert_equal(writer, 'CArrowSchema(name="", format="+s", n_children=2)') 24 | 25 | 26 | def test_primitive_array_from_pyarrow(): 27 | var pa = Python.import_module("pyarrow") 28 | var pyarr = pa.array( 29 | Python.list(1, 2, 3, 4, 5), 30 | mask=Python.list(False, False, False, False, True), 31 | ) 32 | 33 | var c_array = CArrowArray.from_pyarrow(pyarr) 34 | var c_schema = CArrowSchema.from_pyarrow(pyarr.type) 35 | 36 | var dtype = c_schema.to_dtype() 37 | assert_equal(dtype, materialize[int64]()) 38 | assert_equal(c_array.length, 5) 39 | assert_equal(c_array.null_count, 1) 40 | assert_equal(c_array.offset, 0) 41 | assert_equal(c_array.n_buffers, 2) 42 | assert_equal(c_array.n_children, 0) 43 | 44 | var data = c_array.to_array(dtype) 45 | var array = data^.as_int64() 46 | assert_equal(array.bitmap()[].size(), 64) 47 | assert_equal(array.is_valid(0), True) 48 | assert_equal(array.is_valid(1), True) 49 | assert_equal(array.is_valid(2), True) 50 | assert_equal(array.is_valid(3), True) 51 | assert_equal(array.is_valid(4), False) 52 | assert_equal(array.unsafe_get(0), 1) 53 | assert_equal(array.unsafe_get(1), 2) 54 | assert_equal(array.unsafe_get(2), 3) 55 | assert_equal(array.unsafe_get(3), 4) 56 | assert_equal(array.unsafe_get(4), 0) 57 | 58 | array.unsafe_set(0, 10) 59 | assert_equal(array.unsafe_get(0), 10) 60 | assert_equal(String(pyarr), "[\n 10,\n 2,\n 3,\n 4,\n null\n]") 61 | 62 | 63 | def test_binary_array_from_pyarrow(): 64 | var pa = Python.import_module("pyarrow") 65 | 66 | var pyarr = pa.array( 67 | Python.list("foo", "bar", "baz"), 68 | mask=Python.list(False, False, True), 69 | ) 70 | 71 | var c_array = CArrowArray.from_pyarrow(pyarr) 72 | var c_schema = CArrowSchema.from_pyarrow(pyarr.type) 73 | 74 | var dtype = c_schema.to_dtype() 75 | assert_equal(dtype, materialize[string]()) 76 | 77 | assert_equal(c_array.length, 3) 78 | assert_equal(c_array.null_count, 1) 79 | assert_equal(c_array.offset, 0) 80 | assert_equal(c_array.n_buffers, 3) 81 | assert_equal(c_array.n_children, 0) 82 | 83 | var data = c_array.to_array(dtype) 84 | var array = data^.as_string() 85 | 86 | assert_equal(array.bitmap()[].size(), 64) 87 | assert_equal(array.is_valid(0), True) 88 | assert_equal(array.is_valid(1), True) 89 | assert_equal(array.is_valid(2), False) 90 | 91 | assert_equal(String(array.unsafe_get(0)), "foo") 92 | assert_equal(String(array.unsafe_get(1)), "bar") 93 | assert_equal(String(array.unsafe_get(2)), "") 94 | 95 | array.unsafe_set(0, "qux") 96 | assert_equal(String(array.unsafe_get(0)), "qux") 97 | assert_equal(String(pyarr), '[\n "qux",\n "bar",\n null\n]') 98 | 99 | 100 | def test_list_array_from_pyarrow(): 101 | var pa = Python.import_module("pyarrow") 102 | 103 | var pylist1 = Python.list(1, 2, 3) 104 | var pylist2 = Python.list(4, 5) 105 | var pylist3 = Python.list(6, 7) 106 | var pyarr = pa.array( 107 | Python.list(pylist1, pylist2, pylist3), 108 | mask=Python.list(False, True, False), 109 | ) 110 | 111 | var c_array = CArrowArray.from_pyarrow(pyarr) 112 | var c_schema = CArrowSchema.from_pyarrow(pyarr.type) 113 | 114 | var dtype = c_schema.to_dtype() 115 | assert_equal(dtype, list_(materialize[int64]())) 116 | 117 | assert_equal(c_array.length, 3) 118 | assert_equal(c_array.null_count, 1) 119 | assert_equal(c_array.offset, 0) 120 | assert_equal(c_array.n_buffers, 2) 121 | assert_equal(c_array.n_children, 1) 122 | 123 | var data = c_array.to_array(dtype) 124 | var array = data^.as_list() 125 | 126 | assert_equal(array.bitmap()[].size(), 64) 127 | assert_equal(array.is_valid(0), True) 128 | assert_equal(array.is_valid(1), False) 129 | assert_equal(array.is_valid(2), True) 130 | 131 | var values = array.unsafe_get(0).as_int64() 132 | assert_equal(values.unsafe_get(0), 1) 133 | assert_equal(values.unsafe_get(1), 2) 134 | values.unsafe_set(0, 10) 135 | values.unsafe_set(2, 30) 136 | 137 | assert_equal( 138 | String(pyarr), 139 | ( 140 | "[\n [\n 10,\n 2,\n 30\n ],\n null,\n [\n 6,\n " 141 | " 7\n ]\n]" 142 | ), 143 | ) 144 | 145 | 146 | def test_schema_from_dtype(): 147 | var c_schema = CArrowSchema.from_dtype(materialize[int32]()) 148 | var dtype = c_schema.to_dtype() 149 | assert_equal(dtype, materialize[int32]()) 150 | 151 | var c_schema_str = CArrowSchema.from_dtype(materialize[string]()) 152 | var dtype_str = c_schema_str.to_dtype() 153 | assert_equal(dtype_str, materialize[string]()) 154 | 155 | var c_schema_bool = CArrowSchema.from_dtype(materialize[bool_]()) 156 | var dtype_bool = c_schema_bool.to_dtype() 157 | assert_equal(dtype_bool, materialize[bool_]()) 158 | 159 | var c_schema_float64 = CArrowSchema.from_dtype(materialize[float64]()) 160 | var dtype_float64 = c_schema_float64.to_dtype() 161 | assert_equal(dtype_float64, materialize[float64]()) 162 | 163 | 164 | def test_schema_to_field(): 165 | var pa = Python.import_module("pyarrow") 166 | var pyfield = pa.field("test_field", pa.int32(), nullable=True) 167 | var c_schema = CArrowSchema.from_pyarrow(pyfield) 168 | var field = c_schema.to_field() 169 | assert_equal(field.name, "test_field") 170 | assert_equal(field.dtype, materialize[int32]()) 171 | assert_equal(field.nullable, True) 172 | 173 | var pyfield_str = pa.field("string_field", pa.string(), nullable=False) 174 | var c_schema_str = CArrowSchema.from_pyarrow(pyfield_str) 175 | var field_str = c_schema_str.to_field() 176 | assert_equal(field_str.name, "string_field") 177 | assert_equal(field_str.dtype, materialize[string]()) 178 | assert_equal(field_str.nullable, False) 179 | 180 | 181 | def test_arrow_array_stream(): 182 | var pa = Python.import_module("pyarrow") 183 | var python = Python() 184 | ref cpython = python.cpython() 185 | 186 | var data = Python.dict( 187 | col1=Python.list(1.0, 2.0, 3.0, 4.0, 5.0), 188 | col2=Python.list("a", "b", "c", "d", "e"), 189 | ) 190 | var pyschema = pa.schema( 191 | python.list( 192 | pa.field("col1", pa.int64()), 193 | pa.field("col2", pa.string()), 194 | ) 195 | ) 196 | var table = pa.table(data, schema=pyschema) 197 | 198 | var array_stream = ArrowArrayStream.from_pyarrow(table, cpython) 199 | 200 | var c_schema = array_stream.c_schema() 201 | var schema = c_schema.to_dtype() 202 | assert_equal(len(schema.fields), 2) 203 | assert_equal(schema.fields[0].name, "col1") 204 | assert_equal(schema.fields[0].dtype, materialize[int64]()) 205 | assert_equal(schema.fields[1].name, "col2") 206 | assert_equal(schema.fields[1].dtype, materialize[string]()) 207 | 208 | var c_array = array_stream.c_next() 209 | assert_equal(c_array.length, 5) 210 | assert_equal(c_array.null_count, 0) 211 | 212 | var array_data = c_array.to_array(schema) 213 | assert_equal(array_data.length, 5) 214 | assert_equal(len(array_data.children), 2) 215 | 216 | var col1_array = array_data.children[0][].copy().as_int64() 217 | assert_equal(col1_array.unsafe_get(0), 1) 218 | assert_equal(col1_array.unsafe_get(4), 5) 219 | 220 | var col2_array = array_data.children[1][].copy().as_string() 221 | assert_equal(String(col2_array.unsafe_get(0)), "a") 222 | assert_equal(String(col2_array.unsafe_get(4)), "e") 223 | 224 | 225 | def test_struct_dtype_conversion(): 226 | var pa = Python.import_module("pyarrow") 227 | 228 | var struct_fields = Python.list( 229 | Python.tuple("x", pa.int32()), Python.tuple("y", pa.float64()) 230 | ) 231 | var struct_type = pa.`struct`(struct_fields) 232 | var c_schema = CArrowSchema.from_pyarrow(struct_type) 233 | var dtype = c_schema.to_dtype() 234 | 235 | assert_true(dtype.is_struct()) 236 | assert_equal(len(dtype.fields), 2) 237 | assert_equal(dtype.fields[0].name, "x") 238 | assert_equal(dtype.fields[0].dtype, materialize[int32]()) 239 | assert_equal(dtype.fields[1].name, "y") 240 | assert_equal(dtype.fields[1].dtype, materialize[float64]()) 241 | 242 | 243 | def test_list_dtype_conversion(): 244 | var pa = Python.import_module("pyarrow") 245 | 246 | var list_type = pa.list_(pa.int32()) 247 | var c_schema = CArrowSchema.from_pyarrow(list_type) 248 | var dtype = c_schema.to_dtype() 249 | 250 | assert_true(dtype.is_list()) 251 | assert_equal(dtype.fields[0].dtype, materialize[int32]()) 252 | 253 | 254 | def test_numeric_dtypes(): 255 | var pa = Python.import_module("pyarrow") 256 | 257 | var types_to_test = [ 258 | (pa.int8(), materialize[int8]()), 259 | (pa.uint8(), materialize[uint8]()), 260 | (pa.int16(), materialize[int16]()), 261 | (pa.uint16(), materialize[uint16]()), 262 | (pa.int32(), materialize[int32]()), 263 | (pa.uint32(), materialize[uint32]()), 264 | (pa.int64(), materialize[int64]()), 265 | (pa.uint64(), materialize[uint64]()), 266 | (pa.float32(), materialize[float32]()), 267 | (pa.float64(), materialize[float64]()), 268 | ] 269 | 270 | for i in range(len(types_to_test)): 271 | var type_pair = types_to_test[i] 272 | var py_type = type_pair[0] 273 | ref expected_mojo_type = type_pair[1] 274 | 275 | var c_schema = CArrowSchema.from_pyarrow(py_type) 276 | var dtype = c_schema.to_dtype() 277 | assert_equal(dtype, expected_mojo_type) 278 | 279 | 280 | # def test_schema_to_pyarrow(): 281 | # var pa = Python.import_module("pyarrow") 282 | 283 | # var struct_type = struct_( 284 | # Field("int_field", int32), 285 | # Field("string_field", string), 286 | # ) 287 | 288 | # try: 289 | # # mojo->python direction is not working yet 290 | # var c_schema = CArrowSchema.from_dtype(int32) 291 | # except Error: 292 | # pass 293 | 294 | 295 | def main(): 296 | TestSuite.discover_tests[__functions_in_module()]().run() 297 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2024 Szűcs Krisztián 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /marrow/dtypes.mojo: -------------------------------------------------------------------------------- 1 | from io.write import Writable, Writer 2 | from sys import size_of 3 | 4 | # The following enum codes are copied from the C++ implementation of Arrow 5 | 6 | # A NULL type having no physical storage 7 | comptime NA = 0 8 | 9 | # Boolean as 1 bit, LSB bit-packed ordering 10 | comptime BOOL = 1 11 | 12 | # Unsigned 8-bit little-endian integer 13 | comptime UINT8 = 2 14 | 15 | # Signed 8-bit little-endian integer 16 | comptime INT8 = 3 17 | 18 | # Unsigned 16-bit little-endian integer 19 | comptime UINT16 = 4 20 | 21 | # Signed 16-bit little-endian integer 22 | comptime INT16 = 5 23 | 24 | # Unsigned 32-bit little-endian integer 25 | comptime UINT32 = 6 26 | 27 | # Signed 32-bit little-endian integer 28 | comptime INT32 = 7 29 | 30 | # Unsigned 64-bit little-endian integer 31 | comptime UINT64 = 8 32 | 33 | # Signed 64-bit little-endian integer 34 | comptime INT64 = 9 35 | 36 | # 2-byte floating point value 37 | comptime FLOAT16 = 10 38 | 39 | # 4-byte floating point value 40 | comptime FLOAT32 = 11 41 | 42 | # 8-byte floating point value 43 | comptime FLOAT64 = 12 44 | 45 | # UTF8 variable-length string as List 46 | comptime STRING = 13 47 | 48 | # Variable-length bytes (no guarantee of UTF8-ness) 49 | comptime BINARY = 14 50 | 51 | # Fixed-size binary. Each value occupies the same number of bytes 52 | comptime FIXED_SIZE_BINARY = 15 53 | 54 | # int32_t days since the UNIX epoch 55 | comptime DATE32 = 16 56 | 57 | # int64_t milliseconds since the UNIX epoch 58 | comptime DATE64 = 17 59 | 60 | # Exact timestamp encoded with int64 since UNIX epoch 61 | # Default unit millisecond 62 | comptime TIMESTAMP = 18 63 | 64 | # Time as signed 32-bit integer, representing either seconds or 65 | # milliseconds since midnight 66 | comptime TIME32 = 19 67 | 68 | # Time as signed 64-bit integer, representing either microseconds or 69 | # nanoseconds since midnight 70 | comptime TIME64 = 20 71 | 72 | # YEAR_MONTH interval in SQL style 73 | comptime INTERVAL_MONTHS = 21 74 | 75 | # DAY_TIME interval in SQL style 76 | comptime INTERVAL_DAY_TIME = 22 77 | 78 | # Precision- and scale-based decimal type with 128 bits. 79 | comptime DECIMAL128 = 23 80 | 81 | # Defined for backward-compatibility. 82 | comptime DECIMAL = DECIMAL128 83 | 84 | # Precision- and scale-based decimal type with 256 bits. 85 | comptime DECIMAL256 = 24 86 | 87 | # A list of some logical data type 88 | comptime LIST = 25 89 | 90 | # Struct of logical types 91 | comptime STRUCT = 26 92 | 93 | # Sparse unions of logical types 94 | comptime SPARSE_UNION = 27 95 | 96 | # Dense unions of logical types 97 | comptime DENSE_UNION = 28 98 | 99 | # Dictionary-encoded type, also called "categorical" or "factor" 100 | # in other programming languages. Holds the dictionary value 101 | # type but not the dictionary itself, which is part of the 102 | # ArrayData struct 103 | comptime DICTIONARY = 29 104 | 105 | # Map, a repeated struct logical type 106 | comptime MAP = 30 107 | 108 | # Custom data type, implemented by user 109 | comptime EXTENSION = 31 110 | 111 | # Fixed size list of some logical type 112 | comptime FIXED_SIZE_LIST = 32 113 | 114 | # Measure of elapsed time in either seconds, milliseconds, microseconds 115 | # or nanoseconds. 116 | comptime DURATION = 33 117 | 118 | # Like STRING, but with 64-bit offsets 119 | comptime LARGE_STRING = 34 120 | 121 | # Like BINARY, but with 64-bit offsets 122 | comptime LARGE_BINARY = 35 123 | 124 | # Like LIST, but with 64-bit offsets 125 | comptime LARGE_LIST = 36 126 | 127 | # Calendar interval type with three fields. 128 | comptime INTERVAL_MONTH_DAY_NANO = 37 129 | 130 | # Run-end encoded data. 131 | comptime RUN_END_ENCODED = 38 132 | 133 | # String (UTF8) view type with 4-byte prefix and inline small string 134 | # optimization 135 | comptime STRING_VIEW = 39 136 | 137 | # Bytes view type with 4-byte prefix and inline small string optimization 138 | comptime BINARY_VIEW = 40 139 | 140 | # A list of some logical data type represented by offset and size. 141 | comptime LIST_VIEW = 41 142 | 143 | # Like LIST_VIEW, but with 64-bit offsets and sizes 144 | comptime LARGE_LIST_VIEW = 42 145 | 146 | 147 | struct Field(Copyable, Equatable, Movable, Representable, Stringable, Writable): 148 | var name: String 149 | var dtype: DataType 150 | var nullable: Bool 151 | 152 | fn __init__( 153 | out self, name: String, var dtype: DataType, nullable: Bool = False 154 | ): 155 | self.name = name 156 | self.dtype = dtype^ 157 | self.nullable = nullable 158 | 159 | fn __eq__(self, other: Field) -> Bool: 160 | return ( 161 | self.name == other.name 162 | and self.dtype == other.dtype 163 | and self.nullable == other.nullable 164 | ) 165 | 166 | fn __ne__(self, other: Field) -> Bool: 167 | return not self == other 168 | 169 | fn write_to[W: Writer](self, mut writer: W): 170 | """ 171 | Formats this Field to the provided Writer. 172 | 173 | Parameters: 174 | W: A type conforming to the Writable trait. 175 | 176 | Args: 177 | writer: The object to write to. 178 | """ 179 | writer.write("Field(") 180 | writer.write('name="') 181 | writer.write(self.name) 182 | writer.write('", ') 183 | writer.write("dtype=") 184 | writer.write(self.dtype) 185 | writer.write(", ") 186 | writer.write("nullable=") 187 | writer.write(self.nullable) 188 | writer.write(", ") 189 | writer.write(")") 190 | 191 | fn __str__(self) -> String: 192 | return String.write(self) 193 | 194 | fn __repr__(self) -> String: 195 | return String.write(self) 196 | 197 | 198 | struct DataType( 199 | Copyable, Equatable, Movable, Representable, Stringable, Writable 200 | ): 201 | var code: UInt8 202 | var native: DType 203 | var fields: List[Field] 204 | 205 | fn __init__(out self, *, code: UInt8): 206 | self.code = code 207 | self.native = DType.invalid 208 | self.fields = List[Field]() 209 | 210 | fn __init__(out self, native: DType): 211 | if native is DType.bool: 212 | self.code = BOOL 213 | elif native is DType.int8: 214 | self.code = INT8 215 | elif native is DType.int16: 216 | self.code = INT16 217 | elif native is DType.int32: 218 | self.code = INT32 219 | elif native is DType.int64: 220 | self.code = INT64 221 | elif native is DType.uint8: 222 | self.code = UINT8 223 | elif native is DType.uint16: 224 | self.code = UINT16 225 | elif native is DType.uint32: 226 | self.code = UINT32 227 | elif native is DType.uint64: 228 | self.code = UINT64 229 | elif native is DType.float32: 230 | self.code = FLOAT32 231 | elif native is DType.float64: 232 | self.code = FLOAT64 233 | else: 234 | self.code = NA 235 | self.native = native 236 | self.fields = List[Field]() 237 | 238 | fn __init__(out self, *, code: UInt8, native: DType): 239 | self.code = code 240 | self.native = native 241 | self.fields = List[Field]() 242 | 243 | fn __init__(out self, *, code: UInt8, fields: List[Field]): 244 | self.code = code 245 | self.native = DType.invalid 246 | self.fields = fields.copy() 247 | 248 | fn __copyinit__(out self, value: Self): 249 | self.code = value.code 250 | self.native = value.native 251 | self.fields = value.fields.copy() 252 | 253 | fn __moveinit__(out self, deinit value: Self): 254 | self.code = value.code 255 | self.native = value.native 256 | self.fields = value.fields^ 257 | 258 | fn __is__(self, other: DataType) -> Bool: 259 | return self == other 260 | 261 | fn __isnot__(self, other: DataType) -> Bool: 262 | return self != other 263 | 264 | fn __eq__(self, other: DataType) -> Bool: 265 | if self.code != other.code: 266 | return False 267 | if len(self.fields) != len(other.fields): 268 | return False 269 | for i in range(len(self.fields)): 270 | if self.fields[i] != other.fields[i]: 271 | return False 272 | return True 273 | 274 | fn __ne__(self, other: DataType) -> Bool: 275 | return not self == other 276 | 277 | fn write_to[W: Writer](self, mut writer: W): 278 | """ 279 | Formats this DataType to the provided Writer. 280 | 281 | Parameters: 282 | W: A type conforming to the Writable trait. 283 | 284 | Args: 285 | writer: The object to write to. 286 | """ 287 | writer.write("DataType(code=") 288 | if self.code == NA: 289 | writer.write("null") 290 | elif self.code == BOOL: 291 | writer.write("bool") 292 | elif self.code == UINT8: 293 | writer.write("uint8") 294 | elif self.code == INT8: 295 | writer.write("int8") 296 | elif self.code == INT16: 297 | writer.write("int16") 298 | elif self.code == INT32: 299 | writer.write("int32") 300 | elif self.code == INT64: 301 | writer.write("int64") 302 | elif self.code == LIST: 303 | writer.write("list") 304 | elif self.code == STRUCT: 305 | writer.write("struct") 306 | else: 307 | writer.write("unknown " + String(self.code)) 308 | writer.write(")") 309 | 310 | fn __str__(self) -> String: 311 | return String.write(self) 312 | 313 | fn __repr__(self) -> String: 314 | return String.write(self) 315 | 316 | fn is_bool(self) -> Bool: 317 | return self.code == BOOL 318 | 319 | fn bitwidth(self) -> UInt8: 320 | if self.code == BOOL: 321 | return 1 322 | elif self.code == INT8: 323 | return 8 324 | elif self.code == INT16: 325 | return 16 326 | elif self.code == INT32: 327 | return 32 328 | elif self.code == INT64: 329 | return 64 330 | elif self.code == UINT8: 331 | return 8 332 | elif self.code == UINT16: 333 | return 16 334 | elif self.code == UINT32: 335 | return 32 336 | elif self.code == UINT64: 337 | return 64 338 | elif self.code == FLOAT32: 339 | return 32 340 | elif self.code == FLOAT64: 341 | return 64 342 | else: 343 | return 0 344 | 345 | @always_inline 346 | fn is_boolean(self) -> Bool: 347 | return self.code == BOOL 348 | 349 | @always_inline 350 | fn is_fixed_size(self) -> Bool: 351 | return self.bitwidth() > 0 352 | 353 | @always_inline 354 | fn is_integer(self) -> Bool: 355 | # TODO(kszucs): cannot use the following because ListLiteral.__contains__ is not implemented 356 | # return self.code in [INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64] 357 | # return self.is_signed_integer() or self.is_unsigned_integer() 358 | return self.is_signed_integer() or self.is_unsigned_integer() 359 | 360 | @always_inline 361 | fn is_signed_integer(self) -> Bool: 362 | return ( 363 | self.code == INT8 364 | or self.code == INT16 365 | or self.code == INT32 366 | or self.code == INT64 367 | ) 368 | 369 | @always_inline 370 | fn is_unsigned_integer(self) -> Bool: 371 | return ( 372 | self.code == UINT8 373 | or self.code == UINT16 374 | or self.code == UINT32 375 | or self.code == UINT64 376 | ) 377 | 378 | @always_inline 379 | fn is_floating_point(self) -> Bool: 380 | return self.code == FLOAT32 or self.code == FLOAT64 381 | 382 | @always_inline 383 | fn is_numeric(self) -> Bool: 384 | return self.is_integer() or self.is_floating_point() 385 | 386 | @always_inline 387 | fn is_string(self) -> Bool: 388 | return self.code == STRING 389 | 390 | @always_inline 391 | fn is_list(self) -> Bool: 392 | return self.code == LIST 393 | 394 | @always_inline 395 | fn is_struct(self) -> Bool: 396 | return self.code == STRUCT 397 | 398 | 399 | fn list_(var value_type: DataType) -> DataType: 400 | return DataType(code=LIST, fields=List(Field("value", value_type^))) 401 | 402 | 403 | fn struct_(fields: List[Field]) -> DataType: 404 | return DataType(code=STRUCT, fields=fields) 405 | 406 | 407 | fn struct_(var *fields: Field) -> DataType: 408 | return DataType(code=STRUCT, fields=List(elements=fields^)) 409 | 410 | 411 | comptime null = DataType(code=NA) 412 | comptime bool_ = DataType(code=BOOL, native=DType.bool) 413 | comptime int8 = DataType(code=INT8, native=DType.int8) 414 | comptime int16 = DataType(code=INT16, native=DType.int16) 415 | comptime int32 = DataType(code=INT32, native=DType.int32) 416 | comptime int64 = DataType(code=INT64, native=DType.int64) 417 | comptime uint8 = DataType(code=UINT8, native=DType.uint8) 418 | comptime uint16 = DataType(code=UINT16, native=DType.uint16) 419 | comptime uint32 = DataType(code=UINT32, native=DType.uint32) 420 | comptime uint64 = DataType(code=UINT64, native=DType.uint64) 421 | comptime float16 = DataType(code=FLOAT16, native=DType.float16) 422 | comptime float32 = DataType(code=FLOAT32, native=DType.float32) 423 | comptime float64 = DataType(code=FLOAT64, native=DType.float64) 424 | comptime string = DataType(code=STRING) 425 | comptime binary = DataType(code=BINARY) 426 | 427 | comptime all_numeric_dtypes = [ 428 | int8, 429 | int16, 430 | int32, 431 | int64, 432 | uint8, 433 | uint16, 434 | uint32, 435 | uint64, 436 | float16, 437 | float32, 438 | float64, 439 | ] 440 | 441 | 442 | fn dynamic_size_of(dtype: DType) -> Int: 443 | """Get size of a dtype by dispatching to compile-time size_of.""" 444 | if dtype == DType.bool: 445 | return size_of[DType.bool]() 446 | elif dtype == DType.int8: 447 | return size_of[DType.int8]() 448 | elif dtype == DType.int16: 449 | return size_of[DType.int16]() 450 | elif dtype == DType.int32: 451 | return size_of[DType.int32]() 452 | elif dtype == DType.int64: 453 | return size_of[DType.int64]() 454 | elif dtype == DType.uint8: 455 | return size_of[DType.uint8]() 456 | elif dtype == DType.uint16: 457 | return size_of[DType.uint16]() 458 | elif dtype == DType.uint32: 459 | return size_of[DType.uint32]() 460 | elif dtype == DType.uint64: 461 | return size_of[DType.uint64]() 462 | elif dtype == DType.float32: 463 | return size_of[DType.float32]() 464 | elif dtype == DType.float64: 465 | return size_of[DType.float64]() 466 | debug_assert(False, "Can't get the size of ", dtype) 467 | return 0 468 | -------------------------------------------------------------------------------- /marrow/buffers.mojo: -------------------------------------------------------------------------------- 1 | from memory import ( 2 | LegacyUnsafePointer, 3 | memset_zero, 4 | memcpy, 5 | ArcPointer, 6 | Span, 7 | memset, 8 | ) 9 | from sys.info import simd_byte_width 10 | from sys import size_of 11 | from marrow.dtypes import dynamic_size_of 12 | import math 13 | from bit import pop_count, count_trailing_zeros 14 | 15 | 16 | fn _required_bytes(length: Int, T: DType) -> Int: 17 | var size: Int 18 | if T is DType.bool: 19 | size = math.ceildiv(length, 8) 20 | else: 21 | size = length * dynamic_size_of(T) 22 | return math.align_up(size, 64) 23 | 24 | 25 | comptime simd_width = simd_byte_width() 26 | 27 | comptime simd_widths = (simd_width, simd_width // 2, 1) 28 | 29 | 30 | struct Buffer(Movable): 31 | var ptr: LegacyUnsafePointer[UInt8] 32 | var size: Int 33 | var owns: Bool 34 | var offset: Int 35 | 36 | fn __init__( 37 | out self, 38 | ptr: LegacyUnsafePointer[UInt8], 39 | size: Int, 40 | owns: Bool = True, 41 | offset: Int = 0, 42 | ): 43 | self.ptr = ptr 44 | self.size = size 45 | self.owns = owns 46 | self.offset = offset 47 | 48 | fn __moveinit__(out self, deinit existing: Self): 49 | self.ptr = existing.ptr 50 | self.size = existing.size 51 | self.owns = existing.owns 52 | self.offset = existing.offset 53 | 54 | fn swap(mut self, mut other: Self): 55 | """Swap the content of this buffer with another buffer.""" 56 | 57 | var tmp_ptr = self.ptr 58 | var tmp_size = self.size 59 | var tmp_owns = self.owns 60 | var tmp_offset = self.offset 61 | self.ptr = other.ptr 62 | self.size = other.size 63 | self.owns = other.owns 64 | self.offset = other.offset 65 | other.ptr = tmp_ptr 66 | other.size = tmp_size 67 | other.owns = tmp_owns 68 | other.offset = tmp_offset 69 | 70 | @staticmethod 71 | fn alloc[I: Intable, //, T: DType = DType.uint8](length: I) -> Buffer: 72 | var size = _required_bytes(Int(length), T) 73 | var ptr = alloc[UInt8](size, alignment=64) 74 | memset_zero(ptr.bitcast[UInt8](), size) 75 | return Buffer(ptr, size) 76 | 77 | @staticmethod 78 | fn from_values[dtype: DType](*values: Scalar[dtype]) -> Buffer: 79 | """Build a buffer from a list of values.""" 80 | var buffer = Self.alloc[dtype](len(values)) 81 | 82 | for i in range(len(values)): 83 | buffer.unsafe_set[dtype](i, values[i]) 84 | 85 | return buffer^ 86 | 87 | @staticmethod 88 | fn view[ 89 | I: Intable, // 90 | ]( 91 | ptr: LegacyUnsafePointer[NoneType], 92 | length: I, 93 | dtype: DType = DType.uint8, 94 | ) raises -> Buffer: 95 | var size = _required_bytes(Int(length), dtype) 96 | return Buffer(ptr.bitcast[UInt8](), size, owns=False) 97 | 98 | @always_inline 99 | fn get_ptr_at(self, index: Int) -> LegacyUnsafePointer[UInt8]: 100 | return (self.ptr + index).bitcast[UInt8]() 101 | 102 | fn grow[I: Intable, //, T: DType = DType.uint8](mut self, target_length: I): 103 | if self.length[T]() >= Int(target_length): 104 | return 105 | 106 | var new = Buffer.alloc[T](target_length) 107 | memcpy( 108 | dest=new.ptr.bitcast[UInt8](), 109 | src=self.ptr.bitcast[UInt8](), 110 | count=self.size, 111 | ) 112 | self.swap(new) 113 | 114 | fn __del__(deinit self): 115 | if self.owns: 116 | self.ptr.free() 117 | 118 | @always_inline 119 | fn length[T: DType = DType.uint8](self) -> Int: 120 | @parameter 121 | if T is DType.bool: 122 | return self.size * 8 123 | else: 124 | return self.size // size_of[T]() 125 | 126 | @always_inline 127 | fn unsafe_get[T: DType = DType.uint8](self, index: Int) -> Scalar[T]: 128 | comptime output = Scalar[T] 129 | 130 | @parameter 131 | if T is DType.bool: 132 | var adjusted_index = index + self.offset 133 | var byte_index = adjusted_index // 8 134 | var bit_index = adjusted_index % 8 135 | var byte = self.ptr[byte_index] 136 | var wanted_bit = (byte >> bit_index) & 1 137 | return Scalar[T](wanted_bit) 138 | else: 139 | return self.ptr.bitcast[output]()[index + self.offset] 140 | 141 | @always_inline 142 | fn unsafe_set[ 143 | T: DType = DType.uint8 144 | ](mut self, index: Int, value: Scalar[T]): 145 | @parameter 146 | if T is DType.bool: 147 | var adjusted_index = index + self.offset 148 | var byte_index = adjusted_index // 8 149 | var bit_index = adjusted_index % 8 150 | var byte = self.ptr[byte_index] 151 | if value: 152 | self.ptr[byte_index] = byte | (1 << bit_index) 153 | else: 154 | self.ptr[byte_index] = byte & ~(1 << bit_index) 155 | else: 156 | comptime output = Scalar[T] 157 | self.ptr.bitcast[output]()[index + self.offset] = value 158 | 159 | fn bit_count(self) -> Int: 160 | """The number of bits with value 1 in the buffer.""" 161 | var start = 0 162 | var count = 0 163 | while start < self.size: 164 | if self.size - start > simd_width: 165 | count += ( 166 | self.get_ptr_at(start) 167 | .load[width=simd_width]() 168 | .reduce_bit_count() 169 | ) 170 | start += simd_width 171 | else: 172 | count += ( 173 | self.get_ptr_at(start).load[width=1]().reduce_bit_count() 174 | ) 175 | start += 1 176 | return count 177 | 178 | 179 | struct Bitmap(Movable, Representable, Stringable, Writable): 180 | 181 | """Hold information about the null records in an array.""" 182 | 183 | var buffer: Buffer 184 | var offset: Int 185 | 186 | @staticmethod 187 | fn alloc[I: Intable](length: I) -> Bitmap: 188 | return Bitmap(Buffer.alloc[DType.bool](length)) 189 | 190 | fn __init__(out self, var buffer: Buffer, offset: Int = 0): 191 | self.buffer = buffer^ 192 | self.offset = offset 193 | 194 | fn __moveinit__(out self, deinit existing: Self): 195 | self.buffer = existing.buffer^ 196 | self.offset = existing.offset 197 | 198 | fn write_to[W: Writer](self, mut writer: W): 199 | """ 200 | Formats this buffer to the provided Writer. 201 | 202 | Parameters: 203 | W: A type conforming to the Writable trait. 204 | 205 | Args: 206 | writer: The object to write to. 207 | """ 208 | 209 | for i in range(self.length()): 210 | var value = self.unsafe_get(i) 211 | if value: 212 | writer.write("T") 213 | else: 214 | writer.write("f") 215 | if i > 16: 216 | writer.write("...") 217 | break 218 | 219 | fn __str__(self) -> String: 220 | var output = String() 221 | output.write(self) 222 | return output 223 | 224 | fn __repr__(self) -> String: 225 | var output = String() 226 | output.write(self) 227 | return output 228 | 229 | fn unsafe_get(self, index: Int) -> Bool: 230 | return self.buffer.unsafe_get[DType.bool](index + self.offset) 231 | 232 | fn unsafe_set(mut self, index: Int, value: Bool) -> None: 233 | self.buffer.unsafe_set[DType.bool](index + self.offset, value) 234 | 235 | @always_inline 236 | fn length(self) -> Int: 237 | return self.buffer.length[DType.bool]() 238 | 239 | @always_inline 240 | fn size(self) -> Int: 241 | return self.buffer.size 242 | 243 | fn grow[I: Intable](mut self, target_length: I): 244 | return self.buffer.grow[DType.bool](target_length) 245 | 246 | fn bit_count(self) -> Int: 247 | """The number of bits with value 1 in the Bitmap.""" 248 | var start = 0 249 | var count = 0 250 | while start < self.buffer.size: 251 | if self.buffer.size - start > simd_width: 252 | count += ( 253 | self.buffer.get_ptr_at(start) 254 | .load[width=simd_width]() 255 | .reduce_bit_count() 256 | ) 257 | start += simd_width 258 | else: 259 | count += ( 260 | self.buffer.get_ptr_at(start) 261 | .load[width=1]() 262 | .reduce_bit_count() 263 | ) 264 | start += 1 265 | return count 266 | 267 | fn count_leading_bits(self, start: Int = 0, value: Bool = False) -> Int: 268 | """Count the number of leading bits with the given value in the bitmap, starting at a given posiion. 269 | 270 | Note that index 0 in the bitmap translates to right most bit in the first byte of the buffer. 271 | So when we are looking for leading zeros from a bitmap standpoing we need to look at 272 | trailing zeros in the bitmap's associated buffer. 273 | 274 | The SIMD API available looks at leading zeros only, we negate the input when needed. 275 | 276 | Args: 277 | start: The position where we should start counting. 278 | value: The value of the bits we want to count. 279 | 280 | Returns: 281 | The number of leadinging bits with the given value in the bitmap. 282 | """ 283 | 284 | var count = 0 285 | var index = start // 8 286 | var bit_in_first_byte = start % 8 287 | 288 | if bit_in_first_byte != 0: 289 | # Process the partial first byte by applying a mask. 290 | var loaded = self.buffer.get_ptr_at(index).load[width=1]() 291 | if value: 292 | loaded = ~loaded 293 | var mask = (1 << bit_in_first_byte) - 1 294 | loaded &= ~mask 295 | leading_zeros = Int(count_trailing_zeros(loaded)) 296 | if leading_zeros == 0: 297 | return count 298 | count = leading_zeros - bit_in_first_byte 299 | if leading_zeros != 8: 300 | # The first byte has some bits of the other value, just return the count. 301 | return count 302 | 303 | index += 1 304 | 305 | # Process full bytes. 306 | while index < self.size(): 307 | 308 | @parameter 309 | for width_index in range(len(simd_widths)): 310 | comptime width = simd_widths[width_index] 311 | if self.size() - index >= width: 312 | var loaded = self.buffer.get_ptr_at(index).load[ 313 | width=width 314 | ]() 315 | if value: 316 | loaded = ~loaded 317 | var leading_zeros = count_trailing_zeros(loaded) 318 | for i in range(width): 319 | count += Int(leading_zeros[i]) 320 | if leading_zeros[i] != 8: 321 | return count 322 | index += width 323 | # break from the simd widths loop 324 | break 325 | return count 326 | 327 | fn count_leading_zeros(self, start: Int = 0) -> Int: 328 | """Count the number of leading 0s in the given value in the bitmap, starting at a given posiion. 329 | 330 | Note that index 0 in the bitmap translates to right most bit in the first byte of the buffer. 331 | So when we are looking for leading zeros from a bitmap standpoing we need to look at 332 | trailing zeros in the bitmap's associated buffer. 333 | 334 | Args: 335 | start: The position where we should start counting. 336 | 337 | Returns: 338 | The number of leading zeros in the bitmap. 339 | """ 340 | return self.count_leading_bits(start, value=False) 341 | 342 | fn count_leading_ones(self, start: Int = 0) -> Int: 343 | """Count the number of leading 1s in the given value in the bitmap, starting at a given posiion. 344 | 345 | Note that index 0 in the bitmap translates to right most bit in the first byte of the buffer. 346 | So when we are looking for leading zeros from a bitmap standpoing we need to look at 347 | trailing zeros in the bitmap's associated buffer. 348 | 349 | Args: 350 | start: The position where we should start counting. 351 | 352 | Returns: 353 | The number of leading ones in the bitmap. 354 | """ 355 | return self.count_leading_bits(start, value=True) 356 | 357 | fn extend( 358 | mut self, 359 | other: Bitmap, 360 | start: Int, 361 | length: Int, 362 | ) -> None: 363 | """Extends the bitmap with the other's array's bitmap. 364 | 365 | Args: 366 | other: The bitmap to take content from. 367 | start: The starting index in the destination array. 368 | length: The number of elements to copy from the source array. 369 | """ 370 | var desired_size = _required_bytes(start + length, DType.bool) 371 | self.buffer.grow[DType.bool](desired_size) 372 | 373 | for i in range(length): 374 | self.unsafe_set(i + start, other.unsafe_get(i)) 375 | 376 | fn partial_byte_set( 377 | mut self, 378 | byte_index: Int, 379 | bit_pos_start: Int, 380 | bit_pos_end: Int, 381 | value: Bool, 382 | ) -> None: 383 | """Set a range of bits in one specific byte of the bitmap to the specified value. 384 | """ 385 | 386 | debug_assert( 387 | bit_pos_start >= 0 388 | and bit_pos_end <= 8 389 | and bit_pos_start <= bit_pos_end, 390 | "Invalid range: ", 391 | bit_pos_start, 392 | " to ", 393 | bit_pos_end, 394 | ) 395 | 396 | # Process the partial byte at the start, if appropriate. 397 | var mask = (1 << (bit_pos_end - bit_pos_start)) - 1 398 | mask = mask << bit_pos_start 399 | var initial_value = self.buffer.unsafe_get[DType.uint8](byte_index) 400 | var buffer_value = initial_value 401 | if value: 402 | buffer_value = buffer_value | mask 403 | else: 404 | buffer_value = buffer_value & ~mask 405 | self.buffer.unsafe_set[DType.uint8](byte_index, buffer_value) 406 | 407 | fn unsafe_range_set[ 408 | T: Intable, U: Intable, // 409 | ](mut self, start: T, length: U, value: Bool) -> None: 410 | """Set a range of bits in the bitmap to the specified value. 411 | 412 | Args: 413 | start: The starting index in the bitmap. 414 | length: The number of bits to set. 415 | value: The value to set the bits to. 416 | """ 417 | 418 | # Process the partial byte at the ends. 419 | var start_int = Int(start) 420 | var end_int = start_int + Int(length) 421 | var start_index = start_int // 8 422 | var bit_pos_start = start_int % 8 423 | var end_index = end_int // 8 424 | var bit_pos_end = end_int % 8 425 | 426 | if bit_pos_start != 0 or bit_pos_end != 0: 427 | if start_index == end_index: 428 | self.partial_byte_set( 429 | start_index, bit_pos_start, bit_pos_end, value 430 | ) 431 | else: 432 | if bit_pos_start != 0: 433 | self.partial_byte_set(start_index, bit_pos_start, 8, value) 434 | start_index += 1 435 | if bit_pos_end != 0: 436 | self.partial_byte_set(end_index, 0, bit_pos_end, value) 437 | 438 | # Now take care of the full bytes. 439 | if end_index > start_index: 440 | var byte_value = 255 if value else 0 441 | memset( 442 | self.buffer.get_ptr_at(start_index), 443 | value=byte_value, 444 | count=end_index - start_index, 445 | ) 446 | -------------------------------------------------------------------------------- /marrow/c_data.mojo: -------------------------------------------------------------------------------- 1 | from sys.ffi import external_call, c_char 2 | from memory import LegacyUnsafePointer, ArcPointer, memcpy 3 | from sys import size_of 4 | 5 | import math 6 | from python import Python, PythonObject 7 | from python._cpython import CPython, PyObjectPtr 8 | from sys.ffi import c_char 9 | from io.write import Writable, Writer 10 | 11 | from .dtypes import * 12 | from .arrays import * 13 | 14 | comptime ARROW_FLAG_NULLABLE = 2 15 | 16 | # This type of the function argument is really CArrowSchema but we are getting errors with: recursive reference to declaration. 17 | comptime CSchemaReleaseFunction = fn ( 18 | schema: LegacyUnsafePointer[UInt64] 19 | ) -> NoneType 20 | # This type of the function argument is really CArrowArray but we are getting errors with: recursive reference to declaration. 21 | comptime CArrayReleaseFunction = fn ( 22 | schema: LegacyUnsafePointer[UInt64] 23 | ) -> NoneType 24 | 25 | 26 | @fieldwise_init 27 | struct CArrowSchema(Copyable, Movable, Representable, Stringable, Writable): 28 | var format: LegacyUnsafePointer[c_char] 29 | var name: LegacyUnsafePointer[c_char] 30 | var metadata: LegacyUnsafePointer[c_char] 31 | var flags: Int64 32 | var n_children: Int64 33 | var children: LegacyUnsafePointer[LegacyUnsafePointer[CArrowSchema]] 34 | var dictionary: LegacyUnsafePointer[CArrowSchema] 35 | # TODO(kszucs): release callback must be called otherwise memory gets leaked 36 | var release: LegacyUnsafePointer[CSchemaReleaseFunction] 37 | var private_data: LegacyUnsafePointer[NoneType] 38 | 39 | fn __del__(deinit self): 40 | var this = LegacyUnsafePointer(to=self).bitcast[UInt64]() 41 | if self.release: 42 | # Calling the function leads to a crash. 43 | # self.release[](this) 44 | pass 45 | 46 | @staticmethod 47 | fn from_pyarrow(pyobj: PythonObject) raises -> CArrowSchema: 48 | var ptr = LegacyUnsafePointer[CArrowSchema].alloc(1) 49 | pyobj._export_to_c(Int(ptr)) 50 | return ptr.take_pointee() 51 | 52 | fn to_pyarrow(self) raises -> PythonObject: 53 | var pa = Python.import_module("pyarrow") 54 | var ptr = LegacyUnsafePointer(to=self) 55 | return pa.Schema._import_from_c(Int(ptr)) 56 | 57 | @staticmethod 58 | fn from_dtype(dtype: DataType) -> CArrowSchema: 59 | var fmt: String 60 | var n_children: Int64 = 0 61 | var children = LegacyUnsafePointer[LegacyUnsafePointer[CArrowSchema]]() 62 | 63 | if dtype == materialize[null](): 64 | fmt = "n" 65 | elif dtype == materialize[bool_](): 66 | fmt = "b" 67 | elif dtype == materialize[int8](): 68 | fmt = "c" 69 | elif dtype == materialize[uint8](): 70 | fmt = "C" 71 | elif dtype == materialize[int16](): 72 | fmt = "s" 73 | elif dtype == materialize[uint16](): 74 | fmt = "S" 75 | elif dtype == materialize[int32](): 76 | fmt = "i" 77 | elif dtype == materialize[uint32](): 78 | fmt = "I" 79 | elif dtype == materialize[int64](): 80 | fmt = "l" 81 | elif dtype == materialize[uint64](): 82 | fmt = "L" 83 | elif dtype == materialize[float16](): 84 | fmt = "e" 85 | elif dtype == materialize[float32](): 86 | fmt = "f" 87 | elif dtype == materialize[float64](): 88 | fmt = "g" 89 | elif dtype == materialize[binary](): 90 | fmt = "z" 91 | elif dtype == materialize[string](): 92 | fmt = "u" 93 | elif dtype.is_struct(): 94 | print("EEE") 95 | 96 | fmt = "+s" 97 | n_children = Int(len(dtype.fields)) 98 | children = LegacyUnsafePointer[ 99 | LegacyUnsafePointer[CArrowSchema] 100 | ].alloc(Int(n_children)) 101 | 102 | for i in range(n_children): 103 | var child = CArrowSchema.from_field(dtype.fields[i]) 104 | children[i].init_pointee_move(child^) 105 | else: 106 | fmt = "" 107 | # constrained[False, "Unknown dtype"]() 108 | 109 | return CArrowSchema( 110 | format=fmt.unsafe_cstr_ptr(), 111 | name=LegacyUnsafePointer[c_char](), 112 | metadata=LegacyUnsafePointer[c_char](), 113 | flags=0, 114 | n_children=n_children, 115 | children=children, 116 | dictionary=LegacyUnsafePointer[CArrowSchema](), 117 | # TODO(kszucs): currently there is no way to pass a mojo callback to C 118 | release=LegacyUnsafePointer[CSchemaReleaseFunction](), 119 | private_data=LegacyUnsafePointer[NoneType](), 120 | ) 121 | 122 | @staticmethod 123 | fn from_field(field: Field) -> CArrowSchema: 124 | var flags: Int64 = 0 # TODO: nullable 125 | 126 | var field_name = field.name 127 | return CArrowSchema( 128 | format="".unsafe_cstr_ptr(), 129 | name=field_name.unsafe_cstr_ptr(), 130 | metadata="".unsafe_cstr_ptr(), 131 | flags=flags, 132 | n_children=0, 133 | children=LegacyUnsafePointer[LegacyUnsafePointer[CArrowSchema]](), 134 | dictionary=LegacyUnsafePointer[CArrowSchema](), 135 | # TODO(kszucs): currently there is no way to pass a mojo callback to C 136 | release=LegacyUnsafePointer[CSchemaReleaseFunction](), 137 | private_data=LegacyUnsafePointer[NoneType](), 138 | ) 139 | 140 | fn to_dtype(self) raises -> DataType: 141 | var fmt = StringSlice(unsafe_from_utf8_ptr=UnsafePointer(self.format)) 142 | # TODO(kszucs): not the nicest, but dictionary literals are not supported yet 143 | if fmt == "n": 144 | return materialize[null]() 145 | elif fmt == "b": 146 | return materialize[bool_]() 147 | elif fmt == "c": 148 | return materialize[int8]() 149 | elif fmt == "C": 150 | return materialize[uint8]() 151 | elif fmt == "s": 152 | return materialize[int16]() 153 | elif fmt == "S": 154 | return materialize[uint16]() 155 | elif fmt == "i": 156 | return materialize[int32]() 157 | elif fmt == "I": 158 | return materialize[uint32]() 159 | elif fmt == "l": 160 | return materialize[int64]() 161 | elif fmt == "L": 162 | return materialize[uint64]() 163 | elif fmt == "e": 164 | return materialize[float16]() 165 | elif fmt == "f": 166 | return materialize[float32]() 167 | elif fmt == "g": 168 | return materialize[float64]() 169 | elif fmt == "z": 170 | return materialize[binary]() 171 | elif fmt == "u": 172 | return materialize[string]() 173 | elif fmt == "+l": 174 | var field = self.children[0][].to_field() 175 | return list_(field.dtype.copy()) 176 | elif fmt == "+s": 177 | var fields = List[Field]() 178 | for i in range(self.n_children): 179 | fields.append(self.children[i][].to_field()) 180 | return struct_(fields) 181 | else: 182 | raise Error("Unknown format: " + fmt) 183 | 184 | fn to_field(self) raises -> Field: 185 | var name = StringSlice(unsafe_from_utf8_ptr=UnsafePointer(self.name)) 186 | var dtype = self.to_dtype() 187 | var nullable = self.flags & ARROW_FLAG_NULLABLE 188 | return Field(String(name), dtype^, nullable != 0) 189 | 190 | fn write_to[W: Writer](self, mut writer: W): 191 | """ 192 | Formats this CArrowSchema to the provided Writer. 193 | 194 | Parameters: 195 | W: A type conforming to the Writable trait. 196 | 197 | Args: 198 | writer: The object to write to. 199 | """ 200 | writer.write("CArrowSchema(") 201 | writer.write('name="') 202 | writer.write(StringSlice(unsafe_from_utf8_ptr=UnsafePointer(self.name))) 203 | writer.write('", ') 204 | writer.write('format="') 205 | writer.write( 206 | StringSlice(unsafe_from_utf8_ptr=UnsafePointer(self.format)) 207 | ) 208 | writer.write('", ') 209 | if self.metadata: 210 | writer.write('metadata="') 211 | writer.write(self.metadata) 212 | writer.write('", ') 213 | writer.write("n_children=") 214 | writer.write(self.n_children) 215 | writer.write(")") 216 | 217 | fn __str__(self) -> String: 218 | return String.write(self) 219 | 220 | fn __repr__(self) -> String: 221 | return String.write(self) 222 | 223 | 224 | @fieldwise_init 225 | struct CArrowArray(Copyable, Movable): 226 | var length: Int64 227 | var null_count: Int64 228 | var offset: Int64 229 | var n_buffers: Int64 230 | var n_children: Int64 231 | var buffers: LegacyUnsafePointer[LegacyUnsafePointer[NoneType]] 232 | var children: LegacyUnsafePointer[LegacyUnsafePointer[CArrowArray]] 233 | var dictionary: LegacyUnsafePointer[CArrowArray] 234 | var release: LegacyUnsafePointer[CArrayReleaseFunction] 235 | var private_data: LegacyUnsafePointer[NoneType] 236 | 237 | @staticmethod 238 | fn from_pyarrow(pyobj: PythonObject) raises -> CArrowArray: 239 | var ptr = LegacyUnsafePointer[CArrowArray].alloc(1) 240 | pyobj._export_to_c(Int(ptr)) 241 | return ptr.take_pointee() 242 | 243 | fn to_array(self, dtype: DataType) raises -> ArrayData: 244 | var bitmap: ArcPointer[Bitmap] 245 | if self.buffers[0]: 246 | bitmap = ArcPointer( 247 | Bitmap(Buffer.view(self.buffers[0], self.length, DType.bool)) 248 | ) 249 | else: 250 | # bitmaps are allowed to be nullptrs by the specification, in this 251 | # case we allocate a new buffer to hold the null bitmap 252 | bitmap = ArcPointer(Bitmap.alloc(self.length)) 253 | bitmap[].unsafe_range_set(0, self.length, True) 254 | 255 | var buffers = List[ArcPointer[Buffer]]() 256 | if dtype.is_numeric() or dtype == materialize[bool_](): 257 | var buffer = Buffer.view(self.buffers[1], self.length, dtype.native) 258 | buffers.append(ArcPointer(buffer^)) 259 | elif dtype == materialize[string](): 260 | var offsets = Buffer.view( 261 | self.buffers[1], self.length + 1, DType.uint32 262 | ) 263 | var values_size = Int(offsets.unsafe_get(Int(self.length))) 264 | var values = Buffer.view(self.buffers[2], values_size, DType.uint8) 265 | buffers.append(ArcPointer(offsets^)) 266 | buffers.append(ArcPointer(values^)) 267 | elif dtype.is_list(): 268 | var offsets = Buffer.view( 269 | self.buffers[1], self.length + 1, DType.uint32 270 | ) 271 | buffers.append(ArcPointer(offsets^)) 272 | elif dtype.is_struct(): 273 | # Since the children buffers are handled below there is nothing to do here. 274 | pass 275 | else: 276 | raise Error("Unknown dtype: " + String(dtype)) 277 | 278 | var children = List[ArcPointer[ArrayData]]() 279 | for i in range(self.n_children): 280 | var child_field = dtype.fields[i].copy() 281 | var child_array = self.children[i][].to_array(child_field.dtype) 282 | children.append(ArcPointer(child_array^)) 283 | 284 | return ArrayData( 285 | dtype=dtype.copy(), 286 | length=Int(self.length), 287 | bitmap=bitmap, 288 | buffers=buffers^, 289 | children=children^, 290 | offset=Int(self.offset), 291 | ) 292 | 293 | 294 | # See: https://arrow.apache.org/docs/format/CStreamInterface.html 295 | # 296 | # We are getting some compilation errors with many "recursive" function definitions, i.e. functions in 297 | # CArrowArrayStream that take a CArrowArrayStream as an argument. The error is: recursive reference to declaration. 298 | # 299 | # As a workaround we define twp versions 300 | # of the CArrowArrayStream: 301 | # - CArrowArrayStreamOpaque defines the overall shape of the struct using opaque function prototypes. 302 | # - CArrowArrayStream defines the struct with the actual function signatures defined in terms of the Opaque variant above. 303 | # 304 | # An alternative could be to define `get_schema` and friends as methods on the struct and self would have the right type. 305 | # It is not clear if the resulting ABI would be guaranteed to be compatible with C. 306 | comptime AnyFunction = fn (LegacyUnsafePointer[NoneType]) -> UInt 307 | 308 | 309 | @fieldwise_init 310 | @register_passable("trivial") 311 | struct CArrowArrayStreamOpaque(Copyable, Movable): 312 | # Callbacks providing stream functionality 313 | var get_schema: AnyFunction 314 | var get_next: AnyFunction 315 | var get_last_error: AnyFunction 316 | 317 | # Release callback 318 | var release: AnyFunction 319 | 320 | # Opaque producer-specific data 321 | var private_data: LegacyUnsafePointer[NoneType] 322 | 323 | 324 | comptime get_schema_fn = fn ( 325 | stream: LegacyUnsafePointer[CArrowArrayStreamOpaque], 326 | out_schema: LegacyUnsafePointer[CArrowSchema], 327 | ) -> UInt 328 | 329 | comptime get_next_fn = fn ( 330 | stream: LegacyUnsafePointer[CArrowArrayStreamOpaque], 331 | out_array: LegacyUnsafePointer[CArrowArray], 332 | ) -> UInt 333 | 334 | 335 | @fieldwise_init 336 | @register_passable("trivial") 337 | struct CArrowArrayStream(Copyable, Movable): 338 | # Callbacks providing stream functionality 339 | var get_schema: get_schema_fn 340 | var get_next: fn ( 341 | stream: LegacyUnsafePointer[CArrowArrayStreamOpaque], 342 | out_array: LegacyUnsafePointer[CArrowArray], 343 | ) -> UInt 344 | var get_last_error: fn ( 345 | stream: LegacyUnsafePointer[CArrowArrayStreamOpaque] 346 | ) -> LegacyUnsafePointer[c_char] 347 | 348 | # Release callback 349 | var release: fn ( 350 | stream: LegacyUnsafePointer[CArrowArrayStreamOpaque] 351 | ) -> None 352 | 353 | # Opaque producer-specific data 354 | var private_data: LegacyUnsafePointer[NoneType] 355 | 356 | 357 | @fieldwise_init 358 | struct ArrowArrayStream(Copyable, Movable): 359 | """Provide an fiendly interface to the C Arrow Array Stream.""" 360 | 361 | var c_arrow_array_stream: LegacyUnsafePointer[CArrowArrayStreamOpaque] 362 | 363 | @staticmethod 364 | fn from_pyarrow( 365 | pyobj: PythonObject, cpython: CPython 366 | ) raises -> ArrowArrayStream: 367 | """Ask a PyArrow table for its arrow array stream interface.""" 368 | var stream = pyobj.__arrow_c_stream__() 369 | var ptr = cpython.PyCapsule_GetPointer( 370 | stream.steal_data(), "arrow_array_stream" 371 | ) 372 | if not ptr: 373 | raise Error("Failed to get the arrow array stream pointer") 374 | 375 | var alt = LegacyUnsafePointer(ptr.bitcast[CArrowArrayStreamOpaque]()) 376 | return ArrowArrayStream(alt) 377 | 378 | fn c_schema(self) raises -> CArrowSchema: 379 | """Return the C variant of the Arrow Schema.""" 380 | var schema = LegacyUnsafePointer[CArrowSchema].alloc(1) 381 | var function = LegacyUnsafePointer( 382 | to=self.c_arrow_array_stream[].get_schema 383 | ).bitcast[get_schema_fn]()[] 384 | var err = function(self.c_arrow_array_stream, schema) 385 | if err != 0: 386 | raise Error("Failed to get schema " + String(err)) 387 | if not schema: 388 | raise Error("The schema pointer is null") 389 | return schema.take_pointee() 390 | 391 | fn c_next(self) raises -> CArrowArray: 392 | """Return the next buffer in the streeam.""" 393 | var arrow_array = LegacyUnsafePointer[CArrowArray].alloc(1) 394 | var function = LegacyUnsafePointer( 395 | to=self.c_arrow_array_stream[].get_next 396 | ).bitcast[get_next_fn]()[] 397 | var err = function(self.c_arrow_array_stream, arrow_array) 398 | if err != 0: 399 | raise Error("Failed to get next arrow array " + String(err)) 400 | if not arrow_array: 401 | raise Error("The arrow array pointer is null") 402 | return arrow_array.take_pointee() 403 | --------------------------------------------------------------------------------