├── .github
└── workflows
│ └── python-ci.yml
├── .gitignore
├── .gitmodules
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
├── design.md
├── performance.md
└── pics
│ ├── hybrid_storage.png
│ └── overview.png
├── notebooks
├── huggingface_conversion.ipynb
├── incremental_embedding_index.ipynb
├── label_studio_tutorial.ipynb
├── pics
│ └── space_segment_anything_example.png
├── segment_anything_tutorial.ipynb
├── tfds_coco_tutorial.ipynb
└── webdataset_ingestion.ipynb
└── python
├── README.md
├── build_proto.sh
├── pyproject.toml
├── src
├── space
│ ├── __init__.py
│ ├── catalogs
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── directory.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── datasets.py
│ │ ├── fs
│ │ │ ├── __init__.py
│ │ │ ├── array_record.py
│ │ │ ├── arrow.py
│ │ │ ├── base.py
│ │ │ ├── factory.py
│ │ │ └── parquet.py
│ │ ├── jobs.py
│ │ ├── loaders
│ │ │ ├── __init__.py
│ │ │ ├── array_record.py
│ │ │ └── parquet.py
│ │ ├── manifests
│ │ │ ├── __init__.py
│ │ │ ├── falsifiable_filters.py
│ │ │ ├── index.py
│ │ │ └── record.py
│ │ ├── ops
│ │ │ ├── __init__.py
│ │ │ ├── append.py
│ │ │ ├── base.py
│ │ │ ├── change_data.py
│ │ │ ├── delete.py
│ │ │ ├── insert.py
│ │ │ ├── read.py
│ │ │ └── utils.py
│ │ ├── options.py
│ │ ├── proto
│ │ │ ├── __init__.py
│ │ │ ├── metadata.proto
│ │ │ ├── metadata_pb2.py
│ │ │ ├── metadata_pb2.pyi
│ │ │ ├── runtime.proto
│ │ │ ├── runtime_pb2.py
│ │ │ └── runtime_pb2.pyi
│ │ ├── random_access.py
│ │ ├── runners.py
│ │ ├── schema
│ │ │ ├── __init__.py
│ │ │ ├── arrow.py
│ │ │ ├── constants.py
│ │ │ ├── field_ids.py
│ │ │ ├── substrait.py
│ │ │ ├── types
│ │ │ │ ├── __init__.py
│ │ │ │ ├── files.py
│ │ │ │ └── tf_features.py
│ │ │ └── utils.py
│ │ ├── serializers
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── storage.py
│ │ ├── transform
│ │ │ ├── __init__.py
│ │ │ ├── join.py
│ │ │ ├── plans.py
│ │ │ ├── udfs.py
│ │ │ └── utils.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ ├── errors.py
│ │ │ ├── lazy_imports_utils.py
│ │ │ ├── paths.py
│ │ │ ├── protos.py
│ │ │ └── uuids.py
│ │ └── views.py
│ └── ray
│ │ ├── __init__.py
│ │ ├── data_sources.py
│ │ ├── ops
│ │ ├── __init__.py
│ │ ├── append.py
│ │ ├── change_data.py
│ │ ├── delete.py
│ │ ├── insert.py
│ │ ├── join.py
│ │ └── utils.py
│ │ ├── options.py
│ │ └── runners.py
└── substrait
│ ├── __init__.py
│ ├── algebra_pb2.py
│ ├── algebra_pb2.pyi
│ ├── capabilities_pb2.py
│ ├── capabilities_pb2.pyi
│ ├── extended_expression_pb2.py
│ ├── extended_expression_pb2.pyi
│ ├── extensions
│ ├── __init__.py
│ ├── extensions_pb2.py
│ └── extensions_pb2.pyi
│ ├── function_pb2.py
│ ├── function_pb2.pyi
│ ├── parameterized_types_pb2.py
│ ├── parameterized_types_pb2.pyi
│ ├── plan_pb2.py
│ ├── plan_pb2.pyi
│ ├── type_expressions_pb2.py
│ ├── type_expressions_pb2.pyi
│ ├── type_pb2.py
│ └── type_pb2.pyi
└── tests
├── catalogs
└── test_directory.py
├── core
├── conftest.py
├── fs
│ └── test_arrow.py
├── loaders
│ ├── test_array_record.py
│ └── test_parquet.py
├── manifests
│ ├── test_falsifiable_filters.py
│ ├── test_index.py
│ └── test_record.py
├── ops
│ ├── conftest.py
│ ├── test_append.py
│ ├── test_change_data.py
│ ├── test_delete.py
│ ├── test_insert.py
│ ├── test_read.py
│ └── test_utils.py
├── schema
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_arrow.py
│ ├── test_field_ids.py
│ ├── test_substrait.py
│ └── types
│ │ ├── test_files.py
│ │ └── test_tf_features.py
├── test_random_access.py
├── test_runners.py
├── test_storage.py
├── test_views.py
└── utils
│ ├── test_paths.py
│ ├── test_protos.py
│ └── test_uuids.py
└── ray
└── test_runners.py
/.github/workflows/python-ci.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: Python CI
16 |
17 | on:
18 | push:
19 | branches:
20 | - 'main'
21 | pull_request:
22 |
23 | jobs:
24 | build:
25 | runs-on: ubuntu-latest
26 | strategy:
27 | matrix:
28 | python-version: ["3.9", "3.10", "3.11"]
29 | steps:
30 | - uses: actions/checkout@v4
31 | - name: Set up Python ${{ matrix.python-version }}
32 | uses: actions/setup-python@v3
33 | with:
34 | python-version: ${{ matrix.python-version }}
35 | - name: Install test dependencies
36 | run: |
37 | python -m pip install --upgrade pip
38 | pip install mypy pylint pytest pytest-xdist mock
39 | - name: Install runtime dependencies and Space
40 | working-directory: ./python
41 | run: |
42 | pip install .[dev]
43 | - name: Analysing code with pylint
44 | working-directory: ./python/src
45 | run: |
46 | pylint space
47 | - name: Analysing test code with pylint
48 | working-directory: ./python
49 | run: |
50 | pylint tests/**/* \
51 | --disable="missing-module-docstring,missing-function-docstring,\
52 | missing-class-docstring,duplicate-code,\
53 | redefined-outer-name,too-many-arguments"
54 | - name: Checking type with mypy
55 | working-directory: ./python/src
56 | run: |
57 | mypy -p space
58 | - name: Running tests
59 | working-directory: ./python
60 | run: |
61 | pytest -n auto
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .cache
3 |
4 | # Python build
5 | __pycache__/
6 | *.egg-info/
7 | build/
8 | dist/
9 | .pytype/
10 | out/
11 | .mypy_cache/
12 | .pytest_cache/
13 |
14 | # VSCode files
15 | .vscode/
16 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "substrait"]
2 | path = substrait
3 | url = https://github.com/substrait-io/substrait.git
4 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | We'd love to accept your patches and contributions to this project.
4 |
5 | ## Before you begin
6 |
7 | ### Sign our Contributor License Agreement
8 |
9 | Contributions to this project must be accompanied by a
10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11 | You (or your employer) retain the copyright to your contribution; this simply
12 | gives us permission to use and redistribute your contributions as part of the
13 | project.
14 |
15 | If you or your current employer have already signed the Google CLA (even if it
16 | was for a different project), you probably don't need to do it again.
17 |
18 | Visit to see your current agreements or to
19 | sign a new one.
20 |
21 | ### Review our community guidelines
22 |
23 | This project follows
24 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
25 |
26 | ## Contribution process
27 |
28 | ### Code reviews
29 |
30 | All submissions, including submissions by project members, require review. We
31 | use GitHub pull requests for this purpose. Consult
32 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
33 | information on using pull requests.
34 |
--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
1 | ## Space Storage Design
2 |
3 | ### Data Files
4 |
5 | Space has a hybrid column/row oriented storage layer. For each row of data, Space stores bulky unstructured fields in random access row oriented format (record fields), and stores the addresses (pairs of file and row ID) together with the other fields in columnar files (index fields). By decoupling unstructured data and processing only addresses, it can efficiently support all OLAP/columnar style data operations, e.g., sort, join. It automatically reads data from addresses in its APIs when needed, e.g., feed data into training frameworks.
6 |
7 |
8 |
9 | Space supports the following file formats:
10 |
11 | - [Parquet](https://parquet.apache.org/) for storing columnar data.
12 | - [ArrayRecord](https://github.com/google/array_record), a high-performance random access row format for ML training. [ArrayRecord](https://www.tensorflow.org/datasets/tfless_tfds) is the successor format in [Tensorflow Datasets](https://www.tensorflow.org/datasets) after [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord).
13 |
14 | We expect to support more file formats (e.g., [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord), [Lance](https://github.com/lancedb/lance))
15 |
16 | ## Metadata Design
17 |
18 | ### Open Table Format
19 |
20 | Data warehouse/lake features are empowered by a simple, copy-on-write open table format. Its metadata files use [Protobuf](https://protobuf.dev/) and Parquet files. The metadata Parquet files (aka, manifest files) store the information of data files, i.e., file path, storage statistics, and column statistics (min, max). One row represents one data file. There are two types of manifest files, for index/record fields respectively.
21 |
22 | Users can query the manifest files as Arrow tables to get insights of the storage (method `index_manifest`). See more details in the [Segment Anything example](/notebooks/segment_anything_tutorial.ipynb).
23 |
24 | ### Relative Paths
25 |
26 | Space uses **relative file paths** everywhere in metadata that gives us superior portability. A Space dataset stored in Cloud Storage can be mapped to local files using [FUSE](https://en.wikipedia.org/wiki/Filesystem_in_Userspace). And it is immediately usable after downloading or moving. It is perfect for incrementally publishing or sharing datasets.
27 |
--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
1 | ## Cluster Setup and Performance Tuning
2 |
3 | Data operations in Space can run distributedly in a Ray cluster. Ray nodes access the same Space dataset files via Cloud Storage or distributed file systems.
4 |
5 | ## Setup
6 |
7 | ### Cloud Storage
8 |
9 | Setup [GCS FUSE](https://cloud.google.com/storage/docs/gcs-fuse) to use files on Google Cloud Storage (GCS) (or [S3](https://github.com/s3fs-fuse/s3fs-fuse), [Azure](https://github.com/Azure/azure-storage-fuse)):
10 |
11 | ```bash
12 | gcsfuse "/path/to/"
13 | ```
14 |
15 | Space has not yet implemented Cloud Storage file systems. FUSE is the current suggested approach.
16 |
17 | ### Cluster Setup
18 |
19 | On the Ray cluster head/worker nodes:
20 | ```bash
21 | # Start a Ray head node (IP 123.45.67.89, for example).
22 | # See https://docs.ray.io/en/latest/ray-core/starting-ray.html for details.
23 | ray start --head --port=6379
24 | ```
25 |
26 | Using [Cloud Storage + FUSE](#cloud-storage) is required in the distributed mode, because the Ray cluster and the client machine should operate on the same directory of files. The mapped local directory paths **must be the same**.
27 |
28 | Run the following code on the client machine to connect to the Ray cluster:
29 | ```py
30 | import ray
31 |
32 | # Connect to the Ray cluster.
33 | ray.init(address="ray://123.45.67.89:10001")
34 | ```
35 |
36 | ## Configure Space Options
37 |
38 | Create a Ray runner linking to a Space dataset or view to run operations in the Ray cluster. Use options to tune the performance.
39 |
40 | ### Data Ingestion
41 |
42 | The [WebDataset ingestion example](/notebooks/webdataset_ingestion.ipynb) describes the setup in detail. The options to tune include:
43 |
44 | - `max_parallelism`: ingestion workload will run in parallel on Ray nodes, capped by this parallelism
45 |
46 | - `array_record_options`: set the [options of ArrayRecord lib](https://github.com/google/array_record/blob/2ac1d904f6be31e5aa2f09549774af65d84bff5a/cpp/array_record_writer.h#L83); Group size is the number of records to serialize together in one chunk. A lower value improves random access latency. However, a larger value is preferred on Cloud Storage, which performs better for batch read. A larger group size reduces the ArrayRecord file size.
47 |
48 | ```py
49 | # `ds_or_view` is a Space dataset or (materialized) view.
50 | runner = ds_or_view.ray(
51 | ray_options=RayOptions(max_parallelism=4),
52 | file_options=FileOptions(
53 | array_record_options=ArrayRecordOptions(options="group_size:64")
54 | ))
55 | ```
56 |
57 | ### Data Read
58 |
59 | Data read in Ray runner has the following steps:
60 |
61 | - Obtain a list of index files to read, based on the filter and version. If a read `batch size` is provided, further split a file into row ranges. Each row range will be a [Ray data block](https://docs.ray.io/en/latest/data/api/doc/ray.data.block.Block.html).
62 |
63 | - When reading a block, first read the index file as an Arrow table. If there are record fields, read these fields from ArrayRecord files.
64 |
65 | The options to tune include:
66 |
67 | - `max_parallelism`: Ray read parallelism, controlls `parallelism` of [Datasource.get_read_tasks](https://docs.ray.io/en/latest/data/api/doc/ray.data.Datasource.get_read_tasks.html#ray.data.Datasource.get_read_tasks)
68 |
69 | - `batch_size`: a too small batch size will produce too many Ray blocks and have a negative performance impact. A large batch size will require reading many records from ArrayRecord files for each Ray block, which can be slow.
70 |
71 | Examples of setting read batch size in different scenarios:
72 |
73 | ```py
74 | ray_option = RayOptions(max_parallelism=4)
75 |
76 | iterator = ds.ray(ray_option).read(batch_size=64)
77 |
78 | mv.ray(ray_option).refresh(batch_size=64)
79 |
80 | ray_ds = ds.ray_dataset(ray_option, ReadOptions(batch_size=64))
81 | ```
82 |
83 | #### Read Data for Training
84 |
85 | Users can choose to store data fields in **Parquet** or **ArrayRecord** files (record fields). Space performance is similar to other Parquet based datasets when all fields are in Parquet.
86 |
87 | The ArrayRecord reader uses random access read at the granularity of `group size` records. Random access read performs well on local or high performance distributed file systems attached to the reader node (e.g., training VMs). However, the read performance degrades drastically on Cloud Storage, because of too many read RPCs (e.g., per record). The tips for **Cloud Storage** are:
88 |
89 | - For quasi sequential read, use a larger `group size` when ingesting data to Space. It can effectively improve read throughput by reducing number of RPC. But it helps only when adjacent records are read together.
90 |
91 | - For fully randomized read (the order to read records are shuffled), the Space dataset files should be first cached in a high performance file system.
92 |
93 | Random access read training has the benefit of lightweight global shuffling, deterministic training, and checkpointing training state. See the [Grain](https://github.com/google/grain) framework for more details. Integration with Grain is a TODO.
94 |
--------------------------------------------------------------------------------
/docs/pics/hybrid_storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/space/a97f09132bb716a4038ee686e0de3a68fb9d6b3b/docs/pics/hybrid_storage.png
--------------------------------------------------------------------------------
/docs/pics/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/space/a97f09132bb716a4038ee686e0de3a68fb9d6b3b/docs/pics/overview.png
--------------------------------------------------------------------------------
/notebooks/incremental_embedding_index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Incrementally build embedding vector indexes\n",
8 | "\n",
9 | "Space's transform and materialized view are powerful tools to incrementally process changing data. It is useful in LLM applications for incrementally generating vector embedding indexes for data in any format (text, audio, images, and videos). The vector indexes can be further used for vector search and Retrieval-Augmented Generation (RAG) in LLMs.\n",
10 | "\n",
11 | "First create a simple dataset containing input texts."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pyarrow as pa\n",
21 | "from space import Dataset\n",
22 | "\n",
23 | "schema = pa.schema([(\"id\", pa.string()), (\"text\", pa.string())])\n",
24 | "\n",
25 | "text_ds = Dataset.create(\"/space/datasets/text_db\", schema,\n",
26 | " primary_keys=[\"id\"], record_fields=[])"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Create a materialized view that builds embedding indexes:"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "from typing import Any, Dict\n",
43 | "\n",
44 | "# Example of a local embedder.\n",
45 | "# pip install spacy\n",
46 | "# python -m spacy download en_core_web_sm\n",
47 | "from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings\n",
48 | "\n",
49 | "# Example of a Cloud embedder.\n",
50 | "# pip install google-cloud-aiplatform\n",
51 | "# from langchain_community.embeddings import VertexAIEmbeddings\n",
52 | "\n",
53 | "\n",
54 | "def build_embeddings(data: Dict[str, Any]) -> Dict[str, Any]:\n",
55 | " return {\n",
56 | " \"id\": data[\"id\"],\n",
57 | " # Or, VertexAIEmbeddings()\n",
58 | " \"embeddings\": SpacyEmbeddings().embed_documents(data[\"text\"])\n",
59 | " }\n",
60 | "\n",
61 | "\n",
62 | "embeddings_view = text_ds.map_batches(\n",
63 | " fn=build_embeddings,\n",
64 | " output_schema=pa.schema([\n",
65 | " (\"id\", pa.string()),\n",
66 | " (\"embeddings\", pa.list_(pa.float64())) # output embeddings\n",
67 | " ]),\n",
68 | " # This example stores embeddings in Parquet files; we can also serialize\n",
69 | " # embeddings to bytes, and store them in ArrayRecord files.\n",
70 | " output_record_fields=[])\n",
71 | "\n",
72 | "embeddings_mv = embeddings_view.materialize(\"/space/datasets/embeddings_mv\")"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Add data into the source dataset, and refresh the MV to build indexes."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "text_ds.local().append({\n",
89 | " \"id\": [\"record_1\", \"record_2\"],\n",
90 | " \"text\": [\"This is a test string\", \"This is not a string\"],\n",
91 | "})\n",
92 | "\n",
93 | "embeddings_mv.ray().refresh()\n",
94 | "\n",
95 | "# Check the embeddings.\n",
96 | "print(embeddings_mv.local().read_all())"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "Update the source text dataset, and refresh the embeddings."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "text_ds.local().upsert({\n",
113 | " \"id\": [\"record_1\", \"record_3\"],\n",
114 | " \"text\": [\n",
115 | " \"This is the modified 1st test string\", # Override `record_1`\n",
116 | " \"The 3rd string\"],\n",
117 | "})\n",
118 | "\n",
119 | "embeddings_mv.ray().refresh()"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "Use the embedding indexes in a vector DB:"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "# pip install faiss-cpu\n",
136 | "from langchain_community.vectorstores import FAISS\n",
137 | "\n",
138 | "# Convert the embeddings to (id, embeddings) pairs.\n",
139 | "embeddings = map(\n",
140 | " lambda row: (row[\"id\"], row[\"embeddings\"]),\n",
141 | " embeddings_mv.local().read_all().to_pylist())\n",
142 | "\n",
143 | "db = FAISS.from_embeddings(text_embeddings=embeddings,\n",
144 | " embedding=SpacyEmbeddings())\n",
145 | "\n",
146 | "db.similarity_search(\"3rd string\")"
147 | ]
148 | }
149 | ],
150 | "metadata": {
151 | "language_info": {
152 | "name": "python"
153 | }
154 | },
155 | "nbformat": 4,
156 | "nbformat_minor": 2
157 | }
158 |
--------------------------------------------------------------------------------
/notebooks/pics/space_segment_anything_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/space/a97f09132bb716a4038ee686e0de3a68fb9d6b3b/notebooks/pics/space_segment_anything_example.png
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | # Space: Unified Storage for Machine Learning
2 |
3 | Unify data in your entire machine learning lifecycle with **Space**, a comprehensive storage solution that seamlessly handles data from ingestion to training.
4 |
5 | **Key Features:**
6 | - **Ground Truth Database**
7 | - Store and manage multimodal data in open source file formats, row or columnar, local or in cloud.
8 | - Ingest from various sources, including ML datasets, files, and labeling tools.
9 | - Support data manipulation (append, insert, update, delete) and version control.
10 | - **OLAP Database and Lakehouse**
11 | - [Iceberg](https://github.com/apache/iceberg) style [open table format](/docs/design.md#metadata-design).
12 | - Optimized for unstructued data via [reference](./docs/design.md#data-files) operations.
13 | - Quickly analyze data using SQL engines like [DuckDB](https://github.com/duckdb/duckdb).
14 | - **Distributed Data Processing Pipelines**
15 | - Integrate with processing frameworks like [Ray](https://github.com/ray-project/ray) for efficient data transformation.
16 | - Store processed results as Materialized Views (MVs); incrementally update MVs when the source is changed.
17 | - **Seamless Training Framework Integration**
18 | - Access Space datasets and MVs directly via random access interfaces.
19 | - Convert to popular ML dataset formats (e.g., [TFDS](https://github.com/tensorflow/datasets), [HuggingFace](https://github.com/huggingface/datasets), [Ray](https://github.com/ray-project/ray)).
20 |
--------------------------------------------------------------------------------
/python/build_proto.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/bin/bash
16 |
17 | set -e
18 |
19 | PY_FOLDER=`pwd`
20 | SRC_FOLDER="${PY_FOLDER}/src"
21 |
22 | # Build Substrait protos.
23 | cd "${PY_FOLDER}/../substrait/proto"
24 | protoc --python_out="${SRC_FOLDER}" \
25 | --mypy_out="${SRC_FOLDER}" \
26 | substrait/*.proto substrait/extensions/*.proto \
27 | --proto_path=.
28 |
29 | # Build Space protos.
30 | cd "${SRC_FOLDER}"
31 | protoc --python_out=. \
32 | --mypy_out=. \
33 | space/core/proto/*.proto \
34 | --proto_path=. \
35 | --proto_path=../../substrait/proto
36 |
--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "space-datasets"
3 | version = "0.0.11"
4 | authors = [{ name = "Space team", email = "no-reply@google.com" }]
5 | description = "Unified storage framework for machine learning datasets"
6 | readme = "README.md"
7 | license = { text = "Apache-2.0" }
8 | classifiers = [
9 | "License :: OSI Approved :: Apache Software License",
10 | "Operating System :: OS Independent",
11 | "Programming Language :: Python :: 3.9",
12 | "Programming Language :: Python :: 3.10",
13 | "Programming Language :: Python :: 3.11",
14 | ]
15 | requires-python = ">=3.9"
16 | dependencies = [
17 | "absl-py",
18 | "array-record",
19 | "cloudpickle",
20 | "numpy",
21 | "protobuf",
22 | "pyarrow >= 14.0.0",
23 | "pyroaring",
24 | "tensorflow_datasets",
25 | "typing_extensions",
26 | ]
27 |
28 | [project.optional-dependencies]
29 | dev = [
30 | "pandas == 2.1.4",
31 | "pyarrow-stubs",
32 | "ray == 2.9.1",
33 | "tensorflow",
34 | "types-protobuf",
35 | ]
36 |
37 | [project.urls]
38 | Homepage = "https://github.com/google/space"
39 | Issues = "https://github.com/google/space/issues"
40 |
41 | [build-system]
42 | requires = ["setuptools"]
43 | build-backend = "setuptools.build_meta"
44 |
45 | [tool.pytest.ini_options]
46 | addopts = ["--import-mode=importlib"]
47 | pythonpath = ["src"]
48 |
49 | [tool.pylint.format]
50 | max-line-length = 80
51 | indent-string = " "
52 | disable = [
53 | "duplicate-code",
54 | "fixme",
55 | "no-else-return",
56 | "too-few-public-methods",
57 | "too-many-instance-attributes",
58 | "too-many-locals"
59 | ]
60 |
61 | [tool.pylint.MAIN]
62 | ignore = "space/core/proto"
63 | ignored-modules = [
64 | "array_record",
65 | "datasets",
66 | "google.protobuf",
67 | "pyroaring",
68 | "space.core.proto",
69 | "substrait"
70 | ]
71 |
--------------------------------------------------------------------------------
/python/src/space/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Space is a storage framework for ML datasets."""
16 |
17 | from space.catalogs.base import DatasetInfo
18 | from space.catalogs.directory import DirCatalog
19 | from space.core.datasets import Dataset
20 | from space.core.options import (ArrayRecordOptions, FileOptions, JoinOptions,
21 | ParquetWriterOptions, Range, ReadOptions)
22 | from space.core.runners import LocalRunner
23 | from space.core.random_access import RandomAccessDataSource
24 | from space.core.schema.types import File, TfFeatures
25 | from space.core.views import MaterializedView
26 | from space.ray.options import RayOptions
27 |
--------------------------------------------------------------------------------
/python/src/space/catalogs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/catalogs/base.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Catalogs of Space datasets."""
16 |
17 | from __future__ import annotations
18 | from abc import ABC, abstractmethod
19 | from dataclasses import dataclass
20 | from typing import List, Union
21 |
22 | import pyarrow as pa
23 |
24 | from space.core.datasets import Dataset
25 | from space.core.views import MaterializedView, View
26 |
27 |
28 | class BaseCatalog(ABC):
29 | """A catalog is a container of datasets.
30 |
31 | Datasets in a catalog scope can be referenced by a dataset name uniquely.
32 | """
33 |
34 | @abstractmethod
35 | def create_dataset(self, name: str, schema: pa.Schema,
36 | primary_keys: List[str],
37 | record_fields: List[str]) -> Dataset:
38 | """Create a new empty dataset.
39 |
40 | Args:
41 | name: the dataset name.
42 | schema: the schema of the storage.
43 | primary_keys: un-enforced primary keys.
44 | record_fields: fields stored in row format (ArrayRecord).
45 | """
46 |
47 | def materialize(self, name: str, view: View):
48 | """Create a new materialized view.
49 |
50 | Args:
51 | name: the materialized view name.
52 | view: the view to be materialized.
53 | """
54 |
55 | @abstractmethod
56 | def delete_dataset(self, name: str) -> None:
57 | """Delete an existing dataset or materialized view.
58 |
59 | Args:
60 | name: the dataset name.
61 | """
62 |
63 | @abstractmethod
64 | def dataset(self, name: str) -> Union[Dataset, MaterializedView]:
65 | """Get an existing dataset or materialized view.
66 |
67 | Args:
68 | name: the dataset name.
69 | """
70 |
71 | @abstractmethod
72 | def datasets(self) -> List[DatasetInfo]:
73 | """List all datasets and materialized views in the catalog."""
74 |
75 |
76 | @dataclass
77 | class DatasetInfo:
78 | """Basic information of a dataset or materialized view."""
79 |
80 | # Dataset name.
81 | name: str
82 | # Dataset storage location.
83 | location: str
84 | # TODO: to include create time, type; it requires us to store these fields in
85 | # entry point file to avoid openning metadata file.
86 |
--------------------------------------------------------------------------------
/python/src/space/catalogs/directory.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Directory catalog implementation."""
16 |
17 | import os
18 |
19 | from typing import List, Union
20 | import pyarrow as pa
21 |
22 | from space.catalogs.base import BaseCatalog, DatasetInfo
23 | from space.core.datasets import Dataset
24 | import space.core.proto.metadata_pb2 as meta
25 | from space.core.storage import Storage
26 | from space.core.utils import errors, paths
27 | from space.core.views import MaterializedView, View, load_materialized_view
28 |
29 |
30 | class DirCatalog(BaseCatalog):
31 | """A directory catalog consists of datasets with location under the same
32 | directory.
33 |
34 | TODO: to build file system abstraction instead of directly using `os.path`,
35 | for extension to more file system types.
36 | """
37 |
38 | def __init__(self, location):
39 | self._location = location
40 |
41 | def create_dataset(self, name: str, schema: pa.Schema,
42 | primary_keys: List[str],
43 | record_fields: List[str]) -> Dataset:
44 | return Dataset.create(self._dataset_location(name), schema, primary_keys,
45 | record_fields)
46 |
47 | def materialize(self, name: str, view: View):
48 | return view.materialize(self._dataset_location(name))
49 |
50 | def delete_dataset(self, name: str) -> None:
51 | raise NotImplementedError("delete_dataset has not been implemented")
52 |
53 | def dataset(self, name: str) -> Union[Dataset, MaterializedView]:
54 | try:
55 | storage = Storage.load(self._dataset_location(name))
56 | except FileNotFoundError as e:
57 | raise errors.StorageNotFoundError(str(e)) from None
58 |
59 | if storage.metadata.type == meta.StorageMetadata.DATASET:
60 | return Dataset(storage)
61 | elif storage.metadata.type == meta.StorageMetadata.MATERIALIZED_VIEW:
62 | return load_materialized_view(storage)
63 |
64 | raise errors.SpaceRuntimeError(
65 | f"Storage type {storage.metadata.type} is not supported")
66 |
67 | def datasets(self) -> List[DatasetInfo]:
68 | results = []
69 | for ds_name in os.listdir(self._location):
70 | ds_location = self._dataset_location(ds_name)
71 | if os.path.isdir(ds_location) and os.path.isfile(
72 | paths.entry_point_path(ds_location)):
73 | results.append(DatasetInfo(ds_name, ds_location))
74 |
75 | return results
76 |
77 | def _dataset_location(self, name: str) -> str:
78 | return os.path.join(self._location, name)
79 |
--------------------------------------------------------------------------------
/python/src/space/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/datasets.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Space dataset is the interface to interact with underlying storage."""
16 |
17 | from __future__ import annotations
18 | from typing import Dict, List, Optional
19 |
20 | import pyarrow as pa
21 | from substrait.algebra_pb2 import ReadRel, Rel
22 |
23 | from space.core.options import FileOptions, JoinOptions, ReadOptions
24 | from space.core.runners import LocalRunner
25 | from space.core.storage import Storage, Version
26 | from space.core.transform.plans import LogicalPlanBuilder
27 | from space.core.utils.lazy_imports_utils import ray, ray_runners # pylint: disable=unused-import
28 | from space.core.views import View
29 | from space.ray.options import RayOptions
30 |
31 |
32 | class Dataset(View):
33 | """Dataset is the interface to interact with Space storage."""
34 |
35 | def __init__(self, storage: Storage):
36 | self._storage = storage
37 |
38 | @property
39 | def storage(self) -> Storage:
40 | """Return storage of the dataset."""
41 | return self._storage
42 |
43 | @classmethod
44 | def create(cls, location: str, schema: pa.Schema, primary_keys: List[str],
45 | record_fields: List[str]) -> Dataset:
46 | """Create a new empty dataset.
47 |
48 | Args:
49 | location: the directory path to the storage.
50 | schema: the schema of the storage.
51 | primary_keys: un-enforced primary keys.
52 | record_fields: fields stored in row format (ArrayRecord).
53 | """
54 | return Dataset(Storage.create(location, schema, primary_keys,
55 | record_fields))
56 |
57 | @classmethod
58 | def load(cls, location: str) -> Dataset:
59 | """Load an existing dataset from the given location."""
60 | return Dataset(Storage.load(location))
61 |
62 | @property
63 | def schema(self) -> pa.Schema:
64 | """Return the dataset schema."""
65 | return self._storage.logical_schema
66 |
67 | @property
68 | def primary_keys(self) -> List[str]:
69 | return self._storage.primary_keys
70 |
71 | @property
72 | def record_fields(self) -> List[str]:
73 | return self._storage.record_fields
74 |
75 | def add_tag(self, tag: str, snapshot_id: Optional[int] = None):
76 | """Add tag to a dataset."""
77 | self._storage.add_tag(tag, snapshot_id)
78 |
79 | def remove_tag(self, tag: str):
80 | """Remove tag from a dataset."""
81 | self._storage.remove_tag(tag)
82 |
83 | def add_branch(self, branch: str):
84 | """Add branch to a dataset."""
85 | self._storage.add_branch(branch)
86 |
87 | def remove_branch(self, branch: str):
88 | """Remove branch for a dataset."""
89 | self._storage.remove_branch(branch)
90 |
91 | def set_current_branch(self, branch: str):
92 | """Set current branch for the dataset."""
93 | self._storage.set_current_branch(branch)
94 |
95 | def local(self, file_options: Optional[FileOptions] = None) -> LocalRunner:
96 | """Get a runner that runs operations locally."""
97 | return LocalRunner(self._storage, file_options)
98 |
99 | def index_files(self, version: Optional[Version] = None) -> List[str]:
100 | """A list of full path of index files."""
101 | snapshot_id = (None if version is None else
102 | self._storage.version_to_snapshot_id(version))
103 | data_files = self._storage.data_files(snapshot_id=snapshot_id)
104 | return [self._storage.full_path(f.path) for f in data_files.index_files]
105 |
106 | def versions(self) -> pa.Table:
107 | """Return a table of versions (snapshot, tag, branch) in the storage."""
108 | return self._storage.versions()
109 |
110 | @property
111 | def sources(self) -> Dict[str, Dataset]:
112 | return {self._storage.location: self}
113 |
114 | def to_relation(self, builder: LogicalPlanBuilder) -> Rel:
115 | # TODO: using location as table name is a limitation, because the location
116 | # could be mapped from Cloud Storage. The solution is external catalog
117 | # service integration, and using a unique identifier registered in the
118 | # catalog instead.
119 | location = self._storage.location
120 | return Rel(read=ReadRel(named_table=ReadRel.NamedTable(names=[location]),
121 | base_schema=self._storage.metadata.schema.fields))
122 |
123 | def process_source(self, data: ray.data.Dataset) -> ray.data.Dataset:
124 | # Dataset is the source, there is no transform, so simply return the data.
125 | return data
126 |
127 | def _ray_dataset(self, ray_options: RayOptions, read_options: ReadOptions,
128 | join_options: JoinOptions) -> ray.data.Dataset:
129 | """Return a Ray dataset for a Space dataset."""
130 | return self._storage.ray_dataset(ray_options, read_options)
131 |
132 | def ray(
133 | self,
134 | ray_options: Optional[RayOptions] = None,
135 | file_options: Optional[FileOptions] = None
136 | ) -> ray_runners.RayReadWriterRunner:
137 | """Get a Ray runner."""
138 | return ray_runners.RayReadWriterRunner(self, ray_options, file_options)
139 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/array_record.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """ArrayRecord file utilities."""
16 |
17 | from typing import List, Optional
18 |
19 | from space.core.utils.lazy_imports_utils import array_record_module as ar
20 |
21 |
22 | def read_record_file(file_path: str,
23 | positions: Optional[List[int]] = None) -> List[bytes]:
24 | """Read records of an ArrayRecord file.
25 |
26 | Args:
27 | file_path: full file path.
28 | positions: the position inside the file of the records to read.
29 |
30 | """
31 | record_reader = ar.ArrayRecordReader(file_path)
32 | if positions is not None:
33 | records = record_reader.read(positions)
34 | else:
35 | records = record_reader.read_all()
36 |
37 | record_reader.close()
38 | return records
39 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/arrow.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Arrow file system implementation."""
16 |
17 | from abc import abstractmethod
18 |
19 | from google.protobuf import text_format
20 | from pyarrow import fs
21 |
22 | from space.core.fs.base import BaseFileSystem, ProtoT
23 | from space.core.utils import errors
24 | from space.core.utils.protos import proto_to_text
25 | from space.core.utils.uuids import random_id
26 |
27 |
28 | class ArrowFileSystem(BaseFileSystem):
29 | """Abstract Arrow file system."""
30 |
31 | def __init__(self):
32 | super().__init__()
33 | self._fs = self.create_fs()
34 |
35 | @abstractmethod
36 | def create_fs(self) -> fs.FileSystem:
37 | """Create a new underlying Arrow file system."""
38 |
39 | def create_dir(self, dir_path: str) -> None:
40 | self._fs.create_dir(dir_path)
41 |
42 | def write_proto(self,
43 | file_path: str,
44 | msg: ProtoT,
45 | fail_if_exists: bool = False) -> None:
46 | # TODO: this check is not atomic; a more file system specific implement is
47 | # needed.
48 | if fail_if_exists and self._fs.get_file_info(
49 | file_path).type != fs.FileType.NotFound:
50 | raise errors.FileExistError(f"File {file_path} already exists")
51 |
52 | tmp_file_path = f"{file_path}.{random_id()}.tmp"
53 |
54 | with self._fs.open_output_stream(tmp_file_path) as f:
55 | f.write(proto_to_text(msg))
56 |
57 | self._fs.move(tmp_file_path, file_path)
58 |
59 | def read_proto(self, file_path: str, empty_msg: ProtoT) -> ProtoT:
60 | with self._fs.open_input_file(file_path) as f:
61 | result = text_format.Parse(f.readall(), empty_msg)
62 | return result
63 |
64 |
65 | class ArrowLocalFileSystem(ArrowFileSystem):
66 | """Arrow local file system implementation."""
67 |
68 | def create_fs(self) -> fs.FileSystem:
69 | return fs.LocalFileSystem()
70 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/base.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Abstract base file system."""
16 |
17 | from abc import ABC, abstractmethod
18 | from typing import TypeVar
19 |
20 | from google.protobuf import message
21 |
22 | ProtoT = TypeVar("ProtoT", bound=message.Message)
23 |
24 |
25 | class BaseFileSystem(ABC):
26 | """Abstract file system."""
27 |
28 | @abstractmethod
29 | def create_dir(self, dir_path: str) -> None:
30 | """Create a new directory."""
31 |
32 | @abstractmethod
33 | def write_proto(self,
34 | file_path: str,
35 | msg: ProtoT,
36 | fail_if_exists: bool = False) -> None:
37 | """Write a proto message in text format to a file.
38 |
39 | Args:
40 | file_path: full path of the file to write to
41 | msg: the proto message to write
42 | fail_if_exists: if true, fail when the file already exists; otherwise
43 | truncate the file
44 | """
45 |
46 | @abstractmethod
47 | def read_proto(self, file_path: str, empty_msg: ProtoT) -> ProtoT:
48 | """Read a proto message in text format from a file."""
49 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/factory.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """File system factory implementation."""
16 |
17 | from space.core.fs.arrow import ArrowLocalFileSystem
18 | from space.core.fs.base import BaseFileSystem
19 |
20 |
21 | def create_fs(path: str) -> BaseFileSystem: # pylint: disable=unused-argument
22 | """Create a file system based on the path."""
23 | # TODO: to support more file systems.
24 | return ArrowLocalFileSystem()
25 |
--------------------------------------------------------------------------------
/python/src/space/core/fs/parquet.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Parquet file utilities."""
16 |
17 | from typing import List
18 |
19 | import pyarrow as pa
20 | import pyarrow.parquet as pq
21 |
22 |
23 | def write_parquet_file(file_path: str, schema: pa.Schema,
24 | data: List[pa.Table]) -> pq.FileMetaData:
25 | """Materialize a single Parquet file."""
26 | # TODO: currently assume this file is small, so always write a single file.
27 | writer = pq.ParquetWriter(file_path, schema)
28 | for batch in data:
29 | writer.write_table(batch)
30 |
31 | writer.close()
32 | return writer.writer.metadata
33 |
--------------------------------------------------------------------------------
/python/src/space/core/jobs.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Jobs of Space operations."""
16 |
17 | from typing import Optional
18 |
19 | from dataclasses import dataclass
20 | from enum import Enum
21 |
22 | import space.core.proto.metadata_pb2 as meta
23 |
24 |
25 | @dataclass
26 | class JobResult:
27 | """The result of a job."""
28 |
29 | class State(Enum):
30 | """The job state."""
31 | # The job has suceeded.
32 | SUCCEEDED = 1
33 | # The job has failed.
34 | FAILED = 2
35 | # The job is a no-op.
36 | SKIPPED = 3
37 |
38 | # The job state.
39 | state: State
40 |
41 | # The update to storage statistics as the result of the job.
42 | storage_statistics_update: Optional[meta.StorageStatistics] = None
43 |
44 | # Error message if the job failed.
45 | error_message: Optional[str] = None
46 |
--------------------------------------------------------------------------------
/python/src/space/core/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/loaders/array_record.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Load ArrayRecord files into Space datasets."""
16 |
17 | from typing import Any, Callable, Dict, List, Optional, Tuple
18 | import glob
19 |
20 | import pyarrow as pa
21 | from typing_extensions import TypeAlias
22 |
23 | from space.core.fs.array_record import read_record_file
24 | from space.core.proto import metadata_pb2 as meta
25 | from space.core.proto import runtime_pb2 as rt
26 | from space.core.ops import utils
27 | from space.core.ops.append import LocalAppendOp
28 | from space.core.options import FileOptions
29 | from space.core.schema import arrow
30 | from space.core.serializers import DictSerializer
31 | from space.core.utils.paths import StoragePathsMixin
32 |
33 | ArrayRecordIndexFn: TypeAlias = Callable[[Dict[str, Any]], Dict[str, Any]]
34 |
35 |
36 | class LocalArrayRecordLoadOp(StoragePathsMixin):
37 | """Load ArrayRecord files into Space without copying data."""
38 |
39 | # pylint: disable=too-many-arguments
40 | def __init__(self, location: str, metadata: meta.StorageMetadata,
41 | pattern: str, index_fn: ArrayRecordIndexFn,
42 | file_options: FileOptions):
43 | """
44 | Args:
45 | pattern: file path pattern of the input ArrayRecord files, e.g.,
46 | "/directory/*.array_record"
47 | """
48 | StoragePathsMixin.__init__(self, location)
49 | self._file_options = file_options
50 |
51 | self._metadata = metadata
52 | self._index_fn = index_fn
53 |
54 | record_fields = set(self._metadata.schema.record_fields)
55 | logical_schema = arrow.arrow_schema(self._metadata.schema.fields,
56 | record_fields,
57 | physical=False)
58 | self._physical_schema = arrow.logical_to_physical_schema(
59 | logical_schema, record_fields)
60 |
61 | _, self._record_fields = arrow.classify_fields(self._physical_schema,
62 | record_fields,
63 | selected_fields=None)
64 |
65 | assert len(self._record_fields) == 1, "Support only one record field"
66 | self._record_field = self._record_fields[0]
67 |
68 | self._serializer = DictSerializer.create(logical_schema)
69 | self._input_files = glob.glob(pattern)
70 |
71 | def write(self) -> Optional[rt.Patch]:
72 | """Write index files to load ArrayRecord files to Space dataset."""
73 | append_op = LocalAppendOp(self._location,
74 | self._metadata,
75 | self._file_options,
76 | record_address_input=True)
77 |
78 | total_record_bytes = 0
79 | for f in self._input_files:
80 | index_data, record_bytes = self._build_index_for_array_record(f)
81 | total_record_bytes += record_bytes
82 | append_op.write(index_data)
83 |
84 | patch = append_op.finish()
85 | if patch is not None:
86 | patch.storage_statistics_update.record_uncompressed_bytes += total_record_bytes # pylint: disable=line-too-long
87 |
88 | return patch
89 |
90 | def _build_index_for_array_record(self,
91 | file_path: str) -> Tuple[pa.Table, int]:
92 | record_field = self._record_field.name
93 | # TODO: to avoid loading all data into memory at once.
94 | serialized_records = read_record_file(file_path)
95 |
96 | indxes: List[Dict[str, Any]] = []
97 | record_uncompressed_bytes = 0
98 | for sr in serialized_records:
99 | record_uncompressed_bytes += len(sr)
100 | record = self._serializer.deserialize({record_field: [sr]})
101 | indxes.append(self._index_fn(record))
102 |
103 | index_data = pa.Table.from_pylist(indxes, schema=self._physical_schema)
104 | index_data = index_data.drop(record_field) # type: ignore[attr-defined]
105 | index_data = index_data.append_column(
106 | record_field,
107 | utils.address_column(file_path, start_row=0, num_rows=len(indxes)))
108 |
109 | return index_data, record_uncompressed_bytes
110 |
--------------------------------------------------------------------------------
/python/src/space/core/loaders/parquet.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Load Parquet files into Space datasets."""
16 |
17 | from typing import Optional
18 | import glob
19 |
20 | import pyarrow.parquet as pq
21 |
22 | from space.core.manifests import IndexManifestWriter
23 | from space.core.proto import metadata_pb2 as meta
24 | from space.core.proto import runtime_pb2 as rt
25 | from space.core.ops import utils
26 | from space.core.schema import arrow
27 | from space.core.utils.paths import StoragePathsMixin
28 |
29 |
30 | class LocalParquetLoadOp(StoragePathsMixin):
31 | """Load ArrayRecord files into Space without copying data."""
32 |
33 | def __init__(self, location: str, metadata: meta.StorageMetadata,
34 | pattern: str):
35 | """
36 | Args:
37 | pattern: file path pattern of the input Parquet files, e.g.,
38 | "/directory/*.parquet"
39 | """
40 | StoragePathsMixin.__init__(self, location)
41 |
42 | self._metadata = metadata
43 |
44 | assert len(self._metadata.schema.record_fields) == 0
45 | self._physical_schema = arrow.arrow_schema(self._metadata.schema.fields,
46 | set(),
47 | physical=True)
48 | self._input_files = glob.glob(pattern)
49 |
50 | def write(self) -> Optional[rt.Patch]:
51 | """Write metadata files to load Parquet files to Space dataset."""
52 | index_manifest_writer = IndexManifestWriter(
53 | self._metadata_dir, self._physical_schema,
54 | self._metadata.schema.primary_keys) # type: ignore[arg-type]
55 | patch = rt.Patch()
56 |
57 | for f in self._input_files:
58 | stats = _write_index_manifest(index_manifest_writer, f)
59 | utils.update_index_storage_stats(base=patch.storage_statistics_update,
60 | update=stats)
61 |
62 | index_manifest_full_path = index_manifest_writer.finish()
63 | if index_manifest_full_path is not None:
64 | patch.addition.index_manifest_files.append(
65 | self.short_path(index_manifest_full_path))
66 |
67 | return patch
68 |
69 |
70 | def _write_index_manifest(manifest_writer: IndexManifestWriter,
71 | file_path: str) -> meta.StorageStatistics:
72 | # TODO: to verify that file schemas are compatible with dataset.
73 | metadata = pq.read_metadata(file_path)
74 | return manifest_writer.write(file_path, metadata)
75 |
--------------------------------------------------------------------------------
/python/src/space/core/manifests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Manifest files writer and reader implementation."""
16 |
17 | from space.core.manifests.index import IndexManifestWriter
18 | from space.core.manifests.record import RecordManifestWriter
19 |
--------------------------------------------------------------------------------
/python/src/space/core/manifests/record.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Record manifest files writer and reader implementation."""
16 |
17 | from typing import List, Optional
18 |
19 | import pyarrow as pa
20 |
21 | from space.core.fs.parquet import write_parquet_file
22 | import space.core.proto.metadata_pb2 as meta
23 | from space.core.utils import paths
24 | from space.core.schema import constants
25 |
26 |
27 | def _manifest_schema() -> pa.Schema:
28 | fields = [(constants.FILE_PATH_FIELD, pa.utf8()),
29 | (constants.FIELD_ID_FIELD, pa.int32()),
30 | (constants.NUM_ROWS_FIELD, pa.int64()),
31 | (constants.UNCOMPRESSED_BYTES_FIELD, pa.int64())]
32 | return pa.schema(fields) # type: ignore[arg-type]
33 |
34 |
35 | class RecordManifestWriter:
36 | """Writer of record manifest files."""
37 |
38 | def __init__(self, metadata_dir: str):
39 | self._metadata_dir = metadata_dir
40 | self._manifest_schema = _manifest_schema()
41 |
42 | self._file_paths: List[str] = []
43 | self._field_ids: List[int] = []
44 | self._num_rows: List[int] = []
45 | self._uncompressed_bytes: List[int] = []
46 |
47 | def write(self, file_path: str, field_id: int,
48 | storage_statistics: meta.StorageStatistics) -> None:
49 | """Write a new manifest row.
50 |
51 | Args:
52 | file_path: a relative file path of the index file.
53 | field_id: the field ID of the associated field for this ArrayRecord file.
54 | storage_statistics: storage statistics of the file.
55 | """
56 | self._file_paths.append(file_path)
57 | self._field_ids.append(field_id)
58 | self._num_rows.append(storage_statistics.num_rows)
59 | self._uncompressed_bytes.append(
60 | storage_statistics.record_uncompressed_bytes)
61 |
62 | def finish(self) -> Optional[str]:
63 | """Materialize the manifest file and return the file path."""
64 | if not self._file_paths:
65 | return None
66 |
67 | arrays = [
68 | self._file_paths, self._field_ids, self._num_rows,
69 | self._uncompressed_bytes
70 | ]
71 | manifest_data = pa.Table.from_arrays(
72 | arrays, # type: ignore[arg-type]
73 | schema=self._manifest_schema) # type: ignore[call-arg]
74 |
75 | file_path = paths.new_record_manifest_path(self._metadata_dir)
76 | write_parquet_file(file_path, self._manifest_schema, [manifest_data])
77 | return file_path
78 |
--------------------------------------------------------------------------------
/python/src/space/core/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/ops/base.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Abstract base operation."""
16 |
17 | from __future__ import annotations
18 | from abc import ABC
19 | from typing import Any, Callable, Dict, Iterator, Union
20 | from typing_extensions import TypeAlias
21 |
22 | import pyarrow as pa
23 |
24 | # Input data can be either nested Py dict or Arrow table.
25 | InputData: TypeAlias = Union[Dict[str, Any], pa.Table]
26 |
27 | # A no args function that returns an iterator.
28 | InputIteratorFn: TypeAlias = Callable[[], Iterator[InputData]]
29 |
30 |
31 | class BaseOp(ABC):
32 | """Abstract base operation class."""
33 |
--------------------------------------------------------------------------------
/python/src/space/core/ops/insert.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Local insert operation implementation."""
16 |
17 | from __future__ import annotations
18 | from dataclasses import dataclass
19 | from enum import Enum
20 | from typing import List, Optional
21 |
22 | import pyarrow as pa
23 | import pyarrow.compute as pc
24 |
25 | from space.core.ops import utils
26 | from space.core.ops.append import LocalAppendOp
27 | from space.core.ops.base import BaseOp, InputData
28 | from space.core.ops.delete import FileSetDeleteOp
29 | from space.core.ops.read import FileSetReadOp
30 | from space.core.options import FileOptions, ReadOptions
31 | import space.core.proto.metadata_pb2 as meta
32 | import space.core.proto.runtime_pb2 as rt
33 | from space.core.storage import Storage
34 | from space.core.utils import errors
35 | from space.core.utils.paths import StoragePathsMixin
36 |
37 |
38 | @dataclass
39 | class InsertOptions:
40 | """Options of inserting data."""
41 |
42 | class Mode(Enum):
43 | """Mode of insert operation."""
44 | # Fail if duplicated primary key is found.
45 | INSERT = 1
46 | # Update the existing row if duplicated primary key is found.
47 | UPSERT = 2
48 |
49 | # The insert mode.
50 | mode: Mode = Mode.INSERT
51 |
52 |
53 | class BaseInsertOp(BaseOp):
54 | """Abstract base insert operation class."""
55 |
56 | def write(self, data: InputData) -> Optional[rt.Patch]:
57 | """Insert data into storage."""
58 |
59 |
60 | class LocalInsertOp(BaseInsertOp, StoragePathsMixin):
61 | """Insert data to a dataset."""
62 |
63 | def __init__(self, storage: Storage, options: InsertOptions,
64 | file_options: FileOptions):
65 | StoragePathsMixin.__init__(self, storage.location)
66 |
67 | self._storage = storage
68 | self._metadata = self._storage.metadata
69 |
70 | self._options = options or InsertOptions()
71 | self._file_options = file_options
72 |
73 | def write(self, data: InputData) -> Optional[rt.Patch]:
74 | if not isinstance(data, pa.Table):
75 | data = pa.Table.from_pydict(data)
76 |
77 | return self._write_arrow(data)
78 |
79 | def _write_arrow(self, data: pa.Table) -> Optional[rt.Patch]:
80 | if data.num_rows == 0:
81 | return None
82 |
83 | filter_ = utils.primary_key_filter(self._storage.primary_keys, data)
84 | assert filter_ is not None
85 |
86 | data_files = self._storage.data_files(filter_)
87 |
88 | mode = self._options.mode
89 | patches: List[Optional[rt.Patch]] = []
90 | if data_files.index_files:
91 | if mode == InsertOptions.Mode.INSERT:
92 | self._check_duplication(data_files, filter_)
93 | elif mode == InsertOptions.Mode.UPSERT:
94 | self._delete(filter_, data_files, patches)
95 | else:
96 | raise errors.SpaceRuntimeError(f"Insert mode {mode} not supported")
97 |
98 | self._append(data, patches)
99 | return utils.merge_patches(patches)
100 |
101 | def _check_duplication(self, data_files: rt.FileSet, filter_: pc.Expression):
102 | if filter_matched(self._location, self._metadata, data_files, filter_,
103 | self._storage.primary_keys):
104 | raise errors.SpaceRuntimeError("Primary key to insert already exist")
105 |
106 | def _delete(self, filter_: pc.Expression, data_files: rt.FileSet,
107 | patches: List[Optional[rt.Patch]]) -> None:
108 | delete_op = FileSetDeleteOp(self._location, self._metadata, data_files,
109 | filter_, self._file_options)
110 | patches.append(delete_op.delete())
111 |
112 | def _append(self, data: pa.Table, patches: List[Optional[rt.Patch]]) -> None:
113 | append_op = LocalAppendOp(self._location, self._metadata,
114 | self._file_options)
115 | append_op.write(data)
116 | patches.append(append_op.finish())
117 |
118 |
119 | def filter_matched(location: str, metadata: meta.StorageMetadata,
120 | data_files: rt.FileSet, filter_: pc.Expression,
121 | primary_keys: List[str]) -> bool:
122 | """Return True if there are data matching the provided filter."""
123 | op = FileSetReadOp(location,
124 | metadata,
125 | data_files,
126 | options=ReadOptions(filter_=filter_, fields=primary_keys))
127 |
128 | for data in iter(op):
129 | if data.num_rows > 0:
130 | # TODO: to customize the error and converted it to JobResult failed
131 | # status.
132 | return True
133 |
134 | return False
135 |
--------------------------------------------------------------------------------
/python/src/space/core/ops/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for operation classes."""
16 |
17 | from typing import List, Optional, Set
18 |
19 | import numpy as np
20 | import pyarrow as pa
21 | import pyarrow.compute as pc
22 |
23 | from space.core.schema import arrow
24 | from space.core.proto import metadata_pb2 as meta
25 | from space.core.proto import runtime_pb2 as rt
26 | from space.core.utils import errors
27 |
28 |
29 | def update_index_storage_stats(
30 | base: meta.StorageStatistics,
31 | update: meta.StorageStatistics,
32 | ) -> None:
33 | """Update index storage statistics."""
34 | base.num_rows += update.num_rows
35 | base.index_compressed_bytes += update.index_compressed_bytes
36 | base.index_uncompressed_bytes += update.index_uncompressed_bytes
37 |
38 |
39 | def update_record_stats_bytes(base: meta.StorageStatistics,
40 | update: meta.StorageStatistics) -> None:
41 | """Update record storage statistics."""
42 | base.record_uncompressed_bytes += update.record_uncompressed_bytes
43 |
44 |
45 | def address_column(file_path: str, start_row: int,
46 | num_rows: int) -> pa.StructArray:
47 | """Construct an record address column by a file path and row ID range."""
48 | return pa.StructArray.from_arrays(
49 | [
50 | [file_path] * num_rows, # type: ignore[arg-type]
51 | np.arange(start_row, start_row + num_rows, dtype=np.int32)
52 | ],
53 | fields=arrow.record_address_types()) # type: ignore[arg-type]
54 |
55 |
56 | def primary_key_filter(primary_keys: List[str],
57 | data: pa.Table) -> Optional[pc.Expression]:
58 | """Return a filter that match the given primary keys in the input data.
59 |
60 | Raise an error if data contain duplicated primary keys.
61 | """
62 | columns = []
63 | for key in primary_keys:
64 | columns.append(data.column(key).combine_chunks())
65 |
66 | filter_ = None
67 | filter_strs: Set[str] = set()
68 | for i_row in range(data.num_rows):
69 | row_filter = None
70 | for i_col, key in enumerate(primary_keys):
71 | new_filter = pc.field(key) == columns[i_col][i_row]
72 | if row_filter is None:
73 | row_filter = new_filter
74 | else:
75 | row_filter &= new_filter
76 |
77 | # TODO: a simple method of detecting duplicated primary keys. To find a
78 | # more efficient method.
79 | filter_str = str(row_filter)
80 | if filter_str in filter_strs:
81 | raise errors.PrimaryKeyExistError(
82 | f"Found duplicated primary key: {filter_str}")
83 |
84 | filter_strs.add(filter_str)
85 |
86 | if filter_ is None:
87 | filter_ = row_filter
88 | else:
89 | filter_ |= row_filter
90 |
91 | return filter_
92 |
93 |
94 | def merge_patches(patches: List[Optional[rt.Patch]]) -> Optional[rt.Patch]:
95 | """Merge multiple patches into one."""
96 | patch = rt.Patch()
97 | stats_update = meta.StorageStatistics()
98 |
99 | empty = True
100 | for p in patches:
101 | if p is None:
102 | continue
103 |
104 | if empty:
105 | empty = False
106 |
107 | # TODO: to manually merge patches when it gets more complex.
108 | patch.MergeFrom(p)
109 |
110 | # Update statistics.
111 | update_index_storage_stats(stats_update, p.storage_statistics_update)
112 | update_record_stats_bytes(stats_update, p.storage_statistics_update)
113 |
114 | if empty:
115 | return None
116 |
117 | patch.storage_statistics_update.CopyFrom(stats_update)
118 | return patch
119 |
--------------------------------------------------------------------------------
/python/src/space/core/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Options of Space core lib."""
16 |
17 | from dataclasses import dataclass, field as dataclass_field
18 | from typing import Any, Callable, List, Optional
19 |
20 | import pyarrow.compute as pc
21 |
22 | # Default number of rows per batch in read result.
23 | DEFAULT_READ_BATCH_SIZE = 16
24 |
25 |
26 | @dataclass
27 | class ReadOptions:
28 | """Options of reading data."""
29 | # Filters on index fields.
30 | filter_: Optional[pc.Expression] = None
31 |
32 | # When specified, only read the given fields instead of all fields.
33 | fields: Optional[List[str]] = None
34 |
35 | # The snapshot to read.
36 | # TODO: to change it to version.
37 | snapshot_id: Optional[int] = None
38 |
39 | # If true, read the references (e.g., address) of read record fields instead
40 | # of values.
41 | reference_read: bool = False
42 |
43 | # The max number of rows per batch in read result.
44 | #
45 | # `None` will not enforce batch size, data will be read at the step of row
46 | # groups. For large row group size, the cost of loading all record fields may
47 | # be expensive and slow, choose a proper batch size will help.
48 | #
49 | # Too small batch size causes too many Ray blocks in Ray runner and will have
50 | # negative impact on performance.
51 | #
52 | # TODO: currently a batch can be smaller than batch_size (e.g., at boundary
53 | # of row groups), to enforce size to be equal to batch_size.
54 | batch_size: Optional[int] = None
55 |
56 | def __post_init__(self):
57 | self.batch_size = self.batch_size or DEFAULT_READ_BATCH_SIZE
58 |
59 |
60 | @dataclass
61 | class ParquetWriterOptions:
62 | """Options of Parquet file writer."""
63 | # Max uncompressed bytes per row group.
64 | max_uncompressed_row_group_bytes: int = 100 * 1024
65 |
66 | # Max uncompressed bytes per file.
67 | max_uncompressed_file_bytes: int = 1 * 1024 * 1024
68 |
69 |
70 | # pylint: disable=line-too-long
71 | @dataclass
72 | class ArrayRecordOptions:
73 | """Options of ArrayRecord file writer."""
74 | # Max uncompressed bytes per file.
75 | max_uncompressed_file_bytes: int = 100 * 1024 * 1024
76 |
77 | # ArrayRecord lib options.
78 | #
79 | # See https://github.com/google/array_record/blob/2ac1d904f6be31e5aa2f09549774af65d84bff5a/cpp/array_record_writer.h#L83
80 | # Default group size 1 maximizes random read performance.
81 | # It matches the options of TFDS:
82 | # https://github.com/tensorflow/datasets/blob/92ebd18102b62cf85557ba4b905c970203d8914d/tensorflow_datasets/core/sequential_writer.py#L108
83 | #
84 | # A larger group size improves read throughput from Cloud Storage, because
85 | # each RPC reads a larger chunk of data, which performs better on Cloud
86 | # Storage.
87 | options: str = "group_size:1"
88 |
89 |
90 | @dataclass
91 | class FileOptions:
92 | """Options of file IO."""
93 | # Parquet file options.
94 | parquet_options: ParquetWriterOptions = dataclass_field(
95 | default_factory=ParquetWriterOptions)
96 |
97 | # ArrayRecord file options.
98 | array_record_options: ArrayRecordOptions = dataclass_field(
99 | default_factory=ArrayRecordOptions)
100 |
101 |
102 | @dataclass
103 | class Range:
104 | """A range of a field."""
105 | # Always inclusive.
106 | min_: Any
107 |
108 | # Default exclusive.
109 | max_: Any
110 |
111 | # Max is inclusive when true.
112 | include_max: bool = False
113 |
114 |
115 | @dataclass
116 | class JoinOptions:
117 | """Options of joining data."""
118 | # Partition the join key range into multiple ranges for parallel processing.
119 | partition_fn: Optional[Callable[[Range], List[Range]]] = None
120 |
121 | # TODO: to support ReadOptions for left and right views, e.g., filter_,
122 | # snapshot_id
123 | # TODO: to support join type in PyArrow, only `inner` is supported now.
124 |
--------------------------------------------------------------------------------
/python/src/space/core/proto/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/proto/runtime.proto:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // https://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | //
15 | // Proto messages used by Space runtime.
16 | //
17 | // Different from metadata.proto, protos here are not persisted in metadata
18 | // files. We use proto instead of Python classes for the capabilities of
19 | // serialization to bytes for cross machines/languages messaging. For example,
20 | // `FileSet` is sent to worker machine for processing, and `Patch` is sent back
21 | // for the coordinator machine to commit to storage. Pickling Python classses
22 | // may work but it may have more restrictions, especially when crossing
23 | // languages.
24 |
25 | syntax = "proto3";
26 |
27 | import "space/core/proto/metadata.proto";
28 |
29 | package space.proto;
30 |
31 | // Information of a data file.
32 | // NEXT_ID: 6
33 | message DataFile {
34 | // Data file path.
35 | string path = 1;
36 |
37 | // Storage statistics of data in the file.
38 | StorageStatistics storage_statistics = 2;
39 |
40 | // Locally assigned manifest file IDs.
41 | int64 manifest_file_id = 3;
42 |
43 | message Range {
44 | // Inclusive.
45 | int64 start = 1;
46 | // Exclusive.
47 | int64 end = 2;
48 | }
49 |
50 | // Optional, a range of selected rows in the data file.
51 | // Used for partially reading an index file and its records.
52 | Range row_slice = 4;
53 |
54 | // Optional, bitmap masking rows to read; can be used together with
55 | // `row_slice`. `path` in RowBitmap is not used.
56 | RowBitmap row_bitmap = 5;
57 | }
58 |
59 | // A set of associated data and manifest files.
60 | // NEXT_ID: 2
61 | message FileSet {
62 | // Index data files.
63 | repeated DataFile index_files = 1;
64 |
65 | // Key is locally assigned manifest IDs by a local operation.
66 | map index_manifest_files = 2;
67 | }
68 |
69 | // A patch describing metadata changes to the storage for a data operation.
70 | // NEXT_ID: 5
71 | message Patch {
72 | // Manifest files to add to the storage.
73 | ManifestFiles addition = 1;
74 |
75 | // Manifest files to remove from the storage.
76 | ManifestFiles deletion = 2;
77 |
78 | // The change of the storage statistics.
79 | StorageStatistics storage_statistics_update = 3;
80 |
81 | // The change log describing the changes made by the patch.
82 | ChangeLog change_log = 4;
83 | }
84 |
--------------------------------------------------------------------------------
/python/src/space/core/proto/runtime_pb2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by the protocol buffer compiler. DO NOT EDIT!
3 | # source: space/core/proto/runtime.proto
4 | """Generated protocol buffer code."""
5 | from google.protobuf.internal import builder as _builder
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import descriptor_pool as _descriptor_pool
8 | from google.protobuf import symbol_database as _symbol_database
9 | # @@protoc_insertion_point(imports)
10 |
11 | _sym_db = _symbol_database.Default()
12 |
13 |
14 | from space.core.proto import metadata_pb2 as space_dot_core_dot_proto_dot_metadata__pb2
15 |
16 |
17 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1espace/core/proto/runtime.proto\x12\x0bspace.proto\x1a\x1fspace/core/proto/metadata.proto\"\xef\x01\n\x08\x44\x61taFile\x12\x0c\n\x04path\x18\x01 \x01(\t\x12:\n\x12storage_statistics\x18\x02 \x01(\x0b\x32\x1e.space.proto.StorageStatistics\x12\x18\n\x10manifest_file_id\x18\x03 \x01(\x03\x12.\n\trow_slice\x18\x04 \x01(\x0b\x32\x1b.space.proto.DataFile.Range\x12*\n\nrow_bitmap\x18\x05 \x01(\x0b\x32\x16.space.proto.RowBitmap\x1a#\n\x05Range\x12\r\n\x05start\x18\x01 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x02 \x01(\x03\"\xbc\x01\n\x07\x46ileSet\x12*\n\x0bindex_files\x18\x01 \x03(\x0b\x32\x15.space.proto.DataFile\x12J\n\x14index_manifest_files\x18\x02 \x03(\x0b\x32,.space.proto.FileSet.IndexManifestFilesEntry\x1a\x39\n\x17IndexManifestFilesEntry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xd2\x01\n\x05Patch\x12,\n\x08\x61\x64\x64ition\x18\x01 \x01(\x0b\x32\x1a.space.proto.ManifestFiles\x12,\n\x08\x64\x65letion\x18\x02 \x01(\x0b\x32\x1a.space.proto.ManifestFiles\x12\x41\n\x19storage_statistics_update\x18\x03 \x01(\x0b\x32\x1e.space.proto.StorageStatistics\x12*\n\nchange_log\x18\x04 \x01(\x0b\x32\x16.space.proto.ChangeLogb\x06proto3')
18 |
19 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
20 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'space.core.proto.runtime_pb2', globals())
21 | if _descriptor._USE_C_DESCRIPTORS == False:
22 |
23 | DESCRIPTOR._options = None
24 | _FILESET_INDEXMANIFESTFILESENTRY._options = None
25 | _FILESET_INDEXMANIFESTFILESENTRY._serialized_options = b'8\001'
26 | _DATAFILE._serialized_start=81
27 | _DATAFILE._serialized_end=320
28 | _DATAFILE_RANGE._serialized_start=285
29 | _DATAFILE_RANGE._serialized_end=320
30 | _FILESET._serialized_start=323
31 | _FILESET._serialized_end=511
32 | _FILESET_INDEXMANIFESTFILESENTRY._serialized_start=454
33 | _FILESET_INDEXMANIFESTFILESENTRY._serialized_end=511
34 | _PATCH._serialized_start=514
35 | _PATCH._serialized_end=724
36 | # @@protoc_insertion_point(module_scope)
37 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for schema."""
16 |
17 | from space.core.schema.field_ids import FieldIdManager
18 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Constants for schema."""
16 |
17 | FILE_PATH_FIELD = "_FILE"
18 | ROW_ID_FIELD = "_ROW_ID"
19 | FIELD_ID_FIELD = "_FIELD_ID"
20 |
21 | NUM_ROWS_FIELD = "_NUM_ROWS"
22 | UNCOMPRESSED_BYTES_FIELD = "_UNCOMPRESSED_BYTES"
23 |
24 | # Constants for building column statistics field name.
25 | STATS_FIELD = "_STATS"
26 | MIN_FIELD = "_MIN"
27 | MAX_FIELD = "_MAX"
28 |
29 | # Manifest file fields.
30 | INDEX_COMPRESSED_BYTES_FIELD = '_INDEX_COMPRESSED_BYTES'
31 | INDEX_UNCOMPRESSED_BYTES_FIELD = '_INDEX_UNCOMPRESSED_BYTES'
32 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/field_ids.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for schema field IDs."""
16 |
17 | from typing import List, Optional
18 | import pyarrow as pa
19 |
20 | from space.core.schema import arrow
21 |
22 | # The start value of field IDs.
23 | _START_FIELD_ID = 0
24 |
25 |
26 | class FieldIdManager:
27 | """Assign field IDs to schema fields using Depth First Search.
28 |
29 | Rules for nested fields:
30 | - For a list field with ID i, its element is assigned i+1.
31 | - For a struct field with ID i, its fields are assigned starting from i+1.
32 |
33 | Not thread safe.
34 | """
35 |
36 | def __init__(self, next_field_id: Optional[int] = None):
37 | if next_field_id is None:
38 | self._next_field_id = _START_FIELD_ID
39 | else:
40 | assert next_field_id >= _START_FIELD_ID
41 | self._next_field_id = next_field_id
42 |
43 | def assign_field_ids(self, schema: pa.Schema) -> pa.Schema:
44 | """Return a new schema with field IDs assigned."""
45 | return pa.schema(self._assign_field_ids(list(schema)))
46 |
47 | def _assign_field_ids(self, fields: List[pa.Field]) -> List[pa.Field]:
48 | return [self._assign_field_id(f) for f in fields]
49 |
50 | def _assign_field_id(self, field: pa.Field) -> pa.Field:
51 | metadata = arrow.field_metadata(self._next_field_id)
52 | self._next_field_id += 1
53 |
54 | name, type_ = field.name, field.type
55 |
56 | if pa.types.is_list(type_):
57 | return pa.field(
58 | name,
59 | pa.list_(self._assign_field_id(
60 | type_.value_field)), # type: ignore[attr-defined]
61 | metadata=metadata)
62 |
63 | if pa.types.is_struct(type_):
64 | struct_type = pa.struct(
65 | self._assign_field_ids(
66 | [type_.field(i) for i in range(type_.num_fields)]))
67 | return pa.field(name, struct_type, metadata=metadata)
68 |
69 | # TODO: to support more types, e.g., fixed_size_list, map.
70 |
71 | return field.with_metadata(metadata) # type: ignore[arg-type]
72 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/substrait.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for schemas in the Substrait format."""
16 |
17 | from __future__ import annotations
18 | from typing import Any, List
19 |
20 | import pyarrow as pa
21 | from substrait.type_pb2 import NamedStruct, Type
22 |
23 | import space.core.schema.arrow as arrow_schema
24 | from space.core.schema.types import File, TfFeatures
25 |
26 |
27 | def substrait_fields(schema: pa.Schema) -> NamedStruct:
28 | """Convert Arrow schema to equivalent Substrait fields.
29 |
30 | According to the Substrait spec, traverse schema fields in the Depth First
31 | Search order. The field names are persisted in `mutable_names` in the same
32 | order.
33 | """
34 | mutable_names: List[str] = []
35 | types = _substrait_fields(list(schema), mutable_names)
36 | return NamedStruct(names=mutable_names, struct=Type.Struct(types=types))
37 |
38 |
39 | def _substrait_fields(fields: List[pa.Field],
40 | mutable_names: List[str]) -> List[Type]:
41 | return [_substrait_field(f, mutable_names) for f in fields]
42 |
43 |
44 | # pylint: disable=too-many-branches
45 | def _substrait_field(field: pa.Field,
46 | mutable_names: List[str],
47 | is_list_item=False) -> Type:
48 | if not is_list_item:
49 | mutable_names.append(field.name)
50 |
51 | type_ = Type()
52 | field_id = arrow_schema.field_id(field)
53 |
54 | # TODO: to support more types in Substrait, e.g., fixed_size_list, map.
55 | if pa.types.is_int64(field.type):
56 | _set_field_id(type_.i64, field_id)
57 | elif pa.types.is_int32(field.type):
58 | _set_field_id(type_.i32, field_id)
59 | elif pa.types.is_string(field.type):
60 | _set_field_id(type_.string, field_id)
61 | elif pa.types.is_binary(field.type):
62 | _set_field_id(type_.binary, field_id)
63 | elif pa.types.is_boolean(field.type):
64 | _set_field_id(type_.bool, field_id)
65 | elif pa.types.is_float64(field.type):
66 | _set_field_id(type_.fp64, field_id)
67 | elif pa.types.is_float32(field.type):
68 | _set_field_id(type_.fp32, field_id)
69 | elif pa.types.is_list(field.type):
70 | _set_field_id(type_.list, field_id)
71 | type_.list.type.CopyFrom(
72 | _substrait_field(
73 | field.type.value_field, # type: ignore[attr-defined]
74 | mutable_names,
75 | is_list_item=True))
76 | elif pa.types.is_struct(field.type):
77 | _set_field_id(type_.struct, field_id)
78 | subfields = list(field.type) # type: ignore[call-overload]
79 | type_.struct.types.extend(_substrait_fields(subfields, mutable_names))
80 | elif isinstance(field.type, TfFeatures):
81 | # TfFeatures is persisted in Substrait as a user defined type, with
82 | # parameters [TF_FEATURES_TYPE, __arrow_ext_serialize__()].
83 | _set_field_id(type_.user_defined, field_id)
84 | type_.user_defined.type_parameters.extend([
85 | Type.Parameter(string=TfFeatures.EXTENSION_NAME),
86 | _serialized_ext_type(field.type)
87 | ])
88 | elif isinstance(field.type, File):
89 | # File is persisted in Substrait as a user defined type, with
90 | # parameters [FILE_TYPE, __arrow_ext_serialize__()].
91 | _set_field_id(type_.user_defined, field_id)
92 | type_.user_defined.type_parameters.extend([
93 | Type.Parameter(string=File.EXTENSION_NAME),
94 | _serialized_ext_type(field.type)
95 | ])
96 | else:
97 | raise TypeError(f"Type {field.type} of field {field.name} is not supported")
98 |
99 | return type_
100 |
101 |
102 | def _set_field_id(msg: Any, field_id: int) -> None:
103 | msg.type_variation_reference = field_id
104 |
105 |
106 | def _serialized_ext_type(type_: pa.ExtensionType) -> Type.Parameter:
107 | return Type.Parameter(
108 | string=type_.__arrow_ext_serialize__()) # type: ignore[arg-type]
109 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/types/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Defines custom types."""
16 |
17 | from space.core.schema.types.files import File
18 | from space.core.schema.types.tf_features import TfFeatures
19 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/types/files.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Define a custom Arrow type for files."""
16 |
17 | from __future__ import annotations
18 | import json
19 | from os import path
20 | from typing import Union
21 |
22 | from google.protobuf import json_format
23 | import pyarrow as pa
24 |
25 | import space.core.proto.metadata_pb2 as meta
26 | from space.core.utils.constants import UTF_8
27 |
28 |
29 | class File(pa.ExtensionType):
30 | """A custom Arrow type representing data in a standalone file.
31 |
32 | TODO: several features to add, e.g., auto read file content, write a new file
33 | at data write time, serializer/deserializer.
34 | """
35 |
36 | EXTENSION_NAME = "space.file"
37 |
38 | def __init__(self, directory: str = ""):
39 | """
40 | Args:
41 | directory: a directory to add as a prefix of file paths
42 | """
43 | # TODO: managed is not supported yet.
44 | self._file_type = meta.FileType(directory=directory)
45 | pa.ExtensionType.__init__(self, pa.string(), self.EXTENSION_NAME)
46 |
47 | def __arrow_ext_serialize__(self) -> bytes:
48 | return json.dumps(json_format.MessageToJson(self._file_type)).encode(UTF_8)
49 |
50 | @classmethod
51 | def __arrow_ext_deserialize__(
52 | cls,
53 | storage_type: pa.DataType, # pylint: disable=unused-argument
54 | serialized: Union[bytes, str]) -> File:
55 | if isinstance(serialized, bytes):
56 | serialized = serialized.decode(UTF_8)
57 |
58 | file_type = json_format.Parse(json.loads(serialized), meta.FileType())
59 |
60 | return File(directory=file_type.directory)
61 |
62 | def full_path(self, file_path: str) -> str:
63 | """Return the full path of file."""
64 | return path.join(self._file_type.directory, file_path)
65 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/types/tf_features.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Define a custom Arrow type for Tensorflow Dataset Features."""
16 |
17 | from __future__ import annotations
18 | from typing import Any, Union
19 |
20 | import json
21 | import pyarrow as pa
22 | import tensorflow_datasets as tfds # type: ignore[import-untyped]
23 | from tensorflow_datasets import features as f # type: ignore[import-untyped]
24 |
25 | from space.core.serializers import DeserializedData, FieldSerializer
26 | from space.core.utils.constants import UTF_8
27 |
28 |
29 | class TfFeatures(pa.ExtensionType, FieldSerializer):
30 | """A custom Arrow type for Tensorflow Dataset Features."""
31 |
32 | EXTENSION_NAME = "space.tf_features"
33 |
34 | def __init__(self, features_dict: f.FeaturesDict):
35 | """
36 | Args:
37 | features_dict: a Tensorflow Dataset features dict providing serializers
38 | for a nested dict of Tensors or Numpy arrays, see
39 | https://www.tensorflow.org/datasets/api_docs/python/tfds/features/FeaturesDict
40 | """
41 | self._features_dict = features_dict
42 | pa.ExtensionType.__init__(self, pa.binary(), self.EXTENSION_NAME)
43 |
44 | def __arrow_ext_serialize__(self) -> bytes:
45 | return json.dumps(self._features_dict.to_json()).encode(UTF_8)
46 |
47 | @classmethod
48 | def __arrow_ext_deserialize__(
49 | cls,
50 | storage_type: pa.DataType, # pylint: disable=unused-argument
51 | serialized: Union[bytes, str]
52 | ) -> TfFeatures:
53 | if isinstance(serialized, bytes):
54 | serialized = serialized.decode(UTF_8)
55 |
56 | return TfFeatures(f.FeaturesDict.from_json(json.loads(serialized)))
57 |
58 | def serialize(self, value: Any) -> bytes:
59 | """Serialize value using the provided features_dict."""
60 | return self._features_dict.serialize_example(value)
61 |
62 | def deserialize(self, value_bytes: bytes) -> DeserializedData:
63 | """Deserialize value using the provided features_dict."""
64 | return tfds.as_numpy(self._features_dict.deserialize_example(value_bytes))
65 |
--------------------------------------------------------------------------------
/python/src/space/core/schema/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Common utilities for schemas."""
16 |
17 | from dataclasses import dataclass
18 | from typing import List
19 |
20 | import pyarrow as pa
21 |
22 | from space.core.schema import constants
23 | from space.core.schema.types import TfFeatures
24 | from space.core.utils import errors
25 |
26 |
27 | @dataclass
28 | class Field:
29 | """Information of a field."""
30 | name: str
31 | field_id: int
32 |
33 |
34 | def field_names(fields: List[Field]) -> List[str]:
35 | """Extract field names from a list of fields."""
36 | return list(map(lambda f: f.name, fields))
37 |
38 |
39 | def field_ids(fields: List[Field]) -> List[int]:
40 | """Extract field IDs from a list of fields."""
41 | return list(map(lambda f: f.field_id, fields))
42 |
43 |
44 | def stats_field_name(field_id_: int) -> str:
45 | """Column stats struct field name.
46 |
47 | It uses field ID instead of name. Manifest file has all Parquet files and it
48 | is not tied with one Parquet schema, we can't do table field name to file
49 | field name projection. Using field ID ensures that we can always uniquely
50 | identifies a field.
51 | """
52 | return f"{constants.STATS_FIELD}_f{field_id_}"
53 |
54 |
55 | def file_path_field_name(field: str) -> str:
56 | """File path field name in flatten addresses."""
57 | return f"{field}.{constants.FILE_PATH_FIELD}"
58 |
59 |
60 | def row_id_field_name(field: str) -> str:
61 | """Row ID field name in flatten addresses."""
62 | return f"{field}.{constants.ROW_ID_FIELD}"
63 |
64 |
65 | def validate_logical_schema(schema: pa.Schema, primary_keys: List[str],
66 | record_fields: List[str]) -> None:
67 | """Validate the logical schema of a Space storage."""
68 | if not primary_keys:
69 | raise errors.UserInputError("Must specify at least one primary key")
70 |
71 | all_fields = set(schema.names)
72 |
73 | for name in primary_keys:
74 | if name not in all_fields:
75 | raise errors.UserInputError(f"Primary key {name} not found in schema")
76 |
77 | field = schema.field(name) # type: ignore[arg-type]
78 | if pa.types.is_list(field.type) or pa.types.is_struct(
79 | field.type) or isinstance(field.type, TfFeatures):
80 | raise errors.UserInputError(f"Primary key type not supported: {field}")
81 |
82 | all_primary_keys = set(primary_keys)
83 | for name in record_fields:
84 | if name not in all_fields:
85 | raise errors.UserInputError(f"Record field {name} not found in schema")
86 |
87 | if name in all_primary_keys:
88 | raise errors.UserInputError(
89 | f"Record field {name} cannot be a primary key")
90 |
91 | field = schema.field(name) # type: ignore[arg-type]
92 | if not (pa.types.is_binary(field.type)
93 | or isinstance(field.type, TfFeatures)):
94 | raise errors.UserInputError(f"Record field type not supported: {field}")
95 |
--------------------------------------------------------------------------------
/python/src/space/core/serializers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Serializers (and deserializers) for unstructured record fields."""
16 |
17 | from space.core.serializers.base import DeserializedData
18 | from space.core.serializers.base import DictSerializer
19 | from space.core.serializers.base import FieldSerializer
20 |
--------------------------------------------------------------------------------
/python/src/space/core/serializers/base.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Serializers (and deserializers) for unstructured record fields."""
16 |
17 | from __future__ import annotations
18 | from abc import ABC, abstractmethod
19 | from typing import Any, Dict, List, Optional
20 | from typing_extensions import TypeAlias
21 |
22 | import pyarrow as pa
23 | # pylint: disable=line-too-long
24 | from tensorflow_datasets.core.dataset_utils import NumpyElem, Tree # type: ignore[import-untyped]
25 |
26 | DeserializedData: TypeAlias = Tree[NumpyElem]
27 | DictData: TypeAlias = Dict[str, List[DeserializedData]]
28 |
29 |
30 | class FieldSerializer(ABC):
31 | """Abstract serializer of a field.
32 |
33 | Used for serializing record fields into bytes to be stored in Space.
34 | """
35 |
36 | @abstractmethod
37 | def serialize(self, value: Any) -> bytes:
38 | """Serialize a value.
39 |
40 | Args:
41 | value: numpy-like nested dict.
42 | """
43 |
44 | @abstractmethod
45 | def deserialize(self, value_bytes: bytes) -> DeserializedData:
46 | """Deserialize bytes to a value.
47 |
48 | Returns:
49 | Numpy-like nested dict.
50 | """
51 |
52 |
53 | class DictSerializer:
54 | """A serializer (deserializer) of a dict of fields.
55 |
56 | The fields are serialized by FieldSerializer. The dict is usually a Py dict
57 | converted from an Arrow table, e.g., {"field": [values, ...], ...}
58 | """
59 |
60 | def __init__(self, serializers: Dict[str, FieldSerializer]):
61 | self._serializers = serializers
62 |
63 | @classmethod
64 | def create(cls, logical_schema: pa.Schema) -> DictSerializer:
65 | """Create a new dictionary serializer."""
66 | serializers: Dict[str, FieldSerializer] = {}
67 | for field in logical_schema:
68 | if isinstance(field.type, FieldSerializer):
69 | serializers[field.name] = field.type
70 |
71 | return DictSerializer(serializers)
72 |
73 | def field_serializer(self, field: str) -> Optional[FieldSerializer]:
74 | """Return the FieldSerializer of a given field, or None if not found."""
75 | if field not in self._serializers:
76 | return None
77 |
78 | return self._serializers[field]
79 |
80 | def serialize(self, value: DictData) -> DictData:
81 | """Serialize a value.
82 |
83 | Args:
84 | value: a dict of numpy-like nested dicts.
85 | """
86 | return self._process_dict(value, serialize=True)
87 |
88 | def deserialize(self, value_bytes: DictData) -> DictData:
89 | """Deserialize a dict of bytes to a dict of values.
90 |
91 | Returns:
92 | A dict of numpy-like nested dicts.
93 | """
94 | return self._process_dict(value_bytes, serialize=False)
95 |
96 | def _process_dict(self, value: DictData, serialize: bool) -> DictData:
97 | result = {}
98 | for field_name, value_batch in value.items():
99 | if field_name in self._serializers:
100 | ser = self._serializers[field_name]
101 | result[field_name] = [
102 | ser.serialize(v) if serialize else ser.deserialize(v)
103 | for v in value_batch
104 | ]
105 | else:
106 | result[field_name] = value_batch
107 |
108 | return result
109 |
--------------------------------------------------------------------------------
/python/src/space/core/transform/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/transform/join.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Classes for transforming datasets using join."""
16 |
17 | from __future__ import annotations
18 | from dataclasses import dataclass, field as dataclass_field
19 | from typing import Dict, List, TYPE_CHECKING
20 |
21 | import pyarrow as pa
22 | from substrait.algebra_pb2 import Rel
23 |
24 | from space.core.options import JoinOptions, ReadOptions
25 | from space.core.schema.arrow import record_address_types
26 | from space.core.transform.plans import LogicalPlanBuilder
27 | from space.core.utils import errors
28 | from space.core.utils.lazy_imports_utils import ray
29 | from space.core.views import View
30 | from space.ray.ops.join import JoinInput, RayJoinOp
31 | from space.ray.options import RayOptions
32 |
33 | if TYPE_CHECKING:
34 | from space.core.datasets import Dataset
35 |
36 |
37 | @dataclass
38 | class JoinTransform(View):
39 | """Transform that joins two views/datasets."""
40 |
41 | # Join keys must be parts of primary keys.
42 | join_keys: List[str]
43 | # The input views/datasets of the join.
44 | left: JoinInput
45 | right: JoinInput
46 |
47 | output_schema: pa.Schema = dataclass_field(init=False)
48 |
49 | def __post_init__(self):
50 | self.output_schema = self._output_schema()
51 |
52 | @property
53 | def primary_keys(self) -> List[str]:
54 | return self.join_keys
55 |
56 | @property
57 | def sources(self) -> Dict[str, Dataset]:
58 | return {**self.left.view.sources, **self.right.view.sources}
59 |
60 | @property
61 | def schema(self) -> pa.Schema:
62 | return self.output_schema
63 |
64 | def _output_schema(self) -> pa.Schema:
65 | assert len(self.join_keys) == 1
66 | join_key = self.join_keys[0]
67 | record_fields = set(self.record_fields)
68 |
69 | def _fields(input_: JoinInput) -> List[pa.Field]:
70 | nonlocal join_key, record_fields
71 | results = []
72 | for f in (input_.fields or input_.view.schema.names):
73 | if f == join_key:
74 | continue
75 |
76 | if input_.reference_read and f in record_fields:
77 | results.append(pa.field(f, pa.struct(
78 | record_address_types()))) # type: ignore[arg-type]
79 | else:
80 | results.append(input_.view.schema.field(
81 | f).remove_metadata()) # type: ignore[arg-type]
82 |
83 | return results
84 |
85 | # TODO: to handle reference read. If true, use the address field schema.
86 | try:
87 | left_fields = _fields(self.left)
88 | right_fields = _fields(self.right)
89 |
90 | # TODO: to check field names that are the same in left and right; add a
91 | # validation first, and then support field rename.
92 | return pa.schema([
93 | self.left.view.schema.field(
94 | join_key).remove_metadata() # type: ignore[arg-type]
95 | ] + left_fields + right_fields)
96 | except KeyError as e:
97 | raise errors.UserInputError(repr(e))
98 |
99 | @property
100 | def record_fields(self) -> List[str]:
101 | # TODO: For now just inherit record fields from input, to allow updating.
102 | left_record_fields = set(self.left.view.record_fields).intersection(
103 | set(self.left.fields or self.left.view.schema.names))
104 | right_record_fields = set(self.right.view.record_fields).intersection(
105 | set(self.right.fields or self.right.view.schema.names))
106 | return list(left_record_fields) + list(right_record_fields)
107 |
108 | def process_source(self, data: ray.data.Dataset) -> ray.data.Dataset:
109 | raise NotImplementedError("Processing change data in join is not supported")
110 |
111 | def _ray_dataset(self, ray_options: RayOptions, read_options: ReadOptions,
112 | join_options: JoinOptions) -> ray.data.Dataset:
113 | # TODO: to use paralelism specified by ray_options. Today parallelism is
114 | # controlled by join_options.partition_fn.
115 | if read_options.fields is not None:
116 | raise errors.UserInputError(
117 | "`fields` is not supported for join, use `left_fields` and"
118 | " `right_fields` of join instead")
119 |
120 | if read_options.reference_read:
121 | # TODO: need such options for both left and right, will be supported
122 | # after refactoring the arguments.
123 | raise errors.UserInputError("`reference_read` is not supported for join")
124 |
125 | return RayJoinOp(self.left, self.right, self.join_keys, self.schema,
126 | join_options, ray_options).ray_dataset()
127 |
128 | def to_relation(self, builder: LogicalPlanBuilder) -> Rel:
129 | raise NotImplementedError("Materialized view of join is not supported")
130 |
--------------------------------------------------------------------------------
/python/src/space/core/transform/plans.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Plans for view/dataset transforms."""
16 |
17 | from __future__ import annotations
18 | from dataclasses import dataclass
19 | from typing import Callable, Dict, List, Optional
20 |
21 | import cloudpickle # type: ignore[import-untyped]
22 | import pyarrow as pa
23 | from space.core.utils.uuids import random_id
24 |
25 | from space.core.utils import errors
26 | from substrait.algebra_pb2 import Rel, RelRoot
27 | from substrait.extensions.extensions_pb2 import SimpleExtensionDeclaration
28 | from substrait.extensions.extensions_pb2 import SimpleExtensionURI
29 | from substrait.plan_pb2 import Plan, PlanRel
30 |
31 | # Substrait URI representing user defined functions.
32 | # When constructing a materialized view from logical plan, the UDF is loaded
33 | # from a pickle file path in the storage metadata's UDF registry.
34 | SIMPLE_UDF_URI = "urn:space:substrait_simple_extension_function"
35 |
36 |
37 | @dataclass
38 | class UserDefinedFn:
39 | """A user defined function in the logical plan.
40 |
41 | The class object is persisted in the storage metadata's UDF registry.
42 | """
43 | # A callable provided by users. The requirement on signature varies depending
44 | # on the transform type.
45 | fn: Callable
46 | # The output schema after applying fn on the input view.
47 | output_schema: pa.Schema
48 | # The record fields in the output schema.
49 | output_record_fields: List[str]
50 | # If reading the input view by batches, number of rows per input batch.
51 | batch_size: Optional[int] = None
52 |
53 | # TODO: file operations need to be through the FileSystem interface.
54 |
55 | @classmethod
56 | def load(cls, file_path: str) -> UserDefinedFn:
57 | """Load a UDF from a file."""
58 | with open(file_path, "rb") as f:
59 | udf = cloudpickle.load(f)
60 |
61 | return udf
62 |
63 | def dump(self, file_path: str) -> None:
64 | """Dump UDF into a file."""
65 | with open(file_path, 'wb') as f:
66 | cloudpickle.dump(self, f)
67 |
68 |
69 | class LogicalPlanBuilder:
70 | """A builder of logical plan in the Substrait format."""
71 |
72 | def __init__(self):
73 | self._plan = Plan()
74 | self._udfs: Dict[str, UserDefinedFn] = {}
75 |
76 | self._extension_uri_anchor = 1
77 | self._function_anchor = 1
78 |
79 | def next_ext_uri_anchor(self) -> int:
80 | """Return the next extension URI anchor."""
81 | result = self._extension_uri_anchor
82 | self._extension_uri_anchor += 1
83 | return result
84 |
85 | def next_function_anchor(self) -> int:
86 | """Return the next function anchor."""
87 | result = self._function_anchor
88 | self._function_anchor += 1
89 | return result
90 |
91 | def append_ext_uri(self, uri: SimpleExtensionURI) -> None:
92 | """Append an extension URI in the plan."""
93 | self._plan.extension_uris.append(uri)
94 |
95 | def append_ext(self, ext: SimpleExtensionDeclaration) -> None:
96 | """Append an extension in the plan."""
97 | self._plan.extensions.append(ext)
98 |
99 | def build(self, relation: Rel) -> Plan:
100 | """Build the plan."""
101 | self._plan.relations.append(PlanRel(root=RelRoot(input=relation)))
102 | return self._plan
103 |
104 | def add_udf(self, name: str, fn: UserDefinedFn) -> None:
105 | """Add a new user defined function to the plan."""
106 | self._udfs[name] = fn
107 |
108 | def new_udf_name(self) -> str:
109 | """Return a random UDF name, unique in the plan scope."""
110 | retry_count = 0
111 | while retry_count < 10:
112 | retry_count += 1
113 | name = f"udf_{random_id()}"
114 | if name not in self._udfs:
115 | return name
116 |
117 | raise errors.SpaceRuntimeError("Failed to generate an unused UDF name")
118 |
119 | @property
120 | def udfs(self) -> Dict[str, UserDefinedFn]:
121 | """Return user defined functions in the plan."""
122 | return self._udfs
123 |
--------------------------------------------------------------------------------
/python/src/space/core/transform/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for transforms."""
16 |
17 | from __future__ import annotations
18 | from typing import TYPE_CHECKING
19 |
20 | from space.core.datasets import Dataset
21 | from space.core.options import JoinOptions, ReadOptions
22 | from space.core.utils.lazy_imports_utils import ray
23 | from space.ray.options import RayOptions
24 |
25 | if TYPE_CHECKING:
26 | from space.core.views import View
27 |
28 |
29 | def ray_dataset(view: View, ray_options: RayOptions,
30 | read_options: ReadOptions) -> ray.data.Dataset:
31 | """A wrapper for creating Ray dataset for datasets and views."""
32 | empty_join_options = JoinOptions()
33 |
34 | if isinstance(view, Dataset):
35 | # Push input_fields down to the dataset to read less data.
36 | return view._ray_dataset(ray_options, read_options, empty_join_options) # pylint: disable=protected-access
37 |
38 | # For non-dataset views, fields can't be pushed down to storage.
39 | fields = read_options.fields
40 | read_options.fields = None
41 | ds = view._ray_dataset(ray_options, read_options, empty_join_options) # pylint: disable=protected-access
42 |
43 | return ds if fields is None else ds.select_columns(fields)
44 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Define module level constants."""
16 |
17 | UTF_8 = "utf-8"
18 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/errors.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Define errors thrown by Space runtime."""
16 |
17 |
18 | class UserInputError(ValueError):
19 | """Errors caused by invalid user input."""
20 |
21 |
22 | class VersionNotFoundError(UserInputError):
23 | """The version is not found in metadata."""
24 |
25 |
26 | class VersionAlreadyExistError(UserInputError):
27 | """Errors caused by the version to add already exists."""
28 |
29 |
30 | class PrimaryKeyExistError(UserInputError):
31 | """Errors caused by duplicated primary keys."""
32 |
33 |
34 | class FileExistError(UserInputError):
35 | """Errors caused by a file to create already exists."""
36 |
37 |
38 | class StorageExistError(UserInputError):
39 | """Errors caused by a storage to create already exists."""
40 |
41 |
42 | class StorageNotFoundError(UserInputError):
43 | """The storage to load is not found."""
44 |
45 |
46 | class SpaceRuntimeError(RuntimeError):
47 | """Basic class of errors thrown from Space runtime."""
48 |
49 |
50 | class TransactionError(SpaceRuntimeError):
51 | """Errors from Space transaction commit."""
52 |
53 |
54 | class LogicalPlanError(SpaceRuntimeError):
55 | """Errors from parsing logical plan."""
56 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/paths.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utility methods for file paths."""
16 |
17 | from os import path
18 |
19 | from space.core.utils.uuids import uuid_
20 |
21 | # Folders of storage metadata.
22 | _ENTRY_POINT_FILE = "entrypoint.txtpb"
23 | _DATA_DIR = "data"
24 | _METADATA_DIR = "metadata"
25 | _CHANGE_DATA_DIR = "changes"
26 | # Folder of user defined functions for materialized views.
27 | UDF_DIR = 'udfs'
28 |
29 |
30 | def new_index_file_path(data_dir_: str):
31 | """Return a random index file path in a given data directory.."""
32 | return path.join(data_dir_, f"index_{uuid_()}.parquet")
33 |
34 |
35 | def new_record_file_path(data_dir_: str, field_name: str):
36 | """Return a random record file path in a given data directory.."""
37 | return path.join(data_dir_, f"{field_name}_{uuid_()}.array_record")
38 |
39 |
40 | def new_index_manifest_path(metadata_dir_: str):
41 | """Return a random index manifest file path in a given metadata directory."""
42 | return path.join(metadata_dir_, f"index_manifest_{uuid_()}.parquet")
43 |
44 |
45 | def new_record_manifest_path(metadata_dir_: str):
46 | """Return a random record manifest file path in a given metadata directory."""
47 | return path.join(metadata_dir_, f"record_manifest_{uuid_()}.parquet")
48 |
49 |
50 | def data_dir(location: str) -> str:
51 | """Return the data directory path in a given location."""
52 | return path.join(location, _DATA_DIR)
53 |
54 |
55 | def metadata_dir(location: str) -> str:
56 | """Return the metadata directory path in a given location."""
57 | return path.join(location, _METADATA_DIR)
58 |
59 |
60 | def entry_point_path(location: str) -> str:
61 | """Return the static entry point file path in a given location."""
62 | return path.join(location, _METADATA_DIR, _ENTRY_POINT_FILE)
63 |
64 |
65 | def new_metadata_path(metadata_dir_: str) -> str:
66 | """Return a random metadata file path in a given metadata directory."""
67 | return path.join(metadata_dir_, f"metadata_{uuid_()}.txtpb")
68 |
69 |
70 | class StoragePathsMixin:
71 | """Provides util methods for file and directory paths."""
72 |
73 | def __init__(self, location: str):
74 | self._location = location
75 |
76 | self._data_dir = data_dir(self._location)
77 | self._metadata_dir = metadata_dir(self._location)
78 | self._change_data_dir = path.join(self._metadata_dir, _CHANGE_DATA_DIR)
79 | self._entry_point_file = entry_point_path(self._location)
80 |
81 | @property
82 | def location(self) -> str:
83 | """Return the storage base folder location."""
84 | return self._location
85 |
86 | @property
87 | def data_dir(self) -> str:
88 | """Return the data directory."""
89 | return self._data_dir
90 |
91 | @property
92 | def metadata_dir(self) -> str:
93 | """Return the metadata directory."""
94 | return self._metadata_dir
95 |
96 | def short_path(self, full_path: str) -> str:
97 | """Return the short relative path from a full path."""
98 | return path.relpath(full_path, self._location)
99 |
100 | def full_path(self, short_path: str) -> str:
101 | """Return the full path from a full or short path."""
102 | return path.join(self._location, short_path)
103 |
104 | def new_metadata_path(self) -> str:
105 | """Return a random metadata file path."""
106 | return new_metadata_path(self._metadata_dir)
107 |
108 | def new_change_log_path(self) -> str:
109 | """Return a random change log file path."""
110 | return path.join(self._change_data_dir, f"change_{uuid_()}.txtpb")
111 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/protos.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utility methods for protos."""
16 |
17 | from google.protobuf import message
18 | from google.protobuf import text_format
19 | from google.protobuf.timestamp_pb2 import Timestamp
20 |
21 | from space.core.utils.constants import UTF_8
22 |
23 |
24 | def proto_to_text(msg: message.Message) -> bytes:
25 | """Return the text format of a proto."""
26 | return text_format.MessageToString(msg).encode(UTF_8)
27 |
28 |
29 | def proto_now() -> Timestamp:
30 | """Return the current time in the proto format."""
31 | timestamp = Timestamp()
32 | timestamp.GetCurrentTime()
33 | return timestamp
34 |
--------------------------------------------------------------------------------
/python/src/space/core/utils/uuids.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utility methods for UUIDs."""
16 |
17 | import uuid
18 |
19 |
20 | def uuid_() -> str:
21 | """Return a new UUID."""
22 | return str(uuid.uuid4())
23 |
24 |
25 | def random_id() -> str:
26 | """Return a short random ID."""
27 | return uuid_().split("-", maxsplit=1)[0]
28 |
--------------------------------------------------------------------------------
/python/src/space/ray/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/append.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Distributed append operation using Ray."""
16 |
17 | from __future__ import annotations
18 | from typing import List, Optional
19 |
20 | import pyarrow as pa
21 | import ray
22 |
23 | from space.core.ops import utils
24 | from space.core.ops.append import BaseAppendOp, LocalAppendOp
25 | from space.core.ops.base import InputData, InputIteratorFn
26 | from space.core.options import FileOptions
27 | from space.core.proto import metadata_pb2 as meta
28 | from space.core.proto import runtime_pb2 as rt
29 | from space.ray.options import RayOptions
30 |
31 |
32 | class RayAppendOp(BaseAppendOp):
33 | """Ray append operation writing files distributedly."""
34 |
35 | # pylint: disable=too-many-arguments
36 | def __init__(self,
37 | location: str,
38 | metadata: meta.StorageMetadata,
39 | ray_options: RayOptions,
40 | file_options: FileOptions,
41 | record_address_input: bool = False):
42 | """
43 | Args:
44 | record_address_input: if true, input record fields are addresses.
45 | """
46 | self._ray_options = ray_options
47 | self._actors = [
48 | _AppendActor.remote( # type: ignore[attr-defined] # pylint: disable=no-member
49 | location, metadata, file_options, record_address_input)
50 | for _ in range(self._ray_options.max_parallelism)
51 | ]
52 |
53 | def write(self, data: InputData) -> None:
54 | if not isinstance(data, pa.Table):
55 | data = pa.Table.from_pydict(data)
56 |
57 | num_shards = self._ray_options.max_parallelism
58 |
59 | shard_size = data.num_rows // num_shards
60 | if shard_size == 0:
61 | shard_size = 1
62 |
63 | responses = []
64 | offset = 0
65 | for i in range(num_shards):
66 | shard = data.slice(offset=offset, length=shard_size)
67 | responses.append(self._actors[i].write.remote(shard))
68 |
69 | offset += shard_size
70 | if offset >= data.num_rows:
71 | break
72 |
73 | if offset < data.num_rows:
74 | shard = data.slice(offset=offset)
75 | responses.append(self._actors[0].write.remote(shard))
76 |
77 | ray.get(responses)
78 |
79 | def write_from(self, source_fns: List[InputIteratorFn]) -> None:
80 | """Append data into the dataset from multiple iterator sources in
81 | parallel.
82 | """
83 | num_actors = len(self._actors)
84 | responses = []
85 | for i, source_fn in enumerate(source_fns):
86 | responses.append(self._actors[i %
87 | num_actors].write_from.remote(source_fn))
88 |
89 | ray.get(responses)
90 |
91 | def finish(self) -> Optional[rt.Patch]:
92 | patches = ray.get([actor.finish.remote() for actor in self._actors])
93 | return utils.merge_patches(patches)
94 |
95 |
96 | @ray.remote
97 | class _AppendActor:
98 | """A stateful Ray actor for appending data."""
99 |
100 | def __init__(self,
101 | location: str,
102 | metadata: meta.StorageMetadata,
103 | file_options: FileOptions,
104 | record_address_input: bool = False):
105 | self._op = LocalAppendOp(location, metadata, file_options,
106 | record_address_input)
107 |
108 | def write_from(self, source_fn: InputIteratorFn) -> None:
109 | """Append data into the dataset from an iterator source."""
110 | for data in source_fn():
111 | self._op.write(data)
112 |
113 | def write(self, data: InputData) -> bool:
114 | """Append data into storage."""
115 | self._op.write(data)
116 | return True
117 |
118 | def finish(self) -> Optional[rt.Patch]:
119 | """Complete the append operation and return a metadata patch."""
120 | return self._op.finish()
121 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/change_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Change data feed that computes delta between two snapshots by Ray."""
16 |
17 | import math
18 | from typing import Iterable, Iterator
19 |
20 | import ray
21 |
22 | from space.core.ops.change_data import (ChangeData, ChangeType,
23 | LocalChangeDataReadOp,
24 | ordered_snapshot_ids)
25 | from space.core.options import ReadOptions
26 | import space.core.proto.metadata_pb2 as meta
27 | from space.core.storage import Storage
28 | from space.ray import data_sources as ray_data_sources
29 | from space.ray.options import RayOptions
30 |
31 |
32 | def read_change_data(storage: Storage, start_snapshot_id: int,
33 | end_snapshot_id: int, ray_options: RayOptions,
34 | read_options: ReadOptions) -> Iterator[ChangeData]:
35 | """Read change data from a start to an end snapshot.
36 |
37 | start_snapshot_id is excluded; end_snapshot_id is included.
38 | """
39 | for snapshot_id in ordered_snapshot_ids(storage, start_snapshot_id,
40 | end_snapshot_id):
41 | yield from _RayChangeDataReadOp(storage, snapshot_id, ray_options,
42 | read_options)
43 |
44 |
45 | class _RayChangeDataReadOp(LocalChangeDataReadOp):
46 | """Read changes of data from a given snapshot of a dataset."""
47 |
48 | def __init__(self, storage: Storage, snapshot_id: int,
49 | ray_options: RayOptions, read_options: ReadOptions):
50 | LocalChangeDataReadOp.__init__(self, storage, snapshot_id, read_options)
51 | self._ray_options = ray_options
52 |
53 | def __iter__(self) -> Iterator[ChangeData]:
54 | # Must return deletion first, otherwise when the upstream re-apply
55 | # deletions and additions, it may delete newly added data.
56 | # TODO: to enforce this check upstream, or merge deletion+addition as a
57 | # update.
58 | if self._change_log.deleted_rows:
59 | # Only read primary keys for deletions. The data to read is relatively
60 | # small. In addition, currently deletion has to aggregate primary keys
61 | # to delete (can't parallelize two sets of keys to delete). So we don't
62 | # spit it to parallel read streams.
63 | ds = self._ray_dataset(self._change_log.deleted_rows,
64 | self._pk_only_read_option,
65 | self._ray_options.max_parallelism)
66 | yield ChangeData(self._snapshot_id, ChangeType.DELETE, [ds])
67 |
68 | if self._change_log.added_rows:
69 | # Split added data into parallel read streams.
70 | num_files = len(self._change_log.added_rows)
71 | num_streams = self._ray_options.max_parallelism
72 | shard_size = math.ceil(num_files / num_streams)
73 |
74 | shards = []
75 | for i in range(num_streams):
76 | start = i * shard_size
77 | end = min((i + 1) * shard_size, num_files)
78 | shards.append(self._change_log.added_rows[start:end])
79 |
80 | # Parallelism 1 means one reader for each read stream.
81 | # There are `ray_options.max_parallelism` read streams.
82 | # TODO: to measure performance and adjust.
83 | yield ChangeData(self._snapshot_id, ChangeType.ADD, [
84 | self._ray_dataset(s, self._read_options, parallelism=1)
85 | for s in shards
86 | ])
87 |
88 | def _ray_dataset(self, bitmaps: Iterable[meta.RowBitmap],
89 | read_options: ReadOptions,
90 | parallelism: int) -> ray.data.Dataset:
91 | return ray.data.read_datasource(ray_data_sources.SpaceDataSource(),
92 | storage=self._storage,
93 | ray_options=self._ray_options,
94 | read_options=read_options,
95 | file_set=self._bitmaps_to_file_set(bitmaps),
96 | parallelism=parallelism)
97 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/delete.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Distributed delete operation using Ray."""
16 |
17 | from __future__ import annotations
18 | from typing import Optional
19 |
20 | import pyarrow.compute as pc
21 | import ray
22 |
23 | from space.core.ops import utils
24 | from space.core.ops.delete import BaseDeleteOp, FileSetDeleteOp
25 | from space.core.options import FileOptions
26 | from space.core.proto import metadata_pb2 as meta
27 | from space.core.proto import runtime_pb2 as rt
28 | from space.core.storage import Storage
29 | from space.core.utils.paths import StoragePathsMixin
30 |
31 |
32 | class RayDeleteOp(BaseDeleteOp, StoragePathsMixin):
33 | """Ray delete operation processing files distributedly."""
34 |
35 | def __init__(self, storage: Storage, filter_: pc.Expression,
36 | file_options: FileOptions):
37 | StoragePathsMixin.__init__(self, storage.location)
38 |
39 | self._storage = storage
40 | self._filter = filter_
41 | self._file_options = file_options
42 |
43 | def delete(self) -> Optional[rt.Patch]:
44 | """Delete data matching the filter from the dataset."""
45 | metadata = self._storage.metadata
46 | matched_file_set = self._storage.data_files(self._filter)
47 |
48 | remote_delete_patches = []
49 | for index_file in matched_file_set.index_files:
50 | # Deletion only needs index file information (no record file information).
51 | file_set = rt.FileSet(
52 | index_files=[index_file],
53 | # TODO: attach all manifest files here, to select related manifests.
54 | index_manifest_files=matched_file_set.index_manifest_files)
55 |
56 | result = _delete.options( # type: ignore[attr-defined]
57 | num_returns=1).remote(self._storage.location, metadata, file_set,
58 | self._filter, self._file_options)
59 | remote_delete_patches.append(result)
60 |
61 | patches = ray.get(remote_delete_patches)
62 | return utils.merge_patches(patches)
63 |
64 |
65 | @ray.remote
66 | def _delete(location: str, metadata: meta.StorageMetadata, file_set: rt.FileSet,
67 | filter_: pc.Expression,
68 | file_options: FileOptions) -> Optional[rt.Patch]:
69 | return FileSetDeleteOp(location, metadata, file_set, filter_,
70 | file_options).delete()
71 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/insert.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Distributed insert operation using Ray."""
16 |
17 | from typing import List, Optional
18 |
19 | import pyarrow as pa
20 | import pyarrow.compute as pc
21 | import ray
22 |
23 | from space.ray.ops.append import RayAppendOp
24 | from space.core.ops.insert import InsertOptions, LocalInsertOp
25 | from space.core.ops.insert import filter_matched
26 | from space.core.options import FileOptions
27 | import space.core.proto.metadata_pb2 as meta
28 | import space.core.proto.runtime_pb2 as rt
29 | from space.core.storage import Storage
30 | from space.core.utils import errors
31 | from space.ray.options import RayOptions
32 |
33 |
34 | class RayInsertOp(LocalInsertOp):
35 | """Insert data to a dataset with distributed duplication check."""
36 |
37 | def __init__(self, storage: Storage, options: InsertOptions,
38 | ray_options: RayOptions, file_options: FileOptions):
39 | LocalInsertOp.__init__(self, storage, options, file_options)
40 | self._ray_options = ray_options
41 |
42 | def _check_duplication(self, data_files: rt.FileSet, filter_: pc.Expression):
43 | remote_duplicated_values = []
44 | for index_file in data_files.index_files:
45 | # pylint: disable=line-too-long
46 | remote_duplicated = _remote_filter_matched.options( # type: ignore[attr-defined]
47 | num_returns=1).remote(self._storage.location, self._metadata,
48 | rt.FileSet(index_files=[index_file]), filter_,
49 | self._storage.primary_keys)
50 | remote_duplicated_values.append(remote_duplicated)
51 |
52 | for duplicated in ray.get(remote_duplicated_values):
53 | if duplicated:
54 | raise errors.PrimaryKeyExistError("Primary key to insert already exist")
55 |
56 | def _append(self, data: pa.Table, patches: List[Optional[rt.Patch]]) -> None:
57 | append_op = RayAppendOp(self._location, self._metadata, self._ray_options,
58 | self._file_options)
59 | append_op.write(data)
60 | patches.append(append_op.finish())
61 |
62 |
63 | @ray.remote
64 | def _remote_filter_matched(location: str, metadata: meta.StorageMetadata,
65 | data_files: rt.FileSet, pk_filter: pc.Expression,
66 | primary_keys: List[str]) -> bool:
67 | return filter_matched(location, metadata, data_files, pk_filter, primary_keys)
68 |
--------------------------------------------------------------------------------
/python/src/space/ray/ops/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Utilities for Ray operations."""
16 |
17 | from __future__ import annotations
18 | from typing import Iterator, TYPE_CHECKING
19 |
20 | import ray
21 | import pyarrow as pa
22 |
23 | from space.core.storage import Storage
24 | from space.core.utils import errors
25 |
26 | if TYPE_CHECKING:
27 | from space.core.views import View
28 |
29 |
30 | def singleton_storage(view: View) -> Storage:
31 | """Return the singleton source storage for views with only one source
32 | dataset.
33 | """
34 | if len(view.sources) == 0:
35 | raise errors.UserInputError("Source of view not found")
36 |
37 | if len(view.sources) != 1:
38 | raise errors.UserInputError("Joining results of joins is not supported")
39 |
40 | return list(view.sources.values())[0].storage
41 |
42 |
43 | def iter_batches(ds: ray.data.Dataset) -> Iterator[pa.Table]:
44 | """Return an iterator of PyArrow tables from a Ray dataset."""
45 | # batch_size is None to use entire Ray blocks.
46 | for data in ds.iter_batches(batch_size=None,
47 | batch_format="pyarrow",
48 | drop_last=False):
49 | if data.num_rows > 0:
50 | yield data
51 |
--------------------------------------------------------------------------------
/python/src/space/ray/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | """Options of Space Ray lib."""
16 |
17 | from dataclasses import dataclass
18 |
19 |
20 | @dataclass
21 | class RayOptions:
22 | """Options of Ray runners."""
23 | # The max parallelism of computing resources to use in a Ray cluster.
24 | max_parallelism: int = 8
25 |
26 | # Enable using a row range of an index file as a Ray data block, in the Ray
27 | # datasource.
28 | #
29 | # When disabled, the minimal Ray block is data from one index file and
30 | # the records it references. Read batch size is achieved by repartition the
31 | # dataset. For an index Parquet file with 1 million rows, loading the block
32 | # needs to read all 1 million records, which is too expensive.
33 | #
34 | # If enabled, a Ray block size is capped by the provided read batch size.
35 | # The cost is possible duplicated read of index files. It should be disabled
36 | # when most data are stored in index files.
37 | enable_row_range_block: bool = True
38 |
--------------------------------------------------------------------------------
/python/src/substrait/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/substrait/capabilities_pb2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by the protocol buffer compiler. DO NOT EDIT!
3 | # source: substrait/capabilities.proto
4 | """Generated protocol buffer code."""
5 | from google.protobuf.internal import builder as _builder
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import descriptor_pool as _descriptor_pool
8 | from google.protobuf import symbol_database as _symbol_database
9 | # @@protoc_insertion_point(imports)
10 |
11 | _sym_db = _symbol_database.Default()
12 |
13 |
14 |
15 |
16 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1csubstrait/capabilities.proto\x12\tsubstrait\"\xfb\x01\n\x0c\x43\x61pabilities\x12\x1a\n\x12substrait_versions\x18\x01 \x03(\t\x12$\n\x1c\x61\x64vanced_extension_type_urls\x18\x02 \x03(\t\x12\x42\n\x11simple_extensions\x18\x03 \x03(\x0b\x32\'.substrait.Capabilities.SimpleExtension\x1a\x65\n\x0fSimpleExtension\x12\x0b\n\x03uri\x18\x01 \x01(\t\x12\x15\n\rfunction_keys\x18\x02 \x03(\t\x12\x11\n\ttype_keys\x18\x03 \x03(\t\x12\x1b\n\x13type_variation_keys\x18\x04 \x03(\tBW\n\x12io.substrait.protoP\x01Z*github.com/substrait-io/substrait-go/proto\xaa\x02\x12Substrait.Protobufb\x06proto3')
17 |
18 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
19 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'substrait.capabilities_pb2', globals())
20 | if _descriptor._USE_C_DESCRIPTORS == False:
21 |
22 | DESCRIPTOR._options = None
23 | DESCRIPTOR._serialized_options = b'\n\022io.substrait.protoP\001Z*github.com/substrait-io/substrait-go/proto\252\002\022Substrait.Protobuf'
24 | _CAPABILITIES._serialized_start=44
25 | _CAPABILITIES._serialized_end=295
26 | _CAPABILITIES_SIMPLEEXTENSION._serialized_start=194
27 | _CAPABILITIES_SIMPLEEXTENSION._serialized_end=295
28 | # @@protoc_insertion_point(module_scope)
29 |
--------------------------------------------------------------------------------
/python/src/substrait/capabilities_pb2.pyi:
--------------------------------------------------------------------------------
1 | """
2 | @generated by mypy-protobuf. Do not edit manually!
3 | isort:skip_file
4 | SPDX-License-Identifier: Apache-2.0"""
5 | import builtins
6 | import collections.abc
7 | import google.protobuf.descriptor
8 | import google.protobuf.internal.containers
9 | import google.protobuf.message
10 | import sys
11 |
12 | if sys.version_info >= (3, 8):
13 | import typing as typing_extensions
14 | else:
15 | import typing_extensions
16 |
17 | DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
18 |
19 | @typing_extensions.final
20 | class Capabilities(google.protobuf.message.Message):
21 | """Defines a set of Capabilities that a system (producer or consumer) supports."""
22 |
23 | DESCRIPTOR: google.protobuf.descriptor.Descriptor
24 |
25 | @typing_extensions.final
26 | class SimpleExtension(google.protobuf.message.Message):
27 | DESCRIPTOR: google.protobuf.descriptor.Descriptor
28 |
29 | URI_FIELD_NUMBER: builtins.int
30 | FUNCTION_KEYS_FIELD_NUMBER: builtins.int
31 | TYPE_KEYS_FIELD_NUMBER: builtins.int
32 | TYPE_VARIATION_KEYS_FIELD_NUMBER: builtins.int
33 | uri: builtins.str
34 | @property
35 | def function_keys(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
36 | @property
37 | def type_keys(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
38 | @property
39 | def type_variation_keys(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
40 | def __init__(
41 | self,
42 | *,
43 | uri: builtins.str = ...,
44 | function_keys: collections.abc.Iterable[builtins.str] | None = ...,
45 | type_keys: collections.abc.Iterable[builtins.str] | None = ...,
46 | type_variation_keys: collections.abc.Iterable[builtins.str] | None = ...,
47 | ) -> None: ...
48 | def ClearField(self, field_name: typing_extensions.Literal["function_keys", b"function_keys", "type_keys", b"type_keys", "type_variation_keys", b"type_variation_keys", "uri", b"uri"]) -> None: ...
49 |
50 | SUBSTRAIT_VERSIONS_FIELD_NUMBER: builtins.int
51 | ADVANCED_EXTENSION_TYPE_URLS_FIELD_NUMBER: builtins.int
52 | SIMPLE_EXTENSIONS_FIELD_NUMBER: builtins.int
53 | @property
54 | def substrait_versions(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
55 | """List of Substrait versions this system supports"""
56 | @property
57 | def advanced_extension_type_urls(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
58 | """list of com.google.Any message types this system supports for advanced
59 | extensions.
60 | """
61 | @property
62 | def simple_extensions(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Capabilities.SimpleExtension]:
63 | """list of simple extensions this system supports."""
64 | def __init__(
65 | self,
66 | *,
67 | substrait_versions: collections.abc.Iterable[builtins.str] | None = ...,
68 | advanced_extension_type_urls: collections.abc.Iterable[builtins.str] | None = ...,
69 | simple_extensions: collections.abc.Iterable[global___Capabilities.SimpleExtension] | None = ...,
70 | ) -> None: ...
71 | def ClearField(self, field_name: typing_extensions.Literal["advanced_extension_type_urls", b"advanced_extension_type_urls", "simple_extensions", b"simple_extensions", "substrait_versions", b"substrait_versions"]) -> None: ...
72 |
73 | global___Capabilities = Capabilities
74 |
--------------------------------------------------------------------------------
/python/src/substrait/extended_expression_pb2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by the protocol buffer compiler. DO NOT EDIT!
3 | # source: substrait/extended_expression.proto
4 | """Generated protocol buffer code."""
5 | from google.protobuf.internal import builder as _builder
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import descriptor_pool as _descriptor_pool
8 | from google.protobuf import symbol_database as _symbol_database
9 | # @@protoc_insertion_point(imports)
10 |
11 | _sym_db = _symbol_database.Default()
12 |
13 |
14 | from substrait import algebra_pb2 as substrait_dot_algebra__pb2
15 | from substrait.extensions import extensions_pb2 as substrait_dot_extensions_dot_extensions__pb2
16 | from substrait import plan_pb2 as substrait_dot_plan__pb2
17 | from substrait import type_pb2 as substrait_dot_type__pb2
18 |
19 |
20 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n#substrait/extended_expression.proto\x12\tsubstrait\x1a\x17substrait/algebra.proto\x1a%substrait/extensions/extensions.proto\x1a\x14substrait/plan.proto\x1a\x14substrait/type.proto\"\x96\x01\n\x13\x45xpressionReference\x12+\n\nexpression\x18\x01 \x01(\x0b\x32\x15.substrait.ExpressionH\x00\x12/\n\x07measure\x18\x02 \x01(\x0b\x32\x1c.substrait.AggregateFunctionH\x00\x12\x14\n\x0coutput_names\x18\x03 \x03(\tB\x0b\n\texpr_type\"\x87\x03\n\x12\x45xtendedExpression\x12#\n\x07version\x18\x07 \x01(\x0b\x32\x12.substrait.Version\x12@\n\x0e\x65xtension_uris\x18\x01 \x03(\x0b\x32(.substrait.extensions.SimpleExtensionURI\x12\x44\n\nextensions\x18\x02 \x03(\x0b\x32\x30.substrait.extensions.SimpleExtensionDeclaration\x12\x35\n\rreferred_expr\x18\x03 \x03(\x0b\x32\x1e.substrait.ExpressionReference\x12+\n\x0b\x62\x61se_schema\x18\x04 \x01(\x0b\x32\x16.substrait.NamedStruct\x12\x44\n\x13\x61\x64vanced_extensions\x18\x05 \x01(\x0b\x32\'.substrait.extensions.AdvancedExtension\x12\x1a\n\x12\x65xpected_type_urls\x18\x06 \x03(\tBW\n\x12io.substrait.protoP\x01Z*github.com/substrait-io/substrait-go/proto\xaa\x02\x12Substrait.Protobufb\x06proto3')
21 |
22 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
23 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'substrait.extended_expression_pb2', globals())
24 | if _descriptor._USE_C_DESCRIPTORS == False:
25 |
26 | DESCRIPTOR._options = None
27 | DESCRIPTOR._serialized_options = b'\n\022io.substrait.protoP\001Z*github.com/substrait-io/substrait-go/proto\252\002\022Substrait.Protobuf'
28 | _EXPRESSIONREFERENCE._serialized_start=159
29 | _EXPRESSIONREFERENCE._serialized_end=309
30 | _EXTENDEDEXPRESSION._serialized_start=312
31 | _EXTENDEDEXPRESSION._serialized_end=703
32 | # @@protoc_insertion_point(module_scope)
33 |
--------------------------------------------------------------------------------
/python/src/substrait/extensions/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/src/substrait/extensions/extensions_pb2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by the protocol buffer compiler. DO NOT EDIT!
3 | # source: substrait/extensions/extensions.proto
4 | """Generated protocol buffer code."""
5 | from google.protobuf.internal import builder as _builder
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import descriptor_pool as _descriptor_pool
8 | from google.protobuf import symbol_database as _symbol_database
9 | # @@protoc_insertion_point(imports)
10 |
11 | _sym_db = _symbol_database.Default()
12 |
13 |
14 | from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2
15 |
16 |
17 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n%substrait/extensions/extensions.proto\x12\x14substrait.extensions\x1a\x19google/protobuf/any.proto\"?\n\x12SimpleExtensionURI\x12\x1c\n\x14\x65xtension_uri_anchor\x18\x01 \x01(\r\x12\x0b\n\x03uri\x18\x02 \x01(\t\"\xef\x04\n\x1aSimpleExtensionDeclaration\x12X\n\x0e\x65xtension_type\x18\x01 \x01(\x0b\x32>.substrait.extensions.SimpleExtensionDeclaration.ExtensionTypeH\x00\x12k\n\x18\x65xtension_type_variation\x18\x02 \x01(\x0b\x32G.substrait.extensions.SimpleExtensionDeclaration.ExtensionTypeVariationH\x00\x12`\n\x12\x65xtension_function\x18\x03 \x01(\x0b\x32\x42.substrait.extensions.SimpleExtensionDeclaration.ExtensionFunctionH\x00\x1aS\n\rExtensionType\x12\x1f\n\x17\x65xtension_uri_reference\x18\x01 \x01(\r\x12\x13\n\x0btype_anchor\x18\x02 \x01(\r\x12\x0c\n\x04name\x18\x03 \x01(\t\x1a\x66\n\x16\x45xtensionTypeVariation\x12\x1f\n\x17\x65xtension_uri_reference\x18\x01 \x01(\r\x12\x1d\n\x15type_variation_anchor\x18\x02 \x01(\r\x12\x0c\n\x04name\x18\x03 \x01(\t\x1a[\n\x11\x45xtensionFunction\x12\x1f\n\x17\x65xtension_uri_reference\x18\x01 \x01(\r\x12\x17\n\x0f\x66unction_anchor\x18\x02 \x01(\r\x12\x0c\n\x04name\x18\x03 \x01(\tB\x0e\n\x0cmapping_type\"j\n\x11\x41\x64vancedExtension\x12*\n\x0coptimization\x18\x01 \x01(\x0b\x32\x14.google.protobuf.Any\x12)\n\x0b\x65nhancement\x18\x02 \x01(\x0b\x32\x14.google.protobuf.AnyBb\n\x12io.substrait.protoP\x01Z5github.com/substrait-io/substrait-go/proto/extensions\xaa\x02\x12Substrait.Protobufb\x06proto3')
18 |
19 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
20 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'substrait.extensions.extensions_pb2', globals())
21 | if _descriptor._USE_C_DESCRIPTORS == False:
22 |
23 | DESCRIPTOR._options = None
24 | DESCRIPTOR._serialized_options = b'\n\022io.substrait.protoP\001Z5github.com/substrait-io/substrait-go/proto/extensions\252\002\022Substrait.Protobuf'
25 | _SIMPLEEXTENSIONURI._serialized_start=90
26 | _SIMPLEEXTENSIONURI._serialized_end=153
27 | _SIMPLEEXTENSIONDECLARATION._serialized_start=156
28 | _SIMPLEEXTENSIONDECLARATION._serialized_end=779
29 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONTYPE._serialized_start=483
30 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONTYPE._serialized_end=566
31 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONTYPEVARIATION._serialized_start=568
32 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONTYPEVARIATION._serialized_end=670
33 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONFUNCTION._serialized_start=672
34 | _SIMPLEEXTENSIONDECLARATION_EXTENSIONFUNCTION._serialized_end=763
35 | _ADVANCEDEXTENSION._serialized_start=781
36 | _ADVANCEDEXTENSION._serialized_end=887
37 | # @@protoc_insertion_point(module_scope)
38 |
--------------------------------------------------------------------------------
/python/src/substrait/plan_pb2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by the protocol buffer compiler. DO NOT EDIT!
3 | # source: substrait/plan.proto
4 | """Generated protocol buffer code."""
5 | from google.protobuf.internal import builder as _builder
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import descriptor_pool as _descriptor_pool
8 | from google.protobuf import symbol_database as _symbol_database
9 | # @@protoc_insertion_point(imports)
10 |
11 | _sym_db = _symbol_database.Default()
12 |
13 |
14 | from substrait import algebra_pb2 as substrait_dot_algebra__pb2
15 | from substrait.extensions import extensions_pb2 as substrait_dot_extensions_dot_extensions__pb2
16 |
17 |
18 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14substrait/plan.proto\x12\tsubstrait\x1a\x17substrait/algebra.proto\x1a%substrait/extensions/extensions.proto\"X\n\x07PlanRel\x12\x1d\n\x03rel\x18\x01 \x01(\x0b\x32\x0e.substrait.RelH\x00\x12\"\n\x04root\x18\x02 \x01(\x0b\x32\x12.substrait.RelRootH\x00\x42\n\n\x08rel_type\"\xbc\x02\n\x04Plan\x12#\n\x07version\x18\x06 \x01(\x0b\x32\x12.substrait.Version\x12@\n\x0e\x65xtension_uris\x18\x01 \x03(\x0b\x32(.substrait.extensions.SimpleExtensionURI\x12\x44\n\nextensions\x18\x02 \x03(\x0b\x32\x30.substrait.extensions.SimpleExtensionDeclaration\x12%\n\trelations\x18\x03 \x03(\x0b\x32\x12.substrait.PlanRel\x12\x44\n\x13\x61\x64vanced_extensions\x18\x04 \x01(\x0b\x32\'.substrait.extensions.AdvancedExtension\x12\x1a\n\x12\x65xpected_type_urls\x18\x05 \x03(\t\"2\n\x0bPlanVersion\x12#\n\x07version\x18\x06 \x01(\x0b\x32\x12.substrait.Version\"o\n\x07Version\x12\x14\n\x0cmajor_number\x18\x01 \x01(\r\x12\x14\n\x0cminor_number\x18\x02 \x01(\r\x12\x14\n\x0cpatch_number\x18\x03 \x01(\r\x12\x10\n\x08git_hash\x18\x04 \x01(\t\x12\x10\n\x08producer\x18\x05 \x01(\tBW\n\x12io.substrait.protoP\x01Z*github.com/substrait-io/substrait-go/proto\xaa\x02\x12Substrait.Protobufb\x06proto3')
19 |
20 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
21 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'substrait.plan_pb2', globals())
22 | if _descriptor._USE_C_DESCRIPTORS == False:
23 |
24 | DESCRIPTOR._options = None
25 | DESCRIPTOR._serialized_options = b'\n\022io.substrait.protoP\001Z*github.com/substrait-io/substrait-go/proto\252\002\022Substrait.Protobuf'
26 | _PLANREL._serialized_start=99
27 | _PLANREL._serialized_end=187
28 | _PLAN._serialized_start=190
29 | _PLAN._serialized_end=506
30 | _PLANVERSION._serialized_start=508
31 | _PLANVERSION._serialized_end=558
32 | _VERSION._serialized_start=560
33 | _VERSION._serialized_end=671
34 | # @@protoc_insertion_point(module_scope)
35 |
--------------------------------------------------------------------------------
/python/tests/catalogs/test_directory.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from typing import Dict
17 |
18 | import numpy as np
19 | import pyarrow as pa
20 | import pytest
21 |
22 | from space import DatasetInfo, DirCatalog, RayOptions
23 | from space.core.utils import errors
24 |
25 |
26 | # A sample UDF for testing.
27 | def _sample_map_udf(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
28 | batch["float64"] = batch["float64"] + 1
29 | return batch
30 |
31 |
32 | class TestDirectoryCatalog:
33 |
34 | def test_dataset_crud(self, tmp_path):
35 | schema = pa.schema([("f", pa.int64())])
36 | pks = ["f"]
37 | records = []
38 |
39 | location = str(tmp_path / "cat")
40 | cat = DirCatalog(location)
41 |
42 | with pytest.raises(FileNotFoundError):
43 | cat.datasets()
44 |
45 | os.mkdir(location)
46 | assert not cat.datasets()
47 |
48 | ds1 = cat.create_dataset("ds1", schema, pks, records)
49 | ds1_data = {"f": [1, 2, 3]}
50 | ds1.local().append(ds1_data)
51 |
52 | ds1_loaded = cat.dataset("ds1")
53 | assert ds1_loaded.local().read_all().to_pydict() == ds1_data
54 |
55 | ds1_info = DatasetInfo("ds1", ds1_loaded.storage.location)
56 | assert cat.datasets() == [ds1_info]
57 |
58 | ds2 = cat.create_dataset("ds2", schema, pks, records)
59 |
60 | key_fn = lambda ds: ds.location # pylint: disable=unnecessary-lambda-assignment
61 | assert sorted(cat.datasets(), key=key_fn) == sorted(
62 | [ds1_info, DatasetInfo("ds2", ds2.storage.location)], key=key_fn)
63 |
64 | with pytest.raises(errors.StorageExistError) as excinfo:
65 | cat.create_dataset("ds2", schema, pks, records)
66 |
67 | assert "already exists" in str(excinfo.value)
68 |
69 | with pytest.raises(errors.StorageNotFoundError) as excinfo:
70 | cat.dataset("ds_not_exist")
71 |
72 | assert "Failed to open local file" in str(excinfo.value)
73 |
74 | def test_materialized_view_crud(self, tmp_path):
75 | schema = pa.schema([("f", pa.int64()), ("float64", pa.float64())])
76 | pks = ["f"]
77 | records = []
78 |
79 | location = str(tmp_path / "cat")
80 | cat = DirCatalog(location)
81 |
82 | ds = cat.create_dataset("ds", schema, pks, records)
83 | view = ds.map_batches(fn=_sample_map_udf,
84 | input_fields=["f", "float64"],
85 | output_schema=schema,
86 | output_record_fields=[])
87 |
88 | mv1 = cat.materialize("mv1", view)
89 |
90 | ds.local().append({"f": [1, 2, 3], "float64": [0.1, 0.2, 0.3]})
91 | mv1.ray(RayOptions(max_parallelism=1)).refresh()
92 | expected_data = {"f": [1, 2, 3], "float64": [1.1, 1.2, 1.3]}
93 | assert mv1.local().read_all().to_pydict() == expected_data
94 |
95 | mv1_loaded = cat.dataset("mv1")
96 | assert mv1_loaded.local().read_all().to_pydict() == expected_data
97 |
98 | with pytest.raises(errors.StorageExistError):
99 | cat.materialize("mv1", view)
100 |
101 | with pytest.raises(errors.StorageExistError):
102 | cat.materialize("ds", view)
103 |
104 | key_fn = lambda ds: ds.location # pylint: disable=unnecessary-lambda-assignment
105 | assert sorted(cat.datasets(), key=key_fn) == sorted([
106 | DatasetInfo("ds", ds.storage.location),
107 | DatasetInfo("mv1", mv1.storage.location)
108 | ],
109 | key=key_fn)
110 |
--------------------------------------------------------------------------------
/python/tests/core/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 |
18 | @pytest.fixture
19 | def sample_map_batch_plan() -> str:
20 | return """extension_uris {
21 | extension_uri_anchor: 1
22 | uri: "urn:space:substrait_simple_extension_function"
23 | }
24 | extensions {
25 | extension_function {
26 | extension_uri_reference: 1
27 | function_anchor: 1
28 | name: ""
29 | }
30 | }
31 | relations {
32 | root {
33 | input {
34 | project {
35 | input {
36 | read {
37 | base_schema {
38 | names: "int64"
39 | names: "float64"
40 | names: "binary"
41 | struct {
42 | types {
43 | i64 {
44 | }
45 | }
46 | types {
47 | fp64 {
48 | type_variation_reference: 1
49 | }
50 | }
51 | types {
52 | binary {
53 | type_variation_reference: 2
54 | }
55 | }
56 | }
57 | }
58 | named_table {
59 | names: ""
60 | }
61 | }
62 | }
63 | expressions {
64 | scalar_function {
65 | function_reference: 1
66 | arguments {
67 | value {
68 | selection {
69 | direct_reference {
70 | struct_field {
71 | }
72 | }
73 | }
74 | }
75 | }
76 | arguments {
77 | value {
78 | selection {
79 | direct_reference {
80 | struct_field {
81 | field: 2
82 | }
83 | }
84 | }
85 | }
86 | }
87 | }
88 | }
89 | }
90 | }
91 | }
92 | }
93 | """
94 |
95 |
96 | @pytest.fixture
97 | def sample_filter_plan() -> str:
98 | return """extension_uris {
99 | extension_uri_anchor: 1
100 | uri: "urn:space:substrait_simple_extension_function"
101 | }
102 | extensions {
103 | extension_function {
104 | extension_uri_reference: 1
105 | function_anchor: 1
106 | name: ""
107 | }
108 | }
109 | relations {
110 | root {
111 | input {
112 | filter {
113 | input {
114 | read {
115 | base_schema {
116 | names: "int64"
117 | names: "float64"
118 | names: "binary"
119 | struct {
120 | types {
121 | i64 {
122 | }
123 | }
124 | types {
125 | fp64 {
126 | type_variation_reference: 1
127 | }
128 | }
129 | types {
130 | binary {
131 | type_variation_reference: 2
132 | }
133 | }
134 | }
135 | }
136 | named_table {
137 | names: ""
138 | }
139 | }
140 | }
141 | condition {
142 | scalar_function {
143 | function_reference: 1
144 | output_type {
145 | bool {
146 | }
147 | }
148 | arguments {
149 | value {
150 | selection {
151 | direct_reference {
152 | struct_field {
153 | }
154 | }
155 | }
156 | }
157 | }
158 | arguments {
159 | value {
160 | selection {
161 | direct_reference {
162 | struct_field {
163 | field: 1
164 | }
165 | }
166 | }
167 | }
168 | }
169 | }
170 | }
171 | }
172 | }
173 | }
174 | }
175 | """
176 |
--------------------------------------------------------------------------------
/python/tests/core/fs/test_arrow.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from space.core.fs.arrow import ArrowLocalFileSystem
18 | import space.core.proto.metadata_pb2 as meta
19 | from space.core.utils import errors
20 |
21 |
22 | class TestArrowLocalFileSystem:
23 |
24 | @pytest.fixture
25 | def fs(self):
26 | return ArrowLocalFileSystem()
27 |
28 | def test_create_dir(self, tmp_path, fs):
29 | dir_path = tmp_path / "test_create_dir"
30 | fs.create_dir(str(dir_path))
31 | assert dir_path.exists()
32 |
33 | def _read_proto(self, fs, file_path):
34 | read_msg = meta.StorageMetadata()
35 | fs.read_proto(file_path, read_msg)
36 | return read_msg
37 |
38 | def test_write_read_proto(self, tmp_path, fs):
39 | dir_path = tmp_path / "test_write_read_proto"
40 | fs.create_dir(str(dir_path))
41 |
42 | file_path = str(dir_path / "output.txtpb")
43 | write_msg = meta.StorageMetadata(current_snapshot_id=100)
44 | fs.write_proto(file_path, write_msg)
45 | assert dir_path.exists()
46 |
47 | assert self._read_proto(fs, file_path) == write_msg
48 |
49 | def test_overwrite_proto_file(self, tmp_path, fs):
50 | dir_path = tmp_path / "test_write_read_proto"
51 | fs.create_dir(str(dir_path))
52 |
53 | file_path = str(dir_path / "output.txtpb")
54 | write_msg = meta.StorageMetadata(current_snapshot_id=100)
55 | fs.write_proto(file_path, write_msg)
56 | assert self._read_proto(fs, file_path).current_snapshot_id == 100
57 |
58 | write_msg = meta.StorageMetadata(current_snapshot_id=200)
59 | fs.write_proto(file_path, write_msg)
60 | assert self._read_proto(fs, file_path).current_snapshot_id == 200
61 |
62 | with pytest.raises(errors.FileExistError) as excinfo:
63 | fs.write_proto(file_path, write_msg, fail_if_exists=True)
64 |
65 | assert "already exists" in str(excinfo.value)
66 |
--------------------------------------------------------------------------------
/python/tests/core/loaders/test_array_record.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | import numpy as np
18 | import pyarrow as pa
19 | import pytest
20 | from tensorflow_datasets import features as f # type: ignore[import-untyped]
21 |
22 | from space import Dataset, TfFeatures
23 | import space.core.proto.metadata_pb2 as meta
24 | from space.core.utils.lazy_imports_utils import array_record_module as ar
25 | from space.core.utils.uuids import uuid_
26 |
27 |
28 | class TestLocalArrayRecordLoadOp:
29 |
30 | @pytest.fixture
31 | def tf_features(self):
32 | features_dict = f.FeaturesDict({
33 | "image_id": np.int64,
34 | "objects": f.Sequence({"bbox": f.BBoxFeature()}),
35 | })
36 | return TfFeatures(features_dict)
37 |
38 | def test_append_array_record(self, tmp_path, tf_features):
39 | schema = pa.schema([("id", pa.int64()), ("num_objects", pa.int64()),
40 | ("features", tf_features)])
41 | ds = Dataset.create(str(tmp_path / "dataset"),
42 | schema,
43 | primary_keys=["id"],
44 | record_fields=["features"])
45 |
46 | features_data = [{
47 | "image_id": 123,
48 | "objects": {
49 | "bbox": np.array([[0.3, 0.8, 0.5, 1.0]], np.float32)
50 | }
51 | }, {
52 | "image_id": 456,
53 | "objects": {
54 | "bbox":
55 | np.array([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]],
56 | np.float32)
57 | }
58 | }]
59 |
60 | # Create dummy ArrayRecord files.
61 | input_dir = tmp_path / "array_record"
62 | input_dir.mkdir(parents=True)
63 | _write_array_record_files(input_dir,
64 | [tf_features.serialize(r) for r in features_data])
65 |
66 | def index_fn(record):
67 | assert len(record['features']) == 1
68 | features = record['features'][0]
69 | return {
70 | "id": features["image_id"],
71 | 'num_objects': features["objects"]["bbox"].shape[0]
72 | }
73 |
74 | runner = ds.local()
75 | response = runner.append_array_record(f"{input_dir}/*.array_record",
76 | index_fn)
77 | assert response.storage_statistics_update == meta.StorageStatistics(
78 | num_rows=2,
79 | index_compressed_bytes=104,
80 | index_uncompressed_bytes=100,
81 | record_uncompressed_bytes=135)
82 |
83 | index_data = runner.read_all().select(["id", "num_objects"])
84 | assert index_data == pa.Table.from_pydict({
85 | "id": [123, 456],
86 | "num_objects": [1, 2]
87 | })
88 |
89 |
90 | def _write_array_record_files(input_dir, records: List[bytes]):
91 | file_path = f"{uuid_()}.array_record"
92 | writer = ar.ArrayRecordWriter(str(input_dir / file_path), options="")
93 | for r in records:
94 | writer.write(r)
95 |
96 | writer.close()
97 |
--------------------------------------------------------------------------------
/python/tests/core/loaders/test_parquet.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 |
17 | from space import Dataset
18 | import space.core.proto.metadata_pb2 as meta
19 | from space.core.fs.parquet import write_parquet_file
20 |
21 |
22 | class TestLocalParquetLoadOp:
23 |
24 | def test_append_parquet(self, tmp_path):
25 | schema = pa.schema([
26 | pa.field("int64", pa.int64()),
27 | pa.field("float64", pa.float64()),
28 | pa.field("bool", pa.bool_()),
29 | pa.field("string", pa.string())
30 | ])
31 | ds = Dataset.create(str(tmp_path / "dataset"),
32 | schema,
33 | primary_keys=["int64"],
34 | record_fields=[])
35 | ds.add_tag("empty")
36 |
37 | input_data = [{
38 | "int64": [1, 2, 3],
39 | "float64": [0.1, 0.2, 0.3],
40 | "bool": [True, False, False],
41 | "string": ["a", "b", "c"]
42 | }, {
43 | "int64": [0, 10],
44 | "float64": [-0.1, 100.0],
45 | "bool": [False, False],
46 | "string": ["A", "z"]
47 | }]
48 |
49 | # Create dummy Parquet files.
50 | input_dir = tmp_path / "parquet"
51 | input_dir.mkdir(parents=True)
52 |
53 | file0 = str(input_dir / "file0.parquet")
54 | file1 = str(input_dir / "file1.parquet")
55 | write_parquet_file(file0, schema, [pa.Table.from_pydict(input_data[0])])
56 | write_parquet_file(file1, schema, [pa.Table.from_pydict(input_data[1])])
57 |
58 | runner = ds.local()
59 | response = runner.append_parquet(f"{input_dir}/*.parquet")
60 | ds.add_tag("after_append")
61 | assert response.storage_statistics_update == meta.StorageStatistics(
62 | num_rows=5,
63 | index_compressed_bytes=214,
64 | index_uncompressed_bytes=209,
65 | record_uncompressed_bytes=0)
66 |
67 | index_data = pa.concat_tables(
68 | (list(runner.read()))).combine_chunks().sort_by("int64")
69 | assert index_data == pa.concat_tables([
70 | pa.Table.from_pydict(d) for d in input_data
71 | ]).combine_chunks().sort_by("int64")
72 |
73 | assert not ds.index_files(version="empty")
74 | assert sorted(ds.index_files(version="after_append")) == [file0, file1]
75 |
--------------------------------------------------------------------------------
/python/tests/core/manifests/test_falsifiable_filters.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 | import pyarrow.compute as pc
17 | import pytest
18 |
19 | from space.core.manifests import falsifiable_filters as ff
20 |
21 |
22 | @pytest.mark.parametrize(
23 | "filter_,expected_falsifiable_filter",
24 | [
25 | ((pc.field("a") < 10) | (pc.field("b") > 1),
26 | (pc.field("_STATS_f0", "_MIN") >= 10) &
27 | (pc.field("_STATS_f1", "_MAX") <= 1)),
28 | ((10 > pc.field("a")) | (1 < pc.field("b")),
29 | (pc.field("_STATS_f0", "_MIN") >= 10) &
30 | (pc.field("_STATS_f1", "_MAX") <= 1)),
31 | ((pc.field("a") > 10) & (pc.field("b") == 1),
32 | (pc.field("_STATS_f0", "_MAX") <= 10) |
33 | ((pc.field("_STATS_f1", "_MIN") > 1) |
34 | (pc.field("_STATS_f1", "_MAX") < 1))),
35 | ((pc.field("a") != 10), (pc.field("_STATS_f0", "_MIN") == 10) &
36 | (pc.field("_STATS_f0", "_MAX") == 10)),
37 | # Only primary keys are used.
38 | ((pc.field("a") < 10) &
39 | (pc.field("c") > "a"), pc.field("_STATS_f0", "_MIN") >= 10),
40 | # Corner cases.
41 | (pc.scalar(False), ~pc.scalar(False)),
42 | ((pc.scalar(False) | (pc.field("a") <= 10)),
43 | (~pc.scalar(False) & (pc.field("_STATS_f0", "_MIN") > 10))),
44 | (~(pc.field("a") >= 10), ~(pc.field("_STATS_f0", "_MAX") < 10)),
45 | (~(10 <= pc.field("a")), ~(pc.field("_STATS_f0", "_MAX") < 10)),
46 | (pc.field("a") > pc.field("a"), pc.field("_STATS_f0", "_MAX")
47 | <= pc.field("_STATS_f0", "_MIN")),
48 | (pc.scalar(1) < pc.scalar(2), pc.scalar(1) >= pc.scalar(2))
49 | ])
50 | def test_build_manifest_filter(filter_, expected_falsifiable_filter):
51 | arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64()),
52 | ("c", pa.string())])
53 | field_name_ids = {"a": 0, "b": 1, "c": 2}
54 |
55 | manifest_filter = ff.build_manifest_filter(arrow_schema, set(["a", "b"]),
56 | field_name_ids, filter_)
57 | assert str(manifest_filter) == str(~expected_falsifiable_filter)
58 |
59 |
60 | @pytest.mark.parametrize("filter_", [
61 | pc.field("a"), (pc.field("a") < 10) | (pc.field("c") > "a"),
62 | pc.field("a") + 1 < pc.field("b")
63 | ])
64 | def test_build_manifest_filter_not_supported_return_none(filter_):
65 | arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.int64()),
66 | ("c", pa.string())])
67 | field_name_ids = {"a": 0, "b": 1, "c": 2}
68 |
69 | assert ff.build_manifest_filter(arrow_schema, set(["a", "b"]),
70 | field_name_ids, filter_) is None
71 |
--------------------------------------------------------------------------------
/python/tests/core/manifests/test_record.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow.parquet as pq
16 |
17 | from space.core.manifests import RecordManifestWriter
18 | import space.core.proto.metadata_pb2 as meta
19 |
20 |
21 | class TestRecordManifestWriter:
22 |
23 | def test_write(self, tmp_path):
24 | metadata_dir = tmp_path / "dataset" / "metadata"
25 | metadata_dir.mkdir(parents=True)
26 |
27 | manifest_writer = RecordManifestWriter(metadata_dir=str(metadata_dir))
28 |
29 | manifest_writer.write(
30 | "data/file0.array_record", 0,
31 | meta.StorageStatistics(num_rows=123,
32 | index_compressed_bytes=10,
33 | index_uncompressed_bytes=20,
34 | record_uncompressed_bytes=30))
35 | manifest_writer.write(
36 | "data/file1.array_record", 1,
37 | meta.StorageStatistics(num_rows=456,
38 | index_compressed_bytes=10,
39 | index_uncompressed_bytes=20,
40 | record_uncompressed_bytes=100))
41 |
42 | manifest_path = manifest_writer.finish()
43 |
44 | assert manifest_path is not None
45 | assert pq.read_table(manifest_path).to_pydict() == {
46 | "_FILE": ["data/file0.array_record", "data/file1.array_record"],
47 | "_FIELD_ID": [0, 1],
48 | "_NUM_ROWS": [123, 456],
49 | "_UNCOMPRESSED_BYTES": [30, 100]
50 | }
51 |
52 | def test_empty_manifest_should_return_none(self, tmp_path):
53 | metadata_dir = tmp_path / "dataset" / "metadata"
54 | manifest_writer = RecordManifestWriter(metadata_dir=str(metadata_dir))
55 |
56 | assert manifest_writer.finish() is None
57 |
--------------------------------------------------------------------------------
/python/tests/core/ops/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 | import pyarrow as pa
17 | import pytest
18 | from tensorflow_datasets import features # type: ignore[import-untyped]
19 |
20 | from space.core.schema.types import TfFeatures
21 |
22 |
23 | # TODO: the test should cover all types supported by column stats.
24 | @pytest.fixture
25 | def all_types_schema():
26 | return pa.schema([
27 | pa.field("int64", pa.int64()),
28 | pa.field("float64", pa.float64()),
29 | pa.field("bool", pa.bool_()),
30 | pa.field("string", pa.string())
31 | ])
32 |
33 |
34 | @pytest.fixture
35 | def all_types_input_data():
36 | return [{
37 | "int64": [1, 2, 3],
38 | "float64": [0.1, 0.2, 0.3],
39 | "bool": [True, False, False],
40 | "string": ["a", "b", "c"]
41 | }, {
42 | "int64": [0, 10],
43 | "float64": [-0.1, 100.0],
44 | "bool": [False, False],
45 | "string": ["A", "z"]
46 | }]
47 |
48 |
49 | @pytest.fixture
50 | def record_fields_schema():
51 | tf_features_images = features.FeaturesDict(
52 | {"images": features.Image(shape=(None, None, 3), dtype=np.uint8)})
53 | tf_features_objects = features.FeaturesDict({
54 | "objects":
55 | features.Sequence({
56 | "bbox": features.BBoxFeature(),
57 | "id": np.int64
58 | }),
59 | })
60 |
61 | return pa.schema([
62 | pa.field("int64", pa.int64()),
63 | pa.field("string", pa.string()),
64 | pa.field("images", TfFeatures(tf_features_images)),
65 | pa.field("objects", TfFeatures(tf_features_objects))
66 | ])
67 |
68 |
69 | @pytest.fixture
70 | def record_fields_input_data():
71 | return [{
72 | "int64": [1, 2, 3],
73 | "string": ["a", "b", "c"],
74 | "images": [b"images0", b"images1", b"images2"],
75 | "objects": [b"objects0", b"objects1", b"objects2"]
76 | }, {
77 | "int64": [0, 10],
78 | "string": ["A", "z"],
79 | "images": [b"images3", b"images4"],
80 | "objects": [b"objects3", b"objects4"]
81 | }]
82 |
--------------------------------------------------------------------------------
/python/tests/core/ops/test_append.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 | import pyarrow as pa
17 | import pyarrow.parquet as pq
18 |
19 | from space.core.ops.append import LocalAppendOp
20 | from space.core.options import FileOptions
21 | import space.core.proto.metadata_pb2 as meta
22 | from space.core.storage import Storage
23 |
24 | _default_file_options = FileOptions()
25 |
26 |
27 | class TestLocalAppendOp:
28 |
29 | # TODO: to add tests using Arrow table input.
30 |
31 | def test_write_pydict_all_types(self, tmp_path, all_types_schema,
32 | all_types_input_data):
33 | location = tmp_path / "dataset"
34 | storage = Storage.create(location=str(location),
35 | schema=all_types_schema,
36 | primary_keys=["int64"],
37 | record_fields=[])
38 |
39 | op = LocalAppendOp(str(location), storage.metadata, _default_file_options)
40 | for batch in all_types_input_data:
41 | op.write(batch)
42 |
43 | patch = op.finish()
44 | assert patch is not None
45 |
46 | index_manifests = []
47 | for f in patch.addition.index_manifest_files:
48 | index_manifests.append(pq.read_table(storage.full_path(f)))
49 |
50 | index_manifest = pa.concat_tables(index_manifests).to_pydict()
51 | assert "_FILE" in index_manifest
52 |
53 | assert index_manifest == {
54 | "_FILE": index_manifest["_FILE"],
55 | "_INDEX_COMPRESSED_BYTES": [114],
56 | "_INDEX_UNCOMPRESSED_BYTES": [126],
57 | "_NUM_ROWS": [5],
58 | "_STATS_f0": [{
59 | "_MAX": 10,
60 | "_MIN": 0
61 | }]
62 | }
63 |
64 | assert patch.storage_statistics_update == meta.StorageStatistics(
65 | num_rows=5, index_compressed_bytes=114, index_uncompressed_bytes=126)
66 |
67 | def test_write_pydict_with_record_fields(self, tmp_path, record_fields_schema,
68 | record_fields_input_data):
69 | location = tmp_path / "dataset"
70 | storage = Storage.create(location=str(location),
71 | schema=record_fields_schema,
72 | primary_keys=["int64"],
73 | record_fields=["images", "objects"])
74 |
75 | op = LocalAppendOp(str(location), storage.metadata, _default_file_options)
76 | for batch in record_fields_input_data:
77 | op.write(batch)
78 |
79 | patch = op.finish()
80 | assert patch is not None
81 |
82 | # Validate index manifest files.
83 | index_manifest = self._read_manifests(
84 | storage, list(patch.addition.index_manifest_files))
85 | assert index_manifest == {
86 | "_FILE": index_manifest["_FILE"],
87 | "_INDEX_COMPRESSED_BYTES": [114],
88 | "_INDEX_UNCOMPRESSED_BYTES": [126],
89 | "_NUM_ROWS": [5],
90 | "_STATS_f0": [{
91 | "_MAX": 10,
92 | "_MIN": 0
93 | }]
94 | }
95 |
96 | # Validate record manifest files.
97 | record_manifest = self._read_manifests(
98 | storage, list(patch.addition.record_manifest_files))
99 | assert record_manifest == {
100 | "_FILE": record_manifest["_FILE"],
101 | "_FIELD_ID": [2, 3],
102 | "_NUM_ROWS": [5, 5],
103 | "_UNCOMPRESSED_BYTES": [55, 60]
104 | }
105 |
106 | # Data file exists.
107 | self._check_file_exists(location, index_manifest["_FILE"])
108 | self._check_file_exists(location, record_manifest["_FILE"])
109 | assert self._file_schema(
110 | location, index_manifest["_FILE"][0]) == storage.physical_schema
111 |
112 | # Validate statistics.
113 | assert patch.storage_statistics_update == meta.StorageStatistics(
114 | num_rows=5,
115 | index_compressed_bytes=114,
116 | index_uncompressed_bytes=126,
117 | record_uncompressed_bytes=115)
118 |
119 | def test_empty_op_return_none(self, tmp_path):
120 | location = tmp_path / "dataset"
121 | schema = pa.schema([pa.field("int64", pa.int64())])
122 | storage = Storage.create(location=str(location),
123 | schema=schema,
124 | primary_keys=["int64"],
125 | record_fields=[])
126 |
127 | op = LocalAppendOp(str(location), storage.metadata, _default_file_options)
128 | assert op.finish() is None
129 |
130 | def _read_manifests(self, storage: Storage,
131 | file_paths: List[str]) -> pa.Table:
132 | manifests = []
133 | for f in file_paths:
134 | manifests.append(pq.read_table(storage.full_path(f)))
135 |
136 | return pa.concat_tables(manifests).to_pydict()
137 |
138 | def _check_file_exists(self, location, file_paths: List[str]):
139 | for f in file_paths:
140 | assert (location / f).exists()
141 |
142 | def _file_schema(self, location, file_path: str) -> pa.Schema:
143 | return pq.read_schema(str(location / file_path))
144 |
--------------------------------------------------------------------------------
/python/tests/core/ops/test_change_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 | import pyarrow.compute as pc
17 |
18 | import pytest
19 |
20 | from space.core.datasets import Dataset
21 | from space.core.ops.change_data import (ChangeData, ChangeType,
22 | ordered_snapshot_ids)
23 | from space.core.utils import errors
24 | from space.core.utils.uuids import random_id
25 |
26 |
27 | def test_read_change_data(tmp_path, all_types_schema, all_types_input_data):
28 | location = tmp_path / "dataset"
29 | ds = Dataset.create(location=str(location),
30 | schema=all_types_schema,
31 | primary_keys=["int64"],
32 | record_fields=[])
33 |
34 | # Validate ADD changes.
35 | runner = ds.local()
36 | runner.append_from(lambda: iter(all_types_input_data))
37 |
38 | changes = list(runner.diff(0, 1))
39 | assert len(changes) == 1
40 | expected_change0 = ChangeData(ds.storage.metadata.current_snapshot_id,
41 | ChangeType.ADD, runner.read_all())
42 | assert changes[0] == expected_change0
43 |
44 | # Validate DELETE changes.
45 | runner.delete((pc.field("string") == "a") | (pc.field("string") == "A"))
46 | changes = list(runner.diff(1, 2))
47 | assert len(changes) == 1
48 | expected_change1 = ChangeData(ds.storage.metadata.current_snapshot_id,
49 | ChangeType.DELETE,
50 | pa.Table.from_pydict({"int64": [1, 0]}))
51 | assert changes[0] == expected_change1
52 |
53 | # Validate Upsert operation's changes.
54 | upsert_data = {
55 | "int64": [2, 3, 4, 5],
56 | "float64": [0.1, -0.1, 0.4, 0.5],
57 | "bool": [True, False, True, False],
58 | "string": ["a", "A", "4", "5"]
59 | }
60 | runner.upsert(upsert_data)
61 | changes = list(runner.diff(2, 3))
62 | assert len(changes) == 2
63 | expected_change2 = ChangeData(ds.storage.metadata.current_snapshot_id,
64 | ChangeType.DELETE,
65 | pa.Table.from_pydict({"int64": [2, 3]}))
66 | expected_change3 = ChangeData(ds.storage.metadata.current_snapshot_id,
67 | ChangeType.ADD,
68 | pa.Table.from_pydict(upsert_data))
69 | assert changes == [expected_change2, expected_change3]
70 |
71 | # Validate diff with several snapshot in-between
72 | changes = list(runner.diff(0, 3))
73 | assert len(changes) == 4
74 | assert changes == [
75 | expected_change0, expected_change1, expected_change2, expected_change3
76 | ]
77 |
78 |
79 | def test_ordered_snapshot_ids(tmp_path):
80 | schema = pa.schema([
81 | pa.field("int64", pa.int64()),
82 | pa.field("float64", pa.float64()),
83 | pa.field("binary", pa.binary())
84 | ])
85 | ds = Dataset.create(str(tmp_path / f"dataset_{random_id()}"),
86 | schema,
87 | primary_keys=["int64"],
88 | record_fields=["binary"])
89 |
90 | runner = ds.local()
91 | runner.append({"int64": [1], "float64": [0.1], "binary": [b"b1"]})
92 | runner.append({"int64": [2], "float64": [0.2], "binary": [b"b2"]})
93 | runner.append({"int64": [3], "float64": [0.3], "binary": [b"b3"]})
94 |
95 | with pytest.raises(
96 | errors.UserInputError,
97 | match=r".*End snapshot ID 0 should be higher than start snapshot ID 0.*"):
98 | ordered_snapshot_ids(ds.storage, 0, 0)
99 |
100 | assert ordered_snapshot_ids(ds.storage, 0, 1) == [1]
101 | assert ordered_snapshot_ids(ds.storage, 0, 3) == [1, 2, 3]
102 |
--------------------------------------------------------------------------------
/python/tests/core/ops/test_delete.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 | import pyarrow.compute as pc
17 |
18 | from space.core.ops.append import LocalAppendOp
19 | from space.core.ops.delete import FileSetDeleteOp
20 | from space.core.ops.read import FileSetReadOp
21 | from space.core.options import FileOptions
22 | from space.core.storage import Storage
23 |
24 | _default_file_options = FileOptions()
25 |
26 |
27 | class TestFileSetDeleteOp:
28 |
29 | # TODO: to add tests using Arrow table input.
30 | def test_delete_all_types(self, tmp_path, all_types_schema,
31 | all_types_input_data):
32 | location = tmp_path / "dataset"
33 | storage = Storage.create(location=str(location),
34 | schema=all_types_schema,
35 | primary_keys=["int64"],
36 | record_fields=[])
37 |
38 | append_op = LocalAppendOp(str(location), storage.metadata,
39 | _default_file_options)
40 | # TODO: the test should cover all types supported by column stats.
41 | input_data = [pa.Table.from_pydict(d) for d in all_types_input_data]
42 | for batch in input_data:
43 | append_op.write(batch)
44 |
45 | storage.commit(append_op.finish(), "main")
46 | old_data_files = storage.data_files()
47 |
48 | delete_op = FileSetDeleteOp(
49 | str(location),
50 | storage.metadata,
51 | storage.data_files(),
52 | # pylint: disable=singleton-comparison
53 | pc.field("bool") == False,
54 | _default_file_options)
55 | patch = delete_op.delete()
56 | assert patch is not None
57 | storage.commit(patch, "main")
58 |
59 | # Verify storage metadata after patch.
60 | new_data_files = storage.data_files()
61 |
62 | def validate_data_files(data_files, patch_manifests):
63 | assert len(data_files.index_manifest_files) == 1
64 | assert len(patch_manifests.index_manifest_files) == 1
65 | assert data_files.index_manifest_files[
66 | 1] == patch_manifests.index_manifest_files[0]
67 |
68 | validate_data_files(old_data_files, patch.deletion)
69 | validate_data_files(new_data_files, patch.addition)
70 |
71 | read_op = FileSetReadOp(str(location), storage.metadata,
72 | storage.data_files())
73 | results = list(iter(read_op))
74 | assert len(results) == 1
75 | assert list(iter(read_op))[0] == pa.Table.from_pydict({
76 | "int64": [1],
77 | "float64": [0.1],
78 | "bool": [True],
79 | "string": ["a"]
80 | })
81 |
--------------------------------------------------------------------------------
/python/tests/core/ops/test_insert.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 | import pyarrow.compute as pc
17 |
18 | from space import Dataset
19 | from space.core.jobs import JobResult
20 |
21 |
22 | class TestLocalInsertOp:
23 |
24 | # TODO: to add tests using Arrow table input.
25 | def test_insert_and_upsert(self, tmp_path, all_types_schema,
26 | all_types_input_data):
27 | location = tmp_path / "dataset"
28 | ds = Dataset.create(location=str(location),
29 | schema=all_types_schema,
30 | primary_keys=["int64"],
31 | record_fields=[])
32 |
33 | runner = ds.local()
34 | runner.append_from(lambda: iter(all_types_input_data))
35 |
36 | # Test insert.
37 | result = runner.insert({
38 | "int64": [3, 4],
39 | "float64": [0.3, 0.4],
40 | "bool": [False, False],
41 | "string": ["d", "e"]
42 | })
43 | assert result.state == JobResult.State.FAILED
44 | assert "Primary key to insert already exist" in result.error_message
45 |
46 | input_data = {
47 | "int64": [4, 5],
48 | "float64": [0.4, 0.5],
49 | "bool": [False, False],
50 | "string": ["e", "f"]
51 | }
52 | runner.insert(input_data)
53 |
54 | filter_ = (pc.field("int64") == 4) | (pc.field("int64") == 5)
55 | assert runner.read_all(filter_=filter_) == pa.Table.from_pydict(input_data)
56 |
57 | # Test upsert.
58 | input_data = {
59 | "int64": [4, 5, 6],
60 | "float64": [1.4, 1.5, 1.6],
61 | "bool": [True, True, True],
62 | "string": ["e", "f", "g"]
63 | }
64 | runner.upsert(input_data)
65 |
66 | filter_ |= (pc.field("int64") == 6)
67 | assert runner.read_all(filter_=filter_) == pa.Table.from_pydict(input_data)
68 |
--------------------------------------------------------------------------------
/python/tests/core/ops/test_read.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 | import pyarrow.compute as pc
17 |
18 | from space.core.ops.append import LocalAppendOp
19 | from space.core.ops.read import FileSetReadOp
20 | from space.core.options import FileOptions, ReadOptions
21 | from space.core.storage import Storage
22 |
23 | _default_file_options = FileOptions()
24 |
25 |
26 | class TestFileSetReadOp:
27 |
28 | # TODO: to add tests using Arrow table input.
29 | def test_read_all_types(self, tmp_path, all_types_schema,
30 | all_types_input_data):
31 | location = tmp_path / "dataset"
32 | storage = Storage.create(location=str(location),
33 | schema=all_types_schema,
34 | primary_keys=["int64"],
35 | record_fields=[])
36 |
37 | append_op = LocalAppendOp(str(location), storage.metadata,
38 | _default_file_options)
39 | # TODO: the test should cover all types supported by column stats.
40 | input_data = [pa.Table.from_pydict(d) for d in all_types_input_data]
41 | for batch in input_data:
42 | append_op.write(batch)
43 |
44 | storage.commit(append_op.finish(), "main")
45 |
46 | read_op = FileSetReadOp(str(location), storage.metadata,
47 | storage.data_files())
48 | results = list(iter(read_op))
49 | assert len(results) == 1
50 | assert list(iter(read_op))[0] == pa.concat_tables(input_data)
51 |
52 | # Test FileSetReadOp with filters.
53 | read_op = FileSetReadOp(
54 | str(location),
55 | storage.metadata,
56 | storage.data_files(),
57 | # pylint: disable=singleton-comparison
58 | options=ReadOptions(filter_=pc.field("bool") == True))
59 | results = list(iter(read_op))
60 | assert len(results) == 1
61 | assert list(iter(read_op))[0] == pa.Table.from_pydict({
62 | "int64": [1],
63 | "float64": [0.1],
64 | "bool": [True],
65 | "string": ["a"]
66 | })
67 |
68 | def test_read_with_record_filters(self, tmp_path, record_fields_schema,
69 | record_fields_input_data):
70 | location = tmp_path / "dataset"
71 | storage = Storage.create(location=str(location),
72 | schema=record_fields_schema,
73 | primary_keys=["int64"],
74 | record_fields=["images", "objects"])
75 |
76 | append_op = LocalAppendOp(str(location), storage.metadata,
77 | _default_file_options)
78 | input_data = [pa.Table.from_pydict(d) for d in record_fields_input_data]
79 | for batch in input_data:
80 | append_op.write(batch)
81 |
82 | storage.commit(append_op.finish(), "main")
83 | data_files = storage.data_files()
84 |
85 | read_op = FileSetReadOp(str(location), storage.metadata, data_files)
86 | results = list(iter(read_op))
87 | assert len(results) == 1
88 | assert list(iter(read_op))[0] == pa.concat_tables(input_data)
89 |
--------------------------------------------------------------------------------
/python/tests/core/schema/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/space/a97f09132bb716a4038ee686e0de3a68fb9d6b3b/python/tests/core/schema/__init__.py
--------------------------------------------------------------------------------
/python/tests/core/schema/test_arrow.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 |
17 | from space.core.schema import arrow
18 | from space.core.schema import utils
19 | from space.core.schema.arrow import field_metadata
20 |
21 |
22 | def test_field_metadata():
23 | assert arrow.field_metadata(123) == {b"PARQUET:field_id": b"123"}
24 |
25 |
26 | def test_field_id():
27 | assert arrow.field_id(
28 | pa.field("name", pa.int64(),
29 | metadata={b"PARQUET:field_id": b"123"})) == 123
30 |
31 |
32 | def test_arrow_schema_logical_without_records(sample_substrait_fields,
33 | sample_arrow_schema):
34 | assert arrow.arrow_schema(sample_substrait_fields, [],
35 | False) == sample_arrow_schema
36 |
37 |
38 | def test_arrow_schema_logical_with_records(tf_features_substrait_fields,
39 | tf_features_arrow_schema):
40 | assert arrow.arrow_schema(tf_features_substrait_fields, [],
41 | False) == tf_features_arrow_schema
42 |
43 |
44 | def test_arrow_schema_physical_without_records(sample_substrait_fields,
45 | sample_arrow_schema):
46 | assert arrow.arrow_schema(sample_substrait_fields, [],
47 | True) == sample_arrow_schema
48 |
49 |
50 | def test_arrow_schema_logical_with_files(file_substrait_fields,
51 | file_arrow_schema):
52 | assert arrow.arrow_schema(file_substrait_fields, [],
53 | False) == file_arrow_schema
54 |
55 |
56 | def test_arrow_schema_physical_with_files(file_substrait_fields):
57 | assert arrow.arrow_schema(file_substrait_fields, [], True) == pa.schema([
58 | pa.field("int64", pa.int64(), metadata=field_metadata(0)),
59 | pa.field("files", pa.string(), metadata=field_metadata(1))
60 | ])
61 |
62 |
63 | def test_arrow_schema_physical_with_records(tf_features_substrait_fields):
64 | arrow_schema = pa.schema([
65 | pa.field("int64", pa.int64(), metadata=field_metadata(0)),
66 | pa.field("features",
67 | pa.struct([("_FILE", pa.string()), ("_ROW_ID", pa.int32())]),
68 | metadata=field_metadata(1))
69 | ])
70 | assert arrow.arrow_schema(tf_features_substrait_fields, ["features"],
71 | True) == arrow_schema
72 |
73 |
74 | def test_field_name_to_id_dict(sample_arrow_schema):
75 | assert arrow.field_name_to_id_dict(sample_arrow_schema) == {
76 | "float32": 100,
77 | "list": 120,
78 | "struct": 150,
79 | "list_struct": 220,
80 | "struct_list": 260
81 | }
82 |
83 |
84 | def test_field_id_to_column_id_dict(sample_arrow_schema):
85 | assert arrow.field_id_to_column_id_dict(sample_arrow_schema) == {
86 | 100: 0,
87 | 120: 1,
88 | 150: 2,
89 | 220: 3,
90 | 260: 4
91 | }
92 |
93 |
94 | def test_classify_fields(sample_arrow_schema):
95 | index_fields, record_fields = arrow.classify_fields(sample_arrow_schema,
96 | ["float32", "list"])
97 |
98 | assert index_fields == [
99 | utils.Field("struct", 150),
100 | utils.Field("list_struct", 220),
101 | utils.Field("struct_list", 260)
102 | ]
103 | assert record_fields == [
104 | utils.Field("float32", 100),
105 | utils.Field("list", 120)
106 | ]
107 |
108 |
109 | def test_classify_fields_with_selected_fields(sample_arrow_schema):
110 | index_fields, record_fields = arrow.classify_fields(sample_arrow_schema,
111 | ["float32", "list"],
112 | ["list", "struct"])
113 |
114 | assert index_fields == [utils.Field("struct", 150)]
115 | assert record_fields == [utils.Field("list", 120)]
116 |
117 |
118 | def test_field_names():
119 | assert utils.field_names([
120 | utils.Field("struct", 150),
121 | utils.Field("list_struct", 220),
122 | utils.Field("struct_list", 260)
123 | ]) == ["struct", "list_struct", "struct_list"]
124 |
125 |
126 | def test_logical_to_physical_schema(tf_features_arrow_schema):
127 | physical_schema = pa.schema([
128 | pa.field("int64", pa.int64(), metadata=field_metadata(0)),
129 | pa.field("features",
130 | pa.struct([("_FILE", pa.string()), ("_ROW_ID", pa.int32())]),
131 | metadata=field_metadata(1))
132 | ])
133 | assert arrow.logical_to_physical_schema(tf_features_arrow_schema,
134 | set(["features"])) == physical_schema
135 |
--------------------------------------------------------------------------------
/python/tests/core/schema/test_substrait.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from space.core.schema.substrait import substrait_fields
16 |
17 |
18 | def test_substrait_fields(sample_arrow_schema, sample_substrait_fields):
19 | assert substrait_fields(sample_arrow_schema) == sample_substrait_fields
20 |
21 |
22 | def test_substrait_fields_tf_features(tf_features_arrow_schema,
23 | tf_features_substrait_fields):
24 | assert substrait_fields(
25 | tf_features_arrow_schema) == tf_features_substrait_fields
26 |
27 |
28 | def test_substrait_fields_file(file_arrow_schema, file_substrait_fields):
29 | assert substrait_fields(file_arrow_schema) == file_substrait_fields
30 |
--------------------------------------------------------------------------------
/python/tests/core/schema/types/test_files.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pyarrow as pa
16 |
17 | import space.core.proto.metadata_pb2 as meta
18 | from space.core.schema.types import File
19 | from space.core.utils.constants import UTF_8
20 |
21 |
22 | class TestFile:
23 |
24 | def test_arrow_ext_serialize_deserialize(self):
25 | file_type = File(directory="test_folder")
26 | serialized = file_type.__arrow_ext_serialize__()
27 | assert file_type.__arrow_ext_serialize__().decode(
28 | UTF_8) == '"{\\n \\"directory\\": \\"test_folder\\"\\n}"'
29 |
30 | # Bytes input.
31 | deserialized_file_type = File.__arrow_ext_deserialize__(
32 | storage_type=None, serialized=serialized)
33 | assert deserialized_file_type._file_type == meta.FileType(
34 | directory="test_folder")
35 |
36 | def test_full_path(self):
37 | file_type = File(directory="")
38 | assert file_type.full_path("") == ""
39 | assert file_type.full_path("123") == "123"
40 |
41 | file_type = File(directory="test_folder")
42 | assert file_type.full_path("") == "test_folder/"
43 | assert file_type.full_path("123") == "test_folder/123"
44 |
--------------------------------------------------------------------------------
/python/tests/core/schema/types/test_tf_features.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 | import numpy as np
17 | from numpy.testing import assert_array_equal, assert_equal
18 | import pyarrow as pa
19 | import pytest
20 | import tensorflow_datasets as tfds # type: ignore[import-untyped]
21 | from tensorflow_datasets import features as f # type: ignore[import-untyped]
22 |
23 | from space.core.schema.types import TfFeatures
24 | from space.core.serializers import DictSerializer
25 | from space.core.utils.constants import UTF_8
26 |
27 |
28 | class TestTfFeatures:
29 |
30 | @pytest.fixture
31 | def tf_features(self):
32 | features_dict = f.FeaturesDict({
33 | "objects":
34 | f.Sequence({
35 | "bbox": f.BBoxFeature(),
36 | "id": np.int64
37 | }),
38 | })
39 | return TfFeatures(features_dict)
40 |
41 | @pytest.fixture
42 | def sample_objects(self):
43 | return {
44 | "objects": [{
45 | "bbox":
46 | tfds.features.BBox(ymin=0.3, xmin=0.8, ymax=0.5, xmax=1.0),
47 | "id":
48 | 123
49 | }]
50 | }
51 |
52 | def test_arrow_ext_serialize_deserialize(self, tf_features, sample_objects):
53 | serialized = tf_features.__arrow_ext_serialize__()
54 | features_dict = json.loads(serialized.decode(UTF_8))
55 | assert features_dict[
56 | "type"] == "tensorflow_datasets.core.features.features_dict.FeaturesDict" # pylint: disable=line-too-long
57 | assert "sequence" in features_dict["content"]["features"]["objects"]
58 |
59 | # Bytes input.
60 | tf_features = TfFeatures.__arrow_ext_deserialize__(storage_type=None,
61 | serialized=serialized)
62 | assert len(tf_features.serialize(sample_objects)) > 0
63 |
64 | # String input.
65 | tf_features = TfFeatures.__arrow_ext_deserialize__(
66 | storage_type=None, serialized=serialized.decode(UTF_8))
67 | assert len(tf_features.serialize(sample_objects)) > 0
68 |
69 | def test_serialize_deserialize(self, tf_features, sample_objects):
70 | value_bytes = tf_features.serialize(sample_objects)
71 | assert len(value_bytes) > 0
72 |
73 | objects = tf_features.deserialize(value_bytes)["objects"]
74 | assert_array_equal(objects["bbox"],
75 | np.array([[0.3, 0.8, 0.5, 1.]], dtype=np.float32))
76 | assert_array_equal(objects["id"], np.array([123]))
77 |
78 | def test_dict_serialize_deserialize(self, tf_features):
79 | schema = pa.schema([("int64", pa.int64()), ("features", tf_features)])
80 | serializer = DictSerializer.create(schema)
81 |
82 | features_data = [{
83 | "objects": {
84 | "bbox": np.array([[0.3, 0.8, 0.5, 1.0]], np.float32),
85 | "id": np.array([123]),
86 | }
87 | }, {
88 | "objects": {
89 | "bbox": np.array([[0.1, 0.2, 0.3, 0.4]], np.float32),
90 | "id": np.array([456]),
91 | }
92 | }]
93 |
94 | data = {"int64": [1, 2], "features": features_data}
95 | serialized_data = serializer.serialize(data)
96 | assert serialized_data["int64"] == [1, 2]
97 | assert len(serialized_data["features"]) == 2
98 |
99 | objects = tf_features.deserialize(
100 | serialized_data["features"][0])["objects"]
101 | assert_array_equal(objects["bbox"],
102 | np.array([[0.3, 0.8, 0.5, 1.]], dtype=np.float32))
103 | assert_array_equal(objects["id"], np.array([123]))
104 |
105 | assert_equal(serializer.deserialize(serialized_data), data)
106 |
--------------------------------------------------------------------------------
/python/tests/core/utils/test_paths.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from mock import patch
16 | import pytest
17 |
18 | from space.core.utils import paths
19 |
20 | _UUID_PATH = "space.core.utils.paths.uuid_"
21 |
22 |
23 | def _mocked_uuid() -> str:
24 | return ""
25 |
26 |
27 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
28 | def test_new_index_file_path(mock_uuid): # pylint: disable=unused-argument
29 | assert paths.new_index_file_path("data") == "data/index_.parquet"
30 |
31 |
32 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
33 | def test_new_record_file_path(mock_uuid): # pylint: disable=unused-argument
34 | assert paths.new_record_file_path(
35 | "data", "field") == "data/field_.array_record"
36 |
37 |
38 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
39 | def test_new_index_manifest_path(mock_uuid): # pylint: disable=unused-argument
40 | assert paths.new_index_manifest_path(
41 | "metadata") == "metadata/index_manifest_.parquet"
42 |
43 |
44 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
45 | def test_new_record_manifest_path(mock_uuid): # pylint: disable=unused-argument
46 | assert paths.new_record_manifest_path(
47 | "metadata") == "metadata/record_manifest_.parquet"
48 |
49 |
50 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
51 | def test_data_dir(mock_uuid): # pylint: disable=unused-argument
52 | assert paths.data_dir("location") == "location/data"
53 |
54 |
55 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
56 | def test_metadata_dir(mock_uuid): # pylint: disable=unused-argument
57 | assert paths.metadata_dir("location") == "location/metadata"
58 |
59 |
60 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
61 | def test_entry_point_path(mock_uuid): # pylint: disable=unused-argument
62 | assert paths.entry_point_path(
63 | "location") == "location/metadata/entrypoint.txtpb"
64 |
65 |
66 | @patch(_UUID_PATH, side_effect=_mocked_uuid)
67 | def test_new_metadata_path(mock_uuid): # pylint: disable=unused-argument
68 | assert paths.new_metadata_path(
69 | "metadata") == "metadata/metadata_.txtpb"
70 |
71 |
72 | class TestStoragePathsMixin:
73 |
74 | _LOCATION = "location"
75 |
76 | @pytest.fixture
77 | def storage_paths(self):
78 | return paths.StoragePathsMixin(self._LOCATION)
79 |
80 | def test_data_dir(self, storage_paths):
81 | assert storage_paths.data_dir == f"{self._LOCATION}/data"
82 |
83 | def test_short_path(self, storage_paths):
84 | assert storage_paths.short_path(
85 | f"{self._LOCATION}/metadata/file.parquet") == "metadata/file.parquet"
86 |
87 | def test_full_path(self, storage_paths):
88 | assert storage_paths.full_path(
89 | "data/file.parquet") == f"{self._LOCATION}/data/file.parquet"
90 |
91 | def test_new_metadata_path(self, storage_paths):
92 | with patch(_UUID_PATH, side_effect=_mocked_uuid):
93 | assert storage_paths.new_metadata_path(
94 | ) == f"{self._LOCATION}/metadata/metadata_.txtpb"
95 |
--------------------------------------------------------------------------------
/python/tests/core/utils/test_protos.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from space.core.utils import protos
16 | import space.core.proto.metadata_pb2 as meta
17 |
18 |
19 | def test_proto_to_text():
20 | text = protos.proto_to_text(meta.StorageMetadata(current_snapshot_id=100))
21 | assert text.decode("utf-8") == "current_snapshot_id: 100\n"
22 |
--------------------------------------------------------------------------------
/python/tests/core/utils/test_uuids.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import uuid
16 |
17 | from space.core.utils import uuids
18 |
19 |
20 | def test_uuid_():
21 | assert uuid.UUID(uuids.uuid_()).version == 4
22 |
23 |
24 | def test_random_id():
25 | assert len(uuids.random_id()) == 8
26 |
--------------------------------------------------------------------------------