├── src └── mdio │ ├── py.typed │ ├── api │ ├── __init__.py │ └── io.py │ ├── builder │ ├── __init__.py │ ├── templates │ │ ├── __init__.py │ │ ├── types.py │ │ ├── seismic_2d_poststack.py │ │ ├── seismic_3d_poststack.py │ │ ├── seismic_2d_streamer_shot.py │ │ ├── seismic_2d_cdp.py │ │ ├── seismic_3d_coca.py │ │ ├── seismic_3d_streamer_shot.py │ │ ├── seismic_3d_cdp.py │ │ └── seismic_3d_streamer_field.py │ └── schemas │ │ ├── __init__.py │ │ ├── v1 │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── stats.py │ │ ├── variable.py │ │ └── units.py │ │ ├── dimension.py │ │ ├── core.py │ │ ├── chunk_grid.py │ │ ├── base.py │ │ ├── dtype.py │ │ ├── units.py │ │ └── compressors.py │ ├── segy │ ├── __init__.py │ ├── helpers_segy.py │ ├── _raw_trace_wrapper.py │ ├── scalar.py │ ├── exceptions.py │ ├── parsers.py │ └── compat.py │ ├── commands │ ├── __init__.py │ ├── copy.py │ └── info.py │ ├── core │ ├── __init__.py │ ├── zarr_io.py │ ├── utils_write.py │ ├── config.py │ ├── dimension.py │ ├── indexing.py │ └── grid.py │ ├── converters │ ├── __init__.py │ ├── exceptions.py │ └── type_converter.py │ ├── __init__.py │ ├── constants.py │ ├── exceptions.py │ └── __main__.py ├── .gitattributes ├── tests ├── __init__.py ├── unit │ ├── __init__.py │ ├── v1 │ │ ├── __init__.py │ │ ├── templates │ │ │ ├── conftest.py │ │ │ ├── test_seismic_templates.py │ │ │ └── test_seismic_2d_poststack.py │ │ ├── test_dataset_builder_helpers.py │ │ ├── test_dataset_builder_add_dimension.py │ │ └── test_dataset_builder_add_coordinate.py │ ├── test_dimension.py │ ├── test_environment.py │ ├── test_auto_chunking.py │ ├── test_coordinate_scalar.py │ ├── test_segy_spec_validation.py │ ├── test_type_converter.py │ ├── test_segy_grid_overrides.py │ └── test_indexing.py ├── integration │ └── testing_helpers.py ├── conftest.py └── test_main.py ├── docs ├── codeofconduct.md ├── license.md ├── contributing.md ├── data_models │ ├── index.md │ ├── dimensions.md │ ├── compressors.md │ ├── version_1.md │ └── chunk_grids.md ├── requirements.txt ├── tutorials │ └── index.md ├── index.md ├── api_reference.md ├── conf.py ├── installation.md └── template_registry.md ├── .darglint ├── .github ├── workflows │ ├── constraints.txt │ ├── labeler.yml │ ├── release.yml │ └── tests.yml ├── dependabot.yml ├── release-drafter.yml └── labels.yml ├── .readthedocs.yml ├── .editorconfig ├── .devcontainer ├── post-create.sh └── devcontainer.json ├── .pre-commit-config.yaml ├── .dockerignore ├── .gitignore ├── CONTRIBUTING.md ├── pyproject.toml └── CODE_OF_CONDUCT.md /src/mdio/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /src/mdio/api/__init__.py: -------------------------------------------------------------------------------- 1 | """Public API.""" 2 | -------------------------------------------------------------------------------- /src/mdio/builder/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO building utilities.""" 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for the MDIO package.""" 2 | -------------------------------------------------------------------------------- /docs/codeofconduct.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CODE_OF_CONDUCT.md 2 | 3 | ``` 4 | -------------------------------------------------------------------------------- /src/mdio/segy/__init__.py: -------------------------------------------------------------------------------- 1 | """SEG-Y specific implementation module.""" 2 | -------------------------------------------------------------------------------- /.darglint: -------------------------------------------------------------------------------- 1 | [darglint] 2 | docstring_style = google 3 | strictness = long 4 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for parts of the MDIO package.""" 2 | -------------------------------------------------------------------------------- /.github/workflows/constraints.txt: -------------------------------------------------------------------------------- 1 | bump-my-version==1.2.4 2 | nox==2025.10.16 3 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO templates for known dataset kinds.""" 2 | -------------------------------------------------------------------------------- /tests/unit/v1/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for parts of the MDIO package related to the v1 schema.""" 2 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO schemas for different data types.""" 2 | 3 | __all__ = [] 4 | -------------------------------------------------------------------------------- /docs/license.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | ```{literalinclude} ../LICENSE 4 | --- 5 | language: none 6 | --- 7 | ``` 8 | -------------------------------------------------------------------------------- /src/mdio/commands/__init__.py: -------------------------------------------------------------------------------- 1 | """Plugins for MDIO CLI commands. 2 | 3 | Default Plugins: 4 | 5 | * SEG-Y: CLI commands to ingest / export SEG-Y. 6 | """ 7 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../CONTRIBUTING.md 2 | --- 3 | end-before: 4 | --- 5 | ``` 6 | 7 | [code of conduct]: codeofconduct 8 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/v1/__init__.py: -------------------------------------------------------------------------------- 1 | """Schema specific to MDIO v1.""" 2 | 3 | from mdio.builder.schemas.v1.dataset import Dataset 4 | 5 | __all__ = ["Dataset"] 6 | -------------------------------------------------------------------------------- /docs/data_models/index.md: -------------------------------------------------------------------------------- 1 | # Dataset Models 2 | 3 | This section contains the data models for the MDIO format. 4 | 5 | ```{toctree} 6 | :maxdepth: 2 7 | 8 | version_1 9 | ``` 10 | -------------------------------------------------------------------------------- /src/mdio/core/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO core functionalities.""" 2 | 3 | from mdio.core.dimension import Dimension 4 | from mdio.core.grid import Grid 5 | 6 | __all__ = ["Dimension", "Grid"] 7 | -------------------------------------------------------------------------------- /src/mdio/converters/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO Data conversion API.""" 2 | 3 | from mdio.converters.mdio import mdio_to_segy 4 | from mdio.converters.segy import segy_to_mdio 5 | 6 | __all__ = ["mdio_to_segy", "segy_to_mdio"] 7 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-22.04 4 | tools: 5 | python: "3.13" 6 | sphinx: 7 | configuration: docs/conf.py 8 | formats: all 9 | python: 10 | install: 11 | - requirements: docs/requirements.txt 12 | - path: . 13 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.13.2 2 | autodoc-pydantic==2.2.0 3 | furo==2025.9.25 4 | linkify-it-py==2.0.3 5 | matplotlib==3.10.7 6 | myst-nb==1.3.0 7 | sphinx==8.2.3 8 | sphinx-click==6.1.0 9 | sphinx-copybutton==0.5.2 10 | sphinx-design==0.6.1 11 | ipywidgets==8.1.7 12 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/types.py: -------------------------------------------------------------------------------- 1 | """Module that contains type aliases for templates.""" 2 | 3 | from typing import Literal 4 | from typing import TypeAlias 5 | 6 | SeismicDataDomain: TypeAlias = Literal["depth", "time"] 7 | 8 | CdpGatherDomain: TypeAlias = Literal["offset", "angle"] 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | 9 | [*.{py,toml}] 10 | indent_style = space 11 | indent_size = 4 12 | 13 | [*.{yml,yaml,json}] 14 | indent_style = space 15 | indent_size = 2 16 | -------------------------------------------------------------------------------- /docs/tutorials/index.md: -------------------------------------------------------------------------------- 1 | # Tutorials 2 | 3 | Welcome to the tutorials. This section collects hands‑on guides that walk you through common MDIO workflows. 4 | 5 | Pick a topic from the list below to get started. 6 | 7 | ```{toctree} 8 | :maxdepth: 1 9 | :titlesonly: 10 | 11 | quickstart 12 | creation 13 | compression 14 | rechunking 15 | corrupt_files 16 | custom_template 17 | ``` 18 | -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: Labeler 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | jobs: 10 | labeler: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Check out the repository 14 | uses: actions/checkout@v5 15 | 16 | - name: Run Labeler 17 | uses: crazy-max/ghaction-github-labeler@v5 18 | with: 19 | skip-delete: true 20 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/dimension.py: -------------------------------------------------------------------------------- 1 | """Dimension schema.""" 2 | 3 | from pydantic import Field 4 | 5 | from mdio.builder.schemas.core import CamelCaseStrictModel 6 | 7 | 8 | class NamedDimension(CamelCaseStrictModel): 9 | """Represents a single dimension with a name and size.""" 10 | 11 | name: str = Field(..., description="Unique identifier for the dimension.") 12 | size: int = Field(..., gt=0, description="Total size of the dimension.") 13 | -------------------------------------------------------------------------------- /src/mdio/__init__.py: -------------------------------------------------------------------------------- 1 | """MDIO library.""" 2 | 3 | from importlib import metadata 4 | 5 | from mdio.api.io import open_mdio 6 | from mdio.api.io import to_mdio 7 | from mdio.converters import mdio_to_segy 8 | from mdio.converters import segy_to_mdio 9 | 10 | try: 11 | __version__ = metadata.version("multidimio") 12 | except metadata.PackageNotFoundError: 13 | __version__ = "unknown" 14 | 15 | 16 | __all__ = [ 17 | "__version__", 18 | "open_mdio", 19 | "to_mdio", 20 | "mdio_to_segy", 21 | "segy_to_mdio", 22 | ] 23 | -------------------------------------------------------------------------------- /.devcontainer/post-create.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Install useful developer tools used by mdio development 6 | uv tool install nox 7 | uv tool install bump-my-version 8 | 9 | # Sync the environment, installing the project editable and including dev dependencies 10 | uv sync 11 | 12 | # Set Git safe directory to avoid ownership issues 13 | git config --global --add safe.directory "$PWD" 14 | 15 | # Optional: If you need to reset GitHub host key for SSH (uncomment if necessary) 16 | # ssh-keygen -f "/home/vscode/.ssh/known_hosts" -R "github.com" 17 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | - package-ecosystem: pip 8 | directory: "/.github/workflows" 9 | schedule: 10 | interval: monthly 11 | - package-ecosystem: pip 12 | directory: "/docs" 13 | schedule: 14 | interval: monthly 15 | - package-ecosystem: pip 16 | directory: "/" 17 | schedule: 18 | interval: monthly 19 | versioning-strategy: lockfile-only 20 | allow: 21 | - dependency-type: "all" 22 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/core.py: -------------------------------------------------------------------------------- 1 | """This module implements the core components of the MDIO schemas.""" 2 | 3 | from __future__ import annotations 4 | 5 | from pydantic import BaseModel 6 | from pydantic import ConfigDict 7 | from pydantic.alias_generators import to_camel 8 | 9 | 10 | class CamelCaseStrictModel(BaseModel): 11 | """A model with forbidden extras and camel case aliases.""" 12 | 13 | model_config = ConfigDict( 14 | alias_generator=to_camel, 15 | validate_by_name=True, 16 | serialize_by_alias=True, 17 | validate_assignment=True, 18 | extra="forbid", 19 | ) 20 | -------------------------------------------------------------------------------- /docs/data_models/dimensions.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | :tocdepth: 3 3 | ``` 4 | 5 | ```{currentModule} mdio.builder.schemas.dimension 6 | 7 | ``` 8 | 9 | # Dimensions 10 | 11 | ```{article-info} 12 | :author: Altay Sansal 13 | :date: "{sub-ref}`today`" 14 | :read-time: "{sub-ref}`wordcount-minutes` min read" 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light 16 | ``` 17 | 18 | ## Intro 19 | 20 | ```{eval-rst} 21 | .. autosummary:: NamedDimension 22 | ``` 23 | 24 | ## Reference 25 | 26 | :::{dropdown} Dimension 27 | :open: 28 | 29 | ```{eval-rst} 30 | .. autopydantic_model:: NamedDimension 31 | ``` 32 | 33 | ::: 34 | -------------------------------------------------------------------------------- /src/mdio/core/zarr_io.py: -------------------------------------------------------------------------------- 1 | """Utilities to open/write Zarr files.""" 2 | 3 | from __future__ import annotations 4 | 5 | import warnings 6 | from contextlib import contextmanager 7 | from typing import TYPE_CHECKING 8 | 9 | from zarr.errors import UnstableSpecificationWarning 10 | 11 | if TYPE_CHECKING: 12 | from collections.abc import Generator 13 | 14 | 15 | @contextmanager 16 | def zarr_warnings_suppress_unstable_structs_v3() -> Generator[None, None, None]: 17 | """Context manager to suppress Zarr V3 unstable structured array warning.""" 18 | warn = r"The data type \((.*?)\) does not have a Zarr V3 specification\." 19 | warnings.filterwarnings("ignore", message=warn, category=UnstableSpecificationWarning) 20 | try: 21 | yield 22 | finally: 23 | pass 24 | -------------------------------------------------------------------------------- /tests/unit/v1/templates/conftest.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the conftest module in the templates directory.""" 2 | 3 | # conftest.py 4 | import pytest 5 | 6 | from mdio.builder.schemas.dtype import ScalarType 7 | from mdio.builder.schemas.dtype import StructuredField 8 | from mdio.builder.schemas.dtype import StructuredType 9 | 10 | 11 | @pytest.fixture(scope="session") 12 | def structured_headers() -> StructuredType: 13 | """Fixture to provide structured headers for testing.""" 14 | return StructuredType( 15 | fields=[ 16 | StructuredField(name="cdp_x", format=ScalarType.INT32), 17 | StructuredField(name="cdp_y", format=ScalarType.INT32), 18 | StructuredField(name="elevation", format=ScalarType.FLOAT16), 19 | StructuredField(name="some_scalar", format=ScalarType.FLOAT16), 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - title: ":boom: Breaking Changes" 3 | label: "breaking" 4 | - title: ":rocket: Features" 5 | label: "enhancement" 6 | - title: ":fire: Removals and Deprecations" 7 | label: "removal" 8 | - title: ":beetle: Fixes" 9 | label: "bug" 10 | - title: ":racehorse: Performance" 11 | label: "performance" 12 | - title: ":rotating_light: Testing" 13 | label: "testing" 14 | - title: ":construction_worker: Continuous Integration" 15 | label: "ci" 16 | - title: ":books: Documentation" 17 | label: "documentation" 18 | - title: ":hammer: Refactoring" 19 | label: "refactoring" 20 | - title: ":lipstick: Style" 21 | label: "style" 22 | - title: ":package: Dependencies" 23 | labels: 24 | - "dependencies" 25 | - "build" 26 | template: | 27 | ## Changes 28 | 29 | $CHANGES 30 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_2d_poststack.py: -------------------------------------------------------------------------------- 1 | """Seismic2DPostStackTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.templates.base import AbstractDatasetTemplate 6 | from mdio.builder.templates.types import SeismicDataDomain 7 | 8 | 9 | class Seismic2DPostStackTemplate(AbstractDatasetTemplate): 10 | """Seismic post-stack 2D time or depth Dataset template.""" 11 | 12 | def __init__(self, data_domain: SeismicDataDomain): 13 | super().__init__(data_domain=data_domain) 14 | 15 | self._dim_names = ("cdp", self._data_domain) 16 | self._physical_coord_names = ("cdp_x", "cdp_y") 17 | self._var_chunk_shape = (1024, 1024) 18 | 19 | @property 20 | def _name(self) -> str: 21 | return f"PostStack2D{self._data_domain.capitalize()}" 22 | 23 | def _load_dataset_attributes(self) -> dict[str, Any]: 24 | return {"surveyType": "2D", "gatherType": "stacked"} 25 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_3d_poststack.py: -------------------------------------------------------------------------------- 1 | """Seismic3DPostStackTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.templates.base import AbstractDatasetTemplate 6 | from mdio.builder.templates.types import SeismicDataDomain 7 | 8 | 9 | class Seismic3DPostStackTemplate(AbstractDatasetTemplate): 10 | """Seismic post-stack 3D time or depth Dataset template.""" 11 | 12 | def __init__(self, data_domain: SeismicDataDomain): 13 | super().__init__(data_domain=data_domain) 14 | 15 | self._dim_names = ("inline", "crossline", self._data_domain) 16 | self._physical_coord_names = ("cdp_x", "cdp_y") 17 | self._var_chunk_shape = (128, 128, 128) 18 | 19 | @property 20 | def _name(self) -> str: 21 | domain_suffix = self._data_domain.capitalize() 22 | return f"PostStack3D{domain_suffix}" 23 | 24 | def _load_dataset_attributes(self) -> dict[str, Any]: 25 | return {"surveyType": "3D", "gatherType": "stacked"} 26 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ```{include} ../README.md 2 | --- 3 | end-before: 4 | --- 5 | ``` 6 | 7 | [apache 2.0 license]: license 8 | [contributor guide]: contributing 9 | [command-line usage]: cli_usage 10 | [api reference]: api_reference 11 | [installation instructions]: installation 12 | 13 | ```{toctree} 14 | :hidden: 15 | :caption: Getting Started 16 | 17 | installation 18 | cli_usage 19 | configuration 20 | ``` 21 | 22 | ```{toctree} 23 | :hidden: 24 | :caption: Learning and Support 25 | 26 | tutorials/index 27 | api_reference 28 | ``` 29 | 30 | ```{toctree} 31 | :hidden: 32 | :caption: Core Concepts and Structures 33 | 34 | data_models/index 35 | data_models/dimensions 36 | data_models/chunk_grids 37 | data_models/data_types 38 | data_models/compressors 39 | template_registry 40 | ``` 41 | 42 | ```{toctree} 43 | :hidden: 44 | :caption: Community and Contribution 45 | 46 | contributing 47 | Code of Conduct 48 | ``` 49 | 50 | ```{toctree} 51 | :hidden: 52 | :caption: Additional Resources 53 | 54 | License 55 | Changelog 56 | ``` 57 | -------------------------------------------------------------------------------- /docs/api_reference.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | ## Data Converters 4 | 5 | ### Seismic Data 6 | 7 | ````{note} 8 | By default, the SEG-Y ingestion tool uses Python's multiprocessing 9 | to speed up parsing the data. This almost always requires a `__main__` 10 | guard on any other Python code that is executed directly like 11 | `python file.py`. When running inside Jupyter, this is **NOT** needed. 12 | 13 | ```python 14 | if __name__ == "__main__": 15 | segy_to_mdio(...) 16 | ``` 17 | 18 | When the CLI is invoked, this is already handled. 19 | 20 | See the official `multiprocessing` documentation 21 | [here](https://docs.python.org/3/library/multiprocessing.html#the-process-class) 22 | and 23 | [here](https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming). 24 | ```` 25 | 26 | ```{eval-rst} 27 | .. automodule:: mdio.converters.segy 28 | :members: 29 | :exclude-members: grid_density_qc, parse_index_types, get_compressor, populate_dim_coordinates, populate_non_dim_coordinates 30 | 31 | .. automodule:: mdio.converters.mdio 32 | :members: 33 | ``` 34 | 35 | ## Core Functionality 36 | 37 | ### Dimensions 38 | 39 | ```{eval-rst} 40 | .. automodule:: mdio.core.dimension 41 | :members: 42 | ``` 43 | -------------------------------------------------------------------------------- /tests/unit/v1/test_dataset_builder_helpers.py: -------------------------------------------------------------------------------- 1 | """Tests the schema v1 dataset_builder internal methods.""" 2 | 3 | import pytest 4 | 5 | from mdio.builder.dataset_builder import _get_named_dimension 6 | from mdio.builder.schemas.dimension import NamedDimension 7 | 8 | 9 | def test__get_named_dimension() -> None: 10 | """Test getting a dimension by name from the list of dimensions.""" 11 | dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] 12 | 13 | assert _get_named_dimension([], "inline") is None 14 | assert _get_named_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) 15 | assert _get_named_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) 16 | assert _get_named_dimension(dimensions, "time") is None 17 | 18 | with pytest.raises(TypeError, match="Expected str, got NoneType"): 19 | _get_named_dimension(dimensions, None) 20 | with pytest.raises(TypeError, match="Expected str, got int"): 21 | _get_named_dimension(dimensions, 42) 22 | with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): 23 | _get_named_dimension(dimensions, "inline", size=200) 24 | -------------------------------------------------------------------------------- /src/mdio/segy/helpers_segy.py: -------------------------------------------------------------------------------- 1 | """Helper functions for tinkering with SEG-Y related Zarr.""" 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from zarr.errors import ContainsGroupError 6 | 7 | from mdio.exceptions import MDIOAlreadyExistsError 8 | 9 | if TYPE_CHECKING: 10 | from zarr import Group 11 | 12 | 13 | def create_zarr_hierarchy(root_group: "Group", overwrite: bool) -> "Group": 14 | """Create `zarr` hierarchy for SEG-Y files. 15 | 16 | Args: 17 | root_group: Output root group where data will be written. 18 | overwrite: Toggle for overwriting existing store. 19 | 20 | Returns: 21 | Zarr Group instance for root of the file. 22 | 23 | Raises: 24 | MDIOAlreadyExistsError: If a file with data already exists. 25 | """ 26 | try: 27 | root_group.create_group(name="data", overwrite=overwrite) 28 | root_group.create_group(name="metadata", overwrite=overwrite) 29 | except ContainsGroupError as e: 30 | msg = ( 31 | f"An MDIO file with data already exists at {root_group.store_path}. " 32 | "If this is intentional, please specify 'overwrite=True'." 33 | ) 34 | raise MDIOAlreadyExistsError(msg) from e 35 | 36 | return root_group 37 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/v1/dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset model for MDIO V1.""" 2 | 3 | from typing import Any 4 | 5 | from pydantic import AwareDatetime 6 | from pydantic import Field 7 | 8 | from mdio.builder.schemas.base import BaseDataset 9 | from mdio.builder.schemas.core import CamelCaseStrictModel 10 | from mdio.builder.schemas.v1.variable import Variable 11 | 12 | 13 | class DatasetMetadata(CamelCaseStrictModel): 14 | """Contains information about a dataset.""" 15 | 16 | name: str = Field(..., description="Name or identifier for the dataset.") 17 | 18 | api_version: str = Field( 19 | ..., 20 | description="The version of the MDIO API that the dataset complies with.", 21 | ) 22 | 23 | created_on: AwareDatetime = Field( 24 | ..., 25 | description=( 26 | "The timestamp indicating when the dataset was first created, " 27 | "including timezone information. Expressed in ISO 8601 format." 28 | ), 29 | ) 30 | 31 | attributes: dict[str, Any] | None = Field(default=None, description="User defined attributes as key/value pairs.") 32 | 33 | 34 | class Dataset(BaseDataset): 35 | """Represents an MDIO v1 dataset. 36 | 37 | A dataset consists of variables and metadata. 38 | """ 39 | 40 | variables: list[Variable] = Field(..., description="Variables in MDIO dataset") 41 | metadata: DatasetMetadata = Field(..., description="Dataset metadata.") 42 | -------------------------------------------------------------------------------- /src/mdio/core/utils_write.py: -------------------------------------------------------------------------------- 1 | """Convenience utilities for writing to Zarr.""" 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from dask.array.core import normalize_chunks 6 | from dask.array.rechunk import _balance_chunksizes 7 | 8 | if TYPE_CHECKING: 9 | from numpy.typing import DTypeLike 10 | 11 | 12 | MAX_SIZE_LIVE_MASK = 256 * 1024**2 13 | MAX_COORDINATES_BYTES = 32 * 1024**2 14 | 15 | 16 | def get_constrained_chunksize( 17 | shape: tuple[int, ...], 18 | dtype: "DTypeLike", 19 | max_bytes: int, 20 | ) -> tuple[int, ...]: 21 | """Calculate the optimal chunk size for N-D array based on max_bytes. 22 | 23 | Args: 24 | shape: The shape of the array. 25 | dtype: The data dtype to be used in calculation. 26 | max_bytes: The maximum allowed number of bytes per chunk. 27 | 28 | Returns: 29 | A sequence of integers of calculated chunk sizes. 30 | """ 31 | chunks = normalize_chunks("auto", shape, dtype=dtype, limit=max_bytes) 32 | return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks) 33 | 34 | 35 | def get_live_mask_chunksize(shape: tuple[int, ...]) -> tuple[int, ...]: 36 | """Given a live_mask shape, calculate the optimal write chunk size. 37 | 38 | Args: 39 | shape: The shape of the array. 40 | 41 | Returns: 42 | A sequence of integers of calculated chunk sizes. 43 | """ 44 | return get_constrained_chunksize(shape, "bool", MAX_SIZE_LIVE_MASK) 45 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/chunk_grid.py: -------------------------------------------------------------------------------- 1 | """This module contains data models for Zarr's chunk grid.""" 2 | 3 | from __future__ import annotations 4 | 5 | from pydantic import Field 6 | 7 | from mdio.builder.schemas.core import CamelCaseStrictModel 8 | 9 | 10 | class RegularChunkShape(CamelCaseStrictModel): 11 | """Represents regular chunk sizes along each dimension.""" 12 | 13 | chunk_shape: tuple[int, ...] = Field(..., description="Lengths of the chunk along each dimension of the array.") 14 | 15 | 16 | class RectilinearChunkShape(CamelCaseStrictModel): 17 | """Represents irregular chunk sizes along each dimension.""" 18 | 19 | chunk_shape: tuple[tuple[int, ...], ...] = Field( 20 | ..., 21 | description="Lengths of the chunk along each dimension of the array.", 22 | ) 23 | 24 | 25 | class RegularChunkGrid(CamelCaseStrictModel): 26 | """Represents a rectangular and regularly spaced chunk grid.""" 27 | 28 | name: str = Field(default="regular", description="The name of the chunk grid.") 29 | 30 | configuration: RegularChunkShape = Field(..., description="Configuration of the regular chunk grid.") 31 | 32 | 33 | class RectilinearChunkGrid(CamelCaseStrictModel): 34 | """Represents a rectangular and irregularly spaced chunk grid.""" 35 | 36 | name: str = Field(default="rectilinear", description="The name of the chunk grid.") 37 | 38 | configuration: RectilinearChunkShape = Field(..., description="Configuration of the irregular chunk grid.") 39 | -------------------------------------------------------------------------------- /src/mdio/converters/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom exceptions for MDIO converters.""" 2 | 3 | 4 | class EnvironmentFormatError(Exception): 5 | """Raised when environment variable is of the wrong format.""" 6 | 7 | def __init__(self, name: str, format: str, msg: str = ""): # noqa: A002 8 | self.message = f"Environment variable: {name} not of expected format: {format}. " 9 | self.message += f"\n{msg}" if msg else "" 10 | super().__init__(self.message) 11 | 12 | 13 | class GridTraceCountError(Exception): 14 | """Raised when grid trace counts don't match the SEG-Y trace count.""" 15 | 16 | def __init__(self, grid_traces: int, segy_traces: int): 17 | self.message = ( 18 | f"{grid_traces} != {segy_traces}. Scanned grid trace count ({grid_traces}) doesn't " 19 | f"match SEG-Y file ({segy_traces}). Either indexing parameters are wrong (not unique) " 20 | "or SEG-Y file has duplicate traces." 21 | ) 22 | 23 | super().__init__(self.message) 24 | 25 | 26 | class GridTraceSparsityError(Exception): 27 | """Raised when mdio grid will be sparsely populated from SEG-Y traces.""" 28 | 29 | def __init__(self, shape: tuple[int, ...], num_traces: int, msg: str = ""): 30 | self.message = ( 31 | f"Grid shape: {shape} but SEG-Y tracecount: {num_traces}. This grid is very sparse " 32 | "and most likely user error with indexing." 33 | ) 34 | self.message += f"\n{msg}" if msg else "" 35 | super().__init__(self.message) 36 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/base.py: -------------------------------------------------------------------------------- 1 | """Base models to subclass from.""" 2 | 3 | from pydantic import ConfigDict 4 | from pydantic import Field 5 | from pydantic.json_schema import GenerateJsonSchema 6 | 7 | from mdio.builder.schemas.compressors import ZFP 8 | from mdio.builder.schemas.compressors import Blosc 9 | from mdio.builder.schemas.core import CamelCaseStrictModel 10 | from mdio.builder.schemas.dimension import NamedDimension 11 | from mdio.builder.schemas.dtype import DataTypeModel 12 | 13 | JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect 14 | 15 | 16 | class BaseDataset(CamelCaseStrictModel): 17 | """A base class for MDIO datasets. 18 | 19 | We add schema dialect to extend the config of `StrictCamelBaseModel`. 20 | We use the default Pydantic schema generator `GenerateJsonSchema` to 21 | define the JSON schema dialect accurately. 22 | """ 23 | 24 | model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT}) 25 | 26 | 27 | class BaseArray(DataTypeModel, CamelCaseStrictModel): 28 | """A base array schema.""" 29 | 30 | dimensions: list[NamedDimension] | list[str] = Field( 31 | ..., description="List of Dimension collection or reference to dimension names." 32 | ) 33 | compressor: Blosc | ZFP | None = Field(default=None, description="Compression settings.") 34 | 35 | 36 | class NamedArray(BaseArray): 37 | """An array with a name.""" 38 | 39 | name: str = Field(..., description="Name of the array.") 40 | long_name: str | None = Field(default=None, description="Fully descriptive name.") 41 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/dtype.py: -------------------------------------------------------------------------------- 1 | """Schemas for scalar types. 2 | 3 | We take booleans, unsigned and signed integers, floats, and 4 | complex numbers from numpy data types and allow those. 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from enum import StrEnum 10 | 11 | from pydantic import Field 12 | 13 | from mdio.builder.schemas.core import CamelCaseStrictModel 14 | 15 | 16 | class ScalarType(StrEnum): 17 | """Scalar array data type.""" 18 | 19 | BOOL = "bool" 20 | INT8 = "int8" 21 | INT16 = "int16" 22 | INT32 = "int32" 23 | INT64 = "int64" 24 | UINT8 = "uint8" 25 | UINT16 = "uint16" 26 | UINT32 = "uint32" 27 | UINT64 = "uint64" 28 | FLOAT16 = "float16" 29 | FLOAT32 = "float32" 30 | FLOAT64 = "float64" 31 | FLOAT128 = "float128" 32 | COMPLEX64 = "complex64" 33 | COMPLEX128 = "complex128" 34 | COMPLEX256 = "complex256" 35 | BYTES240 = "V240" # fixed-width 240-byte string, used for raw v0/1/2 trace headers 36 | 37 | 38 | class StructuredField(CamelCaseStrictModel): 39 | """Structured array field with name, format.""" 40 | 41 | format: ScalarType = Field(...) 42 | name: str = Field(...) 43 | 44 | 45 | class StructuredType(CamelCaseStrictModel): 46 | """Structured array type with packed fields.""" 47 | 48 | fields: list[StructuredField] = Field() 49 | 50 | 51 | class DataTypeModel(CamelCaseStrictModel): 52 | """Structured array type with fields and total item size.""" 53 | 54 | data_type: ScalarType | StructuredType = Field(..., description="Type of the array.") 55 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | # should be replaced in the future ref https://github.com/astral-sh/ruff/issues/458 3 | - repo: https://github.com/jsh9/pydoclint 4 | rev: 0.6.6 5 | hooks: 6 | - id: pydoclint 7 | - repo: local 8 | hooks: 9 | - id: ruff-format 10 | name: Format code with Ruff 11 | entry: ruff format 12 | language: system 13 | types_or: [python, pyi, jupyter] 14 | - id: ruff 15 | name: Lint code with Ruff 16 | entry: ruff check 17 | language: system 18 | types_or: [python, pyi, jupyter] 19 | args: [--fix] 20 | - id: check-added-large-files 21 | name: Check for added large files 22 | entry: check-added-large-files 23 | language: system 24 | args: ["--maxkb=1100"] 25 | - id: check-toml 26 | name: Check Toml 27 | entry: check-toml 28 | language: system 29 | types: [toml] 30 | - id: check-yaml 31 | name: Check Yaml 32 | entry: check-yaml 33 | language: system 34 | types: [yaml] 35 | - id: end-of-file-fixer 36 | name: Fix End of Files 37 | entry: end-of-file-fixer 38 | language: system 39 | types: [text] 40 | stages: [pre-commit, pre-push, manual] 41 | - id: trailing-whitespace 42 | name: Trim Trailing Whitespace 43 | entry: trailing-whitespace-fixer 44 | language: system 45 | types: [text] 46 | stages: [pre-commit, pre-push, manual] 47 | args: [--markdown-linebreak-ext=md] 48 | - repo: https://github.com/pre-commit/mirrors-prettier 49 | rev: v3.1.0 50 | hooks: 51 | - id: prettier 52 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/units.py: -------------------------------------------------------------------------------- 1 | """Common units for resource assessment data.""" 2 | 3 | from __future__ import annotations 4 | 5 | from enum import Enum 6 | from enum import unique 7 | 8 | from pydantic import Field 9 | from pydantic import create_model 10 | 11 | from mdio.builder.schemas.core import CamelCaseStrictModel 12 | 13 | 14 | @unique 15 | class UnitEnum(str, Enum): 16 | """An Enum representing units as strings, from pint.""" 17 | 18 | 19 | def create_unit_model( 20 | unit_enum: type[UnitEnum], 21 | model_name: str, 22 | quantity: str, 23 | module: str, 24 | ) -> type[CamelCaseStrictModel]: 25 | """Dynamically creates a pydantic model from a unit Enum. 26 | 27 | Args: 28 | unit_enum: UnitEnum representing the units for a specific quantity. 29 | model_name: The name of the model to be created. 30 | quantity: String representing the quantity for which the unit model is created. 31 | module: Name of the module in which the model is to be created. 32 | This should be the `__name__` attribute of the module. 33 | 34 | Returns: 35 | A Pydantic Model representing the unit model derived from the BaseModel. 36 | 37 | Example: 38 | unit_enum = UnitEnum 39 | model_name = "LengthUnitModel" 40 | quantity = "length" 41 | create_unit_model(unit_enum, model_name, quantity) 42 | """ 43 | fields = {quantity: (unit_enum, Field(..., description=f"Unit of {quantity}."))} 44 | 45 | return create_model( 46 | model_name, 47 | **fields, 48 | __base__=CamelCaseStrictModel, 49 | __doc__=f"Model representing units of {quantity}.", 50 | __module__=module, 51 | ) 52 | -------------------------------------------------------------------------------- /tests/unit/v1/test_dataset_builder_add_dimension.py: -------------------------------------------------------------------------------- 1 | """Tests the schema v1 dataset_builder.add_dimension() public API.""" 2 | 3 | import pytest 4 | 5 | from mdio.builder.dataset_builder import MDIODatasetBuilder 6 | from mdio.builder.dataset_builder import _BuilderState 7 | from mdio.builder.dataset_builder import _get_named_dimension 8 | 9 | from .helpers import validate_builder 10 | 11 | 12 | def test_add_dimension() -> None: 13 | """Test adding dimension. Check the state transition and validate required parameters.""" 14 | builder = MDIODatasetBuilder("test_dataset") 15 | assert builder._state == _BuilderState.INITIAL 16 | 17 | # Validate required parameters 18 | bad_name = None 19 | with pytest.raises(ValueError, match="'name' must be a non-empty string"): 20 | builder.add_dimension(bad_name, 200) 21 | with pytest.raises(ValueError, match="'name' must be a non-empty string"): 22 | builder.add_dimension("", 200) 23 | 24 | # First dimension should change state to HAS_DIMENSIONS and create a variable 25 | builder.add_dimension("x", 100) 26 | validate_builder(builder, _BuilderState.HAS_DIMENSIONS, n_dims=1, n_coords=0, n_var=0) 27 | assert _get_named_dimension(builder._dimensions, "x", 100) is not None 28 | 29 | # Validate that we can't add a dimension with the same name twice 30 | with pytest.raises( 31 | ValueError, 32 | match="Adding dimension with the same name twice is not allowed", 33 | ): 34 | builder.add_dimension("x", 200) 35 | 36 | # Adding dimension with the same name twice 37 | msg = "Adding dimension with the same name twice is not allowed" 38 | with pytest.raises(ValueError, match=msg): 39 | builder.add_dimension("x", 200) 40 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "image": "mcr.microsoft.com/devcontainers/python:1-3.13-bookworm", 3 | // Configure tool-specific properties. 4 | "features": { 5 | "ghcr.io/va-h/devcontainers-features/uv:1": { "version": "0.8.17" } 6 | }, 7 | "customizations": { 8 | "vscode": { 9 | "settings": { 10 | "python.defaultInterpreterPath": "${containerWorkspaceFolder}/.venv/bin/python", 11 | "python.testing.pytestArgs": ["tests"], 12 | "python.testing.unittestEnabled": false, 13 | "python.testing.pytestEnabled": true 14 | }, 15 | "extensions": [ 16 | "ms-python.python", 17 | "ms-python.vscode-pylance", 18 | "ms-toolsai.jupyter", 19 | "ms-toolsai.jupyter-keymap", 20 | "ms-toolsai.jupyter-renderers", 21 | "vscode-icons-team.vscode-icons", 22 | "wayou.vscode-todo-highlight", 23 | "streetsidesoftware.code-spell-checker", 24 | "eamodio.gitlens", 25 | "visualstudioexptteam.vscodeintellicode", 26 | "richie5um2.vscode-sort-json" 27 | ] 28 | }, 29 | "jetbrains": { 30 | "plugins": ["com.koxudaxi.pydantic", "com.koxudaxi.ruff"] 31 | } 32 | }, 33 | "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind", 34 | "workspaceFolder": "/workspaces/mdio-python", 35 | // This installs the MDIO dev environment in the container. Put any customizations there. 36 | "postCreateCommand": "bash .devcontainer/post-create.sh", 37 | // Forward 8787 to enable us to view the dask dashboard 38 | "forwardPorts": [8787] 39 | // Add any mounts you want to use here. 40 | //"mounts": [ "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" ] 41 | } 42 | -------------------------------------------------------------------------------- /src/mdio/constants.py: -------------------------------------------------------------------------------- 1 | """Constant values used across MDIO.""" 2 | 3 | from enum import IntEnum 4 | 5 | import numpy as np 6 | 7 | from mdio.builder.schemas.dtype import ScalarType 8 | 9 | 10 | class ZarrFormat(IntEnum): 11 | """Zarr version enum.""" 12 | 13 | V2 = 2 14 | V3 = 3 15 | 16 | 17 | FLOAT16_MAX = np.finfo("float16").max 18 | FLOAT16_MIN = np.finfo("float16").min 19 | 20 | FLOAT32_MAX = np.finfo("float32").max 21 | FLOAT32_MIN = np.finfo("float32").min 22 | 23 | FLOAT64_MIN = np.finfo("float64").min 24 | FLOAT64_MAX = np.finfo("float64").max 25 | 26 | INT8_MIN = np.iinfo("int8").min 27 | INT8_MAX = np.iinfo("int8").max 28 | 29 | INT16_MIN = np.iinfo("int16").min 30 | INT16_MAX = np.iinfo("int16").max 31 | 32 | INT32_MIN = np.iinfo("int32").min 33 | INT32_MAX = np.iinfo("int32").max 34 | 35 | INT64_MIN = np.iinfo("int64").min 36 | INT64_MAX = np.iinfo("int64").max 37 | 38 | UINT8_MIN = 0 39 | UINT8_MAX = np.iinfo("uint8").max 40 | 41 | UINT16_MIN = 0 42 | UINT16_MAX = np.iinfo("uint16").max 43 | 44 | UINT32_MIN = 0 45 | UINT32_MAX = np.iinfo("uint32").max 46 | 47 | UINT64_MIN = 0 48 | UINT64_MAX = np.iinfo("uint64").max 49 | 50 | # Zarr fill values for different scalar types 51 | fill_value_map = { 52 | ScalarType.BOOL: None, 53 | ScalarType.FLOAT16: np.nan, 54 | ScalarType.FLOAT32: np.nan, 55 | ScalarType.FLOAT64: np.nan, 56 | ScalarType.UINT8: UINT8_MAX, 57 | ScalarType.UINT16: UINT16_MAX, 58 | ScalarType.UINT32: UINT32_MAX, 59 | ScalarType.UINT64: UINT64_MAX, 60 | ScalarType.INT8: INT8_MAX, 61 | ScalarType.INT16: INT16_MAX, 62 | ScalarType.INT32: INT32_MAX, 63 | ScalarType.INT64: INT64_MAX, 64 | ScalarType.COMPLEX64: complex(np.nan, np.nan), 65 | ScalarType.COMPLEX128: complex(np.nan, np.nan), 66 | ScalarType.COMPLEX256: complex(np.nan, np.nan), 67 | ScalarType.BYTES240: b"\x00" * 240, 68 | } 69 | -------------------------------------------------------------------------------- /tests/integration/testing_helpers.py: -------------------------------------------------------------------------------- 1 | """This module provides testing helpers for integration testing.""" 2 | 3 | from collections.abc import Callable 4 | 5 | import numpy as np 6 | import xarray as xr 7 | from numpy.typing import DTypeLike 8 | 9 | 10 | def get_values(arr: xr.DataArray) -> np.ndarray: 11 | """Extract actual values from an Xarray DataArray.""" 12 | return arr.values 13 | 14 | 15 | def get_inline_header_values(dataset: xr.Dataset) -> np.ndarray: 16 | """Extract a specific header value from an Xarray DataArray.""" 17 | return dataset["inline"].values 18 | 19 | 20 | def validate_variable( # noqa PLR0913 21 | dataset: xr.Dataset, 22 | name: str, 23 | shape: tuple[int, ...], 24 | dims: tuple[str, ...], 25 | data_type: DTypeLike, 26 | expected_values: range | None, 27 | actual_value_generator: Callable[[xr.DataArray], np.ndarray] | None = None, 28 | ) -> None: 29 | """Validate the properties of a variable in an Xarray dataset.""" 30 | arr = dataset[name] 31 | assert shape == arr.shape 32 | assert set(dims) == set(arr.dims) 33 | if hasattr(data_type, "fields") and data_type.fields is not None: 34 | # The following assertion will fail because of differences in offsets 35 | # assert data_type == arr.dtype 36 | 37 | # Compare field names 38 | expected_names = list(data_type.names) 39 | actual_names = list(arr.dtype.names) 40 | assert expected_names == actual_names 41 | 42 | # Compare field types 43 | expected_types = [data_type[name] for name in data_type.names] 44 | actual_types = [arr.dtype[name] for name in arr.dtype.names] 45 | assert expected_types == actual_types 46 | else: 47 | assert data_type == arr.dtype 48 | 49 | if expected_values is not None and actual_value_generator is not None: 50 | actual_values = actual_value_generator(arr) 51 | assert np.array_equal(expected_values, actual_values) 52 | -------------------------------------------------------------------------------- /src/mdio/segy/_raw_trace_wrapper.py: -------------------------------------------------------------------------------- 1 | """Consumer-side utility to get both raw and transformed header data with single filesystem read.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | import numpy as np 8 | 9 | if TYPE_CHECKING: 10 | from numpy.typing import NDArray 11 | from segy import SegyFile 12 | 13 | 14 | class SegyFileRawTraceWrapper: 15 | def __init__(self, segy_file: SegyFile, indices: int | list[int] | NDArray | slice): 16 | self.segy_file = segy_file 17 | self.indices = indices 18 | 19 | self.idx = self.segy_file.trace.normalize_and_validate_query(self.indices) 20 | self.trace_buffer_array = self.segy_file.trace.fetch(self.idx, raw=True) 21 | 22 | self.trace_view = self.trace_buffer_array.view(self.segy_file.spec.trace.dtype) 23 | 24 | self.trace_decode_pipeline = self.segy_file.accessors.trace_decode_pipeline 25 | self.decoded_traces = None # decode later when not-raw header/sample is called 26 | 27 | def _ensure_decoded(self) -> None: 28 | """Apply trace decoding pipeline if not already done.""" 29 | if self.decoded_traces is not None: # already done 30 | return 31 | self.decoded_traces = self.trace_decode_pipeline.apply(self.trace_view.copy()) 32 | 33 | @property 34 | def raw_header(self) -> NDArray: 35 | """Get byte array view of the raw headers.""" 36 | header_itemsize = self.segy_file.spec.trace.header.itemsize # should be 240 37 | return self.trace_view.header.view(np.dtype((np.void, header_itemsize))) 38 | 39 | @property 40 | def header(self) -> NDArray: 41 | """Get decoded header.""" 42 | self._ensure_decoded() # decode when needed in-place to avoid copy. 43 | return self.decoded_traces.header 44 | 45 | @property 46 | def sample(self) -> NDArray: 47 | """Get decoded trace samples.""" 48 | self._ensure_decoded() # decode when needed in-place to avoid copy. 49 | return self.decoded_traces.sample 50 | -------------------------------------------------------------------------------- /src/mdio/segy/scalar.py: -------------------------------------------------------------------------------- 1 | """Utilities to read, parse, and apply coordinate scalars.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | from typing import TYPE_CHECKING 7 | 8 | from segy.schema import SegyStandard 9 | from segy.standards.fields import trace as trace_header_fields 10 | 11 | if TYPE_CHECKING: 12 | from numpy.typing import NDArray 13 | from segy import SegyFile 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | COORD_SCALAR_KEY = trace_header_fields.Rev0.COORDINATE_SCALAR.model.name 20 | VALID_COORD_SCALAR = {1, 10, 100, 1000, 10000} 21 | SCALE_COORDINATE_KEYS = [ 22 | "cdp_x", 23 | "cdp_y", 24 | "source_coord_x", 25 | "source_coord_y", 26 | "group_coord_x", 27 | "group_coord_y", 28 | ] 29 | 30 | 31 | def _get_coordinate_scalar(segy_file: SegyFile) -> int: 32 | """Get and parse the coordinate scalar from the first SEG-Y trace header.""" 33 | file_revision = segy_file.spec.segy_standard 34 | first_header = segy_file.header[0] 35 | coord_scalar = int(first_header[COORD_SCALAR_KEY]) 36 | 37 | # Per Rev2, standardize 0 to 1 if a file is 2+. 38 | if coord_scalar == 0 and file_revision >= SegyStandard.REV2: 39 | logger.warning("Coordinate scalar is 0 and file is %s. Setting to 1.", file_revision) 40 | return 1 41 | 42 | def validate_segy_scalar(scalar: int) -> bool: 43 | """Validate if coord scalar matches the seg-y standard.""" 44 | logger.debug("Coordinate scalar is %s", scalar) 45 | return abs(scalar) in VALID_COORD_SCALAR # valid values 46 | 47 | is_valid = validate_segy_scalar(coord_scalar) 48 | if not is_valid: 49 | msg = f"Invalid coordinate scalar: {coord_scalar} for file revision {file_revision}." 50 | raise ValueError(msg) 51 | 52 | logger.info("Coordinate scalar is parsed as %s", coord_scalar) 53 | return coord_scalar 54 | 55 | 56 | def _apply_coordinate_scalar(data: NDArray, scalar: int) -> NDArray: 57 | if scalar < 0: 58 | scalar = 1 / scalar 59 | return data * abs(scalar) 60 | -------------------------------------------------------------------------------- /.github/labels.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Labels names are important as they are used by Release Drafter to decide 3 | # regarding where to record them in changelog or if to skip them. 4 | # 5 | # The repository labels will be automatically configured using this file and 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler. 7 | - name: breaking 8 | description: Breaking Changes 9 | color: bfd4f2 10 | - name: bug 11 | description: Something isn't working 12 | color: d73a4a 13 | - name: build 14 | description: Build System and Dependencies 15 | color: bfdadc 16 | - name: ci 17 | description: Continuous Integration 18 | color: 4a97d6 19 | - name: dependencies 20 | description: Pull requests that update a dependency file 21 | color: 0366d6 22 | - name: documentation 23 | description: Improvements or additions to documentation 24 | color: 0075ca 25 | - name: duplicate 26 | description: This issue or pull request already exists 27 | color: cfd3d7 28 | - name: enhancement 29 | description: New feature or request 30 | color: a2eeef 31 | - name: github_actions 32 | description: Pull requests that update Github_actions code 33 | color: "000000" 34 | - name: good first issue 35 | description: Good for newcomers 36 | color: 7057ff 37 | - name: help wanted 38 | description: Extra attention is needed 39 | color: 008672 40 | - name: invalid 41 | description: This doesn't seem right 42 | color: e4e669 43 | - name: performance 44 | description: Performance 45 | color: "016175" 46 | - name: python 47 | description: Pull requests that update Python code 48 | color: 2b67c6 49 | - name: question 50 | description: Further information is requested 51 | color: d876e3 52 | - name: refactoring 53 | description: Refactoring 54 | color: ef67c4 55 | - name: removal 56 | description: Removals and Deprecations 57 | color: 9ae7ea 58 | - name: style 59 | description: Style 60 | color: c120e5 61 | - name: testing 62 | description: Testing 63 | color: b1fc6f 64 | - name: wontfix 65 | description: This will not be worked on 66 | color: ffffff 67 | -------------------------------------------------------------------------------- /tests/unit/test_dimension.py: -------------------------------------------------------------------------------- 1 | """Dimension tests.""" 2 | 3 | import pytest 4 | 5 | from mdio.core import Dimension 6 | from mdio.exceptions import ShapeError 7 | 8 | 9 | @pytest.fixture 10 | def my_dimension() -> Dimension: 11 | """Mock dimension.""" 12 | return Dimension(coords=range(10, 18, 2), name="dim_0") 13 | 14 | 15 | class TestDimension: 16 | """Basic tests for reading or manipulating dimensions.""" 17 | 18 | def test_len(self, my_dimension: Dimension) -> None: 19 | """Test length method.""" 20 | assert len(my_dimension) == 4 21 | 22 | @pytest.mark.parametrize(("index", "expected"), [(1, 12), (-1, 16), (2, 14)]) 23 | def test_getitem(self, my_dimension: Dimension, index: int, expected: int) -> None: 24 | """Test getter (integer indexing).""" 25 | assert my_dimension[index] == expected 26 | 27 | @pytest.mark.parametrize(("index", "expected"), [(1, 12), (-1, 16), (2, 14)]) 28 | def test_setitem(self, index: int, expected: int) -> None: 29 | """Test setter (integer indexing).""" 30 | other_dim = Dimension(coords=range(4), name="dim_6") 31 | other_dim[index] = expected 32 | assert other_dim[index] == expected 33 | 34 | def test_hash_equality(self, my_dimension: Dimension) -> None: 35 | """Test hashing (and equality checks).""" 36 | other_dim1 = Dimension(coords=range(10, 18, 2), name="dim_0") 37 | other_dim2 = Dimension(coords=range(15), name="dim_1") 38 | assert my_dimension == other_dim1 39 | assert my_dimension != other_dim2 40 | 41 | 42 | class TestExceptions: 43 | """Test custom exceptions and if they're raised properly.""" 44 | 45 | def test_shape_error(self) -> None: 46 | """Wrong shape.""" 47 | with pytest.raises(ShapeError): 48 | Dimension(coords=[range(10, 18, 2)] * 2, name="dim_0") 49 | 50 | def test_wrong_type_equals(self, my_dimension: Dimension) -> None: 51 | """Wrong type.""" 52 | with pytest.raises(TypeError): 53 | assert my_dimension == ("not", "a", "Dimension") 54 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration before everything runs.""" 2 | 3 | from __future__ import annotations 4 | 5 | import warnings 6 | from typing import TYPE_CHECKING 7 | from urllib.request import urlretrieve 8 | 9 | import pytest 10 | 11 | if TYPE_CHECKING: 12 | from pathlib import Path 13 | 14 | # Suppress Dask's chunk balancing warning 15 | warnings.filterwarnings( 16 | "ignore", 17 | message="Could not balance chunks to be equal", 18 | category=UserWarning, 19 | module="dask.array.rechunk", 20 | ) 21 | 22 | 23 | @pytest.fixture(scope="session") 24 | def fake_segy_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: 25 | """Make a temp file for the fake SEG-Y files we are going to create.""" 26 | return tmp_path_factory.mktemp(r"fake_segy") 27 | 28 | 29 | @pytest.fixture(scope="session") 30 | def segy_input_uri() -> str: 31 | """Path to dome dataset for cloud testing.""" 32 | return "http://s3.amazonaws.com/teapot/filt_mig.sgy" 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> Path: 37 | """Download teapot dome dataset for testing.""" 38 | tmp_dir = tmp_path_factory.mktemp("segy") 39 | tmp_file = tmp_dir / "teapot.segy" 40 | urlretrieve(segy_input_uri, tmp_file) # noqa: S310 41 | return tmp_file 42 | 43 | 44 | @pytest.fixture(scope="module") 45 | def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: 46 | """Make a temp file for the output MDIO.""" 47 | return tmp_path_factory.mktemp(r"mdio") 48 | 49 | 50 | @pytest.fixture(scope="module") 51 | def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: # pragma: no cover - used by disabled test 52 | """Make a temp file for the output MDIO.""" 53 | return tmp_path_factory.mktemp(r"mdio2") 54 | 55 | 56 | @pytest.fixture(scope="session") 57 | def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: 58 | """Make a temp file for the round-trip IBM SEG-Y.""" 59 | tmp_dir = tmp_path_factory.mktemp("segy") 60 | return tmp_dir / "teapot_roundtrip.segy" 61 | -------------------------------------------------------------------------------- /src/mdio/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom exceptions related to MDIO functionality.""" 2 | 3 | from __future__ import annotations 4 | 5 | 6 | class MDIOError(Exception): 7 | """Base exceptions class.""" 8 | 9 | 10 | class ShapeError(MDIOError): 11 | """Raised when shapes of two or more things don't match. 12 | 13 | Args: 14 | message: Message to show with the exception. 15 | names: Names of the variables for the `message`. 16 | shapes: Shapes of the variables for the `message`. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | message: str, 22 | names: tuple[str, str] | None = None, 23 | shapes: tuple[int, int] | None = None, 24 | ): 25 | if names is not None and shapes is not None: 26 | shape_dict = zip(names, shapes, strict=True) 27 | extras = [f"{name}: {shape}" for name, shape in shape_dict] 28 | extras = " <> ".join(extras) 29 | 30 | message = f"{message} - {extras}" 31 | 32 | super().__init__(message) 33 | 34 | 35 | class WrongTypeError(MDIOError): 36 | """Raised when types of two or things don't match. 37 | 38 | Args: 39 | message: Message to show with the exception. 40 | name: String form of variable's type for the `message`. 41 | expected: String form of expected type for the `message`. 42 | """ 43 | 44 | def __init__(self, message: str, name: str = None, expected: str = None): 45 | if name is not None and expected is not None: 46 | extras = f"Got: {name} Expected: {expected}" 47 | message = f"{message} - {extras}" 48 | 49 | super().__init__(message) 50 | 51 | 52 | class InvalidMDIOError(MDIOError): 53 | """Raised when an invalid MDIO file is encountered.""" 54 | 55 | 56 | class MDIOAlreadyExistsError(MDIOError): 57 | """Raised when MDIO file already exists.""" 58 | 59 | 60 | class MDIONotFoundError(MDIOError): 61 | """Raised when MDIO file doesn't exist.""" 62 | 63 | 64 | class MDIOMissingVariableError(MDIOError): 65 | """Raised when a variable is missing from the MDIO dataset.""" 66 | -------------------------------------------------------------------------------- /src/mdio/core/config.py: -------------------------------------------------------------------------------- 1 | """Environment variable management for MDIO operations.""" 2 | 3 | from psutil import cpu_count 4 | from pydantic import Field 5 | from pydantic_settings import BaseSettings 6 | from pydantic_settings import SettingsConfigDict 7 | 8 | 9 | class MDIOSettings(BaseSettings): 10 | """MDIO environment configuration settings.""" 11 | 12 | # CPU configuration 13 | export_cpus: int = Field( 14 | default_factory=lambda: cpu_count(logical=True), 15 | description="Number of CPUs to use for export operations", 16 | alias="MDIO__EXPORT__CPU_COUNT", 17 | ) 18 | import_cpus: int = Field( 19 | default_factory=lambda: cpu_count(logical=True), 20 | description="Number of CPUs to use for import operations", 21 | alias="MDIO__IMPORT__CPU_COUNT", 22 | ) 23 | 24 | # Grid sparsity configuration 25 | grid_sparsity_ratio_warn: float = Field( 26 | default=2.0, 27 | description="Sparsity ratio threshold for warnings", 28 | alias="MDIO__GRID__SPARSITY_RATIO_WARN", 29 | ) 30 | grid_sparsity_ratio_limit: float = Field( 31 | default=10.0, 32 | description="Sparsity ratio threshold for errors", 33 | alias="MDIO__GRID__SPARSITY_RATIO_LIMIT", 34 | ) 35 | 36 | # Import configuration 37 | save_segy_file_header: bool = Field( 38 | default=False, 39 | description="Whether to save SEG-Y file headers", 40 | alias="MDIO__IMPORT__SAVE_SEGY_FILE_HEADER", 41 | ) 42 | raw_headers: bool = Field( 43 | default=False, 44 | description="Whether to preserve raw headers", 45 | alias="MDIO__IMPORT__RAW_HEADERS", 46 | ) 47 | cloud_native: bool = Field( 48 | default=False, 49 | description="Whether to use cloud-native mode for SEG-Y processing", 50 | alias="MDIO__IMPORT__CLOUD_NATIVE", 51 | ) 52 | 53 | # General configuration 54 | ignore_checks: bool = Field( 55 | default=False, 56 | description="Whether to ignore validation checks", 57 | alias="MDIO_IGNORE_CHECKS", 58 | ) 59 | 60 | model_config = SettingsConfigDict(case_sensitive=True) 61 | -------------------------------------------------------------------------------- /tests/unit/test_environment.py: -------------------------------------------------------------------------------- 1 | """Tests for the MDIO Environment API.""" 2 | 3 | import os 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | 8 | from mdio.core.config import MDIOSettings 9 | 10 | 11 | class TestEnvironment: 12 | """Test the Environment API module functions.""" 13 | 14 | @pytest.mark.parametrize( 15 | ("env_var", "value", "property_name", "expected"), 16 | [ 17 | ("MDIO__EXPORT__CPU_COUNT", "8", "export_cpus", 8), 18 | ("MDIO__IMPORT__CPU_COUNT", "4", "import_cpus", 4), 19 | ("MDIO__GRID__SPARSITY_RATIO_WARN", "3.5", "grid_sparsity_ratio_warn", 3.5), 20 | ("MDIO__GRID__SPARSITY_RATIO_LIMIT", "15.0", "grid_sparsity_ratio_limit", 15.0), 21 | ], 22 | ) 23 | def test_env_var_overrides(self, env_var: str, value: str, property_name: str, expected: object) -> None: 24 | """Test environment variables override defaults.""" 25 | with patch.dict(os.environ, {env_var: value}): 26 | settings = MDIOSettings() 27 | result = getattr(settings, property_name) 28 | assert result == expected 29 | 30 | def test_environment_isolation(self) -> None: 31 | """Test that environment changes don't affect other tests.""" 32 | original_values = { 33 | "cpus": MDIOSettings().export_cpus, 34 | "ratio": MDIOSettings().grid_sparsity_ratio_warn, 35 | "bool": MDIOSettings().save_segy_file_header, 36 | } 37 | 38 | with patch.dict( 39 | os.environ, 40 | { 41 | "MDIO__EXPORT__CPU_COUNT": "99", 42 | "MDIO__GRID__SPARSITY_RATIO_WARN": "99.9", 43 | "MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true", 44 | }, 45 | ): 46 | assert MDIOSettings().export_cpus == 99 47 | assert MDIOSettings().grid_sparsity_ratio_warn == 99.9 48 | assert MDIOSettings().save_segy_file_header is True 49 | 50 | # Values should be restored after context 51 | assert MDIOSettings().export_cpus == original_values["cpus"] 52 | assert MDIOSettings().grid_sparsity_ratio_warn == original_values["ratio"] 53 | assert MDIOSettings().save_segy_file_header == original_values["bool"] 54 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/v1/stats.py: -------------------------------------------------------------------------------- 1 | """Statistics schema for MDIO v1 arrays. 2 | 3 | This module provides two Histogram classes (CenteredBinHistogram and 4 | EdgeDefinedHistogram),a summary statistics class, and a summary statistics 5 | metadata class. 6 | 7 | SummaryStatistics: a class that represents the minimum summary statistics 8 | of an array consisting of count, sum, sum of squares, min, max, and a histogram. 9 | 10 | SummaryStatisticsMetadata: represents metadata for statistics, with a field 11 | for v1 of the stats. 12 | 13 | CenteredBinHistogram takes the center points of each bin in a histogram, 14 | while EdgeDefinedHistogram takes the left edges and widths of each bin. 15 | Both classes extend from the base class BaseHistogram, which represents 16 | a histogram with count of each bin. 17 | """ 18 | 19 | from __future__ import annotations 20 | 21 | from typing import TypeAlias 22 | 23 | from pydantic import Field 24 | 25 | from mdio.builder.schemas.core import CamelCaseStrictModel 26 | 27 | 28 | class BaseHistogram(CamelCaseStrictModel): 29 | """Represents a histogram with bin counts.""" 30 | 31 | counts: list[int] = Field(..., description="Count of each each bin.") 32 | 33 | 34 | class CenteredBinHistogram(BaseHistogram): 35 | """Class representing a center bin histogram.""" 36 | 37 | bin_centers: list[float | int] = Field(..., description="List of bin centers.") 38 | 39 | 40 | class EdgeDefinedHistogram(BaseHistogram): 41 | """A class representing an edge-defined histogram.""" 42 | 43 | bin_edges: list[float | int] = Field(..., description="The left edges of the histogram bins.") 44 | bin_widths: list[float | int] = Field(..., description="The widths of the histogram bins.") 45 | 46 | 47 | Histogram: TypeAlias = CenteredBinHistogram | EdgeDefinedHistogram 48 | 49 | 50 | class SummaryStatistics(CamelCaseStrictModel): 51 | """Data model for some statistics in MDIO v1 arrays.""" 52 | 53 | count: int = Field(..., description="The number of data points.") 54 | sum: float = Field(..., description="The total of all data values.") 55 | sum_squares: float = Field(..., description="The total of all data values squared.") 56 | min: float = Field(..., description="The smallest value in the variable.") 57 | max: float = Field(..., description="The largest value in the variable.") 58 | histogram: Histogram = Field(..., description="Binned frequency distribution.") 59 | -------------------------------------------------------------------------------- /docs/data_models/compressors.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | :tocdepth: 3 3 | ``` 4 | 5 | ```{currentModule} mdio.builder.schemas.compressors 6 | 7 | ``` 8 | 9 | # Compressors 10 | 11 | ```{article-info} 12 | :author: Altay Sansal 13 | :date: "{sub-ref}`today`" 14 | :read-time: "{sub-ref}`wordcount-minutes` min read" 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light 16 | ``` 17 | 18 | ## Dataset Compression 19 | 20 | MDIO relies on [numcodecs] for data compression. We provide good defaults based 21 | on opinionated and limited heuristics for each compressor for various energy datasets. 22 | However, using these data models, the compression can be customized. 23 | 24 | [Numcodecs] is a project that a convenient interface to different compression 25 | libraries. We selected the [Blosc] and [ZFP] compressors for lossless and lossy 26 | compression of energy data. 27 | 28 | ## Blosc 29 | 30 | A high-performance compressor optimized for binary data, combining fast compression 31 | with a byte-shuffle filter for enhanced efficiency, particularly effective with 32 | numerical arrays in multi-threaded environments. 33 | 34 | For more details about compression modes, see [Blosc Documentation]. 35 | 36 | ```{eval-rst} 37 | .. autosummary:: 38 | Blosc 39 | ``` 40 | 41 | ## ZFP 42 | 43 | ZFP is a compression algorithm tailored for floating-point and integer arrays, offering 44 | lossy and lossless compression with customizable precision, well-suited for large 45 | scientific datasets with a focus on balancing data fidelity and compression ratio. 46 | 47 | For more details about compression modes, see [ZFP Documentation]. 48 | 49 | ```{eval-rst} 50 | .. autosummary:: 51 | ZFP 52 | ``` 53 | 54 | [numcodecs]: https://github.com/zarr-developers/numcodecs 55 | [blosc]: https://github.com/Blosc/c-blosc 56 | [blosc documentation]: https://www.blosc.org/python-blosc/python-blosc.html 57 | [zfp]: https://github.com/LLNL/zfp 58 | [zfp documentation]: https://computing.llnl.gov/projects/zfp 59 | 60 | ## Model Reference 61 | 62 | ::: 63 | :::{dropdown} Blosc 64 | :animate: fade-in-slide-down 65 | 66 | ```{eval-rst} 67 | .. autopydantic_model:: Blosc 68 | ``` 69 | 70 | ::: 71 | 72 | :::{dropdown} ZFP 73 | :animate: fade-in-slide-down 74 | 75 | ```{eval-rst} 76 | .. autopydantic_model:: ZFP 77 | 78 | ---------- 79 | 80 | .. autoclass:: ZFPMode() 81 | :members: 82 | :undoc-members: 83 | :member-order: bysource 84 | ``` 85 | 86 | ::: 87 | -------------------------------------------------------------------------------- /src/mdio/commands/copy.py: -------------------------------------------------------------------------------- 1 | """MDIO Dataset copy command.""" 2 | 3 | from __future__ import annotations 4 | 5 | from click import argument 6 | from click import command 7 | from click import option 8 | from click_params import JSON 9 | 10 | 11 | @command(name="copy") 12 | @argument("source-mdio-path", type=str) 13 | @argument("target-mdio-path", type=str) 14 | @option( 15 | "-traces", 16 | "--with-traces", 17 | is_flag=True, 18 | help="Flag to overwrite the MDIO file if it exists", 19 | show_default=True, 20 | ) 21 | @option( 22 | "-headers", 23 | "--with-headers", 24 | is_flag=True, 25 | help="Flag to overwrite the MDIO file if it exists", 26 | show_default=True, 27 | ) 28 | @option( 29 | "-storage-input", 30 | "--storage-options-input", 31 | required=False, 32 | help="Storage options for input MDIO file.", 33 | type=JSON, 34 | ) 35 | @option( 36 | "-storage-output", 37 | "--storage-options-output", 38 | required=False, 39 | help="Storage options for output MDIO file.", 40 | type=JSON, 41 | ) 42 | @option( 43 | "-overwrite", 44 | "--overwrite", 45 | is_flag=True, 46 | help="Flag to overwrite the MDIO file if it exists", 47 | show_default=True, 48 | ) 49 | def copy( # noqa: PLR0913 50 | source_mdio_path: str, 51 | target_mdio_path: str, 52 | with_traces: bool = False, 53 | with_headers: bool = False, 54 | storage_options_input: dict | None = None, 55 | storage_options_output: dict | None = None, 56 | overwrite: bool = False, 57 | ) -> None: 58 | """Copy an MDIO dataset to another MDIO dataset. 59 | 60 | This command copies an MDIO file from a source path to a target path, optionally including 61 | trace data, headers, or both, for all access patterns. It creates a new MDIO file at the target 62 | path with the same structure as the source, and selectively copies data based on the provided 63 | flags. The function supports custom storage options for both input and output, enabling 64 | compatibility with various filesystems via FSSpec. 65 | """ 66 | # Lazy import to reduce CLI startup time 67 | from mdio.api.convenience import copy_mdio # noqa: PLC0415 68 | 69 | copy_mdio( 70 | source_mdio_path, 71 | target_mdio_path, 72 | overwrite, 73 | with_traces, 74 | with_headers, 75 | storage_options_input, 76 | storage_options_output, 77 | ) 78 | 79 | 80 | cli = copy 81 | -------------------------------------------------------------------------------- /src/mdio/segy/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom exceptions for SEG-Y.""" 2 | 3 | from mdio.exceptions import MDIOError 4 | 5 | 6 | class GridOverrideInputError(MDIOError): 7 | """Raised when grid override parameters are not correct.""" 8 | 9 | 10 | class GridOverrideUnknownError(GridOverrideInputError): 11 | """Raised when grid override parameter is unknown. 12 | 13 | Args: 14 | command_name: Name of the unknown grid override parameter. 15 | """ 16 | 17 | def __init__(self, command_name: str): 18 | self.command_name = command_name 19 | self.message = f"Unknown grid override: {command_name}" 20 | super().__init__(self.message) 21 | 22 | 23 | class GridOverrideKeysError(GridOverrideInputError): 24 | """Raised when grid override is not compatible with required keys. 25 | 26 | Args: 27 | command_name: Name of the grid override command. 28 | required_keys: Set of required keys for the grid override. 29 | """ 30 | 31 | def __init__(self, command_name: str, required_keys: set[str]): 32 | self.command_name = command_name 33 | self.required_keys = required_keys 34 | self.message = f"{command_name} can only be used with {required_keys} keys." 35 | super().__init__(self.message) 36 | 37 | 38 | class GridOverrideMissingParameterError(GridOverrideInputError): 39 | """Raised when grid override parameters are not correct. 40 | 41 | Args: 42 | command_name: Name of the grid override command. 43 | missing_parameter: Set of missing parameters required by the command. 44 | """ 45 | 46 | def __init__(self, command_name: str, missing_parameter: set[str]): 47 | self.command_name = command_name 48 | self.missing_parameter = missing_parameter 49 | self.message = f"{command_name} requires {missing_parameter} parameter." 50 | super().__init__(self.message) 51 | 52 | 53 | class GridOverrideIncompatibleError(GridOverrideInputError): 54 | """Raised when two grid overrides are incompatible. 55 | 56 | Args: 57 | first_command: Name of the first grid override command. 58 | second_command: Name of the second grid override command. 59 | """ 60 | 61 | def __init__(self, first_command: str, second_command: str): 62 | self.first_command = first_command 63 | self.second_command = second_command 64 | self.message = f"{first_command} can't be used together with {second_command}." 65 | super().__init__(self.message) 66 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | jobs: 10 | release: 11 | name: Release 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Check out the repository 15 | uses: actions/checkout@v5 16 | with: 17 | fetch-depth: 2 18 | 19 | - name: Install the pinned version of uv 20 | uses: astral-sh/setup-uv@v7 21 | with: 22 | python-version: 3.13 23 | working-directory: ${{ github.workspace }} 24 | 25 | - name: Install bumpversion 26 | run: | 27 | uv tool install --constraint=.github/workflows/constraints.txt bump-my-version 28 | bump-my-version --version 29 | 30 | - name: Check if there is a parent commit 31 | id: check-parent-commit 32 | run: | 33 | echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT 34 | 35 | - name: Detect and tag new version 36 | id: check-version 37 | if: steps.check-parent-commit.outputs.sha 38 | uses: salsify/action-detect-and-tag-new-version@v2.0.3 39 | with: 40 | version-command: | 41 | bump-my-version show current_version 42 | 43 | - name: Bump version for developmental release 44 | if: "! steps.check-version.outputs.tag" 45 | run: | 46 | bump-my-version bump patch && 47 | version=$(bump-my-version show current_version) && 48 | bump-my-version bump --new-version $version.dev$(date +%s) 49 | 50 | - name: Build package 51 | run: | 52 | uv build 53 | 54 | - name: Publish package on PyPI 55 | if: steps.check-version.outputs.tag 56 | uses: pypa/gh-action-pypi-publish@v1.13.0 57 | with: 58 | user: __token__ 59 | password: ${{ secrets.PYPI_TOKEN }} 60 | 61 | - name: Publish package on TestPyPI 62 | if: "! steps.check-version.outputs.tag" 63 | uses: pypa/gh-action-pypi-publish@v1.13.0 64 | with: 65 | user: __token__ 66 | password: ${{ secrets.TEST_PYPI_TOKEN }} 67 | repository_url: https://test.pypi.org/legacy/ 68 | 69 | - name: Publish the release notes 70 | uses: release-drafter/release-drafter@v6.1.0 71 | with: 72 | publish: ${{ steps.check-version.outputs.tag != '' }} 73 | tag: ${{ steps.check-version.outputs.tag }} 74 | env: 75 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 76 | -------------------------------------------------------------------------------- /src/mdio/core/dimension.py: -------------------------------------------------------------------------------- 1 | """Dimension (grid) abstraction and serializers.""" 2 | 3 | from __future__ import annotations 4 | 5 | from dataclasses import dataclass 6 | from typing import TYPE_CHECKING 7 | 8 | import numpy as np 9 | 10 | from mdio.exceptions import ShapeError 11 | 12 | if TYPE_CHECKING: 13 | from numpy.typing import NDArray 14 | 15 | 16 | @dataclass(eq=False, order=False, slots=True) 17 | class Dimension: 18 | """Dimension class. 19 | 20 | Dimension has a name and coordinates associated with it. The Dimension coordinates can only 21 | be a vector. 22 | 23 | Args: 24 | coords: Vector of coordinates. 25 | name: Name of the dimension. 26 | 27 | Attributes: 28 | coords: Vector of coordinates. 29 | name: Name of the dimension. 30 | """ 31 | 32 | coords: list | tuple | NDArray | range 33 | name: str 34 | 35 | def __post_init__(self) -> None: 36 | """Post process and validation.""" 37 | self.coords = np.asarray(self.coords) 38 | if self.coords.ndim != 1: 39 | msg = "Dimensions can only have vector coordinates" 40 | raise ShapeError(msg, ("# Dim", "Expected"), (self.coords.ndim, 1)) 41 | 42 | @property 43 | def size(self) -> int: 44 | """Size of the dimension.""" 45 | return len(self.coords) 46 | 47 | def __len__(self) -> int: 48 | """Length magic.""" 49 | return self.size 50 | 51 | def __getitem__(self, item: int | slice | list[int]) -> NDArray: 52 | """Gets a specific coordinate value by index.""" 53 | return self.coords[item] 54 | 55 | def __setitem__(self, key: int, value: NDArray) -> None: 56 | """Sets a specific coordinate value by index.""" 57 | self.coords[key] = value 58 | 59 | def __hash__(self) -> int: 60 | """Hashing magic.""" 61 | return hash(tuple(self.coords) + (self.name,)) 62 | 63 | def __eq__(self, other: Dimension) -> bool: 64 | """Compares if the dimension has same properties.""" 65 | if not isinstance(other, Dimension): 66 | other_type = type(other).__name__ 67 | msg = f"Can't compare Dimension with {other_type}" 68 | raise TypeError(msg) 69 | 70 | return hash(self) == hash(other) 71 | 72 | def min(self) -> NDArray[float]: 73 | """Get minimum value of dimension.""" 74 | return np.min(self.coords) 75 | 76 | def max(self) -> NDArray[float]: 77 | """Get maximum value of dimension.""" 78 | return np.max(self.coords) 79 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/v1/variable.py: -------------------------------------------------------------------------------- 1 | """This module defines `LabeledArray`, `Coordinate`, and `Variable`. 2 | 3 | `LabeledArray` is a basic array unit which includes basic properties like 4 | name, dimension, data type, compressor etc. 5 | 6 | `Coordinate` extends the `LabeledArray` class, it represents the Coordinate 7 | array in the MDIO format. It has dimensions which are fully defined and can hold 8 | additional metadata. 9 | 10 | `Variable` is another class that extends the `LabeledArray`. It represents a 11 | variable in MDIO format. It can have coordinates and can also hold metadata. 12 | """ 13 | 14 | from typing import Any 15 | 16 | from pydantic import Field 17 | 18 | from mdio.builder.schemas.base import NamedArray 19 | from mdio.builder.schemas.chunk_grid import RectilinearChunkGrid 20 | from mdio.builder.schemas.chunk_grid import RegularChunkGrid 21 | from mdio.builder.schemas.core import CamelCaseStrictModel 22 | from mdio.builder.schemas.dtype import ScalarType 23 | from mdio.builder.schemas.v1.stats import SummaryStatistics 24 | from mdio.builder.schemas.v1.units import AllUnitModel 25 | 26 | 27 | class CoordinateMetadata(CamelCaseStrictModel): 28 | """Reduced Metadata, perfect for simple Coordinates.""" 29 | 30 | units_v1: AllUnitModel | None = Field(default=None) 31 | attributes: dict[str, Any] | None = Field(default=None) 32 | 33 | 34 | class VariableMetadata(CoordinateMetadata): 35 | """Complete Metadata for Variables and complex or large Coordinates.""" 36 | 37 | chunk_grid: RegularChunkGrid | RectilinearChunkGrid | None = Field( 38 | default=None, 39 | description="Chunk grid specification for the array.", 40 | ) 41 | 42 | stats_v1: SummaryStatistics | list[SummaryStatistics] | None = Field( 43 | default=None, 44 | description="Minimal summary statistics.", 45 | ) 46 | 47 | 48 | class Coordinate(NamedArray): 49 | """A simple MDIO Coordinate array with metadata. 50 | 51 | For large or complex Coordinates, define a Variable instead. 52 | """ 53 | 54 | data_type: ScalarType = Field(..., description="Data type of Coordinate.") 55 | metadata: CoordinateMetadata | None = Field(default=None, description="Coordinate Metadata.") 56 | 57 | 58 | class Variable(NamedArray): 59 | """An MDIO Variable that has coordinates and metadata.""" 60 | 61 | coordinates: list[Coordinate] | list[str] | None = Field( 62 | default=None, 63 | description="Coordinates of the MDIO Variable dimensions.", 64 | ) 65 | metadata: VariableMetadata | None = Field(default=None, description="Variable Metadata.") 66 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | """Sphinx configuration.""" 2 | 3 | # -- Project information ----------------------------------------------------- 4 | 5 | project = "MDIO" 6 | author = "TGS" 7 | copyright = "2023, TGS" # noqa: A001 8 | 9 | # -- General configuration --------------------------------------------------- 10 | 11 | # Add any Sphinx extension module names here, as strings. They can be 12 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 13 | # ones. 14 | 15 | extensions = [ 16 | "sphinx.ext.autodoc", 17 | "sphinx.ext.napoleon", 18 | "sphinx.ext.intersphinx", 19 | "sphinx.ext.autosummary", 20 | "sphinxcontrib.autodoc_pydantic", 21 | "sphinx.ext.autosectionlabel", 22 | "sphinx_click", 23 | "sphinx_copybutton", 24 | "myst_nb", 25 | "sphinx_design", 26 | ] 27 | 28 | # List of patterns, relative to source directory, that match files and 29 | # directories to ignore when looking for source files. 30 | # This pattern also affects html_static_path and html_extra_path. 31 | exclude_patterns = [ 32 | "_build", 33 | "Thumbs.db", 34 | "jupyter_execute", 35 | ".DS_Store", 36 | "**.ipynb_checkpoints", 37 | ] 38 | 39 | intersphinx_mapping = { 40 | "python": ("https://docs.python.org/3", None), 41 | "numpy": ("https://numpy.org/doc/stable/", None), 42 | "pydantic": ("https://docs.pydantic.dev/latest/", None), 43 | "zarr": ("https://zarr.readthedocs.io/en/stable/", None), 44 | } 45 | 46 | pygments_style = "vs" 47 | pygments_dark_style = "material" 48 | 49 | autodoc_typehints = "description" 50 | autodoc_typehints_format = "short" 51 | autodoc_member_order = "groupwise" 52 | autoclass_content = "class" 53 | autosectionlabel_prefix_document = True 54 | 55 | autodoc_pydantic_field_list_validators = False 56 | autodoc_pydantic_field_swap_name_and_alias = True 57 | autodoc_pydantic_field_show_alias = False 58 | autodoc_pydantic_model_show_config_summary = False 59 | autodoc_pydantic_model_show_validator_summary = False 60 | autodoc_pydantic_model_show_validator_members = False 61 | autodoc_pydantic_model_show_field_summary = False 62 | 63 | html_theme = "furo" 64 | 65 | myst_number_code_blocks = ["python"] 66 | myst_heading_anchors = 2 67 | myst_words_per_minute = 80 68 | myst_enable_extensions = [ 69 | "colon_fence", 70 | "linkify", 71 | "replacements", 72 | "smartquotes", 73 | "attrs_inline", 74 | ] 75 | 76 | # sphinx-copybutton configurations 77 | copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " 78 | copybutton_line_continuation_character = "\\" 79 | copybutton_prompt_is_regexp = True 80 | 81 | nb_execution_mode = "off" 82 | -------------------------------------------------------------------------------- /src/mdio/segy/parsers.py: -------------------------------------------------------------------------------- 1 | """Parsers for sections of SEG-Y files.""" 2 | 3 | from __future__ import annotations 4 | 5 | import multiprocessing as mp 6 | from concurrent.futures import ProcessPoolExecutor 7 | from itertools import repeat 8 | from math import ceil 9 | from typing import TYPE_CHECKING 10 | 11 | import numpy as np 12 | from tqdm.auto import tqdm 13 | 14 | from mdio.core.config import MDIOSettings 15 | from mdio.segy._workers import header_scan_worker 16 | 17 | if TYPE_CHECKING: 18 | from segy.arrays import HeaderArray 19 | 20 | from mdio.segy.file import SegyFileArguments 21 | 22 | 23 | def parse_headers( 24 | segy_file_kwargs: SegyFileArguments, 25 | num_traces: int, 26 | subset: tuple[str, ...] | None = None, 27 | block_size: int = 10000, 28 | progress_bar: bool = True, 29 | ) -> HeaderArray: 30 | """Read and parse given `byte_locations` from SEG-Y file. 31 | 32 | Args: 33 | segy_file_kwargs: SEG-Y file arguments. 34 | num_traces: Total number of traces in the SEG-Y file. 35 | subset: Tuple of header names to filter and keep. 36 | block_size: Number of traces to read for each block. 37 | progress_bar: Enable or disable progress bar. Default is True. 38 | 39 | Returns: 40 | HeaderArray. Keys are the index names, values are numpy arrays of parsed headers for the 41 | current block. Array is of type byte_type except IBM32 which is mapped to FLOAT32. 42 | """ 43 | settings = MDIOSettings() 44 | 45 | trace_count = num_traces 46 | n_blocks = int(ceil(trace_count / block_size)) 47 | 48 | trace_ranges = [] 49 | for idx in range(n_blocks): 50 | start, stop = idx * block_size, (idx + 1) * block_size 51 | stop = min(stop, trace_count) 52 | 53 | trace_ranges.append((start, stop)) 54 | 55 | num_workers = min(n_blocks, settings.import_cpus) 56 | 57 | tqdm_kw = {"unit": "block", "dynamic_ncols": True} 58 | # For Unix async writes with s3fs/fsspec & multiprocessing, use 'spawn' instead of default 59 | # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows. 60 | context = mp.get_context("spawn") 61 | with ProcessPoolExecutor(num_workers, mp_context=context) as executor: 62 | lazy_work = executor.map(header_scan_worker, repeat(segy_file_kwargs), trace_ranges, repeat(subset)) 63 | 64 | if progress_bar is True: 65 | lazy_work = tqdm( 66 | iterable=lazy_work, 67 | total=n_blocks, 68 | desc="Scanning SEG-Y for geometry attributes", 69 | **tqdm_kw, 70 | ) 71 | 72 | # This executes the lazy work. 73 | headers: list[HeaderArray] = list(lazy_work) 74 | 75 | # Merge blocks before return 76 | return np.concatenate(headers) 77 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | debugging/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # ruff 133 | .ruff_cache/ 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | 144 | # IDE settings 145 | .vscode/ 146 | .idea/ 147 | 148 | # tests 149 | mdio1/* 150 | */mdio1/* 151 | pytest-of-* 152 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_2d_streamer_shot.py: -------------------------------------------------------------------------------- 1 | """Seismic2DStreamerShotGathersTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas import compressors 6 | from mdio.builder.schemas.dtype import ScalarType 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 8 | from mdio.builder.templates.base import AbstractDatasetTemplate 9 | from mdio.builder.templates.types import SeismicDataDomain 10 | 11 | 12 | class Seismic2DStreamerShotGathersTemplate(AbstractDatasetTemplate): 13 | """Seismic Shot pre-stack 2D time or depth Dataset template.""" 14 | 15 | def __init__(self, data_domain: SeismicDataDomain = "time"): 16 | super().__init__(data_domain=data_domain) 17 | 18 | self._dim_names = ("shot_point", "channel", self._data_domain) 19 | self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") 20 | self._var_chunk_shape = (16, 32, 2048) 21 | 22 | @property 23 | def _name(self) -> str: 24 | return "StreamerShotGathers2D" 25 | 26 | def _load_dataset_attributes(self) -> dict[str, Any]: 27 | return {"surveyType": "2D", "gatherType": "common_source"} 28 | 29 | def _add_coordinates(self) -> None: 30 | # Add dimension coordinates 31 | for name in self._dim_names: 32 | self._builder.add_coordinate( 33 | name, 34 | dimensions=(name,), 35 | data_type=ScalarType.INT32, 36 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), 37 | ) 38 | 39 | # Add non-dimension coordinates 40 | compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) 41 | self._builder.add_coordinate( 42 | "source_coord_x", 43 | dimensions=("shot_point",), 44 | data_type=ScalarType.FLOAT64, 45 | compressor=compressor, 46 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")), 47 | ) 48 | self._builder.add_coordinate( 49 | "source_coord_y", 50 | dimensions=("shot_point",), 51 | data_type=ScalarType.FLOAT64, 52 | compressor=compressor, 53 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")), 54 | ) 55 | self._builder.add_coordinate( 56 | "group_coord_x", 57 | dimensions=("shot_point", "channel"), 58 | data_type=ScalarType.FLOAT64, 59 | compressor=compressor, 60 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")), 61 | ) 62 | self._builder.add_coordinate( 63 | "group_coord_y", 64 | dimensions=("shot_point", "channel"), 65 | data_type=ScalarType.FLOAT64, 66 | compressor=compressor, 67 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")), 68 | ) 69 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | """Test cases for the __main__ module.""" 2 | 3 | import os 4 | from pathlib import Path 5 | 6 | import pytest 7 | from click.testing import CliRunner 8 | 9 | from mdio import __main__ 10 | 11 | 12 | @pytest.fixture 13 | def runner() -> CliRunner: 14 | """Fixture for invoking command-line interfaces.""" 15 | return CliRunner() 16 | 17 | 18 | # TODO(Altay): Redesign and implement the new v1 CLI 19 | # https://github.com/TGSAI/mdio-python/issues/646 20 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.") 21 | @pytest.mark.dependency 22 | def test_main_succeeds( 23 | runner: CliRunner, segy_input: Path, zarr_tmp: Path 24 | ) -> None: # pragma: no cover - test is skipped 25 | """It exits with a status code of zero.""" 26 | cli_args = ["segy", "import", str(segy_input), str(zarr_tmp)] 27 | cli_args.extend(["--header-locations", "181,185"]) 28 | cli_args.extend(["--header-names", "inline,crossline"]) 29 | 30 | result = runner.invoke(__main__.main, args=cli_args) 31 | assert result.exit_code == 0 32 | 33 | 34 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.") 35 | @pytest.mark.dependency(depends=["test_main_succeeds"]) 36 | def test_main_cloud( 37 | runner: CliRunner, segy_input_uri: str, zarr_tmp: Path 38 | ) -> None: # pragma: no cover - tests is skipped 39 | """It exits with a status code of zero.""" 40 | os.environ["MDIO__IMPORT__CLOUD_NATIVE"] = "true" 41 | cli_args = ["segy", "import", segy_input_uri, str(zarr_tmp)] 42 | cli_args.extend(["--header-locations", "181,185"]) 43 | cli_args.extend(["--header-names", "inline,crossline"]) 44 | cli_args.extend(["--overwrite"]) 45 | 46 | result = runner.invoke(__main__.main, args=cli_args) 47 | assert result.exit_code == 0 48 | 49 | 50 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.") 51 | @pytest.mark.dependency(depends=["test_main_succeeds"]) 52 | def test_main_info_succeeds(runner: CliRunner, zarr_tmp: Path) -> None: # pragma: no cover - tests is skipped 53 | """It exits with a status code of zero.""" 54 | cli_args = ["info"] 55 | cli_args.extend([str(zarr_tmp)]) 56 | 57 | result = runner.invoke(__main__.main, args=cli_args) 58 | assert result.exit_code == 0 59 | 60 | 61 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.") 62 | @pytest.mark.dependency(depends=["test_main_succeeds"]) 63 | def test_main_copy(runner: CliRunner, zarr_tmp: Path, zarr_tmp2: Path) -> None: # pragma: no cover - tests is skipped 64 | """It exits with a status code of zero.""" 65 | cli_args = ["copy", str(zarr_tmp), str(zarr_tmp2), "-headers", "-traces"] 66 | 67 | result = runner.invoke(__main__.main, args=cli_args) 68 | assert result.exit_code == 0 69 | 70 | 71 | def test_cli_version(runner: CliRunner) -> None: 72 | """Check if version prints without error.""" 73 | cli_args = ["--version"] 74 | result = runner.invoke(__main__.main, args=cli_args) 75 | assert result.exit_code == 0 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | pip-* 29 | tmp* 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | docs/jupyter_execute/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | venv*/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # ruff 135 | .ruff_cache/ 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # pytype static type analyzer 141 | .pytype/ 142 | 143 | # Cython debug symbols 144 | cython_debug/ 145 | 146 | # IDE settings 147 | .vscode/ 148 | .idea/ 149 | 150 | # tests 151 | mdio1/* 152 | */mdio1/* 153 | pytest-of-* 154 | tmp 155 | debugging/* 156 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_2d_cdp.py: -------------------------------------------------------------------------------- 1 | """Seismic2DCDPGathersTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas.compressors import Blosc 6 | from mdio.builder.schemas.compressors import BloscCname 7 | from mdio.builder.schemas.dtype import ScalarType 8 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 9 | from mdio.builder.templates.base import AbstractDatasetTemplate 10 | from mdio.builder.templates.types import CdpGatherDomain 11 | from mdio.builder.templates.types import SeismicDataDomain 12 | 13 | 14 | class Seismic2DCdpGathersTemplate(AbstractDatasetTemplate): 15 | """Seismic CDP pre-stack 2D time or depth Dataset template.""" 16 | 17 | def __init__(self, data_domain: SeismicDataDomain, gather_domain: CdpGatherDomain): 18 | super().__init__(data_domain=data_domain) 19 | self._gather_domain = gather_domain.lower() 20 | 21 | if self._gather_domain not in ["offset", "angle"]: 22 | msg = "gather_type must be 'offset' or 'angle'" 23 | raise ValueError(msg) 24 | 25 | self._dim_names = ("cdp", self._gather_domain, self._data_domain) 26 | self._physical_coord_names = ("cdp_x", "cdp_y") 27 | self._var_chunk_shape = (16, 64, 1024) 28 | 29 | @property 30 | def _name(self) -> str: 31 | gather_domain_suffix = self._gather_domain.capitalize() 32 | data_domain_suffix = self._data_domain.capitalize() 33 | return f"Cdp{gather_domain_suffix}Gathers2D{data_domain_suffix}" 34 | 35 | def _load_dataset_attributes(self) -> dict[str, Any]: 36 | return {"surveyType": "2D", "gatherType": "cdp"} 37 | 38 | def _add_coordinates(self) -> None: 39 | # Add dimension coordinates 40 | self._builder.add_coordinate( 41 | "cdp", 42 | dimensions=("cdp",), 43 | data_type=ScalarType.INT32, 44 | ) 45 | self._builder.add_coordinate( 46 | self._gather_domain, 47 | dimensions=(self._gather_domain,), 48 | data_type=ScalarType.INT32, 49 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)), 50 | ) 51 | self._builder.add_coordinate( 52 | self.trace_domain, 53 | dimensions=(self.trace_domain,), 54 | data_type=ScalarType.INT32, 55 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), 56 | ) 57 | 58 | # Add non-dimension coordinates 59 | compressor = Blosc(cname=BloscCname.zstd) 60 | self._builder.add_coordinate( 61 | "cdp_x", 62 | dimensions=("cdp",), 63 | data_type=ScalarType.FLOAT64, 64 | compressor=compressor, 65 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")), 66 | ) 67 | self._builder.add_coordinate( 68 | "cdp_y", 69 | dimensions=("cdp",), 70 | data_type=ScalarType.FLOAT64, 71 | compressor=compressor, 72 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")), 73 | ) 74 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_3d_coca.py: -------------------------------------------------------------------------------- 1 | """Seismic3DCocaTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas import compressors 6 | from mdio.builder.schemas.dtype import ScalarType 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 8 | from mdio.builder.templates.base import AbstractDatasetTemplate 9 | from mdio.builder.templates.types import SeismicDataDomain 10 | 11 | 12 | class Seismic3DCocaGathersTemplate(AbstractDatasetTemplate): 13 | """Seismic CoCA (common offset, common azimuth) pre-stack 3D Dataset template.""" 14 | 15 | def __init__(self, data_domain: SeismicDataDomain): 16 | super().__init__(data_domain=data_domain) 17 | 18 | self._dim_names = ("inline", "crossline", "offset", "azimuth", self._data_domain) 19 | self._physical_coord_names = ("cdp_x", "cdp_y") 20 | self._var_chunk_shape = (8, 8, 32, 1, 1024) 21 | 22 | @property 23 | def _name(self) -> str: 24 | return f"CocaGathers3D{self._data_domain.capitalize()}" 25 | 26 | def _load_dataset_attributes(self) -> dict[str, Any]: 27 | return {"surveyType": "3D", "gatherType": "common_offset_common_azimuth"} 28 | 29 | def _add_coordinates(self) -> None: 30 | # Add dimension coordinates 31 | self._builder.add_coordinate( 32 | "inline", 33 | dimensions=("inline",), 34 | data_type=ScalarType.INT32, 35 | ) 36 | self._builder.add_coordinate( 37 | "crossline", 38 | dimensions=("crossline",), 39 | data_type=ScalarType.INT32, 40 | ) 41 | self._builder.add_coordinate( 42 | "offset", 43 | dimensions=("offset",), 44 | data_type=ScalarType.INT32, 45 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("offset")), # same unit as X/Y 46 | ) 47 | self._builder.add_coordinate( 48 | "azimuth", 49 | dimensions=("azimuth",), 50 | data_type=ScalarType.FLOAT32, 51 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("azimuth")), 52 | ) 53 | self._builder.add_coordinate( 54 | self.trace_domain, 55 | dimensions=(self.trace_domain,), 56 | data_type=ScalarType.INT32, 57 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), 58 | ) 59 | 60 | # Add non-dimension coordinates 61 | compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) 62 | self._builder.add_coordinate( 63 | "cdp_x", 64 | dimensions=("inline", "crossline"), 65 | data_type=ScalarType.FLOAT64, 66 | compressor=compressor, 67 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")), 68 | ) 69 | self._builder.add_coordinate( 70 | "cdp_y", 71 | dimensions=("inline", "crossline"), 72 | data_type=ScalarType.FLOAT64, 73 | compressor=compressor, 74 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")), 75 | ) 76 | -------------------------------------------------------------------------------- /src/mdio/builder/schemas/v1/units.py: -------------------------------------------------------------------------------- 1 | """Unit schemas specific to MDIO v1.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TypeAlias 6 | 7 | from pint import UnitRegistry 8 | 9 | from mdio.builder.schemas.units import UnitEnum 10 | from mdio.builder.schemas.units import create_unit_model 11 | 12 | ureg = UnitRegistry() 13 | ureg.formatter.default_format = "~C" # compact, abbreviated (symbol). 14 | 15 | 16 | class LengthUnitEnum(UnitEnum): 17 | """Enum class representing metric units of length.""" 18 | 19 | MILLIMETER = ureg.millimeter 20 | CENTIMETER = ureg.centimeter 21 | METER = ureg.meter 22 | KILOMETER = ureg.kilometer 23 | 24 | INCH = ureg.inch 25 | FOOT = ureg.foot 26 | YARD = ureg.yard 27 | MILE = ureg.mile 28 | 29 | 30 | LengthUnitModel = create_unit_model(LengthUnitEnum, "LengthUnitModel", "length", __name__) 31 | 32 | 33 | class TimeUnitEnum(UnitEnum): 34 | """Enum class representing units of time.""" 35 | 36 | NANOSECOND = ureg.nanosecond 37 | MICROSECOND = ureg.microsecond 38 | MILLISECOND = ureg.millisecond 39 | SECOND = ureg.second 40 | MINUTE = ureg.minute 41 | HOUR = ureg.hour 42 | DAY = ureg.day 43 | 44 | 45 | TimeUnitModel = create_unit_model(TimeUnitEnum, "TimeUnitModel", "time", __name__) 46 | 47 | 48 | class DensityUnitEnum(UnitEnum): 49 | """Enum class representing units of density.""" 50 | 51 | GRAMS_PER_CC = ureg.gram / ureg.centimeter**3 52 | KILOGRAMS_PER_M3 = ureg.kilogram / ureg.meter**3 53 | POUNDS_PER_GAL = ureg.pounds / ureg.gallon 54 | 55 | 56 | DensityUnitModel = create_unit_model(DensityUnitEnum, "DensityUnitModel", "density", __name__) 57 | 58 | 59 | class SpeedUnitEnum(UnitEnum): 60 | """Enum class representing units of speed.""" 61 | 62 | METERS_PER_SECOND = ureg.meter / ureg.second 63 | FEET_PER_SECOND = ureg.feet / ureg.second 64 | 65 | 66 | SpeedUnitModel = create_unit_model(SpeedUnitEnum, "SpeedUnitModel", "speed", __name__) 67 | 68 | 69 | class AngleUnitEnum(UnitEnum): 70 | """Enum class representing units of angle.""" 71 | 72 | DEGREES = ureg.degree 73 | RADIANS = ureg.radian 74 | 75 | 76 | AngleUnitModel = create_unit_model(AngleUnitEnum, "AngleUnitModel", "angle", __name__) 77 | 78 | 79 | class FrequencyUnitEnum(UnitEnum): 80 | """Enum class representing units of frequency.""" 81 | 82 | HERTZ = ureg.hertz 83 | 84 | 85 | FrequencyUnitModel = create_unit_model(FrequencyUnitEnum, "FrequencyUnitModel", "frequency", __name__) 86 | 87 | 88 | class VoltageUnitEnum(UnitEnum): 89 | """Enum class representing units of voltage.""" 90 | 91 | MICROVOLT = ureg.microvolt 92 | MILLIVOLT = ureg.millivolt 93 | VOLT = ureg.volt 94 | 95 | 96 | VoltageUnitModel = create_unit_model(VoltageUnitEnum, "VoltageUnitModel", "voltage", __name__) 97 | 98 | 99 | # Composite model types 100 | AllUnitModel: TypeAlias = ( 101 | LengthUnitModel 102 | | TimeUnitModel 103 | | AngleUnitModel 104 | | DensityUnitModel 105 | | SpeedUnitModel 106 | | FrequencyUnitModel 107 | | VoltageUnitModel 108 | ) 109 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_3d_streamer_shot.py: -------------------------------------------------------------------------------- 1 | """Seismic3DStreamerShotGathersTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas import compressors 6 | from mdio.builder.schemas.dtype import ScalarType 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 8 | from mdio.builder.templates.base import AbstractDatasetTemplate 9 | from mdio.builder.templates.types import SeismicDataDomain 10 | 11 | 12 | class Seismic3DStreamerShotGathersTemplate(AbstractDatasetTemplate): 13 | """Seismic Shot pre-stack 3D time or depth Dataset template.""" 14 | 15 | def __init__(self, data_domain: SeismicDataDomain = "time"): 16 | super().__init__(data_domain=data_domain) 17 | 18 | self._dim_names = ("shot_point", "cable", "channel", self._data_domain) 19 | self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") 20 | self._logical_coord_names = ("gun",) 21 | self._var_chunk_shape = (8, 1, 128, 2048) 22 | 23 | @property 24 | def _name(self) -> str: 25 | return "StreamerShotGathers3D" 26 | 27 | def _load_dataset_attributes(self) -> dict[str, Any]: 28 | return {"surveyType": "3D", "gatherType": "common_source"} 29 | 30 | def _add_coordinates(self) -> None: 31 | # Add dimension coordinates 32 | for name in self._dim_names: 33 | self._builder.add_coordinate( 34 | name, 35 | dimensions=(name,), 36 | data_type=ScalarType.INT32, 37 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), 38 | ) 39 | 40 | # Add non-dimension coordinates 41 | compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) 42 | self._builder.add_coordinate( 43 | "gun", 44 | dimensions=("shot_point",), 45 | data_type=ScalarType.UINT8, 46 | compressor=compressor, 47 | ) 48 | self._builder.add_coordinate( 49 | "source_coord_x", 50 | dimensions=("shot_point",), 51 | data_type=ScalarType.FLOAT64, 52 | compressor=compressor, 53 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")), 54 | ) 55 | self._builder.add_coordinate( 56 | "source_coord_y", 57 | dimensions=("shot_point",), 58 | data_type=ScalarType.FLOAT64, 59 | compressor=compressor, 60 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")), 61 | ) 62 | self._builder.add_coordinate( 63 | "group_coord_x", 64 | dimensions=("shot_point", "cable", "channel"), 65 | data_type=ScalarType.FLOAT64, 66 | compressor=compressor, 67 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")), 68 | ) 69 | self._builder.add_coordinate( 70 | "group_coord_y", 71 | dimensions=("shot_point", "cable", "channel"), 72 | data_type=ScalarType.FLOAT64, 73 | compressor=compressor, 74 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")), 75 | ) 76 | -------------------------------------------------------------------------------- /src/mdio/converters/type_converter.py: -------------------------------------------------------------------------------- 1 | """A module for converting numpy dtypes to MDIO scalar and structured types.""" 2 | 3 | from numpy import dtype as np_dtype 4 | 5 | from mdio.builder.schemas.dtype import ScalarType 6 | from mdio.builder.schemas.dtype import StructuredField 7 | from mdio.builder.schemas.dtype import StructuredType 8 | 9 | 10 | def to_scalar_type(data_type: np_dtype) -> ScalarType: 11 | """Convert numpy dtype to MDIO ScalarType. 12 | 13 | Out of the 24 built-in numpy scalar type objects 14 | (see https://numpy.org/doc/stable/reference/arrays.dtypes.html) 15 | this function supports only a limited subset: 16 | ScalarType.INT8 <-> int8 17 | ScalarType.INT16 <-> int16 18 | ScalarType.INT32 <-> int32 19 | ScalarType.INT64 <-> int64 20 | ScalarType.UINT8 <-> uint8 21 | ScalarType.UINT16 <-> uint16 22 | ScalarType.UINT32 <-> uint32 23 | ScalarType.UINT64 <-> uint64 24 | ScalarType.FLOAT32 <-> float32 25 | ScalarType.FLOAT64 <-> float64 26 | ScalarType.COMPLEX64 <-> complex64 27 | ScalarType.COMPLEX128 <-> complex128 28 | ScalarType.BOOL <-> bool 29 | 30 | Args: 31 | data_type: numpy dtype to convert 32 | 33 | Returns: 34 | ScalarType: corresponding MDIO scalar type 35 | 36 | Raises: 37 | ValueError: if dtype is not supported 38 | """ 39 | try: 40 | return ScalarType(data_type.name) 41 | except ValueError as exc: 42 | err = f"Unsupported numpy dtype '{data_type.name}' for conversion to ScalarType." 43 | raise ValueError(err) from exc 44 | 45 | 46 | def to_structured_type(data_type: np_dtype) -> StructuredType: 47 | """Convert numpy dtype to MDIO StructuredType. 48 | 49 | This function supports only a limited subset of structured types. 50 | In particular: 51 | It does not support nested structured types. 52 | It supports fields of only 13 out of 24 built-in numpy scalar types. 53 | (see `to_scalar_type` for details). 54 | 55 | Args: 56 | data_type: numpy dtype to convert 57 | 58 | Returns: 59 | StructuredType: corresponding MDIO structured type 60 | 61 | Raises: 62 | ValueError: if dtype is not structured or has no fields 63 | 64 | """ 65 | if data_type is None or len(data_type.names or []) == 0: 66 | err = "None or empty dtype provided, cannot convert to StructuredType." 67 | raise ValueError(err) 68 | 69 | fields = [] 70 | for field_name in data_type.names: 71 | field_dtype = data_type.fields[field_name][0] 72 | scalar_type = to_scalar_type(field_dtype) 73 | structured_field = StructuredField(name=field_name, format=scalar_type) 74 | fields.append(structured_field) 75 | return StructuredType(fields=fields) 76 | 77 | 78 | def to_numpy_dtype(data_type: ScalarType | StructuredType) -> np_dtype: 79 | """Get the numpy dtype for a variable.""" 80 | if isinstance(data_type, ScalarType): 81 | return np_dtype(data_type.value) 82 | if isinstance(data_type, StructuredType): 83 | return np_dtype([(f.name, f.format.value) for f in data_type.fields]) 84 | msg = f"Expected ScalarType or StructuredType, got '{type(data_type).__name__}'" 85 | raise ValueError(msg) 86 | -------------------------------------------------------------------------------- /docs/data_models/version_1.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | :tocdepth: 3 3 | ``` 4 | 5 | ```{currentModule} mdio.builder.schemas.v1.dataset 6 | 7 | ``` 8 | 9 | # MDIO v1 10 | 11 | ```{article-info} 12 | :author: Altay Sansal 13 | :date: "{sub-ref}`today`" 14 | :read-time: "{sub-ref}`wordcount-minutes` min read" 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light 16 | ``` 17 | 18 | ## Intro 19 | 20 | ```{eval-rst} 21 | .. autosummary:: Dataset 22 | .. autosummary:: DatasetMetadata 23 | ``` 24 | 25 | ## Reference 26 | 27 | :::{dropdown} Dataset 28 | :open: 29 | 30 | ```{eval-rst} 31 | .. autopydantic_model:: Dataset 32 | :inherited-members: BaseModel 33 | 34 | .. autopydantic_model:: DatasetMetadata 35 | :inherited-members: BaseModel 36 | ``` 37 | 38 | ::: 39 | :::{dropdown} Variable 40 | 41 | ```{eval-rst} 42 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.Variable 43 | :inherited-members: BaseModel 44 | 45 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.Coordinate 46 | :inherited-members: BaseModel 47 | 48 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.CoordinateMetadata 49 | :inherited-members: BaseModel 50 | 51 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.VariableMetadata 52 | :inherited-members: BaseModel 53 | ``` 54 | 55 | ::: 56 | 57 | :::{dropdown} Units 58 | 59 | ```{eval-rst} 60 | .. automodule:: mdio.builder.schemas.v1.units 61 | :members: LengthUnitModel, 62 | TimeUnitModel, 63 | AngleUnitModel, 64 | DensityUnitModel, 65 | SpeedUnitModel, 66 | FrequencyUnitModel, 67 | VoltageUnitModel 68 | ``` 69 | 70 | ::: 71 | 72 | :::{dropdown} Stats 73 | 74 | ```{eval-rst} 75 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.SummaryStatistics 76 | 77 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.EdgeDefinedHistogram 78 | :inherited-members: BaseModel 79 | 80 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.CenteredBinHistogram 81 | :inherited-members: BaseModel 82 | ``` 83 | 84 | ::: 85 | 86 | :::{dropdown} Enums 87 | 88 | ```{eval-rst} 89 | .. autoclass:: mdio.builder.schemas.v1.units.AngleUnitEnum() 90 | :members: 91 | :undoc-members: 92 | :member-order: bysource 93 | 94 | .. autoclass:: mdio.builder.schemas.v1.units.DensityUnitEnum() 95 | :members: 96 | :undoc-members: 97 | :member-order: bysource 98 | 99 | .. autoclass:: mdio.builder.schemas.v1.units.FrequencyUnitEnum() 100 | :members: 101 | :undoc-members: 102 | :member-order: bysource 103 | 104 | .. autoclass:: mdio.builder.schemas.v1.units.LengthUnitEnum() 105 | :members: 106 | :undoc-members: 107 | :member-order: bysource 108 | 109 | .. autoclass:: mdio.builder.schemas.v1.units.SpeedUnitEnum() 110 | :members: 111 | :undoc-members: 112 | :member-order: bysource 113 | 114 | .. autoclass:: mdio.builder.schemas.v1.units.TimeUnitEnum() 115 | :members: 116 | :undoc-members: 117 | :member-order: bysource 118 | 119 | .. autoclass:: mdio.builder.schemas.v1.units.VoltageUnitEnum() 120 | :members: 121 | :undoc-members: 122 | :member-order: bysource 123 | ``` 124 | 125 | ::: 126 | -------------------------------------------------------------------------------- /tests/unit/test_auto_chunking.py: -------------------------------------------------------------------------------- 1 | """Test live mask chunk size calculation.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | import numpy as np 8 | import pytest 9 | 10 | from mdio.core.utils_write import MAX_SIZE_LIVE_MASK 11 | from mdio.core.utils_write import get_constrained_chunksize 12 | from mdio.core.utils_write import get_live_mask_chunksize 13 | 14 | if TYPE_CHECKING: 15 | from numpy.typing import DTypeLike 16 | 17 | 18 | @pytest.mark.parametrize( 19 | ("shape", "dtype", "limit", "expected_chunks"), 20 | [ 21 | ((100,), "int8", 100, (100,)), # 1D full chunk 22 | ((8, 6), "int8", 20, (4, 4)), # 2D adjusted int8 23 | ((6, 8), "int16", 96, (6, 8)), # 2D small int16 24 | ((9, 6, 4), "int8", 100, (5, 5, 4)), # 3D adjusted 25 | ((4, 5), "int32", 4, (1, 1)), # test minimum edge case 26 | ((10, 10), "int8", 1000, (10, 10)), # big limit 27 | ((7, 5), "int8", 35, (7, 5)), # test full primes 28 | ((7, 5), "int8", 23, (4, 4)), # test adjusted primes 29 | ], 30 | ) 31 | @pytest.mark.filterwarnings("ignore:chunk size balancing not possible:UserWarning") 32 | def test_auto_chunking( 33 | shape: tuple[int, ...], 34 | dtype: DTypeLike, 35 | limit: int, 36 | expected_chunks: tuple[int, ...], 37 | ) -> None: 38 | """Test automatic chunking based on size limit and an array spec.""" 39 | result = get_constrained_chunksize(shape, dtype, limit) 40 | assert result == expected_chunks 41 | 42 | 43 | class TestAutoChunkLiveMask: 44 | """Test class for live mask auto chunking.""" 45 | 46 | @pytest.mark.parametrize( 47 | ("shape", "expected_chunks"), 48 | [ 49 | ((100,), (100,)), # small 1d 50 | ((100, 100), (100, 100)), # small 2d 51 | ((50000, 50000), (16667, 16667)), # large 2d 52 | ((1500, 1500, 1500), (750, 750, 750)), # large 3d 53 | ((1000, 1000, 100, 36), (250, 250, 100, 36)), # large 4d 54 | ], 55 | ) 56 | def test_auto_chunk_live_mask( 57 | self, 58 | shape: tuple[int, ...], 59 | expected_chunks: tuple[int, ...], 60 | ) -> None: 61 | """Test auto chunked live mask is within expected number of bytes.""" 62 | result = get_live_mask_chunksize(shape) 63 | assert result == expected_chunks 64 | 65 | @pytest.mark.parametrize( 66 | "shape", 67 | [ 68 | # Below are >250MiB. Smaller ones tested above 69 | (32768, 32768), 70 | (46341, 46341), 71 | (86341, 96341), 72 | (55000, 97500), 73 | (100000, 100000), 74 | (512, 216, 512, 400), 75 | (64, 128, 64, 32, 64), 76 | (512, 17, 43, 200, 50), 77 | ], 78 | ) 79 | @pytest.mark.filterwarnings("ignore:chunk size balancing not possible:UserWarning") 80 | def test_auto_chunk_live_mask_nbytes(self, shape: tuple[int, ...]) -> None: 81 | """Test auto chunked live mask is within expected number of bytes.""" 82 | result = get_live_mask_chunksize(shape) 83 | chunk_elements = np.prod(result) 84 | 85 | # We want them to be 250MB +/- 50% 86 | assert chunk_elements > MAX_SIZE_LIVE_MASK * 0.75 87 | assert chunk_elements < MAX_SIZE_LIVE_MASK * 1.25 88 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_3d_cdp.py: -------------------------------------------------------------------------------- 1 | """Seismic3DCDPGathersTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas.compressors import Blosc 6 | from mdio.builder.schemas.compressors import BloscCname 7 | from mdio.builder.schemas.dtype import ScalarType 8 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 9 | from mdio.builder.templates.base import AbstractDatasetTemplate 10 | from mdio.builder.templates.types import CdpGatherDomain 11 | from mdio.builder.templates.types import SeismicDataDomain 12 | 13 | 14 | class Seismic3DCdpGathersTemplate(AbstractDatasetTemplate): 15 | """Seismic CDP pre-stack 3D gathers Dataset template.""" 16 | 17 | def __init__(self, data_domain: SeismicDataDomain, gather_domain: CdpGatherDomain): 18 | super().__init__(data_domain=data_domain) 19 | self._gather_domain = gather_domain.lower() 20 | 21 | if self._gather_domain not in ["offset", "angle"]: 22 | msg = "gather_type must be 'offset' or 'angle'" 23 | raise ValueError(msg) 24 | 25 | self._dim_names = ("inline", "crossline", self._gather_domain, self._data_domain) 26 | self._physical_coord_names = ("cdp_x", "cdp_y") 27 | self._var_chunk_shape = (8, 8, 32, 512) 28 | 29 | @property 30 | def _name(self) -> str: 31 | gather_domain_suffix = self._gather_domain.capitalize() 32 | data_domain_suffix = self._data_domain.capitalize() 33 | return f"Cdp{gather_domain_suffix}Gathers3D{data_domain_suffix}" 34 | 35 | def _load_dataset_attributes(self) -> dict[str, Any]: 36 | return {"surveyType": "3D", "gatherType": "cdp"} 37 | 38 | def _add_coordinates(self) -> None: 39 | # Add dimension coordinates 40 | self._builder.add_coordinate( 41 | "inline", 42 | dimensions=("inline",), 43 | data_type=ScalarType.INT32, 44 | ) 45 | self._builder.add_coordinate( 46 | "crossline", 47 | dimensions=("crossline",), 48 | data_type=ScalarType.INT32, 49 | ) 50 | self._builder.add_coordinate( 51 | self._gather_domain, 52 | dimensions=(self._gather_domain,), 53 | data_type=ScalarType.INT32, 54 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)), 55 | ) 56 | self._builder.add_coordinate( 57 | self.trace_domain, 58 | dimensions=(self.trace_domain,), 59 | data_type=ScalarType.INT32, 60 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), 61 | ) 62 | 63 | # Add non-dimension coordinates 64 | compressor = Blosc(cname=BloscCname.zstd) 65 | self._builder.add_coordinate( 66 | "cdp_x", 67 | dimensions=("inline", "crossline"), 68 | data_type=ScalarType.FLOAT64, 69 | compressor=compressor, 70 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")), 71 | ) 72 | self._builder.add_coordinate( 73 | "cdp_y", 74 | dimensions=("inline", "crossline"), 75 | data_type=ScalarType.FLOAT64, 76 | compressor=compressor, 77 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")), 78 | ) 79 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | There are different ways to install MDIO: 4 | 5 | - Install the latest release via [`pip`](#using-pip-and-virtualenv) or [`conda`](#using-conda). 6 | - Building package [from source](#building-from-source). 7 | 8 | ```{note} 9 | We strongly recommend using a virtual environment `venv` or `conda` 10 | to avoid potential conflicts with other Python packages. 11 | ``` 12 | 13 | ## Using `pip` and `virtualenv` 14 | 15 | Install the 64-bit version of Python 3 from https://www.python.org. 16 | 17 | Then we can create a `venv` and install _MDIO_. 18 | 19 | ```shell 20 | $ python -m venv mdio-venv 21 | $ mdio-venv/Scripts/activate 22 | $ pip install -U multidimio 23 | ``` 24 | 25 | To check if installation was successful see [checking installation](#checking-installation). 26 | 27 | You can also install some optional dependencies (extras) like this: 28 | 29 | ```shell 30 | $ pip install multidimio[distributed] 31 | $ pip install multidimio[cloud] 32 | $ pip install multidimio[lossy] 33 | ``` 34 | 35 | `distributed` installs [Dask][dask] for parallel, distributed processing.\ 36 | `cloud` installs [fsspec][fsspec] backed I/O libraries for [AWS' S3][s3fs], 37 | [Google's GCS][gcsfs], and [Azure ABS][adlfs].\ 38 | `lossy` will install the [ZFPY][zfp] library for lossy chunk compression. 39 | 40 | [dask]: https://www.dask.org/ 41 | [fsspec]: https://filesystem-spec.readthedocs.io/en/latest/ 42 | [s3fs]: https://s3fs.readthedocs.io/ 43 | [gcsfs]: https://gcsfs.readthedocs.io/ 44 | [adlfs]: https://github.com/fsspec/adlfs 45 | [zfp]: https://computing.llnl.gov/projects/zfp 46 | 47 | ## Using `conda` 48 | 49 | MDIO can also be installed in a `conda` environment. 50 | 51 | ```{note} 52 | _MDIO_ is hosted in the `conda-forge` channel. Make sure to always provide the 53 | `-c conda-forge` when running `conda install` or else it won't be able to find 54 | the package. 55 | ``` 56 | 57 | We first run the following to create and activate an environment: 58 | 59 | ```shell 60 | $ conda create -n mdio-env 61 | $ conda activate mdio-env 62 | ``` 63 | 64 | Then we can to install with `conda`: 65 | 66 | ```shell 67 | $ conda install -c conda-forge multidimio 68 | ``` 69 | 70 | The above command will install MDIO into your `conda` environment. 71 | 72 | ```{note} 73 | _MDIO_ extras must be installed separately when using `conda`. 74 | ``` 75 | 76 | ## Checking Installation 77 | 78 | After installing MDIO, run the following: 79 | 80 | ```shell 81 | $ python -c "import mdio; print(mdio.__version__)" 82 | ``` 83 | 84 | You should see the version of MDIO printed to the screen. 85 | 86 | ## Building from Source 87 | 88 | All dependencies of _MDIO_ are Python packages, so the build process is very simple. 89 | To install from source, we need to clone the repo first and then install locally via `pip`. 90 | 91 | ```shell 92 | $ git clone https://github.com/TGSAI/mdio-python.git 93 | $ cd mdio-python 94 | $ pip install . 95 | ``` 96 | 97 | We can also install the extras in a similar way, for example: 98 | 99 | ```shell 100 | $ pip install .[cloud] 101 | ``` 102 | 103 | If you want an editable version of _MDIO_ then we could install it with the command below. 104 | This does allow you to make code changes on the fly. 105 | 106 | ```shell 107 | $ pip install --editable .[cloud] 108 | ``` 109 | 110 | To check if installation was successful see [checking installation](#checking-installation). 111 | -------------------------------------------------------------------------------- /src/mdio/__main__.py: -------------------------------------------------------------------------------- 1 | """Command-line interface.""" 2 | 3 | from __future__ import annotations 4 | 5 | import importlib 6 | from importlib import metadata 7 | from pathlib import Path 8 | from typing import TYPE_CHECKING 9 | 10 | if TYPE_CHECKING: 11 | from collections.abc import Callable 12 | from typing import Any 13 | 14 | import click 15 | 16 | KNOWN_MODULES = ["segy.py", "copy.py", "info.py"] 17 | 18 | 19 | class MyCLI(click.Group): 20 | """CLI generator via plugin design pattern. 21 | 22 | This class dynamically loads command modules from the specified `plugin_folder`. If the 23 | command us another CLI group, the command module must define a `cli = click.Group(...)` and 24 | subsequent commands must be added to this CLI. If it is a single utility it must have a 25 | variable named `cli` for the command to be exposed. 26 | 27 | Args: 28 | plugin_folder: Path to the directory containing command modules 29 | *args: Variable length argument list passed to the click.Group. 30 | **kwargs: Arbitrary keyword arguments passed to the click.Group. 31 | """ 32 | 33 | def __init__(self, plugin_folder: Path, *args: Any, **kwargs: Any): # noqa: ANN401 34 | super().__init__(*args, **kwargs) 35 | self.plugin_folder = plugin_folder 36 | self.known_modules = KNOWN_MODULES 37 | 38 | def list_commands(self, _ctx: click.Context) -> list[str]: 39 | """List commands available under `commands` module.""" 40 | rv = [] 41 | for filename in self.plugin_folder.iterdir(): 42 | is_known = filename.name in self.known_modules 43 | is_python = filename.suffix == ".py" 44 | if is_known and is_python: 45 | rv.append(filename.stem) 46 | rv.sort() 47 | return rv 48 | 49 | def get_command(self, _ctx: click.Context, name: str) -> Callable | None: 50 | """Get command implementation from `commands` module.""" 51 | try: 52 | filepath = self.plugin_folder / f"{name}.py" 53 | if filepath.name not in self.known_modules: 54 | click.echo(f"Command {name} is not safe to execute.") 55 | return None 56 | 57 | module_name = f"mdio.commands.{name}" 58 | spec = importlib.util.spec_from_file_location(module_name, str(filepath)) 59 | if spec and spec.loader: 60 | module = importlib.util.module_from_spec(spec) 61 | spec.loader.exec_module(module) 62 | return module.cli 63 | except Exception as e: 64 | click.echo(f"Error loading command {name}: {e}") 65 | return None 66 | 67 | 68 | def get_package_version(package_name: str, default: str = "unknown") -> str: 69 | """Safely fetch the package version, providing a default if not found.""" 70 | try: 71 | return metadata.version(package_name) 72 | except metadata.PackageNotFoundError: 73 | return default 74 | 75 | 76 | @click.command(cls=MyCLI, plugin_folder=Path(__file__).parent / "commands") 77 | @click.version_option(get_package_version("multidimio")) 78 | def main() -> None: 79 | """Welcome to MDIO! 80 | 81 | MDIO is an open source, cloud-native, and scalable storage engine 82 | for various types of energy data. 83 | 84 | MDIO supports importing or exporting various data containers, 85 | hence we allow plugins as subcommands. 86 | 87 | From this main command, we can see the MDIO version. 88 | """ 89 | -------------------------------------------------------------------------------- /tests/unit/test_coordinate_scalar.py: -------------------------------------------------------------------------------- 1 | """Tests for coordinate scalar getters and apply functions.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | from unittest.mock import MagicMock 7 | 8 | import numpy as np 9 | import pytest 10 | from segy import SegyFile 11 | from segy.standards import SegyStandard 12 | from segy.standards.fields import trace as trace_header_fields 13 | 14 | from mdio.segy.scalar import _apply_coordinate_scalar 15 | from mdio.segy.scalar import _get_coordinate_scalar 16 | 17 | if TYPE_CHECKING: 18 | from numpy.typing import NDArray 19 | 20 | COORD_SCALAR_KEY = trace_header_fields.Rev0.COORDINATE_SCALAR.model.name 21 | 22 | 23 | @pytest.fixture 24 | def mock_segy_file() -> SegyFile: 25 | """Mock SegyFile object.""" 26 | segy_file = MagicMock(spec=SegyFile) 27 | segy_file.spec = MagicMock() 28 | segy_file.header = [MagicMock()] 29 | return segy_file 30 | 31 | 32 | @pytest.mark.parametrize("scalar", [1, 100, 10000, -10, -1000]) 33 | def test_get_coordinate_scalar_valid(mock_segy_file: SegyFile, scalar: int) -> None: 34 | """Test valid options when getting coordinate scalar.""" 35 | mock_segy_file.spec.segy_standard = SegyStandard.REV1 36 | mock_segy_file.header[0].__getitem__.return_value = scalar 37 | 38 | result = _get_coordinate_scalar(mock_segy_file) 39 | 40 | assert result == scalar 41 | 42 | 43 | @pytest.mark.parametrize( 44 | "revision", 45 | [SegyStandard.REV2, SegyStandard.REV21], 46 | ) 47 | def test_get_coordinate_scalar_zero_rev2_plus(mock_segy_file: SegyFile, revision: SegyStandard) -> None: 48 | """Test when scalar is normalized to 1 (from 0) in Rev2+.""" 49 | mock_segy_file.spec.segy_standard = revision 50 | mock_segy_file.header[0].__getitem__.return_value = 0 51 | 52 | result = _get_coordinate_scalar(mock_segy_file) 53 | 54 | assert result == 1 55 | 56 | 57 | @pytest.mark.parametrize( 58 | ("scalar", "revision", "error_msg"), 59 | [ 60 | (0, SegyStandard.REV0, "Invalid coordinate scalar: 0 for file revision SegyStandard.REV0."), 61 | (110, SegyStandard.REV1, "Invalid coordinate scalar: 110 for file revision SegyStandard.REV1."), 62 | (32768, SegyStandard.REV1, "Invalid coordinate scalar: 32768 for file revision SegyStandard.REV1."), 63 | ], 64 | ) 65 | def test_get_coordinate_scalar_invalid( 66 | mock_segy_file: SegyFile, scalar: int, revision: SegyStandard, error_msg: str 67 | ) -> None: 68 | """Test invalid options when getting coordinate scalar.""" 69 | mock_segy_file.spec.segy_standard = revision 70 | mock_segy_file.header[0].__getitem__.return_value = scalar 71 | 72 | with pytest.raises(ValueError, match=error_msg): 73 | _get_coordinate_scalar(mock_segy_file) 74 | 75 | 76 | @pytest.mark.parametrize( 77 | ("data", "scalar", "expected"), 78 | [ 79 | # POSITIVE 80 | (np.array([1, 2, 3]), 1, np.array([1, 2, 3])), 81 | (np.array([1, 2, 3]), 10, np.array([10, 20, 30])), 82 | (np.array([[1, 2], [3, 4]]), 1000, np.array([[1000, 2000], [3000, 4000]])), 83 | # NEGATIVE 84 | (np.array([1, 2, 3]), -1, np.array([1, 2, 3])), 85 | (np.array([10, 20, 30]), -10, np.array([1, 2, 3])), 86 | (np.array([[1000, 2000], [3000, 4000]]), -1000, np.array([[1, 2], [3, 4]])), 87 | ], 88 | ) 89 | def test_apply_coordinate_scalar(data: NDArray, scalar: int, expected: NDArray) -> None: 90 | """Test applying coordinate scalar with negative and positive code.""" 91 | result = _apply_coordinate_scalar(data, scalar) 92 | assert np.allclose(result, expected) 93 | -------------------------------------------------------------------------------- /docs/template_registry.md: -------------------------------------------------------------------------------- 1 | # Template Registry 2 | 3 | A simple, thread-safe place to discover and fetch dataset templates for MDIO. 4 | 5 | ## Why use it 6 | 7 | - One place to find all available templates 8 | - Safe to use across threads and the whole app (singleton) 9 | - Every fetch gives you your own editable copy (no side effects) 10 | - Comes preloaded with common seismic templates 11 | 12 | ```{note} 13 | Fetching a template with `get_template()` returns a deep copy. Editing it will not change the 14 | registry or anyone else’s copy. 15 | ``` 16 | 17 | ## Quick start 18 | 19 | ```python 20 | from mdio.builder.template_registry import get_template, list_templates 21 | 22 | # See what's available 23 | print(list_templates()) 24 | # e.g. ["Seismic2DPostStackTime", "Seismic3DPostStackDepth", ...] 25 | 26 | # Grab a template by name 27 | template = get_template("Seismic3DPostStackTime") 28 | 29 | # Customize your copy (safe) 30 | template.add_units({"amplitude": "unitless"}) 31 | ``` 32 | 33 | ## Common tasks 34 | 35 | ### Fetch a template you can edit 36 | 37 | ```python 38 | from mdio.builder.template_registry import get_template 39 | 40 | template = get_template("Seismic2DPostStackDepth") 41 | # Use/modify template freely — it’s your copy 42 | ``` 43 | 44 | ### List available templates 45 | 46 | ```python 47 | from mdio.builder.template_registry import list_templates 48 | 49 | names = list_templates() 50 | for name in names: 51 | print(name) 52 | ``` 53 | 54 | ### Check if a template exists 55 | 56 | ```python 57 | from mdio.builder.template_registry import is_template_registered 58 | 59 | if is_template_registered("Seismic3DPostStackTime"): 60 | ... # safe to fetch 61 | ``` 62 | 63 | ### Register your own template (optional) 64 | 65 | If you have a custom template class, register an instance so others can fetch it by name: 66 | 67 | ```python 68 | from typing import Any 69 | from mdio.builder.template_registry import register_template 70 | from mdio.builder.templates.base import AbstractDatasetTemplate 71 | from mdio.builder.templates.types import SeismicDataDomain 72 | 73 | 74 | class MyTemplate(AbstractDatasetTemplate): 75 | def __init__(self, domain: SeismicDataDomain = "time"): 76 | super().__init__(domain) 77 | 78 | @property 79 | def _name(self) -> str: 80 | # The public name becomes something like "MyTemplateTime" 81 | return f"MyTemplate{self._data_domain.capitalize()}" 82 | 83 | def _load_dataset_attributes(self) -> dict[str, Any]: 84 | return {"surveyType": "2D", "gatherType": "custom"} 85 | 86 | 87 | # Make it available globally 88 | registered_name = register_template(MyTemplate("time")) 89 | print(registered_name) # "MyTemplateTime" 90 | ``` 91 | 92 | ```{tip} 93 | Use `list_templates()` to discover the exact names to pass to `get_template()`. 94 | ``` 95 | 96 | ## Troubleshooting 97 | 98 | - KeyError: “Template 'XYZ' is not registered.” 99 | - The name is wrong or not registered yet. 100 | - Call `list_templates()` to see valid names, or `is_template_registered(name)` to check first. 101 | 102 | ## FAQ 103 | 104 | - Do I need to create a TemplateRegistry instance? 105 | No. Use the global helpers: `get_template`, `list_templates`, `register_template`, and `is_template_registered`. 106 | - Are templates shared between callers or threads? 107 | No. Each `get_template()` call returns a deep-copied instance that is safe to modify independently. 108 | 109 | ## API reference 110 | 111 | ```{eval-rst} 112 | .. automodule:: mdio.builder.template_registry 113 | :members: 114 | ``` 115 | -------------------------------------------------------------------------------- /tests/unit/test_segy_spec_validation.py: -------------------------------------------------------------------------------- 1 | """Tests for SEG-Y spec validation against MDIO templates.""" 2 | 3 | from __future__ import annotations 4 | 5 | from unittest.mock import MagicMock 6 | 7 | import pytest 8 | from segy.schema import HeaderField 9 | from segy.standards import get_segy_standard 10 | 11 | from mdio.builder.templates.base import AbstractDatasetTemplate 12 | from mdio.converters.segy import _validate_spec_in_template 13 | 14 | 15 | class TestValidateSpecInTemplate: 16 | """Test cases for _validate_spec_in_template function.""" 17 | 18 | def test_validation_passes_with_all_required_fields(self) -> None: 19 | """Test that validation passes when all required fields are present.""" 20 | template = MagicMock(spec=AbstractDatasetTemplate) 21 | template.spatial_dimension_names = ("inline", "crossline") 22 | template.coordinate_names = ("cdp_x", "cdp_y") 23 | 24 | # Use base SEG-Y standard which includes coordinate_scalar at byte 71 25 | segy_spec = get_segy_standard(1.0) 26 | 27 | # Should not raise any exception 28 | _validate_spec_in_template(segy_spec, template) 29 | 30 | def test_validation_fails_with_missing_fields(self) -> None: 31 | """Test that validation fails when required fields are missing.""" 32 | # Template requiring custom fields not in standard spec 33 | template = MagicMock(spec=AbstractDatasetTemplate) 34 | template.name = "CustomTemplate" 35 | template.spatial_dimension_names = ("custom_dim1", "custom_dim2") 36 | template.coordinate_names = ("custom_coord_x", "custom_coord_y") 37 | 38 | # SegySpec with only one of the required custom fields 39 | spec = get_segy_standard(1.0) 40 | header_fields = [ 41 | HeaderField(name="custom_dim1", byte=189, format="int32"), 42 | ] 43 | segy_spec = spec.customize(trace_header_fields=header_fields) 44 | 45 | # Should raise ValueError listing the missing fields 46 | with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info: 47 | _validate_spec_in_template(segy_spec, template) 48 | 49 | error_message = str(exc_info.value) 50 | assert "custom_dim2" in error_message 51 | assert "custom_coord_x" in error_message 52 | assert "custom_coord_y" in error_message 53 | assert "CustomTemplate" in error_message 54 | 55 | def test_validation_fails_with_missing_coordinate_scalar(self) -> None: 56 | """Test that validation fails when coordinate_scalar is missing, even with all other fields.""" 57 | template = MagicMock(spec=AbstractDatasetTemplate) 58 | template.name = "TestTemplate" 59 | template.spatial_dimension_names = ("inline", "crossline") 60 | template.coordinate_names = ("cdp_x", "cdp_y") 61 | 62 | # Create SegySpec with all standard fields except coordinate_scalar 63 | spec = get_segy_standard(1.0) 64 | # Remove coordinate_scalar from the standard fields 65 | standard_fields = [field for field in spec.trace.header.fields if field.name != "coordinate_scalar"] 66 | standard_fields.append(HeaderField(name="not_coordinate_scalar", byte=71, format="int16")) 67 | segy_spec = spec.customize(trace_header_fields=standard_fields) 68 | 69 | # Should raise ValueError for missing coordinate_scalar 70 | with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info: 71 | _validate_spec_in_template(segy_spec, template) 72 | 73 | error_message = str(exc_info.value) 74 | assert "coordinate_scalar" in error_message 75 | assert "TestTemplate" in error_message 76 | -------------------------------------------------------------------------------- /tests/unit/test_type_converter.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the type converter module.""" 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from mdio.builder.schemas.dtype import ScalarType 7 | from mdio.builder.schemas.dtype import StructuredField 8 | from mdio.builder.schemas.dtype import StructuredType 9 | from mdio.converters.type_converter import to_numpy_dtype 10 | from mdio.converters.type_converter import to_scalar_type 11 | from mdio.converters.type_converter import to_structured_type 12 | 13 | 14 | @pytest.fixture 15 | def supported_scalar_types_map() -> tuple[tuple[ScalarType, str], ...]: 16 | """Supported scalar types and their numpy equivalents.""" 17 | return ( 18 | (ScalarType.INT8, "int8"), 19 | (ScalarType.INT16, "int16"), 20 | (ScalarType.INT32, "int32"), 21 | (ScalarType.INT64, "int64"), 22 | (ScalarType.UINT8, "uint8"), 23 | (ScalarType.UINT16, "uint16"), 24 | (ScalarType.UINT32, "uint32"), 25 | (ScalarType.UINT64, "uint64"), 26 | (ScalarType.FLOAT32, "float32"), 27 | (ScalarType.FLOAT64, "float64"), 28 | (ScalarType.COMPLEX64, "complex64"), 29 | (ScalarType.COMPLEX128, "complex128"), 30 | (ScalarType.BOOL, "bool"), 31 | ) 32 | 33 | 34 | @pytest.fixture 35 | def a_structured_type() -> StructuredType: 36 | """Sample structured type. 37 | 38 | Returns a structured type. 39 | """ 40 | return StructuredType( 41 | fields=[ 42 | StructuredField(name="x", format=ScalarType.FLOAT64), 43 | StructuredField(name="y", format=ScalarType.FLOAT64), 44 | StructuredField(name="z", format=ScalarType.FLOAT64), 45 | StructuredField(name="id", format=ScalarType.INT32), 46 | StructuredField(name="valid", format=ScalarType.BOOL), 47 | ] 48 | ) 49 | 50 | 51 | def test_to_numpy_dtype(supported_scalar_types_map: tuple[ScalarType, str], a_structured_type: StructuredType) -> None: 52 | """Comprehensive test for to_numpy_dtype function.""" 53 | # Test 0: invalid input 54 | err = "Expected ScalarType or StructuredType, got 'str'" 55 | with pytest.raises(ValueError, match=err): 56 | to_numpy_dtype("parameter of invalid type") 57 | 58 | # Test 1: ScalarType cases - all supported scalar types 59 | for scalar_type, expected_numpy_type in supported_scalar_types_map: 60 | result = to_numpy_dtype(scalar_type) 61 | expected = np.dtype(expected_numpy_type) 62 | assert result == expected 63 | assert isinstance(result, np.dtype) 64 | assert result.name == expected.name 65 | 66 | # Test 2: StructuredType with multiple fields 67 | result_multi = to_numpy_dtype(a_structured_type) 68 | expected_multi = np.dtype( 69 | [("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")] 70 | ) 71 | 72 | assert result_multi == expected_multi 73 | assert isinstance(result_multi, np.dtype) 74 | assert len(result_multi.names) == 5 75 | assert set(result_multi.names) == {"x", "y", "z", "id", "valid"} 76 | 77 | 78 | def test_to_scalar_type(supported_scalar_types_map: tuple[ScalarType, str]) -> None: 79 | """Test for to_scalar_type function.""" 80 | for expected_mdio_type, numpy_type in supported_scalar_types_map: 81 | result = to_scalar_type(np.dtype(numpy_type)) 82 | assert result == expected_mdio_type 83 | 84 | 85 | def test_to_structured_type(a_structured_type: StructuredType) -> None: 86 | """Test for to_structured_type function.""" 87 | dtype = np.dtype([("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")]) 88 | assert a_structured_type == to_structured_type(dtype) 89 | 90 | dtype = np.dtype([("x", " int: 49 | """Return the integer code of ZFP mode.""" 50 | return zfp_mode_map[self.value] 51 | 52 | 53 | class ZFP(CamelCaseStrictModel): 54 | """Data Model for ZFP options.""" 55 | 56 | name: str = Field(default="zfp", description="Name of the compressor.") 57 | mode: ZFPMode = Field() 58 | 59 | tolerance: float | None = Field( 60 | default=None, 61 | description="Fixed accuracy in terms of absolute error tolerance.", 62 | ) 63 | 64 | rate: float | None = Field( 65 | default=None, 66 | description="Fixed rate in terms of number of compressed bits per value.", 67 | ) 68 | 69 | precision: int | None = Field( 70 | default=None, 71 | description="Fixed precision in terms of number of uncompressed bits per value.", 72 | ) 73 | 74 | @model_validator(mode="after") 75 | def check_requirements(self) -> ZFP: 76 | """Check if ZFP parameters make sense.""" 77 | mode = self.mode 78 | 79 | # Check if reversible mode is provided without other parameters. 80 | if mode == ZFPMode.REVERSIBLE and any( 81 | getattr(self, key) is not None for key in ["tolerance", "rate", "precision"] 82 | ): 83 | msg = "Other fields must be None in REVERSIBLE mode" 84 | raise ValueError(msg) 85 | 86 | if mode == ZFPMode.FIXED_ACCURACY and self.tolerance is None: 87 | msg = "Tolerance required for FIXED_ACCURACY mode" 88 | raise ValueError(msg) 89 | 90 | if mode == ZFPMode.FIXED_RATE and self.rate is None: 91 | msg = "Rate required for FIXED_RATE mode" 92 | raise ValueError(msg) 93 | 94 | if mode == ZFPMode.FIXED_PRECISION and self.precision is None: 95 | msg = "Precision required for FIXED_PRECISION mode" 96 | raise ValueError(msg) 97 | 98 | return self 99 | 100 | 101 | class CompressorModel(CamelCaseStrictModel): 102 | """Model representing compressor configuration.""" 103 | 104 | compressor: Blosc | ZFP | None = Field(default=None, description="Compression settings.") 105 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributor Guide 2 | 3 | Thank you for your interest in improving this project. 4 | This project is open-source under the [Apache 2.0 license] and 5 | welcomes contributions in the form of bug reports, feature requests, and pull requests. 6 | 7 | Here is a list of important resources for contributors: 8 | 9 | - [Source Code] 10 | - [Documentation] 11 | - [Issue Tracker] 12 | - [Code of Conduct] 13 | 14 | [apache 2.0 license]: https://opensource.org/licenses/Apache-2.0 15 | [source code]: https://github.com/TGSAI/mdio-python 16 | [documentation]: https://mdio-python.readthedocs.io/ 17 | [issue tracker]: https://github.com/TGSAI/mdio-python/issues 18 | 19 | ## How to report a bug 20 | 21 | Report bugs on the [Issue Tracker]. 22 | 23 | When filing an issue, make sure to answer these questions: 24 | 25 | - Which operating system and Python version are you using? 26 | - Which version of this project are you using? 27 | - What did you do? 28 | - What did you expect to see? 29 | - What did you see instead? 30 | 31 | The best way to get your bug fixed is to provide a test case, 32 | and/or steps to reproduce the issue. 33 | 34 | ## How to request a feature 35 | 36 | Request features on the [Issue Tracker]. 37 | 38 | ## How to set up your development environment 39 | 40 | You need Python 3.11+ and the following tools: 41 | 42 | - [uv] 43 | - [Nox] 44 | 45 | Another alternative is to use a [Development Container] has been setup to provide 46 | an environment with the required dependencies. This facilitates development on 47 | different systems. 48 | 49 | This should seamlessly enable development for users of [VS Code] on systems with docker installed. 50 | 51 | ### Known Issues: 52 | 53 | - `git config --global --add safe.directory $(pwd)` might be needed inside the container. 54 | 55 | ## How to Install and Run MDIO 56 | 57 | Install the package with development and documentation generation requirements: 58 | 59 | ```console 60 | $ uv sync --all-groups 61 | ``` 62 | 63 | You can now run an interactive Python session, 64 | or the command-line interface: 65 | 66 | ```console 67 | $ uv run python 68 | $ uv run mdio 69 | ``` 70 | 71 | [uv]: https://docs.astral.sh/uv/ 72 | [nox]: https://nox.thea.codes/ 73 | [development container]: https://containers.dev/ 74 | [vs code]: https://code.visualstudio.com/docs/devcontainers/containers/ 75 | 76 | ## How to test the project 77 | 78 | Run the full test suite: 79 | 80 | ```console 81 | $ nox 82 | ``` 83 | 84 | List the available Nox sessions: 85 | 86 | ```console 87 | $ nox --list-sessions 88 | ``` 89 | 90 | You can also run a specific Nox session. 91 | For example, invoke the unit test suite like this: 92 | 93 | ```console 94 | $ nox --session=tests 95 | ``` 96 | 97 | Unit tests are located in the _tests_ directory, 98 | and are written using the [pytest] testing framework. 99 | 100 | [pytest]: https://pytest.readthedocs.io/ 101 | 102 | ## How to submit changes 103 | 104 | Open a [pull request] to submit changes to this project. 105 | 106 | Your pull request needs to meet the following guidelines for acceptance: 107 | 108 | - The Nox test suite must pass without errors and warnings. 109 | - Include unit tests. This project currently maintains 90%+ code coverage. 110 | - If your changes add functionality, update the documentation accordingly. 111 | 112 | Feel free to submit early, though—we can always iterate on this. 113 | 114 | To run linting and code formatting checks before committing your change, you can install pre-commit as a Git hook by running the following command: 115 | 116 | ```console 117 | $ nox --session=pre-commit -- install 118 | ``` 119 | 120 | It is recommended to open an issue before starting work on anything. 121 | This will allow a chance to talk it over with the owners and validate your approach. 122 | 123 | [pull request]: https://github.com/TGSAI/mdio-python/pulls 124 | 125 | 126 | 127 | [code of conduct]: CODE_OF_CONDUCT.md 128 | -------------------------------------------------------------------------------- /tests/unit/v1/templates/test_seismic_templates.py: -------------------------------------------------------------------------------- 1 | """Unit tests for concrete seismic dataset template implementations.""" 2 | 3 | import pytest 4 | 5 | from mdio.builder.template_registry import TemplateRegistry 6 | from mdio.builder.templates.base import AbstractDatasetTemplate 7 | from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate 8 | 9 | 10 | class TestSeismicTemplates: 11 | """Test cases for Seismic2DPostStackTemplate.""" 12 | 13 | def test_chunk_shape_assignment(self) -> None: 14 | """Test that chunk shape is assigned correctly.""" 15 | template = Seismic2DPostStackTemplate("time") 16 | template.build_dataset("test", (50, 50)) 17 | template.full_chunk_shape = (32, 32) 18 | 19 | assert template._var_chunk_shape == (32, 32) 20 | 21 | def test_chunk_shape_assignment_exception(self) -> None: 22 | """Test that chunk shape assignment raises exception for invalid dimensions.""" 23 | template = Seismic2DPostStackTemplate("time") 24 | template.build_dataset("test", (50, 50)) 25 | 26 | with pytest.raises(ValueError, match="Chunk shape.*has.*dimensions, expected"): 27 | template.full_chunk_shape = (32, 32, 32) 28 | 29 | def test_chunk_shape_with_minus_one_before_build(self) -> None: 30 | """Test that chunk shape can be set with -1 before build_dataset.""" 31 | template = Seismic2DPostStackTemplate("time") 32 | 33 | # Should be able to set chunk shape with -1 before build_dataset 34 | template.full_chunk_shape = (32, -1) 35 | 36 | # Before build_dataset, getter should return unexpanded values 37 | assert template.full_chunk_shape == (32, -1) 38 | assert template._var_chunk_shape == (32, -1) 39 | 40 | def test_chunk_shape_with_minus_one_after_build(self) -> None: 41 | """Test that -1 values are expanded after build_dataset.""" 42 | template = Seismic2DPostStackTemplate("time") 43 | template.full_chunk_shape = (32, -1) 44 | 45 | # Build dataset with specific sizes 46 | template.build_dataset("test", (100, 200)) 47 | 48 | # After build_dataset, getter should expand -1 to dimension size 49 | assert template.full_chunk_shape == (32, 200) 50 | assert template._var_chunk_shape == (32, -1) # Internal storage unchanged 51 | 52 | def test_chunk_shape_validation_invalid_values(self) -> None: 53 | """Test that chunk shape setter rejects invalid values.""" 54 | template = Seismic2DPostStackTemplate("time") 55 | template.build_dataset("test", (50, 50)) 56 | 57 | # Test rejection of 0 58 | with pytest.raises(ValueError, match="Chunk size must be positive integer or -1"): 59 | template.full_chunk_shape = (32, 0) 60 | 61 | # Test rejection of negative values other than -1 62 | with pytest.raises(ValueError, match="Chunk size must be positive integer or -1"): 63 | template.full_chunk_shape = (32, -2) 64 | 65 | # Test that positive values and -1 are accepted 66 | template.full_chunk_shape = (32, -1) # Should not raise 67 | template.full_chunk_shape = (32, 16) # Should not raise 68 | 69 | def test_all_templates_inherit_from_abstract(self) -> None: 70 | """Test that all concrete templates inherit from AbstractDatasetTemplate.""" 71 | registry = TemplateRegistry() 72 | template_names = registry.list_all_templates() 73 | 74 | for template_name in template_names: 75 | template = registry.get(template_name) 76 | assert isinstance(template, AbstractDatasetTemplate) 77 | # That each template has the required properties and methods 78 | assert hasattr(template, "name") 79 | assert hasattr(template, "default_variable_name") 80 | assert hasattr(template, "trace_domain") 81 | assert hasattr(template, "dimension_names") 82 | assert hasattr(template, "coordinate_names") 83 | assert hasattr(template, "build_dataset") 84 | 85 | assert len(template_names) == len(set(template_names)), f"Duplicate template names found: {template_names}" 86 | -------------------------------------------------------------------------------- /src/mdio/core/indexing.py: -------------------------------------------------------------------------------- 1 | """Indexing logic.""" 2 | 3 | import itertools 4 | from math import ceil 5 | 6 | import numpy as np 7 | 8 | 9 | class ChunkIterator: 10 | """Chunk iterator for multi-dimensional arrays. 11 | 12 | This iterator takes an array shape and chunks and every time it is iterated, it returns 13 | a dictionary (if dimensions are provided) or a tuple of slices that align with 14 | chunk boundaries. When dimensions are provided, they are used as the dictionary keys. 15 | 16 | Args: 17 | shape: The shape of the array. 18 | chunks: The chunk sizes for each dimension. 19 | dim_names: The names of the array dimensions, to be used with DataArray.isel(). 20 | If the dim_names are not provided, a tuple of the slices will be returned. 21 | 22 | Attributes: # noqa: DOC602 23 | arr_shape: Shape of the array. 24 | len_chunks: Length of chunks in each dimension. 25 | dim_chunks: Number of chunks in each dimension. 26 | num_chunks: Total number of chunks. 27 | 28 | Examples: 29 | >> chunks = (3, 4, 5) 30 | >> shape = (5, 11, 19) 31 | >> dims = ["inline", "crossline", "depth"] 32 | >> 33 | >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims) 34 | >> for i in range(13): 35 | >> region = iter.__next__() 36 | >> print(region) 37 | { "inline": slice(3,6, None), "crossline": slice(0,4, None), "depth": slice(0,5, None) } 38 | 39 | >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=None) 40 | >> for i in range(13): 41 | >> region = iter.__next__() 42 | >> print(region) 43 | (slice(3,6,None), slice(0,4,None), slice(0,5,None)) 44 | """ 45 | 46 | def __init__(self, shape: tuple[int, ...], chunks: tuple[int, ...], dim_names: tuple[str, ...] = None): 47 | self.arr_shape = tuple(shape) # Deep copy to ensure immutability 48 | self.len_chunks = tuple(chunks) # Deep copy to ensure immutability 49 | self.dims = dim_names 50 | 51 | # Compute number of chunks per dimension, and total number of chunks 52 | self.dim_chunks = tuple( 53 | [ceil(len_dim / chunk) for len_dim, chunk in zip(self.arr_shape, self.len_chunks, strict=True)] 54 | ) 55 | self.num_chunks = np.prod(self.dim_chunks) 56 | 57 | # Under the hood stuff for the iterator. This generates C-ordered 58 | # permutation of chunk indices. 59 | dim_ranges = [range(dim_len) for dim_len in self.dim_chunks] 60 | self._ranges = itertools.product(*dim_ranges) 61 | self._idx = 0 62 | 63 | def __iter__(self) -> "ChunkIterator": 64 | """Iteration context.""" 65 | return self 66 | 67 | def __len__(self) -> int: 68 | """Get total number of chunks.""" 69 | return self.num_chunks 70 | 71 | def __next__(self) -> dict[str, slice]: 72 | """Iteration logic.""" 73 | if self._idx <= self.num_chunks: 74 | # We build slices here. It is dimension agnostic 75 | current_start = next(self._ranges) 76 | 77 | start_indices = tuple(dim * chunk for dim, chunk in zip(current_start, self.len_chunks, strict=True)) 78 | 79 | # Calculate stop indices, making the last slice fit the data exactly 80 | stop_indices = tuple( 81 | min((dim + 1) * chunk, self.arr_shape[i]) 82 | for i, (dim, chunk) in enumerate(zip(current_start, self.len_chunks, strict=True)) 83 | ) 84 | 85 | slices = tuple(slice(start, stop) for start, stop in zip(start_indices, stop_indices, strict=True)) 86 | 87 | if self.dims: # noqa SIM108 88 | # Example 89 | # {"inline":slice(3,6,None), "crossline":slice(0,4,None), "depth":slice(0,5,None)} 90 | result = dict(zip(self.dims, slices, strict=False)) 91 | else: 92 | # Example 93 | # (slice(3,6,None), slice(0,4,None), slice(0,5,None)) 94 | result = slices 95 | 96 | self._idx += 1 97 | 98 | return result 99 | 100 | raise StopIteration 101 | -------------------------------------------------------------------------------- /src/mdio/builder/templates/seismic_3d_streamer_field.py: -------------------------------------------------------------------------------- 1 | """Seismic3DStreamerFieldRecordsTemplate MDIO v1 dataset templates.""" 2 | 3 | from typing import Any 4 | 5 | from mdio.builder.schemas.dtype import ScalarType 6 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 7 | from mdio.builder.templates.base import AbstractDatasetTemplate 8 | from mdio.builder.templates.types import SeismicDataDomain 9 | 10 | 11 | class Seismic3DStreamerFieldRecordsTemplate(AbstractDatasetTemplate): 12 | """Seismic 3D streamer shot field records template. 13 | 14 | A generalized template for streamer field records that are optimized for: 15 | - Common-shot access 16 | - Common-channel access 17 | 18 | It can also store all the shot-lines of a survey in one MDIO if needed. 19 | 20 | Args: 21 | data_domain: The domain of the dataset. 22 | """ 23 | 24 | def __init__(self, data_domain: SeismicDataDomain = "time"): 25 | super().__init__(data_domain=data_domain) 26 | 27 | self._spatial_dim_names = ("sail_line", "gun", "shot_index", "cable", "channel") 28 | self._calculated_dims = ("shot_index",) 29 | self._dim_names = (*self._spatial_dim_names, self._data_domain) 30 | self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") 31 | self._logical_coord_names = ("shot_point", "orig_field_record_num") # ffid 32 | self._var_chunk_shape = (1, 1, 16, 1, 32, 1024) 33 | 34 | @property 35 | def _name(self) -> str: 36 | return "StreamerFieldRecords3D" 37 | 38 | def _load_dataset_attributes(self) -> dict[str, Any]: 39 | return {"surveyDimensionality": "3D", "gatherType": "common_source"} 40 | 41 | def _add_coordinates(self) -> None: 42 | # Add dimension coordinates 43 | # EXCLUDE: `shot_index` since its 0-N 44 | self._builder.add_coordinate( 45 | "sail_line", 46 | dimensions=("sail_line",), 47 | data_type=ScalarType.UINT32, 48 | ) 49 | self._builder.add_coordinate( 50 | "gun", 51 | dimensions=("gun",), 52 | data_type=ScalarType.UINT8, 53 | ) 54 | self._builder.add_coordinate( 55 | "cable", 56 | dimensions=("cable",), 57 | data_type=ScalarType.UINT8, 58 | ) 59 | self._builder.add_coordinate( 60 | "channel", 61 | dimensions=("channel",), 62 | data_type=ScalarType.UINT16, 63 | ) 64 | self._builder.add_coordinate( 65 | self._data_domain, 66 | dimensions=(self._data_domain,), 67 | data_type=ScalarType.INT32, 68 | ) 69 | 70 | # Add non-dimension coordinates 71 | self._builder.add_coordinate( 72 | "orig_field_record_num", 73 | dimensions=("sail_line", "gun", "shot_index"), 74 | data_type=ScalarType.UINT32, 75 | ) 76 | self._builder.add_coordinate( 77 | "shot_point", 78 | dimensions=("sail_line", "gun", "shot_index"), 79 | data_type=ScalarType.UINT32, 80 | ) 81 | self._builder.add_coordinate( 82 | "source_coord_x", 83 | dimensions=("sail_line", "gun", "shot_index"), 84 | data_type=ScalarType.FLOAT64, 85 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")), 86 | ) 87 | self._builder.add_coordinate( 88 | "source_coord_y", 89 | dimensions=("sail_line", "gun", "shot_index"), 90 | data_type=ScalarType.FLOAT64, 91 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")), 92 | ) 93 | self._builder.add_coordinate( 94 | "group_coord_x", 95 | dimensions=("sail_line", "gun", "shot_index", "cable", "channel"), 96 | data_type=ScalarType.FLOAT64, 97 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")), 98 | ) 99 | self._builder.add_coordinate( 100 | "group_coord_y", 101 | dimensions=("sail_line", "gun", "shot_index", "cable", "channel"), 102 | data_type=ScalarType.FLOAT64, 103 | metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")), 104 | ) 105 | -------------------------------------------------------------------------------- /src/mdio/api/io.py: -------------------------------------------------------------------------------- 1 | """Utils for reading MDIO dataset.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | from typing import Any 7 | from typing import Literal 8 | 9 | import zarr 10 | from upath import UPath 11 | from xarray import Dataset as xr_Dataset 12 | from xarray import open_zarr as xr_open_zarr 13 | from xarray.backends.writers import to_zarr as xr_to_zarr 14 | 15 | from mdio.constants import ZarrFormat 16 | from mdio.core.zarr_io import zarr_warnings_suppress_unstable_structs_v3 17 | 18 | if TYPE_CHECKING: 19 | from collections.abc import Mapping 20 | from pathlib import Path 21 | 22 | from xarray import Dataset 23 | from xarray.core.types import T_Chunks 24 | from xarray.core.types import ZarrWriteModes 25 | 26 | 27 | def _normalize_path(path: UPath | Path | str) -> UPath: 28 | return UPath(path) 29 | 30 | 31 | def _normalize_storage_options(path: UPath) -> dict[str, Any] | None: 32 | return None if len(path.storage_options) == 0 else path.storage_options 33 | 34 | 35 | def open_mdio(input_path: UPath | Path | str, chunks: T_Chunks = None) -> xr_Dataset: 36 | """Open a Zarr dataset from the specified universal file path. 37 | 38 | Args: 39 | input_path: Universal input path for the MDIO dataset. 40 | chunks: If provided, loads data into dask arrays with new chunking. 41 | - ``chunks="auto"`` will use dask ``auto`` chunking 42 | - ``chunks=None`` skips using dask, which is generally faster for small arrays. 43 | - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. 44 | - ``chunks={}`` loads the data with dask using the engine's preferred chunk size (on disk). 45 | - ``chunks={dim: chunk, ...}`` loads the data with dask using the specified chunk size for each dimension. 46 | 47 | See dask chunking for more details. 48 | 49 | Returns: 50 | An Xarray dataset opened from the input path. 51 | """ 52 | input_path = _normalize_path(input_path) 53 | storage_options = _normalize_storage_options(input_path) 54 | zarr_format = zarr.config.get("default_zarr_format") 55 | 56 | return xr_open_zarr( 57 | input_path.as_posix(), 58 | chunks=chunks, 59 | storage_options=storage_options, 60 | mask_and_scale=zarr_format == ZarrFormat.V3, # off for v2, on for v3 61 | consolidated=zarr_format == ZarrFormat.V2, # on for v2, off for v3 62 | ) 63 | 64 | 65 | def to_mdio( # noqa: PLR0913 66 | dataset: Dataset, 67 | output_path: UPath | Path | str, 68 | mode: ZarrWriteModes | None = None, 69 | *, 70 | compute: bool = True, 71 | region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, 72 | ) -> None: 73 | """Write dataset contents to an MDIO output_path. 74 | 75 | Args: 76 | dataset: The dataset to write. 77 | output_path: The universal path of the output MDIO file. 78 | mode: Persistence mode: "w" means create (overwrite if exists) 79 | "w-" means create (fail if exists) 80 | "a" means override all existing variables including dimension coordinates (create if does not exist) 81 | "a-" means only append those variables that have ``append_dim``. 82 | "r+" means modify existing array *values* only (raise an error if any metadata or shapes would change). 83 | The default mode is "r+" if ``region`` is set and ``w-`` otherwise. 84 | compute: If True write array data immediately; otherwise return a ``dask.delayed.Delayed`` object that 85 | can be computed to write array data later. Metadata is always updated eagerly. 86 | region: Optional mapping from dimension names to either a) ``"auto"``, or b) integer slices, indicating 87 | the region of existing MDIO array(s) in which to write this dataset's data. 88 | """ 89 | output_path = _normalize_path(output_path) 90 | storage_options = _normalize_storage_options(output_path) 91 | zarr_format = zarr.config.get("default_zarr_format") 92 | 93 | with zarr_warnings_suppress_unstable_structs_v3(): 94 | xr_to_zarr( 95 | dataset, 96 | store=output_path.as_posix(), # xarray doesn't like URI when file:// is protocol 97 | mode=mode, 98 | compute=compute, 99 | consolidated=zarr_format == ZarrFormat.V2, # on for v2, off for v3 100 | region=region, 101 | storage_options=storage_options, 102 | write_empty_chunks=False, 103 | ) 104 | -------------------------------------------------------------------------------- /src/mdio/segy/compat.py: -------------------------------------------------------------------------------- 1 | """Generate SEG-Y spec MDIO backward compatibility. 2 | 3 | We were limited to fixed field names and byte locations due to using the segyio library. Since 4 | MDIO 0.8.0 we have a more powerful SEG-Y parser and it gives more flexibility. To support older 5 | files, we need to open them with the old SEG-Y spec. This is where we define it. 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | import logging 11 | from importlib import metadata 12 | 13 | from packaging import version 14 | from segy.alias.segyio import SEGYIO_BIN_FIELD_MAP 15 | from segy.alias.segyio import SEGYIO_TRACE_FIELD_MAP 16 | from segy.schema import HeaderField 17 | from segy.schema import HeaderSpec 18 | from segy.schema import ScalarType 19 | from segy.schema import SegySpec 20 | from segy.schema import TextHeaderSpec 21 | from segy.schema import TraceDataSpec 22 | from segy.schema import TraceSpec 23 | from segy.standards.fields import binary 24 | 25 | from mdio.exceptions import InvalidMDIOError 26 | 27 | MDIO_VERSION = metadata.version("multidimio") 28 | 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | 33 | def get_binary_fields() -> list[HeaderField]: 34 | """Generate binary header fields from equinor/segyio fields.""" 35 | revision_field = binary.Rev1.SEGY_REVISION.model 36 | mdio_v0_bin_fields = [] 37 | 38 | # Replace min/max (rev2-ish) with rev1 like parsing. Ignore minor one, and add the 39 | # revision as 4-byte. 40 | for alias, field in SEGYIO_BIN_FIELD_MAP.items(): 41 | if alias == "SEGYRevision": 42 | mdio_v0_bin_fields.append(revision_field) 43 | elif alias != "SEGYRevisionMinor": 44 | mdio_v0_bin_fields.append(field.model) 45 | return mdio_v0_bin_fields 46 | 47 | 48 | def get_trace_fields(version_str: str) -> list[HeaderField]: 49 | """Generate trace header fields. 50 | 51 | This part allows us to configure custom rules for different MDIO versions. 52 | 53 | For instance, since MDIO 0.8.0 we also save the unassigned parts of the trace header (after 54 | byte 233 / offset 232). To be able to ingest/export new MDIO files and also support exporting 55 | older MDIO files, we conditionally add the new field based on MDIO version specified above. 56 | 57 | Current rules: 58 | * mdio<=0.7.4 use the segyio mappings directly. 59 | * mdio>=0.8.0 adds an extra field to the end to fill the last 8 bytes 60 | 61 | Args: 62 | version_str: MDIO version to generate the trace fields for. 63 | 64 | Returns: 65 | List of header fields for specified MDIO version trace header encoding. 66 | """ 67 | trace_fields = [field.model for field in SEGYIO_TRACE_FIELD_MAP.values()] 68 | version_obj = version.parse(version_str) 69 | if version_obj > version.parse("0.7.4"): 70 | trace_fields.append(HeaderField(name="unassigned", byte=233, format="int64")) 71 | return trace_fields 72 | 73 | 74 | def mdio_segy_spec(version_str: str | None = None) -> SegySpec: 75 | """Get a SEG-Y encoding spec for MDIO based on version.""" 76 | version_str = MDIO_VERSION if version_str is None else version_str 77 | 78 | binary_fields = get_binary_fields() 79 | trace_fields = get_trace_fields(version_str) 80 | 81 | return SegySpec( 82 | segy_standard=None, 83 | text_header=TextHeaderSpec(), 84 | binary_header=HeaderSpec(fields=binary_fields, item_size=400, offset=3200), 85 | trace=TraceSpec( 86 | header=HeaderSpec(fields=trace_fields, item_size=240), 87 | data=TraceDataSpec(format=ScalarType.IBM32), # placeholder 88 | ), 89 | ) 90 | 91 | 92 | def encode_segy_revision(binary_header: dict) -> dict: 93 | """Encode revision code to binary header. 94 | 95 | Return the correctly Rev1-like encoded revision code, ready to write to SEG-Y. 96 | 97 | Args: 98 | binary_header: Dictionary representing the SEG-Y binary header. Contains keys for major 99 | and minor revision numbers. 100 | 101 | Returns: 102 | The updated binary header with the encoded revision. 103 | 104 | Raises: 105 | InvalidMDIOError: Raised when binary header in MDIO is broken. 106 | """ 107 | major_key, minor_key = "segy_revision_major", "segy_revision_minor" 108 | 109 | try: 110 | major = binary_header.pop(major_key) 111 | minor = binary_header.pop(minor_key) 112 | except KeyError: 113 | msg = "Missing revision keys from binary header." 114 | logger.error(msg) 115 | raise InvalidMDIOError(msg) from KeyError 116 | 117 | code = (major << 8) | minor 118 | code_hex = f"0x{code:04x}" 119 | binary_header["segy_revision"] = code 120 | logger.info("Encoded revision %s.%s to code=%s ~ %s", major, minor, code, code_hex) 121 | return binary_header 122 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | tests: 9 | name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }} 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - { python: "3.13", os: "ubuntu-latest", session: "pre-commit" } 16 | # - { python: "3.13", os: "ubuntu-latest", session: "mypy" } 17 | # - { python: "3.12", os: "ubuntu-latest", session: "mypy" } 18 | # - { python: "3.11", os: "ubuntu-latest", session: "mypy" } 19 | - { python: "3.13", os: "ubuntu-latest", session: "tests" } 20 | - { python: "3.12", os: "ubuntu-latest", session: "tests" } 21 | - { python: "3.11", os: "ubuntu-latest", session: "tests" } 22 | - { python: "3.13", os: "windows-latest", session: "tests" } 23 | - { python: "3.13", os: "macos-latest", session: "tests" } 24 | # - { python: "3.13", os: "ubuntu-latest", session: "typeguard" } 25 | # - { python: "3.12", os: "ubuntu-latest", session: "typeguard" } 26 | # - { python: "3.11", os: "ubuntu-latest", session: "typeguard" } 27 | # - { python: "3.13", os: "ubuntu-latest", session: "xdoctest" } 28 | - { python: "3.13", os: "ubuntu-latest", session: "docs-build" } 29 | 30 | env: 31 | NOXSESSION: ${{ matrix.session }} 32 | FORCE_COLOR: "1" 33 | PRE_COMMIT_COLOR: "always" 34 | 35 | steps: 36 | - name: Check out the repository 37 | uses: actions/checkout@v5 38 | 39 | - name: Set up Python ${{ matrix.python }} 40 | uses: actions/setup-python@v6 41 | with: 42 | python-version: ${{ matrix.python }} 43 | 44 | - name: Install the pinned version of uv 45 | uses: astral-sh/setup-uv@v7 46 | with: 47 | python-version: ${{ matrix.python }} 48 | working-directory: ${{ github.workspace }} 49 | 50 | - name: Install Nox 51 | run: | 52 | uv tool install -c "${{ github.workspace }}/.github/workflows/constraints.txt" nox 53 | nox --version 54 | 55 | - name: Compute pre-commit cache key 56 | if: matrix.session == 'pre-commit' 57 | id: pre-commit-cache 58 | shell: python 59 | run: | 60 | import hashlib 61 | import sys 62 | import os 63 | 64 | python = "py{}.{}".format(*sys.version_info[:2]) 65 | payload = sys.version.encode() + sys.executable.encode() 66 | digest = hashlib.sha256(payload).hexdigest() 67 | result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8]) 68 | 69 | print("result={}".format(result), file=open(os.environ['GITHUB_OUTPUT'], 'a')) 70 | 71 | - name: Restore pre-commit cache 72 | uses: actions/cache@v4 73 | if: matrix.session == 'pre-commit' 74 | with: 75 | path: ~/.cache/pre-commit 76 | key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }} 77 | restore-keys: | 78 | ${{ steps.pre-commit-cache.outputs.result }}- 79 | 80 | - name: Run Nox 81 | run: | 82 | nox --python=${{ matrix.python }} 83 | 84 | - name: Upload coverage data 85 | if: always() && matrix.session == 'tests' 86 | uses: actions/upload-artifact@v5 87 | with: 88 | name: coverage-data-${{ matrix.os }}-${{ matrix.python }} 89 | include-hidden-files: true 90 | path: ".coverage.*" 91 | 92 | - name: Upload documentation 93 | if: matrix.session == 'docs-build' 94 | uses: actions/upload-artifact@v5 95 | with: 96 | name: docs 97 | path: docs/_build 98 | 99 | coverage: 100 | runs-on: ubuntu-latest 101 | needs: tests 102 | steps: 103 | - name: Check out the repository 104 | uses: actions/checkout@v5 105 | 106 | - name: Install the pinned version of uv 107 | uses: astral-sh/setup-uv@v7 108 | with: 109 | python-version: 3.13 110 | working-directory: ${{ github.workspace }} 111 | 112 | - name: Install Nox 113 | run: | 114 | uv tool install -c "${{ github.workspace }}/.github/workflows/constraints.txt" nox 115 | nox --version 116 | 117 | - name: Download coverage data 118 | uses: actions/download-artifact@v6 119 | with: 120 | pattern: coverage-data-* 121 | merge-multiple: true 122 | 123 | - name: Combine coverage data and display human readable report 124 | run: | 125 | nox --session=coverage 126 | 127 | - name: Create coverage report 128 | run: | 129 | nox --session=coverage -- xml 130 | 131 | - name: Upload coverage report 132 | uses: codecov/codecov-action@v5.5.1 133 | env: 134 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 135 | -------------------------------------------------------------------------------- /docs/data_models/chunk_grids.md: -------------------------------------------------------------------------------- 1 | ```{eval-rst} 2 | :tocdepth: 3 3 | ``` 4 | 5 | ```{currentModule} mdio.builder.schemas.chunk_grid 6 | 7 | ``` 8 | 9 | # Chunk Grid Models 10 | 11 | ```{article-info} 12 | :author: Altay Sansal 13 | :date: "{sub-ref}`today`" 14 | :read-time: "{sub-ref}`wordcount-minutes` min read" 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light 16 | ``` 17 | 18 | The variables in MDIO data model can represent different types of chunk grids. 19 | These grids are essential for managing multi-dimensional data arrays efficiently. 20 | In this breakdown, we will explore four distinct data models within the MDIO schema, 21 | each serving a specific purpose in data handling and organization. 22 | 23 | MDIO implements data models following the guidelines of the Zarr v3 spec and ZEPs: 24 | 25 | - [Zarr core specification (version 3)](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) 26 | - [ZEP 1 — Zarr specification version 3](https://zarr.dev/zeps/accepted/ZEP0001.html) 27 | - [ZEP 3 — Variable chunking](https://zarr.dev/zeps/draft/ZEP0003.html) 28 | 29 | ## Regular Grid 30 | 31 | The regular grid models are designed to represent a rectangular and regularly 32 | paced chunk grid. 33 | 34 | ```{eval-rst} 35 | .. autosummary:: 36 | RegularChunkGrid 37 | RegularChunkShape 38 | ``` 39 | 40 | For 1D array with `size = 31`{l=python}, we can divide it into 5 equally sized 41 | chunks. Note that the last chunk will be truncated to match the size of the array. 42 | 43 | `{ "name": "regular", "configuration": { "chunkShape": [7] } }`{l=json} 44 | 45 | Using the above schema resulting array chunks will look like this: 46 | 47 | ```bash 48 | ←─ 7 ─→ ←─ 7 ─→ ←─ 7 ─→ ←─ 7 ─→ ↔ 3 49 | ┌───────┬───────┬───────┬───────┬───┐ 50 | └───────┴───────┴───────┴───────┴───┘ 51 | ``` 52 | 53 | For 2D array with shape `rows, cols = (7, 17)`{l=python}, we can divide it into 9 54 | equally sized chunks. 55 | 56 | `{ "name": "regular", "configuration": { "chunkShape": [3, 7] } }`{l=json} 57 | 58 | Using the above schema, the resulting 2D array chunks will look like below. 59 | Note that the rows and columns are conceptual and visually not to scale. 60 | 61 | ```bash 62 | ←─ 7 ─→ ←─ 7 ─→ ↔ 3 63 | ┌───────┬───────┬───┐ 64 | │ ╎ ╎ │ ↑ 65 | │ ╎ ╎ │ 3 66 | │ ╎ ╎ │ ↓ 67 | ├╶╶╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤ 68 | │ ╎ ╎ │ ↑ 69 | │ ╎ ╎ │ 3 70 | │ ╎ ╎ │ ↓ 71 | ├╶╶╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤ 72 | │ ╎ ╎ │ ↕ 1 73 | └───────┴───────┴───┘ 74 | ``` 75 | 76 | ## Rectilinear Grid 77 | 78 | The [RectilinearChunkGrid](RectilinearChunkGrid) model extends 79 | the concept of chunk grids to accommodate rectangular and irregularly spaced chunks. 80 | This model is useful in data structures where non-uniform chunk sizes are necessary. 81 | [RectilinearChunkShape](RectilinearChunkShape) specifies the chunk sizes for each 82 | dimension as a list allowing for irregular intervals. 83 | 84 | ```{eval-rst} 85 | .. autosummary:: 86 | RectilinearChunkGrid 87 | RectilinearChunkShape 88 | ``` 89 | 90 | :::{note} 91 | It's important to ensure that the sum of the irregular spacings specified 92 | in the `chunkShape` matches the size of the respective array dimension. 93 | ::: 94 | 95 | For 1D array with `size = 39`{l=python}, we can divide it into 5 irregular sized 96 | chunks. 97 | 98 | `{ "name": "rectilinear", "configuration": { "chunkShape": [[10, 7, 5, 7, 10]] } }`{l=json} 99 | 100 | Using the above schema resulting array chunks will look like this: 101 | 102 | ```bash 103 | ←── 10 ──→ ←─ 7 ─→ ← 5 → ←─ 7 ─→ ←── 10 ──→ 104 | ┌──────────┬───────┬─────┬───────┬──────────┐ 105 | └──────────┴───────┴─────┴───────┴──────────┘ 106 | ``` 107 | 108 | For 2D array with shape `rows, cols = (7, 25)`{l=python}, we can divide it into 12 109 | rectilinear (rectangular bur irregular) chunks. Note that the rows and columns are 110 | conceptual and visually not to scale. 111 | 112 | `{ "name": "rectilinear", "configuration": { "chunkShape": [[3, 1, 3], [10, 5, 7, 3]] } }`{l=json} 113 | 114 | ```bash 115 | ←── 10 ──→ ← 5 → ←─ 7 ─→ ↔ 3 116 | ┌──────────┬─────┬───────┬───┐ 117 | │ ╎ ╎ ╎ │ ↑ 118 | │ ╎ ╎ ╎ │ 3 119 | │ ╎ ╎ ╎ │ ↓ 120 | ├╶╶╶╶╶╶╶╶╶╶┼╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤ 121 | │ ╎ ╎ ╎ │ ↕ 1 122 | ├╶╶╶╶╶╶╶╶╶╶┼╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤ 123 | │ ╎ ╎ ╎ │ ↑ 124 | │ ╎ ╎ ╎ │ 3 125 | │ ╎ ╎ ╎ │ ↓ 126 | └──────────┴─────┴───────┴───┘ 127 | ``` 128 | 129 | ## Model Reference 130 | 131 | :::{dropdown} RegularChunkGrid 132 | :animate: fade-in-slide-down 133 | 134 | ```{eval-rst} 135 | .. autopydantic_model:: RegularChunkGrid 136 | 137 | ---------- 138 | 139 | .. autopydantic_model:: RegularChunkShape 140 | ``` 141 | 142 | ::: 143 | :::{dropdown} RectilinearChunkGrid 144 | :animate: fade-in-slide-down 145 | 146 | ```{eval-rst} 147 | .. autopydantic_model:: RectilinearChunkGrid 148 | 149 | ---------- 150 | 151 | .. autopydantic_model:: RectilinearChunkShape 152 | ``` 153 | 154 | ::: 155 | -------------------------------------------------------------------------------- /tests/unit/test_segy_grid_overrides.py: -------------------------------------------------------------------------------- 1 | """Check grid overrides.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Any 6 | 7 | import numpy as np 8 | import numpy.typing as npt 9 | import pytest 10 | from numpy import arange 11 | from numpy import column_stack 12 | from numpy import meshgrid 13 | from numpy import unique 14 | from numpy.testing import assert_array_equal 15 | 16 | from mdio.core import Dimension 17 | from mdio.segy.exceptions import GridOverrideUnknownError 18 | from mdio.segy.geometry import GridOverrider 19 | 20 | SHOTS = arange(100, 104, dtype="int32") 21 | CABLES = arange(11, 15, dtype="int32") 22 | RECEIVERS = arange(1, 6, dtype="int32") 23 | 24 | 25 | def run_override( 26 | grid_overrides: dict[str, Any], 27 | index_names: tuple[str, ...], 28 | headers: npt.NDArray, 29 | chunksize: tuple[int, ...] | None = None, 30 | ) -> tuple[dict[str, Any], tuple[str], tuple[int]]: 31 | """Initialize and run overrider.""" 32 | overrider = GridOverrider() 33 | return overrider.run(headers, index_names, grid_overrides, chunksize) 34 | 35 | 36 | def get_dims(headers: npt.NDArray) -> list[Dimension]: 37 | """Get list of Dimensions from headers.""" 38 | dims = [] 39 | for index_name in headers.dtype.names: 40 | index_coords = headers[index_name] 41 | dim_unique = unique(index_coords) 42 | dims.append(Dimension(coords=dim_unique, name=index_name)) 43 | 44 | return dims 45 | 46 | 47 | @pytest.fixture 48 | def mock_streamer_headers() -> npt.NDArray: 49 | """Generate dictionary of mocked streamer index headers.""" 50 | grids = meshgrid(SHOTS, CABLES, RECEIVERS, indexing="ij") 51 | permutations = column_stack([grid.ravel() for grid in grids]) 52 | 53 | # Make channel from receiver ids 54 | for shot in SHOTS: 55 | shot_mask = permutations[:, 0] == shot 56 | permutations[shot_mask, -1] = arange(1, len(CABLES) * len(RECEIVERS) + 1) 57 | 58 | hdr_dtype = np.dtype( 59 | { 60 | "names": ["shot_point", "cable", "channel"], 61 | "formats": ["int32", "int32", "int32"], 62 | } 63 | ) 64 | 65 | n_traces = permutations.shape[0] 66 | result = np.ndarray(dtype=hdr_dtype, shape=n_traces) 67 | 68 | result["shot_point"] = permutations[:, 0] 69 | result["cable"] = permutations[:, 1] 70 | result["channel"] = permutations[:, 2] 71 | 72 | return result 73 | 74 | 75 | class TestAutoGridOverrides: 76 | """Check grid overrides works with auto indexing.""" 77 | 78 | def test_duplicates(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None: 79 | """Test the HasDuplicates Grid Override command.""" 80 | index_names = ("shot_point", "cable") 81 | grid_overrides = {"HasDuplicates": True} 82 | 83 | # Remove channel header 84 | streamer_headers = mock_streamer_headers[list(index_names)] 85 | chunksize = (4, 4, 8) 86 | 87 | new_headers, new_names, new_chunks = run_override( 88 | grid_overrides, 89 | index_names, 90 | streamer_headers, 91 | chunksize, 92 | ) 93 | 94 | assert new_names == ("shot_point", "cable", "trace") 95 | assert new_chunks == (4, 4, 1, 8) 96 | 97 | dims = get_dims(new_headers) 98 | 99 | assert_array_equal(dims[0].coords, SHOTS) 100 | assert_array_equal(dims[1].coords, CABLES) 101 | assert_array_equal(dims[2].coords, RECEIVERS) 102 | 103 | def test_non_binned(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None: 104 | """Test the NonBinned Grid Override command.""" 105 | index_names = ("shot_point", "cable") 106 | grid_overrides = {"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]} 107 | 108 | # Keep channel header for non-binned processing 109 | streamer_headers = mock_streamer_headers 110 | chunksize = (4, 4, 8) 111 | 112 | new_headers, new_names, new_chunks = run_override( 113 | grid_overrides, 114 | index_names, 115 | streamer_headers, 116 | chunksize, 117 | ) 118 | 119 | assert new_names == ("shot_point", "cable", "trace") 120 | assert new_chunks == (4, 4, 4, 8) 121 | 122 | dims = get_dims(new_headers) 123 | 124 | assert_array_equal(dims[0].coords, SHOTS) 125 | assert_array_equal(dims[1].coords, CABLES) 126 | # Trace coords are the unique channel values (1-20) 127 | expected_trace_coords = np.arange(1, 21, dtype="int32") 128 | assert_array_equal(dims[2].coords, expected_trace_coords) 129 | 130 | 131 | class TestStreamerGridOverrides: 132 | """Check grid overrides for shot data with streamer acquisition.""" 133 | 134 | def test_unknown_override( 135 | self, 136 | mock_streamer_headers: dict[str, npt.NDArray], 137 | ) -> None: 138 | """Test exception if user provides a command that's not allowed.""" 139 | index_names = ("shot_point", "cable", "channel") 140 | chunksize = None 141 | overrider = GridOverrider() 142 | with pytest.raises(GridOverrideUnknownError): 143 | overrider.run(mock_streamer_headers, index_names, {"WrongCommand": True}, chunksize) 144 | -------------------------------------------------------------------------------- /tests/unit/test_indexing.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the type converter module.""" 2 | 3 | import numpy as np 4 | from xarray import DataArray as xr_DataArray 5 | from xarray import Dataset as xr_Dataset 6 | 7 | from mdio.core.indexing import ChunkIterator 8 | 9 | 10 | def test_chunk_iterator_returning_dict() -> None: 11 | """Test the ChunkIterator class.""" 12 | dims = ["inline", "crossline", "depth"] 13 | chunks = (3, 4, 5) 14 | 15 | shape = (6, 12, 20) 16 | iter1 = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims) 17 | assert iter1.arr_shape == shape 18 | assert iter1.dims == dims 19 | assert iter1.len_chunks == chunks 20 | assert iter1.dim_chunks == (2, 3, 4) 21 | assert iter1.num_chunks == 24 22 | 23 | shape = (5, 11, 19) 24 | iter2 = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims) 25 | assert iter2.dim_chunks == (2, 3, 4) 26 | assert iter2.num_chunks == 24 27 | 28 | # Its purpose is to confirm that the last slice is adjusted to fit the data exactly 29 | # when the array size doesn't align perfectly with chunk boundaries. 30 | for _ in range(13): # element index 12 31 | region = iter1.__next__() 32 | assert region == { 33 | "inline": slice(3, 6, None), 34 | "crossline": slice(0, 4, None), 35 | "depth": slice(0, 5, None), 36 | } 37 | 38 | for _ in range(13): # element index 12 39 | region = iter2.__next__() 40 | assert region == { 41 | "inline": slice(3, 5, None), 42 | "crossline": slice(0, 4, None), 43 | "depth": slice(0, 5, None), 44 | } 45 | 46 | 47 | def test_chunk_iterator_returning_tuple() -> None: 48 | """Test the ChunkIterator class.""" 49 | chunks = (3, 4, 5) 50 | 51 | shape = (6, 12, 20) 52 | iter1 = ChunkIterator(shape=shape, chunks=chunks) 53 | assert iter1.arr_shape == shape 54 | assert iter1.dims is None 55 | assert iter1.len_chunks == chunks 56 | assert iter1.dim_chunks == (2, 3, 4) 57 | assert iter1.num_chunks == 24 58 | 59 | shape = (5, 11, 19) 60 | iter2 = ChunkIterator(shape=shape, chunks=chunks) 61 | assert iter2.dim_chunks == (2, 3, 4) 62 | assert iter2.num_chunks == 24 63 | 64 | # Its purpose is to confirm that the last slice is adjusted to fit the data exactly 65 | # when the array size doesn't align perfectly with chunk boundaries. 66 | for _ in range(13): # element index 12 67 | region = iter1.__next__() 68 | assert region == (slice(3, 6, None), slice(0, 4, None), slice(0, 5, None)) 69 | 70 | for _ in range(13): # element index 12 71 | region = iter2.__next__() 72 | assert region == (slice(3, 5, None), slice(0, 4, None), slice(0, 5, None)) 73 | 74 | 75 | def val(shape: tuple[int, int, int], i: int, j: int, k: int) -> int: 76 | """Calculate the linear index in a 3D array.""" 77 | return i * (shape[1] * shape[2]) + j * shape[2] + k 78 | 79 | 80 | def mock_trace_worker( 81 | shape: tuple[int, int, int], region: dict[str, slice], dataset: xr_Dataset, grid_map: np.array 82 | ) -> None: 83 | """Mock trace worker function. 84 | 85 | Note: 86 | Xarray, Zarr, and NumPy automatically truncates the slice to the valid bounds of the array 87 | (see the test above, where the last chunk is always of the same size) 88 | and does not raise an error. However, if one attempts to access an element at an index 89 | that is out of bounds, you will get an IndexError 90 | """ 91 | # We used a 2D selection with 2D index_slices 92 | assert grid_map.shape == (3, 4, 20) 93 | # We used a 3D selection with isel() 94 | assert tuple(dataset.sizes[d] for d in region) == (3, 4, 5) 95 | 96 | dimension_names = list(dataset.sizes) 97 | 98 | slice0 = region[dimension_names[0]] 99 | slice1 = region[dimension_names[1]] 100 | slice2 = region[dimension_names[2]] 101 | for ii, i in enumerate(range(slice0.start, min(slice0.stop, shape[0]))): 102 | for jj, j in enumerate(range(slice1.start, min(slice1.stop, shape[1]))): 103 | for kk, k in enumerate(range(slice2.start, min(slice2.stop, shape[2]))): 104 | # Validate that we've got the sample indexing right 105 | assert dataset["amplitude"].values[ii, jj, kk] == val(shape, i, j, k) 106 | # NOTE: grid_map is 2D, so we need to use k for the depth dimension 107 | assert dataset["amplitude"].values[ii, jj, kk] == grid_map[ii, jj, k] 108 | 109 | 110 | def test_chunk_iterator_with_dataset() -> None: 111 | """Test the ChunkIterator with a dataset.""" 112 | shape = (6, 12, 20) 113 | dims = ["inline", "crossline", "depth"] 114 | chunks = (3, 4, 5) 115 | 116 | data3 = np.arange(shape[0] * shape[1] * shape[2]).reshape(shape) 117 | amplitude = xr_DataArray(data3, dims=dims, name="amplitude") 118 | ds = xr_Dataset({"amplitude": amplitude}) 119 | 120 | chunk_iter = ChunkIterator(shape, chunks, dims) 121 | for region in chunk_iter: 122 | # If one needs both a dict and a tuple of slices, 123 | # one can use the following line an example to strip dim names out 124 | index_slices = tuple(region[key] for key in dims[:-1]) 125 | # The .isel() method takes keyword arguments, region, where each keyword corresponds 126 | # to a dimension name and the value is an integer, a slice object (our case), 127 | # or an array-like object 128 | mock_trace_worker(shape, region, ds.isel(region), amplitude[index_slices]) 129 | -------------------------------------------------------------------------------- /tests/unit/v1/templates/test_seismic_2d_poststack.py: -------------------------------------------------------------------------------- 1 | """Unit tests for Seismic2DPostStackTemplate.""" 2 | 3 | import pytest 4 | from tests.unit.v1.helpers import validate_variable 5 | 6 | from mdio.builder.schemas.chunk_grid import RegularChunkGrid 7 | from mdio.builder.schemas.dtype import ScalarType 8 | from mdio.builder.schemas.dtype import StructuredType 9 | from mdio.builder.schemas.v1.dataset import Dataset 10 | from mdio.builder.schemas.v1.units import LengthUnitEnum 11 | from mdio.builder.schemas.v1.units import LengthUnitModel 12 | from mdio.builder.schemas.v1.units import TimeUnitEnum 13 | from mdio.builder.schemas.v1.units import TimeUnitModel 14 | from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate 15 | from mdio.builder.templates.types import SeismicDataDomain 16 | 17 | UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) 18 | UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) 19 | 20 | 21 | def _validate_coordinates_headers_trace_mask(dataset: Dataset, headers: StructuredType, domain: str) -> None: 22 | """Validate the coordinate, headers, trace_mask variables in the dataset.""" 23 | # Verify variables 24 | # 2 dim coords + 2 non-dim coords + 1 data + 1 trace mask + 1 headers = 6 variables 25 | assert len(dataset.variables) == 7 26 | 27 | # Verify trace headers 28 | validate_variable(dataset, name="headers", dims=[("cdp", 2048)], coords=["cdp_x", "cdp_y"], dtype=headers) 29 | 30 | validate_variable( 31 | dataset, 32 | name="trace_mask", 33 | dims=[("cdp", 2048)], 34 | coords=["cdp_x", "cdp_y"], 35 | dtype=ScalarType.BOOL, 36 | ) 37 | 38 | # Verify dimension coordinate variables 39 | validate_variable( 40 | dataset, 41 | name="cdp", 42 | dims=[("cdp", 2048)], 43 | coords=["cdp"], 44 | dtype=ScalarType.INT32, 45 | ) 46 | 47 | domain = validate_variable( 48 | dataset, 49 | name=domain, 50 | dims=[(domain, 4096)], 51 | coords=[domain], 52 | dtype=ScalarType.INT32, 53 | ) 54 | assert domain.metadata.units_v1 in (UNITS_METER, UNITS_SECOND) 55 | 56 | # Verify non-dimension coordinate variables 57 | cdp_x = validate_variable( 58 | dataset, 59 | name="cdp_x", 60 | dims=[("cdp", 2048)], 61 | coords=["cdp_x"], 62 | dtype=ScalarType.FLOAT64, 63 | ) 64 | assert cdp_x.metadata.units_v1 == UNITS_METER 65 | 66 | cdp_y = validate_variable( 67 | dataset, 68 | name="cdp_y", 69 | dims=[("cdp", 2048)], 70 | coords=["cdp_y"], 71 | dtype=ScalarType.FLOAT64, 72 | ) 73 | assert cdp_y.metadata.units_v1 == UNITS_METER 74 | 75 | 76 | @pytest.mark.parametrize("data_domain", ["time", "depth"]) 77 | class TestSeismic2DPostStackTemplate: 78 | """Unit tests for Seismic2DPostStackTemplate.""" 79 | 80 | def test_configuration(self, data_domain: SeismicDataDomain) -> None: 81 | """Test configuration of Seismic2DPostStackTemplate.""" 82 | t = Seismic2DPostStackTemplate(data_domain=data_domain) 83 | 84 | # Template attributes 85 | assert t._data_domain == data_domain 86 | assert t._dim_names == ("cdp", data_domain) 87 | assert t._physical_coord_names == ("cdp_x", "cdp_y") 88 | assert t.full_chunk_shape == (1024, 1024) 89 | 90 | # Variables instantiated when build_dataset() is called 91 | assert t._builder is None 92 | assert t._dim_sizes == () 93 | 94 | # Verify dataset attributes 95 | attrs = t._load_dataset_attributes() 96 | assert attrs == {"surveyType": "2D", "gatherType": "stacked"} 97 | 98 | assert t.default_variable_name == "amplitude" 99 | 100 | def test_build_dataset_time(self, data_domain: SeismicDataDomain, structured_headers: StructuredType) -> None: 101 | """Test building a complete 2D time dataset.""" 102 | t = Seismic2DPostStackTemplate(data_domain=data_domain) 103 | t.add_units({"cdp_x": UNITS_METER, "cdp_y": UNITS_METER}) # spatial domain units 104 | t.add_units({"time": UNITS_SECOND, "depth": UNITS_METER}) # data domain units 105 | 106 | dataset = t.build_dataset("Seismic 2D Time Line 001", sizes=(2048, 4096), header_dtype=structured_headers) 107 | 108 | # Verify dataset metadata 109 | assert dataset.metadata.name == "Seismic 2D Time Line 001" 110 | assert dataset.metadata.attributes["surveyType"] == "2D" 111 | assert dataset.metadata.attributes["gatherType"] == "stacked" 112 | 113 | _validate_coordinates_headers_trace_mask(dataset, structured_headers, data_domain) 114 | 115 | # Verify seismic variable 116 | v = validate_variable( 117 | dataset, 118 | name="amplitude", 119 | dims=[("cdp", 2048), (data_domain, 4096)], 120 | coords=["cdp_x", "cdp_y"], 121 | dtype=ScalarType.FLOAT32, 122 | ) 123 | assert isinstance(v.metadata.chunk_grid, RegularChunkGrid) 124 | assert v.metadata.chunk_grid.configuration.chunk_shape == (1024, 1024) 125 | assert v.metadata.stats_v1 is None 126 | 127 | 128 | @pytest.mark.parametrize("data_domain", ["Time", "DePTh"]) 129 | def test_domain_case_handling(data_domain: str) -> None: 130 | """Test that domain parameter handles different cases correctly.""" 131 | template = Seismic2DPostStackTemplate(data_domain=data_domain) 132 | assert template._data_domain == data_domain.lower() 133 | 134 | data_domain_suffix = data_domain.lower().capitalize() 135 | assert template.name == f"PostStack2D{data_domain_suffix}" 136 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "multidimio" 3 | version = "1.1.1" 4 | description = "Cloud-native, scalable, and user-friendly multi dimensional energy data!" 5 | authors = [{ name = "Altay Sansal", email = "altay.sansal@tgs.com" }] 6 | requires-python = ">=3.11,<3.14" 7 | readme = "README.md" 8 | license = "Apache-2.0" 9 | license-files = ["LICEN[CS]E*"] 10 | keywords = ["mdio", "multidimio", "seismic", "wind", "data"] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.11", 14 | "Programming Language :: Python :: 3.12", 15 | "Programming Language :: Python :: 3.13", 16 | "License :: OSI Approved :: Apache Software License", 17 | "Development Status :: 4 - Beta", 18 | ] 19 | 20 | dependencies = [ 21 | "click>=8.3.0", 22 | "click-params>=0.5.0", 23 | "dask>=2025.9.1", 24 | "fsspec>=2025.9.0", 25 | "pint>=0.25.0", 26 | "psutil>=7.1.0", 27 | "pydantic>=2.12.0", 28 | "pydantic-settings>=2.6.1", 29 | "rich>=14.1.0", 30 | "segy>=0.5.3", 31 | "tqdm>=4.67.1", 32 | "universal-pathlib>=0.3.3", 33 | "xarray>=2025.10.1", 34 | "zarr>=3.1.3", 35 | ] 36 | 37 | [project.optional-dependencies] 38 | cloud = ["s3fs>=2025.9.0", "gcsfs>=2025.9.0", "adlfs>=2025.8.0"] 39 | distributed = ["distributed>=2025.9.1", "bokeh>=3.8.0"] 40 | lossy = ["zfpy>=1.0.1"] 41 | 42 | [project.urls] 43 | homepage = "https://mdio.dev/" 44 | repository = "https://github.com/TGSAI/mdio-python" 45 | documentation = "https://mdio-python.readthedocs.io" 46 | 47 | [project.scripts] 48 | mdio = "mdio.__main__:main" 49 | 50 | [dependency-groups] 51 | dev = [ 52 | "ruff>=0.14.0", 53 | "coverage[toml]>=7.10.7", 54 | "mypy>=1.18.2", 55 | "pre-commit>=4.3.0", 56 | "pre-commit-hooks>=6.0.0", 57 | "pytest>=8.4.2", 58 | "pytest-dependency>=0.6.0", 59 | "typeguard>=4.4.4", 60 | "xdoctest[colors]>=1.3.0", 61 | "Pygments>=2.19.2" 62 | ] 63 | 64 | docs = [ 65 | "aiohttp>=3.13.2", 66 | "autodoc-pydantic>=2.2.0", 67 | "furo>=2025.9.25", 68 | "linkify-it-py>=2.0.3", 69 | "matplotlib>=3.10.7", 70 | "myst-nb>=1.3.0", 71 | "sphinx>=8.2.3", 72 | "sphinx-autobuild>=2025.8.25", 73 | "sphinx-click>=6.1.0", 74 | "sphinx-copybutton>=0.5.2", 75 | "sphinx-design>=0.6.1", 76 | "ipywidgets>=8.1.7", 77 | ] 78 | 79 | [tool.uv] 80 | required-version = ">=0.8.17" 81 | 82 | [tool.ruff] 83 | target-version = "py311" 84 | src = ["src"] 85 | line-length = 120 86 | 87 | [tool.ruff.lint] 88 | select = [ 89 | "E", # pycodestyle 90 | "F", # pyflakes 91 | "B", # bugbear 92 | "I", # isort 93 | "UP", # pyupgrade 94 | "N", # pep8-naming 95 | "D", # pydocstyle 96 | "ANN", # annotations 97 | "S", # bandit 98 | "A", # builtins 99 | "C4", # comprehensions 100 | "DTZ", # datetimez 101 | "EM", # errmsg 102 | "ICN", # import-conventions 103 | "PIE", # pie 104 | "PT", # pytest-style 105 | "RSE", # raise 106 | "RET", # return 107 | "SIM", # simplify 108 | "TID", # tidy-imports 109 | "TC", # type-checking 110 | "ARG", # unused-arguments 111 | "PTH", # use-pathlib 112 | "TD", # todos 113 | "PL", # pylint 114 | "FLY", # flynt 115 | "NPY", # numpy 116 | "LOG", # logging 117 | "G", # logging-format 118 | "PERF", # perflint 119 | "FA", # flake8-future-annotations 120 | ] 121 | 122 | ignore = [ 123 | "D107", # Missing docstring in __init__ ; should be in class docstring 124 | ] 125 | 126 | [tool.ruff.lint.per-file-ignores] 127 | "tests/*" = ["S101", "PLR2004"] 128 | "tests/integration/test_segy_import_export_masked.py" = ["E501"] 129 | "docs/tutorials/*.ipynb" = ["S101"] 130 | 131 | [tool.ruff.lint.flake8-annotations] 132 | mypy-init-return = true 133 | 134 | [tool.ruff.lint.pydocstyle] 135 | convention = "google" 136 | 137 | [tool.ruff.lint.isort] 138 | force-single-line = true 139 | 140 | [tool.ruff.lint.pycodestyle] 141 | max-line-length = 120 142 | ignore-overlong-task-comments = true 143 | 144 | [tool.pydoclint] 145 | style = "google" 146 | arg-type-hints-in-docstring = false 147 | check-return-types = false 148 | check-yield-types = false 149 | 150 | [tool.coverage.paths] 151 | source = ["src", "*/site-packages"] 152 | tests = ["tests", "*/tests"] 153 | 154 | [tool.coverage.run] 155 | branch = true 156 | source = ["src/mdio", "tests"] 157 | relative_files = true 158 | 159 | [tool.coverage.report] 160 | show_missing = true 161 | fail_under = 85 162 | exclude_also = [ 163 | "if __name__ == __main__:", 164 | "if TYPE_CHECKING:", 165 | "raise NotImplementedError", 166 | ] 167 | 168 | [tool.mypy] 169 | strict = true 170 | warn_unreachable = true 171 | warn_redundant_casts = true 172 | warn_unused_ignores = true 173 | pretty = true 174 | show_column_numbers = true 175 | show_error_codes = true 176 | show_error_context = true 177 | disallow_untyped_defs = true # for strict mypy: (this is the tricky one) 178 | plugins = ["pydantic.mypy", "numpy.typing.mypy_plugin"] 179 | 180 | [tool.pydantic-mypy] 181 | init_forbid_extra = true 182 | init_typed = true 183 | warn_required_dynamic_aliases = true 184 | 185 | [tool.bumpversion] 186 | current_version = "1.1.1" 187 | allow_dirty = true 188 | commit = false 189 | tag = false 190 | parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(\\.dev(?P\\d+))?" 191 | serialize = [ 192 | "{major}.{minor}.{patch}.dev{dev}", # For dev releases 193 | "{major}.{minor}.{patch}", # For stable releases 194 | ] 195 | 196 | [tool.uv.build-backend] 197 | module-name = "mdio" 198 | 199 | [build-system] 200 | requires = ["uv_build>=0.8.17,<0.9.0"] 201 | build-backend = "uv_build" 202 | -------------------------------------------------------------------------------- /src/mdio/commands/info.py: -------------------------------------------------------------------------------- 1 | """MDIO Dataset information command.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | from typing import Any 7 | 8 | from click import STRING 9 | from click import Choice 10 | from click import argument 11 | from click import command 12 | from click import option 13 | 14 | if TYPE_CHECKING: 15 | from mdio import MDIOReader 16 | from mdio.core import Grid 17 | 18 | 19 | @command(name="info") 20 | @argument("mdio-path", type=STRING) 21 | @option( 22 | "-access", 23 | "--access-pattern", 24 | required=False, 25 | default="012", 26 | help="Access pattern of the file", 27 | type=STRING, 28 | show_default=True, 29 | ) 30 | @option( 31 | "-format", 32 | "--output-format", 33 | required=False, 34 | default="pretty", 35 | help="Output format. Pretty console or JSON.", 36 | type=Choice(["pretty", "json"]), 37 | show_default=True, 38 | show_choices=True, 39 | ) 40 | def info(mdio_path: str, output_format: str, access_pattern: str) -> None: 41 | """Provide information on a MDIO dataset. 42 | 43 | By default, this returns human-readable information about the grid and stats for the dataset. 44 | If output-format is set to 'json' then a JSON is returned to facilitate parsing. 45 | """ 46 | # Lazy import to reduce CLI startup time 47 | from mdio import MDIOReader # noqa: PLC0415 48 | 49 | reader = MDIOReader(mdio_path, access_pattern=access_pattern, return_metadata=True) 50 | 51 | grid_dict = parse_grid(reader.grid) 52 | stats_dict = cast_stats(reader.stats) 53 | access_pattern_dict = parse_access_patterns(reader) 54 | 55 | mdio_info = { 56 | "path": mdio_path, 57 | "stats": stats_dict, 58 | "grid": grid_dict, 59 | "access_patterns": access_pattern_dict, 60 | } 61 | 62 | if output_format == "pretty": 63 | pretty_print(mdio_info) 64 | 65 | if output_format == "json": 66 | json_print(mdio_info) 67 | 68 | 69 | def cast_stats(stats_dict: dict[str, Any]) -> dict[str, float]: 70 | """Normalize all floats to JSON serializable floats.""" 71 | return {k: float(v) for k, v in stats_dict.items()} 72 | 73 | 74 | def parse_grid(grid: Grid) -> dict[str, dict[str, int | str]]: 75 | """Extract grid information per dimension.""" 76 | grid_dict = {} 77 | for dim_name in grid.dim_names: 78 | dim = grid.select_dim(dim_name) 79 | min_ = str(dim.coords[0]) 80 | max_ = str(dim.coords[-1]) 81 | size = str(dim.coords.shape[0]) 82 | grid_dict[dim_name] = {"name": dim_name, "min": min_, "max": max_, "size": size} 83 | return grid_dict 84 | 85 | 86 | def parse_access_patterns(reader: MDIOReader) -> dict[str, Any]: 87 | """Extract access patterns and their info.""" 88 | access_pattern_dict = {} 89 | for name, array in reader._data_group.arrays(): 90 | pattern = name.replace("chunked_", "") 91 | chunks = str(array.chunks) 92 | format_ = str(array.dtype) 93 | compressors = str(array.compressors) 94 | access_pattern_dict[pattern] = { 95 | "chunks": chunks, 96 | "format": format_, 97 | "compressor(s)": compressors, 98 | } 99 | 100 | return access_pattern_dict 101 | 102 | 103 | def json_print(mdio_info: dict[str, Any]) -> None: 104 | """Convert MDIO Info to JSON and pretty print.""" 105 | # Lazy import to reduce CLI startup time 106 | from json import dumps as json_dumps # noqa: PLC0415 107 | 108 | from rich import print # noqa: A004, PLC0415 109 | 110 | print(json_dumps(mdio_info, indent=2)) 111 | 112 | 113 | def pretty_print(mdio_info: dict[str, Any]) -> None: 114 | """Print pretty MDIO Info table to console.""" 115 | # Lazy import to reduce CLI startup time 116 | from rich.console import Console # noqa: PLC0415 117 | from rich.table import Table # noqa: PLC0415 118 | 119 | console = Console() 120 | 121 | grid_table = Table(show_edge=False) 122 | grid_table.add_column("Dimension", justify="right", style="cyan", no_wrap=True) 123 | grid_table.add_column("Min", justify="left", style="magenta") 124 | grid_table.add_column("Max", justify="left", style="magenta") 125 | grid_table.add_column("Size", justify="left", style="green") 126 | 127 | for axis_dict in mdio_info["grid"].values(): 128 | name, min_, max_, size = axis_dict.values() 129 | grid_table.add_row(name, min_, max_, size) 130 | 131 | stat_table = Table(show_edge=False) 132 | stat_table.add_column("Stat", justify="right", style="cyan", no_wrap=True) 133 | stat_table.add_column("Value", justify="left", style="magenta") 134 | 135 | for stat, value in mdio_info["stats"].items(): 136 | stat_table.add_row(stat, f"{value:.4f}") 137 | 138 | access_patter_table = Table(show_edge=False) 139 | access_patter_table.add_column("Pattern", justify="right", style="cyan", no_wrap=True) 140 | access_patter_table.add_column("Chunks", justify="left", style="magenta") 141 | access_patter_table.add_column("Format", justify="left", style="magenta") 142 | access_patter_table.add_column("Compressor", justify="left", style="magenta") 143 | 144 | for name, pattern_info in mdio_info["access_patterns"].items(): 145 | chunks, format_, compressor = pattern_info.values() 146 | access_patter_table.add_row(name, chunks, format_, compressor) 147 | 148 | master_table = Table(title=f"File Information for {mdio_info['path']}") 149 | master_table.add_column("Grid", justify="center") 150 | master_table.add_column("Statistics", justify="center") 151 | master_table.add_column("Access Patterns", justify="center") 152 | master_table.add_row(grid_table, stat_table, access_patter_table) 153 | 154 | console.print(master_table) 155 | 156 | 157 | cli = info 158 | -------------------------------------------------------------------------------- /src/mdio/core/grid.py: -------------------------------------------------------------------------------- 1 | """Grid abstraction with serializers.""" 2 | 3 | from __future__ import annotations 4 | 5 | from dataclasses import dataclass 6 | from typing import TYPE_CHECKING 7 | 8 | import numpy as np 9 | import zarr 10 | from numcodecs.zarr3 import Blosc 11 | from zarr.codecs import BloscCodec 12 | 13 | from mdio.constants import UINT32_MAX 14 | from mdio.constants import ZarrFormat 15 | from mdio.core.utils_write import get_constrained_chunksize 16 | 17 | if TYPE_CHECKING: 18 | from segy.arrays import HeaderArray 19 | from zarr import Array as ZarrArray 20 | 21 | from mdio.core import Dimension 22 | 23 | 24 | @dataclass 25 | class Grid: 26 | """N-dimensional grid class for managing bounds and increments. 27 | 28 | This class encapsulates an N-dimensional grid, storing dimension information and optional 29 | mapping and live mask arrays for trace indexing. It provides access to dimension names, shape, 30 | and number of dimensions as computed attributes. 31 | 32 | Args: 33 | dims: List of Dimension instances defining the grid axes. 34 | map: Optional Zarr array for trace mapping. Defaults to None. 35 | live_mask: Optional Zarr array indicating live traces. Defaults to None. 36 | 37 | Attributes: 38 | dims: List of Dimension instances defining the grid axes. 39 | map: Optional Zarr array for trace mapping, or None if not set. 40 | live_mask: Optional Zarr array indicating live traces, or None if not set. 41 | 42 | Notes: 43 | Computed attributes available after initialization: 44 | - `dim_names`: Tuple of dimension names. 45 | - `shape`: Tuple of dimension sizes. 46 | - `ndim`: Number of dimensions. 47 | 48 | Example: 49 | >>> from mdio.core import Dimension 50 | >>> dims = [Dimension(name="x", min=0, max=100, step=10)] 51 | >>> grid = Grid(dims) 52 | >>> grid.dim_names 53 | ('x',) 54 | >>> grid.shape 55 | (11,) 56 | """ 57 | 58 | dims: list[Dimension] 59 | map: ZarrArray | None = None 60 | live_mask: ZarrArray | None = None 61 | 62 | _TARGET_MEMORY_PER_BATCH = 1 * 1024**3 # 1GB target for batch processing 63 | _INTERNAL_CHUNK_SIZE_TARGET = 10 * 1024**2 # 10MB target for chunks 64 | 65 | def __post_init__(self) -> None: 66 | """Initialize derived attributes.""" 67 | self.dim_names = tuple(dim.name for dim in self.dims) 68 | self.shape = tuple(dim.size for dim in self.dims) 69 | self.ndim = len(self.dims) 70 | 71 | def __getitem__(self, item: int) -> Dimension: 72 | """Get a dimension by index.""" 73 | return self.dims[item] 74 | 75 | def __setitem__(self, key: int, value: Dimension) -> None: 76 | """Set a dimension by index.""" 77 | self.dims[key] = value 78 | 79 | def select_dim(self, name: str) -> Dimension: 80 | """Get a dimension by name.""" 81 | if name not in self.dim_names: 82 | msg = f"Invalid dimension name '{name}'. Available dimensions: {self.dim_names}." 83 | raise ValueError(msg) 84 | index = self.dim_names.index(name) 85 | return self.dims[index] 86 | 87 | def get_min(self, name: str) -> float: 88 | """Get minimum value of a dimension by name.""" 89 | return self.select_dim(name).min().item() 90 | 91 | def get_max(self, name: str) -> float: 92 | """Get maximum value of a dimension by name.""" 93 | return self.select_dim(name).max().item() 94 | 95 | def build_map(self, index_headers: HeaderArray) -> None: 96 | """Build trace mapping and live mask from header indices. 97 | 98 | Args: 99 | index_headers: Header array containing dimension indices. 100 | """ 101 | # Determine data type for map based on grid size 102 | grid_size = np.prod(self.shape[:-1], dtype=np.uint64) 103 | map_dtype = np.uint64 if grid_size > UINT32_MAX else np.uint32 104 | fill_value = np.iinfo(map_dtype).max 105 | 106 | # Initialize Zarr arrays 107 | live_shape = self.shape[:-1] 108 | chunks = get_constrained_chunksize( 109 | shape=live_shape, 110 | dtype=map_dtype, 111 | max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET, 112 | ) 113 | 114 | zarr_format = zarr.config.get("default_zarr_format") 115 | 116 | common_kwargs = {"shape": live_shape, "chunks": chunks, "store": None} 117 | if zarr_format == ZarrFormat.V2: 118 | common_kwargs["compressors"] = Blosc(cname="zstd") 119 | else: 120 | common_kwargs["compressors"] = BloscCodec(cname="zstd") 121 | 122 | self.map = zarr.create_array(fill_value=fill_value, dtype=map_dtype, **common_kwargs) 123 | self.live_mask = zarr.create_array(fill_value=0, dtype=bool, **common_kwargs) 124 | 125 | # Calculate batch size 126 | memory_per_trace_index = index_headers.itemsize 127 | batch_size = max(1, int(self._TARGET_MEMORY_PER_BATCH / memory_per_trace_index)) 128 | total_live_traces = index_headers.size 129 | 130 | # Process headers in batches 131 | for start in range(0, total_live_traces, batch_size): 132 | end = min(start + batch_size, total_live_traces) 133 | live_dim_indices = [] 134 | 135 | # Compute indices for the batch 136 | for dim in self.dims[:-1]: 137 | dim_hdr = index_headers[dim.name][start:end] 138 | indices = np.searchsorted(dim, dim_hdr).astype(np.uint32) 139 | live_dim_indices.append(indices) 140 | live_dim_indices = tuple(live_dim_indices) 141 | 142 | # Assign trace indices 143 | trace_indices = np.arange(start, end, dtype=np.uint64) 144 | 145 | self.map.vindex[live_dim_indices] = trace_indices 146 | self.live_mask.vindex[live_dim_indices] = True 147 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | - Demonstrating empathy and kindness toward other people 21 | - Being respectful of differing opinions, viewpoints, and experiences 22 | - Giving and gracefully accepting constructive feedback 23 | - Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | - Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | - The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | - Trolling, insulting or derogatory comments, and personal or political attacks 33 | - Public or private harassment 34 | - Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | - Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | [opensource@tgs.com](mailto:opensource@tgs.com). 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][mozilla coc]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][faq]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [mozilla coc]: https://github.com/mozilla/diversity 131 | [faq]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | -------------------------------------------------------------------------------- /tests/unit/v1/test_dataset_builder_add_coordinate.py: -------------------------------------------------------------------------------- 1 | """Tests the schema v1 dataset_builder.add_coordinate() public API.""" 2 | 3 | import pytest 4 | from zarr.codecs import BloscCname 5 | 6 | from mdio.builder.dataset_builder import MDIODatasetBuilder 7 | from mdio.builder.dataset_builder import _BuilderState 8 | from mdio.builder.schemas.compressors import Blosc 9 | from mdio.builder.schemas.dtype import ScalarType 10 | from mdio.builder.schemas.v1.units import LengthUnitEnum 11 | from mdio.builder.schemas.v1.units import LengthUnitModel 12 | from mdio.builder.schemas.v1.variable import CoordinateMetadata 13 | from mdio.builder.schemas.v1.variable import VariableMetadata 14 | 15 | from .helpers import validate_builder 16 | from .helpers import validate_coordinate 17 | from .helpers import validate_variable 18 | 19 | 20 | def test_add_coordinate() -> None: 21 | """Test adding coordinates. Check the state transition and validate required parameters.""" 22 | builder = MDIODatasetBuilder("test_dataset") 23 | assert builder._state == _BuilderState.INITIAL 24 | 25 | msg = "Must add at least one dimension before adding coordinates" 26 | with pytest.raises(ValueError, match=msg): 27 | builder.add_coordinate("cdp", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) 28 | 29 | builder.add_dimension("inline", 100) 30 | builder.add_dimension("crossline", 200) 31 | 32 | # Validate required parameters 33 | bad_name = None 34 | with pytest.raises(ValueError, match="'name' must be a non-empty string"): 35 | builder.add_coordinate(bad_name, dimensions=("speed",), data_type=ScalarType.FLOAT32) 36 | with pytest.raises(ValueError, match="'name' must be a non-empty string"): 37 | builder.add_coordinate("", dimensions=("speed",), data_type=ScalarType.FLOAT32) 38 | with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): 39 | builder.add_coordinate("cdp_x", dimensions=None, data_type=ScalarType.FLOAT32) 40 | with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): 41 | builder.add_coordinate("cdp_x", dimensions=(), data_type=ScalarType.FLOAT32) 42 | 43 | # Add a variable using non-existent dimensions 44 | msg = "Pre-existing dimension named 'xline' is not found" 45 | with pytest.raises(ValueError, match=msg): 46 | builder.add_coordinate("bad_cdp-x", dimensions=("inline", "xline"), data_type=ScalarType.FLOAT32) 47 | 48 | # Validate state transition 49 | builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) 50 | validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) 51 | validate_variable( 52 | builder, 53 | name="cdp_x", 54 | dims=[("inline", 100), ("crossline", 200)], 55 | coords=["cdp_x"], 56 | dtype=ScalarType.FLOAT32, 57 | ) 58 | 59 | # Adding coordinate with the same name twice 60 | msg = "Adding coordinate with the same name twice is not allowed" 61 | with pytest.raises(ValueError, match=msg): 62 | builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) 63 | 64 | 65 | def test_add_coordinate_with_defaults() -> None: 66 | """Test adding coordinates with default arguments.""" 67 | builder = MDIODatasetBuilder("test_dataset") 68 | builder.add_dimension("inline", 100) 69 | builder.add_dimension("crossline", 200) 70 | 71 | # Add coordinate using defaults 72 | builder.add_coordinate("cdp", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) 73 | validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) 74 | validate_coordinate(builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT32) 75 | v = validate_variable( 76 | builder, 77 | name="cdp", 78 | dims=[("inline", 100), ("crossline", 200)], 79 | coords=["cdp"], 80 | dtype=ScalarType.FLOAT32, 81 | ) 82 | assert v.long_name is None # Default value 83 | assert v.compressor is None # Default value 84 | assert v.metadata is None # Default value 85 | 86 | 87 | def test_coordinate_with_full_parameters() -> None: 88 | """Test adding coordinates with all metadata.""" 89 | builder = MDIODatasetBuilder("test_dataset") 90 | builder.add_dimension("inline", 100) 91 | builder.add_dimension("crossline", 200) 92 | 93 | # Add coordinate with all metadata 94 | metadata = CoordinateMetadata( 95 | units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT), 96 | attributes={"MGA": 51, "UnitSystem": "Imperial"}, 97 | ) 98 | builder.add_coordinate( 99 | "cdp", 100 | long_name="Common Depth Point", 101 | dimensions=("inline", "crossline"), 102 | data_type=ScalarType.FLOAT16, 103 | compressor=Blosc(cname=BloscCname.zstd), 104 | metadata=metadata, 105 | ) 106 | validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) 107 | c = validate_coordinate(builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT16) 108 | assert c.long_name == "Common Depth Point" 109 | assert isinstance(c.compressor, Blosc) 110 | assert c.compressor.cname == BloscCname.zstd 111 | assert c.metadata.attributes["MGA"] == 51 112 | assert c.metadata.attributes["UnitSystem"] == "Imperial" 113 | assert c.metadata.units_v1.length == LengthUnitEnum.FOOT 114 | v = validate_variable( 115 | builder, 116 | name="cdp", 117 | dims=[("inline", 100), ("crossline", 200)], 118 | coords=["cdp"], 119 | dtype=ScalarType.FLOAT16, 120 | ) 121 | assert isinstance(v.compressor, Blosc) 122 | assert v.compressor.cname == BloscCname.zstd 123 | assert isinstance(v.metadata, VariableMetadata) 124 | assert v.metadata.units_v1.length == LengthUnitEnum.FOOT 125 | assert v.metadata.attributes["MGA"] == 51 126 | assert v.metadata.attributes["UnitSystem"] == "Imperial" 127 | --------------------------------------------------------------------------------