├── src
    └── mdio
    │   ├── py.typed
    │   ├── api
    │       ├── __init__.py
    │       └── io.py
    │   ├── builder
    │       ├── __init__.py
    │       ├── templates
    │       │   ├── __init__.py
    │       │   ├── types.py
    │       │   ├── seismic_2d_poststack.py
    │       │   ├── seismic_3d_poststack.py
    │       │   ├── seismic_2d_streamer_shot.py
    │       │   ├── seismic_2d_cdp.py
    │       │   ├── seismic_3d_coca.py
    │       │   ├── seismic_3d_streamer_shot.py
    │       │   ├── seismic_3d_cdp.py
    │       │   └── seismic_3d_streamer_field.py
    │       └── schemas
    │       │   ├── __init__.py
    │       │   ├── v1
    │       │       ├── __init__.py
    │       │       ├── dataset.py
    │       │       ├── stats.py
    │       │       ├── variable.py
    │       │       └── units.py
    │       │   ├── dimension.py
    │       │   ├── core.py
    │       │   ├── chunk_grid.py
    │       │   ├── base.py
    │       │   ├── dtype.py
    │       │   ├── units.py
    │       │   └── compressors.py
    │   ├── segy
    │       ├── __init__.py
    │       ├── helpers_segy.py
    │       ├── _raw_trace_wrapper.py
    │       ├── scalar.py
    │       ├── exceptions.py
    │       ├── parsers.py
    │       └── compat.py
    │   ├── commands
    │       ├── __init__.py
    │       ├── copy.py
    │       └── info.py
    │   ├── core
    │       ├── __init__.py
    │       ├── zarr_io.py
    │       ├── utils_write.py
    │       ├── config.py
    │       ├── dimension.py
    │       ├── indexing.py
    │       └── grid.py
    │   ├── converters
    │       ├── __init__.py
    │       ├── exceptions.py
    │       └── type_converter.py
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── exceptions.py
    │   └── __main__.py
├── .gitattributes
├── tests
    ├── __init__.py
    ├── unit
    │   ├── __init__.py
    │   ├── v1
    │   │   ├── __init__.py
    │   │   ├── templates
    │   │   │   ├── conftest.py
    │   │   │   ├── test_seismic_templates.py
    │   │   │   └── test_seismic_2d_poststack.py
    │   │   ├── test_dataset_builder_helpers.py
    │   │   ├── test_dataset_builder_add_dimension.py
    │   │   └── test_dataset_builder_add_coordinate.py
    │   ├── test_dimension.py
    │   ├── test_environment.py
    │   ├── test_auto_chunking.py
    │   ├── test_coordinate_scalar.py
    │   ├── test_segy_spec_validation.py
    │   ├── test_type_converter.py
    │   ├── test_segy_grid_overrides.py
    │   └── test_indexing.py
    ├── integration
    │   └── testing_helpers.py
    ├── conftest.py
    └── test_main.py
├── docs
    ├── codeofconduct.md
    ├── license.md
    ├── contributing.md
    ├── data_models
    │   ├── index.md
    │   ├── dimensions.md
    │   ├── compressors.md
    │   ├── version_1.md
    │   └── chunk_grids.md
    ├── requirements.txt
    ├── tutorials
    │   └── index.md
    ├── index.md
    ├── api_reference.md
    ├── conf.py
    ├── installation.md
    └── template_registry.md
├── .darglint
├── .github
    ├── workflows
    │   ├── constraints.txt
    │   ├── labeler.yml
    │   ├── release.yml
    │   └── tests.yml
    ├── dependabot.yml
    ├── release-drafter.yml
    └── labels.yml
├── .readthedocs.yml
├── .editorconfig
├── .devcontainer
    ├── post-create.sh
    └── devcontainer.json
├── .pre-commit-config.yaml
├── .dockerignore
├── .gitignore
├── CONTRIBUTING.md
├── pyproject.toml
└── CODE_OF_CONDUCT.md


/src/mdio/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/src/mdio/api/__init__.py:
--------------------------------------------------------------------------------
1 | """Public API."""
2 | 


--------------------------------------------------------------------------------
/src/mdio/builder/__init__.py:
--------------------------------------------------------------------------------
1 | """MDIO building utilities."""
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test suite for the MDIO package."""
2 | 


--------------------------------------------------------------------------------
/docs/codeofconduct.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CODE_OF_CONDUCT.md
2 | 
3 | ```
4 | 


--------------------------------------------------------------------------------
/src/mdio/segy/__init__.py:
--------------------------------------------------------------------------------
1 | """SEG-Y specific implementation module."""
2 | 


--------------------------------------------------------------------------------
/.darglint:
--------------------------------------------------------------------------------
1 | [darglint]
2 | docstring_style = google
3 | strictness = long
4 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for parts of the MDIO package."""
2 | 


--------------------------------------------------------------------------------
/.github/workflows/constraints.txt:
--------------------------------------------------------------------------------
1 | bump-my-version==1.2.4
2 | nox==2025.10.16
3 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/__init__.py:
--------------------------------------------------------------------------------
1 | """MDIO templates for known dataset kinds."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/v1/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for parts of the MDIO package related to the v1 schema."""
2 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | """MDIO schemas for different data types."""
2 | 
3 | __all__ = []
4 | 


--------------------------------------------------------------------------------
/docs/license.md:
--------------------------------------------------------------------------------
1 | # License
2 | 
3 | ```{literalinclude} ../LICENSE
4 | ---
5 | language: none
6 | ---
7 | ```
8 | 


--------------------------------------------------------------------------------
/src/mdio/commands/__init__.py:
--------------------------------------------------------------------------------
1 | """Plugins for MDIO CLI commands.
2 | 
3 | Default Plugins:
4 | 
5 | * SEG-Y: CLI commands to ingest / export SEG-Y.
6 | """
7 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../CONTRIBUTING.md
2 | ---
3 | end-before: <!-- github-only -->
4 | ---
5 | ```
6 | 
7 | [code of conduct]: codeofconduct
8 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/v1/__init__.py:
--------------------------------------------------------------------------------
1 | """Schema specific to MDIO v1."""
2 | 
3 | from mdio.builder.schemas.v1.dataset import Dataset
4 | 
5 | __all__ = ["Dataset"]
6 | 


--------------------------------------------------------------------------------
/docs/data_models/index.md:
--------------------------------------------------------------------------------
 1 | # Dataset Models
 2 | 
 3 | This section contains the data models for the MDIO format.
 4 | 
 5 | ```{toctree}
 6 | :maxdepth: 2
 7 | 
 8 | version_1
 9 | ```
10 | 


--------------------------------------------------------------------------------
/src/mdio/core/__init__.py:
--------------------------------------------------------------------------------
1 | """MDIO core functionalities."""
2 | 
3 | from mdio.core.dimension import Dimension
4 | from mdio.core.grid import Grid
5 | 
6 | __all__ = ["Dimension", "Grid"]
7 | 


--------------------------------------------------------------------------------
/src/mdio/converters/__init__.py:
--------------------------------------------------------------------------------
1 | """MDIO Data conversion API."""
2 | 
3 | from mdio.converters.mdio import mdio_to_segy
4 | from mdio.converters.segy import segy_to_mdio
5 | 
6 | __all__ = ["mdio_to_segy", "segy_to_mdio"]
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-22.04
 4 |   tools:
 5 |     python: "3.13"
 6 | sphinx:
 7 |   configuration: docs/conf.py
 8 | formats: all
 9 | python:
10 |   install:
11 |     - requirements: docs/requirements.txt
12 |     - path: .
13 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.13.2
 2 | autodoc-pydantic==2.2.0
 3 | furo==2025.9.25
 4 | linkify-it-py==2.0.3
 5 | matplotlib==3.10.7
 6 | myst-nb==1.3.0
 7 | sphinx==8.2.3
 8 | sphinx-click==6.1.0
 9 | sphinx-copybutton==0.5.2
10 | sphinx-design==0.6.1
11 | ipywidgets==8.1.7
12 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/types.py:
--------------------------------------------------------------------------------
1 | """Module that contains type aliases for templates."""
2 | 
3 | from typing import Literal
4 | from typing import TypeAlias
5 | 
6 | SeismicDataDomain: TypeAlias = Literal["depth", "time"]
7 | 
8 | CdpGatherDomain: TypeAlias = Literal["offset", "angle"]
9 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | 
 9 | [*.{py,toml}]
10 | indent_style = space
11 | indent_size = 4
12 | 
13 | [*.{yml,yaml,json}]
14 | indent_style = space
15 | indent_size = 2
16 | 


--------------------------------------------------------------------------------
/docs/tutorials/index.md:
--------------------------------------------------------------------------------
 1 | # Tutorials
 2 | 
 3 | Welcome to the tutorials. This section collects hands‑on guides that walk you through common MDIO workflows.
 4 | 
 5 | Pick a topic from the list below to get started.
 6 | 
 7 | ```{toctree}
 8 | :maxdepth: 1
 9 | :titlesonly:
10 | 
11 | quickstart
12 | creation
13 | compression
14 | rechunking
15 | corrupt_files
16 | custom_template
17 | ```
18 | 


--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
 1 | name: Labeler
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | 
 9 | jobs:
10 |   labeler:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Check out the repository
14 |         uses: actions/checkout@v5
15 | 
16 |       - name: Run Labeler
17 |         uses: crazy-max/ghaction-github-labeler@v5
18 |         with:
19 |           skip-delete: true
20 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/dimension.py:
--------------------------------------------------------------------------------
 1 | """Dimension schema."""
 2 | 
 3 | from pydantic import Field
 4 | 
 5 | from mdio.builder.schemas.core import CamelCaseStrictModel
 6 | 
 7 | 
 8 | class NamedDimension(CamelCaseStrictModel):
 9 |     """Represents a single dimension with a name and size."""
10 | 
11 |     name: str = Field(..., description="Unique identifier for the dimension.")
12 |     size: int = Field(..., gt=0, description="Total size of the dimension.")
13 | 


--------------------------------------------------------------------------------
/src/mdio/__init__.py:
--------------------------------------------------------------------------------
 1 | """MDIO library."""
 2 | 
 3 | from importlib import metadata
 4 | 
 5 | from mdio.api.io import open_mdio
 6 | from mdio.api.io import to_mdio
 7 | from mdio.converters import mdio_to_segy
 8 | from mdio.converters import segy_to_mdio
 9 | 
10 | try:
11 |     __version__ = metadata.version("multidimio")
12 | except metadata.PackageNotFoundError:
13 |     __version__ = "unknown"
14 | 
15 | 
16 | __all__ = [
17 |     "__version__",
18 |     "open_mdio",
19 |     "to_mdio",
20 |     "mdio_to_segy",
21 |     "segy_to_mdio",
22 | ]
23 | 


--------------------------------------------------------------------------------
/.devcontainer/post-create.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Install useful developer tools used by mdio development
 6 | uv tool install nox
 7 | uv tool install bump-my-version
 8 | 
 9 | # Sync the environment, installing the project editable and including dev dependencies
10 | uv sync
11 | 
12 | # Set Git safe directory to avoid ownership issues
13 | git config --global --add safe.directory "$PWD"
14 | 
15 | # Optional: If you need to reset GitHub host key for SSH (uncomment if necessary)
16 | # ssh-keygen -f "/home/vscode/.ssh/known_hosts" -R "github.com"
17 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: github-actions
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: monthly
 7 |   - package-ecosystem: pip
 8 |     directory: "/.github/workflows"
 9 |     schedule:
10 |       interval: monthly
11 |   - package-ecosystem: pip
12 |     directory: "/docs"
13 |     schedule:
14 |       interval: monthly
15 |   - package-ecosystem: pip
16 |     directory: "/"
17 |     schedule:
18 |       interval: monthly
19 |     versioning-strategy: lockfile-only
20 |     allow:
21 |       - dependency-type: "all"
22 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/core.py:
--------------------------------------------------------------------------------
 1 | """This module implements the core components of the MDIO schemas."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from pydantic import BaseModel
 6 | from pydantic import ConfigDict
 7 | from pydantic.alias_generators import to_camel
 8 | 
 9 | 
10 | class CamelCaseStrictModel(BaseModel):
11 |     """A model with forbidden extras and camel case aliases."""
12 | 
13 |     model_config = ConfigDict(
14 |         alias_generator=to_camel,
15 |         validate_by_name=True,
16 |         serialize_by_alias=True,
17 |         validate_assignment=True,
18 |         extra="forbid",
19 |     )
20 | 


--------------------------------------------------------------------------------
/docs/data_models/dimensions.md:
--------------------------------------------------------------------------------
 1 | ```{eval-rst}
 2 | :tocdepth: 3
 3 | ```
 4 | 
 5 | ```{currentModule} mdio.builder.schemas.dimension
 6 | 
 7 | ```
 8 | 
 9 | # Dimensions
10 | 
11 | ```{article-info}
12 | :author: Altay Sansal
13 | :date: "{sub-ref}`today`"
14 | :read-time: "{sub-ref}`wordcount-minutes` min read"
15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light
16 | ```
17 | 
18 | ## Intro
19 | 
20 | ```{eval-rst}
21 | .. autosummary:: NamedDimension
22 | ```
23 | 
24 | ## Reference
25 | 
26 | :::{dropdown} Dimension
27 | :open:
28 | 
29 | ```{eval-rst}
30 | .. autopydantic_model:: NamedDimension
31 | ```
32 | 
33 | :::
34 | 


--------------------------------------------------------------------------------
/src/mdio/core/zarr_io.py:
--------------------------------------------------------------------------------
 1 | """Utilities to open/write Zarr files."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import warnings
 6 | from contextlib import contextmanager
 7 | from typing import TYPE_CHECKING
 8 | 
 9 | from zarr.errors import UnstableSpecificationWarning
10 | 
11 | if TYPE_CHECKING:
12 |     from collections.abc import Generator
13 | 
14 | 
15 | @contextmanager
16 | def zarr_warnings_suppress_unstable_structs_v3() -> Generator[None, None, None]:
17 |     """Context manager to suppress Zarr V3 unstable structured array warning."""
18 |     warn = r"The data type \((.*?)\) does not have a Zarr V3 specification\."
19 |     warnings.filterwarnings("ignore", message=warn, category=UnstableSpecificationWarning)
20 |     try:
21 |         yield
22 |     finally:
23 |         pass
24 | 


--------------------------------------------------------------------------------
/tests/unit/v1/templates/conftest.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the conftest module in the templates directory."""
 2 | 
 3 | # conftest.py
 4 | import pytest
 5 | 
 6 | from mdio.builder.schemas.dtype import ScalarType
 7 | from mdio.builder.schemas.dtype import StructuredField
 8 | from mdio.builder.schemas.dtype import StructuredType
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def structured_headers() -> StructuredType:
13 |     """Fixture to provide structured headers for testing."""
14 |     return StructuredType(
15 |         fields=[
16 |             StructuredField(name="cdp_x", format=ScalarType.INT32),
17 |             StructuredField(name="cdp_y", format=ScalarType.INT32),
18 |             StructuredField(name="elevation", format=ScalarType.FLOAT16),
19 |             StructuredField(name="some_scalar", format=ScalarType.FLOAT16),
20 |         ]
21 |     )
22 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | categories:
 2 |   - title: ":boom: Breaking Changes"
 3 |     label: "breaking"
 4 |   - title: ":rocket: Features"
 5 |     label: "enhancement"
 6 |   - title: ":fire: Removals and Deprecations"
 7 |     label: "removal"
 8 |   - title: ":beetle: Fixes"
 9 |     label: "bug"
10 |   - title: ":racehorse: Performance"
11 |     label: "performance"
12 |   - title: ":rotating_light: Testing"
13 |     label: "testing"
14 |   - title: ":construction_worker: Continuous Integration"
15 |     label: "ci"
16 |   - title: ":books: Documentation"
17 |     label: "documentation"
18 |   - title: ":hammer: Refactoring"
19 |     label: "refactoring"
20 |   - title: ":lipstick: Style"
21 |     label: "style"
22 |   - title: ":package: Dependencies"
23 |     labels:
24 |       - "dependencies"
25 |       - "build"
26 | template: |
27 |   ## Changes
28 | 
29 |   $CHANGES
30 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_2d_poststack.py:
--------------------------------------------------------------------------------
 1 | """Seismic2DPostStackTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.templates.base import AbstractDatasetTemplate
 6 | from mdio.builder.templates.types import SeismicDataDomain
 7 | 
 8 | 
 9 | class Seismic2DPostStackTemplate(AbstractDatasetTemplate):
10 |     """Seismic post-stack 2D time or depth Dataset template."""
11 | 
12 |     def __init__(self, data_domain: SeismicDataDomain):
13 |         super().__init__(data_domain=data_domain)
14 | 
15 |         self._dim_names = ("cdp", self._data_domain)
16 |         self._physical_coord_names = ("cdp_x", "cdp_y")
17 |         self._var_chunk_shape = (1024, 1024)
18 | 
19 |     @property
20 |     def _name(self) -> str:
21 |         return f"PostStack2D{self._data_domain.capitalize()}"
22 | 
23 |     def _load_dataset_attributes(self) -> dict[str, Any]:
24 |         return {"surveyType": "2D", "gatherType": "stacked"}
25 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_3d_poststack.py:
--------------------------------------------------------------------------------
 1 | """Seismic3DPostStackTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.templates.base import AbstractDatasetTemplate
 6 | from mdio.builder.templates.types import SeismicDataDomain
 7 | 
 8 | 
 9 | class Seismic3DPostStackTemplate(AbstractDatasetTemplate):
10 |     """Seismic post-stack 3D time or depth Dataset template."""
11 | 
12 |     def __init__(self, data_domain: SeismicDataDomain):
13 |         super().__init__(data_domain=data_domain)
14 | 
15 |         self._dim_names = ("inline", "crossline", self._data_domain)
16 |         self._physical_coord_names = ("cdp_x", "cdp_y")
17 |         self._var_chunk_shape = (128, 128, 128)
18 | 
19 |     @property
20 |     def _name(self) -> str:
21 |         domain_suffix = self._data_domain.capitalize()
22 |         return f"PostStack3D{domain_suffix}"
23 | 
24 |     def _load_dataset_attributes(self) -> dict[str, Any]:
25 |         return {"surveyType": "3D", "gatherType": "stacked"}
26 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ```{include} ../README.md
 2 | ---
 3 | end-before: <!-- github-only -->
 4 | ---
 5 | ```
 6 | 
 7 | [apache 2.0 license]: license
 8 | [contributor guide]: contributing
 9 | [command-line usage]: cli_usage
10 | [api reference]: api_reference
11 | [installation instructions]: installation
12 | 
13 | ```{toctree}
14 | :hidden:
15 | :caption: Getting Started
16 | 
17 | installation
18 | cli_usage
19 | configuration
20 | ```
21 | 
22 | ```{toctree}
23 | :hidden:
24 | :caption: Learning and Support
25 | 
26 | tutorials/index
27 | api_reference
28 | ```
29 | 
30 | ```{toctree}
31 | :hidden:
32 | :caption: Core Concepts and Structures
33 | 
34 | data_models/index
35 | data_models/dimensions
36 | data_models/chunk_grids
37 | data_models/data_types
38 | data_models/compressors
39 | template_registry
40 | ```
41 | 
42 | ```{toctree}
43 | :hidden:
44 | :caption: Community and Contribution
45 | 
46 | contributing
47 | Code of Conduct <codeofconduct>
48 | ```
49 | 
50 | ```{toctree}
51 | :hidden:
52 | :caption: Additional Resources
53 | 
54 | License <license>
55 | Changelog <https://github.com/TGSAI/mdio-python/releases>
56 | ```
57 | 


--------------------------------------------------------------------------------
/docs/api_reference.md:
--------------------------------------------------------------------------------
 1 | # API Reference
 2 | 
 3 | ## Data Converters
 4 | 
 5 | ### Seismic Data
 6 | 
 7 | ````{note}
 8 | By default, the SEG-Y ingestion tool uses Python's multiprocessing
 9 | to speed up parsing the data. This almost always requires a `__main__`
10 | guard on any other Python code that is executed directly like
11 | `python file.py`. When running inside Jupyter, this is **NOT** needed.
12 | 
13 | ```python
14 | if __name__ == "__main__":
15 |     segy_to_mdio(...)
16 | ```
17 | 
18 | When the CLI is invoked, this is already handled.
19 | 
20 | See the official `multiprocessing` documentation
21 | [here](https://docs.python.org/3/library/multiprocessing.html#the-process-class)
22 | and
23 | [here](https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming).
24 | ````
25 | 
26 | ```{eval-rst}
27 | .. automodule:: mdio.converters.segy
28 |    :members:
29 |    :exclude-members: grid_density_qc, parse_index_types, get_compressor, populate_dim_coordinates, populate_non_dim_coordinates
30 | 
31 | .. automodule:: mdio.converters.mdio
32 |    :members:
33 | ```
34 | 
35 | ## Core Functionality
36 | 
37 | ### Dimensions
38 | 
39 | ```{eval-rst}
40 | .. automodule:: mdio.core.dimension
41 |    :members:
42 | ```
43 | 


--------------------------------------------------------------------------------
/tests/unit/v1/test_dataset_builder_helpers.py:
--------------------------------------------------------------------------------
 1 | """Tests the schema v1 dataset_builder internal methods."""
 2 | 
 3 | import pytest
 4 | 
 5 | from mdio.builder.dataset_builder import _get_named_dimension
 6 | from mdio.builder.schemas.dimension import NamedDimension
 7 | 
 8 | 
 9 | def test__get_named_dimension() -> None:
10 |     """Test getting a dimension by name from the list of dimensions."""
11 |     dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)]
12 | 
13 |     assert _get_named_dimension([], "inline") is None
14 |     assert _get_named_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2)
15 |     assert _get_named_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3)
16 |     assert _get_named_dimension(dimensions, "time") is None
17 | 
18 |     with pytest.raises(TypeError, match="Expected str, got NoneType"):
19 |         _get_named_dimension(dimensions, None)
20 |     with pytest.raises(TypeError, match="Expected str, got int"):
21 |         _get_named_dimension(dimensions, 42)
22 |     with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"):
23 |         _get_named_dimension(dimensions, "inline", size=200)
24 | 


--------------------------------------------------------------------------------
/src/mdio/segy/helpers_segy.py:
--------------------------------------------------------------------------------
 1 | """Helper functions for tinkering with SEG-Y related Zarr."""
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from zarr.errors import ContainsGroupError
 6 | 
 7 | from mdio.exceptions import MDIOAlreadyExistsError
 8 | 
 9 | if TYPE_CHECKING:
10 |     from zarr import Group
11 | 
12 | 
13 | def create_zarr_hierarchy(root_group: "Group", overwrite: bool) -> "Group":
14 |     """Create `zarr` hierarchy for SEG-Y files.
15 | 
16 |     Args:
17 |         root_group: Output root group where data will be written.
18 |         overwrite: Toggle for overwriting existing store.
19 | 
20 |     Returns:
21 |         Zarr Group instance for root of the file.
22 | 
23 |     Raises:
24 |         MDIOAlreadyExistsError: If a file with data already exists.
25 |     """
26 |     try:
27 |         root_group.create_group(name="data", overwrite=overwrite)
28 |         root_group.create_group(name="metadata", overwrite=overwrite)
29 |     except ContainsGroupError as e:
30 |         msg = (
31 |             f"An MDIO file with data already exists at {root_group.store_path}. "
32 |             "If this is intentional, please specify 'overwrite=True'."
33 |         )
34 |         raise MDIOAlreadyExistsError(msg) from e
35 | 
36 |     return root_group
37 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/v1/dataset.py:
--------------------------------------------------------------------------------
 1 | """Dataset model for MDIO V1."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from pydantic import AwareDatetime
 6 | from pydantic import Field
 7 | 
 8 | from mdio.builder.schemas.base import BaseDataset
 9 | from mdio.builder.schemas.core import CamelCaseStrictModel
10 | from mdio.builder.schemas.v1.variable import Variable
11 | 
12 | 
13 | class DatasetMetadata(CamelCaseStrictModel):
14 |     """Contains information about a dataset."""
15 | 
16 |     name: str = Field(..., description="Name or identifier for the dataset.")
17 | 
18 |     api_version: str = Field(
19 |         ...,
20 |         description="The version of the MDIO API that the dataset complies with.",
21 |     )
22 | 
23 |     created_on: AwareDatetime = Field(
24 |         ...,
25 |         description=(
26 |             "The timestamp indicating when the dataset was first created, "
27 |             "including timezone information. Expressed in ISO 8601 format."
28 |         ),
29 |     )
30 | 
31 |     attributes: dict[str, Any] | None = Field(default=None, description="User defined attributes as key/value pairs.")
32 | 
33 | 
34 | class Dataset(BaseDataset):
35 |     """Represents an MDIO v1 dataset.
36 | 
37 |     A dataset consists of variables and metadata.
38 |     """
39 | 
40 |     variables: list[Variable] = Field(..., description="Variables in MDIO dataset")
41 |     metadata: DatasetMetadata = Field(..., description="Dataset metadata.")
42 | 


--------------------------------------------------------------------------------
/src/mdio/core/utils_write.py:
--------------------------------------------------------------------------------
 1 | """Convenience utilities for writing to Zarr."""
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from dask.array.core import normalize_chunks
 6 | from dask.array.rechunk import _balance_chunksizes
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from numpy.typing import DTypeLike
10 | 
11 | 
12 | MAX_SIZE_LIVE_MASK = 256 * 1024**2
13 | MAX_COORDINATES_BYTES = 32 * 1024**2
14 | 
15 | 
16 | def get_constrained_chunksize(
17 |     shape: tuple[int, ...],
18 |     dtype: "DTypeLike",
19 |     max_bytes: int,
20 | ) -> tuple[int, ...]:
21 |     """Calculate the optimal chunk size for N-D array based on max_bytes.
22 | 
23 |     Args:
24 |         shape: The shape of the array.
25 |         dtype: The data dtype to be used in calculation.
26 |         max_bytes: The maximum allowed number of bytes per chunk.
27 | 
28 |     Returns:
29 |         A sequence of integers of calculated chunk sizes.
30 |     """
31 |     chunks = normalize_chunks("auto", shape, dtype=dtype, limit=max_bytes)
32 |     return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks)
33 | 
34 | 
35 | def get_live_mask_chunksize(shape: tuple[int, ...]) -> tuple[int, ...]:
36 |     """Given a live_mask shape, calculate the optimal write chunk size.
37 | 
38 |     Args:
39 |         shape: The shape of the array.
40 | 
41 |     Returns:
42 |         A sequence of integers of calculated chunk sizes.
43 |     """
44 |     return get_constrained_chunksize(shape, "bool", MAX_SIZE_LIVE_MASK)
45 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/chunk_grid.py:
--------------------------------------------------------------------------------
 1 | """This module contains data models for Zarr's chunk grid."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from pydantic import Field
 6 | 
 7 | from mdio.builder.schemas.core import CamelCaseStrictModel
 8 | 
 9 | 
10 | class RegularChunkShape(CamelCaseStrictModel):
11 |     """Represents regular chunk sizes along each dimension."""
12 | 
13 |     chunk_shape: tuple[int, ...] = Field(..., description="Lengths of the chunk along each dimension of the array.")
14 | 
15 | 
16 | class RectilinearChunkShape(CamelCaseStrictModel):
17 |     """Represents irregular chunk sizes along each dimension."""
18 | 
19 |     chunk_shape: tuple[tuple[int, ...], ...] = Field(
20 |         ...,
21 |         description="Lengths of the chunk along each dimension of the array.",
22 |     )
23 | 
24 | 
25 | class RegularChunkGrid(CamelCaseStrictModel):
26 |     """Represents a rectangular and regularly spaced chunk grid."""
27 | 
28 |     name: str = Field(default="regular", description="The name of the chunk grid.")
29 | 
30 |     configuration: RegularChunkShape = Field(..., description="Configuration of the regular chunk grid.")
31 | 
32 | 
33 | class RectilinearChunkGrid(CamelCaseStrictModel):
34 |     """Represents a rectangular and irregularly spaced chunk grid."""
35 | 
36 |     name: str = Field(default="rectilinear", description="The name of the chunk grid.")
37 | 
38 |     configuration: RectilinearChunkShape = Field(..., description="Configuration of the irregular chunk grid.")
39 | 


--------------------------------------------------------------------------------
/src/mdio/converters/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom exceptions for MDIO converters."""
 2 | 
 3 | 
 4 | class EnvironmentFormatError(Exception):
 5 |     """Raised when environment variable is of the wrong format."""
 6 | 
 7 |     def __init__(self, name: str, format: str, msg: str = ""):  # noqa: A002
 8 |         self.message = f"Environment variable: {name} not of expected format: {format}. "
 9 |         self.message += f"\n{msg}" if msg else ""
10 |         super().__init__(self.message)
11 | 
12 | 
13 | class GridTraceCountError(Exception):
14 |     """Raised when grid trace counts don't match the SEG-Y trace count."""
15 | 
16 |     def __init__(self, grid_traces: int, segy_traces: int):
17 |         self.message = (
18 |             f"{grid_traces} != {segy_traces}. Scanned grid trace count ({grid_traces}) doesn't "
19 |             f"match SEG-Y file ({segy_traces}). Either indexing parameters are wrong (not unique) "
20 |             "or SEG-Y file has duplicate traces."
21 |         )
22 | 
23 |         super().__init__(self.message)
24 | 
25 | 
26 | class GridTraceSparsityError(Exception):
27 |     """Raised when mdio grid will be sparsely populated from SEG-Y traces."""
28 | 
29 |     def __init__(self, shape: tuple[int, ...], num_traces: int, msg: str = ""):
30 |         self.message = (
31 |             f"Grid shape: {shape} but SEG-Y tracecount: {num_traces}. This grid is very sparse "
32 |             "and most likely user error with indexing."
33 |         )
34 |         self.message += f"\n{msg}" if msg else ""
35 |         super().__init__(self.message)
36 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/base.py:
--------------------------------------------------------------------------------
 1 | """Base models to subclass from."""
 2 | 
 3 | from pydantic import ConfigDict
 4 | from pydantic import Field
 5 | from pydantic.json_schema import GenerateJsonSchema
 6 | 
 7 | from mdio.builder.schemas.compressors import ZFP
 8 | from mdio.builder.schemas.compressors import Blosc
 9 | from mdio.builder.schemas.core import CamelCaseStrictModel
10 | from mdio.builder.schemas.dimension import NamedDimension
11 | from mdio.builder.schemas.dtype import DataTypeModel
12 | 
13 | JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect
14 | 
15 | 
16 | class BaseDataset(CamelCaseStrictModel):
17 |     """A base class for MDIO datasets.
18 | 
19 |     We add schema dialect to extend the config of `StrictCamelBaseModel`.
20 |     We use the default Pydantic schema generator `GenerateJsonSchema` to
21 |     define the JSON schema dialect accurately.
22 |     """
23 | 
24 |     model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT})
25 | 
26 | 
27 | class BaseArray(DataTypeModel, CamelCaseStrictModel):
28 |     """A base array schema."""
29 | 
30 |     dimensions: list[NamedDimension] | list[str] = Field(
31 |         ..., description="List of Dimension collection or reference to dimension names."
32 |     )
33 |     compressor: Blosc | ZFP | None = Field(default=None, description="Compression settings.")
34 | 
35 | 
36 | class NamedArray(BaseArray):
37 |     """An array with a name."""
38 | 
39 |     name: str = Field(..., description="Name of the array.")
40 |     long_name: str | None = Field(default=None, description="Fully descriptive name.")
41 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/dtype.py:
--------------------------------------------------------------------------------
 1 | """Schemas for scalar types.
 2 | 
 3 | We take booleans, unsigned and signed integers, floats, and
 4 | complex numbers from numpy data types and allow those.
 5 | """
 6 | 
 7 | from __future__ import annotations
 8 | 
 9 | from enum import StrEnum
10 | 
11 | from pydantic import Field
12 | 
13 | from mdio.builder.schemas.core import CamelCaseStrictModel
14 | 
15 | 
16 | class ScalarType(StrEnum):
17 |     """Scalar array data type."""
18 | 
19 |     BOOL = "bool"
20 |     INT8 = "int8"
21 |     INT16 = "int16"
22 |     INT32 = "int32"
23 |     INT64 = "int64"
24 |     UINT8 = "uint8"
25 |     UINT16 = "uint16"
26 |     UINT32 = "uint32"
27 |     UINT64 = "uint64"
28 |     FLOAT16 = "float16"
29 |     FLOAT32 = "float32"
30 |     FLOAT64 = "float64"
31 |     FLOAT128 = "float128"
32 |     COMPLEX64 = "complex64"
33 |     COMPLEX128 = "complex128"
34 |     COMPLEX256 = "complex256"
35 |     BYTES240 = "V240"  # fixed-width 240-byte string, used for raw v0/1/2 trace headers
36 | 
37 | 
38 | class StructuredField(CamelCaseStrictModel):
39 |     """Structured array field with name, format."""
40 | 
41 |     format: ScalarType = Field(...)
42 |     name: str = Field(...)
43 | 
44 | 
45 | class StructuredType(CamelCaseStrictModel):
46 |     """Structured array type with packed fields."""
47 | 
48 |     fields: list[StructuredField] = Field()
49 | 
50 | 
51 | class DataTypeModel(CamelCaseStrictModel):
52 |     """Structured array type with fields and total item size."""
53 | 
54 |     data_type: ScalarType | StructuredType = Field(..., description="Type of the array.")
55 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   # should be replaced in the future ref https://github.com/astral-sh/ruff/issues/458
 3 |   - repo: https://github.com/jsh9/pydoclint
 4 |     rev: 0.6.6
 5 |     hooks:
 6 |       - id: pydoclint
 7 |   - repo: local
 8 |     hooks:
 9 |       - id: ruff-format
10 |         name: Format code with Ruff
11 |         entry: ruff format
12 |         language: system
13 |         types_or: [python, pyi, jupyter]
14 |       - id: ruff
15 |         name: Lint code with Ruff
16 |         entry: ruff check
17 |         language: system
18 |         types_or: [python, pyi, jupyter]
19 |         args: [--fix]
20 |       - id: check-added-large-files
21 |         name: Check for added large files
22 |         entry: check-added-large-files
23 |         language: system
24 |         args: ["--maxkb=1100"]
25 |       - id: check-toml
26 |         name: Check Toml
27 |         entry: check-toml
28 |         language: system
29 |         types: [toml]
30 |       - id: check-yaml
31 |         name: Check Yaml
32 |         entry: check-yaml
33 |         language: system
34 |         types: [yaml]
35 |       - id: end-of-file-fixer
36 |         name: Fix End of Files
37 |         entry: end-of-file-fixer
38 |         language: system
39 |         types: [text]
40 |         stages: [pre-commit, pre-push, manual]
41 |       - id: trailing-whitespace
42 |         name: Trim Trailing Whitespace
43 |         entry: trailing-whitespace-fixer
44 |         language: system
45 |         types: [text]
46 |         stages: [pre-commit, pre-push, manual]
47 |         args: [--markdown-linebreak-ext=md]
48 |   - repo: https://github.com/pre-commit/mirrors-prettier
49 |     rev: v3.1.0
50 |     hooks:
51 |       - id: prettier
52 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/units.py:
--------------------------------------------------------------------------------
 1 | """Common units for resource assessment data."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from enum import Enum
 6 | from enum import unique
 7 | 
 8 | from pydantic import Field
 9 | from pydantic import create_model
10 | 
11 | from mdio.builder.schemas.core import CamelCaseStrictModel
12 | 
13 | 
14 | @unique
15 | class UnitEnum(str, Enum):
16 |     """An Enum representing units as strings, from pint."""
17 | 
18 | 
19 | def create_unit_model(
20 |     unit_enum: type[UnitEnum],
21 |     model_name: str,
22 |     quantity: str,
23 |     module: str,
24 | ) -> type[CamelCaseStrictModel]:
25 |     """Dynamically creates a pydantic model from a unit Enum.
26 | 
27 |     Args:
28 |         unit_enum: UnitEnum representing the units for a specific quantity.
29 |         model_name: The name of the model to be created.
30 |         quantity: String representing the quantity for which the unit model is created.
31 |         module: Name of the module in which the model is to be created.
32 |             This should be the `__name__` attribute of the module.
33 | 
34 |     Returns:
35 |         A Pydantic Model representing the unit model derived from the BaseModel.
36 | 
37 |     Example:
38 |         unit_enum = UnitEnum
39 |         model_name = "LengthUnitModel"
40 |         quantity = "length"
41 |         create_unit_model(unit_enum, model_name, quantity)
42 |     """
43 |     fields = {quantity: (unit_enum, Field(..., description=f"Unit of {quantity}."))}
44 | 
45 |     return create_model(
46 |         model_name,
47 |         **fields,
48 |         __base__=CamelCaseStrictModel,
49 |         __doc__=f"Model representing units of {quantity}.",
50 |         __module__=module,
51 |     )
52 | 


--------------------------------------------------------------------------------
/tests/unit/v1/test_dataset_builder_add_dimension.py:
--------------------------------------------------------------------------------
 1 | """Tests the schema v1 dataset_builder.add_dimension() public API."""
 2 | 
 3 | import pytest
 4 | 
 5 | from mdio.builder.dataset_builder import MDIODatasetBuilder
 6 | from mdio.builder.dataset_builder import _BuilderState
 7 | from mdio.builder.dataset_builder import _get_named_dimension
 8 | 
 9 | from .helpers import validate_builder
10 | 
11 | 
12 | def test_add_dimension() -> None:
13 |     """Test adding dimension. Check the state transition and validate required parameters."""
14 |     builder = MDIODatasetBuilder("test_dataset")
15 |     assert builder._state == _BuilderState.INITIAL
16 | 
17 |     # Validate required parameters
18 |     bad_name = None
19 |     with pytest.raises(ValueError, match="'name' must be a non-empty string"):
20 |         builder.add_dimension(bad_name, 200)
21 |     with pytest.raises(ValueError, match="'name' must be a non-empty string"):
22 |         builder.add_dimension("", 200)
23 | 
24 |     # First dimension should change state to HAS_DIMENSIONS and create a variable
25 |     builder.add_dimension("x", 100)
26 |     validate_builder(builder, _BuilderState.HAS_DIMENSIONS, n_dims=1, n_coords=0, n_var=0)
27 |     assert _get_named_dimension(builder._dimensions, "x", 100) is not None
28 | 
29 |     # Validate that we can't add a dimension with the same name twice
30 |     with pytest.raises(
31 |         ValueError,
32 |         match="Adding dimension with the same name twice is not allowed",
33 |     ):
34 |         builder.add_dimension("x", 200)
35 | 
36 |     # Adding dimension with the same name twice
37 |     msg = "Adding dimension with the same name twice is not allowed"
38 |     with pytest.raises(ValueError, match=msg):
39 |         builder.add_dimension("x", 200)
40 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.13-bookworm",
 3 |   // Configure tool-specific properties.
 4 |   "features": {
 5 |     "ghcr.io/va-h/devcontainers-features/uv:1": { "version": "0.8.17" }
 6 |   },
 7 |   "customizations": {
 8 |     "vscode": {
 9 |       "settings": {
10 |         "python.defaultInterpreterPath": "${containerWorkspaceFolder}/.venv/bin/python",
11 |         "python.testing.pytestArgs": ["tests"],
12 |         "python.testing.unittestEnabled": false,
13 |         "python.testing.pytestEnabled": true
14 |       },
15 |       "extensions": [
16 |         "ms-python.python",
17 |         "ms-python.vscode-pylance",
18 |         "ms-toolsai.jupyter",
19 |         "ms-toolsai.jupyter-keymap",
20 |         "ms-toolsai.jupyter-renderers",
21 |         "vscode-icons-team.vscode-icons",
22 |         "wayou.vscode-todo-highlight",
23 |         "streetsidesoftware.code-spell-checker",
24 |         "eamodio.gitlens",
25 |         "visualstudioexptteam.vscodeintellicode",
26 |         "richie5um2.vscode-sort-json"
27 |       ]
28 |     },
29 |     "jetbrains": {
30 |       "plugins": ["com.koxudaxi.pydantic", "com.koxudaxi.ruff"]
31 |     }
32 |   },
33 |   "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind",
34 |   "workspaceFolder": "/workspaces/mdio-python",
35 |   // This installs the MDIO dev environment in the container. Put any customizations there.
36 |   "postCreateCommand": "bash .devcontainer/post-create.sh",
37 |   // Forward 8787 to enable us to view the dask dashboard
38 |   "forwardPorts": [8787]
39 |   // Add any mounts you want to use here.
40 |   //"mounts": [ "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" ]
41 | }
42 | 


--------------------------------------------------------------------------------
/src/mdio/constants.py:
--------------------------------------------------------------------------------
 1 | """Constant values used across MDIO."""
 2 | 
 3 | from enum import IntEnum
 4 | 
 5 | import numpy as np
 6 | 
 7 | from mdio.builder.schemas.dtype import ScalarType
 8 | 
 9 | 
10 | class ZarrFormat(IntEnum):
11 |     """Zarr version enum."""
12 | 
13 |     V2 = 2
14 |     V3 = 3
15 | 
16 | 
17 | FLOAT16_MAX = np.finfo("float16").max
18 | FLOAT16_MIN = np.finfo("float16").min
19 | 
20 | FLOAT32_MAX = np.finfo("float32").max
21 | FLOAT32_MIN = np.finfo("float32").min
22 | 
23 | FLOAT64_MIN = np.finfo("float64").min
24 | FLOAT64_MAX = np.finfo("float64").max
25 | 
26 | INT8_MIN = np.iinfo("int8").min
27 | INT8_MAX = np.iinfo("int8").max
28 | 
29 | INT16_MIN = np.iinfo("int16").min
30 | INT16_MAX = np.iinfo("int16").max
31 | 
32 | INT32_MIN = np.iinfo("int32").min
33 | INT32_MAX = np.iinfo("int32").max
34 | 
35 | INT64_MIN = np.iinfo("int64").min
36 | INT64_MAX = np.iinfo("int64").max
37 | 
38 | UINT8_MIN = 0
39 | UINT8_MAX = np.iinfo("uint8").max
40 | 
41 | UINT16_MIN = 0
42 | UINT16_MAX = np.iinfo("uint16").max
43 | 
44 | UINT32_MIN = 0
45 | UINT32_MAX = np.iinfo("uint32").max
46 | 
47 | UINT64_MIN = 0
48 | UINT64_MAX = np.iinfo("uint64").max
49 | 
50 | # Zarr fill values for different scalar types
51 | fill_value_map = {
52 |     ScalarType.BOOL: None,
53 |     ScalarType.FLOAT16: np.nan,
54 |     ScalarType.FLOAT32: np.nan,
55 |     ScalarType.FLOAT64: np.nan,
56 |     ScalarType.UINT8: UINT8_MAX,
57 |     ScalarType.UINT16: UINT16_MAX,
58 |     ScalarType.UINT32: UINT32_MAX,
59 |     ScalarType.UINT64: UINT64_MAX,
60 |     ScalarType.INT8: INT8_MAX,
61 |     ScalarType.INT16: INT16_MAX,
62 |     ScalarType.INT32: INT32_MAX,
63 |     ScalarType.INT64: INT64_MAX,
64 |     ScalarType.COMPLEX64: complex(np.nan, np.nan),
65 |     ScalarType.COMPLEX128: complex(np.nan, np.nan),
66 |     ScalarType.COMPLEX256: complex(np.nan, np.nan),
67 |     ScalarType.BYTES240: b"\x00" * 240,
68 | }
69 | 


--------------------------------------------------------------------------------
/tests/integration/testing_helpers.py:
--------------------------------------------------------------------------------
 1 | """This module provides testing helpers for integration testing."""
 2 | 
 3 | from collections.abc import Callable
 4 | 
 5 | import numpy as np
 6 | import xarray as xr
 7 | from numpy.typing import DTypeLike
 8 | 
 9 | 
10 | def get_values(arr: xr.DataArray) -> np.ndarray:
11 |     """Extract actual values from an Xarray DataArray."""
12 |     return arr.values
13 | 
14 | 
15 | def get_inline_header_values(dataset: xr.Dataset) -> np.ndarray:
16 |     """Extract a specific header value from an Xarray DataArray."""
17 |     return dataset["inline"].values
18 | 
19 | 
20 | def validate_variable(  # noqa PLR0913
21 |     dataset: xr.Dataset,
22 |     name: str,
23 |     shape: tuple[int, ...],
24 |     dims: tuple[str, ...],
25 |     data_type: DTypeLike,
26 |     expected_values: range | None,
27 |     actual_value_generator: Callable[[xr.DataArray], np.ndarray] | None = None,
28 | ) -> None:
29 |     """Validate the properties of a variable in an Xarray dataset."""
30 |     arr = dataset[name]
31 |     assert shape == arr.shape
32 |     assert set(dims) == set(arr.dims)
33 |     if hasattr(data_type, "fields") and data_type.fields is not None:
34 |         # The following assertion will fail because of differences in offsets
35 |         # assert data_type == arr.dtype
36 | 
37 |         # Compare field names
38 |         expected_names = list(data_type.names)
39 |         actual_names = list(arr.dtype.names)
40 |         assert expected_names == actual_names
41 | 
42 |         # Compare field types
43 |         expected_types = [data_type[name] for name in data_type.names]
44 |         actual_types = [arr.dtype[name] for name in arr.dtype.names]
45 |         assert expected_types == actual_types
46 |     else:
47 |         assert data_type == arr.dtype
48 | 
49 |     if expected_values is not None and actual_value_generator is not None:
50 |         actual_values = actual_value_generator(arr)
51 |         assert np.array_equal(expected_values, actual_values)
52 | 


--------------------------------------------------------------------------------
/src/mdio/segy/_raw_trace_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Consumer-side utility to get both raw and transformed header data with single filesystem read."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | import numpy as np
 8 | 
 9 | if TYPE_CHECKING:
10 |     from numpy.typing import NDArray
11 |     from segy import SegyFile
12 | 
13 | 
14 | class SegyFileRawTraceWrapper:
15 |     def __init__(self, segy_file: SegyFile, indices: int | list[int] | NDArray | slice):
16 |         self.segy_file = segy_file
17 |         self.indices = indices
18 | 
19 |         self.idx = self.segy_file.trace.normalize_and_validate_query(self.indices)
20 |         self.trace_buffer_array = self.segy_file.trace.fetch(self.idx, raw=True)
21 | 
22 |         self.trace_view = self.trace_buffer_array.view(self.segy_file.spec.trace.dtype)
23 | 
24 |         self.trace_decode_pipeline = self.segy_file.accessors.trace_decode_pipeline
25 |         self.decoded_traces = None  # decode later when not-raw header/sample is called
26 | 
27 |     def _ensure_decoded(self) -> None:
28 |         """Apply trace decoding pipeline if not already done."""
29 |         if self.decoded_traces is not None:  # already done
30 |             return
31 |         self.decoded_traces = self.trace_decode_pipeline.apply(self.trace_view.copy())
32 | 
33 |     @property
34 |     def raw_header(self) -> NDArray:
35 |         """Get byte array view of the raw headers."""
36 |         header_itemsize = self.segy_file.spec.trace.header.itemsize  # should be 240
37 |         return self.trace_view.header.view(np.dtype((np.void, header_itemsize)))
38 | 
39 |     @property
40 |     def header(self) -> NDArray:
41 |         """Get decoded header."""
42 |         self._ensure_decoded()  # decode when needed in-place to avoid copy.
43 |         return self.decoded_traces.header
44 | 
45 |     @property
46 |     def sample(self) -> NDArray:
47 |         """Get decoded trace samples."""
48 |         self._ensure_decoded()  # decode when needed in-place to avoid copy.
49 |         return self.decoded_traces.sample
50 | 


--------------------------------------------------------------------------------
/src/mdio/segy/scalar.py:
--------------------------------------------------------------------------------
 1 | """Utilities to read, parse, and apply coordinate scalars."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import logging
 6 | from typing import TYPE_CHECKING
 7 | 
 8 | from segy.schema import SegyStandard
 9 | from segy.standards.fields import trace as trace_header_fields
10 | 
11 | if TYPE_CHECKING:
12 |     from numpy.typing import NDArray
13 |     from segy import SegyFile
14 | 
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | COORD_SCALAR_KEY = trace_header_fields.Rev0.COORDINATE_SCALAR.model.name
20 | VALID_COORD_SCALAR = {1, 10, 100, 1000, 10000}
21 | SCALE_COORDINATE_KEYS = [
22 |     "cdp_x",
23 |     "cdp_y",
24 |     "source_coord_x",
25 |     "source_coord_y",
26 |     "group_coord_x",
27 |     "group_coord_y",
28 | ]
29 | 
30 | 
31 | def _get_coordinate_scalar(segy_file: SegyFile) -> int:
32 |     """Get and parse the coordinate scalar from the first SEG-Y trace header."""
33 |     file_revision = segy_file.spec.segy_standard
34 |     first_header = segy_file.header[0]
35 |     coord_scalar = int(first_header[COORD_SCALAR_KEY])
36 | 
37 |     # Per Rev2, standardize 0 to 1 if a file is 2+.
38 |     if coord_scalar == 0 and file_revision >= SegyStandard.REV2:
39 |         logger.warning("Coordinate scalar is 0 and file is %s. Setting to 1.", file_revision)
40 |         return 1
41 | 
42 |     def validate_segy_scalar(scalar: int) -> bool:
43 |         """Validate if coord scalar matches the seg-y standard."""
44 |         logger.debug("Coordinate scalar is %s", scalar)
45 |         return abs(scalar) in VALID_COORD_SCALAR  # valid values
46 | 
47 |     is_valid = validate_segy_scalar(coord_scalar)
48 |     if not is_valid:
49 |         msg = f"Invalid coordinate scalar: {coord_scalar} for file revision {file_revision}."
50 |         raise ValueError(msg)
51 | 
52 |     logger.info("Coordinate scalar is parsed as %s", coord_scalar)
53 |     return coord_scalar
54 | 
55 | 
56 | def _apply_coordinate_scalar(data: NDArray, scalar: int) -> NDArray:
57 |     if scalar < 0:
58 |         scalar = 1 / scalar
59 |     return data * abs(scalar)
60 | 


--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Labels names are important as they are used by Release Drafter to decide
 3 | # regarding where to record them in changelog or if to skip them.
 4 | #
 5 | # The repository labels will be automatically configured using this file and
 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler.
 7 | - name: breaking
 8 |   description: Breaking Changes
 9 |   color: bfd4f2
10 | - name: bug
11 |   description: Something isn't working
12 |   color: d73a4a
13 | - name: build
14 |   description: Build System and Dependencies
15 |   color: bfdadc
16 | - name: ci
17 |   description: Continuous Integration
18 |   color: 4a97d6
19 | - name: dependencies
20 |   description: Pull requests that update a dependency file
21 |   color: 0366d6
22 | - name: documentation
23 |   description: Improvements or additions to documentation
24 |   color: 0075ca
25 | - name: duplicate
26 |   description: This issue or pull request already exists
27 |   color: cfd3d7
28 | - name: enhancement
29 |   description: New feature or request
30 |   color: a2eeef
31 | - name: github_actions
32 |   description: Pull requests that update Github_actions code
33 |   color: "000000"
34 | - name: good first issue
35 |   description: Good for newcomers
36 |   color: 7057ff
37 | - name: help wanted
38 |   description: Extra attention is needed
39 |   color: 008672
40 | - name: invalid
41 |   description: This doesn't seem right
42 |   color: e4e669
43 | - name: performance
44 |   description: Performance
45 |   color: "016175"
46 | - name: python
47 |   description: Pull requests that update Python code
48 |   color: 2b67c6
49 | - name: question
50 |   description: Further information is requested
51 |   color: d876e3
52 | - name: refactoring
53 |   description: Refactoring
54 |   color: ef67c4
55 | - name: removal
56 |   description: Removals and Deprecations
57 |   color: 9ae7ea
58 | - name: style
59 |   description: Style
60 |   color: c120e5
61 | - name: testing
62 |   description: Testing
63 |   color: b1fc6f
64 | - name: wontfix
65 |   description: This will not be worked on
66 |   color: ffffff
67 | 


--------------------------------------------------------------------------------
/tests/unit/test_dimension.py:
--------------------------------------------------------------------------------
 1 | """Dimension tests."""
 2 | 
 3 | import pytest
 4 | 
 5 | from mdio.core import Dimension
 6 | from mdio.exceptions import ShapeError
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def my_dimension() -> Dimension:
11 |     """Mock dimension."""
12 |     return Dimension(coords=range(10, 18, 2), name="dim_0")
13 | 
14 | 
15 | class TestDimension:
16 |     """Basic tests for reading or manipulating dimensions."""
17 | 
18 |     def test_len(self, my_dimension: Dimension) -> None:
19 |         """Test length method."""
20 |         assert len(my_dimension) == 4
21 | 
22 |     @pytest.mark.parametrize(("index", "expected"), [(1, 12), (-1, 16), (2, 14)])
23 |     def test_getitem(self, my_dimension: Dimension, index: int, expected: int) -> None:
24 |         """Test getter (integer indexing)."""
25 |         assert my_dimension[index] == expected
26 | 
27 |     @pytest.mark.parametrize(("index", "expected"), [(1, 12), (-1, 16), (2, 14)])
28 |     def test_setitem(self, index: int, expected: int) -> None:
29 |         """Test setter (integer indexing)."""
30 |         other_dim = Dimension(coords=range(4), name="dim_6")
31 |         other_dim[index] = expected
32 |         assert other_dim[index] == expected
33 | 
34 |     def test_hash_equality(self, my_dimension: Dimension) -> None:
35 |         """Test hashing (and equality checks)."""
36 |         other_dim1 = Dimension(coords=range(10, 18, 2), name="dim_0")
37 |         other_dim2 = Dimension(coords=range(15), name="dim_1")
38 |         assert my_dimension == other_dim1
39 |         assert my_dimension != other_dim2
40 | 
41 | 
42 | class TestExceptions:
43 |     """Test custom exceptions and if they're raised properly."""
44 | 
45 |     def test_shape_error(self) -> None:
46 |         """Wrong shape."""
47 |         with pytest.raises(ShapeError):
48 |             Dimension(coords=[range(10, 18, 2)] * 2, name="dim_0")
49 | 
50 |     def test_wrong_type_equals(self, my_dimension: Dimension) -> None:
51 |         """Wrong type."""
52 |         with pytest.raises(TypeError):
53 |             assert my_dimension == ("not", "a", "Dimension")
54 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Test configuration before everything runs."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import warnings
 6 | from typing import TYPE_CHECKING
 7 | from urllib.request import urlretrieve
 8 | 
 9 | import pytest
10 | 
11 | if TYPE_CHECKING:
12 |     from pathlib import Path
13 | 
14 | # Suppress Dask's chunk balancing warning
15 | warnings.filterwarnings(
16 |     "ignore",
17 |     message="Could not balance chunks to be equal",
18 |     category=UserWarning,
19 |     module="dask.array.rechunk",
20 | )
21 | 
22 | 
23 | @pytest.fixture(scope="session")
24 | def fake_segy_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
25 |     """Make a temp file for the fake SEG-Y files we are going to create."""
26 |     return tmp_path_factory.mktemp(r"fake_segy")
27 | 
28 | 
29 | @pytest.fixture(scope="session")
30 | def segy_input_uri() -> str:
31 |     """Path to dome dataset for cloud testing."""
32 |     return "http://s3.amazonaws.com/teapot/filt_mig.sgy"
33 | 
34 | 
35 | @pytest.fixture(scope="session")
36 | def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> Path:
37 |     """Download teapot dome dataset for testing."""
38 |     tmp_dir = tmp_path_factory.mktemp("segy")
39 |     tmp_file = tmp_dir / "teapot.segy"
40 |     urlretrieve(segy_input_uri, tmp_file)  # noqa: S310
41 |     return tmp_file
42 | 
43 | 
44 | @pytest.fixture(scope="module")
45 | def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
46 |     """Make a temp file for the output MDIO."""
47 |     return tmp_path_factory.mktemp(r"mdio")
48 | 
49 | 
50 | @pytest.fixture(scope="module")
51 | def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path:  # pragma: no cover - used by disabled test
52 |     """Make a temp file for the output MDIO."""
53 |     return tmp_path_factory.mktemp(r"mdio2")
54 | 
55 | 
56 | @pytest.fixture(scope="session")
57 | def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
58 |     """Make a temp file for the round-trip IBM SEG-Y."""
59 |     tmp_dir = tmp_path_factory.mktemp("segy")
60 |     return tmp_dir / "teapot_roundtrip.segy"
61 | 


--------------------------------------------------------------------------------
/src/mdio/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom exceptions related to MDIO functionality."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | 
 6 | class MDIOError(Exception):
 7 |     """Base exceptions class."""
 8 | 
 9 | 
10 | class ShapeError(MDIOError):
11 |     """Raised when shapes of two or more things don't match.
12 | 
13 |     Args:
14 |         message: Message to show with the exception.
15 |         names: Names of the variables for the `message`.
16 |         shapes: Shapes of the variables for the `message`.
17 |     """
18 | 
19 |     def __init__(
20 |         self,
21 |         message: str,
22 |         names: tuple[str, str] | None = None,
23 |         shapes: tuple[int, int] | None = None,
24 |     ):
25 |         if names is not None and shapes is not None:
26 |             shape_dict = zip(names, shapes, strict=True)
27 |             extras = [f"{name}: {shape}" for name, shape in shape_dict]
28 |             extras = " <> ".join(extras)
29 | 
30 |             message = f"{message} - {extras}"
31 | 
32 |         super().__init__(message)
33 | 
34 | 
35 | class WrongTypeError(MDIOError):
36 |     """Raised when types of two or things don't match.
37 | 
38 |     Args:
39 |         message: Message to show with the exception.
40 |         name: String form of variable's type for the `message`.
41 |         expected: String form of expected type for the `message`.
42 |     """
43 | 
44 |     def __init__(self, message: str, name: str = None, expected: str = None):
45 |         if name is not None and expected is not None:
46 |             extras = f"Got: {name} Expected: {expected}"
47 |             message = f"{message} - {extras}"
48 | 
49 |         super().__init__(message)
50 | 
51 | 
52 | class InvalidMDIOError(MDIOError):
53 |     """Raised when an invalid MDIO file is encountered."""
54 | 
55 | 
56 | class MDIOAlreadyExistsError(MDIOError):
57 |     """Raised when MDIO file already exists."""
58 | 
59 | 
60 | class MDIONotFoundError(MDIOError):
61 |     """Raised when MDIO file doesn't exist."""
62 | 
63 | 
64 | class MDIOMissingVariableError(MDIOError):
65 |     """Raised when a variable is missing from the MDIO dataset."""
66 | 


--------------------------------------------------------------------------------
/src/mdio/core/config.py:
--------------------------------------------------------------------------------
 1 | """Environment variable management for MDIO operations."""
 2 | 
 3 | from psutil import cpu_count
 4 | from pydantic import Field
 5 | from pydantic_settings import BaseSettings
 6 | from pydantic_settings import SettingsConfigDict
 7 | 
 8 | 
 9 | class MDIOSettings(BaseSettings):
10 |     """MDIO environment configuration settings."""
11 | 
12 |     # CPU configuration
13 |     export_cpus: int = Field(
14 |         default_factory=lambda: cpu_count(logical=True),
15 |         description="Number of CPUs to use for export operations",
16 |         alias="MDIO__EXPORT__CPU_COUNT",
17 |     )
18 |     import_cpus: int = Field(
19 |         default_factory=lambda: cpu_count(logical=True),
20 |         description="Number of CPUs to use for import operations",
21 |         alias="MDIO__IMPORT__CPU_COUNT",
22 |     )
23 | 
24 |     # Grid sparsity configuration
25 |     grid_sparsity_ratio_warn: float = Field(
26 |         default=2.0,
27 |         description="Sparsity ratio threshold for warnings",
28 |         alias="MDIO__GRID__SPARSITY_RATIO_WARN",
29 |     )
30 |     grid_sparsity_ratio_limit: float = Field(
31 |         default=10.0,
32 |         description="Sparsity ratio threshold for errors",
33 |         alias="MDIO__GRID__SPARSITY_RATIO_LIMIT",
34 |     )
35 | 
36 |     # Import configuration
37 |     save_segy_file_header: bool = Field(
38 |         default=False,
39 |         description="Whether to save SEG-Y file headers",
40 |         alias="MDIO__IMPORT__SAVE_SEGY_FILE_HEADER",
41 |     )
42 |     raw_headers: bool = Field(
43 |         default=False,
44 |         description="Whether to preserve raw headers",
45 |         alias="MDIO__IMPORT__RAW_HEADERS",
46 |     )
47 |     cloud_native: bool = Field(
48 |         default=False,
49 |         description="Whether to use cloud-native mode for SEG-Y processing",
50 |         alias="MDIO__IMPORT__CLOUD_NATIVE",
51 |     )
52 | 
53 |     # General configuration
54 |     ignore_checks: bool = Field(
55 |         default=False,
56 |         description="Whether to ignore validation checks",
57 |         alias="MDIO_IGNORE_CHECKS",
58 |     )
59 | 
60 |     model_config = SettingsConfigDict(case_sensitive=True)
61 | 


--------------------------------------------------------------------------------
/tests/unit/test_environment.py:
--------------------------------------------------------------------------------
 1 | """Tests for the MDIO Environment API."""
 2 | 
 3 | import os
 4 | from unittest.mock import patch
 5 | 
 6 | import pytest
 7 | 
 8 | from mdio.core.config import MDIOSettings
 9 | 
10 | 
11 | class TestEnvironment:
12 |     """Test the Environment API module functions."""
13 | 
14 |     @pytest.mark.parametrize(
15 |         ("env_var", "value", "property_name", "expected"),
16 |         [
17 |             ("MDIO__EXPORT__CPU_COUNT", "8", "export_cpus", 8),
18 |             ("MDIO__IMPORT__CPU_COUNT", "4", "import_cpus", 4),
19 |             ("MDIO__GRID__SPARSITY_RATIO_WARN", "3.5", "grid_sparsity_ratio_warn", 3.5),
20 |             ("MDIO__GRID__SPARSITY_RATIO_LIMIT", "15.0", "grid_sparsity_ratio_limit", 15.0),
21 |         ],
22 |     )
23 |     def test_env_var_overrides(self, env_var: str, value: str, property_name: str, expected: object) -> None:
24 |         """Test environment variables override defaults."""
25 |         with patch.dict(os.environ, {env_var: value}):
26 |             settings = MDIOSettings()
27 |             result = getattr(settings, property_name)
28 |             assert result == expected
29 | 
30 |     def test_environment_isolation(self) -> None:
31 |         """Test that environment changes don't affect other tests."""
32 |         original_values = {
33 |             "cpus": MDIOSettings().export_cpus,
34 |             "ratio": MDIOSettings().grid_sparsity_ratio_warn,
35 |             "bool": MDIOSettings().save_segy_file_header,
36 |         }
37 | 
38 |         with patch.dict(
39 |             os.environ,
40 |             {
41 |                 "MDIO__EXPORT__CPU_COUNT": "99",
42 |                 "MDIO__GRID__SPARSITY_RATIO_WARN": "99.9",
43 |                 "MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true",
44 |             },
45 |         ):
46 |             assert MDIOSettings().export_cpus == 99
47 |             assert MDIOSettings().grid_sparsity_ratio_warn == 99.9
48 |             assert MDIOSettings().save_segy_file_header is True
49 | 
50 |         # Values should be restored after context
51 |         assert MDIOSettings().export_cpus == original_values["cpus"]
52 |         assert MDIOSettings().grid_sparsity_ratio_warn == original_values["ratio"]
53 |         assert MDIOSettings().save_segy_file_header == original_values["bool"]
54 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/v1/stats.py:
--------------------------------------------------------------------------------
 1 | """Statistics schema for MDIO v1 arrays.
 2 | 
 3 | This module provides two Histogram classes (CenteredBinHistogram and
 4 | EdgeDefinedHistogram),a summary statistics class, and a summary statistics
 5 | metadata class.
 6 | 
 7 | SummaryStatistics: a class that represents the minimum summary statistics
 8 | of an array consisting of count, sum, sum of squares, min, max, and a histogram.
 9 | 
10 | SummaryStatisticsMetadata: represents metadata for statistics, with a field
11 | for v1 of the stats.
12 | 
13 | CenteredBinHistogram takes the center points of each bin in a histogram,
14 | while EdgeDefinedHistogram takes the left edges and widths of each bin.
15 | Both classes extend from the base class BaseHistogram, which represents
16 | a histogram with count of each bin.
17 | """
18 | 
19 | from __future__ import annotations
20 | 
21 | from typing import TypeAlias
22 | 
23 | from pydantic import Field
24 | 
25 | from mdio.builder.schemas.core import CamelCaseStrictModel
26 | 
27 | 
28 | class BaseHistogram(CamelCaseStrictModel):
29 |     """Represents a histogram with bin counts."""
30 | 
31 |     counts: list[int] = Field(..., description="Count of each each bin.")
32 | 
33 | 
34 | class CenteredBinHistogram(BaseHistogram):
35 |     """Class representing a center bin histogram."""
36 | 
37 |     bin_centers: list[float | int] = Field(..., description="List of bin centers.")
38 | 
39 | 
40 | class EdgeDefinedHistogram(BaseHistogram):
41 |     """A class representing an edge-defined histogram."""
42 | 
43 |     bin_edges: list[float | int] = Field(..., description="The left edges of the histogram bins.")
44 |     bin_widths: list[float | int] = Field(..., description="The widths of the histogram bins.")
45 | 
46 | 
47 | Histogram: TypeAlias = CenteredBinHistogram | EdgeDefinedHistogram
48 | 
49 | 
50 | class SummaryStatistics(CamelCaseStrictModel):
51 |     """Data model for some statistics in MDIO v1 arrays."""
52 | 
53 |     count: int = Field(..., description="The number of data points.")
54 |     sum: float = Field(..., description="The total of all data values.")
55 |     sum_squares: float = Field(..., description="The total of all data values squared.")
56 |     min: float = Field(..., description="The smallest value in the variable.")
57 |     max: float = Field(..., description="The largest value in the variable.")
58 |     histogram: Histogram = Field(..., description="Binned frequency distribution.")
59 | 


--------------------------------------------------------------------------------
/docs/data_models/compressors.md:
--------------------------------------------------------------------------------
 1 | ```{eval-rst}
 2 | :tocdepth: 3
 3 | ```
 4 | 
 5 | ```{currentModule} mdio.builder.schemas.compressors
 6 | 
 7 | ```
 8 | 
 9 | # Compressors
10 | 
11 | ```{article-info}
12 | :author: Altay Sansal
13 | :date: "{sub-ref}`today`"
14 | :read-time: "{sub-ref}`wordcount-minutes` min read"
15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light
16 | ```
17 | 
18 | ## Dataset Compression
19 | 
20 | MDIO relies on [numcodecs] for data compression. We provide good defaults based
21 | on opinionated and limited heuristics for each compressor for various energy datasets.
22 | However, using these data models, the compression can be customized.
23 | 
24 | [Numcodecs] is a project that a convenient interface to different compression
25 | libraries. We selected the [Blosc] and [ZFP] compressors for lossless and lossy
26 | compression of energy data.
27 | 
28 | ## Blosc
29 | 
30 | A high-performance compressor optimized for binary data, combining fast compression
31 | with a byte-shuffle filter for enhanced efficiency, particularly effective with
32 | numerical arrays in multi-threaded environments.
33 | 
34 | For more details about compression modes, see [Blosc Documentation].
35 | 
36 | ```{eval-rst}
37 | .. autosummary::
38 |    Blosc
39 | ```
40 | 
41 | ## ZFP
42 | 
43 | ZFP is a compression algorithm tailored for floating-point and integer arrays, offering
44 | lossy and lossless compression with customizable precision, well-suited for large
45 | scientific datasets with a focus on balancing data fidelity and compression ratio.
46 | 
47 | For more details about compression modes, see [ZFP Documentation].
48 | 
49 | ```{eval-rst}
50 | .. autosummary::
51 |    ZFP
52 | ```
53 | 
54 | [numcodecs]: https://github.com/zarr-developers/numcodecs
55 | [blosc]: https://github.com/Blosc/c-blosc
56 | [blosc documentation]: https://www.blosc.org/python-blosc/python-blosc.html
57 | [zfp]: https://github.com/LLNL/zfp
58 | [zfp documentation]: https://computing.llnl.gov/projects/zfp
59 | 
60 | ## Model Reference
61 | 
62 | :::
63 | :::{dropdown} Blosc
64 | :animate: fade-in-slide-down
65 | 
66 | ```{eval-rst}
67 | .. autopydantic_model:: Blosc
68 | ```
69 | 
70 | :::
71 | 
72 | :::{dropdown} ZFP
73 | :animate: fade-in-slide-down
74 | 
75 | ```{eval-rst}
76 | .. autopydantic_model:: ZFP
77 | 
78 | ----------
79 | 
80 | .. autoclass:: ZFPMode()
81 |     :members:
82 |     :undoc-members:
83 |     :member-order: bysource
84 | ```
85 | 
86 | :::
87 | 


--------------------------------------------------------------------------------
/src/mdio/commands/copy.py:
--------------------------------------------------------------------------------
 1 | """MDIO Dataset copy command."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from click import argument
 6 | from click import command
 7 | from click import option
 8 | from click_params import JSON
 9 | 
10 | 
11 | @command(name="copy")
12 | @argument("source-mdio-path", type=str)
13 | @argument("target-mdio-path", type=str)
14 | @option(
15 |     "-traces",
16 |     "--with-traces",
17 |     is_flag=True,
18 |     help="Flag to overwrite the MDIO file if it exists",
19 |     show_default=True,
20 | )
21 | @option(
22 |     "-headers",
23 |     "--with-headers",
24 |     is_flag=True,
25 |     help="Flag to overwrite the MDIO file if it exists",
26 |     show_default=True,
27 | )
28 | @option(
29 |     "-storage-input",
30 |     "--storage-options-input",
31 |     required=False,
32 |     help="Storage options for input MDIO file.",
33 |     type=JSON,
34 | )
35 | @option(
36 |     "-storage-output",
37 |     "--storage-options-output",
38 |     required=False,
39 |     help="Storage options for output MDIO file.",
40 |     type=JSON,
41 | )
42 | @option(
43 |     "-overwrite",
44 |     "--overwrite",
45 |     is_flag=True,
46 |     help="Flag to overwrite the MDIO file if it exists",
47 |     show_default=True,
48 | )
49 | def copy(  # noqa: PLR0913
50 |     source_mdio_path: str,
51 |     target_mdio_path: str,
52 |     with_traces: bool = False,
53 |     with_headers: bool = False,
54 |     storage_options_input: dict | None = None,
55 |     storage_options_output: dict | None = None,
56 |     overwrite: bool = False,
57 | ) -> None:
58 |     """Copy an MDIO dataset to another MDIO dataset.
59 | 
60 |     This command copies an MDIO file from a source path to a target path, optionally including
61 |     trace data, headers, or both, for all access patterns. It creates a new MDIO file at the target
62 |     path with the same structure as the source, and selectively copies data based on the provided
63 |     flags. The function supports custom storage options for both input and output, enabling
64 |     compatibility with various filesystems via FSSpec.
65 |     """
66 |     # Lazy import to reduce CLI startup time
67 |     from mdio.api.convenience import copy_mdio  # noqa: PLC0415
68 | 
69 |     copy_mdio(
70 |         source_mdio_path,
71 |         target_mdio_path,
72 |         overwrite,
73 |         with_traces,
74 |         with_headers,
75 |         storage_options_input,
76 |         storage_options_output,
77 |     )
78 | 
79 | 
80 | cli = copy
81 | 


--------------------------------------------------------------------------------
/src/mdio/segy/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom exceptions for SEG-Y."""
 2 | 
 3 | from mdio.exceptions import MDIOError
 4 | 
 5 | 
 6 | class GridOverrideInputError(MDIOError):
 7 |     """Raised when grid override parameters are not correct."""
 8 | 
 9 | 
10 | class GridOverrideUnknownError(GridOverrideInputError):
11 |     """Raised when grid override parameter is unknown.
12 | 
13 |     Args:
14 |         command_name: Name of the unknown grid override parameter.
15 |     """
16 | 
17 |     def __init__(self, command_name: str):
18 |         self.command_name = command_name
19 |         self.message = f"Unknown grid override: {command_name}"
20 |         super().__init__(self.message)
21 | 
22 | 
23 | class GridOverrideKeysError(GridOverrideInputError):
24 |     """Raised when grid override is not compatible with required keys.
25 | 
26 |     Args:
27 |         command_name: Name of the grid override command.
28 |         required_keys: Set of required keys for the grid override.
29 |     """
30 | 
31 |     def __init__(self, command_name: str, required_keys: set[str]):
32 |         self.command_name = command_name
33 |         self.required_keys = required_keys
34 |         self.message = f"{command_name} can only be used with {required_keys} keys."
35 |         super().__init__(self.message)
36 | 
37 | 
38 | class GridOverrideMissingParameterError(GridOverrideInputError):
39 |     """Raised when grid override parameters are not correct.
40 | 
41 |     Args:
42 |         command_name: Name of the grid override command.
43 |         missing_parameter: Set of missing parameters required by the command.
44 |     """
45 | 
46 |     def __init__(self, command_name: str, missing_parameter: set[str]):
47 |         self.command_name = command_name
48 |         self.missing_parameter = missing_parameter
49 |         self.message = f"{command_name} requires {missing_parameter} parameter."
50 |         super().__init__(self.message)
51 | 
52 | 
53 | class GridOverrideIncompatibleError(GridOverrideInputError):
54 |     """Raised when two grid overrides are incompatible.
55 | 
56 |     Args:
57 |         first_command: Name of the first grid override command.
58 |         second_command: Name of the second grid override command.
59 |     """
60 | 
61 |     def __init__(self, first_command: str, second_command: str):
62 |         self.first_command = first_command
63 |         self.second_command = second_command
64 |         self.message = f"{first_command} can't be used together with {second_command}."
65 |         super().__init__(self.message)
66 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 | 
 9 | jobs:
10 |   release:
11 |     name: Release
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Check out the repository
15 |         uses: actions/checkout@v5
16 |         with:
17 |           fetch-depth: 2
18 | 
19 |       - name: Install the pinned version of uv
20 |         uses: astral-sh/setup-uv@v7
21 |         with:
22 |           python-version: 3.13
23 |           working-directory: ${{ github.workspace }}
24 | 
25 |       - name: Install bumpversion
26 |         run: |
27 |           uv tool install --constraint=.github/workflows/constraints.txt bump-my-version
28 |           bump-my-version --version
29 | 
30 |       - name: Check if there is a parent commit
31 |         id: check-parent-commit
32 |         run: |
33 |           echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT
34 | 
35 |       - name: Detect and tag new version
36 |         id: check-version
37 |         if: steps.check-parent-commit.outputs.sha
38 |         uses: salsify/action-detect-and-tag-new-version@v2.0.3
39 |         with:
40 |           version-command: |
41 |             bump-my-version show current_version
42 | 
43 |       - name: Bump version for developmental release
44 |         if: "! steps.check-version.outputs.tag"
45 |         run: |
46 |           bump-my-version bump patch &&
47 |           version=$(bump-my-version show current_version) &&
48 |           bump-my-version bump --new-version $version.dev$(date +%s)
49 | 
50 |       - name: Build package
51 |         run: |
52 |           uv build
53 | 
54 |       - name: Publish package on PyPI
55 |         if: steps.check-version.outputs.tag
56 |         uses: pypa/gh-action-pypi-publish@v1.13.0
57 |         with:
58 |           user: __token__
59 |           password: ${{ secrets.PYPI_TOKEN }}
60 | 
61 |       - name: Publish package on TestPyPI
62 |         if: "! steps.check-version.outputs.tag"
63 |         uses: pypa/gh-action-pypi-publish@v1.13.0
64 |         with:
65 |           user: __token__
66 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
67 |           repository_url: https://test.pypi.org/legacy/
68 | 
69 |       - name: Publish the release notes
70 |         uses: release-drafter/release-drafter@v6.1.0
71 |         with:
72 |           publish: ${{ steps.check-version.outputs.tag != '' }}
73 |           tag: ${{ steps.check-version.outputs.tag }}
74 |         env:
75 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
76 | 


--------------------------------------------------------------------------------
/src/mdio/core/dimension.py:
--------------------------------------------------------------------------------
 1 | """Dimension (grid) abstraction and serializers."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from dataclasses import dataclass
 6 | from typing import TYPE_CHECKING
 7 | 
 8 | import numpy as np
 9 | 
10 | from mdio.exceptions import ShapeError
11 | 
12 | if TYPE_CHECKING:
13 |     from numpy.typing import NDArray
14 | 
15 | 
16 | @dataclass(eq=False, order=False, slots=True)
17 | class Dimension:
18 |     """Dimension class.
19 | 
20 |     Dimension has a name and coordinates associated with it. The Dimension coordinates can only
21 |     be a vector.
22 | 
23 |     Args:
24 |         coords: Vector of coordinates.
25 |         name: Name of the dimension.
26 | 
27 |     Attributes:
28 |         coords: Vector of coordinates.
29 |         name: Name of the dimension.
30 |     """
31 | 
32 |     coords: list | tuple | NDArray | range
33 |     name: str
34 | 
35 |     def __post_init__(self) -> None:
36 |         """Post process and validation."""
37 |         self.coords = np.asarray(self.coords)
38 |         if self.coords.ndim != 1:
39 |             msg = "Dimensions can only have vector coordinates"
40 |             raise ShapeError(msg, ("# Dim", "Expected"), (self.coords.ndim, 1))
41 | 
42 |     @property
43 |     def size(self) -> int:
44 |         """Size of the dimension."""
45 |         return len(self.coords)
46 | 
47 |     def __len__(self) -> int:
48 |         """Length magic."""
49 |         return self.size
50 | 
51 |     def __getitem__(self, item: int | slice | list[int]) -> NDArray:
52 |         """Gets a specific coordinate value by index."""
53 |         return self.coords[item]
54 | 
55 |     def __setitem__(self, key: int, value: NDArray) -> None:
56 |         """Sets a specific coordinate value by index."""
57 |         self.coords[key] = value
58 | 
59 |     def __hash__(self) -> int:
60 |         """Hashing magic."""
61 |         return hash(tuple(self.coords) + (self.name,))
62 | 
63 |     def __eq__(self, other: Dimension) -> bool:
64 |         """Compares if the dimension has same properties."""
65 |         if not isinstance(other, Dimension):
66 |             other_type = type(other).__name__
67 |             msg = f"Can't compare Dimension with {other_type}"
68 |             raise TypeError(msg)
69 | 
70 |         return hash(self) == hash(other)
71 | 
72 |     def min(self) -> NDArray[float]:
73 |         """Get minimum value of dimension."""
74 |         return np.min(self.coords)
75 | 
76 |     def max(self) -> NDArray[float]:
77 |         """Get maximum value of dimension."""
78 |         return np.max(self.coords)
79 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/v1/variable.py:
--------------------------------------------------------------------------------
 1 | """This module defines `LabeledArray`, `Coordinate`, and `Variable`.
 2 | 
 3 | `LabeledArray` is a basic array unit which includes basic properties like
 4 | name, dimension, data type, compressor etc.
 5 | 
 6 | `Coordinate` extends the `LabeledArray` class, it represents the Coordinate
 7 | array in the MDIO format. It has dimensions which are fully defined and can hold
 8 | additional metadata.
 9 | 
10 | `Variable` is another class that extends the `LabeledArray`. It represents a
11 | variable in MDIO format. It can have coordinates and can also hold metadata.
12 | """
13 | 
14 | from typing import Any
15 | 
16 | from pydantic import Field
17 | 
18 | from mdio.builder.schemas.base import NamedArray
19 | from mdio.builder.schemas.chunk_grid import RectilinearChunkGrid
20 | from mdio.builder.schemas.chunk_grid import RegularChunkGrid
21 | from mdio.builder.schemas.core import CamelCaseStrictModel
22 | from mdio.builder.schemas.dtype import ScalarType
23 | from mdio.builder.schemas.v1.stats import SummaryStatistics
24 | from mdio.builder.schemas.v1.units import AllUnitModel
25 | 
26 | 
27 | class CoordinateMetadata(CamelCaseStrictModel):
28 |     """Reduced Metadata, perfect for simple Coordinates."""
29 | 
30 |     units_v1: AllUnitModel | None = Field(default=None)
31 |     attributes: dict[str, Any] | None = Field(default=None)
32 | 
33 | 
34 | class VariableMetadata(CoordinateMetadata):
35 |     """Complete Metadata for Variables and complex or large Coordinates."""
36 | 
37 |     chunk_grid: RegularChunkGrid | RectilinearChunkGrid | None = Field(
38 |         default=None,
39 |         description="Chunk grid specification for the array.",
40 |     )
41 | 
42 |     stats_v1: SummaryStatistics | list[SummaryStatistics] | None = Field(
43 |         default=None,
44 |         description="Minimal summary statistics.",
45 |     )
46 | 
47 | 
48 | class Coordinate(NamedArray):
49 |     """A simple MDIO Coordinate array with metadata.
50 | 
51 |     For large or complex Coordinates, define a Variable instead.
52 |     """
53 | 
54 |     data_type: ScalarType = Field(..., description="Data type of Coordinate.")
55 |     metadata: CoordinateMetadata | None = Field(default=None, description="Coordinate Metadata.")
56 | 
57 | 
58 | class Variable(NamedArray):
59 |     """An MDIO Variable that has coordinates and metadata."""
60 | 
61 |     coordinates: list[Coordinate] | list[str] | None = Field(
62 |         default=None,
63 |         description="Coordinates of the MDIO Variable dimensions.",
64 |     )
65 |     metadata: VariableMetadata | None = Field(default=None, description="Variable Metadata.")
66 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | """Sphinx configuration."""
 2 | 
 3 | # -- Project information -----------------------------------------------------
 4 | 
 5 | project = "MDIO"
 6 | author = "TGS"
 7 | copyright = "2023, TGS"  # noqa: A001
 8 | 
 9 | # -- General configuration ---------------------------------------------------
10 | 
11 | # Add any Sphinx extension module names here, as strings. They can be
12 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
13 | # ones.
14 | 
15 | extensions = [
16 |     "sphinx.ext.autodoc",
17 |     "sphinx.ext.napoleon",
18 |     "sphinx.ext.intersphinx",
19 |     "sphinx.ext.autosummary",
20 |     "sphinxcontrib.autodoc_pydantic",
21 |     "sphinx.ext.autosectionlabel",
22 |     "sphinx_click",
23 |     "sphinx_copybutton",
24 |     "myst_nb",
25 |     "sphinx_design",
26 | ]
27 | 
28 | # List of patterns, relative to source directory, that match files and
29 | # directories to ignore when looking for source files.
30 | # This pattern also affects html_static_path and html_extra_path.
31 | exclude_patterns = [
32 |     "_build",
33 |     "Thumbs.db",
34 |     "jupyter_execute",
35 |     ".DS_Store",
36 |     "**.ipynb_checkpoints",
37 | ]
38 | 
39 | intersphinx_mapping = {
40 |     "python": ("https://docs.python.org/3", None),
41 |     "numpy": ("https://numpy.org/doc/stable/", None),
42 |     "pydantic": ("https://docs.pydantic.dev/latest/", None),
43 |     "zarr": ("https://zarr.readthedocs.io/en/stable/", None),
44 | }
45 | 
46 | pygments_style = "vs"
47 | pygments_dark_style = "material"
48 | 
49 | autodoc_typehints = "description"
50 | autodoc_typehints_format = "short"
51 | autodoc_member_order = "groupwise"
52 | autoclass_content = "class"
53 | autosectionlabel_prefix_document = True
54 | 
55 | autodoc_pydantic_field_list_validators = False
56 | autodoc_pydantic_field_swap_name_and_alias = True
57 | autodoc_pydantic_field_show_alias = False
58 | autodoc_pydantic_model_show_config_summary = False
59 | autodoc_pydantic_model_show_validator_summary = False
60 | autodoc_pydantic_model_show_validator_members = False
61 | autodoc_pydantic_model_show_field_summary = False
62 | 
63 | html_theme = "furo"
64 | 
65 | myst_number_code_blocks = ["python"]
66 | myst_heading_anchors = 2
67 | myst_words_per_minute = 80
68 | myst_enable_extensions = [
69 |     "colon_fence",
70 |     "linkify",
71 |     "replacements",
72 |     "smartquotes",
73 |     "attrs_inline",
74 | ]
75 | 
76 | # sphinx-copybutton configurations
77 | copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: "
78 | copybutton_line_continuation_character = "\\"
79 | copybutton_prompt_is_regexp = True
80 | 
81 | nb_execution_mode = "off"
82 | 


--------------------------------------------------------------------------------
/src/mdio/segy/parsers.py:
--------------------------------------------------------------------------------
 1 | """Parsers for sections of SEG-Y files."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import multiprocessing as mp
 6 | from concurrent.futures import ProcessPoolExecutor
 7 | from itertools import repeat
 8 | from math import ceil
 9 | from typing import TYPE_CHECKING
10 | 
11 | import numpy as np
12 | from tqdm.auto import tqdm
13 | 
14 | from mdio.core.config import MDIOSettings
15 | from mdio.segy._workers import header_scan_worker
16 | 
17 | if TYPE_CHECKING:
18 |     from segy.arrays import HeaderArray
19 | 
20 |     from mdio.segy.file import SegyFileArguments
21 | 
22 | 
23 | def parse_headers(
24 |     segy_file_kwargs: SegyFileArguments,
25 |     num_traces: int,
26 |     subset: tuple[str, ...] | None = None,
27 |     block_size: int = 10000,
28 |     progress_bar: bool = True,
29 | ) -> HeaderArray:
30 |     """Read and parse given `byte_locations` from SEG-Y file.
31 | 
32 |     Args:
33 |         segy_file_kwargs: SEG-Y file arguments.
34 |         num_traces: Total number of traces in the SEG-Y file.
35 |         subset: Tuple of header names to filter and keep.
36 |         block_size: Number of traces to read for each block.
37 |         progress_bar: Enable or disable progress bar. Default is True.
38 | 
39 |     Returns:
40 |         HeaderArray. Keys are the index names, values are numpy arrays of parsed headers for the
41 |         current block. Array is of type byte_type except IBM32 which is mapped to FLOAT32.
42 |     """
43 |     settings = MDIOSettings()
44 | 
45 |     trace_count = num_traces
46 |     n_blocks = int(ceil(trace_count / block_size))
47 | 
48 |     trace_ranges = []
49 |     for idx in range(n_blocks):
50 |         start, stop = idx * block_size, (idx + 1) * block_size
51 |         stop = min(stop, trace_count)
52 | 
53 |         trace_ranges.append((start, stop))
54 | 
55 |     num_workers = min(n_blocks, settings.import_cpus)
56 | 
57 |     tqdm_kw = {"unit": "block", "dynamic_ncols": True}
58 |     # For Unix async writes with s3fs/fsspec & multiprocessing, use 'spawn' instead of default
59 |     # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows.
60 |     context = mp.get_context("spawn")
61 |     with ProcessPoolExecutor(num_workers, mp_context=context) as executor:
62 |         lazy_work = executor.map(header_scan_worker, repeat(segy_file_kwargs), trace_ranges, repeat(subset))
63 | 
64 |         if progress_bar is True:
65 |             lazy_work = tqdm(
66 |                 iterable=lazy_work,
67 |                 total=n_blocks,
68 |                 desc="Scanning SEG-Y for geometry attributes",
69 |                 **tqdm_kw,
70 |             )
71 | 
72 |         # This executes the lazy work.
73 |         headers: list[HeaderArray] = list(lazy_work)
74 | 
75 |     # Merge blocks before return
76 |     return np.concatenate(headers)
77 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | debugging/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # ruff
133 | .ruff_cache/
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | # Cython debug symbols
142 | cython_debug/
143 | 
144 | # IDE settings
145 | .vscode/
146 | .idea/
147 | 
148 | # tests
149 | mdio1/*
150 | */mdio1/*
151 | pytest-of-*
152 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_2d_streamer_shot.py:
--------------------------------------------------------------------------------
 1 | """Seismic2DStreamerShotGathersTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.schemas import compressors
 6 | from mdio.builder.schemas.dtype import ScalarType
 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 8 | from mdio.builder.templates.base import AbstractDatasetTemplate
 9 | from mdio.builder.templates.types import SeismicDataDomain
10 | 
11 | 
12 | class Seismic2DStreamerShotGathersTemplate(AbstractDatasetTemplate):
13 |     """Seismic Shot pre-stack 2D time or depth Dataset template."""
14 | 
15 |     def __init__(self, data_domain: SeismicDataDomain = "time"):
16 |         super().__init__(data_domain=data_domain)
17 | 
18 |         self._dim_names = ("shot_point", "channel", self._data_domain)
19 |         self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y")
20 |         self._var_chunk_shape = (16, 32, 2048)
21 | 
22 |     @property
23 |     def _name(self) -> str:
24 |         return "StreamerShotGathers2D"
25 | 
26 |     def _load_dataset_attributes(self) -> dict[str, Any]:
27 |         return {"surveyType": "2D", "gatherType": "common_source"}
28 | 
29 |     def _add_coordinates(self) -> None:
30 |         # Add dimension coordinates
31 |         for name in self._dim_names:
32 |             self._builder.add_coordinate(
33 |                 name,
34 |                 dimensions=(name,),
35 |                 data_type=ScalarType.INT32,
36 |                 metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)),
37 |             )
38 | 
39 |         # Add non-dimension coordinates
40 |         compressor = compressors.Blosc(cname=compressors.BloscCname.zstd)
41 |         self._builder.add_coordinate(
42 |             "source_coord_x",
43 |             dimensions=("shot_point",),
44 |             data_type=ScalarType.FLOAT64,
45 |             compressor=compressor,
46 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")),
47 |         )
48 |         self._builder.add_coordinate(
49 |             "source_coord_y",
50 |             dimensions=("shot_point",),
51 |             data_type=ScalarType.FLOAT64,
52 |             compressor=compressor,
53 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")),
54 |         )
55 |         self._builder.add_coordinate(
56 |             "group_coord_x",
57 |             dimensions=("shot_point", "channel"),
58 |             data_type=ScalarType.FLOAT64,
59 |             compressor=compressor,
60 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")),
61 |         )
62 |         self._builder.add_coordinate(
63 |             "group_coord_y",
64 |             dimensions=("shot_point", "channel"),
65 |             data_type=ScalarType.FLOAT64,
66 |             compressor=compressor,
67 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")),
68 |         )
69 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | """Test cases for the __main__ module."""
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | from click.testing import CliRunner
 8 | 
 9 | from mdio import __main__
10 | 
11 | 
12 | @pytest.fixture
13 | def runner() -> CliRunner:
14 |     """Fixture for invoking command-line interfaces."""
15 |     return CliRunner()
16 | 
17 | 
18 | # TODO(Altay): Redesign and implement the new v1 CLI
19 | # https://github.com/TGSAI/mdio-python/issues/646
20 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.")
21 | @pytest.mark.dependency
22 | def test_main_succeeds(
23 |     runner: CliRunner, segy_input: Path, zarr_tmp: Path
24 | ) -> None:  # pragma: no cover - test is skipped
25 |     """It exits with a status code of zero."""
26 |     cli_args = ["segy", "import", str(segy_input), str(zarr_tmp)]
27 |     cli_args.extend(["--header-locations", "181,185"])
28 |     cli_args.extend(["--header-names", "inline,crossline"])
29 | 
30 |     result = runner.invoke(__main__.main, args=cli_args)
31 |     assert result.exit_code == 0
32 | 
33 | 
34 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.")
35 | @pytest.mark.dependency(depends=["test_main_succeeds"])
36 | def test_main_cloud(
37 |     runner: CliRunner, segy_input_uri: str, zarr_tmp: Path
38 | ) -> None:  # pragma: no cover - tests is skipped
39 |     """It exits with a status code of zero."""
40 |     os.environ["MDIO__IMPORT__CLOUD_NATIVE"] = "true"
41 |     cli_args = ["segy", "import", segy_input_uri, str(zarr_tmp)]
42 |     cli_args.extend(["--header-locations", "181,185"])
43 |     cli_args.extend(["--header-names", "inline,crossline"])
44 |     cli_args.extend(["--overwrite"])
45 | 
46 |     result = runner.invoke(__main__.main, args=cli_args)
47 |     assert result.exit_code == 0
48 | 
49 | 
50 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.")
51 | @pytest.mark.dependency(depends=["test_main_succeeds"])
52 | def test_main_info_succeeds(runner: CliRunner, zarr_tmp: Path) -> None:  # pragma: no cover - tests is skipped
53 |     """It exits with a status code of zero."""
54 |     cli_args = ["info"]
55 |     cli_args.extend([str(zarr_tmp)])
56 | 
57 |     result = runner.invoke(__main__.main, args=cli_args)
58 |     assert result.exit_code == 0
59 | 
60 | 
61 | @pytest.mark.skip(reason="CLI hasn't been updated to work with v1 yet.")
62 | @pytest.mark.dependency(depends=["test_main_succeeds"])
63 | def test_main_copy(runner: CliRunner, zarr_tmp: Path, zarr_tmp2: Path) -> None:  # pragma: no cover - tests is skipped
64 |     """It exits with a status code of zero."""
65 |     cli_args = ["copy", str(zarr_tmp), str(zarr_tmp2), "-headers", "-traces"]
66 | 
67 |     result = runner.invoke(__main__.main, args=cli_args)
68 |     assert result.exit_code == 0
69 | 
70 | 
71 | def test_cli_version(runner: CliRunner) -> None:
72 |     """Check if version prints without error."""
73 |     cli_args = ["--version"]
74 |     result = runner.invoke(__main__.main, args=cli_args)
75 |     assert result.exit_code == 0
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | pip-*
 29 | tmp*
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | docs/jupyter_execute/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | venv*/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # ruff
135 | .ruff_cache/
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | # pytype static type analyzer
141 | .pytype/
142 | 
143 | # Cython debug symbols
144 | cython_debug/
145 | 
146 | # IDE settings
147 | .vscode/
148 | .idea/
149 | 
150 | # tests
151 | mdio1/*
152 | */mdio1/*
153 | pytest-of-*
154 | tmp
155 | debugging/*
156 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_2d_cdp.py:
--------------------------------------------------------------------------------
 1 | """Seismic2DCDPGathersTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.schemas.compressors import Blosc
 6 | from mdio.builder.schemas.compressors import BloscCname
 7 | from mdio.builder.schemas.dtype import ScalarType
 8 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 9 | from mdio.builder.templates.base import AbstractDatasetTemplate
10 | from mdio.builder.templates.types import CdpGatherDomain
11 | from mdio.builder.templates.types import SeismicDataDomain
12 | 
13 | 
14 | class Seismic2DCdpGathersTemplate(AbstractDatasetTemplate):
15 |     """Seismic CDP pre-stack 2D time or depth Dataset template."""
16 | 
17 |     def __init__(self, data_domain: SeismicDataDomain, gather_domain: CdpGatherDomain):
18 |         super().__init__(data_domain=data_domain)
19 |         self._gather_domain = gather_domain.lower()
20 | 
21 |         if self._gather_domain not in ["offset", "angle"]:
22 |             msg = "gather_type must be 'offset' or 'angle'"
23 |             raise ValueError(msg)
24 | 
25 |         self._dim_names = ("cdp", self._gather_domain, self._data_domain)
26 |         self._physical_coord_names = ("cdp_x", "cdp_y")
27 |         self._var_chunk_shape = (16, 64, 1024)
28 | 
29 |     @property
30 |     def _name(self) -> str:
31 |         gather_domain_suffix = self._gather_domain.capitalize()
32 |         data_domain_suffix = self._data_domain.capitalize()
33 |         return f"Cdp{gather_domain_suffix}Gathers2D{data_domain_suffix}"
34 | 
35 |     def _load_dataset_attributes(self) -> dict[str, Any]:
36 |         return {"surveyType": "2D", "gatherType": "cdp"}
37 | 
38 |     def _add_coordinates(self) -> None:
39 |         # Add dimension coordinates
40 |         self._builder.add_coordinate(
41 |             "cdp",
42 |             dimensions=("cdp",),
43 |             data_type=ScalarType.INT32,
44 |         )
45 |         self._builder.add_coordinate(
46 |             self._gather_domain,
47 |             dimensions=(self._gather_domain,),
48 |             data_type=ScalarType.INT32,
49 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)),
50 |         )
51 |         self._builder.add_coordinate(
52 |             self.trace_domain,
53 |             dimensions=(self.trace_domain,),
54 |             data_type=ScalarType.INT32,
55 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
56 |         )
57 | 
58 |         # Add non-dimension coordinates
59 |         compressor = Blosc(cname=BloscCname.zstd)
60 |         self._builder.add_coordinate(
61 |             "cdp_x",
62 |             dimensions=("cdp",),
63 |             data_type=ScalarType.FLOAT64,
64 |             compressor=compressor,
65 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")),
66 |         )
67 |         self._builder.add_coordinate(
68 |             "cdp_y",
69 |             dimensions=("cdp",),
70 |             data_type=ScalarType.FLOAT64,
71 |             compressor=compressor,
72 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")),
73 |         )
74 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_3d_coca.py:
--------------------------------------------------------------------------------
 1 | """Seismic3DCocaTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.schemas import compressors
 6 | from mdio.builder.schemas.dtype import ScalarType
 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 8 | from mdio.builder.templates.base import AbstractDatasetTemplate
 9 | from mdio.builder.templates.types import SeismicDataDomain
10 | 
11 | 
12 | class Seismic3DCocaGathersTemplate(AbstractDatasetTemplate):
13 |     """Seismic CoCA (common offset, common azimuth) pre-stack 3D Dataset template."""
14 | 
15 |     def __init__(self, data_domain: SeismicDataDomain):
16 |         super().__init__(data_domain=data_domain)
17 | 
18 |         self._dim_names = ("inline", "crossline", "offset", "azimuth", self._data_domain)
19 |         self._physical_coord_names = ("cdp_x", "cdp_y")
20 |         self._var_chunk_shape = (8, 8, 32, 1, 1024)
21 | 
22 |     @property
23 |     def _name(self) -> str:
24 |         return f"CocaGathers3D{self._data_domain.capitalize()}"
25 | 
26 |     def _load_dataset_attributes(self) -> dict[str, Any]:
27 |         return {"surveyType": "3D", "gatherType": "common_offset_common_azimuth"}
28 | 
29 |     def _add_coordinates(self) -> None:
30 |         # Add dimension coordinates
31 |         self._builder.add_coordinate(
32 |             "inline",
33 |             dimensions=("inline",),
34 |             data_type=ScalarType.INT32,
35 |         )
36 |         self._builder.add_coordinate(
37 |             "crossline",
38 |             dimensions=("crossline",),
39 |             data_type=ScalarType.INT32,
40 |         )
41 |         self._builder.add_coordinate(
42 |             "offset",
43 |             dimensions=("offset",),
44 |             data_type=ScalarType.INT32,
45 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("offset")),  # same unit as X/Y
46 |         )
47 |         self._builder.add_coordinate(
48 |             "azimuth",
49 |             dimensions=("azimuth",),
50 |             data_type=ScalarType.FLOAT32,
51 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("azimuth")),
52 |         )
53 |         self._builder.add_coordinate(
54 |             self.trace_domain,
55 |             dimensions=(self.trace_domain,),
56 |             data_type=ScalarType.INT32,
57 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
58 |         )
59 | 
60 |         # Add non-dimension coordinates
61 |         compressor = compressors.Blosc(cname=compressors.BloscCname.zstd)
62 |         self._builder.add_coordinate(
63 |             "cdp_x",
64 |             dimensions=("inline", "crossline"),
65 |             data_type=ScalarType.FLOAT64,
66 |             compressor=compressor,
67 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")),
68 |         )
69 |         self._builder.add_coordinate(
70 |             "cdp_y",
71 |             dimensions=("inline", "crossline"),
72 |             data_type=ScalarType.FLOAT64,
73 |             compressor=compressor,
74 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")),
75 |         )
76 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/v1/units.py:
--------------------------------------------------------------------------------
  1 | """Unit schemas specific to MDIO v1."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TypeAlias
  6 | 
  7 | from pint import UnitRegistry
  8 | 
  9 | from mdio.builder.schemas.units import UnitEnum
 10 | from mdio.builder.schemas.units import create_unit_model
 11 | 
 12 | ureg = UnitRegistry()
 13 | ureg.formatter.default_format = "~C"  # compact, abbreviated (symbol).
 14 | 
 15 | 
 16 | class LengthUnitEnum(UnitEnum):
 17 |     """Enum class representing metric units of length."""
 18 | 
 19 |     MILLIMETER = ureg.millimeter
 20 |     CENTIMETER = ureg.centimeter
 21 |     METER = ureg.meter
 22 |     KILOMETER = ureg.kilometer
 23 | 
 24 |     INCH = ureg.inch
 25 |     FOOT = ureg.foot
 26 |     YARD = ureg.yard
 27 |     MILE = ureg.mile
 28 | 
 29 | 
 30 | LengthUnitModel = create_unit_model(LengthUnitEnum, "LengthUnitModel", "length", __name__)
 31 | 
 32 | 
 33 | class TimeUnitEnum(UnitEnum):
 34 |     """Enum class representing units of time."""
 35 | 
 36 |     NANOSECOND = ureg.nanosecond
 37 |     MICROSECOND = ureg.microsecond
 38 |     MILLISECOND = ureg.millisecond
 39 |     SECOND = ureg.second
 40 |     MINUTE = ureg.minute
 41 |     HOUR = ureg.hour
 42 |     DAY = ureg.day
 43 | 
 44 | 
 45 | TimeUnitModel = create_unit_model(TimeUnitEnum, "TimeUnitModel", "time", __name__)
 46 | 
 47 | 
 48 | class DensityUnitEnum(UnitEnum):
 49 |     """Enum class representing units of density."""
 50 | 
 51 |     GRAMS_PER_CC = ureg.gram / ureg.centimeter**3
 52 |     KILOGRAMS_PER_M3 = ureg.kilogram / ureg.meter**3
 53 |     POUNDS_PER_GAL = ureg.pounds / ureg.gallon
 54 | 
 55 | 
 56 | DensityUnitModel = create_unit_model(DensityUnitEnum, "DensityUnitModel", "density", __name__)
 57 | 
 58 | 
 59 | class SpeedUnitEnum(UnitEnum):
 60 |     """Enum class representing units of speed."""
 61 | 
 62 |     METERS_PER_SECOND = ureg.meter / ureg.second
 63 |     FEET_PER_SECOND = ureg.feet / ureg.second
 64 | 
 65 | 
 66 | SpeedUnitModel = create_unit_model(SpeedUnitEnum, "SpeedUnitModel", "speed", __name__)
 67 | 
 68 | 
 69 | class AngleUnitEnum(UnitEnum):
 70 |     """Enum class representing units of angle."""
 71 | 
 72 |     DEGREES = ureg.degree
 73 |     RADIANS = ureg.radian
 74 | 
 75 | 
 76 | AngleUnitModel = create_unit_model(AngleUnitEnum, "AngleUnitModel", "angle", __name__)
 77 | 
 78 | 
 79 | class FrequencyUnitEnum(UnitEnum):
 80 |     """Enum class representing units of frequency."""
 81 | 
 82 |     HERTZ = ureg.hertz
 83 | 
 84 | 
 85 | FrequencyUnitModel = create_unit_model(FrequencyUnitEnum, "FrequencyUnitModel", "frequency", __name__)
 86 | 
 87 | 
 88 | class VoltageUnitEnum(UnitEnum):
 89 |     """Enum class representing units of voltage."""
 90 | 
 91 |     MICROVOLT = ureg.microvolt
 92 |     MILLIVOLT = ureg.millivolt
 93 |     VOLT = ureg.volt
 94 | 
 95 | 
 96 | VoltageUnitModel = create_unit_model(VoltageUnitEnum, "VoltageUnitModel", "voltage", __name__)
 97 | 
 98 | 
 99 | # Composite model types
100 | AllUnitModel: TypeAlias = (
101 |     LengthUnitModel
102 |     | TimeUnitModel
103 |     | AngleUnitModel
104 |     | DensityUnitModel
105 |     | SpeedUnitModel
106 |     | FrequencyUnitModel
107 |     | VoltageUnitModel
108 | )
109 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_3d_streamer_shot.py:
--------------------------------------------------------------------------------
 1 | """Seismic3DStreamerShotGathersTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.schemas import compressors
 6 | from mdio.builder.schemas.dtype import ScalarType
 7 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 8 | from mdio.builder.templates.base import AbstractDatasetTemplate
 9 | from mdio.builder.templates.types import SeismicDataDomain
10 | 
11 | 
12 | class Seismic3DStreamerShotGathersTemplate(AbstractDatasetTemplate):
13 |     """Seismic Shot pre-stack 3D time or depth Dataset template."""
14 | 
15 |     def __init__(self, data_domain: SeismicDataDomain = "time"):
16 |         super().__init__(data_domain=data_domain)
17 | 
18 |         self._dim_names = ("shot_point", "cable", "channel", self._data_domain)
19 |         self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y")
20 |         self._logical_coord_names = ("gun",)
21 |         self._var_chunk_shape = (8, 1, 128, 2048)
22 | 
23 |     @property
24 |     def _name(self) -> str:
25 |         return "StreamerShotGathers3D"
26 | 
27 |     def _load_dataset_attributes(self) -> dict[str, Any]:
28 |         return {"surveyType": "3D", "gatherType": "common_source"}
29 | 
30 |     def _add_coordinates(self) -> None:
31 |         # Add dimension coordinates
32 |         for name in self._dim_names:
33 |             self._builder.add_coordinate(
34 |                 name,
35 |                 dimensions=(name,),
36 |                 data_type=ScalarType.INT32,
37 |                 metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)),
38 |             )
39 | 
40 |         # Add non-dimension coordinates
41 |         compressor = compressors.Blosc(cname=compressors.BloscCname.zstd)
42 |         self._builder.add_coordinate(
43 |             "gun",
44 |             dimensions=("shot_point",),
45 |             data_type=ScalarType.UINT8,
46 |             compressor=compressor,
47 |         )
48 |         self._builder.add_coordinate(
49 |             "source_coord_x",
50 |             dimensions=("shot_point",),
51 |             data_type=ScalarType.FLOAT64,
52 |             compressor=compressor,
53 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")),
54 |         )
55 |         self._builder.add_coordinate(
56 |             "source_coord_y",
57 |             dimensions=("shot_point",),
58 |             data_type=ScalarType.FLOAT64,
59 |             compressor=compressor,
60 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")),
61 |         )
62 |         self._builder.add_coordinate(
63 |             "group_coord_x",
64 |             dimensions=("shot_point", "cable", "channel"),
65 |             data_type=ScalarType.FLOAT64,
66 |             compressor=compressor,
67 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")),
68 |         )
69 |         self._builder.add_coordinate(
70 |             "group_coord_y",
71 |             dimensions=("shot_point", "cable", "channel"),
72 |             data_type=ScalarType.FLOAT64,
73 |             compressor=compressor,
74 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")),
75 |         )
76 | 


--------------------------------------------------------------------------------
/src/mdio/converters/type_converter.py:
--------------------------------------------------------------------------------
 1 | """A module for converting numpy dtypes to MDIO scalar and structured types."""
 2 | 
 3 | from numpy import dtype as np_dtype
 4 | 
 5 | from mdio.builder.schemas.dtype import ScalarType
 6 | from mdio.builder.schemas.dtype import StructuredField
 7 | from mdio.builder.schemas.dtype import StructuredType
 8 | 
 9 | 
10 | def to_scalar_type(data_type: np_dtype) -> ScalarType:
11 |     """Convert numpy dtype to MDIO ScalarType.
12 | 
13 |     Out of the 24 built-in numpy scalar type objects
14 |     (see https://numpy.org/doc/stable/reference/arrays.dtypes.html)
15 |     this function supports only a limited subset:
16 |         ScalarType.INT8 <-> int8
17 |         ScalarType.INT16 <-> int16
18 |         ScalarType.INT32 <-> int32
19 |         ScalarType.INT64 <-> int64
20 |         ScalarType.UINT8 <-> uint8
21 |         ScalarType.UINT16 <-> uint16
22 |         ScalarType.UINT32 <-> uint32
23 |         ScalarType.UINT64 <-> uint64
24 |         ScalarType.FLOAT32 <-> float32
25 |         ScalarType.FLOAT64 <-> float64
26 |         ScalarType.COMPLEX64 <-> complex64
27 |         ScalarType.COMPLEX128 <-> complex128
28 |         ScalarType.BOOL <-> bool
29 | 
30 |     Args:
31 |         data_type: numpy dtype to convert
32 | 
33 |     Returns:
34 |         ScalarType: corresponding MDIO scalar type
35 | 
36 |     Raises:
37 |         ValueError: if dtype is not supported
38 |     """
39 |     try:
40 |         return ScalarType(data_type.name)
41 |     except ValueError as exc:
42 |         err = f"Unsupported numpy dtype '{data_type.name}' for conversion to ScalarType."
43 |         raise ValueError(err) from exc
44 | 
45 | 
46 | def to_structured_type(data_type: np_dtype) -> StructuredType:
47 |     """Convert numpy dtype to MDIO StructuredType.
48 | 
49 |     This function supports only a limited subset of structured types.
50 |     In particular:
51 |     It does not support nested structured types.
52 |     It supports fields of only 13 out of 24 built-in numpy scalar types.
53 |     (see `to_scalar_type` for details).
54 | 
55 |     Args:
56 |         data_type: numpy dtype to convert
57 | 
58 |     Returns:
59 |         StructuredType: corresponding MDIO structured type
60 | 
61 |     Raises:
62 |         ValueError: if dtype is not structured or has no fields
63 | 
64 |     """
65 |     if data_type is None or len(data_type.names or []) == 0:
66 |         err = "None or empty dtype provided, cannot convert to StructuredType."
67 |         raise ValueError(err)
68 | 
69 |     fields = []
70 |     for field_name in data_type.names:
71 |         field_dtype = data_type.fields[field_name][0]
72 |         scalar_type = to_scalar_type(field_dtype)
73 |         structured_field = StructuredField(name=field_name, format=scalar_type)
74 |         fields.append(structured_field)
75 |     return StructuredType(fields=fields)
76 | 
77 | 
78 | def to_numpy_dtype(data_type: ScalarType | StructuredType) -> np_dtype:
79 |     """Get the numpy dtype for a variable."""
80 |     if isinstance(data_type, ScalarType):
81 |         return np_dtype(data_type.value)
82 |     if isinstance(data_type, StructuredType):
83 |         return np_dtype([(f.name, f.format.value) for f in data_type.fields])
84 |     msg = f"Expected ScalarType or StructuredType, got '{type(data_type).__name__}'"
85 |     raise ValueError(msg)
86 | 


--------------------------------------------------------------------------------
/docs/data_models/version_1.md:
--------------------------------------------------------------------------------
  1 | ```{eval-rst}
  2 | :tocdepth: 3
  3 | ```
  4 | 
  5 | ```{currentModule} mdio.builder.schemas.v1.dataset
  6 | 
  7 | ```
  8 | 
  9 | # MDIO v1
 10 | 
 11 | ```{article-info}
 12 | :author: Altay Sansal
 13 | :date: "{sub-ref}`today`"
 14 | :read-time: "{sub-ref}`wordcount-minutes` min read"
 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light
 16 | ```
 17 | 
 18 | ## Intro
 19 | 
 20 | ```{eval-rst}
 21 | .. autosummary:: Dataset
 22 | .. autosummary:: DatasetMetadata
 23 | ```
 24 | 
 25 | ## Reference
 26 | 
 27 | :::{dropdown} Dataset
 28 | :open:
 29 | 
 30 | ```{eval-rst}
 31 | .. autopydantic_model:: Dataset
 32 |     :inherited-members: BaseModel
 33 | 
 34 | .. autopydantic_model:: DatasetMetadata
 35 |     :inherited-members: BaseModel
 36 | ```
 37 | 
 38 | :::
 39 | :::{dropdown} Variable
 40 | 
 41 | ```{eval-rst}
 42 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.Variable
 43 |     :inherited-members: BaseModel
 44 | 
 45 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.Coordinate
 46 |     :inherited-members: BaseModel
 47 | 
 48 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.CoordinateMetadata
 49 |     :inherited-members: BaseModel
 50 | 
 51 | .. autopydantic_model:: mdio.builder.schemas.v1.variable.VariableMetadata
 52 |     :inherited-members: BaseModel
 53 | ```
 54 | 
 55 | :::
 56 | 
 57 | :::{dropdown} Units
 58 | 
 59 | ```{eval-rst}
 60 | .. automodule:: mdio.builder.schemas.v1.units
 61 |     :members: LengthUnitModel,
 62 |               TimeUnitModel,
 63 |               AngleUnitModel,
 64 |               DensityUnitModel,
 65 |               SpeedUnitModel,
 66 |               FrequencyUnitModel,
 67 |               VoltageUnitModel
 68 | ```
 69 | 
 70 | :::
 71 | 
 72 | :::{dropdown} Stats
 73 | 
 74 | ```{eval-rst}
 75 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.SummaryStatistics
 76 | 
 77 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.EdgeDefinedHistogram
 78 |     :inherited-members: BaseModel
 79 | 
 80 | .. autopydantic_model:: mdio.builder.schemas.v1.stats.CenteredBinHistogram
 81 |     :inherited-members: BaseModel
 82 | ```
 83 | 
 84 | :::
 85 | 
 86 | :::{dropdown} Enums
 87 | 
 88 | ```{eval-rst}
 89 | .. autoclass:: mdio.builder.schemas.v1.units.AngleUnitEnum()
 90 |     :members:
 91 |     :undoc-members:
 92 |     :member-order: bysource
 93 | 
 94 | .. autoclass:: mdio.builder.schemas.v1.units.DensityUnitEnum()
 95 |     :members:
 96 |     :undoc-members:
 97 |     :member-order: bysource
 98 | 
 99 | .. autoclass:: mdio.builder.schemas.v1.units.FrequencyUnitEnum()
100 |     :members:
101 |     :undoc-members:
102 |     :member-order: bysource
103 | 
104 | .. autoclass:: mdio.builder.schemas.v1.units.LengthUnitEnum()
105 |     :members:
106 |     :undoc-members:
107 |     :member-order: bysource
108 | 
109 | .. autoclass:: mdio.builder.schemas.v1.units.SpeedUnitEnum()
110 |     :members:
111 |     :undoc-members:
112 |     :member-order: bysource
113 | 
114 | .. autoclass:: mdio.builder.schemas.v1.units.TimeUnitEnum()
115 |     :members:
116 |     :undoc-members:
117 |     :member-order: bysource
118 | 
119 | .. autoclass:: mdio.builder.schemas.v1.units.VoltageUnitEnum()
120 |     :members:
121 |     :undoc-members:
122 |     :member-order: bysource
123 | ```
124 | 
125 | :::
126 | 


--------------------------------------------------------------------------------
/tests/unit/test_auto_chunking.py:
--------------------------------------------------------------------------------
 1 | """Test live mask chunk size calculation."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | import numpy as np
 8 | import pytest
 9 | 
10 | from mdio.core.utils_write import MAX_SIZE_LIVE_MASK
11 | from mdio.core.utils_write import get_constrained_chunksize
12 | from mdio.core.utils_write import get_live_mask_chunksize
13 | 
14 | if TYPE_CHECKING:
15 |     from numpy.typing import DTypeLike
16 | 
17 | 
18 | @pytest.mark.parametrize(
19 |     ("shape", "dtype", "limit", "expected_chunks"),
20 |     [
21 |         ((100,), "int8", 100, (100,)),  # 1D full chunk
22 |         ((8, 6), "int8", 20, (4, 4)),  # 2D adjusted int8
23 |         ((6, 8), "int16", 96, (6, 8)),  # 2D small int16
24 |         ((9, 6, 4), "int8", 100, (5, 5, 4)),  # 3D adjusted
25 |         ((4, 5), "int32", 4, (1, 1)),  # test minimum edge case
26 |         ((10, 10), "int8", 1000, (10, 10)),  # big limit
27 |         ((7, 5), "int8", 35, (7, 5)),  # test full primes
28 |         ((7, 5), "int8", 23, (4, 4)),  # test adjusted primes
29 |     ],
30 | )
31 | @pytest.mark.filterwarnings("ignore:chunk size balancing not possible:UserWarning")
32 | def test_auto_chunking(
33 |     shape: tuple[int, ...],
34 |     dtype: DTypeLike,
35 |     limit: int,
36 |     expected_chunks: tuple[int, ...],
37 | ) -> None:
38 |     """Test automatic chunking based on size limit and an array spec."""
39 |     result = get_constrained_chunksize(shape, dtype, limit)
40 |     assert result == expected_chunks
41 | 
42 | 
43 | class TestAutoChunkLiveMask:
44 |     """Test class for live mask auto chunking."""
45 | 
46 |     @pytest.mark.parametrize(
47 |         ("shape", "expected_chunks"),
48 |         [
49 |             ((100,), (100,)),  # small 1d
50 |             ((100, 100), (100, 100)),  # small 2d
51 |             ((50000, 50000), (16667, 16667)),  # large 2d
52 |             ((1500, 1500, 1500), (750, 750, 750)),  # large 3d
53 |             ((1000, 1000, 100, 36), (250, 250, 100, 36)),  # large 4d
54 |         ],
55 |     )
56 |     def test_auto_chunk_live_mask(
57 |         self,
58 |         shape: tuple[int, ...],
59 |         expected_chunks: tuple[int, ...],
60 |     ) -> None:
61 |         """Test auto chunked live mask is within expected number of bytes."""
62 |         result = get_live_mask_chunksize(shape)
63 |         assert result == expected_chunks
64 | 
65 |     @pytest.mark.parametrize(
66 |         "shape",
67 |         [
68 |             # Below are >250MiB. Smaller ones tested above
69 |             (32768, 32768),
70 |             (46341, 46341),
71 |             (86341, 96341),
72 |             (55000, 97500),
73 |             (100000, 100000),
74 |             (512, 216, 512, 400),
75 |             (64, 128, 64, 32, 64),
76 |             (512, 17, 43, 200, 50),
77 |         ],
78 |     )
79 |     @pytest.mark.filterwarnings("ignore:chunk size balancing not possible:UserWarning")
80 |     def test_auto_chunk_live_mask_nbytes(self, shape: tuple[int, ...]) -> None:
81 |         """Test auto chunked live mask is within expected number of bytes."""
82 |         result = get_live_mask_chunksize(shape)
83 |         chunk_elements = np.prod(result)
84 | 
85 |         # We want them to be 250MB +/- 50%
86 |         assert chunk_elements > MAX_SIZE_LIVE_MASK * 0.75
87 |         assert chunk_elements < MAX_SIZE_LIVE_MASK * 1.25
88 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_3d_cdp.py:
--------------------------------------------------------------------------------
 1 | """Seismic3DCDPGathersTemplate MDIO v1 dataset templates."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from mdio.builder.schemas.compressors import Blosc
 6 | from mdio.builder.schemas.compressors import BloscCname
 7 | from mdio.builder.schemas.dtype import ScalarType
 8 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 9 | from mdio.builder.templates.base import AbstractDatasetTemplate
10 | from mdio.builder.templates.types import CdpGatherDomain
11 | from mdio.builder.templates.types import SeismicDataDomain
12 | 
13 | 
14 | class Seismic3DCdpGathersTemplate(AbstractDatasetTemplate):
15 |     """Seismic CDP pre-stack 3D gathers Dataset template."""
16 | 
17 |     def __init__(self, data_domain: SeismicDataDomain, gather_domain: CdpGatherDomain):
18 |         super().__init__(data_domain=data_domain)
19 |         self._gather_domain = gather_domain.lower()
20 | 
21 |         if self._gather_domain not in ["offset", "angle"]:
22 |             msg = "gather_type must be 'offset' or 'angle'"
23 |             raise ValueError(msg)
24 | 
25 |         self._dim_names = ("inline", "crossline", self._gather_domain, self._data_domain)
26 |         self._physical_coord_names = ("cdp_x", "cdp_y")
27 |         self._var_chunk_shape = (8, 8, 32, 512)
28 | 
29 |     @property
30 |     def _name(self) -> str:
31 |         gather_domain_suffix = self._gather_domain.capitalize()
32 |         data_domain_suffix = self._data_domain.capitalize()
33 |         return f"Cdp{gather_domain_suffix}Gathers3D{data_domain_suffix}"
34 | 
35 |     def _load_dataset_attributes(self) -> dict[str, Any]:
36 |         return {"surveyType": "3D", "gatherType": "cdp"}
37 | 
38 |     def _add_coordinates(self) -> None:
39 |         # Add dimension coordinates
40 |         self._builder.add_coordinate(
41 |             "inline",
42 |             dimensions=("inline",),
43 |             data_type=ScalarType.INT32,
44 |         )
45 |         self._builder.add_coordinate(
46 |             "crossline",
47 |             dimensions=("crossline",),
48 |             data_type=ScalarType.INT32,
49 |         )
50 |         self._builder.add_coordinate(
51 |             self._gather_domain,
52 |             dimensions=(self._gather_domain,),
53 |             data_type=ScalarType.INT32,
54 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)),
55 |         )
56 |         self._builder.add_coordinate(
57 |             self.trace_domain,
58 |             dimensions=(self.trace_domain,),
59 |             data_type=ScalarType.INT32,
60 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
61 |         )
62 | 
63 |         # Add non-dimension coordinates
64 |         compressor = Blosc(cname=BloscCname.zstd)
65 |         self._builder.add_coordinate(
66 |             "cdp_x",
67 |             dimensions=("inline", "crossline"),
68 |             data_type=ScalarType.FLOAT64,
69 |             compressor=compressor,
70 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_x")),
71 |         )
72 |         self._builder.add_coordinate(
73 |             "cdp_y",
74 |             dimensions=("inline", "crossline"),
75 |             data_type=ScalarType.FLOAT64,
76 |             compressor=compressor,
77 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("cdp_y")),
78 |         )
79 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
  1 | # Installation
  2 | 
  3 | There are different ways to install MDIO:
  4 | 
  5 | - Install the latest release via [`pip`](#using-pip-and-virtualenv) or [`conda`](#using-conda).
  6 | - Building package [from source](#building-from-source).
  7 | 
  8 | ```{note}
  9 | We strongly recommend using a virtual environment `venv` or `conda`
 10 | to avoid potential conflicts with other Python packages.
 11 | ```
 12 | 
 13 | ## Using `pip` and `virtualenv`
 14 | 
 15 | Install the 64-bit version of Python 3 from https://www.python.org.
 16 | 
 17 | Then we can create a `venv` and install _MDIO_.
 18 | 
 19 | ```shell
 20 | $ python -m venv mdio-venv
 21 | $ mdio-venv/Scripts/activate
 22 | $ pip install -U multidimio
 23 | ```
 24 | 
 25 | To check if installation was successful see [checking installation](#checking-installation).
 26 | 
 27 | You can also install some optional dependencies (extras) like this:
 28 | 
 29 | ```shell
 30 | $ pip install multidimio[distributed]
 31 | $ pip install multidimio[cloud]
 32 | $ pip install multidimio[lossy]
 33 | ```
 34 | 
 35 | `distributed` installs [Dask][dask] for parallel, distributed processing.\
 36 | `cloud` installs [fsspec][fsspec] backed I/O libraries for [AWS' S3][s3fs],
 37 | [Google's GCS][gcsfs], and [Azure ABS][adlfs].\
 38 | `lossy` will install the [ZFPY][zfp] library for lossy chunk compression.
 39 | 
 40 | [dask]: https://www.dask.org/
 41 | [fsspec]: https://filesystem-spec.readthedocs.io/en/latest/
 42 | [s3fs]: https://s3fs.readthedocs.io/
 43 | [gcsfs]: https://gcsfs.readthedocs.io/
 44 | [adlfs]: https://github.com/fsspec/adlfs
 45 | [zfp]: https://computing.llnl.gov/projects/zfp
 46 | 
 47 | ## Using `conda`
 48 | 
 49 | MDIO can also be installed in a `conda` environment.
 50 | 
 51 | ```{note}
 52 | _MDIO_ is hosted in the `conda-forge` channel. Make sure to always provide the
 53 | `-c conda-forge` when running `conda install` or else it won't be able to find
 54 | the package.
 55 | ```
 56 | 
 57 | We first run the following to create and activate an environment:
 58 | 
 59 | ```shell
 60 | $ conda create -n mdio-env
 61 | $ conda activate mdio-env
 62 | ```
 63 | 
 64 | Then we can to install with `conda`:
 65 | 
 66 | ```shell
 67 | $ conda install -c conda-forge multidimio
 68 | ```
 69 | 
 70 | The above command will install MDIO into your `conda` environment.
 71 | 
 72 | ```{note}
 73 | _MDIO_ extras must be installed separately when using `conda`.
 74 | ```
 75 | 
 76 | ## Checking Installation
 77 | 
 78 | After installing MDIO, run the following:
 79 | 
 80 | ```shell
 81 | $ python -c "import mdio; print(mdio.__version__)"
 82 | ```
 83 | 
 84 | You should see the version of MDIO printed to the screen.
 85 | 
 86 | ## Building from Source
 87 | 
 88 | All dependencies of _MDIO_ are Python packages, so the build process is very simple.
 89 | To install from source, we need to clone the repo first and then install locally via `pip`.
 90 | 
 91 | ```shell
 92 | $ git clone https://github.com/TGSAI/mdio-python.git
 93 | $ cd mdio-python
 94 | $ pip install .
 95 | ```
 96 | 
 97 | We can also install the extras in a similar way, for example:
 98 | 
 99 | ```shell
100 | $ pip install .[cloud]
101 | ```
102 | 
103 | If you want an editable version of _MDIO_ then we could install it with the command below.
104 | This does allow you to make code changes on the fly.
105 | 
106 | ```shell
107 | $ pip install --editable .[cloud]
108 | ```
109 | 
110 | To check if installation was successful see [checking installation](#checking-installation).
111 | 


--------------------------------------------------------------------------------
/src/mdio/__main__.py:
--------------------------------------------------------------------------------
 1 | """Command-line interface."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import importlib
 6 | from importlib import metadata
 7 | from pathlib import Path
 8 | from typing import TYPE_CHECKING
 9 | 
10 | if TYPE_CHECKING:
11 |     from collections.abc import Callable
12 |     from typing import Any
13 | 
14 | import click
15 | 
16 | KNOWN_MODULES = ["segy.py", "copy.py", "info.py"]
17 | 
18 | 
19 | class MyCLI(click.Group):
20 |     """CLI generator via plugin design pattern.
21 | 
22 |     This class dynamically loads command modules from the specified `plugin_folder`. If the
23 |     command us another CLI group, the command module must define a `cli = click.Group(...)` and
24 |     subsequent commands must be added to this CLI. If it is a single utility it must have a
25 |     variable named `cli` for the command to be exposed.
26 | 
27 |     Args:
28 |         plugin_folder: Path to the directory containing command modules
29 |         *args: Variable length argument list passed to the click.Group.
30 |         **kwargs: Arbitrary keyword arguments passed to the click.Group.
31 |     """
32 | 
33 |     def __init__(self, plugin_folder: Path, *args: Any, **kwargs: Any):  # noqa: ANN401
34 |         super().__init__(*args, **kwargs)
35 |         self.plugin_folder = plugin_folder
36 |         self.known_modules = KNOWN_MODULES
37 | 
38 |     def list_commands(self, _ctx: click.Context) -> list[str]:
39 |         """List commands available under `commands` module."""
40 |         rv = []
41 |         for filename in self.plugin_folder.iterdir():
42 |             is_known = filename.name in self.known_modules
43 |             is_python = filename.suffix == ".py"
44 |             if is_known and is_python:
45 |                 rv.append(filename.stem)
46 |         rv.sort()
47 |         return rv
48 | 
49 |     def get_command(self, _ctx: click.Context, name: str) -> Callable | None:
50 |         """Get command implementation from `commands` module."""
51 |         try:
52 |             filepath = self.plugin_folder / f"{name}.py"
53 |             if filepath.name not in self.known_modules:
54 |                 click.echo(f"Command {name} is not safe to execute.")
55 |                 return None
56 | 
57 |             module_name = f"mdio.commands.{name}"
58 |             spec = importlib.util.spec_from_file_location(module_name, str(filepath))
59 |             if spec and spec.loader:
60 |                 module = importlib.util.module_from_spec(spec)
61 |                 spec.loader.exec_module(module)
62 |                 return module.cli
63 |         except Exception as e:
64 |             click.echo(f"Error loading command {name}: {e}")
65 |             return None
66 | 
67 | 
68 | def get_package_version(package_name: str, default: str = "unknown") -> str:
69 |     """Safely fetch the package version, providing a default if not found."""
70 |     try:
71 |         return metadata.version(package_name)
72 |     except metadata.PackageNotFoundError:
73 |         return default
74 | 
75 | 
76 | @click.command(cls=MyCLI, plugin_folder=Path(__file__).parent / "commands")
77 | @click.version_option(get_package_version("multidimio"))
78 | def main() -> None:
79 |     """Welcome to MDIO!
80 | 
81 |     MDIO is an open source, cloud-native, and scalable storage engine
82 |     for various types of energy data.
83 | 
84 |     MDIO supports importing or exporting various data containers,
85 |     hence we allow plugins as subcommands.
86 | 
87 |     From this main command, we can see the MDIO version.
88 |     """
89 | 


--------------------------------------------------------------------------------
/tests/unit/test_coordinate_scalar.py:
--------------------------------------------------------------------------------
 1 | """Tests for coordinate scalar getters and apply functions."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | from unittest.mock import MagicMock
 7 | 
 8 | import numpy as np
 9 | import pytest
10 | from segy import SegyFile
11 | from segy.standards import SegyStandard
12 | from segy.standards.fields import trace as trace_header_fields
13 | 
14 | from mdio.segy.scalar import _apply_coordinate_scalar
15 | from mdio.segy.scalar import _get_coordinate_scalar
16 | 
17 | if TYPE_CHECKING:
18 |     from numpy.typing import NDArray
19 | 
20 | COORD_SCALAR_KEY = trace_header_fields.Rev0.COORDINATE_SCALAR.model.name
21 | 
22 | 
23 | @pytest.fixture
24 | def mock_segy_file() -> SegyFile:
25 |     """Mock SegyFile object."""
26 |     segy_file = MagicMock(spec=SegyFile)
27 |     segy_file.spec = MagicMock()
28 |     segy_file.header = [MagicMock()]
29 |     return segy_file
30 | 
31 | 
32 | @pytest.mark.parametrize("scalar", [1, 100, 10000, -10, -1000])
33 | def test_get_coordinate_scalar_valid(mock_segy_file: SegyFile, scalar: int) -> None:
34 |     """Test valid options when getting coordinate scalar."""
35 |     mock_segy_file.spec.segy_standard = SegyStandard.REV1
36 |     mock_segy_file.header[0].__getitem__.return_value = scalar
37 | 
38 |     result = _get_coordinate_scalar(mock_segy_file)
39 | 
40 |     assert result == scalar
41 | 
42 | 
43 | @pytest.mark.parametrize(
44 |     "revision",
45 |     [SegyStandard.REV2, SegyStandard.REV21],
46 | )
47 | def test_get_coordinate_scalar_zero_rev2_plus(mock_segy_file: SegyFile, revision: SegyStandard) -> None:
48 |     """Test when scalar is normalized to 1 (from 0) in Rev2+."""
49 |     mock_segy_file.spec.segy_standard = revision
50 |     mock_segy_file.header[0].__getitem__.return_value = 0
51 | 
52 |     result = _get_coordinate_scalar(mock_segy_file)
53 | 
54 |     assert result == 1
55 | 
56 | 
57 | @pytest.mark.parametrize(
58 |     ("scalar", "revision", "error_msg"),
59 |     [
60 |         (0, SegyStandard.REV0, "Invalid coordinate scalar: 0 for file revision SegyStandard.REV0."),
61 |         (110, SegyStandard.REV1, "Invalid coordinate scalar: 110 for file revision SegyStandard.REV1."),
62 |         (32768, SegyStandard.REV1, "Invalid coordinate scalar: 32768 for file revision SegyStandard.REV1."),
63 |     ],
64 | )
65 | def test_get_coordinate_scalar_invalid(
66 |     mock_segy_file: SegyFile, scalar: int, revision: SegyStandard, error_msg: str
67 | ) -> None:
68 |     """Test invalid options when getting coordinate scalar."""
69 |     mock_segy_file.spec.segy_standard = revision
70 |     mock_segy_file.header[0].__getitem__.return_value = scalar
71 | 
72 |     with pytest.raises(ValueError, match=error_msg):
73 |         _get_coordinate_scalar(mock_segy_file)
74 | 
75 | 
76 | @pytest.mark.parametrize(
77 |     ("data", "scalar", "expected"),
78 |     [
79 |         # POSITIVE
80 |         (np.array([1, 2, 3]), 1, np.array([1, 2, 3])),
81 |         (np.array([1, 2, 3]), 10, np.array([10, 20, 30])),
82 |         (np.array([[1, 2], [3, 4]]), 1000, np.array([[1000, 2000], [3000, 4000]])),
83 |         # NEGATIVE
84 |         (np.array([1, 2, 3]), -1, np.array([1, 2, 3])),
85 |         (np.array([10, 20, 30]), -10, np.array([1, 2, 3])),
86 |         (np.array([[1000, 2000], [3000, 4000]]), -1000, np.array([[1, 2], [3, 4]])),
87 |     ],
88 | )
89 | def test_apply_coordinate_scalar(data: NDArray, scalar: int, expected: NDArray) -> None:
90 |     """Test applying coordinate scalar with negative and positive code."""
91 |     result = _apply_coordinate_scalar(data, scalar)
92 |     assert np.allclose(result, expected)
93 | 


--------------------------------------------------------------------------------
/docs/template_registry.md:
--------------------------------------------------------------------------------
  1 | # Template Registry
  2 | 
  3 | A simple, thread-safe place to discover and fetch dataset templates for MDIO.
  4 | 
  5 | ## Why use it
  6 | 
  7 | - One place to find all available templates
  8 | - Safe to use across threads and the whole app (singleton)
  9 | - Every fetch gives you your own editable copy (no side effects)
 10 | - Comes preloaded with common seismic templates
 11 | 
 12 | ```{note}
 13 | Fetching a template with `get_template()` returns a deep copy. Editing it will not change the
 14 | registry or anyone else’s copy.
 15 | ```
 16 | 
 17 | ## Quick start
 18 | 
 19 | ```python
 20 | from mdio.builder.template_registry import get_template, list_templates
 21 | 
 22 | # See what's available
 23 | print(list_templates())
 24 | # e.g. ["Seismic2DPostStackTime", "Seismic3DPostStackDepth", ...]
 25 | 
 26 | # Grab a template by name
 27 | template = get_template("Seismic3DPostStackTime")
 28 | 
 29 | # Customize your copy (safe)
 30 | template.add_units({"amplitude": "unitless"})
 31 | ```
 32 | 
 33 | ## Common tasks
 34 | 
 35 | ### Fetch a template you can edit
 36 | 
 37 | ```python
 38 | from mdio.builder.template_registry import get_template
 39 | 
 40 | template = get_template("Seismic2DPostStackDepth")
 41 | # Use/modify template freely — it’s your copy
 42 | ```
 43 | 
 44 | ### List available templates
 45 | 
 46 | ```python
 47 | from mdio.builder.template_registry import list_templates
 48 | 
 49 | names = list_templates()
 50 | for name in names:
 51 |     print(name)
 52 | ```
 53 | 
 54 | ### Check if a template exists
 55 | 
 56 | ```python
 57 | from mdio.builder.template_registry import is_template_registered
 58 | 
 59 | if is_template_registered("Seismic3DPostStackTime"):
 60 |     ...  # safe to fetch
 61 | ```
 62 | 
 63 | ### Register your own template (optional)
 64 | 
 65 | If you have a custom template class, register an instance so others can fetch it by name:
 66 | 
 67 | ```python
 68 | from typing import Any
 69 | from mdio.builder.template_registry import register_template
 70 | from mdio.builder.templates.base import AbstractDatasetTemplate
 71 | from mdio.builder.templates.types import SeismicDataDomain
 72 | 
 73 | 
 74 | class MyTemplate(AbstractDatasetTemplate):
 75 |     def __init__(self, domain: SeismicDataDomain = "time"):
 76 |         super().__init__(domain)
 77 | 
 78 |     @property
 79 |     def _name(self) -> str:
 80 |         # The public name becomes something like "MyTemplateTime"
 81 |         return f"MyTemplate{self._data_domain.capitalize()}"
 82 | 
 83 |     def _load_dataset_attributes(self) -> dict[str, Any]:
 84 |         return {"surveyType": "2D", "gatherType": "custom"}
 85 | 
 86 | 
 87 | # Make it available globally
 88 | registered_name = register_template(MyTemplate("time"))
 89 | print(registered_name)  # "MyTemplateTime"
 90 | ```
 91 | 
 92 | ```{tip}
 93 | Use `list_templates()` to discover the exact names to pass to `get_template()`.
 94 | ```
 95 | 
 96 | ## Troubleshooting
 97 | 
 98 | - KeyError: “Template 'XYZ' is not registered.”
 99 |   - The name is wrong or not registered yet.
100 |   - Call `list_templates()` to see valid names, or `is_template_registered(name)` to check first.
101 | 
102 | ## FAQ
103 | 
104 | - Do I need to create a TemplateRegistry instance?
105 |   No. Use the global helpers: `get_template`, `list_templates`, `register_template`, and `is_template_registered`.
106 | - Are templates shared between callers or threads?
107 |   No. Each `get_template()` call returns a deep-copied instance that is safe to modify independently.
108 | 
109 | ## API reference
110 | 
111 | ```{eval-rst}
112 | .. automodule:: mdio.builder.template_registry
113 |    :members:
114 | ```
115 | 


--------------------------------------------------------------------------------
/tests/unit/test_segy_spec_validation.py:
--------------------------------------------------------------------------------
 1 | """Tests for SEG-Y spec validation against MDIO templates."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from unittest.mock import MagicMock
 6 | 
 7 | import pytest
 8 | from segy.schema import HeaderField
 9 | from segy.standards import get_segy_standard
10 | 
11 | from mdio.builder.templates.base import AbstractDatasetTemplate
12 | from mdio.converters.segy import _validate_spec_in_template
13 | 
14 | 
15 | class TestValidateSpecInTemplate:
16 |     """Test cases for _validate_spec_in_template function."""
17 | 
18 |     def test_validation_passes_with_all_required_fields(self) -> None:
19 |         """Test that validation passes when all required fields are present."""
20 |         template = MagicMock(spec=AbstractDatasetTemplate)
21 |         template.spatial_dimension_names = ("inline", "crossline")
22 |         template.coordinate_names = ("cdp_x", "cdp_y")
23 | 
24 |         # Use base SEG-Y standard which includes coordinate_scalar at byte 71
25 |         segy_spec = get_segy_standard(1.0)
26 | 
27 |         # Should not raise any exception
28 |         _validate_spec_in_template(segy_spec, template)
29 | 
30 |     def test_validation_fails_with_missing_fields(self) -> None:
31 |         """Test that validation fails when required fields are missing."""
32 |         # Template requiring custom fields not in standard spec
33 |         template = MagicMock(spec=AbstractDatasetTemplate)
34 |         template.name = "CustomTemplate"
35 |         template.spatial_dimension_names = ("custom_dim1", "custom_dim2")
36 |         template.coordinate_names = ("custom_coord_x", "custom_coord_y")
37 | 
38 |         # SegySpec with only one of the required custom fields
39 |         spec = get_segy_standard(1.0)
40 |         header_fields = [
41 |             HeaderField(name="custom_dim1", byte=189, format="int32"),
42 |         ]
43 |         segy_spec = spec.customize(trace_header_fields=header_fields)
44 | 
45 |         # Should raise ValueError listing the missing fields
46 |         with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info:
47 |             _validate_spec_in_template(segy_spec, template)
48 | 
49 |         error_message = str(exc_info.value)
50 |         assert "custom_dim2" in error_message
51 |         assert "custom_coord_x" in error_message
52 |         assert "custom_coord_y" in error_message
53 |         assert "CustomTemplate" in error_message
54 | 
55 |     def test_validation_fails_with_missing_coordinate_scalar(self) -> None:
56 |         """Test that validation fails when coordinate_scalar is missing, even with all other fields."""
57 |         template = MagicMock(spec=AbstractDatasetTemplate)
58 |         template.name = "TestTemplate"
59 |         template.spatial_dimension_names = ("inline", "crossline")
60 |         template.coordinate_names = ("cdp_x", "cdp_y")
61 | 
62 |         # Create SegySpec with all standard fields except coordinate_scalar
63 |         spec = get_segy_standard(1.0)
64 |         # Remove coordinate_scalar from the standard fields
65 |         standard_fields = [field for field in spec.trace.header.fields if field.name != "coordinate_scalar"]
66 |         standard_fields.append(HeaderField(name="not_coordinate_scalar", byte=71, format="int16"))
67 |         segy_spec = spec.customize(trace_header_fields=standard_fields)
68 | 
69 |         # Should raise ValueError for missing coordinate_scalar
70 |         with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info:
71 |             _validate_spec_in_template(segy_spec, template)
72 | 
73 |         error_message = str(exc_info.value)
74 |         assert "coordinate_scalar" in error_message
75 |         assert "TestTemplate" in error_message
76 | 


--------------------------------------------------------------------------------
/tests/unit/test_type_converter.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for the type converter module."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from mdio.builder.schemas.dtype import ScalarType
 7 | from mdio.builder.schemas.dtype import StructuredField
 8 | from mdio.builder.schemas.dtype import StructuredType
 9 | from mdio.converters.type_converter import to_numpy_dtype
10 | from mdio.converters.type_converter import to_scalar_type
11 | from mdio.converters.type_converter import to_structured_type
12 | 
13 | 
14 | @pytest.fixture
15 | def supported_scalar_types_map() -> tuple[tuple[ScalarType, str], ...]:
16 |     """Supported scalar types and their numpy equivalents."""
17 |     return (
18 |         (ScalarType.INT8, "int8"),
19 |         (ScalarType.INT16, "int16"),
20 |         (ScalarType.INT32, "int32"),
21 |         (ScalarType.INT64, "int64"),
22 |         (ScalarType.UINT8, "uint8"),
23 |         (ScalarType.UINT16, "uint16"),
24 |         (ScalarType.UINT32, "uint32"),
25 |         (ScalarType.UINT64, "uint64"),
26 |         (ScalarType.FLOAT32, "float32"),
27 |         (ScalarType.FLOAT64, "float64"),
28 |         (ScalarType.COMPLEX64, "complex64"),
29 |         (ScalarType.COMPLEX128, "complex128"),
30 |         (ScalarType.BOOL, "bool"),
31 |     )
32 | 
33 | 
34 | @pytest.fixture
35 | def a_structured_type() -> StructuredType:
36 |     """Sample structured type.
37 | 
38 |     Returns a structured type.
39 |     """
40 |     return StructuredType(
41 |         fields=[
42 |             StructuredField(name="x", format=ScalarType.FLOAT64),
43 |             StructuredField(name="y", format=ScalarType.FLOAT64),
44 |             StructuredField(name="z", format=ScalarType.FLOAT64),
45 |             StructuredField(name="id", format=ScalarType.INT32),
46 |             StructuredField(name="valid", format=ScalarType.BOOL),
47 |         ]
48 |     )
49 | 
50 | 
51 | def test_to_numpy_dtype(supported_scalar_types_map: tuple[ScalarType, str], a_structured_type: StructuredType) -> None:
52 |     """Comprehensive test for to_numpy_dtype function."""
53 |     # Test 0: invalid input
54 |     err = "Expected ScalarType or StructuredType, got 'str'"
55 |     with pytest.raises(ValueError, match=err):
56 |         to_numpy_dtype("parameter of invalid type")
57 | 
58 |     # Test 1: ScalarType cases - all supported scalar types
59 |     for scalar_type, expected_numpy_type in supported_scalar_types_map:
60 |         result = to_numpy_dtype(scalar_type)
61 |         expected = np.dtype(expected_numpy_type)
62 |         assert result == expected
63 |         assert isinstance(result, np.dtype)
64 |         assert result.name == expected.name
65 | 
66 |     # Test 2: StructuredType with multiple fields
67 |     result_multi = to_numpy_dtype(a_structured_type)
68 |     expected_multi = np.dtype(
69 |         [("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")]
70 |     )
71 | 
72 |     assert result_multi == expected_multi
73 |     assert isinstance(result_multi, np.dtype)
74 |     assert len(result_multi.names) == 5
75 |     assert set(result_multi.names) == {"x", "y", "z", "id", "valid"}
76 | 
77 | 
78 | def test_to_scalar_type(supported_scalar_types_map: tuple[ScalarType, str]) -> None:
79 |     """Test for to_scalar_type function."""
80 |     for expected_mdio_type, numpy_type in supported_scalar_types_map:
81 |         result = to_scalar_type(np.dtype(numpy_type))
82 |         assert result == expected_mdio_type
83 | 
84 | 
85 | def test_to_structured_type(a_structured_type: StructuredType) -> None:
86 |     """Test for to_structured_type function."""
87 |     dtype = np.dtype([("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")])
88 |     assert a_structured_type == to_structured_type(dtype)
89 | 
90 |     dtype = np.dtype([("x", "<f8"), ("y", "<f8"), ("z", "<f8"), ("id", "<i4"), ("valid", "?")])
91 |     assert a_structured_type == to_structured_type(dtype)
92 | 


--------------------------------------------------------------------------------
/src/mdio/builder/schemas/compressors.py:
--------------------------------------------------------------------------------
  1 | """This module contains a Pydantic model to parameterize compressors.
  2 | 
  3 | Important Objects:
  4 |     - Blosc: A Pydantic model that represents a Blosc compression setup.
  5 |     - ZFP: Class that represents the ZFP compression model.
  6 | """
  7 | 
  8 | from __future__ import annotations
  9 | 
 10 | from enum import StrEnum
 11 | 
 12 | from pydantic import Field
 13 | from pydantic import model_validator
 14 | from zarr.codecs import BloscCname
 15 | from zarr.codecs import BloscShuffle
 16 | 
 17 | from mdio.builder.schemas.core import CamelCaseStrictModel
 18 | 
 19 | 
 20 | class Blosc(CamelCaseStrictModel):
 21 |     """Data Model for Blosc options."""
 22 | 
 23 |     name: str = Field(default="blosc", description="Name of the compressor.")
 24 |     cname: BloscCname = Field(default=BloscCname.zstd, description="Compression algorithm name.")
 25 |     clevel: int = Field(default=5, ge=0, le=9, description="Compression level (integer 0–9)")
 26 |     shuffle: BloscShuffle | None = Field(default=None, description="Shuffling mode before compression.")
 27 |     typesize: int | None = Field(default=None, description="The size in bytes that the shuffle is performed over.")
 28 |     blocksize: int = Field(default=0, description="The size (in bytes) of blocks to divide data before compression.")
 29 | 
 30 | 
 31 | zfp_mode_map = {
 32 |     "fixed_rate": 2,
 33 |     "fixed_precision": 3,
 34 |     "fixed_accuracy": 4,
 35 |     "reversible": 5,
 36 | }
 37 | 
 38 | 
 39 | class ZFPMode(StrEnum):
 40 |     """Enum for ZFP algorithm modes."""
 41 | 
 42 |     FIXED_RATE = "fixed_rate"
 43 |     FIXED_PRECISION = "fixed_precision"
 44 |     FIXED_ACCURACY = "fixed_accuracy"
 45 |     REVERSIBLE = "reversible"
 46 | 
 47 |     @property
 48 |     def int_code(self) -> int:
 49 |         """Return the integer code of ZFP mode."""
 50 |         return zfp_mode_map[self.value]
 51 | 
 52 | 
 53 | class ZFP(CamelCaseStrictModel):
 54 |     """Data Model for ZFP options."""
 55 | 
 56 |     name: str = Field(default="zfp", description="Name of the compressor.")
 57 |     mode: ZFPMode = Field()
 58 | 
 59 |     tolerance: float | None = Field(
 60 |         default=None,
 61 |         description="Fixed accuracy in terms of absolute error tolerance.",
 62 |     )
 63 | 
 64 |     rate: float | None = Field(
 65 |         default=None,
 66 |         description="Fixed rate in terms of number of compressed bits per value.",
 67 |     )
 68 | 
 69 |     precision: int | None = Field(
 70 |         default=None,
 71 |         description="Fixed precision in terms of number of uncompressed bits per value.",
 72 |     )
 73 | 
 74 |     @model_validator(mode="after")
 75 |     def check_requirements(self) -> ZFP:
 76 |         """Check if ZFP parameters make sense."""
 77 |         mode = self.mode
 78 | 
 79 |         # Check if reversible mode is provided without other parameters.
 80 |         if mode == ZFPMode.REVERSIBLE and any(
 81 |             getattr(self, key) is not None for key in ["tolerance", "rate", "precision"]
 82 |         ):
 83 |             msg = "Other fields must be None in REVERSIBLE mode"
 84 |             raise ValueError(msg)
 85 | 
 86 |         if mode == ZFPMode.FIXED_ACCURACY and self.tolerance is None:
 87 |             msg = "Tolerance required for FIXED_ACCURACY mode"
 88 |             raise ValueError(msg)
 89 | 
 90 |         if mode == ZFPMode.FIXED_RATE and self.rate is None:
 91 |             msg = "Rate required for FIXED_RATE mode"
 92 |             raise ValueError(msg)
 93 | 
 94 |         if mode == ZFPMode.FIXED_PRECISION and self.precision is None:
 95 |             msg = "Precision required for FIXED_PRECISION mode"
 96 |             raise ValueError(msg)
 97 | 
 98 |         return self
 99 | 
100 | 
101 | class CompressorModel(CamelCaseStrictModel):
102 |     """Model representing compressor configuration."""
103 | 
104 |     compressor: Blosc | ZFP | None = Field(default=None, description="Compression settings.")
105 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributor Guide
  2 | 
  3 | Thank you for your interest in improving this project.
  4 | This project is open-source under the [Apache 2.0 license] and
  5 | welcomes contributions in the form of bug reports, feature requests, and pull requests.
  6 | 
  7 | Here is a list of important resources for contributors:
  8 | 
  9 | - [Source Code]
 10 | - [Documentation]
 11 | - [Issue Tracker]
 12 | - [Code of Conduct]
 13 | 
 14 | [apache 2.0 license]: https://opensource.org/licenses/Apache-2.0
 15 | [source code]: https://github.com/TGSAI/mdio-python
 16 | [documentation]: https://mdio-python.readthedocs.io/
 17 | [issue tracker]: https://github.com/TGSAI/mdio-python/issues
 18 | 
 19 | ## How to report a bug
 20 | 
 21 | Report bugs on the [Issue Tracker].
 22 | 
 23 | When filing an issue, make sure to answer these questions:
 24 | 
 25 | - Which operating system and Python version are you using?
 26 | - Which version of this project are you using?
 27 | - What did you do?
 28 | - What did you expect to see?
 29 | - What did you see instead?
 30 | 
 31 | The best way to get your bug fixed is to provide a test case,
 32 | and/or steps to reproduce the issue.
 33 | 
 34 | ## How to request a feature
 35 | 
 36 | Request features on the [Issue Tracker].
 37 | 
 38 | ## How to set up your development environment
 39 | 
 40 | You need Python 3.11+ and the following tools:
 41 | 
 42 | - [uv]
 43 | - [Nox]
 44 | 
 45 | Another alternative is to use a [Development Container] has been setup to provide
 46 | an environment with the required dependencies. This facilitates development on
 47 | different systems.
 48 | 
 49 | This should seamlessly enable development for users of [VS Code] on systems with docker installed.
 50 | 
 51 | ### Known Issues:
 52 | 
 53 | - `git config --global --add safe.directory $(pwd)` might be needed inside the container.
 54 | 
 55 | ## How to Install and Run MDIO
 56 | 
 57 | Install the package with development and documentation generation requirements:
 58 | 
 59 | ```console
 60 | $ uv sync --all-groups
 61 | ```
 62 | 
 63 | You can now run an interactive Python session,
 64 | or the command-line interface:
 65 | 
 66 | ```console
 67 | $ uv run python
 68 | $ uv run mdio
 69 | ```
 70 | 
 71 | [uv]: https://docs.astral.sh/uv/
 72 | [nox]: https://nox.thea.codes/
 73 | [development container]: https://containers.dev/
 74 | [vs code]: https://code.visualstudio.com/docs/devcontainers/containers/
 75 | 
 76 | ## How to test the project
 77 | 
 78 | Run the full test suite:
 79 | 
 80 | ```console
 81 | $ nox
 82 | ```
 83 | 
 84 | List the available Nox sessions:
 85 | 
 86 | ```console
 87 | $ nox --list-sessions
 88 | ```
 89 | 
 90 | You can also run a specific Nox session.
 91 | For example, invoke the unit test suite like this:
 92 | 
 93 | ```console
 94 | $ nox --session=tests
 95 | ```
 96 | 
 97 | Unit tests are located in the _tests_ directory,
 98 | and are written using the [pytest] testing framework.
 99 | 
100 | [pytest]: https://pytest.readthedocs.io/
101 | 
102 | ## How to submit changes
103 | 
104 | Open a [pull request] to submit changes to this project.
105 | 
106 | Your pull request needs to meet the following guidelines for acceptance:
107 | 
108 | - The Nox test suite must pass without errors and warnings.
109 | - Include unit tests. This project currently maintains 90%+ code coverage.
110 | - If your changes add functionality, update the documentation accordingly.
111 | 
112 | Feel free to submit early, though—we can always iterate on this.
113 | 
114 | To run linting and code formatting checks before committing your change, you can install pre-commit as a Git hook by running the following command:
115 | 
116 | ```console
117 | $ nox --session=pre-commit -- install
118 | ```
119 | 
120 | It is recommended to open an issue before starting work on anything.
121 | This will allow a chance to talk it over with the owners and validate your approach.
122 | 
123 | [pull request]: https://github.com/TGSAI/mdio-python/pulls
124 | 
125 | <!-- github-only -->
126 | 
127 | [code of conduct]: CODE_OF_CONDUCT.md
128 | 


--------------------------------------------------------------------------------
/tests/unit/v1/templates/test_seismic_templates.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for concrete seismic dataset template implementations."""
 2 | 
 3 | import pytest
 4 | 
 5 | from mdio.builder.template_registry import TemplateRegistry
 6 | from mdio.builder.templates.base import AbstractDatasetTemplate
 7 | from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate
 8 | 
 9 | 
10 | class TestSeismicTemplates:
11 |     """Test cases for Seismic2DPostStackTemplate."""
12 | 
13 |     def test_chunk_shape_assignment(self) -> None:
14 |         """Test that chunk shape is assigned correctly."""
15 |         template = Seismic2DPostStackTemplate("time")
16 |         template.build_dataset("test", (50, 50))
17 |         template.full_chunk_shape = (32, 32)
18 | 
19 |         assert template._var_chunk_shape == (32, 32)
20 | 
21 |     def test_chunk_shape_assignment_exception(self) -> None:
22 |         """Test that chunk shape assignment raises exception for invalid dimensions."""
23 |         template = Seismic2DPostStackTemplate("time")
24 |         template.build_dataset("test", (50, 50))
25 | 
26 |         with pytest.raises(ValueError, match="Chunk shape.*has.*dimensions, expected"):
27 |             template.full_chunk_shape = (32, 32, 32)
28 | 
29 |     def test_chunk_shape_with_minus_one_before_build(self) -> None:
30 |         """Test that chunk shape can be set with -1 before build_dataset."""
31 |         template = Seismic2DPostStackTemplate("time")
32 | 
33 |         # Should be able to set chunk shape with -1 before build_dataset
34 |         template.full_chunk_shape = (32, -1)
35 | 
36 |         # Before build_dataset, getter should return unexpanded values
37 |         assert template.full_chunk_shape == (32, -1)
38 |         assert template._var_chunk_shape == (32, -1)
39 | 
40 |     def test_chunk_shape_with_minus_one_after_build(self) -> None:
41 |         """Test that -1 values are expanded after build_dataset."""
42 |         template = Seismic2DPostStackTemplate("time")
43 |         template.full_chunk_shape = (32, -1)
44 | 
45 |         # Build dataset with specific sizes
46 |         template.build_dataset("test", (100, 200))
47 | 
48 |         # After build_dataset, getter should expand -1 to dimension size
49 |         assert template.full_chunk_shape == (32, 200)
50 |         assert template._var_chunk_shape == (32, -1)  # Internal storage unchanged
51 | 
52 |     def test_chunk_shape_validation_invalid_values(self) -> None:
53 |         """Test that chunk shape setter rejects invalid values."""
54 |         template = Seismic2DPostStackTemplate("time")
55 |         template.build_dataset("test", (50, 50))
56 | 
57 |         # Test rejection of 0
58 |         with pytest.raises(ValueError, match="Chunk size must be positive integer or -1"):
59 |             template.full_chunk_shape = (32, 0)
60 | 
61 |         # Test rejection of negative values other than -1
62 |         with pytest.raises(ValueError, match="Chunk size must be positive integer or -1"):
63 |             template.full_chunk_shape = (32, -2)
64 | 
65 |         # Test that positive values and -1 are accepted
66 |         template.full_chunk_shape = (32, -1)  # Should not raise
67 |         template.full_chunk_shape = (32, 16)  # Should not raise
68 | 
69 |     def test_all_templates_inherit_from_abstract(self) -> None:
70 |         """Test that all concrete templates inherit from AbstractDatasetTemplate."""
71 |         registry = TemplateRegistry()
72 |         template_names = registry.list_all_templates()
73 | 
74 |         for template_name in template_names:
75 |             template = registry.get(template_name)
76 |             assert isinstance(template, AbstractDatasetTemplate)
77 |             # That each template has the required properties and methods
78 |             assert hasattr(template, "name")
79 |             assert hasattr(template, "default_variable_name")
80 |             assert hasattr(template, "trace_domain")
81 |             assert hasattr(template, "dimension_names")
82 |             assert hasattr(template, "coordinate_names")
83 |             assert hasattr(template, "build_dataset")
84 | 
85 |         assert len(template_names) == len(set(template_names)), f"Duplicate template names found: {template_names}"
86 | 


--------------------------------------------------------------------------------
/src/mdio/core/indexing.py:
--------------------------------------------------------------------------------
  1 | """Indexing logic."""
  2 | 
  3 | import itertools
  4 | from math import ceil
  5 | 
  6 | import numpy as np
  7 | 
  8 | 
  9 | class ChunkIterator:
 10 |     """Chunk iterator for multi-dimensional arrays.
 11 | 
 12 |     This iterator takes an array shape and chunks and every time it is iterated, it returns
 13 |     a dictionary (if dimensions are provided) or a tuple of slices that align with
 14 |     chunk boundaries. When dimensions are provided, they are used as the dictionary keys.
 15 | 
 16 |     Args:
 17 |         shape: The shape of the array.
 18 |         chunks: The chunk sizes for each dimension.
 19 |         dim_names: The names of the array dimensions, to be used with DataArray.isel().
 20 |                    If the dim_names are not provided, a tuple of the slices will be returned.
 21 | 
 22 |     Attributes:             # noqa: DOC602
 23 |         arr_shape: Shape of the array.
 24 |         len_chunks: Length of chunks in each dimension.
 25 |         dim_chunks: Number of chunks in each dimension.
 26 |         num_chunks: Total number of chunks.
 27 | 
 28 |     Examples:
 29 |         >> chunks = (3, 4, 5)
 30 |         >> shape = (5, 11, 19)
 31 |         >> dims = ["inline", "crossline", "depth"]
 32 |         >>
 33 |         >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims)
 34 |         >> for i in range(13):
 35 |         >>    region = iter.__next__()
 36 |         >> print(region)
 37 |         { "inline": slice(3,6, None), "crossline": slice(0,4, None), "depth": slice(0,5, None) }
 38 | 
 39 |         >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=None)
 40 |         >> for i in range(13):
 41 |         >>    region = iter.__next__()
 42 |         >> print(region)
 43 |         (slice(3,6,None), slice(0,4,None), slice(0,5,None))
 44 |     """
 45 | 
 46 |     def __init__(self, shape: tuple[int, ...], chunks: tuple[int, ...], dim_names: tuple[str, ...] = None):
 47 |         self.arr_shape = tuple(shape)  # Deep copy to ensure immutability
 48 |         self.len_chunks = tuple(chunks)  # Deep copy to ensure immutability
 49 |         self.dims = dim_names
 50 | 
 51 |         # Compute number of chunks per dimension, and total number of chunks
 52 |         self.dim_chunks = tuple(
 53 |             [ceil(len_dim / chunk) for len_dim, chunk in zip(self.arr_shape, self.len_chunks, strict=True)]
 54 |         )
 55 |         self.num_chunks = np.prod(self.dim_chunks)
 56 | 
 57 |         # Under the hood stuff for the iterator. This generates C-ordered
 58 |         # permutation of chunk indices.
 59 |         dim_ranges = [range(dim_len) for dim_len in self.dim_chunks]
 60 |         self._ranges = itertools.product(*dim_ranges)
 61 |         self._idx = 0
 62 | 
 63 |     def __iter__(self) -> "ChunkIterator":
 64 |         """Iteration context."""
 65 |         return self
 66 | 
 67 |     def __len__(self) -> int:
 68 |         """Get total number of chunks."""
 69 |         return self.num_chunks
 70 | 
 71 |     def __next__(self) -> dict[str, slice]:
 72 |         """Iteration logic."""
 73 |         if self._idx <= self.num_chunks:
 74 |             # We build slices here. It is dimension agnostic
 75 |             current_start = next(self._ranges)
 76 | 
 77 |             start_indices = tuple(dim * chunk for dim, chunk in zip(current_start, self.len_chunks, strict=True))
 78 | 
 79 |             # Calculate stop indices, making the last slice fit the data exactly
 80 |             stop_indices = tuple(
 81 |                 min((dim + 1) * chunk, self.arr_shape[i])
 82 |                 for i, (dim, chunk) in enumerate(zip(current_start, self.len_chunks, strict=True))
 83 |             )
 84 | 
 85 |             slices = tuple(slice(start, stop) for start, stop in zip(start_indices, stop_indices, strict=True))
 86 | 
 87 |             if self.dims:  # noqa SIM108
 88 |                 # Example
 89 |                 # {"inline":slice(3,6,None), "crossline":slice(0,4,None), "depth":slice(0,5,None)}
 90 |                 result = dict(zip(self.dims, slices, strict=False))
 91 |             else:
 92 |                 # Example
 93 |                 # (slice(3,6,None), slice(0,4,None), slice(0,5,None))
 94 |                 result = slices
 95 | 
 96 |             self._idx += 1
 97 | 
 98 |             return result
 99 | 
100 |         raise StopIteration
101 | 


--------------------------------------------------------------------------------
/src/mdio/builder/templates/seismic_3d_streamer_field.py:
--------------------------------------------------------------------------------
  1 | """Seismic3DStreamerFieldRecordsTemplate MDIO v1 dataset templates."""
  2 | 
  3 | from typing import Any
  4 | 
  5 | from mdio.builder.schemas.dtype import ScalarType
  6 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
  7 | from mdio.builder.templates.base import AbstractDatasetTemplate
  8 | from mdio.builder.templates.types import SeismicDataDomain
  9 | 
 10 | 
 11 | class Seismic3DStreamerFieldRecordsTemplate(AbstractDatasetTemplate):
 12 |     """Seismic 3D streamer shot field records template.
 13 | 
 14 |     A generalized template for streamer field records that are optimized for:
 15 |         - Common-shot access
 16 |         - Common-channel access
 17 | 
 18 |     It can also store all the shot-lines of a survey in one MDIO if needed.
 19 | 
 20 |     Args:
 21 |         data_domain: The domain of the dataset.
 22 |     """
 23 | 
 24 |     def __init__(self, data_domain: SeismicDataDomain = "time"):
 25 |         super().__init__(data_domain=data_domain)
 26 | 
 27 |         self._spatial_dim_names = ("sail_line", "gun", "shot_index", "cable", "channel")
 28 |         self._calculated_dims = ("shot_index",)
 29 |         self._dim_names = (*self._spatial_dim_names, self._data_domain)
 30 |         self._physical_coord_names = ("source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y")
 31 |         self._logical_coord_names = ("shot_point", "orig_field_record_num")  # ffid
 32 |         self._var_chunk_shape = (1, 1, 16, 1, 32, 1024)
 33 | 
 34 |     @property
 35 |     def _name(self) -> str:
 36 |         return "StreamerFieldRecords3D"
 37 | 
 38 |     def _load_dataset_attributes(self) -> dict[str, Any]:
 39 |         return {"surveyDimensionality": "3D", "gatherType": "common_source"}
 40 | 
 41 |     def _add_coordinates(self) -> None:
 42 |         # Add dimension coordinates
 43 |         # EXCLUDE: `shot_index` since its 0-N
 44 |         self._builder.add_coordinate(
 45 |             "sail_line",
 46 |             dimensions=("sail_line",),
 47 |             data_type=ScalarType.UINT32,
 48 |         )
 49 |         self._builder.add_coordinate(
 50 |             "gun",
 51 |             dimensions=("gun",),
 52 |             data_type=ScalarType.UINT8,
 53 |         )
 54 |         self._builder.add_coordinate(
 55 |             "cable",
 56 |             dimensions=("cable",),
 57 |             data_type=ScalarType.UINT8,
 58 |         )
 59 |         self._builder.add_coordinate(
 60 |             "channel",
 61 |             dimensions=("channel",),
 62 |             data_type=ScalarType.UINT16,
 63 |         )
 64 |         self._builder.add_coordinate(
 65 |             self._data_domain,
 66 |             dimensions=(self._data_domain,),
 67 |             data_type=ScalarType.INT32,
 68 |         )
 69 | 
 70 |         # Add non-dimension coordinates
 71 |         self._builder.add_coordinate(
 72 |             "orig_field_record_num",
 73 |             dimensions=("sail_line", "gun", "shot_index"),
 74 |             data_type=ScalarType.UINT32,
 75 |         )
 76 |         self._builder.add_coordinate(
 77 |             "shot_point",
 78 |             dimensions=("sail_line", "gun", "shot_index"),
 79 |             data_type=ScalarType.UINT32,
 80 |         )
 81 |         self._builder.add_coordinate(
 82 |             "source_coord_x",
 83 |             dimensions=("sail_line", "gun", "shot_index"),
 84 |             data_type=ScalarType.FLOAT64,
 85 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_x")),
 86 |         )
 87 |         self._builder.add_coordinate(
 88 |             "source_coord_y",
 89 |             dimensions=("sail_line", "gun", "shot_index"),
 90 |             data_type=ScalarType.FLOAT64,
 91 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("source_coord_y")),
 92 |         )
 93 |         self._builder.add_coordinate(
 94 |             "group_coord_x",
 95 |             dimensions=("sail_line", "gun", "shot_index", "cable", "channel"),
 96 |             data_type=ScalarType.FLOAT64,
 97 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_x")),
 98 |         )
 99 |         self._builder.add_coordinate(
100 |             "group_coord_y",
101 |             dimensions=("sail_line", "gun", "shot_index", "cable", "channel"),
102 |             data_type=ScalarType.FLOAT64,
103 |             metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("group_coord_y")),
104 |         )
105 | 


--------------------------------------------------------------------------------
/src/mdio/api/io.py:
--------------------------------------------------------------------------------
  1 | """Utils for reading MDIO dataset."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING
  6 | from typing import Any
  7 | from typing import Literal
  8 | 
  9 | import zarr
 10 | from upath import UPath
 11 | from xarray import Dataset as xr_Dataset
 12 | from xarray import open_zarr as xr_open_zarr
 13 | from xarray.backends.writers import to_zarr as xr_to_zarr
 14 | 
 15 | from mdio.constants import ZarrFormat
 16 | from mdio.core.zarr_io import zarr_warnings_suppress_unstable_structs_v3
 17 | 
 18 | if TYPE_CHECKING:
 19 |     from collections.abc import Mapping
 20 |     from pathlib import Path
 21 | 
 22 |     from xarray import Dataset
 23 |     from xarray.core.types import T_Chunks
 24 |     from xarray.core.types import ZarrWriteModes
 25 | 
 26 | 
 27 | def _normalize_path(path: UPath | Path | str) -> UPath:
 28 |     return UPath(path)
 29 | 
 30 | 
 31 | def _normalize_storage_options(path: UPath) -> dict[str, Any] | None:
 32 |     return None if len(path.storage_options) == 0 else path.storage_options
 33 | 
 34 | 
 35 | def open_mdio(input_path: UPath | Path | str, chunks: T_Chunks = None) -> xr_Dataset:
 36 |     """Open a Zarr dataset from the specified universal file path.
 37 | 
 38 |     Args:
 39 |         input_path: Universal input path for the MDIO dataset.
 40 |         chunks: If provided, loads data into dask arrays with new chunking.
 41 |             - ``chunks="auto"`` will use dask ``auto`` chunking
 42 |             - ``chunks=None`` skips using dask, which is generally faster for small arrays.
 43 |             - ``chunks=-1`` loads the data with dask using a single chunk for all arrays.
 44 |             - ``chunks={}`` loads the data with dask using the engine's preferred chunk size (on disk).
 45 |             - ``chunks={dim: chunk, ...}`` loads the data with dask using the specified chunk size for each dimension.
 46 | 
 47 |             See dask chunking for more details.
 48 | 
 49 |     Returns:
 50 |         An Xarray dataset opened from the input path.
 51 |     """
 52 |     input_path = _normalize_path(input_path)
 53 |     storage_options = _normalize_storage_options(input_path)
 54 |     zarr_format = zarr.config.get("default_zarr_format")
 55 | 
 56 |     return xr_open_zarr(
 57 |         input_path.as_posix(),
 58 |         chunks=chunks,
 59 |         storage_options=storage_options,
 60 |         mask_and_scale=zarr_format == ZarrFormat.V3,  # off for v2, on for v3
 61 |         consolidated=zarr_format == ZarrFormat.V2,  # on for v2, off for v3
 62 |     )
 63 | 
 64 | 
 65 | def to_mdio(  # noqa: PLR0913
 66 |     dataset: Dataset,
 67 |     output_path: UPath | Path | str,
 68 |     mode: ZarrWriteModes | None = None,
 69 |     *,
 70 |     compute: bool = True,
 71 |     region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
 72 | ) -> None:
 73 |     """Write dataset contents to an MDIO output_path.
 74 | 
 75 |     Args:
 76 |         dataset: The dataset to write.
 77 |         output_path: The universal path of the output MDIO file.
 78 |         mode: Persistence mode: "w" means create (overwrite if exists)
 79 |             "w-" means create (fail if exists)
 80 |             "a" means override all existing variables including dimension coordinates (create if does not exist)
 81 |             "a-" means only append those variables that have ``append_dim``.
 82 |             "r+" means modify existing array *values* only (raise an error if any metadata or shapes would change).
 83 |             The default mode is "r+" if ``region`` is set and ``w-`` otherwise.
 84 |         compute: If True write array data immediately; otherwise return a ``dask.delayed.Delayed`` object that
 85 |             can be computed to write array data later. Metadata is always updated eagerly.
 86 |         region: Optional mapping from dimension names to either a) ``"auto"``, or b) integer slices, indicating
 87 |             the region of existing MDIO array(s) in which to write this dataset's data.
 88 |     """
 89 |     output_path = _normalize_path(output_path)
 90 |     storage_options = _normalize_storage_options(output_path)
 91 |     zarr_format = zarr.config.get("default_zarr_format")
 92 | 
 93 |     with zarr_warnings_suppress_unstable_structs_v3():
 94 |         xr_to_zarr(
 95 |             dataset,
 96 |             store=output_path.as_posix(),  # xarray doesn't like URI when file:// is protocol
 97 |             mode=mode,
 98 |             compute=compute,
 99 |             consolidated=zarr_format == ZarrFormat.V2,  # on for v2, off for v3
100 |             region=region,
101 |             storage_options=storage_options,
102 |             write_empty_chunks=False,
103 |         )
104 | 


--------------------------------------------------------------------------------
/src/mdio/segy/compat.py:
--------------------------------------------------------------------------------
  1 | """Generate SEG-Y spec MDIO backward compatibility.
  2 | 
  3 | We were limited to fixed field names and byte locations due to using the segyio library. Since
  4 | MDIO 0.8.0 we have a more powerful SEG-Y parser and it gives more flexibility. To support older
  5 | files, we need to open them with the old SEG-Y spec. This is where we define it.
  6 | """
  7 | 
  8 | from __future__ import annotations
  9 | 
 10 | import logging
 11 | from importlib import metadata
 12 | 
 13 | from packaging import version
 14 | from segy.alias.segyio import SEGYIO_BIN_FIELD_MAP
 15 | from segy.alias.segyio import SEGYIO_TRACE_FIELD_MAP
 16 | from segy.schema import HeaderField
 17 | from segy.schema import HeaderSpec
 18 | from segy.schema import ScalarType
 19 | from segy.schema import SegySpec
 20 | from segy.schema import TextHeaderSpec
 21 | from segy.schema import TraceDataSpec
 22 | from segy.schema import TraceSpec
 23 | from segy.standards.fields import binary
 24 | 
 25 | from mdio.exceptions import InvalidMDIOError
 26 | 
 27 | MDIO_VERSION = metadata.version("multidimio")
 28 | 
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | def get_binary_fields() -> list[HeaderField]:
 34 |     """Generate binary header fields from equinor/segyio fields."""
 35 |     revision_field = binary.Rev1.SEGY_REVISION.model
 36 |     mdio_v0_bin_fields = []
 37 | 
 38 |     # Replace min/max (rev2-ish) with rev1 like parsing. Ignore minor one, and add the
 39 |     # revision as 4-byte.
 40 |     for alias, field in SEGYIO_BIN_FIELD_MAP.items():
 41 |         if alias == "SEGYRevision":
 42 |             mdio_v0_bin_fields.append(revision_field)
 43 |         elif alias != "SEGYRevisionMinor":
 44 |             mdio_v0_bin_fields.append(field.model)
 45 |     return mdio_v0_bin_fields
 46 | 
 47 | 
 48 | def get_trace_fields(version_str: str) -> list[HeaderField]:
 49 |     """Generate trace header fields.
 50 | 
 51 |     This part allows us to configure custom rules for different MDIO versions.
 52 | 
 53 |     For instance, since MDIO 0.8.0 we also save the unassigned parts of the trace header (after
 54 |     byte 233 / offset 232). To be able to ingest/export new MDIO files and also support exporting
 55 |     older MDIO files, we conditionally add the new field based on MDIO version specified above.
 56 | 
 57 |     Current rules:
 58 |     * mdio<=0.7.4 use the segyio mappings directly.
 59 |     * mdio>=0.8.0 adds an extra field to the end to fill the last 8 bytes
 60 | 
 61 |     Args:
 62 |         version_str: MDIO version to generate the trace fields for.
 63 | 
 64 |     Returns:
 65 |         List of header fields for specified MDIO version trace header encoding.
 66 |     """
 67 |     trace_fields = [field.model for field in SEGYIO_TRACE_FIELD_MAP.values()]
 68 |     version_obj = version.parse(version_str)
 69 |     if version_obj > version.parse("0.7.4"):
 70 |         trace_fields.append(HeaderField(name="unassigned", byte=233, format="int64"))
 71 |     return trace_fields
 72 | 
 73 | 
 74 | def mdio_segy_spec(version_str: str | None = None) -> SegySpec:
 75 |     """Get a SEG-Y encoding spec for MDIO based on version."""
 76 |     version_str = MDIO_VERSION if version_str is None else version_str
 77 | 
 78 |     binary_fields = get_binary_fields()
 79 |     trace_fields = get_trace_fields(version_str)
 80 | 
 81 |     return SegySpec(
 82 |         segy_standard=None,
 83 |         text_header=TextHeaderSpec(),
 84 |         binary_header=HeaderSpec(fields=binary_fields, item_size=400, offset=3200),
 85 |         trace=TraceSpec(
 86 |             header=HeaderSpec(fields=trace_fields, item_size=240),
 87 |             data=TraceDataSpec(format=ScalarType.IBM32),  # placeholder
 88 |         ),
 89 |     )
 90 | 
 91 | 
 92 | def encode_segy_revision(binary_header: dict) -> dict:
 93 |     """Encode revision code to binary header.
 94 | 
 95 |     Return the correctly Rev1-like encoded revision code, ready to write to SEG-Y.
 96 | 
 97 |     Args:
 98 |         binary_header: Dictionary representing the SEG-Y binary header. Contains keys for major
 99 |             and minor revision numbers.
100 | 
101 |     Returns:
102 |         The updated binary header with the encoded revision.
103 | 
104 |     Raises:
105 |         InvalidMDIOError: Raised when binary header in MDIO is broken.
106 |     """
107 |     major_key, minor_key = "segy_revision_major", "segy_revision_minor"
108 | 
109 |     try:
110 |         major = binary_header.pop(major_key)
111 |         minor = binary_header.pop(minor_key)
112 |     except KeyError:
113 |         msg = "Missing revision keys from binary header."
114 |         logger.error(msg)
115 |         raise InvalidMDIOError(msg) from KeyError
116 | 
117 |     code = (major << 8) | minor
118 |     code_hex = f"0x{code:04x}"
119 |     binary_header["segy_revision"] = code
120 |     logger.info("Encoded revision %s.%s to code=%s ~ %s", major, minor, code, code_hex)
121 |     return binary_header
122 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: Tests
  2 | 
  3 | on:
  4 |   - push
  5 |   - pull_request
  6 | 
  7 | jobs:
  8 |   tests:
  9 |     name: ${{ matrix.session }} ${{ matrix.python }} / ${{ matrix.os }}
 10 |     runs-on: ${{ matrix.os }}
 11 |     strategy:
 12 |       fail-fast: false
 13 |       matrix:
 14 |         include:
 15 |           - { python: "3.13", os: "ubuntu-latest", session: "pre-commit" }
 16 |           #          - { python: "3.13", os: "ubuntu-latest", session: "mypy" }
 17 |           #          - { python: "3.12", os: "ubuntu-latest", session: "mypy" }
 18 |           #          - { python: "3.11", os: "ubuntu-latest", session: "mypy" }
 19 |           - { python: "3.13", os: "ubuntu-latest", session: "tests" }
 20 |           - { python: "3.12", os: "ubuntu-latest", session: "tests" }
 21 |           - { python: "3.11", os: "ubuntu-latest", session: "tests" }
 22 |           - { python: "3.13", os: "windows-latest", session: "tests" }
 23 |           - { python: "3.13", os: "macos-latest", session: "tests" }
 24 |           #          - { python: "3.13", os: "ubuntu-latest", session: "typeguard" }
 25 |           #          - { python: "3.12", os: "ubuntu-latest", session: "typeguard" }
 26 |           #          - { python: "3.11", os: "ubuntu-latest", session: "typeguard" }
 27 |           #          - { python: "3.13", os: "ubuntu-latest", session: "xdoctest" }
 28 |           - { python: "3.13", os: "ubuntu-latest", session: "docs-build" }
 29 | 
 30 |     env:
 31 |       NOXSESSION: ${{ matrix.session }}
 32 |       FORCE_COLOR: "1"
 33 |       PRE_COMMIT_COLOR: "always"
 34 | 
 35 |     steps:
 36 |       - name: Check out the repository
 37 |         uses: actions/checkout@v5
 38 | 
 39 |       - name: Set up Python ${{ matrix.python }}
 40 |         uses: actions/setup-python@v6
 41 |         with:
 42 |           python-version: ${{ matrix.python }}
 43 | 
 44 |       - name: Install the pinned version of uv
 45 |         uses: astral-sh/setup-uv@v7
 46 |         with:
 47 |           python-version: ${{ matrix.python }}
 48 |           working-directory: ${{ github.workspace }}
 49 | 
 50 |       - name: Install Nox
 51 |         run: |
 52 |           uv tool install -c "${{ github.workspace }}/.github/workflows/constraints.txt" nox
 53 |           nox --version
 54 | 
 55 |       - name: Compute pre-commit cache key
 56 |         if: matrix.session == 'pre-commit'
 57 |         id: pre-commit-cache
 58 |         shell: python
 59 |         run: |
 60 |           import hashlib
 61 |           import sys
 62 |           import os
 63 | 
 64 |           python = "py{}.{}".format(*sys.version_info[:2])
 65 |           payload = sys.version.encode() + sys.executable.encode()
 66 |           digest = hashlib.sha256(payload).hexdigest()
 67 |           result = "${{ runner.os }}-{}-{}-pre-commit".format(python, digest[:8])
 68 | 
 69 |           print("result={}".format(result), file=open(os.environ['GITHUB_OUTPUT'], 'a'))
 70 | 
 71 |       - name: Restore pre-commit cache
 72 |         uses: actions/cache@v4
 73 |         if: matrix.session == 'pre-commit'
 74 |         with:
 75 |           path: ~/.cache/pre-commit
 76 |           key: ${{ steps.pre-commit-cache.outputs.result }}-${{ hashFiles('.pre-commit-config.yaml') }}
 77 |           restore-keys: |
 78 |             ${{ steps.pre-commit-cache.outputs.result }}-
 79 | 
 80 |       - name: Run Nox
 81 |         run: |
 82 |           nox --python=${{ matrix.python }}
 83 | 
 84 |       - name: Upload coverage data
 85 |         if: always() && matrix.session == 'tests'
 86 |         uses: actions/upload-artifact@v5
 87 |         with:
 88 |           name: coverage-data-${{ matrix.os }}-${{ matrix.python }}
 89 |           include-hidden-files: true
 90 |           path: ".coverage.*"
 91 | 
 92 |       - name: Upload documentation
 93 |         if: matrix.session == 'docs-build'
 94 |         uses: actions/upload-artifact@v5
 95 |         with:
 96 |           name: docs
 97 |           path: docs/_build
 98 | 
 99 |   coverage:
100 |     runs-on: ubuntu-latest
101 |     needs: tests
102 |     steps:
103 |       - name: Check out the repository
104 |         uses: actions/checkout@v5
105 | 
106 |       - name: Install the pinned version of uv
107 |         uses: astral-sh/setup-uv@v7
108 |         with:
109 |           python-version: 3.13
110 |           working-directory: ${{ github.workspace }}
111 | 
112 |       - name: Install Nox
113 |         run: |
114 |           uv tool install -c "${{ github.workspace }}/.github/workflows/constraints.txt" nox
115 |           nox --version
116 | 
117 |       - name: Download coverage data
118 |         uses: actions/download-artifact@v6
119 |         with:
120 |           pattern: coverage-data-*
121 |           merge-multiple: true
122 | 
123 |       - name: Combine coverage data and display human readable report
124 |         run: |
125 |           nox --session=coverage
126 | 
127 |       - name: Create coverage report
128 |         run: |
129 |           nox --session=coverage -- xml
130 | 
131 |       - name: Upload coverage report
132 |         uses: codecov/codecov-action@v5.5.1
133 |         env:
134 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
135 | 


--------------------------------------------------------------------------------
/docs/data_models/chunk_grids.md:
--------------------------------------------------------------------------------
  1 | ```{eval-rst}
  2 | :tocdepth: 3
  3 | ```
  4 | 
  5 | ```{currentModule} mdio.builder.schemas.chunk_grid
  6 | 
  7 | ```
  8 | 
  9 | # Chunk Grid Models
 10 | 
 11 | ```{article-info}
 12 | :author: Altay Sansal
 13 | :date: "{sub-ref}`today`"
 14 | :read-time: "{sub-ref}`wordcount-minutes` min read"
 15 | :class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light
 16 | ```
 17 | 
 18 | The variables in MDIO data model can represent different types of chunk grids.
 19 | These grids are essential for managing multi-dimensional data arrays efficiently.
 20 | In this breakdown, we will explore four distinct data models within the MDIO schema,
 21 | each serving a specific purpose in data handling and organization.
 22 | 
 23 | MDIO implements data models following the guidelines of the Zarr v3 spec and ZEPs:
 24 | 
 25 | - [Zarr core specification (version 3)](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html)
 26 | - [ZEP 1 — Zarr specification version 3](https://zarr.dev/zeps/accepted/ZEP0001.html)
 27 | - [ZEP 3 — Variable chunking](https://zarr.dev/zeps/draft/ZEP0003.html)
 28 | 
 29 | ## Regular Grid
 30 | 
 31 | The regular grid models are designed to represent a rectangular and regularly
 32 | paced chunk grid.
 33 | 
 34 | ```{eval-rst}
 35 | .. autosummary::
 36 |    RegularChunkGrid
 37 |    RegularChunkShape
 38 | ```
 39 | 
 40 | For 1D array with `size = 31`{l=python}, we can divide it into 5 equally sized
 41 | chunks. Note that the last chunk will be truncated to match the size of the array.
 42 | 
 43 | `{ "name": "regular", "configuration": { "chunkShape": [7] } }`{l=json}
 44 | 
 45 | Using the above schema resulting array chunks will look like this:
 46 | 
 47 | ```bash
 48 |  ←─ 7 ─→ ←─ 7 ─→ ←─ 7 ─→ ←─ 7 ─→  ↔ 3
 49 | ┌───────┬───────┬───────┬───────┬───┐
 50 | └───────┴───────┴───────┴───────┴───┘
 51 | ```
 52 | 
 53 | For 2D array with shape `rows, cols = (7, 17)`{l=python}, we can divide it into 9
 54 | equally sized chunks.
 55 | 
 56 | `{ "name": "regular", "configuration": { "chunkShape": [3, 7] } }`{l=json}
 57 | 
 58 | Using the above schema, the resulting 2D array chunks will look like below.
 59 | Note that the rows and columns are conceptual and visually not to scale.
 60 | 
 61 | ```bash
 62 |  ←─ 7 ─→ ←─ 7 ─→  ↔ 3
 63 | ┌───────┬───────┬───┐
 64 | │       ╎       ╎   │  ↑
 65 | │       ╎       ╎   │  3
 66 | │       ╎       ╎   │  ↓
 67 | ├╶╶╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤
 68 | │       ╎       ╎   │  ↑
 69 | │       ╎       ╎   │  3
 70 | │       ╎       ╎   │  ↓
 71 | ├╶╶╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤
 72 | │       ╎       ╎   │  ↕ 1
 73 | └───────┴───────┴───┘
 74 | ```
 75 | 
 76 | ## Rectilinear Grid
 77 | 
 78 | The [RectilinearChunkGrid](RectilinearChunkGrid) model extends
 79 | the concept of chunk grids to accommodate rectangular and irregularly spaced chunks.
 80 | This model is useful in data structures where non-uniform chunk sizes are necessary.
 81 | [RectilinearChunkShape](RectilinearChunkShape) specifies the chunk sizes for each
 82 | dimension as a list allowing for irregular intervals.
 83 | 
 84 | ```{eval-rst}
 85 | .. autosummary::
 86 |    RectilinearChunkGrid
 87 |    RectilinearChunkShape
 88 | ```
 89 | 
 90 | :::{note}
 91 | It's important to ensure that the sum of the irregular spacings specified
 92 | in the `chunkShape` matches the size of the respective array dimension.
 93 | :::
 94 | 
 95 | For 1D array with `size = 39`{l=python}, we can divide it into 5 irregular sized
 96 | chunks.
 97 | 
 98 | `{ "name": "rectilinear", "configuration": { "chunkShape": [[10, 7, 5, 7, 10]] } }`{l=json}
 99 | 
100 | Using the above schema resulting array chunks will look like this:
101 | 
102 | ```bash
103 |  ←── 10 ──→ ←─ 7 ─→ ← 5 → ←─ 7 ─→ ←── 10 ──→
104 | ┌──────────┬───────┬─────┬───────┬──────────┐
105 | └──────────┴───────┴─────┴───────┴──────────┘
106 | ```
107 | 
108 | For 2D array with shape `rows, cols = (7, 25)`{l=python}, we can divide it into 12
109 | rectilinear (rectangular bur irregular) chunks. Note that the rows and columns are
110 | conceptual and visually not to scale.
111 | 
112 | `{ "name": "rectilinear", "configuration": { "chunkShape": [[3, 1, 3], [10, 5, 7, 3]] } }`{l=json}
113 | 
114 | ```bash
115 |  ←── 10 ──→ ← 5 → ←─ 7 ─→  ↔ 3
116 | ┌──────────┬─────┬───────┬───┐
117 | │          ╎     ╎       ╎   │  ↑
118 | │          ╎     ╎       ╎   │  3
119 | │          ╎     ╎       ╎   │  ↓
120 | ├╶╶╶╶╶╶╶╶╶╶┼╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤
121 | │          ╎     ╎       ╎   │  ↕ 1
122 | ├╶╶╶╶╶╶╶╶╶╶┼╶╶╶╶╶┼╶╶╶╶╶╶╶┼╶╶╶┤
123 | │          ╎     ╎       ╎   │  ↑
124 | │          ╎     ╎       ╎   │  3
125 | │          ╎     ╎       ╎   │  ↓
126 | └──────────┴─────┴───────┴───┘
127 | ```
128 | 
129 | ## Model Reference
130 | 
131 | :::{dropdown} RegularChunkGrid
132 | :animate: fade-in-slide-down
133 | 
134 | ```{eval-rst}
135 | .. autopydantic_model:: RegularChunkGrid
136 | 
137 | ----------
138 | 
139 | .. autopydantic_model:: RegularChunkShape
140 | ```
141 | 
142 | :::
143 | :::{dropdown} RectilinearChunkGrid
144 | :animate: fade-in-slide-down
145 | 
146 | ```{eval-rst}
147 | .. autopydantic_model:: RectilinearChunkGrid
148 | 
149 | ----------
150 | 
151 | .. autopydantic_model:: RectilinearChunkShape
152 | ```
153 | 
154 | :::
155 | 


--------------------------------------------------------------------------------
/tests/unit/test_segy_grid_overrides.py:
--------------------------------------------------------------------------------
  1 | """Check grid overrides."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import Any
  6 | 
  7 | import numpy as np
  8 | import numpy.typing as npt
  9 | import pytest
 10 | from numpy import arange
 11 | from numpy import column_stack
 12 | from numpy import meshgrid
 13 | from numpy import unique
 14 | from numpy.testing import assert_array_equal
 15 | 
 16 | from mdio.core import Dimension
 17 | from mdio.segy.exceptions import GridOverrideUnknownError
 18 | from mdio.segy.geometry import GridOverrider
 19 | 
 20 | SHOTS = arange(100, 104, dtype="int32")
 21 | CABLES = arange(11, 15, dtype="int32")
 22 | RECEIVERS = arange(1, 6, dtype="int32")
 23 | 
 24 | 
 25 | def run_override(
 26 |     grid_overrides: dict[str, Any],
 27 |     index_names: tuple[str, ...],
 28 |     headers: npt.NDArray,
 29 |     chunksize: tuple[int, ...] | None = None,
 30 | ) -> tuple[dict[str, Any], tuple[str], tuple[int]]:
 31 |     """Initialize and run overrider."""
 32 |     overrider = GridOverrider()
 33 |     return overrider.run(headers, index_names, grid_overrides, chunksize)
 34 | 
 35 | 
 36 | def get_dims(headers: npt.NDArray) -> list[Dimension]:
 37 |     """Get list of Dimensions from headers."""
 38 |     dims = []
 39 |     for index_name in headers.dtype.names:
 40 |         index_coords = headers[index_name]
 41 |         dim_unique = unique(index_coords)
 42 |         dims.append(Dimension(coords=dim_unique, name=index_name))
 43 | 
 44 |     return dims
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def mock_streamer_headers() -> npt.NDArray:
 49 |     """Generate dictionary of mocked streamer index headers."""
 50 |     grids = meshgrid(SHOTS, CABLES, RECEIVERS, indexing="ij")
 51 |     permutations = column_stack([grid.ravel() for grid in grids])
 52 | 
 53 |     # Make channel from receiver ids
 54 |     for shot in SHOTS:
 55 |         shot_mask = permutations[:, 0] == shot
 56 |         permutations[shot_mask, -1] = arange(1, len(CABLES) * len(RECEIVERS) + 1)
 57 | 
 58 |     hdr_dtype = np.dtype(
 59 |         {
 60 |             "names": ["shot_point", "cable", "channel"],
 61 |             "formats": ["int32", "int32", "int32"],
 62 |         }
 63 |     )
 64 | 
 65 |     n_traces = permutations.shape[0]
 66 |     result = np.ndarray(dtype=hdr_dtype, shape=n_traces)
 67 | 
 68 |     result["shot_point"] = permutations[:, 0]
 69 |     result["cable"] = permutations[:, 1]
 70 |     result["channel"] = permutations[:, 2]
 71 | 
 72 |     return result
 73 | 
 74 | 
 75 | class TestAutoGridOverrides:
 76 |     """Check grid overrides works with auto indexing."""
 77 | 
 78 |     def test_duplicates(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None:
 79 |         """Test the HasDuplicates Grid Override command."""
 80 |         index_names = ("shot_point", "cable")
 81 |         grid_overrides = {"HasDuplicates": True}
 82 | 
 83 |         # Remove channel header
 84 |         streamer_headers = mock_streamer_headers[list(index_names)]
 85 |         chunksize = (4, 4, 8)
 86 | 
 87 |         new_headers, new_names, new_chunks = run_override(
 88 |             grid_overrides,
 89 |             index_names,
 90 |             streamer_headers,
 91 |             chunksize,
 92 |         )
 93 | 
 94 |         assert new_names == ("shot_point", "cable", "trace")
 95 |         assert new_chunks == (4, 4, 1, 8)
 96 | 
 97 |         dims = get_dims(new_headers)
 98 | 
 99 |         assert_array_equal(dims[0].coords, SHOTS)
100 |         assert_array_equal(dims[1].coords, CABLES)
101 |         assert_array_equal(dims[2].coords, RECEIVERS)
102 | 
103 |     def test_non_binned(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None:
104 |         """Test the NonBinned Grid Override command."""
105 |         index_names = ("shot_point", "cable")
106 |         grid_overrides = {"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}
107 | 
108 |         # Keep channel header for non-binned processing
109 |         streamer_headers = mock_streamer_headers
110 |         chunksize = (4, 4, 8)
111 | 
112 |         new_headers, new_names, new_chunks = run_override(
113 |             grid_overrides,
114 |             index_names,
115 |             streamer_headers,
116 |             chunksize,
117 |         )
118 | 
119 |         assert new_names == ("shot_point", "cable", "trace")
120 |         assert new_chunks == (4, 4, 4, 8)
121 | 
122 |         dims = get_dims(new_headers)
123 | 
124 |         assert_array_equal(dims[0].coords, SHOTS)
125 |         assert_array_equal(dims[1].coords, CABLES)
126 |         # Trace coords are the unique channel values (1-20)
127 |         expected_trace_coords = np.arange(1, 21, dtype="int32")
128 |         assert_array_equal(dims[2].coords, expected_trace_coords)
129 | 
130 | 
131 | class TestStreamerGridOverrides:
132 |     """Check grid overrides for shot data with streamer acquisition."""
133 | 
134 |     def test_unknown_override(
135 |         self,
136 |         mock_streamer_headers: dict[str, npt.NDArray],
137 |     ) -> None:
138 |         """Test exception if user provides a command that's not allowed."""
139 |         index_names = ("shot_point", "cable", "channel")
140 |         chunksize = None
141 |         overrider = GridOverrider()
142 |         with pytest.raises(GridOverrideUnknownError):
143 |             overrider.run(mock_streamer_headers, index_names, {"WrongCommand": True}, chunksize)
144 | 


--------------------------------------------------------------------------------
/tests/unit/test_indexing.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for the type converter module."""
  2 | 
  3 | import numpy as np
  4 | from xarray import DataArray as xr_DataArray
  5 | from xarray import Dataset as xr_Dataset
  6 | 
  7 | from mdio.core.indexing import ChunkIterator
  8 | 
  9 | 
 10 | def test_chunk_iterator_returning_dict() -> None:
 11 |     """Test the ChunkIterator class."""
 12 |     dims = ["inline", "crossline", "depth"]
 13 |     chunks = (3, 4, 5)
 14 | 
 15 |     shape = (6, 12, 20)
 16 |     iter1 = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims)
 17 |     assert iter1.arr_shape == shape
 18 |     assert iter1.dims == dims
 19 |     assert iter1.len_chunks == chunks
 20 |     assert iter1.dim_chunks == (2, 3, 4)
 21 |     assert iter1.num_chunks == 24
 22 | 
 23 |     shape = (5, 11, 19)
 24 |     iter2 = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims)
 25 |     assert iter2.dim_chunks == (2, 3, 4)
 26 |     assert iter2.num_chunks == 24
 27 | 
 28 |     # Its purpose is to confirm that the last slice is adjusted to fit the data exactly
 29 |     # when the array size doesn't align perfectly with chunk boundaries.
 30 |     for _ in range(13):  # element index 12
 31 |         region = iter1.__next__()
 32 |     assert region == {
 33 |         "inline": slice(3, 6, None),
 34 |         "crossline": slice(0, 4, None),
 35 |         "depth": slice(0, 5, None),
 36 |     }
 37 | 
 38 |     for _ in range(13):  # element index 12
 39 |         region = iter2.__next__()
 40 |     assert region == {
 41 |         "inline": slice(3, 5, None),
 42 |         "crossline": slice(0, 4, None),
 43 |         "depth": slice(0, 5, None),
 44 |     }
 45 | 
 46 | 
 47 | def test_chunk_iterator_returning_tuple() -> None:
 48 |     """Test the ChunkIterator class."""
 49 |     chunks = (3, 4, 5)
 50 | 
 51 |     shape = (6, 12, 20)
 52 |     iter1 = ChunkIterator(shape=shape, chunks=chunks)
 53 |     assert iter1.arr_shape == shape
 54 |     assert iter1.dims is None
 55 |     assert iter1.len_chunks == chunks
 56 |     assert iter1.dim_chunks == (2, 3, 4)
 57 |     assert iter1.num_chunks == 24
 58 | 
 59 |     shape = (5, 11, 19)
 60 |     iter2 = ChunkIterator(shape=shape, chunks=chunks)
 61 |     assert iter2.dim_chunks == (2, 3, 4)
 62 |     assert iter2.num_chunks == 24
 63 | 
 64 |     # Its purpose is to confirm that the last slice is adjusted to fit the data exactly
 65 |     # when the array size doesn't align perfectly with chunk boundaries.
 66 |     for _ in range(13):  # element index 12
 67 |         region = iter1.__next__()
 68 |     assert region == (slice(3, 6, None), slice(0, 4, None), slice(0, 5, None))
 69 | 
 70 |     for _ in range(13):  # element index 12
 71 |         region = iter2.__next__()
 72 |     assert region == (slice(3, 5, None), slice(0, 4, None), slice(0, 5, None))
 73 | 
 74 | 
 75 | def val(shape: tuple[int, int, int], i: int, j: int, k: int) -> int:
 76 |     """Calculate the linear index in a 3D array."""
 77 |     return i * (shape[1] * shape[2]) + j * shape[2] + k
 78 | 
 79 | 
 80 | def mock_trace_worker(
 81 |     shape: tuple[int, int, int], region: dict[str, slice], dataset: xr_Dataset, grid_map: np.array
 82 | ) -> None:
 83 |     """Mock trace worker function.
 84 | 
 85 |     Note:
 86 |         Xarray, Zarr, and NumPy automatically truncates the slice to the valid bounds of the array
 87 |         (see the test above, where the last chunk is always of the same size)
 88 |         and does not raise an error. However, if one attempts to access an element at an index
 89 |         that is out of bounds, you will get an IndexError
 90 |     """
 91 |     # We used a 2D selection with 2D index_slices
 92 |     assert grid_map.shape == (3, 4, 20)
 93 |     # We used a 3D selection with isel()
 94 |     assert tuple(dataset.sizes[d] for d in region) == (3, 4, 5)
 95 | 
 96 |     dimension_names = list(dataset.sizes)
 97 | 
 98 |     slice0 = region[dimension_names[0]]
 99 |     slice1 = region[dimension_names[1]]
100 |     slice2 = region[dimension_names[2]]
101 |     for ii, i in enumerate(range(slice0.start, min(slice0.stop, shape[0]))):
102 |         for jj, j in enumerate(range(slice1.start, min(slice1.stop, shape[1]))):
103 |             for kk, k in enumerate(range(slice2.start, min(slice2.stop, shape[2]))):
104 |                 # Validate that we've got the sample indexing right
105 |                 assert dataset["amplitude"].values[ii, jj, kk] == val(shape, i, j, k)
106 |                 # NOTE: grid_map is 2D, so we need to use k for the depth dimension
107 |                 assert dataset["amplitude"].values[ii, jj, kk] == grid_map[ii, jj, k]
108 | 
109 | 
110 | def test_chunk_iterator_with_dataset() -> None:
111 |     """Test the ChunkIterator with a dataset."""
112 |     shape = (6, 12, 20)
113 |     dims = ["inline", "crossline", "depth"]
114 |     chunks = (3, 4, 5)
115 | 
116 |     data3 = np.arange(shape[0] * shape[1] * shape[2]).reshape(shape)
117 |     amplitude = xr_DataArray(data3, dims=dims, name="amplitude")
118 |     ds = xr_Dataset({"amplitude": amplitude})
119 | 
120 |     chunk_iter = ChunkIterator(shape, chunks, dims)
121 |     for region in chunk_iter:
122 |         # If one needs both a dict and a tuple of slices,
123 |         # one can use the following line an example to strip dim names out
124 |         index_slices = tuple(region[key] for key in dims[:-1])
125 |         # The .isel() method takes keyword arguments, region, where each keyword corresponds
126 |         # to a dimension name and the value is an integer, a slice object (our case),
127 |         # or an array-like object
128 |         mock_trace_worker(shape, region, ds.isel(region), amplitude[index_slices])
129 | 


--------------------------------------------------------------------------------
/tests/unit/v1/templates/test_seismic_2d_poststack.py:
--------------------------------------------------------------------------------
  1 | """Unit tests for Seismic2DPostStackTemplate."""
  2 | 
  3 | import pytest
  4 | from tests.unit.v1.helpers import validate_variable
  5 | 
  6 | from mdio.builder.schemas.chunk_grid import RegularChunkGrid
  7 | from mdio.builder.schemas.dtype import ScalarType
  8 | from mdio.builder.schemas.dtype import StructuredType
  9 | from mdio.builder.schemas.v1.dataset import Dataset
 10 | from mdio.builder.schemas.v1.units import LengthUnitEnum
 11 | from mdio.builder.schemas.v1.units import LengthUnitModel
 12 | from mdio.builder.schemas.v1.units import TimeUnitEnum
 13 | from mdio.builder.schemas.v1.units import TimeUnitModel
 14 | from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate
 15 | from mdio.builder.templates.types import SeismicDataDomain
 16 | 
 17 | UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER)
 18 | UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND)
 19 | 
 20 | 
 21 | def _validate_coordinates_headers_trace_mask(dataset: Dataset, headers: StructuredType, domain: str) -> None:
 22 |     """Validate the coordinate, headers, trace_mask variables in the dataset."""
 23 |     # Verify variables
 24 |     # 2 dim coords + 2 non-dim coords + 1 data + 1 trace mask + 1 headers = 6 variables
 25 |     assert len(dataset.variables) == 7
 26 | 
 27 |     # Verify trace headers
 28 |     validate_variable(dataset, name="headers", dims=[("cdp", 2048)], coords=["cdp_x", "cdp_y"], dtype=headers)
 29 | 
 30 |     validate_variable(
 31 |         dataset,
 32 |         name="trace_mask",
 33 |         dims=[("cdp", 2048)],
 34 |         coords=["cdp_x", "cdp_y"],
 35 |         dtype=ScalarType.BOOL,
 36 |     )
 37 | 
 38 |     # Verify dimension coordinate variables
 39 |     validate_variable(
 40 |         dataset,
 41 |         name="cdp",
 42 |         dims=[("cdp", 2048)],
 43 |         coords=["cdp"],
 44 |         dtype=ScalarType.INT32,
 45 |     )
 46 | 
 47 |     domain = validate_variable(
 48 |         dataset,
 49 |         name=domain,
 50 |         dims=[(domain, 4096)],
 51 |         coords=[domain],
 52 |         dtype=ScalarType.INT32,
 53 |     )
 54 |     assert domain.metadata.units_v1 in (UNITS_METER, UNITS_SECOND)
 55 | 
 56 |     # Verify non-dimension coordinate variables
 57 |     cdp_x = validate_variable(
 58 |         dataset,
 59 |         name="cdp_x",
 60 |         dims=[("cdp", 2048)],
 61 |         coords=["cdp_x"],
 62 |         dtype=ScalarType.FLOAT64,
 63 |     )
 64 |     assert cdp_x.metadata.units_v1 == UNITS_METER
 65 | 
 66 |     cdp_y = validate_variable(
 67 |         dataset,
 68 |         name="cdp_y",
 69 |         dims=[("cdp", 2048)],
 70 |         coords=["cdp_y"],
 71 |         dtype=ScalarType.FLOAT64,
 72 |     )
 73 |     assert cdp_y.metadata.units_v1 == UNITS_METER
 74 | 
 75 | 
 76 | @pytest.mark.parametrize("data_domain", ["time", "depth"])
 77 | class TestSeismic2DPostStackTemplate:
 78 |     """Unit tests for Seismic2DPostStackTemplate."""
 79 | 
 80 |     def test_configuration(self, data_domain: SeismicDataDomain) -> None:
 81 |         """Test configuration of Seismic2DPostStackTemplate."""
 82 |         t = Seismic2DPostStackTemplate(data_domain=data_domain)
 83 | 
 84 |         # Template attributes
 85 |         assert t._data_domain == data_domain
 86 |         assert t._dim_names == ("cdp", data_domain)
 87 |         assert t._physical_coord_names == ("cdp_x", "cdp_y")
 88 |         assert t.full_chunk_shape == (1024, 1024)
 89 | 
 90 |         # Variables instantiated when build_dataset() is called
 91 |         assert t._builder is None
 92 |         assert t._dim_sizes == ()
 93 | 
 94 |         # Verify dataset attributes
 95 |         attrs = t._load_dataset_attributes()
 96 |         assert attrs == {"surveyType": "2D", "gatherType": "stacked"}
 97 | 
 98 |         assert t.default_variable_name == "amplitude"
 99 | 
100 |     def test_build_dataset_time(self, data_domain: SeismicDataDomain, structured_headers: StructuredType) -> None:
101 |         """Test building a complete 2D time dataset."""
102 |         t = Seismic2DPostStackTemplate(data_domain=data_domain)
103 |         t.add_units({"cdp_x": UNITS_METER, "cdp_y": UNITS_METER})  # spatial domain units
104 |         t.add_units({"time": UNITS_SECOND, "depth": UNITS_METER})  # data domain units
105 | 
106 |         dataset = t.build_dataset("Seismic 2D Time Line 001", sizes=(2048, 4096), header_dtype=structured_headers)
107 | 
108 |         # Verify dataset metadata
109 |         assert dataset.metadata.name == "Seismic 2D Time Line 001"
110 |         assert dataset.metadata.attributes["surveyType"] == "2D"
111 |         assert dataset.metadata.attributes["gatherType"] == "stacked"
112 | 
113 |         _validate_coordinates_headers_trace_mask(dataset, structured_headers, data_domain)
114 | 
115 |         # Verify seismic variable
116 |         v = validate_variable(
117 |             dataset,
118 |             name="amplitude",
119 |             dims=[("cdp", 2048), (data_domain, 4096)],
120 |             coords=["cdp_x", "cdp_y"],
121 |             dtype=ScalarType.FLOAT32,
122 |         )
123 |         assert isinstance(v.metadata.chunk_grid, RegularChunkGrid)
124 |         assert v.metadata.chunk_grid.configuration.chunk_shape == (1024, 1024)
125 |         assert v.metadata.stats_v1 is None
126 | 
127 | 
128 | @pytest.mark.parametrize("data_domain", ["Time", "DePTh"])
129 | def test_domain_case_handling(data_domain: str) -> None:
130 |     """Test that domain parameter handles different cases correctly."""
131 |     template = Seismic2DPostStackTemplate(data_domain=data_domain)
132 |     assert template._data_domain == data_domain.lower()
133 | 
134 |     data_domain_suffix = data_domain.lower().capitalize()
135 |     assert template.name == f"PostStack2D{data_domain_suffix}"
136 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "multidimio"
  3 | version = "1.1.1"
  4 | description = "Cloud-native, scalable, and user-friendly multi dimensional energy data!"
  5 | authors = [{ name = "Altay Sansal", email = "altay.sansal@tgs.com" }]
  6 | requires-python = ">=3.11,<3.14"
  7 | readme = "README.md"
  8 | license = "Apache-2.0"
  9 | license-files = ["LICEN[CS]E*"]
 10 | keywords = ["mdio", "multidimio", "seismic", "wind", "data"]
 11 | classifiers = [
 12 |     "Programming Language :: Python :: 3",
 13 |     "Programming Language :: Python :: 3.11",
 14 |     "Programming Language :: Python :: 3.12",
 15 |     "Programming Language :: Python :: 3.13",
 16 |     "License :: OSI Approved :: Apache Software License",
 17 |     "Development Status :: 4 - Beta",
 18 | ]
 19 | 
 20 | dependencies = [
 21 |     "click>=8.3.0",
 22 |     "click-params>=0.5.0",
 23 |     "dask>=2025.9.1",
 24 |     "fsspec>=2025.9.0",
 25 |     "pint>=0.25.0",
 26 |     "psutil>=7.1.0",
 27 |     "pydantic>=2.12.0",
 28 |     "pydantic-settings>=2.6.1",
 29 |     "rich>=14.1.0",
 30 |     "segy>=0.5.3",
 31 |     "tqdm>=4.67.1",
 32 |     "universal-pathlib>=0.3.3",
 33 |     "xarray>=2025.10.1",
 34 |     "zarr>=3.1.3",
 35 | ]
 36 | 
 37 | [project.optional-dependencies]
 38 | cloud = ["s3fs>=2025.9.0", "gcsfs>=2025.9.0", "adlfs>=2025.8.0"]
 39 | distributed = ["distributed>=2025.9.1", "bokeh>=3.8.0"]
 40 | lossy = ["zfpy>=1.0.1"]
 41 | 
 42 | [project.urls]
 43 | homepage = "https://mdio.dev/"
 44 | repository = "https://github.com/TGSAI/mdio-python"
 45 | documentation = "https://mdio-python.readthedocs.io"
 46 | 
 47 | [project.scripts]
 48 | mdio = "mdio.__main__:main"
 49 | 
 50 | [dependency-groups]
 51 | dev = [
 52 |     "ruff>=0.14.0",
 53 |     "coverage[toml]>=7.10.7",
 54 |     "mypy>=1.18.2",
 55 |     "pre-commit>=4.3.0",
 56 |     "pre-commit-hooks>=6.0.0",
 57 |     "pytest>=8.4.2",
 58 |     "pytest-dependency>=0.6.0",
 59 |     "typeguard>=4.4.4",
 60 |     "xdoctest[colors]>=1.3.0",
 61 |     "Pygments>=2.19.2"
 62 | ]
 63 | 
 64 | docs = [
 65 |     "aiohttp>=3.13.2",
 66 |     "autodoc-pydantic>=2.2.0",
 67 |     "furo>=2025.9.25",
 68 |     "linkify-it-py>=2.0.3",
 69 |     "matplotlib>=3.10.7",
 70 |     "myst-nb>=1.3.0",
 71 |     "sphinx>=8.2.3",
 72 |     "sphinx-autobuild>=2025.8.25",
 73 |     "sphinx-click>=6.1.0",
 74 |     "sphinx-copybutton>=0.5.2",
 75 |     "sphinx-design>=0.6.1",
 76 |     "ipywidgets>=8.1.7",
 77 | ]
 78 | 
 79 | [tool.uv]
 80 | required-version = ">=0.8.17"
 81 | 
 82 | [tool.ruff]
 83 | target-version = "py311"
 84 | src = ["src"]
 85 | line-length = 120
 86 | 
 87 | [tool.ruff.lint]
 88 | select = [
 89 |     "E",    # pycodestyle
 90 |     "F",    # pyflakes
 91 |     "B",    # bugbear
 92 |     "I",    # isort
 93 |     "UP",   # pyupgrade
 94 |     "N",    # pep8-naming
 95 |     "D",    # pydocstyle
 96 |     "ANN",  # annotations
 97 |     "S",    # bandit
 98 |     "A",    # builtins
 99 |     "C4",   # comprehensions
100 |     "DTZ",  # datetimez
101 |     "EM",   # errmsg
102 |     "ICN",  # import-conventions
103 |     "PIE",  # pie
104 |     "PT",   # pytest-style
105 |     "RSE",  # raise
106 |     "RET",  # return
107 |     "SIM",  # simplify
108 |     "TID",  # tidy-imports
109 |     "TC",   # type-checking
110 |     "ARG",  # unused-arguments
111 |     "PTH",  # use-pathlib
112 |     "TD",   # todos
113 |     "PL",   # pylint
114 |     "FLY",  # flynt
115 |     "NPY",  # numpy
116 |     "LOG",  # logging
117 |     "G",    # logging-format
118 |     "PERF", # perflint
119 |     "FA",   # flake8-future-annotations
120 | ]
121 | 
122 | ignore = [
123 |     "D107", # Missing docstring in __init__ ; should be in class docstring
124 | ]
125 | 
126 | [tool.ruff.lint.per-file-ignores]
127 | "tests/*" = ["S101", "PLR2004"]
128 | "tests/integration/test_segy_import_export_masked.py" = ["E501"]
129 | "docs/tutorials/*.ipynb" = ["S101"]
130 | 
131 | [tool.ruff.lint.flake8-annotations]
132 | mypy-init-return = true
133 | 
134 | [tool.ruff.lint.pydocstyle]
135 | convention = "google"
136 | 
137 | [tool.ruff.lint.isort]
138 | force-single-line = true
139 | 
140 | [tool.ruff.lint.pycodestyle]
141 | max-line-length = 120
142 | ignore-overlong-task-comments = true
143 | 
144 | [tool.pydoclint]
145 | style = "google"
146 | arg-type-hints-in-docstring = false
147 | check-return-types = false
148 | check-yield-types = false
149 | 
150 | [tool.coverage.paths]
151 | source = ["src", "*/site-packages"]
152 | tests = ["tests", "*/tests"]
153 | 
154 | [tool.coverage.run]
155 | branch = true
156 | source = ["src/mdio", "tests"]
157 | relative_files = true
158 | 
159 | [tool.coverage.report]
160 | show_missing = true
161 | fail_under = 85
162 | exclude_also = [
163 |     "if __name__ == __main__:",
164 |     "if TYPE_CHECKING:",
165 |     "raise NotImplementedError",
166 | ]
167 | 
168 | [tool.mypy]
169 | strict = true
170 | warn_unreachable = true
171 | warn_redundant_casts = true
172 | warn_unused_ignores = true
173 | pretty = true
174 | show_column_numbers = true
175 | show_error_codes = true
176 | show_error_context = true
177 | disallow_untyped_defs = true                            # for strict mypy: (this is the tricky one)
178 | plugins = ["pydantic.mypy", "numpy.typing.mypy_plugin"]
179 | 
180 | [tool.pydantic-mypy]
181 | init_forbid_extra = true
182 | init_typed = true
183 | warn_required_dynamic_aliases = true
184 | 
185 | [tool.bumpversion]
186 | current_version = "1.1.1"
187 | allow_dirty = true
188 | commit = false
189 | tag = false
190 | parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)(\\.dev(?P<dev>\\d+))?"
191 | serialize = [
192 |     "{major}.{minor}.{patch}.dev{dev}", # For dev releases
193 |     "{major}.{minor}.{patch}",          # For stable releases
194 | ]
195 | 
196 | [tool.uv.build-backend]
197 | module-name = "mdio"
198 | 
199 | [build-system]
200 | requires = ["uv_build>=0.8.17,<0.9.0"]
201 | build-backend = "uv_build"
202 | 


--------------------------------------------------------------------------------
/src/mdio/commands/info.py:
--------------------------------------------------------------------------------
  1 | """MDIO Dataset information command."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING
  6 | from typing import Any
  7 | 
  8 | from click import STRING
  9 | from click import Choice
 10 | from click import argument
 11 | from click import command
 12 | from click import option
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from mdio import MDIOReader
 16 |     from mdio.core import Grid
 17 | 
 18 | 
 19 | @command(name="info")
 20 | @argument("mdio-path", type=STRING)
 21 | @option(
 22 |     "-access",
 23 |     "--access-pattern",
 24 |     required=False,
 25 |     default="012",
 26 |     help="Access pattern of the file",
 27 |     type=STRING,
 28 |     show_default=True,
 29 | )
 30 | @option(
 31 |     "-format",
 32 |     "--output-format",
 33 |     required=False,
 34 |     default="pretty",
 35 |     help="Output format. Pretty console or JSON.",
 36 |     type=Choice(["pretty", "json"]),
 37 |     show_default=True,
 38 |     show_choices=True,
 39 | )
 40 | def info(mdio_path: str, output_format: str, access_pattern: str) -> None:
 41 |     """Provide information on a MDIO dataset.
 42 | 
 43 |     By default, this returns human-readable information about the grid and stats for the dataset.
 44 |     If output-format is set to 'json' then a JSON is returned to facilitate parsing.
 45 |     """
 46 |     # Lazy import to reduce CLI startup time
 47 |     from mdio import MDIOReader  # noqa: PLC0415
 48 | 
 49 |     reader = MDIOReader(mdio_path, access_pattern=access_pattern, return_metadata=True)
 50 | 
 51 |     grid_dict = parse_grid(reader.grid)
 52 |     stats_dict = cast_stats(reader.stats)
 53 |     access_pattern_dict = parse_access_patterns(reader)
 54 | 
 55 |     mdio_info = {
 56 |         "path": mdio_path,
 57 |         "stats": stats_dict,
 58 |         "grid": grid_dict,
 59 |         "access_patterns": access_pattern_dict,
 60 |     }
 61 | 
 62 |     if output_format == "pretty":
 63 |         pretty_print(mdio_info)
 64 | 
 65 |     if output_format == "json":
 66 |         json_print(mdio_info)
 67 | 
 68 | 
 69 | def cast_stats(stats_dict: dict[str, Any]) -> dict[str, float]:
 70 |     """Normalize all floats to JSON serializable floats."""
 71 |     return {k: float(v) for k, v in stats_dict.items()}
 72 | 
 73 | 
 74 | def parse_grid(grid: Grid) -> dict[str, dict[str, int | str]]:
 75 |     """Extract grid information per dimension."""
 76 |     grid_dict = {}
 77 |     for dim_name in grid.dim_names:
 78 |         dim = grid.select_dim(dim_name)
 79 |         min_ = str(dim.coords[0])
 80 |         max_ = str(dim.coords[-1])
 81 |         size = str(dim.coords.shape[0])
 82 |         grid_dict[dim_name] = {"name": dim_name, "min": min_, "max": max_, "size": size}
 83 |     return grid_dict
 84 | 
 85 | 
 86 | def parse_access_patterns(reader: MDIOReader) -> dict[str, Any]:
 87 |     """Extract access patterns and their info."""
 88 |     access_pattern_dict = {}
 89 |     for name, array in reader._data_group.arrays():
 90 |         pattern = name.replace("chunked_", "")
 91 |         chunks = str(array.chunks)
 92 |         format_ = str(array.dtype)
 93 |         compressors = str(array.compressors)
 94 |         access_pattern_dict[pattern] = {
 95 |             "chunks": chunks,
 96 |             "format": format_,
 97 |             "compressor(s)": compressors,
 98 |         }
 99 | 
100 |     return access_pattern_dict
101 | 
102 | 
103 | def json_print(mdio_info: dict[str, Any]) -> None:
104 |     """Convert MDIO Info to JSON and pretty print."""
105 |     # Lazy import to reduce CLI startup time
106 |     from json import dumps as json_dumps  # noqa: PLC0415
107 | 
108 |     from rich import print  # noqa: A004, PLC0415
109 | 
110 |     print(json_dumps(mdio_info, indent=2))
111 | 
112 | 
113 | def pretty_print(mdio_info: dict[str, Any]) -> None:
114 |     """Print pretty MDIO Info table to console."""
115 |     # Lazy import to reduce CLI startup time
116 |     from rich.console import Console  # noqa: PLC0415
117 |     from rich.table import Table  # noqa: PLC0415
118 | 
119 |     console = Console()
120 | 
121 |     grid_table = Table(show_edge=False)
122 |     grid_table.add_column("Dimension", justify="right", style="cyan", no_wrap=True)
123 |     grid_table.add_column("Min", justify="left", style="magenta")
124 |     grid_table.add_column("Max", justify="left", style="magenta")
125 |     grid_table.add_column("Size", justify="left", style="green")
126 | 
127 |     for axis_dict in mdio_info["grid"].values():
128 |         name, min_, max_, size = axis_dict.values()
129 |         grid_table.add_row(name, min_, max_, size)
130 | 
131 |     stat_table = Table(show_edge=False)
132 |     stat_table.add_column("Stat", justify="right", style="cyan", no_wrap=True)
133 |     stat_table.add_column("Value", justify="left", style="magenta")
134 | 
135 |     for stat, value in mdio_info["stats"].items():
136 |         stat_table.add_row(stat, f"{value:.4f}")
137 | 
138 |     access_patter_table = Table(show_edge=False)
139 |     access_patter_table.add_column("Pattern", justify="right", style="cyan", no_wrap=True)
140 |     access_patter_table.add_column("Chunks", justify="left", style="magenta")
141 |     access_patter_table.add_column("Format", justify="left", style="magenta")
142 |     access_patter_table.add_column("Compressor", justify="left", style="magenta")
143 | 
144 |     for name, pattern_info in mdio_info["access_patterns"].items():
145 |         chunks, format_, compressor = pattern_info.values()
146 |         access_patter_table.add_row(name, chunks, format_, compressor)
147 | 
148 |     master_table = Table(title=f"File Information for {mdio_info['path']}")
149 |     master_table.add_column("Grid", justify="center")
150 |     master_table.add_column("Statistics", justify="center")
151 |     master_table.add_column("Access Patterns", justify="center")
152 |     master_table.add_row(grid_table, stat_table, access_patter_table)
153 | 
154 |     console.print(master_table)
155 | 
156 | 
157 | cli = info
158 | 


--------------------------------------------------------------------------------
/src/mdio/core/grid.py:
--------------------------------------------------------------------------------
  1 | """Grid abstraction with serializers."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from dataclasses import dataclass
  6 | from typing import TYPE_CHECKING
  7 | 
  8 | import numpy as np
  9 | import zarr
 10 | from numcodecs.zarr3 import Blosc
 11 | from zarr.codecs import BloscCodec
 12 | 
 13 | from mdio.constants import UINT32_MAX
 14 | from mdio.constants import ZarrFormat
 15 | from mdio.core.utils_write import get_constrained_chunksize
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from segy.arrays import HeaderArray
 19 |     from zarr import Array as ZarrArray
 20 | 
 21 |     from mdio.core import Dimension
 22 | 
 23 | 
 24 | @dataclass
 25 | class Grid:
 26 |     """N-dimensional grid class for managing bounds and increments.
 27 | 
 28 |     This class encapsulates an N-dimensional grid, storing dimension information and optional
 29 |     mapping and live mask arrays for trace indexing. It provides access to dimension names, shape,
 30 |     and number of dimensions as computed attributes.
 31 | 
 32 |     Args:
 33 |         dims: List of Dimension instances defining the grid axes.
 34 |         map: Optional Zarr array for trace mapping. Defaults to None.
 35 |         live_mask: Optional Zarr array indicating live traces. Defaults to None.
 36 | 
 37 |     Attributes:
 38 |         dims: List of Dimension instances defining the grid axes.
 39 |         map: Optional Zarr array for trace mapping, or None if not set.
 40 |         live_mask: Optional Zarr array indicating live traces, or None if not set.
 41 | 
 42 |     Notes:
 43 |         Computed attributes available after initialization:
 44 |         - `dim_names`: Tuple of dimension names.
 45 |         - `shape`: Tuple of dimension sizes.
 46 |         - `ndim`: Number of dimensions.
 47 | 
 48 |     Example:
 49 |         >>> from mdio.core import Dimension
 50 |         >>> dims = [Dimension(name="x", min=0, max=100, step=10)]
 51 |         >>> grid = Grid(dims)
 52 |         >>> grid.dim_names
 53 |         ('x',)
 54 |         >>> grid.shape
 55 |         (11,)
 56 |     """
 57 | 
 58 |     dims: list[Dimension]
 59 |     map: ZarrArray | None = None
 60 |     live_mask: ZarrArray | None = None
 61 | 
 62 |     _TARGET_MEMORY_PER_BATCH = 1 * 1024**3  # 1GB target for batch processing
 63 |     _INTERNAL_CHUNK_SIZE_TARGET = 10 * 1024**2  # 10MB target for chunks
 64 | 
 65 |     def __post_init__(self) -> None:
 66 |         """Initialize derived attributes."""
 67 |         self.dim_names = tuple(dim.name for dim in self.dims)
 68 |         self.shape = tuple(dim.size for dim in self.dims)
 69 |         self.ndim = len(self.dims)
 70 | 
 71 |     def __getitem__(self, item: int) -> Dimension:
 72 |         """Get a dimension by index."""
 73 |         return self.dims[item]
 74 | 
 75 |     def __setitem__(self, key: int, value: Dimension) -> None:
 76 |         """Set a dimension by index."""
 77 |         self.dims[key] = value
 78 | 
 79 |     def select_dim(self, name: str) -> Dimension:
 80 |         """Get a dimension by name."""
 81 |         if name not in self.dim_names:
 82 |             msg = f"Invalid dimension name '{name}'. Available dimensions: {self.dim_names}."
 83 |             raise ValueError(msg)
 84 |         index = self.dim_names.index(name)
 85 |         return self.dims[index]
 86 | 
 87 |     def get_min(self, name: str) -> float:
 88 |         """Get minimum value of a dimension by name."""
 89 |         return self.select_dim(name).min().item()
 90 | 
 91 |     def get_max(self, name: str) -> float:
 92 |         """Get maximum value of a dimension by name."""
 93 |         return self.select_dim(name).max().item()
 94 | 
 95 |     def build_map(self, index_headers: HeaderArray) -> None:
 96 |         """Build trace mapping and live mask from header indices.
 97 | 
 98 |         Args:
 99 |             index_headers: Header array containing dimension indices.
100 |         """
101 |         # Determine data type for map based on grid size
102 |         grid_size = np.prod(self.shape[:-1], dtype=np.uint64)
103 |         map_dtype = np.uint64 if grid_size > UINT32_MAX else np.uint32
104 |         fill_value = np.iinfo(map_dtype).max
105 | 
106 |         # Initialize Zarr arrays
107 |         live_shape = self.shape[:-1]
108 |         chunks = get_constrained_chunksize(
109 |             shape=live_shape,
110 |             dtype=map_dtype,
111 |             max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET,
112 |         )
113 | 
114 |         zarr_format = zarr.config.get("default_zarr_format")
115 | 
116 |         common_kwargs = {"shape": live_shape, "chunks": chunks, "store": None}
117 |         if zarr_format == ZarrFormat.V2:
118 |             common_kwargs["compressors"] = Blosc(cname="zstd")
119 |         else:
120 |             common_kwargs["compressors"] = BloscCodec(cname="zstd")
121 | 
122 |         self.map = zarr.create_array(fill_value=fill_value, dtype=map_dtype, **common_kwargs)
123 |         self.live_mask = zarr.create_array(fill_value=0, dtype=bool, **common_kwargs)
124 | 
125 |         # Calculate batch size
126 |         memory_per_trace_index = index_headers.itemsize
127 |         batch_size = max(1, int(self._TARGET_MEMORY_PER_BATCH / memory_per_trace_index))
128 |         total_live_traces = index_headers.size
129 | 
130 |         # Process headers in batches
131 |         for start in range(0, total_live_traces, batch_size):
132 |             end = min(start + batch_size, total_live_traces)
133 |             live_dim_indices = []
134 | 
135 |             # Compute indices for the batch
136 |             for dim in self.dims[:-1]:
137 |                 dim_hdr = index_headers[dim.name][start:end]
138 |                 indices = np.searchsorted(dim, dim_hdr).astype(np.uint32)
139 |                 live_dim_indices.append(indices)
140 |             live_dim_indices = tuple(live_dim_indices)
141 | 
142 |             # Assign trace indices
143 |             trace_indices = np.arange(start, end, dtype=np.uint64)
144 | 
145 |             self.map.vindex[live_dim_indices] = trace_indices
146 |             self.live_mask.vindex[live_dim_indices] = True
147 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | - Demonstrating empathy and kindness toward other people
 21 | - Being respectful of differing opinions, viewpoints, and experiences
 22 | - Giving and gracefully accepting constructive feedback
 23 | - Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | - Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | - The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | - Trolling, insulting or derogatory comments, and personal or political attacks
 33 | - Public or private harassment
 34 | - Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | - Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | [opensource@tgs.com](mailto:opensource@tgs.com).
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][mozilla coc].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][faq]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [mozilla coc]: https://github.com/mozilla/diversity
131 | [faq]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 | 


--------------------------------------------------------------------------------
/tests/unit/v1/test_dataset_builder_add_coordinate.py:
--------------------------------------------------------------------------------
  1 | """Tests the schema v1 dataset_builder.add_coordinate() public API."""
  2 | 
  3 | import pytest
  4 | from zarr.codecs import BloscCname
  5 | 
  6 | from mdio.builder.dataset_builder import MDIODatasetBuilder
  7 | from mdio.builder.dataset_builder import _BuilderState
  8 | from mdio.builder.schemas.compressors import Blosc
  9 | from mdio.builder.schemas.dtype import ScalarType
 10 | from mdio.builder.schemas.v1.units import LengthUnitEnum
 11 | from mdio.builder.schemas.v1.units import LengthUnitModel
 12 | from mdio.builder.schemas.v1.variable import CoordinateMetadata
 13 | from mdio.builder.schemas.v1.variable import VariableMetadata
 14 | 
 15 | from .helpers import validate_builder
 16 | from .helpers import validate_coordinate
 17 | from .helpers import validate_variable
 18 | 
 19 | 
 20 | def test_add_coordinate() -> None:
 21 |     """Test adding coordinates. Check the state transition and validate required parameters."""
 22 |     builder = MDIODatasetBuilder("test_dataset")
 23 |     assert builder._state == _BuilderState.INITIAL
 24 | 
 25 |     msg = "Must add at least one dimension before adding coordinates"
 26 |     with pytest.raises(ValueError, match=msg):
 27 |         builder.add_coordinate("cdp", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32)
 28 | 
 29 |     builder.add_dimension("inline", 100)
 30 |     builder.add_dimension("crossline", 200)
 31 | 
 32 |     # Validate required parameters
 33 |     bad_name = None
 34 |     with pytest.raises(ValueError, match="'name' must be a non-empty string"):
 35 |         builder.add_coordinate(bad_name, dimensions=("speed",), data_type=ScalarType.FLOAT32)
 36 |     with pytest.raises(ValueError, match="'name' must be a non-empty string"):
 37 |         builder.add_coordinate("", dimensions=("speed",), data_type=ScalarType.FLOAT32)
 38 |     with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"):
 39 |         builder.add_coordinate("cdp_x", dimensions=None, data_type=ScalarType.FLOAT32)
 40 |     with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"):
 41 |         builder.add_coordinate("cdp_x", dimensions=(), data_type=ScalarType.FLOAT32)
 42 | 
 43 |     # Add a variable using non-existent dimensions
 44 |     msg = "Pre-existing dimension named 'xline' is not found"
 45 |     with pytest.raises(ValueError, match=msg):
 46 |         builder.add_coordinate("bad_cdp-x", dimensions=("inline", "xline"), data_type=ScalarType.FLOAT32)
 47 | 
 48 |     # Validate state transition
 49 |     builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32)
 50 |     validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1)
 51 |     validate_variable(
 52 |         builder,
 53 |         name="cdp_x",
 54 |         dims=[("inline", 100), ("crossline", 200)],
 55 |         coords=["cdp_x"],
 56 |         dtype=ScalarType.FLOAT32,
 57 |     )
 58 | 
 59 |     # Adding coordinate with the same name twice
 60 |     msg = "Adding coordinate with the same name twice is not allowed"
 61 |     with pytest.raises(ValueError, match=msg):
 62 |         builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32)
 63 | 
 64 | 
 65 | def test_add_coordinate_with_defaults() -> None:
 66 |     """Test adding coordinates with default arguments."""
 67 |     builder = MDIODatasetBuilder("test_dataset")
 68 |     builder.add_dimension("inline", 100)
 69 |     builder.add_dimension("crossline", 200)
 70 | 
 71 |     # Add coordinate using defaults
 72 |     builder.add_coordinate("cdp", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32)
 73 |     validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1)
 74 |     validate_coordinate(builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT32)
 75 |     v = validate_variable(
 76 |         builder,
 77 |         name="cdp",
 78 |         dims=[("inline", 100), ("crossline", 200)],
 79 |         coords=["cdp"],
 80 |         dtype=ScalarType.FLOAT32,
 81 |     )
 82 |     assert v.long_name is None  # Default value
 83 |     assert v.compressor is None  # Default value
 84 |     assert v.metadata is None  # Default value
 85 | 
 86 | 
 87 | def test_coordinate_with_full_parameters() -> None:
 88 |     """Test adding coordinates with all metadata."""
 89 |     builder = MDIODatasetBuilder("test_dataset")
 90 |     builder.add_dimension("inline", 100)
 91 |     builder.add_dimension("crossline", 200)
 92 | 
 93 |     # Add coordinate with all metadata
 94 |     metadata = CoordinateMetadata(
 95 |         units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT),
 96 |         attributes={"MGA": 51, "UnitSystem": "Imperial"},
 97 |     )
 98 |     builder.add_coordinate(
 99 |         "cdp",
100 |         long_name="Common Depth Point",
101 |         dimensions=("inline", "crossline"),
102 |         data_type=ScalarType.FLOAT16,
103 |         compressor=Blosc(cname=BloscCname.zstd),
104 |         metadata=metadata,
105 |     )
106 |     validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1)
107 |     c = validate_coordinate(builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT16)
108 |     assert c.long_name == "Common Depth Point"
109 |     assert isinstance(c.compressor, Blosc)
110 |     assert c.compressor.cname == BloscCname.zstd
111 |     assert c.metadata.attributes["MGA"] == 51
112 |     assert c.metadata.attributes["UnitSystem"] == "Imperial"
113 |     assert c.metadata.units_v1.length == LengthUnitEnum.FOOT
114 |     v = validate_variable(
115 |         builder,
116 |         name="cdp",
117 |         dims=[("inline", 100), ("crossline", 200)],
118 |         coords=["cdp"],
119 |         dtype=ScalarType.FLOAT16,
120 |     )
121 |     assert isinstance(v.compressor, Blosc)
122 |     assert v.compressor.cname == BloscCname.zstd
123 |     assert isinstance(v.metadata, VariableMetadata)
124 |     assert v.metadata.units_v1.length == LengthUnitEnum.FOOT
125 |     assert v.metadata.attributes["MGA"] == 51
126 |     assert v.metadata.attributes["UnitSystem"] == "Imperial"
127 | 


--------------------------------------------------------------------------------