├── requirements ├── 3d.txt ├── opencv.txt ├── depth_estimation.txt ├── torch.txt ├── base.txt ├── test.txt ├── embedding_estimation.txt └── preparation.txt ├── src └── nicr_scene_analysis_datasets │ ├── utils │ ├── __init__.py │ ├── misc.py │ ├── testing.py │ ├── img.py │ ├── io.py │ └── _colormaps.py │ ├── datasets │ ├── __init__.py │ ├── nyuv2 │ │ ├── __init__.py │ │ ├── splits.mat │ │ ├── class13Mapping.mat │ │ ├── classMapping40.mat │ │ └── README.md │ ├── scannet │ │ ├── __init__.py │ │ ├── scannetv2_test.txt │ │ ├── README.md │ │ └── scannetv2_val.txt │ ├── sunrgbd │ │ ├── __init__.py │ │ ├── legacy_emsanet_version │ │ │ ├── __init__.py │ │ │ ├── nyu_weak_box_3d_mapping.json │ │ │ └── nyu_additional_class_mapping.json │ │ ├── nyu_weak_box_3d_mapping.json │ │ ├── nyu_additional_class_mapping.json │ │ └── README.md │ ├── cityscapes │ │ ├── __init__.py │ │ ├── README.md │ │ └── cityscapes.py │ ├── hypersim │ │ ├── __init__.py │ │ └── README.md │ ├── scenenetrgbd │ │ ├── __init__.py │ │ ├── README.md │ │ ├── scenenetrgbd.py │ │ ├── scenenet.proto │ │ └── dataset.py │ ├── ade20k │ │ ├── __init__.py │ │ ├── README.md │ │ └── _class_mappings.py │ └── coco │ │ ├── __init__.py │ │ ├── README.md │ │ └── dataset.py │ ├── scripts │ ├── __init__.py │ ├── prepare_dataset.py │ └── common.py │ ├── mira │ ├── __init__.py │ └── utils.py │ ├── dataset_base │ ├── _meta.py │ ├── _rgbd_dataset.py │ ├── _rgb_dataset.py │ ├── __init__.py │ ├── _config.py │ ├── _annotation.py │ ├── _class_weighting.py │ └── _concat_dataset.py │ ├── auxiliary_data │ ├── __init__.py │ ├── embedding_estimation │ │ ├── __init__.py │ │ └── _base.py │ ├── depth_estimation │ │ ├── __init__.py │ │ └── _base.py │ ├── _config.py │ └── _base.py │ ├── d2 │ ├── __init__.py │ └── _auto_init.py │ ├── version.py │ ├── __init__.py │ └── pytorch.py ├── .coveragerc ├── .vscode ├── settings.json └── launch.json ├── tests ├── conftest.py ├── test_scenenetrgbd.py ├── test_embedding_estimation.py ├── test_cityscapes.py ├── test_coco.py ├── test_d2.py ├── test_depth_estimation.py ├── test_ade20k.py ├── test_nyuv2.py ├── test_concat.py ├── test_sunrgbd.py ├── test_hypersim.py └── test_scannet.py ├── .gitignore ├── pyproject.toml └── .gitlab-ci.yml /requirements/3d.txt: -------------------------------------------------------------------------------- 1 | open3d 2 | plyfile 3 | -------------------------------------------------------------------------------- /requirements/opencv.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | -------------------------------------------------------------------------------- /requirements/depth_estimation.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.45.0 2 | -------------------------------------------------------------------------------- /requirements/torch.txt: -------------------------------------------------------------------------------- 1 | torch>=2.3.1 # DepthAnythingV2 requires torch.nn.RMSNorm 2 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | cityscapesScripts==1.5.0 2 | numpy 3 | pillow 4 | scipy 5 | tqdm>=4.42.0 6 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | panopticapi @ git+https://github.com/cocodataset/panopticapi.git 2 | pytest>=3.0.2 3 | -------------------------------------------------------------------------------- /requirements/embedding_estimation.txt: -------------------------------------------------------------------------------- 1 | loralib 2 | alpha-clip @ git+https://github.com/SunzeY/AlphaCLIP.git 3 | gdown 4 | -------------------------------------------------------------------------------- /requirements/preparation.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | numba 3 | pandas 4 | panopticapi @ git+https://github.com/cocodataset/panopticapi.git 5 | protobuf 6 | termcolor 7 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/nyuv2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scannet/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/cityscapes/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/hypersim/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Marius Engelhardt 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scenenetrgbd/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/ade20k/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/coco/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/nyuv2/splits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TUI-NICR/nicr-scene-analysis-datasets/HEAD/src/nicr_scene_analysis_datasets/datasets/nyuv2/splits.mat -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/legacy_emsanet_version/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/nyuv2/class13Mapping.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TUI-NICR/nicr-scene-analysis-datasets/HEAD/src/nicr_scene_analysis_datasets/datasets/nyuv2/class13Mapping.mat -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/nyuv2/classMapping40.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TUI-NICR/nicr-scene-analysis-datasets/HEAD/src/nicr_scene_analysis_datasets/datasets/nyuv2/classMapping40.mat -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit= 3 | */prepare_dataset.py 4 | */datasets/scannet/SensorData.py 5 | */datasets/scenenetrgbd/scenenet_pb2.py 6 | */datasets/sunrgbd/match_nyuv2_instances.py 7 | */datasets/sunrgbd/prepare_instances.py 8 | */mira/* 9 | */scripts/* 10 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/mira/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from ._hypersim_reader import HypersimReaderBase # noqa: F401 6 | from ._scannet_reader import ScanNetReaderBase # noqa: F401 7 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_meta.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass(frozen=True) 9 | class DepthStats: 10 | min: float 11 | max: float 12 | mean: float 13 | std: float 14 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/nyu_weak_box_3d_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "books": ["bookshelf"], 3 | "shower curtain": ["curtain"], 4 | 5 | "counter": ["sink"], 6 | "shelves": ["bookshelf", "cabinet"], 7 | "floor mat": ["otherprop"], 8 | "ceiling": ["otherstructure"], 9 | "paper": ["box"], 10 | "bag": ["bag"] 11 | } -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/legacy_emsanet_version/nyu_weak_box_3d_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "books": ["bookshelf"], 3 | "shower curtain": ["curtain"], 4 | 5 | "counter": ["sink"], 6 | "shelves": ["bookshelf", "cabinet"], 7 | "floor mat": ["otherprop"], 8 | "ceiling": ["otherstructure"], 9 | "paper": ["box"], 10 | "bag": ["bag"] 11 | } -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | from ._config import DatasetConfigWithAuxiliary 6 | from ._config import build_dataset_config_with_auxiliary 7 | from ._dataset import _AuxiliaryDataset 8 | from ._dataset import wrap_dataset_with_auxiliary_data 9 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/d2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | 6 | from . import _auto_init # noqa: F401 7 | from .utils import NICRChainedDatasetMapper # noqa: F401 8 | from .utils import NICRSceneAnalysisDatasetMapper # noqa: F401 9 | from .utils import register_dataset_to_d2 # noqa: F401 10 | from .utils import set_dataset_path # noqa: F401 11 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from functools import lru_cache 6 | from functools import partialmethod 7 | 8 | 9 | @lru_cache() 10 | def partial_class(cls, *args, **kwargs): 11 | # modified version of: https://stackoverflow.com/a/38911383 12 | if args or kwargs: 13 | 14 | class PartialClass(cls): 15 | __init__ = partialmethod(cls.__init__, *args, **kwargs) 16 | 17 | return PartialClass 18 | else: 19 | return cls 20 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/testing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import os 6 | 7 | from .. import KNOWN_DATASETS 8 | from ..version import get_version 9 | 10 | 11 | DATASET_BASEPATH = os.environ.get( 12 | 'NICR_SA_DATASET_BASEPATH', 13 | os.path.join('/datasets_nas/nicr_scene_analysis_datasets/', 14 | 'version_{}{}{}'.format(*get_version(with_suffix=False))) 15 | ) 16 | 17 | DATASET_PATH_DICT = { 18 | key: os.path.join(DATASET_BASEPATH, key) 19 | for key in KNOWN_DATASETS 20 | } 21 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | ".", 4 | "-vvs", 5 | ], 6 | "python.testing.unittestEnabled": false, 7 | "python.testing.pytestEnabled": true, 8 | "python.linting.pycodestyleEnabled": true, 9 | "python.linting.enabled": true, 10 | // disable annoying top-level source code modification indication 11 | "gitlens.codeLens.authors.enabled": false, 12 | "gitlens.codeLens.recentChange.enabled": false, 13 | "files.trimTrailingWhitespace": true, 14 | "[markdown]": { 15 | "files.trimTrailingWhitespace": false 16 | }, 17 | "cSpell.words": [ 18 | "codeauthor" 19 | ], 20 | } -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_rgbd_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | """ 6 | from typing import Tuple 7 | 8 | from ._depth_dataset import DepthDataset 9 | from ._rgb_dataset import RGBDataset 10 | 11 | 12 | class RGBDDataset(RGBDataset, DepthDataset): 13 | def __init__( 14 | self, 15 | depth_mode: str = 'raw', 16 | sample_keys: Tuple[str] = ('rgb', 'depth', 'semantic'), 17 | use_cache: bool = False, 18 | **kwargs 19 | ) -> None: 20 | super().__init__( 21 | depth_mode=depth_mode, 22 | sample_keys=sample_keys, 23 | use_cache=use_cache, 24 | **kwargs 25 | ) 26 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import os 6 | import shutil 7 | 8 | import pytest 9 | 10 | 11 | def pytest_addoption(parser): 12 | parser.addoption('--keep-files', action='store_true') 13 | 14 | 15 | @pytest.fixture(scope='session') 16 | def keep_files(request): 17 | return request.config.getoption('--keep-files') 18 | 19 | 20 | @pytest.fixture(scope='session') 21 | def tmp_path(tmpdir_factory, keep_files): 22 | # see: https://docs.pytest.org/en/6.2.x/reference.html#tmpdir-factory 23 | # use '--basetemp' to change default path 24 | # -> BE AWARE <- --basetemp is cleared on start !!! 25 | 26 | path = tmpdir_factory.mktemp('nicr_scene_analysis_datasets') 27 | print(f"\nWriting temporary files to '{path}'") 28 | if keep_files: 29 | print("Files are kept and require to be deleted manually!") 30 | 31 | yield path 32 | 33 | # teardown (delete if it was created) 34 | if os.path.exists(path) and not keep_files: 35 | shutil.rmtree(path) 36 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_rgb_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | """ 6 | from typing import Tuple 7 | 8 | import abc 9 | 10 | import numpy as np 11 | 12 | from ._annotation import IntrinsicCameraParametersNormalized 13 | from ._base_dataset import DatasetBase 14 | 15 | 16 | class RGBDataset(DatasetBase): 17 | def __init__( 18 | self, 19 | sample_keys: Tuple[str] = ('rgb', 'semantic'), 20 | use_cache: bool = False, 21 | **kwargs 22 | ) -> None: 23 | super().__init__( 24 | sample_keys=sample_keys, 25 | use_cache=use_cache, 26 | **kwargs 27 | ) 28 | 29 | @abc.abstractmethod 30 | def _load_rgb(self, idx) -> np.ndarray: 31 | pass 32 | 33 | def _load_rgb_intrinsics(self, idx) -> IntrinsicCameraParametersNormalized: 34 | # so far, only few datasets support intrinsics, thus, we define a 35 | # default here 36 | raise NotImplementedError() 37 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/scripts/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import argparse as ap 6 | import importlib 7 | 8 | from .. import KNOWN_DATASETS 9 | 10 | 11 | def main(): 12 | # parse args 13 | parser = ap.ArgumentParser( 14 | formatter_class=ap.ArgumentDefaultsHelpFormatter, 15 | description="Prepare a dataset for scene analysis." 16 | ) 17 | subparsers = parser.add_subparsers( 18 | help='Dataset to prepare.', 19 | dest='dataset', 20 | required=True 21 | ) 22 | for dataset in KNOWN_DATASETS: 23 | subparsers.add_parser(dataset, add_help=False) # redirect help 24 | 25 | parsed_args, remaining_args = parser.parse_known_args() 26 | 27 | # import dataset module 28 | dataset_module = importlib.import_module( 29 | name=f'..datasets.{parsed_args.dataset}.prepare_dataset', 30 | package=__package__ 31 | ) 32 | 33 | # run prepare function 34 | dataset_module.main(remaining_args) 35 | 36 | 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import os 6 | import subprocess 7 | import warnings 8 | 9 | _VERSION_MAJOR = 0 10 | _VERSION_MINOR = 8 11 | _VERSION_MICRO = 3 12 | 13 | 14 | def get_version(with_suffix=False): # pragma no cover 15 | if with_suffix: 16 | try: 17 | suffix = subprocess.check_output( 18 | ['git', 'describe', '--always', '--dirty'], 19 | cwd=os.path.abspath(os.path.dirname(__file__)) 20 | ) 21 | suffix = suffix.decode().strip() 22 | # replace - with . to be PEP440 compliant, 23 | # e.g., d2c4396-dirty -> d2c4396.dirty 24 | suffix = suffix.replace('-', '.') 25 | except Exception: 26 | warnings.warn("Cannot determine version suffix using git.") 27 | suffix = '' 28 | 29 | return _VERSION_MAJOR, _VERSION_MINOR, _VERSION_MICRO, suffix 30 | 31 | else: 32 | return _VERSION_MAJOR, _VERSION_MINOR, _VERSION_MICRO 33 | 34 | 35 | __version__ = '{}.{}.{}'.format(*get_version(with_suffix=False)) 36 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/embedding_estimation/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | from typing import Union 6 | 7 | from ...utils.misc import partial_class 8 | from .alpha_clip import AlphaCLIPEmbeddingEstimator 9 | 10 | 11 | _EMBEDDING_ESTIMATORS = { 12 | # Alpha-CLIP 13 | AlphaCLIPEmbeddingEstimator.NAME: AlphaCLIPEmbeddingEstimator, 14 | } 15 | # add all variants of each base class as well 16 | for cls in list(_EMBEDDING_ESTIMATORS.values()): 17 | if issubclass(cls, AlphaCLIPEmbeddingEstimator): 18 | for model_name in cls.MODEL_LOOKUP_DICT.keys(): 19 | n = f"{cls.NAME}__{model_name}" 20 | _EMBEDDING_ESTIMATORS[n] = partial_class(cls, model_name=model_name) 21 | 22 | KNOWN_EMBEDDING_ESTIMATORS = tuple(sorted(_EMBEDDING_ESTIMATORS.keys())) 23 | 24 | EMBEDDING_ESTIMATOR_TYPE = Union[ 25 | AlphaCLIPEmbeddingEstimator, 26 | ] 27 | 28 | 29 | def get_embedding_estimator_class(name: str) -> EMBEDDING_ESTIMATOR_TYPE: 30 | # force lowercase 31 | name = name.lower() 32 | 33 | cls = _EMBEDDING_ESTIMATORS.get(name, None) 34 | if cls is None: 35 | raise ValueError(f"Unknown embedding estimator: '{name}'") 36 | 37 | return cls 38 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from ._annotation import ExtrinsicCameraParametersNormalized # noqa: F401 6 | from ._annotation import IntrinsicCameraParametersNormalized # noqa: F401 7 | from ._annotation import MetaDict # noqa: F401 8 | from ._annotation import OrientationDict # noqa: F401 9 | from ._annotation import PanopticEmbeddingDict # noqa: F401 10 | from ._annotation import SampleIdentifier # noqa: F401 11 | from ._annotation import SceneLabel # noqa: F401 12 | from ._annotation import SceneLabelList # noqa: F401 13 | from ._annotation import SemanticLabel # noqa: F401 14 | from ._annotation import SemanticLabelList # noqa: F401 15 | 16 | from ._class_weighting import KNOWN_CLASS_WEIGHTINGS # noqa: F401 17 | from ._class_weighting import compute_class_weights # noqa: F401 18 | 19 | from ._config import build_dataset_config # noqa: F401 20 | from ._config import DatasetConfig # noqa: F401 21 | 22 | from ._meta import DepthStats # noqa: F401 23 | 24 | from ._base_dataset import DatasetBase # noqa: F401 25 | from ._concat_dataset import ConcatDataset # noqa: F401 26 | from ._depth_dataset import DepthDataset # noqa: F401 27 | from ._rgb_dataset import RGBDataset # noqa: F401 28 | from ._rgbd_dataset import RGBDDataset # noqa: F401 29 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/depth_estimation/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import Union 6 | 7 | from ...utils.misc import partial_class 8 | from .hugging_face import DepthAnythingV2DepthEstimator 9 | from .hugging_face import DinoV2DPTDepthEstimator 10 | from .hugging_face import ZoeDepthDepthEstimator 11 | from .hugging_face import _HuggingFaceDepthEstimator 12 | 13 | 14 | _DEPTH_ESTIMATORS = { 15 | # DepthAnything V2 16 | DepthAnythingV2DepthEstimator.NAME: DepthAnythingV2DepthEstimator, 17 | # ZoeDepth 18 | ZoeDepthDepthEstimator.NAME: ZoeDepthDepthEstimator, 19 | # Dino V2 with Dense Prediction Transformer (DPT) head for depth estimation 20 | DinoV2DPTDepthEstimator.NAME: DinoV2DPTDepthEstimator 21 | } 22 | # add all variants of each base class as well 23 | for cls in list(_DEPTH_ESTIMATORS.values()): 24 | if issubclass(cls, _HuggingFaceDepthEstimator): 25 | for model_name in cls.MODEL_LOOKUP_DICT.keys(): 26 | n = f"{cls.NAME}__{model_name}" 27 | _DEPTH_ESTIMATORS[n] = partial_class(cls, model_name=model_name) 28 | 29 | KNOWN_DEPTH_ESTIMATORS = tuple(sorted(_DEPTH_ESTIMATORS.keys())) 30 | 31 | DEPTH_ESTIMATOR_TYPE = Union[ 32 | DepthAnythingV2DepthEstimator, 33 | ZoeDepthDepthEstimator, 34 | DinoV2DPTDepthEstimator 35 | ] 36 | 37 | 38 | def get_depth_estimator_class(name: str) -> DEPTH_ESTIMATOR_TYPE: 39 | # force lowercase 40 | name = name.lower() 41 | 42 | cls = _DEPTH_ESTIMATORS.get(name, None) 43 | if cls is None: 44 | raise ValueError(f"Unknown depth estimator: '{name}'") 45 | 46 | return cls 47 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/embedding_estimation/_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | from typing import Union 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from .._base import AuxiliaryDataEstimatorBase 11 | 12 | 13 | UINT16_MAX = np.iinfo('uint16').max 14 | 15 | 16 | class EmbeddingEstimatorBase(AuxiliaryDataEstimatorBase): 17 | NAME: str 18 | 19 | def predict( 20 | self, 21 | rgb_img: Union[torch.Tensor, np.ndarray], 22 | mask_img: Union[torch.Tensor, np.ndarray], 23 | ) -> Union[torch.Tensor, np.ndarray]: 24 | # store input type and original shape for later postprocessing 25 | rgb_is_numpy = isinstance(rgb_img, np.ndarray) 26 | rgb_h, rgb_w = self._get_height_width(rgb_img) 27 | # Ensure that mask only has 0 and 1 values 28 | assert np.all(np.isin(mask_img, [0, 1])) 29 | 30 | mask_h, mask_w = self._get_height_width(mask_img) 31 | assert rgb_h == mask_h and rgb_w == mask_w, \ 32 | f"Input image and mask must have the same shape. " \ 33 | f"Got '{rgb_h}x{rgb_w}' and '{mask_h}x{mask_w}'." 34 | 35 | # prepare the input to have the correct shape 36 | rgb_img = self.prepare_input(rgb_img) 37 | mask_img = self.prepare_input(mask_img) 38 | 39 | # apply estimator 40 | rgb_img = rgb_img.to(self._device).to(torch.float32) 41 | mask_img = mask_img.to(self._device).to(torch.float32) 42 | 43 | predicted_embeddings = self._estimator_predict(rgb_img, mask_img).cpu() 44 | 45 | # convert to numpy 2d array if input was numpy 46 | if rgb_is_numpy: 47 | predicted_embeddings = predicted_embeddings.numpy() 48 | 49 | return predicted_embeddings 50 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scannet/scannetv2_test.txt: -------------------------------------------------------------------------------- 1 | scene0707_00 2 | scene0708_00 3 | scene0709_00 4 | scene0710_00 5 | scene0711_00 6 | scene0712_00 7 | scene0713_00 8 | scene0714_00 9 | scene0715_00 10 | scene0716_00 11 | scene0717_00 12 | scene0718_00 13 | scene0719_00 14 | scene0720_00 15 | scene0721_00 16 | scene0722_00 17 | scene0723_00 18 | scene0724_00 19 | scene0725_00 20 | scene0726_00 21 | scene0727_00 22 | scene0728_00 23 | scene0729_00 24 | scene0730_00 25 | scene0731_00 26 | scene0732_00 27 | scene0733_00 28 | scene0734_00 29 | scene0735_00 30 | scene0736_00 31 | scene0737_00 32 | scene0738_00 33 | scene0739_00 34 | scene0740_00 35 | scene0741_00 36 | scene0742_00 37 | scene0743_00 38 | scene0744_00 39 | scene0745_00 40 | scene0746_00 41 | scene0747_00 42 | scene0748_00 43 | scene0749_00 44 | scene0750_00 45 | scene0751_00 46 | scene0752_00 47 | scene0753_00 48 | scene0754_00 49 | scene0755_00 50 | scene0756_00 51 | scene0757_00 52 | scene0758_00 53 | scene0759_00 54 | scene0760_00 55 | scene0761_00 56 | scene0762_00 57 | scene0763_00 58 | scene0764_00 59 | scene0765_00 60 | scene0766_00 61 | scene0767_00 62 | scene0768_00 63 | scene0769_00 64 | scene0770_00 65 | scene0771_00 66 | scene0772_00 67 | scene0773_00 68 | scene0774_00 69 | scene0775_00 70 | scene0776_00 71 | scene0777_00 72 | scene0778_00 73 | scene0779_00 74 | scene0780_00 75 | scene0781_00 76 | scene0782_00 77 | scene0783_00 78 | scene0784_00 79 | scene0785_00 80 | scene0786_00 81 | scene0787_00 82 | scene0788_00 83 | scene0789_00 84 | scene0790_00 85 | scene0791_00 86 | scene0792_00 87 | scene0793_00 88 | scene0794_00 89 | scene0795_00 90 | scene0796_00 91 | scene0797_00 92 | scene0798_00 93 | scene0799_00 94 | scene0800_00 95 | scene0801_00 96 | scene0802_00 97 | scene0803_00 98 | scene0804_00 99 | scene0805_00 100 | scene0806_00 101 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | import dataclasses 6 | 7 | from ..dataset_base import DatasetConfig 8 | 9 | 10 | @dataclasses.dataclass(frozen=True) 11 | class DatasetConfigWithAuxiliary(DatasetConfig): 12 | semantic_text_embeddings: list = None 13 | scene_text_embeddings: list = None 14 | mean_embedding_per_semantic_class: dict = None 15 | mean_image_embedding_per_semantic_class: dict = None 16 | 17 | 18 | def build_dataset_config_with_auxiliary( 19 | original_config: DatasetConfig, 20 | semantic_text_embeddings: list, 21 | scene_text_embeddings: list, 22 | mean_embedding_per_semantic_class: dict, 23 | mean_image_embedding_per_semantic_class: dict 24 | ) -> DatasetConfigWithAuxiliary: 25 | """ 26 | Creates a new DatasetConfigWithAuxiliary instance by copying attributes 27 | from the original config and adding auxiliary fields. 28 | """ 29 | # Create a new instance of DatasetConfigWithAuxiliary 30 | # Note: We didn't just use dataclasses.asdict(original_config) as it would 31 | # also convert its members to a dict, which is not what we want. 32 | new_config = DatasetConfigWithAuxiliary( 33 | semantic_label_list=original_config.semantic_label_list, 34 | semantic_label_list_without_void=original_config.semantic_label_list_without_void, 35 | scene_label_list=original_config.scene_label_list, 36 | scene_label_list_without_void=original_config.scene_label_list_without_void, 37 | depth_stats=original_config.depth_stats, 38 | semantic_text_embeddings=semantic_text_embeddings, 39 | scene_text_embeddings=scene_text_embeddings, 40 | mean_embedding_per_semantic_class=mean_embedding_per_semantic_class, 41 | mean_image_embedding_per_semantic_class=mean_image_embedding_per_semantic_class 42 | ) 43 | return new_config 44 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import Type, Union 6 | 7 | 8 | from .utils.imports import install_nicr_scene_analysis_datasets_dependency_import_hooks 9 | 10 | install_nicr_scene_analysis_datasets_dependency_import_hooks() 11 | 12 | 13 | from .auxiliary_data import wrap_dataset_with_auxiliary_data 14 | from .dataset_base import KNOWN_CLASS_WEIGHTINGS 15 | from .dataset_base import ConcatDataset 16 | from .datasets.ade20k.dataset import ADE20K 17 | from .datasets.cityscapes.dataset import Cityscapes 18 | from .datasets.coco.dataset import COCO 19 | from .datasets.hypersim.dataset import Hypersim 20 | from .datasets.nyuv2.dataset import NYUv2 21 | from .datasets.scannet.dataset import ScanNet 22 | from .datasets.scenenetrgbd.dataset import SceneNetRGBD 23 | from .datasets.sunrgbd.dataset import SUNRGBD 24 | 25 | 26 | _DATASETS = { 27 | 'ade20k': ADE20K, 28 | 'cityscapes': Cityscapes, 29 | 'coco': COCO, 30 | 'hypersim': Hypersim, 31 | 'nyuv2': NYUv2, 32 | 'scannet': ScanNet, 33 | 'scenenetrgbd': SceneNetRGBD, 34 | 'sunrgbd': SUNRGBD, 35 | } 36 | KNOWN_DATASETS = tuple(_DATASETS.keys()) 37 | 38 | DatasetType = Union[ 39 | ADE20K, 40 | Cityscapes, 41 | COCO, 42 | Hypersim, 43 | NYUv2, 44 | ScanNet, 45 | SceneNetRGBD, 46 | SUNRGBD, 47 | ConcatDataset 48 | ] 49 | 50 | 51 | def get_dataset_class(name: str, with_auxiliary_data: bool = False) -> Type[DatasetType]: 52 | name = name.lower() 53 | if name not in KNOWN_DATASETS: 54 | raise ValueError(f"Unknown dataset: '{name}'") 55 | original_dataset_class = _DATASETS[name] 56 | if with_auxiliary_data: 57 | current_dataset_class = \ 58 | wrap_dataset_with_auxiliary_data(original_dataset_class) 59 | else: 60 | current_dataset_class = original_dataset_class 61 | 62 | return current_dataset_class 63 | 64 | 65 | from .version import __version__ 66 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/depth_estimation/_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | """ 6 | from typing import Union 7 | 8 | 9 | import numpy as np 10 | import torch 11 | 12 | from .._base import AuxiliaryDataEstimatorBase 13 | 14 | 15 | UINT16_MAX = np.iinfo('uint16').max 16 | 17 | 18 | class DepthEstimatorBase(AuxiliaryDataEstimatorBase): 19 | NAME: str 20 | 21 | def predict( 22 | self, 23 | rgb_img: Union[torch.Tensor, np.ndarray], 24 | ) -> Union[torch.Tensor, np.ndarray]: 25 | # store input type and original shape for later postprocessing 26 | is_numpy = isinstance(rgb_img, np.ndarray) 27 | h, w = self._get_height_width(rgb_img) 28 | 29 | # prepare the input to have the correct shape 30 | rgb_img = self.prepare_input(rgb_img) 31 | 32 | # apply estimator 33 | rgb_img = rgb_img.to(self._device).to(torch.float32) 34 | 35 | predicted_depth = self._estimator_predict(rgb_img).cpu() 36 | 37 | # resize to original shape 38 | predicted_depth = self._resize_image( 39 | predicted_depth[:, None, ...], # (B, H, W) -> (B, C, H, W) 40 | height=h, width=w, mode='nearest' 41 | ) 42 | 43 | # convert to numpy 2d array if input was numpy 44 | if is_numpy: 45 | predicted_depth = predicted_depth.numpy()[0, 0] 46 | n_above_max = (predicted_depth > UINT16_MAX).sum() 47 | if n_above_max > 0: 48 | print( 49 | f"Warning: Detected {n_above_max} values above " 50 | f"{UINT16_MAX} in predicted depth." 51 | ) 52 | predicted_depth = np.clip(predicted_depth, 0, UINT16_MAX) 53 | predicted_depth = np.asarray(predicted_depth, dtype='uint16') 54 | 55 | assert 2 == predicted_depth.ndim 56 | 57 | return predicted_depth 58 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/cityscapes/README.md: -------------------------------------------------------------------------------- 1 | # Cityscapes dataset 2 | 3 | The Cityscapes dataset contains a diverse set of stereo video sequences recorded in street scenes from 50 different cities, with high quality pixel-level annotations of 5 000 frames in addition to a larger set of 20 000 weakly annotated frames. 4 | The dataset is thus an order of magnitude larger than similar previous attempts. Details on [annotated classes](https://www.cityscapes-dataset.com/dataset-overview/#class-definitions) and [examples of our annotations](https://www.cityscapes-dataset.com/examples/#dense-pixel-annotations) are available at this webpage. 5 | 6 | For more details, see: [Cityscapes Dataset](https://www.cityscapes-dataset.com/) and [Cityscapes Dataset at GitHub](https://github.com/mcordts/cityscapesScripts). 7 | 8 | ## Prepare dataset 9 | 10 | 1. Download and unzip dataset files: 11 | Use `csDownload` or download the files mentioned below manually from: [Cityscapes Dataset Downloads](https://www.cityscapes-dataset.com/downloads/) 12 | 13 | ```bash 14 | CITYSCAPES_DOWNLOAD_DIR="/path/where/to/store/cityscapes_downloads" 15 | 16 | # using cityscapesScripts 17 | # use "csDownload -l" to list available packages 18 | 19 | # labels (semantic, instance) 20 | csDownload gtFine_trainvaltest.zip -d $CITYSCAPES_DOWNLOAD_DIR # -> 241MB 21 | # rgb images 22 | csDownload leftImg8bit_trainvaltest.zip -d $CITYSCAPES_DOWNLOAD_DIR # -> 11GB 23 | # disparity images (only upon request) 24 | csDownload disparity_trainvaltest.zip -d $CITYSCAPES_DOWNLOAD_DIR # -> 3.5GB 25 | # intrinsic and extrinsic camera parameter to calculate depth 26 | csDownload camera_trainvaltest.zip -d $CITYSCAPES_DOWNLOAD_DIR # -> 2MB 27 | 28 | # unzip files 29 | find $CITYSCAPES_DOWNLOAD_DIR -name '*.zip' -exec unzip -o {} -d $CITYSCAPES_DOWNLOAD_DIR \; 30 | ``` 31 | 32 | 2. Convert dataset: 33 | ```bash 34 | # general usage 35 | nicr_sa_prepare_dataset cityscapes \ 36 | /path/where/to/store/cityscapes \ 37 | $CITYSCAPES_DOWNLOAD_DIR 38 | ``` 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # Environments 82 | .env 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # mypy 99 | .mypy_cache/ 100 | 101 | # PyCharm 102 | .idea 103 | 104 | # MacOS 105 | .DS_Store 106 | 107 | # Binaries 108 | .npz 109 | .npy 110 | .h5 111 | .hdf5 112 | core 113 | 114 | inference_time*.pdf 115 | inference_time*.json 116 | onnx_models/*.onnx 117 | *.onnx 118 | onnx_models/*.trt 119 | *.pth 120 | *.tar 121 | *.pickle 122 | datasets/* 123 | !datasets/.gitkeep 124 | trained_models/* 125 | !trained_models/.gitkeep 126 | todo.txt 127 | *.pdf 128 | results 129 | *.svg 130 | .pytest_cache 131 | .vscode 132 | 133 | stuff/hypersim_instance_semantic_overlaps/hypersim_instances -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/d2/_auto_init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | """ 5 | from ..pytorch import ADE20K 6 | from ..pytorch import COCO 7 | from ..pytorch import Cityscapes 8 | from ..pytorch import Hypersim 9 | from ..pytorch import NYUv2 10 | from ..pytorch import ScanNet 11 | from ..pytorch import SceneNetRGBD 12 | from ..pytorch import SUNRGBD 13 | from .utils import register_dataset_to_d2 14 | 15 | # Automatically register all datasets with some default keys so that they 16 | # are available through Detectron2's DatasetCatalog. 17 | # Note that they are just registered so that the stats can be access. 18 | # For using the dataset, the 'set_dataset_path' function should be called first. 19 | # Moreover, we currently do not load the 'depth' sample key for any dataset. 20 | # If your interested in another sample key, remove the dataset and call 21 | # 'register_dataset_to_d2' yourself. 22 | register_dataset_to_d2( 23 | name_prefix='ade20k', 24 | dataset_class=ADE20K, 25 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 26 | ) 27 | register_dataset_to_d2( 28 | name_prefix='cityscapes', 29 | dataset_class=Cityscapes, 30 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 31 | ) 32 | register_dataset_to_d2( 33 | name_prefix='coco', 34 | dataset_class=COCO, 35 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 36 | ) 37 | register_dataset_to_d2( 38 | name_prefix='hypersim', 39 | dataset_class=Hypersim, 40 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 41 | ) 42 | register_dataset_to_d2( 43 | name_prefix='nyuv2', 44 | dataset_class=NYUv2, 45 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 46 | ) 47 | register_dataset_to_d2( 48 | name_prefix='scannet', 49 | dataset_class=ScanNet, 50 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 51 | ) 52 | register_dataset_to_d2( 53 | name_prefix='scenenetrgbd', 54 | dataset_class=SceneNetRGBD, 55 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 56 | ) 57 | register_dataset_to_d2( 58 | name_prefix='sunrgbd', 59 | dataset_class=SUNRGBD, 60 | sample_keys=('identifier', 'rgb', 'semantic', 'instance') 61 | ) 62 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | .. codeauthor:: Daniel Seichter 5 | """ 6 | import dataclasses 7 | from typing import Union 8 | 9 | from ._annotation import SceneLabelList 10 | from ._annotation import SemanticLabelList 11 | from ._meta import DepthStats 12 | 13 | 14 | @dataclasses.dataclass(frozen=True) 15 | class DatasetConfig: 16 | semantic_label_list: SemanticLabelList 17 | semantic_label_list_without_void: SemanticLabelList 18 | scene_label_list: SceneLabelList 19 | scene_label_list_without_void: SceneLabelList 20 | depth_stats: Union[DepthStats, None] 21 | 22 | 23 | def build_dataset_config( 24 | semantic_label_list: SemanticLabelList, 25 | scene_label_list: Union[SceneLabelList, None] = None, 26 | depth_stats: Union[DepthStats, None] = None 27 | ) -> DatasetConfig: 28 | """ 29 | Builds a dataset config from a semantic and scene label list and known 30 | depth stats. 31 | 32 | Notes 33 | ----- 34 | The function assumes that the first element in the semantic label list has 35 | the void label. 36 | """ 37 | scene_label_list = scene_label_list or SceneLabelList(()) 38 | 39 | # build semantic label list without void 40 | semantic_label_list_without_void = SemanticLabelList(()) 41 | for idx, label in enumerate(semantic_label_list): 42 | # skip void 43 | if idx == 0: 44 | # we always have 0 as void 45 | continue 46 | semantic_label_list_without_void.add_label(label) 47 | 48 | # build scene label list without void 49 | scene_label_list_without_void = SceneLabelList(()) 50 | for label in scene_label_list: 51 | # skip void 52 | if 'void' == label.class_name.lower(): 53 | # indoor domestic class labels contain a void class 54 | continue 55 | scene_label_list_without_void.add_label(label) 56 | 57 | # create dataset config 58 | config = DatasetConfig( 59 | semantic_label_list, 60 | semantic_label_list_without_void, 61 | scene_label_list, 62 | scene_label_list_without_void, 63 | depth_stats 64 | ) 65 | 66 | return config 67 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/coco/README.md: -------------------------------------------------------------------------------- 1 | # COCO dataset 2 | 3 | COCO is a large-scale object detection, segmentation, and captioning dataset. 4 | It contains over 200.000 labeled images with 80 object and 91 stuff categories 5 | for panoptic segmentation. 6 | 7 | For more details, see: [COCO dataset](https://cocodataset.org/#home) 8 | 9 | ## Prepare dataset 10 | 1. Convert the dataset 11 | ```bash 12 | # general usage 13 | nicr_sa_prepare_dataset coco \ 14 | /path/where/to/store/coco/ 15 | ``` 16 | 17 | 2. (Optional) Generate auxiliary data 18 | > **Note**: To use auxiliary data generation, the package must be installed with the `withauxiliarydata` option: 19 | > ```bash 20 | > pip install -e .[withauxiliarydata] 21 | > ``` 22 | 23 | ```bash 24 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 25 | nicr_sa_generate_auxiliary_data \ 26 | --dataset coco \ 27 | --dataset-path /path/to/already/prepared/coco/dataset \ 28 | --auxiliary-data depth image-embedding panoptic-embedding \ 29 | --embedding-estimator-device cuda \ 30 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 31 | --depth-estimator-device cuda \ 32 | --depth-estimators depthanything_v2__indoor_large \ 33 | --cache-models 34 | ``` 35 | 36 | With arguments: 37 | - `--dataset-path`: 38 | Path to the prepared COCO dataset. 39 | - `--auxiliary-data`: 40 | Types of auxiliary data to generate: 41 | - `depth`: Generates synthetic depth images from RGB. 42 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 43 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 44 | - `--depth-estimator-device`: 45 | Device to use for depth estimation (`cpu` or `cuda`). 46 | - `--depth-estimators`: 47 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 48 | - `--embedding-estimator-device`: 49 | Device to use for embedding estimation (`cpu` or `cuda`). 50 | - `--embedding-estimators`: 51 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 52 | - `--cache-models`: 53 | Cache models locally to avoid reloading them in future runs. 54 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/mira/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import cv2 6 | 7 | from PythonImageWrapper import Img 8 | from PythonImageWrapper import Img8U1 9 | 10 | 11 | def to_mira_img(img, rgb2bgr=False): 12 | if 3 == img.ndim: 13 | h, w, n = img.shape 14 | elif 2 == img.ndim: 15 | h, w = img.shape 16 | n = 1 17 | else: 18 | raise ValueError(f"Unknown shape: {img.shape}") 19 | 20 | if rgb2bgr: 21 | img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 22 | 23 | if img.dtype == 'uint8': 24 | t = '8U' 25 | elif img.dtype == 'uint16': 26 | t = '16U' 27 | elif img.dtype == 'float32': 28 | t = '32F' 29 | else: 30 | raise ValueError(f"Unknown dtype: {img.dtype}") 31 | cv_type = getattr(cv2, f'CV_{t}C{n}') 32 | 33 | img_mira = Img(w, h, cv_type, n) 34 | img_mira.setMat(img) 35 | 36 | return img_mira 37 | 38 | 39 | def to_mira_img8u1(img): 40 | assert img.dtype == 'uint8' 41 | assert img.ndim == 2 42 | 43 | h, w = img.shape 44 | img_mira = Img8U1(w, h) 45 | img_mira.setMat(img) 46 | 47 | return img_mira 48 | 49 | 50 | def parse_list(comma_sep_str, cast_to=str): 51 | if cast_to is bool: 52 | cast_to = lambda x: x.lower() in ['true', '1'] 53 | 54 | return [cast_to(e.strip()) 55 | for e in comma_sep_str.strip().split(',') 56 | if e.strip()] 57 | 58 | 59 | class AutoGetterSetter: 60 | def __getattr__(self, name): 61 | """Generic getter and setter methods for reflection""" 62 | if name.startswith(('_rget', '_rset')): 63 | member = name[5:] 64 | if member not in self.__dict__: 65 | raise AttributeError( 66 | "{} has no attribute '{}'".format(self, name) 67 | ) 68 | 69 | if name.startswith('_rset'): 70 | # make setter 71 | def _cb_set(value): 72 | setattr(self, member, value) 73 | return _cb_set 74 | elif name.startswith('_rget'): 75 | # make getter 76 | def _cb_get(): 77 | return getattr(self, member) 78 | return _cb_get 79 | else: 80 | return super().__getattr__(name) 81 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/scripts/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import List, Optional, Tuple, Union 6 | 7 | import numpy as np 8 | 9 | from .. import get_dataset_class 10 | from ..utils import img as img_utils 11 | 12 | 13 | DATASET_COLORMAPS = { 14 | 'auto_n': {}, 15 | 'ade20k': {}, 16 | 'cityscapes_19': {'semantic_n_classes': 19}, 17 | 'cityscapes_33': {'semantic_n_classes': 33}, 18 | 'coco': {}, 19 | 'hypersim': {}, 20 | 'nyuv2_13': {'semantic_n_classes': 13}, 21 | 'nyuv2_40': {'semantic_n_classes': 40}, 22 | 'nyuv2_894': {'semantic_n_classes': 894}, 23 | 'scannet_20': {'semantic_n_classes': 20}, 24 | 'scannet_40': {'semantic_n_classes': 40}, 25 | 'scannet_200': {'semantic_n_classes': 200}, 26 | 'scannet_549': {'semantic_n_classes': 549}, 27 | 'scenenetrgbd': {}, 28 | 'sunrgbd': {}, 29 | 'visual_distinct': {} 30 | } 31 | 32 | AVAILABLE_COLORMAPS = tuple(DATASET_COLORMAPS.keys()) 33 | 34 | 35 | def get_colormap( 36 | name: str, 37 | n: Optional[int] = 256, 38 | return_names: bool = False 39 | ) -> Union[np.ndarray, Tuple[List[str], np.ndarray]]: 40 | if 'auto_n' == name: 41 | # generate color map with n colors 42 | colors = np.array( 43 | img_utils.get_colormap(n) 44 | ) 45 | names = [f'{i}' for i in range(n)] 46 | elif 'visual_distinct' == name: 47 | # use visually distinct colors (useful for visualizing instances) 48 | colors = np.array( 49 | img_utils.get_visual_distinct_colormap(with_void=True) 50 | ) 51 | names = [f'{i}' for i in range(colors.shape[0])] 52 | else: 53 | # use colors from dataset 54 | dataset_name = name.split('_')[0] 55 | dataset = get_dataset_class(dataset_name)( 56 | disable_prints=True, 57 | **DATASET_COLORMAPS[name] 58 | ) 59 | # with void class 60 | colors = dataset.config.semantic_label_list.colors_array 61 | names = dataset.config.semantic_label_list.class_names 62 | 63 | if not return_names: 64 | return colors 65 | 66 | return colors, names 67 | 68 | 69 | def print_section(section_name: str, section_content: str = ''): 70 | print(f"===== {section_name.upper()} =====") 71 | if section_content: 72 | print(section_content+"\n") 73 | -------------------------------------------------------------------------------- /tests/test_scenenetrgbd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for SceneNet RGB-D dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | """ 7 | import pytest 8 | 9 | from nicr_scene_analysis_datasets import SceneNetRGBD 10 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 11 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 12 | 13 | N_CLASSES_WITH_VOID = 13 + 1 14 | N_SAMPLES = {'train': 50595, 'valid': 6000} 15 | N_SCENE_CLASSES = 5 16 | 17 | 18 | @pytest.mark.parametrize('split', ('train', 'valid')) 19 | def test_dataset(split): 20 | dataset = SceneNetRGBD( 21 | dataset_path=DATASET_PATH_DICT['scenenetrgbd'], 22 | split=split, 23 | sample_keys=SceneNetRGBD.get_available_sample_keys(split), 24 | depth_mode='refined', 25 | ) 26 | 27 | assert dataset.depth_mode == 'refined' 28 | assert dataset.split == split 29 | 30 | assert len(dataset) == N_SAMPLES[split] 31 | 32 | assert dataset.semantic_n_classes == N_CLASSES_WITH_VOID 33 | assert dataset.semantic_n_classes_without_void == N_CLASSES_WITH_VOID - 1 34 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 35 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 36 | 37 | assert len(dataset.scene_class_names) == N_SCENE_CLASSES 38 | 39 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 40 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 41 | 42 | assert len(dataset.cameras) == 1 43 | 44 | assert isinstance(dataset.depth_min, float) 45 | assert isinstance(dataset.depth_max, float) 46 | assert isinstance(dataset.depth_mean, float) 47 | assert isinstance(dataset.depth_std, float) 48 | assert isinstance(dataset.depth_stats, dict) 49 | 50 | # test first 10 samples sample 51 | for i, sample in enumerate(dataset): 52 | assert isinstance(sample, dict) 53 | assert isinstance(sample['identifier'], SampleIdentifier) 54 | # inputs: rgb and depth 55 | assert sample['rgb'].ndim == 3 56 | assert sample['depth'].ndim == 2 57 | # semantic 58 | assert sample['semantic'].ndim == 2 59 | # instance 60 | assert sample['instance'].ndim == 2 61 | # scene 62 | assert isinstance(sample['scene'], int) 63 | 64 | if i >= 9: 65 | break 66 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/nyuv2/README.md: -------------------------------------------------------------------------------- 1 | # NYUv2 dataset 2 | 3 | The NYU-Depth V2 dataset is comprised of video sequences from a variety of indoor scenes as recorded by both the RGB and Depth cameras from the Microsoft Kinect. 4 | It contains 1449 densely labeled pairs of aligned RGB and depth images. 5 | 6 | For more details, see: [NYU Depth Dataset V2](https://cs.nyu.edu/~fergus/datasets/nyu_depth_v2.html) 7 | 8 | > As of Nov 2022, [precomputed normals](https://cs.nyu.edu/~deigen/dnl/normals_gt.tgz) are not publicly available any longer. 9 | We are trying to reach the authors. 10 | Normal extraction is optional for now. 11 | 12 | ## Prepare dataset 13 | 14 | 1. Download and convert the dataset to the desired format: 15 | 16 | ```bash 17 | # general usage 18 | nicr_sa_prepare_dataset nyuv2 \ 19 | /path/where/to/store/nyuv2 20 | ``` 21 | 22 | 2. (Optional) Generate auxiliary data 23 | > **Note**: To use auxiliary data generation, the package must be installed with the `withauxiliarydata` option: 24 | > ```bash 25 | > pip install -e .[withauxiliarydata] 26 | > ``` 27 | 28 | ```bash 29 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 30 | nicr_sa_generate_auxiliary_data \ 31 | --dataset nyuv2 \ 32 | --dataset-path /path/to/already/prepared/nyuv2/dataset\ 33 | --auxiliary-data depth image-embedding panoptic-embedding \ 34 | --embedding-estimator-device cuda \ 35 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 36 | --depth-estimator-device cuda \ 37 | --depth-estimators depthanything_v2__indoor_large \ 38 | --cache-models 39 | ``` 40 | With arguments: 41 | - `--dataset-path`: 42 | Path to the prepared NYUv2 dataset. 43 | - `--auxiliary-data`: 44 | Types of auxiliary data to generate: 45 | - `depth`: Generates synthetic depth images from RGB. 46 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 47 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 48 | - `--depth-estimator-device`: 49 | Device to use for depth estimation (`cpu` or `cuda`). 50 | - `--depth-estimators`: 51 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 52 | - `--embedding-estimator-device`: 53 | Device to use for embedding estimation (`cpu` or `cuda`). 54 | - `--embedding-estimators`: 55 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 56 | - `--cache-models`: 57 | Cache models locally to avoid reloading them in future runs. 58 | -------------------------------------------------------------------------------- /tests/test_embedding_estimation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for embedding estimation 4 | 5 | .. codeauthor:: Soehnke Fischedick 6 | """ 7 | import os 8 | import shutil 9 | 10 | import cv2 11 | import numpy as np 12 | import pytest 13 | 14 | from nicr_scene_analysis_datasets.auxiliary_data.embedding_estimation import get_embedding_estimator_class 15 | from nicr_scene_analysis_datasets.utils.io import download_file 16 | 17 | 18 | EXAMPLE_IMAGE = 'https://dl.fbaipublicfiles.com/dinov2/images/example.jpg' 19 | 20 | # true: always use the same default transformers path and, thus, speed up 21 | # consecutive test runs 22 | USE_DEFAULT_CACHE_PATH = True 23 | 24 | 25 | def _get_example_img(tmp_path): 26 | fn = 'example.jpg' 27 | fp = os.path.join(tmp_path, fn) 28 | if not os.path.exists(fp): 29 | if EXAMPLE_IMAGE.startswith('http'): 30 | download_file(EXAMPLE_IMAGE, fp) 31 | else: 32 | shutil.copy(EXAMPLE_IMAGE, fp) 33 | 34 | img = cv2.imread(fp, cv2.IMREAD_UNCHANGED) 35 | assert img is not None 36 | assert img.ndim == 3 37 | 38 | return img 39 | 40 | 41 | @pytest.mark.parametrize( 42 | 'estimator__model', ( 43 | 'alpha_clip__b16-grit-1m', 44 | 'alpha_clip__l14-grit-1m', 45 | 'alpha_clip__l14-336-grit-1m', 46 | 'alpha_clip__b16-grit-20m', 47 | 'alpha_clip__l14-grit-20m', 48 | 'alpha_clip__l14-336-grit-20m', 49 | 'alpha_clip__b16-combined', 50 | 'alpha_clip__l14-combined', 51 | ) 52 | ) 53 | def test_embedding_estimator(estimator__model, tmp_path): 54 | # Get example image 55 | img = _get_example_img(tmp_path) 56 | 57 | # Initialize the embedding estimator 58 | Estimator = get_embedding_estimator_class(estimator__model) 59 | estimator = Estimator( 60 | device='cpu', 61 | auto_set_up=True, 62 | cache_basepath=tmp_path if not USE_DEFAULT_CACHE_PATH else None, 63 | ) 64 | 65 | # Generate mask for the whole image, same size as input 66 | mask = np.ones_like(img, dtype=np.uint8) 67 | # The mask should only have one channel 68 | mask = mask[:, :, 0][:, :, None] 69 | 70 | # Get embeddings 71 | embeddings = estimator.predict(img, mask) 72 | 73 | # Basic assertions 74 | assert embeddings is not None 75 | assert isinstance(embeddings, np.ndarray) 76 | 77 | # Embedding should have batch dimension and embedding dimension 78 | assert embeddings.ndim == 2 79 | 80 | # We only have on input, so the batch dimension should be 1 81 | assert embeddings.shape[0] == 1 82 | -------------------------------------------------------------------------------- /tests/test_cityscapes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for Cityscapes dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | """ 7 | import pytest 8 | 9 | from nicr_scene_analysis_datasets import Cityscapes 10 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 11 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 12 | 13 | N_SAMPLES = {'train': 2975, 'valid': 500, 'test': 1525} 14 | 15 | 16 | @pytest.mark.parametrize('split', ('train', 'valid', 'test')) 17 | @pytest.mark.parametrize('semantic_n_classes', (19, 33)) 18 | @pytest.mark.parametrize('disparity_instead_of_depth', (False, True)) 19 | def test_dataset(split, 20 | semantic_n_classes, 21 | disparity_instead_of_depth): 22 | dataset = Cityscapes( 23 | dataset_path=DATASET_PATH_DICT['cityscapes'], 24 | split=split, 25 | sample_keys=Cityscapes.get_available_sample_keys(split), 26 | depth_mode='raw', 27 | disparity_instead_of_depth=disparity_instead_of_depth, 28 | semantic_n_classes=semantic_n_classes 29 | ) 30 | 31 | assert dataset.depth_mode == 'raw' 32 | assert dataset.split == split 33 | 34 | assert len(dataset) == N_SAMPLES[split] 35 | 36 | assert dataset.semantic_n_classes == semantic_n_classes + 1 37 | assert dataset.semantic_n_classes_without_void == semantic_n_classes 38 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 39 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 40 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 41 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 42 | 43 | assert len(dataset.cameras) == 1 44 | 45 | assert isinstance(dataset.depth_min, float) 46 | assert isinstance(dataset.depth_max, float) 47 | assert isinstance(dataset.depth_mean, float) 48 | assert isinstance(dataset.depth_std, float) 49 | assert isinstance(dataset.depth_stats, dict) 50 | 51 | # test first 10 samples 52 | for i, sample in enumerate(dataset): 53 | assert isinstance(sample, dict) 54 | assert isinstance(sample['identifier'], SampleIdentifier) 55 | # inputs: rgb and depth 56 | assert sample['rgb'].ndim == 3 57 | assert sample['depth'].ndim == 2 58 | 59 | if 'test' != split: 60 | # note that there are annotation files for test but they are empty 61 | # semantic 62 | assert sample['semantic'].ndim == 2 63 | # instance 64 | assert sample['instance'].ndim == 2 65 | 66 | if i >= 9: 67 | break 68 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/cityscapes/cityscapes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | .. codeauthor:: Leonard Rabes 6 | """ 7 | from cityscapesscripts.helpers.labels import labels 8 | 9 | from ...dataset_base import DepthStats 10 | from ...dataset_base import SemanticLabel 11 | from ...dataset_base import SemanticLabelList 12 | 13 | 14 | class CityscapesMeta: 15 | SPLITS = ('train', 'valid', 'test') 16 | 17 | _DATA_SAMPLE_KEYS = ('identifier', 'meta', 'rgb', 'depth') 18 | _ANNOTATION_SAMPLE_KEYS = ('semantic', 'instance') 19 | SPLIT_SAMPLE_KEYS = { 20 | SPLITS[0]: _DATA_SAMPLE_KEYS+_ANNOTATION_SAMPLE_KEYS, 21 | SPLITS[1]: _DATA_SAMPLE_KEYS+_ANNOTATION_SAMPLE_KEYS, 22 | SPLITS[2]: _DATA_SAMPLE_KEYS, 23 | } 24 | 25 | # calculated over the whole train split 26 | # see: my_dataset.depth_compute_stats() for calculation 27 | TRAIN_SPLIT_DEPTH_STATS = DepthStats( 28 | min=3.7578125, 29 | max=300.0, # see _load_depth() in dataset.py 30 | mean=31.715617493177906, 31 | std=38.70280704877372, 32 | ) 33 | TRAIN_SPLIT_DEPTH_STATS_DISPARITY = DepthStats( 34 | min=1.0, 35 | max=32257.0, 36 | mean=9069.706336834102, 37 | std=7178.335960071306 38 | ) 39 | 40 | DEPTH_MODES = ('raw',) 41 | 42 | CAMERAS = ('camera1',) # just a dummy camera name 43 | 44 | # number of semantic classes without void/unlabeled and 45 | # license plate (class 34) 46 | SEMANTIC_N_CLASSES = (19, 33) 47 | 48 | SEMANTIC_LABEL_LIST_REDUCED = SemanticLabelList(( 49 | SemanticLabel('void', False, False, (0, 0, 0)), 50 | )) 51 | SEMANTIC_LABEL_LIST_FULL = SemanticLabelList(( 52 | SemanticLabel('void', False, False, (0, 0, 0)), 53 | )) 54 | 55 | SEMANTIC_CLASS_MAPPING_REDUCED = { 56 | c: labels[c].trainId+1 if not labels[c].ignoreInEval else 0 57 | for c in range(1+33) 58 | } 59 | 60 | for idx, label in enumerate(labels): 61 | semantic_label = SemanticLabel( 62 | class_name=label.name, 63 | is_thing=label.hasInstances, 64 | use_orientations=False, 65 | color=label.color 66 | ) 67 | 68 | if not label.ignoreInEval: 69 | SEMANTIC_LABEL_LIST_REDUCED.add_label(semantic_label) 70 | # 1+33 classes (0: unlabeled), ignore license plate 71 | if idx < 33: 72 | SEMANTIC_LABEL_LIST_FULL.add_label(semantic_label) 73 | 74 | # DEPTH_DIR = 'depth' # refined depth does not exist 75 | DEPTH_RAW_DIR = 'depth_raw' 76 | DISPARITY_RAW_DIR = 'disparity_raw' 77 | RGB_DIR = 'rgb' 78 | SEMANTIC_FULL_DIR = 'semantic_33' 79 | SEMANTIC_REDUCED_DIR = 'semantic_19' 80 | INSTANCE_DIR = 'instance' 81 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "create hypersim ply", 6 | "type": "debugpy", 7 | "request": "launch", 8 | "module": "nicr_scene_analysis_datasets.scripts.create_labeled_point_clouds", 9 | "console": "integratedTerminal", 10 | "args": [ 11 | "hypersim", 12 | "/datasets_nas/nicr_scene_analysis_datasets/version_052/hypersim", 13 | "./test", 14 | "--split", "train", 15 | "--voxel-size", "0.05", 16 | "--max-depth", "20", 17 | "--write-scannet-label", 18 | ], 19 | "env": {}, 20 | }, 21 | { 22 | "name": "create hypersim", 23 | "type": "debugpy", 24 | "request": "launch", 25 | "module": "nicr_scene_analysis_datasets.datasets.hypersim.prepare_dataset", 26 | "console": "integratedTerminal", 27 | "args": [ 28 | "/datasets_nas/nicr_scene_analysis_datasets/version_052/hypersim", 29 | "/datasets_nas/segmentation/hypersim/apple-hypersim", 30 | "--additional-subsamples", "2", "5", "10", "20", 31 | "--n-processes", "16", 32 | ], 33 | "env": { 34 | //"VERSION": "052" 35 | }, 36 | }, 37 | { 38 | "name": "create scennetrgbd", 39 | "type": "debugpy", 40 | "request": "launch", 41 | "module": "nicr_scene_analysis_datasets.datasets.scenenetrgbd.prepare_dataset", 42 | "console": "integratedTerminal", 43 | "args": [ 44 | "/datasets_nas/nicr_scene_analysis_datasets/version_test/scenenetrgbd", 45 | "/datasets_nas/segmentation/SceneNetRGBD", 46 | "--n-random-views-to-include-train", "3", 47 | "--n-random-views-to-include-valid", "6", 48 | "--force-at-least-n-classes-in-view", "4", 49 | ], 50 | }, 51 | { 52 | "name": "create sunrgbd v060", 53 | "type": "debugpy", 54 | "request": "launch", 55 | "module": "nicr_scene_analysis_datasets.datasets.sunrgbd.prepare_dataset", 56 | "console": "integratedTerminal", 57 | "args": [ 58 | "/local/datasets/sunrgbd_test", 59 | "--toolbox-filepath", "/local/datasets/raw/sunrgbd/SUNRGBDtoolbox.zip", 60 | "--data-filepath", "/local/datasets/raw/sunrgbd/SUNRGBD.zip", 61 | "--box-filepath", "/local/datasets/raw/sunrgbd/SUNRGBDMeta3DBB_v2.mat", 62 | "--create-instances", 63 | "--copy-instances-from-nyuv2", 64 | "--nyuv2-path", "/datasets_nas/nicr_scene_analysis_datasets/version_060/nyuv2" 65 | ], 66 | }, 67 | ] 68 | } -------------------------------------------------------------------------------- /tests/test_coco.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for COCO dataset 4 | 5 | .. codeauthor:: Soehnke Fischedick 6 | .. codeauthor:: Daniel Seichter 7 | """ 8 | import pytest 9 | 10 | from nicr_scene_analysis_datasets import COCO 11 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 12 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 13 | 14 | N_SAMPLES = {'train': 118287, 'valid': 5000} 15 | N_CLASSES_WITH_VOID = 133 + 1 16 | N_CAMERAS = {'train': 2477, 'valid': 603} 17 | 18 | 19 | @pytest.mark.parametrize('split', ('train', 'valid')) 20 | def test_dataset(split): 21 | dataset = COCO( 22 | dataset_path=DATASET_PATH_DICT['coco'], 23 | split=split, 24 | sample_keys=COCO.get_available_sample_keys(split) 25 | ) 26 | assert dataset.split == split 27 | 28 | assert len(dataset) == N_SAMPLES[split] 29 | 30 | assert dataset.semantic_n_classes == N_CLASSES_WITH_VOID 31 | assert dataset.semantic_n_classes_without_void == N_CLASSES_WITH_VOID - 1 32 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 33 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 34 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 35 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 36 | 37 | # test first 10 samples sample 38 | for i, sample in enumerate(dataset): 39 | assert isinstance(sample, dict) 40 | assert isinstance(sample['identifier'], SampleIdentifier) 41 | # inputs: rgb 42 | assert sample['rgb'].ndim == 3 43 | # semantic 44 | assert sample['semantic'].ndim == 2 45 | # instance 46 | assert sample['instance'].ndim == 2 47 | 48 | if i >= 9: 49 | break 50 | 51 | # test camera filtering 52 | assert len(dataset.cameras) == N_CAMERAS[split], len(dataset.cameras) 53 | for camera in dataset.cameras[::10]: # test only every 10th camera 54 | with dataset.filter_camera(camera): 55 | # get shape of first sample 56 | h, w, _ = dataset[0]['rgb'].shape 57 | 58 | assert f'{w}x{h}' == camera 59 | 60 | 61 | @pytest.mark.parametrize('split', ('train', 'valid')) 62 | def test_filter_camera(split): 63 | # just some random cameras and counts that we know 64 | sample_cameras = { 65 | 'train': {'480x640': 8411, '426x640': 1660}, 66 | 'valid': {'640x480': 1061, '480x640': 336, '500x335': 9} 67 | } 68 | 69 | cameras = tuple(sample_cameras[split].keys()) 70 | n_samples = tuple(sample_cameras[split].values()) 71 | 72 | # create dataset with specified cameras 73 | dataset = COCO( 74 | dataset_path=DATASET_PATH_DICT['coco'], 75 | split=split, 76 | sample_keys=COCO.get_available_sample_keys(split), 77 | cameras=cameras 78 | ) 79 | 80 | assert dataset.cameras == cameras 81 | assert len(dataset) == sum(n_samples) 82 | 83 | # test filtering 84 | dataset.filter_camera(cameras[0]) 85 | assert dataset.camera == cameras[0] 86 | assert len(dataset) == n_samples[0] 87 | 88 | # reset filtering 89 | dataset.filter_camera(None) 90 | assert dataset.camera is None 91 | assert len(dataset) == sum(n_samples) 92 | -------------------------------------------------------------------------------- /tests/test_d2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Some common dataset tests for the d2 interface 4 | 5 | .. codeauthor:: Soehnke Fischedick 6 | """ 7 | import pytest 8 | 9 | from detectron2.data import DatasetCatalog 10 | from detectron2.data import MetadataCatalog 11 | 12 | # The import registers the datasets to d2 13 | from nicr_scene_analysis_datasets import d2 as nicr_d2 14 | from nicr_scene_analysis_datasets import KNOWN_DATASETS 15 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 16 | 17 | 18 | @pytest.mark.parametrize('dataset_name', KNOWN_DATASETS) 19 | @pytest.mark.parametrize('dataset_split', ('test', 'valid', 'train')) 20 | def test_d2_dataset(dataset_name, dataset_split): 21 | invalid_names = set({ 22 | 'coco_test', 23 | 'nyuv2_valid', 24 | 'scenenetrgbd_test', 25 | 'sunrgbd_valid', 26 | 'ade20k_test_panoptic_2017', 27 | }) 28 | # Get the path of the dataset 29 | dataset_path = DATASET_PATH_DICT[dataset_name] 30 | # Set the path for the dataset, so that d2 can use it 31 | nicr_d2.set_dataset_path(dataset_path) 32 | # Get the correct name for using the dataset from the DatasetCatalog 33 | dataset_name_d2 = f'{dataset_name}_{dataset_split}' 34 | # Get a corrcet split for ade20k dataset 35 | if dataset_name == 'ade20k': 36 | dataset_name_d2 = f'{dataset_name}_{dataset_split}_panoptic_2017' 37 | 38 | if dataset_name_d2 in invalid_names: 39 | return 40 | dataset = DatasetCatalog.get(dataset_name_d2) 41 | assert MetadataCatalog.get(dataset_name_d2).dataset_config 42 | 43 | for i, sample in enumerate(dataset): 44 | assert isinstance(sample, dict) 45 | assert 'identifier' in sample 46 | assert 'rgb' in sample 47 | assert 'semantic' in sample or 'semantic' not in dataset.get_available_sample_keys(dataset_split) 48 | assert 'instance' in sample or 'instance' not in dataset.get_available_sample_keys(dataset_split) 49 | 50 | if i >= 9: 51 | break 52 | 53 | 54 | @pytest.mark.parametrize('dataset_name', KNOWN_DATASETS) 55 | def test_d2_helper_functions(dataset_name): 56 | 57 | class DummyMapper: 58 | def __call__(self, data): 59 | data['test'] = True 60 | return data 61 | 62 | valid_datasets_for_test = set({ 63 | 'nyuv2', 64 | 'hypersim', 65 | 'sunrgbd' 66 | }) 67 | if dataset_name not in valid_datasets_for_test: 68 | return 69 | 70 | # Get the path of the dataset 71 | dataset_path = DATASET_PATH_DICT[dataset_name] 72 | # Set the path for the dataset, so that d2 can use it 73 | nicr_d2.set_dataset_path(dataset_path) 74 | # Get the correct name for using the dataset from the DatasetCatalog 75 | dataset_name_d2 = f'{dataset_name}_test' 76 | dataset = DatasetCatalog.get(dataset_name_d2) 77 | dataset_config = MetadataCatalog.get(dataset_name_d2).dataset_config 78 | 79 | data_mapper = nicr_d2.NICRSceneAnalysisDatasetMapper(dataset_config) 80 | dummy_mapper = DummyMapper() 81 | chained_mapper = nicr_d2.NICRChainedDatasetMapper( 82 | [data_mapper, dummy_mapper] 83 | ) 84 | 85 | for i, data in enumerate(dataset): 86 | mapped_data = chained_mapper(data) 87 | assert 'test' in mapped_data 88 | assert mapped_data['test'] 89 | if i >= 9: 90 | break 91 | -------------------------------------------------------------------------------- /tests/test_depth_estimation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for depth estimation 4 | 5 | .. codeauthor:: Daniel Seichter 6 | """ 7 | import os 8 | import shutil 9 | 10 | import cv2 11 | import pytest 12 | 13 | from nicr_scene_analysis_datasets.auxiliary_data.depth_estimation import get_depth_estimator_class 14 | from nicr_scene_analysis_datasets.utils.io import download_file 15 | from nicr_scene_analysis_datasets.scripts import viewer_depth 16 | 17 | 18 | # EXAMPLE_IMAGE = 'http://images.cocodataset.org/val2017/000000039769.jpg' 19 | EXAMPLE_IMAGE = 'https://dl.fbaipublicfiles.com/dinov2/images/example.jpg' 20 | # EXAMPLE_IMAGE = '/local/dase6070/datasets/ade20k/tmp/ADEChallengeData2016/images/training/ADE_train_00006921.jpg' # 2100x2100 image 21 | SHOW_RESULTS = False 22 | 23 | # true: always use the same default transformers path and, thus, speed up 24 | # consecutive test runs 25 | USE_DEFAULT_CACHE_PATH = True 26 | 27 | 28 | def _get_example_img(tmp_path): 29 | fn = 'example.jpg' 30 | fp = os.path.join(tmp_path, fn) 31 | if not os.path.exists(fp): 32 | if EXAMPLE_IMAGE.startswith('http'): 33 | download_file(EXAMPLE_IMAGE, fp) 34 | else: 35 | shutil.copy(EXAMPLE_IMAGE, fp) 36 | 37 | img = cv2.imread(fp, cv2.IMREAD_UNCHANGED) 38 | assert img is not None 39 | assert img.ndim == 3 40 | 41 | return img 42 | 43 | 44 | def _show_result(img, tmp_path): 45 | if not SHOW_RESULTS: 46 | return 47 | 48 | # dump file and show 49 | output_path = os.path.join(tmp_path, 'prediction') 50 | os.makedirs(output_path, exist_ok=True) 51 | cv2.imwrite(os.path.join(output_path, 'example.png'), img) 52 | args = [ 53 | output_path, 54 | '--color-path', str(tmp_path), 55 | '--color-alpha', '0.9', 56 | ] 57 | viewer_depth.main(args) 58 | 59 | 60 | @pytest.mark.parametrize( 61 | 'estimator__model', ( 62 | 'depthanything_v2__indoor_small', 63 | 'depthanything_v2__indoor_base', 64 | 'depthanything_v2__indoor_large', 65 | 'depthanything_v2__outdoor_small', 66 | 'depthanything_v2__outdoor_base', 67 | 'depthanything_v2__outdoor_large', 68 | 'zoedepth__indoor', 69 | 'zoedepth__outdoor', 70 | 'zoedepth__indoor_outdoor', 71 | 'dino_v2_dpt__indoor_small', 72 | 'dino_v2_dpt__indoor_base', 73 | 'dino_v2_dpt__indoor_large', 74 | 'dino_v2_dpt__indoor_giant', 75 | 'dino_v2_dpt__outdoor_small', 76 | 'dino_v2_dpt__outdoor_base', 77 | 'dino_v2_dpt__outdoor_large', 78 | 'dino_v2_dpt__outdoor_giant', 79 | ) 80 | ) 81 | @pytest.mark.parametrize('max_pixels', (1920 * 1080, None)) 82 | def test_depth_estimator(estimator__model, max_pixels, tmp_path): 83 | # get image 84 | img = _get_example_img(tmp_path) 85 | 86 | # get model 87 | Estimator = get_depth_estimator_class(estimator__model) 88 | estimator = Estimator( 89 | device='cpu', 90 | max_pixels=max_pixels, 91 | auto_set_up=True, 92 | cache_basepath=tmp_path if not USE_DEFAULT_CACHE_PATH else None, 93 | ) 94 | 95 | # predict 96 | depth = estimator.predict(img) 97 | 98 | # perform some basic tests 99 | assert depth.ndim == 2 100 | assert depth.shape == img.shape[:2] 101 | 102 | # optional: show result 103 | _show_result(depth, tmp_path) 104 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scannet/README.md: -------------------------------------------------------------------------------- 1 | # ScanNet dataset 2 | 3 | ScanNet is an RGB-D video dataset containing 2.5 million views in more than 1500 scans, annotated with 3D camera poses, surface reconstructions, and instance-level semantic segmentations. 4 | For more details, see: [ScanNet v2](http://www.scan-net.org/) 5 | 6 | Note: 3D meshes and surface reconstructions are not included in the preparation of the dataset. 7 | 8 | 9 | ## Prepare dataset 10 | 1. Download the Dataset: 11 | 12 | To be able to download the dataset fill out the [ScanNet Terms of Use](http://kaldir.vc.in.tum.de/scannet/ScanNet_TOS.pdf) and send it to them at scannet@googlegroups.com. Once your request is approved, you will receive a `download_scannet.py` script. 13 | 14 | Execute it with: 15 | ```bash 16 | # general usage 17 | python download-scannet.py -o /path/where/to/download/ScanNet 18 | ``` 19 | 20 | 2. Convert dataset: 21 | 22 | ```bash 23 | # general usage (note that one process might use more than 3GB RAM) 24 | nicr_sa_prepare_dataset scannet \ 25 | /path/where/to/download/ScanNet \ 26 | /path/where/to/convert/ScanNet \ 27 | [--n-processes N] \ 28 | [--subsample N0] 29 | [--additional-subsamples N1 N2] 30 | [--label-map-file /path/to/scannet-labels.combined.tsv] 31 | ``` 32 | With arguments: 33 | - `--n-processes`: 34 | The number of worker processes to spawn. 35 | - `--subsample` 36 | The subsample that is exported to the output folder. 37 | - `--additional_subsamples`: 38 | Tor additional subsampled versions of the dataset. 39 | - `--label-map-file`: 40 | Path to scannet-labels.combined.tsv, if not specified assumed to be located 41 | in source dir. 42 | 43 | 44 | 3. (Optional) Generate auxiliary data: 45 | > **Note**: To use auxiliary data generation, the package must be installed with the `withauxiliarydata` option: 46 | > ```bash 47 | > pip install -e .[withauxiliarydata] 48 | > ``` 49 | 50 | ```bash 51 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 52 | nicr_sa_generate_auxiliary_data \ 53 | --dataset scannet \ 54 | --dataset-path /path/to/already/prepared/ScanNet/dataset \ 55 | --auxiliary-data depth image-embedding panoptic-embedding \ 56 | --embedding-estimator-device cuda \ 57 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 58 | --depth-estimator-device cuda \ 59 | --depth-estimators depthanything_v2__indoor_large \ 60 | --cache-models 61 | ``` 62 | With arguments: 63 | - `--dataset-path`: 64 | Path to the prepared ScanNet dataset. 65 | - `--auxiliary-data`: 66 | Types of auxiliary data to generate: 67 | - `depth`: Generates synthetic depth images from RGB. 68 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 69 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 70 | - `--depth-estimator-device`: 71 | Device to use for depth estimation (`cpu` or `cuda`). 72 | - `--depth-estimators`: 73 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 74 | - `--embedding-estimator-device`: 75 | Device to use for embedding estimation (`cpu` or `cuda`). 76 | - `--embedding-estimators`: 77 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 78 | - `--cache-models`: 79 | Cache models locally to avoid reloading them in future runs. -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scenenetrgbd/README.md: -------------------------------------------------------------------------------- 1 | # SceneNet RGB-D dataset 2 | 3 | SceneNet RGB-D expands the previous work of SceneNet to enable large scale photorealistic rendering of indoor scene trajectories. It provides pixel-perfect ground truth for scene understanding problems such as semantic segmentation, instance segmentation, and object detection, and also for geometric computer vision problems such as optical flow, depth estimation, camera pose estimation, and 3D reconstruction. Random sampling permits virtually unlimited scene configurations, and here we provide a set of 5M rendered RGB-D images from over 15K trajectories in synthetic layouts with random but physically simulated object poses. Each layout also has random lighting, camera trajectories, and textures. The scale of this dataset is well suited for pre-training data-driven computer vision techniques from scratch with RGB-D inputs, which previously has been limited by relatively small labelled datasets in NYUv2 and SUN RGB-D. It also provides a basis for investigating 3D scene labelling tasks by providing perfect camera poses and depth data as proxy for a SLAM system. 4 | 5 | For more details, see: [SceneNet RGB-D](https://robotvault.bitbucket.io/scenenet-rgbd.html) and [pySceneNetRGBD](https://github.com/jmccormac/pySceneNetRGBD). 6 | 7 | ## Prepare dataset 8 | 1. Download and untar dataset files: 9 | ```bash 10 | # see: https://robotvault.bitbucket.io/scenenet-rgbd.html 11 | 12 | SCENENETRGBD_DOWNLOAD_DIR="/path/where/to/store/scenenetrgbd_dowloads" 13 | 14 | # train 15 | wget https://www.doc.ic.ac.uk/~bjm113/scenenet_data/SceneNet-train.tar.gz -P ${SCENENETRGBD_DOWNLOAD_DIR} # -> 263GB 16 | wget https://www.doc.ic.ac.uk/~bjm113/scenenet_data/train_protobufs.tar.gz -P ${SCENENETRGBD_DOWNLOAD_DIR} # -> 323MB 17 | 18 | # valid 19 | wget http://www.doc.ic.ac.uk/~bjm113/scenenet_data/SceneNetRGBD-val.tar.gz -P ${SCENENETRGBD_DOWNLOAD_DIR} # -> 15GB 20 | wget http://www.doc.ic.ac.uk/~bjm113/scenenet_data/scenenet_rgbd_val.pb -P ${SCENENETRGBD_DOWNLOAD_DIR} # -> 31MB 21 | 22 | # untar files 23 | find ${SCENENETRGBD_DOWNLOAD_DIR} -name '*.tar.gz' -exec tar xfvz {} \; 24 | 25 | # move train protobuf files 26 | mv ${SCENENETRGBD_DOWNLOAD_DIR}/train_protobufs/* ${SCENENETRGBD_DOWNLOAD_DIR} 27 | rm -rf ${SCENENETRGBD_DOWNLOAD_DIR}/train_protobufs 28 | ``` 29 | 30 | 2. Build protobuf python source file: 31 | ``` 32 | protoc --python_out=./ scenenet.proto 33 | ``` 34 | 35 | 3. Convert dataset: 36 | ```bash 37 | # general usage 38 | 39 | # full dataset: 40 | # - train: 16x1000 + 1x865 trajectories with 300 views per trajectory -> 5,059,500 samples 41 | # - valid: 1x1000 trajectories with 300 views per trajectory -> 300,000 samples 42 | nicr_sa_prepare_dataset scenenetrgbd \ 43 | /path/where/to/store/scenenetrgbd \ 44 | ${SCENENETRGBD_DOWNLOAD_DIR} 45 | 46 | # subsampled dataset 47 | # -> randomly pick 3 views from each trajectory for training 48 | # -> randomly pick 6 views from each trajectory for validation 49 | # -> pick only views with >= 4 different classes 50 | # - train: 16x1000 + 1x865 trajectories with 3 views per trajectory -> 50,595 samples 51 | # - valid: 1x1000 trajectories with 6 views per trajectory -> 6,000 samples 52 | nicr_sa_prepare_dataset scenenetrgbd \ 53 | /path/where/to/store/scenenetrgbd \ 54 | ${SCENENETRGBD_DOWNLOAD_DIR} \ 55 | --n-random-views-to-include-train 3 \ 56 | --n-random-views-to-include-valid 6 \ 57 | --force-at-least-n-classes-in-view 4 58 | ``` 59 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/nyu_additional_class_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "night_stand": 32, 3 | "tv_stand": 39, 4 | "vacuum_cleaner": 40, 5 | "coffee_table": 39, 6 | "piano_bench": 39, 7 | "garbage_bin": 39, 8 | "shower_curtain": 28, 9 | "paper_towel_dispenser": 40, 10 | "toilet_paper": 40, 11 | "water_dispenser": 40, 12 | "toaster_oven": 40, 13 | "fire_extinguisher": 40, 14 | "shopping_cart": 40, 15 | "tissue_box": 40, 16 | "wine_glass": 40, 17 | "door_knob": 40, 18 | "door_lock": 40, 19 | "display_case": 39, 20 | "plant_pot": 40, 21 | "bunk_bed": 39, 22 | "water_heater": 40, 23 | "air_conditioner": 38, 24 | "water_fountain": 38, 25 | "frying_pan": 40, 26 | "mouse_pad": 40, 27 | "pen_stand": 40, 28 | "flower_pot": 40, 29 | "washing_machine": 39, 30 | "projector_screen": 38, 31 | "lazy_susan": 40, 32 | "remote_control": 40, 33 | "shoe_rack": 40, 34 | "flower_box": 40, 35 | "hole_puncher": 40, 36 | "baby_chair": 39, 37 | "hair_brush": 40, 38 | "cordless_phone": 40, 39 | "bean_bag": 39, 40 | "paper_towel": 40, 41 | "fax_machine": 40, 42 | "plastic_box": 40, 43 | "hand_sanitizer": 40, 44 | "music_stand": 39, 45 | "dish_rack": 40, 46 | "ping_pong_table": 39, 47 | "pool_table": 39, 48 | "foosball_table": 39, 49 | "drying_rack": 39, 50 | "glass_container": 40, 51 | "paper_cutter": 40, 52 | "fire_alarm": 40, 53 | "plastic_rack": 40, 54 | "plastic_tub": 40, 55 | "toy_plane": 40, 56 | "display_board": 39, 57 | "flower_basket": 40, 58 | "toy_car": 40, 59 | "show_piece": 40, 60 | "dvd_player": 40, 61 | "tea_pot": 40, 62 | "plastic_bowl": 40, 63 | "toy_house": 40, 64 | "back_pack": 40, 65 | "stack_of_chairs": 39, 66 | "flower_vase": 40, 67 | "plants": 40, 68 | "towel_bar": 38, 69 | "suits_case": 40, 70 | "plastic_container": 40, 71 | "shoes": 40, 72 | "flowers": 40, 73 | "bed_sheet": 40, 74 | "dresser_mirror": 3, 75 | "sofa_chair": 5, 76 | "tv": 25, 77 | "shelf": 3, 78 | "endtable": 7, 79 | "fridge": 24, 80 | "recycle_bin": 39, 81 | "bathroom_vanity": 3, 82 | "painting": 11, 83 | "island": 38, 84 | "kitchen_counter": 12, 85 | "kitchen_cabinet": 3, 86 | "rack": 15, 87 | "cubby": 15, 88 | "cupboard": 15, 89 | "tripod": 40, 90 | "scanner": 40, 91 | "poster": 11, 92 | "information_board": 40, 93 | "dining_table": 7, 94 | "bulletin_board": 39, 95 | "coffee_maker": 40, 96 | "file_cabinet": 3, 97 | "decor": 40, 98 | "locker": 39, 99 | "hanging_cabinet": 3, 100 | "kitchen": 39, 101 | "portrait": 11, 102 | "organizer": 40, 103 | "switch": 38, 104 | "mug": 40, 105 | "cpu": 40, 106 | "soap_dispenser": 40, 107 | "thermos": 40, 108 | "microwave_oven": 40, 109 | "electric_fan": 40, 110 | "paper_bag": 40, 111 | "rice_cooker": 40, 112 | "magazine_rack": 15, 113 | "armoire": 15, 114 | "podium": 39, 115 | "grab_bar": 39, 116 | "toilet_paper_dispenser": 40, 117 | "urinal": 33, 118 | "basin": 34, 119 | "stuffed_toy": 40, 120 | "cartoon": 40, 121 | "plastic_bottle": 40, 122 | "plastic_bag": 40, 123 | "computer_keyboard": 40, 124 | "water_bottle": 40, 125 | "kettle": 40, 126 | "desktop": 40, 127 | "tissue_paper": 40, 128 | "food_tray": 40, 129 | "end_table": 7, 130 | "battery": 40, 131 | "helmet": 40, 132 | "saucer_chair": 5, 133 | "fume_hood": 38, 134 | "water_jug": 40, 135 | "frige": 24, 136 | "packet": 40, 137 | "child_chair": 5 138 | } -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/legacy_emsanet_version/nyu_additional_class_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "night_stand": 32, 3 | "tv_stand": 39, 4 | "vacuum_cleaner": 40, 5 | "coffee_table": 39, 6 | "piano_bench": 39, 7 | "garbage_bin": 39, 8 | "shower_curtain": 28, 9 | "paper_towel_dispenser": 40, 10 | "toilet_paper": 40, 11 | "water_dispenser": 40, 12 | "toaster_oven": 40, 13 | "fire_extinguisher": 40, 14 | "shopping_cart": 40, 15 | "tissue_box": 40, 16 | "wine_glass": 40, 17 | "door_knob": 40, 18 | "door_lock": 40, 19 | "display_case": 39, 20 | "plant_pot": 40, 21 | "bunk_bed": 39, 22 | "water_heater": 40, 23 | "air_conditioner": 38, 24 | "water_fountain": 38, 25 | "frying_pan": 40, 26 | "mouse_pad": 40, 27 | "pen_stand": 40, 28 | "flower_pot": 40, 29 | "washing_machine": 39, 30 | "projector_screen": 38, 31 | "lazy_susan": 40, 32 | "remote_control": 40, 33 | "shoe_rack": 40, 34 | "flower_box": 40, 35 | "hole_puncher": 40, 36 | "baby_chair": 39, 37 | "hair_brush": 40, 38 | "cordless_phone": 40, 39 | "bean_bag": 39, 40 | "paper_towel": 40, 41 | "fax_machine": 40, 42 | "plastic_box": 40, 43 | "hand_sanitizer": 40, 44 | "music_stand": 39, 45 | "dish_rack": 40, 46 | "ping_pong_table": 39, 47 | "pool_table": 39, 48 | "foosball_table": 39, 49 | "drying_rack": 39, 50 | "glass_container": 40, 51 | "paper_cutter": 40, 52 | "fire_alarm": 40, 53 | "plastic_rack": 40, 54 | "plastic_tub": 40, 55 | "toy_plane": 40, 56 | "display_board": 39, 57 | "flower_basket": 40, 58 | "toy_car": 40, 59 | "show_piece": 40, 60 | "dvd_player": 40, 61 | "tea_pot": 40, 62 | "plastic_bowl": 40, 63 | "toy_house": 40, 64 | "back_pack": 40, 65 | "stack_of_chairs": 39, 66 | "flower_vase": 40, 67 | "plants": 40, 68 | "towel_bar": 38, 69 | "suits_case": 40, 70 | "plastic_container": 40, 71 | "shoes": 40, 72 | "flowers": 40, 73 | "bed_sheet": 40, 74 | "dresser_mirror": 3, 75 | "sofa_chair": 5, 76 | "tv": 25, 77 | "shelf": 3, 78 | "endtable": 7, 79 | "fridge": 24, 80 | "recycle_bin": 39, 81 | "bathroom_vanity": 3, 82 | "painting": 11, 83 | "island": 38, 84 | "kitchen_counter": 12, 85 | "kitchen_cabinet": 3, 86 | "rack": 15, 87 | "cubby": 15, 88 | "cupboard": 15, 89 | "tripod": 40, 90 | "scanner": 40, 91 | "poster": 11, 92 | "information_board": 40, 93 | "dining_table": 7, 94 | "bulletin_board": 39, 95 | "coffee_maker": 40, 96 | "file_cabinet": 3, 97 | "decor": 40, 98 | "locker": 39, 99 | "hanging_cabinet": 3, 100 | "kitchen": 39, 101 | "portrait": 11, 102 | "organizer": 40, 103 | "switch": 38, 104 | "mug": 40, 105 | "cpu": 40, 106 | "soap_dispenser": 40, 107 | "thermos": 40, 108 | "microwave_oven": 40, 109 | "electric_fan": 40, 110 | "paper_bag": 40, 111 | "rice_cooker": 40, 112 | "magazine_rack": 15, 113 | "armoire": 15, 114 | "podium": 39, 115 | "grab_bar": 39, 116 | "toilet_paper_dispenser": 40, 117 | "urinal": 33, 118 | "basin": 34, 119 | "stuffed_toy": 40, 120 | "cartoon": 40, 121 | "plastic_bottle": 40, 122 | "plastic_bag": 40, 123 | "computer_keyboard": 40, 124 | "water_bottle": 40, 125 | "kettle": 40, 126 | "desktop": 40, 127 | "tissue_paper": 40, 128 | "food_tray": 40, 129 | "end_table": 7, 130 | "battery": 40, 131 | "helmet": 40, 132 | "saucer_chair": 5, 133 | "fume_hood": 38, 134 | "water_jug": 40, 135 | "frige": 24, 136 | "packet": 40, 137 | "child_chair": 5 138 | } -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/sunrgbd/README.md: -------------------------------------------------------------------------------- 1 | # SUNRGB-D Dataset 2 | 3 | The SUNRGB-D dataset is comprised of images of four different cameras, i.e., 4 | Intel Realsense, Asus Xtion, and Microsoft Kinect v1 and v2. 5 | It contains all images from NYUv2, manually selected images from Berkeley 6 | B3DO and SUN3D as well as newly shot images. 7 | 8 | It contains 10,335 densely labeled pairs of aligned RGB and depth images. 9 | 10 | For more details, see: [SUNRGB-D dataset](https://rgbd.cs.princeton.edu/) 11 | 12 | We further extracted dense 2d instance annotations from annotated 3d boxes to 13 | enable panoptic segmentation on SUNRGB-D. Over time, we created two versions 14 | for additional instance annotations: 15 | - 'emsanet': this initial version was created for training EMSANet (efficient 16 | panoptic segmentation) - see IJCNN 2022 paper - and was also used for 17 | EMSAFormer (efficient panoptic segmentation) - see IJCNN 2023 paper 18 | - 'panopticndt': this revised version was created along with the work for 19 | PanopticNDT (panoptic mapping) - see IROS 2023 paper, it refines large parts 20 | of the instance extraction (see changelog for v0.6.0 of this package). 21 | 22 | 23 | ## Prepare dataset 24 | 1. Download and convert the dataset to the desired format: 25 | 26 | ```bash 27 | # general usage (latest PanopticNDT version) 28 | nicr_sa_prepare_dataset sunrgbd \ 29 | /path/where/to/store/sunrgbd \ 30 | --create-instances \ 31 | --copy-instances-from-nyuv2 \ 32 | --nyuv2-path /path/to/already/prepared/nyuv2/ 33 | 34 | # general usage (EMSANet version - use this version to reproduce results 35 | # reported in EMSANet or EMSAFormer paper) 36 | nicr_sa_prepare_dataset sunrgbd \ 37 | /path/where/to/store/sunrgbd \ 38 | --create-instances \ 39 | --instances-version emsanet \ 40 | --copy-instances-from-nyuv2 \ 41 | --nyuv2-path /path/to/already/prepared/nyuv2/ 42 | ``` 43 | > Note: NYUv2 matching requires NYUv2 prepared first. 44 | 45 | With arguments: 46 | - `--create-instances`: 47 | Whether instances should be created by matching 3D boxes with point clouds. 48 | - `--instances-version`: 49 | Version of instance annotations to extract, see notes above. 50 | - `--copy-instances-from-nyuv2`: 51 | Whether instances and orientations should copied from (already prepared!) 52 | NYUv2 dataset. 53 | - `--nyuv2-path /path/to/datasets/nyuv2`: 54 | Path to (already prepared!) NYUv2 dataset when using 55 | `copy-instances-from-nyuv2`. 56 | 57 | 2. (Optional) Generate auxiliary data 58 | ```bash 59 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 60 | nicr_sa_generate_auxiliary_data \ 61 | --dataset sunrgbd \ 62 | --dataset-path /path/to/already/prepared/sunrgbd/dataset \ 63 | --auxiliary-data depth image-embedding panoptic-embedding \ 64 | --embedding-estimator-device cuda \ 65 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 66 | --depth-estimator-device cuda \ 67 | --depth-estimators depthanything_v2__indoor_large \ 68 | --cache-models 69 | ``` 70 | 71 | With arguments: 72 | - `--dataset-path`: 73 | Path to the prepared SUNRGB-D dataset. 74 | - `--auxiliary-data`: 75 | Types of auxiliary data to generate: 76 | - `depth`: Generates synthetic depth images from RGB. 77 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 78 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 79 | - `--depth-estimator-device`: 80 | Device to use for depth estimation (`cpu` or `cuda`). 81 | - `--depth-estimators`: 82 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 83 | - `--embedding-estimator-device`: 84 | Device to use for embedding estimation (`cpu` or `cuda`). 85 | - `--embedding-estimators`: 86 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 87 | - `--cache-models`: 88 | Cache models locally to avoid reloading them in future runs. 89 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_annotation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | .. codeauthor:: Daniel Seichter 5 | """ 6 | from dataclasses import dataclass 7 | from typing import Tuple, Union 8 | 9 | import numpy as np 10 | 11 | 12 | class ExtrinsicCameraParametersNormalized(dict): 13 | """Enables a custom pytorch collate function ignore this dict.""" 14 | pass 15 | 16 | 17 | class IntrinsicCameraParametersNormalized(dict): 18 | """Enables a custom pytorch collate function ignore this dict.""" 19 | pass 20 | 21 | 22 | class MetaDict(dict): 23 | """Enables a custom pytorch collate function ignore the meta data.""" 24 | pass 25 | 26 | 27 | class OrientationDict(dict): 28 | """Enables a custom pytorch collate function ignore the orientations.""" 29 | pass 30 | 31 | 32 | class PanopticEmbeddingDict(dict): 33 | """Enables a custom pytorch collate function ignore the embeddings.""" 34 | pass 35 | 36 | 37 | class SampleIdentifier(tuple): 38 | """Enables a custom pytorch collate function ignore the identifier.""" 39 | pass 40 | 41 | 42 | @dataclass(frozen=True) 43 | class _LabelBase: 44 | class_name: str 45 | 46 | 47 | @dataclass(frozen=True) 48 | class SemanticLabel(_LabelBase): 49 | is_thing: Union[bool, None] 50 | use_orientations: Union[bool, None] 51 | color: Tuple[int] 52 | 53 | 54 | @dataclass(frozen=True) 55 | class SceneLabel(_LabelBase): 56 | # maybe add color for scene labels 57 | pass 58 | 59 | 60 | class _LabelListBase: 61 | def __init__( 62 | self, 63 | label_list: Tuple[_LabelBase] = () 64 | ) -> None: 65 | self.label_list = list(label_list) 66 | # a copy of a the class names list for faster name to idx lookup 67 | self._class_names = () 68 | self._update_internal_lists() 69 | # for iterator 70 | self._idx = 0 71 | 72 | def __len__(self): 73 | return len(self.label_list) 74 | 75 | def __getitem__(self, idx): 76 | return self.label_list[idx] 77 | 78 | def __iter__(self): 79 | return self 80 | 81 | def __next__(self): 82 | try: 83 | el = self[self._idx] 84 | self._idx += 1 85 | return el 86 | except IndexError: 87 | self._idx = 0 88 | raise StopIteration # done iterating 89 | 90 | def add_label(self, label: _LabelBase): 91 | self.label_list.append(label) 92 | self._update_internal_lists() 93 | 94 | def _update_internal_lists(self): 95 | self._class_names = tuple(item.class_name for item in self.label_list) 96 | 97 | def _name_to_idx(self, name: str) -> int: 98 | return self._class_names.index(name) 99 | 100 | def index(self, value: Union[_LabelBase, str]) -> int: 101 | if isinstance(value, _LabelBase): 102 | return self.label_list.index(value) 103 | else: 104 | return self._name_to_idx(value) 105 | 106 | def __contains__(self, value: Union[_LabelBase, str]) -> bool: 107 | if isinstance(value, _LabelBase): 108 | return value in self.label_list 109 | else: 110 | return value in self._class_names 111 | 112 | @property 113 | def class_names(self) -> Tuple[str]: 114 | return self._class_names 115 | 116 | 117 | class SemanticLabelList(_LabelListBase): 118 | @property 119 | def colors(self) -> Tuple[Tuple[int]]: 120 | return tuple(item.color for item in self.label_list) 121 | 122 | @property 123 | def colors_array(self) -> np.ndarray: 124 | return np.array(self.colors, dtype=np.uint8) 125 | 126 | @property 127 | def classes_is_thing(self) -> Tuple[bool]: 128 | return tuple(item.is_thing for item in self.label_list) 129 | 130 | @property 131 | def classes_use_orientations(self) -> Tuple[bool]: 132 | return [item.use_orientations for item in self.label_list] 133 | 134 | 135 | class SceneLabelList(_LabelListBase): 136 | pass 137 | -------------------------------------------------------------------------------- /tests/test_ade20k.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for ADE20k dataset 4 | 5 | .. codeauthor:: Soehnke Fischedick 6 | """ 7 | 8 | import numpy as np 9 | import pytest 10 | 11 | from nicr_scene_analysis_datasets import ADE20K 12 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 13 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 14 | 15 | # Constants based on ADE20K dataset details 16 | SPLITS = ADE20K.SPLITS 17 | # SEMANTIC_N_CLASSES = (150, 3688) 18 | CAMERAS = ('683x512', '674x512') 19 | 20 | 21 | @pytest.mark.parametrize('split', SPLITS) 22 | def test_dataset_initialization(split): 23 | # Initialize dataset 24 | dataset = ADE20K( 25 | dataset_path=DATASET_PATH_DICT['ade20k'], 26 | split=split, 27 | sample_keys=ADE20K.get_available_sample_keys(split), 28 | ) 29 | 30 | # Check basic properties 31 | assert dataset.split == split 32 | # +1 because of void class 33 | assert dataset.semantic_n_classes == 151 34 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 35 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 36 | 37 | # Check config based on semantic_n_classes 38 | assert dataset.config.semantic_label_list == ADE20K.SEMANTIC_LABEL_LIST_CHALLENGE_150 39 | 40 | # Check sample keys 41 | available_keys = ADE20K.get_available_sample_keys(split) 42 | for key in available_keys: 43 | assert key in dataset.sample_keys 44 | 45 | # Test loading first few samples 46 | for i in range(min(10, len(dataset))): 47 | sample = dataset[i] 48 | assert isinstance(sample, dict) 49 | assert isinstance(sample['identifier'], SampleIdentifier) 50 | 51 | if 'rgb' in sample: 52 | assert sample['rgb'].ndim == 3 53 | assert sample['rgb'].shape[2] == 3 # RGB channels 54 | 55 | if 'semantic' in sample: 56 | assert sample['semantic'].ndim == 2 57 | assert sample['semantic'].dtype == np.uint8 58 | 59 | if 'instance' in sample: 60 | assert sample['instance'].ndim == 2 61 | assert sample['instance'].dtype == np.uint16 62 | 63 | if 'scene' in sample: 64 | assert isinstance(sample['scene'], int) 65 | # +1 because of void class 66 | assert 0 <= sample['scene'] < len(dataset.scene_class_names) + 1 67 | 68 | 69 | @pytest.mark.parametrize('split', SPLITS) 70 | def test_scene_class_loading(split): 71 | if 'scene' not in ADE20K.get_available_sample_keys(split): 72 | pytest.skip(f"Split {split} does not contain scene labels") 73 | 74 | dataset = ADE20K( 75 | dataset_path=DATASET_PATH_DICT['ade20k'], 76 | split=split, 77 | sample_keys=('scene',) 78 | ) 79 | 80 | for i in range(min(10, len(dataset))): 81 | scene_class = dataset[i]['scene'] 82 | assert isinstance(scene_class, int) 83 | assert 0 <= scene_class < len(dataset.scene_class_names) 84 | 85 | 86 | @pytest.mark.parametrize('split', SPLITS) 87 | def test_camera_filtering(split): 88 | # Test with first camera (assuming multiple exist) 89 | test_cameras = (CAMERAS[0],) 90 | dataset = ADE20K( 91 | dataset_path=DATASET_PATH_DICT['ade20k'], 92 | split=split, 93 | cameras=test_cameras, 94 | sample_keys=ADE20K.get_available_sample_keys(split) 95 | ) 96 | 97 | # Check all samples are from specified cameras 98 | for i in range(min(10, len(dataset))): 99 | identifier = dataset[i]['identifier'] 100 | assert identifier[0] in test_cameras 101 | 102 | # Test filtering after initialization 103 | if len(dataset.cameras) > 1: 104 | dataset.filter_camera(CAMERAS[1]) 105 | assert len(dataset) <= len(dataset._filenames_per_camera[CAMERAS[1]]) 106 | for i in range(min(10, len(dataset))): 107 | assert dataset[i]['identifier'][0] == CAMERAS[1] 108 | 109 | 110 | def test_debug_mode(): 111 | # Test dataset without dataset_path 112 | dataset = ADE20K(dataset_path=None) 113 | assert len(dataset) == 0 114 | assert dataset.cameras == ADE20K.CAMERAS # Single dummy camera 115 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/pytorch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import Callable, Type, Union 6 | 7 | from torch.utils.data import Dataset 8 | 9 | from .auxiliary_data import wrap_dataset_with_auxiliary_data 10 | from .dataset_base import KNOWN_CLASS_WEIGHTINGS # noqa: F401 11 | from .dataset_base._base_dataset import DatasetBase 12 | 13 | 14 | class _PytorchDatasetWrapper(DatasetBase, Dataset): 15 | def __init__(self, *args, **kwargs) -> None: 16 | super().__init__(*args, **kwargs) 17 | self._preprocessor = None 18 | 19 | @property 20 | def transform(self) -> Union[Callable, None]: 21 | # just to be compatible with VisionDataset from torchvision 22 | return self.preprocessor 23 | 24 | @transform.setter 25 | def transform(self, value: Union[Callable, None]): 26 | # just to be compatible with VisionDataset from torchvision 27 | self.preprocessor = value 28 | 29 | @property 30 | def preprocessor(self) -> Union[Callable, None]: 31 | return self._preprocessor 32 | 33 | @preprocessor.setter 34 | def preprocessor(self, value: Union[Callable, None]): 35 | self._preprocessor = value 36 | 37 | def __getitem__(self, idx): 38 | sample = super().__getitem__(idx) 39 | 40 | # apply preprocessing 41 | if self._preprocessor is not None: 42 | sample = self._preprocessor(sample) 43 | 44 | return sample 45 | 46 | 47 | from . import ADE20K as _ADE20K 48 | from . import COCO as _COCO 49 | from . import Cityscapes as _Cityscapes 50 | from . import Hypersim as _Hypersim 51 | from . import NYUv2 as _NYUv2 52 | from . import ScanNet as _ScanNet 53 | from . import SceneNetRGBD as _SceneNetRGBD 54 | from . import SUNRGBD as _SUNRGBD 55 | 56 | 57 | class Cityscapes(_Cityscapes, _PytorchDatasetWrapper): 58 | pass 59 | 60 | 61 | class COCO(_COCO, _PytorchDatasetWrapper): 62 | pass 63 | 64 | 65 | class Hypersim(_Hypersim, _PytorchDatasetWrapper): 66 | pass 67 | 68 | 69 | class NYUv2(_NYUv2, _PytorchDatasetWrapper): 70 | pass 71 | 72 | 73 | class ScanNet(_ScanNet, _PytorchDatasetWrapper): 74 | pass 75 | 76 | 77 | class SceneNetRGBD(_SceneNetRGBD, _PytorchDatasetWrapper): 78 | pass 79 | 80 | 81 | class SUNRGBD(_SUNRGBD, _PytorchDatasetWrapper): 82 | pass 83 | 84 | 85 | class ADE20K(_ADE20K, _PytorchDatasetWrapper): 86 | pass 87 | 88 | 89 | from .dataset_base import ConcatDataset as _ConcatDataset 90 | 91 | 92 | class _PytorchConcatDatasetWrapper: 93 | def __init__(self, *args, **kwargs) -> None: 94 | super().__init__(*args, **kwargs) 95 | self._preprocessor = None 96 | 97 | @property 98 | def transform(self) -> Union[Callable, None]: 99 | # just to be compatible with VisionDataset from torchvision 100 | return self.preprocessor 101 | 102 | @transform.setter 103 | def transform(self, value: Union[Callable, None]): 104 | # just to be compatible with VisionDataset from torchvision 105 | self.preprocessor = value 106 | 107 | @property 108 | def preprocessor(self) -> Union[Callable, None]: 109 | return self._preprocessor 110 | 111 | @preprocessor.setter 112 | def preprocessor(self, value: Union[Callable, None]): 113 | self._preprocessor = value 114 | # apply preprocessor to all datasets 115 | for ds in self._datasets: 116 | ds.preprocessor = value 117 | 118 | 119 | class ConcatDataset(_ConcatDataset, _PytorchConcatDatasetWrapper): 120 | pass 121 | 122 | 123 | _DATASETS = { 124 | 'ade20k': ADE20K, 125 | 'cityscapes': Cityscapes, 126 | 'coco': COCO, 127 | 'hypersim': Hypersim, 128 | 'nyuv2': NYUv2, 129 | 'scannet': ScanNet, 130 | 'scenenetrgbd': SceneNetRGBD, 131 | 'sunrgbd': SUNRGBD, 132 | } 133 | KNOWN_DATASETS = tuple(_DATASETS.keys()) 134 | 135 | DatasetType = Union[ 136 | ADE20K, 137 | Cityscapes, 138 | COCO, 139 | Hypersim, 140 | NYUv2, 141 | ScanNet, 142 | SceneNetRGBD, 143 | SUNRGBD, 144 | ConcatDataset 145 | ] 146 | 147 | 148 | def get_dataset_class(name: str, with_auxiliary_data: bool = False) -> Type[DatasetType]: 149 | name = name.lower() 150 | if name not in KNOWN_DATASETS: 151 | raise ValueError(f"Unknown dataset: '{name}'") 152 | 153 | original_dataset = _DATASETS[name] 154 | if with_auxiliary_data: 155 | current_dataset = wrap_dataset_with_auxiliary_data(original_dataset) 156 | else: 157 | current_dataset = original_dataset 158 | 159 | return current_dataset 160 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scenenetrgbd/scenenetrgbd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Söhnke Fischedick 5 | """ 6 | from ...dataset_base import DepthStats 7 | from ...dataset_base import SceneLabel 8 | from ...dataset_base import SceneLabelList 9 | from ...dataset_base import SemanticLabel 10 | from ...dataset_base import SemanticLabelList 11 | 12 | 13 | class SceneNetRGBDMeta: 14 | SPLITS = ('train', 'valid') 15 | SPLIT_FILELIST_FILENAMES = {SPLITS[0]: 'train.txt', SPLITS[1]: 'valid.txt'} 16 | 17 | _DATA_SAMPLE_KEYS = ('identifier', 'meta', 'rgb', 'depth') 18 | _ANNOTATION_SAMPLE_KEYS = ('semantic', 'instance', 'scene') 19 | SPLIT_SAMPLE_KEYS = { 20 | SPLITS[0]: _DATA_SAMPLE_KEYS+_ANNOTATION_SAMPLE_KEYS, 21 | SPLITS[1]: _DATA_SAMPLE_KEYS+_ANNOTATION_SAMPLE_KEYS, 22 | } 23 | 24 | # calculated using a subsampled dataset (see prepare_dataset.py): 25 | # --n_random_views_to_include_train 3 26 | # --n_random_views_to_include_valid 6 27 | # --force_at_least_n_classes_in_view 4 28 | # see: my_dataset.depth_compute_stats() for calculation 29 | TRAIN_SPLIT_DEPTH_STATS = DepthStats( 30 | min=0.0, 31 | max=20076.0, 32 | mean=4006.9281155769777, 33 | std=2459.7763971709933, 34 | ) 35 | 36 | DEPTH_MODES = ('refined',) 37 | 38 | CAMERAS = ('camera1',) # just a dummy camera name 39 | 40 | DEPTH_DIR = 'depth' 41 | RGB_DIR = 'rgb' 42 | SEMANTIC_13_DIR = 'semantic_13' 43 | INSTANCES_DIR = 'instance' 44 | SCENE_CLASS_DIR = 'scene' 45 | 46 | # number of classes without void (NYUv2 classes) 47 | SEMANTIC_N_CLASSES = 13 48 | # there are no orientations, thus, it is set to None 49 | SEMANTIC_LABEL_LIST = SemanticLabelList(( 50 | # class_name, is_thing, use orientations, color 51 | SemanticLabel('void', False, None, (0, 0, 0)), 52 | SemanticLabel('bed', True, None, (0, 0, 255)), 53 | SemanticLabel('books', True, None, (232, 88, 47)), 54 | SemanticLabel('ceiling', False, None, (0, 217, 0)), 55 | SemanticLabel('chair', True, None, (148, 0, 240)), 56 | SemanticLabel('floor', False, None, (222, 241, 23)), 57 | SemanticLabel('furniture', True, None, (255, 205, 205)), 58 | SemanticLabel('objects', True, None, (0, 223, 228)), 59 | SemanticLabel('picture', True, None, (106, 135, 204)), 60 | SemanticLabel('sofa', True, None, (116, 28, 41)), 61 | SemanticLabel('table', True, None, (240, 35, 235)), 62 | SemanticLabel('tv', True, None, (0, 166, 156)), 63 | SemanticLabel('wall', False, None, (249, 139, 0)), 64 | SemanticLabel('window', True, None, (225, 228, 194)), 65 | )) 66 | 67 | # original scene labels 68 | SCENE_LABEL_LIST = SceneLabelList(( 69 | SceneLabel('bathroom'), 70 | SceneLabel('bedroom'), 71 | SceneLabel('kitchen'), 72 | SceneLabel('living_room'), 73 | SceneLabel('office') 74 | )) 75 | 76 | # scene labels for indoor domestic environments 77 | # mapping dict with new labels as keys and tuple of old labels as values 78 | SCENE_LABEL_MAPPING_INDOOR_DOMESTIC = { 79 | SceneLabel('void'): ( 80 | ), 81 | SceneLabel('bathroom'): ( 82 | SceneLabel('bathroom'), 83 | ), 84 | SceneLabel('bedroom'): ( 85 | SceneLabel('bedroom'), 86 | ), 87 | SceneLabel('dining room'): ( 88 | ), 89 | SceneLabel('discussion room'): ( 90 | ), 91 | SceneLabel('hallway'): ( 92 | ), 93 | SceneLabel('kitchen'): ( 94 | SceneLabel('kitchen'), 95 | ), 96 | SceneLabel('living room'): ( 97 | SceneLabel('living_room'), 98 | ), 99 | SceneLabel('office'): ( 100 | SceneLabel('office'), 101 | ), 102 | SceneLabel('other indoor'): ( 103 | ), 104 | SceneLabel('stairs'): ( 105 | ) 106 | } 107 | 108 | SCENE_LABEL_LIST_INDOOR_DOMESTIC = SceneLabelList( 109 | tuple(SCENE_LABEL_MAPPING_INDOOR_DOMESTIC.keys()) 110 | ) 111 | # create index mapping 112 | SCENE_LABEL_IDX_TO_SCENE_LABEL_INDOOR_DOMESTIC_IDX = {} 113 | for new_label, old_labels in SCENE_LABEL_MAPPING_INDOOR_DOMESTIC.items(): 114 | for old_label in old_labels: 115 | old_idx = SCENE_LABEL_LIST.index(old_label) 116 | new_idx = SCENE_LABEL_LIST_INDOOR_DOMESTIC.index(new_label) 117 | SCENE_LABEL_IDX_TO_SCENE_LABEL_INDOOR_DOMESTIC_IDX[old_idx] = new_idx 118 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/auxiliary_data/_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | """ 6 | from typing import Optional, Tuple, Union 7 | 8 | import abc 9 | import os 10 | 11 | import numpy as np 12 | import torch 13 | 14 | DEFAULT_CACHE_BASEPATH = os.getenv( 15 | "DEFAULT_CACHE_BASEPATH", 16 | "~/.cache/nicr_scene_analysis_datasets/auxiliary_data" 17 | ) 18 | 19 | 20 | class AuxiliaryDataEstimatorBase(abc.ABC): 21 | NAME: str 22 | 23 | def __init__( 24 | self, 25 | device: Union[str, torch.device] = 'cpu', 26 | max_pixels: Optional[int] = None, 27 | input_interpolation: str = 'bilinear', 28 | auto_set_up: bool = True, 29 | cache_basepath: Optional[str] = None, # None -> DEFAULT_CACHE_BASEPATH 30 | ) -> None: 31 | self._device = device 32 | self._max_pixels = max_pixels 33 | self._input_interpolation = input_interpolation 34 | 35 | # cache path 36 | self._cache_basepath = cache_basepath 37 | if self._cache_basepath is None: 38 | self._cache_basepath = os.path.expanduser(DEFAULT_CACHE_BASEPATH) 39 | 40 | self._cache_path = os.path.join(self._cache_basepath, self.NAME) 41 | os.makedirs(self._cache_path, exist_ok=True) 42 | 43 | if auto_set_up: 44 | self.set_up_estimator(self._device) 45 | 46 | @property 47 | def cache_path(self) -> str: 48 | return self._cache_path 49 | 50 | @abc.abstractmethod 51 | def set_up_estimator( 52 | self, 53 | device: Union[str, torch.device] = 'cpu' 54 | ) -> None: 55 | pass 56 | 57 | @staticmethod 58 | def _get_height_width( 59 | img: Union[torch.Tensor, np.ndarray] 60 | ) -> Tuple[int, int]: 61 | if 2 == img.ndim: 62 | # assume single channel: (H, W) 63 | return img.shape[0], img.shape[1] 64 | elif 3 == img.ndim: 65 | if isinstance(img, np.ndarray): 66 | # assume channels last: (H, W, C) 67 | return img.shape[0], img.shape[1] 68 | else: 69 | # assume channels first: (C, H, W) 70 | return img.shape[1], img.shape[2] 71 | elif 4 == img.ndim: 72 | # assume channels first with batch axis (B, C, H, W) 73 | return img.shape[2], img.shape[3] 74 | 75 | @staticmethod 76 | def _resize_image( 77 | img: torch.Tensor, 78 | height: int, 79 | width: int, 80 | mode: str = 'nearest' 81 | ) -> torch.Tensor: 82 | 83 | if AuxiliaryDataEstimatorBase._get_height_width(img) == (height, width): 84 | # nothing to do 85 | return img 86 | 87 | # resize 88 | return torch.nn.functional.interpolate( 89 | img, size=(height, width), mode=mode 90 | ) 91 | 92 | def prepare_input( 93 | self, 94 | image: Union[torch.Tensor, np.ndarray], 95 | ) -> torch.Tensor: 96 | # check input 97 | assert image.ndim in (3, 4) 98 | 99 | # store input type and original shape for later postprocessing 100 | is_numpy = isinstance(image, np.ndarray) 101 | h, w = self._get_height_width(image) 102 | 103 | # ensure torch tensor with channels first 104 | if is_numpy: 105 | # assume image is channels last, i.e., (H, W, C) 106 | assert image.ndim == 3 and \ 107 | (image.shape[-1] == 3 or image.shape[-1] == 1) 108 | image = torch.from_numpy(image) 109 | image = image.permute(2, 0, 1) # (H, W, C) -> (C, H, W) 110 | 111 | # ensure (B, C, H, W) 112 | if 3 == image.ndim: 113 | image = image[None, ...] 114 | 115 | if self._max_pixels is not None: 116 | # resize image to have at most max_pixels while keeping the aspect 117 | # ratio 118 | n_pixels = h * w 119 | if n_pixels > self._max_pixels: 120 | image = self._resize_image( 121 | image, 122 | height=int(np.round(h * np.sqrt(self._max_pixels)/w)), 123 | width=int(np.round(w * np.sqrt(self._max_pixels)/h)), 124 | mode=self._input_interpolation 125 | ) 126 | 127 | return image 128 | 129 | @abc.abstractmethod 130 | def predict( 131 | self, 132 | rgb_img: Union[torch.Tensor, np.ndarray], 133 | ) -> Union[torch.Tensor, np.ndarray]: 134 | pass 135 | 136 | @abc.abstractmethod 137 | def _estimator_predict(self, rgb_image: torch.Tensor) -> torch.Tensor: 138 | """(B,C,H,W) -> (B,H,W)""" 139 | pass 140 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_class_weighting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import warnings 6 | 7 | import numpy as np 8 | from tqdm import tqdm 9 | from tqdm.contrib.concurrent import thread_map 10 | 11 | 12 | KNOWN_CLASS_WEIGHTINGS = ( 13 | 'median-frequency', # median frequency balancing 14 | 'logarithmic', # logarithmic weighting with: 1 / ln(c+p_class) 15 | 'linear', # 1 - p_class 16 | 'none' # no weighting (ones for all classes) 17 | ) 18 | 19 | 20 | def compute_class_weights( 21 | dataset, 22 | sample_key, 23 | n_classes, 24 | ignore_first_class: bool = True, # ignore void class 25 | weight_mode: str = 'median-frequency', 26 | c: float = 1.02, 27 | n_threads: int = 1, 28 | debug: bool = False, 29 | verbose: bool = True 30 | ) -> np.ndarray: 31 | assert weight_mode in KNOWN_CLASS_WEIGHTINGS 32 | 33 | if verbose: 34 | print_ = print 35 | else: 36 | def print_(*args, **kwargs): 37 | pass 38 | 39 | if debug: 40 | warnings.warn( 41 | "Weight mode 'none' is forced as debug mode is enabled, i.e., " 42 | "ones are used as class weights." 43 | ) 44 | weight_mode = 'none' 45 | 46 | print_(f"Computing '{weight_mode}' class weights for '{sample_key}' ...") 47 | 48 | if 'none' == weight_mode: 49 | # equal weights for all classes -> disables class weighting 50 | if ignore_first_class: 51 | return np.ones(n_classes-1) 52 | else: 53 | return np.ones(n_classes) 54 | 55 | def count_helper(sample_idx): 56 | data = dataset.load(sample_key, sample_idx) 57 | h, w = data.shape 58 | n_pixels_per_class_sample = np.bincount( 59 | data.flatten(), 60 | minlength=n_classes 61 | ) 62 | 63 | # for median frequency, we need the pixel sum of the images where 64 | # the specific class is present. (it only matters if the class is 65 | # present in the image and not how many pixels it occupies.) 66 | class_in_image = n_pixels_per_class_sample > 0 67 | n_image_pixels_with_class_sample = class_in_image * h * w 68 | 69 | return n_pixels_per_class_sample, n_image_pixels_with_class_sample 70 | 71 | n_pixels_per_class = np.zeros(n_classes, dtype=np.int64) 72 | n_image_pixels_with_class = np.zeros(n_classes, dtype=np.int64) 73 | 74 | if n_threads == 1: 75 | for i in tqdm(range(len(dataset)), 76 | total=len(dataset), 77 | disable=not verbose): 78 | # process current image at index i 79 | cur_n_pixels_per_class, cur_n_image_pixels_with_class = \ 80 | count_helper(i) 81 | 82 | # update stats 83 | n_pixels_per_class += cur_n_pixels_per_class 84 | n_image_pixels_with_class += cur_n_image_pixels_with_class 85 | else: 86 | # process images using multiple threads 87 | res = thread_map(count_helper, range(len(dataset)), 88 | total=len(dataset), 89 | max_workers=n_threads, 90 | disable=not verbose) 91 | # update stats 92 | for cur_n_pixels_per_class, cur_n_image_pixels_with_class in res: 93 | n_pixels_per_class += cur_n_pixels_per_class 94 | n_image_pixels_with_class += cur_n_image_pixels_with_class 95 | 96 | # remove first class (void) 97 | if ignore_first_class: 98 | n_pixels_per_class = n_pixels_per_class[1:] 99 | n_image_pixels_with_class = n_image_pixels_with_class[1:] 100 | 101 | if weight_mode == 'linear': 102 | probabilities = n_pixels_per_class / np.sum(n_pixels_per_class) 103 | class_weights = 1 - probabilities 104 | 105 | elif weight_mode == 'median-frequency': 106 | frequency = n_pixels_per_class / n_image_pixels_with_class 107 | class_weights = np.nanmedian(frequency) / frequency 108 | 109 | elif weight_mode == 'logarithmic': 110 | probabilities = n_pixels_per_class / np.sum(n_pixels_per_class) 111 | class_weights = 1 / np.log(c + probabilities) 112 | 113 | nan_indices = np.argwhere(np.isnan(class_weights)) 114 | if len(nan_indices) != 0: 115 | print_(f"class_weights:\n{class_weights}") 116 | warnings.warn( 117 | f"Class weights contain NaNs at positions: {nan_indices}, " 118 | "setting NaNs to 0." 119 | ) 120 | print_(f"n_pixels_per_class:\n{n_pixels_per_class}") 121 | print_(f"n_image_pixels_with_class:\n{n_image_pixels_with_class}") 122 | class_weights[nan_indices] = 0 123 | print_(f"fixed class_weights:\n{class_weights}") 124 | 125 | return class_weights 126 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/img.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import numpy as np 6 | from PIL import Image 7 | 8 | 9 | from ._colormaps import COLORMAP_VISUALLY_DISTINCT_VOID_PLUS_256 10 | from ._colormaps import COLORMAP_VISUALLY_DISTINCT_256 11 | 12 | 13 | def dimshuffle(input_img, from_axes, to_axes): 14 | # check axes parameter 15 | if from_axes.find('0') == -1 or from_axes.find('1') == -1: 16 | raise ValueError("`from_axes` must contain both axis0 ('0') and" 17 | "axis 1 ('1')") 18 | if to_axes.find('0') == -1 or to_axes.find('1') == -1: 19 | raise ValueError("`to_axes` must contain both axis0 ('0') and" 20 | "axis 1 ('1')") 21 | if len(from_axes) != len(input_img.shape): 22 | raise ValueError("Number of axis given by `from_axes` does not match " 23 | "the number of axis in `input_img`") 24 | 25 | # handle special cases for channel axis 26 | to_axes_c = to_axes.find('c') 27 | from_axes_c = from_axes.find('c') 28 | # remove channel axis (only grayscale image) 29 | if to_axes_c == -1 and from_axes_c >= 0: 30 | if input_img.shape[from_axes_c] != 1: 31 | raise ValueError('Cannot remove channel axis because size is not ' 32 | 'equal to 1') 33 | input_img = input_img.squeeze(axis=from_axes_c) 34 | from_axes = from_axes.replace('c', '') 35 | 36 | # handle special cases for batch axis 37 | to_axes_b = to_axes.find('b') 38 | from_axes_b = from_axes.find('b') 39 | # remove batch axis 40 | if to_axes_b == -1 and from_axes_b >= 0: 41 | if input_img.shape[from_axes_b] != 1: 42 | raise ValueError('Cannot remove batch axis because size is not ' 43 | 'equal to 1') 44 | input_img = input_img.squeeze(axis=from_axes_b) 45 | from_axes = from_axes.replace('b', '') 46 | 47 | # add new batch axis (in front) 48 | if to_axes_b >= 0 and from_axes_b == -1: 49 | input_img = input_img[np.newaxis] 50 | from_axes = 'b' + from_axes 51 | 52 | # add new channel axis (in front) 53 | if to_axes_c >= 0 and from_axes_c == -1: 54 | input_img = input_img[np.newaxis] 55 | from_axes = 'c' + from_axes 56 | 57 | return np.transpose(input_img, [from_axes.find(a) for a in to_axes]) 58 | 59 | 60 | def get_colormap(n): 61 | def bitget(byteval, idx): 62 | return (byteval & (1 << idx)) != 0 63 | 64 | cmap = np.zeros((n, 3), dtype='uint8') 65 | for i in range(n): 66 | r = g = b = 0 67 | c = i 68 | for j in range(8): 69 | r = r | (bitget(c, 0) << 7-j) 70 | g = g | (bitget(c, 1) << 7-j) 71 | b = b | (bitget(c, 2) << 7-j) 72 | c = c >> 3 73 | 74 | cmap[i] = np.array([r, g, b]) 75 | 76 | return cmap 77 | 78 | 79 | def get_visual_distinct_colormap(with_void: bool = True) -> np.ndarray: 80 | # useful for visualizing instances 81 | if with_void: 82 | return COLORMAP_VISUALLY_DISTINCT_VOID_PLUS_256 83 | else: 84 | return COLORMAP_VISUALLY_DISTINCT_256 85 | 86 | 87 | def save_indexed_png(filepath, label, colormap): 88 | # note that OpenCV is not able to handle indexed pngs correctly. 89 | img = Image.fromarray(np.asarray(label, dtype='uint8')) 90 | img.putpalette(list(np.asarray(colormap, dtype='uint8').flatten())) 91 | img.save(filepath, 'PNG') 92 | 93 | 94 | def blend_images(img1, img2, alpha=0.5): 95 | """ 96 | Function to alpha composite two images. The output image is calculated 97 | by img_out = ( 1 - ( alpha*( img2 > 0 ) ) )*img1 + alpha*img2. 98 | 99 | Parameters 100 | ---------- 101 | img1 : {numpy.ndarray, list, tuple} 102 | The first image with axes '01' or '01c' and of dtype 'uintX' or 103 | 'floatX'. (background image). 104 | img2 : {numpy.ndarray, list, tuple} 105 | The second image with axes '01' or '01c' and of dtype 'uintX' or 106 | 'floatX' (foreground image). 107 | alpha : {float} 108 | The alpha value to use: 0.0 <= alpha <= 1.0. 109 | 110 | Returns 111 | ------- 112 | img_out : numpy.ndarray 113 | The resulting image. 114 | 115 | """ 116 | # ensure that img is a numpy object 117 | img1 = np.asanyarray(img1) 118 | img2 = np.asanyarray(img2) 119 | assert img1.dtype == img2.dtype 120 | assert img1.ndim == img2.ndim 121 | 122 | # alpha composite images 123 | if img2.ndim == 3: 124 | mask = np.any(img2 > 0, axis=2) 125 | else: 126 | mask = img2 > 0 127 | 128 | result = img1.copy() 129 | result[mask, ...] = \ 130 | ((1-alpha)*img1[mask, ...] + alpha*img2[mask, ...]).astype(img1.dtype) 131 | 132 | return result 133 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # build system ----------------------------------------------------------------- 2 | [build-system] 3 | build-backend = "setuptools.build_meta" 4 | requires = [ 5 | "setuptools>=61.0", 6 | "wheel", 7 | ] 8 | 9 | # project metadata ------------------------------------------------------------- 10 | [project] 11 | name = "nicr-scene-analysis-datasets" 12 | description = "Package to prepare and use common datasets for scene analysis." 13 | authors = [ 14 | { name = "Daniel Seichter", email = "daniel.seichter@tu-ilmenau.de" }, 15 | { name = "Soehnke Fischedick", email = "soehnke.fischedick@tu-ilmenau.de" }, 16 | ] 17 | license = { file = "LICENSE" } 18 | readme = "README.md" 19 | requires-python = ">=3.8" 20 | dynamic = [ "version", "dependencies", "optional-dependencies" ] 21 | 22 | [project.urls] 23 | Homepage = "https://www.tu-ilmenau.de/neurob" 24 | Repository = "https://github.com/TUI-NICR/nicr-scene-analysis-datasets" 25 | 26 | # package configuration ------------------------------------------------------- 27 | [tool.setuptools.packages.find] 28 | where = ["src"] 29 | 30 | # additional package data 31 | [tool.setuptools.package-data] 32 | nicr_scene_analysis_datasets = [ 33 | # ADE20K 34 | "datasets/ade20k/README.md", 35 | # Cityscapes 36 | "datasets/cityscapes/README.md", 37 | # COCO 38 | "datasets/coco/README.md", 39 | # Hypersim 40 | "datasets/hypersim/README.md", 41 | # NYUv2 42 | "datasets/nyuv2/README.md", 43 | "datasets/nyuv2/class13Mapping.mat", 44 | "datasets/nyuv2/classMapping40.mat", 45 | "datasets/nyuv2/splits.mat", 46 | "datasets/nyuv2/manual_orientations_test.json", 47 | "datasets/nyuv2/manual_orientations_train.json", 48 | # SceneNet RGB-D 49 | "datasets/scenenetrgbd/README.md", 50 | "datasets/scenenetrgbd/scenenet.proto", 51 | # SUN RGB-D 52 | "datasets/sunrgbd/README.md", 53 | "datasets/sunrgbd/nyu_additional_class_mapping.json", 54 | "datasets/sunrgbd/nyu_weak_box_3d_mapping.json", 55 | # ScanNet 56 | "datasets/scannet/README.md", 57 | "datasets/scannet/scannetv2_train.txt", 58 | "datasets/scannet/scannetv2_val.txt", 59 | "datasets/scannet/scannetv2_test.txt", 60 | ] 61 | 62 | # version and dependencies 63 | [tool.setuptools.dynamic] 64 | version = { attr = "nicr_scene_analysis_datasets.version.__version__" } 65 | dependencies = { file = [ 66 | "requirements/base.txt" 67 | ] } 68 | optional-dependencies.withpreparation = { file = [ 69 | "requirements/preparation.txt" 70 | ] } 71 | optional-dependencies.withopencv = { file = [ 72 | "requirements/opencv.txt" 73 | ] } 74 | optional-dependencies.withtorch = { file = [ 75 | "requirements/torch.txt" 76 | ] } 77 | optional-dependencies.with3d = { file = [ 78 | "requirements/3d.txt" 79 | ] } 80 | optional-dependencies.withauxiliarydata = { file = [ 81 | "requirements/torch.txt", 82 | "requirements/depth_estimation.txt", 83 | "requirements/embedding_estimation.txt", 84 | ] } 85 | optional-dependencies.withdepthestimation = { file = [ 86 | "requirements/torch.txt", 87 | "requirements/depth_estimation.txt", 88 | ] } 89 | optional-dependencies.withembeddingestimation = { file = [ 90 | "requirements/torch.txt", 91 | "requirements/embedding_estimation.txt" 92 | ] } 93 | optional-dependencies.test = { file = [ 94 | "requirements/torch.txt", 95 | "requirements/depth_estimation.txt", 96 | "requirements/embedding_estimation.txt", 97 | "requirements/test.txt", 98 | ] } 99 | 100 | # entry points 101 | [project.scripts] 102 | nicr_sa_prepare_dataset = "nicr_scene_analysis_datasets.scripts.prepare_dataset:main" 103 | nicr_sa_prepare_labeled_point_clouds = "nicr_scene_analysis_datasets.scripts.prepare_labeled_point_clouds:main" 104 | nicr_sa_depth_viewer = "nicr_scene_analysis_datasets.scripts.viewer_depth:main" 105 | nicr_sa_semantic_instance_viewer = "nicr_scene_analysis_datasets.scripts.viewer_semantic_instance:main" 106 | nicr_sa_labeled_pc_viewer = "nicr_scene_analysis_datasets.scripts.viewer_labeled_point_cloud:main" 107 | nicr_sa_generate_auxiliary_data = "nicr_scene_analysis_datasets.scripts.generate_auxiliary_data:main" 108 | 109 | # linting ---------------------------------------------------------------------- 110 | [tool.ruff] 111 | exclude = [ 112 | # we are not the authors of these files 113 | "src/*scannet/scannet200_constants.py", 114 | "src/*scannet/SensorData.py", 115 | # stuff 116 | "stuff/*", 117 | ] 118 | 119 | [tool.ruff.lint] 120 | ignore = [ 121 | # E501 line too long (82 > 79 characters) 122 | "E501", 123 | # E402 module level import not at top of file 124 | "E402", 125 | # E731 do not assign a lambda expression, use a def 126 | "E731", 127 | # [not implemented in ruff] line breaks W503 vs. W504 128 | # "W504" 129 | ] 130 | 131 | [tool.ruff.lint.per-file-ignores] 132 | "__init__.py" = [ 133 | # allow unused imports in __init__.py files 134 | "F401" 135 | ] 136 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scenenetrgbd/scenenet.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package scenenet; 4 | 5 | message SceneLayout { 6 | enum LayoutType { 7 | BATHROOM = 1; 8 | BEDROOM = 2; 9 | KITCHEN = 3; 10 | LIVING_ROOM = 4; 11 | OFFICE = 5; 12 | } 13 | optional LayoutType layout_type = 1; 14 | // This is the name of the SceneNet model used for the layout 15 | optional string model = 2; 16 | } 17 | 18 | message LightInfo { 19 | enum LightType { 20 | SPHERE = 1; 21 | PARALLELOGRAM = 2; 22 | } 23 | optional LightType light_type = 1; 24 | // Light intensity 25 | optional Power light_output = 2; 26 | // This is the center for sphere type lights. And corner for others 27 | optional Position position = 3; 28 | // This is only for SPHERE lights 29 | optional float radius = 4; 30 | // This is only for PARALLELOGRAM lights 31 | optional Position v1 = 5; 32 | optional Position v2 = 6; 33 | } 34 | 35 | message RandomObjectInfo { 36 | optional string shapenet_hash = 1; 37 | optional float height_meters = 2; 38 | message Transformation { 39 | // The 3x4 matrix is as follows: 40 | // rotation_mat11 rotation_mat12 rotation_mat13 translation_x 41 | // rotation_mat21 rotation_mat22 rotation_mat23 translation_y 42 | // rotation_mat31 rotation_mat32 rotation_mat33 translation_y 43 | optional float translation_x = 1; 44 | optional float translation_y = 2; 45 | optional float translation_z = 3; 46 | optional float rotation_mat11 = 4; 47 | optional float rotation_mat12 = 5; 48 | optional float rotation_mat13 = 6; 49 | optional float rotation_mat21 = 7; 50 | optional float rotation_mat22 = 8; 51 | optional float rotation_mat23 = 9; 52 | optional float rotation_mat31 = 10; 53 | optional float rotation_mat32 = 11; 54 | optional float rotation_mat33 = 12; 55 | } 56 | // The transformation gives the transformation applies to an object, about 57 | // the center of the base plane of its axis-aligned bounding box. 58 | optional Transformation object_pose = 3; 59 | } 60 | 61 | message Instance { 62 | optional int32 instance_id = 1; 63 | optional string semantic_wordnet_id = 2; 64 | optional string semantic_english = 3; 65 | enum InstanceType { 66 | // This is the instance type when no object is present, e.g. because of 67 | // looking out a window into nothingness 68 | BACKGROUND = 1; 69 | // This is an object that is hard coded into the layout and does not 70 | // move. This type does not have a transformation or shapenet hash 71 | LAYOUT_OBJECT = 2; 72 | // This is a randomly positioned light source 73 | LIGHT_OBJECT = 3; 74 | // This means the object is a randomly positioned shapenet object. The 75 | // object has a transformation and scale parameter in the object_info 76 | // variable. 77 | RANDOM_OBJECT = 4; 78 | } 79 | optional InstanceType instance_type = 4; 80 | // This information is only filled in for the respective type 81 | optional LightInfo light_info = 5; 82 | optional RandomObjectInfo object_info = 6; 83 | } 84 | 85 | message Power { 86 | optional float r = 1; 87 | optional float g = 2; 88 | optional float b = 3; 89 | } 90 | 91 | message Position { 92 | optional float x = 1; 93 | optional float y = 2; 94 | optional float z = 3; 95 | } 96 | 97 | message Pose { 98 | // The position of these two points define the camera view. The y vector is 99 | // defined as [0,1,0]. For an example of how to calculate the camera view 100 | // coordinate system, see the python codebase. 101 | optional Position camera = 1; 102 | optional Position lookat = 2; 103 | optional float timestamp = 3; 104 | } 105 | 106 | message View { 107 | // These increment by the number of skip frames, i.e. 0,25,50...7475. 108 | optional int32 frame_num = 1; 109 | // The photo is rendered by integrating uniformly sampled 110 | // exposures between the following two poses 111 | optional Pose shutter_open = 2; 112 | optional Pose shutter_close = 3; 113 | } 114 | 115 | message Trajectory { 116 | optional SceneLayout layout = 1; 117 | // The first instances[0] is always the 'background' and 118 | // undefined class when for example looking out windows 119 | repeated Instance instances = 2; 120 | // These are ordered sequentially for a trajectory 121 | repeated View views = 3; 122 | // This stores the path from the root data directory to the trajectory data 123 | // folder. If the trajectories are stored as: 124 | // /path/i/extracted/{val/train}/0/123/photo/0.jpg 125 | // then this path will be '0/123' designating the trajectories folder 126 | optional string render_path = 4; 127 | } 128 | 129 | message Trajectories { 130 | // This is the root list which stores all of the available trajectories 131 | repeated Trajectory trajectories = 1; 132 | } 133 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/dataset_base/_concat_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import Any, Tuple, Union 6 | 7 | from collections import OrderedDict 8 | import warnings 9 | 10 | from ._base_dataset import DatasetBase 11 | 12 | 13 | class ConcatDataset: 14 | def __init__( 15 | self, 16 | main_dataset: DatasetBase, 17 | *additional_datasets: DatasetBase 18 | ) -> None: 19 | self._main_dataset = main_dataset 20 | self._additional_datasets = additional_datasets 21 | self._datasets = (main_dataset,) + additional_datasets 22 | self._active_datasets = (main_dataset,) + additional_datasets 23 | 24 | # catch common misconfiguration 25 | for ds in self._datasets: 26 | if hasattr(ds, 'depth_force_mm') and not ds.depth_force_mm: 27 | # actually SUNRGB-D 28 | warnings.warn( 29 | f"Detected '{ds.__class__.__name__}' dataset with " 30 | "deviating depth scale, consider setting " 31 | "`depth_force_mm` to 'True'." 32 | ) 33 | 34 | # extract information from main dataset 35 | self._sample_keys = main_dataset.sample_keys 36 | # ensure that all additional datasets provide the sample keys of the 37 | # main dataset 38 | for ds in self._additional_datasets: 39 | assert all(sk in ds.sample_keys for sk in self._sample_keys) 40 | 41 | # handle cameras (create ordered union of all cameras) 42 | # note, we use dicts instead of sets to preserve the order (sets use a 43 | # random seed while hashing and, thus, do not guarantee insertion order) 44 | assert all(ds.camera is None for ds in self._datasets) 45 | cameras = [] 46 | for ds in self._datasets: 47 | cameras.extend(ds.cameras) 48 | # note as of python 3.7, dicts guarantee insertion order, so we might 49 | # use dict in future 50 | self._cameras = tuple(OrderedDict.fromkeys(cameras).keys()) 51 | 52 | self._camera = None 53 | 54 | def filter_camera(self, camera: Union[None, str]): 55 | assert camera is None or camera in self.cameras 56 | 57 | # apply filter to all datasets 58 | # note, not all datasets may support given camera, filter them using 59 | # active_datasets 60 | active_datasets = [] 61 | for ds in self._datasets: 62 | if camera is None or camera in ds.cameras: 63 | ds.filter_camera(camera) 64 | active_datasets.append(ds) 65 | 66 | self._active_datasets = tuple(active_datasets) 67 | self._camera = camera 68 | 69 | return self 70 | 71 | def __enter__(self): 72 | # handles context stuff, e.g., with dataset.filter_camera('xy') as ds 73 | return self 74 | 75 | def __exit__(self, *exc: Any): 76 | # handles context stuff, e.g., with dataset.filter_camera('xy') as ds 77 | # reset camera filter 78 | self.filter_camera(None) 79 | 80 | def __len__(self) -> int: 81 | return sum(len(ds) for ds in self._active_datasets) 82 | 83 | @property 84 | def datasets(self) -> Tuple[DatasetBase]: 85 | return self._active_datasets 86 | 87 | def _determine_dataset_and_idx(self, idx: int) -> Tuple[DatasetBase, int]: 88 | length = len(self) 89 | 90 | # ensure that idx is in valid range 91 | if not (-length <= idx < length): 92 | raise IndexError(f"Index {idx} out of range (length: {length}).") 93 | 94 | # handle negative indices 95 | if idx < 0: 96 | idx += length 97 | 98 | # note that the lengths may change if filter_dataset is called outside 99 | for ds in self._active_datasets: 100 | if idx < len(ds): 101 | return ds, idx 102 | idx -= len(ds) 103 | 104 | def load(self, sample_key: str, idx: int) -> Any: 105 | ds, ds_idx = self._determine_dataset_and_idx(idx) 106 | return ds._sample_key_loaders.get(sample_key.lower())(ds_idx) 107 | 108 | def __getitem__(self, idx: int): 109 | # note, we also reimplement __getitem__ to do index mapping stuff only 110 | # once per sample 111 | ds, ds_idx = self._determine_dataset_and_idx(idx) 112 | return ds[ds_idx] 113 | 114 | @property 115 | def cameras(self) -> Tuple[str]: 116 | return self._cameras 117 | 118 | @property 119 | def camera(self) -> Union[None, str]: 120 | return self._camera 121 | 122 | def __getstate__(self): 123 | # important for copying 124 | return self.__dict__ 125 | 126 | def __setstate__(self, state): 127 | # important for copying 128 | self.__dict__ = state 129 | 130 | def __getattr__(self, name): 131 | if name not in self.__dict__ and '_main_dataset' in self.__dict__: 132 | # redirect all other attributes/calls to main dataset 133 | return getattr(self._main_dataset, name) 134 | 135 | return super().__getattr__(name) 136 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - stylecheck 3 | - test 4 | - deploy 5 | 6 | .conda_env: &conda_env 7 | before_script: 8 | # update conda 9 | - conda config --set always_yes yes 10 | - conda update -q conda 11 | 12 | # create and activate environment 13 | - conda create -q -n testenv_${CI_PIPELINE_ID}_${CI_JOB_NAME}_py${PYTHON_VERSION_TO_USE//./} python=${PYTHON_VERSION_TO_USE} 14 | - source activate testenv_${CI_PIPELINE_ID}_${CI_JOB_NAME}_py${PYTHON_VERSION_TO_USE//./} 15 | after_script: 16 | # remove environment 17 | - conda env remove --name testenv_${CI_PIPELINE_ID}_${CI_JOB_NAME}_py${PYTHON_VERSION_TO_USE//./} 18 | 19 | .test_template: &test_template 20 | <<: *conda_env 21 | stage: test 22 | rules: 23 | - if: $CI_MERGE_REQUEST_TITLE =~ /^(Draft:|WIP:|\[Draft\]|\[WIP\])/ 24 | when: manual 25 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TITLE !~ /^(Draft:|WIP:|\[Draft\]|\[WIP\])/ 26 | - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH 27 | - if: $CI_PIPELINE_SOURCE == "schedule" 28 | script: 29 | # for debugging, list all environment variables 30 | # - export 31 | 32 | # install packages (use conda to avoid time-consuming installations) 33 | - conda install -q pytest pytest-cov 34 | - python -m pip install -q pytest-html 35 | 36 | # pytorch and detectron 37 | - | 38 | if [ "${PYTHON_VERSION_TO_USE}" == "3.6" ]; then 39 | conda install pytorch=1.10.1 torchvision=0.11.2 cpuonly -c pytorch 40 | python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html 41 | elif [ "${PYTHON_VERSION_TO_USE}" == "3.8" ]; then 42 | # EMSANet 43 | conda install pytorch=1.13.0 torchvision=0.14.0 cpuonly -c pytorch 44 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' 45 | else 46 | conda install pytorch=2.0.1 torchvision=0.15.2 cpuonly -c pytorch 47 | python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' 48 | fi 49 | 50 | # panopticapi 51 | - pip install git+https://github.com/cocodataset/panopticapi.git 52 | # check conda installation 53 | - conda info 54 | - conda list 55 | 56 | # install package (and all missing dependencies) 57 | - python -m pip install -q --editable .[test] 58 | 59 | # test package (opt: get coverage) 60 | - | 61 | if [ "${REPORT_COVERAGE}" == "true" ]; then 62 | py.test tests/ -rx -s -vv --cov=${CI_PROJECT_NAME//-/_} --cov-report html --cov-report term --html=report_py${PYTHON_VERSION_TO_USE//./}.html --self-contained-html 63 | else 64 | py.test tests/ -rx -s -vv --html=report_py${PYTHON_VERSION_TO_USE//./}.html --self-contained-html 65 | fi 66 | coverage: '/^TOTAL.*\s+(\d+\%)$/' 67 | artifacts: 68 | when: always 69 | paths: 70 | - report_py${PYTHON_VERSION_TO_USE//./}.html 71 | - htmlcov 72 | 73 | style_check: 74 | <<: *conda_env 75 | stage: stylecheck 76 | rules: 77 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 78 | - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH 79 | - if: $CI_PIPELINE_SOURCE == "schedule" 80 | variables: 81 | PYTHON_VERSION_TO_USE: "3.6" # ubuntu 18.04 / jetson default 82 | script: 83 | # install packages 84 | - conda install -q pycodestyle pylint 85 | 86 | # check style using pep8 87 | - find ./ -name "*.py" -not -path "*/stuff/*" | xargs pycodestyle --show-source --show-pep8 88 | 89 | # check style using pylint (without taking into account) 90 | - pylint ${CI_PROJECT_NAME//-/_} --rcfile=${CI_PROJECT_DIR}/.pylintrc || true 91 | 92 | # tests_py36: # ubuntu18 93 | # <<: *test_template 94 | # variables: 95 | # PYTHON_VERSION_TO_USE: "3.6" 96 | # REPORT_COVERAGE: "false" 97 | 98 | tests_py38: # ubuntu20 99 | <<: *test_template 100 | variables: 101 | PYTHON_VERSION_TO_USE: "3.8" 102 | REPORT_COVERAGE: "true" 103 | 104 | tests_py310: # ubuntu22 105 | <<: *test_template 106 | variables: 107 | PYTHON_VERSION_TO_USE: "3.10" 108 | REPORT_COVERAGE: "false" 109 | 110 | tests_py311: # current 111 | <<: *test_template 112 | variables: 113 | PYTHON_VERSION_TO_USE: "3.11" 114 | REPORT_COVERAGE: "false" 115 | 116 | update_pip_package: 117 | stage: deploy 118 | rules: 119 | - if: $CI_PIPELINE_SOURCE == "schedule" 120 | when: never 121 | - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH 122 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 123 | when: manual 124 | script: 125 | - pip install twine 126 | 127 | # build package 128 | - python setup.py sdist bdist_wheel 129 | 130 | # upload package 131 | - export TWINE_USERNAME=${TWINE_USERNAME} 132 | - export TWINE_PASSWORD=${TWINE_PASSWORD} 133 | - python -m twine upload --skip-existing --repository-url ${CI_API_V4_URL}/projects/${PACKAGE_REGISTRY}/packages/pypi dist/* 134 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/ade20k/README.md: -------------------------------------------------------------------------------- 1 | # ADE20K dataset 2 | This dataset provides access to the ADE20K-based semantic segmentation benchmarks, primarily focusing on the MIT Scene Parse Benchmark Challenge 2016 and Places Challenge 2017. 3 | The 2016 Challenge data contains over 20K scene-centric images with pixel-level semantic annotations for 150 object categories, including both stuff classes (sky, road, grass) and thing classes (person, car, bed). 4 | The 2017 Places Challenge adds instance segmentation annotations for the same images. 5 | 6 | For more details about the challenges, see: [MIT Scene Parse Benchmark Challenge 2016](http://sceneparsing.csail.mit.edu/) 7 | 8 | ## Dataset versions 9 | There are two main versions of the dataset which are currently supported in this package: 10 | 11 | 1. [MIT Scene Parse Benchmark Challenge 2016 - MIT SceneParse150](http://sceneparsing.csail.mit.edu/): 12 | - see [GitHub](https://github.com/CSAILVision/sceneparsing) 13 | - benchmark that contains a subset of images and labels of the ADE20K dataset 14 | - is often misleadingly referred to as 'ADE20K' in literature as the data come from the ADE20K dataset 15 | - contains 20,210 training images and 2,000 images for validation, for which semantic annotations and scene classes are available; in addition to that, there is a test set containing 3,352 images 16 | - 150 semantic classes - result from selecting the 150 most frequent object classes from ADE20K (which contains 3600+ classes) - annotations that are not part of this 150 class subset are ignored (treated as background/void) 17 | - usually used for semantic segmentation tasks 18 | - images taken from ADE20K are rescaled so that their longer side is at most 512px 19 | 20 | 2. [Places Challenge 2017](http://placeschallenge.csail.mit.edu/) 21 | - see [GitHub](https://github.com/CSAILVision/placeschallenge) 22 | - adds instance segmentation annotations to the challenge data from 2016 (above) 23 | - can be combined to panoptic segmentation data (100 thing classes and 50 stuff classes) 24 | 25 | ### Additional dataset versions (currently not supported) 26 | 3. [ADE20K Dataset 2021](https://groups.csail.mit.edu/vision/datasets/ADE20K/): 27 | - version from January 17, 2021 - most recent version of the dataset 28 | - 27,574 images (25,574 for training and 2,000 for testing/validation) - therefore, containing about 5K additional training images: 29 | - images come in various sizes, some of them larger than the 512px mentioned above 30 | - semantic, instance, scene and part annotations 31 | - primarily used for open vocabulary segmentation 32 | 33 | ## Prepare dataset 34 | 35 | 1. Download and convert the dataset to the desired format: 36 | The Challenge data can be downloaded without having to register. 37 | Therefore, the prepare script can handle the download. 38 | If you prefer to download the files yourself, you may provide the path to the archives of the challenge data with `--challenge-2016-filepath` AND `--challenge-2017-instances-filepath` (instance annotations are given as a separate download). 39 | 40 | ```bash 41 | # general usage 42 | nicr_sa_prepare_dataset ade20k \ 43 | /path/where/to/store/ade20k/ \ 44 | [--challenge-2016-filepath] \ 45 | [--challenge-2017-instances-filepath] \ 46 | [--n-processes N] 47 | ``` 48 | 49 | With arguments: 50 | - `--challenge-2016-filepath`: 51 | Path to the '2016 Scene Parse Benchmark Challenge' zip file (ADEChallengeData2016.zip). 52 | - `--challenge-2017-instances-filepath`: 53 | Path to the tar file containing the instance annotations of the '2017 Places Challenge' tar file (annotations_instance.tar). 54 | - `--n-processes`: 55 | The number of worker processes to spawn. 56 | 57 | 2. (Optional) Generate auxiliary data 58 | > **Note**: To use auxiliary data generation, the package must be installed with the `withauxiliarydata` option: 59 | > ```bash 60 | > pip install -e .[withauxiliarydata] 61 | > ``` 62 | 63 | ```bash 64 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 65 | nicr_sa_generate_auxiliary_data \ 66 | --dataset ade20k \ 67 | --dataset-path /path/to/already/prepared/ade20k/dataset \ 68 | --auxiliary-data depth image-embedding panoptic-embedding \ 69 | --embedding-estimator-device cuda \ 70 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 71 | --depth-estimator-device cuda \ 72 | --depth-estimators depthanything_v2__indoor_large \ 73 | --cache-models 74 | ``` 75 | With arguments: 76 | - `--dataset-path`: 77 | Path to the prepared ADE20k dataset. 78 | - `--auxiliary-data`: 79 | Types of auxiliary data to generate: 80 | - `depth`: Generates synthetic depth images from RGB. 81 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 82 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 83 | - `--depth-estimator-device`: 84 | Device to use for depth estimation (`cpu` or `cuda`). 85 | - `--depth-estimators`: 86 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 87 | - `--embedding-estimator-device`: 88 | Device to use for embedding estimation (`cpu` or `cuda`). 89 | - `--embedding-estimators`: 90 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 91 | - `--cache-models`: 92 | Cache models locally to avoid reloading them in future runs. 93 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/io.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | .. codeauthor:: Soehnke Fischedick 5 | """ 6 | from typing import Any, Dict, List, Union 7 | 8 | from collections import OrderedDict 9 | from datetime import datetime 10 | import getpass 11 | import hashlib 12 | import json 13 | import os 14 | import sys 15 | from time import time 16 | import urllib.request 17 | import zipfile 18 | 19 | from tqdm import tqdm 20 | 21 | 22 | from ..version import get_version 23 | 24 | 25 | CREATION_META_FILENAME = 'creation_meta.json' 26 | 27 | 28 | class DownloadProgressBar(tqdm): 29 | def update_to(self, b=1, bsize=1, tsize=None): 30 | if tsize is not None: 31 | self.total = tsize 32 | self.update(b * bsize - self.n) 33 | 34 | 35 | def extract_zip(zip_filepath: str, output_dirpath: str) -> None: 36 | with zipfile.ZipFile(zip_filepath, 'r') as zip_file: 37 | for m in tqdm(zip_file.infolist(), desc='Extracting'): 38 | zip_file.extract(m, output_dirpath) 39 | 40 | 41 | def download_file( 42 | url: str, 43 | output_filepath: str, 44 | display_progressbar: bool = False 45 | ) -> None: 46 | with DownloadProgressBar(unit='B', unit_scale=True, 47 | miniters=1, desc=url.split('/')[-1], 48 | disable=not display_progressbar) as t: 49 | urllib.request.urlretrieve(url, 50 | filename=output_filepath, 51 | reporthook=t.update_to) 52 | 53 | 54 | def create_dir(path: str) -> None: 55 | if not os.path.isdir(path): 56 | os.makedirs(path, exist_ok=True) 57 | 58 | 59 | def get_files_by_extension( 60 | path: str, 61 | extension: str = '.png', 62 | flat_structure: bool = False, 63 | recursive: bool = False, 64 | follow_links: bool = True 65 | ) -> Union[List, Dict]: 66 | # check input args 67 | if not os.path.exists(path): 68 | raise IOError("No such file or directory: '{}'".format(path)) 69 | 70 | if flat_structure: 71 | filelist = [] 72 | else: 73 | filelist = {} 74 | 75 | # path is a file 76 | if os.path.isfile(path): 77 | basename = os.path.basename(path) 78 | if extension is None or basename.lower().endswith(extension): 79 | if flat_structure: 80 | filelist.append(path) 81 | else: 82 | filelist[os.path.dirname(path)] = [basename] 83 | return filelist 84 | 85 | # get filelist 86 | filter_func = lambda f: extension is None or f.lower().endswith(extension) 87 | for root, _, filenames in os.walk(path, topdown=True, 88 | followlinks=follow_links): 89 | filenames = list(filter(filter_func, filenames)) 90 | if filenames: 91 | if flat_structure: 92 | filelist.extend((os.path.join(root, f) for f in filenames)) 93 | else: 94 | filelist[root] = sorted(filenames) 95 | if not recursive: 96 | break 97 | 98 | # return 99 | if flat_structure: 100 | return sorted(filelist) 101 | else: 102 | return OrderedDict(sorted(filelist.items())) 103 | 104 | 105 | def create_or_update_creation_metafile( 106 | dataset_basepath: str, 107 | **additional_meta 108 | ) -> None: 109 | filepath = os.path.join(dataset_basepath, CREATION_META_FILENAME) 110 | 111 | # load existing file 112 | if os.path.exists(filepath): 113 | with open(filepath) as f: 114 | meta = json.load(f) 115 | else: 116 | meta = [] 117 | 118 | # update file 119 | ts = time() 120 | meta.append({ 121 | 'executable': sys.executable, 122 | 'command': ' '.join(sys.argv), 123 | 'timestamp': int(ts), 124 | 'local_time': datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'), 125 | 'user': getpass.getuser(), 126 | 'version': '{}.{}.{}+{}'.format(*get_version(with_suffix=True)), 127 | 'additional_meta': additional_meta or None 128 | }) 129 | with open(filepath, 'w') as f: 130 | json.dump(meta, f, indent=4) 131 | 132 | 133 | def _normalize_version(version: str) -> str: 134 | # ensure PEP 440 compliant version 135 | # older versions might use: 136 | # (1) 1.2.3-a2c4e6-dirty -> 1.2.3+a2c4e6.dirty 137 | # (2) 1.2.3- -> 1.2.3 138 | version = version.strip('-') # (2) 139 | version = version.replace('-', '+', 1).replace('-', '.') # (1) 140 | return version 141 | 142 | 143 | def load_creation_metafile(dataset_basepath: str) -> Dict[str, Any]: 144 | filepath = os.path.join(dataset_basepath, CREATION_META_FILENAME) 145 | 146 | if not os.path.exists(filepath): 147 | # file does not exist, dataset might be created before metafile was 148 | # introduced, so do not raise an error here 149 | return 150 | 151 | with open(filepath) as f: 152 | metas = json.load(f) 153 | 154 | # ensure PEP 440 compliant version, we messed it up in the past 155 | for meta in metas: 156 | meta['version'] = _normalize_version(meta['version']) 157 | 158 | return metas 159 | 160 | 161 | def get_sha256_hash(filepath: str) -> str: 162 | sha256_hash = hashlib.sha256() 163 | with open(filepath, "rb") as f: 164 | # Read and update hash string value in blocks of 4K 165 | for byte_block in iter(lambda: f.read(4096), b""): 166 | sha256_hash.update(byte_block) 167 | # Get the hexadecimal digest of the hash 168 | return sha256_hash.hexdigest() 169 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scannet/scannetv2_val.txt: -------------------------------------------------------------------------------- 1 | scene0568_00 2 | scene0568_01 3 | scene0568_02 4 | scene0304_00 5 | scene0488_00 6 | scene0488_01 7 | scene0412_00 8 | scene0412_01 9 | scene0217_00 10 | scene0019_00 11 | scene0019_01 12 | scene0414_00 13 | scene0575_00 14 | scene0575_01 15 | scene0575_02 16 | scene0426_00 17 | scene0426_01 18 | scene0426_02 19 | scene0426_03 20 | scene0549_00 21 | scene0549_01 22 | scene0578_00 23 | scene0578_01 24 | scene0578_02 25 | scene0665_00 26 | scene0665_01 27 | scene0050_00 28 | scene0050_01 29 | scene0050_02 30 | scene0257_00 31 | scene0025_00 32 | scene0025_01 33 | scene0025_02 34 | scene0583_00 35 | scene0583_01 36 | scene0583_02 37 | scene0701_00 38 | scene0701_01 39 | scene0701_02 40 | scene0580_00 41 | scene0580_01 42 | scene0565_00 43 | scene0169_00 44 | scene0169_01 45 | scene0655_00 46 | scene0655_01 47 | scene0655_02 48 | scene0063_00 49 | scene0221_00 50 | scene0221_01 51 | scene0591_00 52 | scene0591_01 53 | scene0591_02 54 | scene0678_00 55 | scene0678_01 56 | scene0678_02 57 | scene0462_00 58 | scene0427_00 59 | scene0595_00 60 | scene0193_00 61 | scene0193_01 62 | scene0164_00 63 | scene0164_01 64 | scene0164_02 65 | scene0164_03 66 | scene0598_00 67 | scene0598_01 68 | scene0598_02 69 | scene0599_00 70 | scene0599_01 71 | scene0599_02 72 | scene0328_00 73 | scene0300_00 74 | scene0300_01 75 | scene0354_00 76 | scene0458_00 77 | scene0458_01 78 | scene0423_00 79 | scene0423_01 80 | scene0423_02 81 | scene0307_00 82 | scene0307_01 83 | scene0307_02 84 | scene0606_00 85 | scene0606_01 86 | scene0606_02 87 | scene0432_00 88 | scene0432_01 89 | scene0608_00 90 | scene0608_01 91 | scene0608_02 92 | scene0651_00 93 | scene0651_01 94 | scene0651_02 95 | scene0430_00 96 | scene0430_01 97 | scene0689_00 98 | scene0357_00 99 | scene0357_01 100 | scene0574_00 101 | scene0574_01 102 | scene0574_02 103 | scene0329_00 104 | scene0329_01 105 | scene0329_02 106 | scene0153_00 107 | scene0153_01 108 | scene0616_00 109 | scene0616_01 110 | scene0671_00 111 | scene0671_01 112 | scene0618_00 113 | scene0382_00 114 | scene0382_01 115 | scene0490_00 116 | scene0621_00 117 | scene0607_00 118 | scene0607_01 119 | scene0149_00 120 | scene0695_00 121 | scene0695_01 122 | scene0695_02 123 | scene0695_03 124 | scene0389_00 125 | scene0377_00 126 | scene0377_01 127 | scene0377_02 128 | scene0342_00 129 | scene0139_00 130 | scene0629_00 131 | scene0629_01 132 | scene0629_02 133 | scene0496_00 134 | scene0633_00 135 | scene0633_01 136 | scene0518_00 137 | scene0652_00 138 | scene0406_00 139 | scene0406_01 140 | scene0406_02 141 | scene0144_00 142 | scene0144_01 143 | scene0494_00 144 | scene0278_00 145 | scene0278_01 146 | scene0316_00 147 | scene0609_00 148 | scene0609_01 149 | scene0609_02 150 | scene0609_03 151 | scene0084_00 152 | scene0084_01 153 | scene0084_02 154 | scene0696_00 155 | scene0696_01 156 | scene0696_02 157 | scene0351_00 158 | scene0351_01 159 | scene0643_00 160 | scene0644_00 161 | scene0645_00 162 | scene0645_01 163 | scene0645_02 164 | scene0081_00 165 | scene0081_01 166 | scene0081_02 167 | scene0647_00 168 | scene0647_01 169 | scene0535_00 170 | scene0353_00 171 | scene0353_01 172 | scene0353_02 173 | scene0559_00 174 | scene0559_01 175 | scene0559_02 176 | scene0593_00 177 | scene0593_01 178 | scene0246_00 179 | scene0653_00 180 | scene0653_01 181 | scene0064_00 182 | scene0064_01 183 | scene0356_00 184 | scene0356_01 185 | scene0356_02 186 | scene0030_00 187 | scene0030_01 188 | scene0030_02 189 | scene0222_00 190 | scene0222_01 191 | scene0338_00 192 | scene0338_01 193 | scene0338_02 194 | scene0378_00 195 | scene0378_01 196 | scene0378_02 197 | scene0660_00 198 | scene0553_00 199 | scene0553_01 200 | scene0553_02 201 | scene0527_00 202 | scene0663_00 203 | scene0663_01 204 | scene0663_02 205 | scene0664_00 206 | scene0664_01 207 | scene0664_02 208 | scene0334_00 209 | scene0334_01 210 | scene0334_02 211 | scene0046_00 212 | scene0046_01 213 | scene0046_02 214 | scene0203_00 215 | scene0203_01 216 | scene0203_02 217 | scene0088_00 218 | scene0088_01 219 | scene0088_02 220 | scene0088_03 221 | scene0086_00 222 | scene0086_01 223 | scene0086_02 224 | scene0670_00 225 | scene0670_01 226 | scene0256_00 227 | scene0256_01 228 | scene0256_02 229 | scene0249_00 230 | scene0441_00 231 | scene0658_00 232 | scene0704_00 233 | scene0704_01 234 | scene0187_00 235 | scene0187_01 236 | scene0131_00 237 | scene0131_01 238 | scene0131_02 239 | scene0207_00 240 | scene0207_01 241 | scene0207_02 242 | scene0461_00 243 | scene0011_00 244 | scene0011_01 245 | scene0343_00 246 | scene0251_00 247 | scene0077_00 248 | scene0077_01 249 | scene0684_00 250 | scene0684_01 251 | scene0550_00 252 | scene0686_00 253 | scene0686_01 254 | scene0686_02 255 | scene0208_00 256 | scene0500_00 257 | scene0500_01 258 | scene0552_00 259 | scene0552_01 260 | scene0648_00 261 | scene0648_01 262 | scene0435_00 263 | scene0435_01 264 | scene0435_02 265 | scene0435_03 266 | scene0690_00 267 | scene0690_01 268 | scene0693_00 269 | scene0693_01 270 | scene0693_02 271 | scene0700_00 272 | scene0700_01 273 | scene0700_02 274 | scene0699_00 275 | scene0231_00 276 | scene0231_01 277 | scene0231_02 278 | scene0697_00 279 | scene0697_01 280 | scene0697_02 281 | scene0697_03 282 | scene0474_00 283 | scene0474_01 284 | scene0474_02 285 | scene0474_03 286 | scene0474_04 287 | scene0474_05 288 | scene0355_00 289 | scene0355_01 290 | scene0146_00 291 | scene0146_01 292 | scene0146_02 293 | scene0196_00 294 | scene0702_00 295 | scene0702_01 296 | scene0702_02 297 | scene0314_00 298 | scene0277_00 299 | scene0277_01 300 | scene0277_02 301 | scene0095_00 302 | scene0095_01 303 | scene0015_00 304 | scene0100_00 305 | scene0100_01 306 | scene0100_02 307 | scene0558_00 308 | scene0558_01 309 | scene0558_02 310 | scene0685_00 311 | scene0685_01 312 | scene0685_02 313 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/scenenetrgbd/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | from typing import Any, Optional, Tuple, Union 6 | 7 | import os 8 | 9 | import cv2 10 | import numpy as np 11 | 12 | from ...dataset_base import build_dataset_config 13 | from ...dataset_base import DatasetConfig 14 | from ...dataset_base import RGBDDataset 15 | from ...dataset_base import SampleIdentifier 16 | from .scenenetrgbd import SceneNetRGBDMeta 17 | 18 | 19 | class SceneNetRGBD(SceneNetRGBDMeta, RGBDDataset): 20 | def __init__( 21 | self, 22 | *, 23 | dataset_path: Optional[str] = None, 24 | split: str = 'train', 25 | sample_keys: Tuple[str] = ('rgb', 'depth', 'semantic'), 26 | use_cache: bool = False, 27 | cameras: Optional[Tuple[str]] = None, 28 | depth_mode: str = 'refined', 29 | scene_use_indoor_domestic_labels: bool = False, 30 | **kwargs: Any 31 | ) -> None: 32 | super().__init__( 33 | dataset_path=dataset_path, 34 | depth_mode=depth_mode, 35 | sample_keys=sample_keys, 36 | use_cache=use_cache, 37 | **kwargs 38 | ) 39 | 40 | assert split in self.SPLITS 41 | assert depth_mode in self.DEPTH_MODES 42 | assert all(sk in self.get_available_sample_keys(split) for sk in sample_keys) 43 | self._semantic_n_classes = 13 44 | self._split = split 45 | self._depth_mode = depth_mode 46 | self._cameras = self.CAMERAS 47 | self._scene_use_indoor_domestic_labels = scene_use_indoor_domestic_labels 48 | 49 | # cameras 50 | if cameras is None: 51 | # use all available cameras (=default dummy camera) 52 | self._cameras = self.CAMERAS 53 | else: 54 | # use subset of cameras (does not really apply to this dataset) 55 | assert all(c in self.CAMERAS for c in cameras) 56 | self._cameras = cameras 57 | 58 | # load file list 59 | if dataset_path is not None: 60 | # load file list 61 | fp = os.path.join(self.dataset_path, 62 | self.SPLIT_FILELIST_FILENAMES[self._split]) 63 | with open(fp, 'r') as f: 64 | self._files = f.read().splitlines() 65 | else: 66 | self.debug_print("Loaded SceneNetRGBD dataset without files") 67 | 68 | if self._scene_use_indoor_domestic_labels: 69 | # use remapped scene labels 70 | scene_label_list = self.SCENE_LABEL_LIST_INDOOR_DOMESTIC 71 | else: 72 | # use original scene labels 73 | scene_label_list = self.SCENE_LABEL_LIST 74 | 75 | # build config object 76 | self._config = build_dataset_config( 77 | semantic_label_list=self.SEMANTIC_LABEL_LIST, 78 | scene_label_list=scene_label_list, 79 | depth_stats=self.TRAIN_SPLIT_DEPTH_STATS 80 | ) 81 | 82 | # register loader functions 83 | self.auto_register_sample_key_loaders() 84 | 85 | @property 86 | def cameras(self) -> Tuple[str]: 87 | return self._cameras 88 | 89 | @property 90 | def config(self) -> DatasetConfig: 91 | return self._config 92 | 93 | @property 94 | def split(self) -> str: 95 | return self._split 96 | 97 | def _get_filename(self, idx: int) -> str: 98 | return self._files[idx] 99 | 100 | def __len__(self) -> int: 101 | return len(self._files) 102 | 103 | @staticmethod 104 | def get_available_sample_keys(split: str) -> Tuple[str]: 105 | return SceneNetRGBDMeta.SPLIT_SAMPLE_KEYS[split] 106 | 107 | def _load( 108 | self, 109 | directory: str, 110 | idx: int, 111 | extension: str = '.png' 112 | ) -> Union[str, np.ndarray]: 113 | # determine filepath 114 | fp = os.path.join(self.dataset_path, 115 | self.split, 116 | directory, 117 | f'{self._files[idx]}{extension}') 118 | 119 | # load data 120 | if '.txt' == extension: 121 | with open(fp, 'r') as f: 122 | data = f.readline() 123 | else: 124 | # default load using OpenCV 125 | data = cv2.imread(fp, cv2.IMREAD_UNCHANGED) 126 | if data is None: 127 | raise IOError(f"Unable to load image: '{fp}'") 128 | if data.ndim == 3: 129 | data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB) 130 | 131 | return data 132 | 133 | def _load_rgb(self, idx: int) -> np.ndarray: 134 | return self._load(self.RGB_DIR, idx, '.jpg') 135 | 136 | def _load_depth(self, idx: int) -> np.ndarray: 137 | return self._load(self.DEPTH_DIR, idx) 138 | 139 | def _load_identifier(self, idx: int) -> Tuple[str]: 140 | fn = self._files[idx] 141 | return SampleIdentifier(os.path.normpath(fn).split(os.sep)) 142 | 143 | def _load_semantic(self, idx: int) -> np.ndarray: 144 | return self._load(self.SEMANTIC_13_DIR, idx).astype('uint8') 145 | 146 | def _load_instance(self, idx: int) -> np.ndarray: 147 | return self._load(self.INSTANCES_DIR, idx).astype('uint16') 148 | 149 | def _load_scene(self, idx: int) -> int: 150 | class_str = self._load(self.SCENE_CLASS_DIR, idx, '.txt') 151 | 152 | class_idx = self.SCENE_LABEL_LIST.index(class_str) 153 | 154 | if self._scene_use_indoor_domestic_labels: 155 | # map class to indoor domestic environment labels 156 | mapping = self.SCENE_LABEL_IDX_TO_SCENE_LABEL_INDOOR_DOMESTIC_IDX 157 | class_idx = mapping[class_idx] 158 | 159 | return class_idx 160 | -------------------------------------------------------------------------------- /tests/test_nyuv2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for NYUv2 dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | .. codeauthor:: Soehnke Fischedick 7 | """ 8 | import numpy as np 9 | from numpy.testing import assert_almost_equal 10 | import pytest 11 | 12 | from nicr_scene_analysis_datasets import NYUv2 13 | from nicr_scene_analysis_datasets.dataset_base import OrientationDict 14 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 15 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 16 | 17 | 18 | N_SAMPLES = {'train': 795, 'test': 654} 19 | N_SCENE_CLASSES = 27 20 | 21 | 22 | @pytest.mark.parametrize('split', ('train', 'test')) 23 | @pytest.mark.parametrize('semantic_n_classes', (894, 40, 13)) 24 | @pytest.mark.parametrize('depth_mode', ('refined', 'raw')) 25 | def test_dataset(split, semantic_n_classes, depth_mode): 26 | dataset = NYUv2( 27 | dataset_path=DATASET_PATH_DICT['nyuv2'], 28 | split=split, 29 | depth_mode=depth_mode, 30 | sample_keys=NYUv2.get_available_sample_keys(split), 31 | semantic_n_classes=semantic_n_classes 32 | ) 33 | 34 | assert dataset.depth_mode == depth_mode 35 | assert dataset.split == split 36 | 37 | assert len(dataset) == N_SAMPLES[split] 38 | 39 | assert dataset.semantic_n_classes == semantic_n_classes + 1 40 | assert dataset.semantic_n_classes_without_void == semantic_n_classes 41 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 42 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 43 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 44 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 45 | 46 | assert len(dataset.scene_class_names) == N_SCENE_CLASSES 47 | assert len(dataset.cameras) == 1 48 | 49 | assert isinstance(dataset.depth_min, float) 50 | assert isinstance(dataset.depth_max, float) 51 | assert isinstance(dataset.depth_mean, float) 52 | assert isinstance(dataset.depth_std, float) 53 | assert isinstance(dataset.depth_stats, dict) 54 | 55 | # test first 10 samples 56 | for i, sample in enumerate(dataset): 57 | assert isinstance(sample, dict) 58 | assert isinstance(sample['identifier'], SampleIdentifier) 59 | # inputs: rgb and depth 60 | assert sample['rgb'].ndim == 3 61 | assert sample['depth'].ndim == 2 62 | # semantic 63 | assert sample['semantic'].ndim == 2 64 | # instance 65 | assert sample['instance'].ndim == 2 66 | # normal 67 | normal = sample['normal'] 68 | assert normal.ndim == 3 69 | assert normal.dtype == 'float32' 70 | norms = np.linalg.norm(normal, ord=2, axis=-1) 71 | mask = norms > 1e-7 # filter invalid pixels 72 | assert_almost_equal(norms[mask], 1, decimal=4) 73 | # scene 74 | assert isinstance(sample['scene'], int) 75 | # orientation 76 | assert isinstance(sample['orientations'], OrientationDict) 77 | for key, value in sample['orientations'].items(): 78 | # check if orientation with key exists in instance 79 | assert (sample['instance'] == key).sum() > 0 80 | 81 | assert isinstance(key, int) 82 | assert isinstance(value, float) 83 | # assert that the encoding is in radians 84 | assert 0 <= value <= 2*np.pi 85 | 86 | if i >= 9: 87 | break 88 | 89 | 90 | def test_dataset_computing(): 91 | # as NYUv2 is quite small, we additionally test some functions 92 | dataset = NYUv2( 93 | dataset_path=DATASET_PATH_DICT['nyuv2'], 94 | sample_keys=('rgb', 'depth', 'semantic') 95 | ) 96 | weights_1 = dataset.semantic_compute_class_weights( 97 | 'median-frequency', n_threads=1 98 | ) 99 | weights_10 = dataset.semantic_compute_class_weights( 100 | 'median-frequency', 101 | n_threads=10 102 | ) 103 | assert np.array_equal(weights_1, weights_10) 104 | assert np.array_equal(dataset.semantic_compute_class_weights(debug=True), 105 | np.ones(dataset.semantic_n_classes_without_void)) 106 | 107 | assert dataset.depth_compute_stats(n_threads=10) 108 | 109 | 110 | @pytest.mark.parametrize('split', ('train', 'test')) 111 | def test_scene_class_mapping(split): 112 | sample_keys = ('scene',) 113 | 114 | # create datasets 115 | dataset_original = NYUv2( 116 | dataset_path=DATASET_PATH_DICT['nyuv2'], 117 | split=split, 118 | sample_keys=sample_keys, 119 | scene_use_indoor_domestic_labels=False 120 | ) 121 | 122 | dataset_remapped = NYUv2( 123 | dataset_path=DATASET_PATH_DICT['nyuv2'], 124 | split=split, 125 | sample_keys=sample_keys, 126 | scene_use_indoor_domestic_labels=True 127 | ) 128 | 129 | # count samples 130 | def count(dataset): 131 | class_names = dataset.config.scene_label_list.class_names 132 | counts = {n: 0 for n in class_names} 133 | for sample in dataset: 134 | counts[class_names[sample['scene']]] += 1 135 | 136 | return counts 137 | 138 | counts_original = count(dataset_original) 139 | counts_remapped = count(dataset_remapped) 140 | 141 | # perform simple some checks 142 | assert sum(counts_remapped.values()) == N_SAMPLES[split] 143 | assert sum(counts_remapped.values()) == sum(counts_original.values()) 144 | assert len(counts_remapped) == dataset_remapped.scene_n_classes 145 | 146 | assert dataset_original.scene_n_classes == dataset_original.scene_n_classes_without_void 147 | assert dataset_remapped.scene_n_classes == dataset_remapped.scene_n_classes_without_void + 1 148 | -------------------------------------------------------------------------------- /tests/test_concat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Some common dataset tests 4 | 5 | .. codeauthor:: Daniel Seichter 6 | """ 7 | from copy import deepcopy 8 | 9 | import cv2 10 | 11 | import pytest 12 | 13 | from nicr_scene_analysis_datasets import ConcatDataset 14 | from nicr_scene_analysis_datasets import get_dataset_class 15 | from nicr_scene_analysis_datasets.pytorch import ConcatDataset as ConcatDatasetPyTorch 16 | from nicr_scene_analysis_datasets.pytorch import get_dataset_class as get_dataset_class_pytorch 17 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 18 | 19 | 20 | class SimpleDepthPreprocessor: 21 | def __call__(self, sample): 22 | sample['depth'] = cv2.resize(sample['depth'], (10, 10)) 23 | return sample 24 | 25 | 26 | @pytest.mark.parametrize('dataset_factory_and_class', 27 | ((get_dataset_class, ConcatDataset), 28 | (get_dataset_class_pytorch, ConcatDatasetPyTorch))) 29 | def test_concatenated_dataset(dataset_factory_and_class): 30 | """Dataset concatenation""" 31 | dataset_factory, ConcatDatasetClass = dataset_factory_and_class 32 | 33 | sample_keys = ('identifier', 'depth') 34 | 35 | main_dataset = dataset_factory('sunrgbd')( 36 | dataset_path=DATASET_PATH_DICT['sunrgbd'], 37 | sample_keys=sample_keys, 38 | depth_force_mm=True, 39 | split='train', 40 | cameras=('kv1', 'kv2', 'xtion') # remove realsense samples 41 | ) 42 | 43 | dataset2 = dataset_factory('scannet')( 44 | dataset_path=DATASET_PATH_DICT['scannet'], 45 | sample_keys=sample_keys, 46 | split='train' 47 | ) 48 | 49 | dataset3 = dataset_factory('nyuv2')( 50 | dataset_path=DATASET_PATH_DICT['nyuv2'], 51 | sample_keys=sample_keys, 52 | split='train' 53 | ) 54 | 55 | dataset = ConcatDatasetClass(main_dataset, dataset2, dataset3) 56 | 57 | if issubclass(ConcatDatasetClass, ConcatDatasetPyTorch): 58 | # it is a pytorch dataset class, set a simple preprocessor 59 | preprocessor = SimpleDepthPreprocessor() 60 | dataset.preprocessor = preprocessor 61 | 62 | assert dataset.preprocessor == preprocessor 63 | assert main_dataset.preprocessor == preprocessor 64 | assert dataset2.preprocessor == preprocessor 65 | assert dataset3.preprocessor == preprocessor 66 | 67 | # simple tests 68 | n_samples_total = len(main_dataset) + len(dataset2) + len(dataset3) 69 | assert n_samples_total == len(main_dataset) + len(dataset2) + len(dataset3) 70 | 71 | # check that main_dataset is present 72 | offset = 0 73 | assert dataset[offset]['identifier'] == main_dataset[0]['identifier'] 74 | assert (dataset[offset]['depth'] == main_dataset[0]['depth']).all() 75 | if issubclass(ConcatDatasetClass, ConcatDatasetPyTorch): 76 | # it is a pytorch dataset class, check that preprocessor was applied 77 | assert dataset[offset]['depth'].shape == (10, 10) 78 | 79 | # check that dataset2 is present 80 | offset += len(main_dataset) 81 | assert dataset[offset]['identifier'] == dataset2[0]['identifier'] 82 | assert (dataset[offset]['depth'] == dataset2[0]['depth']).all() 83 | if issubclass(ConcatDatasetClass, ConcatDatasetPyTorch): 84 | # it is a pytorch dataset class, check that preprocessor was applied 85 | assert dataset[offset]['depth'].shape == (10, 10) 86 | 87 | # check that dataset3 is present 88 | offset += len(dataset2) 89 | assert dataset[offset]['identifier'] == dataset3[0]['identifier'] 90 | assert (dataset[offset]['depth'] == dataset3[0]['depth']).all() 91 | if issubclass(ConcatDatasetClass, ConcatDatasetPyTorch): 92 | # it is a pytorch dataset class, check that preprocessor was applied 93 | assert dataset[offset]['depth'].shape == (10, 10) 94 | 95 | # check that negative indices work 96 | assert dataset[-n_samples_total]['identifier'] == dataset[0]['identifier'] 97 | assert dataset[-1]['identifier'] == dataset3[-1]['identifier'] 98 | 99 | # test with camera filter from outside 100 | n_samples_sunrgbd = len(main_dataset) 101 | with main_dataset.filter_camera('kv1') as ds: 102 | n_samples_sunrgbd_kv1 = len(ds) 103 | # concatenated dataset should change as well 104 | assert len(dataset) == n_samples_total - n_samples_sunrgbd + n_samples_sunrgbd_kv1 105 | # everything should be back to normal 106 | assert len(dataset) == n_samples_total 107 | assert len(main_dataset) == n_samples_sunrgbd 108 | 109 | # test with camera filter 110 | with dataset.filter_camera('kv1') as ds: 111 | assert len(ds) == n_samples_sunrgbd_kv1 + len(dataset3) 112 | assert ds.camera == 'kv1' 113 | # everything should be back to normal 114 | assert len(dataset) == n_samples_total 115 | assert dataset.camera is None 116 | 117 | # test with camera filter 118 | with dataset.filter_camera('structureio_480x640') as ds: 119 | assert len(ds) == len(dataset2) 120 | assert ds.camera == 'structureio_480x640' 121 | # everything should be back to normal 122 | assert len(dataset) == n_samples_total 123 | assert dataset.camera is None 124 | 125 | # test with camera filter without context 126 | dataset.filter_camera('kv1') 127 | assert len(ds) == n_samples_sunrgbd_kv1 + len(dataset3) 128 | dataset.filter_camera(None) 129 | # everything should be back to normal 130 | assert len(dataset) == n_samples_total 131 | 132 | # test copying 133 | dataset_copy = deepcopy(dataset) 134 | assert id(dataset_copy._main_dataset) != id(dataset._main_dataset) 135 | assert id(dataset_copy._additional_datasets[0]) != id(dataset._additional_datasets[0]) 136 | dataset.filter_camera('kv1') 137 | assert len(dataset_copy) == n_samples_total 138 | assert dataset_copy.camera is None 139 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/coco/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Soehnke Fischedick 4 | .. codeauthor:: Daniel Seichter 5 | """ 6 | from typing import Any, Optional, Tuple 7 | 8 | import os 9 | 10 | import cv2 11 | import numpy as np 12 | 13 | from ...dataset_base import build_dataset_config 14 | from ...dataset_base import DatasetConfig 15 | from ...dataset_base import RGBDataset 16 | from ...dataset_base import SampleIdentifier 17 | from .coco import COCOMeta 18 | 19 | 20 | class COCO(COCOMeta, RGBDataset): 21 | def __init__( 22 | self, 23 | *, 24 | dataset_path: Optional[str] = None, 25 | split: str = 'train', 26 | sample_keys: Tuple[str] = ('rgb', 'semantic'), 27 | use_cache: bool = False, 28 | cameras: Optional[Tuple[str]] = None, 29 | **kwargs: Any 30 | ) -> None: 31 | super().__init__( 32 | dataset_path=dataset_path, 33 | sample_keys=sample_keys, 34 | use_cache=use_cache, 35 | **kwargs 36 | ) 37 | 38 | assert split in self.SPLITS 39 | assert all(sk in self.get_available_sample_keys(split) for sk in sample_keys) 40 | self._split = split 41 | 42 | if dataset_path is not None: 43 | # load filenames 44 | fp = os.path.join(self.dataset_path, 45 | self.SPLIT_FILELIST_FILENAMES[self._split]) 46 | self._filenames = list(np.loadtxt(fp, dtype=str)) 47 | 48 | # COCO is comprised of images of various cameras and spatial 49 | # dimensions, so we do not know the actual cameras, however, in the 50 | # dataset class, we use the camera property to split the dataset 51 | # in virtual cameras with images of same spatial dimensions 52 | 53 | # get filelist for each camera 54 | self._filenames_per_camera = {} 55 | for fn in self._filenames: 56 | camera = os.path.dirname(fn) 57 | if camera not in self._filenames_per_camera: 58 | self._filenames_per_camera[camera] = [] 59 | self._filenames_per_camera[camera].append(fn) 60 | 61 | available_cameras = tuple(self._filenames_per_camera.keys()) 62 | 63 | if cameras is None: 64 | # use all available cameras 65 | self._cameras = available_cameras 66 | else: 67 | # use subset of cameras 68 | assert all(c in available_cameras for c in cameras) 69 | self._cameras = cameras 70 | 71 | # filter dict 72 | for camera in list(self._filenames_per_camera.keys()): 73 | if camera not in self._cameras: 74 | # remove from dict 75 | del self._filenames_per_camera[camera] 76 | # recreate filelist 77 | self._filenames = [] 78 | for camera, filenames in self._filenames_per_camera.items(): 79 | self._filenames.extend( 80 | os.path.join(camera, fn) for fn in filenames 81 | ) 82 | else: 83 | self.debug_print("Loaded COCO dataset without files") 84 | self._cameras = self.CAMERAS # single dummy camera 85 | 86 | # build config object 87 | self._config = build_dataset_config( 88 | semantic_label_list=self.SEMANTIC_LABEL_LIST, 89 | ) 90 | 91 | # register loader functions 92 | self.auto_register_sample_key_loaders() 93 | 94 | @property 95 | def cameras(self) -> Tuple[str]: 96 | return self._cameras 97 | 98 | @property 99 | def config(self) -> DatasetConfig: 100 | return self._config 101 | 102 | @property 103 | def split(self) -> str: 104 | return self._split 105 | 106 | def __len__(self) -> int: 107 | if self.camera is None or self.CAMERAS[0] == self.camera: 108 | return len(self._filenames) 109 | return len(self._filenames_per_camera[self.camera]) 110 | 111 | @staticmethod 112 | def get_available_sample_keys(split: str) -> Tuple[str]: 113 | return COCOMeta.SPLIT_SAMPLE_KEYS[split] 114 | 115 | def _get_filename(self, idx: int) -> str: 116 | if self.camera is None or self.CAMERAS[0] == self.camera: 117 | return self._filenames[idx] 118 | else: 119 | return self._filenames_per_camera[self.camera][idx] 120 | 121 | def _load( 122 | self, 123 | directory: str, 124 | idx: int, 125 | ext: str = '.png' 126 | ) -> np.ndarray: 127 | # get filename depending on current camera 128 | filename = self._get_filename(idx) 129 | fp = os.path.join(self.dataset_path, 130 | self.split, 131 | directory, 132 | f'{filename}{ext}') 133 | img = cv2.imread(fp, cv2.IMREAD_UNCHANGED) 134 | if img is None: 135 | raise IOError(f"Unable to load image: '{fp}'") 136 | if 3 == img.ndim: 137 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 138 | 139 | return img 140 | 141 | def _load_rgb(self, idx) -> np.ndarray: 142 | img = self._load(self.IMAGE_DIR, idx, '.jpg') 143 | 144 | # force RGB if the image is grayscale 145 | if 2 == img.ndim: 146 | img = img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) 147 | 148 | return img 149 | 150 | def _load_identifier(self, idx: int) -> Tuple[str]: 151 | # get filename depending on current camera 152 | filename = self._get_filename(idx) 153 | return SampleIdentifier(os.path.normpath(filename).split(os.sep)) 154 | 155 | def _load_semantic(self, idx: int) -> np.ndarray: 156 | return self._load(self.SEMANTIC_DIR, idx).astype('uint8') 157 | 158 | def _load_instance(self, idx: int) -> np.ndarray: 159 | instance = self._load(self.INSTANCES_DIR, idx) 160 | return instance.astype('uint16') 161 | -------------------------------------------------------------------------------- /tests/test_sunrgbd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for SUNRGBD dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | .. codeauthor:: Soehnke Fischedick 7 | """ 8 | import numpy as np 9 | import pytest 10 | 11 | from nicr_scene_analysis_datasets import SUNRGBD 12 | from nicr_scene_analysis_datasets.dataset_base import ExtrinsicCameraParametersNormalized 13 | from nicr_scene_analysis_datasets.dataset_base import IntrinsicCameraParametersNormalized 14 | from nicr_scene_analysis_datasets.dataset_base import OrientationDict 15 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 16 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 17 | 18 | 19 | N_CLASSES_WITH_VOID = 37 + 1 20 | N_SCENE_CLASSES = 45 21 | N_SAMPLES = {'train': 5285, 'test': 5050} 22 | 23 | 24 | @pytest.mark.parametrize('split', ('train', 'test')) 25 | @pytest.mark.parametrize('depth_mode', ('refined', 'raw')) 26 | def test_dataset(split, depth_mode): 27 | dataset = SUNRGBD( 28 | dataset_path=DATASET_PATH_DICT['sunrgbd'], 29 | split=split, 30 | depth_mode=depth_mode, 31 | sample_keys=SUNRGBD.get_available_sample_keys(split) 32 | ) 33 | 34 | assert dataset.depth_mode == depth_mode 35 | assert dataset.split == split 36 | 37 | assert len(dataset) == N_SAMPLES[split] 38 | 39 | assert dataset.semantic_n_classes == N_CLASSES_WITH_VOID 40 | assert dataset.semantic_n_classes_without_void == N_CLASSES_WITH_VOID - 1 41 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 42 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 43 | 44 | assert len(dataset.scene_class_names) == N_SCENE_CLASSES 45 | 46 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 47 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 48 | 49 | assert len(dataset.cameras) == 4 50 | 51 | assert isinstance(dataset.depth_min, float) 52 | assert isinstance(dataset.depth_max, float) 53 | assert isinstance(dataset.depth_mean, float) 54 | assert isinstance(dataset.depth_std, float) 55 | assert isinstance(dataset.depth_stats, dict) 56 | 57 | # test first 10 samples sample 58 | for i, sample in enumerate(dataset): 59 | assert isinstance(sample, dict) 60 | assert isinstance(sample['identifier'], SampleIdentifier) 61 | assert isinstance(sample['extrinsics'], 62 | ExtrinsicCameraParametersNormalized) 63 | assert (3+4) == len(sample['extrinsics']) 64 | # inputs: rgb and depth 65 | assert sample['rgb'].ndim == 3 66 | assert isinstance(sample['rgb_intrinsics'], 67 | IntrinsicCameraParametersNormalized) 68 | assert (2+2+6+2) == len(sample['rgb_intrinsics']) 69 | assert sample['depth'].ndim == 2 70 | assert isinstance(sample['depth_intrinsics'], 71 | IntrinsicCameraParametersNormalized) 72 | assert (2+2+6+2+2) == len(sample['depth_intrinsics']) 73 | # semantic 74 | assert sample['semantic'].ndim == 2 75 | # instance 76 | assert sample['instance'].ndim == 2 77 | # scene 78 | assert isinstance(sample['scene'], int) 79 | # orientation 80 | assert isinstance(sample['orientations'], OrientationDict) 81 | for key, value in sample['orientations'].items(): 82 | 83 | # check if orientation with key exists in instance 84 | assert (sample['instance'] == key).sum() > 0 85 | 86 | assert isinstance(key, int) 87 | assert isinstance(value, float) 88 | # Assert that the encoding is in radians 89 | assert 0 <= value <= 2*np.pi 90 | # 3d boxes 91 | assert isinstance(sample['3d_boxes'], list) 92 | 93 | if i >= 9: 94 | break 95 | 96 | 97 | @pytest.mark.parametrize('split', ('train', 'test')) 98 | def test_scene_class_mapping(split): 99 | sample_keys = ('scene',) 100 | 101 | # create datasets 102 | dataset_original = SUNRGBD( 103 | dataset_path=DATASET_PATH_DICT['sunrgbd'], 104 | split=split, 105 | sample_keys=sample_keys, 106 | scene_use_indoor_domestic_labels=False 107 | ) 108 | 109 | dataset_remapped = SUNRGBD( 110 | dataset_path=DATASET_PATH_DICT['sunrgbd'], 111 | split=split, 112 | sample_keys=sample_keys, 113 | scene_use_indoor_domestic_labels=True 114 | ) 115 | 116 | # count samples 117 | def count(dataset): 118 | class_names = dataset.config.scene_label_list.class_names 119 | counts = {n: 0 for n in class_names} 120 | for sample in dataset: 121 | counts[class_names[sample['scene']]] += 1 122 | 123 | return counts 124 | 125 | counts_original = count(dataset_original) 126 | counts_remapped = count(dataset_remapped) 127 | 128 | # perform simple some checks 129 | assert sum(counts_remapped.values()) == N_SAMPLES[split] 130 | assert sum(counts_remapped.values()) == sum(counts_original.values()) 131 | assert len(counts_remapped) == dataset_remapped.scene_n_classes 132 | 133 | assert dataset_original.scene_n_classes == dataset_original.scene_n_classes_without_void 134 | assert dataset_remapped.scene_n_classes == dataset_remapped.scene_n_classes_without_void + 1 135 | 136 | 137 | @pytest.mark.parametrize('split', ('train', 'test')) 138 | def test_filter_camera(split): 139 | # just some random cameras and counts that we know 140 | sample_cameras = { 141 | 'train': {'xtion': 1701, 'realsense': 587}, 142 | 'test': {'kv2': 1860, 'kv1': 930} 143 | } 144 | 145 | cameras = tuple(sample_cameras[split].keys()) 146 | n_samples = tuple(sample_cameras[split].values()) 147 | 148 | # create dataset with specified cameras 149 | dataset = SUNRGBD( 150 | dataset_path=DATASET_PATH_DICT['sunrgbd'], 151 | split=split, 152 | sample_keys=SUNRGBD.get_available_sample_keys(split), 153 | cameras=cameras 154 | ) 155 | 156 | assert dataset.cameras == cameras 157 | assert len(dataset) == sum(n_samples) 158 | 159 | # test filtering 160 | dataset.filter_camera(cameras[0]) 161 | assert dataset.camera == cameras[0] 162 | assert len(dataset) == n_samples[0] 163 | 164 | # reset filtering 165 | dataset.filter_camera(None) 166 | assert dataset.camera is None 167 | assert len(dataset) == sum(n_samples) 168 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/hypersim/README.md: -------------------------------------------------------------------------------- 1 | # Hypersim dataset 2 | 3 | For many fundamental scene understanding tasks, it is difficult or impossible 4 | to obtain per-pixel ground truth labels from real images. We address this 5 | challenge with Hypersim, a photorealistic synthetic dataset for holistic indoor 6 | scene understanding. To create our dataset, we leverage a large repository of 7 | synthetic scenes created by professional artists, and we generate 77,400 images 8 | of 461 indoor scenes with detailed per-pixel labels and corresponding ground 9 | truth geometry. Our dataset: (1) relies exclusively on publicly available 3D 10 | assets; (2) includes complete scene geometry, material information, and 11 | lighting information for every scene; (3) includes dense per-pixel semantic 12 | instance segmentations for every image; and (4) factors every image into 13 | diffuse reflectance, diffuse illumination, and a non-diffuse residual term 14 | that captures view-dependent lighting effects. Together, these features make 15 | our dataset well-suited for geometric learning problems that require direct 3D 16 | supervision, multi-task learning problems that require reasoning jointly over 17 | multiple input and output modalities, and inverse rendering problems. 18 | 19 | For more details, see: [Hypersim](https://machinelearning.apple.com/research/hypersim) 20 | 21 | ## Notes 22 | 23 | > Hypersim uses non-standard perspective projection matrices (with 24 | tilt-shift photography parameters) in most scenes. As common frameworks, such as 25 | MIRA or ROS, do not support this projection, we convert the camera parameters if 26 | possible or project the data/annotations back to a standard camera ignoring the 27 | tilt-shift parameters. Note that this is not a perfect conversion and introduces 28 | some artifacts, i.e., void pixels as we only back-project points without 29 | contradictions. Void is assigned to ~5% of the pixels. 30 | However, rendering full images with a standard perspective projection 31 | requires buying the dataset meshes. 32 | For more details, see [this issue](https://github.com/apple/ml-hypersim/issues/24). 33 | To disable this conversion and to stick to original images, pass the 34 | `--no-tilt-shift-conversion` parameter to the prepare script. 35 | 36 | > We observed that merging semantic and instance labels in order to derive 37 | panoptic labels might slightly change the semantic in few images. This is 38 | because there are some pixels that belong to a thing class but are not assigned 39 | to any instance (instance=0), e.g., in scene ai_052_001, a lamp is labeled as 40 | lamp but is not annotated as instance. Panoptic merging assigns void for those 41 | pixels. There is no workaround for this issue. Affected scenes: 42 | valid: ai_023_003, ai_041_003, ai_052_001, ai_052_003 -> 1576566 pixels (0.03%); 43 | test: ai_005_001, ai_008_005, ai_008_005, ai_022_001 -> 801359 pixels (0.01%). 44 | Computing mIoU in [0, 1] to semantic / panoptic_semantic as ground truth 45 | changes the result by ~0.0001-0.0002 - so it is not a big issue and negligible. 46 | 47 | > We further observed that some images are not correctly annotated. There are 48 | instances that are assigned to multiple semantic classes. While most overlaps 49 | are with void (unlabeled textures -> void label), we observed other issues for: 50 | ai_017_004: semantic classes 35 + 40: lamp + otherprop -> some small stuff in 51 | the background; ai_021_008: semantic classes 12 + 35 -> kitchen counter + lamp 52 | belong to same instance -> might be an annotation fault; ai_022_009: semantic 53 | classes 1 + 8 -> door frame labeled as wall, but door instance contains both 54 | the door frame and the door. 55 | 56 | ## Prepare dataset 57 | 58 | 1. Download and unzip dataset files: 59 | 60 | ```bash 61 | wget https://raw.githubusercontent.com/apple/ml-hypersim/6cbaa80207f44a312654e288cf445016c84658a1/code/python/tools/dataset_download_images.py 62 | 63 | # general usage 64 | python dataset_download_images.py \ 65 | --downloads_dir /path/to/download \ 66 | --decompress_dir /path/to/uncompressed/hypersim 67 | ``` 68 | 69 | 2. Convert dataset: 70 | 71 | ```bash 72 | # general usage 73 | nicr_sa_prepare_dataset hypersim \ 74 | /path/where/to/store/hypersim \ 75 | /path/to/uncompressed/hypersim \ 76 | [--additional-subsamples N1 N2] \ 77 | [--n-processes N] 78 | ``` 79 | With arguments: 80 | - `--additional_subsamples`: 81 | For additional subsampled versions of the dataset. 82 | - `--n-processes`: 83 | Number of worker processes to spawn. 84 | - `--no-tilt-shift-conversion`: 85 | Disable projecting the data/annotations back to a standard camera ignoring the 86 | tilt-shift parameters (use this to create dataset compatible with < v050). 87 | 88 | 3. (Optional) Generate auxiliary data: 89 | > **Note**: To use auxiliary data generation, the package must be installed with the `withauxiliarydata` option: 90 | > ```bash 91 | > pip install -e .[withauxiliarydata] 92 | > ``` 93 | 94 | ```bash 95 | # for auxiliary data such as synthetic depth and rgb/panoptic embeddings 96 | nicr_sa_generate_auxiliary_data \ 97 | --dataset hypersim \ 98 | --dataset-path /path/to/already/prepared/hypersim/dataset \ 99 | --auxiliary-data depth image-embedding panoptic-embedding \ 100 | --embedding-estimator-device cuda \ 101 | --embedding-estimators alpha_clip__l14-336-grit-20m \ 102 | --depth-estimator-device cuda \ 103 | --depth-estimators depthanything_v2__indoor_large \ 104 | --cache-models 105 | ``` 106 | 107 | With arguments: 108 | - `--dataset-path`: 109 | Path to the prepared Hypersim dataset. 110 | - `--auxiliary-data`: 111 | Types of auxiliary data to generate: 112 | - `depth`: Generates synthetic depth images from RGB. 113 | - `image-embedding`: Uses Alpha-CLIP to generate an embedding for the entire image. 114 | - `panoptic-embedding`: Uses Alpha-CLIP to generate an embedding for each panoptic mask. 115 | - `--depth-estimator-device`: 116 | Device to use for depth estimation (`cpu` or `cuda`). 117 | - `--depth-estimators`: 118 | Depth estimator(s) to use. Use `depthanything_v2__indoor_large` to match DVEFormer. 119 | - `--embedding-estimator-device`: 120 | Device to use for embedding estimation (`cpu` or `cuda`). 121 | - `--embedding-estimators`: 122 | Embedding estimator(s) to use. Use `alpha_clip__l14-336-grit-20m` to match DVEFormer. 123 | - `--cache-models`: 124 | Cache models locally to avoid reloading them in future runs. 125 | 126 | 127 | -------------------------------------------------------------------------------- /tests/test_hypersim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for Hypersim dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | .. codeauthor:: Soehnke Fischedick 7 | """ 8 | import numpy as np 9 | from numpy.testing import assert_almost_equal 10 | import pytest 11 | 12 | from nicr_scene_analysis_datasets import Hypersim 13 | from nicr_scene_analysis_datasets.dataset_base import ExtrinsicCameraParametersNormalized 14 | from nicr_scene_analysis_datasets.dataset_base import IntrinsicCameraParametersNormalized 15 | from nicr_scene_analysis_datasets.dataset_base import OrientationDict 16 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 17 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 18 | 19 | 20 | N_CLASSES_WITH_VOID = 40 + 1 21 | N_SAMPLES = { 22 | None: {'train': 57443, 'valid': 7286, 'test': 7690}, 23 | 1: {'train': 57443, 'valid': 7286, 'test': 7690}, 24 | 2: {'train': 28722, 'valid': 3643, 'test': 3845}, 25 | 5: {'train': 11489, 'valid': 1458, 'test': 1538}, 26 | 10: {'train': 5745, 'valid': 729, 'test': 769}, 27 | 20: {'train': 2873, 'valid': 365, 'test': 385} 28 | } 29 | N_SCENE_CLASSES = 22 30 | 31 | 32 | @pytest.mark.parametrize('split', ('train', 'valid', 'test')) 33 | @pytest.mark.parametrize('depth_mode', ('raw', )) 34 | @pytest.mark.parametrize('subsample', (None, 1, 2, 5, 10, 20)) 35 | @pytest.mark.parametrize('orientations_use', (True, False)) 36 | def test_dataset(split, depth_mode, subsample, orientations_use): 37 | dataset = Hypersim( 38 | dataset_path=DATASET_PATH_DICT['hypersim'], 39 | split=split, 40 | subsample=subsample, 41 | sample_keys=Hypersim.get_available_sample_keys(split), 42 | depth_mode=depth_mode, 43 | orientations_use=orientations_use, 44 | ) 45 | 46 | assert dataset.depth_mode == depth_mode 47 | assert dataset.split == split 48 | 49 | assert len(dataset) == N_SAMPLES[subsample][split] 50 | 51 | assert dataset.semantic_n_classes == N_CLASSES_WITH_VOID 52 | assert dataset.semantic_n_classes_without_void == N_CLASSES_WITH_VOID - 1 53 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 54 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 55 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 56 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 57 | assert len(dataset.scene_class_names) == N_SCENE_CLASSES 58 | assert len(dataset.cameras) == 1 59 | 60 | assert isinstance(dataset.depth_min, float) 61 | assert isinstance(dataset.depth_max, float) 62 | assert isinstance(dataset.depth_mean, float) 63 | assert isinstance(dataset.depth_std, float) 64 | assert isinstance(dataset.depth_stats, dict) 65 | 66 | # test first 10 samples 67 | for i, sample in enumerate(dataset): 68 | assert isinstance(sample, dict) 69 | assert isinstance(sample['identifier'], SampleIdentifier) 70 | assert isinstance(sample['extrinsics'], 71 | ExtrinsicCameraParametersNormalized) 72 | assert (3+4) == len(sample['extrinsics']) 73 | # inputs: rgb and depth 74 | assert sample['rgb'].ndim == 3 75 | assert isinstance(sample['rgb_intrinsics'], 76 | IntrinsicCameraParametersNormalized) 77 | assert (2+2+6+2) == len(sample['rgb_intrinsics']) 78 | assert sample['depth'].ndim == 2 79 | assert isinstance(sample['depth_intrinsics'], 80 | IntrinsicCameraParametersNormalized) 81 | assert (2+2+6+2+2) == len(sample['depth_intrinsics']) 82 | # semantic 83 | assert sample['semantic'].ndim == 2 84 | # instance 85 | assert sample['instance'].ndim == 2 86 | # normal 87 | normal = sample['normal'] 88 | assert normal.ndim == 3 89 | assert normal.dtype == 'float32' 90 | norms = np.linalg.norm(normal, ord=2, axis=-1) 91 | mask = norms > 1e-7 # filter invalid pixels 92 | assert_almost_equal(norms[mask], 1, decimal=4) 93 | # scene 94 | assert isinstance(sample['scene'], int) 95 | # orientation 96 | assert isinstance(sample['orientations'], OrientationDict) 97 | for key, value in sample['orientations'].items(): 98 | # Check if orientation with key exists in instance 99 | assert (sample['instance'] == key).sum() > 0 100 | assert isinstance(key, int) 101 | assert isinstance(value, float) 102 | # assert that the encoding is in radians 103 | assert 0 <= value <= 2*np.pi 104 | # 3d boxes 105 | assert isinstance(sample['3d_boxes'], dict) 106 | 107 | # verify that every instance has an orientation 108 | for instance_id in np.unique(sample['instance']): 109 | # void 110 | if instance_id == 0: 111 | continue 112 | if orientations_use: 113 | assert (instance_id in sample['orientations']) 114 | 115 | if i >= 9: 116 | break 117 | 118 | 119 | @pytest.mark.parametrize('split', ('train', 'valid', 'test')) 120 | def test_scene_class_mapping(split): 121 | sample_keys = ('scene',) 122 | 123 | # create datasets 124 | dataset_original = Hypersim( 125 | dataset_path=DATASET_PATH_DICT['hypersim'], 126 | split=split, 127 | sample_keys=sample_keys, 128 | scene_use_indoor_domestic_labels=False 129 | ) 130 | 131 | dataset_remapped = Hypersim( 132 | dataset_path=DATASET_PATH_DICT['hypersim'], 133 | split=split, 134 | sample_keys=sample_keys, 135 | scene_use_indoor_domestic_labels=True 136 | ) 137 | 138 | # count samples 139 | def count(dataset): 140 | class_names = dataset.config.scene_label_list.class_names 141 | counts = {n: 0 for n in class_names} 142 | for sample in dataset: 143 | counts[class_names[sample['scene']]] += 1 144 | 145 | return counts 146 | 147 | counts_original = count(dataset_original) 148 | counts_remapped = count(dataset_remapped) 149 | 150 | # perform simple some checks 151 | assert sum(counts_remapped.values()) == N_SAMPLES[None][split] 152 | assert sum(counts_remapped.values()) == sum(counts_original.values()) 153 | assert len(counts_remapped) == dataset_remapped.scene_n_classes 154 | 155 | assert dataset_original.scene_n_classes == dataset_original.scene_n_classes_without_void 156 | assert dataset_remapped.scene_n_classes == dataset_remapped.scene_n_classes_without_void + 1 157 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/datasets/ade20k/_class_mappings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | 5 | Mapping tables to convert semantic classes in ADE20K. 6 | 7 | Based on: 8 | - [1] https://github.com/CSAILVision/sceneparsing/blob/master/convertFromADE/mapFromADE.txt 9 | - [2] https://github.com/CSAILVision/ADE20K/blob/main/utils/ade20k_instance_catid_mapping.txt 10 | """ 11 | 12 | # ----------------------------------------------------------------------------- 13 | # mapping from 2021 full ADE20K to 150 classes used in 2016 scene parsing 14 | # challenge (from [1]) 15 | MAPPING_FULL_ADE20K_TO_SCENE_PARSE_150 = { 16 | 2978: 1, 17 | 312: 2, 18 | 2420: 3, 19 | 976: 4, 20 | 2855: 5, 21 | 447: 6, 22 | 2131: 7, 23 | 165: 8, 24 | 3055: 9, 25 | 1125: 10, 26 | 350: 11, 27 | 2377: 12, 28 | 1831: 13, 29 | 838: 14, 30 | 774: 15, 31 | 783: 15, 32 | 2684: 16, 33 | 1610: 17, 34 | 1910: 18, 35 | 687: 19, 36 | 471: 20, 37 | 401: 21, 38 | 2994: 22, 39 | 1735: 23, 40 | 2473: 24, 41 | 2329: 25, 42 | 1276: 26, 43 | 2264: 27, 44 | 1564: 28, 45 | 2178: 29, 46 | 913: 30, 47 | 57: 31, 48 | 2272: 32, 49 | 907: 33, 50 | 724: 34, 51 | 2138: 35, 52 | 2985: 36, 53 | 533: 36, 54 | 1395: 37, 55 | 155: 38, 56 | 2053: 39, 57 | 689: 40, 58 | 137: 41, 59 | 266: 42, 60 | 581: 43, 61 | 2380: 44, 62 | 491: 45, 63 | 627: 46, 64 | 2212: 47, 65 | 2388: 48, 66 | 2423: 49, 67 | 943: 50, 68 | 2096: 51, 69 | 1121: 52, 70 | 1788: 53, 71 | 2530: 54, 72 | 2185: 55, 73 | 420: 56, 74 | 1948: 57, 75 | 1869: 58, 76 | 2251: 59, 77 | 2531: 60, 78 | 2128: 61, 79 | 294: 62, 80 | 239: 63, 81 | 212: 64, 82 | 571: 65, 83 | 2793: 66, 84 | 978: 67, 85 | 236: 68, 86 | 1240: 69, 87 | 181: 70, 88 | 629: 71, 89 | 2598: 72, 90 | 1744: 73, 91 | 1374: 74, 92 | 591: 75, 93 | 2679: 76, 94 | 223: 77, 95 | 123: 78, 96 | 47: 79, 97 | 1282: 80, 98 | 327: 81, 99 | 2821: 82, 100 | 1451: 83, 101 | 2880: 84, 102 | 2828: 85, 103 | 480: 86, 104 | 77: 87, 105 | 2616: 88, 106 | 246: 89, 107 | 247: 89, 108 | 2733: 90, 109 | 14: 91, 110 | 738: 92, 111 | 38: 93, 112 | 1936: 94, 113 | 1401: 95, 114 | 120: 96, 115 | 868: 97, 116 | 1702: 98, 117 | 249: 99, 118 | 308: 100, 119 | 1969: 101, 120 | 2526: 102, 121 | 2928: 103, 122 | 2337: 104, 123 | 1023: 105, 124 | 609: 106, 125 | 389: 107, 126 | 2989: 108, 127 | 1930: 109, 128 | 2668: 110, 129 | 2586: 111, 130 | 131: 112, 131 | 146: 113, 132 | 3016: 114, 133 | 2739: 115, 134 | 95: 116, 135 | 1563: 117, 136 | 642: 118, 137 | 1708: 119, 138 | 103: 120, 139 | 1002: 121, 140 | 2569: 122, 141 | 2704: 123, 142 | 2833: 124, 143 | 1551: 125, 144 | 1981: 126, 145 | 29: 127, 146 | 187: 128, 147 | 1393: 129, 148 | 747: 130, 149 | 2254: 131, 150 | 206: 132, 151 | 2262: 133, 152 | 1260: 134, 153 | 2243: 135, 154 | 2932: 136, 155 | 2836: 137, 156 | 2850: 138, 157 | 64: 139, 158 | 894: 140, 159 | 1858: 141, 160 | 3109: 142, 161 | 1919: 143, 162 | 1583: 144, 163 | 318: 145, 164 | 2356: 146, 165 | 2046: 147, 166 | 1098: 148, 167 | 530: 149, 168 | 954: 150 169 | } 170 | # double mapping for 15, 36, and 89, thus, vice versa is not possible 171 | assert len(MAPPING_FULL_ADE20K_TO_SCENE_PARSE_150) == 153 172 | 173 | # ----------------------------------------------------------------------------- 174 | # mapping from 2021 full ADE20K to 100 classes used in the instance part of the 175 | # 2017 places challenge (from [2]) 176 | MAPPING_FULL_ADE20K_TO_INSTANCE_100 = { 177 | 165: 1, 178 | 3055: 2, 179 | 350: 3, 180 | 1831: 4, 181 | 774: 5, 182 | 783: 5, 183 | 2684: 6, 184 | 687: 7, 185 | 471: 8, 186 | 401: 9, 187 | 1735: 10, 188 | 2473: 11, 189 | 2329: 12, 190 | 1564: 13, 191 | 57: 14, 192 | 2272: 15, 193 | 907: 16, 194 | 724: 17, 195 | 2985: 18, 196 | 533: 18, 197 | 1395: 19, 198 | 155: 20, 199 | 2053: 21, 200 | 689: 22, 201 | 266: 23, 202 | 581: 24, 203 | 2380: 25, 204 | 491: 26, 205 | 627: 27, 206 | 2388: 28, 207 | 943: 29, 208 | 2096: 30, 209 | 2530: 31, 210 | 420: 32, 211 | 1948: 33, 212 | 1869: 34, 213 | 2251: 35, 214 | 239: 36, 215 | 571: 37, 216 | 2793: 38, 217 | 978: 39, 218 | 236: 40, 219 | 181: 41, 220 | 629: 42, 221 | 2598: 43, 222 | 1744: 44, 223 | 1374: 45, 224 | 591: 46, 225 | 2679: 47, 226 | 223: 48, 227 | 47: 49, 228 | 327: 50, 229 | 2821: 51, 230 | 1451: 52, 231 | 2880: 53, 232 | 480: 54, 233 | 77: 55, 234 | 2616: 56, 235 | 246: 57, 236 | 247: 57, 237 | 2733: 58, 238 | 14: 59, 239 | 38: 60, 240 | 1936: 61, 241 | 120: 62, 242 | 1702: 63, 243 | 249: 64, 244 | 2928: 65, 245 | 2337: 66, 246 | 1023: 67, 247 | 2989: 68, 248 | 1930: 69, 249 | 2586: 70, 250 | 131: 71, 251 | 146: 72, 252 | 95: 73, 253 | 1563: 74, 254 | 1708: 75, 255 | 103: 76, 256 | 1002: 77, 257 | 2569: 78, 258 | 2833: 79, 259 | 1551: 80, 260 | 1981: 81, 261 | 29: 82, 262 | 187: 83, 263 | 747: 84, 264 | 2254: 85, 265 | 2262: 86, 266 | 1260: 87, 267 | 2243: 88, 268 | 2932: 89, 269 | 2836: 90, 270 | 2850: 91, 271 | 64: 92, 272 | 894: 93, 273 | 1919: 94, 274 | 1583: 95, 275 | 318: 96, 276 | 2046: 97, 277 | 1098: 98, 278 | 530: 99, 279 | 954: 100 280 | } 281 | # double mapping for 5, 18, and 57, thus, vice versa is not possible 282 | assert len(MAPPING_FULL_ADE20K_TO_INSTANCE_100) == 103 283 | 284 | # ----------------------------------------------------------------------------- 285 | # mapping from 150 classes used in 2016 scene parsing challenge to 100 classes 286 | # used in the instance part of the 2017 places challenge and vice versa 287 | # (from [2]) 288 | MAPPING_SCENE_PARSE_150_TO_INSTANCE_100 = { 289 | MAPPING_FULL_ADE20K_TO_SCENE_PARSE_150[k]: v 290 | for k, v in MAPPING_FULL_ADE20K_TO_INSTANCE_100.items() 291 | } 292 | 293 | # no multiple mappings, as the 100 classes are a subset of the 150 classes 294 | assert len(MAPPING_SCENE_PARSE_150_TO_INSTANCE_100) == 100 295 | 296 | # vice versa 297 | MAPPING_INSTANCE_100_TO_SCENE_PARSE_150 = { 298 | v: k 299 | for k, v in MAPPING_SCENE_PARSE_150_TO_INSTANCE_100.items() 300 | } 301 | -------------------------------------------------------------------------------- /src/nicr_scene_analysis_datasets/utils/_colormaps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | .. codeauthor:: Daniel Seichter 4 | """ 5 | import numpy as np 6 | 7 | # see MIRA/toolboxes/ColorMaps/include/SemanticColormaps.h 8 | _COLORMAP_VISUALLY_DISTINCT_256_PLUS_1 = ( 9 | (0, 0, 0), 10 | (22, 254, 25), 11 | (255, 0, 255), 12 | (0, 127, 255), 13 | (255, 127, 0), 14 | (127, 63, 127), 15 | (88, 251, 192), 16 | (194, 249, 49), 17 | (56, 1, 254), 18 | (239, 127, 216), 19 | (251, 2, 45), 20 | (0, 127, 0), 21 | (12, 156, 128), 22 | (0, 0, 127), 23 | (131, 155, 48), 24 | (134, 24, 6), 25 | (131, 86, 253), 26 | (133, 163, 180), 27 | (248, 185, 115), 28 | (233, 81, 107), 29 | (159, 0, 198), 30 | (21, 78, 82), 31 | (0, 255, 127), 32 | (0, 255, 255), 33 | (18, 61, 193), 34 | (186, 242, 160), 35 | (100, 236, 96), 36 | (157, 219, 251), 37 | (53, 191, 239), 38 | (240, 6, 153), 39 | (179, 93, 22), 40 | (113, 229, 4), 41 | (78, 83, 11), 42 | (253, 205, 8), 43 | (12, 183, 50), 44 | (90, 2, 90), 45 | (213, 65, 250), 46 | (176, 8, 89), 47 | (187, 141, 113), 48 | (72, 113, 174), 49 | (237, 200, 207), 50 | (254, 252, 107), 51 | (72, 6, 175), 52 | (175, 97, 177), 53 | (175, 150, 251), 54 | (250, 59, 0), 55 | (94, 176, 114), 56 | (12, 205, 178), 57 | (90, 115, 95), 58 | (194, 166, 1), 59 | (66, 164, 1), 60 | (2, 61, 12), 61 | (84, 133, 244), 62 | (168, 198, 78), 63 | (9, 20, 61), 64 | (104, 58, 195), 65 | (250, 59, 177), 66 | (57, 82, 247), 67 | (188, 41, 160), 68 | (0, 140, 191), 69 | (134, 27, 253), 70 | (196, 39, 29), 71 | (23, 130, 62), 72 | (33, 208, 105), 73 | (127, 214, 156), 74 | (250, 134, 77), 75 | (50, 43, 129), 76 | (2, 101, 145), 77 | (2, 5, 205), 78 | (85, 53, 72), 79 | (245, 248, 180), 80 | (60, 28, 8), 81 | (193, 189, 157), 82 | (116, 180, 248), 83 | (84, 197, 55), 84 | (173, 66, 83), 85 | (127, 2, 143), 86 | (3, 42, 251), 87 | (68, 172, 175), 88 | (251, 253, 30), 89 | (71, 253, 252), 90 | (240, 137, 147), 91 | (132, 122, 143), 92 | (8, 254, 194), 93 | (199, 254, 233), 94 | (123, 121, 3), 95 | (199, 15, 246), 96 | (7, 210, 3), 97 | (173, 212, 9), 98 | (251, 35, 102), 99 | (150, 112, 68), 100 | (40, 254, 85), 101 | (244, 168, 253), 102 | (192, 142, 183), 103 | (63, 246, 141), 104 | (196, 137, 50), 105 | (167, 253, 104), 106 | (222, 183, 58), 107 | (215, 221, 91), 108 | (124, 110, 203), 109 | (136, 70, 41), 110 | (233, 96, 46), 111 | (168, 61, 218), 112 | (137, 252, 51), 113 | (180, 184, 215), 114 | (80, 126, 41), 115 | (80, 248, 43), 116 | (183, 104, 253), 117 | (129, 254, 232), 118 | (130, 184, 6), 119 | (103, 206, 204), 120 | (131, 30, 63), 121 | (75, 83, 128), 122 | (230, 98, 171), 123 | (1, 99, 216), 124 | (1, 167, 235), 125 | (213, 1, 5), 126 | (58, 41, 234), 127 | (104, 1, 220), 128 | (173, 98, 120), 129 | (254, 94, 249), 130 | (1, 225, 64), 131 | (208, 208, 253), 132 | (146, 179, 132), 133 | (161, 252, 2), 134 | (0, 60, 145), 135 | (0, 0, 255), 136 | (61, 0, 46), 137 | (4, 214, 226), 138 | (42, 169, 89), 139 | (63, 138, 130), 140 | (165, 0, 38), 141 | (236, 23, 207), 142 | (27, 95, 35), 143 | (69, 213, 168), 144 | (140, 249, 181), 145 | (66, 207, 4), 146 | (1, 41, 102), 147 | (130, 144, 100), 148 | (236, 51, 54), 149 | (188, 3, 143), 150 | (236, 221, 140), 151 | (16, 24, 165), 152 | (133, 128, 254), 153 | (108, 223, 254), 154 | (54, 142, 205), 155 | (56, 225, 212), 156 | (209, 121, 2), 157 | (250, 165, 12), 158 | (252, 172, 170), 159 | (37, 48, 43), 160 | (170, 214, 125), 161 | (12, 166, 5), 162 | (139, 39, 169), 163 | (204, 39, 104), 164 | (212, 248, 1), 165 | (52, 119, 1), 166 | (166, 217, 193), 167 | (225, 2, 102), 168 | (23, 115, 105), 169 | (202, 178, 108), 170 | (72, 90, 58), 171 | (113, 254, 137), 172 | (114, 87, 166), 173 | (252, 219, 61), 174 | (162, 56, 2), 175 | (217, 84, 0), 176 | (207, 110, 87), 177 | (58, 4, 131), 178 | (86, 151, 79), 179 | (145, 30, 113), 180 | (96, 2, 10), 181 | (137, 212, 41), 182 | (65, 253, 0), 183 | (97, 34, 141), 184 | (61, 66, 165), 185 | (39, 186, 144), 186 | (254, 49, 253), 187 | (56, 165, 47), 188 | (117, 85, 85), 189 | (130, 178, 84), 190 | (213, 252, 123), 191 | (149, 150, 5), 192 | (48, 23, 91), 193 | (1, 216, 138), 194 | (98, 49, 15), 195 | (101, 156, 212), 196 | (214, 218, 31), 197 | (69, 82, 203), 198 | (97, 52, 254), 199 | (42, 116, 239), 200 | (216, 94, 220), 201 | (166, 123, 218), 202 | (41, 150, 250), 203 | (251, 208, 254), 204 | (222, 167, 205), 205 | (211, 232, 198), 206 | (214, 61, 201), 207 | (26, 170, 195), 208 | (45, 223, 38), 209 | (39, 224, 252), 210 | (174, 169, 44), 211 | (207, 114, 137), 212 | (100, 141, 170), 213 | (1, 80, 251), 214 | (139, 86, 2), 215 | (196, 71, 133), 216 | (208, 4, 59), 217 | (253, 18, 0), 218 | (192, 23, 200), 219 | (76, 210, 124), 220 | (57, 36, 192), 221 | (162, 149, 150), 222 | (252, 106, 131), 223 | (109, 101, 36), 224 | (118, 207, 114), 225 | (30, 129, 160), 226 | (120, 0, 44), 227 | (145, 226, 81), 228 | (178, 43, 254), 229 | (62, 222, 78), 230 | (0, 151, 90), 231 | (25, 100, 181), 232 | (218, 164, 151), 233 | (214, 136, 249), 234 | (227, 40, 145), 235 | (1, 147, 35), 236 | (29, 81, 121), 237 | (164, 145, 74), 238 | (11, 28, 20), 239 | (154, 71, 159), 240 | (57, 2, 213), 241 | (119, 9, 184), 242 | (34, 251, 230), 243 | (2, 184, 110), 244 | (203, 174, 254), 245 | (206, 76, 62), 246 | (224, 149, 106), 247 | (141, 189, 204), 248 | (161, 2, 254), 249 | (135, 39, 213), 250 | (104, 185, 155), 251 | (91, 102, 245), 252 | (109, 154, 12), 253 | (214, 196, 0), 254 | (30, 235, 165), 255 | (106, 34, 96), 256 | (233, 139, 34), 257 | (252, 228, 218), 258 | (139, 76, 198), 259 | (231, 251, 68), 260 | (162, 123, 30), 261 | (11, 0, 93), 262 | (78, 29, 47), 263 | (0, 64, 52), 264 | (156, 155, 212), 265 | (151, 245, 141) 266 | ) 267 | 268 | COLORMAP_VISUALLY_DISTINCT_VOID_PLUS_256 = \ 269 | np.array(_COLORMAP_VISUALLY_DISTINCT_256_PLUS_1, dtype='uint8') 270 | 271 | COLORMAP_VISUALLY_DISTINCT_256 = \ 272 | np.array(_COLORMAP_VISUALLY_DISTINCT_256_PLUS_1[1:], dtype='uint8') 273 | -------------------------------------------------------------------------------- /tests/test_scannet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Simple (interface) tests for ScanNet dataset 4 | 5 | .. codeauthor:: Daniel Seichter 6 | """ 7 | import pytest 8 | 9 | from nicr_scene_analysis_datasets import ScanNet 10 | from nicr_scene_analysis_datasets.dataset_base import ExtrinsicCameraParametersNormalized 11 | from nicr_scene_analysis_datasets.dataset_base import IntrinsicCameraParametersNormalized 12 | from nicr_scene_analysis_datasets.dataset_base import SampleIdentifier 13 | from nicr_scene_analysis_datasets.utils.testing import DATASET_PATH_DICT 14 | 15 | 16 | N_SAMPLES = { # subsample is applied to each trajectory in a scene (folder) 17 | None: {'train': 1893422, 'valid': 530449, 'test': 208862}, # not used so far 18 | 1: {'train': 1893422, 'valid': 530449, 'test': 208862}, # not used so far 19 | 5: {'train': 379221, 'valid': 106217, 'test': 41827}, # used for mapping 20 | 10: {'train': 189916, 'valid': 53193, 'test': 20942}, # used for mapping 21 | 50: {'train': 38474, 'valid': 10767, 'test': 4223}, # default subsample ! 22 | 100: {'train': 19559, 'valid': 5465, 'test': 2135}, 23 | 200: {'train': 10098, 'valid': 2814, 'test': 1089}, 24 | 500: {'train': 4403, 'valid': 1222, 'test': 468} 25 | } 26 | 27 | N_SCENE_CLASSES = 21 28 | 29 | 30 | @pytest.mark.parametrize('split', ('train', 'valid', 'test')) 31 | @pytest.mark.parametrize('subsample', (50, 100, 200, 500)) 32 | @pytest.mark.parametrize('semantic_n_classes', (20, 40, 200, 549)) 33 | @pytest.mark.parametrize('instance_semantic_mode', ('raw', 'refined')) 34 | def test_dataset(split, subsample, semantic_n_classes, instance_semantic_mode): 35 | dataset = ScanNet( 36 | dataset_path=DATASET_PATH_DICT['scannet'], 37 | split=split, 38 | subsample=subsample, 39 | depth_mode='raw', 40 | sample_keys=ScanNet.get_available_sample_keys(split), 41 | semantic_n_classes=semantic_n_classes, 42 | instance_semantic_mode=instance_semantic_mode 43 | ) 44 | 45 | assert dataset.split == split 46 | 47 | assert len(dataset) == N_SAMPLES[subsample][split] 48 | 49 | assert dataset.semantic_n_classes == semantic_n_classes + 1 50 | assert dataset.semantic_n_classes_without_void == semantic_n_classes 51 | assert len(dataset.semantic_class_names) == dataset.semantic_n_classes 52 | assert len(dataset.semantic_class_names_without_void) == dataset.semantic_n_classes_without_void 53 | assert len(dataset.semantic_class_colors) == dataset.semantic_n_classes 54 | assert len(dataset.semantic_class_colors_without_void) == dataset.semantic_n_classes_without_void 55 | 56 | assert len(dataset.scene_class_names) == N_SCENE_CLASSES 57 | assert len(dataset.cameras) == 2 58 | 59 | assert isinstance(dataset.depth_min, float) 60 | assert isinstance(dataset.depth_max, float) 61 | assert isinstance(dataset.depth_mean, float) 62 | assert isinstance(dataset.depth_std, float) 63 | assert isinstance(dataset.depth_stats, dict) 64 | 65 | # test first 10 samples 66 | for i, sample in enumerate(dataset): 67 | assert isinstance(sample, dict) 68 | assert isinstance(sample['identifier'], SampleIdentifier) 69 | assert isinstance(sample['extrinsics'], 70 | ExtrinsicCameraParametersNormalized) 71 | assert (3+4) == len(sample['extrinsics']) 72 | # inputs: rgb and depth 73 | assert sample['rgb'].ndim == 3 74 | assert isinstance(sample['rgb_intrinsics'], 75 | IntrinsicCameraParametersNormalized) 76 | assert (2+2+6+2) == len(sample['rgb_intrinsics']) 77 | assert sample['depth'].ndim == 2 78 | assert isinstance(sample['depth_intrinsics'], 79 | IntrinsicCameraParametersNormalized) 80 | assert (2+2+6+2+2) == len(sample['depth_intrinsics']) 81 | 82 | if 'test' != split: 83 | # semantic 84 | assert sample['semantic'].ndim == 2 85 | # instance 86 | assert sample['instance'].ndim == 2 87 | # scene 88 | assert isinstance(sample['scene'], int) 89 | 90 | if i >= 9: 91 | break 92 | 93 | 94 | @pytest.mark.parametrize('split', ('train', 'valid')) 95 | def test_scene_class_mapping(split): 96 | sample_keys = ('scene',) 97 | 98 | # create datasets (with default subsample!) 99 | dataset_original = ScanNet( 100 | dataset_path=DATASET_PATH_DICT['scannet'], 101 | split=split, 102 | sample_keys=sample_keys, 103 | scene_use_indoor_domestic_labels=False 104 | ) 105 | 106 | dataset_remapped = ScanNet( 107 | dataset_path=DATASET_PATH_DICT['scannet'], 108 | split=split, 109 | sample_keys=sample_keys, 110 | scene_use_indoor_domestic_labels=True 111 | ) 112 | 113 | # count samples 114 | def count(dataset): 115 | class_names = dataset.config.scene_label_list.class_names 116 | counts = {n: 0 for n in class_names} 117 | for sample in dataset: 118 | counts[class_names[sample['scene']]] += 1 119 | 120 | return counts 121 | 122 | counts_original = count(dataset_original) 123 | counts_remapped = count(dataset_remapped) 124 | 125 | # perform simple some checks 126 | assert sum(counts_remapped.values()) == N_SAMPLES[50][split] 127 | assert sum(counts_remapped.values()) == sum(counts_original.values()) 128 | assert len(counts_remapped) == dataset_remapped.scene_n_classes 129 | 130 | assert dataset_original.scene_n_classes == dataset_original.scene_n_classes_without_void 131 | assert dataset_remapped.scene_n_classes == dataset_remapped.scene_n_classes_without_void + 1 132 | 133 | 134 | @pytest.mark.parametrize('split', ('train', 'valid', 'test')) 135 | def test_filter_camera(split): 136 | # just some random cameras and counts that we know 137 | sample_cameras = { # for default subsample of 50 138 | 'train': { 139 | 'structureio_480x640': 688 140 | }, 141 | 'valid': { 142 | 'structureio_968x1296': 10492, 143 | }, 144 | 'test': {'structureio_968x1296': 4223}, 145 | } 146 | 147 | cameras = tuple(sample_cameras[split].keys()) 148 | n_samples = tuple(sample_cameras[split].values()) 149 | 150 | # create dataset with specified cameras 151 | dataset = ScanNet( 152 | dataset_path=DATASET_PATH_DICT['scannet'], 153 | split=split, 154 | sample_keys=ScanNet.get_available_sample_keys(split), 155 | cameras=cameras 156 | ) 157 | 158 | assert dataset.cameras == cameras 159 | assert len(dataset) == sum(n_samples) 160 | 161 | # test filtering 162 | dataset.filter_camera(cameras[0]) 163 | assert dataset.camera == cameras[0] 164 | assert len(dataset) == n_samples[0] 165 | 166 | # reset filtering 167 | dataset.filter_camera(None) 168 | assert dataset.camera is None 169 | assert len(dataset) == sum(n_samples) 170 | --------------------------------------------------------------------------------