├── ocl
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── savi.py
    │   └── savi_with_memory.py
    ├── config
    │   ├── memory.py
    │   ├── predictor.py
    │   ├── perceptual_groupings.py
    │   ├── __init__.py
    │   ├── neural_networks.py
    │   ├── optimizers.py
    │   ├── feature_extractors.py
    │   ├── datasets.py
    │   ├── conditioning.py
    │   ├── utils.py
    │   ├── metrics.py
    │   └── plugins.py
    ├── cli
    │   ├── eval_utils.py
    │   ├── cli_utils.py
    │   ├── compute_dataset_size.py
    │   ├── eval.py
    │   └── train.py
    ├── path_defaults.py
    ├── matching.py
    ├── hooks.py
    ├── consistency.py
    ├── visualization_types.py
    ├── distillation.py
    ├── scheduling.py
    ├── tree_utils.py
    ├── combined_model.py
    ├── memory_rollout.py
    ├── neural_networks.py
    ├── losses.py
    ├── mha.py
    ├── base.py
    ├── predictor.py
    ├── conditioning.py
    └── visualizations.py
├── NOTICE
├── configs
    ├── .DS_Store
    ├── experiment
    │   ├── .DS_Store
    │   ├── _output_path.yaml
    │   ├── OC-MOT
    │   │   ├── _cater_bbox_mot_preprocessing.yaml
    │   │   ├── cater_eval.yaml
    │   │   └── cater.yaml
    │   └── SAVi
    │   │   ├── _cater_bbox_mot_preprocessing.yaml
    │   │   └── cater.yaml
    └── dataset
    │   └── cater.yaml
├── srcs
    ├── cater_demo.gif
    ├── framework.png
    └── real_world_demo.gif
├── CODE_OF_CONDUCT.md
├── setup.cfg
├── pyproject.toml
├── requirements.txt
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/ocl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/configs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/object-centric-multiple-object-tracking/HEAD/configs/.DS_Store


--------------------------------------------------------------------------------
/srcs/cater_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/object-centric-multiple-object-tracking/HEAD/srcs/cater_demo.gif


--------------------------------------------------------------------------------
/srcs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/object-centric-multiple-object-tracking/HEAD/srcs/framework.png


--------------------------------------------------------------------------------
/srcs/real_world_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/object-centric-multiple-object-tracking/HEAD/srcs/real_world_demo.gif


--------------------------------------------------------------------------------
/configs/experiment/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/object-centric-multiple-object-tracking/HEAD/configs/experiment/.DS_Store


--------------------------------------------------------------------------------
/ocl/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Models defined in code."""
2 | from ocl.models.savi import SAVi
3 | from ocl.models.savi_with_memory import SAVi_mem
4 | __all__ = ["SAVi", "SAVi_mem"]
5 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/configs/experiment/_output_path.yaml:
--------------------------------------------------------------------------------
1 | # @package hydra
2 | 
3 | run:
4 |   dir: ${oc.select:experiment.root_output_folder,outputs}/${hydra:runtime.choices.experiment}/${now:%Y-%m-%d_%H-%M-%S}
5 | sweep:
6 |   dir: ${oc.select:experiment.root_output_folder,multirun}
7 |   subdir: ${hydra:runtime.choices.experiment}/${now:%Y-%m-%d_%H-%M-%S}
8 | output_subdir: config
9 | 


--------------------------------------------------------------------------------
/configs/dataset/cater.yaml:
--------------------------------------------------------------------------------
 1 | # Video dataset CATER based on https://github.com/deepmind/multi_object_datasets .
 2 | defaults:
 3 |   - webdataset
 4 | train_shards: ${dataset_prefix:"cater_with_masks/train/shard-{000000..000152}.tar"}
 5 | train_size: 35427
 6 | val_shards: ${dataset_prefix:"cater_with_masks/val/shard-{000000..000016}.tar"}
 7 | val_size: 50  #3937
 8 | test_shards: ${dataset_prefix:"cater_with_masks/test/shard-{000000..000073}.tar"}
 9 | test_size: 17100
10 | 


--------------------------------------------------------------------------------
/ocl/config/memory.py:
--------------------------------------------------------------------------------
 1 | """Perceptual grouping models."""
 2 | import dataclasses
 3 | 
 4 | from hydra_zen import builds
 5 | 
 6 | from ocl import memory
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class MemoryConfig:
11 |     """Configuration class of Predictor."""
12 | 
13 | 
14 | TransitionConfig = builds(
15 |     memory.SelfSupervisedMemory,
16 |     builds_bases=(MemoryConfig,),
17 |     populate_full_signature=True,
18 | )
19 | 
20 | 
21 | def register_configs(config_store):
22 |     config_store.store(group="schemas", name="memory", node=MemoryConfig)
23 |     config_store.store(group="memory", name="mem", node=TransitionConfig)
24 | 


--------------------------------------------------------------------------------
/ocl/cli/eval_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import pytorch_lightning
 4 | 
 5 | from ocl.cli import train
 6 | 
 7 | 
 8 | def build_from_train_config(
 9 |     config: train.TrainingConfig, checkpoint_path: Optional[str], seed: bool = True
10 | ):
11 |     if seed:
12 |         pytorch_lightning.seed_everything(config.seed, workers=True)
13 | 
14 |     pm = train.create_plugin_manager()
15 |     datamodule = train.build_and_register_datamodule_from_config(config, pm.hook, pm)
16 |     train.build_and_register_plugins_from_config(config, pm)
17 |     model = train.build_model_from_config(config, pm.hook, checkpoint_path)
18 | 
19 |     return datamodule, model, pm
20 | 


--------------------------------------------------------------------------------
/ocl/config/predictor.py:
--------------------------------------------------------------------------------
 1 | """Perceptual grouping models."""
 2 | import dataclasses
 3 | 
 4 | from hydra_zen import builds
 5 | 
 6 | from ocl import predictor
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class PredictorConfig:
11 |     """Configuration class of Predictor."""
12 | 
13 | 
14 | TransitionConfig = builds(
15 |     predictor.TransformerBlock,
16 |     builds_bases=(PredictorConfig,),
17 |     populate_full_signature=True,
18 | )
19 | 
20 | 
21 | 
22 | def register_configs(config_store):
23 |     config_store.store(group="schemas", name="perceptual_grouping", node=PredictorConfig)
24 |     config_store.store(group="perceptual_grouping", name="slot_attention", node=TransitionConfig)
25 | 
26 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | select=
 3 |     # F: errors from pyflake
 4 |     F,
 5 |     # W, E: warnings/errors from pycodestyle (PEP8)
 6 |     W, E,
 7 |     # I: problems with imports
 8 |     I,
 9 |     # B: bugbear warnings ("likely bugs and design problems")
10 |     B,
11 |     # D: docstring warnings from pydocstyle
12 |     D
13 | ignore=
14 |     # E203: whitespace before ':' (incompatible with black)
15 |     E203,
16 |     # E731: do not use a lambda expression, use a def (local def is often ugly)
17 |     E731,
18 |     # W503: line break before binary operator (incompatible with black)
19 |     W503,
20 |     # D1: docstring warnings related to missing documentation
21 |     D1
22 | max-line-length = 101
23 | ban-relative-imports = true
24 | docstring-convention = google
25 | exclude = .*,__pycache__,./outputs
26 | 


--------------------------------------------------------------------------------
/ocl/path_defaults.py:
--------------------------------------------------------------------------------
 1 | """Default paths for different types of inputs.
 2 | 
 3 | These are only defined for convenience and can also be overwritten using the appropriate *_path
 4 | constructor variables of RoutableMixin subclasses.
 5 | """
 6 | INPUT = "input"
 7 | VIDEO = f"{INPUT}.image"
 8 | BOX = f"{INPUT}.instance_bbox"
 9 | MASK = f"{INPUT}.mask"
10 | ID = f"{INPUT}.instance_id"
11 | BATCH_SIZE = f"{INPUT}.batch_size"
12 | GLOBAL_STEP = "global_step"
13 | FEATURES = "feature_extractor"
14 | CONDITIONING = "conditioning"
15 | # TODO(hornmax): Currently decoders are nested in the task and accept PerceptualGroupingOutput as
16 | # input. In the future this will change and decoders should just be regular parts of the model.
17 | OBJECTS = "perceptual_grouping.objects"
18 | FEATURE_ATTRIBUTIONS = "perceptual_grouping.feature_attributions"
19 | OBJECT_DECODER = "object_decoder"
20 | 


--------------------------------------------------------------------------------
/ocl/config/perceptual_groupings.py:
--------------------------------------------------------------------------------
 1 | """Perceptual grouping models."""
 2 | import dataclasses
 3 | 
 4 | from hydra_zen import builds
 5 | 
 6 | from ocl import perceptual_grouping
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class PerceptualGroupingConfig:
11 |     """Configuration class of perceptual grouping models."""
12 | 
13 | 
14 | SlotAttentionConfig = builds(
15 |     perceptual_grouping.SlotAttentionGrouping,
16 |     builds_bases=(PerceptualGroupingConfig,),
17 |     populate_full_signature=True,
18 | )
19 | StickBreakingGroupingConfig = builds(
20 |     perceptual_grouping.StickBreakingGrouping,
21 |     builds_bases=(PerceptualGroupingConfig,),
22 |     populate_full_signature=True,
23 | )
24 | 
25 | 
26 | def register_configs(config_store):
27 |     config_store.store(group="schemas", name="perceptual_grouping", node=PerceptualGroupingConfig)
28 |     config_store.store(group="perceptual_grouping", name="slot_attention", node=SlotAttentionConfig)
29 |     config_store.store(
30 |         group="perceptual_grouping", name="stick_breaking", node=StickBreakingGroupingConfig
31 |     )
32 | 


--------------------------------------------------------------------------------
/ocl/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from hydra.core.config_store import ConfigStore
 2 | from omegaconf import OmegaConf
 3 | 
 4 | from ocl.config import (
 5 |     conditioning,
 6 |     datasets,
 7 |     feature_extractors,
 8 |     metrics,
 9 |     neural_networks,
10 |     optimizers,
11 |     perceptual_groupings,
12 |     plugins,
13 |     predictor,
14 |     memory,
15 |     utils,
16 | )
17 | 
18 | config_store = ConfigStore.instance()
19 | 
20 | conditioning.register_configs(config_store)
21 | 
22 | datasets.register_configs(config_store)
23 | datasets.register_resolvers(OmegaConf)
24 | 
25 | feature_extractors.register_configs(config_store)
26 | 
27 | metrics.register_configs(config_store)
28 | 
29 | neural_networks.register_configs(config_store)
30 | 
31 | optimizers.register_configs(config_store)
32 | 
33 | perceptual_groupings.register_configs(config_store)
34 | predictor.register_configs(config_store)
35 | memory.register_configs(config_store)
36 | 
37 | plugins.register_configs(config_store)
38 | plugins.register_resolvers(OmegaConf)
39 | 
40 | utils.register_configs(config_store)
41 | utils.register_resolvers(OmegaConf)
42 | 


--------------------------------------------------------------------------------
/ocl/cli/cli_utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | from hydra.core.hydra_config import HydraConfig
 5 | 
 6 | 
 7 | def get_commandline_config_path():
 8 |     """Get the path of a config path specified on the command line."""
 9 |     hydra_cfg = HydraConfig.get()
10 |     config_sources = hydra_cfg.runtime.config_sources
11 |     config_path = None
12 |     for source in config_sources:
13 |         if source.schema == "file" and source.provider == "command-line":
14 |             config_path = source.path
15 |             break
16 |     return config_path
17 | 
18 | 
19 | def find_checkpoint(path):
20 |     """Find checkpoint in output path of previous run."""
21 |     checkpoints = glob.glob(
22 |         os.path.join(path, "lightning_logs", "version_*", "checkpoints", "*.ckpt")
23 |     )
24 |     checkpoints.sort()
25 |     # Return the last checkpoint.
26 |     # TODO (hornmax): If more than one checkpoint is stored this might not lead to the most recent
27 |     # checkpoint being loaded. Generally, I think this is ok as we still allow people to set the
28 |     # checkpoint manually.
29 |     return checkpoints[-1]
30 | 


--------------------------------------------------------------------------------
/ocl/config/neural_networks.py:
--------------------------------------------------------------------------------
 1 | """Configs for neural networks."""
 2 | import omegaconf
 3 | from hydra_zen import builds
 4 | 
 5 | from ocl import neural_networks
 6 | 
 7 | MLPBuilderConfig = builds(
 8 |     neural_networks.build_mlp,
 9 |     features=omegaconf.MISSING,
10 |     zen_partial=True,
11 |     populate_full_signature=True,
12 | )
13 | TransformerEncoderBuilderConfig = builds(
14 |     neural_networks.build_transformer_encoder,
15 |     n_layers=omegaconf.MISSING,
16 |     n_heads=omegaconf.MISSING,
17 |     zen_partial=True,
18 |     populate_full_signature=True,
19 | )
20 | TransformerDecoderBuilderConfig = builds(
21 |     neural_networks.build_transformer_decoder,
22 |     n_layers=omegaconf.MISSING,
23 |     n_heads=omegaconf.MISSING,
24 |     zen_partial=True,
25 |     populate_full_signature=True,
26 | )
27 | 
28 | 
29 | def register_configs(config_store):
30 |     config_store.store(group="neural_networks", name="mlp", node=MLPBuilderConfig)
31 |     config_store.store(
32 |         group="neural_networks", name="transformer_encoder", node=TransformerEncoderBuilderConfig
33 |     )
34 |     config_store.store(
35 |         group="neural_networks", name="transformer_decoder", node=TransformerDecoderBuilderConfig
36 |     )
37 | 


--------------------------------------------------------------------------------
/ocl/config/optimizers.py:
--------------------------------------------------------------------------------
 1 | """Pytorch optimizers."""
 2 | import dataclasses
 3 | 
 4 | import torch.optim
 5 | from hydra_zen import make_custom_builds_fn
 6 | 
 7 | 
 8 | @dataclasses.dataclass
 9 | class OptimizerConfig:
10 |     pass
11 | 
12 | 
13 | # TODO(hornmax): We cannot automatically extract type information from the torch SGD implementation,
14 | # thus we define it manually here.
15 | @dataclasses.dataclass
16 | class SGDConfig(OptimizerConfig):
17 |     learning_rate: float
18 |     momentum: float = 0.0
19 |     dampening: float = 0.0
20 |     nestov: bool = False
21 |     _target_: str = "hydra_zen.funcs.zen_processing"
22 |     _zen_target: str = "torch.optim.SGD"
23 |     _zen_partial: bool = True
24 | 
25 | 
26 | pbuilds = make_custom_builds_fn(
27 |     zen_partial=True,
28 |     populate_full_signature=True,
29 | )
30 | 
31 | AdamConfig = pbuilds(torch.optim.Adam, builds_bases=(OptimizerConfig,))
32 | AdamWConfig = pbuilds(torch.optim.AdamW, builds_bases=(OptimizerConfig,))
33 | 
34 | 
35 | def register_configs(config_store):
36 |     config_store.store(group="optimizers", name="sgd", node=SGDConfig)
37 |     config_store.store(group="optimizers", name="adam", node=AdamConfig)
38 |     config_store.store(group="optimizers", name="adamw", node=AdamWConfig)
39 | 


--------------------------------------------------------------------------------
/ocl/matching.py:
--------------------------------------------------------------------------------
 1 | """Methods for matching between sets of elements."""
 2 | from typing import Tuple, Type
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from scipy.optimize import linear_sum_assignment
 7 | from torchtyping import TensorType
 8 | 
 9 | # Avoid errors due to flake:
10 | batch_size = None
11 | n_elements = None
12 | 
13 | CostMatrix = Type[TensorType["batch_size", "n_elements", "n_elements"]]
14 | AssignmentMatrix = Type[TensorType["batch_size", "n_elements", "n_elements"]]
15 | CostVector = Type[TensorType["batch_size"]]
16 | 
17 | 
18 | class Matcher(torch.nn.Module):
19 |     """Matcher base class to define consistent interface."""
20 | 
21 |     def forward(self, C: CostMatrix) -> Tuple[AssignmentMatrix, CostVector]:
22 |         pass
23 | 
24 | 
25 | class CPUHungarianMatcher(Matcher):
26 |     """Implementaiton of a cpu hungarian matcher using scipy.optimize.linear_sum_assignment."""
27 | 
28 |     def forward(self, C: CostMatrix) -> Tuple[AssignmentMatrix, CostVector]:
29 |         X = torch.zeros_like(C)
30 |         C_cpu: np.ndarray = C.detach().cpu().numpy()
31 |         for i, cost_matrix in enumerate(C_cpu):
32 |             row_ind, col_ind = linear_sum_assignment(cost_matrix)
33 |             X[i][row_ind, col_ind] = 1.0
34 |         return X, (C * X).sum(dim=(1, 2))
35 | 


--------------------------------------------------------------------------------
/configs/experiment/OC-MOT/_cater_bbox_mot_preprocessing.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - /plugins/data_preprocessing@plugins.03a_preprocessing
 4 |   - /plugins/multi_element_preprocessing@plugins.03b_preprocessing
 5 | 
 6 | plugins:
 7 |   03a_preprocessing:
 8 |     training_fields:
 9 |       - image
10 |       - mask
11 |       - object_positions
12 |       - __key__
13 |     training_transform:
14 |       _target_: torchvision.transforms.Compose
15 |       transforms:
16 |         - _target_: ocl.preprocessing.AddBBoxFromInstanceMasks
17 |     evaluation_fields:
18 |       - image
19 |       - mask
20 |       - object_positions
21 |       - __key__
22 |     evaluation_transform:
23 |       _target_: torchvision.transforms.Compose
24 |       transforms:
25 |         - _target_: ocl.preprocessing.AddBBoxFromInstanceMasks
26 | 
27 |   03b_preprocessing:
28 |     training_transforms:
29 |       image:
30 |         _target_: ocl.preprocessing.VideoToTensor
31 |       instance_bbox:
32 |         _target_: ocl.preprocessing.BBoxToTensor
33 |       instance_cls:
34 |         _target_: ocl.preprocessing.ClsToTensor
35 |       instance_id:
36 |         _target_: ocl.preprocessing.IDToTensor
37 |       mask:
38 |         _target_: ocl.preprocessing.MultiMaskToTensor
39 |     evaluation_transforms:
40 |       image:
41 |         _target_: ocl.preprocessing.VideoToTensor
42 |       instance_bbox:
43 |         _target_: ocl.preprocessing.BBoxToTensor
44 |       instance_cls:
45 |         _target_: ocl.preprocessing.ClsToTensor
46 |       instance_id:
47 |         _target_: ocl.preprocessing.IDToTensor
48 |       mask:
49 |         _target_: ocl.preprocessing.MultiMaskToTensor
50 | 


--------------------------------------------------------------------------------
/configs/experiment/SAVi/_cater_bbox_mot_preprocessing.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - /plugins/data_preprocessing@plugins.03a_preprocessing
 4 |   - /plugins/multi_element_preprocessing@plugins.03b_preprocessing
 5 | 
 6 | plugins:
 7 |   03a_preprocessing:
 8 |     training_fields:
 9 |       - image
10 |       - mask
11 |       - object_positions
12 |       - __key__
13 |     training_transform:
14 |       _target_: torchvision.transforms.Compose
15 |       transforms:
16 |         - _target_: ocl.preprocessing.AddBBoxFromInstanceMasks
17 |     evaluation_fields:
18 |       - image
19 |       - mask
20 |       - object_positions
21 |       - __key__
22 |     evaluation_transform:
23 |       _target_: torchvision.transforms.Compose
24 |       transforms:
25 |         - _target_: ocl.preprocessing.AddBBoxFromInstanceMasks
26 | 
27 |   03b_preprocessing:
28 |     training_transforms:
29 |       image:
30 |         _target_: ocl.preprocessing.VideoToTensor
31 |       instance_bbox:
32 |         _target_: ocl.preprocessing.BBoxToTensor
33 |       instance_cls:
34 |         _target_: ocl.preprocessing.ClsToTensor
35 |       instance_id:
36 |         _target_: ocl.preprocessing.IDToTensor
37 |       mask:
38 |         _target_: ocl.preprocessing.MultiMaskToTensor
39 |     evaluation_transforms:
40 |       image:
41 |         _target_: ocl.preprocessing.VideoToTensor
42 |       instance_bbox:
43 |         _target_: ocl.preprocessing.BBoxToTensor
44 |       instance_cls:
45 |         _target_: ocl.preprocessing.ClsToTensor
46 |       instance_id:
47 |         _target_: ocl.preprocessing.IDToTensor
48 |       mask:
49 |         _target_: ocl.preprocessing.MultiMaskToTensor
50 | 


--------------------------------------------------------------------------------
/ocl/config/feature_extractors.py:
--------------------------------------------------------------------------------
 1 | """Configurations for feature extractors."""
 2 | import dataclasses
 3 | 
 4 | from hydra_zen import make_custom_builds_fn
 5 | 
 6 | from ocl import feature_extractors
 7 | 
 8 | 
 9 | @dataclasses.dataclass
10 | class FeatureExtractorConfig:
11 |     """Base class for PyTorch Lightning DataModules.
12 | 
13 |     This class does not actually do anything but ensures that feature extractors give outputs of
14 |     a defined structure.
15 |     """
16 | 
17 |     pass
18 | 
19 | 
20 | builds_feature_extractor = make_custom_builds_fn(
21 |     populate_full_signature=True,
22 | )
23 | 
24 | TimmFeatureExtractorConfig = builds_feature_extractor(
25 |     feature_extractors.TimmFeatureExtractor,
26 |     builds_bases=(FeatureExtractorConfig,),
27 | )
28 | SlotAttentionFeatureExtractorConfig = builds_feature_extractor(
29 |     feature_extractors.SlotAttentionFeatureExtractor,
30 |     builds_bases=(FeatureExtractorConfig,),
31 | )
32 | SAViFeatureExtractorConfig = builds_feature_extractor(
33 |     feature_extractors.SAViFeatureExtractor,
34 |     builds_bases=(FeatureExtractorConfig,),
35 | )
36 | 
37 | 
38 | def register_configs(config_store):
39 |     config_store.store(group="schemas", name="feature_extractor", node=FeatureExtractorConfig)
40 |     config_store.store(
41 |         group="feature_extractor",
42 |         name="timm_model",
43 |         node=TimmFeatureExtractorConfig,
44 |     )
45 |     config_store.store(
46 |         group="feature_extractor",
47 |         name="slot_attention",
48 |         node=SlotAttentionFeatureExtractorConfig,
49 |     )
50 |     config_store.store(
51 |         group="feature_extractor",
52 |         name="savi",
53 |         node=SAViFeatureExtractorConfig,
54 |     )
55 | 


--------------------------------------------------------------------------------
/ocl/hooks.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, Tuple
 2 | 
 3 | import webdataset
 4 | from pluggy import HookimplMarker, HookspecMarker
 5 | 
 6 | from ocl.combined_model import CombinedModel
 7 | 
 8 | hook_specification = HookspecMarker("ocl")
 9 | hook_implementation = HookimplMarker("ocl")
10 | 
11 | 
12 | class FakeHooks:
13 |     """Class that mimics the behavior of the plugin manager hooks property."""
14 | 
15 |     def __getattr__(self, attribute):
16 |         """Return a fake hook handler for any attribute query."""
17 | 
18 |         def fake_hook_handler(*args, **kwargs):
19 |             return tuple()
20 | 
21 |         return fake_hook_handler
22 | 
23 | 
24 | # @transform_hooks
25 | # def input_dependencies() -> Tuple[str, ...]:
26 | #     """Provide list of variables that are required for the plugin to function."""
27 | #
28 | #
29 | # @transform_hooks
30 | # def provided_inputs() -> Tuple[str, ...]:
31 | #     """Provide list of variables that are provided by the plugin."""
32 | 
33 | 
34 | @hook_specification
35 | def training_transform() -> Callable[[webdataset.Processor], webdataset.Processor]:
36 |     """Provide a transformation which processes a component of a webdataset pipeline."""
37 | 
38 | 
39 | @hook_specification
40 | def training_fields() -> Tuple[str]:
41 |     """Provide list of fields that are required to be decoded during training."""
42 | 
43 | 
44 | @hook_specification
45 | def evaluation_transform() -> Callable[[webdataset.Processor], webdataset.Processor]:
46 |     """Provide a transformation which processes a component of a webdataset pipeline."""
47 | 
48 | 
49 | @hook_specification
50 | def evaluation_fields() -> Tuple[str]:
51 |     """Provide list of fields that are required to be decoded during evaluation."""
52 | 
53 | 
54 | @hook_specification
55 | def configure_optimizers(model: CombinedModel) -> Dict[str, Any]:
56 |     """Return optimizers in the format of pytorch lightning."""
57 | 
58 | 
59 | @hook_specification
60 | def on_train_epoch_start(model: CombinedModel) -> None:
61 |     """Hook called when starting training epoch."""
62 | 


--------------------------------------------------------------------------------
/ocl/consistency.py:
--------------------------------------------------------------------------------
 1 | """Modules to compute the IoU matching cost and solve the corresponding LSAP."""
 2 | import numpy as np
 3 | import torch
 4 | from scipy.optimize import linear_sum_assignment
 5 | from torch import nn
 6 | 
 7 | 
 8 | class HungarianMatcher(nn.Module):
 9 |     """This class computes an assignment between the targets and the predictions of the network."""
10 | 
11 |     @torch.no_grad()
12 |     def forward(self, mask_preds, mask_targets):
13 |         """Performs the matching.
14 | 
15 |         Params:
16 |             mask_preds: Tensor of dim [batch_size, n_objects, N, N] with the predicted masks
17 |             mask_targets: Tensor of dim [batch_size, n_objects, N, N]
18 |             with the target masks from another augmentation
19 | 
20 |         Returns:
21 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
22 |                 - index_i is the indices of the selected predictions
23 |                 - index_j is the indices of the corresponding selected targets
24 |         """
25 |         bs, n_objects, _, _ = mask_preds.shape
26 |         # Compute the iou cost betwen masks
27 |         cost_iou = -get_iou_matrix(mask_preds, mask_targets)
28 |         cost_iou = cost_iou.reshape(bs, n_objects, bs, n_objects).cpu()
29 |         self.costs = torch.stack([cost_iou[i, :, i, :][None] for i in range(bs)])
30 |         indices = [linear_sum_assignment(c[0]) for c in self.costs]
31 |         return torch.as_tensor(np.array(indices))
32 | 
33 | 
34 | def get_iou_matrix(preds, targets):
35 | 
36 |     bs, n_objects, H, W = targets.shape
37 |     targets = targets.reshape(bs * n_objects, H * W).float()
38 |     preds = preds.reshape(bs * n_objects, H * W).float()
39 | 
40 |     intersection = torch.matmul(targets, preds.t())
41 |     targets_area = targets.sum(dim=1).view(1, -1)
42 |     preds_area = preds.sum(dim=1).view(1, -1)
43 |     union = (targets_area.t() + preds_area) - intersection
44 | 
45 |     return torch.where(
46 |         union == 0,
47 |         torch.tensor(0.0, device=targets.device),
48 |         intersection / union,
49 |     )
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ocl"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Max Horn <hornmax@amazon.de>"]
 6 | 
 7 | [tool.poetry.scripts]
 8 | ocl_train = "ocl.cli.train:train"
 9 | ocl_eval = "ocl.cli.eval:evaluate"
10 | ocl_compute_dataset_size = "ocl.cli.compute_dataset_size:compute_size"
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.7.1,<3.9"
14 | webdataset = "^0.1.103"
15 | # There seems to be an issue in torch 1.12.x with masking and multi-head
16 | # attention. This prevents the usage of makes without a batch dimension.
17 | # Staying with torch 1.11.x version for now.
18 | torch = "1.12.*"
19 | pytorch-lightning = "^1.5.10"
20 | hydra-zen = "^0.7.0"
21 | torchtyping = "^0.1.4"
22 | hydra-core = "^1.2.0"
23 | pluggy = "^1.0.0"
24 | importlib-metadata = "4.2"
25 | torchvision = "0.13.*"
26 | Pillow = "9.0.1"  # Newer versions of pillow seem to result in segmentation faults.
27 | torchmetrics = "^0.8.1"
28 | matplotlib = "^3.5.1"
29 | moviepy = "^1.0.3"
30 | scipy = "<=1.8"
31 | awscli = "^1.22.90"
32 | scikit-learn = "^1.0.2"
33 | pyamg = "^4.2.3"
34 | botocore = { extras = ["crt"], version = "^1.27.22" }
35 | timm = {version = "0.6.7", optional = true}
36 | hydra-submitit-launcher = { version = "^1.2.0", optional = true }
37 | decord = "0.6.0"
38 | motmetrics = "^1.2.5"
39 | clip = {git = "https://github.com/openai/CLIP.git", rev = "main", optional = true}
40 | ftfy = {version = "^6.1.1", optional = true}
41 | regex = {version = "^2022.7.9", optional = true}
42 | 
43 | [tool.poetry.dev-dependencies]
44 | black = "^22.1.0"
45 | pytest = "^7.0.1"
46 | flake8 = "^4.0.1"
47 | flake8-isort = "^4.1.1"
48 | pre-commit = "^2.17.0"
49 | flake8-tidy-imports = "^4.6.0"
50 | flake8-bugbear = "^22.1.11"
51 | flake8-docstrings = "^1.6.0"
52 | 
53 | [tool.poetry.extras]
54 | timm = ["timm"]
55 | clip = ["clip", "ftfy", "regex"]
56 | submitit = ["hydra-submitit-launcher"]
57 | 
58 | [build-system]
59 | requires = ["poetry-core<=1.0.4"]
60 | build-backend = "poetry.core.masonry.api"
61 | 
62 | [tool.black]
63 | line-length = 101
64 | target-version = ["py38"]
65 | 
66 | [tool.isort]
67 | profile = "black"
68 | line_length = 101
69 | skip_gitignore = true
70 | remove_redundant_aliases = true
71 | 


--------------------------------------------------------------------------------
/ocl/config/datasets.py:
--------------------------------------------------------------------------------
 1 | """Register all dataset related configs."""
 2 | import dataclasses
 3 | import os
 4 | 
 5 | from hydra_zen import builds
 6 | 
 7 | from ocl import datasets
 8 | 
 9 | 
10 | def get_region():
11 |     """Determine the region this EC2 instance is running in.
12 | 
13 |     Returns None if not running on an EC2 instance.
14 |     """
15 |     import requests
16 | 
17 |     try:
18 |         r = requests.get(
19 |             "http://169.254.169.254/latest/dynamic/instance-identity/document", timeout=0.5
20 |         )
21 |         response_json = r.json()
22 |         return response_json.get("region")
23 |     except Exception:
24 |         # Not running on an ec2 instance.
25 |         return None
26 | 
27 | 
28 | # Detemine region name and select bucket accordingly.
29 | AWS_REGION = get_region()
30 | if AWS_REGION in ["us-east-2", "us-west-2", "eu-west-1"]:
31 |     # Select bucket in same region.
32 |     DEFAULT_S3_PATH = f"s3://object-centric-datasets-{AWS_REGION}"
33 | else:
34 |     # Use MRAP to find closest bucket.
35 |     DEFAULT_S3_PATH = "s3://arn:aws:s3::436622332146:accesspoint/m6p4hmmybeu97.mrap"
36 | 
37 | 
38 | @dataclasses.dataclass
39 | class DataModuleConfig:
40 |     """Base class for PyTorch Lightning DataModules.
41 | 
42 |     This class does not actually do anything but ensures that datasets behave like pytorch lightning
43 |     datamodules.
44 |     """
45 | 
46 | 
47 | def dataset_prefix(path):
48 |     # prefix = os.environ.get("DATASET_PREFIX")
49 |     prefix = '/home/ubuntu/data'
50 |     if prefix:
51 |         return f"{prefix}/{path}"
52 |     # Use the path to the multi-region bucket if no override is specified.
53 |     return f"pipe:aws s3 cp --quiet {DEFAULT_S3_PATH}/{path} -"
54 | 
55 | 
56 | WebdatasetDataModuleConfig = builds(
57 |     datasets.WebdatasetDataModule, populate_full_signature=True, builds_bases=(DataModuleConfig,)
58 | )
59 | DummyDataModuleConfig = builds(
60 |     datasets.DummyDataModule, populate_full_signature=True, builds_bases=(DataModuleConfig,)
61 | )
62 | 
63 | 
64 | def register_configs(config_store):
65 |     config_store.store(group="schemas", name="dataset", node=DataModuleConfig)
66 |     config_store.store(group="dataset", name="webdataset", node=WebdatasetDataModuleConfig)
67 |     config_store.store(group="dataset", name="dummy_dataset", node=DummyDataModuleConfig)
68 | 
69 | 
70 | def register_resolvers(omegaconf):
71 |     omegaconf.register_new_resolver("dataset_prefix", dataset_prefix)
72 | 


--------------------------------------------------------------------------------
/ocl/config/conditioning.py:
--------------------------------------------------------------------------------
 1 | """Configuration of slot conditionings."""
 2 | import dataclasses
 3 | 
 4 | from hydra_zen import builds
 5 | from omegaconf import SI
 6 | 
 7 | from ocl import conditioning
 8 | 
 9 | 
10 | @dataclasses.dataclass
11 | class ConditioningConfig:
12 |     """Base class for conditioning module configuration."""
13 | 
14 | 
15 | # Unfortunately, we cannot define object_dim as part of the base config class as this prevents using
16 | # required positional arguments in all subclasses. We thus instead pass them here.
17 | LearntConditioningConfig = builds(
18 |     conditioning.LearntConditioning,
19 |     object_dim=SI("${perceptual_grouping.object_dim}"),
20 |     builds_bases=(ConditioningConfig,),
21 |     populate_full_signature=True,
22 | )
23 | 
24 | RandomConditioningConfig = builds(
25 |     conditioning.RandomConditioning,
26 |     object_dim=SI("${perceptual_grouping.object_dim}"),
27 |     builds_bases=(ConditioningConfig,),
28 |     populate_full_signature=True,
29 | )
30 | 
31 | RandomConditioningWithQMCSamplingConfig = builds(
32 |     conditioning.RandomConditioningWithQMCSampling,
33 |     object_dim=SI("${perceptual_grouping.object_dim}"),
34 |     builds_bases=(ConditioningConfig,),
35 |     populate_full_signature=True,
36 | )
37 | 
38 | SlotwiseLearntConditioningConfig = builds(
39 |     conditioning.SlotwiseLearntConditioning,
40 |     object_dim=SI("${perceptual_grouping.object_dim}"),
41 |     builds_bases=(ConditioningConfig,),
42 |     populate_full_signature=True,
43 | )
44 | CoordinateEncoderStateInitConfig = builds(
45 |     conditioning.CoordinateEncoderStateInit,
46 |     object_dim=SI("${perceptual_grouping.object_dim}"),
47 |     builds_bases=(ConditioningConfig,),
48 |     populate_full_signature=True,
49 | )
50 | 
51 | def register_configs(config_store):
52 |     config_store.store(group="schemas", name="conditioning", node=ConditioningConfig)
53 | 
54 |     config_store.store(group="conditioning", name="learnt", node=LearntConditioningConfig)
55 |     config_store.store(group="conditioning", name="random", node=RandomConditioningConfig)
56 |     config_store.store(
57 |         group="conditioning",
58 |         name="random_with_qmc_sampling",
59 |         node=RandomConditioningWithQMCSamplingConfig,
60 |     )
61 |     config_store.store(
62 |         group="conditioning", name="slotwise_learnt_random", node=SlotwiseLearntConditioningConfig
63 |     )
64 |     config_store.store(group="conditioning", name="boxhint", node=CoordinateEncoderStateInitConfig)


--------------------------------------------------------------------------------
/ocl/cli/compute_dataset_size.py:
--------------------------------------------------------------------------------
 1 | """Script to compute the size of a dataset.
 2 | 
 3 | This is useful when subsampling data using transformations in order to determine the final dataset
 4 | size.  The size of the dataset is typically need when running distributed training in order to
 5 | ensure that all nodes and gpu training processes are presented with the same number of batches.
 6 | """
 7 | import dataclasses
 8 | import logging
 9 | import os
10 | from typing import Dict
11 | 
12 | import hydra
13 | import hydra_zen
14 | import tqdm
15 | from pluggy import PluginManager
16 | 
17 | import ocl.hooks
18 | from ocl.config.datasets import DataModuleConfig
19 | from ocl.config.plugins import PluginConfig
20 | 
21 | 
22 | @dataclasses.dataclass
23 | class ComputeSizeConfig:
24 |     """Configuration of a training run."""
25 | 
26 |     dataset: DataModuleConfig
27 |     plugins: Dict[str, PluginConfig] = dataclasses.field(default_factory=dict)
28 | 
29 | 
30 | hydra.core.config_store.ConfigStore.instance().store(
31 |     name="compute_size_config",
32 |     node=ComputeSizeConfig,
33 | )
34 | 
35 | 
36 | @hydra.main(config_name="compute_size_config", config_path="../../configs", version_base="1.1")
37 | def compute_size(config: ComputeSizeConfig):
38 |     pm = PluginManager("ocl")
39 |     pm.add_hookspecs(ocl.hooks)
40 | 
41 |     datamodule = hydra_zen.instantiate(config.dataset, hooks=pm.hook)
42 |     pm.register(datamodule)
43 | 
44 |     plugins = hydra_zen.instantiate(config.plugins)
45 |     for plugin in plugins.values():
46 |         pm.register(plugin)
47 | 
48 |     # Compute dataset sizes
49 |     # TODO(hornmax): This is needed for webdataset shuffling, is there a way to make this more
50 |     # elegant and less specific?
51 |     os.environ["WDS_EPOCH"] = str(0)
52 |     train_size = sum(
53 |         1
54 |         for _ in tqdm.tqdm(
55 |             datamodule.train_data_iterator(), desc="Reading train split", unit="samples"
56 |         )
57 |     )
58 |     logging.info("Train split size: %d", train_size)
59 |     val_size = sum(
60 |         1
61 |         for _ in tqdm.tqdm(
62 |             datamodule.val_data_iterator(), desc="Reading validation split", unit="samples"
63 |         )
64 |     )
65 |     logging.info("Validation split size: %d", val_size)
66 |     test_size = sum(
67 |         1
68 |         for _ in tqdm.tqdm(
69 |             datamodule.test_data_iterator(), desc="Reading test split", unit="samples"
70 |         )
71 |     )
72 |     logging.info("Test split size: %d", test_size)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     compute_size()
77 | 


--------------------------------------------------------------------------------
/ocl/visualization_types.py:
--------------------------------------------------------------------------------
 1 | """Classes for handling different types of visualizations."""
 2 | import dataclasses
 3 | from typing import Any, List, Optional, Union
 4 | 
 5 | import matplotlib.pyplot
 6 | import torch
 7 | from torch.utils.tensorboard import SummaryWriter
 8 | from torchtyping import TensorType
 9 | 
10 | 
11 | def dataclass_to_dict(d):
12 |     return {field.name: getattr(d, field.name) for field in dataclasses.fields(d)}
13 | 
14 | 
15 | @dataclasses.dataclass
16 | class Visualization:
17 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
18 |         pass
19 | 
20 | 
21 | @dataclasses.dataclass
22 | class Figure(Visualization):
23 |     """Matplotlib figure."""
24 | 
25 |     figure: matplotlib.pyplot.figure
26 |     close: bool = True
27 | 
28 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
29 |         experiment.add_figure(**dataclass_to_dict(self), tag=tag, global_step=global_step)
30 | 
31 | 
32 | @dataclasses.dataclass
33 | class Image(Visualization):
34 |     """Single image."""
35 | 
36 |     img_tensor: torch.Tensor
37 |     dataformats: str = "CHW"
38 | 
39 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
40 |         experiment.add_image(**dataclass_to_dict(self), tag=tag, global_step=global_step)
41 | 
42 | 
43 | @dataclasses.dataclass
44 | class Images(Visualization):
45 |     """Batch of images."""
46 | 
47 |     img_tensor: torch.Tensor
48 |     dataformats: str = "NCHW"
49 | 
50 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
51 |         experiment.add_images(**dataclass_to_dict(self), tag=tag, global_step=global_step)
52 | 
53 | 
54 | @dataclasses.dataclass
55 | class Video(Visualization):
56 |     """Batch of videos."""
57 | 
58 |     vid_tensor: TensorType["batch_size", "frames", "channels", "height", "width"]  # noqa: F821
59 |     fps: Union[int, float] = 4
60 | 
61 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
62 |         experiment.add_video(**dataclass_to_dict(self), tag=tag, global_step=global_step)
63 | 
64 | 
65 | class Embedding(Visualization):
66 |     """Batch of embeddings."""
67 | 
68 |     mat: TensorType["batch_size", "feature_dim"]  # noqa: F821
69 |     metadata: Optional[List[Any]] = None
70 |     label_img: Optional[TensorType["batch_size", "channels", "height", "width"]] = None  # noqa: F821
71 |     metadata_header: Optional[List[str]] = None
72 | 
73 |     def add_to_experiment(self, experiment: SummaryWriter, tag: str, global_step: int):
74 |         experiment.add_embedding(**dataclass_to_dict(self), tag=tag, global_step=global_step)
75 | 


--------------------------------------------------------------------------------
/ocl/models/savi.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | from typing import Any, Dict
 3 | import copy
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | from ocl.path_defaults import VIDEO, BOX
 8 | from ocl.tree_utils import get_tree_element, reduce_tree
 9 | 
10 | 
11 | class SAVi(nn.Module):
12 |     def __init__(
13 |         self,
14 |         conditioning: nn.Module,
15 |         feature_extractor: nn.Module,
16 |         perceptual_grouping: nn.Module,
17 |         decoder: nn.Module,
18 |         transition_model: nn.Module,
19 |     ):
20 |         super().__init__()
21 |         self.conditioning = conditioning
22 |         self.feature_extractor = feature_extractor
23 |         self.perceptual_grouping = perceptual_grouping
24 |         self.decoder = decoder
25 |         self.transition_model = transition_model
26 |         self.batched_input = None
27 | 
28 |     def forward(self, inputs: Dict[str, Any], phase = 'train'):
29 |         # if self.batched_input is None:
30 |         #     video = get_tree_element(inputs, VIDEO.split("."))
31 |         #     # if video.shape[1] == 6:
32 |         #     self.batched_input = copy.deepcopy(inputs)
33 |         # else:
34 |         #     print ('use catched')
35 |         #     inputs = self.batched_input
36 | 
37 |         output = inputs
38 |         video = get_tree_element(inputs, VIDEO.split("."))
39 |         box = get_tree_element(inputs, BOX.split("."))
40 |         batch_size = video.shape[0]
41 | 
42 |         features = self.feature_extractor(video=video)
43 |         output["feature_extractor"] = features
44 |         conditioning = self.conditioning(batch_size=batch_size)
45 |         # conditioning = self.conditioning(batch_size=batch_size)
46 |         output["initial_conditioning"] = conditioning
47 | 
48 |         # Loop over time.
49 |         perceptual_grouping_outputs = []
50 |         decoder_outputs = []
51 |         transition_model_outputs = []
52 |         trackers = []
53 |         for frame_features in features:
54 |             perceptual_grouping_output = self.perceptual_grouping(
55 |                 extracted_features=frame_features, conditioning=conditioning
56 |             )
57 |             slots = perceptual_grouping_output.objects
58 |             decoder_output = self.decoder(object_features=slots)
59 | 
60 |             # remove background
61 |             masks = decoder_output.masks_eval
62 |             valid_idx = [0,1,2,4,5,6,7,8,9,10]
63 |             masks_obj = masks[:, valid_idx]
64 | 
65 |             conditioning = self.transition_model(slots)
66 |             # Store outputs.
67 |             perceptual_grouping_outputs.append(slots)
68 |             decoder_outputs.append(decoder_output)
69 |             transition_model_outputs.append(conditioning)
70 |             trackers.append(masks_obj)
71 | 
72 |         # Stack all recurrent outputs.
73 |         stacking_fn = partial(torch.stack, dim=1)
74 |         output["perceptual_grouping"] = reduce_tree(perceptual_grouping_outputs, stacking_fn)
75 |         output["decoder"] = reduce_tree(decoder_outputs, stacking_fn)
76 |         output["transition_model"] = reduce_tree(transition_model_outputs, stacking_fn)
77 |         output["tracks"] = reduce_tree(trackers, stacking_fn)
78 |         return output
79 | 
80 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.1.0
  2 | aiohttp==3.8.1
  3 | aiosignal==1.2.0
  4 | antlr4-python3-runtime==4.9.3
  5 | async-timeout==4.0.2
  6 | attrs==21.4.0
  7 | awscli==1.25.22
  8 | awscrt==0.13.8
  9 | black==22.6.0
 10 | botocore==1.27.22
 11 | braceexpand==0.1.7
 12 | CacheControl==0.12.11
 13 | cachetools==5.2.0
 14 | cachy==0.3.0
 15 | certifi==2022.6.15
 16 | cfgv==3.3.1
 17 | charset-normalizer==2.1.0
 18 | cleo==0.8.1
 19 | click==8.1.3
 20 | clikit==0.6.2
 21 | colorama==0.4.4
 22 | crashtest==0.3.1
 23 | cycler==0.11.0
 24 | decorator==4.4.2
 25 | decord==0.6.0
 26 | distlib==0.3.4
 27 | docutils==0.16
 28 | filelock==3.7.1
 29 | filterpy==1.4.5
 30 | flake8==4.0.1
 31 | flake8-bugbear==22.7.1
 32 | flake8-docstrings==1.6.0
 33 | flake8-isort==4.1.1
 34 | flake8-tidy-imports==4.8.0
 35 | fonttools==4.33.3
 36 | frozenlist==1.3.0
 37 | fsspec==2022.7.1
 38 | google-auth==2.9.0
 39 | google-auth-oauthlib==0.4.6
 40 | grpcio==1.47.0
 41 | html5lib==1.1
 42 | hydra-core==1.2.0
 43 | hydra-zen==0.7.1
 44 | identify==2.5.1
 45 | idna==3.3
 46 | imageio==2.19.3
 47 | imageio-ffmpeg==0.4.7
 48 | importlib-metadata==4.2.0
 49 | importlib-resources==5.8.0
 50 | iniconfig==1.1.1
 51 | isort==5.10.1
 52 | jeepney==0.8.0
 53 | jmespath==1.0.1
 54 | joblib==1.1.0
 55 | keyring==23.8.2
 56 | kiwisolver==1.4.3
 57 | lap==0.4.0
 58 | llvmlite==0.39.1
 59 | lockfile==0.12.2
 60 | Markdown==3.3.5
 61 | matplotlib==3.5.2
 62 | mccabe==0.6.1
 63 | motmetrics==1.2.5
 64 | moviepy==1.0.3
 65 | msgpack==1.0.4
 66 | multidict==6.0.2
 67 | mypy-extensions==0.4.3
 68 | nodeenv==1.7.0
 69 | numba==0.56.4
 70 | numpy==1.21.6
 71 | oauthlib==3.2.0
 72 | # Editable install with no version control (ocl==0.1.0)
 73 | -e /home/ubuntu/object-centric-learning-models-mainline
 74 | omegaconf==2.2.2
 75 | packaging==21.3
 76 | pandas==1.3.5
 77 | pastel==0.2.1
 78 | pathspec==0.9.0
 79 | pexpect==4.8.0
 80 | Pillow==9.0.1
 81 | pkginfo==1.8.3
 82 | platformdirs==2.5.2
 83 | pluggy==1.0.0
 84 | poetry==1.1.14
 85 | poetry-core==1.0.8
 86 | pre-commit==2.19.0
 87 | proglog==0.1.10
 88 | protobuf==3.20.1
 89 | ptyprocess==0.7.0
 90 | py==1.11.0
 91 | pyamg==4.2.3
 92 | pyasn1==0.4.8
 93 | pyasn1-modules==0.2.8
 94 | pycodestyle==2.8.0
 95 | pyDeprecate==0.3.2
 96 | pydocstyle==6.1.1
 97 | pyflakes==2.4.0
 98 | pylev==1.4.0
 99 | pyparsing==3.0.9
100 | pytest==7.1.2
101 | python-dateutil==2.8.2
102 | pytorch-lightning==1.6.4
103 | pytz==2022.4
104 | PyYAML==5.4.1
105 | requests==2.28.1
106 | requests-oauthlib==1.3.1
107 | requests-toolbelt==0.9.1
108 | rsa==4.7.2
109 | s3transfer==0.6.0
110 | scikit-learn==1.0.2
111 | scipy==1.7.3
112 | SecretStorage==3.3.2
113 | setuptools-scm==7.0.4
114 | shellingham==1.5.0
115 | six==1.16.0
116 | snowballstemmer==2.2.0
117 | tensorboard==2.9.0
118 | tensorboard-data-server==0.6.1
119 | tensorboard-plugin-wit==1.8.1
120 | testfixtures==6.18.5
121 | threadpoolctl==3.1.0
122 | toml==0.10.2
123 | tomli==2.0.1
124 | tomlkit==0.11.3
125 | torch==1.12.1
126 | torchmetrics==0.8.2
127 | torchtyping==0.1.4
128 | torchvision==0.13.1
129 | tqdm==4.64.0
130 | typeguard==2.13.3
131 | typing_extensions==4.3.0
132 | urllib3==1.26.9
133 | virtualenv==20.15.1
134 | webdataset==0.1.103
135 | webencodings==0.5.1
136 | Werkzeug==2.1.2
137 | xmltodict==0.13.0
138 | yarl==1.7.2
139 | zipp==3.8.0
140 | 
141 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/ocl/config/utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions useful for configuration."""
 2 | import ast
 3 | from typing import Any, Callable
 4 | 
 5 | from hydra_zen import builds
 6 | 
 7 | from ocl.config.feature_extractors import FeatureExtractorConfig
 8 | from ocl.config.perceptual_groupings import PerceptualGroupingConfig
 9 | from ocl.config.predictor import PredictorConfig
10 | from ocl.distillation import EMASelfDistillation
11 | from ocl.utils import Combined, CreateSlotMask, Recurrent
12 | 
13 | 
14 | def lambda_string_to_function(function_string: str) -> Callable[..., Any]:
15 |     """Convert string of the form "lambda x: x" into a callable Python function."""
16 |     # This is a bit hacky but ensures that the syntax of the input is correct and contains
17 |     # a valid lambda function definition without requiring to run `eval`.
18 |     parsed = ast.parse(function_string)
19 |     is_lambda = isinstance(parsed.body[0], ast.Expr) and isinstance(parsed.body[0].value, ast.Lambda)
20 |     if not is_lambda:
21 |         raise ValueError(f"'{function_string}' is not a valid lambda definition.")
22 | 
23 |     return eval(function_string)
24 | 
25 | 
26 | class ConfigDefinedLambda:
27 |     """Lambda function defined in the config.
28 | 
29 |     This allows lambda functions defined in the config to be pickled.
30 |     """
31 | 
32 |     def __init__(self, function_string: str):
33 |         self.__setstate__(function_string)
34 | 
35 |     def __getstate__(self) -> str:
36 |         return self.function_string
37 | 
38 |     def __setstate__(self, function_string: str):
39 |         self.function_string = function_string
40 |         self._fn = lambda_string_to_function(function_string)
41 | 
42 |     def __call__(self, *args, **kwargs):
43 |         return self._fn(*args, **kwargs)
44 | 
45 | 
46 | def eval_lambda(function_string, *args):
47 |     lambda_fn = lambda_string_to_function(function_string)
48 |     return lambda_fn(*args)
49 | 
50 | 
51 | FunctionConfig = builds(ConfigDefinedLambda, populate_full_signature=True)
52 | 
53 | # Inherit from all so it can be used in place of any module.
54 | CombinedConfig = builds(
55 |     Combined,
56 |     populate_full_signature=True,
57 |     builds_bases=(FeatureExtractorConfig, PerceptualGroupingConfig, PredictorConfig),
58 | )
59 | RecurrentConfig = builds(
60 |     Recurrent,
61 |     populate_full_signature=True,
62 |     builds_bases=(FeatureExtractorConfig, PerceptualGroupingConfig, PredictorConfig),
63 | )
64 | CreateSlotMaskConfig = builds(CreateSlotMask, populate_full_signature=True)
65 | 
66 | 
67 | EMASelfDistillationConfig = builds(
68 |     EMASelfDistillation,
69 |     populate_full_signature=True,
70 |     builds_bases=(FeatureExtractorConfig, PerceptualGroupingConfig, PredictorConfig),
71 | )
72 | 
73 | 
74 | def register_configs(config_store):
75 |     config_store.store(group="schemas", name="lambda_fn", node=FunctionConfig)
76 |     config_store.store(group="utils", name="combined", node=CombinedConfig)
77 |     config_store.store(group="utils", name="selfdistillation", node=EMASelfDistillationConfig)
78 |     config_store.store(group="utils", name="recurrent", node=RecurrentConfig)
79 |     config_store.store(group="utils", name="create_slot_mask", node=CreateSlotMaskConfig)
80 | 
81 | 
82 | def register_resolvers(omegaconf):
83 |     omegaconf.register_new_resolver("lambda_fn", ConfigDefinedLambda)
84 |     omegaconf.register_new_resolver("eval_lambda", eval_lambda)
85 | 


--------------------------------------------------------------------------------
/ocl/distillation.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Any, Dict, List, Optional, Union
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | from ocl import scheduling, tree_utils, utils
 8 | 
 9 | 
10 | class EMASelfDistillation(nn.Module):
11 |     def __init__(
12 |         self,
13 |         student: Union[nn.Module, Dict[str, nn.Module]],
14 |         schedule: scheduling.HPScheduler,
15 |         student_remapping: Optional[Dict[str, str]] = None,
16 |         teacher_remapping: Optional[Dict[str, str]] = None,
17 |     ):
18 |         super().__init__()
19 |         # Do this for convenience to reduce crazy amount of nesting.
20 |         if isinstance(student, dict):
21 |             student = utils.Combined(student)
22 |         if student_remapping is None:
23 |             student_remapping = {}
24 |         if teacher_remapping is None:
25 |             teacher_remapping = {}
26 | 
27 |         self.student = student
28 |         self.teacher = copy.deepcopy(student)
29 |         self.schedule = schedule
30 |         self.student_remapping = {key: value.split(".") for key, value in student_remapping.items()}
31 |         self.teacher_remapping = {key: value.split(".") for key, value in teacher_remapping.items()}
32 | 
33 |     def build_input_dict(self, inputs, remapping):
34 |         if not remapping:
35 |             return inputs
36 |         # This allows us to bing the initial input and previous_output into a similar format.
37 |         output_dict = {}
38 |         for output_path, input_path in remapping.items():
39 |             source = tree_utils.get_tree_element(inputs, input_path)
40 | 
41 |             output_path = output_path.split(".")
42 |             cur_search = output_dict
43 |             for path_part in output_path[:-1]:
44 |                 # Iterate along path and create nodes that do not exist yet.
45 |                 try:
46 |                     # Get element prior to last.
47 |                     cur_search = tree_utils.get_tree_element(cur_search, [path_part])
48 |                 except ValueError:
49 |                     # Element does not yet exist.
50 |                     cur_search[path_part] = {}
51 |                     cur_search = cur_search[path_part]
52 | 
53 |             cur_search[output_path[-1]] = source
54 |         return output_dict
55 | 
56 |     def forward(self, inputs: Dict[str, Any]):
57 |         if self.training:
58 |             with torch.no_grad():
59 |                 m = self.schedule(inputs["global_step"])  # momentum parameter
60 |                 for param_q, param_k in zip(self.student.parameters(), self.teacher.parameters()):
61 |                     param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)
62 | 
63 |         # prefix variable similar to combined module.
64 |         prefix: List[str]
65 |         if "prefix" in inputs.keys():
66 |             prefix = inputs["prefix"]
67 |         else:
68 |             prefix = []
69 |             inputs["prefix"] = prefix
70 | 
71 |         outputs = tree_utils.get_tree_element(inputs, prefix)
72 | 
73 |         # Forward pass student.
74 |         prefix.append("student")
75 |         outputs["student"] = {}
76 |         student_inputs = self.build_input_dict(inputs, self.student_remapping)
77 |         outputs["student"] = self.student(inputs={**inputs, **student_inputs})
78 |         # Teacher and student share the same code, thus paths also need to be the same.  To ensure
79 |         # that we save the student outputs and run the teacher as if it where the student.
80 |         student_output = outputs["student"]
81 | 
82 |         # Forward pass teacher, but pretending to be student.
83 |         outputs["student"] = {}
84 |         teacher_inputs = self.build_input_dict(inputs, self.teacher_remapping)
85 | 
86 |         with torch.no_grad():
87 |             outputs["teacher"] = self.teacher(inputs={**inputs, **teacher_inputs})
88 |         prefix.pop()
89 | 
90 |         # Set correct outputs again.
91 |         outputs["student"] = student_output
92 | 
93 |         return outputs
94 | 


--------------------------------------------------------------------------------
/configs/experiment/SAVi/cater.yaml:
--------------------------------------------------------------------------------
  1 | defaults:
  2 |   - /experiment/_output_path
  3 |   - /training_config
  4 |   - /dataset: cater
  5 |   - /plugins/optimization@plugins.optimize_parameters
  6 |   - /plugins/random_strided_window@plugins.02_random_strided_window         # Used during training.
  7 |   - /plugins/multi_element_preprocessing@plugins.03_preprocessing
  8 |   - /optimizers/adam@plugins.optimize_parameters.optimizer
  9 |   - /lr_schedulers/cosine_annealing@plugins.optimize_parameters.lr_scheduler
 10 |   - /experiment/SAVi/_cater_bbox_mot_preprocessing
 11 | #  - /metrics/three_d_iou@evaluation_metrics.iou
 12 | #  - /metrics/mot_metric@evaluation_metrics.mot
 13 |   - /metrics/ari_metric@evaluation_metrics.ari
 14 |   - _self_
 15 | 
 16 | 
 17 | 
 18 | load_checkpoint: outputs/SAVi/savi/2023-02-20_23-49-54/checkpoints/epoch=18-step=1064.ckpt
 19 | 
 20 | trainer:
 21 |   gpus: 8
 22 |   gradient_clip_val: 0.05
 23 |   gradient_clip_algorithm: "norm"
 24 |   max_epochs: null
 25 |   max_steps: 2000005
 26 |   strategy: 'ddp'
 27 |   callbacks:
 28 |     - _target_: pytorch_lightning.callbacks.LearningRateMonitor
 29 |       logging_interval: "step"
 30 | 
 31 | dataset:
 32 |   num_workers: 4
 33 |   batch_size: 30
 34 | 
 35 | models:
 36 |   _target_: ocl.models.SAVi
 37 |   conditioning:
 38 |     _target_: ocl.conditioning.LearntConditioning
 39 |     n_slots: 11
 40 |     object_dim: 128
 41 | 
 42 |   feature_extractor:
 43 |     # Use the smaller verion of the feature extractor architecture.
 44 |     _target_: ocl.feature_extractors.SAViFeatureExtractor
 45 |     larger_input_arch: False
 46 | 
 47 |   perceptual_grouping:
 48 |     _target_: ocl.perceptual_grouping.SlotAttentionGrouping
 49 |     feature_dim: 32
 50 |     object_dim: ${models.conditioning.object_dim}
 51 |     iters: 2
 52 |     kvq_dim: 128
 53 |     use_projection_bias: false
 54 |     positional_embedding:
 55 |       _target_: ocl.utils.Sequential
 56 |       _args_:
 57 |         - _target_: ocl.utils.SoftPositionEmbed
 58 |           n_spatial_dims: 2
 59 |           feature_dim: 32
 60 |           savi_style: true
 61 |         - _target_: ocl.neural_networks.build_two_layer_mlp
 62 |           input_dim: 32
 63 |           output_dim: 32
 64 |           hidden_dim: 64
 65 |           initial_layer_norm: true
 66 |     ff_mlp: null
 67 | 
 68 |   decoder:
 69 |     _target_: ocl.decoding.SlotAttentionDecoder
 70 |     decoder:
 71 |       _target_: ocl.decoding.get_savi_decoder_backbone
 72 |       object_dim: ${models.perceptual_grouping.object_dim}
 73 |       larger_input_arch: False
 74 |     positional_embedding:
 75 |       _target_: ocl.utils.SoftPositionEmbed
 76 |       n_spatial_dims: 2
 77 |       feature_dim: ${models.perceptual_grouping.object_dim}
 78 |       cnn_channel_order: true
 79 |       savi_style: true
 80 | 
 81 |   transition_model:
 82 |     _target_: torch.nn.Identity
 83 | 
 84 | losses:
 85 |   mse:
 86 |     _target_: ocl.losses.ReconstructionLoss
 87 |     loss_type: mse_sum
 88 |     input_path: decoder.reconstruction
 89 |     target_path: input.image
 90 | 
 91 | plugins:
 92 |   optimize_parameters:
 93 |     optimizer:
 94 |       lr: 0.0001
 95 |     lr_scheduler:
 96 |       T_max: 200000
 97 |       eta_min: 0.0
 98 |       warmup_steps: 0
 99 |   02_random_strided_window:
100 |     n_consecutive_frames: 6
101 |     training_fields:
102 |       - image
103 |     evaluation_fields: []
104 | 
105 | visualizations:
106 |   input:
107 |     _target_: ocl.visualizations.Video
108 |     denormalization: null
109 |     video_path: input.image
110 |   reconstruction:
111 |     _target_: ocl.visualizations.Video
112 |     denormalization: ${..input.denormalization}
113 |     video_path: decoder.reconstruction
114 |   objects:
115 |     _target_: ocl.visualizations.VisualObject
116 |     denormalization: ${..input.denormalization}
117 |     object_path: decoder.object_reconstructions
118 |     mask_path: decoder.masks_eval
119 |   objectmot:
120 |     _target_: ocl.visualizations.ObjectMOT
121 |     n_clips: 5
122 |     denormalization: null
123 |     video_path: input.image
124 |     mask_path: tracks
125 | 
126 | evaluation_metrics:
127 |   ari:
128 |     prediction_path: decoder.masks
129 |     target_path: input.mask
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Object-Centric Multiple Object Tracking (OC-MOT)
 2 | This is the official implementation of the ICCV'23 paper [Object-Centric Multiple Object Tracking](https://arxiv.org/abs/2309.00233). The code was implemented by [Zixu Zhao](https://github.com/zxzhaoeric), [Jiaze Wang](https://jiazewang.com/), [Max Horn](https://github.com/ExpectationMax) and [Tianjun Xiao](http://tianjunxiao.com/).
 3 | 
 4 | ## Introduction
 5 | 
 6 | ![framework](srcs/framework.png)
 7 | 
 8 | OC-MOT  is a framework designed to perform multiple object tracking on object-centric representations without object ID labels. It consists of an index-merge module that adapts the object-centric slots into detection outputs and an unsupervised memory module that builds complete object prototypes to handle occlusions. Benefited from object-centric learning, we only requires sparse detection labels for object localization and feature binding. Our experiments significantly narrow the gap between the existing object-centric model and the fully supervised state-of-the-art and outperform several unsupervised trackers.
 9 | 
10 | 
11 | ## Development Setup
12 | Installing OC-MOT requires at least python3.8. Installation can be done using [poetry](https://python-poetry.org/docs/#installation).  After installing `poetry`, check out the repo and setup a development environment:
13 | 
14 | ```bash
15 | git clone https://github.com/amazon-science/object-centric-learning-framework.git
16 | cd object-centric-multiple-object-tracking
17 | poetry install
18 | ```
19 | 
20 | This installs the `ocl` package and the cli scripts used for running experiments in a poetry managed virtual environment. Activate the poetry virtual environment `poetry shell` before running the experiments.
21 | 
22 | ## Running experiments
23 | 
24 | Experiments are defined in the folder `configs/experiment` and can be run
25 | by setting the experiment variable. For example, if we run OC-MOT on Cater dataset, we can follow: 
26 | 
27 | ```bash
28 | poetry run python -m ocl.cli.train +experiment=OC-MOT/cater
29 | poetry run python -m ocl.cli.eval +experiment=OC-MOT/cater_eval
30 | ```
31 | 
32 | The result is saved in a timestamped subdirectory in `outputs/<experiment_name>`, i.e. `outputs/OC-MOT/cater/<date>_<time>` in the above case. The prefix path `outputs` can be configured using the `experiment.root_output_path` variable.
33 | 
34 | Besides, you can use `tensorboard` for visualization:
35 | 
36 | ```bash
37 | cd outputs/<experiment_name>
38 | tensorboard --logdir version_0
39 | ```
40 | ## Checkpoints
41 | The pre-trained models will be uploaded soon.
42 | 
43 | ## Demo
44 | MOT performance comparison with video object-centric model SAVi:
45 | 
46 | ![demo1](srcs/cater_demo.gif)
47 | 
48 | Extension to real-world videos. We point out in the main paper that existing object-centric models show bad detection / segmentation performance on real-world videos. However, one highlight of this work is our novel framework to learn object association in a self-supervised manner, which is agnostic to the detection module. For real-world purposes, we replace the detection module with [SEEM](https://arxiv.org/abs/2304.06718) and train OC-MOT with self-supervised feature loss. From the visualizations below, we also observe quite inspring tracking performance on objects such as persons and cars.
49 | 
50 | ![demo1](srcs/real_world_demo.gif)
51 | 
52 | 
53 | ## Citation
54 | Please cite our paper if you find this repo useful!
55 | 
56 | ```bibtex
57 | @article{zhao2023object,
58 |   title={Object-Centric Multiple Object Tracking},
59 |   author={Zhao, Zixu and Wang, Jiaze and Horn, Max and Ding, Yizhuo and He, Tong and Bai, Zechen and Zietlow, Dominik and Simon-Gabriel, Carl-Johann and Shuai, Bing and Tu, Zhuowen and others},
60 |   journal={arXiv preprint arXiv:2309.00233},
61 |   year={2023}
62 | }
63 | ```
64 | Related projects that this paper is developed upon:
65 | ```bibtex
66 | @misc{oclf,
67 |   author = {Max Horn and Maximilian Seitzer and Andrii Zadaianchuk and Zixu Zhao and Dominik Zietlow and Florian Wenzel and Tianjun Xiao},
68 |   title = {Object Centric Learning Framework (version 0.1)},
69 |   year  = {2023},
70 |   url   = {https://github.com/amazon-science/object-centric-learning-framework},
71 | }
72 | ```
73 | 
74 | ## License
75 | This project is licensed under the Apache-2.0 License.
76 | 


--------------------------------------------------------------------------------
/ocl/config/metrics.py:
--------------------------------------------------------------------------------
  1 | """Register metric related configs."""
  2 | import dataclasses
  3 | 
  4 | from hydra_zen import builds, make_custom_builds_fn
  5 | 
  6 | from ocl import metrics
  7 | 
  8 | 
  9 | @dataclasses.dataclass
 10 | class MetricConfig:
 11 |     """Base class for metrics."""
 12 | 
 13 |     pass
 14 | 
 15 | 
 16 | builds_metric = make_custom_builds_fn(
 17 |     populate_full_signature=True,
 18 | )
 19 | 
 20 | TensorStatisticConfig = builds_metric(metrics.TensorStatistic, builds_bases=(MetricConfig,))
 21 | TorchmetricsWrapperConfig = builds_metric(metrics.TorchmetricsWrapper, builds_bases=(MetricConfig,))
 22 | ARIMetricConfig = builds_metric(metrics.ARIMetric, builds_bases=(MetricConfig,))
 23 | PatchARIMetricConfig = builds_metric(
 24 |     metrics.PatchARIMetric,
 25 |     builds_bases=(MetricConfig,),
 26 | )
 27 | 
 28 | MOTMetricConfig = builds_metric(
 29 |     metrics.MOTMetric,
 30 |     builds_bases=(MetricConfig,),
 31 | )
 32 | ThreeDIOUMetric = builds_metric(
 33 |     metrics.ThreeDIOUMetric,
 34 |     builds_bases=(MetricConfig,),
 35 | )
 36 | UnsupervisedMaskIoUMetricConfig = builds_metric(
 37 |     metrics.UnsupervisedMaskIoUMetric,
 38 |     builds_bases=(MetricConfig,),
 39 | )
 40 | MaskCorLocMetricConfig = builds_metric(
 41 |     metrics.UnsupervisedMaskIoUMetric,
 42 |     matching="best_overlap",
 43 |     correct_localization=True,
 44 |     builds_bases=(MetricConfig,),
 45 | )
 46 | AverageBestOverlapMetricConfig = builds_metric(
 47 |     metrics.UnsupervisedMaskIoUMetric,
 48 |     matching="best_overlap",
 49 |     builds_bases=(MetricConfig,),
 50 | )
 51 | BestOverlapObjectRecoveryMetricConfig = builds_metric(
 52 |     metrics.UnsupervisedMaskIoUMetric,
 53 |     matching="best_overlap",
 54 |     compute_discovery_fraction=True,
 55 |     builds_bases=(MetricConfig,),
 56 | )
 57 | UnsupervisedBboxIoUMetricConfig = builds_metric(
 58 |     metrics.UnsupervisedBboxIoUMetric,
 59 |     builds_bases=(MetricConfig,),
 60 | )
 61 | BboxCorLocMetricConfig = builds_metric(
 62 |     metrics.UnsupervisedBboxIoUMetric,
 63 |     matching="best_overlap",
 64 |     correct_localization=True,
 65 |     builds_bases=(MetricConfig,),
 66 | )
 67 | BboxRecallMetricConfig = builds_metric(
 68 |     metrics.UnsupervisedBboxIoUMetric,
 69 |     matching="best_overlap",
 70 |     compute_discovery_fraction=True,
 71 |     builds_bases=(MetricConfig,),
 72 | )
 73 | 
 74 | DatasetSemanticMaskIoUMetricConfig = builds_metric(metrics.DatasetSemanticMaskIoUMetric)
 75 | 
 76 | SklearnClusteringConfig = builds(
 77 |     metrics.SklearnClustering,
 78 |     populate_full_signature=True,
 79 | )
 80 | 
 81 | 
 82 | def register_configs(config_store):
 83 |     config_store.store(group="metrics", name="tensor_statistic", node=TensorStatisticConfig)
 84 |     config_store.store(group="metrics", name="torchmetric", node=TorchmetricsWrapperConfig)
 85 |     config_store.store(group="metrics", name="ari_metric", node=ARIMetricConfig)
 86 |     config_store.store(group="metrics", name="mot_metric", node=MOTMetricConfig)
 87 |     config_store.store(group="metrics", name="three_d_iou", node=ThreeDIOUMetric)
 88 |     config_store.store(group="metrics", name="patch_ari_metric", node=PatchARIMetricConfig)
 89 |     config_store.store(
 90 |         group="metrics", name="unsupervised_mask_iou_metric", node=UnsupervisedMaskIoUMetricConfig
 91 |     )
 92 |     config_store.store(group="metrics", name="mask_corloc_metric", node=MaskCorLocMetricConfig)
 93 |     config_store.store(
 94 |         group="metrics", name="average_best_overlap_metric", node=AverageBestOverlapMetricConfig
 95 |     )
 96 |     config_store.store(
 97 |         group="metrics",
 98 |         name="best_overlap_object_recovery_metric",
 99 |         node=BestOverlapObjectRecoveryMetricConfig,
100 |     )
101 |     config_store.store(
102 |         group="metrics", name="unsupervised_bbox_iou_metric", node=UnsupervisedBboxIoUMetricConfig
103 |     )
104 |     config_store.store(group="metrics", name="bbox_corloc_metric", node=BboxCorLocMetricConfig)
105 |     config_store.store(group="metrics", name="bbox_recall_metric", node=BboxRecallMetricConfig)
106 | 
107 |     config_store.store(
108 |         group="metrics",
109 |         name="dataset_semantic_mask_iou",
110 |         node=DatasetSemanticMaskIoUMetricConfig,
111 |     )
112 |     config_store.store(
113 |         group="clustering",
114 |         name="sklearn_clustering",
115 |         node=SklearnClusteringConfig,
116 |     )
117 | 


--------------------------------------------------------------------------------
/ocl/scheduling.py:
--------------------------------------------------------------------------------
  1 | """Scheduling of learning rate and hyperparameters."""
  2 | import abc
  3 | import math
  4 | import warnings
  5 | from typing import Callable
  6 | 
  7 | from torch.optim.lr_scheduler import _LRScheduler
  8 | 
  9 | 
 10 | def exp_decay_with_warmup_fn(
 11 |     step: int, decay_rate: float, decay_steps: int, warmup_steps: int
 12 | ) -> float:
 13 |     """Decay function for exponential decay with learning rate warmup.
 14 | 
 15 |     Maps the step to a factor for rescaling the learning rate.
 16 |     """
 17 |     if warmup_steps:
 18 |         factor = min(1.0, step / warmup_steps)
 19 |     else:
 20 |         factor = 1.0
 21 | 
 22 |     return factor * (decay_rate ** (step / decay_steps))
 23 | 
 24 | 
 25 | class CosineAnnealingWithWarmup(_LRScheduler):
 26 |     """Cosine annealing with warmup."""
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         optimizer,
 31 |         T_max: int,
 32 |         warmup_steps: int = 0,
 33 |         eta_min: float = 0.0,
 34 |         last_epoch: int = -1,
 35 |         error_on_exceeding_steps: bool = True,
 36 |         verbose: bool = False,
 37 |     ):
 38 |         self.T_max = T_max
 39 |         self.warmup_steps = warmup_steps
 40 |         self.eta_min = eta_min
 41 |         self.error_on_exceeding_steps = error_on_exceeding_steps
 42 |         super().__init__(optimizer, last_epoch, verbose)
 43 | 
 44 |     def _linear_lr_warmup(self, base_lr, step_num):
 45 |         return base_lr * ((step_num + 0.5) / self.warmup_steps)
 46 | 
 47 |     def _cosine_annealing(self, base_lr, step_num):
 48 |         fraction_of_steps = (step_num - self.warmup_steps) / (self.T_max - self.warmup_steps - 1)
 49 |         return self.eta_min + 1 / 2 * (base_lr - self.eta_min) * (
 50 |             1 + math.cos(math.pi * fraction_of_steps)
 51 |         )
 52 | 
 53 |     def get_lr(self):
 54 |         if not self._get_lr_called_within_step:
 55 |             warnings.warn(
 56 |                 "To get the last learning rate computed by the scheduler, "
 57 |                 "please use `get_last_lr()`."
 58 |             )
 59 |         step_num = self.last_epoch
 60 |         self.T_max = 400005
 61 | 
 62 |         if step_num < self.warmup_steps:
 63 |             # Warmup.
 64 |             return [self._linear_lr_warmup(base_lr, step_num) for base_lr in self.base_lrs]
 65 |         elif step_num < self.T_max:
 66 |             # Cosine annealing.
 67 |             return [self._cosine_annealing(base_lr, step_num) for base_lr in self.base_lrs]
 68 |         else:
 69 |             if self.error_on_exceeding_steps:
 70 |                 raise ValueError(
 71 |                     "Tried to step {} times. The specified number of total steps is {}".format(
 72 |                         step_num + 1, self.T_max
 73 |                     )
 74 |                 )
 75 |             else:
 76 |                 return [self.eta_min for _ in self.base_lrs]
 77 | 
 78 | 
 79 | HPSchedulerT = Callable[[int], float]  # Type for function signatures.
 80 | 
 81 | 
 82 | class HPScheduler(metaclass=abc.ABCMeta):
 83 |     """Base class for scheduling of scalar hyperparameters based on the number of training steps."""
 84 | 
 85 |     @abc.abstractmethod
 86 |     def __call__(self, step: int) -> float:
 87 |         """Return current value of hyperparameter based on global step."""
 88 |         pass
 89 | 
 90 | 
 91 | class LinearHPScheduler(HPScheduler):
 92 |     def __init__(
 93 |         self, end_value: float, end_step: int, start_value: float = 0.0, start_step: int = 0
 94 |     ):
 95 |         super().__init__()
 96 |         if start_step > end_step:
 97 |             raise ValueError("`start_step` needs to be smaller equal to `end_step`.")
 98 | 
 99 |         self.start_value = start_value
100 |         self.end_value = end_value
101 |         self.start_step = start_step
102 |         self.end_step = end_step
103 | 
104 |     def __call__(self, step: int) -> float:
105 |         if step < self.start_step:
106 |             return self.start_value
107 |         elif step > self.end_step:
108 |             return self.end_value
109 |         else:
110 |             t = step - self.start_step
111 |             T = self.end_step - self.start_step
112 |             return self.start_value + t * (self.end_value - self.start_value) / T
113 | 
114 | 
115 | class StepHPScheduler(HPScheduler):
116 |     def __init__(self, end_value: float, switch_step: int, start_value: float = 0.0):
117 |         super().__init__()
118 |         self.start_value = start_value
119 |         self.end_value = end_value
120 |         self.switch_step = switch_step
121 | 
122 |     def __call__(self, step: int) -> float:
123 |         if step < self.switch_step:
124 |             return self.start_value
125 |         elif step >= self.switch_step:
126 |             return self.end_value
127 | 


--------------------------------------------------------------------------------
/configs/experiment/OC-MOT/cater_eval.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | # An example implementaiton of SAVi that leverages a model definition in code.
  3 | # The code can be found in `ocl/models/savi.py`, the config is used to
  4 | # instantiate the submodules used by the code.
  5 | defaults:
  6 |   - /experiment/_output_path
  7 |   - /training_config
  8 |   - /dataset: cater
  9 |   - /plugins/optimization@plugins.optimize_parameters
 10 |   - /plugins/random_strided_window@plugins.02_random_strided_window         # Used during training.
 11 |   - /plugins/multi_element_preprocessing@plugins.03_preprocessing
 12 |   - /optimizers/adam@plugins.optimize_parameters.optimizer
 13 |   - /lr_schedulers/cosine_annealing@plugins.optimize_parameters.lr_scheduler
 14 |   - /experiment/SAVi/_cater_bbox_mot_preprocessing
 15 |   - /metrics/mot_metric@evaluation_metrics.mot
 16 |   - _self_
 17 | 
 18 | load_checkpoint: outputs/OC-MOT/cater/2023-07-27_06-43-22/checkpoints/epoch=59-step=66420.ckpt #seg+rec loss
 19 | 
 20 | trainer:
 21 |   gpus: 8
 22 |   gradient_clip_val: 0.05
 23 |   gradient_clip_algorithm: "norm"
 24 |   max_epochs: null
 25 |   max_steps: 100000
 26 |   strategy: 'ddp'
 27 |   callbacks:
 28 |     - _target_: pytorch_lightning.callbacks.LearningRateMonitor
 29 |       logging_interval: "step"
 30 | 
 31 | 
 32 | dataset:
 33 |   num_workers: 4
 34 |   batch_size: 8
 35 | 
 36 | models:
 37 |   _target_: ocl.models.SAVi_mem
 38 |   freeze: True
 39 | 
 40 |   conditioning:
 41 |     _target_: ocl.conditioning.LearntConditioning
 42 |     n_slots: 11
 43 |     object_dim: 128
 44 | 
 45 | 
 46 |   feature_extractor:
 47 |     # Use the smaller verion of the feature extractor architecture.
 48 |     _target_: ocl.feature_extractors.SAViFeatureExtractor
 49 |     larger_input_arch: False
 50 | 
 51 |   perceptual_grouping:
 52 |     _target_: ocl.perceptual_grouping.SlotAttentionGrouping
 53 |     feature_dim: 32
 54 |     object_dim: ${models.conditioning.object_dim}
 55 |     iters: 2
 56 |     kvq_dim: 128
 57 |     use_projection_bias: false
 58 |     positional_embedding:
 59 |       _target_: ocl.utils.Sequential
 60 |       _args_:
 61 |         - _target_: ocl.utils.SoftPositionEmbed
 62 |           n_spatial_dims: 2
 63 |           feature_dim: 32
 64 |           savi_style: true
 65 |         - _target_: ocl.neural_networks.build_two_layer_mlp
 66 |           input_dim: 32
 67 |           output_dim: 32
 68 |           hidden_dim: 64
 69 |           initial_layer_norm: true
 70 |     ff_mlp: null
 71 | 
 72 |   decoder:
 73 |     _target_: ocl.decoding.SlotAttentionDecoder
 74 |     decoder:
 75 |       _target_: ocl.decoding.get_savi_decoder_backbone
 76 |       object_dim: ${models.perceptual_grouping.object_dim}
 77 |       larger_input_arch: False
 78 |     positional_embedding:
 79 |       _target_: ocl.utils.SoftPositionEmbed
 80 |       n_spatial_dims: 2
 81 |       feature_dim: ${models.perceptual_grouping.object_dim}
 82 |       cnn_channel_order: true
 83 |       savi_style: true
 84 | 
 85 | 
 86 |   transition_model:
 87 |     _target_: torch.nn.Identity
 88 | 
 89 |   memory:
 90 |     _target_: ocl.memory.SelfSupervisedMemory
 91 |     stale_number: 4
 92 |     embed_dim: 128
 93 |     num_objects: 12
 94 |     memory_len: 7
 95 | 
 96 | 
 97 | losses:
 98 |   merge_loss:
 99 |     _target_: ocl.losses.EM_loss
100 |     pred_mask_path: mem_masks.masks
101 |     tgt_mask_path: decoder.masks
102 |     tgt_vis_path: decoder.masks_eval
103 |     rec_path: mem_masks.object_reconstructions
104 |     img_path: input.image
105 |     attn_index_path: attn_index
106 |     pred_feat_path: objects
107 |     gt_feat_path: slots
108 |     loss_weight: 10 #100
109 | 
110 | 
111 | 
112 | plugins:
113 |   optimize_parameters:
114 |     optimizer:
115 |       lr: 0.0001
116 |     lr_scheduler:
117 |       T_max: 200000
118 |       eta_min: 0.0
119 |       warmup_steps: 0
120 |   02_random_strided_window:
121 |     n_consecutive_frames: 10
122 |     training_fields:
123 |       - image
124 |       - mask
125 |     evaluation_fields: []
126 | 
127 | visualizations:
128 |   input:
129 |     _target_: ocl.visualizations.Video
130 |     denormalization: null
131 |     video_path: input.image
132 |   reconstruction:
133 |     _target_: ocl.visualizations.Video
134 |     denormalization: ${..input.denormalization}
135 |     video_path: decoder.reconstruction
136 |   slot_obj:
137 |     _target_: ocl.visualizations.VisualObject
138 |     denormalization: ${..input.denormalization}
139 |     object_path: decoder.object_reconstructions
140 |     mask_path: decoder.masks_eval
141 | 
142 |   merged_obj:
143 |     _target_: ocl.visualizations.VisualObject
144 |     denormalization: ${..input.denormalization}
145 |     object_path: mem_masks.object_reconstructions
146 |     mask_path: mem_masks.masks_eval
147 |   rollout:
148 |     _target_: ocl.visualizations.VisualObject
149 |     denormalization: ${..input.denormalization}
150 |     object_path: rollout_decode.object_reconstructions
151 |     mask_path: rollout_decode.masks
152 |   objectmot:
153 |     _target_: ocl.visualizations.ObjectMOT
154 |     n_clips: 1
155 |     denormalization: null
156 |     video_path: input.image
157 |     mask_path: tracks
158 | 
159 | evaluation_metrics:
160 |   mot:
161 |     prediction_path: tracks
162 |     target_path: input.mask
163 |     threshold: 0.7
164 |     ignore_background: True
165 | 


--------------------------------------------------------------------------------
/configs/experiment/OC-MOT/cater.yaml:
--------------------------------------------------------------------------------
  1 | # @package _global_
  2 | # An example implementaiton of SAVi that leverages a model definition in code.
  3 | # The code can be found in `ocl/models/savi.py`, the config is used to
  4 | # instantiate the submodules used by the code.
  5 | defaults:
  6 |   - /experiment/_output_path
  7 |   - /training_config
  8 |   - /dataset: cater
  9 |   - /plugins/optimization@plugins.optimize_parameters
 10 |   - /plugins/random_strided_window@plugins.02_random_strided_window         # Used during training.
 11 |   - /plugins/multi_element_preprocessing@plugins.03_preprocessing
 12 |   - /optimizers/adam@plugins.optimize_parameters.optimizer
 13 |   - /lr_schedulers/cosine_annealing@plugins.optimize_parameters.lr_scheduler
 14 |   - /experiment/SAVi/_cater_bbox_mot_preprocessing
 15 |   - /metrics/mot_metric@evaluation_metrics.mot
 16 |   - _self_
 17 | 
 18 | load_checkpoint: checkpoints/unsupervised_savi.ckpt
 19 | 
 20 | trainer:
 21 |   gpus: 8
 22 |   gradient_clip_val: 0.05
 23 |   gradient_clip_algorithm: "norm"
 24 |   max_epochs: null
 25 |   max_steps: 100000
 26 |   strategy: 'ddp'
 27 |   callbacks:
 28 |     - _target_: pytorch_lightning.callbacks.LearningRateMonitor
 29 |       logging_interval: "step"
 30 | 
 31 | 
 32 | dataset:
 33 |   num_workers: 4
 34 |   batch_size: 16
 35 | 
 36 | models:
 37 |   _target_: ocl.models.SAVi_mem
 38 |   freeze: True
 39 | 
 40 |   conditioning:
 41 |     _target_: ocl.conditioning.LearntConditioning
 42 |     n_slots: 11
 43 |     object_dim: 128
 44 | 
 45 | 
 46 |   feature_extractor:
 47 |     # Use the smaller verion of the feature extractor architecture.
 48 |     _target_: ocl.feature_extractors.SAViFeatureExtractor
 49 |     larger_input_arch: False
 50 | 
 51 |   perceptual_grouping:
 52 |     _target_: ocl.perceptual_grouping.SlotAttentionGrouping
 53 |     feature_dim: 32
 54 |     object_dim: ${models.conditioning.object_dim}
 55 |     iters: 2
 56 |     kvq_dim: 128
 57 |     use_projection_bias: false
 58 |     positional_embedding:
 59 |       _target_: ocl.utils.Sequential
 60 |       _args_:
 61 |         - _target_: ocl.utils.SoftPositionEmbed
 62 |           n_spatial_dims: 2
 63 |           feature_dim: 32
 64 |           savi_style: true
 65 |         - _target_: ocl.neural_networks.build_two_layer_mlp
 66 |           input_dim: 32
 67 |           output_dim: 32
 68 |           hidden_dim: 64
 69 |           initial_layer_norm: true
 70 |     ff_mlp: null
 71 | 
 72 |   decoder:
 73 |     _target_: ocl.decoding.SlotAttentionDecoder
 74 |     decoder:
 75 |       _target_: ocl.decoding.get_savi_decoder_backbone
 76 |       object_dim: ${models.perceptual_grouping.object_dim}
 77 |       larger_input_arch: False
 78 |     positional_embedding:
 79 |       _target_: ocl.utils.SoftPositionEmbed
 80 |       n_spatial_dims: 2
 81 |       feature_dim: ${models.perceptual_grouping.object_dim}
 82 |       cnn_channel_order: true
 83 |       savi_style: true
 84 | 
 85 | 
 86 |   transition_model:
 87 |     _target_: torch.nn.Identity
 88 | 
 89 |   memory:
 90 |     _target_: ocl.memory.SelfSupervisedMemory
 91 |     stale_number: 4
 92 |     embed_dim: 128
 93 |     num_objects: 12
 94 |     memory_len: 7
 95 | 
 96 | 
 97 | losses:
 98 |   merge_loss:
 99 |     _target_: ocl.losses.EM_loss
100 |     pred_mask_path: mem_masks.masks
101 |     tgt_mask_path: decoder.masks
102 |     tgt_vis_path: decoder.masks_eval
103 |     rec_path: mem_masks.object_reconstructions
104 |     img_path: input.image
105 |     attn_index_path: attn_index
106 |     pred_feat_path: objects
107 |     gt_feat_path: slots
108 |     loss_weight: 10 #100
109 | 
110 |   rollout_loss:
111 |     _target_: ocl.losses.EM_loss
112 |     pred_mask_path: rollout_decode.masks
113 |     tgt_mask_path: decoder.masks
114 |     tgt_vis_path: decoder.masks_eval
115 |     rec_path: rollout_decode.object_reconstructions
116 |     img_path: input.image
117 |     attn_index_path: attn_index
118 |     pred_feat_path: memory
119 |     gt_feat_path: slots
120 |     loss_weight: 10 #100
121 | 
122 | 
123 | plugins:
124 |   optimize_parameters:
125 |     optimizer:
126 |       lr: 0.0001
127 |     lr_scheduler:
128 |       T_max: 200000
129 |       eta_min: 0.0
130 |       warmup_steps: 0
131 |   02_random_strided_window:
132 |     n_consecutive_frames: 10
133 |     training_fields:
134 |       - image
135 |       - mask
136 |     evaluation_fields: []
137 | 
138 | visualizations:
139 |   input:
140 |     _target_: ocl.visualizations.Video
141 |     denormalization: null
142 |     video_path: input.image
143 |   reconstruction:
144 |     _target_: ocl.visualizations.Video
145 |     denormalization: ${..input.denormalization}
146 |     video_path: decoder.reconstruction
147 |   slot_obj:
148 |     _target_: ocl.visualizations.VisualObject
149 |     denormalization: ${..input.denormalization}
150 |     object_path: decoder.object_reconstructions
151 |     mask_path: decoder.masks_eval
152 | 
153 |   merged_obj:
154 |     _target_: ocl.visualizations.VisualObject
155 |     denormalization: ${..input.denormalization}
156 |     object_path: mem_masks.object_reconstructions
157 |     mask_path: mem_masks.masks_eval
158 |   rollout:
159 |     _target_: ocl.visualizations.VisualObject
160 |     denormalization: ${..input.denormalization}
161 |     object_path: rollout_decode.object_reconstructions
162 |     mask_path: rollout_decode.masks
163 |   objectmot:
164 |     _target_: ocl.visualizations.ObjectMOT
165 |     n_clips: 1
166 |     denormalization: null
167 |     video_path: input.image
168 |     mask_path: mem_masks.masks_eval
169 | 
170 | evaluation_metrics:
171 |   mot:
172 |     prediction_path: mem_masks.masks_eval
173 |     target_path: input.mask
174 |     threshold: 0.5
175 | 


--------------------------------------------------------------------------------
/ocl/cli/eval.py:
--------------------------------------------------------------------------------
  1 | """Train a slot attention type model."""
  2 | import dataclasses
  3 | from typing import Any, Dict, Optional
  4 | from pytorch_lightning.utilities.cloud_io import load as pl_load
  5 | import hydra
  6 | import hydra_zen
  7 | import pytorch_lightning as pl
  8 | from pytorch_lightning.callbacks import ModelCheckpoint
  9 | from pluggy import PluginManager
 10 | import torch
 11 | import ocl.hooks
 12 | from ocl import base
 13 | from ocl.combined_model import CombinedModel
 14 | from ocl.config.datasets import DataModuleConfig
 15 | from ocl.config.metrics import MetricConfig
 16 | from ocl.config.plugins import PluginConfig
 17 | from ocl.plugins import Plugin
 18 | from ocl.utils import Combined, Recurrent, RoutableMixin
 19 | from ocl.cli import cli_utils, eval_utils, train
 20 | from typing import Union
 21 | 
 22 | TrainerConf = hydra_zen.builds(
 23 |     pl.Trainer, max_epochs=100, zen_partial=False, populate_full_signature=True
 24 | )
 25 | 
 26 | 
 27 | @dataclasses.dataclass
 28 | class TrainingConfig:
 29 |     """Configuration of a training run."""
 30 | 
 31 |     dataset: DataModuleConfig
 32 |     models: Any  # When provided with dict wrap in `utils.Combined`, otherwise interpret as model.
 33 |     losses: Dict[str, Any]
 34 |     visualizations: Dict[str, Any] = dataclasses.field(default_factory=dict)
 35 |     plugins: Dict[str, PluginConfig] = dataclasses.field(default_factory=dict)
 36 |     trainer: TrainerConf = TrainerConf
 37 |     training_vis_frequency: Optional[int] = None
 38 |     training_metrics: Optional[Dict[str, MetricConfig]] = None
 39 |     evaluation_metrics: Optional[Dict[str, MetricConfig]] = None
 40 |     load_checkpoint: Optional[str] = None
 41 |     seed: Optional[int] = None
 42 |     experiment: Optional[Any] = None
 43 |     root_output_folder: Optional[str] = None
 44 | 
 45 | 
 46 | hydra.core.config_store.ConfigStore.instance().store(
 47 |     name="training_config",
 48 |     node=TrainingConfig,
 49 | )
 50 | 
 51 | 
 52 | def create_plugin_manager() -> PluginManager:
 53 |     pm = PluginManager("ocl")
 54 |     pm.add_hookspecs(ocl.hooks)
 55 |     return pm
 56 | 
 57 | 
 58 | def build_and_register_datamodule_from_config(
 59 |         config: TrainingConfig,
 60 |         hooks: base.PluggyHookRelay,
 61 |         plugin_manager: Optional[PluginManager] = None,
 62 |         **datamodule_kwargs,
 63 | ) -> pl.LightningDataModule:
 64 |     datamodule = hydra_zen.instantiate(
 65 |         config.dataset, hooks=hooks, _convert_="all", **datamodule_kwargs
 66 |     )
 67 | 
 68 |     if plugin_manager:
 69 |         plugin_manager.register(datamodule)
 70 | 
 71 |     return datamodule
 72 | 
 73 | 
 74 | def build_and_register_plugins_from_config(
 75 |         config: TrainingConfig, plugin_manager: Optional[PluginManager] = None
 76 | ) -> Dict[str, Plugin]:
 77 |     plugins = hydra_zen.instantiate(config.plugins)
 78 |     # Use lexicographical sorting to allow to influence registration order. This is necessary in
 79 |     # some cases as certain plugins might need to be called before others. Pluggy calls hooks
 80 |     # according to FILO (first in last out) and this is slightly unintuitive. We thus register
 81 |     # plugins in reverse order to their sorting position, leading to a FIFO (first in first out)
 82 |     # behavior with regard to the sorted position.
 83 |     if plugin_manager:
 84 |         for plugin_name in sorted(plugins.keys())[::-1]:
 85 |             plugin_manager.register(plugins[plugin_name])
 86 | 
 87 |     return plugins
 88 | 
 89 | 
 90 | def build_model_from_config(
 91 |         config: TrainingConfig,
 92 |         hooks: base.PluggyHookRelay,
 93 |         checkpoint_path: Optional[str] = None,
 94 | ) -> pl.LightningModule:
 95 |     models = hydra_zen.instantiate(config.models, _convert_="all")
 96 |     losses = hydra_zen.instantiate(config.losses, _convert_="all")
 97 |     visualizations = hydra_zen.instantiate(config.visualizations, _convert_="all")
 98 | 
 99 |     training_metrics = hydra_zen.instantiate(config.training_metrics)
100 |     evaluation_metrics = hydra_zen.instantiate(config.evaluation_metrics)
101 | 
102 |     train_vis_freq = config.training_vis_frequency if config.training_vis_frequency else 100
103 | 
104 |     if checkpoint_path is None:
105 |         model = CombinedModel(
106 |             models=models,
107 |             losses=losses,
108 |             visualizations=visualizations,
109 |             hooks=hooks,
110 |             training_metrics=training_metrics,
111 |             evaluation_metrics=evaluation_metrics,
112 |             vis_log_frequency=train_vis_freq,
113 |         )
114 |     else:
115 |         model = CombinedModel.load_from_checkpoint(
116 |             checkpoint_path,
117 |             strict=False,
118 |             models=models,
119 |             losses=losses,
120 |             visualizations=visualizations,
121 |             hooks=hooks,
122 |             training_metrics=training_metrics,
123 |             evaluation_metrics=evaluation_metrics,
124 |             vis_log_frequency=train_vis_freq,
125 |         )
126 | 
127 |     return model
128 | 
129 | 
130 | @hydra.main(config_name="training_config", config_path="../../configs/", version_base="1.1")
131 | def train(config: TrainingConfig):
132 |     # Set all relevant random seeds. If `config.seed is None`, the function samples a random value.
133 |     # The function takes care of correctly distributing the seed across nodes in multi-node training,
134 |     # and assigns each dataloader worker a different random seed.
135 |     # IMPORTANTLY, we need to take care not to set a custom `worker_init_fn` function on the
136 |     # dataloaders (or take care of worker seeding ourselves).
137 |     pl.seed_everything(config.seed, workers=True)
138 | 
139 |     pm = create_plugin_manager()
140 | 
141 |     # datamodule = build_and_register_datamodule_from_config(config, pm.hook, pm)
142 |     checkpoint_path = hydra.utils.to_absolute_path(config.load_checkpoint)
143 |     datamodule, model, pm = eval_utils.build_from_train_config(
144 |         config, checkpoint_path
145 |     )
146 | 
147 |     trainer: pl.Trainer = hydra_zen.instantiate(
148 |         config.trainer,
149 |         _convert_="all",
150 |         enable_progress_bar=True,
151 |         gpus=[0],
152 |     )
153 | 
154 |     print("******start validate model******")
155 |     trainer.validate(model, datamodule.val_dataloader())
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     train()
160 | 
161 | 


--------------------------------------------------------------------------------
/ocl/config/plugins.py:
--------------------------------------------------------------------------------
  1 | """Configuration of plugins."""
  2 | import dataclasses
  3 | import functools
  4 | 
  5 | import hydra_zen
  6 | from hydra_zen import builds
  7 | from torch.optim.lr_scheduler import LambdaLR
  8 | 
  9 | from ocl import plugins, scheduling
 10 | from ocl.config.optimizers import OptimizerConfig
 11 | 
 12 | 
 13 | @dataclasses.dataclass
 14 | class PluginConfig:
 15 |     """Base class for plugin configurations."""
 16 | 
 17 |     pass
 18 | 
 19 | 
 20 | @dataclasses.dataclass
 21 | class LRSchedulerConfig:
 22 |     pass
 23 | 
 24 | 
 25 | def exponential_decay_with_optional_warmup(
 26 |     optimizer, decay_rate: float = 1.0, decay_steps: int = 10000, warmup_steps: int = 0
 27 | ):
 28 |     """Return pytorch lighting optimizer configuration for exponential decay with optional warmup.
 29 | 
 30 |     Returns:
 31 |         Dict with structure compatible with ptl.  See
 32 |         https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.configure_optimizers
 33 |     """
 34 |     decay_fn = functools.partial(
 35 |         scheduling.exp_decay_with_warmup_fn,
 36 |         decay_rate=decay_rate,
 37 |         decay_steps=decay_steps,
 38 |         warmup_steps=warmup_steps,
 39 |     )
 40 | 
 41 |     return {"lr_scheduler": {"scheduler": LambdaLR(optimizer, decay_fn), "interval": "step"}}
 42 | 
 43 | 
 44 | def cosine_annealing_with_optional_warmup(
 45 |     optimizer,
 46 |     T_max: int = 100000,
 47 |     eta_min: float = 0.0,
 48 |     warmup_steps: int = 0,
 49 |     error_on_exceeding_steps: bool = True,
 50 | ):
 51 |     """Return pytorch lighting optimizer configuration for cosine annealing with warmup.
 52 | 
 53 |     Returns:
 54 |         Dict with structure compatible with ptl.  See
 55 |         https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.configure_optimizers
 56 |     """
 57 |     return {
 58 |         "lr_scheduler": {
 59 |             "scheduler": scheduling.CosineAnnealingWithWarmup(
 60 |                 optimizer,
 61 |                 T_max,
 62 |                 eta_min=eta_min,
 63 |                 warmup_steps=warmup_steps,
 64 |                 error_on_exceeding_steps=error_on_exceeding_steps,
 65 |             ),
 66 |             "interval": "step",
 67 |         }
 68 |     }
 69 | 
 70 | 
 71 | ExpDecayLR = builds(
 72 |     exponential_decay_with_optional_warmup,
 73 |     zen_partial=True,
 74 |     populate_full_signature=True,
 75 |     builds_bases=(LRSchedulerConfig,),
 76 | )
 77 | 
 78 | CosineAnnealingLR = builds(
 79 |     cosine_annealing_with_optional_warmup,
 80 |     zen_partial=True,
 81 |     populate_full_signature=True,
 82 |     builds_bases=(LRSchedulerConfig,),
 83 | )
 84 | 
 85 | 
 86 | @dataclasses.dataclass
 87 | class HPSchedulerConfig:
 88 |     """Base class for hyperparameter scheduler configuration."""
 89 | 
 90 | 
 91 | LinearHPSchedulerConfig = builds(
 92 |     scheduling.LinearHPScheduler,
 93 |     builds_bases=(HPSchedulerConfig,),
 94 |     populate_full_signature=True,
 95 | )
 96 | 
 97 | StepHPSchedulerConfig = builds(
 98 |     scheduling.StepHPScheduler,
 99 |     builds_bases=(HPSchedulerConfig,),
100 |     populate_full_signature=True,
101 | )
102 | 
103 | 
104 | builds_plugin = hydra_zen.make_custom_builds_fn(
105 |     populate_full_signature=True,
106 | )
107 | OptimizationConfig = builds_plugin(
108 |     plugins.Optimization,
109 |     optimizer=OptimizerConfig,
110 |     lr_scheduler=LRSchedulerConfig,
111 |     builds_bases=(PluginConfig,),
112 | )
113 | SingleElementPreprocessingConfig = builds_plugin(
114 |     plugins.SingleElementPreprocessing, builds_bases=(PluginConfig,)
115 | )
116 | MultiElementPreprocessingConfig = builds_plugin(
117 |     plugins.MultiElementPreprocessing, builds_bases=(PluginConfig,)
118 | )
119 | DataPreprocessingConfig = builds_plugin(plugins.DataPreprocessing, builds_bases=(PluginConfig,))
120 | SubsetDatasetConfig = builds_plugin(plugins.SubsetDataset, builds_bases=(PluginConfig,))
121 | SampleFramesFromVideoConfig = builds_plugin(
122 |     plugins.SampleFramesFromVideo, builds_bases=(PluginConfig,)
123 | )
124 | SplitConsecutiveFramesConfig = builds_plugin(
125 |     plugins.SplitConsecutiveFrames, builds_bases=(PluginConfig,)
126 | )
127 | RandomStridedWindowConfig = builds_plugin(plugins.RandomStridedWindow, builds_bases=(PluginConfig,))
128 | 
129 | 
130 | def register_configs(config_store):
131 |     config_store.store(group="schemas", name="lr_scheduler", node=LRSchedulerConfig)
132 |     config_store.store(group="lr_schedulers", name="exponential_decay", node=ExpDecayLR)
133 |     config_store.store(group="lr_schedulers", name="cosine_annealing", node=CosineAnnealingLR)
134 | 
135 |     config_store.store(group="schemas", name="hp_scheduler", node=HPSchedulerConfig)
136 |     config_store.store(group="hp_schedulers", name="linear", node=LinearHPSchedulerConfig)
137 |     config_store.store(group="hp_schedulers", name="step", node=StepHPSchedulerConfig)
138 | 
139 |     config_store.store(group="schemas", name="plugin", node=PluginConfig)
140 |     config_store.store(group="plugins", name="optimization", node=OptimizationConfig)
141 |     config_store.store(
142 |         group="plugins",
143 |         name="single_element_preprocessing",
144 |         node=SingleElementPreprocessingConfig,
145 |     )
146 |     config_store.store(
147 |         group="plugins",
148 |         name="multi_element_preprocessing",
149 |         node=MultiElementPreprocessingConfig,
150 |     )
151 |     config_store.store(
152 |         group="plugins",
153 |         name="data_preprocessing",
154 |         node=DataPreprocessingConfig,
155 |     )
156 |     config_store.store(group="plugins", name="subset_dataset", node=SubsetDatasetConfig)
157 |     config_store.store(
158 |         group="plugins", name="sample_frames_from_video", node=SampleFramesFromVideoConfig
159 |     )
160 |     config_store.store(
161 |         group="plugins", name="split_consecutive_frames", node=SplitConsecutiveFramesConfig
162 |     )
163 |     config_store.store(group="plugins", name="random_strided_window", node=RandomStridedWindowConfig)
164 | 
165 | 
166 | def _torchvision_interpolation_mode(mode):
167 |     import torchvision
168 | 
169 |     return torchvision.transforms.InterpolationMode[mode.upper()]
170 | 
171 | 
172 | def register_resolvers(omegaconf):
173 |     omegaconf.register_new_resolver(
174 |         "torchvision_interpolation_mode", _torchvision_interpolation_mode
175 |     )
176 | 


--------------------------------------------------------------------------------
/ocl/tree_utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities for working with our own version of PyTrees which focus on torch tensors.
  2 | 
  3 | PyTrees are any nested structure of dictionaries, lists, tuples, namedtuples or dataclasses.
  4 | """
  5 | import copy
  6 | import dataclasses
  7 | from collections import OrderedDict, abc
  8 | from typing import Any, Callable, Dict, List, Mapping, Sequence, Tuple, Union
  9 | 
 10 | import torch
 11 | 
 12 | Tree = Union[Dict, List, Tuple]
 13 | 
 14 | 
 15 | def is_tensor_or_module(t: Any):
 16 |     """Check if input is a torch.Tensor or a torch.nn.Module."""
 17 |     return isinstance(t, (torch.Tensor, torch.nn.Module, dict))  # update
 18 | 
 19 | 
 20 | def is_namedtuple(obj) -> bool:
 21 |     """Check if input is a named tuple."""
 22 |     return isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
 23 | 
 24 | 
 25 | def get_tree_element(d: Tree, path: List[str]):
 26 |     """Get element of a tree."""
 27 |     next_element = d
 28 | 
 29 |     for next_element_name in path:
 30 |         if isinstance(next_element, abc.Mapping) and next_element_name in next_element:
 31 |             next_element = next_element[next_element_name]
 32 |         elif hasattr(next_element, next_element_name):
 33 |             next_element = getattr(next_element, next_element_name)
 34 |         elif isinstance(next_element, (list, tuple)) and next_element_name.isnumeric():
 35 |             next_element = next_element[int(next_element_name)]
 36 |         else:
 37 |             msg = f"Trying to access path {'.'.join(path)}, "
 38 |             if isinstance(next_element, abc.Mapping):
 39 |                 msg += f"but element {next_element_name} is not among keys {next_element.keys()}"
 40 |             elif isinstance(next_element, (list, tuple)):
 41 |                 msg += f"but cannot index into list with {next_element_name}"
 42 |             else:
 43 | 
 44 |                 msg += (
 45 |                     f"but element {next_element_name} cannot be used to access attribute of object "
 46 |                     f"of type {type(next_element)}"
 47 |                 )
 48 |             raise ValueError(msg)
 49 |     return next_element
 50 | 
 51 | 
 52 | def _build_walk_path(previous_element, new_element):
 53 |     return previous_element + [new_element]
 54 | 
 55 | 
 56 | def walk_tree_with_paths(next_element, path=None, instance_check=is_tensor_or_module):
 57 |     """Walk over all tensors + modules and their paths in a nested structure.
 58 | 
 59 |     This could lead to an infinite loop.
 60 |     """
 61 |     if path is None:
 62 |         path = []
 63 | 
 64 |     if instance_check(next_element):
 65 |         yield path, next_element
 66 |     elif isinstance(next_element, str):
 67 |         # Special handling for strings, as even a single element slice is a sequence. This leads to
 68 |         # infinite nesting.
 69 |         pass
 70 |     elif isinstance(next_element, (dict, Mapping)):
 71 |         for key, value in next_element.items():
 72 |             yield from walk_tree_with_paths(
 73 |                 value, path=_build_walk_path(path, key), instance_check=instance_check
 74 |             )
 75 |     elif dataclasses.is_dataclass(next_element):
 76 |         for field in dataclasses.fields(next_element):
 77 |             yield from walk_tree_with_paths(
 78 |                 getattr(next_element, field.name),
 79 |                 path=_build_walk_path(path, field.name),
 80 |                 instance_check=instance_check,
 81 |             )
 82 |     elif is_namedtuple(next_element):
 83 |         for field_name in next_element._fields:
 84 |             yield from walk_tree_with_paths(
 85 |                 getattr(next_element, field_name),
 86 |                 path=_build_walk_path(path, field_name),
 87 |                 instance_check=instance_check,
 88 |             )
 89 |     elif isinstance(next_element, (List, Sequence, tuple)):
 90 |         for index, el in enumerate(next_element):
 91 |             yield from walk_tree_with_paths(
 92 |                 el, path=_build_walk_path(path, index), instance_check=instance_check
 93 |             )
 94 | 
 95 | 
 96 | def reduce_tree(outputs: List[Dict[str, Any]], fn: Callable[[List[torch.Tensor]], torch.Tensor]):
 97 |     """Apply reduction function to a list of nested dicts.
 98 | 
 99 |     This only considers tensors at the moment, for other data types are simply copied from the first
100 |     element.
101 |     """
102 |     id_to_reduced_tensor = {}
103 |     for path, tensor in walk_tree_with_paths(outputs[0]):
104 |         stacked_tensor = fn([tensor] + [get_tree_element(output, path) for output in outputs[1:]])
105 |         id_to_reduced_tensor[id(tensor)] = stacked_tensor
106 | 
107 |     # Replace all tensors with their stacked versions.
108 |     return copy.deepcopy(outputs[0], memo=id_to_reduced_tensor)
109 | 
110 | 
111 | def split_tree(d: Tree, split_paths: List[List[str]], split_axis: int, chunk_size: int):
112 |     # We essentially need a deep copy of the input dict that we then update with splitted
113 |     # references. To avoid copies of tensors and thus memory duplication we want to use shallow
114 |     # copies for tensors instead. We do this by defining the memo parameter used in deepcopy for
115 |     # all tensors in the dict. This way deepcopy thinks that these where already copied and uses
116 |     # the provided objects instead. We can further use this trick to replace the original
117 |     # tensors with splitted counterparts when running deepcopy.
118 | 
119 |     # Create memo containing all tensors to avoid data duplication.
120 |     memo = {id(tensor): tensor for path, tensor in walk_tree_with_paths(d)}
121 | 
122 |     # Gather tensors that should be replaced and note their id.
123 |     tensors_to_split = [get_tree_element(d, path) for path in split_paths]
124 |     splitted_memos = OrderedDict(
125 |         (id(tensor), torch.split(tensor, chunk_size, dim=split_axis)) for tensor in tensors_to_split
126 |     )
127 | 
128 |     for tensor_slices in zip(*splitted_memos.values()):
129 |         # Replace entires in memo dict with splitted counterparts.
130 |         if chunk_size == 1:
131 |             # Additionally squeeze the input.
132 |             memo_override = {
133 |                 orig_id: tensor_slice.squeeze(split_axis)
134 |                 for orig_id, tensor_slice in zip(splitted_memos.keys(), tensor_slices)
135 |             }
136 |         else:
137 |             memo_override = {
138 |                 orig_id: tensor_slice
139 |                 for orig_id, tensor_slice in zip(splitted_memos.keys(), tensor_slices)
140 |             }
141 |         yield copy.deepcopy(d, {**memo, **memo_override})
142 | 


--------------------------------------------------------------------------------
/ocl/combined_model.py:
--------------------------------------------------------------------------------
  1 | """Implementation of combined model."""
  2 | from __future__ import annotations
  3 | 
  4 | from functools import partial
  5 | from typing import TYPE_CHECKING, Any, Dict
  6 | 
  7 | import pytorch_lightning as pl
  8 | import torch
  9 | from torch.profiler import profile, record_function, ProfilerActivity
 10 | from ocl import base, path_defaults, tree_utils, utils
 11 | from ocl.visualization_types import Visualization
 12 | 
 13 | # from slot_attention.tasks import Task
 14 | 
 15 | if TYPE_CHECKING:
 16 |     import torchmetrics
 17 | 
 18 | 
 19 | class CombinedModel(pl.LightningModule):
 20 |     def __init__(
 21 |         self,
 22 |         models: Dict[str, Any],
 23 |         losses: Dict[str, Any],
 24 |         visualizations: Dict[str, Any],
 25 |         hooks: base.PluggyHookRelay,
 26 |         training_metrics: Dict[str, torchmetrics.Metric] = None,
 27 |         evaluation_metrics: Dict[str, torchmetrics.Metric] = None,
 28 |         vis_log_frequency: int = 10,  #100
 29 |     ):
 30 |         super().__init__()
 31 |         if isinstance(models, Dict):
 32 |             models = utils.Combined(models)
 33 |         self.models = models
 34 |         self.losses = losses
 35 |         self.visualizations = visualizations
 36 |         self.hooks = hooks
 37 |         self.vis_log_frequency = vis_log_frequency
 38 | 
 39 |         if training_metrics is None:
 40 |             training_metrics = {}
 41 |         self.training_metrics = torch.nn.ModuleDict(training_metrics)
 42 | 
 43 |         if evaluation_metrics is None:
 44 |             evaluation_metrics = {}
 45 |         self.evaluation_metrics = torch.nn.ModuleDict(evaluation_metrics)
 46 | 
 47 |     def configure_optimizers(self):
 48 |         return self.hooks.configure_optimizers(model=self)
 49 | 
 50 |     def __getattribute__(self, name):
 51 |         """Forward pytorch lightning module hooks to the plugin manager.
 52 | 
 53 |         We need to implement `__getattribute__` as the model hooks are defined in a superclass of
 54 |         `pl.LightningModule` and thus `__getattr__` would never get called for them. This makes the
 55 |         call a bit more clumsy.
 56 |         """
 57 |         if not name.startswith("__") and hasattr(pl.core.hooks.ModelHooks, name):
 58 |             # A pytorch lighting hook is being called.
 59 |             try:
 60 |                 hook_caller = getattr(self.hooks, name)
 61 |                 return partial(hook_caller, model=self)
 62 |             except AttributeError:
 63 |                 pass
 64 |         return super().__getattribute__(name)
 65 | 
 66 |     def forward(self, input_data: dict):
 67 |         # Maybe we should use something like a read only dict to prevent existing keys from being
 68 |         # overwritten.
 69 |         data: Dict[str, Any]
 70 |         data = {path_defaults.INPUT: input_data, path_defaults.GLOBAL_STEP: self.global_step}
 71 |         phase = input_data['phase']
 72 |         return self.models.forward(inputs=data, phase=phase)
 73 | 
 74 |     def _compute_losses(self, inputs, phase="train"):
 75 |         quantities_to_log = {}
 76 |         for name, loss in self.losses.items():
 77 |             quantities_to_log[f"{phase}/{name}"] = loss(inputs=inputs)
 78 | 
 79 |         losses = []
 80 |         for loss in quantities_to_log.values():
 81 |             losses.append(loss)
 82 |         total_loss = torch.stack(losses).sum()
 83 | 
 84 |         # Log total loss only if there is more than one task
 85 |         if len(losses) > 1:
 86 |             quantities_to_log[f"{phase}/loss_total"] = total_loss
 87 | 
 88 |         return total_loss, quantities_to_log
 89 | 
 90 |     def training_step(self, batch, batch_idx):
 91 |         batch_size = batch["batch_size"]
 92 |         batch['phase'] = 'train'
 93 |         # with profile(activities=[
 94 |         #     ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
 95 |         #     with record_function("model_train"):
 96 |         #         outputs = self(batch)
 97 |         #         total_loss, quantities_to_log = self._compute_losses(outputs)
 98 |         # print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
 99 |         outputs = self(batch)
100 |         total_loss, quantities_to_log = self._compute_losses(outputs)
101 |         quantities_to_log.update(self._compute_metrics(outputs, self.training_metrics))
102 |         self.log_dict(quantities_to_log, on_step=True, on_epoch=False, batch_size=batch_size)
103 | 
104 |         if self.trainer.global_step % self.vis_log_frequency == 0:
105 |             self._log_visualizations(outputs)
106 | 
107 |         return total_loss
108 | 
109 |     def validation_step(self, batch, batch_idx):
110 |         batch_size = batch["batch_size"]
111 |         batch['phase'] = 'val'
112 |         outputs = self(batch)
113 |         total_loss, quantities_to_log = self._compute_losses(outputs, phase="val")
114 | 
115 |         quantities_to_log.update(
116 |             self._compute_metrics(outputs, self.evaluation_metrics, phase="val")
117 |         )
118 |         self.log_dict(
119 |             quantities_to_log, on_step=False, on_epoch=True, prog_bar=True, batch_size=batch_size
120 |         )
121 | 
122 |         if batch_idx == 0:
123 |             self._log_visualizations(outputs, phase="val")
124 | 
125 |     def _compute_metrics(self, outputs, metric_fns, phase="train"):
126 |         metrics = {}
127 |         if len(metric_fns) > 0:
128 |             for metric_name, metric in metric_fns.items():
129 |                 if phase == "val":
130 |                     # Call update instead of forward to avoid unnecessary metric compute on batch.
131 |                     metric.update(**outputs)
132 |                 else:
133 |                     metric(**outputs)
134 |                 metrics[f"{phase}/{metric_name}"] = metric
135 | 
136 |         return metrics
137 | 
138 |     def _log_visualizations(self, outputs, phase="train"):
139 |         if self.logger is None:
140 |             return
141 |         logger_experiment = self.logger.experiment
142 |         visualizations = {}
143 |         for name, vis in self.visualizations.items():
144 |             visualizations[name] = vis(inputs=outputs)
145 | 
146 |         visualization_iterator = tree_utils.walk_tree_with_paths(
147 |             visualizations, path=None, instance_check=lambda a: isinstance(a, Visualization)
148 |         )
149 |         for path, vis in visualization_iterator:
150 |             str_path = ".".join(path)
151 |             vis.add_to_experiment(
152 |                 experiment=logger_experiment,
153 |                 tag=f"{phase}/{str_path}",
154 |                 global_step=self.trainer.global_step,
155 |             )
156 | 


--------------------------------------------------------------------------------
/ocl/memory_rollout.py:
--------------------------------------------------------------------------------
  1 | """Memory roll-out module, following GPT-2 architecture.
  2 | 
  3 | References:
  4 | 1) minGPT by Andrej Karpathy:
  5 | https://github.com/karpathy/minGPT/tree/master/mingpt
  6 | 2) the official GPT-2 TensorFlow implementation released by OpenAI:
  7 | https://github.com/openai/gpt-2/blob/master/src/model.py
  8 | """
  9 | 
 10 | import math
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | import torch.nn.functional as F
 15 | # -----------------------------------------------------------------------------
 16 | 
 17 | 
 18 | class GELU(nn.Module):
 19 |     def forward(self, x):
 20 |         return (
 21 |             0.5
 22 |             * x
 23 |             * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 24 |         )
 25 | 
 26 | 
 27 | class Block(nn.Module):
 28 |     """One GPT-2 decoder block, consists of a Masked Self-Attn and a FFN."""
 29 | 
 30 |     def __init__(self, n_embd, n_heads, dropout_rate):
 31 |         super().__init__()
 32 |         self.ln_1 = nn.LayerNorm(n_embd)
 33 |         self.attn = nn.MultiheadAttention(n_embd, n_heads, batch_first=True)
 34 |         self.ln_2 = nn.LayerNorm(n_embd)
 35 |         self.mlp = nn.ModuleDict(
 36 |             dict(
 37 |                 c_fc=nn.Linear(n_embd, 4 * n_embd),
 38 |                 c_proj=nn.Linear(4 * n_embd, n_embd),
 39 |                 act=GELU(),
 40 |                 dropout=nn.Dropout(dropout_rate),
 41 |             )
 42 |         )
 43 |         m = self.mlp
 44 |         self.ffn = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))
 45 | 
 46 |     def forward(self, x, causal_mask):
 47 |         # query: [160,33,128]
 48 |         # print(causal_mask.shape) #[640, 33, 33]
 49 |         att, att_weights = self.attn(query=self.ln_1(x), key=self.ln_1(x), value=self.ln_1(x), attn_mask=causal_mask)
 50 |         # att, att_weights = self.attn(query=self.ln_1(x), key=self.ln_1(x), value=self.ln_1(x))
 51 | 
 52 | 
 53 |         x = x + att
 54 |         x = x + self.ffn(self.ln_2(x))
 55 | 
 56 |         # att_weights[att_weights>0] = 1
 57 |         # att_weights = F.softmax(att_weights, dim=-1)
 58 |         # att = torch.matmul(att_weights, self.ln_1(x))
 59 |         # x = x + att
 60 |         # x = x + self.ffn(self.ln_2(x))
 61 | 
 62 |         return x, att_weights
 63 | 
 64 | 
 65 | class GPT(nn.Module):
 66 |     """Memory roll-out GPT."""
 67 | 
 68 |     def __init__(
 69 |         self, buffer_len, n_layer, n_head, n_embd, embd_pdrop=0.0, resid_pdrop=0.0, attn_pdrop=0.0
 70 |     ):
 71 |         super().__init__()
 72 |         self.buffer_len = buffer_len
 73 |         self.n_layer = n_layer
 74 |         self.n_head = n_head
 75 |         self.n_embd = n_embd
 76 |         self.embd_pdrop = embd_pdrop
 77 |         self.resid_pdrop = resid_pdrop
 78 |         self.attn_pdrop = attn_pdrop
 79 | 
 80 |         self.transformer = nn.ModuleDict(
 81 |             dict(
 82 |                 wte=nn.Linear(self.n_embd, self.n_embd, bias=False),
 83 |                 wpe=nn.Embedding(self.buffer_len, self.n_embd),
 84 |                 drop=nn.Dropout(self.embd_pdrop),
 85 |                 h=nn.ModuleList(
 86 |                     [Block(self.n_embd, self.n_head, self.resid_pdrop) for _ in range(self.n_layer)]
 87 |                 ),
 88 |                 ln_f=nn.LayerNorm(self.n_embd),
 89 |             )
 90 |         )
 91 |         # roll out to the same dimension
 92 |         self.roll_out_head = nn.Linear(self.n_embd, self.n_embd, bias=False)
 93 | 
 94 |         # init all weights
 95 |         self.apply(self._init_weights)
 96 |         for pn, p in self.named_parameters():
 97 |             if pn.endswith("c_proj.weight"):
 98 |                 torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * self.n_layer))
 99 | 
100 |         # report number of parameters (note we don't count the decoder parameters in lm_head)
101 |         n_params = sum(p.numel() for p in self.transformer.parameters())
102 |         print("number of parameters: %.2fM" % (n_params / 1e6,))
103 | 
104 |     def _init_weights(self, module):
105 |         if isinstance(module, nn.Linear):
106 |             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
107 |             if module.bias is not None:
108 |                 torch.nn.init.zeros_(module.bias)
109 |         elif isinstance(module, nn.Embedding):
110 |             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
111 |         elif isinstance(module, nn.LayerNorm):
112 |             torch.nn.init.zeros_(module.bias)
113 |             torch.nn.init.ones_(module.weight)
114 | 
115 |     def forward(self, mem, mem_table, targets=None):
116 |         device = mem.device
117 |         b, t, n, d = mem.shape
118 | 
119 |         # reshape to merge the batch and num_buffer dimensionsni
120 |         mem = mem.permute(0, 2, 1, 3).reshape(b*n, t, d)
121 |         mem_table = mem_table.view(b * n, -1)
122 |         pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
123 | 
124 |         tok_emb = self.transformer.wte(mem)  # token embeddings of shape (b, t, n_embd)
125 |         pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (1, t, n_embd)
126 |         x = self.transformer.drop(tok_emb + pos_emb)
127 | 
128 |         # create causal attention masks
129 |         # need to check correctness
130 |         causal_masks = []
131 |         for idx in range(b * n):
132 |             occupied_len = mem_table[idx].cpu().numpy().astype(int)[0]
133 |             if occupied_len == 0:
134 |                 occupied_len = 1
135 |             # causal_mask = torch.tril(torch.ones(self.buffer_len, self.buffer_len).to(device)).view(
136 |             #     1, self.buffer_len, self.buffer_len
137 |             # )
138 |             causal_mask = torch.zeros(self.buffer_len, self.buffer_len).to(device).view( 1, self.buffer_len, self.buffer_len)
139 |             causal_mask[:, occupied_len:, occupied_len:] = 1
140 |             causal_mask = causal_mask > 0
141 |             causal_masks.append(causal_mask)
142 |         causal_masks = torch.stack(causal_masks)
143 |         causal_masks = causal_masks.repeat(1,self.n_head,1,1).view(-1, t, t)
144 | 
145 | 
146 |         for block in self.transformer.h:
147 |             x, attn_weights = block(x, causal_masks)
148 |         x = self.transformer.ln_f(x)
149 |         x = self.roll_out_head(x) #[b*n, t, d]
150 | 
151 |         out = torch.zeros((b*n, d)).to(device)
152 | 
153 |         for idx in range(b * n):
154 |             t_pos = mem_table[idx].cpu().numpy().astype(int)[0]
155 |             if t_pos > 0 and t_pos < t:
156 |                 # print(attn_weights[idx, t_pos])
157 |                 out[idx] = x[idx, t_pos-1]
158 | 
159 |         # for idx in range(b * n):
160 |         #     t_pos = mem_table[idx].cpu().numpy().astype(int)[0]
161 |         #     out[idx] = x[idx, t_pos]
162 |         return out.view(b,n,d)
163 | 


--------------------------------------------------------------------------------
/ocl/cli/train.py:
--------------------------------------------------------------------------------
  1 | """Train a slot attention type model."""
  2 | import dataclasses
  3 | from typing import Any, Dict, Optional
  4 | from pytorch_lightning.utilities.cloud_io import load as pl_load
  5 | import hydra
  6 | import hydra_zen
  7 | import pytorch_lightning as pl
  8 | from pytorch_lightning.callbacks import ModelCheckpoint
  9 | from pluggy import PluginManager
 10 | import torch
 11 | import ocl.hooks
 12 | from ocl import base
 13 | from ocl.combined_model import CombinedModel
 14 | from ocl.config.datasets import DataModuleConfig
 15 | from ocl.config.metrics import MetricConfig
 16 | from ocl.config.plugins import PluginConfig
 17 | from ocl.plugins import Plugin
 18 | from ocl.utils import Combined, Recurrent, RoutableMixin
 19 | from typing import Union
 20 | TrainerConf = hydra_zen.builds(
 21 |     pl.Trainer, max_epochs=100, zen_partial=False, populate_full_signature=True
 22 | )
 23 | 
 24 | 
 25 | @dataclasses.dataclass
 26 | class TrainingConfig:
 27 |     """Configuration of a training run."""
 28 | 
 29 |     dataset: DataModuleConfig
 30 |     models: Any # When provided with dict wrap in `utils.Combined`, otherwise interpret as model.
 31 |     losses: Dict[str, Any]
 32 |     visualizations: Dict[str, Any] = dataclasses.field(default_factory=dict)
 33 |     plugins: Dict[str, PluginConfig] = dataclasses.field(default_factory=dict)
 34 |     trainer: TrainerConf = TrainerConf
 35 |     training_vis_frequency: Optional[int] = None
 36 |     training_metrics: Optional[Dict[str, MetricConfig]] = None
 37 |     evaluation_metrics: Optional[Dict[str, MetricConfig]] = None
 38 |     load_checkpoint: Optional[str] = None
 39 |     seed: Optional[int] = None
 40 |     experiment: Optional[Any] = None
 41 |     root_output_folder: Optional[str] = None
 42 | 
 43 | 
 44 | hydra.core.config_store.ConfigStore.instance().store(
 45 |     name="training_config",
 46 |     node=TrainingConfig,
 47 | )
 48 | 
 49 | 
 50 | def create_plugin_manager() -> PluginManager:
 51 |     pm = PluginManager("ocl")
 52 |     pm.add_hookspecs(ocl.hooks)
 53 |     return pm
 54 | 
 55 | 
 56 | def build_and_register_datamodule_from_config(
 57 |     config: TrainingConfig,
 58 |     hooks: base.PluggyHookRelay,
 59 |     plugin_manager: Optional[PluginManager] = None,
 60 |     **datamodule_kwargs,
 61 | ) -> pl.LightningDataModule:
 62 |     datamodule = hydra_zen.instantiate(
 63 |         config.dataset, hooks=hooks, _convert_="all", **datamodule_kwargs
 64 |     )
 65 | 
 66 |     if plugin_manager:
 67 |         plugin_manager.register(datamodule)
 68 | 
 69 |     return datamodule
 70 | 
 71 | 
 72 | def build_and_register_plugins_from_config(
 73 |     config: TrainingConfig, plugin_manager: Optional[PluginManager] = None
 74 | ) -> Dict[str, Plugin]:
 75 |     plugins = hydra_zen.instantiate(config.plugins)
 76 |     # Use lexicographical sorting to allow to influence registration order. This is necessary in
 77 |     # some cases as certain plugins might need to be called before others. Pluggy calls hooks
 78 |     # according to FILO (first in last out) and this is slightly unintuitive. We thus register
 79 |     # plugins in reverse order to their sorting position, leading to a FIFO (first in first out)
 80 |     # behavior with regard to the sorted position.
 81 |     if plugin_manager:
 82 |         for plugin_name in sorted(plugins.keys())[::-1]:
 83 |             plugin_manager.register(plugins[plugin_name])
 84 | 
 85 |     return plugins
 86 | 
 87 | 
 88 | def build_model_from_config(
 89 |     config: TrainingConfig,
 90 |     hooks: base.PluggyHookRelay,
 91 |     checkpoint_path: Optional[str] = None,
 92 | ) -> pl.LightningModule:
 93 |     models = hydra_zen.instantiate(config.models, _convert_="all")
 94 |     losses = hydra_zen.instantiate(config.losses, _convert_="all")
 95 |     visualizations = hydra_zen.instantiate(config.visualizations, _convert_="all")
 96 | 
 97 |     training_metrics = hydra_zen.instantiate(config.training_metrics)
 98 |     evaluation_metrics = hydra_zen.instantiate(config.evaluation_metrics)
 99 | 
100 |     # train_vis_freq = config.training_vis_frequency if config.training_vis_frequency else 10
101 |     train_vis_freq = 10
102 | 
103 |     if checkpoint_path is None:
104 |         model = CombinedModel(
105 |             models=models,
106 |             losses=losses,
107 |             visualizations=visualizations,
108 |             hooks=hooks,
109 |             training_metrics=training_metrics,
110 |             evaluation_metrics=evaluation_metrics,
111 |             vis_log_frequency=train_vis_freq,
112 |         )
113 |     else:
114 |         model = CombinedModel.load_from_checkpoint(
115 |             checkpoint_path,
116 |             strict=False,
117 |             models=models,
118 |             losses=losses,
119 |             visualizations=visualizations,
120 |             hooks=hooks,
121 |             training_metrics=training_metrics,
122 |             evaluation_metrics=evaluation_metrics,
123 |             vis_log_frequency=train_vis_freq,
124 |         )
125 |         # checkpoint = pl_load(checkpoint_path)
126 |         # model.load_state_dict(checkpoint['state_dict'], strict=False)
127 |     return model
128 | 
129 | 
130 | @hydra.main(config_name="training_config", config_path="../../configs/", version_base="1.1")
131 | def train(config: TrainingConfig):
132 |     # Set all relevant random seeds. If `config.seed is None`, the function samples a random value.
133 |     # The function takes care of correctly distributing the seed across nodes in multi-node training,
134 |     # and assigns each dataloader worker a different random seed.
135 |     # IMPORTANTLY, we need to take care not to set a custom `worker_init_fn` function on the
136 |     # dataloaders (or take care of worker seeding ourselves).
137 |     pl.seed_everything(config.seed, workers=True)
138 | 
139 |     pm = create_plugin_manager()
140 | 
141 |     datamodule = build_and_register_datamodule_from_config(config, pm.hook, pm)
142 | 
143 |     build_and_register_plugins_from_config(config, pm)
144 | 
145 |     if config.load_checkpoint:
146 |         checkpoint_path = hydra.utils.to_absolute_path(config.load_checkpoint)
147 |     else:
148 |         checkpoint_path = None
149 | 
150 |     model = build_model_from_config(config, pm.hook, checkpoint_path)
151 | 
152 |     callbacks = []
153 |     if config.trainer.logger is not False:
154 |         lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
155 |         callbacks.append(lr_monitor)
156 | 
157 |     checkpoint_path_ = 'checkpoints'
158 | 
159 |     checkpoint_callback  = ModelCheckpoint(dirpath = checkpoint_path_, every_n_epochs = 10)
160 |     callbacks.append(checkpoint_callback)
161 | 
162 |     trainer: pl.Trainer = hydra_zen.instantiate(config.trainer, callbacks=callbacks, _convert_="all", gpus=[4,5,6,7]) #gpus=[5,6]
163 | 
164 |     trainer.fit(model, datamodule=datamodule, ckpt_path=None)
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     train()
169 | 


--------------------------------------------------------------------------------
/ocl/models/savi_with_memory.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from typing import Any, Dict
  3 | import copy
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from ocl.path_defaults import VIDEO, BOX, MASK
  8 | from ocl.tree_utils import get_tree_element, reduce_tree
  9 | from ocl.metrics import masks_to_bboxes
 10 | 
 11 | class SAVi_mem(nn.Module):
 12 |     def __init__(
 13 |         self,
 14 |         conditioning: nn.Module,
 15 |         feature_extractor: nn.Module,
 16 |         perceptual_grouping: nn.Module,
 17 |         decoder: nn.Module,
 18 |         transition_model: nn.Module,
 19 |         memory: nn.Module,
 20 |         freeze = False,
 21 |     ):
 22 |         super().__init__()
 23 |         self.conditioning = conditioning
 24 |         self.feature_extractor = feature_extractor
 25 |         self.perceptual_grouping = perceptual_grouping
 26 |         self.decoder = decoder
 27 |         self.transition_model = transition_model
 28 |         self.memory = memory
 29 |         if freeze:
 30 |             self.conditioning.eval()
 31 |             self.feature_extractor.eval()
 32 |             self.perceptual_grouping.eval()
 33 |             self.decoder.eval()
 34 |             self.transition_model.eval()
 35 |             # freeze params
 36 |             for param in self.conditioning.parameters():
 37 |                 param.requires_grad = False
 38 |             for param in self.feature_extractor.parameters():
 39 |                 param.requires_grad = False
 40 |             for param in self.perceptual_grouping.parameters():
 41 |                 param.requires_grad = False
 42 |             for param in self.decoder.parameters():
 43 |                 param.requires_grad = False
 44 |             for param in self.transition_model.parameters():
 45 |                 param.requires_grad = False
 46 |         self.batched_input = None
 47 | 
 48 |     def remove_bg_id(self, slot_masks):
 49 |         slot_masks = slot_masks > 0.7
 50 |         n, h, w = slot_masks.shape
 51 |         # remove background or none masks
 52 |         mask_sum = torch.sum(slot_masks.reshape(-1, h * w), dim=-1)
 53 |         empty_idx = (mask_sum <= 10).nonzero(as_tuple=True)[0].tolist()
 54 |         # bg_value = mask_sum[3]
 55 |         # bg_idx = (mask_sum == bg_value).nonzero(as_tuple=True)[0]
 56 |         # bg_idx = torch.cat([bg_idx, empty_idx], dim=0)
 57 |         fg_idxs = (mask_sum>20).nonzero(as_tuple=True)[0].tolist()
 58 |         bg_idx = 3
 59 |         fg_idx = 0
 60 |         for i in fg_idxs:
 61 |             if i not in empty_idx:
 62 |                 if fg_idx != bg_idx:
 63 |                     fg_idx = i
 64 |                 break
 65 |         empty_idx = list(empty_idx)
 66 |         if bg_idx in empty_idx:
 67 |             del empty_idx[empty_idx.index(3)]
 68 |         return empty_idx, fg_idx, bg_idx
 69 | 
 70 |     def remove_duplicated_slot_id(self, slot_masks):
 71 |         slot_masks = slot_masks > 0.7
 72 |         n, h, w = slot_masks.shape
 73 |         # remove background or none masks
 74 |         mask_sum = torch.sum(slot_masks.reshape(-1, h * w), dim=-1)
 75 |         empty_idx = (mask_sum <= 10).nonzero(as_tuple=True)[0]
 76 | 
 77 |         # remove duplicated masks
 78 |         mask = slot_masks.unsqueeze(1).to(torch.bool).reshape(n, 1, -1)
 79 |         mask_ = slot_masks.unsqueeze(0).to(torch.bool).reshape(1, n, -1)
 80 |         intersection = torch.sum(mask & mask_, dim=-1).to(torch.float64)
 81 |         union = torch.sum(mask | mask_, dim=-1).to(torch.float64)
 82 |         union1 = torch.sum(mask | mask, dim=-1).to(torch.float64)
 83 |         # union2 = torch.sum(mask_ | mask_, dim=-1).to(torch.float64)
 84 |         pairwise_iou = intersection / union
 85 |         self_iou1 = intersection / union1
 86 |         # self_iou2 = intersection / union2
 87 |         pairwise_iou[union == 0] = 1.0
 88 |         dup_idx = []
 89 |         for i in range(n):
 90 |             for j in range(i + 1, n):
 91 |                 # if pairwise_iou[i, j] > 0.9 or self_iou1[i, j]>0.6 or self_iou2[i, j]>0.6:
 92 |                 if pairwise_iou[i, j] > 0.9 or self_iou1[i, j] > 0.6:
 93 |                     dup_idx.append(i)
 94 | 
 95 |         invalid_idx = [*set(list(empty_idx) + list(dup_idx) +[3])]
 96 |         valid_idx = [3]
 97 |         for i in range(n):
 98 |             if i not in invalid_idx:
 99 |                 valid_idx.append(i)
100 |         return valid_idx
101 | 
102 |     def forward(self, inputs: Dict[str, Any], phase: str):
103 | 
104 |         output = inputs
105 |         video = get_tree_element(inputs, VIDEO.split("."))
106 |         # box = get_tree_element(inputs, BOX.split("."))
107 | 
108 |         batch_size = video.shape[0]
109 |         features = self.feature_extractor(video=video)
110 |         output["feature_extractor"] = features
111 |         conditioning = self.conditioning(batch_size=batch_size)
112 |         output["initial_conditioning"] = conditioning
113 | 
114 |         # Loop over time.
115 |         perceptual_grouping_outputs = []
116 |         decoder_outputs = []
117 |         memory_outputs = []
118 |         rollout_outputs = []
119 |         object_features = []
120 |         object_eval_masks= []
121 |         objects = []
122 |         attn_index = []
123 |         tables = []
124 |         slots_feats = []
125 |         for (frame_id, frame_features) in enumerate(features):
126 |             if frame_id == 0:
127 |                 query = conditioning
128 |             else:
129 |                 query = conditioning
130 | 
131 |             perceptual_grouping_output = self.perceptual_grouping(
132 |                 extracted_features=frame_features, conditioning=query
133 |             )
134 |             slots = perceptual_grouping_output.objects
135 |             # slots_ori = slots.clone()
136 |             conditioning = self.transition_model(slots).clone()
137 | 
138 |             decoder_output = self.decoder(object_features=slots)
139 |             cur_slot_masks = decoder_output.masks_eval
140 |             bs, n = cur_slot_masks.shape[:2]
141 |             for b in range(bs):
142 |                 empty_id, fg_id, bg_id = self.remove_bg_id(cur_slot_masks[b])
143 |                 if len(list(empty_id)) > 0:
144 |                     tmp = slots[b,fg_id].unsqueeze(0).repeat(len(list(empty_id)),1)
145 |                     slots[b, empty_id] = tmp
146 | 
147 |             decoder_output_new = self.decoder(object_features=slots)
148 | 
149 |             cur_slot_masks = decoder_output_new.masks_eval
150 |             cur_slot_amodal_masks = decoder_output_new.masks
151 | 
152 |             if frame_id == 0:
153 |                 prev_slot_masks = cur_slot_masks.clone()
154 |             memory_output = self.memory(observations=slots , prev_slot_masks = prev_slot_masks, amodal_masks = cur_slot_amodal_masks,
155 |                 cur_slot_masks = cur_slot_masks, conditions=slots, frame_id=frame_id, phase = phase)
156 |             # prev_slot_masks = cur_slot_masks.clone()
157 | 
158 |             rollout_output = self.decoder(object_features= memory_output.rollout)
159 |             prev_slot_masks = rollout_output.masks_eval
160 |             object_feature_masks = self.decoder(object_features=memory_output.object_features)
161 | 
162 |             # remove background slots for evaluation
163 |             mem_masks = object_feature_masks.masks_eval
164 |             mem_masks_eval = torch.zeros(mem_masks.shape).to(mem_masks.device)
165 |             num_buffer, w, h = mem_masks.shape[1:]
166 |             for b in range(bs):
167 |                 masks = mem_masks[b] > 0.7
168 |                 boxes = masks_to_bboxes(masks)
169 |                 for i in range(num_buffer):
170 |                     if not(boxes[i, 2] > 0.4 * w or boxes[i, 3] > 0.4*h):
171 |                         mem_masks_eval[b, i] = mem_masks[b, i]
172 | 
173 | 
174 |             # Store outputs.
175 |             perceptual_grouping_outputs.append(perceptual_grouping_output)
176 |             decoder_outputs.append(decoder_output_new) # decoder_output_new for memory training
177 |             memory_outputs.append(memory_output.rollout)
178 |             rollout_outputs.append(rollout_output)
179 |             object_features.append(object_feature_masks)
180 |             objects.append(memory_output.object_features)
181 |             attn_index.append(memory_output.attn_index)
182 |             slots_feats.append(slots)
183 |             object_eval_masks.append(mem_masks_eval)
184 |             tables.append(memory_output.table)
185 | 
186 |         # Stack all recurrent outputs.
187 |         stacking_fn = partial(torch.stack, dim=1)
188 |         output["perceptual_grouping"] = reduce_tree(perceptual_grouping_outputs, stacking_fn)
189 |         output["decoder"] = reduce_tree(decoder_outputs, stacking_fn)
190 |         output["memory"] = reduce_tree(memory_outputs, stacking_fn)
191 |         output["rollout_decode"] = reduce_tree(rollout_outputs, stacking_fn)
192 |         output["mem_masks"] = reduce_tree(object_features, stacking_fn)
193 |         output['objects'] = reduce_tree(objects, stacking_fn)
194 |         output["attn_index"] = reduce_tree(attn_index, stacking_fn)
195 |         output["slots"] = reduce_tree(slots_feats, stacking_fn)
196 |         output["tracks"] = reduce_tree(object_eval_masks, stacking_fn)
197 |         output["table"] = reduce_tree(tables, stacking_fn)
198 | 
199 | 
200 |         return output
201 | 
202 | 


--------------------------------------------------------------------------------
/ocl/neural_networks.py:
--------------------------------------------------------------------------------
  1 | """Neural network backbones."""
  2 | from typing import Callable, List, Optional, Union
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from ocl.utils import Residual
  8 | 
  9 | 
 10 | class ReLUSquared(nn.Module):
 11 |     def __init__(self, inplace=False):
 12 |         super().__init__()
 13 |         self.inplace = inplace
 14 | 
 15 |     def forward(self, x):
 16 |         return nn.functional.relu(x, inplace=self.inplace) ** 2
 17 | 
 18 | 
 19 | def get_activation_fn(name: str, inplace: bool = True, leaky_relu_slope: Optional[float] = None):
 20 |     if callable(name):
 21 |         return name
 22 | 
 23 |     name = name.lower()
 24 |     if name == "relu":
 25 |         return nn.ReLU(inplace=inplace)
 26 |     elif name == "relu_squared":
 27 |         return ReLUSquared(inplace=inplace)
 28 |     elif name == "leaky_relu":
 29 |         if leaky_relu_slope is None:
 30 |             raise ValueError("Slope of leaky ReLU was not defined")
 31 |         return nn.LeakyReLU(leaky_relu_slope, inplace=inplace)
 32 |     elif name == "tanh":
 33 |         return nn.Tanh()
 34 |     elif name == "sigmoid":
 35 |         return nn.Sigmoid()
 36 |     elif name == "identity":
 37 |         return nn.Identity()
 38 |     else:
 39 |         raise ValueError(f"Unknown activation function {name}")
 40 | 
 41 | 
 42 | def build_mlp(
 43 |     input_dim: int,
 44 |     output_dim: int,
 45 |     features: List[int],
 46 |     activation_fn: Union[str, Callable] = "relu",
 47 |     final_activation_fn: Optional[Union[str, Callable]] = None,
 48 |     initial_layer_norm: bool = False,
 49 |     residual: bool = False,
 50 | ) -> nn.Sequential:
 51 |     layers = []
 52 |     current_dim = input_dim
 53 |     if initial_layer_norm:
 54 |         layers.append(nn.LayerNorm(current_dim))
 55 | 
 56 |     for n_features in features:
 57 |         layers.append(nn.Linear(current_dim, n_features))
 58 |         nn.init.zeros_(layers[-1].bias)
 59 |         layers.append(get_activation_fn(activation_fn))
 60 |         current_dim = n_features
 61 | 
 62 |     layers.append(nn.Linear(current_dim, output_dim))
 63 |     nn.init.zeros_(layers[-1].bias)
 64 |     if final_activation_fn is not None:
 65 |         layers.append(get_activation_fn(final_activation_fn))
 66 | 
 67 |     if residual:
 68 |         return Residual(nn.Sequential(*layers))
 69 |     return nn.Sequential(*layers)
 70 | 
 71 | 
 72 | def build_two_layer_mlp(
 73 |     input_dim, output_dim, hidden_dim, initial_layer_norm: bool = False, residual: bool = False
 74 | ):
 75 |     """Build a two layer MLP, with optional initial layer norm.
 76 | 
 77 |     Separate class as this type of construction is used very often for slot attention and
 78 |     transformers.
 79 |     """
 80 |     return build_mlp(
 81 |         input_dim, output_dim, [hidden_dim], initial_layer_norm=initial_layer_norm, residual=residual
 82 |     )
 83 | 
 84 | 
 85 | def build_transformer_encoder(
 86 |     input_dim: int,
 87 |     output_dim: int,
 88 |     n_layers: int,
 89 |     n_heads: int,
 90 |     hidden_dim: Optional[int] = None,
 91 |     dropout: float = 0.0,
 92 |     activation_fn: Union[str, Callable] = "relu",
 93 |     layer_norm_eps: float = 1e-5,
 94 |     use_output_transform: bool = True,
 95 | ):
 96 |     if hidden_dim is None:
 97 |         hidden_dim = 4 * input_dim
 98 | 
 99 |     layers = []
100 |     for _ in range(n_layers):
101 |         layers.append(
102 |             nn.TransformerEncoderLayer(
103 |                 d_model=input_dim,
104 |                 nhead=n_heads,
105 |                 dim_feedforward=hidden_dim,
106 |                 dropout=dropout,
107 |                 activation=activation_fn,
108 |                 layer_norm_eps=layer_norm_eps,
109 |                 batch_first=True,
110 |                 norm_first=True,
111 |             )
112 |         )
113 | 
114 |     if use_output_transform:
115 |         layers.append(nn.LayerNorm(input_dim, eps=layer_norm_eps))
116 |         output_transform = nn.Linear(input_dim, output_dim, bias=True)
117 |         nn.init.xavier_uniform_(output_transform.weight)
118 |         nn.init.zeros_(output_transform.bias)
119 |         layers.append(output_transform)
120 | 
121 |     return nn.Sequential(*layers)
122 | 
123 | 
124 | def build_transformer_decoder(
125 |     input_dim: int,
126 |     output_dim: int,
127 |     n_layers: int,
128 |     n_heads: int,
129 |     hidden_dim: Optional[int] = None,
130 |     dropout: float = 0.0,
131 |     activation_fn: Union[str, Callable] = "relu",
132 |     layer_norm_eps: float = 1e-5,
133 |     return_attention_weights: bool = False,
134 |     attention_weight_type: Union[int, str] = -1,
135 | ):
136 |     if hidden_dim is None:
137 |         hidden_dim = 4 * input_dim
138 | 
139 |     decoder_layer = nn.TransformerDecoderLayer(
140 |         d_model=input_dim,
141 |         nhead=n_heads,
142 |         dim_feedforward=hidden_dim,
143 |         dropout=dropout,
144 |         activation=activation_fn,
145 |         layer_norm_eps=layer_norm_eps,
146 |         batch_first=True,
147 |         norm_first=True,
148 |     )
149 | 
150 |     if return_attention_weights:
151 |         return TransformerDecoder(
152 |             decoder_layer,
153 |             n_layers,
154 |             return_attention_weights=True,
155 |             attention_weight_type=attention_weight_type,
156 |         )
157 |     else:
158 |         return nn.TransformerDecoder(decoder_layer, n_layers)
159 | 
160 | 
161 | class TransformerDecoder(nn.TransformerDecoder):
162 |     """Modified nn.TransformerDecoder class that returns attention weights over memory."""
163 | 
164 |     def __init__(
165 |         self,
166 |         decoder_layer,
167 |         num_layers,
168 |         norm=None,
169 |         return_attention_weights=False,
170 |         attention_weight_type: Union[int, str] = "mean",
171 |     ):
172 |         super(TransformerDecoder, self).__init__(decoder_layer, num_layers, norm)
173 | 
174 |         if return_attention_weights:
175 |             self.attention_hooks = []
176 |             for layer in self.layers:
177 |                 self.attention_hooks.append(self._prepare_layer(layer))
178 |         else:
179 |             self.attention_hooks = None
180 | 
181 |         if isinstance(attention_weight_type, int):
182 |             if attention_weight_type >= num_layers or attention_weight_type < -num_layers:
183 |                 raise ValueError(
184 |                     f"Index {attention_weight_type} exceeds number of layers {num_layers}"
185 |                 )
186 |         elif attention_weight_type != "mean":
187 |             raise ValueError("`weights` needs to be a number or 'mean'.")
188 |         self.weights = attention_weight_type
189 | 
190 |     def _prepare_layer(self, layer):
191 |         assert isinstance(layer, nn.TransformerDecoderLayer)
192 | 
193 |         def _mha_block(self, x, mem, attn_mask, key_padding_mask):
194 |             x = self.multihead_attn(
195 |                 x,
196 |                 mem,
197 |                 mem,
198 |                 attn_mask=attn_mask,
199 |                 key_padding_mask=key_padding_mask,
200 |                 need_weights=True,
201 |             )[0]
202 |             return self.dropout2(x)
203 | 
204 |         # Patch _mha_block method to compute attention weights
205 |         layer._mha_block = _mha_block.__get__(layer, nn.TransformerDecoderLayer)
206 | 
207 |         class AttentionHook:
208 |             def __init__(self):
209 |                 self._attention = None
210 | 
211 |             def pop(self) -> torch.Tensor:
212 |                 assert self._attention is not None, "Forward was not called yet!"
213 |                 attention = self._attention
214 |                 self._attention = None
215 |                 return attention
216 | 
217 |             def __call__(self, module, inp, outp):
218 |                 self._attention = outp[1]
219 | 
220 |         hook = AttentionHook()
221 |         layer.multihead_attn.register_forward_hook(hook)
222 |         return hook
223 | 
224 |     def forward(
225 |         self,
226 |         tgt: torch.Tensor,
227 |         memory: torch.Tensor,
228 |         tgt_mask: Optional[torch.Tensor] = None,
229 |         memory_mask: Optional[torch.Tensor] = None,
230 |         tgt_key_padding_mask: Optional[torch.Tensor] = None,
231 |         memory_key_padding_mask: Optional[torch.Tensor] = None,
232 |     ) -> torch.Tensor:
233 |         output = tgt
234 | 
235 |         for mod in self.layers:
236 |             output = mod(
237 |                 output,
238 |                 memory,
239 |                 tgt_mask=tgt_mask,
240 |                 memory_mask=memory_mask,
241 |                 tgt_key_padding_mask=tgt_key_padding_mask,
242 |                 memory_key_padding_mask=memory_key_padding_mask,
243 |             )
244 | 
245 |         if self.norm is not None:
246 |             output = self.norm(output)
247 | 
248 |         if self.attention_hooks is not None:
249 |             attentions = []
250 |             for hook in self.attention_hooks:
251 |                 attentions.append(hook.pop())
252 | 
253 |             if self.weights == "mean":
254 |                 attentions = torch.stack(attentions, dim=-1)
255 |                 # Take mean over all layers
256 |                 attention = attentions.mean(dim=-1)
257 |             else:
258 |                 attention = attentions[self.weights]
259 | 
260 |             return output, attention.transpose(1, 2)
261 |         else:
262 |             return output
263 | 


--------------------------------------------------------------------------------
/ocl/losses.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from typing import Callable, Optional, Union
  3 | from ocl.matching import CPUHungarianMatcher
  4 | import torch
  5 | from torch import nn
  6 | from torchvision import transforms
  7 | import torch.nn.functional as F
  8 | from ocl import base, consistency, path_defaults, scheduling
  9 | from ocl.utils import RoutableMixin
 10 | from typing import Any
 11 | from scipy.optimize import linear_sum_assignment
 12 | from ocl.base import Instances
 13 | from torchvision.ops import generalized_box_iou
 14 | from ocl.utils import box_cxcywh_to_xyxy
 15 | 
 16 | 
 17 | def _constant_weight(weight: float, global_step: int):
 18 |     return weight
 19 | 
 20 | 
 21 | class ReconstructionLoss(nn.Module, RoutableMixin):
 22 |     def __init__(
 23 |         self,
 24 |         loss_type: str,
 25 |         weight: Union[Callable, float] = 1.0,
 26 |         normalize_target: bool = False,
 27 |         input_path: Optional[str] = None,
 28 |         target_path: Optional[str] = None,
 29 |     ):
 30 |         nn.Module.__init__(self)
 31 |         RoutableMixin.__init__(
 32 |             self,
 33 |             {"input": input_path, "target": target_path, "global_step": path_defaults.GLOBAL_STEP},
 34 |         )
 35 |         if loss_type == "mse":
 36 |             self.loss_fn = nn.functional.mse_loss
 37 |         elif loss_type == "mse_sum":
 38 |             # Used for slot_attention and video slot attention.
 39 |             self.loss_fn = (
 40 |                 lambda x1, x2: nn.functional.mse_loss(x1, x2, reduction="sum") / x1.shape[0]
 41 |             )
 42 |         elif loss_type == "l1":
 43 |             self.loss_name = "l1_loss"
 44 |             self.loss_fn = nn.functional.l1_loss
 45 |         elif loss_type == "cosine":
 46 |             self.loss_name = "cosine_loss"
 47 |             self.loss_fn = lambda x1, x2: -nn.functional.cosine_similarity(x1, x2, dim=-1).mean()
 48 |         else:
 49 |             raise ValueError(f"Unknown loss {loss_type}. Valid choices are (mse, l1, cosine).")
 50 |         # If weight is callable use it to determine scheduling otherwise use constant value.
 51 |         self.weight = weight if callable(weight) else partial(_constant_weight, weight)
 52 |         self.normalize_target = normalize_target
 53 | 
 54 |     @RoutableMixin.route
 55 |     def forward(self, input: torch.Tensor, target: torch.Tensor, global_step: int):
 56 |         target = target.detach()
 57 |         if self.normalize_target:
 58 |             mean = target.mean(dim=-1, keepdim=True)
 59 |             var = target.var(dim=-1, keepdim=True)
 60 |             target = (target - mean) / (var + 1.0e-6) ** 0.5
 61 |         loss = self.loss_fn(input, target)
 62 |         weight = self.weight(global_step)
 63 |         return weight * loss
 64 | 
 65 | 
 66 | class LatentDupplicateSuppressionLoss(nn.Module, RoutableMixin):
 67 |     def __init__(
 68 |         self,
 69 |         weight: Union[float, scheduling.HPSchedulerT],
 70 |         eps: float = 1e-08,
 71 |         grouping_path: Optional[str] = "perceptual_grouping",
 72 |     ):
 73 |         nn.Module.__init__(self)
 74 |         RoutableMixin.__init__(
 75 |             self, {"grouping": grouping_path, "global_step": path_defaults.GLOBAL_STEP}
 76 |         )
 77 |         self.weight = weight
 78 |         self.similarity = nn.CosineSimilarity(dim=-1, eps=eps)
 79 | 
 80 |     @RoutableMixin.route
 81 |     def forward(self, grouping: base.PerceptualGroupingOutput, global_step: int):
 82 |         if grouping.dim() == 4:
 83 |             # Build large tensor of reconstructed video.
 84 |             # objects = grouping.objects
 85 |             objects = grouping
 86 |             bs, n_frames, n_objects, n_features = objects.shape
 87 | 
 88 |             off_diag_indices = torch.triu_indices(
 89 |                 n_objects, n_objects, offset=1, device=objects.device
 90 |             )
 91 | 
 92 |             sq_similarities = (
 93 |                 self.similarity(
 94 |                     objects[:, :, off_diag_indices[0], :], objects[:, :, off_diag_indices[1], :]
 95 |                 )
 96 |                 ** 2
 97 |             )
 98 | 
 99 |             # if grouping.is_empty is not None:
100 |             #     p_not_empty = 1.0 - grouping.is_empty
101 |             #     # Assume that the probability of of individual objects being present is independent,
102 |             #     # thus the probability of both being present is the product of the individual
103 |             #     # probabilities.
104 |             #     p_pair_present = (
105 |             #         p_not_empty[..., off_diag_indices[0]] * p_not_empty[..., off_diag_indices[1]]
106 |             #     )
107 |             #     # Use average expected penalty as loss for each frame.
108 |             #     losses = (sq_similarities * p_pair_present) / torch.sum(
109 |             #         p_pair_present, dim=-1, keepdim=True
110 |             #     )
111 |             # else:
112 |             losses = sq_similarities.mean(dim=-1)
113 | 
114 |             weight = self.weight(global_step) if callable(self.weight) else self.weight
115 | 
116 |             return weight * losses.sum() / (bs * n_frames)
117 |         elif grouping.dim() == 3:
118 |             # Build large tensor of reconstructed image.
119 |             objects = grouping
120 |             bs, n_objects, n_features = objects.shape
121 | 
122 |             off_diag_indices = torch.triu_indices(
123 |                 n_objects, n_objects, offset=1, device=objects.device
124 |             )
125 | 
126 |             sq_similarities = (
127 |                 self.similarity(
128 |                     objects[:, off_diag_indices[0], :], objects[:, off_diag_indices[1], :]
129 |                 )
130 |                 ** 2
131 |             )
132 | 
133 |             if grouping.is_empty is not None:
134 |                 p_not_empty = 1.0 - grouping.is_empty
135 |                 # Assume that the probability of of individual objects being present is independent,
136 |                 # thus the probability of both being present is the product of the individual
137 |                 # probabilities.
138 |                 p_pair_present = (
139 |                     p_not_empty[..., off_diag_indices[0]] * p_not_empty[..., off_diag_indices[1]]
140 |                 )
141 |                 # Use average expected penalty as loss for each frame.
142 |                 losses = (sq_similarities * p_pair_present) / torch.sum(
143 |                     p_pair_present, dim=-1, keepdim=True
144 |                 )
145 |             else:
146 |                 losses = sq_similarities.mean(dim=-1)
147 | 
148 |             weight = self.weight(global_step) if callable(self.weight) else self.weight
149 |             return weight * losses.sum() / bs
150 |         else:
151 |             raise ValueError("Incompatible input format.")
152 | 
153 | 
154 | class EM_loss(nn.Module, RoutableMixin):
155 |     def __init__(
156 |             self,
157 |             loss_weight: float = 20,
158 |             pred_mask_path: Optional[str] = None,
159 |             rec_path: Optional[str] = None,
160 |             tgt_mask_path: Optional[str] = None,
161 |             img_path: Optional[str] = None,
162 |             tgt_vis_path: Optional[str] = None,
163 |             attn_index_path: Optional[str] = None,
164 |             pred_feat_path: Optional[str] = None,
165 |             gt_feat_path: Optional[str] = None,
166 |     ):
167 |         nn.Module.__init__(self)
168 |         RoutableMixin.__init__(
169 |             self,
170 |             {
171 |                 "pred_mask": pred_mask_path,
172 |                 "reconstructions": rec_path,
173 |                 "tgt_mask": tgt_mask_path,
174 |                 "masks_vis": tgt_vis_path,
175 |                 "rec_tgt": img_path,
176 |                 "pred_feats": pred_feat_path,
177 |                 "gt_feats": gt_feat_path,
178 |                 "attn_index": attn_index_path,
179 |             },
180 |         )
181 |         self.loss_weight = loss_weight
182 |         self.loss_fn = (
183 |             lambda x1, x2: nn.functional.mse_loss(x1, x2, reduction="none")
184 |         )
185 | 
186 |     @RoutableMixin.route
187 |     def forward(
188 |             self,
189 |             pred_mask: torch.Tensor,  # rollout_decode.masks
190 |             tgt_mask: torch.Tensor,  # decoder.masks
191 |             reconstructions: torch.Tensor,
192 |             rec_tgt: torch.Tensor,
193 |             masks_vis: torch.Tensor,
194 |             pred_feats: torch.Tensor,
195 |             gt_feats: torch.Tensor,
196 |             attn_index: torch.Tensor,
197 | 
198 |     ):
199 |         b, f, c, h, w = pred_mask.shape
200 |         _, _, n_slots, n_buffer = attn_index.shape
201 |         dim = pred_feats.shape[-1]
202 | 
203 |         pred_feats = F.normalize(pred_feats, dim=-1)
204 |         gt_feats = F.normalize(gt_feats, dim=-1)
205 | 
206 |         pred_feats = pred_feats.reshape(-1, n_buffer, dim).unsqueeze(1).repeat(1, n_slots, 1, 1)
207 |         gt_feats = gt_feats.reshape(-1, n_slots, dim).unsqueeze(2).repeat(1, 1, n_buffer, 1)
208 | 
209 |         pred_mask = pred_mask.reshape(-1, n_buffer, h, w).unsqueeze(1).repeat(1,n_slots,1,1,1)
210 |         tgt_mask = tgt_mask.reshape(-1, n_slots, h, w).unsqueeze(2).repeat(1,1,n_buffer,1,1)
211 |         tgt_mask = tgt_mask > 0.5
212 |         masks_vis = masks_vis.reshape(-1, n_slots, h, w).unsqueeze(2).unsqueeze(3).repeat(1,1,n_buffer,3,1,1)
213 |         masks_vis = masks_vis > 0.5
214 |         attn_index = attn_index.reshape(-1, n_slots, n_buffer)
215 |         rec_tgt = rec_tgt.reshape(-1,3,h,w).unsqueeze(1).unsqueeze(2).repeat(1,n_slots,n_buffer,1,1,1)
216 |         reconstructions = reconstructions.reshape(-1, n_buffer, 3, h, w).unsqueeze(1).repeat(1,n_slots,1,1,1,1)
217 |         rec_pred = reconstructions * masks_vis
218 |         rec_tgt_ = rec_tgt * masks_vis
219 |         loss = torch.sum(F.binary_cross_entropy(pred_mask, tgt_mask.float(), reduction = 'none'), (-1,-2)) / (h*w) + 0.1 * torch.sum(self.loss_fn(rec_pred, rec_tgt_), (-3,-2,-1))
220 |         #loss = torch.sum(self.loss_fn(pred_feats, gt_feats), -1)
221 |         total_loss = torch.sum(attn_index * loss, (0,1,2)) / (b * f * n_slots * n_buffer)
222 |         return (total_loss) * self.loss_weight
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/ocl/mha.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | class ScaledDotProductAttention(nn.Module):
  5 |     ''' Scaled Dot-Product Attention '''
  6 | 
  7 |     def __init__(self, temperature, attn_dropout=0.0):
  8 |         super().__init__()
  9 |         self.temperature = temperature
 10 |         self.dropout = nn.Dropout(attn_dropout)
 11 | 
 12 |     def forward(self, q, k, v, mask=None):
 13 |         attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
 14 | 
 15 |         # if mask is not None:
 16 |         #     attn = attn.masked_fill(mask == 0, -1e9)
 17 |         if mask is not None:
 18 |             bias = (1-mask)*(-1e9)
 19 |             attn = attn * mask + bias
 20 | 
 21 | 
 22 |         attn = F.softmax(attn, dim=-1)
 23 |         output = torch.matmul(attn, v)
 24 | 
 25 |         return output, attn
 26 | 
 27 | 
 28 | class MultiHeadAttention_for_index(nn.Module):
 29 |     ''' Multi-Head Attention module '''
 30 | 
 31 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0):
 32 |         super().__init__()
 33 | 
 34 |         self.n_head = n_head
 35 |         self.d_k = d_k
 36 |         self.d_v = d_v
 37 | 
 38 |         self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
 39 |         self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
 40 |         self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
 41 |         self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
 42 | 
 43 |         '''
 44 |         nn.init.constant_(self.w_qs.weight, 0)
 45 |         nn.init.constant_(self.w_ks.weight, 0)
 46 |         nn.init.constant_(self.w_vs.weight, 0)
 47 |         '''
 48 | 
 49 |         # nn.init.eye_(self.w_qs.weight)
 50 |         nn.init.eye_(self.w_ks.weight)
 51 |         nn.init.eye_(self.w_vs.weight)
 52 |         # nn.init.eye_(self.w_qs.weight)
 53 | 
 54 |         # self.w_vs.eval()
 55 |         # self.w_ks.eval()
 56 |         # self.w_qs.eval()
 57 |         # self.w_vs.weight.requires_grad = False
 58 |         # self.w_ks.weight.requires_grad = False
 59 |         # self.w_qs.weight.requires_grad = False
 60 | 
 61 |         # nn.init.eye_(self.fc.weight)
 62 | 
 63 |         self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)  #temperature=d_k ** 0.5
 64 | 
 65 |         self.dropout = nn.Dropout(dropout)
 66 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
 67 | 
 68 |     def forward(self, q, k, v, mask=None):
 69 |         # print("w_qs:", self.w_qs.weight)
 70 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
 71 |         sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
 72 | 
 73 | 
 74 |         # Pass through the pre-attention projection: b x lq x (n*dv)
 75 |         # Separate different heads: b x lq x n x dv
 76 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
 77 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
 78 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
 79 |         # print (self.w_qs.weight.abs().sum(), self.w_ks.weight.abs().sum(), self.w_vs.weight.abs().sum())
 80 |         # print ('kqv', q.abs().sum(), k.abs().sum(), v.abs().sum())
 81 | 
 82 |         # Transpose for attention dot product: b x n x lq x dv
 83 |         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
 84 | 
 85 |         if mask is not None:
 86 |             mask = mask.unsqueeze(1)  # For head axis broadcasting.
 87 | 
 88 |         q, attn2 = self.attention(q, k, v, mask=mask)
 89 | 
 90 | 
 91 |         # attn = attn1 + attn2
 92 |         # attn = attn1.unsqueeze(1)
 93 |         attn = attn2
 94 | 
 95 |         # Transpose to move the head dimension back: b x lq x n x dv
 96 |         # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
 97 |         q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
 98 |         q = self.dropout(self.fc(q))
 99 |         # q += residual
100 | 
101 |         q = self.layer_norm(q)
102 | 
103 |         attn = torch.mean(attn, 1)
104 |         return q, attn
105 | 
106 | 
107 | class MultiHeadAttention(nn.Module):
108 |     ''' Multi-Head Attention module '''
109 | 
110 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0):
111 |         super().__init__()
112 | 
113 |         self.n_head = n_head
114 |         self.d_k = d_k
115 |         self.d_v = d_v
116 | 
117 |         self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
118 |         self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
119 |         self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
120 |         self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
121 | 
122 | 
123 | 
124 |         nn.init.eye_(self.w_qs.weight)
125 |         nn.init.eye_(self.w_ks.weight)
126 |         nn.init.eye_(self.w_vs.weight)
127 |         nn.init.eye_(self.fc.weight)
128 |         self.attention = ScaledDotProductAttention(temperature=0.5)
129 | 
130 |         self.dropout = nn.Dropout(dropout)
131 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
132 | 
133 | 
134 |     def forward(self, q, k, v, mask=None):
135 |         #print("w_qs:", self.w_qs.weight)
136 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
137 |         sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
138 | 
139 |         residual = q
140 | 
141 |         # Pass through the pre-attention projection: b x lq x (n*dv)
142 |         # Separate different heads: b x lq x n x dv
143 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
144 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
145 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
146 | 
147 |         # Transpose for attention dot product: b x n x lq x dv
148 |         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
149 | 
150 |         if mask is not None:
151 |             mask = mask.unsqueeze(1)   # For head axis broadcasting.
152 | 
153 |         q, attn = self.attention(q, k, v, mask=mask)
154 | 
155 |         # Transpose to move the head dimension back: b x lq x n x dv
156 |         # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
157 |         q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
158 |         q = self.dropout(self.fc(q))
159 | 
160 |         # q += residual
161 |         # q = self.fc2(q)
162 |         # do not norm
163 |         # q = self.layer_norm(q)
164 | 
165 |         attn = torch.mean(attn, 1)
166 |         return q, attn
167 | 
168 | 
169 | 
170 | class MultiHeadAttention_dotversion_merge(nn.Module):
171 |     ''' Multi-Head Attention module '''
172 | 
173 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0):
174 |         super().__init__()
175 | 
176 |         self.n_head = n_head
177 |         self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
178 |         self.n_head = n_head
179 |         self.d_k = d_k
180 |         self.d_v = d_v
181 | 
182 |         self.attention = ScaledDotProductAttention(temperature=0.5)
183 | 
184 |         self.dropout = nn.Dropout(dropout)
185 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
186 | 
187 | 
188 |     def forward(self, q, k, v, mask=None):
189 |         #print("w_qs:", self.w_qs.weight)
190 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
191 |         sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
192 | 
193 |         residual = q
194 | 
195 |         # Pass through the pre-attention projection: b x lq x (n*dv)
196 |         # Separate different heads: b x lq x n x dv
197 |         q = q.view(sz_b, len_q, n_head, d_k)
198 |         k = k.view(sz_b, len_k, n_head, d_k)
199 |         v = v.view(sz_b, len_v, n_head, d_v)
200 | 
201 |         # Transpose for attention dot product: b x n x lq x dv
202 |         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
203 | 
204 |         if mask is not None:
205 |             mask = mask.unsqueeze(1)   # For head axis broadcasting.
206 | 
207 |         q, attn = self.attention(q, k, v, mask=mask)
208 | 
209 |         # Transpose to move the head dimension back: b x lq x n x dv
210 |         # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
211 |         q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
212 |         q = self.dropout(self.fc(q))
213 | 
214 | 
215 |         attn = torch.mean(attn, 1)
216 |         return q, attn
217 | 
218 | class MultiHeadAttention_dotversion_index(nn.Module):
219 |     ''' Multi-Head Attention module '''
220 | 
221 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0):
222 |         super().__init__()
223 | 
224 |         self.n_head = n_head
225 | 
226 |         '''
227 |         nn.init.constant_(self.w_qs.weight, 0)
228 |         nn.init.constant_(self.w_ks.weight, 0)
229 |         nn.init.constant_(self.w_vs.weight, 0)
230 |         '''
231 |         self.n_head = n_head
232 |         self.d_k = d_k
233 |         self.d_v = d_v
234 | 
235 |         self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)  #temperature=d_k ** 0.5
236 | 
237 |         self.dropout = nn.Dropout(dropout)
238 |         self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
239 | 
240 |     def forward(self, q, k, v, mask=None):
241 |         # print("w_qs:", self.w_qs.weight)
242 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
243 |         sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
244 | 
245 | 
246 |         # Pass through the pre-attention projection: b x lq x (n*dv)
247 |         # Separate different heads: b x lq x n x dv
248 |         q = q.view(sz_b, len_q, n_head, d_k)
249 |         k = k.view(sz_b, len_k, n_head, d_k)
250 |         v = v.view(sz_b, len_v, n_head, d_v)
251 |         # print (self.w_qs.weight.abs().sum(), self.w_ks.weight.abs().sum(), self.w_vs.weight.abs().sum())
252 |         # print ('kqv', q.abs().sum(), k.abs().sum(), v.abs().sum())
253 | 
254 |         # Transpose for attention dot product: b x n x lq x dv
255 |         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
256 | 
257 |         if mask is not None:
258 |             mask = mask.unsqueeze(1)  # For head axis broadcasting.
259 | 
260 |         q, attn2 = self.attention(q, k, v, mask=mask)
261 | 
262 | 
263 |         # attn = attn1 + attn2
264 |         # attn = attn1.unsqueeze(1)
265 |         attn = attn2
266 | 
267 |         # Transpose to move the head dimension back: b x lq x n x dv
268 |         # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
269 |         q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
270 |         q = self.dropout(q)
271 |         # q += residual
272 | 
273 |         q = self.layer_norm(q)
274 | 
275 |         attn = torch.mean(attn, 1)
276 |         return q, attn


--------------------------------------------------------------------------------
/ocl/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import dataclasses
  3 | from typing import Dict, Optional
  4 | from typing import Any, Dict, List, Optional, Tuple, Union
  5 | import pluggy
  6 | import torch
  7 | from torch import nn
  8 | from torchtyping import TensorType
  9 | 
 10 | PluggyHookRelay = pluggy._hooks._HookRelay  # Type alias for more readable function signatures
 11 | 
 12 | ConditioningOutput = TensorType["batch_size", "n_objects", "object_dim"]  # noqa: F821
 13 | 
 14 | 
 15 | class Conditioning(nn.Module, metaclass=abc.ABCMeta):
 16 |     """Base class for conditioning perceptual grouping."""
 17 | 
 18 |     @abc.abstractmethod
 19 |     def forward(self, *args) -> ConditioningOutput:
 20 |         pass
 21 | 
 22 | 
 23 | @dataclasses.dataclass
 24 | class FrameFeatures:
 25 |     """Features associated with a single frame."""
 26 | 
 27 |     features: TensorType["batch_size", "n_spatial_features", "feature_dim"]  # noqa: F821
 28 |     positions: TensorType["n_spatial_features", "spatial_dims"]  # noqa: F821
 29 | 
 30 | 
 31 | @dataclasses.dataclass
 32 | class FeatureExtractorOutput:
 33 |     """Output of feature extractor."""
 34 | 
 35 |     features: TensorType["batch_size", "frames", "n_spatial_features", "feature_dim"]  # noqa: F821
 36 |     positions: TensorType["n_spatial_features", "spatial_dims"]  # noqa: F821
 37 |     aux_features: Optional[Dict[str, torch.Tensor]] = None
 38 | 
 39 |     def __iter__(self):
 40 |         """Iterate over features and positions per frame."""
 41 |         for frame_features in torch.split(self.features, 1, dim=1):
 42 |             yield FrameFeatures(frame_features.squeeze(1), self.positions)
 43 | 
 44 | 
 45 | class FeatureExtractor(nn.Module, metaclass=abc.ABCMeta):
 46 |     """Abstract base class for Feature Extractors.
 47 | 
 48 |     We expect that the forward method returns a flattened representation of the features, to make
 49 |     outputs consistent and not dependent on equal spacing or the dimensionality of the spatial
 50 |     information.
 51 |     """
 52 | 
 53 |     @property
 54 |     @abc.abstractmethod
 55 |     def feature_dim(self):
 56 |         """Get dimensionality of the features.
 57 | 
 58 |         Returns:
 59 |             int: The dimensionality of the features.
 60 |         """
 61 | 
 62 |     @abc.abstractmethod
 63 |     def forward(self, inputs: torch.Tensor) -> FeatureExtractorOutput:
 64 |         pass
 65 | 
 66 | 
 67 | @dataclasses.dataclass
 68 | class PerceptualGroupingOutput:
 69 |     """Output of a perceptual grouping algorithm."""
 70 | 
 71 |     objects: TensorType["batch_size", "n_objects", "object_dim"]  # noqa: F821
 72 |     is_empty: Optional[TensorType["batch_size", "n_objects"]] = None  # noqa: F821
 73 |     feature_attributions: Optional[
 74 |         TensorType["batch_size", "n_objects", "n_spatial_features"]  # noqa: F821
 75 |     ] = None
 76 |     value: Optional[
 77 |         TensorType["batch_size", "n_objects", "extra_dim", "n_spatial_features"]  # noqa: F821
 78 |     ] = None
 79 | 
 80 | 
 81 | class PerceptualGrouping(nn.Module, metaclass=abc.ABCMeta):
 82 |     """Abstract base class of a perceptual grouping algorithm."""
 83 | 
 84 |     @abc.abstractmethod
 85 |     def forward(self, extracted_features: FeatureExtractorOutput) -> PerceptualGroupingOutput:
 86 |         pass
 87 | 
 88 |     @property
 89 |     @abc.abstractmethod
 90 |     def object_dim(self):
 91 |         pass
 92 | 
 93 | class Instances:
 94 |     """Modified from Detectron2 (https://github.com/facebookresearch/detectron2).
 95 | 
 96 |     This class represents a list of instances in an image.
 97 |     It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
 98 |     All fields must have the same ``__len__`` which is the number of instances.
 99 | 
100 |     All other (non-field) attributes of this class are considered private:
101 |     they must start with '_' and are not modifiable by a user.
102 | 
103 |     Some basic usage:
104 | 
105 |     1. Set/get/check a field:
106 | 
107 |        .. code-block:: python
108 | 
109 |           instances.gt_boxes = Boxes(...)
110 |           print(instances.pred_masks)  # a tensor of shape (N, H, W)
111 |           print('gt_masks' in instances)
112 | 
113 |     2. ``len(instances)`` returns the number of instances
114 |     3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
115 |        and returns a new :class:`Instances`.
116 |        Typically, ``indices`` is a integer vector of indices,
117 |        or a binary mask of length ``num_instances``
118 | 
119 |        .. code-block:: python
120 | 
121 |           category_3_detections = instances[instances.pred_classes == 3]
122 |           confident_detections = instances[instances.scores > 0.9]
123 |     """
124 | 
125 |     def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
126 |         """Init function.
127 | 
128 |         Args:
129 |             image_size (height, width): the spatial size of the image.
130 |             kwargs: fields to add to this `Instances`.
131 |         """
132 |         self._image_size = image_size
133 |         self._fields: Dict[str, Any] = {}
134 |         for k, v in kwargs.items():
135 |             self.set(k, v)
136 | 
137 |     @property
138 |     def image_size(self) -> Tuple[int, int]:
139 |         return self._image_size
140 | 
141 |     def __setattr__(self, name: str, val: Any) -> None:
142 |         if name.startswith("_"):
143 |             super().__setattr__(name, val)
144 |         else:
145 |             self.set(name, val)
146 | 
147 |     def __getattr__(self, name: str) -> Any:
148 |         if name == "_fields" or name not in self._fields:
149 |             raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
150 |         return self._fields[name]
151 | 
152 |     def set(self, name: str, value: Any) -> None:
153 |         """Set the field named `name` to `value`.
154 | 
155 |         The length of `value` must be the number of instances,
156 |         and must agree with other existing fields in this object.
157 |         """
158 |         data_len = len(value)
159 |         if len(self._fields):
160 |             assert (
161 |                 len(self) == data_len
162 |             ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
163 |         self._fields[name] = value
164 | 
165 |     def has(self, name: str) -> bool:
166 |         """Returns whether the field called `name` exists."""
167 |         return name in self._fields
168 | 
169 |     def remove(self, name: str) -> None:
170 |         """Remove the field called `name`."""
171 |         del self._fields[name]
172 | 
173 |     def get(self, name: str) -> Any:
174 |         """Returns the field called `name`."""
175 |         return self._fields[name]
176 | 
177 |     def get_fields(self) -> Dict[str, Any]:
178 |         """Get field.
179 | 
180 |         Returns:
181 |             dict: a dict which maps names (str) to data of the fields
182 | 
183 |         Modifying the returned dict will modify this instance.
184 |         """
185 |         return self._fields
186 | 
187 |     # Tensor-like methods
188 |     def to(self, *args: Any, **kwargs: Any) -> "Instances":
189 |         """To device.
190 | 
191 |         Returns:
192 |             Instances: all fields are called with a `to(device)`, if the field has this method.
193 |         """
194 |         ret = Instances(self._image_size)
195 |         for k, v in self._fields.items():
196 |             if hasattr(v, "to"):
197 |                 v = v.to(*args, **kwargs)
198 |             ret.set(k, v)
199 |         return ret
200 | 
201 |     def numpy(self):
202 |         ret = Instances(self._image_size)
203 |         for k, v in self._fields.items():
204 |             if hasattr(v, "numpy"):
205 |                 v = v.numpy()
206 |             ret.set(k, v)
207 |         return ret
208 | 
209 |     def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
210 |         """Get entry.
211 | 
212 |         Args:
213 |             item: an index-like object and will be used to index all the fields.
214 | 
215 |         Returns:
216 |             If `item` is a string, return the data in the corresponding field.
217 |             Otherwise, returns an `Instances` where all fields are indexed by `item`.
218 |         """
219 |         if type(item) == int:
220 |             if item >= len(self) or item < -len(self):
221 |                 raise IndexError("Instances index out of range!")
222 |             else:
223 |                 item = slice(item, None, len(self))
224 | 
225 |         ret = Instances(self._image_size)
226 |         for k, v in self._fields.items():
227 |             ret.set(k, v[item])
228 |         return ret
229 | 
230 |     def __len__(self) -> int:
231 |         for v in self._fields.values():
232 |             # use __len__ because len() has to be int and is not friendly to tracing
233 |             return v.__len__()
234 |         raise NotImplementedError("Empty Instances does not support __len__!")
235 | 
236 |     def __iter__(self):
237 |         raise NotImplementedError("`Instances` object is not iterable!")
238 | 
239 |     @staticmethod
240 |     def cat(instance_lists: List["Instances"]) -> "Instances":
241 |         """Concatenate instances.
242 | 
243 |         Args:
244 |             instance_lists (list[Instances])
245 | 
246 |         Returns:
247 |             Instances
248 |         """
249 |         assert all(isinstance(i, Instances) for i in instance_lists)
250 |         assert len(instance_lists) > 0
251 |         if len(instance_lists) == 1:
252 |             return instance_lists[0]
253 | 
254 |         image_size = instance_lists[0].image_size
255 |         for i in instance_lists[1:]:
256 |             assert i.image_size == image_size
257 |         ret = Instances(image_size)
258 |         for k in instance_lists[0]._fields.keys():
259 |             values = [i.get(k) for i in instance_lists]
260 |             v0 = values[0]
261 |             if isinstance(v0, torch.Tensor):
262 |                 values = torch.cat(values, dim=0)
263 |             elif isinstance(v0, list):
264 |                 values = list(itertools.chain(*values))
265 |             elif hasattr(type(v0), "cat"):
266 |                 values = type(v0).cat(values)
267 |             else:
268 |                 raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
269 |             ret.set(k, values)
270 |         return ret
271 | 
272 |     def __str__(self) -> str:
273 |         s = self.__class__.__name__ + "("
274 |         s += "num_instances={}, ".format(len(self))
275 |         s += "image_height={}, ".format(self._image_size[0])
276 |         s += "image_width={}, ".format(self._image_size[1])
277 |         s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
278 |         return s
279 | 
280 |     __repr__ = __str__


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/ocl/predictor.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from ocl.utils import RoutableMixin, init_fn
  6 | from typing import Callable, Optional, Tuple, Union, Any
  7 | from ocl.path_defaults import OBJECTS
  8 | DType = Any
  9 | import dataclasses
 10 | from torchtyping import TensorType
 11 | from ocl import base, path_defaults
 12 | 
 13 | class MLP(nn.Module):
 14 |     def __init__(self,
 15 |                  input_size: int,  # FIXME: added because or else can't instantiate submodules
 16 |                  hidden_size: int,
 17 |                  output_size: int,  # if not given, should be inputs.shape[-1] at forward
 18 |                  num_hidden_layers: int = 1,
 19 |                  activation_fn: nn.Module = nn.ReLU,
 20 |                  layernorm: Optional[str] = None,
 21 |                  activate_output: bool = False,
 22 |                  residual: bool = False,
 23 |                  weight_init=None
 24 |                  ):
 25 |         super().__init__()
 26 |         self.input_size = input_size
 27 |         self.hidden_size = hidden_size
 28 |         self.output_size = output_size
 29 |         self.num_hidden_layers = num_hidden_layers
 30 |         self.activation_fn = activation_fn
 31 |         self.layernorm = layernorm
 32 |         self.activate_output = activate_output
 33 |         self.residual = residual
 34 |         self.weight_init = weight_init
 35 |         if self.layernorm == "pre":
 36 |             self.layernorm_module = nn.LayerNorm(input_size, eps=1e-6)
 37 |         elif self.layernorm == "post":
 38 |             self.layernorm_module = nn.LayerNorm(output_size, eps=1e-6)
 39 |         ## mlp
 40 |         self.model = nn.ModuleList()
 41 |         self.model.add_module("dense_mlp_0", nn.Linear(self.input_size, self.hidden_size))
 42 |         self.model.add_module("dense_mlp_0_act", self.activation_fn())
 43 |         for i in range(1, self.num_hidden_layers):
 44 |             self.model.add_module(f"den_mlp_{i}", nn.Linear(self.hidden_size, self.hidden_size))
 45 |             self.model.add_module(f"dense_mlp_{i}_act", self.activation_fn())
 46 |         self.model.add_module(f"dense_mlp_{self.num_hidden_layers}", nn.Linear(self.hidden_size, self.output_size))
 47 |         if self.activate_output:
 48 |             self.model.add_module(f"dense_mlp_{self.num_hidden_layers}_act", self.activation_fn())
 49 |         for name, module in self.model.named_children():
 50 |             if 'act' not in name:
 51 |                 nn.init.xavier_uniform_(module.weight)
 52 |                 # init_fn[weight_init['linear_w']](module.weight)
 53 |                 # init_fn[weight_init['linear_b']](module.bias)
 54 | 
 55 |     def forward(self, inputs: torch.Tensor, train: bool = False) -> torch.Tensor:
 56 |         del train  # Unused
 57 | 
 58 |         x = inputs
 59 |         if self.layernorm == "pre":
 60 |             x = self.layernorm_module(x)
 61 |         for layer in self.model:
 62 |             x = layer(x)
 63 |         if self.residual:
 64 |             x = x + inputs
 65 |         if self.layernorm == "post":
 66 |             x = self.layernorm_module(x)
 67 |         return x
 68 | 
 69 | 
 70 | 
 71 | 
 72 | class GeneralizedDotProductAttention(nn.Module):
 73 |     """Multi-head dot-product attention with customizable normalization axis.
 74 |     This module supports logging of attention weights in a variable collection.
 75 |     """
 76 | 
 77 |     def __init__(self,
 78 |                  dtype: DType = torch.float32,
 79 |                  # precision: Optional[] # not used
 80 |                  epsilon: float = 1e-8,
 81 |                  inverted_attn: bool = False,
 82 |                  renormalize_keys: bool = False,
 83 |                  attn_weights_only: bool = False
 84 |                 ):
 85 |         super().__init__()
 86 | 
 87 |         self.dtype = dtype
 88 |         self.epsilon = epsilon
 89 |         self.inverted_attn = inverted_attn
 90 |         self.renormalize_keys = renormalize_keys
 91 |         self.attn_weights_only = attn_weights_only
 92 | 
 93 |     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
 94 |                 train: bool = False, **kwargs) -> torch.Tensor:
 95 |         """Computes multi-head dot-product attention given query, key, and value.
 96 |         Args:
 97 |             query: Queries with shape of `[batch..., q_num, num_heads, qk_features]`.
 98 |             key: Keys with shape of `[batch..., kv_num, num_heads, qk_features]`.
 99 |             value: Values with shape of `[batch..., kv_num, num_heads, v_features]`.
100 |             train: Indicating whether we're training or evaluating.
101 |             **kwargs: Additional keyword arguments are required when used as attention
102 |                 function in nn.MultiHeadDotPRoductAttention, but they will be ignored here.
103 |         Returns:
104 |             Output of shape `[batch..., q_num, num_heads, v_features]`.
105 |         """
106 |         del train # Unused.
107 | 
108 |         assert query.ndim == key.ndim == value.ndim, (
109 |             "Queries, keys, and values must have the same rank.")
110 |         assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], (
111 |             "Query, key, and value batch dimensions must match.")
112 |         assert query.shape[-2] == key.shape[-2] == value.shape[-2], (
113 |             "Query, key, and value num_heads dimensions must match.")
114 |         assert key.shape[-3] == value.shape[-3], (
115 |             "Key and value cardinality dimensions must match.")
116 |         assert query.shape[-1] == key.shape[-1], (
117 |             "Query and key feature dimensions must match.")
118 | 
119 |         if kwargs.get("bias") is not None:
120 |             raise NotImplementedError(
121 |                 "Support for masked attention is not yet implemented.")
122 | 
123 |         if "dropout_rate" in kwargs:
124 |             if kwargs["dropout_rate"] > 0.:
125 |                 raise NotImplementedError("Support for dropout is not yet implemented.")
126 | 
127 |         # Temperature normalization.
128 |         qk_features = query.shape[-1]
129 |         query = query / (qk_features ** 0.5) # torch.sqrt(qk_features)
130 | 
131 |         # attn.shape = (batch..., num_heads, q_num, kv_num)
132 |         attn = torch.matmul(query.permute(0, 2, 1, 3), key.permute(0, 2, 3, 1)) # bhqd @ bhdk -> bhqk
133 | 
134 |         if self.inverted_attn:
135 |             attention_dim = -2 # Query dim
136 |         else:
137 |             attention_dim = -1 # Key dim
138 | 
139 |         # Softmax normalization (by default over key dim)
140 |         attn = torch.softmax(attn, dim=attention_dim, dtype=self.dtype)
141 | 
142 |         if self.renormalize_keys:
143 |             # Corresponds to value aggregation via weighted mean (as opposed to sum).
144 |             normalizer = torch.sum(attn, axis=-1, keepdim=True) + self.epsilon
145 |             attn_n = attn / normalizer
146 |         else:
147 |             attn_n = attn
148 | 
149 |         if self.attn_weights_only:
150 |             return attn_n
151 | 
152 |         # Aggregate values using a weighted sum with weights provided by `attn`
153 |         updates = torch.einsum("bhqk,bkhd->bqhd", attn_n, value)
154 | 
155 |         return updates, attn # FIXME: return attention too, as no option for intermediate storing in module in torch.
156 | 
157 | class TransformerBlock(nn.Module, RoutableMixin):
158 |     def __init__(self,
159 |                  embed_dim: int = 128,
160 |                  num_heads: int = 4,
161 |                  qkv_size: int = 128,
162 |                  mlp_size: int = 256,
163 |                  pre_norm: bool = False,
164 |                  weight_init=None,
165 |                  # object_features_path: Optional[str] = OBJECTS,
166 |                  ):
167 |         nn.Module.__init__(self)
168 |         # RoutableMixin.__init__(self, {"object_features": object_features_path})
169 | 
170 |         self.embed_dim = embed_dim
171 |         self.qkv_size = qkv_size
172 |         self.mlp_size = mlp_size
173 |         self.num_heads = num_heads
174 |         self.pre_norm = pre_norm
175 |         self.weight_init = weight_init
176 | 
177 |         assert num_heads >= 1
178 |         assert qkv_size % num_heads == 0, "embed dim must be divisible by num_heads"
179 |         self.head_dim = qkv_size // num_heads
180 | 
181 |         # submodules
182 |         ## MHA #
183 |         self.attn = GeneralizedDotProductAttention()
184 |         ## mlps
185 |         self.mlp = MLP(
186 |             input_size=embed_dim, hidden_size=mlp_size,
187 |             output_size=embed_dim, weight_init=weight_init)
188 |         ## layernorms
189 |         self.layernorm_query = nn.LayerNorm(embed_dim, eps=1e-6)
190 |         self.layernorm_mlp = nn.LayerNorm(embed_dim, eps=1e-6)
191 |         ## weights
192 |         self.dense_q = nn.Linear(embed_dim, qkv_size)
193 |         self.dense_k = nn.Linear(embed_dim, qkv_size)
194 |         self.dense_v = nn.Linear(embed_dim, qkv_size)
195 |         # init_fn[weight_init['linear_w']](self.dense_q.weight)
196 |         # init_fn[weight_init['linear_b']](self.dense_q.bias)
197 |         # init_fn[weight_init['linear_w']](self.dense_k.weight)
198 |         # init_fn[weight_init['linear_b']](self.dense_k.bias)
199 |         # init_fn[weight_init['linear_w']](self.dense_v.weight)
200 |         # init_fn[weight_init['linear_b']](self.dense_v.bias)
201 |         if self.num_heads > 1:
202 |             self.dense_o = nn.Linear(qkv_size, embed_dim)
203 |             # nn.init.xavier_uniform_(self.w_o.weight)
204 |             # init_fn[weight_init['linear_w']](self.dense_o.weight)
205 |             # init_fn[weight_init['linear_b']](self.dense_o.bias)
206 |             self.multi_head = True
207 |         else:
208 |             self.multi_head = False
209 | 
210 | 
211 |     # @RoutableMixin.route
212 |     def forward(self, object_features: torch.Tensor):  # TODO: add general attention for q, k, v, not just for x = qkv
213 |         assert object_features.ndim == 3
214 |         B, L, _ = object_features.shape
215 |         head_dim = self.embed_dim // self.num_heads
216 | 
217 |         if self.pre_norm:
218 |             # Self-attention.
219 |             x = self.layernorm_query(object_features)
220 |             q = self.dense_q(x).view(B, L, self.num_heads, head_dim)
221 |             k = self.dense_k(x).view(B, L, self.num_heads, head_dim)
222 |             v = self.dense_v(x).view(B, L, self.num_heads, head_dim)
223 |             x, _ = self.attn(query=q, key=k, value=v)
224 |             if self.multi_head:
225 |                 x = self.dense_o(x.reshape(B, L, self.qkv_size)).view(B, L, self.embed_dim)
226 |             else:
227 |                 x = x.squeeze(-2)
228 |             x = x + object_features
229 | 
230 |             y = x
231 | 
232 |             # MLP
233 |             z = self.layernorm_mlp(y)
234 |             z = self.mlp(z)
235 |             z = z + y
236 |         else:
237 |             # Self-attention on queries.
238 |             x = object_features
239 |             # q = self.dense_q(x).view(B, L, self.num_heads, head_dim)
240 |             # k = self.dense_k(x).view(B, L, self.num_heads, head_dim)
241 |             # v = self.dense_v(x).view(B, L, self.num_heads, head_dim)
242 |             # x, _ = self.attn(query=q, key=k, value=v)
243 |             # if self.multi_head:
244 |             #     x = self.dense_o(x.reshape(B, L, self.qkv_size)).view(B, L, self.embed_dim)
245 |             # else:
246 |             #     x = x.squeeze(-2)
247 |             # x = x + object_features
248 |             # x = self.layernorm_query(x)
249 | 
250 |             y = x
251 | 
252 |             # MLP
253 |             z = self.mlp(y)
254 |             z = z + y
255 |             z = self.layernorm_mlp(z)
256 |         return z
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | @dataclasses.dataclass
264 | class PredictorOutput:
265 |     objects: TensorType["batch_size", "n_objects", "object_dim"]  # noqa: F821
266 |     is_empty: Optional[TensorType["batch_size", "n_objects"]] = None  # noqa: F821
267 |     feature_attributions: Optional[
268 |         TensorType["batch_size", "n_objects", "n_spatial_features"]  # noqa: F821
269 |     ] = None


--------------------------------------------------------------------------------
/ocl/conditioning.py:
--------------------------------------------------------------------------------
  1 | """Implementation of conditioning approaches for slots."""
  2 | from typing import Callable, Optional, Tuple
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from torch import nn
  7 | 
  8 | from ocl import base, path_defaults
  9 | from ocl.utils import RoutableMixin
 10 | from ocl.predictor import MLP
 11 | 
 12 | class RandomConditioning(base.Conditioning, RoutableMixin):
 13 |     """Random conditioning with potentially learnt mean and stddev."""
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         object_dim: int,
 18 |         n_slots: int,
 19 |         learn_mean: bool = True,
 20 |         learn_std: bool = True,
 21 |         mean_init: Optional[Callable[[torch.Tensor], None]] = None,
 22 |         logsigma_init: Optional[Callable[[torch.Tensor], None]] = None,
 23 |         batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
 24 |     ):
 25 |         base.Conditioning.__init__(self)
 26 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
 27 |         self.n_slots = n_slots
 28 |         self.object_dim = object_dim
 29 | 
 30 |         if learn_mean:
 31 |             self.slots_mu = nn.Parameter(torch.zeros(1, 1, object_dim))
 32 |         else:
 33 |             self.register_buffer("slots_mu", torch.zeros(1, 1, object_dim))
 34 | 
 35 |         if learn_std:
 36 |             self.slots_logsigma = nn.Parameter(torch.zeros(1, 1, object_dim))
 37 |         else:
 38 |             self.register_buffer("slots_logsigma", torch.zeros(1, 1, object_dim))
 39 | 
 40 |         if mean_init is None:
 41 |             mean_init = nn.init.xavier_uniform_
 42 |         if logsigma_init is None:
 43 |             logsigma_init = nn.init.xavier_uniform_
 44 | 
 45 |         with torch.no_grad():
 46 |             mean_init(self.slots_mu)
 47 |             logsigma_init(self.slots_logsigma)
 48 | 
 49 |     @RoutableMixin.route
 50 |     def forward(self, batch_size: int) -> base.ConditioningOutput:
 51 |         mu = self.slots_mu.expand(batch_size, self.n_slots, -1)
 52 |         sigma = self.slots_logsigma.exp().expand(batch_size, self.n_slots, -1)
 53 |         return mu + sigma * torch.randn_like(mu)
 54 | 
 55 | 
 56 | class LearntConditioning(base.Conditioning, RoutableMixin):
 57 |     """Conditioning with a learnt set of slot initializations, similar to DETR."""
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         object_dim: int,
 62 |         n_slots: int,
 63 |         slot_init: Optional[Callable[[torch.Tensor], None]] = None,
 64 |         batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
 65 |     ):
 66 |         base.Conditioning.__init__(self)
 67 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
 68 |         self.n_slots = n_slots
 69 |         self.object_dim = object_dim
 70 | 
 71 |         self.slots = nn.Parameter(torch.zeros(1, n_slots, object_dim))
 72 | 
 73 |         if slot_init is None:
 74 |             slot_init = nn.init.normal_
 75 | 
 76 |         with torch.no_grad():
 77 |             slot_init(self.slots)
 78 | 
 79 |     @RoutableMixin.route
 80 |     def forward(self, batch_size: int) -> base.ConditioningOutput:
 81 |         return self.slots.expand(batch_size, -1, -1)
 82 | 
 83 | 
 84 | class RandomConditioningWithQMCSampling(RandomConditioning):
 85 |     """Random conditioning with learnt mean and stddev using Quasi-Monte Carlo (QMC) samples."""
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         object_dim: int,
 90 |         n_slots: int,
 91 |         learn_mean: bool = True,
 92 |         learn_std: bool = True,
 93 |         mean_init: Optional[Callable[[torch.Tensor], None]] = None,
 94 |         logsigma_init: Optional[Callable[[torch.Tensor], None]] = None,
 95 |         batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
 96 |     ):
 97 |         super().__init__(
 98 |             object_dim,
 99 |             n_slots,
100 |             learn_mean,
101 |             learn_std,
102 |             mean_init,
103 |             logsigma_init,
104 |             batch_size_path=batch_size_path,
105 |         )
106 | 
107 |         import scipy.stats  # Import lazily because scipy takes some time to import
108 | 
109 |         self.randn_rng = scipy.stats.qmc.MultivariateNormalQMC(mean=np.zeros(object_dim))
110 | 
111 |     def _randn(self, *args: Tuple[int]) -> torch.Tensor:
112 |         n_elements = np.prod(args)
113 |         # QMC sampler needs to sample powers of 2 numbers at a time
114 |         n_elements_rounded2 = 2 ** int(np.ceil(np.log2(n_elements)))
115 |         z = self.randn_rng.random(n_elements_rounded2)[:n_elements]
116 | 
117 |         return torch.from_numpy(z).view(*args, -1)
118 | 
119 |     @RoutableMixin.route
120 |     def forward(self, batch_size: int) -> base.ConditioningOutput:
121 |         mu = self.slots_mu.expand(batch_size, self.n_slots, -1)
122 |         sigma = self.slots_logsigma.exp().expand(batch_size, self.n_slots, -1)
123 | 
124 |         z = self._randn(batch_size, self.n_slots).to(mu, non_blocking=True)
125 |         return mu + sigma * z
126 | 
127 | 
128 | class SlotwiseLearntConditioning(base.Conditioning, RoutableMixin):
129 |     """Random conditioning with learnt mean and stddev for each slot.
130 | 
131 |     Removes permutation equivariance compared to the original slot attention conditioning.
132 |     """
133 | 
134 |     def __init__(
135 |         self,
136 |         object_dim: int,
137 |         n_slots: int,
138 |         mean_init: Optional[Callable[[torch.Tensor], None]] = None,
139 |         logsigma_init: Optional[Callable[[torch.Tensor], None]] = None,
140 |         batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
141 |     ):
142 |         base.Conditioning.__init__(self)
143 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
144 |         self.n_slots = n_slots
145 |         self.object_dim = object_dim
146 | 
147 |         self.slots_mu = nn.Parameter(torch.zeros(1, n_slots, object_dim))
148 |         self.slots_logsigma = nn.Parameter(torch.zeros(1, n_slots, object_dim))
149 | 
150 |         if mean_init is None:
151 |             mean_init = nn.init.normal_
152 |         if logsigma_init is None:
153 |             logsigma_init = nn.init.xavier_uniform_
154 | 
155 |         with torch.no_grad():
156 |             mean_init(self.slots_mu)
157 |             logsigma_init(self.slots_logsigma)
158 | 
159 |     @RoutableMixin.route
160 |     def forward(self, batch_size: int) -> base.ConditioningOutput:
161 |         mu = self.slots_mu.expand(batch_size, -1, -1)
162 |         sigma = self.slots_logsigma.exp().expand(batch_size, -1, -1)
163 |         return mu + sigma * torch.randn_like(mu)
164 | 
165 | 
166 | class CoordinateEncoderStateInit(base.Conditioning, RoutableMixin):
167 |     """State init that encodes bounding box corrdinates as conditional input.
168 |     Attributes:
169 |         embedding_transform: A nn.Module that is applied on inputs (bounding boxes).
170 |         prepend_background: Boolean flag' whether to prepend a special, zero-valued
171 |             background bounding box to the input. Default: False.
172 |         center_of_mass: Boolean flag; whether to convert bounding boxes to center
173 |             of mass coordinates. Default: False.
174 |         background_value: Default value to fill in the background.
175 |     """
176 |     def __init__(self,
177 |                  object_dim: int,
178 |                  prepend_background: bool = True,
179 |                  center_of_mass: bool = False,
180 |                  background_value: float = 0. ,
181 |                  batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
182 |                  ):
183 |         base.Conditioning.__init__(self)
184 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
185 | 
186 |         self.embedding_transform = MLP(input_size=4, hidden_size=256,output_size=128, layernorm=None)
187 |         self.prepend_background = prepend_background
188 |         self.center_of_mass = center_of_mass
189 |         self.background_value = background_value
190 |         self.object_dim = object_dim
191 | 
192 |     @RoutableMixin.route
193 |     def forward(self, target_bbox: torch.Tensor, batch_size: int) -> base.ConditioningOutput:
194 |         del batch_size  # Unused.
195 | 
196 |         # inputs.shape = (batch_size, seq_len, bboxes, 4)
197 |         inputs = target_bbox[:, 0]  # Only condition on first time step.
198 |         # inputs.shape = (batch_size, bboxes, 4)
199 |         if self.prepend_background:
200 |             # Adds a fake background box [0, 0, 0, 0] at the beginning.
201 |             batch_size = inputs.shape[0]
202 | 
203 |             # Encode the background as specified by the background_value.
204 |             background = torch.full(
205 |                 (batch_size, 1, 4), self.background_value, dtype=inputs.dtype,
206 |                 device=inputs.get_device())
207 | 
208 |             inputs = torch.cat([background, inputs], dim=1)
209 |             # inputs = torch.cat([inputs, background], dim=1)
210 | 
211 |         if self.center_of_mass:
212 |             y_pos = (inputs[:, :, 0] + inputs[:, :, 2]) / 2
213 |             x_pos = (inputs[:, :, 1] + inputs[:, :, 3]) / 2
214 |             inputs = torch.stack([y_pos, x_pos], dim=-1)
215 | 
216 |         slots = self.embedding_transform(inputs)
217 |         # duplicated_slots = torch.cat([slots, slots], dim=1)
218 | 
219 |         return slots
220 | 
221 | class CoordinateEncoderState_learned_Init(base.Conditioning, RoutableMixin):
222 |     """State init that encodes bounding box corrdinates as conditional input.
223 |     Attributes:
224 |         embedding_transform: A nn.Module that is applied on inputs (bounding boxes).
225 |         prepend_background: Boolean flag' whether to prepend a special, zero-valued
226 |             background bounding box to the input. Default: False.
227 |         center_of_mass: Boolean flag; whether to convert bounding boxes to center
228 |             of mass coordinates. Default: False.
229 |         background_value: Default value to fill in the background.
230 |     """
231 |     def __init__(self,
232 |                  object_dim: int,
233 |                  n_slots: int,
234 |                  prepend_background: bool = True,
235 |                  center_of_mass: bool = False,
236 |                  background_value: float = 0. ,
237 |                  slot_init: Optional[Callable[[torch.Tensor], None]] = None,
238 |                  batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
239 |                  ):
240 |         base.Conditioning.__init__(self)
241 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
242 | 
243 |         self.embedding_transform = MLP(input_size=4, hidden_size=256,output_size=128, layernorm=None)
244 |         self.prepend_background = prepend_background
245 |         self.center_of_mass = center_of_mass
246 |         self.background_value = background_value
247 |         self.object_dim = object_dim
248 |         self.slots = nn.Parameter(torch.zeros(1, n_slots, object_dim))
249 |         if slot_init is None:
250 |             slot_init = nn.init.normal_
251 | 
252 |         with torch.no_grad():
253 |             slot_init(self.slots)
254 | 
255 | 
256 |     @RoutableMixin.route
257 |     def forward(self, target_bbox: torch.Tensor, batch_size: int) -> base.ConditioningOutput:
258 |         del batch_size  # Unused.
259 |         inputs = target_bbox[:, 0]  # Only condition on first time step.
260 |         inputs_box = inputs
261 |         batch_size, object_num, _ = inputs.shape
262 |         if self.prepend_background:
263 |             # Adds a fake background box [0, 0, 0, 0] at the beginning.
264 |             batch_size = inputs.shape[0]
265 | 
266 |             # Encode the background as specified by the background_value.
267 |             background = torch.full(
268 |                 (batch_size, 1, 4), self.background_value, dtype=inputs.dtype,
269 |                 device=inputs.get_device())
270 | 
271 |             inputs = torch.cat([background, inputs], dim=1)
272 | 
273 |         if self.center_of_mass:
274 |             y_pos = (inputs[:, :, 0] + inputs[:, :, 2]) / 2
275 |             x_pos = (inputs[:, :, 1] + inputs[:, :, 3]) / 2
276 |             inputs = torch.stack([y_pos, x_pos], dim=-1)
277 | 
278 |         slots = self.embedding_transform(inputs)
279 |         slots_learned = self.slots.expand(batch_size, -1, -1)
280 | 
281 |         # slots_output = torch.zeros(slots.shape).to(target_bbox.device)
282 |         #
283 |         # for i in range(batch_size):
284 |         #     for j in range(object_num):
285 |         #         if j == 0:
286 |         #             slots_output[i][0] = slots[i][0]
287 |         #         else:
288 |         #             if torch.sum(inputs_box[i][j]) == 0:
289 |         #                 slots_output[i][j] = slots_learned[i][j]
290 |         #             else:
291 |         #                 slots_output[i][j] = slots[i][j]
292 | 
293 |         slots_output = torch.cat([slots, slots_learned], dim = 1)
294 |         return slots_output
295 | 
296 | class CoordinateEncoderState_learned_m_2(base.Conditioning, RoutableMixin):
297 |     """State init that encodes bounding box corrdinates as conditional input.
298 |     Attributes:
299 |         embedding_transform: A nn.Module that is applied on inputs (bounding boxes).
300 |         prepend_background: Boolean flag' whether to prepend a special, zero-valued
301 |             background bounding box to the input. Default: False.
302 |         center_of_mass: Boolean flag; whether to convert bounding boxes to center
303 |             of mass coordinates. Default: False.
304 |         background_value: Default value to fill in the background.
305 |     """
306 |     def __init__(self,
307 |                  object_dim: int,
308 |                  n_slots: int,
309 |                  prepend_background: bool = True,
310 |                  center_of_mass: bool = False,
311 |                  background_value: float = 0. ,
312 |                  slot_init: Optional[Callable[[torch.Tensor], None]] = None,
313 |                  batch_size_path: Optional[str] = path_defaults.BATCH_SIZE,
314 |                  ):
315 |         base.Conditioning.__init__(self)
316 |         RoutableMixin.__init__(self, {"batch_size": batch_size_path})
317 | 
318 |         self.embedding_transform = MLP(input_size=4, hidden_size=256,output_size=128, layernorm=None)
319 |         self.prepend_background = prepend_background
320 |         self.center_of_mass = center_of_mass
321 |         self.background_value = background_value
322 |         self.object_dim = object_dim
323 |         self.slots = nn.Parameter(torch.zeros(1, 1, n_slots, object_dim))
324 |         if slot_init is None:
325 |             slot_init = nn.init.normal_
326 | 
327 |         with torch.no_grad():
328 |             slot_init(self.slots)
329 | 
330 | 
331 |     @RoutableMixin.route
332 |     def forward(self, target_bbox: torch.Tensor, batch_size: int) -> base.ConditioningOutput:
333 |         del batch_size  # Unused.
334 |         inputs = target_bbox[:, 0]  # Only condition on first time step.
335 |         inputs_box = inputs
336 |         batch_size, object_num, _ = inputs.shape
337 |         if self.prepend_background:
338 |             # Adds a fake background box [0, 0, 0, 0] at the beginning.
339 |             batch_size = inputs.shape[0]
340 | 
341 |             # Encode the background as specified by the background_value.
342 |             background = torch.full(
343 |                 (batch_size, 1, 4), self.background_value, dtype=inputs.dtype,
344 |                 device=inputs.get_device())
345 | 
346 |             inputs = torch.cat([background, inputs], dim=1)
347 | 
348 |         if self.center_of_mass:
349 |             y_pos = (inputs[:, :, 0] + inputs[:, :, 2]) / 2
350 |             x_pos = (inputs[:, :, 1] + inputs[:, :, 3]) / 2
351 |             inputs = torch.stack([y_pos, x_pos], dim=-1)
352 | 
353 |         slots = self.embedding_transform(inputs).unsqueeze(1)
354 |         slots_learned = self.slots.expand(batch_size, -1, -1, -1)
355 |         slots_output = torch.cat([slots, slots_learned], dim=1)
356 |         return slots_output
357 | 
358 | 


--------------------------------------------------------------------------------
/ocl/visualizations.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Dict, List, Optional, Tuple
  2 | 
  3 | import torch
  4 | from torchvision import transforms
  5 | from torchvision.utils import draw_segmentation_masks, make_grid, draw_bounding_boxes
  6 | import numpy as np
  7 | from ocl import consistency, visualization_types
  8 | from ocl.utils import RoutableMixin, box_cxcywh_to_xyxy
  9 | from ocl.metrics import masks_to_bboxes_xyxy
 10 | from torchvision.ops import masks_to_boxes
 11 | 
 12 | def _nop(arg):
 13 |     return arg
 14 | 
 15 | 
 16 | class Image(RoutableMixin):
 17 |     def __init__(
 18 |         self,
 19 |         n_instances: int = 8,
 20 |         n_row: int = 8,
 21 |         denormalization: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
 22 |         as_grid: bool = True,
 23 |         image_path: Optional[str] = None,
 24 |     ):
 25 |         super().__init__({"image": image_path})
 26 |         self.n_instances = n_instances
 27 |         self.n_row = n_row
 28 |         self.denormalization = denormalization if denormalization else _nop
 29 |         self.as_grid = as_grid
 30 | 
 31 |     @RoutableMixin.route
 32 |     def __call__(self, image: torch.Tensor):
 33 |         image = self.denormalization(image[: self.n_instances].cpu())
 34 |         if self.as_grid:
 35 |             return visualization_types.Image(make_grid(image, nrow=self.n_row))
 36 |         else:
 37 |             return visualization_types.Images(image)
 38 | 
 39 | 
 40 | class Video(RoutableMixin):
 41 |     def __init__(
 42 |         self,
 43 |         n_instances: int = 8,
 44 |         n_row: int = 8,
 45 |         denormalization: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
 46 |         as_grid: bool = True,
 47 |         video_path: Optional[str] = None,
 48 |     ):
 49 |         super().__init__({"video": video_path})
 50 |         self.n_instances = n_instances
 51 |         self.n_row = n_row
 52 |         self.denormalization = denormalization if denormalization else _nop
 53 |         self.as_grid = as_grid
 54 | 
 55 |     @RoutableMixin.route
 56 |     def __call__(self, video: torch.Tensor):
 57 |         video = video[: self.n_instances].cpu()
 58 |         if self.as_grid:
 59 |             video = torch.stack(
 60 |                 [
 61 |                     make_grid(self.denormalization(frame.unsqueeze(1)).squeeze(1), nrow=self.n_row)
 62 |                     for frame in torch.unbind(video, 1)
 63 |                 ],
 64 |                 dim=0,
 65 |             ).unsqueeze(0)
 66 |         return visualization_types.Video(video)
 67 | 
 68 | 
 69 | class Mask(RoutableMixin):
 70 |     def __init__(
 71 |         self,
 72 |         n_instances: int = 8,
 73 |         mask_path: Optional[str] = None,
 74 |     ):
 75 |         super().__init__({"masks": mask_path})
 76 |         self.n_instances = n_instances
 77 | 
 78 |     @RoutableMixin.route
 79 |     def __call__(self, masks):
 80 |         masks = masks[: self.n_instances].cpu().contiguous()
 81 |         image_shape = masks.shape[-2:]
 82 |         n_objects = masks.shape[-3]
 83 | 
 84 |         if masks.dim() == 5:
 85 |             # Handling video data.
 86 |             # bs x frames x objects x H x W
 87 |             mask_vis = masks.transpose(1, 2).contiguous()
 88 |             flattened_masks = mask_vis.flatten(0, 1).unsqueeze(2)
 89 | 
 90 |             # Draw masks inverted as they are easier to print.
 91 |             mask_vis = torch.stack(
 92 |                 [
 93 |                     make_grid(1.0 - masks, nrow=n_objects)
 94 |                     for masks in torch.unbind(flattened_masks, 1)
 95 |                 ],
 96 |                 dim=0,
 97 |             )
 98 |             mask_vis = mask_vis.unsqueeze(0)
 99 |             return visualization_types.Video(mask_vis)
100 |         elif masks.dim() == 4:
101 |             # Handling image data.
102 |             # bs x objects x H x W
103 |             # Monochrome image with single channel.
104 |             masks = masks.view(-1, 1, *image_shape)
105 |             # Draw masks inverted as they are easier to print.
106 |             return visualization_types.Image(make_grid(1.0 - masks, nrow=n_objects))
107 | 
108 | 
109 | class VisualObject(RoutableMixin):
110 |     def __init__(
111 |         self,
112 |         n_instances: int = 8,
113 |         denormalization: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
114 |         object_path: Optional[str] = None,
115 |         mask_path: Optional[str] = None,
116 |     ):
117 |         super().__init__({"object_reconstructions": object_path, "object_masks": mask_path})
118 |         self.n_instances = n_instances
119 |         self.denormalization = denormalization if denormalization else _nop
120 | 
121 |     @RoutableMixin.route
122 |     def __call__(self, object_reconstructions, object_masks):
123 |         objects = object_reconstructions[: self.n_instances].cpu()
124 |         masks = object_masks[: self.n_instances].cpu().contiguous()
125 |         masks = masks > 0.5
126 | 
127 |         image_shape = objects.shape[-3:]
128 |         n_objects = objects.shape[-4]
129 | 
130 |         if objects.dim() == 6:
131 |             # Handling video data.
132 |             # bs x frames x objects x C x H x W
133 | 
134 |             # We need to denormalize prior to constructing the grid, yet the denormalization
135 |             # method assumes video input. We thus convert a frame into a single frame video and
136 |             # remove the additional dimension prior to make_grid.
137 |             # Switch object and frame dimension.
138 |             object_vis = objects.transpose(1, 2).contiguous()
139 |             mask_vis = masks.transpose(1, 2).contiguous()
140 |             flattened_masks = mask_vis.flatten(0, 1).unsqueeze(2).float()
141 |             object_vis = self.denormalization(object_vis.flatten(0, 1))
142 |             # Keep object pixels and apply white background to non-objects parts.
143 |             object_vis = object_vis * flattened_masks + (1.0 - flattened_masks)
144 | 
145 |             # object_vis = object_vis * flattened_masks
146 |             object_vis = torch.stack(
147 |                 [
148 |                     make_grid(
149 |                         object_vis_frame,
150 |                         nrow=n_objects,
151 |                     )
152 |                     for object_vis_frame in torch.unbind(object_vis, 1)
153 |                 ],
154 |                 dim=0,
155 |             )
156 |             # Add batch dimension as this is required for video input.
157 |             object_vis = object_vis.unsqueeze(0)
158 | 
159 |             # Draw masks inverted as they are easier to print.
160 |             flattened_masks_ = flattened_masks > 0.5
161 |             mask_vis = torch.stack(
162 |                 [
163 |                     # make_grid(1.0 - masks, nrow=n_objects)
164 |                     make_grid(masks, nrow=n_objects)
165 |                     for masks in torch.unbind(flattened_masks_, 1)
166 |                 ],
167 |                 dim=0,
168 |             )
169 |             mask_vis = mask_vis.unsqueeze(0)
170 |             return {
171 |                 "reconstruction": visualization_types.Video(object_vis),
172 |                 "mask": visualization_types.Video(mask_vis),
173 |             }
174 |         elif objects.dim() == 5:
175 |             # Handling image data.
176 |             # bs x objects x C x H x W
177 |             object_reconstructions = self.denormalization(objects.view(-1, *image_shape))
178 |             # Monochrome image with single channel.
179 |             masks = masks.view(-1, 1, *image_shape[1:])
180 |             # Save object reconstructions as RGBA image. make_grid does not support RGBA input, thus
181 |             # we combine the channels later.  For the masks we need to pad with 1 as we want the
182 |             # borders between images to remain visible (i.e. alpha value of 1.)
183 |             masks_grid = make_grid(masks, nrow=n_objects, pad_value=1.0)
184 |             object_grid = make_grid(object_reconstructions, nrow=n_objects)
185 |             # masks_grid expands the image to three channels, which we don't need. Only keep one, and
186 |             # use it as the alpha channel. After make_grid the tensor has the shape C X W x H.
187 |             object_grid = torch.cat((object_grid, masks_grid[:1]), dim=0)
188 | 
189 |             return {
190 |                 "reconstruction": visualization_types.Image(object_grid),
191 |                 # Draw masks inverted as they are easier to print.
192 |                 "mask": visualization_types.Image(make_grid(1.0 - masks, nrow=n_objects)),
193 |             }
194 | 
195 | 
196 | class ConsistencyMask(RoutableMixin):
197 |     def __init__(
198 |         self,
199 |         matcher: consistency.HungarianMatcher,
200 |         mask_path: Optional[str] = None,
201 |         mask_target_path: Optional[str] = None,
202 |         params_path: Optional[str] = None,
203 |     ):
204 |         super().__init__(
205 |             {"mask": mask_path, "mask_target": mask_target_path, "cropping_params": params_path}
206 |         )
207 |         self.matcher = matcher
208 | 
209 |     @RoutableMixin.route
210 |     def __call__(self, mask: torch.Tensor, mask_target: torch.Tensor, cropping_params: torch.Tensor):
211 |         _, _, size, _ = mask.shape
212 |         mask_one_hot = self._to_binary_mask(mask)
213 |         mask_target = self.crop_views(mask_target, cropping_params, size)
214 |         mask_target_one_hot = self._to_binary_mask(mask_target)
215 |         _ = self.matcher(mask_one_hot, mask_target_one_hot)
216 |         return {
217 |             "costs": visualization_types.Image(
218 |                 make_grid(-self.matcher.costs, nrow=8, pad_value=0.9)
219 |             ),
220 |         }
221 | 
222 |     @staticmethod
223 |     def _to_binary_mask(masks: torch.Tensor):
224 |         _, n_objects, _, _ = masks.shape
225 |         m_lables = masks.argmax(dim=1)
226 |         mask_one_hot = torch.nn.functional.one_hot(m_lables, n_objects)
227 |         return mask_one_hot.permute(0, 3, 1, 2)
228 | 
229 |     def crop_views(self, view: torch.Tensor, param: torch.Tensor, size: int):
230 |         return torch.cat([self.crop_maping(v, p, size) for v, p in zip(view, param)])
231 | 
232 |     @staticmethod
233 |     def crop_maping(view: torch.Tensor, p: torch.Tensor, size: int):
234 |         p = tuple(p.cpu().numpy().astype(int))
235 |         return transforms.functional.resized_crop(view, *p, size=(size, size))[None]
236 | 
237 | 
238 | class Segmentation(RoutableMixin):
239 |     def __init__(
240 |         self,
241 |         n_instances: int = 8,
242 |         denormalization: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
243 |         image_path: Optional[str] = None,
244 |         mask_path: Optional[str] = None,
245 |     ):
246 |         super().__init__({"image": image_path, "mask": mask_path})
247 |         self.n_instances = n_instances
248 |         self.denormalization = denormalization if denormalization else _nop
249 |         self._cmap_cache: Dict[int, List[Tuple[int, int, int]]] = {}
250 | 
251 |     def _get_cmap(self, num_classes: int) -> List[Tuple[int, int, int]]:
252 |         if num_classes in self._cmap_cache:
253 |             return self._cmap_cache[num_classes]
254 | 
255 |         from matplotlib import cm
256 | 
257 |         if num_classes <= 20:
258 |             mpl_cmap = cm.get_cmap("tab20", num_classes)(range(num_classes))
259 |         else:
260 |             mpl_cmap = cm.get_cmap("turbo", num_classes)(range(num_classes))
261 | 
262 |         cmap = [tuple((255 * cl[:3]).astype(int)) for cl in mpl_cmap]
263 |         self._cmap_cache[num_classes] = cmap
264 |         return cmap
265 | 
266 |     @RoutableMixin.route
267 |     def __call__(
268 |         self, image: torch.Tensor, mask: torch.Tensor
269 |     ) -> Optional[visualization_types.Visualization]:
270 |         image = image[: self.n_instances].cpu()
271 |         mask = mask[: self.n_instances].cpu().contiguous()
272 |         if image.dim() == 4:  # Only support image data at the moment.
273 |             input_image = self.denormalization(image)
274 |             n_objects = mask.shape[1]
275 | 
276 |             masks_argmax = mask.argmax(dim=1)[:, None]
277 |             classes = torch.arange(n_objects)[None, :, None, None].to(masks_argmax)
278 |             masks_one_hot = masks_argmax == classes
279 | 
280 |             cmap = self._get_cmap(n_objects)
281 |             masks_on_image = torch.stack(
282 |                 [
283 |                     draw_segmentation_masks(
284 |                         (255 * img).to(torch.uint8), mask, alpha=0.75, colors=cmap
285 |                     )
286 |                     for img, mask in zip(input_image.to("cpu"), masks_one_hot.to("cpu"))
287 |                 ]
288 |             )
289 | 
290 |             return visualization_types.Image(make_grid(masks_on_image, nrow=8))
291 |         return None
292 | 
293 | class ObjectMOT(RoutableMixin):
294 |     def __init__(
295 |         self,
296 |         n_clips: int = 3,
297 |         n_row: int = 8,
298 |         denormalization: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
299 |         video_path: Optional[str] = None,
300 |         mask_path: Optional[str] = None,
301 |         pred_is_box: bool = False,
302 |     ):
303 |         super().__init__({"video": video_path, "object_masks": mask_path})
304 |         self.n_clips = n_clips
305 |         self.n_row = n_row
306 |         self.denormalization = denormalization if denormalization else _nop
307 |         self.pred_is_box = pred_is_box
308 | 
309 |     def generate_color_list(self, track_num):
310 |         import random
311 |         color_list = []
312 |         for i in range(track_num):
313 |             hexadecimal = ["#" + ''.join([random.choice('ABCDEF0123456789') for i in range(6)])]
314 |             color_list.append(hexadecimal[0])
315 |         return color_list
316 | 
317 | 
318 |     @RoutableMixin.route
319 |     def __call__(
320 |         self,
321 |         video: torch.Tensor,
322 |         object_masks: torch.Tensor,
323 |     ) -> Optional[visualization_types.Visualization]:
324 |         video = video[: self.n_clips].cpu()
325 |         num_frames = video.shape[1]
326 | 
327 |         if not self.pred_is_box:
328 |             masks = object_masks[: self.n_clips].cpu().contiguous()
329 |             B, F, C, h, w = masks.shape  # [5, 6, 11, 64, 64]
330 |             masks = masks.flatten(0, 1)
331 |             masks = masks > 0.7
332 |             bbox = masks_to_bboxes_xyxy(masks.flatten(0, 1)).unflatten(0, (B, F, C))
333 |         else:
334 |             bbox = object_masks[: self.n_clips].cpu().contiguous()
335 |             bbox[:,:,:,2] += bbox[:,:,:,0]
336 |             bbox[:, :, :, 3] += bbox[:, :, :, 1]
337 | 
338 |         rendered_video = torch.zeros_like(video)
339 | 
340 |         n_colors = 500
341 |         color_list = self.generate_color_list(n_colors)
342 | 
343 |         for cidx in range(self.n_clips):
344 |             for fidx in range(num_frames):
345 |                 cur_obj_box = bbox[cidx, fidx][:, 0] != -1.0
346 |                 cur_obj_idx = cur_obj_box.nonzero()[:, 0].detach().cpu().numpy()
347 |                 idx = cur_obj_idx.tolist()
348 | 
349 |                 cur_obj_idx = np.array(idx)
350 |                 cur_color_list = [color_list[obj_idx] for obj_idx in idx]
351 |                 frame = (video[cidx, fidx] * 256).to(torch.uint8)
352 |                 frame = draw_bounding_boxes(
353 |                     frame, bbox[cidx, fidx][cur_obj_idx], colors=cur_color_list
354 |                 )
355 |                 rendered_video[cidx, fidx] = frame
356 | 
357 |         rendered_video = (
358 |             torch.stack(
359 |                 [
360 |                     make_grid(self.denormalization(frame.unsqueeze(1)).squeeze(1), nrow=self.n_row)
361 |                     for frame in torch.unbind(rendered_video, 1)
362 |                 ],
363 |                 dim=0,
364 |             )
365 |             .unsqueeze(0)
366 |             .to(torch.float32)
367 |             / 256
368 |         )
369 | 
370 |         return visualization_types.Video(rendered_video)
371 | 


--------------------------------------------------------------------------------