├── config ├── loss │ ├── mapping │ │ ├── l1.yaml │ │ ├── l2.yaml │ │ └── huber.yaml │ ├── cc.yaml │ ├── dynamic_area.yaml │ ├── flow_3d.yaml │ ├── tracking_3d.yaml │ └── shape.yaml ├── model │ ├── intrinsics │ │ ├── model.yaml │ │ ├── ground_truth.yaml │ │ ├── regressed.yaml │ │ └── softmin.yaml │ ├── extrinsics │ │ ├── regressed.yaml │ │ ├── procrustes_ransac.yaml │ │ └── procrustes_flow.yaml │ └── backbone │ │ ├── explicit_depth.yaml │ │ ├── midas.yaml │ │ ├── unidepth.yaml │ │ └── nvds_unidepth.yaml ├── flow │ └── gmflow.yaml ├── visualizer │ ├── summary.yaml │ └── trajectory.yaml ├── tracking │ └── cotracker.yaml ├── dataset │ ├── fpha.yaml │ ├── h2o.yaml │ ├── egopat3d.yaml │ ├── pov_surgery.yaml │ ├── epic_kitchen.yaml │ ├── arctic.yaml │ └── hoi4d.yaml ├── datagen_egopat3d.yaml ├── pretrain.yaml ├── datagen_arctic.yaml ├── datagen_fpha.yaml ├── datagen_h2o.yaml ├── datagen_hoi4d.yaml ├── pretrain_eval_h2o.yaml ├── pretrain_eval_hoi4d.yaml ├── pretrain_eval_arctic.yaml ├── pretrain_eval_pov_surgery.yaml ├── datagen_pov_surgery.yaml └── datagen_epic_kitchen.yaml ├── egomono4d ├── repo │ └── gmflow │ │ ├── gmflow │ │ ├── __init__.py │ │ ├── position.py │ │ ├── utils.py │ │ ├── geometry.py │ │ ├── trident_conv.py │ │ └── matching.py │ │ ├── demo │ │ └── davis_breakdance-flare │ │ │ ├── 00000.jpg │ │ │ ├── 00001.jpg │ │ │ └── 00002.jpg │ │ ├── data │ │ └── __init__.py │ │ ├── loss.py │ │ ├── utils │ │ ├── misc.py │ │ ├── utils.py │ │ ├── logger.py │ │ └── dist_utils.py │ │ ├── scripts │ │ ├── submission.sh │ │ ├── demo.sh │ │ ├── evaluate.sh │ │ └── train_gmflow.sh │ │ └── .gitignore ├── model │ ├── backbone │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── transformer.py │ │ ├── backbone.py │ │ ├── __init__.py │ │ └── backbone_explicit_depth.py │ ├── model_pretrain_cfg.py │ ├── intrinsics │ │ ├── common.py │ │ ├── __init__.py │ │ ├── intrinsics.py │ │ ├── intrinsics_ground_truth.py │ │ ├── intrinsics_regressed.py │ │ └── intrinsics_model.py │ └── extrinsics │ │ ├── __init__.py │ │ ├── extrinsics.py │ │ ├── extrinsics_procrustes_ransac.py │ │ ├── extrinsics_procrustes_flow.py │ │ └── extrinsics_regressed.py ├── visualization │ ├── drawing │ │ ├── __init__.py │ │ ├── coordinate_conversion.py │ │ ├── types.py │ │ ├── points.py │ │ └── lines.py │ ├── __init__.py │ ├── depth.py │ ├── color.py │ └── visualizer.py ├── eval │ ├── __init__.py │ ├── eval_depth.py │ ├── eval_extrinsic.py │ └── eval_pointcloud.py ├── dataset │ ├── data_module_pretrain_cfg.py │ ├── dataset_merged.py │ ├── types.py │ ├── __init__.py │ └── data_module_pretrain.py ├── loss │ ├── mapping │ │ ├── __init__.py │ │ ├── mapping_l1.py │ │ ├── mapping_l2.py │ │ ├── mapping_huber.py │ │ └── mapping.py │ ├── __init__.py │ ├── loss_cc.py │ ├── loss.py │ ├── loss_shape.py │ └── loss_dynamic_area.py ├── frame_sampler │ ├── __init__.py │ ├── frame_sampler.py │ └── frame_sampler_pretrain.py ├── config │ ├── pretrain.py │ ├── tools.py │ └── common.py ├── misc │ ├── nn_module_tools.py │ ├── ate.py │ ├── local_logger.py │ ├── disk_cache.py │ ├── config_tools.py │ ├── wandb_tools.py │ ├── image_io.py │ ├── common_training_setup.py │ ├── depth.py │ ├── fly.py │ └── data_util.py ├── flow │ ├── common.py │ ├── __init__.py │ └── flow_predictor_gmflow.py ├── utils.py ├── tracking │ ├── track_predictor.py │ └── track_predictor_cotracker.py └── datagen.py ├── lightning_logs ├── version_2 │ └── hparams.yaml ├── version_3 │ └── hparams.yaml └── version_4 │ └── hparams.yaml ├── cache.zip ├── assets ├── teaser.png └── vis-result.png ├── examples ├── example_hoi4d │ ├── 00140.jpg │ ├── 00144.jpg │ ├── 00148.jpg │ ├── 00152.jpg │ ├── 00156.jpg │ ├── 00160.jpg │ ├── 00164.jpg │ ├── 00168.jpg │ ├── 00172.jpg │ ├── 00176.jpg │ ├── 00180.jpg │ ├── 00184.jpg │ ├── 00188.jpg │ ├── 00192.jpg │ ├── 00196.jpg │ ├── 00200.jpg │ ├── 00204.jpg │ ├── 00208.jpg │ ├── 00212.jpg │ ├── 00216.jpg │ ├── 00220.jpg │ ├── 00224.jpg │ ├── 00228.jpg │ ├── 00232.jpg │ ├── 00236.jpg │ ├── 00240.jpg │ └── 00244.jpg └── example_epic_kitchen │ ├── frame_0000015420.jpg │ ├── frame_0000015421.jpg │ ├── frame_0000015422.jpg │ ├── frame_0000015423.jpg │ ├── frame_0000015424.jpg │ ├── frame_0000015425.jpg │ ├── frame_0000015426.jpg │ ├── frame_0000015427.jpg │ ├── frame_0000015428.jpg │ ├── frame_0000015429.jpg │ ├── frame_0000015430.jpg │ ├── frame_0000015431.jpg │ ├── frame_0000015432.jpg │ ├── frame_0000015433.jpg │ ├── frame_0000015434.jpg │ ├── frame_0000015435.jpg │ ├── frame_0000015436.jpg │ ├── frame_0000015437.jpg │ ├── frame_0000015438.jpg │ ├── frame_0000015439.jpg │ ├── frame_0000015440.jpg │ ├── frame_0000015441.jpg │ ├── frame_0000015442.jpg │ ├── frame_0000015443.jpg │ ├── frame_0000015444.jpg │ ├── frame_0000015445.jpg │ ├── frame_0000015446.jpg │ ├── frame_0000015447.jpg │ ├── frame_0000015448.jpg │ ├── frame_0000015449.jpg │ ├── frame_0000015450.jpg │ ├── frame_0000015451.jpg │ ├── frame_0000015452.jpg │ ├── frame_0000015453.jpg │ ├── frame_0000015454.jpg │ ├── frame_0000015455.jpg │ ├── frame_0000015456.jpg │ ├── frame_0000015457.jpg │ ├── frame_0000015458.jpg │ ├── frame_0000015459.jpg │ └── frame_0000015460.jpg ├── .gitmodules ├── pyproject.toml ├── LICENSE └── .gitignore /config/loss/mapping/l1.yaml: -------------------------------------------------------------------------------- 1 | name: l1 2 | -------------------------------------------------------------------------------- /config/loss/mapping/l2.yaml: -------------------------------------------------------------------------------- 1 | name: l2 2 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egomono4d/model/backbone/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lightning_logs/version_2/hparams.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /lightning_logs/version_3/hparams.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /lightning_logs/version_4/hparams.yaml: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /config/model/intrinsics/model.yaml: -------------------------------------------------------------------------------- 1 | name: model 2 | -------------------------------------------------------------------------------- /config/model/extrinsics/regressed.yaml: -------------------------------------------------------------------------------- 1 | name: regressed 2 | -------------------------------------------------------------------------------- /config/loss/cc.yaml: -------------------------------------------------------------------------------- 1 | cc: 2 | weight: 10.0 3 | enable_after: 0 -------------------------------------------------------------------------------- /config/flow/gmflow.yaml: -------------------------------------------------------------------------------- 1 | name: gmflow 2 | 3 | cache_dir: null 4 | -------------------------------------------------------------------------------- /config/loss/mapping/huber.yaml: -------------------------------------------------------------------------------- 1 | name: huber 2 | 3 | delta: 0.01 4 | -------------------------------------------------------------------------------- /config/model/intrinsics/ground_truth.yaml: -------------------------------------------------------------------------------- 1 | name: ground_truth 2 | -------------------------------------------------------------------------------- /config/visualizer/summary.yaml: -------------------------------------------------------------------------------- 1 | summary: 2 | num_vis_frames: 5 3 | -------------------------------------------------------------------------------- /cache.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/cache.zip -------------------------------------------------------------------------------- /config/loss/dynamic_area.yaml: -------------------------------------------------------------------------------- 1 | dynamic_area: 2 | weight: 10.0 3 | enable_after: 0 -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/assets/teaser.png -------------------------------------------------------------------------------- /assets/vis-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/assets/vis-result.png -------------------------------------------------------------------------------- /config/loss/flow_3d.yaml: -------------------------------------------------------------------------------- 1 | flow_3d: 2 | weight: 10.0 3 | enable_after: 0 4 | # upper_threshold: 0.5 -------------------------------------------------------------------------------- /config/visualizer/trajectory.yaml: -------------------------------------------------------------------------------- 1 | trajectory: 2 | generate_plot: true 3 | ate_save_root: null 4 | -------------------------------------------------------------------------------- /config/loss/tracking_3d.yaml: -------------------------------------------------------------------------------- 1 | tracking_3d: 2 | weight: 10.0 3 | enable_after: 0 4 | # upper_threshold: 0.5 5 | -------------------------------------------------------------------------------- /config/model/extrinsics/procrustes_ransac.yaml: -------------------------------------------------------------------------------- 1 | name: procrustes_ransac 2 | 3 | max_iter: 5 4 | num_points: 4800 5 | -------------------------------------------------------------------------------- /config/tracking/cotracker.yaml: -------------------------------------------------------------------------------- 1 | name: cotracker 2 | 3 | grid_size: 35 4 | similarity_threshold: 0.75 5 | cache_dir: null -------------------------------------------------------------------------------- /examples/example_hoi4d/00140.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00140.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00144.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00144.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00148.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00148.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00152.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00152.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00156.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00156.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00160.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00160.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00164.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00164.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00168.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00168.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00172.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00172.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00176.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00176.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00180.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00180.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00184.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00184.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00188.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00188.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00192.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00192.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00196.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00196.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00200.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00200.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00204.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00204.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00208.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00208.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00212.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00212.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00216.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00216.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00220.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00220.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00224.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00224.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00228.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00228.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00232.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00232.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00236.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00236.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00240.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00240.jpg -------------------------------------------------------------------------------- /examples/example_hoi4d/00244.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_hoi4d/00244.jpg -------------------------------------------------------------------------------- /config/model/backbone/explicit_depth.yaml: -------------------------------------------------------------------------------- 1 | name: explicit_depth 2 | 3 | initial_depth: 0.1 4 | weight_sensitivity: 100.0 5 | -------------------------------------------------------------------------------- /config/model/extrinsics/procrustes_flow.yaml: -------------------------------------------------------------------------------- 1 | name: procrustes_flow 2 | 3 | num_points: 1000 4 | randomize_points: false 5 | -------------------------------------------------------------------------------- /egomono4d/visualization/drawing/__init__.py: -------------------------------------------------------------------------------- 1 | from .lines import draw_lines as draw_lines 2 | from .points import draw_points as draw_points 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "flowcam/third_party/gmflow"] 2 | path = flowmap/third_party/gmflow 3 | url = https://github.com/haofeixu/gmflow.git 4 | -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015420.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015420.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015421.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015421.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015422.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015422.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015423.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015424.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015424.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015425.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015425.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015426.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015426.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015427.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015427.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015428.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015428.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015429.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015429.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015430.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015430.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015431.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015431.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015432.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015432.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015433.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015433.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015434.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015434.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015435.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015435.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015436.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015436.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015437.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015437.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015438.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015438.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015439.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015439.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015440.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015440.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015441.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015441.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015442.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015442.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015443.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015443.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015444.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015444.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015445.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015445.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015446.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015446.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015447.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015447.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015448.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015448.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015449.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015449.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015450.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015450.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015451.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015451.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015452.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015452.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015453.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015453.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015454.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015454.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015455.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015455.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015456.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015456.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015457.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015457.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015458.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015458.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015459.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015459.jpg -------------------------------------------------------------------------------- /examples/example_epic_kitchen/frame_0000015460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/examples/example_epic_kitchen/frame_0000015460.jpg -------------------------------------------------------------------------------- /config/loss/shape.yaml: -------------------------------------------------------------------------------- 1 | shape: 2 | weight: 10.0 3 | enable_after: 0 4 | dynamic_coef: 1.0 5 | decay_end_epochs: -1 # [-1] close 6 | decay_low_weight: 0.0 -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/demo/davis_breakdance-flare/00000.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/egomono4d/repo/gmflow/demo/davis_breakdance-flare/00000.jpg -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/demo/davis_breakdance-flare/00001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/egomono4d/repo/gmflow/demo/davis_breakdance-flare/00001.jpg -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/demo/davis_breakdance-flare/00002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelyuancb/egomono4d/HEAD/egomono4d/repo/gmflow/demo/davis_breakdance-flare/00002.jpg -------------------------------------------------------------------------------- /egomono4d/model/model_pretrain_cfg.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class ModelWrapperPretrainCfg: 5 | lr: float = 5e-5 6 | cache_track: bool = "" 7 | 8 | -------------------------------------------------------------------------------- /config/model/backbone/midas.yaml: -------------------------------------------------------------------------------- 1 | name: midas 2 | 3 | pretrained: true 4 | weight_sensitivity: null 5 | mapping: original 6 | model: MiDaS_small 7 | local_dir: /home/ycb/hub/torch/hub/intel-isl_MiDaS_master -------------------------------------------------------------------------------- /config/model/intrinsics/regressed.yaml: -------------------------------------------------------------------------------- 1 | name: regressed 2 | 3 | # This is roughly in the middle of the focal length distribution for Tanks & Temples, 4 | # LLFF, and MipNeRF 360. 5 | initial_focal_length: 0.85 6 | -------------------------------------------------------------------------------- /config/model/backbone/unidepth.yaml: -------------------------------------------------------------------------------- 1 | name: unidepth 2 | 3 | cache_dir: null 4 | estimator: unidepth_v2_large # unidepth_v2_[large, small] 5 | finetune_head: true # whether to only finetune dpt head of depth-anything-v2 -------------------------------------------------------------------------------- /egomono4d/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval_depth import eval_depth_conductor 2 | from .eval_extrinsic import eval_extrinsic_conductor 3 | from .eval_track import eval_track_conductor 4 | from .eval_track_hoi import eval_track_hoi_conductor 5 | from .eval_pointcloud import eval_pointcloud_conductor -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import build_train_dataset 2 | from .datasets import (FlyingChairs, 3 | FlyingThings3D, 4 | MpiSintel, 5 | KITTI, 6 | HD1K, 7 | ) 8 | -------------------------------------------------------------------------------- /egomono4d/dataset/data_module_pretrain_cfg.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from dataclasses import dataclass 3 | 4 | @dataclass 5 | class DataLoaderStageCfg: 6 | batch_size: int = 1 7 | num_workers: int = 1 8 | persistent_workers: bool = True 9 | seed: Optional[int] = None 10 | 11 | 12 | @dataclass 13 | class DataModulePretrainCfg: 14 | train: DataLoaderStageCfg 15 | val: DataLoaderStageCfg -------------------------------------------------------------------------------- /config/model/intrinsics/softmin.yaml: -------------------------------------------------------------------------------- 1 | name: softmin 2 | 3 | num_procrustes_points: 8192 4 | 5 | min_focal_length: 0.5 6 | max_focal_length: 2.0 7 | num_candidates: 60 8 | 9 | # If this is non-null, the intrinsics will be regressed after the specified number of 10 | # steps. The initial regressed value will be the mean of the last n non-regressed 11 | # intrinsics estimates, where n is window. 12 | regression: 13 | after_step: 1000 14 | window: 100 15 | -------------------------------------------------------------------------------- /config/model/backbone/nvds_unidepth.yaml: -------------------------------------------------------------------------------- 1 | name: nvds_unidepth 2 | 3 | cache_dir: null 4 | estimator: unidepth_v2_large # unidepth_v2_[large, small] 5 | finetune_head: true # whether to only finetune dpt head of depth-anything-v2 6 | 7 | unet_num: 1 8 | unet_channels: [256, 256, 384, 384] # torch.Size([20, 42, 56, 1024]) 9 | unet_kernel_size: 3 10 | unet_groups: 1 11 | 12 | transformer_depth: 2 13 | transformer_heads: 4 14 | transformer_dim_head: 64 15 | transformer_mlp_dim: 256 16 | -------------------------------------------------------------------------------- /egomono4d/loss/mapping/__init__.py: -------------------------------------------------------------------------------- 1 | from .mapping import Mapping 2 | from .mapping_huber import MappingHuber, MappingHuberCfg 3 | from .mapping_l1 import MappingL1, MappingL1Cfg 4 | from .mapping_l2 import MappingL2, MappingL2Cfg 5 | 6 | MAPPINGS = { 7 | "huber": MappingHuber, 8 | "l1": MappingL1, 9 | "l2": MappingL2, 10 | } 11 | 12 | MappingCfg = MappingHuberCfg | MappingL1Cfg | MappingL2Cfg 13 | 14 | 15 | def get_mapping(cfg: MappingCfg) -> Mapping: 16 | return MAPPINGS[cfg.name](cfg) 17 | -------------------------------------------------------------------------------- /egomono4d/loss/mapping/mapping_l1.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | 7 | from .mapping import Mapping 8 | 9 | 10 | @dataclass 11 | class MappingL1Cfg: 12 | name: Literal["l1"] 13 | 14 | 15 | class MappingL1(Mapping[MappingL1Cfg]): 16 | def forward_undistorted( 17 | self, 18 | delta: Float[Tensor, "*batch 2"], 19 | ) -> Float[Tensor, " *batch"]: 20 | return delta.norm(dim=-1) 21 | -------------------------------------------------------------------------------- /egomono4d/frame_sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from .frame_sampler import FrameSampler 4 | from .frame_sampler_pretrain import FrameSamplerPretrainNeighbor, FrameSamplerPretrainInterval 5 | 6 | FRAME_SAMPLER = { 7 | "pretrain_neighbor": FrameSamplerPretrainNeighbor, # pick num_frames neighborhood 8 | "pretrain_interval": FrameSamplerPretrainInterval, # pick random index (with random interval) 9 | } 10 | 11 | 12 | def get_frame_sampler(fs_name, num_frames, stage) -> FrameSampler[Any]: 13 | return FRAME_SAMPLER[fs_name](num_frames, stage) 14 | -------------------------------------------------------------------------------- /egomono4d/loss/mapping/mapping_l2.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | 7 | from .mapping import Mapping 8 | 9 | 10 | @dataclass 11 | class MappingL2Cfg: 12 | name: Literal["l2"] 13 | 14 | 15 | class MappingL2(Mapping[MappingL2Cfg]): 16 | def forward_undistorted( 17 | self, 18 | delta: Float[Tensor, "*batch 2"], 19 | ) -> Float[Tensor, " *batch"]: 20 | # Multiply by 0.5 to match torch.nn.functional.huber_loss. 21 | return 0.5 * (delta * delta).sum(dim=-1) 22 | -------------------------------------------------------------------------------- /config/dataset/fpha.yaml: -------------------------------------------------------------------------------- 1 | fpha: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: ['egohos'] 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | clip_frame: 20 20 | original_base_root: ./cache/original_datasets/FPHA 21 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /egomono4d/config/pretrain.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from ..misc.data_util import PreProcessingCfg 4 | from ..dataset.data_module_pretrain_cfg import DataModulePretrainCfg 5 | from ..model.model_pretrain_cfg import ModelWrapperPretrainCfg 6 | from .common import CommonCfg 7 | 8 | 9 | @dataclass 10 | class StageCfg: 11 | batch_size: int = 1 12 | num_workers: int = 1 13 | 14 | 15 | @dataclass 16 | class PretrainCfg(CommonCfg): 17 | model_wrapper: ModelWrapperPretrainCfg = None 18 | data_module: DataModulePretrainCfg = None 19 | preprocess: PreProcessingCfg = None 20 | -------------------------------------------------------------------------------- /config/dataset/h2o.yaml: -------------------------------------------------------------------------------- 1 | h2o: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: ['egohos'] 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | clip_frame: 20 20 | original_base_root: ./cache/original_datasets/H2O/downloads 21 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /egomono4d/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from .visualizer import Visualizer 2 | from .visualizer_summary import VisualizerSummary, VisualizerSummaryCfg 3 | from .visualizer_trajectory import VisualizerTrajectory, VisualizerTrajectoryCfg 4 | from .visualizer_cotracker import VisualizerCoTracker 5 | 6 | VISUALIZERS = { 7 | "summary": VisualizerSummary, 8 | "trajectory": VisualizerTrajectory, 9 | } 10 | 11 | VisualizerCfg = VisualizerSummaryCfg | VisualizerTrajectoryCfg 12 | 13 | 14 | def get_visualizers(cfgs: list[VisualizerCfg]) -> list[Visualizer]: 15 | return [VISUALIZERS[cfg.name](cfg) for cfg in cfgs] 16 | -------------------------------------------------------------------------------- /config/dataset/egopat3d.yaml: -------------------------------------------------------------------------------- 1 | egopat3d: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: ['egohos'] 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | clip_frame: 20 20 | original_base_root: ./cache/original_datasets/EgoPAT3D 21 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /config/dataset/pov_surgery.yaml: -------------------------------------------------------------------------------- 1 | pov_surgery: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: ['egohos'] 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | clip_frame: 40 20 | original_base_root: ./cache/original_datasets/POV_Surgery/POV_Surgery_data 21 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /egomono4d/misc/nn_module_tools.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | def convert_to_buffer(module: nn.Module, persistent: bool = True): 5 | # Recurse over child modules. 6 | for name, child in list(module.named_children()): 7 | convert_to_buffer(child, persistent) 8 | 9 | # Also re-save buffers to change persistence. 10 | for name, parameter_or_buffer in ( 11 | *module.named_parameters(recurse=False), 12 | *module.named_buffers(recurse=False), 13 | ): 14 | value = parameter_or_buffer.detach().clone() 15 | delattr(module, name) 16 | module.register_buffer(name, value, persistent=persistent) 17 | -------------------------------------------------------------------------------- /egomono4d/flow/common.py: -------------------------------------------------------------------------------- 1 | from einops import rearrange 2 | from jaxtyping import Float 3 | from torch import Tensor 4 | 5 | 6 | def split_videos( 7 | videos: Float[Tensor, "batch frame 3 height width"], 8 | ) -> tuple[ 9 | Float[Tensor, "batch*(frame-1) 3 height width"], # source (flattened batch dims) 10 | Float[Tensor, "batch*(frame-1) 3 height width"], # target (flattened batch dims) 11 | int, # batch 12 | int, # frame 13 | ]: 14 | b, f, _, _, _ = videos.shape 15 | return ( 16 | rearrange(videos[:, :-1], "b f c h w -> (b f) c h w"), 17 | rearrange(videos[:, 1:], "b f c h w -> (b f) c h w"), 18 | b, 19 | f, 20 | ) 21 | -------------------------------------------------------------------------------- /config/dataset/epic_kitchen.yaml: -------------------------------------------------------------------------------- 1 | epic_kitchen: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: null 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | max_clip_per_video: 1200 20 | clip_frame: 20 21 | original_base_root: ./cache/original_datasets/EpicKitchen/EPIC-KITCHENS 22 | intrinsic_root: none 23 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /config/dataset/arctic.yaml: -------------------------------------------------------------------------------- 1 | arctic: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: ['egohos'] 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | clip_frame: 40 20 | original_data_root: ./cache/original_datasets/ARCTIC/arctic/data/ 21 | original_render_root: ./cache/original_datasets/ARCTIC/arctic/render_out 22 | pre_save_root: ./cache/processed_datasets -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from jaxtyping import Float 3 | from torch import Tensor 4 | 5 | 6 | def focal_lengths_to_intrinsics( 7 | focal_lengths: Float[Tensor, " *batch"], 8 | image_shape: tuple[int, int], 9 | ) -> Float[Tensor, "*batch 3 3"]: 10 | device = focal_lengths.device 11 | h, w = image_shape 12 | focal_lengths = focal_lengths * (h * w) ** 0.5 13 | 14 | intrinsics = torch.eye(3, dtype=torch.float32, device=device) 15 | intrinsics[:2, 2] = 0.5 16 | intrinsics = intrinsics.broadcast_to((*focal_lengths.shape, 3, 3)).contiguous() 17 | intrinsics[..., 0, 0] = focal_lengths / w # fx 18 | intrinsics[..., 1, 1] = focal_lengths / h # fy 19 | 20 | return intrinsics 21 | -------------------------------------------------------------------------------- /egomono4d/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from .loss import Loss 2 | from .loss_dynamic_area import LossDynamicArea, LossDynamicAreaCfg 3 | from .loss_shape import LossShape, LossShapeCfg #, loss_shape_func 4 | from .loss_flow_3d import LossFlow3D, LossFlow3DCfg #, loss_flow_3d_func 5 | from .loss_tracking_3d import LossTracking3D, LossTracking3DCfg #, loss_tracking_3d_func 6 | from .loss_cc import LossCC, LossCCCfg 7 | 8 | LOSSES = { 9 | "dynamic_area": LossDynamicArea, 10 | "tracking_3d": LossTracking3D, 11 | "flow_3d": LossFlow3D, 12 | "shape": LossShape, 13 | "cc": LossCC, 14 | } 15 | 16 | LossCfg = LossDynamicAreaCfg | LossTracking3DCfg | LossCCCfg | LossShapeCfg | LossFlow3DCfg 17 | 18 | 19 | def get_losses(cfgs: list[LossCfg]) -> list[Loss]: 20 | return [LOSSES[cfg.name](cfg) for cfg in cfgs] 21 | -------------------------------------------------------------------------------- /egomono4d/model/extrinsics/__init__.py: -------------------------------------------------------------------------------- 1 | from .extrinsics import Extrinsics 2 | from .extrinsics_procrustes_flow import ExtrinsicsProcrustesFlow, ExtrinsicsProcrustesFlowCfg 3 | from .extrinsics_regressed import ExtrinsicsRegressed, ExtrinsicsRegressedCfg 4 | from .extrinsics_procrustes_ransac import ExtrinsicsProcrustesRANSAC, ExtrinsicsProcrustesRANSACCfg 5 | 6 | EXTRINSICS = { 7 | "regressed": ExtrinsicsRegressed, 8 | "procrustes_flow": ExtrinsicsProcrustesFlow, 9 | "procrustes_ransac": ExtrinsicsProcrustesRANSAC 10 | } 11 | 12 | ExtrinsicsCfg = ExtrinsicsRegressedCfg | ExtrinsicsProcrustesFlowCfg | ExtrinsicsProcrustesRANSACCfg 13 | 14 | 15 | def get_extrinsics( 16 | cfg: ExtrinsicsCfg, 17 | num_frames: int | None, 18 | ) -> Extrinsics: 19 | return EXTRINSICS[cfg.name](cfg, num_frames) 20 | -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/__init__.py: -------------------------------------------------------------------------------- 1 | from .intrinsics import Intrinsics 2 | from .intrinsics_ground_truth import IntrinsicsGroundTruth, IntrinsicsGroundTruthCfg 3 | from .intrinsics_regressed import IntrinsicsRegressed, IntrinsicsRegressedCfg 4 | from .intrinsics_softmin import IntrinsicsSoftmin, IntrinsicsSoftminCfg 5 | from .intrinsics_model import IntrinsicsModel, IntrinsicsModelCfg 6 | 7 | INTRINSICS = { 8 | "ground_truth": IntrinsicsGroundTruth, 9 | "regressed": IntrinsicsRegressed, 10 | "softmin": IntrinsicsSoftmin, 11 | "model": IntrinsicsModel 12 | } 13 | 14 | IntrinsicsCfg = IntrinsicsRegressedCfg | IntrinsicsGroundTruthCfg | IntrinsicsSoftminCfg | \ 15 | IntrinsicsModelCfg 16 | 17 | 18 | def get_intrinsics(cfg: IntrinsicsCfg) -> Intrinsics: 19 | return INTRINSICS[cfg.name](cfg) 20 | -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/intrinsics.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor, nn 6 | 7 | from ...dataset.types import Batch 8 | from ...flow.flow_predictor import Flows 9 | from ...tracking.track_predictor import Tracks 10 | from ..backbone.backbone import BackboneOutput 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | class Intrinsics(nn.Module, ABC, Generic[T]): 16 | cfg: T 17 | 18 | def __init__(self, cfg: T) -> None: 19 | super().__init__() 20 | self.cfg = cfg 21 | 22 | @abstractmethod 23 | def forward( 24 | self, 25 | batch: Batch, 26 | flows: Flows | list[Tracks], 27 | backbone_output: BackboneOutput, 28 | global_step: int, 29 | ) -> Float[Tensor, "batch frame 3 3"]: 30 | pass 31 | -------------------------------------------------------------------------------- /egomono4d/visualization/depth.py: -------------------------------------------------------------------------------- 1 | from jaxtyping import Float 2 | from torch import Tensor 3 | 4 | from .color import apply_color_map_to_image 5 | 6 | 7 | def color_map_depth( 8 | depth: Float[Tensor, "batch height width"], 9 | cmap: str = "inferno", 10 | invert: bool = True, 11 | log_first: bool = False 12 | ) -> Float[Tensor, "batch 3 height width"]: 13 | mask = (depth == 0) 14 | if log_first is True: 15 | # for depth estimation, we first get log for convinient visualization. 16 | depth = depth.log() 17 | # Normalize the depth. 18 | far = depth.max() 19 | depth = depth + mask * 1e9 20 | near = depth.min() 21 | depth = (depth - near) / (far - near) 22 | depth = depth.clip(min=0, max=1) 23 | depth[mask] = 0 24 | if invert: 25 | depth = 1 - depth 26 | return apply_color_map_to_image(depth, cmap) 27 | -------------------------------------------------------------------------------- /egomono4d/misc/ate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from jaxtyping import Float 3 | from scipy import spatial 4 | from torch import Tensor 5 | 6 | 7 | def compute_ate( 8 | gt: Float[Tensor, "point 3"], 9 | predicted: Float[Tensor, "point 3"], 10 | ) -> tuple[ 11 | Float[Tensor, ""], # ate 12 | Float[Tensor, "point 3"], # aligned gt 13 | Float[Tensor, "point 3"], # aligned predicted 14 | ]: 15 | aligned_gt, aligned_predicted, _ = spatial.procrustes( 16 | gt.detach().cpu().numpy(), 17 | predicted.cpu().numpy(), 18 | ) 19 | aligned_gt = torch.tensor(aligned_gt, dtype=torch.float32, device=gt.device) 20 | aligned_predicted = torch.tensor( 21 | aligned_predicted, dtype=torch.float32, device=predicted.device 22 | ) 23 | 24 | ate = ((aligned_gt - aligned_predicted) ** 2).mean() ** 0.5 25 | return ate, aligned_gt, aligned_predicted 26 | -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/intrinsics_ground_truth.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | 7 | from ...dataset.types import Batch 8 | from ...flow.flow_predictor import Flows 9 | from ..backbone.backbone import BackboneOutput 10 | from .intrinsics import Intrinsics 11 | from ...tracking.track_predictor import Tracks 12 | 13 | 14 | @dataclass 15 | class IntrinsicsGroundTruthCfg: 16 | name: Literal["ground_truth"] 17 | 18 | 19 | class IntrinsicsGroundTruth(Intrinsics[IntrinsicsGroundTruthCfg]): 20 | def forward( 21 | self, 22 | batch: Batch, 23 | flows: Flows | list[Tracks], 24 | backbone_output: BackboneOutput, 25 | global_step: int, 26 | ) -> Float[Tensor, "batch frame 3 3"]: 27 | # Just return the ground-truth intrinsics. 28 | return batch.intrinsics 29 | -------------------------------------------------------------------------------- /egomono4d/visualization/color.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange 3 | from jaxtyping import Float 4 | from matplotlib import cm 5 | from torch import Tensor 6 | 7 | 8 | def apply_color_map( 9 | x: Float[Tensor, " *batch"], 10 | color_map: str = "inferno", 11 | ) -> Float[Tensor, "*batch 3"]: 12 | cmap = cm.get_cmap(color_map) 13 | 14 | # Convert to NumPy so that Matplotlib color maps can be used. 15 | mapped = cmap(x.detach().clip(min=0, max=1).cpu().numpy())[..., :3] 16 | 17 | # Convert back to the original format. 18 | return torch.tensor(mapped, device=x.device, dtype=torch.float32) 19 | 20 | 21 | def apply_color_map_to_image( 22 | image: Float[Tensor, "*batch height width"], 23 | color_map: str = "inferno", 24 | ) -> Float[Tensor, "*batch 3 height with"]: 25 | image = apply_color_map(image, color_map) 26 | return rearrange(image, "... h w c -> ... c h w") 27 | -------------------------------------------------------------------------------- /egomono4d/model/extrinsics/extrinsics.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor, nn 6 | 7 | from ...dataset.types import Batch 8 | from ...flow.flow_predictor import Flows 9 | from ..backbone.backbone import BackboneOutput 10 | 11 | T = TypeVar("T") 12 | 13 | 14 | class Extrinsics(nn.Module, ABC, Generic[T]): 15 | cfg: T 16 | num_frames: int | None 17 | 18 | def __init__(self, cfg: T, num_frames: int | None) -> None: 19 | super().__init__() 20 | self.cfg = cfg 21 | self.num_frames = num_frames 22 | 23 | @abstractmethod 24 | def forward( 25 | self, 26 | batch: Batch, 27 | flows: Flows, 28 | backbone_output: BackboneOutput, 29 | surfaces: Float[Tensor, "batch frame height width 3"], 30 | ) -> Float[Tensor, "batch frame 4 4"]: 31 | pass 32 | -------------------------------------------------------------------------------- /egomono4d/frame_sampler/frame_sampler.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | import torch 5 | from jaxtyping import Int64 6 | from torch import Tensor 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | class FrameSampler(ABC, Generic[T]): 12 | """A frame sampler picks the frames that should be sampled from a dataset's video. 13 | It makes sense to break the logic for frame sampling into an interface because 14 | pre-training and fine-tuning require different frame sampling strategies (generally, 15 | whole video vs. batch of video segments of same length). 16 | """ 17 | 18 | 19 | def __init__(self, num_frames, stage) -> None: 20 | self.num_frames = num_frames 21 | self.stage = stage 22 | 23 | @abstractmethod 24 | def sample( 25 | self, 26 | num_frames_in_video: int, 27 | device: torch.device, 28 | ) -> Int64[Tensor, " frame"]: # frame indices 29 | pass 30 | -------------------------------------------------------------------------------- /egomono4d/loss/mapping/mapping_huber.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from jaxtyping import Float 7 | from torch import Tensor 8 | 9 | from .mapping import Mapping 10 | 11 | 12 | @dataclass 13 | class MappingHuberCfg: 14 | name: Literal["huber"] 15 | delta: float 16 | 17 | 18 | class MappingHuber(Mapping[MappingHuberCfg]): 19 | def forward_undistorted( 20 | self, 21 | delta: Float[Tensor, "*batch 2"], 22 | ) -> Float[Tensor, " *batch"]: 23 | norm = delta.norm(dim=-1) 24 | 25 | mapped = F.huber_loss( 26 | norm, 27 | torch.zeros_like(norm), 28 | reduction="none", 29 | delta=self.cfg.delta, 30 | ) 31 | 32 | # Divide by the delta so that the gradient magnitude in the linear region 33 | # matches that of a regular L1 loss. 34 | return mapped / self.cfg.delta 35 | -------------------------------------------------------------------------------- /egomono4d/visualization/visualizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | 7 | from ..dataset.types import Batch 8 | from ..flow import Flows 9 | from ..model.model import Model, ModelOutput 10 | from ..tracking import Tracks 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | class Visualizer(ABC, Generic[T]): 16 | cfg: T 17 | 18 | def __init__(self, cfg: T) -> None: 19 | super().__init__() 20 | self.cfg = cfg 21 | self.select_indices = None 22 | self.select_scenes = None 23 | 24 | @abstractmethod 25 | def visualize( 26 | self, 27 | batch: Batch, 28 | flows: Flows, 29 | tracks: list[Tracks] | None, 30 | model_output: ModelOutput, 31 | model: Model, 32 | global_step: int, 33 | current_epoch: int 34 | ) -> dict[str, Float[Tensor, "3 _ _"] | Float[Tensor, ""]]: 35 | pass 36 | -------------------------------------------------------------------------------- /egomono4d/flow/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ..dataset.types import Batch 4 | from ..misc.nn_module_tools import convert_to_buffer 5 | from .flow_predictor import FlowPredictor, Flows 6 | from .flow_predictor_gmflow import FlowPredictorGMFlow, FlowPredictorGMFlowCfg 7 | 8 | FLOW_PREDICTORS = { 9 | "gmflow": FlowPredictorGMFlow, 10 | } 11 | 12 | FlowPredictorCfg = FlowPredictorGMFlowCfg 13 | 14 | 15 | def get_flow_predictor(cfg: FlowPredictorCfg) -> FlowPredictor: 16 | flow_predictor = FLOW_PREDICTORS[cfg.name](cfg) 17 | convert_to_buffer(flow_predictor, persistent=False) 18 | return flow_predictor 19 | 20 | 21 | @torch.no_grad() 22 | def compute_flows( 23 | batch: Batch, 24 | flow_shape: tuple[int, int], 25 | device: torch.device, 26 | cfg: FlowPredictorCfg, 27 | ) -> Flows: 28 | print("Precomputing optical flow.") 29 | flow_predictor = get_flow_predictor(cfg) 30 | flow_predictor.to(device) 31 | return flow_predictor.compute_bidirectional_flow(batch.to(device), flow_shape) 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | # Enable Pyflakes `E` and `F` codes by default. 3 | select = ["E", "F", "I"] 4 | ignore = ["F722"] # Ignore F722 for jaxtyping compatibility. 5 | 6 | # Allow autofix for all enabled rules (when `--fix`) is provided. 7 | fixable = ["A", "B", "C", "D", "E", "F", "I"] 8 | unfixable = [] 9 | 10 | # Exclude a variety of commonly ignored directories. 11 | exclude = [ 12 | ".bzr", 13 | ".direnv", 14 | ".eggs", 15 | ".git", 16 | ".hg", 17 | ".mypy_cache", 18 | ".nox", 19 | ".pants.d", 20 | ".ruff_cache", 21 | ".svn", 22 | ".tox", 23 | ".venv", 24 | "__pypackages__", 25 | "_build", 26 | "buck-out", 27 | "build", 28 | "dist", 29 | "node_modules", 30 | "venv", 31 | ] 32 | per-file-ignores = {} 33 | 34 | # Same as Black. 35 | line-length = 88 36 | 37 | # Allow unused variables when underscore-prefixed. 38 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 39 | 40 | # Assume Python 3.10. 41 | target-version = "py310" 42 | 43 | [tool.ruff.mccabe] 44 | # Unlike Flake8, default to a complexity level of 10. 45 | max-complexity = 10 46 | -------------------------------------------------------------------------------- /config/dataset/hoi4d.yaml: -------------------------------------------------------------------------------- 1 | hoi4d: 2 | # Common configuration items (all datasets have these) 3 | scene: null 4 | cache_dir: null 5 | resize_shape: null 6 | patch_size: null 7 | num_frames: null 8 | all_frames: false 9 | use_gt_depth: false 10 | 11 | mask_estimation: null 12 | mask_flow_model: null 13 | mask_binary_open_value: null 14 | 15 | frame_sampler: pretrain_interval 16 | frame_max_interval: 4 17 | 18 | # Dataset-specific configuration items 19 | mask_blur_radius: 10.0 20 | clip_t: 2.0 # duration of each clip (sec) 21 | clip_interval: 1.0 # interval of each clip (sec) 22 | clip_max_n: 20 # max number of frame of each clip 23 | meta_file: ./cache/original_datasets/HOI4D/hoi4d_release.txt 24 | rgb_root: ./cache/original_datasets/HOI4D/HOI4D_release 25 | depth_root: ./cache/original_datasets/HOI4D/HOI4D_depth_video 26 | anno_root: ./cache/original_datasets/HOI4D/HOI4D_annotations 27 | cam_root: ./cache/original_datasets/HOI4D/camera_params 28 | pre_save_root: ./cache/processed_datasets 29 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def flow_loss_func(flow_preds, flow_gt, valid, 5 | gamma=0.9, 6 | max_flow=400, 7 | **kwargs, 8 | ): 9 | n_predictions = len(flow_preds) 10 | flow_loss = 0.0 11 | 12 | # exlude invalid pixels and extremely large diplacements 13 | mag = torch.sum(flow_gt ** 2, dim=1).sqrt() # [B, H, W] 14 | valid = (valid >= 0.5) & (mag < max_flow) 15 | 16 | for i in range(n_predictions): 17 | i_weight = gamma ** (n_predictions - i - 1) 18 | 19 | i_loss = (flow_preds[i] - flow_gt).abs() 20 | 21 | flow_loss += i_weight * (valid[:, None] * i_loss).mean() 22 | 23 | epe = torch.sum((flow_preds[-1] - flow_gt) ** 2, dim=1).sqrt() 24 | 25 | if valid.max() < 0.5: 26 | pass 27 | 28 | epe = epe.view(-1)[valid.view(-1)] 29 | 30 | metrics = { 31 | 'epe': epe.mean().item(), 32 | '1px': (epe > 1).float().mean().item(), 33 | '3px': (epe > 3).float().mean().item(), 34 | '5px': (epe > 5).float().mean().item(), 35 | } 36 | 37 | return flow_loss, metrics 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Cameron Smith, David Charatan, Ayush Tewari, and Vincent Sitzmann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /egomono4d/model/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | from typing import Generic, TypeVar 4 | 5 | from jaxtyping import Float 6 | from torch import Tensor, nn 7 | 8 | from ...dataset.types import Batch 9 | from ...flow.flow_predictor import Flows 10 | from ...tracking.track_predictor import Tracks 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | @dataclass 16 | class BackboneOutput: 17 | depths: Float[Tensor, "batch frame height width"] 18 | weights: Float[Tensor, "batch frame-1 height width"] 19 | intrinsics: tuple[Float[Tensor, "batch 2"], Float[Tensor, "batch 2"]] | None # (focal, principle) 20 | 21 | 22 | class Backbone(nn.Module, ABC, Generic[T]): 23 | cfg: T 24 | 25 | def __init__( 26 | self, 27 | cfg: T, 28 | num_frames: int | None, 29 | image_shape: tuple[int, int] | None, 30 | patch_size: tuple[int, int] | None, 31 | ) -> None: 32 | super().__init__() 33 | self.cfg = cfg 34 | self.num_frames = num_frames 35 | self.image_shape = image_shape 36 | self.patch_size = patch_size 37 | 38 | @abstractmethod 39 | def forward(self, batch: Batch, flows: Flows | list[Tracks]) -> BackboneOutput: 40 | pass 41 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import sys 4 | import json 5 | 6 | 7 | def read_text_lines(filepath): 8 | with open(filepath, 'r') as f: 9 | lines = f.readlines() 10 | lines = [l.rstrip() for l in lines] 11 | return lines 12 | 13 | 14 | def check_path(path): 15 | if not os.path.exists(path): 16 | os.makedirs(path, exist_ok=True) # explicitly set exist_ok when multi-processing 17 | 18 | 19 | def save_command(save_path, filename='command_train.txt'): 20 | check_path(save_path) 21 | command = sys.argv 22 | save_file = os.path.join(save_path, filename) 23 | # Save all training commands when resuming training 24 | with open(save_file, 'a') as f: 25 | f.write(' '.join(command)) 26 | f.write('\n\n') 27 | 28 | 29 | def save_args(args, filename='args.json'): 30 | args_dict = vars(args) 31 | check_path(args.checkpoint_dir) 32 | save_path = os.path.join(args.checkpoint_dir, filename) 33 | 34 | # Save all training args when resuming training 35 | with open(save_path, 'a') as f: 36 | json.dump(args_dict, f, indent=4, sort_keys=False) 37 | f.write('\n\n') 38 | 39 | 40 | def int_list(s): 41 | """Convert string to int list""" 42 | return [int(x) for x in s.split(',')] 43 | -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/intrinsics_regressed.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from einops import repeat 6 | from jaxtyping import Float 7 | from torch import Tensor, nn 8 | 9 | from ...dataset.types import Batch 10 | from ...flow.flow_predictor import Flows 11 | from ..backbone.backbone import BackboneOutput 12 | from .common import focal_lengths_to_intrinsics 13 | from .intrinsics import Intrinsics 14 | 15 | 16 | @dataclass 17 | class IntrinsicsRegressedCfg: 18 | name: Literal["regressed"] 19 | initial_focal_length: float 20 | 21 | 22 | class IntrinsicsRegressed(Intrinsics[IntrinsicsRegressedCfg]): 23 | def __init__(self, cfg: IntrinsicsRegressedCfg) -> None: 24 | super().__init__(cfg) 25 | focal_length = torch.full( 26 | tuple(), 27 | cfg.initial_focal_length, 28 | dtype=torch.float32, 29 | ) 30 | self.focal_length = nn.Parameter(focal_length) 31 | 32 | def forward( 33 | self, 34 | batch: Batch, 35 | flows: Flows, 36 | backbone_output: BackboneOutput, 37 | global_step: int, 38 | ) -> Float[Tensor, "batch frame 3 3"]: 39 | b, f, _, h, w = batch.videos.shape 40 | intrinsics = focal_lengths_to_intrinsics(self.focal_length, (h, w)) 41 | return repeat(intrinsics, "i j -> b f i j", b=b, f=f) 42 | -------------------------------------------------------------------------------- /egomono4d/misc/local_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Any, Optional 4 | 5 | from lightning.pytorch.loggers import Logger 6 | from lightning.pytorch.utilities import rank_zero_only 7 | from PIL import Image 8 | 9 | LOG_PATH = Path("outputs/local") 10 | 11 | 12 | class LocalLogger(Logger): 13 | def __init__(self) -> None: 14 | super().__init__() 15 | self.experiment = None 16 | os.system(f"rm -r {LOG_PATH}") 17 | 18 | @property 19 | def name(self): 20 | return "LocalLogger" 21 | 22 | @property 23 | def version(self): 24 | return 0 25 | 26 | @rank_zero_only 27 | def log_hyperparams(self, params): 28 | pass 29 | 30 | @rank_zero_only 31 | def log_metrics(self, metrics, step): 32 | pass 33 | 34 | @rank_zero_only 35 | def log_image( 36 | self, 37 | key: str, 38 | images: list[Any], 39 | step: Optional[int] = None, 40 | **kwargs, 41 | ): 42 | # The function signature is the same as the wandb logger's, but the step is 43 | # actually required. 44 | assert step is not None 45 | for index, image in enumerate(images): 46 | path = LOG_PATH / f"{key}/{index:0>2}_{step:0>6}.png" 47 | path.parent.mkdir(exist_ok=True, parents=True) 48 | Image.fromarray(image).save(path) 49 | -------------------------------------------------------------------------------- /egomono4d/dataset/dataset_merged.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import pdb 4 | from typing import List 5 | from .types import Stage 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class DatasetMerged(Dataset): 10 | 11 | def __init__(self, 12 | datasets: List[Dataset], 13 | stage: Stage, 14 | global_rank: int, 15 | world_size: int, 16 | data_ratio: float=1.0 17 | ) -> None: 18 | self.datasets = datasets 19 | self.stage = stage 20 | self.global_rank = global_rank 21 | self.world_size = world_size 22 | index_list = [] 23 | 24 | for ids, dataset in enumerate(self.datasets): 25 | index_list = index_list + [(ids, i) for i in range(int(len(dataset)*data_ratio))] 26 | 27 | random.seed(0) 28 | random.shuffle(index_list) 29 | self.index_list = index_list 30 | 31 | print(f"################### [Stage {stage}: Num Data = {len(self.index_list)}] ###################") 32 | 33 | 34 | def __len__(self): 35 | return len(self.index_list) 36 | 37 | def __getitem__(self, index): 38 | dataset_id, data_id = self.index_list[index] 39 | # print(f"[Data Go] global_rank={self.global_rank} | dataloader_index={index} | data_index={(dataset_id, data_id)}") 40 | return self.datasets[dataset_id][data_id] 41 | -------------------------------------------------------------------------------- /egomono4d/model/intrinsics/intrinsics_model.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | import pdb 7 | import torch 8 | 9 | from ...dataset.types import Batch, BatchInference 10 | from ...flow.flow_predictor import Flows 11 | from ..backbone.backbone import BackboneOutput 12 | from .intrinsics import Intrinsics 13 | from ...tracking.track_predictor import Tracks 14 | 15 | 16 | @dataclass 17 | class IntrinsicsModelCfg: 18 | name: Literal["model"] 19 | 20 | 21 | class IntrinsicsModel(Intrinsics[IntrinsicsModelCfg]): 22 | def forward( 23 | self, 24 | batch: Batch | BatchInference, 25 | flows: Flows | list[Tracks], 26 | backbone_output: BackboneOutput, 27 | global_step: int, 28 | ) -> Float[Tensor, "batch frame 3 3"]: 29 | # Just return the ground-truth intrinsics. 30 | # pdb.set_trace() 31 | b, f, _, h, w = batch.videos.shape 32 | focal, principle = backbone_output.intrinsics 33 | focal = focal * (h * w) ** 0.5 34 | intrinsics = torch.stack([torch.eye(3, dtype=torch.float32, device=focal.device)]*b, dim=0) 35 | intrinsics = torch.stack([intrinsics]*f, dim=1) 36 | intrinsics[..., :2, 2] = principle.unsqueeze(-2) 37 | intrinsics[..., 0, 0] = focal[..., 0].unsqueeze(-1) / w 38 | intrinsics[..., 1, 1] = focal[..., 1].unsqueeze(-1) / h 39 | return intrinsics -------------------------------------------------------------------------------- /egomono4d/dataset/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | # from typing import Literal 3 | from typing_extensions import Literal 4 | from typing import List, Union, Optional 5 | 6 | from jaxtyping import Float, Int64, Int32 7 | from torch import Tensor 8 | 9 | from ..misc.manipulable import Manipulable 10 | 11 | Stage = Literal["train", "test", "val"] 12 | 13 | 14 | @dataclass 15 | class Batch(Manipulable): 16 | videos: Float[Tensor, "batch frame 3 height width"] 17 | depths: Float[Tensor, "batch frame height width"] 18 | pcds: Float[Tensor, "batch frame height width 3"] 19 | flys: Float[Tensor, "batch frame height width"] 20 | masks: Float[Tensor, "batch frame height width"] 21 | indices: Int64[Tensor, "batch frame"] 22 | 23 | scenes: Union[List[str], str] 24 | datasets: Union[List[str], str] 25 | use_gt_depth: bool 26 | 27 | intrinsics: Optional[Float[Tensor, "batch frame 3 3"]] = None 28 | 29 | gt_depths: Optional[Float[Tensor, "batch frame height width"]] = None 30 | gt_intrinsics: Optional[Float[Tensor, "batch frame 3 3"]] = None 31 | gt_extrinsics: Optional[Float[Tensor, "batch frame 4 4"]] = None 32 | hoi_masks: Optional[Float[Tensor, "batch frame height width"]] = None 33 | 34 | 35 | @dataclass 36 | class BatchInference(Manipulable): 37 | videos: Float[Tensor, "batch frame 3 height width"] 38 | start_indice: Int32 39 | aux_masks: Optional[Float[Tensor, "batch frame height width"]] = None 40 | -------------------------------------------------------------------------------- /egomono4d/utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from einops import einsum, rearrange 3 | from .model.projection import sample_image_grid, unproject, homogenize_points 4 | 5 | 6 | def load_pickle(pickle_file): 7 | try: 8 | with open(pickle_file, 'rb') as f: 9 | pickle_data = pickle.load(f) 10 | except UnicodeDecodeError as e: 11 | with open(pickle_file, 'rb') as f: 12 | pickle_data = pickle.load(f, encoding='latin1') 13 | except Exception as e: 14 | print('Unable to load data ', pickle_file, ':', e) 15 | raise 16 | return pickle_data 17 | 18 | 19 | def save_pickle(pickle_file, data): 20 | with open(pickle_file, 'wb') as pfile: 21 | pickle.dump(data, pfile) 22 | 23 | 24 | def batch_recover_pointclouds_sequence(depths, intrinsics, extrinsics, target_frame=0): 25 | b, f, h, w = depths.shape 26 | xy, _ = sample_image_grid((h, w), device=depths.device) 27 | gt_pcds_unp = unproject(xy, depths, rearrange(intrinsics, "b f i j -> b f () () i j")) 28 | 29 | extrinsics_source = rearrange(extrinsics, "b fs i j -> b fs () () i j") 30 | extrinsics_target = rearrange(extrinsics[:, target_frame:target_frame+1], "b ft i j -> b () ft () i j") 31 | relative_transformations = extrinsics_target.inverse() @ extrinsics_source 32 | 33 | pcds = einsum( 34 | relative_transformations, 35 | homogenize_points(gt_pcds_unp), 36 | "... i j, ... j -> ... i", 37 | )[..., :3] 38 | 39 | return pcds -------------------------------------------------------------------------------- /egomono4d/model/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import Backbone 2 | from .backbone_explicit_depth import BackboneExplicitDepth, BackboneExplicitDepthCfg 3 | from .backbone_midas import BackboneMidas, BackboneMidasCfg 4 | from .backbone_unidepth import BackboneUniDepth, BackboneUniDepthCfg 5 | from .backbone_nvds_unidepth import BackboneNvdsUniDepth, BackboneNvdsUniDepthCfg 6 | 7 | try: 8 | from .backbone_depthanythingv2 import BackboneDepthanythingV2, BackboneDepthanythingV2Cfg 9 | from .backbone_nvds_unet_dpt import BackboneNvdsUnetDPT, BackboneNvdsUnetDPTCfg 10 | except: 11 | BackboneDepthanythingV2 = None 12 | BackboneDepthanythingV2Cfg = None 13 | BackboneNvdsUnetDPT = None 14 | BackboneNvdsUnetDPTCfg = None 15 | 16 | BACKBONES = { 17 | "explicit_depth": BackboneExplicitDepth, 18 | "midas": BackboneMidas, 19 | "unidepth": BackboneUniDepth, 20 | "depthanythingv2": BackboneDepthanythingV2, 21 | "nvds_unet_dpt": BackboneNvdsUnetDPT, 22 | "nvds_unidepth": BackboneNvdsUniDepth 23 | } 24 | 25 | BackboneCfg = BackboneExplicitDepthCfg | BackboneMidasCfg | BackboneNvdsUniDepthCfg | \ 26 | BackboneNvdsUnetDPTCfg | BackboneDepthanythingV2Cfg | BackboneUniDepthCfg 27 | 28 | 29 | def get_backbone( 30 | cfg: BackboneCfg, 31 | num_frames: int | None, 32 | image_shape: tuple[int, int] | None, 33 | patch_size: tuple[int, int] | None = None, 34 | ) -> Backbone: 35 | return BACKBONES[cfg.name](cfg, num_frames, image_shape, patch_size) 36 | -------------------------------------------------------------------------------- /egomono4d/model/backbone/backbone_explicit_depth.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from torch import nn 6 | 7 | from ...dataset.types import Batch 8 | from ...flow.flow_predictor import Flows 9 | from .backbone import Backbone, BackboneOutput 10 | 11 | 12 | @dataclass 13 | class BackboneExplicitDepthCfg: 14 | name: Literal["explicit_depth"] 15 | initial_depth: float 16 | weight_sensitivity: float 17 | 18 | 19 | class BackboneExplicitDepth(Backbone[BackboneExplicitDepthCfg]): 20 | def __init__( 21 | self, 22 | cfg: BackboneExplicitDepthCfg, 23 | num_frames: int | None, 24 | image_shape: tuple[int, int] | None, 25 | patch_size: tuple[int, int] | None, 26 | ) -> None: 27 | super().__init__(cfg, num_frames=num_frames, image_shape=image_shape, patch_size=patch_size) 28 | depth = torch.full( 29 | (num_frames, *image_shape), cfg.initial_depth, dtype=torch.float32 30 | ) 31 | self.depth = nn.Parameter(depth) 32 | weights = torch.full((num_frames - 1, *image_shape), 0, dtype=torch.float32) 33 | self.weights = nn.Parameter(weights) 34 | 35 | def forward(self, batch: Batch, flows: Flows) -> BackboneOutput: 36 | b, _, _, _, _ = batch.videos.shape 37 | assert b == 1 38 | 39 | return BackboneOutput( 40 | self.depth[None], 41 | (self.cfg.weight_sensitivity * self.weights).sigmoid()[None], 42 | ) 43 | -------------------------------------------------------------------------------- /egomono4d/misc/disk_cache.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import os 4 | from typing import Optional 5 | from pathlib import Path 6 | from typing import Any, Callable, TypeVar 7 | 8 | import torch 9 | 10 | T = TypeVar("T") 11 | 12 | 13 | def make_cache(location: Optional[Path] = None): 14 | if not os.path.exists(location): 15 | os.makedirs(location, exist_ok=True) 16 | def cache(key: Any, device, fallback: Callable[[], T]) -> T: 17 | # If there's no cache location, the cache is disabled. 18 | if location is None: 19 | return fallback() 20 | 21 | key_str = hashlib.sha256(json.dumps(key).encode("utf-8")).digest().hex() 22 | 23 | path = location + "/" + f"{key_str}.torch" 24 | try: 25 | # Attempt to load the cached item. 26 | key_loaded, value = torch.load(path, map_location=device) 27 | 28 | # If there was a hash collision and the keys don't actually match, throw an 29 | # error so that the fallback can be used. 30 | if key != key_loaded: 31 | raise ValueError("Keys did not match!") 32 | 33 | return value 34 | except (FileNotFoundError, ValueError): 35 | # Use the fallback to compute the value. 36 | value = fallback() 37 | 38 | # Cache the value. 39 | # path.parent.mkdir(exist_ok=True, parents=True) 40 | torch.save((key, value), path) 41 | 42 | return value 43 | 44 | return cache 45 | -------------------------------------------------------------------------------- /egomono4d/loss/loss_cc.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | import torch 4 | import pdb 5 | 6 | from jaxtyping import Float 7 | from torch import Tensor 8 | import torch.nn.functional as F 9 | from torchvision.utils import save_image 10 | 11 | from ..dataset.types import Batch 12 | from ..flow import Flows 13 | from ..model.model import ModelOutput 14 | from ..tracking import Tracks 15 | from .loss import Loss, LossCfgCommon 16 | 17 | @dataclass 18 | class LossCCCfg(LossCfgCommon): # CC: Clip Consistency 19 | name: Literal["cc"] 20 | 21 | 22 | class LossCC(Loss[LossCCCfg]): 23 | def __init__(self, cfg: LossCCCfg) -> None: 24 | super().__init__(cfg) 25 | self.loss = torch.nn.L1Loss(reduction="none") 26 | 27 | def compute_unweighted_loss( 28 | self, 29 | batch: Batch, 30 | flows: Flows, 31 | tracks: list[Tracks] | None, 32 | model_output: ModelOutput, 33 | current_epoch: int, 34 | return_val: bool 35 | ) -> tuple[Float[Tensor, ""], dict]: 36 | 37 | intrinsics = model_output.intrinsics 38 | b, f, _, _ = intrinsics.shape 39 | assert b % 2 == 0 40 | rb = b // 2 41 | 42 | intrinsics_subclip_1 = intrinsics[::2] # (b//2, f, 3, 3) 43 | intrinsics_subclip_2 = intrinsics[1::2] # (b//2, f, 3, 3) 44 | loss = self.loss(intrinsics_subclip_1, intrinsics_subclip_2) 45 | loss = loss.sum() / (rb * f) # fx, fy, cx, cy 46 | 47 | return loss, {"cc": loss} -------------------------------------------------------------------------------- /egomono4d/visualization/drawing/coordinate_conversion.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Protocol, runtime_checkable 2 | 3 | import torch 4 | from jaxtyping import Float 5 | from torch import Tensor 6 | 7 | from .types import Pair, sanitize_pair 8 | 9 | 10 | @runtime_checkable 11 | class ConversionFunction(Protocol): 12 | def __call__( 13 | self, 14 | xy: Float[Tensor, "*batch 2"], 15 | ) -> Float[Tensor, "*batch 2"]: 16 | pass 17 | 18 | 19 | def generate_conversions( 20 | shape: tuple[int, int], 21 | device: torch.device, 22 | x_range: Optional[Pair] = None, 23 | y_range: Optional[Pair] = None, 24 | ) -> tuple[ 25 | ConversionFunction, # conversion from world coordinates to pixel coordinates 26 | ConversionFunction, # conversion from pixel coordinates to world coordinates 27 | ]: 28 | h, w = shape 29 | x_range = sanitize_pair((0, w) if x_range is None else x_range, device) 30 | y_range = sanitize_pair((0, h) if y_range is None else y_range, device) 31 | minima, maxima = torch.stack((x_range, y_range), dim=-1) 32 | wh = torch.tensor((w, h), dtype=torch.float32, device=device) 33 | 34 | def convert_world_to_pixel( 35 | xy: Float[Tensor, "*batch 2"], 36 | ) -> Float[Tensor, "*batch 2"]: 37 | return (xy - minima) / (maxima - minima) * wh 38 | 39 | def convert_pixel_to_world( 40 | xy: Float[Tensor, "*batch 2"], 41 | ) -> Float[Tensor, "*batch 2"]: 42 | return xy / wh * (maxima - minima) + minima 43 | 44 | return convert_world_to_pixel, convert_pixel_to_world 45 | -------------------------------------------------------------------------------- /egomono4d/misc/config_tools.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | from typing import Type, TypeVar 4 | 5 | from dacite import Config, from_dict 6 | from omegaconf import DictConfig, OmegaConf 7 | 8 | TYPE_HOOKS = { 9 | Path: Path, 10 | } 11 | 12 | 13 | T = TypeVar("T") 14 | 15 | 16 | def get_typed_config( 17 | data_class: Type[T], 18 | cfg: DictConfig, 19 | extra_type_hooks: dict = {}, 20 | ) -> T: 21 | return from_dict( 22 | data_class, 23 | OmegaConf.to_container(cfg), 24 | config=Config(type_hooks={**TYPE_HOOKS, **extra_type_hooks}, cast=[tuple]), 25 | ) 26 | 27 | 28 | def separate_multiple_defaults(data_class_union): 29 | """Return a function that will pull individual configurations out of a merged dict. 30 | For example, the merged dict might look like this: 31 | 32 | { 33 | a: ... 34 | b: ... 35 | } 36 | 37 | The returned function will generate this: 38 | 39 | [{ name: a, ... }, { name: b, ... }] 40 | 41 | In other words, this function makes the types for default lists with single and 42 | multiple items be typed identically. 43 | """ 44 | 45 | def separate_fn(joined: dict) -> list: 46 | # The dummy allows the union to be converted. 47 | @dataclass 48 | class Dummy: 49 | dummy: data_class_union 50 | 51 | return [ 52 | get_typed_config(Dummy, DictConfig({"dummy": {"name": name, **cfg}})).dummy 53 | for name, cfg in joined.items() 54 | ] 55 | 56 | return separate_fn 57 | -------------------------------------------------------------------------------- /egomono4d/config/tools.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | from typing import Type, TypeVar 4 | 5 | from dacite import Config, from_dict 6 | from omegaconf import DictConfig, OmegaConf 7 | 8 | TYPE_HOOKS = { 9 | Path: Path, 10 | } 11 | 12 | 13 | T = TypeVar("T") 14 | 15 | 16 | def get_typed_config( 17 | data_class: Type[T], 18 | cfg: DictConfig, 19 | extra_type_hooks: dict = {}, 20 | ) -> T: 21 | cfg_res = from_dict(data_class, OmegaConf.to_container(cfg), config=Config(type_hooks={**TYPE_HOOKS, **extra_type_hooks}, cast=[tuple]),) 22 | return cfg_res 23 | 24 | 25 | def separate_multiple_defaults(data_class_union): 26 | """Return a function that will pull individual configurations out of a merged dict. 27 | For example, the merged dict might look like this: 28 | 29 | { 30 | a: ... 31 | b: ... 32 | } 33 | 34 | The returned function will generate this: 35 | 36 | [{ name: a, ... }, { name: b, ... }] 37 | 38 | In other words, this function makes the types for default lists with single and 39 | multiple items be typed identically. 40 | """ 41 | 42 | def separate_fn(joined: dict) -> list: 43 | # The dummy allows the union to be converted. 44 | @dataclass 45 | class Dummy: 46 | dummy: data_class_union 47 | 48 | dummy_dict = [ 49 | get_typed_config(Dummy, DictConfig({"dummy": {"name": name, **cfg}})).dummy 50 | for name, cfg in joined.items() 51 | ] 52 | return dummy_dict 53 | 54 | return separate_fn 55 | -------------------------------------------------------------------------------- /egomono4d/loss/mapping/mapping.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, TypeVar 3 | 4 | import torch 5 | from jaxtyping import Float 6 | from torch import Tensor, nn 7 | 8 | 9 | def fix_aspect_ratio( 10 | points: Float[Tensor, "*batch 2"], 11 | image_shape: tuple[int, int], 12 | ) -> Float[Tensor, "*batch 2"]: 13 | """When computing losses on normalized image coordinates (width in range [0, 1] and 14 | height in range [0, 1]), distances are skewed based on the aspect ratio. This 15 | function scales space based on the aspect ratio to correct for this skew. 16 | """ 17 | h, w = image_shape 18 | scale = (h * w) ** 0.5 19 | correction = torch.tensor( 20 | (w / scale, h / scale), 21 | dtype=points.dtype, 22 | device=points.device, 23 | ) 24 | return points * correction 25 | 26 | 27 | T = TypeVar("T") 28 | 29 | 30 | class Mapping(nn.Module, ABC, Generic[T]): 31 | def __init__(self, cfg: T) -> None: 32 | super().__init__() 33 | self.cfg = cfg 34 | 35 | def forward( 36 | self, 37 | a: Float[Tensor, "*#batch 2"], 38 | b: Float[Tensor, "*#batch 2"], 39 | image_shape: tuple[int, int], 40 | ) -> Float[Tensor, " *batch"]: 41 | a = fix_aspect_ratio(a, image_shape) 42 | b = fix_aspect_ratio(b, image_shape) 43 | return self.forward_undistorted(a - b) 44 | 45 | @abstractmethod 46 | def forward_undistorted( 47 | self, 48 | delta: Float[Tensor, "*batch 2"], 49 | ) -> Float[Tensor, " *batch"]: 50 | pass 51 | -------------------------------------------------------------------------------- /egomono4d/eval/eval_depth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pdb 3 | from ..loss.loss_midas import compute_scale_and_shift 4 | 5 | EPS = 1e-6 6 | 7 | def eval_depth_conductor(pred_depth, gt_depth, gt_flys): # (b, f, h, w) * 3 8 | 9 | b, f, h, w = pred_depth.shape 10 | 11 | pred_depth_align = pred_depth.reshape(b,f*h,w) 12 | gt_depth_align = gt_depth.reshape(b,f*h,w) 13 | gt_flys_align = gt_flys.reshape(b,f*h,w) 14 | scale_video, shift_video = compute_scale_and_shift(pred_depth_align, gt_depth_align, gt_flys_align) 15 | 16 | pred_depth_align = scale_video.view(-1, 1, 1) * pred_depth_align + shift_video.view(-1, 1, 1) 17 | err = torch.abs(pred_depth_align - gt_depth_align) 18 | err_rel = err / (gt_depth_align + EPS) 19 | 20 | err_sq = err ** 2 21 | thresh = torch.maximum((gt_depth_align / (pred_depth_align + EPS)), (pred_depth_align / (gt_depth_align + EPS))) 22 | gt_flys = gt_flys.reshape(b,f*h,w) 23 | 24 | return { 25 | 'DEPTH_AbsRel(%)': 100 * ((err_rel*gt_flys).sum()/(gt_flys.sum())).item(), 26 | 'DEPTH_RMSE(mm)': 1000 * torch.sqrt((err_sq*gt_flys).sum()/(gt_flys.sum())).item(), 27 | 'DEPTH_Delta_[.025](%)': (100*(((thresh < 1.025).float()*gt_flys).sum())/(gt_flys.sum())).item(), 28 | 'DEPTH_Delta_[.05](%)': (100*(((thresh < 1.05).float()*gt_flys).sum())/(gt_flys.sum())).item(), 29 | 'DEPTH_Delta_[.1](%)': (100*(((thresh < 1.1).float()*gt_flys).sum())/(gt_flys.sum())).item(), 30 | 'DEPTH_Delta_[.25](%)': (100*(((thresh < 1.25).float()*gt_flys).sum())/(gt_flys.sum())).item(), 31 | 'DEPTH_Delta_[.25]^2(%)': (100*(((thresh < 1.25**2).float()*gt_flys).sum())/(gt_flys.sum())).item(), 32 | } -------------------------------------------------------------------------------- /egomono4d/model/extrinsics/extrinsics_procrustes_ransac.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | import numpy as np 4 | 5 | import torch 6 | from jaxtyping import Float 7 | from torch import Tensor 8 | 9 | from ...dataset.types import Batch 10 | from ...flow.flow_predictor import Flows 11 | from ..backbone.backbone import BackboneOutput 12 | from ..projection import align_surfaces_eval 13 | from .extrinsics import Extrinsics 14 | 15 | 16 | @dataclass 17 | class ExtrinsicsProcrustesRANSACCfg: 18 | name: Literal["procrustes_ransac"] 19 | max_iter: int | None 20 | num_points: int | None 21 | 22 | 23 | class ExtrinsicsProcrustesRANSAC(Extrinsics[ExtrinsicsProcrustesRANSACCfg]): 24 | def forward( 25 | self, 26 | batch: Batch, 27 | flows: Flows, 28 | backbone_output: BackboneOutput, 29 | surfaces: Float[Tensor, "batch frame height width 3"], 30 | ) -> Float[Tensor, "batch frame 4 4"]: 31 | device = surfaces.device 32 | _, _, h, w, _ = surfaces.shape 33 | 34 | indices = torch.linspace(0, h*w-1, self.cfg.num_points, dtype=torch.int64, device=device,) 35 | best_extrinsics, best_score = align_surfaces_eval(surfaces, flows.backward, backbone_output.weights, batch.flys, indices) 36 | for i in range(self.cfg.max_iter): 37 | maybe_inliers = np.random.choice(h*w, size=self.cfg.num_points, replace=False) 38 | extrinsics, score = align_surfaces_eval(surfaces, flows.backward, backbone_output.weights, batch.flys, maybe_inliers) 39 | if score > best_score: 40 | # print(f"undate score: {score} > {best_score}") 41 | best_score = score 42 | best_extrinsics = extrinsics 43 | 44 | return extrinsics 45 | -------------------------------------------------------------------------------- /egomono4d/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from ..frame_sampler import get_frame_sampler 2 | from typing import Union, Optional, List 3 | import os 4 | 5 | from .dataset_merged import DatasetMerged 6 | from .types import Stage 7 | from .dataset_arctic import DatasetArctic, DatasetArcticCfg 8 | from .dataset_pov_surgery import DatasetPOVSurgery, DatasetPOVSurgeryCfg 9 | from .dataset_hoi4d import DatasetHOI4D, DatasetHOI4DCfg 10 | from .dataset_fpha import DatasetFPHA, DatasetFPHACfg 11 | from .dataset_h2o import DatasetH2O, DatasetH2OCfg 12 | from .dataset_egopat3d import DatasetEgoPAT3D, DatasetEgoPAT3DCfg 13 | from .dataset_epic_kitchen import DatasetEpicKitchen, DatasetEpicKitchenCfg 14 | 15 | DATASETS = { 16 | "arctic": DatasetArctic, 17 | "pov_surgery": DatasetPOVSurgery, 18 | "hoi4d": DatasetHOI4D, 19 | "h2o": DatasetH2O, 20 | "fpha": DatasetFPHA, 21 | "egopat3d": DatasetEgoPAT3D, 22 | "epic_kitchen": DatasetEpicKitchen 23 | } 24 | 25 | DatasetCfg = Union[ 26 | DatasetArcticCfg, 27 | DatasetPOVSurgeryCfg, 28 | DatasetHOI4DCfg, 29 | DatasetH2OCfg, 30 | DatasetEgoPAT3DCfg, 31 | DatasetFPHACfg, 32 | DatasetEpicKitchenCfg 33 | ] 34 | 35 | def get_dataset( 36 | dataset_cfgs: List[DatasetCfg], 37 | stage: Stage, 38 | global_rank: int, 39 | world_size: int, 40 | data_ratio: Optional[float]=1.0, 41 | debug: Optional[bool]=False, 42 | ) -> DatasetMerged: 43 | 44 | datasets = [] 45 | for cfg in dataset_cfgs: 46 | frame_sampler = get_frame_sampler(cfg.frame_sampler, cfg.num_frames, stage) 47 | dataset = DATASETS[cfg.name](cfg, stage, frame_sampler, global_rank, world_size, debug) 48 | datasets.append(dataset) 49 | 50 | return DatasetMerged(datasets, stage=stage, global_rank=global_rank, world_size=world_size, data_ratio=data_ratio) 51 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/scripts/submission.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | # generate prediction results for submission on sintel and kitti online servers 5 | 6 | 7 | # GMFlow without refinement 8 | 9 | # submission to sintel 10 | CUDA_VISIBLE_DEVICES=0 python main.py \ 11 | --submission \ 12 | --output_path submission/sintel-gmflow-norefine \ 13 | --val_dataset sintel \ 14 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 15 | 16 | # submission to kitti 17 | CUDA_VISIBLE_DEVICES=0 python main.py \ 18 | --submission \ 19 | --output_path submission/kitti-gmflow-norefine \ 20 | --val_dataset kitti \ 21 | --resume pretrained/gmflow_kitti-285701a8.pth 22 | 23 | 24 | # you can also visualize the predictions before submission 25 | # CUDA_VISIBLE_DEVICES=0 python main.py \ 26 | # --submission \ 27 | # --output_path submission/sintel-gmflow-norefine-vis \ 28 | # --save_vis_flow \ 29 | # --no_save_flo \ 30 | # --val_dataset sintel \ 31 | # --resume pretrained/gmflow_sintel.pth 32 | 33 | 34 | 35 | 36 | # GMFlow with refinement 37 | 38 | # submission to sintel 39 | CUDA_VISIBLE_DEVICES=0 python main.py \ 40 | --submission \ 41 | --output_path submission/sintel-gmflow-withrefine \ 42 | --val_dataset sintel \ 43 | --resume pretrained/gmflow_with_refine_sintel-3ed1cf48.pth \ 44 | --padding_factor 32 \ 45 | --upsample_factor 4 \ 46 | --num_scales 2 \ 47 | --attn_splits_list 2 8 \ 48 | --corr_radius_list -1 4 \ 49 | --prop_radius_list -1 1 50 | 51 | # submission to kitti 52 | CUDA_VISIBLE_DEVICES=0 python main.py \ 53 | --submission \ 54 | --output_path submission/kitti-gmflow-withrefine \ 55 | --val_dataset kitti \ 56 | --resume pretrained/gmflow_with_refine_kitti-8d3b9786.pth \ 57 | --padding_factor 32 \ 58 | --upsample_factor 4 \ 59 | --num_scales 2 \ 60 | --attn_splits_list 2 8 \ 61 | --corr_radius_list -1 4 \ 62 | --prop_radius_list -1 1 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /egomono4d/visualization/drawing/types.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Union 2 | 3 | import torch 4 | from einops import repeat 5 | from jaxtyping import Float, Shaped 6 | from torch import Tensor 7 | 8 | Real = Union[float, int] 9 | 10 | Vector = Union[ 11 | Real, 12 | Iterable[Real], 13 | Shaped[Tensor, "3"], 14 | Shaped[Tensor, "batch 3"], 15 | ] 16 | 17 | 18 | def sanitize_vector( 19 | vector: Vector, 20 | dim: int, 21 | device: torch.device, 22 | ) -> Float[Tensor, "*#batch dim"]: 23 | if isinstance(vector, Tensor): 24 | vector = vector.type(torch.float32).to(device) 25 | else: 26 | vector = torch.tensor(vector, dtype=torch.float32, device=device) 27 | while vector.ndim < 2: 28 | vector = vector[None] 29 | if vector.shape[-1] == 1: 30 | vector = repeat(vector, "... () -> ... c", c=dim) 31 | assert vector.shape[-1] == dim 32 | assert vector.ndim == 2 33 | return vector 34 | 35 | 36 | Scalar = Union[ 37 | Real, 38 | Iterable[Real], 39 | Shaped[Tensor, ""], 40 | Shaped[Tensor, " batch"], 41 | ] 42 | 43 | 44 | def sanitize_scalar(scalar: Scalar, device: torch.device) -> Float[Tensor, "*#batch"]: 45 | if isinstance(scalar, Tensor): 46 | scalar = scalar.type(torch.float32).to(device) 47 | else: 48 | scalar = torch.tensor(scalar, dtype=torch.float32, device=device) 49 | while scalar.ndim < 1: 50 | scalar = scalar[None] 51 | assert scalar.ndim == 1 52 | return scalar 53 | 54 | 55 | Pair = Union[ 56 | Iterable[Real], 57 | Shaped[Tensor, "2"], 58 | ] 59 | 60 | 61 | def sanitize_pair(pair: Pair, device: torch.device) -> Float[Tensor, "2"]: 62 | if isinstance(pair, Tensor): 63 | pair = pair.type(torch.float32).to(device) 64 | else: 65 | pair = torch.tensor(pair, dtype=torch.float32, device=device) 66 | assert pair.shape == (2,) 67 | return pair 68 | -------------------------------------------------------------------------------- /config/datagen_egopat3d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [egopat3d] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 5 19 | 20 | dataset: 21 | egopat3d: 22 | clip_frame: 20 23 | 24 | wandb: 25 | project: egomono4d 26 | mode: online 27 | name: placeholder 28 | group: null 29 | tags: null 30 | 31 | checkpoint: 32 | load: null 33 | 34 | trainer: 35 | val_check_interval: 0.1 36 | gradient_clip_val: 10.0 37 | max_epochs: 25 38 | accumulate_grad_batches: 1 39 | num_nodes: 1 40 | gpus: 8 41 | 42 | loss: 43 | dynamic_area: 44 | weight: 0.005 45 | enable_after: 0 46 | cc: 47 | weight: 1.0 48 | enable_after: 0 49 | tracking_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | flow_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | shape: 56 | weight: 4.0 57 | enable_after: 0 58 | dynamic_coef: 1.0 59 | decay_end_epochs: -1 60 | decay_low_weight: 1.0 61 | 62 | model_wrapper: 63 | lr: 5e-5 64 | cache_track: false 65 | 66 | model: 67 | use_correspondence_weights: true 68 | 69 | data_module: 70 | train: 71 | num_workers: 4 72 | persistent_workers: true 73 | batch_size: 2 # batch-size of per-gpu 74 | seed: 233 75 | val: 76 | num_workers: 4 77 | persistent_workers: true 78 | batch_size: 2 79 | seed: 233 80 | 81 | hydra: 82 | run: 83 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/pretrain.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [h2o, hoi4d, fpha, egopat3d, epic_kitchen] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | wandb: 21 | project: egomono4d 22 | mode: online 23 | name: placeholder 24 | group: null 25 | tags: null 26 | 27 | checkpoint: 28 | load: null 29 | 30 | trainer: 31 | val_check_interval: 0.1 32 | # check_val_every_n_epoch: 1 33 | gradient_clip_val: 10.0 34 | max_epochs: 25 35 | accumulate_grad_batches: 1 36 | num_nodes: 1 37 | gpus: 8 38 | 39 | loss: 40 | dynamic_area: 41 | weight: 0.005 42 | enable_after: 0 43 | cc: 44 | weight: 1.0 45 | enable_after: 0 46 | tracking_3d: 47 | weight: 5.0 48 | enable_after: 0 49 | flow_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | shape: 53 | weight: 4.0 54 | enable_after: 0 55 | dynamic_coef: 1.0 56 | decay_end_epochs: -1 57 | decay_low_weight: 1.0 58 | 59 | model_wrapper: 60 | lr: 5e-5 61 | cache_track: false 62 | 63 | model: 64 | use_correspondence_weights: true 65 | 66 | data_module: 67 | train: 68 | num_workers: 4 69 | persistent_workers: true 70 | batch_size: 2 # batch-size of per-gpu 71 | seed: 233 72 | val: 73 | num_workers: 4 74 | persistent_workers: true 75 | batch_size: 2 76 | seed: 233 77 | 78 | hydra: 79 | run: 80 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /egomono4d/tracking/track_predictor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | import torch 4 | from typing import Generic, TypeVar, Optional 5 | 6 | from jaxtyping import Bool, Float 7 | from torch import Tensor, nn 8 | 9 | from ..misc.manipulable import Manipulable 10 | 11 | T = TypeVar("T") 12 | 13 | 14 | def sample_image_grid_tracker( 15 | shape, 16 | device: torch.device = torch.device("cpu"), 17 | ): 18 | """Get normalized (range 0 to 1) coordinates and integer indices for an image.""" 19 | indices = [torch.arange(length, device=device) for length in shape] 20 | stacked_indices = torch.stack(torch.meshgrid(*indices, indexing="ij"), dim=-1) 21 | coordinates = [(idx + 0.5) / length for idx, length in zip(indices, shape)] 22 | coordinates = reversed(coordinates) 23 | coordinates = torch.stack(torch.meshgrid(*coordinates, indexing="xy"), dim=-1) 24 | return coordinates 25 | 26 | 27 | @dataclass 28 | class Tracks(Manipulable): 29 | xy: Optional[Float[Tensor, "batch frame point 2"]] = None 30 | visibility: Optional[Bool[Tensor, "batch frame point"]] = None 31 | 32 | # This is the first frame in the track sequence, not the query frame used to 33 | # generate the sequence, which is often different. 34 | start_frame: int = 0 35 | 36 | def build_from_track_list(self, track_list, device='cpu'): 37 | self.xy = torch.concatenate([track.xy for track in track_list], dim=0) 38 | self.visibility = torch.concatenate([track.visibility for track in track_list], dim=0) 39 | 40 | 41 | class TrackPredictor(nn.Module, ABC, Generic[T]): 42 | def __init__(self, cfg: T) -> None: 43 | super().__init__() 44 | self.cfg = cfg 45 | 46 | @abstractmethod 47 | def forward( 48 | self, 49 | videos: Float[Tensor, "batch frame 3 height width"], 50 | query_frame: int, 51 | ) -> Tracks: 52 | pass 53 | -------------------------------------------------------------------------------- /egomono4d/misc/wandb_tools.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import wandb 4 | 5 | from ..config.common import WandbCfg 6 | 7 | 8 | def version_to_int(artifact) -> int: 9 | """Convert versions of the form vX to X. For example, v12 to 12.""" 10 | return int(artifact.version[1:]) 11 | 12 | 13 | def download_checkpoint( 14 | run_id: str, 15 | download_dir: Path, 16 | version: str | None, 17 | ) -> Path: 18 | api = wandb.Api() 19 | run = api.run(run_id) 20 | 21 | # Find the latest saved model checkpoint. 22 | chosen = None 23 | for artifact in run.logged_artifacts(): 24 | if artifact.type != "model" or artifact.state != "COMMITTED": 25 | continue 26 | 27 | # If no version is specified, use the latest. 28 | if version is None: 29 | if chosen is None or version_to_int(artifact) > version_to_int(chosen): 30 | chosen = artifact 31 | 32 | # If a specific verison is specified, look for it. 33 | elif version == artifact.version: 34 | chosen = artifact 35 | break 36 | 37 | # Download the checkpoint. 38 | download_dir.mkdir(exist_ok=True, parents=True) 39 | root = download_dir / run_id 40 | chosen.download(root=root) 41 | return root / "model.ckpt" 42 | 43 | 44 | def update_checkpoint_path(path: str | None, cfg: WandbCfg) -> Path | None: 45 | if path is None: 46 | return None 47 | 48 | if not str(path).startswith("wandb://"): 49 | return Path(path) 50 | 51 | run_id, *version = path[len("wandb://") :].split(":") 52 | if len(version) == 0: 53 | version = None 54 | elif len(version) == 1: 55 | version = version[0] 56 | else: 57 | raise ValueError("Invalid version specifier!") 58 | 59 | project = cfg.project 60 | return download_checkpoint( 61 | f"{project}/{run_id}", 62 | Path("checkpoints"), 63 | version, 64 | ) 65 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/scripts/demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # inference GMFlow without refinement 4 | 5 | # sintel 6 | 7 | # only predict forward flow 8 | CUDA_VISIBLE_DEVICES=0 python main.py \ 9 | --inference_dir demo/sintel_market_1 \ 10 | --output_path output/gmflow-norefine-sintel_market_1 \ 11 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 12 | 13 | # predict forward & backward flow 14 | CUDA_VISIBLE_DEVICES=0 python main.py \ 15 | --inference_dir demo/sintel_market_1 \ 16 | --output_path output/gmflow-norefine-sintel_market_1 \ 17 | --pred_bidir_flow \ 18 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 19 | 20 | 21 | # predict forward & backward flow with forward-backward consistency check 22 | CUDA_VISIBLE_DEVICES=0 python main.py \ 23 | --inference_dir demo/sintel_market_1 \ 24 | --output_path output/gmflow-norefine-sintel_market_1 \ 25 | --pred_bidir_flow \ 26 | --fwd_bwd_consistency_check \ 27 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 28 | 29 | 30 | # davis 31 | 32 | CUDA_VISIBLE_DEVICES=0 python main.py \ 33 | --inference_dir demo/davis_breakdance-flare \ 34 | --output_path output/gmflow-norefine-davis_breakdance-flare \ 35 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 36 | 37 | 38 | 39 | 40 | # inference GMFlow with refinement 41 | 42 | CUDA_VISIBLE_DEVICES=0 python main.py \ 43 | --inference_dir demo/davis_breakdance-flare \ 44 | --output_path output/gmflow-withrefine-davis_breakdance-flare \ 45 | --resume pretrained/gmflow_with_refine_sintel-3ed1cf48.pth \ 46 | --padding_factor 32 \ 47 | --upsample_factor 4 \ 48 | --num_scales 2 \ 49 | --attn_splits_list 2 8 \ 50 | --corr_radius_list -1 4 \ 51 | --prop_radius_list -1 1 52 | 53 | 54 | 55 | 56 | CUDA_VISIBLE_DEVICES=0 python main.py \ 57 | --inference_dir demo/sintel_test_clean_market_1 \ 58 | --output_path output/gmflow-norefine-sintel_test_clean_market_1 \ 59 | --pred_bidir_flow \ 60 | --fwd_bwd_consistency_check \ 61 | --resume pretrained/gmflow_sintel-0c07dcb3.pth 62 | 63 | 64 | -------------------------------------------------------------------------------- /egomono4d/loss/loss.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | from typing import Generic, TypeVar 4 | 5 | import torch 6 | import pdb 7 | from jaxtyping import Float 8 | from torch import Tensor, nn 9 | 10 | from ..dataset.types import Batch 11 | from ..flow import Flows 12 | from ..model.model import ModelOutput 13 | from ..tracking import Tracks 14 | 15 | 16 | @dataclass 17 | class LossCfgCommon: 18 | enable_after: int 19 | weight: float 20 | 21 | 22 | T = TypeVar("T", bound=LossCfgCommon) 23 | 24 | 25 | class Loss(nn.Module, ABC, Generic[T]): 26 | cfg: T 27 | 28 | def __init__(self, cfg: T) -> None: 29 | super().__init__() 30 | self.cfg = cfg 31 | 32 | def forward( 33 | self, 34 | batch: Batch, 35 | flows: Flows | None, 36 | tracks: list[Tracks] | None, 37 | model_output: ModelOutput, 38 | current_epoch: int, 39 | return_unweighted=False 40 | ) -> Float[Tensor, ""]: 41 | 42 | if current_epoch < self.cfg.enable_after: 43 | zr_loss = torch.tensor(0, dtype=torch.float32, device=batch.videos.device) 44 | if return_unweighted is True: 45 | return (zr_loss, zr_loss), None 46 | else: 47 | return zr_loss, None 48 | 49 | loss, loss_package = self.compute_unweighted_loss( 50 | batch, flows, tracks, model_output, current_epoch, return_unweighted 51 | ) 52 | if return_unweighted is True: 53 | return (self.cfg.weight * loss, 100 * loss), loss_package 54 | else: 55 | return self.cfg.weight * loss, loss_package 56 | 57 | @abstractmethod 58 | def compute_unweighted_loss( 59 | self, 60 | batch: Batch, 61 | flows: Flows, 62 | tracks: list[Tracks] | None, 63 | model_output: ModelOutput, 64 | global_step: int, 65 | ) -> tuple[Float[Tensor, ""], dict]: 66 | pass 67 | -------------------------------------------------------------------------------- /config/datagen_arctic.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [arctic] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | arctic: 22 | clip_frame: 40 23 | frame_sampler: pretrain_interval 24 | frame_max_interval: 4 25 | 26 | wandb: 27 | project: egomono4d 28 | mode: online 29 | name: placeholder 30 | group: null 31 | tags: null 32 | 33 | checkpoint: 34 | load: null 35 | 36 | trainer: 37 | val_check_interval: 0.1 38 | gradient_clip_val: 10.0 39 | max_epochs: 25 40 | accumulate_grad_batches: 1 41 | num_nodes: 1 42 | gpus: 8 43 | 44 | loss: 45 | dynamic_area: 46 | weight: 0.005 47 | enable_after: 0 48 | cc: 49 | weight: 1.0 50 | enable_after: 0 51 | tracking_3d: 52 | weight: 5.0 53 | enable_after: 0 54 | flow_3d: 55 | weight: 5.0 56 | enable_after: 0 57 | shape: 58 | weight: 4.0 59 | enable_after: 0 60 | dynamic_coef: 1.0 61 | decay_end_epochs: -1 62 | decay_low_weight: 1.0 63 | 64 | model_wrapper: 65 | lr: 5e-5 66 | cache_track: false 67 | 68 | model: 69 | use_correspondence_weights: true 70 | 71 | data_module: 72 | train: 73 | num_workers: 4 74 | persistent_workers: true 75 | batch_size: 2 # batch-size of per-gpu 76 | seed: 233 77 | val: 78 | num_workers: 4 79 | persistent_workers: true 80 | batch_size: 2 81 | seed: 233 82 | 83 | hydra: 84 | run: 85 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/datagen_fpha.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [fpha] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 5 19 | 20 | 21 | dataset: 22 | fpha: 23 | clip_frame: 20 24 | frame_sampler: pretrain_interval 25 | frame_max_interval: 3 26 | 27 | wandb: 28 | project: egomono4d 29 | mode: online 30 | name: placeholder 31 | group: null 32 | tags: null 33 | 34 | checkpoint: 35 | load: null 36 | 37 | trainer: 38 | val_check_interval: 0.1 39 | gradient_clip_val: 10.0 40 | max_epochs: 25 41 | accumulate_grad_batches: 1 42 | num_nodes: 1 43 | gpus: 8 44 | 45 | loss: 46 | dynamic_area: 47 | weight: 0.005 48 | enable_after: 0 49 | cc: 50 | weight: 1.0 51 | enable_after: 0 52 | tracking_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | flow_3d: 56 | weight: 5.0 57 | enable_after: 0 58 | shape: 59 | weight: 4.0 60 | enable_after: 0 61 | dynamic_coef: 1.0 62 | decay_end_epochs: -1 63 | decay_low_weight: 1.0 64 | 65 | model_wrapper: 66 | lr: 5e-5 67 | cache_track: false 68 | 69 | model: 70 | use_correspondence_weights: true 71 | 72 | data_module: 73 | train: 74 | num_workers: 4 75 | persistent_workers: true 76 | batch_size: 2 # batch-size of per-gpu 77 | seed: 233 78 | val: 79 | num_workers: 4 80 | persistent_workers: true 81 | batch_size: 2 82 | seed: 233 83 | 84 | hydra: 85 | run: 86 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/datagen_h2o.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [h2o] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 5 19 | 20 | dataset: 21 | h2o: 22 | clip_frame: 20 23 | frame_sampler: pretrain_interval 24 | frame_max_interval: 4 25 | 26 | 27 | wandb: 28 | project: egomono4d 29 | mode: online 30 | name: placeholder 31 | group: null 32 | tags: null 33 | 34 | checkpoint: 35 | load: null 36 | 37 | trainer: 38 | val_check_interval: 0.1 39 | gradient_clip_val: 10.0 40 | max_epochs: 25 41 | accumulate_grad_batches: 1 42 | num_nodes: 1 43 | gpus: 8 44 | 45 | loss: 46 | dynamic_area: 47 | weight: 0.005 48 | enable_after: 0 49 | cc: 50 | weight: 1.0 51 | enable_after: 0 52 | tracking_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | flow_3d: 56 | weight: 5.0 57 | enable_after: 0 58 | shape: 59 | weight: 4.0 60 | enable_after: 0 61 | dynamic_coef: 1.0 62 | decay_end_epochs: -1 63 | decay_low_weight: 1.0 64 | 65 | model_wrapper: 66 | lr: 5e-5 67 | cache_track: false 68 | 69 | model: 70 | use_correspondence_weights: true 71 | 72 | data_module: 73 | train: 74 | num_workers: 4 75 | persistent_workers: true 76 | batch_size: 2 # batch-size of per-gpu 77 | seed: 233 78 | val: 79 | num_workers: 4 80 | persistent_workers: true 81 | batch_size: 2 82 | seed: 233 83 | 84 | hydra: 85 | run: 86 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/datagen_hoi4d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [hoi4d] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 5 19 | 20 | # frame_sampler: egomono4d/frame_sampler/__init__.py 21 | 22 | dataset: 23 | hoi4d: 24 | frame_sampler: pretrain_interval 25 | frame_max_interval: 4 26 | 27 | wandb: 28 | project: egomono4d 29 | mode: online 30 | name: placeholder 31 | group: null 32 | tags: null 33 | 34 | checkpoint: 35 | load: null 36 | 37 | trainer: 38 | val_check_interval: 0.1 39 | gradient_clip_val: 10.0 40 | max_epochs: 25 41 | accumulate_grad_batches: 1 42 | num_nodes: 1 43 | gpus: 8 44 | 45 | loss: 46 | dynamic_area: 47 | weight: 0.005 48 | enable_after: 0 49 | cc: 50 | weight: 1.0 51 | enable_after: 0 52 | tracking_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | flow_3d: 56 | weight: 5.0 57 | enable_after: 0 58 | shape: 59 | weight: 4.0 60 | enable_after: 0 61 | dynamic_coef: 1.0 62 | decay_end_epochs: -1 63 | decay_low_weight: 1.0 64 | 65 | model_wrapper: 66 | lr: 5e-5 67 | cache_track: false 68 | 69 | model: 70 | use_correspondence_weights: true 71 | 72 | data_module: 73 | train: 74 | num_workers: 4 75 | persistent_workers: true 76 | batch_size: 2 # batch-size of per-gpu 77 | seed: 233 78 | val: 79 | num_workers: 4 80 | persistent_workers: true 81 | batch_size: 2 82 | seed: 233 83 | 84 | hydra: 85 | run: 86 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/pretrain_eval_h2o.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [h2o] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | h2o: 22 | clip_frame: 40 23 | 24 | wandb: 25 | project: egomono4d 26 | mode: online 27 | name: placeholder 28 | group: null 29 | tags: null 30 | 31 | checkpoint: 32 | load: ./cache/processed_datasets/egomono4d_result/2024-09-11/14-12-41/ptr_all_01_dp2/egomono4d/tdqluu5w/checkpoints/last.ckpt 33 | 34 | trainer: 35 | val_check_interval: 0.1 36 | gradient_clip_val: 10.0 37 | max_epochs: 25 38 | accumulate_grad_batches: 1 39 | num_nodes: 1 40 | gpus: 8 41 | 42 | loss: 43 | dynamic_area: 44 | weight: 0.005 45 | enable_after: 0 46 | cc: 47 | weight: 1.0 48 | enable_after: 0 49 | tracking_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | flow_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | shape: 56 | weight: 4.0 57 | enable_after: 0 58 | dynamic_coef: 1.0 59 | decay_end_epochs: -1 60 | decay_low_weight: 1.0 61 | 62 | model_wrapper: 63 | lr: 5e-5 64 | cache_track: false 65 | 66 | model: 67 | use_correspondence_weights: true 68 | 69 | data_module: 70 | train: 71 | num_workers: 4 72 | persistent_workers: true 73 | batch_size: 2 # batch-size of per-gpu 74 | seed: 233 75 | val: 76 | num_workers: 4 77 | persistent_workers: true 78 | batch_size: 2 79 | seed: 233 80 | 81 | hydra: 82 | run: 83 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/pretrain_eval_hoi4d.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [hoi4d] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | hoi4d: 22 | clip_frame: 40 23 | 24 | wandb: 25 | project: egomono4d 26 | mode: online 27 | name: placeholder 28 | group: null 29 | tags: null 30 | 31 | checkpoint: 32 | load: ./cache/processed_datasets/egomono4d_result/2024-09-11/14-12-41/ptr_all_01_dp2/egomono4d/tdqluu5w/checkpoints/last.ckpt 33 | 34 | trainer: 35 | val_check_interval: 0.1 36 | gradient_clip_val: 10.0 37 | max_epochs: 25 38 | accumulate_grad_batches: 1 39 | num_nodes: 1 40 | gpus: 8 41 | 42 | loss: 43 | dynamic_area: 44 | weight: 0.005 45 | enable_after: 0 46 | cc: 47 | weight: 1.0 48 | enable_after: 0 49 | tracking_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | flow_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | shape: 56 | weight: 4.0 57 | enable_after: 0 58 | dynamic_coef: 1.0 59 | decay_end_epochs: -1 60 | decay_low_weight: 1.0 61 | 62 | model_wrapper: 63 | lr: 5e-5 64 | cache_track: false 65 | 66 | model: 67 | use_correspondence_weights: true 68 | 69 | data_module: 70 | train: 71 | num_workers: 4 72 | persistent_workers: true 73 | batch_size: 2 # batch-size of per-gpu 74 | seed: 233 75 | val: 76 | num_workers: 4 77 | persistent_workers: true 78 | batch_size: 2 79 | seed: 233 80 | 81 | hydra: 82 | run: 83 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/pretrain_eval_arctic.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [arctic] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | arctic: 22 | clip_frame: 40 23 | 24 | wandb: 25 | project: egomono4d 26 | mode: online 27 | name: placeholder 28 | group: null 29 | tags: null 30 | 31 | checkpoint: 32 | load: ./cache/processed_datasets/egomono4d_result/2024-09-11/14-12-41/ptr_all_01_dp2/egomono4d/tdqluu5w/checkpoints/last.ckpt 33 | 34 | trainer: 35 | val_check_interval: 0.1 36 | gradient_clip_val: 10.0 37 | max_epochs: 25 38 | accumulate_grad_batches: 1 39 | num_nodes: 1 40 | gpus: 8 41 | 42 | loss: 43 | dynamic_area: 44 | weight: 0.005 45 | enable_after: 0 46 | cc: 47 | weight: 1.0 48 | enable_after: 0 49 | tracking_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | flow_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | shape: 56 | weight: 4.0 57 | enable_after: 0 58 | dynamic_coef: 1.0 59 | decay_end_epochs: -1 60 | decay_low_weight: 1.0 61 | 62 | model_wrapper: 63 | lr: 5e-5 64 | cache_track: false 65 | 66 | model: 67 | use_correspondence_weights: true 68 | 69 | data_module: 70 | train: 71 | num_workers: 4 72 | persistent_workers: true 73 | batch_size: 2 # batch-size of per-gpu 74 | seed: 233 75 | val: 76 | num_workers: 4 77 | persistent_workers: true 78 | batch_size: 2 79 | seed: 233 80 | 81 | hydra: 82 | run: 83 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/pretrain_eval_pov_surgery.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [pov_surgery] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | pov_surgery: 22 | clip_frame: 40 23 | 24 | wandb: 25 | project: egomono4d 26 | mode: online 27 | name: placeholder 28 | group: null 29 | tags: null 30 | 31 | checkpoint: 32 | load: ./cache/processed_datasets/egomono4d_result/2024-09-11/14-12-41/ptr_all_01_dp2/egomono4d/tdqluu5w/checkpoints/last.ckpt 33 | 34 | trainer: 35 | val_check_interval: 0.1 36 | gradient_clip_val: 10.0 37 | max_epochs: 25 38 | accumulate_grad_batches: 1 39 | num_nodes: 1 40 | gpus: 8 41 | 42 | loss: 43 | dynamic_area: 44 | weight: 0.005 45 | enable_after: 0 46 | cc: 47 | weight: 1.0 48 | enable_after: 0 49 | tracking_3d: 50 | weight: 5.0 51 | enable_after: 0 52 | flow_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | shape: 56 | weight: 4.0 57 | enable_after: 0 58 | dynamic_coef: 1.0 59 | decay_end_epochs: -1 60 | decay_low_weight: 1.0 61 | 62 | model_wrapper: 63 | lr: 5e-5 64 | cache_track: false 65 | 66 | model: 67 | use_correspondence_weights: true 68 | 69 | data_module: 70 | train: 71 | num_workers: 4 72 | persistent_workers: true 73 | batch_size: 2 # batch-size of per-gpu 74 | seed: 233 75 | val: 76 | num_workers: 4 77 | persistent_workers: true 78 | batch_size: 2 79 | seed: 233 80 | 81 | hydra: 82 | run: 83 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /config/datagen_pov_surgery.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [pov_surgery] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: midas 6 | - model/intrinsics: softmin 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, midas, tracking, flow, tracking_3d] # simple_arap 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 4 19 | 20 | dataset: 21 | pov_surgery: 22 | clip_frame: 40 23 | frame_sampler: pretrain_interval 24 | frame_max_interval: 4 25 | 26 | wandb: 27 | project: egomono4d 28 | mode: online 29 | name: placeholder 30 | group: null 31 | tags: null 32 | 33 | checkpoint: 34 | load: null 35 | 36 | trainer: 37 | val_check_interval: 0.1 38 | # check_val_every_n_epoch: 1 39 | gradient_clip_val: 10.0 40 | max_epochs: 25 41 | accumulate_grad_batches: 1 42 | num_nodes: 1 43 | gpus: 8 44 | 45 | loss: 46 | dynamic_area: 47 | weight: 0.005 48 | enable_after: 0 49 | cc: 50 | weight: 1.0 51 | enable_after: 0 52 | tracking_3d: 53 | weight: 5.0 54 | enable_after: 0 55 | flow_3d: 56 | weight: 5.0 57 | enable_after: 0 58 | shape: 59 | weight: 4.0 60 | enable_after: 0 61 | dynamic_coef: 1.0 62 | decay_end_epochs: -1 63 | decay_low_weight: 1.0 64 | 65 | model_wrapper: 66 | lr: 5e-5 67 | cache_track: false 68 | 69 | model: 70 | use_correspondence_weights: true 71 | 72 | data_module: 73 | train: 74 | num_workers: 4 75 | persistent_workers: true 76 | batch_size: 2 # batch-size of per-gpu 77 | seed: 233 78 | val: 79 | num_workers: 4 80 | persistent_workers: true 81 | batch_size: 2 82 | seed: 233 83 | 84 | hydra: 85 | run: 86 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /egomono4d/model/extrinsics/extrinsics_procrustes_flow.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from jaxtyping import Float 6 | from torch import Tensor 7 | 8 | from ...dataset.types import Batch 9 | from ...flow.flow_predictor import Flows 10 | from ..backbone.backbone import BackboneOutput 11 | from ..projection import align_surfaces 12 | from .extrinsics import Extrinsics 13 | 14 | 15 | @dataclass 16 | class ExtrinsicsProcrustesFlowCfg: 17 | name: Literal["procrustes_flow"] 18 | num_points: int | None 19 | randomize_points: bool 20 | 21 | 22 | class ExtrinsicsProcrustesFlow(Extrinsics[ExtrinsicsProcrustesFlowCfg]): 23 | def forward( 24 | self, 25 | batch: Batch, 26 | flows: Flows, 27 | backbone_output: BackboneOutput, 28 | surfaces: Float[Tensor, "batch frame height width 3"], 29 | ) -> Float[Tensor, "batch frame 4 4"]: 30 | device = surfaces.device 31 | _, _, h, w, _ = surfaces.shape 32 | 33 | # Select the subset of points used for the alignment. 34 | if self.cfg.num_points is None: 35 | indices = torch.arange(h * w, dtype=torch.int64, device=device) 36 | elif self.cfg.randomize_points: 37 | indices = torch.randint( 38 | 0, 39 | h * w, 40 | (self.cfg.num_points,), 41 | dtype=torch.int64, 42 | device=device, 43 | ) 44 | else: 45 | indices = torch.linspace( 46 | 0, 47 | h * w - 1, 48 | self.cfg.num_points, 49 | dtype=torch.int64, 50 | device=device, 51 | ) 52 | 53 | # Align the depth maps using a Procrustes fit. 54 | return align_surfaces( 55 | surfaces, # (B, F, H, W, 3) 56 | flows.backward, # (B, F-1, H, W, 2) 57 | backbone_output.weights, # (B, F-1, H, W) 58 | indices, # rand-index (H*W) 59 | ) 60 | # (B, F, 4, 4) -------------------------------------------------------------------------------- /config/datagen_epic_kitchen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - dataset: [epic_kitchen] 3 | - flow: gmflow 4 | - tracking: cotracker 5 | - model/backbone: nvds_unidepth 6 | - model/intrinsics: model 7 | - model/extrinsics: procrustes_flow 8 | - loss: [dynamic_area, cc, tracking_3d, flow_3d, shape] 9 | - visualizer: [summary] 10 | - _self_ 11 | 12 | base_cache_dir: ./cache 13 | save_dir: ./cache/models 14 | 15 | preprocess: 16 | resize_shape: [300, 400] # First resize the image into resize_shape. 17 | patch_size: 32 # Then conduct center_crop with w&h divided by patch_size equal to 0. 18 | num_frames: 5 19 | 20 | dataset: 21 | epic_kitchen: 22 | clip_frame: 20 23 | mask_estimation: ['egohos', 'epipolar'] 24 | mask_binary_open_value: 5000 25 | frame_sampler: pretrain_interval 26 | frame_max_interval: 3 27 | 28 | wandb: 29 | project: egomono4d 30 | mode: online 31 | name: placeholder 32 | group: null 33 | tags: null 34 | 35 | checkpoint: 36 | load: null 37 | 38 | trainer: 39 | val_check_interval: 0.1 40 | gradient_clip_val: 10.0 41 | max_epochs: 25 42 | accumulate_grad_batches: 1 43 | num_nodes: 1 44 | gpus: 8 45 | 46 | loss: 47 | dynamic_area: 48 | weight: 0.005 49 | enable_after: 0 50 | cc: 51 | weight: 1.0 52 | enable_after: 0 53 | tracking_3d: 54 | weight: 5.0 55 | enable_after: 0 56 | flow_3d: 57 | weight: 5.0 58 | enable_after: 0 59 | shape: 60 | weight: 4.0 61 | enable_after: 0 62 | dynamic_coef: 1.0 63 | decay_end_epochs: -1 64 | decay_low_weight: 1.0 65 | 66 | model_wrapper: 67 | lr: 5e-5 68 | cache_track: false 69 | 70 | model: 71 | use_correspondence_weights: true 72 | 73 | data_module: 74 | train: 75 | num_workers: 4 76 | persistent_workers: true 77 | batch_size: 2 # batch-size of per-gpu 78 | seed: 233 79 | val: 80 | num_workers: 4 81 | persistent_workers: true 82 | batch_size: 2 83 | seed: 233 84 | 85 | hydra: 86 | run: 87 | dir: ${save_dir}/${now:%Y-%m-%d}/${now:%H-%M-%S} -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/position.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # https://github.com/facebookresearch/detr/blob/main/models/position_encoding.py 3 | 4 | import torch 5 | import torch.nn as nn 6 | import math 7 | 8 | 9 | class PositionEmbeddingSine(nn.Module): 10 | """ 11 | This is a more standard version of the position embedding, very similar to the one 12 | used by the Attention is all you need paper, generalized to work on images. 13 | """ 14 | 15 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None): 16 | super().__init__() 17 | self.num_pos_feats = num_pos_feats 18 | self.temperature = temperature 19 | self.normalize = normalize 20 | if scale is not None and normalize is False: 21 | raise ValueError("normalize should be True if scale is passed") 22 | if scale is None: 23 | scale = 2 * math.pi 24 | self.scale = scale 25 | 26 | def forward(self, x): 27 | # x = tensor_list.tensors # [B, C, H, W] 28 | # mask = tensor_list.mask # [B, H, W], input with padding, valid as 0 29 | b, c, h, w = x.size() 30 | mask = torch.ones((b, h, w), device=x.device) # [B, H, W] 31 | y_embed = mask.cumsum(1, dtype=torch.float32) 32 | x_embed = mask.cumsum(2, dtype=torch.float32) 33 | if self.normalize: 34 | eps = 1e-6 35 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 36 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 37 | 38 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 39 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 40 | 41 | pos_x = x_embed[:, :, :, None] / dim_t 42 | pos_y = y_embed[:, :, :, None] / dim_t 43 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 44 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 45 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 46 | return pos 47 | -------------------------------------------------------------------------------- /egomono4d/loss/loss_shape.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | import torch 4 | import pdb 5 | 6 | from jaxtyping import Float 7 | from torch import Tensor 8 | import torch.nn.functional as F 9 | from torchvision.utils import save_image 10 | 11 | from ..dataset.types import Batch 12 | from ..flow import Flows 13 | from ..model.model import ModelOutput 14 | from ..tracking import Tracks 15 | from .loss import Loss, LossCfgCommon 16 | 17 | from ..model.procrustes import align_scaled_rigid 18 | 19 | @dataclass 20 | class LossShapeCfg(LossCfgCommon): 21 | name: Literal["shape"] 22 | dynamic_coef: float 23 | decay_end_epochs: int 24 | decay_low_weight: float 25 | 26 | 27 | def loss_shape_func(ref_pcds, surfaces, flys, loss_func, return_val=False, inf_mode=False, cfg=None): 28 | b, f, h, w, _ = ref_pcds.shape 29 | device = ref_pcds.device 30 | 31 | surfaces = surfaces.reshape(b*f, h*w, 3) 32 | pcd_r = ref_pcds.reshape(b*f, h*w, 3) 33 | 34 | # we keep all points the same weight to conduct constraint on shape rather than points. 35 | weights = flys.reshape(b*f,h*w) 36 | transform, scale = align_scaled_rigid(surfaces, pcd_r, weights=weights) 37 | 38 | surfaces_transformed = torch.matmul(transform[..., :3,:3], surfaces.mT).mT + transform[..., None, :3, 3] 39 | 40 | loss_map = loss_func(surfaces_transformed, pcd_r).sum(dim=-1) * weights 41 | loss_map = loss_map.reshape(b, f, h, w) 42 | loss = loss_map.sum() / weights.sum() 43 | 44 | return loss, {"shape": loss} 45 | 46 | 47 | class LossShape(Loss[LossShapeCfg]): 48 | def __init__(self, cfg: LossShapeCfg) -> None: 49 | super().__init__(cfg) 50 | self.loss = torch.nn.MSELoss(reduction="none") 51 | 52 | def compute_unweighted_loss( 53 | self, 54 | batch: Batch, 55 | flows: Flows, 56 | tracks: list[Tracks] | None, 57 | model_output: ModelOutput, 58 | current_epoch: int, 59 | return_val: bool, 60 | ) -> tuple[Float[Tensor, ""], dict]: 61 | return loss_shape_func(batch.pcds, model_output.surfaces, batch.flys, self.loss, return_val=return_val, cfg=self.cfg) 62 | 63 | -------------------------------------------------------------------------------- /egomono4d/eval/eval_extrinsic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pdb 3 | import numpy as np 4 | from evo.core.trajectory import PoseTrajectory3D 5 | from evo.core.transformations import quaternion_from_matrix 6 | from evo.core import metrics 7 | from evo.tools import file_interface 8 | import evo.main_ape as main_ape 9 | import evo.main_rpe as main_rpe 10 | 11 | 12 | def tensor_to_trajectory(tensor): 13 | # Tensor(seq_length, 4, 4) --> PoseTrajectory3D 14 | seq_length = len(tensor) 15 | timestamps = np.arange(seq_length) 16 | return PoseTrajectory3D(poses_se3=list(tensor.cpu().numpy()), timestamps=timestamps) 17 | 18 | 19 | def eval_extrinsic_conductor(pred_extrinsic, gt_extrinsic, correct_scale=True): # for mono-slam, correct_scale=True 20 | 21 | ate_list, rpe_trans_list, rpe_rot_list = [], [], [] 22 | for i in range(len(pred_extrinsic)): 23 | 24 | pred = pred_extrinsic[i] 25 | gt = gt_extrinsic[i] 26 | traj_est = tensor_to_trajectory(pred) 27 | traj_ref = tensor_to_trajectory(gt) 28 | 29 | ate_result = main_ape.ape(traj_ref, traj_est, est_name='ate', 30 | pose_relation=metrics.PoseRelation.translation_part, align=True, correct_scale=correct_scale) 31 | 32 | rpe_trans_result = main_rpe.rpe(traj_ref, traj_est, est_name='rpe_t', delta=1.0, delta_unit=metrics.Unit.frames, 33 | pose_relation=metrics.PoseRelation.translation_part, align=True, rel_delta_tol=0.1, correct_scale=correct_scale) 34 | 35 | rpe_rot_result = main_rpe.rpe(traj_ref, traj_est, est_name='rpe_r', delta=1.0, delta_unit=metrics.Unit.frames, 36 | pose_relation=metrics.PoseRelation.rotation_angle_deg, align=True, rel_delta_tol=0.1, correct_scale=correct_scale) 37 | ate_list.append(ate_result.stats["mean"]) 38 | rpe_trans_list.append(rpe_trans_result.stats["mean"]) 39 | rpe_rot_list.append(rpe_rot_result.stats["mean"]) 40 | 41 | return { 42 | 'CAM_ATE(mm)': 1000.0 * sum(ate_list) / len(ate_list), 43 | 'CAM_RPE_Trans(mm)': 1000.0 * sum(rpe_trans_list) / len(rpe_trans_list), 44 | 'CAM_RPE_Rot(deg)': sum(rpe_rot_list) / len(rpe_rot_list) 45 | } 46 | 47 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class InputPadder: 6 | """ Pads images such that dimensions are divisible by 8 """ 7 | 8 | def __init__(self, dims, mode='sintel', padding_factor=8): 9 | self.ht, self.wd = dims[-2:] 10 | pad_ht = (((self.ht // padding_factor) + 1) * padding_factor - self.ht) % padding_factor 11 | pad_wd = (((self.wd // padding_factor) + 1) * padding_factor - self.wd) % padding_factor 12 | if mode == 'sintel': 13 | self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2] 14 | else: 15 | self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht] 16 | 17 | def pad(self, *inputs): 18 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 19 | 20 | def unpad(self, x): 21 | ht, wd = x.shape[-2:] 22 | c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] 23 | return x[..., c[0]:c[1], c[2]:c[3]] 24 | 25 | 26 | def coords_grid(batch, ht, wd, normalize=False): 27 | if normalize: # [-1, 1] 28 | coords = torch.meshgrid(2 * torch.arange(ht) / (ht - 1) - 1, 29 | 2 * torch.arange(wd) / (wd - 1) - 1) 30 | else: 31 | coords = torch.meshgrid(torch.arange(ht), torch.arange(wd)) 32 | coords = torch.stack(coords[::-1], dim=0).float() 33 | return coords[None].repeat(batch, 1, 1, 1) # [B, 2, H, W] 34 | 35 | 36 | def compute_out_of_boundary_mask(flow): 37 | # flow: [B, 2, H, W] 38 | assert flow.dim() == 4 and flow.size(1) == 2 39 | b, _, h, w = flow.shape 40 | init_coords = coords_grid(b, h, w).to(flow.device) 41 | corres = init_coords + flow # [B, 2, H, W] 42 | 43 | max_w = w - 1 44 | max_h = h - 1 45 | 46 | valid_mask = (corres[:, 0] >= 0) & (corres[:, 0] <= max_w) & (corres[:, 1] >= 0) & (corres[:, 1] <= max_h) 47 | 48 | # in case very large flow 49 | flow_mask = (flow[:, 0].abs() <= max_w) & (flow[:, 1].abs() <= max_h) 50 | 51 | valid_mask = valid_mask & flow_mask 52 | 53 | return valid_mask # [B, H, W] 54 | 55 | 56 | def count_parameters(model): 57 | num = sum(p.numel() for p in model.parameters() if p.requires_grad) 58 | return num 59 | -------------------------------------------------------------------------------- /egomono4d/visualization/drawing/points.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from einops import repeat 5 | from jaxtyping import Float 6 | from torch import Tensor 7 | 8 | from .coordinate_conversion import generate_conversions 9 | from .rendering import render_over_image 10 | from .types import Pair, Scalar, Vector, sanitize_scalar, sanitize_vector 11 | 12 | 13 | def draw_points( 14 | image: Float[Tensor, "3 height width"] | Float[Tensor, "4 height width"], 15 | points: Vector, 16 | color: Vector = [1, 1, 1], 17 | radius: Scalar = 1, 18 | inner_radius: Scalar = 0, 19 | num_msaa_passes: int = 1, 20 | x_range: Optional[Pair] = None, 21 | y_range: Optional[Pair] = None, 22 | ) -> Float[Tensor, "3 height width"] | Float[Tensor, "4 height width"]: 23 | device = image.device 24 | points = sanitize_vector(points, 2, device) 25 | color = sanitize_vector(color, 3, device) 26 | radius = sanitize_scalar(radius, device) 27 | inner_radius = sanitize_scalar(inner_radius, device) 28 | (num_points,) = torch.broadcast_shapes( 29 | points.shape[0], 30 | color.shape[0], 31 | radius.shape, 32 | inner_radius.shape, 33 | ) 34 | 35 | # Convert world-space points to pixel space. 36 | _, h, w = image.shape 37 | world_to_pixel, _ = generate_conversions((h, w), device, x_range, y_range) 38 | points = world_to_pixel(points) 39 | 40 | def color_function( 41 | xy: Float[Tensor, "point 2"], 42 | ) -> Float[Tensor, "point 4"]: 43 | # Define a vector between the start and end points. 44 | delta = xy[:, None] - points[None] 45 | delta_norm = delta.norm(dim=-1) 46 | mask = (delta_norm >= inner_radius[None]) & (delta_norm <= radius[None]) 47 | 48 | # Determine the sample's color. 49 | selectable_color = color.broadcast_to((num_points, 3)) 50 | arrangement = mask * torch.arange(num_points, device=device) 51 | top_color = selectable_color.gather( 52 | dim=0, 53 | index=repeat(arrangement.argmax(dim=1), "s -> s c", c=3), 54 | ) 55 | rgba = torch.cat((top_color, mask.any(dim=1).float()[:, None]), dim=-1) 56 | 57 | return rgba 58 | 59 | return render_over_image(image, color_function, device, num_passes=num_msaa_passes) 60 | -------------------------------------------------------------------------------- /egomono4d/datagen.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import torch 3 | from jaxtyping import install_import_hook 4 | from lightning import Trainer 5 | import pdb 6 | from lightning.pytorch.plugins.environments import SLURMEnvironment 7 | from omegaconf import DictConfig 8 | 9 | # Configure beartype and jaxtyping. 10 | with install_import_hook( 11 | ("flowmap",), 12 | ("beartype", "beartype"), 13 | ): 14 | from .config.common import get_typed_root_config 15 | from .config.pretrain import PretrainCfg 16 | from .dataset.data_module_pretrain import DataModulePretrain 17 | from .loss import get_losses 18 | from .misc.common_training_setup import run_common_training_setup 19 | from .model.model import Model 20 | from .model.model_wrapper_pretrain import ModelWrapperPretrain 21 | from .visualization import get_visualizers 22 | 23 | from .dataset import get_dataset 24 | 25 | @hydra.main( 26 | version_base=None, 27 | config_path="../config", 28 | config_name="datagen_pov_surgery", 29 | ) 30 | def pretrain(cfg_dict: DictConfig) -> None: 31 | cfg = get_typed_root_config(cfg_dict, PretrainCfg) 32 | cfg.flow.cache_dir = cfg.base_cache_dir 33 | loss_name_list = [cfg_item.name for cfg_item in cfg.loss] 34 | 35 | for dataset_cfg in cfg.dataset: 36 | dataset_cfg.resize_shape = cfg.preprocess.resize_shape 37 | dataset_cfg.patch_size = cfg.preprocess.patch_size 38 | dataset_cfg.num_frames = cfg.preprocess.num_frames 39 | dataset_cfg.cache_dir = cfg.base_cache_dir 40 | dataset_cfg.use_consistency_loss = ('cc' in loss_name_list) 41 | if hasattr(dataset_cfg, "mask_flow_model"): 42 | dataset_cfg.mask_flow_model = cfg.flow 43 | 44 | cfg.trainer.gpus = 1 45 | 46 | dataset_train = get_dataset(cfg.dataset, 'train', debug=False, global_rank=0, world_size=1) 47 | dataset_val = get_dataset(cfg.dataset, 'val', debug=False, global_rank=0, world_size=1) 48 | dataset_test = get_dataset(cfg.dataset, 'test', debug=False, global_rank=0, world_size=1) 49 | train = iter(dataset_train).__next__() 50 | val = iter(dataset_val).__next__() 51 | test = iter(dataset_test).__next__() 52 | pdb.set_trace() 53 | 54 | 55 | if __name__ == "__main__": 56 | pretrain() 57 | 58 | # CUDA_VISIBLE_DEVICES=0,1 python -m egomono4d.data -------------------------------------------------------------------------------- /egomono4d/loss/loss_dynamic_area.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | import torch 4 | import pdb 5 | from einops import einsum, rearrange 6 | 7 | from jaxtyping import Float 8 | from torch import Tensor 9 | import torch.nn.functional as F 10 | from torchvision.utils import save_image 11 | 12 | from ..dataset.types import Batch 13 | from ..flow import Flows 14 | from ..model.model import ModelOutput 15 | from ..model.projection import sample_image_grid 16 | 17 | from ..tracking import Tracks 18 | from .loss import Loss, LossCfgCommon 19 | from .mapping import MappingCfg, get_mapping 20 | 21 | earlier = lambda x: x[:, :-1] # noqa 22 | later = lambda x: x[:, 1:] # noqa 23 | 24 | 25 | @dataclass 26 | class LossDynamicAreaCfg(LossCfgCommon): 27 | name: Literal["dynamic_area"] 28 | 29 | 30 | class LossDynamicArea(Loss[LossDynamicAreaCfg]): 31 | def __init__(self, cfg: LossDynamicAreaCfg) -> None: 32 | super().__init__(cfg) 33 | self.bce_loss = torch.nn.BCELoss(reduction="none") 34 | 35 | def compute_unweighted_loss( 36 | self, 37 | batch: Batch, 38 | flows: Flows, 39 | tracks: list[Tracks] | None, 40 | model_output: ModelOutput, 41 | current_epoch: int, 42 | return_val: bool 43 | ) -> tuple[Float[Tensor, ""], dict]: 44 | 45 | surfaces = model_output.surfaces 46 | device = surfaces.device 47 | b, f, h, w, _ = surfaces.shape 48 | xy, _ = sample_image_grid((h, w), device=device) 49 | 50 | later_mask = later(batch.masks) # (b, f-1, h, w) 51 | b_xy_earlier = rearrange(xy + flows.backward, "b f h w xy -> (b f) h w xy") 52 | earlier_mask = F.grid_sample( 53 | rearrange(earlier(batch.masks), "b f h w -> (b f) () h w"), 54 | b_xy_earlier * 2 - 1, 55 | align_corners=True, 56 | mode='bilinear', 57 | padding_mode="zeros" 58 | ) 59 | earlier_mask = rearrange(earlier_mask, "(b f) () h w -> b f h w", b=b, f=f-1) 60 | gt_mask = later_mask * earlier_mask 61 | 62 | loss = self.bce_loss(model_output.backward_correspondence_weights, gt_mask) 63 | valid = h * w * (f-1) * b 64 | 65 | 66 | loss = loss.sum() / (valid or 1) 67 | return loss, {"dynamic_area": loss} -------------------------------------------------------------------------------- /egomono4d/misc/image_io.py: -------------------------------------------------------------------------------- 1 | import io 2 | from pathlib import Path 3 | from typing import Union 4 | 5 | import numpy as np 6 | import torch 7 | import torchvision.transforms as tf 8 | from einops import rearrange, repeat 9 | from jaxtyping import Float, UInt8 10 | from matplotlib.figure import Figure 11 | from PIL import Image 12 | from torch import Tensor 13 | 14 | FloatImage = Union[ 15 | Float[Tensor, "height width"], 16 | Float[Tensor, "channel height width"], 17 | Float[Tensor, "batch channel height width"], 18 | ] 19 | 20 | 21 | def fig_to_image( 22 | fig: Figure, 23 | dpi: int = 100, 24 | device: torch.device = torch.device("cpu"), 25 | ) -> Float[Tensor, "3 height width"]: 26 | buffer = io.BytesIO() 27 | fig.savefig(buffer, format="raw", dpi=dpi) 28 | buffer.seek(0) 29 | data = np.frombuffer(buffer.getvalue(), dtype=np.uint8) 30 | h = int(fig.bbox.bounds[3]) 31 | w = int(fig.bbox.bounds[2]) 32 | data = rearrange(data, "(h w c) -> c h w", h=h, w=w, c=4) 33 | buffer.close() 34 | return (torch.tensor(data, device=device, dtype=torch.float32) / 255)[:3] 35 | 36 | 37 | def prep_image(image: FloatImage) -> UInt8[np.ndarray, "height width channel"]: 38 | # Handle batched images. 39 | if image.ndim == 4: 40 | image = rearrange(image, "b c h w -> c h (b w)") 41 | 42 | # Handle single-channel images. 43 | if image.ndim == 2: 44 | image = rearrange(image, "h w -> () h w") 45 | 46 | # Ensure that there are 3 or 4 channels. 47 | channel, _, _ = image.shape 48 | if channel == 1: 49 | image = repeat(image, "() h w -> c h w", c=3) 50 | assert image.shape[0] in (3, 4) 51 | 52 | image = (image.detach().clip(min=0, max=1) * 255).type(torch.uint8) 53 | return rearrange(image, "c h w -> h w c").cpu().numpy() 54 | 55 | 56 | def save_image( 57 | image: FloatImage, 58 | path: Union[Path, str], 59 | ) -> None: 60 | """Save an image. Assumed to be in range 0-1.""" 61 | 62 | # Create the parent directory if it doesn't already exist. 63 | path = Path(path) 64 | path.parent.mkdir(exist_ok=True, parents=True) 65 | 66 | # Save the image. 67 | Image.fromarray(prep_image(image)).save(path) 68 | 69 | 70 | def load_image( 71 | path: Union[Path, str], 72 | ) -> Float[Tensor, "3 height width"]: 73 | return tf.ToTensor()(Image.open(path))[:3] 74 | -------------------------------------------------------------------------------- /egomono4d/misc/common_training_setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import hydra 4 | import torch 5 | import wandb 6 | import os 7 | from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint 8 | from lightning.pytorch.loggers import Logger 9 | from lightning.pytorch.loggers.wandb import WandbLogger 10 | from omegaconf import DictConfig, OmegaConf 11 | 12 | from ..config.common import CommonCfg 13 | from .local_logger import LOG_PATH, LocalLogger 14 | from .wandb_tools import update_checkpoint_path 15 | 16 | 17 | def run_common_training_setup( 18 | cfg: CommonCfg, 19 | cfg_dict: DictConfig 20 | ) -> tuple[list[Callback], Logger, Path | None, Path]: 21 | torch.set_float32_matmul_precision("highest") 22 | 23 | # Set up callbacks. 24 | callbacks = [ 25 | LearningRateMonitor("step", True), 26 | ModelCheckpoint( 27 | monitor="val/loss/total_loss", 28 | mode="min", 29 | dirpath=(LOG_PATH / "checkpoints") if cfg.wandb.mode == "disabled" else None, 30 | save_top_k=1, 31 | save_last=True, 32 | filename="best-{epoch}-{step}", 33 | ) 34 | ] 35 | 36 | # Set up logging. 37 | if cfg.wandb.mode == "disabled": 38 | logger = LocalLogger() 39 | output_dir = LOG_PATH 40 | os.makedirs(output_dir, exist_ok=True) 41 | else: 42 | output_dir = Path( 43 | hydra.core.hydra_config.HydraConfig.get()["runtime"]["output_dir"] 44 | ) 45 | output_dir = output_dir / cfg.wandb.name 46 | os.makedirs(output_dir, exist_ok=True) 47 | logger = WandbLogger( 48 | project=cfg.wandb.project, 49 | name=cfg.wandb.name, 50 | mode=cfg.wandb.mode, 51 | group=cfg.wandb.group, 52 | tags=cfg.wandb.tags, 53 | config=OmegaConf.to_container(cfg_dict), 54 | log_model=False, # disabled artifact logging for storage saving 55 | save_dir=output_dir, 56 | ) 57 | 58 | # Log code to wandb if rank is 0. On rank != 0, wandb.run is None. 59 | if wandb.run is not None: 60 | wandb.run.log_code("egomono4d") 61 | 62 | # Prepare the checkpoint for loading. 63 | checkpoint_path = update_checkpoint_path(cfg.checkpoint.load, cfg.wandb) 64 | 65 | return callbacks, logger, checkpoint_path, output_dir 66 | -------------------------------------------------------------------------------- /egomono4d/model/backbone/modules/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from einops import rearrange, repeat 5 | from einops.layers.torch import Rearrange 6 | 7 | 8 | class FeedForward(nn.Module): 9 | def __init__(self, dim, hidden_dim, dropout = 0.): 10 | super().__init__() 11 | self.net = nn.Sequential( 12 | nn.LayerNorm(dim), 13 | nn.Linear(dim, hidden_dim), 14 | nn.GELU(), 15 | nn.Dropout(dropout), 16 | nn.Linear(hidden_dim, dim), 17 | nn.Dropout(dropout) 18 | ) 19 | def forward(self, x): 20 | return self.net(x) 21 | 22 | 23 | class Attention(nn.Module): 24 | def __init__(self, dim, heads = 4, dim_head = 64, dropout = 0.): 25 | super().__init__() 26 | inner_dim = dim_head * heads 27 | project_out = not (heads == 1 and dim_head == dim) 28 | 29 | self.heads = heads 30 | self.scale = dim_head ** -0.5 31 | 32 | self.norm = nn.LayerNorm(dim) 33 | self.attend = nn.Softmax(dim = -1) 34 | self.dropout = nn.Dropout(dropout) 35 | 36 | self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) 37 | 38 | self.to_out = nn.Sequential( 39 | nn.Linear(inner_dim, dim), 40 | nn.Dropout(dropout) 41 | ) if project_out else nn.Identity() 42 | 43 | def forward(self, x): 44 | x = self.norm(x) 45 | qkv = self.to_qkv(x).chunk(3, dim = -1) 46 | q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv) 47 | 48 | dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale 49 | 50 | attn = self.attend(dots) 51 | attn = self.dropout(attn) 52 | 53 | out = torch.matmul(attn, v) 54 | out = rearrange(out, 'b h n d -> b n (h d)') 55 | return self.to_out(out) 56 | 57 | 58 | class Transformer(nn.Module): 59 | def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.): 60 | super().__init__() 61 | self.layers = nn.ModuleList([]) 62 | for _ in range(depth): 63 | self.layers.append(nn.ModuleList([ 64 | Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout), 65 | FeedForward(dim, mlp_dim, dropout = dropout) 66 | ])) 67 | def forward(self, x): 68 | for attn, ff in self.layers: 69 | x = attn(x) + x 70 | x = ff(x) + x 71 | return x 72 | -------------------------------------------------------------------------------- /egomono4d/frame_sampler/frame_sampler_pretrain.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | # from typing import Literal 3 | from typing_extensions import Literal 4 | 5 | import torch 6 | from jaxtyping import Int64 7 | from torch import Tensor 8 | import random 9 | 10 | from .frame_sampler import FrameSampler 11 | 12 | 13 | class FrameSamplerPretrainNeighbor(FrameSampler): 14 | def sample( 15 | self, 16 | num_frames_in_video: int, 17 | device: torch.device, 18 | ) -> Int64[Tensor, " frame"]: 19 | # If the video doesn't have enough frames, just repeat the last frame. 20 | if num_frames_in_video < self.num_frames: 21 | indices = torch.arange(self.num_frames, device=device) 22 | indices[indices >= num_frames_in_video] = num_frames_in_video - 1 23 | return indices 24 | 25 | # If the video has enough frames, pick a random starting point. 26 | if self.stage == 'train': 27 | start = torch.randint(0, num_frames_in_video - self.num_frames + 1, tuple()) 28 | else: 29 | start = 0 30 | return torch.arange(start, start + self.num_frames, device=device) 31 | 32 | 33 | class FrameSamplerPretrainInterval(FrameSampler): 34 | def sample( 35 | self, 36 | num_frames_in_video: int, 37 | device: torch.device, 38 | max_interval: int=1 39 | ) -> Int64[Tensor, " frame"]: 40 | # If the video doesn't have enough frames, just repeat the last frame. 41 | if num_frames_in_video < self.num_frames: 42 | indices = torch.arange(self.num_frames, device=device) 43 | indices[indices >= num_frames_in_video] = num_frames_in_video - 1 44 | return indices 45 | 46 | if num_frames_in_video - 1 < max_interval * (self.num_frames-1): 47 | max_interval = (num_frames_in_video - 1) // (self.num_frames-1) 48 | 49 | if self.stage == 'train': 50 | interval = random.randint(1, max_interval) 51 | start = torch.randint(0, (num_frames_in_video-1)-interval*(self.num_frames-1), tuple()) 52 | else: 53 | interval = (max_interval + 1) // 2 # we test the middle state as representative performance (between easiest and hardest). 54 | start = ((num_frames_in_video-1)-interval*(self.num_frames-1)) // 2 # fixed it to eliminate uncertainty. 55 | # print(f"interval: {interval}") 56 | res_idx = torch.tensor([start+i*interval for i in range(self.num_frames)], device=device) 57 | # print(f"max_interval={max_interval}, interval={interval}, res={res_idx}") 58 | return res_idx -------------------------------------------------------------------------------- /egomono4d/config/common.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import os 3 | import pdb 4 | from typing import Type, TypeVar, Optional, List, Union # , Literal 5 | 6 | from omegaconf import DictConfig 7 | 8 | from ..dataset import DatasetCfg 9 | from ..misc.cropping import CroppingCfg 10 | from .tools import get_typed_config, separate_multiple_defaults 11 | 12 | 13 | try: 14 | EVAL = os.environ['EVAL_MODE'] 15 | except: 16 | EVAL = 'False' 17 | try: 18 | INFER = os.environ['INFER_MODE'] 19 | except: 20 | INFER = 'False' 21 | if (EVAL not in ['True']): 22 | from ..flow import FlowPredictorCfg 23 | from ..loss import LossCfg 24 | from ..model.model import ModelCfg 25 | from ..visualization import VisualizerCfg 26 | from ..tracking import TrackPredictorCfg 27 | print("Install Training Cfg.") 28 | else: 29 | FlowPredictorCfg, TrackPredictorCfg = None, None 30 | LossCfg, ModelCfg, VisualizerCfg = None, None, None 31 | 32 | @dataclass 33 | class WandbCfg: 34 | project: str = "egomono4d" 35 | mode: str = "disabled" 36 | name: Optional[str] = None 37 | group: Optional[str] = None 38 | tags: Optional[List[str]] = None 39 | 40 | 41 | @dataclass 42 | class CheckpointCfg: 43 | load: Optional[str] = None # str instead of Path, since it could be wandb://... 44 | 45 | 46 | @dataclass 47 | class TrainerCfg: 48 | val_check_interval: Union[int, float] = 1.0 49 | # check_val_every_n_epoch: int 50 | gradient_clip_val: float = 10.0 51 | max_steps: Optional[int] = None 52 | max_epochs: Optional[int] = None 53 | accumulate_grad_batches: Optional[int] = None 54 | num_nodes: int = 1 55 | gpus: int = 1 56 | 57 | 58 | @dataclass 59 | class CommonCfg: 60 | base_cache_dir: str = None 61 | save_dir: str = None 62 | data_ratio: float = None 63 | use_gt_depth: bool = False 64 | wandb: WandbCfg = None 65 | checkpoint: CheckpointCfg = None 66 | trainer: TrainerCfg = None 67 | flow: Optional[FlowPredictorCfg] = None 68 | tracking: Optional[TrackPredictorCfg] = None 69 | dataset: List[DatasetCfg] = None 70 | model: ModelCfg = None 71 | loss: List[LossCfg] = None 72 | visualizer: List[VisualizerCfg] = None 73 | cropping: Optional[CroppingCfg] = None 74 | 75 | 76 | T = TypeVar("T") 77 | 78 | 79 | def get_typed_root_config(cfg_dict: DictConfig, cfg_type: Type[T]) -> T: 80 | return get_typed_config( 81 | cfg_type, 82 | cfg_dict, 83 | { 84 | List[DatasetCfg]: separate_multiple_defaults(DatasetCfg), 85 | List[LossCfg]: separate_multiple_defaults(LossCfg), 86 | List[VisualizerCfg]: separate_multiple_defaults(VisualizerCfg), 87 | }, 88 | ) 89 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/utils/logger.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from utils.flow_viz import flow_tensor_to_image 4 | 5 | 6 | class Logger: 7 | def __init__(self, lr_scheduler, 8 | summary_writer, 9 | summary_freq=100, 10 | start_step=0, 11 | ): 12 | self.lr_scheduler = lr_scheduler 13 | self.total_steps = start_step 14 | self.running_loss = {} 15 | self.summary_writer = summary_writer 16 | self.summary_freq = summary_freq 17 | 18 | def print_training_status(self, mode='train'): 19 | 20 | print('step: %06d \t epe: %.3f' % (self.total_steps, self.running_loss['epe'] / self.summary_freq)) 21 | 22 | for k in self.running_loss: 23 | self.summary_writer.add_scalar(mode + '/' + k, 24 | self.running_loss[k] / self.summary_freq, self.total_steps) 25 | self.running_loss[k] = 0.0 26 | 27 | def lr_summary(self): 28 | lr = self.lr_scheduler.get_last_lr()[0] 29 | self.summary_writer.add_scalar('lr', lr, self.total_steps) 30 | 31 | def add_image_summary(self, img1, img2, flow_preds, flow_gt, mode='train', 32 | ): 33 | if self.total_steps % self.summary_freq == 0: 34 | img_concat = torch.cat((img1[0].detach().cpu(), img2[0].detach().cpu()), dim=-1) 35 | img_concat = img_concat.type(torch.uint8) # convert to uint8 to visualize in tensorboard 36 | 37 | flow_pred = flow_tensor_to_image(flow_preds[-1][0]) 38 | forward_flow_gt = flow_tensor_to_image(flow_gt[0]) 39 | flow_concat = torch.cat((torch.from_numpy(flow_pred), 40 | torch.from_numpy(forward_flow_gt)), dim=-1) 41 | 42 | concat = torch.cat((img_concat, flow_concat), dim=-2) 43 | 44 | self.summary_writer.add_image(mode + '/img_pred_gt', concat, self.total_steps) 45 | 46 | def push(self, metrics, mode='train'): 47 | self.total_steps += 1 48 | 49 | self.lr_summary() 50 | 51 | for key in metrics: 52 | if key not in self.running_loss: 53 | self.running_loss[key] = 0.0 54 | 55 | self.running_loss[key] += metrics[key] 56 | 57 | if self.total_steps % self.summary_freq == 0: 58 | self.print_training_status(mode) 59 | self.running_loss = {} 60 | 61 | def write_dict(self, results): 62 | for key in results: 63 | tag = key.split('_')[0] 64 | tag = tag + '/' + key 65 | self.summary_writer.add_scalar(tag, results[key], self.total_steps) 66 | 67 | def close(self): 68 | self.summary_writer.close() 69 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/scripts/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # evaluate GMFlow without refinement 4 | 5 | # evaluate chairs & things trained model on things and sintel (Table 3 of GMFlow paper) 6 | # the output should be: 7 | # Number of validation image pairs: 1024 8 | # Validation Things test set (things_clean) EPE: 3.475 9 | # Validation Things test (things_clean) s0_10: 0.666, s10_40: 1.310, s40+: 8.968 10 | # Number of validation image pairs: 1041 11 | # Validation Sintel (clean) EPE: 1.495, 1px: 0.161, 3px: 0.059, 5px: 0.040 12 | # Validation Sintel (clean) s0_10: 0.457, s10_40: 1.770, s40+: 8.257 13 | # Number of validation image pairs: 1041 14 | # Validation Sintel (final) EPE: 2.955, 1px: 0.209, 3px: 0.098, 5px: 0.071 15 | # Validation Sintel (final) s0_10: 0.725, s10_40: 3.446, s40+: 17.701 16 | 17 | CUDA_VISIBLE_DEVICES=0 python main.py \ 18 | --eval \ 19 | --resume pretrained/gmflow_things-e9887eda.pth \ 20 | --val_dataset things sintel \ 21 | --with_speed_metric 22 | 23 | 24 | 25 | # evaluate GMFlow with refinement 26 | 27 | # evaluate chairs & things trained model on things and sintel (Table 3 of GMFlow paper) 28 | # the output should be: 29 | # Validation Things test set (things_clean) EPE: 2.804 30 | # Validation Things test (things_clean) s0_10: 0.527, s10_40: 1.009, s40+: 7.314 31 | # Number of validation image pairs: 1041 32 | # Validation Sintel (clean) EPE: 1.084, 1px: 0.092, 3px: 0.040, 5px: 0.028 33 | # Validation Sintel (clean) s0_10: 0.303, s10_40: 1.252, s40+: 6.261 34 | # Number of validation image pairs: 1041 35 | # Validation Sintel (final) EPE: 2.475, 1px: 0.147, 3px: 0.077, 5px: 0.058 36 | # Validation Sintel (final) s0_10: 0.511, s10_40: 2.810, s40+: 15.669 37 | 38 | CUDA_VISIBLE_DEVICES=0 python main.py \ 39 | --eval \ 40 | --resume pretrained/gmflow_with_refine_things-36579974.pth \ 41 | --val_dataset things sintel \ 42 | --with_speed_metric \ 43 | --padding_factor 32 \ 44 | --upsample_factor 4 \ 45 | --num_scales 2 \ 46 | --attn_splits_list 2 8 \ 47 | --corr_radius_list -1 4 \ 48 | --prop_radius_list -1 1 49 | 50 | 51 | 52 | # evaluate matched & matched on sintel 53 | 54 | # evaluate GMFlow without refinement 55 | 56 | CUDA_VISIBLE_DEVICES=0 python main.py \ 57 | --eval \ 58 | --evaluate_matched_unmatched \ 59 | --resume pretrained/gmflow_things-e9887eda.pth \ 60 | --val_dataset sintel 61 | 62 | # evaluate GMFlow with refinement 63 | 64 | CUDA_VISIBLE_DEVICES=0 python main.py \ 65 | --eval \ 66 | --evaluate_matched_unmatched \ 67 | --resume pretrained/gmflow_with_refine_things-36579974.pth \ 68 | --val_dataset sintel \ 69 | --with_speed_metric \ 70 | --padding_factor 32 \ 71 | --upsample_factor 4 \ 72 | --num_scales 2 \ 73 | --attn_splits_list 2 8 \ 74 | --corr_radius_list -1 4 \ 75 | --prop_radius_list -1 1 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /egomono4d/misc/depth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cv2 3 | import pdb 4 | import os 5 | import json 6 | import numpy as np 7 | from PIL import Image 8 | from unidepth.models import UniDepthV2 9 | 10 | def get_depth_estimator(estimator_name="unidepth_v2_large", cache_dir='.cache', device='cuda'): 11 | # pdb.set_trace() 12 | if estimator_name in ["unidepth_v2_large", "unidepth_v2_small"]: 13 | version = 'v2' 14 | if estimator_name.endswith('large'): 15 | backbone = 'vitl' 16 | elif estimator_name.endswith('small'): 17 | backbone = 'vits' 18 | with open(os.path.join(cache_dir, "unidepth_v2_checkpoints", f"unidepth-{version}-{backbone}14.json")) as f: 19 | config = json.load(f) 20 | model = UniDepthV2(config) 21 | model_dir = os.path.join(cache_dir, "unidepth_v2_checkpoints", f"unidepth-{version}-{backbone}14.bin") 22 | model.load_state_dict(torch.load(model_dir, map_location='cpu')) 23 | model = model.to(device).eval() 24 | return model 25 | else: 26 | raise ValueError(f"Unsupport Depth Estimator: {estimator_name}. Supportion: [depth_anything_v2_large].") 27 | 28 | 29 | 30 | def estimate_relative_depth(pil_image: Image.Image, 31 | model, 32 | estimator_name="unidepth_v2_large"): 33 | if estimator_name in ['unidepth_v2_large', 'unidepth_v2_small']: 34 | rgb = torch.from_numpy(np.array(pil_image)).permute(2, 0, 1) # C, H, W 35 | predictions = model.infer(rgb) 36 | predictions['depth'] = predictions['depth'].cpu().detach().numpy()[0,0] 37 | predictions['intrinsics'] = predictions['intrinsics'].cpu().detach().numpy()[0] 38 | predictions['points'] = predictions['points'].cpu().detach().numpy()[0].transpose(1,2,0) 39 | return predictions 40 | else: 41 | raise ValueError("Unsupport Disparity-Depth Estimator: {estimator_name}. Supportion: [depth_anything_v2_large, depth_anything_v2_large_indoor].") 42 | 43 | 44 | def save_estimate_disparity_png(e_dep, e_dep_fp_img): 45 | # black: 0 <----> white: 1 46 | # we follow that more closer to camera, more closer to white color. 47 | e_dep = (255 * (e_dep - e_dep.min()) / (e_dep.max() - e_dep.min())).astype(np.uint8) 48 | e_dep_img = Image.fromarray(e_dep, mode='L') 49 | e_dep_img.save(e_dep_fp_img) 50 | 51 | 52 | def save_estimate_depth_png(e_dep, e_dep_fp_img): 53 | # black: 1 (deeper) <----> white: 0 (closer) 54 | e_dep = np.log(e_dep) # for depth we first conduct log for it. 55 | dp_norm = (e_dep - e_dep.min()) / (e_dep.max() - e_dep.min()) 56 | e_dep = 255 * (1.0 - dp_norm) 57 | if e_dep.dtype != np.uint8: 58 | e_dep = e_dep.astype(np.uint8) 59 | e_dep_img = Image.fromarray(e_dep, mode='L') 60 | e_dep_img.save(e_dep_fp_img) 61 | -------------------------------------------------------------------------------- /egomono4d/model/extrinsics/extrinsics_regressed.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | import torch 5 | from einops import rearrange 6 | from jaxtyping import Float 7 | from torch import Tensor, nn 8 | 9 | from ...dataset.types import Batch 10 | from ...flow.flow_predictor import Flows 11 | from ..backbone.backbone import BackboneOutput 12 | from ..projection import get_extrinsics 13 | from .extrinsics import Extrinsics 14 | 15 | 16 | # https://github.com/facebookresearch/pytorch3d/blob/main/pytorch3d/transforms/rotation_conversions.py 17 | def quaternion_to_matrix( 18 | quaternions: Float[Tensor, "*batch 4"], 19 | eps: float = 1e-8, 20 | ) -> Float[Tensor, "*batch 3 3"]: 21 | # Order changed to match scipy format! 22 | i, j, k, r = torch.unbind(quaternions, dim=-1) 23 | two_s = 2 / ((quaternions * quaternions).sum(dim=-1) + eps) 24 | 25 | o = torch.stack( 26 | ( 27 | 1 - two_s * (j * j + k * k), 28 | two_s * (i * j - k * r), 29 | two_s * (i * k + j * r), 30 | two_s * (i * j + k * r), 31 | 1 - two_s * (i * i + k * k), 32 | two_s * (j * k - i * r), 33 | two_s * (i * k - j * r), 34 | two_s * (j * k + i * r), 35 | 1 - two_s * (i * i + j * j), 36 | ), 37 | -1, 38 | ) 39 | return rearrange(o, "... (i j) -> ... i j", i=3, j=3) 40 | 41 | 42 | @dataclass 43 | class ExtrinsicsRegressedCfg: 44 | name: Literal["regressed"] 45 | 46 | 47 | class ExtrinsicsRegressed(Extrinsics[ExtrinsicsRegressedCfg]): 48 | def __init__( 49 | self, 50 | cfg: ExtrinsicsRegressedCfg, 51 | num_frames: int, 52 | ) -> None: 53 | super().__init__(cfg, num_frames) 54 | 55 | assert num_frames >= 2 56 | 57 | # Initialize identity translations and rotations. 58 | self.translations = nn.Parameter( 59 | torch.zeros((num_frames - 1, 3), dtype=torch.float32) 60 | ) 61 | rotations = torch.zeros((num_frames - 1, 4), dtype=torch.float32) 62 | rotations[:, -1] = 1 63 | self.rotations = nn.Parameter(rotations) 64 | 65 | def forward( 66 | self, 67 | batch: Batch, 68 | flows: Flows, 69 | backbone_output: BackboneOutput, 70 | surfaces: Float[Tensor, "batch frame height width 3"], 71 | ) -> Float[Tensor, "batch frame 4 4"]: 72 | device = surfaces.device 73 | b, f, _, _, _ = surfaces.shape 74 | 75 | # Regressing the extrinsics only makes sense during overfitting. 76 | assert b == 1 77 | 78 | tf = torch.eye(4, dtype=torch.float32, device=device) 79 | tf = tf.broadcast_to((f - 1, 4, 4)).contiguous() 80 | tf[:, :3, :3] = quaternion_to_matrix(self.rotations) 81 | tf[:, :3, 3] = self.translations 82 | 83 | return get_extrinsics(tf)[None] 84 | -------------------------------------------------------------------------------- /egomono4d/flow/flow_predictor_gmflow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import urllib.request 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from typing import Literal 7 | 8 | import torch 9 | from einops import rearrange 10 | from jaxtyping import Float 11 | from torch import Tensor 12 | 13 | try: 14 | from ..repo.gmflow.gmflow.gmflow import GMFlow 15 | except ImportError: 16 | GMFlow = None 17 | 18 | from .common import split_videos 19 | from .flow_predictor import FlowPredictor 20 | 21 | 22 | @dataclass 23 | class FlowPredictorGMFlowCfg: 24 | name: Literal["gmflow"] 25 | cache_dir: str | None 26 | 27 | 28 | class FlowPredictorGMFlow(FlowPredictor[FlowPredictorGMFlowCfg]): 29 | def __init__(self, cfg: FlowPredictorGMFlowCfg) -> None: 30 | super().__init__(cfg) 31 | 32 | # Warn that GMFlow isn't installed. 33 | if GMFlow is None: 34 | print( 35 | "Warning: GMFlow could not be imported. Did you forget to initialize " 36 | "the git submodules?" 37 | ) 38 | sys.exit(1) 39 | 40 | # Ensure that the checkpoint exists. 41 | checkpoint = "gmflow-scale1-mixdata-train320x576-4c3a6e9a.pth" 42 | checkpoint_path = cfg.cache_dir + "/gmflow_checkpoints/" + checkpoint 43 | if not os.path.exists(checkpoint_path): 44 | os.makedirs(checkpoint_path, exist_ok=True) 45 | print("Downloading GMFlow checkpoint.") 46 | urllib.request.urlretrieve( 47 | f"https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/{checkpoint}", 48 | checkpoint_path, 49 | ) 50 | 51 | # Set up the model. 52 | self.model = GMFlow( 53 | feature_channels=128, 54 | num_scales=1, 55 | upsample_factor=8, 56 | num_head=1, 57 | attention_type="swin", 58 | ffn_dim_expansion=4, 59 | num_transformer_layers=6, 60 | ) 61 | 62 | # Load the pre-trained checkpoint. 63 | checkpoint = torch.load(checkpoint_path) 64 | weights = checkpoint["model"] if "model" in checkpoint else checkpoint 65 | self.model.load_state_dict(weights, strict=False) 66 | 67 | def forward( 68 | self, 69 | videos: Float[Tensor, "batch frame 3 height width"], 70 | ) -> Float[Tensor, "batch frame-1 height width 2"]: 71 | source, target, b, f = split_videos(videos) 72 | 73 | result = self.model( 74 | source * 255, 75 | target * 255, 76 | attn_splits_list=[2], 77 | corr_radius_list=[-1], 78 | prop_radius_list=[-1], 79 | pred_bidir_flow=False, 80 | ) 81 | flow = result["flow_preds"][-1] 82 | 83 | # Normalize the optical flow. 84 | _, _, h, w = source.shape 85 | wh = torch.tensor((w-1, h-1), dtype=torch.float32, device=flow.device) 86 | return rearrange(flow, "(b f) xy h w -> b f h w xy", b=b, f=f - 1) / wh 87 | -------------------------------------------------------------------------------- /egomono4d/dataset/data_module_pretrain.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | from typing import Callable, Optional, List 4 | 5 | import numpy as np 6 | import torch 7 | from torch import Generator 8 | from torch.utils.data import DataLoader, Dataset, IterableDataset, DistributedSampler 9 | 10 | from . import DatasetCfg, get_dataset 11 | from .types import Stage 12 | from lightning.pytorch import LightningDataModule as LightningDataModule 13 | from .data_module_pretrain_cfg import DataLoaderStageCfg, DataModulePretrainCfg 14 | 15 | DatasetShim = Callable[[Dataset, Stage], Dataset] 16 | 17 | 18 | def worker_init_fn(worker_id: int) -> None: 19 | random.seed(int(torch.utils.data.get_worker_info().seed) % (2**32 - 1)) 20 | np.random.seed(int(torch.utils.data.get_worker_info().seed) % (2**32 - 1)) 21 | 22 | 23 | class DataModulePretrain(LightningDataModule): 24 | def __init__( 25 | self, 26 | dataset_cfgs: List[DatasetCfg], 27 | data_module_cfg: DataModulePretrainCfg, 28 | global_rank: int, 29 | world_size: int, 30 | data_ratio: Optional[float]=1.0 31 | ) -> None: 32 | super().__init__() 33 | self.dataset_cfgs = dataset_cfgs 34 | self.data_module_cfg = data_module_cfg 35 | self.global_rank = global_rank 36 | self.world_size = world_size 37 | self.data_ratio = data_ratio 38 | 39 | def get_persistent(self, loader_cfg: DataLoaderStageCfg): 40 | return None if loader_cfg.num_workers == 0 else loader_cfg.persistent_workers 41 | 42 | def get_generator(self, loader_cfg: DataLoaderStageCfg): 43 | if loader_cfg.seed is None: 44 | return None 45 | generator = Generator() 46 | generator.manual_seed(loader_cfg.seed + self.global_rank) 47 | return generator 48 | 49 | def train_dataloader(self): 50 | dataset = get_dataset(self.dataset_cfgs, "train", global_rank=self.global_rank, world_size=self.world_size, data_ratio=self.data_ratio) 51 | print(f"train_batch_size = {self.data_module_cfg.train.batch_size}") 52 | return DataLoader( 53 | dataset, 54 | self.data_module_cfg.train.batch_size, 55 | shuffle=not isinstance(dataset, IterableDataset), 56 | num_workers=self.data_module_cfg.train.num_workers, 57 | generator=self.get_generator(self.data_module_cfg.train), 58 | worker_init_fn=worker_init_fn, 59 | persistent_workers=self.get_persistent(self.data_module_cfg.train), 60 | ) 61 | 62 | def val_dataloader(self): 63 | dataset = get_dataset(self.dataset_cfgs, "val", global_rank=self.global_rank, world_size=self.world_size, data_ratio=self.data_ratio) 64 | print(f"validation_batch_size = {self.data_module_cfg.val.batch_size}") 65 | return DataLoader( 66 | dataset, 67 | self.data_module_cfg.val.batch_size, 68 | num_workers=self.data_module_cfg.val.num_workers, 69 | generator=self.get_generator(self.data_module_cfg.val), 70 | worker_init_fn=worker_init_fn, 71 | persistent_workers=self.get_persistent(self.data_module_cfg.val), 72 | ) 73 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .position import PositionEmbeddingSine 3 | 4 | 5 | def split_feature(feature, 6 | num_splits=2, 7 | channel_last=False, 8 | ): 9 | if channel_last: # [B, H, W, C] 10 | b, h, w, c = feature.size() 11 | assert h % num_splits == 0 and w % num_splits == 0 12 | 13 | b_new = b * num_splits * num_splits 14 | h_new = h // num_splits 15 | w_new = w // num_splits 16 | 17 | feature = feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c 18 | ).permute(0, 1, 3, 2, 4, 5).reshape(b_new, h_new, w_new, c) # [B*K*K, H/K, W/K, C] 19 | else: # [B, C, H, W] 20 | b, c, h, w = feature.size() 21 | assert h % num_splits == 0 and w % num_splits == 0 22 | 23 | b_new = b * num_splits * num_splits 24 | h_new = h // num_splits 25 | w_new = w // num_splits 26 | 27 | feature = feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits 28 | ).permute(0, 2, 4, 1, 3, 5).reshape(b_new, c, h_new, w_new) # [B*K*K, C, H/K, W/K] 29 | 30 | return feature 31 | 32 | 33 | def merge_splits(splits, 34 | num_splits=2, 35 | channel_last=False, 36 | ): 37 | if channel_last: # [B*K*K, H/K, W/K, C] 38 | b, h, w, c = splits.size() 39 | new_b = b // num_splits // num_splits 40 | 41 | splits = splits.view(new_b, num_splits, num_splits, h, w, c) 42 | merge = splits.permute(0, 1, 3, 2, 4, 5).contiguous().view( 43 | new_b, num_splits * h, num_splits * w, c) # [B, H, W, C] 44 | else: # [B*K*K, C, H/K, W/K] 45 | b, c, h, w = splits.size() 46 | new_b = b // num_splits // num_splits 47 | 48 | splits = splits.view(new_b, num_splits, num_splits, c, h, w) 49 | merge = splits.permute(0, 3, 1, 4, 2, 5).contiguous().view( 50 | new_b, c, num_splits * h, num_splits * w) # [B, C, H, W] 51 | 52 | return merge 53 | 54 | 55 | def normalize_img(img0, img1): 56 | # loaded images are in [0, 255] 57 | # normalize by ImageNet mean and std 58 | mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device) 59 | std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device) 60 | img0 = (img0 / 255. - mean) / std 61 | img1 = (img1 / 255. - mean) / std 62 | 63 | return img0, img1 64 | 65 | 66 | def feature_add_position(feature0, feature1, attn_splits, feature_channels): 67 | pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2) 68 | 69 | if attn_splits > 1: # add position in splited window 70 | feature0_splits = split_feature(feature0, num_splits=attn_splits) 71 | feature1_splits = split_feature(feature1, num_splits=attn_splits) 72 | 73 | position = pos_enc(feature0_splits) 74 | 75 | feature0_splits = feature0_splits + position 76 | feature1_splits = feature1_splits + position 77 | 78 | feature0 = merge_splits(feature0_splits, num_splits=attn_splits) 79 | feature1 = merge_splits(feature1_splits, num_splits=attn_splits) 80 | else: 81 | position = pos_enc(feature0) 82 | 83 | feature0 = feature0 + position 84 | feature1 = feature1 + position 85 | 86 | return feature0, feature1 87 | -------------------------------------------------------------------------------- /egomono4d/visualization/drawing/lines.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Optional 2 | 3 | import torch 4 | from einops import einsum, repeat 5 | from jaxtyping import Float 6 | from torch import Tensor 7 | 8 | from .coordinate_conversion import generate_conversions 9 | from .rendering import render_over_image 10 | from .types import Pair, Scalar, Vector, sanitize_scalar, sanitize_vector 11 | 12 | 13 | def draw_lines( 14 | image: Float[Tensor, "3 height width"] | Float[Tensor, "4 height width"], 15 | start: Vector, 16 | end: Vector, 17 | color: Vector, 18 | width: Scalar, 19 | cap: Literal["butt", "round", "square"] = "round", 20 | num_msaa_passes: int = 1, 21 | x_range: Optional[Pair] = None, 22 | y_range: Optional[Pair] = None, 23 | ) -> Float[Tensor, "3 height width"] | Float[Tensor, "4 height width"]: 24 | device = image.device 25 | start = sanitize_vector(start, 2, device) 26 | end = sanitize_vector(end, 2, device) 27 | color = sanitize_vector(color, 3, device) 28 | width = sanitize_scalar(width, device) 29 | (num_lines,) = torch.broadcast_shapes( 30 | start.shape[0], 31 | end.shape[0], 32 | color.shape[0], 33 | width.shape, 34 | ) 35 | 36 | # Convert world-space points to pixel space. 37 | _, h, w = image.shape 38 | world_to_pixel, _ = generate_conversions((h, w), device, x_range, y_range) 39 | start = world_to_pixel(start) 40 | end = world_to_pixel(end) 41 | 42 | def color_function( 43 | xy: Float[Tensor, "point 2"], 44 | ) -> Float[Tensor, "point 4"]: 45 | # Define a vector between the start and end points. 46 | delta = end - start 47 | delta_norm = delta.norm(dim=-1, keepdim=True) 48 | u_delta = delta / delta_norm 49 | 50 | # Define a vector between each sample and the start point. 51 | indicator = xy - start[:, None] 52 | 53 | # Determine whether each sample is inside the line in the parallel direction. 54 | extra = 0.5 * width[:, None] if cap == "square" else 0 55 | parallel = einsum(u_delta, indicator, "l xy, l s xy -> l s") 56 | parallel_inside_line = (parallel <= delta_norm + extra) & (parallel > -extra) 57 | 58 | # Determine whether each sample is inside the line perpendicularly. 59 | perpendicular = indicator - parallel[..., None] * u_delta[:, None] 60 | perpendicular_inside_line = perpendicular.norm(dim=-1) < 0.5 * width[:, None] 61 | 62 | inside_line = parallel_inside_line & perpendicular_inside_line 63 | 64 | # Compute round caps. 65 | if cap == "round": 66 | near_start = indicator.norm(dim=-1) < 0.5 * width[:, None] 67 | inside_line |= near_start 68 | end_indicator = indicator = xy - end[:, None] 69 | near_end = end_indicator.norm(dim=-1) < 0.5 * width[:, None] 70 | inside_line |= near_end 71 | 72 | # Determine the sample's color. 73 | selectable_color = color.broadcast_to((num_lines, 3)) 74 | arrangement = inside_line * torch.arange(num_lines, device=device)[:, None] 75 | top_color = selectable_color.gather( 76 | dim=0, 77 | index=repeat(arrangement.argmax(dim=0), "s -> s c", c=3), 78 | ) 79 | rgba = torch.cat((top_color, inside_line.any(dim=0).float()[:, None]), dim=-1) 80 | 81 | return rgba 82 | 83 | return render_over_image(image, color_function, device, num_passes=num_msaa_passes) 84 | -------------------------------------------------------------------------------- /egomono4d/eval/eval_pointcloud.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import open3d as o3d 4 | import pdb 5 | import copy 6 | import torch.nn.functional as F 7 | from einops import einsum, rearrange 8 | from ..model.procrustes import align_scaled_rigid 9 | import kaolin as kal 10 | 11 | FLY_THRESHOLD = 0.05 12 | 13 | def eval_pointcloud_conductor(pred_pcd, gt_pcd, gt_flys, rgbs, commit=""): # (b, f, h, w, 3) * 2, (b, f, h, w) 14 | """ An implementation based on Kaolin library. """ 15 | 16 | b, f, h, w, _ = pred_pcd.shape 17 | 18 | pred_pcd_align = pred_pcd.reshape(b, f*h*w, 3) 19 | gt_pcd_align = gt_pcd.reshape(b, f*h*w, 3) 20 | gt_flys_align = gt_flys.reshape(b, f*h*w) 21 | 22 | delta_ext_scale, scale = align_scaled_rigid(pred_pcd_align, gt_pcd_align, gt_flys_align) 23 | pred_pcd_align = torch.matmul(delta_ext_scale[:, :3,:3], pred_pcd_align.permute(0,2,1)).permute(0,2,1) + delta_ext_scale[:, :3, -1][:, None] 24 | 25 | pred_pcd_align = pred_pcd_align.reshape(b, f, h*w, 3) 26 | gt_pcd_align = gt_pcd_align.reshape(b, f, h*w, 3) 27 | gt_flys_align = gt_flys_align.reshape(b, f, h*w) 28 | 29 | ######################################## VIS ############################################### 30 | # vis_flys = gt_flys_align.reshape(b, -1) 31 | # rgbss = rgbs.permute(0,1,3,4,2) 32 | # rgbss = rgbss.reshape(b, -1, 3)[vis_flys == 1] 33 | # ppa = pred_pcd_align.reshape(b, -1, 3)[vis_flys == 1].reshape(-1, 3) 34 | # pcd = o3d.geometry.PointCloud() 35 | # pcd.points = o3d.utility.Vector3dVector(np.array(ppa.cpu().detach().reshape(-1, 3))) 36 | # pcd.colors = o3d.utility.Vector3dVector(np.array(rgbss.cpu().detach().reshape(-1, 3))) 37 | # o3d.io.write_point_cloud(f"pcd_pred_{commit}"+".ply", pcd) 38 | 39 | # pcd = o3d.geometry.PointCloud() 40 | # gpa = gt_pcd_align.reshape(b, -1, 3)[vis_flys == 1].reshape(-1, 3) 41 | # pcd.points = o3d.utility.Vector3dVector(np.array(gpa.cpu().detach().reshape(-1, 3))) 42 | # pcd.colors = o3d.utility.Vector3dVector(np.array(rgbss.cpu().detach().reshape(-1, 3))) 43 | # o3d.io.write_point_cloud(f"pcd_gt_{commit}"+".ply", pcd) 44 | ######################################## VIS ############################################### 45 | 46 | cds, f001, f0025, f005, f01 = [], [], [], [], [] 47 | n_f = pred_pcd_align.shape[0] 48 | for i in range(n_f): 49 | gt_fly_f = gt_flys_align[:, i] 50 | pred_pcd_f = pred_pcd_align[:, i][gt_fly_f == 1][None] 51 | gt_pcd_f = gt_pcd_align[:, i][gt_fly_f == 1][None] 52 | cd = kal.metrics.pointcloud.chamfer_distance(pred_pcd_f, gt_pcd_f) 53 | cds.append(cd) 54 | f001.append(kal.metrics.pointcloud.f_score(pred_pcd_f, gt_pcd_f, radius=0.01)) 55 | f0025.append(kal.metrics.pointcloud.f_score(pred_pcd_f, gt_pcd_f, radius=0.025)) 56 | f005.append(kal.metrics.pointcloud.f_score(pred_pcd_f, gt_pcd_f, radius=0.05)) 57 | f01.append(kal.metrics.pointcloud.f_score(pred_pcd_f, gt_pcd_f, radius=0.1)) 58 | 59 | cd = sum(cds) / (b*n_f) 60 | f_score_001 = sum(f001) / (b*n_f) 61 | f_score_0025 = sum(f0025) / (b*n_f) 62 | f_score_005 = sum(f005) / (b*n_f) 63 | f_score_01 = sum(f01) / (b*n_f) 64 | 65 | return { 66 | "PCD_ChamferDistance(mm)": 1000.0 * cd.item(), 67 | "PCD_FScore_[.01]": 100.0*f_score_001.item(), 68 | "PCD_FScore_[.025]": 100.0*f_score_0025.item(), 69 | "PCD_FScore_[.05]": 100.0*f_score_005.item(), 70 | "PCD_FScore_[.1]": 100.0*f_score_01.item(), 71 | } 72 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/geometry.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def coords_grid(b, h, w, homogeneous=False, device=None): 6 | y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) # [H, W] 7 | 8 | stacks = [x, y] 9 | 10 | if homogeneous: 11 | ones = torch.ones_like(x) # [H, W] 12 | stacks.append(ones) 13 | 14 | grid = torch.stack(stacks, dim=0).float() # [2, H, W] or [3, H, W] 15 | 16 | grid = grid[None].repeat(b, 1, 1, 1) # [B, 2, H, W] or [B, 3, H, W] 17 | 18 | if device is not None: 19 | grid = grid.to(device) 20 | 21 | return grid 22 | 23 | 24 | def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None): 25 | assert device is not None 26 | 27 | x, y = torch.meshgrid([torch.linspace(w_min, w_max, len_w, device=device), 28 | torch.linspace(h_min, h_max, len_h, device=device)], 29 | ) 30 | grid = torch.stack((x, y), -1).transpose(0, 1).float() # [H, W, 2] 31 | 32 | return grid 33 | 34 | 35 | def normalize_coords(coords, h, w): 36 | # coords: [B, H, W, 2] 37 | c = torch.Tensor([(w - 1) / 2., (h - 1) / 2.]).float().to(coords.device) 38 | return (coords - c) / c # [-1, 1] 39 | 40 | 41 | def bilinear_sample(img, sample_coords, mode='bilinear', padding_mode='zeros', return_mask=False): 42 | # img: [B, C, H, W] 43 | # sample_coords: [B, 2, H, W] in image scale 44 | if sample_coords.size(1) != 2: # [B, H, W, 2] 45 | sample_coords = sample_coords.permute(0, 3, 1, 2) 46 | 47 | b, _, h, w = sample_coords.shape 48 | 49 | # Normalize to [-1, 1] 50 | x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1 51 | y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1 52 | 53 | grid = torch.stack([x_grid, y_grid], dim=-1) # [B, H, W, 2] 54 | 55 | img = F.grid_sample(img, grid, mode=mode, padding_mode=padding_mode, align_corners=True) 56 | 57 | if return_mask: 58 | mask = (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1) # [B, H, W] 59 | 60 | return img, mask 61 | 62 | return img 63 | 64 | 65 | def flow_warp(feature, flow, mask=False, padding_mode='zeros'): 66 | b, c, h, w = feature.size() 67 | assert flow.size(1) == 2 68 | 69 | grid = coords_grid(b, h, w).to(flow.device) + flow # [B, 2, H, W] 70 | 71 | return bilinear_sample(feature, grid, padding_mode=padding_mode, 72 | return_mask=mask) 73 | 74 | 75 | def forward_backward_consistency_check(fwd_flow, bwd_flow, 76 | alpha=0.01, 77 | beta=0.5 78 | ): 79 | # fwd_flow, bwd_flow: [B, 2, H, W] 80 | # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837) 81 | assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4 82 | assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2 83 | flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1) # [B, H, W] 84 | 85 | warped_bwd_flow = flow_warp(bwd_flow, fwd_flow) # [B, 2, H, W] 86 | warped_fwd_flow = flow_warp(fwd_flow, bwd_flow) # [B, 2, H, W] 87 | 88 | diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1) # [B, H, W] 89 | diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1) 90 | 91 | threshold = alpha * flow_mag + beta 92 | 93 | fwd_occ = (diff_fwd > threshold).float() # [B, H, W] 94 | bwd_occ = (diff_bwd > threshold).float() 95 | 96 | return fwd_occ, bwd_occ 97 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/trident_conv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # https://github.com/facebookresearch/detectron2/blob/main/projects/TridentNet/tridentnet/trident_conv.py 3 | 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | from torch.nn.modules.utils import _pair 8 | 9 | 10 | class MultiScaleTridentConv(nn.Module): 11 | def __init__( 12 | self, 13 | in_channels, 14 | out_channels, 15 | kernel_size, 16 | stride=1, 17 | strides=1, 18 | paddings=0, 19 | dilations=1, 20 | dilation=1, 21 | groups=1, 22 | num_branch=1, 23 | test_branch_idx=-1, 24 | bias=False, 25 | norm=None, 26 | activation=None, 27 | ): 28 | super(MultiScaleTridentConv, self).__init__() 29 | self.in_channels = in_channels 30 | self.out_channels = out_channels 31 | self.kernel_size = _pair(kernel_size) 32 | self.num_branch = num_branch 33 | self.stride = _pair(stride) 34 | self.groups = groups 35 | self.with_bias = bias 36 | self.dilation = dilation 37 | if isinstance(paddings, int): 38 | paddings = [paddings] * self.num_branch 39 | if isinstance(dilations, int): 40 | dilations = [dilations] * self.num_branch 41 | if isinstance(strides, int): 42 | strides = [strides] * self.num_branch 43 | self.paddings = [_pair(padding) for padding in paddings] 44 | self.dilations = [_pair(dilation) for dilation in dilations] 45 | self.strides = [_pair(stride) for stride in strides] 46 | self.test_branch_idx = test_branch_idx 47 | self.norm = norm 48 | self.activation = activation 49 | 50 | assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1 51 | 52 | self.weight = nn.Parameter( 53 | torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) 54 | ) 55 | if bias: 56 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 57 | else: 58 | self.bias = None 59 | 60 | nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") 61 | if self.bias is not None: 62 | nn.init.constant_(self.bias, 0) 63 | 64 | def forward(self, inputs): 65 | num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1 66 | assert len(inputs) == num_branch 67 | 68 | if self.training or self.test_branch_idx == -1: 69 | outputs = [ 70 | F.conv2d(input, self.weight, self.bias, stride, padding, self.dilation, self.groups) 71 | for input, stride, padding in zip(inputs, self.strides, self.paddings) 72 | ] 73 | else: 74 | outputs = [ 75 | F.conv2d( 76 | inputs[0], 77 | self.weight, 78 | self.bias, 79 | self.strides[self.test_branch_idx] if self.test_branch_idx == -1 else self.strides[-1], 80 | self.paddings[self.test_branch_idx] if self.test_branch_idx == -1 else self.paddings[-1], 81 | self.dilation, 82 | self.groups, 83 | ) 84 | ] 85 | 86 | if self.norm is not None: 87 | outputs = [self.norm(x) for x in outputs] 88 | if self.activation is not None: 89 | outputs = [self.activation(x) for x in outputs] 90 | return outputs 91 | -------------------------------------------------------------------------------- /egomono4d/misc/fly.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import torch 4 | 5 | 6 | def detect_flying_pixels(depth_map, threshold=10): 7 | # # depth_map: (h, w) 8 | 9 | depth_dx, depth_dy = np.gradient(depth_map) 10 | depth_grad = np.sqrt(depth_dx**2 + depth_dy**2) 11 | flying_pixels = depth_grad > threshold 12 | 13 | return flying_pixels 14 | 15 | 16 | def detect_sequence_flying_pixels(depth_sequence, threshold=10): 17 | """ 18 | Process a sequence of depth maps to detect flying pixels. 19 | 20 | Parameters: 21 | depth_sequence (numpy.ndarray): The input depth sequence with shape (f, h, w). 22 | threshold (int): The threshold for detecting depth discontinuities. 23 | 24 | Returns: 25 | numpy.ndarray: A binary sequence where flying pixels are marked as 1. 26 | """ 27 | f, h, w = depth_sequence.shape 28 | flying_pixels_sequence = np.zeros((f, h, w), dtype=np.uint8) 29 | 30 | for i in range(f): 31 | flying_pixels_sequence[i] = detect_flying_pixels(depth_sequence[i], threshold) 32 | 33 | return flying_pixels_sequence 34 | 35 | 36 | def calculate_edge_scale_torch(surfaces, fly_masks): 37 | """ 38 | Get the scale of the point cloud defined with mean edge distance. 39 | scale = \sigma_{(i,j) in edges} ||p_i - p_j|| 40 | 41 | Inputs: 42 | surfaces: torch.Tensor[batch*, h, w, 3] 43 | fly_masks: torch.Tensor[batch*, h, w] 44 | 45 | Return: 46 | scale: torch.Tensor[batch*] 47 | """ 48 | 49 | dist_right = torch.norm(surfaces[..., :, 1:, :] - surfaces[..., :, :-1, :], dim=-1) 50 | dist_down = torch.norm(surfaces[..., 1:, :, :] - surfaces[..., :-1, :, :], dim=-1) 51 | mask_right = fly_masks[..., :, 1:] * fly_masks[..., :, :-1] 52 | mask_down = fly_masks[..., 1:, :] * fly_masks[..., :-1, :] 53 | 54 | scale_right = (dist_right * mask_right).sum(dim=[-1,-2]) / mask_right.sum(dim=[-1,-2]) 55 | scale_left = (dist_down * mask_down).sum(dim=[-1,-2]) / mask_down.sum(dim=[-1,-2]) 56 | 57 | scale_edge = (scale_right + scale_left) * 0.5 58 | return scale_edge 59 | 60 | 61 | def calculate_scale_pts(pts): # (n, 3) 62 | 63 | n, _ = pts.shape 64 | surfaces_flat = pts[None] 65 | 66 | centroids = torch.mean(surfaces_flat, dim=1, keepdim=True) 67 | centered_points = surfaces_flat - centroids 68 | 69 | cov_matrices = torch.bmm(centered_points.transpose(1, 2), centered_points) / n 70 | eigenvalues, _ = torch.linalg.eigh(cov_matrices) # (batch*, 3) 71 | scale = torch.sqrt(eigenvalues[:, -1]) # pick the largest PCA item 72 | 73 | return scale[0] 74 | 75 | 76 | def calculate_scale_torch(surfaces): 77 | """ 78 | Get the scale of the point cloud defined with PCA analysis. 79 | scale = \sigma_{(i,j) in edges} ||p_i - p_j|| 80 | 81 | Inputs: 82 | surfaces: torch.Tensor[batch*, h, w, 3] 83 | fly_masks: torch.Tensor[batch*, h, w] 84 | 85 | Return: 86 | scale: torch.Tensor[batch*] 87 | """ 88 | 89 | batch_shape = surfaces.shape[:-3] 90 | h, w = surfaces.shape[-3:-1] 91 | surfaces_flat = surfaces.view(-1, h * w, 3) # (batch*, h*w, 3) 92 | 93 | centroids = torch.mean(surfaces_flat, dim=1, keepdim=True) 94 | centered_points = surfaces_flat - centroids 95 | 96 | cov_matrices = torch.bmm(centered_points.transpose(1, 2), centered_points) / (h * w) 97 | eigenvalues, _ = torch.linalg.eigh(cov_matrices) # (batch*, 3) 98 | scale = torch.sqrt(eigenvalues[:, -1]) # pick the largest PCA item 99 | scale = scale.reshape(batch_shape) 100 | 101 | return scale 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/gmflow/matching.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | from .geometry import coords_grid, generate_window_grid, normalize_coords 5 | 6 | 7 | def global_correlation_softmax(feature0, feature1, 8 | pred_bidir_flow=False, 9 | ): 10 | # global correlation 11 | b, c, h, w = feature0.shape 12 | feature0 = feature0.view(b, c, -1).permute(0, 2, 1) # [B, H*W, C] 13 | feature1 = feature1.view(b, c, -1) # [B, C, H*W] 14 | 15 | correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (c ** 0.5) # [B, H, W, H, W] 16 | 17 | # flow from softmax 18 | init_grid = coords_grid(b, h, w).to(correlation.device) # [B, 2, H, W] 19 | grid = init_grid.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2] 20 | 21 | correlation = correlation.view(b, h * w, h * w) # [B, H*W, H*W] 22 | 23 | if pred_bidir_flow: 24 | correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0) # [2*B, H*W, H*W] 25 | init_grid = init_grid.repeat(2, 1, 1, 1) # [2*B, 2, H, W] 26 | grid = grid.repeat(2, 1, 1) # [2*B, H*W, 2] 27 | b = b * 2 28 | 29 | prob = F.softmax(correlation, dim=-1) # [B, H*W, H*W] 30 | 31 | correspondence = torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W] 32 | 33 | # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow 34 | flow = correspondence - init_grid 35 | 36 | return flow, prob 37 | 38 | 39 | def local_correlation_softmax(feature0, feature1, local_radius, 40 | padding_mode='zeros', 41 | ): 42 | b, c, h, w = feature0.size() 43 | coords_init = coords_grid(b, h, w).to(feature0.device) # [B, 2, H, W] 44 | coords = coords_init.view(b, 2, -1).permute(0, 2, 1) # [B, H*W, 2] 45 | 46 | local_h = 2 * local_radius + 1 47 | local_w = 2 * local_radius + 1 48 | 49 | window_grid = generate_window_grid(-local_radius, local_radius, 50 | -local_radius, local_radius, 51 | local_h, local_w, device=feature0.device) # [2R+1, 2R+1, 2] 52 | window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1) # [B, 1, (2R+1)^2, 2] 53 | sample_coords = coords.unsqueeze(-2) + window_grid # [B, H*W, (2R+1)^2, 2] 54 | 55 | sample_coords_softmax = sample_coords 56 | 57 | # exclude coords that are out of image space 58 | valid_x = (sample_coords[:, :, :, 0] >= 0) & (sample_coords[:, :, :, 0] < w) # [B, H*W, (2R+1)^2] 59 | valid_y = (sample_coords[:, :, :, 1] >= 0) & (sample_coords[:, :, :, 1] < h) # [B, H*W, (2R+1)^2] 60 | 61 | valid = valid_x & valid_y # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax 62 | 63 | # normalize coordinates to [-1, 1] 64 | sample_coords_norm = normalize_coords(sample_coords, h, w) # [-1, 1] 65 | window_feature = F.grid_sample(feature1, sample_coords_norm, 66 | padding_mode=padding_mode, align_corners=True 67 | ).permute(0, 2, 1, 3) # [B, H*W, C, (2R+1)^2] 68 | feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c) # [B, H*W, 1, C] 69 | 70 | corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (c ** 0.5) # [B, H*W, (2R+1)^2] 71 | 72 | # mask invalid locations 73 | corr[~valid] = -1e9 74 | 75 | prob = F.softmax(corr, -1) # [B, H*W, (2R+1)^2] 76 | 77 | correspondence = torch.matmul(prob.unsqueeze(-2), sample_coords_softmax).squeeze(-2).view( 78 | b, h, w, 2).permute(0, 3, 1, 2) # [B, 2, H, W] 79 | 80 | flow = correspondence - coords_init 81 | match_prob = prob 82 | 83 | return flow, match_prob 84 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # https://github.com/open-mmlab/mmcv/blob/7540cf73ac7e5d1e14d0ffbd9b6759e83929ecfc/mmcv/runner/dist_utils.py 3 | 4 | import os 5 | import subprocess 6 | 7 | import torch 8 | import torch.multiprocessing as mp 9 | from torch import distributed as dist 10 | 11 | 12 | def init_dist(launcher, backend='nccl', **kwargs): 13 | if mp.get_start_method(allow_none=True) is None: 14 | mp.set_start_method('spawn') 15 | if launcher == 'pytorch': 16 | _init_dist_pytorch(backend, **kwargs) 17 | elif launcher == 'mpi': 18 | _init_dist_mpi(backend, **kwargs) 19 | elif launcher == 'slurm': 20 | _init_dist_slurm(backend, **kwargs) 21 | else: 22 | raise ValueError(f'Invalid launcher type: {launcher}') 23 | 24 | 25 | def _init_dist_pytorch(backend, **kwargs): 26 | # TODO: use local_rank instead of rank % num_gpus 27 | rank = int(os.environ['RANK']) 28 | num_gpus = torch.cuda.device_count() 29 | torch.cuda.set_device(rank % num_gpus) 30 | dist.init_process_group(backend=backend, **kwargs) 31 | 32 | 33 | def _init_dist_mpi(backend, **kwargs): 34 | rank = int(os.environ['OMPI_COMM_WORLD_RANK']) 35 | num_gpus = torch.cuda.device_count() 36 | torch.cuda.set_device(rank % num_gpus) 37 | dist.init_process_group(backend=backend, **kwargs) 38 | 39 | 40 | def _init_dist_slurm(backend, port=None): 41 | """Initialize slurm distributed training environment. 42 | If argument ``port`` is not specified, then the master port will be system 43 | environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system 44 | environment variable, then a default port ``29500`` will be used. 45 | Args: 46 | backend (str): Backend of torch.distributed. 47 | port (int, optional): Master port. Defaults to None. 48 | """ 49 | proc_id = int(os.environ['SLURM_PROCID']) 50 | ntasks = int(os.environ['SLURM_NTASKS']) 51 | node_list = os.environ['SLURM_NODELIST'] 52 | num_gpus = torch.cuda.device_count() 53 | torch.cuda.set_device(proc_id % num_gpus) 54 | addr = subprocess.getoutput( 55 | f'scontrol show hostname {node_list} | head -n1') 56 | # specify master port 57 | if port is not None: 58 | os.environ['MASTER_PORT'] = str(port) 59 | elif 'MASTER_PORT' in os.environ: 60 | pass # use MASTER_PORT in the environment variable 61 | else: 62 | # 29500 is torch.distributed default port 63 | os.environ['MASTER_PORT'] = '29500' 64 | # use MASTER_ADDR in the environment variable if it already exists 65 | if 'MASTER_ADDR' not in os.environ: 66 | os.environ['MASTER_ADDR'] = addr 67 | os.environ['WORLD_SIZE'] = str(ntasks) 68 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 69 | os.environ['RANK'] = str(proc_id) 70 | dist.init_process_group(backend=backend) 71 | 72 | 73 | def get_dist_info(): 74 | if dist.is_available(): 75 | initialized = dist.is_initialized() 76 | else: 77 | initialized = False 78 | if initialized: 79 | rank = dist.get_rank() 80 | world_size = dist.get_world_size() 81 | else: 82 | rank = 0 83 | world_size = 1 84 | return rank, world_size 85 | 86 | 87 | def setup_for_distributed(is_master): 88 | """ 89 | This function disables printing when not in master process 90 | """ 91 | import builtins as __builtin__ 92 | builtin_print = __builtin__.print 93 | 94 | def print(*args, **kwargs): 95 | force = kwargs.pop('force', False) 96 | if is_master or force: 97 | builtin_print(*args, **kwargs) 98 | 99 | __builtin__.print = print 100 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/scripts/train_gmflow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # GMFlow without refinement 4 | 5 | # number of gpus for training, please set according to your hardware 6 | # by default use all gpus on a machine 7 | # can be trained on 4x 16GB V100 or 2x 32GB V100 or 2x 40GB A100 gpus 8 | NUM_GPUS=4 9 | 10 | # chairs 11 | CHECKPOINT_DIR=checkpoints/chairs-gmflow && \ 12 | mkdir -p ${CHECKPOINT_DIR} && \ 13 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --master_port=9989 main.py \ 14 | --launcher pytorch \ 15 | --checkpoint_dir ${CHECKPOINT_DIR} \ 16 | --batch_size 16 \ 17 | --val_dataset chairs sintel kitti \ 18 | --lr 4e-4 \ 19 | --image_size 384 512 \ 20 | --padding_factor 16 \ 21 | --upsample_factor 8 \ 22 | --with_speed_metric \ 23 | --val_freq 10000 \ 24 | --save_ckpt_freq 10000 \ 25 | --num_steps 100000 \ 26 | 2>&1 | tee -a ${CHECKPOINT_DIR}/train.log 27 | 28 | # things (our final model is trained for 800K iterations, for ablation study, you can train for 200K) 29 | CHECKPOINT_DIR=checkpoints/things-gmflow && \ 30 | mkdir -p ${CHECKPOINT_DIR} && \ 31 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --master_port=9989 main.py \ 32 | --launcher pytorch \ 33 | --checkpoint_dir ${CHECKPOINT_DIR} \ 34 | --resume checkpoints/chairs-gmflow/step_100000.pth \ 35 | --stage things \ 36 | --batch_size 8 \ 37 | --val_dataset things sintel kitti \ 38 | --lr 2e-4 \ 39 | --image_size 384 768 \ 40 | --padding_factor 16 \ 41 | --upsample_factor 8 \ 42 | --with_speed_metric \ 43 | --val_freq 40000 \ 44 | --save_ckpt_freq 50000 \ 45 | --num_steps 800000 \ 46 | 2>&1 | tee -a ${CHECKPOINT_DIR}/train.log 47 | 48 | # sintel 49 | CHECKPOINT_DIR=checkpoints/sintel-gmflow && \ 50 | mkdir -p ${CHECKPOINT_DIR} && \ 51 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --master_port=9989 main.py \ 52 | --launcher pytorch \ 53 | --checkpoint_dir ${CHECKPOINT_DIR} \ 54 | --resume checkpoints/things-gmflow/step_800000.pth \ 55 | --stage sintel \ 56 | --batch_size 8 \ 57 | --val_dataset sintel kitti \ 58 | --lr 2e-4 \ 59 | --image_size 320 896 \ 60 | --padding_factor 16 \ 61 | --upsample_factor 8 \ 62 | --with_speed_metric \ 63 | --val_freq 20000 \ 64 | --save_ckpt_freq 20000 \ 65 | --num_steps 200000 \ 66 | 2>&1 | tee -a ${CHECKPOINT_DIR}/train.log 67 | 68 | # kitti 69 | CHECKPOINT_DIR=checkpoints/kitti-gmflow && \ 70 | mkdir -p ${CHECKPOINT_DIR} && \ 71 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --master_port=9989 main.py \ 72 | --launcher pytorch \ 73 | --checkpoint_dir ${CHECKPOINT_DIR} \ 74 | --resume checkpoints/sintel-gmflow/step_200000.pth \ 75 | --stage kitti \ 76 | --batch_size 8 \ 77 | --val_dataset kitti \ 78 | --lr 2e-4 \ 79 | --image_size 320 1152 \ 80 | --padding_factor 16 \ 81 | --upsample_factor 8 \ 82 | --with_speed_metric \ 83 | --val_freq 10000 \ 84 | --save_ckpt_freq 10000 \ 85 | --num_steps 100000 \ 86 | 2>&1 | tee -a ${CHECKPOINT_DIR}/train.log 87 | 88 | 89 | # a final note: if your training is terminated unexpectedly, you can resume from the latest checkpoint 90 | # an example: resume chairs training 91 | # CHECKPOINT_DIR=checkpoints/chairs-gmflow && \ 92 | # mkdir -p ${CHECKPOINT_DIR} && \ 93 | # python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --master_port=9989 main.py \ 94 | # --launcher pytorch \ 95 | # --checkpoint_dir ${CHECKPOINT_DIR} \ 96 | # --resume checkpoints/chairs-gmflow/checkpoint_latest.pth \ 97 | # --batch_size 16 \ 98 | # --val_dataset chairs sintel kitti \ 99 | # --lr 4e-4 \ 100 | # --image_size 384 512 \ 101 | # --padding_factor 16 \ 102 | # --upsample_factor 8 \ 103 | # --with_speed_metric \ 104 | # --val_freq 10000 \ 105 | # --save_ckpt_freq 10000 \ 106 | # --num_steps 100000 \ 107 | # 2>&1 | tee -a ${CHECKPOINT_DIR}/train.log 108 | 109 | -------------------------------------------------------------------------------- /egomono4d/repo/gmflow/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /egomono4d/tracking/track_predictor_cotracker.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | # from typing import Literal 3 | from typing_extensions import Literal 4 | import pdb 5 | import os 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | from einops import rearrange 10 | from jaxtyping import Float 11 | from torch import Tensor 12 | 13 | from .track_predictor import TrackPredictor, Tracks, sample_image_grid_tracker 14 | import cotracker 15 | from cotracker.predictor import CoTrackerPredictor 16 | 17 | 18 | @dataclass 19 | class TrackPredictorCoTrackerCfg: 20 | name: Literal["cotracker"] 21 | grid_size: int 22 | similarity_threshold: float 23 | cache_dir: str | None 24 | # cache_path: str | None 25 | 26 | 27 | class TrackPredictorCoTracker(TrackPredictor[TrackPredictorCoTrackerCfg]): 28 | def __init__(self, cfg: TrackPredictorCoTrackerCfg) -> None: 29 | super().__init__(cfg) 30 | self.cache_dir = cfg.cache_dir 31 | # checkpoint = "scaled_offline.pth" 32 | checkpoint = "cotracker2.pth" 33 | self.tracker = CoTrackerPredictor(checkpoint=cfg.cache_dir+"/cotracker_checkpoints/"+checkpoint) 34 | grid_size = self.cfg.grid_size 35 | self.grid_queries = sample_image_grid_tracker((grid_size, grid_size))[None] 36 | self.grid_queries_init = False 37 | 38 | 39 | def calc_tracking( 40 | self, 41 | videos: Float[Tensor, "batch frame 3 height width"], 42 | query_frame: int, 43 | backward_tracking: bool=True 44 | ) -> Tracks: 45 | 46 | # (Michael) Ensuring that the coordinates of tracking points is INT for loss_tracking_robust. 47 | b, _, _, h, w = videos.shape 48 | if self.grid_queries_init is False: 49 | gs = self.grid_queries.clone() 50 | gs[..., 0] = gs[..., 0] * (w - 1) 51 | gs[..., 1] = gs[..., 1] * (h - 1) 52 | gs = torch.round(gs).to(videos.device) 53 | self.grid_queries = gs.reshape(1, -1, 2) 54 | self.grid_queries_init = True 55 | 56 | queries = torch.cat([torch.zeros_like(self.grid_queries[:, :, :1], device=videos.device) * query_frame, self.grid_queries], dim=-1) 57 | 58 | # pdb.set_trace() 59 | xy, visibility = self.tracker(videos*255, queries=queries.repeat(b, 1, 1), grid_query_frame=query_frame, backward_tracking=backward_tracking) 60 | xy, visibility = self.tracker( 61 | videos * 255, 62 | queries=queries.repeat(b, 1, 1), 63 | # grid_size=self.cfg.grid_size, 64 | grid_query_frame=query_frame, 65 | backward_tracking=backward_tracking, 66 | ) 67 | 68 | # Normalize the coordinates. 69 | b, f, _, h, w = videos.shape 70 | wh = torch.tensor((w-1, h-1), dtype=torch.float32, device=videos.device) 71 | xy = xy / wh 72 | 73 | # Filter visibility based on RGB values. 74 | rgb = F.grid_sample( 75 | rearrange(videos, "b f c h w -> (b f) c h w"), 76 | rearrange(xy, "b f p xy -> (b f) p () xy"), 77 | mode="bilinear", 78 | padding_mode="zeros", 79 | align_corners=False, 80 | ) 81 | rgb = rearrange(rgb, "(b f) c p () -> b f p c", b=b, f=f) 82 | rgb_delta = (rgb[:, [query_frame]] - rgb).abs().norm(dim=-1) 83 | visibility = visibility & (rgb_delta < self.cfg.similarity_threshold) 84 | 85 | return Tracks(xy, visibility, 0) 86 | 87 | def forward( 88 | self, 89 | videos: Float[Tensor, "batch frame 3 height width"], 90 | query_frame: int, 91 | ) -> Tracks: 92 | 93 | if query_frame > 1: 94 | return self.calc_tracking(videos, query_frame, backward_tracking=True) 95 | elif query_frame == 0: 96 | return self.calc_tracking(videos, query_frame, backward_tracking=False) 97 | else: 98 | raise ValueError(f"Unsupport query_frame for co-trackerr, query_frame={query_frame}") 99 | 100 | -------------------------------------------------------------------------------- /egomono4d/misc/data_util.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, replace 2 | 3 | import torch.nn.functional as F 4 | from einops import rearrange 5 | from jaxtyping import Float 6 | from PIL import Image 7 | import numpy as np 8 | import pdb 9 | import os 10 | import torch 11 | from torch import Tensor 12 | from typing import Union 13 | 14 | try: 15 | EVAL = os.environ['EVAL_MODE'] 16 | except: 17 | EVAL = 'False' 18 | if EVAL not in ['True']: 19 | import open3d as o3d 20 | 21 | 22 | @dataclass 23 | class PreProcessingCfg: 24 | resize_shape: Union[tuple, int] = (300, 400) 25 | patch_size: int = 14 26 | num_frames: int = 4 27 | 28 | 29 | def compute_patch_cropped_shape( 30 | shape: tuple, 31 | patch_size: int, 32 | ) -> tuple: 33 | h, w = shape 34 | 35 | h_new = (h // patch_size) * patch_size 36 | w_new = (w // patch_size) * patch_size 37 | return h_new, w_new 38 | 39 | 40 | def pil_resize_to_center_crop( 41 | image: Image.Image, 42 | resize_shape: tuple, 43 | cropped_shape: tuple, 44 | depth_process=False 45 | ): # -> tuple[ 46 | # Image.Image, # the image itself 47 | # tuple[int, int], # image shape after scaling, before cropping 48 | 49 | w_old, h_old = image.size 50 | h_new, w_new = resize_shape 51 | h_crp, w_crp = cropped_shape 52 | 53 | # Figure out the scale factor needed to cover the desired shape with a uniformly 54 | # scaled version of the input image. Then, resize the input image. 55 | scale_factor = max(h_new / h_old, w_new / w_old) 56 | h_scaled = round(h_old * scale_factor) 57 | w_scaled = round(w_old * scale_factor) 58 | if depth_process is True: 59 | image_scaled = image.resize((w_scaled, h_scaled), Image.NEAREST) 60 | else: 61 | image_scaled = image.resize((w_scaled, h_scaled), Image.LANCZOS) 62 | 63 | # Center-crop the image. 64 | x = (w_scaled - w_crp) // 2 65 | y = (h_scaled - h_crp) // 2 66 | image_cropped = image_scaled.crop((x, y, x + w_crp, y + h_crp)) 67 | return image_cropped, (h_scaled, w_scaled) 68 | 69 | 70 | def resize_crop_intrinisic( 71 | intrinsics: Float[Tensor, "*#batch 3 3"], 72 | origin_shape: tuple, 73 | scaled_shape: tuple, 74 | croped_shape: tuple 75 | ): 76 | h_old, w_old = origin_shape 77 | h_scl, w_scl = scaled_shape 78 | h_new, w_new = croped_shape 79 | 80 | # reshape updatation 81 | sx = w_scl / w_old 82 | sy = h_scl / h_old 83 | new_intrinsics = intrinsics.clone() 84 | new_intrinsics[..., 0, 0] *= sx 85 | new_intrinsics[..., 0, 2] *= sx 86 | new_intrinsics[..., 1, 1] *= sy 87 | new_intrinsics[..., 1, 2] *= sy 88 | 89 | # center_crop updataion 90 | offset_x = (w_scl - w_new) / 2 91 | offset_y = (h_scl - h_new) / 2 92 | new_intrinsics[0, 2] -= offset_x 93 | new_intrinsics[1, 2] -= offset_y 94 | 95 | return new_intrinsics 96 | 97 | 98 | def canonicalize_intrinisic( 99 | intrinsics: Float[Tensor, "*#batch 3 3"], 100 | shape: tuple 101 | ): 102 | # NOTE: (michael) Intrinsic Canonicalization to (1,1) size space for mixture dataset training. 103 | h, w = shape 104 | new_intrinsics = intrinsics.clone() 105 | new_intrinsics[..., 0, 0] = new_intrinsics[..., 0, 0] / w 106 | new_intrinsics[..., 0, 2] = new_intrinsics[..., 0, 2] / w 107 | new_intrinsics[..., 1, 1] = new_intrinsics[..., 1, 1] / h 108 | new_intrinsics[..., 1, 2] = new_intrinsics[..., 1, 2] / h 109 | return new_intrinsics 110 | 111 | 112 | def visualize_pcd_from_rgbd_fp(rgb_fp, depth_fp, intrinsic): 113 | color = o3d.io.read_image(rgb_fp) 114 | depth = o3d.io.read_image(depth_fp) 115 | rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( 116 | color, depth, convert_rgb_to_intensity=False) 117 | camera = o3d.camera.PinholeCameraIntrinsic() 118 | H, W, _ = np.asarray(color).shape 119 | camera.set_intrinsics(W, H, intrinsic[0,0], intrinsic[1,1], intrinsic[0,2], intrinsic[1,2]) 120 | pcd = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_image, camera) 121 | voxel_down_pcd = pcd.voxel_down_sample(voxel_size=0.025) 122 | return voxel_down_pcd -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | cache/* 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | 165 | 166 | outputs 167 | datasets 168 | wandb 169 | checkpoints 170 | *.ckpt 171 | *.pt 172 | figures 173 | tables 174 | results 175 | events.out.tfevents* 176 | *.ply 177 | *.mp4 178 | *.pkl 179 | 180 | *.npy 181 | *.json 182 | *.tar.gz 183 | 184 | # cache/cotracker_checkpoints/* 185 | # cache/data_custom/* 186 | # cache/ego_hos_checkpoints/* 187 | # cache/gmflow_checkpoints/* 188 | # cache/models/* 189 | # cache/original_datasets/* 190 | # cache/processed_datasets/* 191 | # cache/unidepth_v2_checkpoints/* 192 | --------------------------------------------------------------------------------