├── utils ├── __init__.py ├── losess.py ├── ransac.py └── frame_utils.py ├── isaacsim ├── .gitignore ├── requirements.txt ├── pattern.png ├── replicate │ ├── __init__.py │ └── std_object.py ├── render.py ├── README.md ├── config │ └── hssd.yaml ├── utils_func.py ├── replicator.py └── custom_writer.py ├── datasets ├── .gitignore ├── Real │ └── xiaomeng │ │ ├── 0000_ir_l.png │ │ ├── 0000_ir_r.png │ │ ├── 0000_rgb.png │ │ ├── 0000_depth.png │ │ └── 0000_raw_disparity.png └── README.md ├── raw_aligned.png ├── assets ├── in-the-wild.png └── examples │ ├── 0000_ir_l.png │ ├── 0000_ir_r.png │ ├── 0000_rgb.png │ └── 0000_depth.png ├── .gitignore ├── conf ├── config.yaml └── task │ ├── eval_ldm_his.yaml │ ├── eval_his_sim.yaml │ ├── eval_ldm_mixed.yaml │ ├── eval_dreds_reprod.yaml │ ├── eval_ldm_mixed_rgb+raw.yaml │ ├── eval_ldm_mixed_cond_rgbd.yaml │ ├── eval_clearpose.yaml │ ├── eval_syntodd_rgbd.yaml │ ├── eval_sceneflow.yaml │ ├── eval_ldm_mono.yaml │ ├── eval_ldm.yaml │ ├── train_ldm_mixed.yaml │ ├── train_ldm_mono.yaml │ ├── train_sceneflow.yaml │ ├── train_hiss.yaml │ ├── train_ldm_mixed_rgb+raw.yaml │ ├── train_dreds_reprod.yaml │ ├── train_ldm_mixed_left+right+raw.yaml │ ├── train_ldm_mixed_cond_rgbd.yaml │ ├── train_clearpose.yaml │ ├── train_ldm_mixed_gapartnet.yaml │ └── train_syntodd_rgbd.yaml ├── pyrightconfig.json ├── scripts ├── check_sceneflow.py └── check_stereo.py ├── data ├── dataset.py ├── data_loader.py └── augmentor.py ├── core ├── praser.py └── resample.py ├── distributed_evaluate.py ├── README.md ├── config.py └── inference.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /isaacsim/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | output_ir 3 | **/*.pyc -------------------------------------------------------------------------------- /isaacsim/requirements.txt: -------------------------------------------------------------------------------- 1 | hydra-core==1.3.2 2 | transforms3d -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | clearpose** 2 | DREDS** 3 | HISS** 4 | sceneflow** -------------------------------------------------------------------------------- /raw_aligned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/raw_aligned.png -------------------------------------------------------------------------------- /isaacsim/pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/isaacsim/pattern.png -------------------------------------------------------------------------------- /assets/in-the-wild.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/assets/in-the-wild.png -------------------------------------------------------------------------------- /assets/examples/0000_ir_l.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/assets/examples/0000_ir_l.png -------------------------------------------------------------------------------- /assets/examples/0000_ir_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/assets/examples/0000_ir_r.png -------------------------------------------------------------------------------- /assets/examples/0000_rgb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/assets/examples/0000_rgb.png -------------------------------------------------------------------------------- /assets/examples/0000_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/assets/examples/0000_depth.png -------------------------------------------------------------------------------- /datasets/Real/xiaomeng/0000_ir_l.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/datasets/Real/xiaomeng/0000_ir_l.png -------------------------------------------------------------------------------- /datasets/Real/xiaomeng/0000_ir_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/datasets/Real/xiaomeng/0000_ir_r.png -------------------------------------------------------------------------------- /datasets/Real/xiaomeng/0000_rgb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/datasets/Real/xiaomeng/0000_rgb.png -------------------------------------------------------------------------------- /datasets/Real/xiaomeng/0000_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/datasets/Real/xiaomeng/0000_depth.png -------------------------------------------------------------------------------- /isaacsim/replicate/__init__.py: -------------------------------------------------------------------------------- 1 | from .scene_replicator import Replicator 2 | from .std_object import STDObjectReplicator 3 | 4 | -------------------------------------------------------------------------------- /datasets/Real/xiaomeng/0000_raw_disparity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songlin/d3roma/HEAD/datasets/Real/xiaomeng/0000_raw_disparity.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | experiments* 3 | checkpoint 4 | _outputs* 5 | _outputs/** 6 | checkpoint/** 7 | test_* 8 | backup 9 | bad_sim* 10 | .vscode 11 | -------------------------------------------------------------------------------- /conf/config.yaml: -------------------------------------------------------------------------------- 1 | 2 | defaults: 3 | - _self_ 4 | - task: train_ldm 5 | 6 | debug: false 7 | seed: -1 8 | 9 | hydra: 10 | run: 11 | dir: _outputs/${hydra.job.name} 12 | -------------------------------------------------------------------------------- /conf/task/eval_ldm_his.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm_his 3 | 4 | name: ldm_his 5 | resume_pretrained: 6 | camera_resolution: 640x360 # W,H 7 | image_size: [180, 320] # H,W 8 | eval_dataset: [HISS] 9 | eval_num_batch: -1 10 | eval_batch_size: 4 11 | num_inference_timesteps: 10 12 | num_intermediate_images: 5 13 | num_inference_rounds: 1 14 | 15 | -------------------------------------------------------------------------------- /conf/task/eval_his_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_his_sim 3 | 4 | resume_pretrained: 5 | camera_resolution: 224x126 # WxH 6 | image_size: [126, 224] # H,W 7 | safe_ssi: true 8 | eval_dataset: [HISS] 9 | eval_num_batch: -1 10 | eval_batch_size: 32 11 | sampler: my_ddpm 12 | num_inference_timesteps: 128 13 | num_intermediate_images: 8 14 | num_inference_rounds: 1 15 | write_pcd: true -------------------------------------------------------------------------------- /conf/task/eval_ldm_mixed.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm_mixed 3 | 4 | # was the best version in real during the submition to CoRL 2024 5 | name: eval_ldm_sf 6 | resume_pretrained: 7 | camera_resolution: 480x270 # W,H 8 | image_size: [180,320] # H,W 9 | eval_dataset: [Real_xiaomeng_fxm] 10 | eval_num_batch: -1 11 | eval_batch_size: 4 12 | num_inference_timesteps: 10 13 | num_intermediate_images: 5 14 | num_inference_rounds: 1 15 | -------------------------------------------------------------------------------- /conf/task/eval_dreds_reprod.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_dreds_reprod 3 | 4 | name: dreds 5 | resume_pretrained: 6 | cond_channels: left+right+raw 7 | camera_resolution: 224x126 # WxH 8 | image_size: [126, 224] # H,W 9 | safe_ssi: true 10 | train_dataset: [Dreds] 11 | eval_dataset: [Dreds] 12 | eval_num_batch: -1 13 | eval_batch_size: 32 14 | save_model_epochs: 5 15 | num_inference_timesteps: 128 16 | num_intermediate_images: 8 17 | sampler: my_ddpm 18 | -------------------------------------------------------------------------------- /conf/task/eval_ldm_mixed_rgb+raw.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm_mixed_rgb+raw 3 | 4 | # was the best version in real during the submition to CoRL 2024 5 | name: eval_ldm_mixed_rgb+raw 6 | resume_pretrained: 7 | camera_resolution: 480x270 # W,H 8 | image_size: [180,320] # H,W 9 | eval_dataset: [Real_xiaomeng_fxm] 10 | eval_num_batch: -1 11 | eval_batch_size: 4 12 | num_inference_timesteps: 10 13 | num_intermediate_images: 5 14 | num_inference_rounds: 1 15 | -------------------------------------------------------------------------------- /conf/task/eval_ldm_mixed_cond_rgbd.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm_mixed_cond_rgbd 3 | 4 | name: ldm_sf 5 | resume_pretrained: experiments/ldm_sf-0807.dep4.lr3e-05.v_prediction.nossi.scaled_linear.randn.ddpm1000.ClearPose_Dreds_HISS.240x320.rgb+raw.w0.0/best 6 | camera_resolution: 320x240 # WxH 7 | image_size: [240,320] # H,W 8 | eval_dataset: [ClearPose] 9 | eval_num_batch: -1 10 | sampler: ddim 11 | num_inference_timesteps: 10 12 | num_intermediate_images: 5 13 | num_inference_rounds: 1 -------------------------------------------------------------------------------- /conf/task/eval_clearpose.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_clearpose 3 | 4 | name: clearpose 5 | resume_pretrained: experiments/clearpose-0809.dep1.lr1e-04.sample.ssi.squaredcos_cap_v2.pyramid.my_ddpm128.ClearPose_Dreds_HISS.240x320.rgb+raw.w0.0/best 6 | eval_num_batch: -1 7 | camera_resolution: 320x240 # WxH 8 | image_size: [240,320] # H,W 9 | eval_dataset: [ClearPose] 10 | num_intermediate_images: 8 11 | sampler: my_ddpm 12 | plot_error_map: false 13 | plot_denoised_images: false 14 | eval_batch_size: 96 15 | eval_split: "test" 16 | safe_ssi: false 17 | -------------------------------------------------------------------------------- /conf/task/eval_syntodd_rgbd.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_syntodd_rgbd 3 | 4 | name: clearpose 5 | resume_pretrained: experiments/syntodd_rgbd-0810.dep1.lr1e-04.sample.ssi.squaredcos_cap_v2.pyramid.my_ddpm128.SynTODDRgbd.240x320.rgb+raw.w0.0/best 6 | eval_num_batch: -1 7 | camera_resolution: 320x240 # WxH 8 | image_size: [240,320] # H,W 9 | eval_dataset: [SynTODDRgbd] 10 | num_intermediate_images: 8 11 | sampler: my_ddpm 12 | plot_error_map: false 13 | plot_denoised_images: false 14 | eval_batch_size: 12 15 | eval_split: "test" 16 | safe_ssi: false 17 | -------------------------------------------------------------------------------- /conf/task/eval_sceneflow.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_sceneflow 3 | 4 | name: eval_sceneflow 5 | resume_pretrained: 6 | eval_dataset: [SceneFlow] 7 | eval_split: val 8 | camera_resolution: 960x540 9 | image_size: [540, 960] 10 | eval_num_batch: -1 11 | eval_batch_size: 3 12 | eval_output: "" # use default 13 | prediction_type: sample 14 | flow_guidance_mode: imputation 15 | flow_guidance_weights: [0] 16 | num_inference_rounds: 1 17 | num_inference_timesteps: 10 18 | num_intermediate_images: 5 19 | plot_denoised_images: true 20 | plot_intermediate_metrics: false 21 | write_pcd: false 22 | plot_error_map: true 23 | ensemble: false 24 | ssi: false -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": ["datasets", "experiments", "experiments.corl24", "checkpoint", "_outputs"], 3 | "reportPrivateImportUsage": false, 4 | "reportOptionalMemberAccess": false, 5 | "reportCallIssue": false, 6 | "reportPossiblyUnboundVariable": false, 7 | "reportArgumentType": false, 8 | "reportOptionalSubscript": false, 9 | "reportAttributeAccessIssue": false, 10 | "reportOptionalOperand": false, 11 | "reportIndexIssue": false, 12 | "reportAssignmentType": false, 13 | "reportOperatorIssue": false, 14 | "reportReturnType": false, 15 | "reportGeneralTypeIssues": false 16 | } -------------------------------------------------------------------------------- /conf/task/eval_ldm_mono.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm_mono 3 | 4 | name: eval_ldm_mono 5 | resume_pretrained: experiments/ldm_mono-0809.dep4.lr3e-05.v_prediction.ssi.scaled_linear.randn.ddpm1000.SynTODD.240x320.rgb.w0.0/best 6 | eval_dataset: [SynTODD] 7 | eval_split: test 8 | # camera_resolution: 640x480 9 | # image_size: [480, 640] 10 | eval_num_batch: -1 11 | eval_batch_size: 16 12 | num_inference_rounds: 1 13 | num_inference_timesteps: 10 14 | num_intermediate_images: 5 15 | plot_denoised_images: false 16 | plot_error_map: true 17 | write_pcd: false 18 | # ensemble: false 19 | # safe_ssi: true 20 | # ransac_error_threshold: 0.6 # rmse error, 0.6 for nyu 21 | 22 | -------------------------------------------------------------------------------- /conf/task/eval_ldm.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - train_ldm 3 | 4 | name: eval_ft_sd2_hypersim 5 | resume_pretrained: experiments/d.fixed.lr3e-05.v_prediction.ssi.scaled_linear.randn.ssi.my_ddpm1000.HyperSim.240x320.cond4.w0.0/epoch_0038 6 | # train_dataset: [HyperSim] 7 | eval_dataset: [NYUv2] 8 | eval_split: val 9 | camera_resolution: 640x480 10 | image_size: [480, 640] 11 | eval_num_batch: -1 12 | eval_batch_size: 3 13 | eval_output: "" # use default 14 | flow_guidance_mode: imputation 15 | flow_guidance_weights: [0] 16 | num_inference_rounds: 1 17 | num_inference_timesteps: 10 18 | num_intermediate_images: 5 19 | plot_denoised_images: true 20 | write_pcd: false 21 | plot_error_map: true 22 | ensemble: false 23 | # safe_ssi: true 24 | # ransac_error_threshold: 0.6 # rmse error, 0.6 for nyu 25 | 26 | -------------------------------------------------------------------------------- /conf/task/train_ldm_mixed.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: ldm_sf 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: disp 10 | camera_resolution: 480x270 # W,H 11 | image_size: [180,320] # H,W 12 | train_dataset: [SceneFlow] 13 | eval_dataset: [SceneFlow] 14 | train_batch_size: 16 15 | gradient_accumulation_steps: 1 16 | eval_num_batch: -1 17 | eval_batch_size: 4 18 | lr_warmup_steps: 0 19 | learning_rate: 3e-5 20 | lr_scheduler: constant # linear: almost the same as constant 21 | val_every_global_steps: 1000 22 | save_model_epochs: 3 23 | num_train_timesteps: 1000 24 | num_inference_timesteps: 10 25 | num_intermediate_images: 5 26 | num_inference_rounds: 1 27 | ssi: false 28 | normalize_mode: average 29 | num_chs: 1 30 | ch_bounds: [128.] 31 | ch_gammas: [1.] 32 | noise_strategy: randn 33 | loss_type: mse 34 | prediction_type: v_prediction 35 | sampler: ddpm 36 | num_epochs: 200 37 | cond_channels: left+right+raw 38 | beta_schedule: scaled_linear 39 | beta_start: 0.00085 40 | beta_end: 0.012 41 | mixed_precision: "no" 42 | thresholding: false 43 | clip_sample: false 44 | block_out_channels: [0] # N/A 45 | -------------------------------------------------------------------------------- /conf/task/train_ldm_mono.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: ldm_mono 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: depth 10 | camera_resolution: 320x240 # WxH 11 | image_size: [240,320] # H,W 12 | train_dataset: [SynTODD] 13 | eval_dataset: [SynTODD] 14 | dataset_weight: [1] 15 | train_batch_size: 12 16 | gradient_accumulation_steps: 1 17 | eval_num_batch: -1 18 | eval_batch_size: 4 19 | lr_warmup_steps: 5000 20 | learning_rate: 3e-5 21 | lr_scheduler: constant # linear: almost the same as constant 22 | val_every_global_steps: 1000 23 | save_model_epochs: 3 24 | num_train_timesteps: 1000 25 | num_inference_timesteps: 10 26 | num_intermediate_images: 5 27 | num_inference_rounds: 1 28 | ssi: true 29 | normalize_mode: average 30 | num_chs: 1 31 | ch_bounds: [1.] 32 | ch_gammas: [1.] 33 | noise_strategy: randn 34 | loss_type: mse 35 | prediction_type: v_prediction 36 | sampler: ddpm 37 | num_epochs: 200 38 | cond_channels: rgb 39 | beta_schedule: scaled_linear 40 | beta_start: 0.00085 41 | beta_end: 0.012 42 | mixed_precision: "no" 43 | thresholding: false 44 | clip_sample: false 45 | block_out_channels: [0] # N/A 46 | -------------------------------------------------------------------------------- /conf/task/train_sceneflow.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: sceneflow 5 | ldm: false 6 | prediction_space: disp 7 | ssi: false 8 | normalize_mode: average 9 | ch_bounds: [128] 10 | ch_gammas: [1.0] 11 | resume_pretrained: 12 | camera_resolution: 480x270 #960x540 # W,H 13 | image_size: [270, 480] # H,W 14 | train_dataset: [SceneFlow] 15 | eval_dataset: [SceneFlow] 16 | train_batch_size: 4 17 | eval_num_batch: -1 18 | eval_batch_size: 8 19 | lr_warmup_steps: 1000 20 | learning_rate: 1e-4 21 | lr_scheduler: linear 22 | gradient_accumulation_steps: 1 23 | val_every_global_steps: 2000 24 | save_model_epochs: 5 25 | num_train_timesteps: 128 26 | num_inference_timesteps: 10 27 | num_intermediate_images: 5 28 | num_inference_rounds: 1 29 | block_out_channels: [128, 128, 256, 256, 512, 512] 30 | noise_strategy: pyramid 31 | loss_type: l1 32 | prediction_type: sample 33 | num_epochs: 600 34 | cond_channels: left+right+raw 35 | depth_channels: 3 36 | beta_schedule: squaredcos_cap_v2 37 | beta_start: 1e-4 38 | beta_end: 2e-2 39 | sampler: my_ddpm 40 | mixed_precision: "no" 41 | thresholding: true 42 | dynamic_thresholding_ratio: 0.995 43 | clip_sample: true 44 | clip_sample_range: 1.0 -------------------------------------------------------------------------------- /conf/task/train_hiss.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: hiss 5 | ldm: false 6 | prediction_space: disp 7 | resume_pretrained: 8 | cond_channels: left+right+raw 9 | camera_resolution: 224x126 # WxH 10 | image_size: [126, 224] # H,W 11 | ssi: true 12 | safe_ssi: true 13 | train_dataset: [HISS] 14 | eval_dataset: [HISS] 15 | normalize_mode: average 16 | ch_bounds: [64.] 17 | ch_gammas: [1.] 18 | num_chs: 1 19 | norm_s: 2 20 | norm_t: 0.5 21 | train_batch_size: 32 22 | eval_num_batch: -1 23 | eval_batch_size: 32 24 | lr_warmup_steps: 1000 25 | learning_rate: 0.0001 26 | lr_scheduler: constant 27 | gradient_accumulation_steps: 1 28 | val_every_global_steps: 5000 29 | save_model_epochs: 5 30 | num_train_timesteps: 128 31 | num_inference_timesteps: 8 32 | num_intermediate_images: 4 33 | num_inference_rounds: 1 34 | block_out_channels: [128, 128, 256, 256, 512, 512] 35 | noise_strategy: pyramid 36 | loss_type: mse 37 | prediction_type: sample 38 | num_epochs: 200 39 | depth_channels: 1 40 | beta_schedule: squaredcos_cap_v2 41 | beta_start: 0.0001 42 | beta_end: 0.02 43 | sampler: my_ddpm 44 | mixed_precision: "no" 45 | thresholding: true 46 | dynamic_thresholding_ratio: 0.995 47 | clip_sample: true 48 | clip_sample_range: 1.0 -------------------------------------------------------------------------------- /conf/task/train_ldm_mixed_rgb+raw.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: ldm_sf 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: disp 10 | camera_resolution: 480x270 # W,H 11 | image_size: [180,320] # H,W 12 | train_dataset: [Dreds, HISS, ClearPose] 13 | dataset_weight: [1,1,1] 14 | eval_dataset: [Dreds, HISS, Real_xiaomeng_fxm] 15 | train_batch_size: 16 16 | gradient_accumulation_steps: 1 17 | eval_num_batch: -1 18 | eval_batch_size: 4 19 | lr_warmup_steps: 0 20 | learning_rate: 3e-5 21 | lr_scheduler: constant # linear: almost the same as constant 22 | val_every_global_steps: 1000 23 | save_model_epochs: 3 24 | num_train_timesteps: 1000 25 | num_inference_timesteps: 10 26 | num_intermediate_images: 5 27 | num_inference_rounds: 1 28 | ssi: false 29 | normalize_mode: average 30 | num_chs: 1 31 | ch_bounds: [128.] 32 | ch_gammas: [1.] 33 | noise_strategy: randn 34 | loss_type: mse 35 | prediction_type: v_prediction 36 | sampler: ddpm 37 | num_epochs: 200 38 | cond_channels: rgb+raw 39 | beta_schedule: scaled_linear 40 | beta_start: 0.00085 41 | beta_end: 0.012 42 | mixed_precision: "no" 43 | thresholding: false 44 | clip_sample: false 45 | block_out_channels: [0] # N/A 46 | -------------------------------------------------------------------------------- /conf/task/train_dreds_reprod.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: dreds 5 | ldm: false 6 | prediction_space: disp 7 | resume_pretrained: 8 | cond_channels: left+right+raw 9 | camera_resolution: 224x126 # WxH 10 | image_size: [126, 224] # H,W 11 | ssi: true 12 | safe_ssi: true 13 | train_dataset: [Dreds] 14 | eval_dataset: [Dreds] 15 | normalize_mode: average 16 | ch_bounds: [64.] 17 | ch_gammas: [1.] 18 | num_chs: 1 19 | norm_s: 2 20 | norm_t: 0.5 21 | train_batch_size: 32 22 | eval_num_batch: -1 23 | eval_batch_size: 32 24 | lr_warmup_steps: 1000 25 | learning_rate: 0.0001 26 | lr_scheduler: constant 27 | gradient_accumulation_steps: 1 28 | val_every_global_steps: 5000 29 | save_model_epochs: 5 30 | num_train_timesteps: 128 31 | num_inference_timesteps: 8 32 | num_intermediate_images: 4 33 | num_inference_rounds: 1 34 | block_out_channels: [128, 128, 256, 256, 512, 512] 35 | noise_strategy: pyramid 36 | loss_type: mse 37 | prediction_type: sample 38 | num_epochs: 200 39 | depth_channels: 1 40 | beta_schedule: squaredcos_cap_v2 41 | beta_start: 0.0001 42 | beta_end: 0.02 43 | sampler: my_ddpm 44 | mixed_precision: "no" 45 | thresholding: true 46 | dynamic_thresholding_ratio: 0.995 47 | clip_sample: true 48 | clip_sample_range: 1.0 -------------------------------------------------------------------------------- /conf/task/train_ldm_mixed_left+right+raw.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: ldm_sf 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: disp 10 | camera_resolution: 480x270 # W,H 11 | image_size: [180,320] # H,W 12 | train_dataset: [SceneFlow, Dreds, HISS] 13 | dataset_weight: [1,1,1] 14 | eval_dataset: [SceneFlow, Dreds, HISS, Real_xiaomeng_fxm] 15 | train_batch_size: 16 16 | gradient_accumulation_steps: 1 17 | eval_num_batch: -1 18 | eval_batch_size: 4 19 | lr_warmup_steps: 0 20 | learning_rate: 3e-5 21 | lr_scheduler: constant # linear: almost the same as constant 22 | val_every_global_steps: 1000 23 | save_model_epochs: 3 24 | num_train_timesteps: 1000 25 | num_inference_timesteps: 10 26 | num_intermediate_images: 5 27 | num_inference_rounds: 1 28 | ssi: false 29 | normalize_mode: average 30 | num_chs: 1 31 | ch_bounds: [128.] 32 | ch_gammas: [1.] 33 | noise_strategy: randn 34 | loss_type: mse 35 | prediction_type: v_prediction 36 | sampler: ddpm 37 | num_epochs: 200 38 | cond_channels: left+right+raw 39 | beta_schedule: scaled_linear 40 | beta_start: 0.00085 41 | beta_end: 0.012 42 | mixed_precision: "no" 43 | thresholding: false 44 | clip_sample: false 45 | block_out_channels: [0] # N/A 46 | -------------------------------------------------------------------------------- /conf/task/train_ldm_mixed_cond_rgbd.yaml: -------------------------------------------------------------------------------- 1 | fdefaults: 2 | - cfg 3 | 4 | name: ldm_sf 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: disp 10 | camera_resolution: 320x240 # WxH 11 | image_size: [240,320] # H,W 12 | train_dataset: [ClearPose, Dreds, HISS] # [Dreds] # 13 | eval_dataset: [ClearPose, Dreds, HISS] # [Dreds] # 14 | dataset_weight: [1, 1, 1] # [1] # 15 | train_batch_size: 16 16 | gradient_accumulation_steps: 1 17 | eval_num_batch: -1 18 | eval_batch_size: 4 19 | lr_warmup_steps: 5000 20 | learning_rate: 3e-5 21 | lr_scheduler: constant # linear: almost the same as constant 22 | val_every_global_steps: 1000 23 | save_model_epochs: 3 24 | num_train_timesteps: 1000 25 | num_inference_timesteps: 10 26 | num_intermediate_images: 5 27 | num_inference_rounds: 1 28 | ssi: false 29 | normalize_mode: average 30 | num_chs: 1 31 | ch_bounds: [64.0] 32 | ch_gammas: [1.] 33 | noise_strategy: randn 34 | loss_type: mse 35 | prediction_type: v_prediction 36 | sampler: ddpm 37 | num_epochs: 200 38 | cond_channels: rgb+raw 39 | beta_schedule: scaled_linear 40 | beta_start: 0.00085 41 | beta_end: 0.012 42 | mixed_precision: "no" 43 | thresholding: false 44 | clip_sample: false 45 | block_out_channels: [0] # N/A 46 | -------------------------------------------------------------------------------- /conf/task/train_clearpose.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: clearpose 5 | ldm: false 6 | prediction_space: disp 7 | resume_pretrained: 8 | cond_channels: rgb+raw 9 | camera_resolution: 320x240 # WxH 10 | image_size: [240, 320] # H,W 11 | ssi: true 12 | safe_ssi: false 13 | train_dataset: [ClearPose, Dreds, HISS] # [Dreds] # 14 | eval_dataset: [ClearPose, Dreds, HISS] # [Dreds] # 15 | dataset_weight: [1, 1, 1] # [1] # 16 | normalize_mode: average 17 | ch_bounds: [64.] 18 | ch_gammas: [1.] 19 | num_chs: 1 20 | norm_s: 2 21 | norm_t: 0.5 22 | train_batch_size: 12 # 32 works for 224x126 23 | eval_num_batch: -1 24 | eval_batch_size: 32 25 | lr_warmup_steps: 5000 26 | learning_rate: 0.0001 27 | lr_scheduler: constant 28 | gradient_accumulation_steps: 1 29 | val_every_global_steps: 5000 30 | save_model_epochs: 5 31 | num_train_timesteps: 128 32 | num_inference_timesteps: 8 33 | num_intermediate_images: 4 34 | num_inference_rounds: 1 35 | block_out_channels: [128, 128, 256, 256, 512, 512] 36 | noise_strategy: pyramid 37 | loss_type: mse 38 | prediction_type: sample 39 | num_epochs: 200 40 | depth_channels: 1 41 | beta_schedule: squaredcos_cap_v2 42 | beta_start: 0.0001 43 | beta_end: 0.02 44 | sampler: my_ddpm 45 | mixed_precision: "no" 46 | thresholding: true 47 | dynamic_thresholding_ratio: 0.995 48 | clip_sample: true 49 | clip_sample_range: 1.0 -------------------------------------------------------------------------------- /conf/task/train_ldm_mixed_gapartnet.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: ldm_mixed_gapartnet 5 | resume_pretrained: 6 | ldm: true 7 | depth_channels: 4 8 | divis_by: 8 9 | prediction_space: disp 10 | camera_resolution: 320x180 # W,H 11 | # camera_resolution: 480x270 # W,H 12 | image_size: [180,320] # H,W 13 | # image_size: [270,480] # H,W 14 | train_dataset: [SceneFlow, Dreds, HISS, Gapartnet2] 15 | eval_dataset: [SceneFlow, Dreds, HISS, Gapartnet2, Real] 16 | dataset_weight: [1, 1, 1, 1] 17 | train_batch_size: 16 18 | gradient_accumulation_steps: 1 19 | eval_num_batch: 10 20 | eval_batch_size: 4 21 | lr_warmup_steps: 0 22 | learning_rate: 3e-5 23 | lr_scheduler: constant # linear: almost the same as constant 24 | val_every_global_steps: 1000 25 | save_model_epochs: 3 26 | num_train_timesteps: 1000 27 | num_inference_timesteps: 10 28 | num_intermediate_images: 5 29 | num_inference_rounds: 1 30 | ssi: false 31 | normalize_mode: average 32 | num_chs: 1 33 | ch_bounds: [128.] 34 | ch_gammas: [1.] 35 | noise_strategy: randn 36 | loss_type: mse 37 | prediction_type: v_prediction 38 | sampler: ddpm 39 | num_epochs: 200 40 | cond_channels: left+right+raw 41 | beta_schedule: scaled_linear 42 | beta_start: 0.00085 43 | beta_end: 0.012 44 | mixed_precision: "no" 45 | thresholding: false 46 | clip_sample: false 47 | block_out_channels: [0] # N/A 48 | -------------------------------------------------------------------------------- /conf/task/train_syntodd_rgbd.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - cfg 3 | 4 | name: syntodd_rgbd 5 | ldm: false 6 | prediction_space: disp 7 | resume_pretrained: 8 | cond_channels: rgb+raw 9 | camera_resolution: 320x240 # WxH 10 | image_size: [240, 320] # H,W 11 | ssi: true 12 | safe_ssi: false 13 | train_dataset: [SynTODDRgbd] # 14 | eval_dataset: [SynTODDRgbd] # 15 | dataset_variant: simdepth # "simdepth", "erodedepth", "dilatedepth" 16 | dataset_weight: [1] # [1] # 17 | normalize_mode: average 18 | ch_bounds: [64.] 19 | ch_gammas: [1.] 20 | num_chs: 1 21 | norm_s: 2 22 | norm_t: 0.5 23 | train_batch_size: 12 # 32 works for 224x126 24 | eval_num_batch: -1 25 | eval_batch_size: 32 26 | lr_warmup_steps: 5000 27 | learning_rate: 0.0001 28 | lr_scheduler: constant 29 | gradient_accumulation_steps: 1 30 | val_every_global_steps: 5000 31 | save_model_epochs: 5 32 | num_train_timesteps: 128 33 | num_inference_timesteps: 8 34 | num_intermediate_images: 4 35 | num_inference_rounds: 1 36 | block_out_channels: [128, 128, 256, 256, 512, 512] 37 | noise_strategy: pyramid 38 | loss_type: mse 39 | prediction_type: sample 40 | num_epochs: 200 41 | depth_channels: 1 42 | beta_schedule: squaredcos_cap_v2 43 | beta_start: 0.0001 44 | beta_end: 0.02 45 | sampler: my_ddpm 46 | mixed_precision: "no" 47 | thresholding: true 48 | dynamic_thresholding_ratio: 0.995 49 | clip_sample: true 50 | clip_sample_range: 1.0 -------------------------------------------------------------------------------- /isaacsim/render.py: -------------------------------------------------------------------------------- 1 | """Generate infrared rendering using replicator 2 | """ 3 | import json 4 | import math 5 | import os 6 | import random 7 | import sys 8 | 9 | import carb 10 | import yaml 11 | from omni.isaac.kit import SimulationApp 12 | 13 | from omegaconf import DictConfig, OmegaConf 14 | from hydra import compose, initialize 15 | import hydra 16 | 17 | # hydra: load config 18 | with initialize(version_base=None, config_path="config", job_name="replicator_ir"): 19 | cfg = compose(config_name="hssd.yaml" , overrides=sys.argv[1:]) 20 | 21 | if cfg["seed"] >= 0: 22 | random.seed(cfg["seed"]) 23 | 24 | # start simulation 25 | _app = SimulationApp(launch_config=cfg['launch_config']) 26 | _Log = _app.app.print_and_log 27 | 28 | from omni.isaac.core import World 29 | from replicator import IRReplicator 30 | 31 | # main program 32 | def run(cfg: DictConfig) -> None: 33 | _Log("start running") 34 | _world = World(set_defaults=True) #**cfg['world'], 35 | _world.set_simulation_dt(**cfg["world"]) 36 | 37 | # start replicator 38 | rep = IRReplicator(_app, _world, cfg) 39 | rep.start() 40 | 41 | _Log("keep GUI running if headless is False") 42 | while _app.is_running() and not cfg['launch_config']['headless']: 43 | _world.step(render=True) 44 | 45 | _app.close() 46 | 47 | if __name__ == "__main__": 48 | run(cfg) 49 | -------------------------------------------------------------------------------- /isaacsim/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Data Generation in simulation 3 | 4 | Although we do not plan to release all the sources for generating `HISS` dataset, I want to share an example code of generating IR renderings using [IsaacSim 4.0.0](https://docs.isaacsim.omniverse.nvidia.com/4.0.0/installation/install_container.html). 5 | 6 | > This code should also work on Newer version of Isaac Sim with very few changes. If you encounter any problem please feel free to contact me. 7 | 8 | 9 | ### 1. prepare data 10 | 11 | + Download [HSSD scenes](https://huggingface.co/datasets/hssd/hssd-scenes) from here 12 | 13 | Notice that HSSD scenes are very big, you can download some of them for using. 14 | 15 | eg., I set [107734119_175999932](https://huggingface.co/datasets/hssd/hssd-scenes/blob/main/scenes/107734119_175999932.glb) as the default scene in `config/hssd.yaml` 16 | 17 | Please first convert it to USD file using [USD composer](https://docs.omniverse.nvidia.com/composer/latest/index.html). 18 | 19 | + Download object cad models from dreds, [link](https://mirrors.pku.edu.cn/dl-release/DREDS_ECCV2022/data/cad_model/) 20 | 21 | + Download NVIDIA Omniverse [vMaterials_2](https://developer.nvidia.com/vmaterials) 22 | 23 | 24 | Put them all in `data` folder, example folder structure: 25 | 26 | ``` 27 | data 28 | ├── dreds 29 | │ ├── cad_model 30 | │ │ ├── 00000000 31 | │ │ ├── 02691156 32 | │ │ ├── 02876657 33 | │ │ ├── 02880940 34 | │ │ ├── 02942699 35 | │ │ ├── 02946921 36 | │ │ ├── 02954340 37 | │ │ ├── 02958343 38 | │ │ ├── 02992529 39 | │ │ └── 03797390 40 | │ └── output 41 | ├── hssd 42 | │ └── scenes 43 | │ └── 107734119_175999932 44 | └── vMaterials_2 45 | ├── Carpet 46 | ..... 47 | ``` 48 | 49 | ### 2. start isaac sim 4.0.0 Container 50 | 51 | Change your project dir and start isaac-sim container 52 | 53 | ``` 54 | docker run --name isaac-sim --entrypoint bash -it --runtime=nvidia --gpus all -e "ACCEPT_EULA=Y" --rm --network=host \ 55 | -e "PRIVACY_CONSENT=Y" \ 56 | -v ~/workspace/projects/d3roma/isaacsim:/root/d3roma:rw \ 57 | -v ~/docker/isaac-sim/cache/kit:/isaac-sim/kit/cache:rw \ 58 | -v ~/docker/isaac-sim/cache/ov:/root/.cache/ov:rw \ 59 | -v ~/docker/isaac-sim/cache/pip:/root/.cache/pip:rw \ 60 | -v ~/docker/isaac-sim/cache/glcache:/root/.cache/nvidia/GLCache:rw \ 61 | -v ~/docker/isaac-sim/cache/computecache:/root/.nv/ComputeCache:rw \ 62 | -v ~/docker/isaac-sim/logs:/root/.nvidia-omniverse/logs:rw \ 63 | -v ~/docker/isaac-sim/data:/root/.local/share/ov/data:rw \ 64 | -v ~/docker/isaac-sim/documents:/root/Documents:rw \ 65 | nvcr.io/nvidia/isaac-sim:4.0.0 66 | ``` 67 | 68 | ### 3. install python packages into isaac-sim 69 | 70 | ``` 71 | /isaac-sim/python.sh -m pip install -r requirements.txt 72 | ``` 73 | 74 | ### 4. generate IR renderings 75 | ``` 76 | cd /root/d3roma 77 | /isaac-sim/python.sh render.py 78 | ``` 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /utils/losess.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for various likelihood-based losses. These are ported from the original 3 | Ho et al. diffusion models codebase: 4 | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py 5 | """ 6 | 7 | import numpy as np 8 | import torch as th 9 | 10 | def mse_to_vlb(t, mse, logvar_clipped): 11 | """ t: bs 12 | mse: bs 13 | """ 14 | if t == 0: 15 | return discretized_gaussian_log_likelihood() 16 | else: 17 | return 0.5 * ( 18 | # -1.0 19 | # + logvar2 20 | # - logvar1 21 | # + th.exp(logvar1 - logvar2) 22 | + mse * th.exp(-logvar_clipped[t]) / np.log(2.0) 23 | ) 24 | 25 | def normal_kl(mean1, logvar1, mean2, logvar2): 26 | """ 27 | Compute the KL divergence between two gaussians. 28 | 29 | Shapes are automatically broadcasted, so batches can be compared to 30 | scalars, among other use cases. 31 | """ 32 | tensor = None 33 | for obj in (mean1, logvar1, mean2, logvar2): 34 | if isinstance(obj, th.Tensor): 35 | tensor = obj 36 | break 37 | assert tensor is not None, "at least one argument must be a Tensor" 38 | 39 | # Force variances to be Tensors. Broadcasting helps convert scalars to 40 | # Tensors, but it does not work for th.exp(). 41 | logvar1, logvar2 = [ 42 | x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) 43 | for x in (logvar1, logvar2) 44 | ] 45 | 46 | return 0.5 * ( 47 | -1.0 48 | + logvar2 49 | - logvar1 50 | + th.exp(logvar1 - logvar2) 51 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2) 52 | ) 53 | 54 | 55 | def approx_standard_normal_cdf(x): 56 | """ 57 | A fast approximation of the cumulative distribution function of the 58 | standard normal. 59 | """ 60 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) 61 | 62 | 63 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 64 | """ 65 | Compute the log-likelihood of a Gaussian distribution discretizing to a 66 | given image. 67 | 68 | :param x: the target images. It is assumed that this was uint8 values, 69 | rescaled to the range [-1, 1]. 70 | :param means: the Gaussian mean Tensor. 71 | :param log_scales: the Gaussian log stddev Tensor. 72 | :return: a tensor like x of log probabilities (in nats). 73 | """ 74 | assert x.shape == means.shape == log_scales.shape 75 | centered_x = x - means 76 | inv_stdv = th.exp(-log_scales) 77 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 78 | cdf_plus = approx_standard_normal_cdf(plus_in) 79 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 80 | cdf_min = approx_standard_normal_cdf(min_in) 81 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 82 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 83 | cdf_delta = cdf_plus - cdf_min 84 | log_probs = th.where( 85 | x < -0.999, 86 | log_cdf_plus, 87 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 88 | ) 89 | assert log_probs.shape == x.shape 90 | return log_probs 91 | -------------------------------------------------------------------------------- /scripts/check_sceneflow.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | from omegaconf import DictConfig, OmegaConf 3 | from hydra.core.config_store import ConfigStore 4 | from config import Config, TrainingConfig, setup_hydra_configurations 5 | from data.data_loader import fetch_dataloader 6 | from utils.utils import seed_everything 7 | from accelerate import Accelerator 8 | from accelerate.logging import get_logger 9 | from tqdm import tqdm 10 | from utils.utils import Normalizer 11 | from utils.frame_utils import read_gen 12 | import torch.nn.functional as F 13 | import shutil 14 | 15 | import torch 16 | import numpy as np 17 | from PIL import Image 18 | 19 | import os 20 | logger = get_logger(__name__, log_level="INFO") # multi-process logging 21 | 22 | Accelerator() # hack: enable logging 23 | 24 | @hydra.main(version_base=None, config_path="conf", config_name="config.yaml") 25 | def check(config: Config): 26 | cfg = config.task 27 | logger.info(cfg.train_dataset) 28 | 29 | train_dataloader, val_dataloader_lst = fetch_dataloader(cfg) 30 | logger.info(val_dataloader_lst[0].dataset.__class__.__name__) 31 | 32 | all_dataloaders = [train_dataloader] 33 | all_dataloaders.extend(val_dataloader_lst) 34 | 35 | count = 0 36 | bads = {} 37 | 38 | for i, dataloader in enumerate([train_dataloader]): # all_dataloaders, val_dataloader_lst 39 | pbar = tqdm(total=len(dataloader)) 40 | for j, data in enumerate(dataloader): 41 | # print(data.keys()) 42 | B = data['mask'].shape[0] 43 | for b in range(B): 44 | # rgb = data['normalized_rgb'][b] 45 | index = data['index'][b] 46 | path = data['path'][b] 47 | 48 | raw_left = path.replace("disparity", "raw_cleanpass").replace("pfm", "png").replace("right", "left") 49 | # raw_right= path.replace("disparity", "raw_finalpass").replace("pfm", "png").replace("left", "right") 50 | 51 | raw_left = np.array(read_gen(raw_left)) 52 | gt_left = np.array(read_gen(path)) 53 | 54 | TP = ((raw_left > 0) & (np.abs(gt_left - raw_left) <= 2)).sum() 55 | FP = ((raw_left > 0) & (np.abs(gt_left - raw_left) > 2)).sum() 56 | FN = ((raw_left == 0) & (np.abs(gt_left - raw_left) <= 2)).sum() 57 | precision = TP / (TP + FP) 58 | recall = TP / (TP + FN) # biased 59 | 60 | # raw_right = read_gen(raw_right) 61 | 62 | # if precision < 0.6 and recall < 0.7: 63 | if precision < 0.2: 64 | bads[path] = precision 65 | logger.info(f"bad image {index}: {path}") 66 | 67 | if True: 68 | dump_dir = "./bad_sim" 69 | shutil.copy2(path, f"{dump_dir}/{j}_{b}_disp.pfm") 70 | shutil.copy2(path.replace("disparity", "raw_finalpass").replace("pfm", "png"), f"{dump_dir}/{j}_{b}_raw.png") 71 | shutil.copy2(path.replace("disparity", "raw_cleanpass").replace("pfm", "png"), f"{dump_dir}/{j}_{b}_raw_clean.png") 72 | shutil.copy2(path.replace("disparity", "frames_finalpass").replace("pfm", "png"), f"{dump_dir}/{j}_{b}_left.png") 73 | shutil.copy2(path.replace("disparity", "frames_finalpass").replace("pfm", "png").replace("left", "right"), f"{dump_dir}/{j}_{b}_right.png") 74 | 75 | count += 1 76 | 77 | pbar.update(1) 78 | 79 | logger.info(f"how many bad images? {len(bads.items())}") 80 | with open(f'bad_his.txt', 'w') as f: 81 | for path,epe in bads.items(): 82 | f.write(f"{path} {epe}\n") 83 | 84 | if __name__ == "__main__": 85 | seed_everything(0) 86 | setup_hydra_configurations() 87 | check() -------------------------------------------------------------------------------- /isaacsim/config/hssd.yaml: -------------------------------------------------------------------------------- 1 | launch_config: 2 | renderer: PathTracing #RayTracedLighting # 3 | headless: true # false # 4 | 5 | # Controls lightings for rendering images, 6 | # rgb: color image only 7 | # ir: ir depth image only 8 | # rgb+ir: iteratively render rgb and ir images 9 | # na: don't render images with replicators 10 | render_mode: rgb+ir # gt+rgb+ir # rgb+ir # rgb # ir # 11 | 12 | # Controls the simulation mode 13 | # layout_n_capture: init scene and capture images then quit 14 | # load_n_render: TODO load scene and render images 15 | # simulate: normal simulation mode 16 | 17 | sim_mode: load_n_render # layout_n_capture # simulate # 18 | 19 | resume_scene: 20 | 21 | robot: 22 | name: "franka.yml" #"galbot_zero_lefthand.yml" # 23 | init_pose: [-0.2, 0., 0., 1, 0, 0, 0] #[0.0, 0.5, 0.0] # usually look at, , 0.707, 0.0, 0.0, -0.707 24 | 25 | scene: empty #hssd # 26 | layout: part # dreds # graspnet # 27 | 28 | dreds: 29 | cad_model_dir: data/dreds 30 | layout_offset: [0.2, 0.0, 0.0] 31 | 32 | graspnet: 33 | root_path: data/graspnet 34 | layout_offset: [0.5, 0.2, 0.0] 35 | 36 | hssd: 37 | data_dir: data/hssd/scenes 38 | name: "107734119_175999932" 39 | default_prim_path: "/World/scene" 40 | scale: 1 41 | hide_ceilings: true 42 | hide_walls: false 43 | center_offset: [0.0, 0.0, 0.0] # [0.0, 0.0, 0.0] 44 | surface: 45 | category: teatable 46 | prim_path: /World/furniture/node_b914fb6bcc81386bfa1ff7a3eb8412b7ac581ff 47 | stt: false # specular or transparent, translucent surface 48 | 49 | seed: -1 # set to >= 0 to disable domain randomization 50 | rt_subframes: 8 51 | num_frames_per_surface: 3 52 | visualize: false 53 | render_after_quiet: true 54 | shadow: off 55 | 56 | viewport: 57 | record: false 58 | 59 | world: 60 | physics_dt: 0.016666667 # 0.01 # 61 | rendering_dt: 0.016666667 #0.005 # 62 | 63 | depth_sensor: 64 | name: realsense 65 | clipping_range: [0.1, 5] 66 | focal_length: 1.88 67 | # horizontal_aperture: 26.42033 68 | # vertical_aperture: 14.86144 69 | fov: 71.28 70 | resolution: [640, 360] # [1280, 720] # 71 | placement: # baseline = 0.055 72 | rgb_to_left_ir: 0.0 # 0.015 # 73 | rgb_to_right_ir: 0.055 # 0.070 # 74 | rgb_to_projector: 0.0410 # 0.0425 # 75 | projector: 76 | intensity: 5 77 | exposure: -1.0 78 | 79 | replicator: std_obj # graspnet # glass, articulated_obj 80 | domain_randomization: true 81 | 82 | lighting: 83 | light_type: [Sphere] # Rect # Disk # disk_light # 84 | range: #@see https://zh.wikipedia.org/zh-cn/%E7%90%83%E5%BA%A7%E6%A8%99%E7%B3%BB 85 | theta: [30, 90] 86 | phi: [-60, 60] 87 | radius: [1, 2] 88 | 89 | Distant_light: 90 | intensity: 0 91 | 92 | Sphere_light: 93 | radius: [1, 1] #[0.5, 1.0] 94 | height: [2.5, 2.5] #[1.5, 2] 95 | intensity: 96 | "on": [10000, 10000] # [7500, 11000] 97 | "off": [500, 500] # [200, 400] 98 | treatAsPoint: true 99 | 100 | Disk_light: 101 | radius: [1,1] # [0.5, 1.0] 102 | height: [1.5,1.5] #[1.5, 2] 103 | intensity: 104 | "on": [10000, 10000] #[6000, 9000] 105 | "off": [200, 400] 106 | 107 | Rect_light: 108 | width: [100, 100] 109 | height: [100, 100] 110 | intensity: 111 | "on": [50000, 50000] 112 | "off": [2000, 2000] 113 | 114 | specular: 115 | reflection_roughness_constant: [0.05, 0.2] # < 0.4 116 | metallic_constant: [0.8, 0.99] # > 0.9 117 | reflection_color: [0.0, 1.0] 118 | 119 | transparent: 120 | roughness_constant: [0.1, 0.1] # 0.05 121 | cutout_opacity: [0.1, 0.2] # [0.6, 0.7] # [0.2, 0.3] # < 0.4 122 | thin_walled: false #true 123 | glass_ior: [1.4, 1.6] # ~3, default: 1.491 124 | frosting_roughness: [0.2, 0.3] # < 0.1, grayscale only 125 | 126 | glass: 127 | base_alpha: [0.0, 1.0] 128 | ior: [1.4, 1.6] 129 | metallic_factor: [0.0, 0.35] 130 | roughness_factor: [0.0, 0.1] 131 | 132 | scope_name: /MyScope 133 | writer: on # off # BasicWriter 134 | writer_config: 135 | output_dir: output_ir 136 | start_sequence_id: -1 # -1 means continue from the existing frames, otherwise start with specified frame id 137 | rgb: true 138 | disparity: true 139 | normals: true # TODO 140 | # disparity: true 141 | # bounding_box_2d_tight: false 142 | semantic_segmentation: true 143 | distance_to_image_plane: true 144 | pointcloud: false 145 | # bounding_box_3d: false 146 | # occlusion: false 147 | clear_previous_semantics: true 148 | 149 | hydra: 150 | run: 151 | dir: _outputs/${hydra.job.name} 152 | job: 153 | chdir: true 154 | 155 | -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | from torchvision.transforms import RandomResizedCrop, InterpolationMode 2 | import torchvision.transforms.functional as TF 3 | import torch 4 | import functools 5 | 6 | class WarpDataset(torch.utils.data.Dataset): 7 | def __init__(self, image_size, augment): 8 | self.augment = augment 9 | self.rgb_list = [] 10 | self.depth_list = [] 11 | self.lr_list = [] 12 | self.mask_list = [] 13 | 14 | if self.augment is None: 15 | self.augment = dict() 16 | if type(image_size) == int: 17 | self.image_size = (image_size, image_size) # H x W 18 | elif type(image_size) == tuple: 19 | self.image_size = image_size 20 | else: 21 | raise ValueError("image_size must be int or tuple") 22 | return 23 | 24 | def data_aug(self, rgb, depth, mask, img1=None, img2=None, raw_depth=None): 25 | # random crop and resize. 26 | safe_apply = lambda func, x: func(x) if x is not None else None 27 | if 'resizedcrop' in self.augment.keys(): 28 | param = self.augment['resizedcrop'] 29 | i, j, h, w = RandomResizedCrop.get_params(rgb, scale=param['scale'], ratio=param['ratio']) 30 | resized_crop = lambda i, j, h, w, size, interp, x: TF.resized_crop(x, i, j, h, w, size=size, interpolation=interp) 31 | resized_crop_fn = functools.partial(resized_crop, i,j,h,w,self.image_size, InterpolationMode.NEAREST) 32 | rgb, mask, depth, img1, img2 = map(lambda x: safe_apply(resized_crop_fn, x), [rgb, mask, depth, img1, img2]) 33 | 34 | """ rgb = TF.resized_crop(rgb, i, j, h, w, size=self.image_size, interpolation=InterpolationMode.NEAREST) 35 | mask = TF.resized_crop(mask, i, j, h, w, size=self.image_size, interpolation=InterpolationMode.NEAREST) 36 | depth = TF.resized_crop(depth, i, j, h, w, size=self.image_size, interpolation=InterpolationMode.NEAREST) 37 | if img1 is not None: 38 | img1 = TF.resized_crop(img1, i, j, h, w, size=self.image_size, interpolation=InterpolationMode.NEAREST) 39 | img2 = TF.resized_crop(img2, i, j, h, w, size=self.image_size, interpolation=InterpolationMode.NEAREST) """ 40 | else: # only resize when eval and test 41 | resize = lambda size, interp, x: TF.resize(x, size=size, interpolation=interp) 42 | resize_fn = functools.partial(resize, self.image_size, InterpolationMode.NEAREST) 43 | rgb, mask, depth, img1, img2 = map(lambda x: safe_apply(resize_fn, x), [rgb, mask, depth, img1, img2]) 44 | 45 | # rgb = TF.resize(rgb, size=self.image_size, interpolation=InterpolationMode.NEAREST) 46 | # mask = TF.resize(mask, size=self.image_size, interpolation=InterpolationMode.NEAREST) 47 | # depth = TF.resize(depth, size=self.image_size, interpolation=InterpolationMode.NEAREST) 48 | # if img1 is not None: 49 | # img1 = TF.resize(img1, size=self.image_size, interpolation=InterpolationMode.NEAREST) 50 | # img2 = TF.resize(img2, size=self.image_size, interpolation=InterpolationMode.NEAREST) 51 | 52 | # Random hflip 53 | if 'hflip' in self.augment.keys(): 54 | param = self.augment['hflip'] 55 | if torch.rand(1) < 0.5: #param['prob']: 56 | rgb, mask, depth, img1, img2 = map(lambda x: safe_apply(TF.hflip, x), [rgb, mask, depth, img1, img2]) 57 | """ rgb = TF.hflip(rgb) 58 | mask = TF.hflip(mask) 59 | depth = TF.hflip(depth) 60 | if img1 is not None: 61 | img1 = TF.hflip(img1) 62 | img2 = TF.hflip(img2) """ 63 | 64 | # TODO add color augmentation such as changing the lighting 65 | 66 | if img1 is None: 67 | return rgb, depth, mask 68 | else: 69 | return rgb, depth, mask, img1, img2 70 | 71 | 72 | def normalize_depth(self, depth, mask, low_p=0.00, high_p=1.00): 73 | """ low_p, high_p: low and high percentile to normalize the depth""" 74 | mask = mask.bool() 75 | masked_depth = depth[mask] 76 | low, high = torch.quantile(masked_depth, torch.tensor((low_p, high_p))) 77 | 78 | depth = (depth - low) / (high - low) 79 | depth = (depth - 0.5) * 2 # [0,1] -> [-1, 1] 80 | return depth 81 | 82 | def normalize_rgb(self, rgb): 83 | return (rgb / 255 - 0.5) * 2 # [0,1] -> [-1, 1] 84 | 85 | def __mul__(self, v): 86 | self.rgb_list = v * self.rgb_list 87 | self.depth_list = v * self.depth_list 88 | self.lr_list = v * self.lr_list 89 | self.mask_list = v * self.mask_list 90 | return self 91 | 92 | def __len__(self): 93 | return len(self.rgb_list) 94 | -------------------------------------------------------------------------------- /core/praser.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import OrderedDict 3 | import json 4 | from pathlib import Path 5 | from datetime import datetime 6 | from functools import partial 7 | import importlib 8 | from types import FunctionType 9 | import shutil 10 | def init_obj(opt, logger, *args, default_file_name='default file', given_module=None, init_type='Network', **modify_kwargs): 11 | """ 12 | finds a function handle with the name given as 'name' in config, 13 | and returns the instance initialized with corresponding args. 14 | """ 15 | if opt is None or len(opt)<1: 16 | logger.info('Option is None when initialize {}'.format(init_type)) 17 | return None 18 | 19 | ''' default format is dict with name key ''' 20 | if isinstance(opt, str): 21 | opt = {'name': opt} 22 | logger.warning('Config is a str, converts to a dict {}'.format(opt)) 23 | 24 | name = opt['name'] 25 | ''' name can be list, indicates the file and class name of function ''' 26 | if isinstance(name, list): 27 | file_name, class_name = name[0], name[1] 28 | else: 29 | file_name, class_name = default_file_name, name 30 | try: 31 | if given_module is not None: 32 | module = given_module 33 | else: 34 | module = importlib.import_module(file_name) 35 | 36 | attr = getattr(module, class_name) 37 | kwargs = opt.get('args', {}) 38 | kwargs.update(modify_kwargs) 39 | ''' import class or function with args ''' 40 | if isinstance(attr, type): 41 | ret = attr(*args, **kwargs) 42 | ret.__name__ = ret.__class__.__name__ 43 | elif isinstance(attr, FunctionType): 44 | ret = partial(attr, *args, **kwargs) 45 | ret.__name__ = attr.__name__ 46 | # ret = attr 47 | logger.info('{} [{:s}() form {:s}] is created.'.format(init_type, class_name, file_name)) 48 | except: 49 | raise NotImplementedError('{} [{:s}() form {:s}] not recognized.'.format(init_type, class_name, file_name)) 50 | return ret 51 | 52 | 53 | def mkdirs(paths): 54 | if isinstance(paths, str): 55 | os.makedirs(paths, exist_ok=True) 56 | else: 57 | for path in paths: 58 | os.makedirs(path, exist_ok=True) 59 | 60 | def get_timestamp(): 61 | return datetime.now().strftime('%y%m%d_%H%M%S') 62 | 63 | 64 | def write_json(content, fname): 65 | fname = Path(fname) 66 | with fname.open('wt') as handle: 67 | json.dump(content, handle, indent=4, sort_keys=False) 68 | 69 | class NoneDict(dict): 70 | def __missing__(self, key): 71 | return None 72 | 73 | def dict_to_nonedict(opt): 74 | """ convert to NoneDict, which return None for missing key. """ 75 | if isinstance(opt, dict): 76 | new_opt = dict() 77 | for key, sub_opt in opt.items(): 78 | new_opt[key] = dict_to_nonedict(sub_opt) 79 | return NoneDict(**new_opt) 80 | elif isinstance(opt, list): 81 | return [dict_to_nonedict(sub_opt) for sub_opt in opt] 82 | else: 83 | return opt 84 | 85 | def dict2str(opt, indent_l=1): 86 | """ dict to string for logger """ 87 | msg = '' 88 | for k, v in opt.items(): 89 | if isinstance(v, dict): 90 | msg += ' ' * (indent_l * 2) + k + ':[\n' 91 | msg += dict2str(v, indent_l + 1) 92 | msg += ' ' * (indent_l * 2) + ']\n' 93 | else: 94 | msg += ' ' * (indent_l * 2) + k + ': ' + str(v) + '\n' 95 | return msg 96 | 97 | def parse(args): 98 | json_str = '' 99 | with open(args.config, 'r') as f: 100 | for line in f: 101 | line = line.split('//')[0] + '\n' 102 | json_str += line 103 | opt = json.loads(json_str, object_pairs_hook=OrderedDict) 104 | 105 | ''' replace the config context using args ''' 106 | opt['phase'] = args.phase 107 | if args.gpu_ids is not None: 108 | opt['gpu_ids'] = [int(id) for id in args.gpu_ids.split(',')] 109 | if args.batch is not None: 110 | opt['datasets'][opt['phase']]['dataloader']['args']['batch_size'] = args.batch 111 | 112 | ''' set cuda environment ''' 113 | if len(opt['gpu_ids']) > 1: 114 | opt['distributed'] = True 115 | else: 116 | opt['distributed'] = False 117 | 118 | ''' update name ''' 119 | if args.debug: 120 | opt['name'] = 'debug_{}'.format(opt['name']) 121 | elif opt['finetune_norm']: 122 | opt['name'] = 'finetune_{}'.format(opt['name']) 123 | else: 124 | opt['name'] = '{}_{}'.format(opt['phase'], opt['name']) 125 | 126 | ''' set log directory ''' 127 | experiments_root = os.path.join(opt['path']['base_dir'], '{}_{}'.format(opt['name'], get_timestamp())) 128 | mkdirs(experiments_root) 129 | 130 | ''' save json ''' 131 | write_json(opt, '{}/config.json'.format(experiments_root)) 132 | 133 | ''' change folder relative hierarchy ''' 134 | opt['path']['experiments_root'] = experiments_root 135 | for key, path in opt['path'].items(): 136 | if 'resume' not in key and 'base' not in key and 'root' not in key: 137 | opt['path'][key] = os.path.join(experiments_root, path) 138 | mkdirs(opt['path'][key]) 139 | 140 | ''' debug mode ''' 141 | if 'debug' in opt['name']: 142 | opt['train'].update(opt['debug']) 143 | 144 | ''' code backup ''' 145 | for name in os.listdir('.'): 146 | if name in ['config', 'models', 'core', 'slurm', 'data']: 147 | shutil.copytree(name, os.path.join(opt['path']['code'], name), ignore=shutil.ignore_patterns("*.pyc", "__pycache__")) 148 | if '.py' in name or '.sh' in name: 149 | shutil.copy(name, opt['path']['code']) 150 | return dict_to_nonedict(opt) 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /core/resample.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import numpy as np 4 | import torch as th 5 | import torch.distributed as dist 6 | 7 | 8 | def create_named_schedule_sampler(name, T, *args): 9 | """ 10 | Create a ScheduleSampler from a library of pre-defined samplers. 11 | 12 | :param name: the name of the sampler. 13 | :param diffusion: the diffusion object to sample for. 14 | """ 15 | if name == "uniform": 16 | return UniformSampler(T) 17 | elif name == "snr": 18 | return SNRSampler(T, *args) 19 | elif name == "loss-second-moment": 20 | return LossSecondMomentResampler(T) 21 | else: 22 | raise NotImplementedError(f"unknown schedule sampler: {name}") 23 | 24 | 25 | class ScheduleSampler(ABC): 26 | 27 | """ 28 | A distribution over timesteps in the diffusion process, intended to reduce 29 | variance of the objective. 30 | 31 | By default, samplers perform unbiased importance sampling, in which the 32 | objective's mean is unchanged. 33 | However, subclasses may override sample() to change how the resampled 34 | terms are reweighted, allowing for actual changes in the objective. 35 | """ 36 | 37 | @abstractmethod 38 | def weights(self): 39 | """ 40 | Get a numpy array of weights, one per diffusion step. 41 | 42 | The weights needn't be normalized, but must be positive. 43 | """ 44 | 45 | def sample(self, batch_size, device): 46 | """ 47 | Importance-sample timesteps for a batch. 48 | 49 | :param batch_size: the number of timesteps. 50 | :param device: the torch device to save to. 51 | :return: a tuple (timesteps, weights): 52 | - timesteps: a tensor of timestep indices. 53 | - weights: a tensor of weights to scale the resulting losses. 54 | """ 55 | w = self.weights() 56 | p = w / np.sum(w) 57 | indices_np = np.random.choice(len(p), size=(batch_size,), p=p) 58 | indices = th.from_numpy(indices_np).long().to(device) 59 | weights_np = 1 / (len(p) * p[indices_np]) 60 | weights = th.from_numpy(weights_np).float().to(device) 61 | return indices, weights 62 | 63 | 64 | class UniformSampler(ScheduleSampler): 65 | def __init__(self, T): 66 | self.T = T 67 | self._weights = np.ones([T]) 68 | 69 | def weights(self): 70 | return self._weights 71 | 72 | class SNRSampler(ScheduleSampler): 73 | def __init__(self, snr): 74 | self._snr = snr 75 | 76 | def weights(self): 77 | return self._snr 78 | 79 | class LossAwareSampler(ScheduleSampler): 80 | def update_with_local_losses(self, local_ts, local_losses): 81 | """ 82 | Update the reweighting using losses from a model. 83 | 84 | Call this method from each rank with a batch of timesteps and the 85 | corresponding losses for each of those timesteps. 86 | This method will perform synchronization to make sure all of the ranks 87 | maintain the exact same reweighting. 88 | 89 | :param local_ts: an integer Tensor of timesteps. 90 | :param local_losses: a 1D Tensor of losses. 91 | """ 92 | batch_sizes = [ 93 | th.tensor([0], dtype=th.int32, device=local_ts.device) 94 | for _ in range(dist.get_world_size()) 95 | ] 96 | dist.all_gather( 97 | batch_sizes, 98 | th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), 99 | ) 100 | 101 | # Pad all_gather batches to be the maximum batch size. 102 | batch_sizes = [x.item() for x in batch_sizes] 103 | max_bs = max(batch_sizes) 104 | 105 | timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes] 106 | loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes] 107 | dist.all_gather(timestep_batches, local_ts) 108 | dist.all_gather(loss_batches, local_losses) 109 | timesteps = [ 110 | x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] 111 | ] 112 | losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] 113 | self.update_with_all_losses(timesteps, losses) 114 | 115 | @abstractmethod 116 | def update_with_all_losses(self, ts, losses): 117 | """ 118 | Update the reweighting using losses from a model. 119 | 120 | Sub-classes should override this method to update the reweighting 121 | using losses from the model. 122 | 123 | This method directly updates the reweighting without synchronizing 124 | between workers. It is called by update_with_local_losses from all 125 | ranks with identical arguments. Thus, it should have deterministic 126 | behavior to maintain state across workers. 127 | 128 | :param ts: a list of int timesteps. 129 | :param losses: a list of float losses, one per timestep. 130 | """ 131 | 132 | 133 | class LossSecondMomentResampler(LossAwareSampler): 134 | def __init__(self, T, history_per_term=10, uniform_prob=0.001): 135 | self.T = T 136 | self.history_per_term = history_per_term 137 | self.uniform_prob = uniform_prob 138 | self._loss_history = np.zeros( 139 | [T, history_per_term], dtype=np.float64 140 | ) 141 | self._loss_counts = np.zeros([T], dtype=np.int32) 142 | 143 | def weights(self): 144 | if not self._warmed_up(): 145 | return np.ones([self.T], dtype=np.float64) 146 | weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) 147 | weights /= np.sum(weights) 148 | weights *= 1 - self.uniform_prob 149 | weights += self.uniform_prob / len(weights) 150 | return weights 151 | 152 | def update_with_all_losses(self, ts, losses): 153 | for t, loss in zip(ts, losses): 154 | if self._loss_counts[t] == self.history_per_term: 155 | # Shift out the oldest loss term. 156 | self._loss_history[t, :-1] = self._loss_history[t, 1:] 157 | self._loss_history[t, -1] = loss 158 | else: 159 | self._loss_history[t, self._loss_counts[t]] = loss 160 | self._loss_counts[t] += 1 161 | 162 | def _warmed_up(self): 163 | return (self._loss_counts == self.history_per_term).all() 164 | -------------------------------------------------------------------------------- /utils/ransac.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import numpy as np 3 | from numpy.random import default_rng 4 | rng = default_rng() 5 | import torch 6 | import time 7 | from utils.utils import compute_scale_and_shift 8 | 9 | def square_error_loss(y_true, y_pred): 10 | return (y_true - y_pred) ** 2 11 | 12 | def mean_square_error(y_true, y_pred): 13 | return torch.sum(square_error_loss(y_true, y_pred)) / y_true.shape[0] 14 | 15 | def mean_absolute_error(y_true, y_pred): 16 | # return np.abs(y_true - y_pred).mean() 17 | return torch.abs(y_true - y_pred).mean(1) 18 | 19 | def mean_accuracy_inverse(y_true, y_pred): 20 | thresh = torch.maximum(y_true / y_pred, y_pred / y_true) 21 | return 1 / torch.mean((thresh < 1.25).float()) 22 | 23 | 24 | class ScaleShiftEstimator: 25 | def __init__(self): 26 | self.params = (1, 0) # s,t 27 | 28 | def fit(self, X: np.ndarray, Y: np.ndarray): 29 | """ X & Y: Nx1 """ 30 | start = time.time() 31 | self.params = compute_scale_and_shift(X, Y) 32 | end = time.time() 33 | print(f"ssi: {end - start:.5f}") 34 | return self 35 | 36 | def predict(self, X: np.ndarray): 37 | return X * self.params[0] + self.params[1] 38 | 39 | class RANSAC: 40 | def __init__(self, n=0.1, k=100, t=0.05, d=0.5, model=ScaleShiftEstimator(), loss=square_error_loss, metric=mean_accuracy_inverse): 41 | self.n = n # `n`: (percent) Minimum number of data points to estimate parameters 42 | self.k = k # `k`: Maximum iterations allowed 43 | self.t = t # `t`: Threshold value to determine if points are fit well 44 | self.d = d # `d`: (percent)Number of close data points required to assert model fits well 45 | self.model = model # `model`: class implementing `fit` and `predict` 46 | self.loss = loss # `loss`: function of `y_true` and `y_pred` that returns a vector 47 | self.metric = metric # `metric`: function of `y_true` and `y_pred` and returns a float 48 | self.best_fit = None 49 | self.best_error = None 50 | 51 | def fit(self, X, Y, mask): 52 | """ X: source 53 | Y: target 54 | """ 55 | assert X.shape == Y.shape == mask.shape 56 | B, HW = X.shape 57 | 58 | X = X.clone() 59 | Y = Y.clone() 60 | mask = mask.clone() 61 | N = int(self.n * HW) 62 | T = self.t 63 | # T = self.t * torch.abs(Y[mask.bool()]).mean() 64 | D = int(self.d * HW) 65 | 66 | assert D < HW and N < HW, "N, D must be less than HW" 67 | 68 | self.best_num_inlier = torch.zeros((B, 1), device=X.device).to(torch.int32) 69 | self.best_mask_inlier = torch.zeros((B, HW), device=X.device).to(torch.bool) 70 | self.best_error = torch.full((B, 1), torch.inf, device=X.device) 71 | self.best_fit = torch.empty((B, 2), device=X.device) 72 | self.best_fit[:,0] = 1.0 # init s=1, t=0 73 | self.best_fit[:,1] = 0.0 74 | 75 | for _ in range(self.k): 76 | ids = torch.randperm(HW, device=X.device).repeat(B, 1) # torch.arange(HW, device=X.device).repeat(B, 1) # 77 | maybe_inliers = ids[:, :N] 78 | maybe_model = compute_scale_and_shift( 79 | torch.gather(X, 1, maybe_inliers), 80 | torch.gather(Y, 1, maybe_inliers), 81 | torch.gather(mask, 1, maybe_inliers)) 82 | 83 | X_ = X * maybe_model[:, 0:1] + maybe_model[:,1:] 84 | threshold = torch.where(self.loss(Y, X_,) < T, 1, 0).to(torch.bool) & mask.bool() 85 | 86 | better_model = compute_scale_and_shift(X, Y, threshold) 87 | X__ = X * better_model[:, 0:1] + better_model[:, 1:] 88 | this_error = self.metric(Y, X__)[...,None] 89 | this_num_inlier = torch.sum(threshold, 1)[...,None] 90 | select = (this_num_inlier > D) & (this_error < self.best_error) 91 | 92 | self.best_num_inlier = torch.where(select, this_num_inlier, self.best_num_inlier) 93 | self.best_mask_inlier = torch.where(select, threshold, self.best_mask_inlier) 94 | self.best_fit = torch.where(select, better_model, self.best_fit) 95 | self.best_error = torch.where(select, this_error, self.best_error) 96 | return self 97 | 98 | def predict(self, X): 99 | return self.best_fit.predict(X) 100 | 101 | class LinearRegressor: 102 | def __init__(self): 103 | self.params = None 104 | 105 | def fit(self, X: np.ndarray, y: np.ndarray): 106 | r, _ = X.shape 107 | X = np.hstack([np.ones((r, 1)), X]) 108 | self.params = np.linalg.inv(X.T @ X) @ X.T @ y 109 | return self 110 | 111 | def predict(self, X: np.ndarray): 112 | r, _ = X.shape 113 | X = np.hstack([np.ones((r, 1)), X]) 114 | return X @ self.params 115 | 116 | 117 | if __name__ == "__main__": 118 | 119 | regressor = RANSAC(model=LinearRegressor(), loss=square_error_loss, metric=mean_square_error) 120 | 121 | X = np.array([-0.848,-0.800,-0.704,-0.632,-0.488,-0.472,-0.368,-0.336,-0.280,-0.200,-0.00800,-0.0840,0.0240,0.100,0.124,0.148,0.232,0.236,0.324,0.356,0.368,0.440,0.512,0.548,0.660,0.640,0.712,0.752,0.776,0.880,0.920,0.944,-0.108,-0.168,-0.720,-0.784,-0.224,-0.604,-0.740,-0.0440,0.388,-0.0200,0.752,0.416,-0.0800,-0.348,0.988,0.776,0.680,0.880,-0.816,-0.424,-0.932,0.272,-0.556,-0.568,-0.600,-0.716,-0.796,-0.880,-0.972,-0.916,0.816,0.892,0.956,0.980,0.988,0.992,0.00400]).reshape(-1,1) 122 | y = np.array([-0.917,-0.833,-0.801,-0.665,-0.605,-0.545,-0.509,-0.433,-0.397,-0.281,-0.205,-0.169,-0.0531,-0.0651,0.0349,0.0829,0.0589,0.175,0.179,0.191,0.259,0.287,0.359,0.395,0.483,0.539,0.543,0.603,0.667,0.679,0.751,0.803,-0.265,-0.341,0.111,-0.113,0.547,0.791,0.551,0.347,0.975,0.943,-0.249,-0.769,-0.625,-0.861,-0.749,-0.945,-0.493,0.163,-0.469,0.0669,0.891,0.623,-0.609,-0.677,-0.721,-0.745,-0.885,-0.897,-0.969,-0.949,0.707,0.783,0.859,0.979,0.811,0.891,-0.137]).reshape(-1,1) 123 | 124 | regressor.fit(X, y) 125 | 126 | import matplotlib.pyplot as plt 127 | plt.style.use("seaborn-darkgrid") 128 | fig, ax = plt.subplots(1, 1) 129 | ax.set_box_aspect(1) 130 | 131 | plt.scatter(X, y) 132 | 133 | line = np.linspace(-1, 1, num=100).reshape(-1, 1) 134 | plt.plot(line, regressor.predict(line), c="peru") 135 | # plt.show() 136 | plt.savefig("ransac.png") 137 | plt.close() -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | link all the datasets here, example folder structures: 2 | 3 | ``` 4 | datasets 5 | ├── clearpose -> /raid/songlin/Data/clearpose 6 | │   ├── clearpose_downsample_100 7 | │   │   ├── downsample.py 8 | │   │   ├── model 9 | │   │   ├── set1 10 | │   │   ├── set2 11 | │   │   ├── set3 12 | │   │   ├── set4 13 | │   │   ├── set5 14 | │   │   ├── set6 15 | │   │   ├── set7 16 | │   │   ├── set8 17 | │   │   └── set9 18 | │   ├── metadata 19 | │   │   ├── set1 20 | │   │   ├── set2 21 | │   │   ├── set3 22 | │   │   ├── set4 23 | │   │   ├── set5 24 | │   │   ├── set6 25 | │   │   ├── set7 26 | │   │   ├── set8 27 | │   │   └── set9 28 | │   ├── model 29 | │   │   ├── 003_cracker_box 30 | │   │   ├── 005_tomato_soup_can 31 | │   │   ├── 006_mustard_bottle 32 | │   │   ├── 007_tuna_fish_can 33 | │   │   ├── 009_gelatin_box 34 | │   │   ├── BBQSauce 35 | │   │   ├── beaker_1 36 | │   │   ├── bottle_1 37 | │   │   ├── bottle_2 38 | │   │   ├── bottle_3 39 | │   │   ├── bottle_4 40 | │   │   ├── bottle_5 41 | │   │   ├── bowl_1 42 | │   │   ├── bowl_2 43 | │   │   ├── bowl_3 44 | │   │   ├── bowl_4 45 | │   │   ├── bowl_5 46 | │   │   ├── bowl_6 47 | │   │   ├── container_1 48 | │   │   ├── container_2 49 | │   │   ├── container_3 50 | │   │   ├── container_4 51 | │   │   ├── container_5 52 | │   │   ├── create_keypoints.py 53 | │   │   ├── dropper_1 54 | │   │   ├── dropper_2 55 | │   │   ├── flask_1 56 | │   │   ├── fork_1 57 | │   │   ├── funnel_1 58 | │   │   ├── graduated_cylinder_1 59 | │   │   ├── graduated_cylinder_2 60 | │   │   ├── knife_1 61 | │   │   ├── knife_2 62 | │   │   ├── Mayo 63 | │   │   ├── mug_1 64 | │   │   ├── mug_2 65 | │   │   ├── OrangeJuice 66 | │   │   ├── pan_1 67 | │   │   ├── pan_2 68 | │   │   ├── pan_3 69 | │   │   ├── pitcher_1 70 | │   │   ├── plate_1 71 | │   │   ├── plate_2 72 | │   │   ├── reagent_bottle_1 73 | │   │   ├── reagent_bottle_2 74 | │   │   ├── round_table 75 | │   │   ├── spoon_1 76 | │   │   ├── spoon_2 77 | │   │   ├── stick_1 78 | │   │   ├── syringe_1 79 | │   │   ├── trans_models.blend 80 | │   │   ├── trans_models_keypoint.blend 81 | │   │   ├── trans_models_keypoint.blend1 82 | │   │   ├── trans_models_keypoint (copy).blend 83 | │   │   ├── trans_models_kp.blend 84 | │   │   ├── water_cup_1 85 | │   │   ├── water_cup_10 86 | │   │   ├── water_cup_11 87 | │   │   ├── water_cup_12 88 | │   │   ├── water_cup_13 89 | │   │   ├── water_cup_14 90 | │   │   ├── water_cup_2 91 | │   │   ├── water_cup_3 92 | │   │   ├── water_cup_4 93 | │   │   ├── water_cup_5 94 | │   │   ├── water_cup_6 95 | │   │   ├── water_cup_7 96 | │   │   ├── water_cup_8 97 | │   │   ├── water_cup_9 98 | │   │   ├── wine_cup_1 99 | │   │   ├── wine_cup_2 100 | │   │   ├── wine_cup_3 101 | │   │   ├── wine_cup_4 102 | │   │   ├── wine_cup_5 103 | │   │   ├── wine_cup_6 104 | │   │   ├── wine_cup_7 105 | │   │   ├── wine_cup_8 106 | │   │   └── wine_cup_9 107 | │   ├── set1 108 | │   │   ├── scene1 109 | │   │   ├── scene2 110 | │   │   ├── scene3 111 | │   │   ├── scene4 112 | │   │   └── scene5 113 | │   ├── set2 114 | │   │   ├── scene1 115 | │   │   ├── scene3 116 | │   │   ├── scene4 117 | │   │   ├── scene5 118 | │   │   └── scene6 119 | │   ├── set3 120 | │   │   ├── scene1 121 | │   │   ├── scene11 122 | │   │   ├── scene3 123 | │   │   ├── scene4 124 | │   │   └── scene8 125 | │   ├── set4 126 | │   │   ├── scene1 127 | │   │   ├── scene2 128 | │   │   ├── scene3 129 | │   │   ├── scene4 130 | │   │   ├── scene5 131 | │   │   └── scene6 132 | │   ├── set5 133 | │   │   ├── scene1 134 | │   │   ├── scene2 135 | │   │   ├── scene3 136 | │   │   ├── scene4 137 | │   │   ├── scene5 138 | │   │   └── scene6 139 | │   ├── set6 140 | │   │   ├── scene1 141 | │   │   ├── scene2 142 | │   │   ├── scene3 143 | │   │   ├── scene4 144 | │   │   ├── scene5 145 | │   │   └── scene6 146 | │   ├── set7 147 | │   │   ├── scene1 148 | │   │   ├── scene2 149 | │   │   ├── scene3 150 | │   │   ├── scene4 151 | │   │   ├── scene5 152 | │   │   └── scene6 153 | │   ├── set8 154 | │   │   ├── scene1 155 | │   │   ├── scene2 156 | │   │   ├── scene3 157 | │   │   ├── scene4 158 | │   │   ├── scene5 159 | │   │   └── scene6 160 | │   └── set9 161 | │   ├── scene10 162 | │   ├── scene11 163 | │   ├── scene12 164 | │   ├── scene7 165 | │   ├── scene8 166 | │   └── scene9 167 | ├── DREDS 168 | │   ├── test -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/test 169 | │   │   └── shapenet_generate_1216_val_novel 170 | │   ├── test_std_catknown -> /raid/songlin/Data/DREDS_ECCV2022/STD-CatKnown 171 | │   │   ├── test_0 172 | │   │   ├── test_14-1 173 | │   │   ├── test_18-1 174 | │   │   ├── test_19 175 | │   │   ├── test_20-3 176 | │   │   ├── test_3-2 177 | │   │   ├── test_4-2 178 | │   │   ├── test_5-2 179 | │   │   ├── test_6-1 180 | │   │   ├── test_7-1 181 | │   │   ├── test_8 182 | │   │   ├── test_9-2 183 | │   │   ├── train_0-5 184 | │   │   ├── train_10-1 185 | │   │   ├── train_12 186 | │   │   ├── train_1-4 187 | │   │   ├── train_14-1 188 | │   │   ├── train_16-2 189 | │   │   ├── train_17-1 190 | │   │   ├── train_19-1 191 | │   │   ├── train_3 192 | │   │   ├── train_4-1 193 | │   │   ├── train_7-1 194 | │   │   ├── train_8 195 | │   │   └── train_9-3 196 | │   ├── test_std_catnovel -> /raid/songlin/Data/DREDS_ECCV2022/STD-CatNovel 197 | │   │   └── real_data_novel 198 | │   ├── train -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/train 199 | │   │   ├── part0 200 | │   │   ├── part1 201 | │   │   ├── part2 202 | │   │   ├── part3 203 | │   │   └── part4 204 | │   └── val -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/val 205 | │   └── shapenet_generate_1216 206 | ├── HISS 207 | │   ├── train -> /raid/songlin/Data/hssd-isaac-sim-100k 208 | │   │   ├── 102344049 209 | │   │   ├── 102344280 210 | │   │   ├── 103997586_171030666 211 | │   │   ├── 107734119_175999932 212 | │   │   └── bad_his.txt 213 | │   └── val -> /raid/songlin/Data/hssd-isaac-sim-300hq 214 | │   ├── 102344049 215 | │   ├── 102344280 216 | │   ├── 103997586_171030666 217 | │   ├── 107734119_175999932 218 | │   ├── bad_his.txt 219 | │   └── simulation2 220 | ├── README.md 221 | ├── Real 222 | │   └── xiaomeng 223 | │   ├── 0000_depth.png 224 | │   ├── 0000_ir_l.png 225 | │   ├── 0000_ir_r.png 226 | │   ├── 0000_raw_disparity.png 227 | │   ├── 0000_rgb.png 228 | │   └── intrinsics.txt 229 | └── sceneflow -> /raid/songlin/Data/sceneflow 230 | ├── bad_sceneflow_test.txt 231 | ├── bad_sceneflow_train.txt 232 | ├── Driving 233 | │   ├── disparity 234 | │   ├── frames_cleanpass 235 | │   ├── frames_finalpass 236 | │   ├── raw_cleanpass 237 | │   └── raw_finalpass 238 | ├── FlyingThings3D 239 | │   ├── disparity 240 | │   ├── frames_cleanpass 241 | │   ├── frames_finalpass 242 | │   ├── raw_cleanpass 243 | │   └── raw_finalpass 244 | └── Monkaa 245 | ├── disparity 246 | ├── frames_cleanpass 247 | ├── frames_finalpass 248 | ├── raw_cleanpass 249 | └── raw_finalpass 250 | 251 | 227 directories, 18 files 252 | 253 | ``` 254 | -------------------------------------------------------------------------------- /distributed_evaluate.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import math 3 | import argparse 4 | import torch 5 | import logging 6 | from tqdm import tqdm 7 | 8 | from core.custom_pipelines import GuidedLatentDiffusionPipeline 9 | from accelerate import Accelerator, PartialState 10 | from core.guidance import FlowGuidance 11 | import numpy as np 12 | from utils.utils import seed_everything 13 | from config import TrainingConfig, create_sampler 14 | from diffusers import UNet2DModel, DDIMScheduler 15 | from utils.utils import InputPadder, metrics_to_dict, pretty_json 16 | from accelerate.logging import get_logger 17 | from utils.camera import plot_error_map 18 | from evaluate import eval_batch 19 | from data.stereo_datasets import * 20 | from data.mono_datasets import * 21 | 22 | import hydra 23 | from config import Config, TrainingConfig, create_sampler, setup_hydra_configurations 24 | 25 | logger = get_logger(__name__, log_level="INFO") 26 | 27 | @hydra.main(version_base=None, config_path="conf", config_name="config.yaml") 28 | def run_distributed_eval(base_cfg: Config): 29 | if base_cfg.seed != -1: 30 | seed_everything(base_cfg.seed) # for reproducing 31 | 32 | accelerator = Accelerator() # hack: enable logging 33 | 34 | config = base_cfg.task 35 | assert len(config.eval_dataset) == 1, "only support single dataset for evaluation" 36 | 37 | inputPadder = InputPadder(config.image_size, divis_by=8) 38 | # config.camera # hack init default camera 39 | 40 | patrained_path = f"{config.resume_pretrained}" 41 | if os.path.exists(patrained_path): 42 | logger.info(f"load weights from {patrained_path}") 43 | """ pipeline = GuidedLatentDiffusionPipeline.from_pretrained(patrained_path).to("cuda") 44 | # model = UNet2DConditionModel.from_pretrained(patrained_path) 45 | 46 | from diffusers import DDIMScheduler 47 | ddim = DDIMScheduler.from_config(dict( 48 | beta_schedule = config.beta_schedule, # "scaled_linear", 49 | beta_start = config.beta_start, # 0.00085, 50 | beta_end = config.beta_end, # 0.012, 51 | clip_sample = config.clip_sample, # False, 52 | num_train_timesteps = config.num_train_timesteps, # 1000, 53 | prediction_type = config.prediction_type, # #"v_prediction", 54 | set_alpha_to_one = False, 55 | skip_prk_steps = True, 56 | steps_offset = 1, 57 | trained_betas = None 58 | )) 59 | pipeline.scheduler = ddim """ 60 | 61 | from core.custom_pipelines import GuidedDiffusionPipeline, GuidedLatentDiffusionPipeline 62 | clazz_pipeline = GuidedLatentDiffusionPipeline if config.ldm else GuidedDiffusionPipeline 63 | pipeline = clazz_pipeline.from_pretrained(patrained_path).to("cuda") 64 | pipeline.guidance.flow_guidance_mode=config.flow_guidance_mode 65 | 66 | pipeline.scheduler = create_sampler(config, train=False) 67 | else: 68 | raise ValueError(f"patrained path not exists: {patrained_path}") 69 | 70 | if config.eval_output: 71 | eval_output_dir = f"{config.resume_pretrained}/{config.eval_output}" 72 | else: 73 | eval_output_dir = f"{config.resume_pretrained}/dist.{config.eval_dataset[0]}.g.{config.guide_source}.b{config.eval_num_batch}.{config.eval_split}" 74 | 75 | if not os.path.exists(eval_output_dir): 76 | os.makedirs(eval_output_dir, exist_ok=True) 77 | 78 | logger.logger.addHandler(logging.FileHandler(f"{eval_output_dir}/eval.log")) 79 | logger.logger.addHandler(logging.StreamHandler(sys.stdout)) 80 | logger.info(f"eval output dir: {eval_output_dir}") 81 | 82 | from data.data_loader import create_dataset 83 | val_dataset = create_dataset(config, config.eval_dataset[0], split = config.eval_split) 84 | # print(f"eval_batch_size={config.eval_batch_size}"); exit(0) 85 | val_dataloader = torch.utils.data.DataLoader(val_dataset, 86 | batch_size=config.eval_batch_size, 87 | shuffle=True, 88 | pin_memory=False, 89 | drop_last=False) 90 | 91 | """ if type(model.sample_size) == list: 92 | model.sample_size[0] = inputPadder.padded_size[0] 93 | model.sample_size[1] = inputPadder.padded_size[1] """ 94 | 95 | # distributed evaluation 96 | val_dataloader = accelerator.prepare(val_dataloader) 97 | 98 | pbar = tqdm(total=len(val_dataloader), desc="Eval", disable=not accelerator.is_local_main_process, position=0) 99 | disable_bar = not accelerator.is_local_main_process 100 | distributed_state = PartialState() 101 | 102 | w = config.flow_guidance_weights[0] 103 | if accelerator.is_local_main_process: 104 | logger.info(f"guided by {config.guide_source}") 105 | 106 | disp_metrics = [] 107 | depth_metrics = [] 108 | total = 0 109 | for i, batch in enumerate(val_dataloader): 110 | if config.eval_num_batch > 0 and i >= config.eval_num_batch: 111 | break 112 | 113 | normalized_rgbs = batch["normalized_rgb"] 114 | gt_images = batch["normalized_disp"] 115 | raw_disps = batch["raw_disp"] 116 | left_images = batch["left_image"] if "left_image" in batch else None 117 | right_images = batch["right_image"] if "right_image" in batch else None 118 | depth_images = batch["depth"] if "depth" in batch else None 119 | gt_masks = batch["mask"] 120 | fxb = batch["fxb"] 121 | sim_disps = batch["sim_disp"] if "sim_disp" in batch else None 122 | 123 | B = normalized_rgbs.shape[0] 124 | # assert not torch.any(gt_images[gt_masks.to(torch.bool)] == 0.0), "dataset bug" 125 | if config.guide_source is None: 126 | pass 127 | 128 | elif config.guide_source == "raft-stereo": 129 | pass 130 | 131 | elif config.guide_source == "stereo-match": 132 | pass 133 | 134 | elif config.guide_source == "raw-depth": 135 | guidance_image = batch["raw_depth"] # raw 136 | valid = guidance_image > 0 137 | 138 | elif config.guide_source == "gt": 139 | guidance_image = batch["depth"] # gt 140 | valid = guidance_image > 0 141 | else: 142 | raise ValueError(f"Unknown guidance mode: {config.guide_source}") 143 | 144 | if config.guide_source is not None: 145 | pipeline.guidance.prepare(guidance_image, valid, "depth") # disp 146 | pipeline.guidance.flow_guidance_weight = w 147 | 148 | pred_disps, metrics_, uncertainties, error, intermediates = eval_batch(config, pipeline, disable_bar, fxb, normalized_rgbs, 149 | raw_disps, gt_masks, left_images, right_images, sim_disps) 150 | metrics = metrics_to_dict(*metrics_) 151 | logger.info(f"metrics(w={w}):{pretty_json(metrics)}") 152 | 153 | disp_err = torch.from_numpy(metrics_[0]).to(distributed_state.device) # to be gathered 154 | depth_err = torch.from_numpy(metrics_[1]).to(distributed_state.device) 155 | 156 | if config.plot_error_map: 157 | fname = lambda name: f"{eval_output_dir}/idx{i}_w{w}_pid{distributed_state.process_index}_{name}" 158 | error_map = plot_error_map(error) 159 | error_map.save(fname("error.png")) 160 | 161 | # gather all batch results 162 | gathered_disp_err = accelerator.gather_for_metrics(disp_err) 163 | gathered_depth_err = accelerator.gather_for_metrics(depth_err) 164 | 165 | disp_metrics.extend(gathered_disp_err) 166 | depth_metrics.extend(gathered_depth_err) 167 | total += gathered_disp_err.shape[0] 168 | 169 | pbar.update(1) 170 | 171 | # whole val set results 172 | gathered_metrics = metrics_to_dict(torch.vstack(disp_metrics).cpu().numpy(), torch.vstack(depth_metrics).cpu().numpy()) 173 | logger.info(f"final metrics:{pretty_json(gathered_metrics)}") 174 | logger.info(f"total evaluated {total} samples, please check if correct") 175 | 176 | if __name__ == "__main__": 177 | setup_hydra_configurations() 178 | run_distributed_eval() -------------------------------------------------------------------------------- /isaacsim/utils_func.py: -------------------------------------------------------------------------------- 1 | import os, re, math 2 | import numpy as np 3 | from typing import Union, Type, List, Tuple 4 | from pxr import Gf, Sdf, Usd, UsdGeom 5 | from omni.isaac.core.utils.prims import get_prim_at_path 6 | import transforms3d 7 | import omni 8 | 9 | def find_next_sequence_id(output_dir): 10 | import glob 11 | import os 12 | files = sorted(glob.glob(os.path.join(output_dir, "*.png")), reverse=True) 13 | if len(files) == 0: 14 | return 0 15 | return int(files[0].split("/")[-1].split("_")[0]) + 1 16 | 17 | def get_visibility_attribute( 18 | stage: Usd.Stage, prim_path: str 19 | ) -> Union[Usd.Attribute, None]: 20 | #Return the visibility attribute of a prim 21 | path = Sdf.Path(prim_path) 22 | prim = stage.GetPrimAtPath(path) 23 | if not prim.IsValid(): 24 | return None 25 | visibility_attribute = prim.GetAttribute("visibility") 26 | return visibility_attribute 27 | 28 | def get_all_child_mesh(parent_prim: Usd.Prim) -> Usd.Prim: 29 | # Iterates only active, loaded, defined, non-abstract children 30 | mesh_prims = [] 31 | for model_prim in parent_prim.GetChildren(): 32 | if "model" in model_prim.GetPath().pathString: 33 | for child_prim in model_prim.GetChildren(): 34 | if child_prim.IsA(UsdGeom.Mesh): 35 | mesh_prims.append(child_prim) 36 | return mesh_prims 37 | 38 | def create_materials(self, stage, num, opacity): 39 | MDL = "OmniPBR.mdl" 40 | # MDL = "OmniGlass.mdl" 41 | mtl_name, _ = os.path.splitext(MDL) 42 | MAT_PATH = "/World/Looks" 43 | materials = [] 44 | for _ in range(num): 45 | prim_path = omni.usd.get_stage_next_free_path(stage, f"{MAT_PATH}/{mtl_name}", False) 46 | mat = self.create_omnipbr_material(mtl_url=MDL, mtl_name=mtl_name, mtl_path=prim_path, cutout_opacity=opacity) 47 | materials.append(mat) 48 | return materials 49 | 50 | def parse_quadrant(q): 51 | """ x+-y+-z+-, in isaac sim hssd coordinate system """ 52 | x_, y_, z_ = q.split(',') 53 | if y_[1:] == '+': 54 | theta = [0, np.pi/2] 55 | elif y_[1:] == '-': 56 | theta = [np.pi/2, np.pi] 57 | else: 58 | theta = [0, np.pi] 59 | 60 | if z_[1:] == '+': 61 | phi = [0, np.pi/2] 62 | elif z_[1:] == '-': 63 | phi = [np.pi/2, np.pi] 64 | else: 65 | phi = [0, np.pi] 66 | 67 | return theta, phi 68 | 69 | def grasp_pose_in_robot(target_grasp, graspnet_offset = np.array([0,0,0])): 70 | T_table_grasp = np.eye(4) 71 | T_table_grasp[:3, :3] = transforms3d.quaternions.quat2mat(target_grasp['orientation']) 72 | T_table_grasp[:3, 3] = target_grasp['position'] 73 | 74 | T_world_table = np.eye(4) 75 | # TODO random table rotation around z 76 | T_world_table[:3, 3] = graspnet_offset 77 | 78 | T_grasp_ee = np.array([ 79 | [0, 0, 1, 0], 80 | [0, -1, 0, 0], 81 | [1, 0, 0, 0], 82 | [0, 0, 0, 1] 83 | ]) 84 | 85 | T_robot_world = np.eye(4) # should be always be identity due to curobo limitation 86 | T_ee_hand = np.eye(4) 87 | T_ee_hand[:3, 3] = np.array([0, 0, -0.10]) 88 | 89 | """ T_robot_hand: base_link -> panda_hand """ 90 | T_robot_hand = T_robot_world @ T_world_table @ T_table_grasp @ T_grasp_ee @ T_ee_hand 91 | target_pose = { 92 | 'position' : T_robot_hand[:3, 3], 93 | 'orientation' : transforms3d.quaternions.mat2quat(T_robot_hand[:3, :3]) 94 | } 95 | return target_pose 96 | 97 | def compute_obb(bbox_cache: UsdGeom.BBoxCache, prim_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 98 | """Computes the Oriented Bounding Box (OBB) of a prim 99 | 100 | .. note:: 101 | 102 | * The OBB does not guarantee the smallest possible bounding box, it rotates and scales the default AABB. 103 | * The rotation matrix incorporates any scale factors applied to the object. 104 | * The `half_extent` values do not include these scaling effects. 105 | 106 | Args: 107 | bbox_cache (UsdGeom.BBoxCache): USD Bounding Box Cache object to use for computation 108 | prim_path (str): Prim path to compute OBB for 109 | 110 | Returns: 111 | Tuple[np.ndarray, np.ndarray, np.ndarray]: A tuple containing the following OBB information: 112 | - The centroid of the OBB as a NumPy array. 113 | - The axes of the OBB as a 2D NumPy array, where each row represents a different axis. 114 | - The half extent of the OBB as a NumPy array. 115 | 116 | Example: 117 | 118 | .. code-block:: python 119 | 120 | >>> import omni.isaac.core.utils.bounds as bounds_utils 121 | >>> 122 | >>> # 1 stage unit length cube centered at (0.0, 0.0, 0.0) 123 | >>> cache = bounds_utils.create_bbox_cache() 124 | >>> centroid, axes, half_extent = bounds_utils.compute_obb(cache, prim_path="/World/Cube") 125 | >>> centroid 126 | [0. 0. 0.] 127 | >>> axes 128 | [[1. 0. 0.] 129 | [0. 1. 0.] 130 | [0. 0. 1.]] 131 | >>> half_extent 132 | [0.5 0.5 0.5] 133 | >>> 134 | >>> # the same cube rotated 45 degrees around the z-axis 135 | >>> cache = bounds_utils.create_bbox_cache() 136 | >>> centroid, axes, half_extent = bounds_utils.compute_obb(cache, prim_path="/World/Cube") 137 | >>> centroid 138 | [0. 0. 0.] 139 | >>> axes 140 | [[ 0.70710678 0.70710678 0. ] 141 | [-0.70710678 0.70710678 0. ] 142 | [ 0. 0. 1. ]] 143 | >>> half_extent 144 | [0.5 0.5 0.5] 145 | """ 146 | # Compute the BBox3d for the prim 147 | prim = get_prim_at_path(prim_path) 148 | bound = bbox_cache.ComputeWorldBound(prim) 149 | 150 | # Compute the translated centroid of the world bound 151 | centroid = bound.ComputeCentroid() 152 | 153 | # Compute the axis vectors of the OBB 154 | # NOTE: The rotation matrix incorporates the scale factors applied to the object 155 | rotation_matrix = bound.GetMatrix().ExtractRotationMatrix() 156 | x_axis = rotation_matrix.GetRow(0) 157 | y_axis = rotation_matrix.GetRow(1) 158 | z_axis = rotation_matrix.GetRow(2) 159 | 160 | # Compute the half-lengths of the OBB along each axis 161 | # NOTE the size/extent values do not include any scaling effects 162 | half_extent = bound.GetRange().GetSize() * 0.5 163 | 164 | return np.array([*centroid]), np.array([[*x_axis], [*y_axis], [*z_axis]]), np.array(half_extent) 165 | 166 | def get_obb_corners(centroid: np.ndarray, axes: np.ndarray, half_extent: np.ndarray) -> np.ndarray: 167 | """Computes the corners of the Oriented Bounding Box (OBB) from the given OBB information 168 | 169 | Args: 170 | centroid (np.ndarray): The centroid of the OBB as a NumPy array. 171 | axes (np.ndarray): The axes of the OBB as a 2D NumPy array, where each row represents a different axis. 172 | half_extent (np.ndarray): The half extent of the OBB as a NumPy array. 173 | 174 | Returns: 175 | np.ndarray: NumPy array of shape (8, 3) containing each corner location of the OBB 176 | 177 | :math:`c_0 = (x_{min}, y_{min}, z_{min})` 178 | |br| :math:`c_1 = (x_{min}, y_{min}, z_{max})` 179 | |br| :math:`c_2 = (x_{min}, y_{max}, z_{min})` 180 | |br| :math:`c_3 = (x_{min}, y_{max}, z_{max})` 181 | |br| :math:`c_4 = (x_{max}, y_{min}, z_{min})` 182 | |br| :math:`c_5 = (x_{max}, y_{min}, z_{max})` 183 | |br| :math:`c_6 = (x_{max}, y_{max}, z_{min})` 184 | |br| :math:`c_7 = (x_{max}, y_{max}, z_{max})` 185 | 186 | Example: 187 | 188 | .. code-block:: python 189 | 190 | >>> import omni.isaac.core.utils.bounds as bounds_utils 191 | >>> 192 | >>> cache = bounds_utils.create_bbox_cache() 193 | >>> centroid, axes, half_extent = bounds_utils.compute_obb(cache, prim_path="/World/Cube") 194 | >>> bounds_utils.get_obb_corners(centroid, axes, half_extent) 195 | [[-0.5 -0.5 -0.5] 196 | [-0.5 -0.5 0.5] 197 | [-0.5 0.5 -0.5] 198 | [-0.5 0.5 0.5] 199 | [ 0.5 -0.5 -0.5] 200 | [ 0.5 -0.5 0.5] 201 | [ 0.5 0.5 -0.5] 202 | [ 0.5 0.5 0.5]] 203 | """ 204 | corners = [ 205 | centroid - axes[0] * half_extent[0] - axes[1] * half_extent[1] - axes[2] * half_extent[2], 206 | centroid - axes[0] * half_extent[0] - axes[1] * half_extent[1] + axes[2] * half_extent[2], 207 | centroid - axes[0] * half_extent[0] + axes[1] * half_extent[1] - axes[2] * half_extent[2], 208 | centroid - axes[0] * half_extent[0] + axes[1] * half_extent[1] + axes[2] * half_extent[2], 209 | centroid + axes[0] * half_extent[0] - axes[1] * half_extent[1] - axes[2] * half_extent[2], 210 | centroid + axes[0] * half_extent[0] - axes[1] * half_extent[1] + axes[2] * half_extent[2], 211 | centroid + axes[0] * half_extent[0] + axes[1] * half_extent[1] - axes[2] * half_extent[2], 212 | centroid + axes[0] * half_extent[0] + axes[1] * half_extent[1] + axes[2] * half_extent[2], 213 | ] 214 | return np.array(corners) 215 | -------------------------------------------------------------------------------- /scripts/check_stereo.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | from omegaconf import DictConfig, OmegaConf 3 | from hydra.core.config_store import ConfigStore 4 | from config import Config, TrainingConfig, setup_hydra_configurations 5 | from data.data_loader import fetch_dataloader 6 | from utils.utils import seed_everything 7 | from accelerate import Accelerator 8 | from accelerate.logging import get_logger 9 | from tqdm import tqdm 10 | from utils.utils import Normalizer 11 | import torch.nn.functional as F 12 | 13 | import torch 14 | import numpy as np 15 | from PIL import Image 16 | 17 | logger = get_logger(__name__, log_level="INFO") # multi-process logging 18 | 19 | Accelerator() # hack: enable logging 20 | 21 | @hydra.main(version_base=None, config_path="conf", config_name="config.yaml") 22 | def check(config: Config): 23 | cfg = config.task 24 | logger.info(cfg.train_dataset) 25 | 26 | from utils.camera import DepthCamera, Realsense 27 | from functools import partial 28 | from utils import frame_utils 29 | sim_camera = DepthCamera.from_device("sim") 30 | # sim_camera.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 31 | sim_camera.change_resolution(cfg.camera_resolution) 32 | disp_reader = partial(frame_utils.readDispReal, sim_camera) 33 | 34 | # sim_disp, sim_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_simDepthImage.exr") 35 | # sim_disp, sim_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_simDispImage.png") 36 | # raw_disp, raw_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_depth.exr") 37 | 38 | # epe = np.abs(sim_disp[sim_valid] - raw_disp[sim_valid]).mean() 39 | # assert epe < 1, f"bad quality sim disp, epe={epe}" 40 | 41 | train_dataloader, val_dataloader_lst = fetch_dataloader(cfg) 42 | logger.info(val_dataloader_lst[0].dataset.__class__.__name__) 43 | 44 | all_dataloaders = [train_dataloader] 45 | all_dataloaders.extend(val_dataloader_lst) 46 | bad = [] 47 | 48 | stats = { 49 | 'mean': [], 50 | 'med': [], 51 | 'min': [], 52 | 'max': [], 53 | 'std': [] 54 | } 55 | 56 | stats_norm = { 57 | 'mean': [], 58 | 'med': [], 59 | 'min': [], 60 | 'max': [], 61 | 'std': [] 62 | } 63 | count = 0 64 | 65 | norm = Normalizer.from_config(cfg) 66 | 67 | bads = {} 68 | 69 | for i, dataloader in enumerate(val_dataloader_lst): # all_dataloaders, [train_dataloader] 70 | pbar = tqdm(total=len(dataloader)) 71 | for j, data in enumerate(dataloader): 72 | # print(data.keys()) 73 | B = data['mask'].shape[0] 74 | for b in range(B): 75 | mask = data['mask'][b] 76 | # sim_mask = data['sim_mask'][b] 77 | 78 | disp = data['raw_disp'][b] 79 | disp_norm = data["normalized_disp"][b] 80 | # rgb = data['normalized_rgb'][b] 81 | index = data['index'][b] 82 | path = data['path'][b] 83 | 84 | # sim_disp = data["sim_disp_unnorm"][b] 85 | # sim_valid = data["sim_mask"][b].bool() 86 | 87 | stats['mean'].append(disp.mean().item()) 88 | stats['med'].append(disp.median().item()) 89 | stats['min'].append(disp.min().item()) 90 | stats['max'].append(disp.max().item()) 91 | stats['std'].append(disp.std().item()) 92 | 93 | stats_norm['mean'].append(disp_norm.mean().item()) 94 | stats_norm['med'].append(disp_norm.median().item()) 95 | stats_norm['min'].append(disp_norm.min().item()) 96 | stats_norm['max'].append(disp_norm.max().item()) 97 | stats_norm['std'].append(disp_norm.std().item()) 98 | 99 | # sim_disp, sim_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_simDepthImage.exr") 100 | # sim_disp, sim_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_simDispImage.png") 101 | # raw_disp, raw_valid, min_disp, max_disp = disp_reader("datasets/HssdIsaacStd/train/102344049/kitchentable/1500_depth.exr") 102 | 103 | # epe = torch.abs(sim_disp[sim_valid] - disp[sim_valid]).mean() 104 | if True: #&epe > 2.: 105 | # print(f"bad quality sim disp, epe={epe}, {data['path']}") 106 | # bads[data['path'][b]] = epe 107 | 108 | if "normalized_rgb" in data: 109 | rgb = data['normalized_rgb'][b:b+1] 110 | Image.fromarray(((rgb[0]+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_{j}_rgb.png") 111 | 112 | if True: 113 | left = data['left_image'][b:b+1] 114 | Image.fromarray(((left[0]+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_{j}_left.png") 115 | 116 | right = data['right_image'][b:b+1] 117 | Image.fromarray(((right[0]+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_{j}_right.png") 118 | 119 | H, W = disp.shape[-2:] 120 | device = left.device 121 | 122 | xx, yy = torch.meshgrid(torch.arange(W), torch.arange(H), indexing='xy') 123 | xx = xx.unsqueeze(0).repeat(1, 1, 1).to(device) 124 | yy = yy.unsqueeze(0).repeat(1, 1, 1).to(device) 125 | 126 | # raw_disp = data['raw_disp'][b] 127 | xx = (xx - disp) / ((W - 1) / 2.) - 1 128 | yy = yy / ((H - 1) / 2.) - 1 129 | grid = torch.stack((xx, yy), dim=-1) 130 | warp_left_image = F.grid_sample(right, grid, align_corners=True, mode="bilinear", padding_mode="border") 131 | warp_left_image[0][mask.repeat(3,1,1)<1.0] = -1 132 | Image.fromarray(((warp_left_image[0]+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_{j}_warped_right.png") 133 | loss = F.l1_loss(left[..., 0:], warp_left_image, reduction='mean') 134 | logger.info(f"raw disp loss: {loss.item()}") 135 | 136 | sim_disp = norm.denormalize(data["sim_disp"])[b] 137 | xx, yy = torch.meshgrid(torch.arange(W), torch.arange(H), indexing='xy') 138 | xx = xx.unsqueeze(0).repeat(B, 1, 1).to(device) 139 | yy = yy.unsqueeze(0).repeat(B, 1, 1).to(device) 140 | xx = (xx - sim_disp) / ((W - 1) / 2.) - 1 141 | yy = yy / ((H - 1) / 2.) - 1 142 | sim_grid = torch.stack((xx, yy), dim=-1) 143 | warp_left_image_sim = F.grid_sample(right, sim_grid, align_corners=True, mode="bilinear", padding_mode="border") 144 | # warp_left_image_sim[0][mask.repeat(3,1,1)<1.0] = -1 for sparse dataset 145 | warp_left_image_sim[0][mask.repeat(3,1,1)<1.0] = -1 146 | Image.fromarray(((warp_left_image_sim[0]+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_{j}_warped_right_sim.png") 147 | loss_sim = F.l1_loss(left[..., 0:], warp_left_image_sim, reduction='mean') 148 | logger.info(f"sim disp loss: {loss_sim.item()}") 149 | 150 | """ if True or mask.sum() / mask.numel() < 0.98: 151 | bad.append(path) 152 | logger.info(f"bad image {index}: {path}") 153 | 154 | if True: 155 | # low, high = torch.quantile(data['depth'][b], torch.tensor((0.02, 0.98))) # gt depth 156 | # d = (data['depth'][b] - low) / (high - low) 157 | # Image.fromarray(mask[0].cpu().numpy().astype(np.uint8)*255).save(f"{index}_mask.png") 158 | # Image.fromarray((d[0].clamp(0,1)*255).cpu().numpy().astype(np.uint8)).save(f"{index}_depth_p.png") 159 | Image.fromarray(((rgb+1) * 127.5).cpu().numpy().astype(np.uint8).transpose(1,2,0)).save(f"{index}_rgb.png") """ 160 | 161 | count += 1 162 | if count % 1000 == 0: 163 | print("stats_raw...") 164 | print(f"tatal={len(stats['mean'])}") 165 | for k, vals in stats.items(): 166 | print(f"{k}: {np.mean(vals)}") 167 | print("stats_norm...") 168 | for k, vals in stats_norm.items(): 169 | print(f"{k}: {np.mean(vals)}") 170 | 171 | # break 172 | # break 173 | pbar.update(1) 174 | 175 | print(f"tatal={len(stats['mean'])}") 176 | print("stats_raw...") 177 | for k, vals in stats.items(): 178 | print(f"{k}: {np.mean(vals)}") 179 | print("stats_norm...") 180 | for k, vals in stats_norm.items(): 181 | print(f"{k}: {np.mean(vals)}") 182 | 183 | # print("stats:", stats) 184 | logger.info(f"how many bad images? {len(bads.items())}") 185 | with open(f'bad_his.txt', 'w') as f: 186 | for path,epe in bads.items(): 187 | f.write(f"{path} {epe}\n") 188 | 189 | if __name__ == "__main__": 190 | 191 | seed_everything(0) 192 | setup_hydra_configurations() 193 | check() -------------------------------------------------------------------------------- /data/data_loader.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .mono_datasets import * 3 | from .stereo_datasets import * 4 | from config import TrainingConfig 5 | from omegaconf import OmegaConf 6 | from torch.utils.data.dataset import ConcatDataset 7 | from utils.camera import Realsense, RGBDCamera 8 | 9 | def create_dataset(config: TrainingConfig, dataset_name, split = "train"): 10 | mono_lst = ['NYUv2', 'ScanNet', 'HyperSim', 'SceneNet', 'ScanNetpp', 'VK2', 'KITTI', "Middlebury", "InStereo2K", "Tartenair", "HRWSI", "SynTODD"] 11 | stereo_lst = ["Dreds", "Middlebury", "SceneFlow", "Real", "HISS", "ClearPose", "SynTODDRgbd", "Gapartnet2"] 12 | image_size = tuple(config.image_size) 13 | 14 | if len(dataset_name.split("_")) > 1: # Real_split_device 15 | dataset_name, split, device = dataset_name.split("_") 16 | 17 | from utils.utils import Normalizer 18 | normalizer = Normalizer.from_config(config) 19 | 20 | if dataset_name in stereo_lst: 21 | cam_res = [int(x) for x in config.camera_resolution.split("x")[::-1]] 22 | 23 | if split == "train": 24 | # dataset = eval(dataset_name)(f"datasets/{dataset_name}", split="train", image_size=config.image_size, augment=config.augment, camera = config.camera) 25 | aug_params = {"crop_size": image_size, 26 | "min_scale": config.augment["min_scale"], 27 | "max_scale": config.augment["max_scale"], 28 | "yjitter": config.augment["yjitter"]} 29 | aug_params["saturation_range"] = tuple(config.augment["saturation_range"]) 30 | aug_params["gamma"] = config.augment["gamma"] 31 | aug_params["do_flip"] = config.augment["hflip"] #config.augment["hflip"]["prob"] > 0.0 32 | # aug_params["camera_resolution"] = cam_res 33 | if dataset_name == 'SceneFlow': # BUG? min disp=0.5, max disp=192.0? 34 | disp_reader = partial(frame_utils.read_sceneflow, cam_res) 35 | clean_dataset = SceneFlow(aug_params=aug_params, root="datasets/sceneflow", dstype='frames_cleanpass', 36 | reader=disp_reader, normalizer=normalizer) 37 | final_dataset = SceneFlow(aug_params=aug_params, root="datasets/sceneflow", dstype='frames_finalpass', 38 | reader=disp_reader, normalizer=normalizer) 39 | dataset = clean_dataset + final_dataset 40 | elif dataset_name == 'HISS': 41 | sim_camera = DepthCamera.from_device("sim") # BUG? max depth=5. 42 | # sim_camera.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 43 | sim_camera.change_resolution(config.camera_resolution) 44 | disp_reader = partial(frame_utils.readDispReal, sim_camera) 45 | dataset = HISS(sim_camera, normalizer, image_size, split, config.prediction_space, aug_params, reader=disp_reader) 46 | elif dataset_name == "Dreds": 47 | sim_camera = Realsense.default_sim() # BUG? max depth=2. 48 | # sim_camera.change_resolution(f"{image_size[1]}x{image_size[0]}") 49 | sim_camera.change_resolution(config.camera_resolution) 50 | # assert image_size == (126, 224) 51 | # disp_reader = partial(frame_utils.readDispDreds_exr, sim_camera) 52 | dataset = Dreds(sim_camera, normalizer, image_size, split, config.prediction_space, aug_params) 53 | elif dataset_name == "ClearPose": 54 | camera = RGBDCamera.default_clearpose() # BUG? max depth=5. 55 | camera.change_resolution(config.camera_resolution) 56 | disp_reader = partial(frame_utils.readDispReal, camera) 57 | dataset = ClearPose(camera, normalizer, image_size, split, config.prediction_space, reader=disp_reader) 58 | elif dataset_name == "SynTODDRgbd": 59 | camera = RGBDCamera.default_syntodd() 60 | camera.change_resolution(config.camera_resolution) 61 | disp_reader = partial(frame_utils.readDispReal, camera) 62 | dataset = SynTODDRgbd(config.dataset_variant, camera, normalizer, image_size, split, config.prediction_space, reader=disp_reader) 63 | elif dataset_name == "Gapartnet2": 64 | sim_camera = Realsense.from_device("sim") 65 | sim_camera.change_resolution(config.camera_resolution) 66 | disp_reader = partial(frame_utils.readDispReal, sim_camera) 67 | dataset = Gapartnet2(sim_camera, normalizer, image_size, split, config.prediction_space, aug_params, reader=disp_reader) 68 | else: 69 | raise NotImplementedError 70 | 71 | else: 72 | if dataset_name == 'SceneFlow': 73 | disp_reader = partial(frame_utils.read_sceneflow, cam_res) 74 | dataset = SceneFlow(root="datasets/sceneflow", dstype='frames_cleanpass', things_test=True, 75 | reader=disp_reader, normalizer=normalizer) 76 | elif dataset_name == "HISS": 77 | sim_camera = Realsense.from_device("sim") 78 | sim_camera.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 79 | disp_reader = partial(frame_utils.readDispReal, sim_camera) 80 | dataset = HISS(sim_camera, normalizer, image_size, split, space=config.prediction_space, reader=disp_reader) 81 | elif dataset_name == "Dreds": 82 | sim_camera = Realsense.default_sim() 83 | sim_camera.change_resolution(f"{image_size[1]}x{image_size[0]}") 84 | # assert image_size == (126, 224) # reprod dreds-1.0 85 | # disp_reader = partial(frame_utils.readDispDreds_exr, sim_camera) 86 | dataset = Dreds(sim_camera, normalizer, image_size, split, space=config.prediction_space) 87 | elif dataset_name == "Real": 88 | real_cam = Realsense.default_real("fxm") 89 | real_cam.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 90 | dataset = Real(camera=real_cam, normalizer=normalizer, 91 | image_size=image_size, scene=split, space=config.prediction_space) 92 | elif dataset_name == "ClearPose": 93 | camera = RGBDCamera.default_clearpose() 94 | camera.change_resolution(f"{image_size[1]}x{image_size[0]}") 95 | disp_reader = partial(frame_utils.readDispReal, camera) 96 | dataset = ClearPose(camera, normalizer, image_size, split, config.prediction_space, reader=disp_reader) 97 | elif dataset_name == "SynTODDRgbd": 98 | camera = RGBDCamera.default_syntodd() 99 | camera.change_resolution(f"{image_size[1]}x{image_size[0]}") 100 | disp_reader = partial(frame_utils.readDispReal, camera) 101 | dataset = SynTODDRgbd(config.dataset_variant, camera, normalizer, image_size, split, config.prediction_space, reader=disp_reader) 102 | elif dataset_name == "Gapartnet2": 103 | sim_camera = Realsense.from_device("sim") 104 | sim_camera.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 105 | disp_reader = partial(frame_utils.readDispReal, sim_camera) 106 | dataset = Gapartnet2(sim_camera, normalizer, image_size, split, space=config.prediction_space, reader=disp_reader) 107 | 108 | else: 109 | raise NotImplementedError 110 | 111 | elif dataset_name in mono_lst: 112 | if split == "train": 113 | dataset= eval(dataset_name)(f"datasets/{dataset_name}", split="train", image_size=image_size, augment=config.augment) 114 | else: 115 | dataset = eval(dataset_name)(f"datasets/{dataset_name}", split=split, image_size=image_size, augment=None) 116 | else: 117 | raise NotImplementedError 118 | return dataset 119 | 120 | def fetch_dataloader(config: TrainingConfig): 121 | """ Create the data loader for the corresponding trainign set """ 122 | 123 | """ if not isinstance(config.dataset, List): 124 | dataset_lst = [config.dataset] 125 | else: 126 | dataset_lst = config.dataset 127 | 128 | if not isinstance(config.dataset_weight, List): 129 | weight_lst = [config.dataset_weight] 130 | else: 131 | weight_lst = config.dataset_weight """ 132 | 133 | assert len(config.train_dataset) == len(config.dataset_weight) 134 | 135 | val_loader_lst = [] 136 | train_dataset = None 137 | for i, dataset_name in enumerate(config.train_dataset): 138 | new_dataset = create_dataset(config, dataset_name, split = "train") 139 | 140 | # multiple dataset weights 141 | if type(new_dataset) == ConcatDataset: 142 | # hack: unsupported operand type(s) for *: 'ConcatDataset' and 'int' 143 | for i in range(max(0, int(config.dataset_weight[i])-1)): 144 | new_dataset += new_dataset 145 | else: 146 | new_dataset = new_dataset * config.dataset_weight[i] 147 | 148 | # add train dataset together 149 | train_dataset = new_dataset if train_dataset is None else train_dataset + new_dataset 150 | 151 | for i, dataset_name in enumerate(config.eval_dataset): 152 | # saperately evaluate each dataset 153 | val_dataset = create_dataset(config, dataset_name, split = "val") 154 | val_dataloader = torch.utils.data.DataLoader(val_dataset, 155 | batch_size=config.eval_batch_size, 156 | shuffle=True, 157 | pin_memory=False, 158 | drop_last=False) 159 | val_loader_lst.append(val_dataloader) 160 | 161 | train_dataloader = torch.utils.data.DataLoader(train_dataset, 162 | batch_size=config.train_batch_size, 163 | shuffle=True, 164 | pin_memory=False, 165 | num_workers=int(os.environ.get('SLURM_CPUS_PER_TASK', 6))-2, 166 | drop_last=True) 167 | 168 | logging.info('Training with %d image pairs' % len(train_dataset)) 169 | return train_dataloader, val_loader_lst 170 | 171 | -------------------------------------------------------------------------------- /isaacsim/replicator.py: -------------------------------------------------------------------------------- 1 | 2 | import os, sys 3 | import csv, copy, math 4 | import time, json 5 | import numpy as np 6 | import random 7 | import transforms3d as t3d 8 | # from scipy.spatial.transform import Rotation 9 | from typing import Union, Type, List 10 | from functools import partial 11 | from PIL import Image 12 | 13 | import carb 14 | import omni.replicator.core as rep 15 | import omni.usd 16 | from omni.isaac.kit import SimulationApp 17 | 18 | from omni.isaac.core.utils.nucleus import get_assets_root_path 19 | 20 | from omni.isaac.core.utils.bounds import compute_combined_aabb, create_bbox_cache 21 | from omni.isaac.core import World 22 | from omni.isaac.core.utils.stage import add_reference_to_stage 23 | from omni.replicator.core import Writer, AnnotatorRegistry 24 | from omni.isaac.core.utils.rotations import euler_angles_to_quat, quat_to_euler_angles 25 | from omni.isaac.core.objects import DynamicCuboid 26 | from pxr import Gf, Sdf, Usd, PhysxSchema, UsdGeom, UsdLux, UsdPhysics, UsdShade 27 | 28 | # import offline_generation_utils 29 | from hydra.utils import get_original_cwd, to_absolute_path 30 | from omegaconf import DictConfig 31 | 32 | from custom_writer import ColorWriter, GtWriter, IRWriter 33 | from omni.replicator.core import WriterRegistry 34 | from replicate import Replicator 35 | 36 | scene_prim_path = "/World/scene" #!! 37 | 38 | class IRReplicator: 39 | def __init__(self, app: SimulationApp, world: World, config:DictConfig) -> None: 40 | self._app = app 41 | self._world = world 42 | self._config = config 43 | self._log = self._app.app.print_and_log 44 | 45 | # Get server path 46 | # self.assets_root_path = get_assets_root_path() 47 | # if self.assets_root_path is None: 48 | # carb.log_error("Could not get nucleus server path, closing application..") 49 | # app.close() 50 | 51 | # load different scene replicator according to configuration 52 | self.replicator = Replicator.factory(world, config) 53 | 54 | # self._light: Usd.Prim = self.setup_lighting() 55 | 56 | self._scene: Usd.Prim = self.load_scene() 57 | # self._world.scene.add_default_ground_plane() 58 | """ self.scene = UsdPhysics.Scene.Define(self._world.stage, Sdf.Path("/physicsScene")) 59 | self.scene.CreateGravityDirectionAttr().Set(Gf.Vec3f(0.0, 0.0, -1.0)) 60 | self.scene.CreateGravityMagnitudeAttr().Set(9.81) 61 | omni.kit.commands.execute( 62 | "AddGroundPlaneCommand", 63 | stage=self._world.stage, 64 | planePath="/groundPlane", 65 | axis="Z", 66 | size=10.000, 67 | position=Gf.Vec3f(0, 0, -0.01), # hack to hide ground mesh 68 | color=Gf.Vec3f(0.5), 69 | ) """ 70 | 71 | # self._mats = self.load_materials() 72 | 73 | # Disable capture on play and async rendering 74 | carb.settings.get_settings().set("/omni/replicator/captureOnPlay", False) 75 | carb.settings.get_settings().set("/omni/replicator/asyncRendering", False) 76 | carb.settings.get_settings().set("/app/asyncRendering", False) 77 | 78 | # https://forums.developer.nvidia.com/t/replicator-images-contain-artifacts-from-other-frames/220837 79 | # carb.settings.get_settings().set("/rtx/ambientOcclusion/enabled", False) 80 | # rep.settings.set_render_rtx_realtime(antialiasing="FXAA") 81 | 82 | # start replicator 83 | if self._config["rt_subframes"] > 1: 84 | rep.settings.carb_settings("/omni/replicator/RTSubframes", self._config["rt_subframes"]) 85 | else: 86 | carb.log_warn("RTSubframes is set to 1, consider increasing it if materials are not loaded on time") 87 | 88 | self.clear_previous_semantics() 89 | 90 | self.output_dir = os.path.join(os.path.dirname(__file__), config["writer_config"]["output_dir"]) 91 | if not os.path.exists(self.output_dir): 92 | os.makedirs(self.output_dir) 93 | 94 | self.replicator.setup_depth_sensor() 95 | 96 | WriterRegistry.register(ColorWriter) 97 | WriterRegistry.register(GtWriter) 98 | WriterRegistry.register(IRWriter) 99 | 100 | self.dr = self.replicator.setup_domain_randomization() 101 | self._log(json.dumps(self.dr)) 102 | 103 | def clear_previous_semantics(self): 104 | return 105 | if self._config["clear_previous_semantics"]: 106 | offline_generation_utils.remove_previous_semantics(self._world.stage) 107 | 108 | 109 | def setup_lighting(self): 110 | # prim_path = "/World/DiskLight" 111 | # diskLight = UsdLux.DiskLight.Define(self._world.stage, Sdf.Path(prim_path)) 112 | # diskLight.CreateIntensityAttr(15000) 113 | 114 | # light = self._world.stage.GetPrimAtPath(prim_path) 115 | # if not light.GetAttribute("xformOp:translate"): 116 | # UsdGeom.Xformable(light).AddTranslateOp() 117 | # return light 118 | pass 119 | 120 | # def setup_projector_lighting(self): 121 | # prim_path = "/World/RectLight" 122 | # rectLight = UsdLux.RectLight.Define(self._world.stage, Sdf.Path(prim_path)) 123 | # rectLight.CreateIntensityAttr(500) 124 | # rectLight.Create 125 | 126 | def load_scene(self): 127 | scene_name = self._config["hssd"]["name"] 128 | data_dir = os.path.abspath(self._config.hssd["data_dir"]) 129 | env_url = f"{data_dir}/{scene_name}/{scene_name}.usd" 130 | assert os.path.exists(env_url), f"Scene file {env_url} does not exist" 131 | add_reference_to_stage(usd_path=env_url, prim_path=scene_prim_path) 132 | 133 | hssd_env = self._world.stage.GetPrimAtPath(scene_prim_path) 134 | if not hssd_env.GetAttribute("xformOp:translate"): 135 | UsdGeom.Xformable(hssd_env).AddTranslateOp() 136 | if not hssd_env.GetAttribute("xformOp:rotateXYZ"): 137 | UsdGeom.Xformable(hssd_env).AddRotateXYZOp() 138 | if not hssd_env.GetAttribute("xformOp:scale"): 139 | UsdGeom.Xformable(hssd_env).AddScaleOp() 140 | 141 | hssd_env.GetAttribute("xformOp:rotateXYZ").Set((90, 0, 0)) 142 | scale = self._config["hssd"]["scale"] 143 | hssd_env.GetAttribute("xformOp:scale").Set((scale, scale, scale)) 144 | 145 | if self._config["hssd"]["hide_ceilings"]: 146 | ceiling = hssd_env.GetPrimAtPath(f"{scene_prim_path}/ceilings") 147 | ceiling.GetAttribute("visibility").Set("invisible") 148 | 149 | if self._config["hssd"]["hide_walls"]: # an ugly hack 150 | walls = hssd_env.GetPrimAtPath(f"{scene_prim_path}/walls") 151 | walls.GetAttribute("visibility").Set("invisible") 152 | 153 | return hssd_env 154 | 155 | # deprecated 156 | def load_materials(self): 157 | #https://forums.developer.nvidia.com/t/how-can-i-change-material-of-the-existing-object-in-runtime/161253 158 | # path_mat_glass_clear = assets_root_path + "/NVIDIA/Materials/vMaterials_2/Glass/Glass_Clear.mdl" 159 | path_mat_glass_clear = "omniverse://localhost/NVIDIA/Materials/vMaterials_2/Glass/Glass_Clear.mdl" 160 | # load more 161 | success, result = omni.kit.commands.execute('CreateMdlMaterialPrimCommand', 162 | mtl_url=path_mat_glass_clear, # This can be path to local or remote MDL 163 | mtl_name='Glass_Clear', # sourceAsset:subIdentifier (i.e. the name of the material within the MDL) 164 | mtl_path="/World/Looks/Glass_Clear" # Prim path for the Material to create. 165 | ) 166 | t = UsdShade.Material(self._world.stage.GetPrimAtPath("/World/Looks/Glass_Clear")) 167 | 168 | path_mat_metal_aluminum = "omniverse://localhost/NVIDIA/Materials/vMaterials_2/Metal/Aluminum.mdl" 169 | success, result = omni.kit.commands.execute('CreateMdlMaterialPrimCommand', 170 | mtl_url=path_mat_glass_clear, # This can be path to local or remote MDL 171 | mtl_name='Aluminum', 172 | mtl_path="/World/Looks/Aluminum" # Prim path for the Material to create. 173 | ) 174 | s = UsdShade.Material(self._world.stage.GetPrimAtPath("/World/Looks/Aluminum")) 175 | 176 | return { 177 | 'transparent': [t], # TODO add more 178 | 'specular': [s] # TODO add more 179 | } 180 | 181 | # deprecated 182 | def create_rep_object(self, surface_center_pos): 183 | test_model = rep.create.from_usd(f"file:///home/songlin/Projects/DREDS/DepthSensorSimulator/cad_model/02691156/1c93b0eb9c313f5d9a6e43b878d5b335_converted/model_obj.usd", 184 | semantics=[("class", "test")]) 185 | 186 | test_ball = rep.create.sphere(name="test_ball", position=surface_center_pos, scale=(0.1, 0.1, 0.1)) 187 | with test_model: 188 | rep.physics.collider() 189 | rep.physics.rigid_body( 190 | # velocity=rep.distribution.uniform((-0,0,-0),(0,0,1)), 191 | # angular_velocity=rep.distribution.uniform((-0,0,-100),(0,0,0)) 192 | ) 193 | 194 | 195 | 196 | def start(self): 197 | # self.debug = 0 198 | # Find the desired surface 199 | # for surface_config in self._config["hssd"]['surfaces']: 200 | # surface = self._config["hssd"]['surface'] 201 | self.replicator.render() 202 | 203 | """ def randomize_texture(self, dred_models): 204 | materials = create_materials(self._world.stage, len(dred_models)) 205 | assets_root_path = get_assets_root_path() 206 | textures = [ 207 | assets_root_path + "/NVIDIA/Materials/vMaterials_2/Ground/textures/aggregate_exposed_diff.jpg", 208 | assets_root_path + "/NVIDIA/Materials/vMaterials_2/Ground/textures/gravel_track_ballast_diff.jpg", 209 | assets_root_path + "/NVIDIA/Materials/vMaterials_2/Ground/textures/gravel_track_ballast_multi_R_rough_G_ao.jpg", 210 | assets_root_path + "/NVIDIA/Materials/vMaterials_2/Ground/textures/rough_gravel_rough.jpg", 211 | ] 212 | 213 | delay=0.2 214 | initial_materials = {} 215 | for i, shape in dred_models.items(): #enumerate(): 216 | cur_mat, _ = UsdShade.MaterialBindingAPI(shape).ComputeBoundMaterial() 217 | initial_materials[shape] = cur_mat 218 | UsdShade.MaterialBindingAPI(shape).Bind(materials[i-1], UsdShade.Tokens.strongerThanDescendants) 219 | 220 | for mat in materials: 221 | shader = UsdShade.Shader(omni.usd.get_shader_from_material(mat, get_prim=True)) 222 | # diffuse_texture = np.random.choice(textures) 223 | # shader.GetInput("diffuse_texture").Set(diffuse_texture) 224 | 225 | # project_uvw = np.random.choice([True, False], p=[0.9, 0.1]) 226 | # shader.GetInput("project_uvw").Set(bool(project_uvw)) 227 | 228 | # texture_scale = np.random.uniform(0.1, 1) 229 | # shader.GetInput("texture_scale").Set((texture_scale, texture_scale)) 230 | 231 | # texture_rotate = np.random.uniform(0, 45) 232 | # shader.GetInput("texture_rotate").Set(texture_rotate) 233 | 234 | shader.GetInput("metallic_constant").Set(1.0) 235 | shader.GetInput("reflection_roughness_constant").Set(0.0) """ 236 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | D3RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic Robotic Manipulation 3 | 4 | CoRL 2024, Munich, Germany. 5 | 6 | 7 |
8 | 9 | Paper arXiv 10 | 11 | Project Page 12 | 13 | Open Review 14 |
15 |

16 | 17 | This is the official repository of [**D3RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic Robotic Manipulation**](https://arxiv.org/abs/2409.14365). 18 | 19 | For more information, please visit our [**project page**](https://pku-epic.github.io/D3RoMa/). 20 | 21 | [Songlin Wei](https://songlin.github.io/), 22 | [Haoran Geng](https://geng-haoran.github.io/), 23 | [Jiayi Chen](https://jychen18.github.io/), 24 | [Congyue Deng](https://cs.stanford.edu/~congyue/), 25 | [Wenbo Cui](#), 26 | [Chengyang Zhao](https://chengyzhao.github.io/), 27 | [Xiaomeng Fang](#), 28 | [Leonidas Guibas](https://geometry.stanford.edu/member/guibas/), and 29 | [He Wang](https://hughw19.github.io/) 30 | 31 | 32 | 33 | ## 💡 Updates (Feb 27, 2025) 34 | - [x] We just release example code for generating IR stereo images using isaac-sim 4.0.0 35 | - [x] We just release new model variant (Cond. on RGB+Raw), please checkout the updated inference.py 36 | - [x] Traning protocols and datasets 37 | 38 | 39 | 40 | Our method robustly predicts transparent (bottles) and specular (basin and cups) object depths in tabletop environments and beyond. 41 | ![teaser](assets/in-the-wild.png) 42 | 43 | 44 | 45 | ## INSTALLATION 46 | ``` 47 | conda create --name d3roma python=3.8 48 | conda activate d3roma 49 | 50 | # install dependencies with pip 51 | pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 52 | pip install huggingface_hub==0.24.5 53 | pip install diffusers opencv-python scikit-image matplotlib transformers datasets accelerate tensorboard imageio open3d kornia 54 | pip install hydra-core --upgrade 55 | ``` 56 | 57 | 58 | ## DOWNLOAD PRE-TRAINED WEIGHT 59 | 60 | + For model variant: Cond. Left+Right+Raw [Google drive](https://drive.google.com/file/d/12BLB7mKDbLPhW2UuJSmYnwBFokOjDvC9/view?usp=sharing), [百度云](https://pan.baidu.com/s/1u7n4wstGpqwAswp8ZbTNlw?pwd=o9nk) 61 | + For model variant: Cond. RGB+Raw [Google drive](https://drive.google.com/file/d/1cTAUZ2lXBXe4-peHLUneJ6ufQTqFr6E9/view?usp=drive_link), [百度云](https://pan.baidu.com/s/1zWwdMQ2_6-CViaC2JUGsFA?pwd=bwwb) 62 | ``` 63 | # Download pretrained weigths from Google Drive 64 | # Extract it under the project folder 65 | ``` 66 | 67 | ## RUN INFERENCE 68 | You can run the following script to test our model. We provided two variants `left+right+raw` for stereo cameras and `rgb+raw` for any RGBD cameras: 69 | ``` 70 | python inference.py 71 | ``` 72 | This will generate three files under folder `_output`: 73 | 74 | `_outputs.{variant}/pred.png`: the pseudo colored depth map 75 | 76 | `_outputs.{variant}/pred.ply`: the pointcloud which ia obtained though back-projected the predicted depth 77 | 78 | `_outputs.{variant}/raw.ply`: the pointcloud which ia obtained though back-projected the camera raw depth 79 | 80 | 81 | ## Training 82 | 83 | #### 1. Preparing Datasets 84 | 85 | All the datasets will be linked to folder `datasets` 86 | 87 | + Download [SceneFlow stereo](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html) 88 | 89 | + Download [DREDS](https://github.com/PKU-EPIC/DREDS#dataset) 90 | 91 | + Download [HISS](https://drive.google.com/drive/folders/1BTbiHWIM_zQC85pz-NMVnYBdZvt1oxaV?usp=sharing) 92 | 93 | + Download [Clearpose](https://github.com/opipari/ClearPose) 94 | 95 | Example datasets folder structure: 96 | 97 | ``` 98 | datasets 99 | ├── clearpose -> /raid/songlin/Data/clearpose 100 | │   ├── clearpose_downsample_100 101 | │   │   ├── downsample.py 102 | │   │   ├── model 103 | │   │   ├── set1 104 | │   │   ├── ... 105 | │   ├── metadata 106 | │   │   ├── set1 107 | │   │   ├── ... 108 | │   ├── model 109 | │   │   ├── 003_cracker_box 110 | │   │   ├── ... 111 | │   ├── set1 112 | │   │   ├── scene1 113 | │   │   ├── ... 114 | │   ├── ... 115 | ├── DREDS 116 | │   ├── test -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/test 117 | │   │   └── shapenet_generate_1216_val_novel 118 | │   ├── test_std_catknown -> /raid/songlin/Data/DREDS_ECCV2022/STD-CatKnown 119 | │   │   ├── test_0 120 | │   │   ├── ... 121 | │   ├── test_std_catnovel -> /raid/songlin/Data/DREDS_ECCV2022/STD-CatNovel 122 | │   │   └── real_data_novel 123 | │   ├── train -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/train 124 | │   │   ├── part0 125 | │   │   ├── ... 126 | │   └── val -> /raid/songlin/Data/DREDS_ECCV2022/DREDS-CatKnown/val 127 | │   └── shapenet_generate_1216 128 | ├── HISS 129 | │   ├── train -> /raid/songlin/Data/hssd-isaac-sim-100k 130 | │   │   ├── 102344049 131 | │   │   ├── 102344280 132 | │   │   ├── 103997586_171030666 133 | │   │   ├── 107734119_175999932 134 | │   │   └── bad_his.txt 135 | │   └── val -> /raid/songlin/Data/hssd-isaac-sim-300hq 136 | │   ├── 102344049 137 | │   ├── 102344280 138 | │   ├── 103997586_171030666 139 | │   ├── 107734119_175999932 140 | │   ├── 300hq.tar.gz 141 | │   ├── bad_his.txt 142 | │   └── simulation2 143 | ├── sceneflow -> /raid/songlin/Data/sceneflow 144 | │ ├── bad_sceneflow_test.txt 145 | │ ├── bad_sceneflow_train.txt 146 | │ ├── Driving 147 | │ │   ├── disparity 148 | │ │   ├── frames_cleanpass 149 | │ │   ├── frames_finalpass 150 | │ │   ├── raw_cleanpass 151 | │ │   └── raw_finalpass 152 | │ ├── FlyingThings3D 153 | │ │   ├── disparity 154 | │ │   ├── frames_cleanpass 155 | │ │   ├── frames_finalpass 156 | │ │   ├── raw_cleanpass 157 | │ │   └── raw_finalpass 158 | │ └── Monkaa 159 | │ ├── disparity 160 | │ ├── frames_cleanpass 161 | │ ├── frames_finalpass 162 | │ ├── raw_cleanpass 163 | │ └── raw_finalpass 164 | ├── README.md 165 | ``` 166 | 167 | #### 2. Data Preprocessing - resize, compute raw disparity, and filter bad images 168 | 169 | - We resize `DREDS` dataset from `1270x720` to `640x360`, and convert raw depth to raw disparity using resized resolutions. 170 | 171 | - If the dataset does not provide **raw disparity**, we pre-compute them by running Stereo Matching algorithms: 172 | ``` 173 | # please make necessary changes to file paths, focal lengths and baselines etc. 174 | # we adapted this file from DREDS. 175 | python scripts/stereo_matching.py 176 | ``` 177 | 178 | We also tried using [libSGM](https://github.com/fixstars/libSGM) to precompute disaprity maps for SceneFlow. 179 | The precomputed raw disparities are put under `raw_cleanpass` and `raw_finalpass` with same sub-folder paths. 180 | You can also download the [precomputed sceneflow raw disparities here](https://drive.google.com/file/d/1CZQvR-61IQ8o4n4ewNkVO9M3VCIIGHgr/view?usp=sharing). 181 | 182 | - Sometimes the source stereo images are too challenging for computing raw disparities, so we filter them our during training. 183 | We run the following scripts to filter out very bad raw disparities and exclude them in dataloader: 184 | 185 | ``` 186 | python scritps/check_sceneflow.py 187 | python scritps/check_stereo.py 188 | ``` 189 | 190 | #### 3. Download pre-trained stable-diffusion 191 | 192 | We use v-2.1 (resolution 768) version of stable diffusion. 193 | 194 | Download [stablediffusion v2.1-768 checkpoints](https://huggingface.co/stabilityai/stable-diffusion-2-1/tree/main) and put in under `checkpoint/stablediffusion` 195 | 196 | Example folder structure after downloaed (I download the checkpoint files manullay) 197 | ``` 198 | checkpoint 199 | └── stable-diffusion -> /home/songlin/Projects/diff-stereo/checkpoint/stable-diffusion 200 | ├── feature_extractor 201 | │ └── preprocessor_config.json 202 | ├── model_index.json 203 | ├── scheduler 204 | │ └── scheduler_config.json 205 | ├── text_encoder 206 | │ ├── config.json 207 | │ └── model.safetensors 208 | ├── tokenizer 209 | │ ├── merges.txt 210 | │ ├── special_tokens_map.json 211 | │ ├── tokenizer_config.json 212 | │ └── vocab.json 213 | ├── unet 214 | │ ├── config.json 215 | │ └── diffusion_pytorch_model.safetensors 216 | ├── v2-1_768-nonema-pruned.safetensors 217 | └── vae 218 | ├── config.json 219 | └── diffusion_pytorch_model.safetensors 220 | 221 | ``` 222 | 223 | #### 4. Train 224 | 225 | ``` 226 | # Because we already downloaded StableDiffusion's pretrained weights 227 | export HF_HUB_OFFLINE=True 228 | ``` 229 | 230 | We use huggingface accelerate and train on 8 A100-40G: 231 | ``` 232 | cd 233 | conda activate d3roma 234 | accelerate config 235 | ``` 236 | 237 | We train the variant `left+right+raw` using datasets: `SceneFlow`, `DREDS`, and `HISS`. This variant is suitable for working with Stereo cameras. 238 | ``` 239 | accelerate launch train.py \ 240 | task=train_ldm_mixed_left+right+raw \ 241 | task.tag=release \ 242 | task.eval_num_batch=10 \ 243 | task.val_every_global_steps=5000 244 | ``` 245 | 246 | We train the variant `rgb+raw` using datasets: `DREDS`, `HISS` and `ClearPose`. This variant is suitable for working with RGBD cameras. 247 | 248 | ``` 249 | accelerate launch train.py \ 250 | task=train_ldm_mixed_rgb+raw \ 251 | task.tag=release \ 252 | task.eval_num_batch=10 \ 253 | task.val_every_global_steps=5000 254 | ``` 255 | 256 | #### 5. Run tensorboard to monitor training process 257 | 258 | ``` 259 | tensorboard --logdir experiments --port 20000 260 | ``` 261 | 262 | #### 6. Distributed Evaluation 263 | 264 | If you want to parallel evaluation on test datasets: 265 | ``` 266 | accelerate launch distributed_evaluate.py task=... 267 | ``` 268 | 269 | ## Reproducing results in Paper 270 | ``` 271 | accelerate launch train.py task=train_dreds_reprod 272 | 273 | accelerate launch train.py task=train_clearpose 274 | 275 | accelerate launch train.py task=train_syntodd_rgbd 276 | 277 | accelerate launch train.py task=train_sceneflow 278 | ``` 279 | 280 | ## Contact 281 | If you have any questions please contact us: 282 | 283 | Songlin Wei: slwei@stu.pku.edu.cn, Haoran Geng: ghr@berkeley.edu, He Wang: hewang@pku.edu.cn 284 | 285 | ## Citation 286 | ``` 287 | @inproceedings{ 288 | wei2024droma, 289 | title={D3RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic Robotic Manipulation}, 290 | author={Songlin Wei and Haoran Geng and Jiayi Chen and Congyue Deng and Cui Wenbo and Chengyang Zhao and Xiaomeng Fang and Leonidas Guibas and He Wang}, 291 | booktitle={8th Annual Conference on Robot Learning}, 292 | year={2024}, 293 | url={https://openreview.net/forum?id=7E3JAys1xO} 294 | } 295 | ``` 296 | 297 | 298 | ## License 299 | 300 | This work and the dataset are licensed under [CC BY-NC 4.0][cc-by-nc]. 301 | 302 | [![CC BY-NC 4.0][cc-by-nc-image]][cc-by-nc] 303 | 304 | [cc-by-nc]: https://creativecommons.org/licenses/by-nc/4.0/ 305 | [cc-by-nc-image]: https://licensebuttons.net/l/by-nc/4.0/88x31.png -------------------------------------------------------------------------------- /isaacsim/custom_writer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json, math, copy 4 | import numpy as np 5 | # import cv2 6 | import warp as wp 7 | os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" 8 | # import open3d as o3d 9 | from omni.replicator.core import AnnotatorRegistry, BackendDispatch, Writer, BasicWriter, WriterRegistry 10 | 11 | def rgb2gray(rgb): 12 | return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140]) 13 | 14 | def colorize_normals(data): 15 | colored_data = ((data * 0.5 + 0.5) * 255).astype(np.uint8) 16 | return colored_data 17 | 18 | class ColorWriter(BasicWriter): 19 | def __init__( 20 | self, 21 | **kwargs 22 | ): 23 | self.version = "0.0.1" 24 | if "semantic_segmentation" in kwargs: 25 | del kwargs["semantic_segmentation"] 26 | if "distance_to_image_plane" in kwargs: 27 | del kwargs["distance_to_image_plane"] 28 | if "pointcloud" in kwargs: 29 | del kwargs["pointcloud"] 30 | 31 | if "disparity" in kwargs: 32 | del kwargs["disparity"] 33 | 34 | interval = kwargs.pop("interval", 1) 35 | ticker = kwargs.pop("ticker", None) 36 | 37 | if "start_sequence_id" in kwargs: # keep it simple here 38 | start_sequence_id = kwargs["start_sequence_id"] 39 | assert start_sequence_id >= 0, "start_sequence_id must be >= 0" 40 | del kwargs["start_sequence_id"] 41 | 42 | super().__init__(**kwargs) 43 | 44 | self._frame_id = 0 45 | self._sequence_id = start_sequence_id 46 | self._start_sequence_id = start_sequence_id 47 | self._interval = interval 48 | self._ticker = ticker 49 | if self._ticker is None: 50 | self._ticker = lambda: self._frame_id 51 | 52 | def write(self, data: dict): 53 | if self._ticker()[0] == "rgb": 54 | for annotator, val in data["annotators"].items(): 55 | if annotator.startswith("rgb"): 56 | file_path = f"{self._output_dir}/{self._sequence_id:04d}_color.png" 57 | self._backend.write_image(file_path, val["RenderProduct_CameraRGB"]["data"]) 58 | # print(f"rendered color {self._sequence_id:04d}") 59 | self._sequence_id += 1 60 | self._frame_id += 1 61 | 62 | def _write_rgb(self, data: dict, render_product_path: str, annotator: str): 63 | file_path = f"{render_product_path}rgb_{self._sequence_id}{self._frame_id:0{self._frame_padding}}.{self._image_output_format}" 64 | self._backend.write_image(file_path, data[annotator]) 65 | 66 | def on_final_frame(self): # reset 67 | self._frame_id = 0 68 | self._sequence_id = self._start_sequence_id 69 | 70 | class GtWriter(BasicWriter): 71 | """ not only render depth, but also render semantic / masks / pointcloud, etc. """ 72 | 73 | def __init__(self, interval=1, depth_sensor_cfg=dict(), **kwargs): 74 | self.version = "0.0.1" 75 | ticker = kwargs.pop("ticker") 76 | 77 | conifg = copy.copy(kwargs) 78 | # kwargs = dict(conifg["writer_config"]) 79 | 80 | if "rgb" in kwargs: 81 | del kwargs["rgb"] 82 | 83 | if "disparity" in kwargs: # hack 84 | self.render_disparity = kwargs["disparity"] 85 | self.depth_sensor_cfg = depth_sensor_cfg 86 | self.set_render_disparity() 87 | del kwargs["disparity"] 88 | else: 89 | self.render_disparity = False 90 | # kwargs["pointcloud_include_unlabelled"] = True 91 | 92 | if "start_sequence_id" in kwargs: 93 | start_sequence_id = kwargs["start_sequence_id"] 94 | assert start_sequence_id >= 0, "start_sequence_id must be >= 0" 95 | del kwargs["start_sequence_id"] 96 | 97 | 98 | 99 | super().__init__(**kwargs) 100 | self._frame_id = 0 101 | self._sequence_id = start_sequence_id 102 | self._start_sequence_id = start_sequence_id 103 | self._interval = interval 104 | self._ticker = ticker 105 | self._last_tick = None 106 | 107 | def set_render_disparity(self): 108 | FOV = np.deg2rad(self.depth_sensor_cfg["fov"]) 109 | W = self.depth_sensor_cfg["resolution"][0] 110 | # H = cfg["depth_sensor"]["resolution"][1] 111 | focal = W / (2 * math.tan(FOV / 2)) 112 | # assert np.allclose(focal, 446.31), "do you have the correct focal length?" 113 | 114 | baseline = self.depth_sensor_cfg["placement"]["rgb_to_right_ir"] - self.depth_sensor_cfg["placement"]["rgb_to_left_ir"] 115 | assert np.isclose(baseline, 0.055), "wrong baseline" 116 | self.fxb = focal * baseline 117 | 118 | def write(self, data: dict): 119 | 120 | def write_exr(path, data, exr_flag=None): 121 | """ fix for isaac-sim 2022.2.1 """ 122 | import imageio 123 | if isinstance(data, wp.array): 124 | data = data.numpy() 125 | 126 | # Download freeimage dll, will only download once if not present 127 | # from https://imageio.readthedocs.io/en/v2.8.0/format_exr-fi.html#exr-fi 128 | imageio.plugins.freeimage.download() 129 | if exr_flag == None: 130 | exr_flag = imageio.plugins.freeimage.IO_FLAGS.EXR_ZIP 131 | 132 | exr_bytes = imageio.imwrite( 133 | imageio.RETURN_BYTES, 134 | data, 135 | format="exr", 136 | flags=exr_flag, 137 | ) 138 | self._backend.write_blob(path, exr_bytes) 139 | 140 | if self._ticker()[0] == "gt": 141 | if self._last_tick is not None and self._ticker()[1] == self._last_tick: 142 | return # hack to avoid duplicate frames (only happens for GT writer on isaac-sim 2023 hotfix) 143 | for annotator, val in data["annotators"].items(): 144 | if annotator.startswith("distance_to_image_plane"): 145 | # file_path = f"{self._output_dir}/{self._sequence_id:04d}_depth.png" 146 | # self._backend.write_image(file_path, (data[annotator]*1000).astype(np.uint16)) 147 | # file_path = f"{self._output_dir}/{self._sequence_id:04d}_depth.npy" 148 | # self._backend.write_array(file_path, data[annotator]) 149 | 150 | file_path_exr = f"{self._output_dir}/{self._sequence_id:04d}_depth.exr" 151 | # self._backend.write_exr(file_path_exr, data[annotator]) 152 | # cv2.imwrite(file_path_exr, data[annotator]) 153 | write_exr(file_path_exr, val["RenderProduct_CameraDepth"]["data"]) 154 | 155 | if self.render_disparity: 156 | assert self.fxb is not None, "please call set_render_disparity() first" 157 | disparity = self.fxb / val["RenderProduct_CameraDepth"]["data"] 158 | # file_path = f"{self._output_dir}/{self._sequence_id:04d}_disp.npy" 159 | file_path_exr = f"{self._output_dir}/{self._sequence_id:04d}_disp.exr" 160 | # self._backend.write_array(file_path, disparity) 161 | # self._backend.write_exr(file_path_exr, disparity) 162 | # cv2.imwrite(file_path_exr, disparity) 163 | write_exr(file_path_exr, disparity) 164 | 165 | if annotator.startswith("semantic_segmentation"): 166 | semantic_seg_data = val["RenderProduct_CameraDepth"]["data"] 167 | height, width = semantic_seg_data.shape[:2] 168 | 169 | file_path = (f"{self._output_dir}/{self._sequence_id:04d}_mask.png") 170 | if self.colorize_semantic_segmentation: 171 | semantic_seg_data = semantic_seg_data.view(np.uint8).reshape(height, width, -1) 172 | self._backend.write_image(file_path, semantic_seg_data) 173 | else: 174 | semantic_seg_data = semantic_seg_data.view(np.uint32).reshape(height, width) 175 | self._backend.write_image(file_path, semantic_seg_data) 176 | 177 | id_to_labels = val["RenderProduct_CameraDepth"]["idToLabels"] 178 | file_path = f"{self._output_dir}/{self._sequence_id:04d}_mask.json" 179 | buf = io.BytesIO() 180 | buf.write(json.dumps({str(k): v for k, v in id_to_labels.items()}).encode()) 181 | self._backend.write_blob(file_path, buf.getvalue()) 182 | 183 | if annotator.startswith("normals"): 184 | normals_data = val["RenderProduct_CameraDepth"]["data"] 185 | file_path_normal = f"{self._output_dir}/{self._sequence_id:04d}_normal.png" 186 | colorized_normals_data = colorize_normals(normals_data) 187 | self._backend.write_image(file_path_normal, colorized_normals_data) 188 | 189 | if annotator.startswith("pointcloud"): 190 | pointcloud_data = data[annotator]["data"] 191 | file_path = f"{self._output_dir}/{self._sequence_id:04d}_pcd.npy" 192 | self._backend.write_array(file_path, pointcloud_data) 193 | 194 | pointcloud_rgb = data[annotator]["info"]["pointRgb"].reshape(-1, 4) 195 | rgb_file_path = f"{self._output_dir}/{self._sequence_id:04d}_pcd_rgb.npy" 196 | self._backend.write_array(rgb_file_path, pointcloud_rgb) 197 | 198 | """ pcd = o3d.geometry.PointCloud() 199 | pcd.points = o3d.utility.Vector3dVector(pointcloud_data.astype(np.float32).reshape(-1, 3)) 200 | o3d.io.write_point_cloud(file_path, pcd) """ 201 | self._last_tick = self._ticker()[1] 202 | # print(f"rendered gt {self._sequence_id:04d}") 203 | self._sequence_id += 1 204 | self._frame_id += 1 205 | 206 | def on_final_frame(self): 207 | self._frame_id = 0 208 | self._sequence_id = self._start_sequence_id 209 | 210 | class IRWriter(Writer): 211 | def __init__( 212 | self, 213 | output_dir, 214 | start_sequence_id=0, 215 | interval=1, 216 | ticker=None, 217 | ): 218 | self.version = "0.0.1" 219 | self.backend = BackendDispatch({"paths": {"out_dir": output_dir}}) 220 | self.annotators.append(AnnotatorRegistry.get_annotator("rgb")) 221 | self._output_dir = output_dir 222 | self._interval = interval 223 | 224 | assert start_sequence_id >= 0, "start_sequence_id must be >= 0" 225 | self._frame_id = 0 226 | self._sequence_id = start_sequence_id 227 | self._start_sequence_id = start_sequence_id 228 | self._ticker = ticker 229 | if self._ticker is None: 230 | self._ticker = lambda: self._frame_id 231 | 232 | def write(self, data: dict): 233 | if self._ticker()[0] == "ir": 234 | for annotator in data.keys(): 235 | if annotator.startswith("rgb"): 236 | # ir_name = 'ir_l' if 'Left' in annotator else 'ir_r' 237 | ir_name = 'ir_l' if '01' in annotator else 'ir_r' # HACK 238 | filename = f"{self._output_dir}/{self._sequence_id:04d}_{ir_name}.png" 239 | self.backend.write_image(filename, rgb2gray(data[annotator]).astype(np.uint8)) 240 | # print(f"rendered ir {self._sequence_id:04d}") 241 | self._sequence_id += 1 242 | self._frame_id += 1 243 | 244 | def on_final_frame(self): 245 | self._frame_id = 0 246 | self._sequence_id = self._start_sequence_id 247 | 248 | 249 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from utils.camera import DepthCamera, RGBDCamera, Realsense 3 | 4 | from diffusers import DDPMScheduler, HeunDiscreteScheduler, EulerDiscreteScheduler, DDIMScheduler 5 | from core.scheduler_ddpm import MyDDPMScheduler 6 | from core.scheduler_ddim import MyDDIMScheduler 7 | from typing import List, Union, Optional, Tuple 8 | from omegaconf import MISSING, OmegaConf 9 | from omegaconf import DictConfig, OmegaConf, ValidationError 10 | from hydra.core.config_store import ConfigStore 11 | 12 | supported_samplers = { 13 | 'ddpm': DDPMScheduler, 14 | 'euler': EulerDiscreteScheduler, 15 | 'heun': HeunDiscreteScheduler, 16 | 'ddim': DDIMScheduler, 17 | 'my_ddim': MyDDIMScheduler, 18 | 'my_ddpm': MyDDPMScheduler 19 | } 20 | 21 | @dataclass 22 | class Augment: 23 | resizedcrop: dict = field(default_factory=lambda: { 24 | 'scale': [2, 2], 25 | 'ratio': [1.33333333333333,1.33333333333333333333] 26 | }) 27 | hflip: str = "h" # off 28 | #==== raft stereo augmentation ====# 29 | min_scale: float = 0 # -0.2 30 | max_scale: float = 0 # 0.4 31 | saturation_range: List[float] = field(default_factory=lambda: [0, 1.4]) 32 | gamma: List[float] = field(default_factory=lambda: [1,1,1,1]) 33 | yjitter: bool =False 34 | 35 | @dataclass 36 | class TrainingConfig: 37 | name: Optional[str] = "your task name here" 38 | tag: str = "" # your tag here 39 | camera_resolution: str = "320x256" # "224x128" # WxH dataset camera resolution, default "640x360" 40 | image_size: Tuple[int] = field(default_factory=lambda: (256, 320)) # (128, 224) #(352, 640) # [h,w] training image size 41 | divis_by: int = 32 42 | # image_size: tuple = (126, 224) # (128, 224) #(352, 640) # [h,w] training image size 43 | depth_channels: int = 1 44 | cond_channels: str = "rgb" # "rgb+raw" # "left+right" # "rgb+left+right" # "left+right+raw" # "left+right+raw" 45 | train_batch_size: int = 12 # 16 46 | eval_batch_size: int = 12 47 | eval_num_batch: int = 2 # if set to -1, will evaluate whole val set 48 | 49 | num_epochs: int = 1000 50 | gradient_accumulation_steps: int = 3 51 | clip_grad_norm: float = 1.0 52 | 53 | lr_warmup_steps: int = 500 54 | val_every_global_steps: int = 1000 55 | save_model_epochs: int = 10 56 | mixed_precision: str = "no" # `no` for float32, `fp16` for automatic mixed precision 57 | 58 | push_to_hub: bool = False # whether to upload the saved model to the HF Hub 59 | hub_model_id: str = "/" # the name of the repository to create on the HF Hub 60 | hub_private_repo: bool = False 61 | overwrite_output_dir: bool = True # overwrite the old model when re-running the notebook 62 | # seed: int = 0 63 | 64 | train_dataset: List[str] = field(default_factory=lambda: ['NYUv2']) #"std_100k" # 65 | eval_dataset: List[str] = field(default_factory=lambda: ['NYUv2']) #"std_100k" # 66 | dataset_weight: List[int] = field(default_factory=lambda: [1]) 67 | dataset_variant: str = "default" 68 | 69 | #### training settings 70 | ldm: bool = True 71 | prediction_space: str = "depth" # or "disp" ? 72 | ssi: bool = False 73 | # data normalizer 74 | normalize_mode: str = "average" 75 | num_chs: int = 3 76 | ch_bounds: List[float] = field(default_factory=lambda: [256, 256, 256])#[64, 64, 128] 77 | ch_gammas: List[float] = field(default_factory=lambda: [1/3., 1/3., 1/3. ])#[1., 1/3, 1/3] 78 | norm_t: float = 0.5 79 | norm_s: float = 2.0 80 | 81 | num_train_timesteps: int = 128 #1000 # diff-11 82 | num_inference_timesteps: int = 128 #1000 # diff-11 83 | num_inference_rounds: int = 1 84 | noise_strategy: str = 'randn' # ['randn', 'pyramid'] 85 | loss_type: str = "l1" # "mse" 86 | learning_rate: float = 1e-4 87 | clip_gradient: bool = False 88 | 89 | #### scheduler 90 | clip_sample: bool = True 91 | clip_sample_range: float = 1.0 92 | thresholding: bool = False 93 | dynamic_thresholding_ratio: float = 0.995 94 | num_cycles: int = 1 95 | beta_schedule: str = "squaredcos_cap_v2" # "linear" 96 | beta_start: float = 1e-4 97 | beta_end: float = 2e-2 98 | noise_rgb: bool = False 99 | 100 | sampler: str = "my_ddpm" 101 | prediction_type: str = "v_prediction" # "sample" # "epsilon" # 102 | 103 | #### guidance settings 104 | flow_guidance_weights: List[float] = field(default_factory=lambda: [0.0]) 105 | perturb_start_ratio: float = 1.0 # @deprecated 106 | guide_source: Optional[Union[str, None]] = None # "raw|stereo-match" 107 | flow_guidance_mode: str = "imputation" 108 | 109 | #### evaluation settings 110 | eval_output: str = "" 111 | eval_split: str = "val" # "test" 112 | write_pcd: bool = False 113 | num_intermediate_images: int = 8 114 | plot_mask: bool = False 115 | plot_error_map: bool = True 116 | plot_denoised_images: bool = True 117 | plot_intermediate_images: bool = False 118 | plot_intermediate_metrics: bool = False 119 | experiment_dir: str = "experiments" 120 | safe_ssi: bool = False # do ransac when align scales, only valid when ssi is on, should be turn off when training 121 | ransac_error_threshold: float = 0.6 # squared error, 0.6 works for nyu 122 | ensemble: bool = False 123 | coarse_to_fine: bool = False 124 | 125 | #### resume checkpoints 126 | resume_pretrained: Optional[str] = "" 127 | resume_ckpt: Optional[str] = "" 128 | 129 | #### experiment output directory, will be overriden automatically 130 | output_dir: Optional[str] = "" 131 | 132 | augment: Augment=field(default_factory=Augment) #Augment= MISSING # 133 | 134 | ### networks 135 | block_out_channels: Tuple[int] = field(default_factory=lambda: (128, 128, 256, 256, 512, 512)) 136 | lr_scheduler: Optional[str] = "cosine" 137 | 138 | @dataclass 139 | class Config: 140 | debug: bool = False 141 | seed: int = -1 142 | task: TrainingConfig = MISSING 143 | 144 | def setup_hydra_configurations(): 145 | # setup hydra configurations 146 | cs = ConfigStore.instance() 147 | cs.store(name="base_config", node=Config) 148 | 149 | cs = ConfigStore.instance() 150 | cs.store( 151 | group="task", 152 | name="cfg", 153 | node=TrainingConfig 154 | ) 155 | 156 | def get_output_dir(base_config: Config): 157 | config = base_config.task 158 | ssi = "ssi" if config.ssi else "nossi" 159 | datasets = "_".join(config.train_dataset) 160 | weights = "_".join(format(x, ".1f") for x in config.flow_guidance_weights) 161 | tag = "" if config.tag=="" else f"-{config.tag}" 162 | 163 | return f"{config.experiment_dir}/{config.name}{tag}.dep{config.depth_channels}.lr{config.learning_rate:.0e}.{config.prediction_type}.{ssi}.{config.beta_schedule}.{config.noise_strategy}." + \ 164 | f"{config.sampler}{config.num_train_timesteps}." + \ 165 | f"{datasets}.{config.image_size[0]}x{config.image_size[1]}.{config.cond_channels}." + \ 166 | f"w{weights}" + ("_debug" if base_config.debug else "") 167 | 168 | def set_debug(config: TrainingConfig): 169 | config.val_every_global_steps = 10 #1000# 170 | config.save_model_epochs = 1 171 | config.train_batch_size = 1 172 | config.eval_batch_size = 1 173 | config.beta_schedule = "linear" 174 | config.beta_start = 1e-4 175 | config.beta_end = 2e-1 176 | # config.dataset = "nyu_depth_v2" # "std_debug" #720x360 177 | config.num_train_timesteps = 128 # 128# 178 | config.num_inference_timesteps = 128 # 128# 179 | config.num_intermediate_images = 4 180 | # config.output_dir = f"{config.output_dir}_debug" 181 | 182 | def create_sampler(config, train=True): 183 | if config.sampler not in supported_samplers.keys(): 184 | raise ValueError("Sampler not found") 185 | 186 | opt = { 187 | "num_train_timesteps": config.num_train_timesteps if train else config.num_inference_timesteps 188 | } 189 | 190 | if train: 191 | assert "ddim" not in config.sampler, "DDIM should not be used for training" 192 | 193 | opt["clip_sample"] = config.clip_sample 194 | opt["prediction_type"] = config.prediction_type 195 | opt["beta_schedule"] = config.beta_schedule 196 | opt["beta_start"] = config.beta_start 197 | opt["beta_end"] = config.beta_end 198 | opt["num_train_timesteps"] = config.num_train_timesteps 199 | 200 | if config.sampler == "my_ddpm" or config.sampler == "ddpm": 201 | opt["clip_sample_range"] = config.clip_sample_range 202 | opt["thresholding"] = config.thresholding 203 | opt["dynamic_thresholding_ratio"] = config.dynamic_thresholding_ratio 204 | elif config.sampler == "my_ddim" or config.sampler == "ddim": 205 | opt["set_alpha_to_one"] = False 206 | opt["skip_prk_steps"] = True 207 | opt["steps_offset"] = 1 208 | opt["trained_betas"] = None 209 | else: 210 | raise ValueError("Sampler may not be configured properly?!") 211 | 212 | return supported_samplers[config.sampler].from_config(opt) 213 | 214 | ########### TESTING BELOW, INGNORE ############# 215 | 216 | def plot_iddpm_figure_1(): 217 | def distortion(delta, sqared_err): 218 | # return ( math.log(1/math.sqrt(2*math.pi)) - math.log(delta) - 0.5 * sqared_err / delta**2) 219 | log_scales = th.FloatTensor([0.5 * math.log(delta)]) # 0.5 * log_variance 220 | centered_x = 0.95/256/256 221 | x = th.FloatTensor([0.5]) 222 | 223 | inv_stdv = th.exp(-log_scales) 224 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 225 | cdf_plus = approx_standard_normal_cdf(plus_in) 226 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 227 | cdf_min = approx_standard_normal_cdf(min_in) 228 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 229 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 230 | cdf_delta = cdf_plus - cdf_min 231 | log_probs = th.where( 232 | x < -0.999, 233 | log_cdf_plus, 234 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 235 | ) 236 | assert log_probs.shape == x.shape 237 | return log_probs 238 | 239 | # config.set_debug() 240 | T = 4000 241 | config.num_train_timesteps = config.num_inference_timesteps = T 242 | config.beta_schedule = "squaredcos_cap_v2" 243 | scheduler = create_sampler(config) 244 | print(distortion(scheduler.betas[0], 0.95**2)) 245 | print(normal_kl(scheduler.alphas_cumprod[-1]**0.5*2, math.log(1-scheduler.alphas_cumprod[-1]), 0, 0)) # Section 4 ddpm: keep SNR at X_T ~= 1e-5 (=10**-5) 246 | 247 | vlb = [] 248 | for t in range(T): 249 | vlb.append( 250 | normal_kl(scheduler.alphas_cumprod[t]**0.5*2, math.log(1-scheduler.alphas_cumprod[t]), 0, 0) 251 | ) 252 | 253 | x = np.linspace(0, 1, T) 254 | y = scheduler.betas_tilde / scheduler.betas 255 | plt.plot(x, y, label="4000") 256 | 257 | T = 1000 258 | config.num_train_timesteps = config.num_inference_timesteps = T 259 | scheduler = create_sampler(config) 260 | print(distortion(scheduler.betas[0], 0.95**2)) 261 | x2 = np.linspace(0, 1, T) 262 | y2 = scheduler.betas_tilde / scheduler.betas 263 | 264 | plt.plot(x2, y2, label="1000") 265 | print(normal_kl(scheduler.alphas_cumprod[-1]**0.5, math.log(1-scheduler.alphas_cumprod[-1]), 0, 0)) # Section 4 ddpm: keep SNR at X_T ~= 1e-5 (=10**-5) 266 | 267 | T = 128 268 | config.num_train_timesteps = config.num_inference_timesteps = T 269 | scheduler = create_sampler(config) 270 | alphas_cumprod_128 = scheduler.alphas_cumprod 271 | print(distortion(scheduler.betas[0], 0.95**2)) 272 | x3 = np.linspace(0, 1, T) 273 | y3 = scheduler.betas_tilde / scheduler.betas 274 | plt.plot(x3, y3, label="128") 275 | plt.xlabel("t/T") 276 | plt.ylabel("~beta_t/beta_t") 277 | plt.legend(loc="upper right") 278 | plt.savefig("Figure 1.png") # Figure 1 in iDDPM 279 | print(normal_kl(scheduler.alphas_cumprod[-1], math.log(1-scheduler.alphas_cumprod[-1]), 0, 0)) # Section 4 ddpm: keep SNR at X_T ~= 1e-5 (=10**-5) 280 | 281 | def plot_iddpm_figure_2(): 282 | T = 128 283 | config.num_train_timesteps = config.num_inference_timesteps = T 284 | scheduler = create_sampler(config) 285 | x = np.linspace(0, 1, T) 286 | vlbs = [] 287 | for t in range(T): 288 | vlbs.append( 289 | normal_kl(0, math.log(1-scheduler.alphas_cumprod[t]), 0, 0) 290 | ) 291 | 292 | def plot_iddpm_figure_5(): 293 | T = 1000 294 | config.num_train_timesteps = config.num_inference_timesteps = T 295 | config.beta_schedule = "linear" 296 | scheduler = create_sampler(config) 297 | alphas_cumprod_linear = scheduler.alphas_cumprod 298 | 299 | T = 1000 300 | config.num_train_timesteps = config.num_inference_timesteps = T 301 | config.beta_schedule = "squaredcos_cap_v2" 302 | scheduler = create_sampler(config) 303 | alphas_cumprod_cosine = scheduler.alphas_cumprod 304 | 305 | x = np.linspace(0, 1, T) 306 | plt.figure() 307 | plt.plot(x, alphas_cumprod_linear, label="linear") 308 | plt.plot(x, alphas_cumprod_cosine, label="cosine") 309 | plt.legend(loc="upper right") 310 | plt.xlabel("diffusion step t/T") 311 | plt.ylabel("alpha bar") 312 | plt.savefig("Figure 5.png") 313 | 314 | def plot_snr(): 315 | T = 128 316 | config.num_train_timesteps = T 317 | config.beta_schedule = "linear" 318 | scheduler = create_sampler(config) 319 | plt.figure() 320 | 321 | x = np.linspace(0, T, T) 322 | snr_linear = scheduler.alphas_cumprod / ( 1-scheduler.alphas_cumprod) 323 | # plt.plot(x, snr_linear, label="SNR Linear") 324 | plt.plot(x, snr_linear ** 0.5, label="sqrt SNR Linear") 325 | # plt.plot(x, th.log(snr_linear), label="log SNR Linear") 326 | 327 | config.beta_schedule = "squaredcos_cap_v2" 328 | scheduler = create_sampler(config) 329 | 330 | x = np.linspace(0, T, T) 331 | snr_cosine = scheduler.alphas_cumprod / ( 1-scheduler.alphas_cumprod) 332 | # plt.plot(x, snr_cosine, label="SNR cosine") 333 | plt.plot(x, snr_cosine ** 0.5, label="sqrt SNR cosine") 334 | # plt.plot(x, th.log(snr_cosine), label="log SNR cosine") 335 | plt.xlabel("t/T") 336 | plt.ylabel("SNR") 337 | plt.legend(loc="upper right") 338 | plt.savefig("Figure_SNR.png") 339 | 340 | def plot_sample_t(): 341 | T = 128 342 | config.num_train_timesteps = T 343 | config.beta_schedule = "squaredcos_cap_v2" 344 | scheduler = create_sampler(config) 345 | snr_cosine = scheduler.alphas_cumprod / ( 1-scheduler.alphas_cumprod) 346 | from core.resample import create_named_schedule_sampler 347 | t_sampler = create_named_schedule_sampler("snr", (snr_cosine ** 0.5 + 1).cpu().numpy()) 348 | timestemps, weights = t_sampler.sample(128, "cpu") 349 | # print(timestemps, weights) 350 | plt.figure() 351 | fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True) 352 | axs[0].hist(timestemps, bins=T) 353 | axs[1].hist(weights, bins=T) 354 | # print(weights.mean()) 355 | plt.savefig("Figure_sampled_t.png") 356 | 357 | if __name__ == "__main__": # DEBUG & PLOT schdulers 358 | config = TrainingConfig() 359 | 360 | from utils.losess import * 361 | 362 | import matplotlib.pyplot as plt 363 | import numpy as np 364 | import torch as th 365 | import math 366 | 367 | # plot_iddpm_figure_1() 368 | # plot_iddpm_figure_2() 369 | # plot_iddpm_figure_5() 370 | plot_snr() 371 | plot_sample_t() 372 | 373 | """ 374 | # resolution is irrelanvent for predicting depth 375 | print(config.camera.resolution_str) 376 | print(config.camera.resolution) 377 | print(config.camera.fxb) 378 | 379 | fxb = config.camera.fxb #* 2.5 380 | disp = fxb / 0.75 381 | print(disp) 382 | disp_2 = fxb / (0.75 + 0.001) 383 | print(disp_2 - disp) 384 | print(f"{(disp_2 - disp) / disp * 100} %") """ 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import torch 4 | import numpy as np 5 | from functools import partial 6 | from utils.camera import Realsense 7 | 8 | def denormalize(config, pred_disps, raw_disp=None, mask=None): 9 | from utils.utils import Normalizer 10 | norm = Normalizer.from_config(config) 11 | 12 | if config.ssi: 13 | # assert config.depth_channels == 1, "fixme" 14 | B, R, H, W = pred_disps.shape 15 | # scale-shift invariant evaluation, consider using config.safe_ssi if the ssi computation is not stable 16 | batch_pred = pred_disps.reshape(-1, H*W) # BR, HW 17 | batch_gt = raw_disp.repeat(1, R, 1, 1).reshape(-1, H*W) # BR, HW 18 | batch_mask = mask.repeat(1, R, 1, 1).reshape(-1, H*W) 19 | if config.safe_ssi: 20 | from utils.ransac import RANSAC 21 | regressor = RANSAC(n=0.1, k=10, d=0.2, t=config.ransac_error_threshold) 22 | regressor.fit(batch_pred, batch_gt, batch_mask) 23 | st = regressor.best_fit 24 | print(f"safe ssi in on: n=0.1, k=10, d=0.2, t={config.ransac_error_threshold}") 25 | else: 26 | print("directly compute ssi") 27 | from utils.utils import compute_scale_and_shift 28 | st = compute_scale_and_shift(batch_pred, batch_gt, batch_mask) # BR, HW 29 | 30 | s, t = torch.split(st.view(B, R, 1, 2), 1, dim=-1) 31 | pred_disps_unnormalized = pred_disps * s + t 32 | else: 33 | pred_disps_unnormalized = norm.denormalize(pred_disps) 34 | 35 | return pred_disps_unnormalized 36 | 37 | class D3RoMa(): 38 | def __init__(self, overrides=[], camera=None, variant="left+right+raw"): 39 | assert variant in ["left+right+raw", "rgb+raw"], "not released yet" 40 | 41 | from config import TrainingConfig, setup_hydra_configurations 42 | self.camera: Realsense = camera 43 | 44 | setup_hydra_configurations() 45 | from hydra import compose, initialize 46 | with initialize(version_base=None, config_path="conf", job_name="inference"): 47 | base_cfg = compose(config_name="config.yaml", overrides=overrides) 48 | 49 | if base_cfg.seed != -1: 50 | from utils.utils import seed_everything 51 | seed_everything(base_cfg.seed) # for reproducing 52 | 53 | config: TrainingConfig = base_cfg.task 54 | self.camera.change_resolution(f"{config.image_size[1]}x{config.image_size[0]}") 55 | self.pipeline = self._load_pipeline(config) 56 | 57 | self.eval_output_dir = f"_outputs.{variant}" 58 | if not os.path.exists(self.eval_output_dir): 59 | os.makedirs(self.eval_output_dir, exist_ok=True) 60 | 61 | from utils.utils import Normalizer 62 | self.normer = Normalizer.from_config(config) 63 | self.config = config 64 | self.variant = variant 65 | 66 | def _load_pipeline(self, config): 67 | patrained_path = f"{config.resume_pretrained}" 68 | if os.path.exists(patrained_path): 69 | print(f"load weights from {patrained_path}") 70 | 71 | from core.custom_pipelines import GuidedDiffusionPipeline, GuidedLatentDiffusionPipeline 72 | clazz_pipeline = GuidedLatentDiffusionPipeline if config.ldm else GuidedDiffusionPipeline 73 | pipeline = clazz_pipeline.from_pretrained(patrained_path).to("cuda") 74 | # model = UNet2DConditionModel.from_pretrained(patrained_path) 75 | pipeline.guidance.flow_guidance_mode=config.flow_guidance_mode 76 | 77 | if config.sampler == "my_ddim": 78 | from core.scheduler_ddim import MyDDIMScheduler 79 | my_ddim = MyDDIMScheduler.from_config(dict( 80 | beta_schedule = config.beta_schedule, 81 | beta_start = config.beta_start, 82 | beta_end = config.beta_end, 83 | clip_sample = config.clip_sample, 84 | num_train_timesteps = config.num_train_timesteps, 85 | prediction_type = config.prediction_type, 86 | set_alpha_to_one = False, 87 | skip_prk_steps = True, 88 | steps_offset = 1, 89 | trained_betas = None 90 | )) 91 | pipeline.scheduler = my_ddim 92 | print(f"Careful! sampler is overriden to {config.sampler}") 93 | else: 94 | raise ValueError(f"patrained path not exists: {patrained_path}") 95 | 96 | return pipeline 97 | 98 | @torch.no_grad() 99 | def infer_with_rgb_raw(self, rgb: np.ndarray, raw_depth: np.ndarray): 100 | """Depth restoration with RGB and raw depth (RGB and depth SHOULD be aligned!) 101 | 102 | Args: 103 | rgb (np.ndarray): RGB image or gray image 104 | raw (np.ndarray): raw depth image from camera sensors, unit is meter 105 | 106 | Returns: 107 | np.ndarray: restored depth image, unit is meter 108 | """ 109 | 110 | assert rgb.dtype == np.uint8 111 | if len(rgb.shape[:2]) != len(raw_depth.shape[:2]): 112 | rgb = cv2.resize(rgb, dsize=raw_depth.shape[:2][::-1], interpolation=cv2.INTER_LINEAR) 113 | 114 | if len(rgb.shape) == 2: 115 | # grayscale images 116 | rgb = np.tile(rgb[...,None], (1, 1, 3)) 117 | else: 118 | rgb = rgb[..., :3] 119 | 120 | rgb = cv2.resize(rgb, self.camera.resolution[::-1], interpolation=cv2.INTER_LINEAR) 121 | rgb = torch.from_numpy(rgb).permute(2, 0, 1).float() 122 | 123 | if len(raw_depth.shape) == 2: 124 | raw_depth = raw_depth[...,None] 125 | raw_depth = torch.from_numpy(raw_depth).permute(2, 0, 1).float() 126 | 127 | assert self.config.prediction_space == "disp", "not implemented" 128 | raw_disp = torch.zeros_like(raw_depth) 129 | raw_valid = (raw_depth > 0) 130 | raw_disp[raw_valid] = self.camera.fxb_depth / raw_depth[raw_valid] 131 | 132 | # normalized_raw_disp = self.normer.normalize(raw_disp)[0] 133 | return self.run_pipeline(None, None, raw_disp, rgb) 134 | 135 | @torch.no_grad() 136 | def infer(self, left: np.ndarray, right: np.ndarray, raw_depth: np.ndarray=None, rgb:np.ndarray=None): 137 | """Depth restoration with left, right and raw depth 138 | 139 | Args: 140 | left (np.ndarray): left (IR) image 141 | right (np.ndarray): right (IR) image 142 | raw (np.ndarray): raw depth image from camera sensors, unit is meter (optional) 143 | rgb (np.ndarray): RGB image (optional) for point cloud visualization only 144 | 145 | Returns: 146 | np.ndarray: restored depth image, unit is meter 147 | """ 148 | assert len(left.shape) == len(right.shape) 149 | assert left.dtype == right.dtype == np.uint8 150 | 151 | if raw_depth is None or rgb is None: 152 | raise NotImplementedError("no worry, i will implement this soon") 153 | 154 | # assert raw.dtype == np.float32 155 | # if len(raw.shape) == 2: 156 | # raw = raw[...,None] 157 | 158 | if len(left.shape) == 2: 159 | # grayscale images 160 | left = np.tile(left[...,None], (1, 1, 3)) 161 | right = np.tile(right[...,None], (1, 1, 3)) 162 | else: 163 | left = left[..., :3] 164 | right = right[..., :3] 165 | 166 | left = cv2.resize(left, self.camera.resolution[::-1], interpolation=cv2.INTER_LINEAR) 167 | right = cv2.resize(right, self.camera.resolution[::-1], interpolation=cv2.INTER_LINEAR) 168 | 169 | left = torch.from_numpy(left).permute(2, 0, 1).float() 170 | right = torch.from_numpy(right).permute(2, 0, 1).float() 171 | 172 | if rgb is not None: 173 | rgb = cv2.resize(rgb, self.camera.resolution[::-1], interpolation=cv2.INTER_LINEAR) 174 | rgb = torch.from_numpy(rgb).permute(2, 0, 1).float() 175 | 176 | raw_depth = cv2.resize(raw_depth, dsize=self.camera.resolution[::-1], interpolation=cv2.INTER_NEAREST) 177 | if len(raw_depth.shape) == 3 and raw_depth.shape[-1] == 3: 178 | raw_depth = raw_depth [...,0] 179 | if len(raw_depth.shape) == 2: 180 | raw_depth = raw_depth[...,None] 181 | raw_depth = torch.from_numpy(raw_depth).permute(2, 0, 1).float() 182 | 183 | assert self.config.prediction_space == "disp", "not implemented" 184 | raw_disp = torch.zeros_like(raw_depth) 185 | raw_valid = (raw_depth > 0) 186 | raw_disp[raw_valid] = self.camera.fxb_depth / raw_depth[raw_valid] 187 | 188 | assert left.shape[1] % 8 == 0 and left.shape[2] % 8 == 0, "image size must be multiple of 8" 189 | return self.run_pipeline(left, right, raw_disp, rgb) 190 | 191 | def run_pipeline(self, left_image, right_image, raw_disp, rgb): 192 | device = "cuda" if torch.cuda.is_available() else "cpu" # "cpu" # 193 | normalize_rgb_fn = lambda x: (x / 255. - 0.5) * 2 194 | 195 | # batchify 196 | if rgb is not None: 197 | normalized_rgb = normalize_rgb_fn(rgb).to(device) 198 | normalized_rgb = normalized_rgb.unsqueeze(0).repeat(self.config.num_inference_rounds, 1, 1, 1) 199 | 200 | if left_image is not None and right_image is not None: 201 | left_image = normalize_rgb_fn(left_image).to(device) 202 | right_image = normalize_rgb_fn(right_image).to(device) 203 | 204 | left_image = left_image.unsqueeze(0).repeat(self.config.num_inference_rounds, 1, 1, 1) 205 | right_image = right_image.unsqueeze(0).repeat(self.config.num_inference_rounds, 1, 1, 1) 206 | 207 | raw_disp = raw_disp.to(device) 208 | normalized_raw_disp = self.normer.normalize(raw_disp)[0] # normalized sim disp 209 | normalized_raw_disp = normalized_raw_disp.unsqueeze(0).repeat(self.config.num_inference_rounds, 1, 1, 1) 210 | 211 | raw_disp = raw_disp.unsqueeze(0).repeat(self.config.num_inference_rounds, 1, 1, 1) 212 | mask = (raw_disp > 0).float() 213 | 214 | denorm = partial(denormalize, self.config) 215 | self.pipeline.set_progress_bar_config(desc=f"Denoising") 216 | 217 | pred_disps = self.pipeline(normalized_rgb, left_image, right_image, normalized_raw_disp, raw_disp, mask, 218 | num_inference_steps=self.config.num_inference_timesteps, 219 | num_intermediate_images=self.config.num_intermediate_images, # T 220 | add_noise_rgb=self.config.noise_rgb, 221 | depth_channels=self.config.depth_channels, 222 | cond_channels=self.config.cond_channels, 223 | denorm = denorm 224 | ).images 225 | 226 | if pred_disps.shape[0] > 1: # B is actually num_inference_rounds 227 | uncertainties = np.zeros_like(raw_disp) 228 | uncertainties[mask] = np.std(pred_disps.cpu().numpy(), axis=0)[mask] 229 | else: 230 | uncertainties = None 231 | 232 | pred_disps_unnormalized = denormalize(self.config, pred_disps, raw_disp, mask) 233 | pred_disps_unnormalized = pred_disps_unnormalized.mean(dim=0) 234 | 235 | if True: 236 | from utils.utils import compute_errors, metrics_to_dict, pretty_json 237 | metrics = compute_errors(raw_disp[0].cpu().numpy(), 238 | pred_disps_unnormalized.cpu().numpy(), 239 | self.config.prediction_space, 240 | mask[0].cpu().numpy().astype(bool), 241 | [self.camera.fxb_depth]) 242 | 243 | metrics = metrics_to_dict(*metrics) 244 | print((f"metrics:{pretty_json(metrics)}")) 245 | 246 | pred_disps_unnormalized = pred_disps_unnormalized[0].cpu().numpy() 247 | pred_depth = np.zeros_like(pred_disps_unnormalized) 248 | pred_mask = (pred_disps_unnormalized > 0) 249 | pred_depth[pred_mask] = self.camera.fxb_depth / pred_disps_unnormalized[pred_mask] 250 | return pred_depth 251 | 252 | 253 | if __name__ == "__main__": 254 | from utils.camera import Realsense 255 | camera = Realsense.default_real("fxm") 256 | overrides = [ 257 | # uncomment if you choose variant left+right+raw 258 | # "task=eval_ldm_mixed", 259 | # "task.resume_pretrained=experiments/ldm_sf-mixed.dep4.lr3e-05.v_prediction.nossi.scaled_linear.randn.nossi.my_ddpm1000.SceneFlow_Dreds_HssdIsaacStd.180x320.cond7-raw+left+right.w0.0/epoch_0199", 260 | 261 | # uncomment if you choose variant rgb+raw 262 | "task=eval_ldm_mixed_rgb+raw", 263 | "task.resume_pretrained=experiments/ldm_sf-241212.2.dep4.lr3e-05.v_prediction.nossi.scaled_linear.randn.ddpm1000.Dreds_HssdIsaacStd_ClearPose.180x320.rgb+raw.w0.0/epoch_0056", 264 | 265 | # rest of the configurations 266 | "task.eval_num_batch=1", 267 | "task.image_size=[360,640]", 268 | "task.eval_batch_size=1", 269 | "task.num_inference_rounds=1", 270 | "task.num_inference_timesteps=10", "task.num_intermediate_images=5", 271 | "task.write_pcd=true" 272 | ] 273 | """ if False: # turn on guidance 274 | overrides += [ 275 | "task.sampler=my_ddim", 276 | "task.guide_source=raw-depth", 277 | "task.flow_guidance_mode=gradient", 278 | "task.flow_guidance_weights=[1.0]" 279 | ] """ 280 | 281 | droma = D3RoMa(overrides, camera, variant="rgb+raw") 282 | 283 | from PIL import Image 284 | from hydra.utils import to_absolute_path 285 | left = np.array(Image.open(to_absolute_path("./assets/examples/0000_ir_l.png"))) 286 | right = np.array(Image.open(to_absolute_path("./assets/examples/0000_ir_r.png"))) 287 | raw = np.array(Image.open(to_absolute_path("./assets/examples/0000_depth.png"))) * 1e-3 288 | rgb = np.array(Image.open(to_absolute_path("./assets/examples/0000_rgb.png"))) 289 | 290 | if droma.variant == "rgb+raw": 291 | depth_aligned = camera.transform_depth_to_rgb_frame(raw) #if not alreay aligned 292 | if True: # visualize aligned depth for realsense d415 293 | valid = (depth_aligned > 0.2) & (depth_aligned < 5) 294 | import matplotlib.pyplot as plt 295 | cmap_spectral = plt.get_cmap('Spectral') 296 | raw_depth_normalized = np.zeros_like(depth_aligned) 297 | raw_depth_normalized[valid] = (depth_aligned[valid] - depth_aligned[valid].min()) / (depth_aligned[valid].max() - depth_aligned[valid].min()) 298 | Image.fromarray((cmap_spectral(raw_depth_normalized)*255.)[...,:3].astype(np.uint8)).save(f"raw_aligned.png") 299 | 300 | pred_depth = droma.infer_with_rgb_raw(rgb, depth_aligned) 301 | # if droma.config.write_pcd: 302 | elif droma.variant == "left+right+raw": 303 | pred_depth = droma.infer(left, right, raw, rgb) 304 | else: 305 | raise NotImplementedError 306 | 307 | import matplotlib.pyplot as plt 308 | cmap_spectral = plt.get_cmap('Spectral') 309 | pred_depth_normalized = (pred_depth - pred_depth.min()) / (pred_depth.max() - pred_depth.min()) 310 | Image.fromarray((cmap_spectral(pred_depth_normalized)*255.)[...,:3].astype(np.uint8)).save(f"{droma.eval_output_dir}/pred.png") 311 | 312 | if droma.config.write_pcd: 313 | from utils.utils import viz_cropped_pointcloud 314 | gt_depth_np = raw # [H,W] 315 | gt_masks_np = raw > 0 316 | gt_depth_np[~gt_masks_np] = 0.0 317 | gt_depth_np = camera.transform_depth_to_rgb_frame(gt_depth_np) #if not alreay aligned 318 | viz_cropped_pointcloud(camera.K.arr, rgb, gt_depth_np, fname=f"{droma.eval_output_dir}/raw.ply") 319 | 320 | if droma.variant == "left+right+raw": 321 | pred_depth = camera.transform_depth_to_rgb_frame(pred_depth) 322 | viz_cropped_pointcloud(camera.K.arr, rgb, pred_depth, fname=f"{droma.eval_output_dir}/pred.ply") 323 | -------------------------------------------------------------------------------- /isaacsim/replicate/std_object.py: -------------------------------------------------------------------------------- 1 | import os, random, time, json, math, copy 2 | import numpy as np 3 | 4 | import omni 5 | import omni.replicator.core as rep 6 | from omni.isaac.core.utils import prims 7 | from omni.isaac.core.prims.rigid_prim import RigidPrim 8 | from omni.isaac.core.utils.rotations import euler_angles_to_quat 9 | from omni.isaac.core.utils.stage import get_current_stage, open_stage, create_new_stage 10 | from pxr import Gf, Sdf, Usd, PhysxSchema, UsdGeom, UsdLux, UsdPhysics, UsdShade 11 | 12 | from replicate.scene_replicator import Replicator 13 | from dreds_renderer import DredsRenderer, generate_material_type, g_synset_name_scale_pairs 14 | from utils_func import get_all_child_mesh, get_visibility_attribute 15 | 16 | scene_prim_path = "/World/scene" #!! 17 | 18 | class STDObjectReplicator(Replicator): 19 | 20 | def __init__(self, world, config) -> None: 21 | super().__init__(world, config) 22 | 23 | self.dr = {} 24 | 25 | def setup_domain_randomization(self): 26 | self.domain_randomization = self._config["domain_randomization"] 27 | assert self.domain_randomization, "not implemented yet!" 28 | 29 | # domain randomization of lighting 30 | light_type_dr = self._config["lighting"]["light_type"] 31 | self.light_type = light_type_dr[random.randint(0, len(light_type_dr))-1] 32 | light_conf_dr = self._config["lighting"][f"{self.light_type}_light"] 33 | 34 | self.dr['lighting'] = {} 35 | self.dr['lighting']['type'] = self.light_type 36 | 37 | light_conf = { 38 | 'radius': random.uniform(*light_conf_dr['radius']), 39 | 'height': random.uniform(*light_conf_dr['height']), 40 | 'intensity': [ 41 | random.uniform(*light_conf_dr['intensity']['on']), 42 | random.uniform(*light_conf_dr['intensity']['off']) 43 | ] 44 | } 45 | # self.dr['lighting'][f'{self.light_type}_light'] = light_conf 46 | self.dr['lighting'].update(light_conf) 47 | 48 | # scene disk light 49 | self._light = rep.create.light( 50 | light_type = self.light_type, #"Sphere", #"Disk", 51 | intensity = self.dr['lighting']["intensity"][0], 52 | color = (1.0, 1.0, 1.0), 53 | position = (0.0, 0.0, 0.0), 54 | name= f"{self.light_type}Light" 55 | ) 56 | 57 | # prim_path_disk = "/Replicator/DiskLight_Xform/DiskLight" 58 | # rect_light = self._world.stage.GetPrimAtPath(prim_path_disk) 59 | # rect_light.GetAttribute("inputs:radius").Set(self._config["lighting"]["disk_light"]["radius"]) 60 | 61 | prim_path_light = f"/Replicator/{self.light_type}Light_Xform/{self.light_type}Light" 62 | prim_light = self._world.stage.GetPrimAtPath(prim_path_light) 63 | prim_light.GetAttribute("inputs:radius").Set(self.dr["lighting"]["radius"]) 64 | 65 | if self.dr["lighting"]["type"] == "Sphere": 66 | prim_light.GetAttribute("treatAsPoint").Set(True) 67 | 68 | # domain randomization of materials 69 | self.dr["std"] = {} 70 | transparent_dr = self._config["transparent"] 71 | transparent_conf = { 72 | "roughness_constant": random.uniform(*transparent_dr["roughness_constant"]), 73 | "cutout_opacity": random.uniform(*transparent_dr["cutout_opacity"]), 74 | "thin_walled": transparent_dr["thin_walled"], 75 | "glass_ior": random.uniform(*transparent_dr["glass_ior"]), 76 | "frosting_roughness": random.uniform(*transparent_dr["frosting_roughness"]) 77 | } 78 | self.dr["std"]["transparent"] = transparent_conf 79 | 80 | specular_dr = self._config["specular"] 81 | specular_conf = { 82 | "reflection_roughness_constant": random.uniform(*specular_dr["reflection_roughness_constant"]), 83 | "metallic_constant": random.uniform(*specular_dr["metallic_constant"]), 84 | "reflection_color": random.uniform(*specular_dr["reflection_color"]), 85 | } 86 | self.dr["std"]["specular"] = specular_conf 87 | return self.dr 88 | 89 | def render(self) -> None: 90 | self._log("start std_obj render on surface") 91 | 92 | surface_config = self._config["hssd"]['surface'] 93 | origin_prim_path = surface_config['prim_path'] 94 | prim_path = origin_prim_path.replace("/World", scene_prim_path) 95 | surface_prim = self._world.stage.GetPrimAtPath(prim_path) 96 | self.enable_physics(surface_prim) 97 | 98 | surface_center_pos = self.calc_surface_center(surface_prim) 99 | # move disk light 1m above the surface center 100 | # self._light.GetAttribute("xformOp:translate").Set((surface_center_pos[0], surface_center_pos[1], surface_center_pos[2] + 1.0)) 101 | with self._light: 102 | rep.modify.pose(position=(surface_center_pos[0], 103 | surface_center_pos[1], 104 | surface_center_pos[2] + self.dr["lighting"]["height"])) 105 | 106 | # domain randomization 107 | root_dir = os.path.abspath(self._config.dreds.cad_model_dir) 108 | renderer = DredsRenderer(root_dir) 109 | select_model_list, cam_q_list, cam_p_list = renderer.domain_randomize(self._config["num_frames_per_surface"]) 110 | surface_center_pos = self.calc_surface_center(surface_prim) 111 | 112 | # load object 113 | all_rigid_objects = [] 114 | # last_object_name = None 115 | # model_prims = {} 116 | # material_prims = [] 117 | initial_materials = {} 118 | for model in select_model_list: 119 | prim_name = f"model_{model['instance_id']}_{model['class_name']}" 120 | self._log(f"{model['material_type']}, {model['class_name']}, {model['instance_path']}") 121 | 122 | model_prim = prims.create_prim( 123 | prim_path=f"/World/{model['class_name']}_{model['instance_id']}", 124 | usd_path=f"file://{model['instance_path']}", 125 | semantic_label=prim_name, 126 | scale=[model['scale']]*3 127 | ) 128 | # Wrap the prim into a rigid prim to be able to simulate it 129 | box_rigid_prim = RigidPrim( 130 | prim_path=str(model_prim.GetPrimPath()), 131 | name=model['instance_name'], 132 | position=surface_center_pos + Gf.Vec3d(random.uniform(-0.3, 0.3), random.uniform(-0.3, 0.3), model['instance_id'] * 0.05), 133 | orientation=euler_angles_to_quat([random.uniform(0, math.pi/2), random.uniform(0, math.pi/2), random.uniform(0, math.pi)]), 134 | ) 135 | # set object as rigid body 136 | box_rigid_prim.enable_rigid_body_physics() 137 | # Enable collision 138 | UsdPhysics.CollisionAPI.Apply(model_prim) 139 | # Register rigid prim with the scene 140 | self._world.scene.add(box_rigid_prim) 141 | # last_object_name = model['instance_name'] 142 | all_rigid_objects.append(model['instance_name']) 143 | # model_prims[model['instance_id']] = model_prim 144 | 145 | # disable opacity for ground truth depth rendering, tested in PathRendering mode. 146 | for prim in get_all_child_mesh(model_prim): 147 | cur_mat, _ = UsdShade.MaterialBindingAPI(prim).ComputeBoundMaterial() 148 | shader = UsdShade.Shader(omni.usd.get_shader_from_material(cur_mat, get_prim=True)) 149 | 150 | shader.CreateInput("enable_opacity", Sdf.ValueTypeNames.Bool) 151 | shader.GetInput("enable_opacity").Set(False) 152 | 153 | # change material 154 | 155 | if model["material_type"] == "transparent" or model['class_name'] in ["cup", "bottle"]: # hack transparent cup and bottle 156 | mat_type = model["material_type"] 157 | MDL = "OmniGlass.mdl" 158 | mtl_name, _ = os.path.splitext(MDL) 159 | MAT_PATH = "/World/Looks" 160 | 161 | prim_path = omni.usd.get_stage_next_free_path(self._world.stage, f"{MAT_PATH}/{mtl_name}", False) 162 | mat = self.create_omnipbr_material(mtl_url=MDL, mtl_name=mtl_name, mtl_path=prim_path) 163 | 164 | initial_materials[model_prim] = mat 165 | # material_prims.append(prim_path) 166 | 167 | elif model["material_type"] == "specular": 168 | mat_type = model["material_type"] 169 | for prim in get_all_child_mesh(model_prim): 170 | 171 | if len(prim.GetChildren()) >1 : 172 | # hot fix: multi-materials 173 | self._log(f"multi-materials: {prim.GetPath()}") 174 | for subp in prim.GetChildren(): 175 | mat, _ = UsdShade.MaterialBindingAPI(subp).ComputeBoundMaterial() 176 | shader = UsdShade.Shader(omni.usd.get_shader_from_material(mat, get_prim=False)) 177 | 178 | shader.CreateInput("metallic", Sdf.ValueTypeNames.Float) 179 | shader.CreateInput("roughness", Sdf.ValueTypeNames.Float) 180 | 181 | shader.GetInput("metallic").Set(self.dr["std"]["specular"]["metallic_constant"]) 182 | shader.GetInput("roughness").Set(self.dr["std"]["specular"]["reflection_roughness_constant"]) 183 | continue 184 | 185 | cur_mat, _ = UsdShade.MaterialBindingAPI(prim).ComputeBoundMaterial() 186 | shader = UsdShade.Shader(omni.usd.get_shader_from_material(cur_mat, get_prim=True)) 187 | 188 | # Add value inputs 189 | shader.CreateInput("diffuse_color_constant", Sdf.ValueTypeNames.Color3f) 190 | shader.CreateInput("reflection_roughness_constant", Sdf.ValueTypeNames.Float) 191 | shader.CreateInput("metallic_constant", Sdf.ValueTypeNames.Float) 192 | 193 | # Add texture inputs 194 | shader.CreateInput("diffuse_texture", Sdf.ValueTypeNames.Asset) 195 | shader.CreateInput("reflectionroughness_texture", Sdf.ValueTypeNames.Asset) 196 | shader.CreateInput("metallic_texture", Sdf.ValueTypeNames.Asset) 197 | 198 | # Add other attributes 199 | shader.CreateInput("project_uvw", Sdf.ValueTypeNames.Bool) 200 | 201 | # Add texture scale and rotate 202 | shader.CreateInput("texture_scale", Sdf.ValueTypeNames.Float2) 203 | shader.CreateInput("texture_rotate", Sdf.ValueTypeNames.Float) 204 | 205 | shader.GetInput("metallic_constant").Set(self.dr["std"]["specular"]["metallic_constant"]) 206 | shader.GetInput("reflection_roughness_constant").Set(self.dr["std"]["specular"]["reflection_roughness_constant"]) 207 | 208 | UsdShade.MaterialBindingAPI(prim).Bind(cur_mat, UsdShade.Tokens.strongerThanDescendants) 209 | 210 | elif model["material_type"] == "diffuse": 211 | mat_type = model["material_type"] 212 | pass 213 | 214 | # randomize camera 215 | surface_center = self.calc_surface_center(surface_prim) 216 | self.rep_randomize_camera(None, surface_center, cam_p_list, cam_q_list) 217 | 218 | # output_dir = f"{self.output_dir}/{self._config["hssd"]['name']}/{surface_config['category']}" 219 | # os.makedirs(output_dir, exist_ok=True) 220 | # self.writer._output_dir = output_dir 221 | # output_dir = self._config.writer_config.output_dir 222 | with open(f"{self.output_dir}/meta_{self.next_seq_id}.json", 'w') as f: 223 | meta = { 224 | "models": select_model_list, 225 | "domain_randomization": self.dr 226 | } 227 | f.write(json.dumps(meta, indent=4, sort_keys=True)) 228 | 229 | # replicate texture 230 | # self.randomize_texture(model_prims) 231 | # Setup the writer 232 | 233 | cfg = copy.deepcopy(self._config["writer_config"]) 234 | cfg["output_dir"] = self.output_dir 235 | cfg["start_sequence_id"] = self.next_seq_id 236 | 237 | _config = copy.copy(self._config) 238 | _config["writer_config"]["output_dir"] = self.output_dir 239 | _config["writer_config"]["start_sequence_id"] = self.next_seq_id 240 | 241 | # self._config["writer_config"]["output_dir"] 242 | 243 | resolution = np.array(self._config["depth_sensor"]["resolution"]).astype(np.uint32).tolist() 244 | dep_res = (resolution[0], resolution[1]) 245 | self.writer_gt = rep.WriterRegistry.get("GtWriter") 246 | self.writer_gt.initialize(ticker=self.ticker, depth_sensor_cfg=_config["depth_sensor"], **_config["writer_config"]) 247 | cam_gt_rp = rep.create.render_product(self.cam_rgb, dep_res, name="CameraDepth") 248 | self.writer_gt.attach([cam_gt_rp]) 249 | 250 | # start simulation 251 | self._world.reset() 252 | 253 | if self._config["render_after_quiet"]: 254 | # wait for objects to fall 255 | # last_box = self._world.scene.get_object(last_object_name) 256 | max_tried = 0 257 | while True and max_tried < 10: 258 | max_sim_steps = 250 259 | for i in range(max_sim_steps): 260 | self._world.step(render=False) 261 | quited = True 262 | for rigid_object in all_rigid_objects: 263 | obj = self._world.scene.get_object(rigid_object) 264 | if obj is None: 265 | self._log(f"{rigid_object} is not found!") 266 | continue 267 | if np.linalg.norm(obj.get_linear_velocity()) > 0.001: 268 | quited = False 269 | break 270 | if quited: 271 | self._log("all objects quited") 272 | break # stop physics simulation, start rendering 273 | max_tried += 1 274 | self._log("still waiting for objects to fall") 275 | 276 | rep.settings.set_render_rtx_realtime() 277 | start_time = time.time() 278 | # rep.orchestrator.run_until_complete(num_frames=2*self._config['num_frames_per_surface']) 279 | for _ in range(2*self._config['num_frames_per_surface']): 280 | self._writer_tick = "gt" 281 | if _ % 2 == 0: 282 | self._step_tick += 1 283 | rep.orchestrator.step(rt_subframes=self._config['rt_subframes'], pause_timeline=True) 284 | 285 | end_time = time.time() 286 | 287 | # log running time 288 | runtime = end_time - start_time 289 | fps = runtime / self._config['num_frames_per_surface'] 290 | self._log(f"Replicator finished in {round(runtime, 2)} seconds, FPS={round(fps, 2)}") 291 | 292 | # change materials 293 | for model_prim_, mat_ in initial_materials.items(): 294 | UsdShade.MaterialBindingAPI(model_prim_).Bind(mat_, UsdShade.Tokens.strongerThanDescendants) 295 | 296 | self.writer_gt.detach() 297 | 298 | self.writer_rgb = rep.WriterRegistry.get("ColorWriter") 299 | self.writer_rgb.initialize(ticker=self.ticker, **cfg) 300 | cam_rgb_rp = rep.create.render_product(self.cam_rgb, resolution, name="CameraRGB") 301 | self.writer_rgb.attach([cam_rgb_rp]) 302 | 303 | self.writer_ir = rep.WriterRegistry.get("IRWriter") 304 | self.writer_ir.initialize(output_dir = self.output_dir, start_sequence_id = self.next_seq_id, ticker=self.ticker) 305 | cam_left_ir_rp = rep.create.render_product(self.cam_ir_left, resolution, name="Camera01") 306 | cam_right_ir_rp = rep.create.render_product(self.cam_ir_right, resolution, name="Camera02") 307 | self.writer_ir.attach([cam_left_ir_rp, cam_right_ir_rp]) 308 | 309 | if self._config["launch_config"]["renderer"] == "PathTracing": # hack 310 | rep.settings.set_render_pathtraced() 311 | start_time = time.time() 312 | # rep.orchestrator.run_until_complete(num_frames=2*self._config['num_frames_per_surface']) 313 | for _ in range(2*self._config['num_frames_per_surface']): 314 | if _ % 2 == 0: 315 | self._writer_tick = "rgb" 316 | else: 317 | self._writer_tick = "ir" 318 | self._step_tick += 1 319 | rep.orchestrator.step(rt_subframes=self._config['rt_subframes'], pause_timeline=True) 320 | end_time = time.time() 321 | runtime = end_time - start_time 322 | fps = runtime / self._config['num_frames_per_surface'] 323 | self._log(f"Replicator finished in {round(runtime, 2)} seconds, FPS={round(fps, 2)}") -------------------------------------------------------------------------------- /data/augmentor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import warnings 4 | import os 5 | import time 6 | from glob import glob 7 | from skimage import color, io 8 | from PIL import Image 9 | 10 | import cv2 11 | cv2.setNumThreads(0) 12 | cv2.ocl.setUseOpenCL(False) 13 | 14 | import torch 15 | from torchvision.transforms import ColorJitter, functional, Compose 16 | import torch.nn.functional as F 17 | 18 | def get_middlebury_images(): 19 | root = "datasets/Middlebury/MiddEval3" 20 | with open(os.path.join(root, "official_train.txt"), 'r') as f: 21 | lines = f.read().splitlines() 22 | return sorted([os.path.join(root, 'trainingQ', f'{name}/im0.png') for name in lines]) 23 | 24 | def get_eth3d_images(): 25 | return sorted(glob('datasets/ETH3D/two_view_training/*/im0.png')) 26 | 27 | def get_kitti_images(): 28 | return sorted(glob('datasets/KITTI/training/image_2/*_10.png')) 29 | 30 | def transfer_color(image, style_mean, style_stddev): 31 | reference_image_lab = color.rgb2lab(image) 32 | reference_stddev = np.std(reference_image_lab, axis=(0,1), keepdims=True)# + 1 33 | reference_mean = np.mean(reference_image_lab, axis=(0,1), keepdims=True) 34 | 35 | reference_image_lab = reference_image_lab - reference_mean 36 | lamb = style_stddev/reference_stddev 37 | style_image_lab = lamb * reference_image_lab 38 | output_image_lab = style_image_lab + style_mean 39 | l, a, b = np.split(output_image_lab, 3, axis=2) 40 | l = l.clip(0, 100) 41 | output_image_lab = np.concatenate((l,a,b), axis=2) 42 | with warnings.catch_warnings(): 43 | warnings.simplefilter("ignore", category=UserWarning) 44 | output_image_rgb = color.lab2rgb(output_image_lab) * 255 45 | return output_image_rgb 46 | 47 | class AdjustGamma(object): 48 | 49 | def __init__(self, gamma_min, gamma_max, gain_min=1.0, gain_max=1.0): 50 | self.gamma_min, self.gamma_max, self.gain_min, self.gain_max = gamma_min, gamma_max, gain_min, gain_max 51 | 52 | def __call__(self, sample): 53 | gain = random.uniform(self.gain_min, self.gain_max) 54 | gamma = random.uniform(self.gamma_min, self.gamma_max) 55 | return functional.adjust_gamma(sample, gamma, gain) 56 | 57 | def __repr__(self): 58 | return f"Adjust Gamma {self.gamma_min}, ({self.gamma_max}) and Gain ({self.gain_min}, {self.gain_max})" 59 | 60 | class FlowAugmentor: 61 | def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True, yjitter=False, saturation_range=[0.6,1.4], gamma=[1,1,1,1], stretch=False): 62 | 63 | # spatial augmentation params 64 | self.crop_size = crop_size 65 | self.min_scale = min_scale 66 | self.max_scale = max_scale 67 | if stretch: 68 | self.spatial_aug_prob = 1.0 69 | self.stretch_prob = 0.8 70 | self.max_stretch = 0.2 71 | else: 72 | self.spatial_aug_prob = 0.0 73 | self.stretch_prob = 0.0 74 | self.max_stretch = 0.0 75 | 76 | # flip augmentation params 77 | self.yjitter = yjitter 78 | self.do_flip = do_flip 79 | self.h_flip_prob = 0.5 80 | self.v_flip_prob = 0.1 81 | 82 | # photometric augmentation params 83 | self.photo_aug = Compose([ColorJitter(brightness=0.4, contrast=0.4, saturation=saturation_range, hue=0.5/3.14), AdjustGamma(*gamma)]) 84 | self.asymmetric_color_aug_prob = 0.2 85 | self.eraser_aug_prob = 0.5 86 | 87 | def color_transform(self, img1, img2): 88 | """ Photometric augmentation """ 89 | 90 | # asymmetric 91 | if np.random.rand() < self.asymmetric_color_aug_prob: 92 | img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) 93 | img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) 94 | 95 | # symmetric 96 | else: 97 | image_stack = np.concatenate([img1, img2], axis=0) 98 | image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) 99 | img1, img2 = np.split(image_stack, 2, axis=0) 100 | 101 | return img1, img2 102 | 103 | def eraser_transform(self, img1, img2, bounds=[50, 100]): 104 | """ Occlusion augmentation """ 105 | 106 | ht, wd = img1.shape[:2] 107 | if np.random.rand() < self.eraser_aug_prob: 108 | mean_color = np.mean(img2.reshape(-1, 3), axis=0) 109 | for _ in range(np.random.randint(1, 3)): 110 | x0 = np.random.randint(0, wd) 111 | y0 = np.random.randint(0, ht) 112 | dx = np.random.randint(bounds[0], bounds[1]) 113 | dy = np.random.randint(bounds[0], bounds[1]) 114 | img2[y0:y0+dy, x0:x0+dx, :] = mean_color 115 | 116 | return img1, img2 117 | 118 | def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): 119 | ht, wd = flow.shape[:2] 120 | coords = np.meshgrid(np.arange(wd), np.arange(ht)) 121 | coords = np.stack(coords, axis=-1) 122 | 123 | coords = coords.reshape(-1, 2).astype(np.float32) 124 | flow = flow.reshape(-1, 2).astype(np.float32) 125 | valid = valid.reshape(-1).astype(np.float32) 126 | 127 | coords0 = coords[valid>=1] 128 | flow0 = flow[valid>=1] 129 | 130 | ht1 = int(round(ht * fy)) 131 | wd1 = int(round(wd * fx)) 132 | 133 | coords1 = coords0 * [fx, fy] 134 | flow1 = flow0 * [fx, fy] 135 | 136 | xx = np.round(coords1[:,0]).astype(np.int32) 137 | yy = np.round(coords1[:,1]).astype(np.int32) 138 | 139 | v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) 140 | xx = xx[v] 141 | yy = yy[v] 142 | flow1 = flow1[v] 143 | 144 | flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) 145 | valid_img = np.zeros([ht1, wd1], dtype=np.int32) 146 | 147 | flow_img[yy, xx] = flow1 148 | valid_img[yy, xx] = 1 149 | 150 | return flow_img, valid_img 151 | 152 | def spatial_transform(self, img1, img2, flow, sim_flow, sim_valid): 153 | # randomly sample scale 154 | ht, wd = img1.shape[:2] 155 | min_scale = np.maximum( 156 | (self.crop_size[0] + 8) / float(ht), 157 | (self.crop_size[1] + 8) / float(wd)) 158 | 159 | scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) 160 | scale_x = scale 161 | scale_y = scale 162 | if np.random.rand() < self.stretch_prob: 163 | scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) 164 | scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) 165 | 166 | scale_x = np.clip(scale_x, min_scale, None) 167 | scale_y = np.clip(scale_y, min_scale, None) 168 | 169 | if np.random.rand() < self.spatial_aug_prob: 170 | # rescale the images 171 | img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 172 | img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 173 | flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 174 | flow = flow * [scale_x, scale_y] 175 | 176 | sim_flow, sim_valid = self.resize_sparse_flow_map(sim_flow, sim_valid, fx=scale_x, fy=scale_y) 177 | 178 | if self.do_flip: 179 | if np.random.rand() < self.h_flip_prob and self.do_flip == 'hf': # h-flip 180 | img1 = img1[:, ::-1] 181 | img2 = img2[:, ::-1] 182 | flow = flow[:, ::-1] * [-1.0, 1.0] 183 | sim_flow = sim_flow[:, ::-1] * [-1.0, 1.0] 184 | 185 | if np.random.rand() < self.h_flip_prob and self.do_flip == 'h': # h-flip for stereo 186 | tmp = img1[:, ::-1] 187 | img1 = img2[:, ::-1] 188 | img2 = tmp 189 | 190 | if np.random.rand() < self.v_flip_prob and self.do_flip == 'v': # v-flip 191 | img1 = img1[::-1, :] 192 | img2 = img2[::-1, :] 193 | flow = flow[::-1, :] * [1.0, -1.0] 194 | sim_flow = sim_flow[::-1, :] * [1.0, -1.0] 195 | 196 | if self.yjitter: 197 | y0 = np.random.randint(2, img1.shape[0] - self.crop_size[0] - 2) 198 | x0 = np.random.randint(2, img1.shape[1] - self.crop_size[1] - 2) 199 | 200 | y1 = y0 + np.random.randint(-2, 2 + 1) 201 | img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 202 | img2 = img2[y1:y1+self.crop_size[0], x0:x0+self.crop_size[1]] 203 | flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 204 | sim_flow = sim_flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 205 | sim_valid = sim_valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 206 | 207 | else: 208 | y0 = 0 if img1.shape[0] == self.crop_size[0] else np.random.randint(0, img1.shape[0] - self.crop_size[0]) 209 | x0 = 0 if img1.shape[1] == self.crop_size[1] else np.random.randint(0, img1.shape[1] - self.crop_size[1]) 210 | 211 | img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 212 | img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 213 | flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 214 | sim_flow = sim_flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 215 | sim_valid = sim_valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 216 | 217 | return img1, img2, flow, sim_flow, sim_valid 218 | 219 | 220 | def __call__(self, img1, img2, flow, sim_flow, sim_valid): 221 | img1, img2 = self.color_transform(img1, img2) 222 | img1, img2 = self.eraser_transform(img1, img2) 223 | img1, img2, flow, sim_flow, sim_valid = self.spatial_transform(img1, img2, flow, sim_flow, sim_valid) 224 | 225 | img1 = np.ascontiguousarray(img1) 226 | img2 = np.ascontiguousarray(img2) 227 | flow = np.ascontiguousarray(flow) 228 | sim_flow = np.ascontiguousarray(sim_flow) 229 | sim_valid = np.ascontiguousarray(sim_valid) 230 | 231 | return img1, img2, flow, sim_flow, sim_valid 232 | 233 | class SparseFlowAugmentor: 234 | def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False, yjitter=False, saturation_range=[0.7,1.3], gamma=[1,1,1,1]): 235 | # spatial augmentation params 236 | self.crop_size = crop_size 237 | self.min_scale = min_scale 238 | self.max_scale = max_scale 239 | self.spatial_aug_prob = 0.8 240 | self.stretch_prob = 0.8 241 | self.max_stretch = 0.2 242 | 243 | # flip augmentation params 244 | self.do_flip = do_flip 245 | self.h_flip_prob = 0.5 246 | self.v_flip_prob = 0.1 247 | 248 | # photometric augmentation params 249 | self.photo_aug = Compose([ColorJitter(brightness=0.3, contrast=0.3, saturation=saturation_range, hue=0.3/3.14), AdjustGamma(*gamma)]) 250 | self.asymmetric_color_aug_prob = 0.2 251 | self.eraser_aug_prob = 0.5 252 | 253 | def color_transform(self, img1, img2): 254 | image_stack = np.concatenate([img1, img2], axis=0) 255 | image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) 256 | img1, img2 = np.split(image_stack, 2, axis=0) 257 | return img1, img2 258 | 259 | def eraser_transform(self, img1, img2): 260 | ht, wd = img1.shape[:2] 261 | if np.random.rand() < self.eraser_aug_prob: 262 | mean_color = np.mean(img2.reshape(-1, 3), axis=0) 263 | for _ in range(np.random.randint(1, 3)): 264 | x0 = np.random.randint(0, wd) 265 | y0 = np.random.randint(0, ht) 266 | dx = np.random.randint(50, 100) 267 | dy = np.random.randint(50, 100) 268 | img2[y0:y0+dy, x0:x0+dx, :] = mean_color 269 | 270 | return img1, img2 271 | 272 | def resize_sparse_flow_map(self, flow, valid, sim_flow, sim_valid, fx=1.0, fy=1.0): 273 | ht, wd = flow.shape[:2] 274 | coords = np.meshgrid(np.arange(wd), np.arange(ht)) 275 | coords = np.stack(coords, axis=-1) 276 | 277 | coords = coords.reshape(-1, 2).astype(np.float32) 278 | flow = flow.reshape(-1, 2).astype(np.float32) 279 | valid = valid.reshape(-1).astype(np.float32) 280 | 281 | sim_flow = sim_flow.reshape(-1, 2).astype(np.float32) 282 | sim_valid = sim_valid.reshape(-1).astype(np.float32) 283 | 284 | coords0 = coords[valid>=1] 285 | flow0 = flow[valid>=1] 286 | 287 | coords0_sim = coords[sim_valid>=1] 288 | flow0_sim = sim_flow[sim_valid>=1] 289 | 290 | ht1 = int(round(ht * fy)) 291 | wd1 = int(round(wd * fx)) 292 | 293 | coords1 = coords0 * [fx, fy] 294 | flow1 = flow0 * [fx, fy] 295 | 296 | coords1_sim = coords0_sim * [fx, fy] 297 | flow1_sim = flow0_sim * [fx, fy] 298 | 299 | xx = np.round(coords1[:,0]).astype(np.int32) 300 | yy = np.round(coords1[:,1]).astype(np.int32) 301 | 302 | xx_sim = np.round(coords1_sim[:,0]).astype(np.int32) 303 | yy_sim = np.round(coords1_sim[:,1]).astype(np.int32) 304 | 305 | v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) 306 | xx = xx[v] 307 | yy = yy[v] 308 | flow1 = flow1[v] 309 | 310 | v_sim = (xx_sim > 0) & (xx_sim < wd1) & (yy_sim > 0) & (yy_sim < ht1) 311 | xx_sim = xx_sim[v_sim] 312 | yy_sim = yy_sim[v_sim] 313 | flow1_sim = flow1_sim[v_sim] 314 | 315 | flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) 316 | valid_img = np.zeros([ht1, wd1], dtype=np.int32) 317 | 318 | sim_flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) 319 | sim_valid_img = np.zeros([ht1, wd1], dtype=np.int32) 320 | 321 | flow_img[yy, xx] = flow1 322 | valid_img[yy, xx] = 1 323 | 324 | sim_flow_img[yy_sim, xx_sim] = flow1_sim 325 | sim_valid_img[yy_sim, xx_sim] = 1 326 | 327 | return flow_img, valid_img, sim_flow_img, sim_valid_img 328 | 329 | def spatial_transform(self, img1, img2, flow, valid, sim_flow, sim_valid): 330 | # randomly sample scale 331 | 332 | ht, wd = img1.shape[:2] 333 | min_scale = np.maximum( 334 | (self.crop_size[0]) / float(ht), #+1 335 | (self.crop_size[1]) / float(wd)) #+1 336 | 337 | scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) # default [0.87 ~ 1.32] 338 | scale_x = np.clip(scale, min_scale, None) 339 | scale_y = np.clip(scale, min_scale, None) 340 | 341 | if True or np.random.rand() < self.spatial_aug_prob: 342 | # rescale the images 343 | img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 344 | img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 345 | flow, valid, sim_flow, sim_valid = self.resize_sparse_flow_map(flow, valid, sim_flow, sim_valid, fx=scale_x, fy=scale_y) 346 | 347 | if self.do_flip: 348 | rand1 = np.random.rand() 349 | if rand1 < self.h_flip_prob and self.do_flip == 'hf': # h-flip 350 | img1 = img1[:, ::-1] 351 | img2 = img2[:, ::-1] 352 | flow = flow[:, ::-1] * [-1.0, 1.0] 353 | sim_flow = sim_flow[:, ::-1] * [-1.0, 1.0] 354 | 355 | rand2 = np.random.rand() 356 | if rand2 < self.h_flip_prob and self.do_flip == 'h': # h-flip for stereo 357 | tmp = img1[:, ::-1] 358 | img1 = img2[:, ::-1] 359 | img2 = tmp 360 | 361 | rand3 = np.random.rand() 362 | if rand3 < self.v_flip_prob and self.do_flip == 'v': # v-flip 363 | img1 = img1[::-1, :] 364 | img2 = img2[::-1, :] 365 | flow = flow[::-1, :] * [1.0, -1.0] 366 | sim_flow = sim_flow[::-1, :] * [1.0, -1.0] 367 | 368 | margin_y = 20 369 | margin_x = 50 370 | 371 | y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) 372 | x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x) 373 | 374 | y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) 375 | x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) 376 | 377 | img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 378 | img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 379 | flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 380 | valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 381 | sim_flow = sim_flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 382 | sim_valid = sim_valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 383 | return img1, img2, flow, valid, sim_flow, sim_valid 384 | 385 | 386 | def __call__(self, img1, img2, flow, valid, sim_flow, sim_valid): 387 | img1, img2 = self.color_transform(img1, img2) 388 | img1, img2 = self.eraser_transform(img1, img2) 389 | img1, img2, flow, valid, sim_flow, sim_valid = self.spatial_transform(img1, img2, flow, valid, sim_flow, sim_valid) 390 | 391 | img1 = np.ascontiguousarray(img1) 392 | img2 = np.ascontiguousarray(img2) 393 | flow = np.ascontiguousarray(flow) 394 | valid = np.ascontiguousarray(valid) 395 | sim_flow = np.ascontiguousarray(sim_flow) 396 | sim_valid = np.ascontiguousarray(sim_valid) 397 | 398 | return img1, img2, flow, valid, sim_flow, sim_valid 399 | -------------------------------------------------------------------------------- /utils/frame_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | from os.path import * 4 | import re 5 | import json 6 | import imageio 7 | import os 8 | import cv2 9 | import torch 10 | import torch.nn.functional as F 11 | from scipy import interpolate 12 | 13 | cv2.setNumThreads(0) 14 | cv2.ocl.setUseOpenCL(False) 15 | 16 | TAG_CHAR = np.array([202021.25], np.float32) 17 | 18 | def readFlow(fn): 19 | """ Read .flo file in Middlebury format""" 20 | # Code adapted from: 21 | # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy 22 | 23 | # WARNING: this will work on little-endian architectures (eg Intel x86) only! 24 | # print 'fn = %s'%(fn) 25 | with open(fn, 'rb') as f: 26 | magic = np.fromfile(f, np.float32, count=1) 27 | if 202021.25 != magic: 28 | print('Magic number incorrect. Invalid .flo file') 29 | return None 30 | else: 31 | w = np.fromfile(f, np.int32, count=1) 32 | h = np.fromfile(f, np.int32, count=1) 33 | # print 'Reading %d x %d flo file\n' % (w, h) 34 | data = np.fromfile(f, np.float32, count=2*int(w)*int(h)) 35 | # Reshape data into 3D array (columns, rows, bands) 36 | # The reshape here is for visualization, the original code is (w,h,2) 37 | return np.resize(data, (int(h), int(w), 2)) 38 | 39 | def readPFM(file): 40 | file = open(file, 'rb') 41 | 42 | color = None 43 | width = None 44 | height = None 45 | scale = None 46 | endian = None 47 | 48 | header = file.readline().rstrip() 49 | if header == b'PF': 50 | color = True 51 | elif header == b'Pf': 52 | color = False 53 | else: 54 | raise Exception('Not a PFM file.') 55 | 56 | dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) 57 | if dim_match: 58 | width, height = map(int, dim_match.groups()) 59 | else: 60 | raise Exception('Malformed PFM header.') 61 | 62 | scale = float(file.readline().rstrip()) 63 | if scale < 0: # little-endian 64 | endian = '<' 65 | scale = -scale 66 | else: 67 | endian = '>' # big-endian 68 | 69 | data = np.fromfile(file, endian + 'f') 70 | shape = (height, width, 3) if color else (height, width) 71 | 72 | data = np.reshape(data, shape) 73 | data = np.flipud(data) 74 | return data 75 | 76 | def writePFM(file, array): 77 | import os 78 | assert type(file) is str and type(array) is np.ndarray and \ 79 | os.path.splitext(file)[1] == ".pfm" 80 | with open(file, 'wb') as f: 81 | H, W = array.shape 82 | headers = ["Pf\n", f"{W} {H}\n", "-1\n"] 83 | for header in headers: 84 | f.write(str.encode(header)) 85 | array = np.flip(array, axis=0).astype(np.float32) 86 | f.write(array.tobytes()) 87 | 88 | 89 | 90 | def writeFlow(filename,uv,v=None): 91 | """ Write optical flow to file. 92 | 93 | If v is None, uv is assumed to contain both u and v channels, 94 | stacked in depth. 95 | Original code by Deqing Sun, adapted from Daniel Scharstein. 96 | """ 97 | nBands = 2 98 | 99 | if v is None: 100 | assert(uv.ndim == 3) 101 | assert(uv.shape[2] == 2) 102 | u = uv[:,:,0] 103 | v = uv[:,:,1] 104 | else: 105 | u = uv 106 | 107 | assert(u.shape == v.shape) 108 | height,width = u.shape 109 | f = open(filename,'wb') 110 | # write the header 111 | f.write(TAG_CHAR) 112 | np.array(width).astype(np.int32).tofile(f) 113 | np.array(height).astype(np.int32).tofile(f) 114 | # arrange into matrix form 115 | tmp = np.zeros((height, width*nBands)) 116 | tmp[:,np.arange(width)*2] = u 117 | tmp[:,np.arange(width)*2 + 1] = v 118 | tmp.astype(np.float32).tofile(f) 119 | f.close() 120 | 121 | 122 | def readFlowKITTI(filename): 123 | flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR) 124 | flow = flow[:,:,::-1].astype(np.float32) 125 | flow, valid = flow[:, :, :2], flow[:, :, 2] 126 | flow = (flow - 2**15) / 64.0 127 | return flow, valid 128 | 129 | def readDispKITTI(filename): 130 | disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 131 | valid = disp > 0.0 132 | return disp, valid 133 | 134 | # Method taken from /n/fs/raft-depth/RAFT-Stereo/datasets/SintelStereo/sdk/python/sintel_io.py 135 | def readDispSintelStereo(file_name): 136 | a = np.array(Image.open(file_name)) 137 | d_r, d_g, d_b = np.split(a, axis=2, indices_or_sections=3) 138 | disp = (d_r * 4 + d_g / (2**6) + d_b / (2**14))[..., 0] 139 | mask = np.array(Image.open(file_name.replace('disparities', 'occlusions'))) 140 | valid = ((mask == 0) & (disp > 0)) 141 | return disp, valid 142 | 143 | # Method taken from https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt 144 | def readDispFallingThings(file_name): 145 | a = np.array(Image.open(file_name)) 146 | with open('/'.join(file_name.split('/')[:-1] + ['_camera_settings.json']), 'r') as f: 147 | intrinsics = json.load(f) 148 | fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx'] 149 | disp = (fx * 6.0 * 100) / a.astype(np.float32) 150 | valid = disp > 0 151 | return disp, valid 152 | 153 | # Method taken from https://github.com/castacks/tartanair_tools/blob/master/data_type.md 154 | def readDispTartanAir(file_name): 155 | depth = np.load(file_name) 156 | disp = 80.0 / depth 157 | valid = disp > 0 158 | return disp, valid 159 | 160 | def readDispSTD_np(filename): 161 | disp = np.load(filename) 162 | valid = (disp > 0) & ~ np.isinf(disp) 163 | return disp, valid 164 | 165 | def readDispReal(camera, filename): 166 | """ 167 | read disparity either ground truth depth or simulated disparity 168 | resize here aligns the file resolution with desired camera resolution 169 | """ 170 | if not os.path.exists(filename): 171 | # hack: prevent dataset errors 172 | return np.ones(camera.resolution), np.ones(camera.resolution, dtype=bool), 0, 1 173 | 174 | ext = splitext(filename)[-1] 175 | if ext == ".png": 176 | data = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) 177 | elif ext == ".npy": 178 | data = np.load(filename) 179 | elif ext == ".exr": 180 | data = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) 181 | if data is None: 182 | print(f"bug: {filename}") 183 | if len(data.shape) == 3 and data.shape[-1] == 3: 184 | data = data[...,0] 185 | else: 186 | raise NotImplementedError 187 | 188 | scale = data.shape[1] / camera.resolution[1] 189 | data = cv2.resize(data, dsize=camera.resolution[::-1], interpolation=cv2.INTER_NEAREST) 190 | valid = ~ np.isinf(data) & ~ np.isnan(data) & (data > 0) 191 | 192 | if "depth" in filename or "Depth" in filename: 193 | # depth = camera.transform_depth_to_rgb_frame(depth) #if not alreay aligned 194 | disp = np.zeros_like(data, dtype=np.float32) 195 | # FIXME: hack 196 | depth_unit = 1 197 | if camera.device == "fxm" or camera.device == "jav" or camera.device == "d435": 198 | depth_unit = 1e-3 199 | valid = valid & (data > 200) & (data < 3000) 200 | data = np.clip(data, a_min=0.0, a_max=3000) # only clip large depth values 201 | elif camera.device == "clearpose": 202 | depth_unit = 1e-3 203 | min_depth = camera.min_depth / depth_unit 204 | max_depth = camera.max_depth / depth_unit 205 | valid = valid & (data > min_depth ) & (data < max_depth) # [0.2~10] 206 | data = np.clip(data, a_min = 0.0, a_max = max_depth) # only clip large depth values 207 | 208 | disp[valid] = camera.fxb_depth / (data[valid] * depth_unit) 209 | else: 210 | # disparity scales with resolution 211 | disp = data / scale 212 | 213 | valid = (disp > camera.min_disp) & (disp < camera.max_disp) & valid 214 | # disp[valid] = np.clip(disp[valid], camera.min_disp, camera.max_disp) # DEBUG: * 1.333333 215 | # disp[~valid] = 0.0 216 | return disp, valid, camera.min_disp, camera.max_disp 217 | 218 | def readDispDreds_exr(camera, filename): 219 | depth = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) 220 | if len(depth.shape) == 3 and depth.shape[-1] == 3: 221 | depth = depth [...,0] 222 | 223 | if depth.shape[:2] != camera.resolution: 224 | # be very carefull here !!! only resize in depth space 225 | depth = cv2.resize(depth, dsize=camera.resolution[::-1], interpolation=cv2.INTER_NEAREST) # same with DREDS 226 | 227 | valid = (~ (np.isinf(depth) | np.isnan(depth))) & (depth > 0.2) & (depth < 2) 228 | disp = np.zeros_like(depth) 229 | disp[valid] = camera.fxb / depth[valid] 230 | # disp[valid] = np.clip(disp[valid], camera.min_disp, camera.max_disp) 231 | return disp, valid, camera.min_disp, camera.max_disp 232 | 233 | def readDispSTD_exr(filename): 234 | disp = cv2.imread(filename, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) 235 | valid = (~ (np.isinf(disp) | np.isnan(disp))) & (disp != 0) 236 | return disp, valid 237 | 238 | def readDispSTD(file_name): 239 | # depth_rgb = np.load(file_name) 240 | gt_depth = cv2.imread(str(file_name), cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) 241 | gt_depth = cv2.resize(gt_depth, (640*2, 360*2), interpolation=cv2.INTER_NEAREST) 242 | valid = ~ (np.isnan(gt_depth) | np.isinf(gt_depth)) 243 | gt_depth[~valid] = 0 244 | 245 | fx = 446.31 246 | focal_length = fx * 2 # original ir size 247 | baseline = 0.055 248 | T_lc = np.eye(4) # color to left ir 249 | T_lc[0,3] = -0.015 250 | H, W = 360*2, 640*2 251 | K = np.array([[fx*2, 0, W/2-0.5], [0, fx*2, H/2-0.5], [0, 0, 1]]) 252 | inv_K = np.linalg.inv(K) 253 | 254 | meshgrid = np.meshgrid(range(W), range(H), indexing='xy') 255 | id_coords = np.stack(meshgrid, axis=0).astype(np.float32) 256 | ones = np.ones((1, H * W), dtype=np.float32) 257 | pix_coords = np.concatenate((id_coords[0].reshape(1, -1), id_coords[1].reshape(1, -1), ones), axis=0) 258 | 259 | gt_depth = gt_depth.reshape(1, H*W) 260 | cam_points_ir = (inv_K @ pix_coords) * gt_depth 261 | valid_mask = cam_points_ir[2] > 0. # filter out invalid points 262 | 263 | cam_points_ir = cam_points_ir[:, valid_mask] 264 | cam_points_color = T_lc[:3,:3] @ cam_points_ir + T_lc[:3,3:] # convert to ir frame 265 | 266 | pix_coords_color = (K @ cam_points_color) # project to ir frame 267 | pix_coords_color[:2] /= pix_coords_color[2:3] # normalize 268 | 269 | ir_depth = np.zeros((H, W), dtype=np.float32)# * np.inf 270 | u, v = pix_coords_color[:2] 271 | u_left, u_right = np.floor(u).astype(np.uint32), np.ceil(u).astype(np.uint32) 272 | v_up, v_bottom = np.floor(v).astype(np.uint32), np.ceil(v).astype(np.uint32) 273 | 274 | def fill(depth_map, pred_depth, u, v): 275 | u, v = u.astype(np.uint32), v.astype(np.uint32) 276 | uv = np.vstack([u,v]) 277 | valid_color = (uv[0] >= 0) & (uv[0] < W) & (uv[1] >= 0) & (uv[1] < H) 278 | u, v = uv[:, valid_color] 279 | depth_map[v, u] = pred_depth[0, valid_mask][valid_color] 280 | 281 | # an ugly HACK 282 | fill(ir_depth, gt_depth, u_left, v_up) 283 | fill(ir_depth, gt_depth, u_left, v_bottom) 284 | fill(ir_depth, gt_depth, u_right, v_up) 285 | fill(ir_depth, gt_depth, u_right, v_bottom) 286 | 287 | uv = np.rint(pix_coords_color).astype(np.uint32) 288 | valid_color = (uv[0] >= 0) & (uv[0] < W) & (uv[1] >= 0) & (uv[1] < H) 289 | u, v = uv[:2, valid_color] 290 | ir_depth[v, u] = gt_depth[0, valid_mask][valid_color] 291 | 292 | # fill holes 293 | ir_depth_torch = torch.from_numpy(ir_depth).unsqueeze(0).unsqueeze(0) 294 | holes_mask = (ir_depth == 0) #np.isinf(ir_depth) # exclude occ-in/occ-out? 295 | holes_mask[:, -20:] = False # another ugly hack exclude the right 10 cols 296 | holes_coords = id_coords[:2, holes_mask][(1,0),:] 297 | holes_coords_normal = holes_coords / np.array(([[H],[W]])) * 2 - 1 298 | grid = torch.from_numpy(holes_coords_normal, ).transpose(1,0).reshape(1,1,-1,2) 299 | interp = F.grid_sample(ir_depth_torch, grid.to(torch.float32), mode='nearest', padding_mode='zeros') 300 | ir_depth[holes_mask] = interp[0,0,0,:].numpy() 301 | 302 | disp = np.zeros_like(ir_depth) 303 | valid = valid & (ir_depth > 0) 304 | disp[valid] = focal_length * baseline / ir_depth[valid] 305 | 306 | valid = disp > 0 307 | return disp, valid 308 | 309 | def readDispMiddlebury(file_name, extra_info=None): #, image_size 310 | import os 311 | if basename(file_name) == 'disp0GT.pfm': 312 | disp = readPFM(file_name).astype(np.float32) 313 | # disp = cv2.resize(disp, image_size[::-1], cv2.INTER_NEAREST) 314 | assert len(disp.shape) == 2 315 | nocc_pix = file_name.replace('disp0GT.pfm', 'mask0nocc.png') 316 | assert exists(nocc_pix) 317 | nocc_pix = imageio.imread(nocc_pix) == 255 318 | # nocc_pix = cv2.resize(nocc_pix, image_size[::-1], cv2.INTER_NEAREST) 319 | assert np.any(nocc_pix) 320 | calib_file = file_name.replace('disp0GT.pfm', 'calib.txt') 321 | if exists(calib_file): 322 | calib = {} 323 | with open(calib_file, "r") as f: 324 | # read line by line 325 | lines = f.readlines() 326 | for line in lines: 327 | name, var = line.partition("=")[::2] 328 | if name.startswith("cam"): 329 | # parse matlab mat? 330 | arr = var[1:-2].split(';') 331 | to_list = lambda str_arr: list(map(float, str_arr.strip().split(' '))) 332 | calib[name] = [to_list(a) for a in arr] 333 | else: 334 | calib[name] = eval(var) 335 | 336 | # convert disp to depth 337 | depth = np.zeros_like(disp) 338 | depth[nocc_pix] = calib['baseline'] * calib['cam0'][0][0] / (calib['doffs'] + disp[nocc_pix]) * 1e-3 # meter 339 | 340 | if os.path.exists(file_name.replace("disp0GT.pfm", "im0.png_flow_pred.npy")): 341 | raft_disp = np.load(file_name.replace("disp0GT.pfm", "im0.png_flow_pred.npy")) 342 | raw_depth = calib['baseline'] * calib['cam0'][0][0] / (calib['doffs'] + -raft_disp) * 1e-3 # meter 343 | else: 344 | raw_depth = depth 345 | return disp, nocc_pix, depth, np.array(calib["cam0"]), raw_depth 346 | 347 | return disp, nocc_pix, np.zeros_like(disp) 348 | 349 | elif basename(file_name) == 'disp0.pfm': 350 | disp = readPFM(file_name).astype(np.float32) 351 | valid = disp < 1e3 352 | return disp, valid 353 | 354 | def writeFlowKITTI(filename, uv): 355 | uv = 64.0 * uv + 2**15 356 | valid = np.ones([uv.shape[0], uv.shape[1], 1]) 357 | uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) 358 | cv2.imwrite(filename, uv[..., ::-1]) 359 | 360 | def read_sceneflow(resolution, file_name, pil=False): 361 | """ 362 | train sceneflow with different resolution 363 | resolution: HxW 364 | """ 365 | try: 366 | disp = np.array(read_gen(file_name, pil)).astype(np.float32) 367 | except: 368 | print(f"invalid ground truth file, {file_name}") 369 | 370 | assert len(disp.shape) == 2 371 | scale, min_disp, max_disp = 1., 0.5, 256. 372 | if resolution is not None and disp.shape != tuple(resolution): 373 | scale = disp.shape[0] / resolution[0] 374 | disp = cv2.resize(disp, resolution[::-1], cv2.INTER_NEAREST) #cv2.INTER_LINEAR 375 | disp = disp / scale 376 | max_disp = max_disp / scale 377 | min_disp = min_disp / scale 378 | return disp, (disp < max_disp) & (disp > min_disp), min_disp, max_disp 379 | 380 | def read_gen(file_name, pil=False): 381 | ext = splitext(file_name)[-1] 382 | if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': 383 | return Image.open(file_name) 384 | elif ext == '.bin' or ext == '.raw': 385 | return np.load(file_name) 386 | elif ext == '.flo': 387 | return readFlow(file_name).astype(np.float32) 388 | elif ext == '.pfm': 389 | flow = readPFM(file_name).astype(np.float32) 390 | if len(flow.shape) == 2: 391 | return flow 392 | else: 393 | return flow[:, :, :-1] 394 | elif ext == ".npy": 395 | return np.load(file_name).astype(np.float32) 396 | elif ext == ".exr": 397 | return cv2.imread(file_name, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) 398 | return [] 399 | 400 | 401 | #https://stackoverflow.com/questions/37662180/interpolate-missing-values-2d-python 402 | def interpolate_missing_pixels( 403 | image: np.ndarray, 404 | mask: np.ndarray, 405 | method: str = 'nearest', 406 | fill_value: int = 0 407 | ): 408 | """ 409 | :param image: a 2D image 410 | :param mask: a 2D boolean image, True indicates missing values 411 | :param method: interpolation method, one of 412 | 'nearest', 'linear', 'cubic'. 413 | :param fill_value: which value to use for filling up data outside the 414 | convex hull of known pixel values. 415 | Default is 0, Has no effect for 'nearest'. 416 | :return: the image with missing values interpolated 417 | """ 418 | assert len(image.shape) == 2, "should pass a 2D image" 419 | h, w = image.shape[:2] 420 | xx, yy = np.meshgrid(np.arange(w), np.arange(h)) 421 | 422 | known_x = xx[~mask] 423 | known_y = yy[~mask] 424 | known_v = image[~mask] 425 | missing_x = xx[mask] 426 | missing_y = yy[mask] 427 | 428 | interp_values = interpolate.griddata( 429 | (known_x, known_y), known_v, (missing_x, missing_y), 430 | method=method, fill_value=fill_value 431 | ) 432 | 433 | interp_image = image.copy() 434 | interp_image[missing_y, missing_x] = interp_values 435 | 436 | return interp_image --------------------------------------------------------------------------------