├── utils_p ├── __init__.py ├── __pycache__ │ ├── losses.cpython-37.pyc │ ├── memory.cpython-37.pyc │ ├── prompt.cpython-37.pyc │ ├── __init__.cpython-37.pyc │ ├── convert.cpython-37.pyc │ └── metrics.cpython-37.pyc ├── prompt.py ├── losses.py ├── convert.py └── memory.py ├── vlnce_baselines ├── config │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── default.cpython-37.pyc │ ├── r2r_configs │ │ ├── test_set_inference.yaml │ │ ├── cma_sf.yaml │ │ ├── seq2seq_da.yaml │ │ ├── seq2seq_aug.yaml │ │ ├── seq2seq_pm.yaml │ │ ├── cma_aug.yaml │ │ ├── seq2seq_pm_aug.yaml │ │ ├── cma_da.yaml │ │ ├── cma_pm.yaml │ │ ├── cma_ss.yaml │ │ ├── cma_pm_da.yaml │ │ ├── cma_pm_aug.yaml │ │ ├── seq2seq_aug_tune.yaml │ │ ├── seq2seq.yaml │ │ ├── cma_aug_tune.yaml │ │ ├── cma_pm_aug_tune.yaml │ │ ├── cma_da_aug_tune.yaml │ │ ├── cma.yaml │ │ ├── seq2seq_pm_da_aug_tune.yaml │ │ └── cma_pm_da_aug_tune.yaml │ ├── nonlearning.yaml │ └── default.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── policy.cpython-37.pyc │ │ ├── utils.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── graph_utils.cpython-37.pyc │ │ └── Policy_ViewSelection_ETP.cpython-37.pyc │ ├── etp │ │ ├── __pycache__ │ │ │ ├── nerf.cpython-37.pyc │ │ │ ├── vilmodel_cmt.cpython-37.pyc │ │ │ └── vlnbert_init.cpython-37.pyc │ │ ├── vlnbert_init.py │ │ └── nerf.py │ ├── encoders │ │ ├── __pycache__ │ │ │ ├── clip.cpython-37.pyc │ │ │ ├── resnet_encoders.cpython-37.pyc │ │ │ └── instruction_encoder.cpython-37.pyc │ │ ├── instruction_encoder.py │ │ ├── clip.py │ │ └── resnet_encoders.py │ ├── policy.py │ └── graph_utils.py ├── bert-base-uncased │ └── instruction ├── common │ ├── __pycache__ │ │ ├── ops.cpython-37.pyc │ │ ├── utils.cpython-37.pyc │ │ ├── aux_losses.cpython-37.pyc │ │ ├── env_utils.cpython-37.pyc │ │ ├── transformer.cpython-37.pyc │ │ ├── environments.cpython-37.pyc │ │ └── base_il_trainer.cpython-37.pyc │ ├── aux_losses.py │ ├── utils.py │ ├── ops.py │ ├── env_utils.py │ └── recollection_dataset.py ├── waypoint_networks │ ├── __pycache__ │ │ ├── utils.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── resnetUnet.cpython-37.pyc │ │ ├── viz_utils.cpython-37.pyc │ │ ├── semantic_grid.cpython-37.pyc │ │ └── img_segmentor_model.cpython-37.pyc │ ├── __init__.py │ ├── img_segmentor_model.py │ ├── resnetUnet.py │ ├── semantic_grid.py │ └── viz_utils.py ├── __init__.py └── utils.py ├── data └── instruction ├── img └── EWM.png ├── pretrained └── instruction ├── run_r2r ├── r2r_vlnce.yaml ├── iter_train.yaml └── main.bash ├── run.py ├── README.md └── environment.yaml /utils_p/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vlnce_baselines/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vlnce_baselines/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/instruction: -------------------------------------------------------------------------------- 1 | Please download the file 'data' and put it here. 2 | -------------------------------------------------------------------------------- /img/EWM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/img/EWM.png -------------------------------------------------------------------------------- /pretrained/instruction: -------------------------------------------------------------------------------- 1 | Please download the file 'pretrained' and put it here. 2 | -------------------------------------------------------------------------------- /vlnce_baselines/bert-base-uncased/instruction: -------------------------------------------------------------------------------- 1 | Please download the file 'bert-base-uncased' and put it here. 2 | -------------------------------------------------------------------------------- /utils_p/__pycache__/losses.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/losses.cpython-37.pyc -------------------------------------------------------------------------------- /utils_p/__pycache__/memory.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/memory.cpython-37.pyc -------------------------------------------------------------------------------- /utils_p/__pycache__/prompt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/prompt.cpython-37.pyc -------------------------------------------------------------------------------- /utils_p/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /utils_p/__pycache__/convert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/convert.cpython-37.pyc -------------------------------------------------------------------------------- /utils_p/__pycache__/metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/metrics.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/ops.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/ops.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/__pycache__/policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/policy.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/config/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/config/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/config/__pycache__/default.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/config/__pycache__/default.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/etp/__pycache__/nerf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/nerf.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/aux_losses.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/aux_losses.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/env_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/env_utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/transformer.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/__pycache__/graph_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/graph_utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/environments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/environments.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/__pycache__/clip.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/clip.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/common/__pycache__/base_il_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/base_il_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/etp/__pycache__/vilmodel_cmt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/vilmodel_cmt.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/etp/__pycache__/vlnbert_init.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/vlnbert_init.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/resnetUnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/resnetUnet.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/viz_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/viz_utils.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/__pycache__/Policy_ViewSelection_ETP.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/Policy_ViewSelection_ETP.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/__pycache__/resnet_encoders.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/resnet_encoders.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/semantic_grid.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/semantic_grid.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/__pycache__/instruction_encoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/instruction_encoder.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__pycache__/img_segmentor_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/img_segmentor_model.cpython-37.pyc -------------------------------------------------------------------------------- /vlnce_baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from vlnce_baselines import ss_trainer_ETP, dagger_trainer 2 | from vlnce_baselines.common import environments 3 | 4 | from vlnce_baselines.models import ( 5 | Policy_ViewSelection_ETP, 6 | ) 7 | -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .resnetUnet import ResNetUNet 4 | from .img_segmentor_model import ImgSegmentor 5 | 6 | def get_img_segmentor_from_options(n_object_classes,img_segm_loss_scale): 7 | return ImgSegmentor(segmentation_model=ResNetUNet(n_channel_in=3, n_class_out=n_object_classes), 8 | loss_scale=img_segm_loss_scale) 9 | 10 | ''' 11 | Model ResNetUnet taken from: 12 | https://github.com/usuyama/pytorch-unet 13 | ''' -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/test_set_inference.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_PROCESSES: 1 5 | 6 | INFERENCE: 7 | SPLIT: test 8 | USE_CKPT_CONFIG: False 9 | SAMPLE: False 10 | CKPT_PATH: data/checkpoints/CMA_PM_DA_Aug.pth 11 | PREDICTIONS_FILE: predictions.json 12 | 13 | MODEL: 14 | policy_name: CMAPolicy 15 | 16 | INSTRUCTION_ENCODER: 17 | bidirectional: True 18 | 19 | CMA: 20 | use: True 21 | 22 | PROGRESS_MONITOR: 23 | use: True 24 | -------------------------------------------------------------------------------- /vlnce_baselines/config/nonlearning.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | EVAL: 3 | SPLIT: val_unseen 4 | # any num greater than the actual episode count evals every episode 5 | EPISODE_COUNT: 10 6 | EVAL_NONLEARNING: True 7 | NONLEARNING: 8 | # RandomAgent or HandcraftedAgent 9 | AGENT: RandomAgent 10 | 11 | INFERENCE: 12 | SPLIT: val_unseen 13 | PREDICTIONS_FILE: predictions.json 14 | INFERENCE_NONLEARNING: True 15 | NONLEARNING: 16 | # RandomAgent or HandcraftedAgent 17 | AGENT: "RandomAgent" 18 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_sf.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_IDS: [0] 3 | TORCH_GPU_ID: 0 4 | TORCH_GPU_IDS: [0] 5 | GPU_NUMBERS: 1 6 | NUM_ENVIRONMENTS: 1 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_sf 8 | CHECKPOINT_FOLDER: data/checkpoints/cma_sf 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_sf 10 | 11 | EVAL: 12 | USE_CKPT_CONFIG: False 13 | SPLIT: val_unseen 14 | EPISODE_COUNT: -1 15 | 16 | IL: 17 | epochs: 50 18 | batch_size: 8 19 | schedule_ratio: 0.75 20 | decay_time: 10 21 | 22 | max_traj_len: 130 23 | 24 | MODEL: 25 | policy_name: CMAPolicyO 26 | 27 | INSTRUCTION_ENCODER: 28 | bidirectional: True 29 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_da.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_da 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_da 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_da 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 4 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 10 20 | update_size: 5000 21 | p: 0.75 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq_da/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: Seq2SeqPolicy 27 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_aug.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_aug 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_aug 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_aug 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 15 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 157232 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq_aug/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: Seq2SeqPolicy 27 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_pm.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 15 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 10819 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: Seq2SeqPolicy 27 | 28 | PROGRESS_MONITOR: 29 | use: True 30 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_aug.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 45 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 157232 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma_aug/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: CMAPolicy 27 | 28 | INSTRUCTION_ENCODER: 29 | bidirectional: True 30 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_pm_aug.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm_aug 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm_aug 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm_aug 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 15 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 157232 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq_aug/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: Seq2SeqPolicy 27 | 28 | PROGRESS_MONITOR: 29 | use: True 30 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_da.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_IDS: [0] 3 | TORCH_GPU_ID: 0 4 | TORCH_GPU_IDS: [0] 5 | GPU_NUMBERS: 1 6 | NUM_ENVIRONMENTS: 1 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_da 8 | CHECKPOINT_FOLDER: data/checkpoints/cma_da 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_da 10 | 11 | EVAL: 12 | USE_CKPT_CONFIG: False 13 | SPLIT: val_unseen 14 | EPISODE_COUNT: -1 15 | 16 | IL: 17 | epochs: 4 18 | batch_size: 5 19 | 20 | DAGGER: 21 | iterations: 10 22 | update_size: 5000 23 | p: 0.75 24 | preload_lmdb_features: False 25 | lmdb_features_dir: data/trajectories_dirs/cma_da/trajectories.lmdb 26 | 27 | MODEL: 28 | policy_name: CMAPolicy 29 | 30 | INSTRUCTION_ENCODER: 31 | bidirectional: True 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_pm.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 45 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 10819 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: CMAPolicy 27 | 28 | INSTRUCTION_ENCODER: 29 | bidirectional: True 30 | 31 | PROGRESS_MONITOR: 32 | use: True 33 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_ss.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_IDS: [0] 3 | TORCH_GPU_ID: 0 4 | TORCH_GPU_IDS: [0] 5 | TRAINER_NAME: ss 6 | GPU_NUMBERS: 1 7 | NUM_ENVIRONMENTS: 1 8 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_ss 9 | CHECKPOINT_FOLDER: data/checkpoints/cma_ss 10 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_ss 11 | 12 | EVAL: 13 | USE_CKPT_CONFIG: False 14 | SPLIT: val_unseen 15 | EPISODE_COUNT: -1 16 | 17 | #RL: 18 | # POLICY: 19 | # OBS_TRANSFORMS: 20 | # ENABLED_TRANSFORMS: [Resize] 21 | 22 | IL: 23 | epochs: 50 24 | batch_size: 8 25 | schedule_ratio: 0.75 26 | decay_time: 10 27 | 28 | max_traj_len: 130 29 | 30 | MODEL: 31 | policy_name: CMAPolicyO 32 | 33 | INSTRUCTION_ENCODER: 34 | bidirectional: True 35 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_pm_da.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_da 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_da 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_da 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 4 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 10 20 | update_size: 5000 21 | p: 0.75 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma_pm_da/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: CMAPolicy 27 | 28 | INSTRUCTION_ENCODER: 29 | bidirectional: True 30 | 31 | PROGRESS_MONITOR: 32 | use: True 33 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_pm_aug.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_aug 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_aug 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_aug 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 45 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 157232 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma_aug/trajectories.lmdb 24 | 25 | MODEL: 26 | policy_name: CMAPolicy 27 | 28 | INSTRUCTION_ENCODER: 29 | bidirectional: True 30 | 31 | PROGRESS_MONITOR: 32 | use: True 33 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 15 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 10819 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/seq2seq_aug/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: Seq2SeqPolicy 29 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | TRAINER_NAME: dagger # recollect_trainer 3 | SIMULATOR_GPU_ID: 0 4 | TORCH_GPU_ID: 0 5 | NUM_ENVIRONMENTS: 1 6 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq 7 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq 8 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq 9 | 10 | EVAL: 11 | USE_CKPT_CONFIG: False 12 | SPLIT: val_unseen 13 | EPISODE_COUNT: -1 14 | 15 | IL: 16 | epochs: 15 17 | batch_size: 5 18 | 19 | RECOLLECT_TRAINER: 20 | gt_file: 21 | data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 22 | 23 | DAGGER: 24 | iterations: 1 25 | update_size: 10819 26 | p: 1.0 27 | preload_lmdb_features: False 28 | lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb 29 | 30 | MODEL: 31 | policy_name: Seq2SeqPolicy 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 45 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 10819 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/cma_aug/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: CMAPolicy 29 | 30 | INSTRUCTION_ENCODER: 31 | bidirectional: True 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_pm_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 45 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 1 20 | update_size: 10819 21 | p: 1.0 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/cma_pm_aug/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: CMAPolicy 29 | 30 | INSTRUCTION_ENCODER: 31 | bidirectional: True 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_da_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_da_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_da_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_da_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 4 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 10 20 | update_size: 5000 21 | p: 0.5 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma_da_aug_tune/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/cma_aug_tune/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: CMAPolicy 29 | 30 | INSTRUCTION_ENCODER: 31 | bidirectional: True 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | TRAINER_NAME: dagger # recollect_trainer 3 | SIMULATOR_GPU_IDS: [0] 4 | TORCH_GPU_ID: 0 5 | GPU_NUMBERS: 1 6 | NUM_ENVIRONMENTS: 1 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma 8 | CHECKPOINT_FOLDER: data/checkpoints/cma 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma 10 | 11 | EVAL: 12 | USE_CKPT_CONFIG: False 13 | SPLIT: val_unseen 14 | EPISODE_COUNT: -1 15 | 16 | IL: 17 | epochs: 45 18 | batch_size: 5 19 | 20 | RECOLLECT_TRAINER: 21 | gt_file: 22 | data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 23 | 24 | DAGGER: 25 | iterations: 1 26 | update_size: 10819 27 | p: 1.0 28 | preload_lmdb_features: False 29 | lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb 30 | 31 | MODEL: 32 | policy_name: CMAPolicy 33 | 34 | INSTRUCTION_ENCODER: 35 | bidirectional: True 36 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/seq2seq_pm_da_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm_da_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm_da_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm_da_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 4 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 10 20 | update_size: 5000 21 | p: 0.75 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/seq2seq_pm_da_aug_tune/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/seq2seq_pm_aug/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: Seq2SeqPolicy 29 | 30 | PROGRESS_MONITOR: 31 | use: True 32 | -------------------------------------------------------------------------------- /vlnce_baselines/config/r2r_configs/cma_pm_da_aug_tune.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_ENVIRONMENTS: 1 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_da_aug_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_da_aug_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_da_aug_tune 8 | 9 | EVAL: 10 | USE_CKPT_CONFIG: False 11 | SPLIT: val_unseen 12 | EPISODE_COUNT: -1 13 | 14 | IL: 15 | epochs: 4 16 | batch_size: 5 17 | 18 | DAGGER: 19 | iterations: 10 20 | update_size: 5000 21 | p: 0.5 22 | preload_lmdb_features: False 23 | lmdb_features_dir: data/trajectories_dirs/cma_pm_da_aug_tune/trajectories.lmdb 24 | load_from_ckpt: True 25 | ckpt_to_load: data/checkpoints/cma_pm_aug/best_checkpoint.pth # REPLACE 26 | 27 | MODEL: 28 | policy_name: CMAPolicy 29 | 30 | INSTRUCTION_ENCODER: 31 | bidirectional: True 32 | 33 | PROGRESS_MONITOR: 34 | use: True 35 | -------------------------------------------------------------------------------- /vlnce_baselines/common/aux_losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class _AuxLosses: 5 | def __init__(self): 6 | self._losses = {} 7 | self._loss_alphas = {} 8 | self._is_active = False 9 | 10 | def clear(self): 11 | self._losses.clear() 12 | self._loss_alphas.clear() 13 | 14 | def register_loss(self, name, loss, alpha=1.0): 15 | assert self.is_active() 16 | assert name not in self._losses 17 | 18 | self._losses[name] = loss 19 | self._loss_alphas[name] = alpha 20 | 21 | def get_loss(self, name): 22 | return self._losses[name] 23 | 24 | def reduce(self, mask): 25 | assert self.is_active() 26 | total = torch.tensor(0.0).cuda() 27 | 28 | for k in self._losses.keys(): 29 | k_loss = torch.masked_select(self._losses[k], mask).mean() 30 | total = total + self._loss_alphas[k] * k_loss 31 | 32 | return total 33 | 34 | def is_active(self): 35 | return self._is_active 36 | 37 | def activate(self): 38 | self._is_active = True 39 | 40 | def deactivate(self): 41 | self._is_active = False 42 | 43 | 44 | AuxLosses = _AuxLosses() 45 | -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/img_segmentor_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ImgSegmentor(nn.Module): 7 | 8 | def __init__(self, segmentation_model, loss_scale): 9 | super(ImgSegmentor, self).__init__() 10 | self._segmentation_model = segmentation_model 11 | self.loss_scale = loss_scale 12 | 13 | self.cel_loss = nn.CrossEntropyLoss() 14 | 15 | 16 | 17 | def forward(self, batch): 18 | 19 | imgs = batch['images'] 20 | B, T, _, H, W = imgs.shape 21 | 22 | pred_segm_raw = self._segmentation_model(imgs) 23 | 24 | C = pred_segm_raw.shape[1] 25 | 26 | # Get a prob distribution over the labels 27 | pred_segm_raw = pred_segm_raw.view(B,T,C,H,W) 28 | pred_segm = F.softmax(pred_segm_raw, dim=2) 29 | 30 | output = {'pred_segm_raw':pred_segm_raw, 31 | 'pred_segm':pred_segm} 32 | 33 | return output 34 | 35 | 36 | def loss_cel(self, batch, pred_outputs): 37 | pred_segm_raw = pred_outputs['pred_segm_raw'] 38 | B, T, C, H, W = pred_segm_raw.shape 39 | 40 | gt_segm = batch['gt_segm'] 41 | pred_segm_loss = self.cel_loss(input=pred_segm_raw.view(B*T,C,H,W), target=gt_segm.view(B*T,H,W)) 42 | 43 | pred_segm_err = pred_segm_loss.clone().detach() 44 | 45 | output={} 46 | output['pred_segm_err'] = pred_segm_err 47 | output['pred_segm_loss'] = self.loss_scale * pred_segm_loss 48 | return output -------------------------------------------------------------------------------- /run_r2r/r2r_vlnce.yaml: -------------------------------------------------------------------------------- 1 | ENVIRONMENT: 2 | MAX_EPISODE_STEPS: 5000 3 | 4 | SIMULATOR: 5 | ACTION_SPACE_CONFIG: v0 6 | AGENT_0: 7 | SENSORS: [RGB_SENSOR, DEPTH_SENSOR] 8 | FORWARD_STEP_SIZE: 0.25 9 | TURN_ANGLE: 15 10 | HABITAT_SIM_V0: 11 | GPU_DEVICE_ID: 0 12 | ALLOW_SLIDING: True 13 | RGB_SENSOR: 14 | WIDTH: 224 15 | HEIGHT: 224 16 | HFOV: 90 17 | TYPE: HabitatSimRGBSensor 18 | DEPTH_SENSOR: 19 | WIDTH: 256 # pretrained DDPPO resnet needs 256x256 20 | HEIGHT: 256 21 | HFOV: 90 22 | TYPE: 23 | Sim-v1 24 | 25 | TASK: 26 | TYPE: VLN-v0 27 | POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT, HIGHTOLOW] 28 | SUCCESS_DISTANCE: 3.0 29 | SENSORS: [ 30 | INSTRUCTION_SENSOR, 31 | # SHORTEST_PATH_SENSOR, 32 | # VLN_ORACLE_PROGRESS_SENSOR 33 | ] 34 | INSTRUCTION_SENSOR_UUID: instruction 35 | MEASUREMENTS: [ 36 | # DISTANCE_TO_GOAL, 37 | # SUCCESS, 38 | # SPL, 39 | # NDTW, 40 | # PATH_LENGTH, 41 | # ORACLE_SUCCESS, 42 | # STEPS_TAKEN 43 | ] 44 | SUCCESS: 45 | SUCCESS_DISTANCE: 3.0 46 | SPL: 47 | SUCCESS_DISTANCE: 3.0 48 | NDTW: 49 | SUCCESS_DISTANCE: 3.0 50 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 51 | SDTW: 52 | SUCCESS_DISTANCE: 3.0 53 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 54 | ORACLE_SUCCESS: 55 | SUCCESS_DISTANCE: 3.0 56 | 57 | DATASET: 58 | TYPE: VLN-CE-v1 59 | SPLIT: train 60 | DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed_BERTidx/{split}/{split}_bertidx.json.gz 61 | SCENES_DIR: data/scene_datasets/ 62 | -------------------------------------------------------------------------------- /run_r2r/iter_train.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: run_r2r/r2r_vlnce.yaml 2 | SIMULATOR_GPU_IDS: [0] 3 | TORCH_GPU_ID: 0 4 | TORCH_GPU_IDS: [0] 5 | TRAINER_NAME: SS-ETP 6 | GPU_NUMBERS: 1 7 | NUM_ENVIRONMENTS: 1 8 | TENSORBOARD_DIR: data/logs/tensorboard_dirs/ 9 | CHECKPOINT_FOLDER: data/logs/checkpoints/ 10 | EVAL_CKPT_PATH_DIR: data/logs/checkpoints/ 11 | RESULTS_DIR: data/logs/eval_results/ 12 | VIDEO_DIR: data/logs/video/ 13 | 14 | INFERENCE: 15 | SPLIT: test 16 | USE_CKPT_CONFIG: False 17 | SAMPLE: False 18 | CKPT_PATH: '' # REPLACE THIS 19 | PREDICTIONS_FILE: '' 20 | FORMAT: r2r 21 | EPISODE_COUNT: -1 22 | 23 | EVAL: 24 | USE_CKPT_CONFIG: False 25 | SPLIT: val_unseen 26 | EPISODE_COUNT: -1 27 | CKPT_PATH_DIR: '' 28 | fast_eval: False 29 | 30 | IL: 31 | iters: 30000 32 | log_every: 500 33 | lr: 1e-5 34 | batch_size: 1 # equal to NUM_ENVIRONMENTS 35 | ml_weight: 1.0 36 | expert_policy: spl 37 | 38 | sample_ratio: 0.75 39 | decay_interval: 3000 40 | 41 | max_traj_len: 30 42 | max_text_len: 80 43 | loc_noise: 0.5 44 | waypoint_aug: False 45 | ghost_aug: 0.0 46 | back_algo: teleport 47 | # back_algo: control 48 | tryout: True 49 | 50 | MODEL: 51 | task_type: r2r 52 | 53 | policy_name: PolicyViewSelectionETP 54 | NUM_ANGLES: 12 55 | pretrained_path: pretrained/model_step_82500.pt 56 | fix_lang_embedding: False 57 | fix_pano_embedding: False 58 | use_depth_embedding: True 59 | use_sprels: True 60 | merge_ghost: True 61 | consume_ghost: True 62 | 63 | spatial_output: False 64 | RGB_ENCODER: 65 | output_size: 512 66 | DEPTH_ENCODER: 67 | output_size: 256 68 | VISUAL_DIM: 69 | vis_hidden: 768 70 | directional: 128 71 | INSTRUCTION_ENCODER: 72 | bidirectional: True 73 | 74 | -------------------------------------------------------------------------------- /run_r2r/main.bash: -------------------------------------------------------------------------------- 1 | export GLOG_minloglevel=2 2 | export MAGNUM_LOG=quiet 3 | 4 | flag1="--exp_name release_r2r 5 | --run-type train 6 | --exp-config run_r2r/iter_train.yaml 7 | SIMULATOR_GPU_IDS [0] 8 | TORCH_GPU_IDS [0] 9 | GPU_NUMBERS 1 10 | NUM_ENVIRONMENTS 1 11 | IL.iters 29000 12 | IL.lr 1e-5 13 | IL.log_every 500 14 | IL.ml_weight 1.0 15 | IL.sample_ratio 0.75 16 | IL.decay_interval 4000 17 | IL.load_from_ckpt True 18 | IL.is_requeue True 19 | IL.waypoint_aug True 20 | IL.ckpt_to_load data/checkpoints/ckpt.iter25000.pth 21 | TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True 22 | MODEL.pretrained_path pretrained/model_step_100000.pt 23 | " 24 | 25 | flag2=" --exp_name release_r2r 26 | --run-type eval 27 | --exp-config run_r2r/iter_train.yaml 28 | SIMULATOR_GPU_IDS [0] 29 | TORCH_GPU_IDS [0] 30 | GPU_NUMBERS 1 31 | NUM_ENVIRONMENTS 1 32 | TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True 33 | EVAL.CKPT_PATH_DIR data/checkpoints/ckpt.pth 34 | MODEL.pretrained_path pretrained/model_step_100000.pt 35 | IL.back_algo control 36 | " 37 | 38 | flag3="--exp_name release_r2r 39 | --run-type inference 40 | --exp-config run_r2r/iter_train.yaml 41 | SIMULATOR_GPU_IDS [0] 42 | TORCH_GPU_IDS [0] 43 | GPU_NUMBERS 1 44 | NUM_ENVIRONMENTS 1 45 | TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True 46 | INFERENCE.CKPT_PATH data/checkpoints/ckpt.pth 47 | INFERENCE.PREDICTIONS_FILE preds.json 48 | MODEL.pretrained_path pretrained/model_step_100000.pt 49 | IL.back_algo control 50 | " 51 | 52 | mode=$1 53 | case $mode in 54 | train) 55 | echo "###### train mode ######" 56 | CUDA_VISIBLE_DEVICES='7' python run.py $flag1 57 | ;; 58 | eval) 59 | echo "###### eval mode ######" 60 | #CUDA_VISIBLE_DEVICES='5' python -m pdb run.py $flag2 61 | CUDA_VISIBLE_DEVICES='0' python run.py $flag2 62 | ;; 63 | infer) 64 | echo "###### infer mode ######" 65 | CUDA_VISIBLE_DEVICES='7' python run.py $flag3 66 | ;; 67 | esac -------------------------------------------------------------------------------- /vlnce_baselines/common/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | import torch 3 | import torch.distributed as dist 4 | import numpy as np 5 | import copy 6 | import math 7 | 8 | def extract_instruction_tokens( 9 | observations: List[Dict], 10 | instruction_sensor_uuid: str, 11 | tokens_uuid: str = "tokens", 12 | max_length: int = 512, 13 | pad_id: int = 0, 14 | ): 15 | """Extracts instruction tokens from an instruction sensor if the tokens 16 | exist and are in a dict structure.""" 17 | if instruction_sensor_uuid not in observations[0]: 18 | return observations 19 | 20 | for i in range(len(observations)): 21 | if ( 22 | isinstance(observations[i][instruction_sensor_uuid], dict) 23 | and tokens_uuid in observations[i][instruction_sensor_uuid] 24 | ): 25 | token = observations[i][instruction_sensor_uuid]["tokens"][:max_length] 26 | if len(token) < max_length: 27 | token += [pad_id] * (max_length - len(token)) 28 | observations[i][instruction_sensor_uuid] = token 29 | else: 30 | break 31 | return observations 32 | 33 | def gather_list_and_concat(list_of_nums,world_size): 34 | if not torch.is_tensor(list_of_nums): 35 | tensor = torch.Tensor(list_of_nums).cuda() 36 | else: 37 | if list_of_nums.is_cuda == False: 38 | tensor = list_of_nums.cuda() 39 | else: 40 | tensor = list_of_nums 41 | gather_t = [torch.ones_like(tensor) for _ in 42 | range(world_size)] 43 | dist.all_gather(gather_t, tensor) 44 | return gather_t 45 | 46 | def dis_to_con(path, amount=0.25): 47 | starts = path[:-1] 48 | ends = path[1:] 49 | new_path = [path[0]] 50 | for s, e in zip(starts,ends): 51 | vec = np.array(e) - np.array(s) 52 | ratio = amount/np.linalg.norm(vec[[0,2]]) 53 | unit = vec*ratio 54 | times = int(1/ratio) 55 | for i in range(times): 56 | if i != times - 1: 57 | location = np.array(new_path[-1])+unit 58 | new_path.append(location.tolist()) 59 | new_path.append(e) 60 | 61 | return new_path 62 | 63 | def get_camera_orientations12(): 64 | base_angle_deg = 30 65 | base_angle_rad = math.pi / 6 66 | orient_dict = {} 67 | for k in range(1,12): 68 | orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0] 69 | return orient_dict -------------------------------------------------------------------------------- /vlnce_baselines/common/ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .transformer import TransformerEncoder, TransformerEncoderLayer 4 | 5 | try: 6 | from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm 7 | except (ImportError, AttributeError) as e: 8 | # logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") 9 | BertLayerNorm = torch.nn.LayerNorm 10 | 11 | def create_transformer_encoder(config, num_layers, norm=False): 12 | enc_layer = TransformerEncoderLayer( 13 | config.hidden_size, config.num_attention_heads, 14 | dim_feedforward=config.intermediate_size, 15 | dropout=config.hidden_dropout_prob, 16 | activation=config.hidden_act, 17 | normalize_before=True 18 | ) 19 | if norm: 20 | norm_layer = BertLayerNorm(config.hidden_size, eps=1e-12) 21 | else: 22 | norm_layer = None 23 | return TransformerEncoder(enc_layer, num_layers, norm=norm_layer, batch_first=True) 24 | 25 | def extend_neg_masks(masks, dtype=None): 26 | """ 27 | mask from (N, L) into (N, 1(H), 1(L), L) and make it negative 28 | """ 29 | if dtype is None: 30 | dtype = torch.float 31 | extended_masks = masks.unsqueeze(1).unsqueeze(2) 32 | extended_masks = extended_masks.to(dtype=dtype) 33 | extended_masks = (1.0 - extended_masks) * -10000.0 34 | return extended_masks 35 | 36 | def gen_seq_masks(seq_lens, max_len=None): 37 | if max_len is None: 38 | max_len = max(seq_lens) 39 | batch_size = len(seq_lens) 40 | device = seq_lens.device 41 | 42 | masks = torch.arange(max_len).unsqueeze(0).repeat(batch_size, 1).to(device) 43 | masks = masks < seq_lens.unsqueeze(1) 44 | return masks 45 | 46 | def pad_tensors_wgrad(tensors, lens=None): 47 | """B x [T, ...] torch tensors""" 48 | if lens is None: 49 | lens = [t.size(0) for t in tensors] 50 | max_len = max(lens) 51 | batch_size = len(tensors) 52 | hid = list(tensors[0].size()[1:]) 53 | 54 | device = tensors[0].device 55 | dtype = tensors[0].dtype 56 | 57 | output = [] 58 | for i in range(batch_size): 59 | if lens[i] < max_len: 60 | tmp = torch.cat( 61 | [tensors[i], torch.zeros([max_len-lens[i]]+hid, dtype=dtype).to(device)], 62 | dim=0 63 | ) 64 | else: 65 | tmp = tensors[i] 66 | output.append(tmp) 67 | output = torch.stack(output, 0) 68 | return output 69 | -------------------------------------------------------------------------------- /vlnce_baselines/models/etp/vlnbert_init.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_tokenizer(args): 5 | from transformers import AutoTokenizer 6 | if args.dataset == 'rxr' or args.tokenizer == 'xlm': 7 | cfg_name = 'bert_config/xlm-roberta-base' 8 | else: 9 | cfg_name = 'bert_config/bert-base-uncased' 10 | tokenizer = AutoTokenizer.from_pretrained(cfg_name) 11 | return tokenizer 12 | 13 | def get_vlnbert_models(config=None): 14 | 15 | from transformers import PretrainedConfig, BertTokenizer, BertModel 16 | from vlnce_baselines.models.etp.vilmodel_cmt import GlocalTextPathNavCMT 17 | 18 | model_class = GlocalTextPathNavCMT 19 | 20 | model_name_or_path = config.pretrained_path 21 | new_ckpt_weights = {} 22 | if model_name_or_path is not None: 23 | ckpt_weights = torch.load(model_name_or_path, map_location='cpu') 24 | for k, v in ckpt_weights.items(): 25 | if k.startswith('module'): 26 | new_ckpt_weights[k[7:]] = v 27 | if 'sap_head' in k: 28 | new_ckpt_weights['bert.' + k] = v 29 | else: 30 | new_ckpt_weights[k] = v 31 | 32 | if config.task_type == 'r2r': 33 | #cfg_name = 'bert_config/bert-base-uncased' 34 | cfg_name = 'vlnce_baselines/bert-base-uncased' 35 | elif config.task_type == 'rxr': 36 | cfg_name = 'bert_config/xlm-roberta-base' 37 | vis_config = PretrainedConfig.from_pretrained(cfg_name) 38 | 39 | if config.task_type == 'rxr': 40 | vis_config.type_vocab_size = 2 41 | 42 | vis_config.max_action_steps = 100 43 | vis_config.image_feat_size = 512 44 | vis_config.use_depth_embedding = config.use_depth_embedding 45 | vis_config.depth_feat_size = 128 46 | vis_config.angle_feat_size = 4 47 | 48 | vis_config.num_l_layers = 9 49 | vis_config.num_pano_layers = 2 50 | vis_config.num_x_layers = 4 51 | vis_config.graph_sprels = config.use_sprels 52 | vis_config.glocal_fuse = 'global' 53 | 54 | vis_config.fix_lang_embedding = config.fix_lang_embedding 55 | vis_config.fix_pano_embedding = config.fix_pano_embedding 56 | 57 | vis_config.update_lang_bert = not vis_config.fix_lang_embedding 58 | vis_config.output_attentions = True 59 | vis_config.pred_head_dropout_prob = 0.1 60 | vis_config.use_lang2visn_attn = False 61 | 62 | visual_model = model_class.from_pretrained( 63 | pretrained_model_name_or_path=None, 64 | config=vis_config, 65 | state_dict=new_ckpt_weights) 66 | 67 | return visual_model 68 | -------------------------------------------------------------------------------- /vlnce_baselines/models/policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Any 3 | 4 | from habitat_baselines.rl.ppo.policy import Policy 5 | from habitat_baselines.utils.common import ( 6 | CategoricalNet, 7 | CustomFixedCategorical, 8 | ) 9 | from torch.distributions import Categorical 10 | 11 | 12 | class ILPolicy(Policy, metaclass=abc.ABCMeta): 13 | def __init__(self, net, dim_actions): 14 | r"""Defines an imitation learning policy as having functions act() and 15 | build_distribution(). 16 | """ 17 | super(Policy, self).__init__() 18 | self.net = net 19 | self.dim_actions = dim_actions 20 | 21 | # self.action_distribution = CategoricalNet( 22 | # self.net.output_size, self.dim_actions 23 | # ) 24 | 25 | def forward(self, *x): 26 | raise NotImplementedError 27 | 28 | def act( 29 | self, 30 | observations, 31 | rnn_hidden_states, 32 | prev_actions, 33 | masks, 34 | deterministic=False, 35 | ): 36 | 37 | print('need to revise for CMA and VLNBERT') 38 | import pdb; pdb.set_trace() 39 | 40 | features, rnn_hidden_states = self.net( 41 | observations, rnn_hidden_states, prev_actions, masks 42 | ) 43 | distribution = self.action_distribution(features) 44 | 45 | # if distribution.logit 46 | if deterministic: 47 | action = distribution.mode() 48 | else: 49 | action = distribution.sample() 50 | 51 | return action, rnn_hidden_states 52 | 53 | def get_value(self, *args: Any, **kwargs: Any): 54 | raise NotImplementedError 55 | 56 | def evaluate_actions(self, *args: Any, **kwargs: Any): 57 | raise NotImplementedError 58 | 59 | def build_distribution( 60 | self, observations, rnn_hidden_states, prev_actions, masks 61 | ) -> CustomFixedCategorical: 62 | features, rnn_hidden_states = self.net( 63 | observations, rnn_hidden_states, prev_actions, masks 64 | ) 65 | return self.action_distribution(features) 66 | 67 | def act2( 68 | self, 69 | observations, 70 | rnn_hidden_states, 71 | prev_actions, 72 | masks, 73 | deterministic=False, 74 | ): 75 | 76 | print('need to revise for CMA and VLNBERT') 77 | import pdb; pdb.set_trace() 78 | 79 | feature_rgb, feature_depth, rnn_hidden_states = self.net( 80 | observations, rnn_hidden_states, prev_actions, masks 81 | ) 82 | distribution_rgb = self.action_distribution(feature_rgb) 83 | distribution_depth = self.action_distribution(feature_depth) 84 | 85 | probs = (distribution_rgb.probs + distribution_depth.probs)/2 86 | # if distribution.logit 87 | if deterministic: 88 | action = probs.argmax(dim=-1, keepdim=True) 89 | else: 90 | action = Categorical(probs).sample().unsqueeze(-1) 91 | 92 | return action, rnn_hidden_states 93 | -------------------------------------------------------------------------------- /utils_p/prompt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import time 5 | 6 | 7 | class Prompt(nn.Module): 8 | def __init__(self, prompt_alpha=0.01, image_size=224): 9 | super().__init__() 10 | self.prompt_size = int(image_size * prompt_alpha) if int(image_size * prompt_alpha) > 1 else 1 11 | self.padding_size = (image_size - self.prompt_size)//2 12 | self.init_para = torch.ones((1, 3, self.prompt_size, self.prompt_size)) 13 | self.data_prompt = nn.Parameter(self.init_para, requires_grad=True) 14 | self.pre_prompt = self.data_prompt.detach().cpu().data 15 | 16 | def update(self, init_data): 17 | with torch.no_grad(): 18 | self.data_prompt.copy_(init_data) 19 | 20 | def iFFT(self, amp_src_, pha_src, imgH, imgW): 21 | # recompose fft 22 | real = torch.cos(pha_src) * amp_src_ 23 | imag = torch.sin(pha_src) * amp_src_ 24 | fft_src_ = torch.complex(real=real, imag=imag) 25 | 26 | src_in_trg = torch.fft.ifft2(fft_src_, dim=(-2, -1), s=[imgH, imgW]).real 27 | return src_in_trg 28 | 29 | def forward(self, x): 30 | start = time.time() 31 | _, _, imgH, imgW = x.size() # image size 32 | 33 | fft = torch.fft.fft2(x.clone(), dim=(-2, -1)) # fft for image 34 | 35 | # extract amplitude and phase of both ffts 36 | amp_src, pha_src = torch.abs(fft), torch.angle(fft) # amp: 振幅谱,表示频率大小;pha:相位谱,相位偏移 37 | amp_src = torch.fft.fftshift(amp_src) # 将振幅谱的低频分量移动到图像的中心 38 | 39 | # F.pad: 填充self.data_prompt, 将其居中于输入图像尺寸 40 | prompt = F.pad(self.data_prompt, [self.padding_size, imgH - self.padding_size - self.prompt_size, 41 | self.padding_size, imgW - self.padding_size - self.prompt_size], 42 | mode='constant', value=1.0).contiguous() # self.data:预定义的提示,通过填充操作使其与输入图像的尺寸匹配。填充操作确保只处理频谱中的特定部分(低频部分) 43 | 44 | amp_src_ = amp_src * prompt # 振幅谱与提示相乘,以专注于特定的低频 45 | amp_src_ = torch.fft.ifftshift(amp_src_) # prompt调制后,将频率成分移回其原始排列方式 46 | 47 | amp_low_ = amp_src[:, :, self.padding_size:self.padding_size+self.prompt_size, self.padding_size:self.padding_size+self.prompt_size] # 提取低频成分 48 | 49 | src_in_trg = self.iFFT(amp_src_, pha_src, imgH, imgW) #重构图像 50 | end = time.time() 51 | T = end - start 52 | #print(T) 53 | 54 | return src_in_trg, amp_low_ 55 | 56 | 57 | 58 | def enhance(self, x, retrieve_p): 59 | 60 | _, _, imgH, imgW = x.size() # image size 61 | 62 | fft = torch.fft.fft2(x.clone(), dim=(-2, -1)) # fft for image 63 | 64 | # extract amplitude and phase of both ffts 65 | amp_src, pha_src = torch.abs(fft), torch.angle(fft) # amp: 振幅谱,表示频率大小;pha:相位谱,相位偏移 66 | amp_src = torch.fft.fftshift(amp_src) # 将振幅谱的低频分量移动到图像的中心 67 | 68 | # F.pad: 填充self.data_prompt, 将其居中于输入图像尺寸 69 | prompt = F.pad(retrieve_p.cuda(), [self.padding_size, imgH - self.padding_size - self.prompt_size, 70 | self.padding_size, imgW - self.padding_size - self.prompt_size], 71 | mode='constant', value=1.0).contiguous() # self.data:预定义的提示,通过填充操作使其与输入图像的尺寸匹配。填充操作确保只处理频谱中的特定部分(低频部分) 72 | 73 | amp_src_ = amp_src * prompt # 振幅谱与提示相乘,以专注于特定的低频 74 | amp_src_ = torch.fft.ifftshift(amp_src_) # prompt调制后,将频率成分移回其原始排列方式 75 | 76 | # amp_low_ = amp_src[:, :, self.padding_size:self.padding_size+self.prompt_size, self.padding_size:self.padding_size+self.prompt_size] # 提取低频成分 77 | 78 | src_in_trg = self.iFFT(amp_src_, pha_src, imgH, imgW) #重构图像 79 | #end = time.time() 80 | #T = end - start 81 | #print(T) 82 | 83 | return src_in_trg #, amp_low_ -------------------------------------------------------------------------------- /utils_p/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class RegressionLoss(nn.Module): 7 | def __init__(self, norm, channel_dim=-1): 8 | super().__init__() 9 | self.norm = norm 10 | self.channel_dim = channel_dim 11 | 12 | if norm == 1: 13 | self.loss_fn = F.l1_loss 14 | elif norm == 2: 15 | self.loss_fn = F.mse_loss 16 | else: 17 | raise ValueError(f'Expected norm 1 or 2, but got norm={norm}') 18 | 19 | def forward(self, prediction, target): 20 | loss = self.loss_fn(prediction.float(), target,float(), reduction='none') 21 | 22 | # Sum channel dimension 23 | loss = torch.sum(loss, dim=self.channel_dim, keepdims=True) 24 | return loss.mean() 25 | 26 | 27 | class SpatialRegressionLoss(nn.Module): 28 | def __init__(self, norm, ignore_index=255): 29 | super(SpatialRegressionLoss, self).__init__() 30 | self.norm = norm 31 | self.ignore_index = ignore_index 32 | 33 | if norm == 1: 34 | self.loss_fn = F.l1_loss 35 | elif norm == 2: 36 | self.loss_fn = F.mse_loss 37 | else: 38 | raise ValueError(f'Expected norm 1 or 2, but got norm={norm}') 39 | 40 | def forward(self, prediction, target): 41 | assert len(prediction.shape) == 5, 'Must be a 5D tensor' 42 | # ignore_index is the same across all channels 43 | mask = target[:, :, :1] != self.ignore_index 44 | if mask.sum() == 0: 45 | return prediction.new_zeros(1)[0].float() 46 | 47 | loss = self.loss_fn(prediction, target, reduction='none') 48 | 49 | # Sum channel dimension 50 | loss = torch.sum(loss, dim=-3, keepdims=True) 51 | 52 | return loss[mask].mean() 53 | 54 | 55 | class ProbabilisticLoss(nn.Module): 56 | """ Given a prior distribution and a posterior distribution, this module computes KL(posterior, prior)""" 57 | def __init__(self, remove_first_timestamp=True): 58 | super().__init__() 59 | self.remove_first_timestamp = remove_first_timestamp 60 | 61 | def forward(self, prior_mu, prior_sigma, posterior_mu, posterior_sigma): 62 | posterior_var = posterior_sigma[:, 1:] ** 2 63 | prior_var = prior_sigma[:, 1:] ** 2 64 | 65 | posterior_log_sigma = torch.log(posterior_sigma[:, 1:]) 66 | prior_log_sigma = torch.log(prior_sigma[:, 1:]) 67 | 68 | kl_div = ( 69 | prior_log_sigma - posterior_log_sigma - 0.5 70 | + (posterior_var + (posterior_mu[:, 1:] - prior_mu[:, 1:]) ** 2) / (2 * prior_var) 71 | ) 72 | first_kl = - posterior_log_sigma[:, :1] - 0.5 + (posterior_var[:, :1] + posterior_mu[:, :1] ** 2) / 2 73 | kl_div = torch.cat([first_kl, kl_div], dim=1) 74 | 75 | # Sum across channel dimension 76 | # Average across batch dimension, keep time dimension for monitoring 77 | kl_loss = torch.mean(torch.sum(kl_div, dim=-1)) 78 | return kl_loss 79 | 80 | 81 | class KLLoss(nn.Module): 82 | def __init__(self, alpha): 83 | super().__init__() 84 | self.alpha = alpha 85 | self.loss = ProbabilisticLoss(remove_first_timestamp=True) 86 | 87 | def forward(self, prior, posterior): 88 | prior_mu, prior_sigma = prior['mu'], prior['sigma'] 89 | posterior_mu, posterior_sigma = posterior['mu'], posterior['sigma'] 90 | prior_mu = prior_mu.float() 91 | prior_sigma = prior_sigma.float() 92 | posterior_mu = posterior_mu.float() 93 | posterior_sigma = posterior_sigma.float() 94 | 95 | prior_loss = self.loss(prior_mu, prior_sigma, posterior_mu.detach(), posterior_sigma.detach()) 96 | posterior_loss = self.loss(prior_mu.detach(), prior_sigma.detach(), posterior_mu, posterior_sigma) 97 | 98 | return self.alpha * prior_loss + (1 - self.alpha) * posterior_loss -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/resnetUnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchvision import models 4 | 5 | def convrelu(in_channels, out_channels, kernel, padding): 6 | return nn.Sequential( 7 | nn.Conv2d(in_channels, out_channels, kernel, padding=padding), 8 | nn.BatchNorm2d(num_features=out_channels), 9 | nn.ReLU(inplace=True), 10 | ) 11 | 12 | 13 | class ResNetUNet(nn.Module): 14 | def __init__(self, n_channel_in, n_class_out, has_positional_embedding = False): 15 | super().__init__() 16 | 17 | self.has_positional_embedding = has_positional_embedding 18 | if self.has_positional_embedding == True: 19 | positional_embedding_dim = 16 20 | n_channel_in += positional_embedding_dim 21 | 22 | scale = positional_embedding_dim ** -0.5 23 | self.positional_embedding = nn.Parameter(scale * torch.randn((positional_embedding_dim,192,192))) 24 | 25 | self.base_model = models.resnet18(pretrained=False) 26 | self.base_model.load_state_dict(torch.load("pretrained/resnet18-f37072fd.pth")) 27 | self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3,bias=False) 28 | self.base_layers = list(self.base_model.children()) 29 | 30 | self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2) 31 | self.layer0_1x1 = convrelu(64, 64, 1, 0) 32 | self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4) 33 | self.layer1_1x1 = convrelu(64, 64, 1, 0) 34 | self.layer2 = self.base_layers[5] # size=(N, 128, x.H/8, x.W/8) 35 | self.layer2_1x1 = convrelu(128, 128, 1, 0) 36 | self.layer3 = self.base_layers[6] # size=(N, 256, x.H/16, x.W/16) 37 | self.layer3_1x1 = convrelu(256, 256, 1, 0) 38 | self.layer4 = self.base_layers[7] # size=(N, 512, x.H/32, x.W/32) 39 | self.layer4_1x1 = convrelu(512, 512, 1, 0) 40 | 41 | self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) 42 | 43 | self.conv_up3 = convrelu(256 + 512, 512, 3, 1) 44 | self.conv_up2 = convrelu(128 + 512, 256, 3, 1) 45 | self.conv_up1 = convrelu(64 + 256, 256, 3, 1) 46 | self.conv_up0 = convrelu(64 + 256, 128, 3, 1) 47 | 48 | self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1) 49 | self.conv_original_size1 = convrelu(64, 64, 3, 1) 50 | self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1) 51 | 52 | self.conv_last = nn.Conv2d(64, n_class_out, 1) 53 | 54 | def forward(self, input): 55 | B, T, C, cH, cW = input.shape 56 | input = input.view(B*T,C,cH,cW) 57 | 58 | if self.has_positional_embedding: 59 | input = torch.cat((self.positional_embedding.unsqueeze(0).repeat(B*T,1,1,1),input),dim=1) 60 | 61 | x_original = self.conv_original_size0(input) 62 | x_original = self.conv_original_size1(x_original) 63 | 64 | layer0 = self.layer0(input) 65 | layer1 = self.layer1(layer0) 66 | layer2 = self.layer2(layer1) 67 | layer3 = self.layer3(layer2) 68 | layer4 = self.layer4(layer3) 69 | 70 | layer4 = self.layer4_1x1(layer4) 71 | x = self.upsample(layer4) 72 | 73 | layer3 = self.layer3_1x1(layer3) 74 | x = torch.cat([x, layer3], dim=1) 75 | x = self.conv_up3(x) 76 | 77 | x = self.upsample(x) 78 | layer2 = self.layer2_1x1(layer2) 79 | x = torch.cat([x, layer2], dim=1) 80 | x = self.conv_up2(x) 81 | 82 | x = self.upsample(x) 83 | layer1 = self.layer1_1x1(layer1) 84 | x = torch.cat([x, layer1], dim=1) 85 | x = self.conv_up1(x) 86 | 87 | x = self.upsample(x) 88 | layer0 = self.layer0_1x1(layer0) 89 | x = torch.cat([x, layer0], dim=1) 90 | x = self.conv_up0(x) 91 | 92 | x = self.upsample(x) 93 | x = torch.cat([x, x_original], dim=1) 94 | x = self.conv_original_size2(x) 95 | 96 | out = self.conv_last(x) 97 | 98 | return out -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/instruction_encoder.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | 4 | import torch 5 | import torch.nn as nn 6 | from habitat import Config 7 | 8 | 9 | class InstructionEncoder(nn.Module): 10 | def __init__(self, config: Config): 11 | r"""An encoder that uses RNN to encode an instruction. Returns 12 | the final hidden state after processing the instruction sequence. 13 | 14 | Args: 15 | config: must have 16 | embedding_size: The dimension of each embedding vector 17 | hidden_size: The hidden (output) size 18 | rnn_type: The RNN cell type. Must be GRU or LSTM 19 | final_state_only: Whether or not to return just the final state 20 | """ 21 | super().__init__() 22 | 23 | self.config = config 24 | 25 | # lang_drop_ratio = 0.50 26 | # self.drop = nn.Dropout(p=lang_drop_ratio) 27 | 28 | rnn = nn.GRU if self.config.rnn_type == "GRU" else nn.LSTM 29 | self.encoder_rnn = rnn( 30 | input_size=config.embedding_size, 31 | hidden_size=config.hidden_size, 32 | bidirectional=config.bidirectional, 33 | ) 34 | 35 | if config.sensor_uuid == "instruction": 36 | if self.config.use_pretrained_embeddings: 37 | self.embedding_layer = nn.Embedding.from_pretrained( 38 | embeddings=self._load_embeddings(), 39 | freeze=not self.config.fine_tune_embeddings, 40 | ) 41 | else: # each embedding initialized to sampled Gaussian 42 | self.embedding_layer = nn.Embedding( 43 | num_embeddings=config.vocab_size, 44 | embedding_dim=config.embedding_size, 45 | padding_idx=0, 46 | ) 47 | 48 | @property 49 | def output_size(self): 50 | return self.config.hidden_size * (1 + int(self.config.bidirectional)) 51 | 52 | def _load_embeddings(self): 53 | """Loads word embeddings from a pretrained embeddings file. 54 | PAD: index 0. [0.0, ... 0.0] 55 | UNK: index 1. mean of all R2R word embeddings: [mean_0, ..., mean_n] 56 | why UNK is averaged: https://bit.ly/3u3hkYg 57 | Returns: 58 | embeddings tensor of size [num_words x embedding_dim] 59 | """ 60 | with gzip.open(self.config.embedding_file, "rt") as f: 61 | embeddings = torch.tensor(json.load(f)) 62 | return embeddings 63 | 64 | def forward(self, observations): 65 | """ 66 | Tensor sizes after computation: 67 | instruction: [batch_size x seq_length] 68 | lengths: [batch_size] 69 | hidden_state: [batch_size x hidden_size] 70 | """ 71 | 72 | if self.config.sensor_uuid == "instruction": 73 | instruction = observations["instruction"].long() 74 | lengths = (instruction != 0.0).long().sum(dim=1) 75 | instruction = self.embedding_layer(instruction) 76 | # instruction = self.drop(instruction) 77 | else: 78 | instruction = observations["rxr_instruction"] 79 | 80 | lengths = (instruction != 0.0).long().sum(dim=2) 81 | lengths = (lengths != 0.0).long().sum(dim=1) 82 | 83 | packed_seq = nn.utils.rnn.pack_padded_sequence( 84 | instruction, lengths.cpu(), batch_first=True, enforce_sorted=False 85 | ) 86 | output, final_state = self.encoder_rnn(packed_seq) 87 | 88 | if self.config.rnn_type == "LSTM": 89 | final_state = final_state[0] 90 | 91 | if self.config.final_state_only: # default False 92 | return final_state.squeeze(0) 93 | else: 94 | ctx = nn.utils.rnn.pad_packed_sequence(output, 95 | batch_first=True)[0].permute(0, 2, 1) 96 | all_lang_masks = (ctx == 0.0).all(dim=1) 97 | ctx = ctx.permute(0, 2, 1) 98 | 99 | # ctx = self.drop(ctx) 100 | 101 | return ctx, all_lang_masks 102 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import random 5 | import os 6 | import numpy as np 7 | import torch 8 | from habitat import logger 9 | from habitat_baselines.common.baseline_registry import baseline_registry 10 | 11 | import habitat_extensions # noqa: F401 12 | import vlnce_baselines # noqa: F401 13 | from vlnce_baselines.config.default import get_config 14 | # from vlnce_baselines.nonlearning_agents import ( 15 | # evaluate_agent, 16 | # nonlearning_inference, 17 | # ) 18 | os.environ["CUDA_VISIBLE_DEVICES"] = "4" 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | "--exp_name", 25 | type=str, 26 | default="release_r2r", 27 | #required=True, 28 | help="experiment id that matches to exp-id in Notion log", 29 | ) 30 | parser.add_argument( 31 | "--run-type", 32 | choices=["train", "eval", "inference"], 33 | default="eval", 34 | #required=True, 35 | help="run type of the experiment (train, eval, inference)", 36 | ) 37 | parser.add_argument( 38 | "--exp-config", 39 | type=str, 40 | default="run_r2r/iter_train.yaml", 41 | # required=True, 42 | help="path to config yaml containing info about experiment", 43 | ) 44 | parser.add_argument( 45 | "opts", 46 | default=None, 47 | nargs=argparse.REMAINDER, 48 | help="Modify config options from command line", 49 | ) 50 | parser.add_argument('--local_rank', type=int, default=0, help="local gpu id") 51 | 52 | #Prompt 53 | parser.add_argument('--memory_size', type=int, default=1000) 54 | parser.add_argument('--neighbor', type=int, default=5) 55 | parser.add_argument('--prompt_alpha', type=float, default=0.1) 56 | parser.add_argument('--warm_n', type=int, default=5) 57 | parser.add_argument('--image_size', type=int, default=224) 58 | parser.add_argument('--imagine_T', type=int, default=2) 59 | 60 | args = parser.parse_args() 61 | run_exp(**vars(args)) 62 | 63 | 64 | def run_exp(exp_name: str, exp_config: str, 65 | run_type: str, memory_size: int, neighbor: int, prompt_alpha: float, warm_n: int, image_size: int, imagine_T: int, 66 | opts=None, local_rank=None) -> None: 67 | r"""Runs experiment given mode and config 68 | 69 | Args: 70 | exp_config: path to config file. 71 | run_type: "train" or "eval. 72 | opts: list of strings of additional config options. 73 | 74 | Returns: 75 | None. 76 | """ 77 | config = get_config(exp_config, opts) 78 | config.defrost() 79 | 80 | config.TENSORBOARD_DIR += exp_name 81 | config.CHECKPOINT_FOLDER += exp_name 82 | if os.path.isdir(config.EVAL_CKPT_PATH_DIR): 83 | config.EVAL_CKPT_PATH_DIR += exp_name 84 | config.RESULTS_DIR += exp_name 85 | config.VIDEO_DIR += exp_name 86 | # config.TASK_CONFIG.TASK.RXR_INSTRUCTION_SENSOR.max_text_len = config.IL.max_text_len 87 | config.LOG_FILE = exp_name + '_' + config.LOG_FILE 88 | 89 | if 'CMA' in config.MODEL.policy_name and 'r2r' in config.BASE_TASK_CONFIG_PATH: 90 | config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz' 91 | 92 | config.local_rank = local_rank 93 | 94 | #prompt 95 | config.memory_size = memory_size 96 | config.neighbor = neighbor 97 | config.prompt_alpha = prompt_alpha 98 | config.warm_n = warm_n 99 | config.image_size = image_size 100 | config.imagine_T = imagine_T 101 | 102 | config.freeze() 103 | os.system("mkdir -p data/logs/running_log") 104 | logger.add_filehandler('data/logs/running_log/'+config.LOG_FILE) 105 | 106 | random.seed(config.TASK_CONFIG.SEED) 107 | np.random.seed(config.TASK_CONFIG.SEED) 108 | torch.manual_seed(config.TASK_CONFIG.SEED) 109 | torch.backends.cudnn.benchmark = False 110 | torch.backends.cudnn.deterministic = False 111 | if torch.cuda.is_available(): 112 | torch.set_num_threads(1) 113 | 114 | # if run_type == "eval" and config.EVAL.EVAL_NONLEARNING: 115 | # evaluate_agent(config) 116 | # return 117 | 118 | # if run_type == "inference" and config.INFERENCE.INFERENCE_NONLEARNING: 119 | # nonlearning_inference(config) 120 | # return 121 | 122 | trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) 123 | assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" 124 | trainer = trainer_init(config) 125 | 126 | # import pdb; pdb.set_trace() 127 | if run_type == "train": 128 | trainer.train() 129 | elif run_type == "eval": 130 | trainer.eval() 131 | elif run_type == "inference": 132 | trainer.inference() 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /utils_p/convert.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import numpy as np 3 | 4 | 5 | class AdaBN(nn.BatchNorm2d): 6 | def __init__(self, in_ch, warm_n=5): 7 | super(AdaBN, self).__init__(in_ch) 8 | self.warm_n = warm_n 9 | self.sample_num = 0 10 | self.new_sample = False 11 | 12 | def get_mu_var(self, x): 13 | if self.new_sample: 14 | self.sample_num += 1 15 | C = x.shape[1] 16 | 17 | cur_mu = x.mean((0, 2, 3), keepdims=True).detach() 18 | cur_var = x.var((0, 2, 3), keepdims=True).detach() 19 | 20 | src_mu = self.running_mean.view(1, C, 1, 1) 21 | src_var = self.running_var.view(1, C, 1, 1) 22 | 23 | moment = 1 / ((np.sqrt(self.sample_num) / self.warm_n) + 1) 24 | 25 | new_mu = moment * cur_mu + (1 - moment) * src_mu 26 | new_var = moment * cur_var + (1 - moment) * src_var 27 | return new_mu, new_var 28 | 29 | def forward(self, x): 30 | N, C, H, W = x.shape 31 | 32 | new_mu, new_var = self.get_mu_var(x) 33 | 34 | cur_mu = x.mean((2, 3), keepdims=True) 35 | cur_std = x.std((2, 3), keepdims=True) 36 | self.bn_loss = ( 37 | (new_mu - cur_mu).abs().mean() + (new_var.sqrt() - cur_std).abs().mean() 38 | ) 39 | 40 | # Normalization with new statistics 41 | new_sig = (new_var + self.eps).sqrt() 42 | new_x = ((x - new_mu) / new_sig) * self.weight.view(1, C, 1, 1) + self.bias.view(1, C, 1, 1) 43 | return new_x 44 | 45 | 46 | def convert_encoder_to_target(net, norm, start=0, end=5, verbose=True, bottleneck=False, input_size=512, warm_n=5): 47 | def convert_norm(old_norm, new_norm, num_features, idx, fea_size): 48 | norm_layer = new_norm(num_features, warm_n).to(net.conv1.weight.device) 49 | if hasattr(norm_layer, 'load_old_dict'): 50 | info = 'Converted to : {}'.format(norm) 51 | norm_layer.load_old_dict(old_norm) 52 | elif hasattr(norm_layer, 'load_state_dict'): 53 | state_dict = old_norm.state_dict() 54 | info = norm_layer.load_state_dict(state_dict, strict=False) 55 | else: 56 | info = 'No load_old_dict() found!!!' 57 | if verbose: 58 | print(info) 59 | return norm_layer 60 | 61 | layers = [0, net.layer1, net.layer2, net.layer3, net.layer4] 62 | 63 | idx = 0 64 | for i, layer in enumerate(layers): 65 | if not (start <= i < end): 66 | continue 67 | if i == 0: 68 | net.bn1 = convert_norm(net.bn1, norm, net.bn1.num_features, idx, fea_size=input_size // 2) 69 | idx += 1 70 | else: 71 | down_sample = 2 ** (1 + i) 72 | 73 | for j, block in enumerate(layer): 74 | block.bn1 = convert_norm(block.bn1, norm, block.bn1.num_features, idx, fea_size=input_size // down_sample) 75 | idx += 1 76 | block.bn2 = convert_norm(block.bn2, norm, block.bn2.num_features, idx, fea_size=input_size // down_sample) 77 | idx += 1 78 | if bottleneck: 79 | block.bn3 = convert_norm(block.bn3, norm, block.bn3.num_features, idx, fea_size=input_size // down_sample) 80 | idx += 1 81 | if block.downsample is not None: 82 | block.downsample[1] = convert_norm(block.downsample[1], norm, block.downsample[1].num_features, idx, fea_size=input_size // down_sample) 83 | idx += 1 84 | return net 85 | 86 | 87 | def convert_decoder_to_target(net, norm, start=0, end=5, verbose=True, input_size=512, warm_n=5): 88 | def convert_norm(old_norm, new_norm, num_features, idx, fea_size): 89 | norm_layer = new_norm(num_features, warm_n).to(old_norm.weight.device) 90 | if hasattr(norm_layer, 'load_old_dict'): 91 | info = 'Converted to : {}'.format(norm) 92 | norm_layer.load_old_dict(old_norm) 93 | elif hasattr(norm_layer, 'load_state_dict'): 94 | state_dict = old_norm.state_dict() 95 | info = norm_layer.load_state_dict(state_dict, strict=False) 96 | else: 97 | info = 'No load_old_dict() found!!!' 98 | if verbose: 99 | print(info) 100 | return norm_layer 101 | 102 | layers = [net[0], net[1], net[2], net[3], net[4]] 103 | 104 | idx = 0 105 | for i, layer in enumerate(layers): 106 | if not (start <= i < end): 107 | continue 108 | if i == 4: 109 | net[4] = convert_norm(layer, norm, layer.num_features, idx, input_size) 110 | idx += 1 111 | else: 112 | down_sample = 2 ** (4 - i) 113 | layer.bn = convert_norm(layer.bn, norm, layer.bn.num_features, idx, input_size // down_sample) 114 | idx += 1 115 | return net 116 | 117 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/clip.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from: https://github.com/openai/CLIP/blob/main/clip/clip.py 3 | """ 4 | from collections import OrderedDict 5 | from typing import Tuple, Union 6 | 7 | import hashlib 8 | import os 9 | import urllib 10 | import warnings 11 | from tqdm import tqdm 12 | import numpy as np 13 | import torch 14 | import torch.nn.functional as F 15 | from torch import nn 16 | 17 | class LayerNorm(nn.LayerNorm): 18 | """Subclass torch's LayerNorm to handle fp16.""" 19 | 20 | def forward(self, x: torch.Tensor): 21 | orig_type = x.dtype 22 | ret = super().forward(x.type(torch.float32)) 23 | return ret.type(orig_type) 24 | 25 | 26 | class QuickGELU(nn.Module): 27 | def forward(self, x: torch.Tensor): 28 | return x * torch.sigmoid(1.702 * x) 29 | 30 | 31 | class ResidualAttentionBlock(nn.Module): 32 | def __init__(self, d_model: int, n_head: int, attn_mask=None): 33 | super().__init__() 34 | 35 | self.attn = nn.MultiheadAttention(d_model, n_head) 36 | self.ln_1 = LayerNorm(d_model) 37 | self.mlp = nn.Sequential(OrderedDict([ 38 | ("c_fc", nn.Linear(d_model, d_model * 4)), 39 | ("gelu", QuickGELU()), 40 | ("c_proj", nn.Linear(d_model * 4, d_model)) 41 | ])) 42 | self.ln_2 = LayerNorm(d_model) 43 | self.attn_mask = attn_mask 44 | 45 | def attention(self, x: torch.Tensor): 46 | attn_mask_ = self.attn_mask 47 | if self.attn_mask is not None and hasattr(self.attn_mask, '__call__'): 48 | attn_mask_ = self.attn_mask(x.size(0)) # LND 49 | 50 | attn_mask_ = attn_mask_.to(dtype=x.dtype, device=x.device) if attn_mask_ is not None else None 51 | return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask_)[0] 52 | 53 | def forward(self, x): 54 | x = x + self.attention(self.ln_1(x)) 55 | x = x + self.mlp(self.ln_2(x)) 56 | return x 57 | 58 | 59 | class Transformer(nn.Module): 60 | def __init__(self, width: int, layers: int, heads: int, attn_mask = None): 61 | super().__init__() 62 | self.width = width 63 | self.layers = layers 64 | self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) 65 | 66 | def forward(self, x: torch.Tensor): 67 | return self.resblocks(x) 68 | 69 | class VisionTransformer(nn.Module): 70 | def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int): 71 | super().__init__() 72 | self.input_resolution = input_resolution 73 | self.layers = layers 74 | self.patch_size = patch_size 75 | 76 | self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) 77 | scale = width ** -0.5 78 | self.class_embedding = nn.Parameter(scale * torch.randn(width)) 79 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) 80 | self.ln_pre = LayerNorm(width) 81 | self.transformer = Transformer(width, layers, heads) 82 | self.proj = nn.Parameter(scale * torch.randn(width, 512)) 83 | self.ln_post = LayerNorm(width) 84 | 85 | 86 | 87 | def forward(self, x: torch.Tensor): 88 | 89 | x = self.conv1(x) # shape = [*, width, grid, grid] 90 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] 91 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 92 | 93 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] 94 | x = x + self.positional_embedding.to(x.dtype) 95 | x = self.ln_pre(x) 96 | 97 | x = x.permute(1, 0, 2) # NLD -> LND 98 | x = self.transformer(x) 99 | x = x.permute(1, 0, 2) # LND -> NLD 100 | x = self.ln_post(x) 101 | x = x @ self.proj 102 | return x 103 | 104 | class CLIP(nn.Module): 105 | def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int): 106 | super().__init__() 107 | 108 | self.visual = VisionTransformer(input_resolution, patch_size, width, layers, heads) 109 | 110 | transformer_width = 512 111 | self.vocab_size = 49408 112 | self.context_length = 77 113 | 114 | self.transformer = Transformer( 115 | width=512, 116 | layers=12, 117 | heads=8, 118 | attn_mask=self.build_attention_mask() 119 | ) 120 | self.token_embedding = nn.Embedding(self.vocab_size, transformer_width) 121 | self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) 122 | self.ln_final = LayerNorm(transformer_width) 123 | 124 | self.text_projection = nn.Parameter(torch.empty(transformer_width, 512)) 125 | self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) 126 | 127 | def build_attention_mask(self): 128 | # lazily create causal attention mask, with full attention between the vision tokens 129 | # pytorch uses additive attention mask; fill with -inf 130 | mask = torch.empty(self.context_length, self.context_length) 131 | mask.fill_(float("-inf")) 132 | mask.triu_(1) # zero out the lower diagonal 133 | return mask 134 | 135 | def forward(self, x: torch.Tensor): 136 | 137 | return self.visual(x) 138 | 139 | def encode_image(self, x: torch.Tensor): 140 | return self.visual(x)[:,0] 141 | 142 | 143 | def encode_text(self, text): 144 | x = self.token_embedding(text) # [batch_size, n_ctx, d_model] 145 | x = x + self.positional_embedding 146 | x = x.permute(1, 0, 2) # NLD -> LND 147 | x = self.transformer(x) 148 | x = x.permute(1, 0, 2) # LND -> NLD 149 | x = self.ln_final(x) 150 | 151 | # x.shape = [batch_size, n_ctx, transformer.width] 152 | # take features from the eot embedding (eot_token is the highest number in each sequence) 153 | x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection 154 | 155 | return x 156 | 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments 2 | 3 | **Xuan Yao, Junyu Gao, and Changsheng Xu** 4 | 5 | This repository is the official implementation of [NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments](https://arxiv.org/abs/2506.23468). 6 | 7 | > Vision-and-Language Navigation in Continuous Environments (VLN-CE) requires agents to execute sequential navigation actions in complex environments guided by natural language instructions. Current approaches often struggle with generalizing to novel environments and adapting to ongoing changes during navigation. 8 | Inspired by human cognition, we present NavMorph, a self-evolving world model framework that enhances environmental understanding and decision-making in VLN-CE tasks. NavMorph employs compact latent representations to model environmental dynamics, equipping agents with foresight for adaptive planning and policy refinement. By integrating a novel Contextual Evolution Memory, NavMorph leverages scene-contextual information to support effective navigation while maintaining online adaptability. Extensive experiments demonstrate that our method achieves notable performance improvements on popular VLN-CE benchmarks. 9 | 10 | ![image](img/EWM.png) 11 | 12 | 13 | ## 🌍 Usage 14 | 15 | ### Prerequisites 16 | 17 | 1. Follow the [Habitat Installation Guide](https://github.com/facebookresearch/habitat-lab#installation) and [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to install [`habitat-lab`](https://github.com/facebookresearch/habitat-lab) and [`habitat-sim`](https://github.com/facebookresearch/habitat-sim). We use version `v0.2.1` in our experiments. 18 | 19 | 2. Install `torch_kdtree` and `tinycudann`: follow instructions [here](https://github.com/MrZihan/Sim2Real-VLN-3DFF). 20 | 21 | 3. Install requirements: 22 | ```setup 23 | conda create --name morph python=3.7.11 24 | conda activate morph 25 | ``` 26 | * Required packages are listed in `environment.yaml`. You can install by running: 27 | 28 | ``` 29 | conda env create -f environment.yaml 30 | ``` 31 | 32 | 33 | ### Dataset Preparation 34 | 35 | 1. **Scenes for Matterport3D** 36 | 37 | > Instructions copied from [VLN-CE](https://github.com/jacobkrantz/VLN-CE) 38 | 39 | Matterport3D (MP3D) scene reconstructions are used. The official Matterport3D download script (`download_mp.py`) can be accessed by following the instructions on their [project webpage](https://niessner.github.io/Matterport/). The scene data can then be downloaded: 40 | 41 | ```bash 42 | # requires running with python 2.7 43 | python download_mp.py --task habitat -o data/scene_datasets/mp3d/ 44 | ``` 45 | 46 | Extract such that it has the form `scene_datasets/mp3d/{scene}/{scene}.glb`. There should be 90 scenes. Place the `scene_datasets` folder in `data/`. 47 | 48 | 2. **Data and Trained Models** 49 | 50 | Please download the pretrained models and checkpoints from [GoogleDrive](https://drive.google.com/file/d/1x01wods-LUA6EyAD8C3ahiEaO8lKD6jy/view?usp=sharing). 51 | 52 | ``` 53 | unzip NavMorph-8324.zip 54 | ``` 55 | Overall, files and folds should be organized as follows: 56 | 57 | ``` 58 | NavMorph 59 | ├── data 60 | │ ├── checkpoints 61 | │ │ └── ckpt.pth 62 | │ ├── vpm_1000_wm_im.pkl 63 | │ ├── datasets 64 | │ | ├── R2R_VLNCE_v1-2 65 | │ | ├── R2R_VLNCE_v1-2_preprocessed 66 | │ | ├── R2R_VLNCE_v1-2_preprocessed_BERTidx 67 | │ | └── RxR_VLNCE_v0_enc_xlmr 68 | │ ├── logs 69 | │ ├── scene_datasets 70 | │ └── wp_pred 71 | │ ├── check_cwp_bestdist_hfov90 72 | │ └── check_cwp_bestdist_hfov63 73 | ├── pretrained 74 | │ ├── NeRF_p16_8x8.pth 75 | │ ├── ViT-B-32.pt 76 | │ ├── segm.pt 77 | │ ├── resnet18-f37072fd.pth 78 | │ ├── cwp_predictor.pth 79 | │ └── model_step_100000.pt 80 | └── bert_config 81 | └── bert-base-uncased 82 | ``` 83 | 84 | 🧑‍💻 We will soon provide a clean, organized compressed package matching this structure for easy download. 85 | 86 | 3. **Supplementary Notes** 📌 87 | 88 | - **2025-11-28 Update:** → See [Issue #11](https://github.com/Feliciaxyao/NavMorph/issues/11) for details. 89 | 90 | Clarified missing pretrained files (*e.g., waypoint prediction models* — `data/wp_pred/`, *e.g., Vision backbone weights* — `data/pretrained/ViT-B-32.pth`, ) and provided external download links. 91 | 92 | - **2025-11-28 Update:** → See [Issue #12](https://github.com/Feliciaxyao/NavMorph/issues/12) for details. 93 | 94 | Clarified missing BERT model weights required by NavMorph (`data/bert_config/bert-base-uncased`) and provided external download links. 95 | 96 | - **2025-12-01 Update:** → See [Issue #13](https://github.com/Feliciaxyao/NavMorph/issues/13) for details. 97 | 98 | Clarified the absence of the datasets (`R2R_VLNCE_v1-2_preprocessed_BERTidx` and `RxR_VLNCE_v0_enc_xlmr`) and provided external download links. 99 | 100 | 101 | 102 | ### Training for R2R-CE / RxR-CE 103 | 104 | Use pseudo interative demonstrator to train the world model Navmorph: 105 | ``` 106 | bash run_r2r/main.bash train # (run_rxr/main.bash) 107 | ``` 108 | 109 | ### Online Evaluation on R2R-CE / RxR-CE 110 | 111 | Use pseudo interative demonstrator to equip the model with our NavMorph: 112 | ``` 113 | bash run_r2r/main.bash eval # (run_rxr/main.bash) 114 | ``` 115 | 116 | ### Notes❗ 117 | 118 | When transitioning from the R2R dataset to the RxR dataset based on the baseline code, you will need to adjust the camera settings in three places to prevent any simulation issues. 119 | 120 | 1. **Camera HFOV and VFOV Adjustment**: 121 | In [vlnce_bacelines/models/etp/nerf.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/models/etp/nerf.py#L57-L60), update the camera's **HFOV** and **VFOV**: 122 | - Set `HFOV = 90` for R2R. 123 | - Set `HFOV = 79` for RxR. 124 | 125 | 2. **Dataset Setting**: 126 | In [vlnce_bacelines/models/Policy_ViewSelection_ETP.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/models/Policy_ViewSelection_ETP.py#L41), modify the `DATASET` variable: 127 | - Set `DATASET = 'R2R'` for R2R. 128 | - Set `DATASET = 'RxR'` for RxR. 129 | 130 | 3. **Camera Configuration**: 131 | In [vlnce_baselines/ss_trainer_ETP.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/ss_trainer_ETP.py#L181), ensure the camera configuration is updated: 132 | - Set `camera.config.HFOV = 90` for R2R. 133 | - Set `camera.config.HFOV = 79` for RxR. 134 | 135 | These adjustments are essential for proper camera calibration and to avoid discrepancies during simulation. 136 | 137 | ## 📢 TODO list: 138 | 139 | -◻️ Checkpoints for RxR-CE release 140 | 141 | -◻️ Pre-trained CEM for RxR-CE release 142 | 143 | -◻️ Real-world Verification 144 | 145 | ## Acknowledgements 146 | Our implementations are partially based on [VLN-3DFF](https://github.com/MrZihan/Sim2Real-VLN-3DFF) and [ETPNav](https://github.com/MarSaKi/ETPNav). Thanks to the authors for sharing their code. 147 | 148 | 149 | ## Related Work 150 | * [Beyond the Nav-Graph: Vision-and-Language Navigation in Continuous Environments](https://arxiv.org/pdf/2004.02857) 151 | 152 | ## 📝 Citation 153 | 154 | If you find this project useful in your research, please consider cite: 155 | ``` 156 | @inproceedings{yao2025navmorph, 157 | title={NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments}, 158 | author={Xuan Yao, Junyu Gao and Changsheng Xu}, 159 | booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, 160 | pages={5536-5546}, 161 | year={2025} 162 | } 163 | -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/semantic_grid.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | class SemanticGrid(object): 8 | 9 | def __init__(self, batch_size, grid_dim, crop_size, cell_size, spatial_labels, object_labels): 10 | self.grid_dim = grid_dim 11 | self.cell_size = cell_size 12 | self.spatial_labels = spatial_labels 13 | self.object_labels = object_labels 14 | self.batch_size = batch_size 15 | self.crop_size = crop_size 16 | 17 | self.crop_start = int( (self.grid_dim[0] / 2) - (self.crop_size / 2) ) 18 | self.crop_end = int( (self.grid_dim[0] / 2) + (self.crop_size / 2) ) 19 | 20 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 21 | 22 | # observed ground-projected sem grid over entire scene 23 | self.occupancy_grid = torch.ones((self.batch_size, self.spatial_labels, self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32, device=self.device) 24 | self.occupancy_grid = self.occupancy_grid*(1/self.spatial_labels) 25 | 26 | self.semantic_grid = torch.ones((self.batch_size, self.object_labels, self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32, device=self.device) 27 | self.semantic_grid = self.semantic_grid*(1/self.object_labels) 28 | 29 | 30 | def pop(self, batch_id): 31 | self.batch_size -= 1 32 | self.occupancy_grid = torch.cat([self.occupancy_grid[:batch_id],self.occupancy_grid[batch_id+1:]],dim=0) 33 | self.semantic_grid = torch.cat([self.semantic_grid[:batch_id],self.semantic_grid[batch_id+1:]],dim=0) 34 | 35 | 36 | # Transform each ground-projected grid into geocentric coordinates 37 | def spatialTransformer(self, grid, pose, abs_pose): 38 | # Input: 39 | # grid -- sequence len x number of classes x grid_dim x grid_dim 40 | # pose -- sequence len x 3 41 | # abs_pose -- same as pose 42 | 43 | geo_grid_out = torch.zeros((grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(grid.device) 44 | 45 | 46 | for j in range(grid.shape[0]): # sequence length 47 | 48 | init_pose = abs_pose[j,0,:] # init absolute pose of each sequence 49 | init_rot_mat = torch.tensor([[torch.cos(init_pose[2]), -torch.sin(init_pose[2])], 50 | [torch.sin(init_pose[2]),torch.cos(init_pose[2])]], dtype=torch.float32).to(grid.device) 51 | 52 | grid_step = grid[j,:,:,:].unsqueeze(0) 53 | pose_step = pose[j,:] 54 | 55 | rel_coord = torch.tensor([pose_step[1],pose_step[0]], dtype=torch.float32).to(grid.device) 56 | rel_coord = rel_coord.reshape((2,1)) 57 | rel_coord = torch.matmul(init_rot_mat,rel_coord) 58 | 59 | x = 2*(rel_coord[0]/self.cell_size)/(self.grid_dim[0]) 60 | z = 2*(rel_coord[1]/self.cell_size)/(self.grid_dim[1]) 61 | 62 | angle = pose_step[2] 63 | 64 | trans_theta = torch.tensor( [ [1, -0, x], [0, 1, z] ], dtype=torch.float32 ).unsqueeze(0) 65 | rot_theta = torch.tensor( [ [torch.cos(angle), -1.0*torch.sin(angle), 0], [torch.sin(angle), torch.cos(angle), 0] ], dtype=torch.float32 ).unsqueeze(0) 66 | trans_theta = trans_theta.to(grid.device) 67 | rot_theta = rot_theta.to(grid.device) 68 | 69 | trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False) # get grid translation displacement 70 | rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False) # get grid rotation displacement 71 | 72 | rot_geo_grid = F.grid_sample(grid_step, rot_disp_grid.float(), align_corners=False ) # apply rotation 73 | geo_grid = F.grid_sample(rot_geo_grid, trans_disp_grid.float(), align_corners=False) # apply translation 74 | 75 | geo_grid = geo_grid + 1e-12 76 | geo_grid_out[j,:,:,:] = geo_grid 77 | 78 | return geo_grid_out 79 | 80 | 81 | # Transform a geocentric map back to egocentric view 82 | def rotate_map(self, grid, rel_pose, abs_pose): 83 | # grid -- sequence len x number of classes x grid_dim x grid_dim 84 | # rel_pose -- sequence len x 3 85 | ego_grid_out = torch.zeros((grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(grid.device) 86 | 87 | for i in range(grid.shape[0]): # sequence length 88 | 89 | init_pose = abs_pose[i,0,:] # init absolute pose of each sequence 90 | init_rot_mat = torch.tensor([[torch.cos(init_pose[2]), -torch.sin(init_pose[2])], 91 | [torch.sin(init_pose[2]),torch.cos(init_pose[2])]], dtype=torch.float32).to(grid.device) 92 | 93 | grid_step = grid[i,:,:,:].unsqueeze(0) 94 | rel_pose_step = rel_pose[i,:] 95 | rel_coord = torch.tensor([rel_pose_step[1],rel_pose_step[0]], dtype=torch.float32).to(grid.device) 96 | rel_coord = rel_coord.reshape((2,1)) 97 | rel_coord = torch.matmul(init_rot_mat,rel_coord) 98 | x = -2*(rel_coord[0]/self.cell_size)/(self.grid_dim[0]) 99 | z = -2*(rel_coord[1]/self.cell_size)/(self.grid_dim[1]) 100 | angle = -rel_pose_step[2] 101 | 102 | trans_theta = torch.tensor( [ [1, -0, x], [0, 1, z] ], dtype=torch.float32 ).unsqueeze(0) 103 | rot_theta = torch.tensor( [ [torch.cos(angle), -1.0*torch.sin(angle), 0], [torch.sin(angle), torch.cos(angle), 0] ], dtype=torch.float32 ).unsqueeze(0) 104 | trans_theta = trans_theta.to(grid.device) 105 | rot_theta = rot_theta.to(grid.device) 106 | 107 | trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False) # get grid translation displacement 108 | rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False) # get grid rotation displacement 109 | trans_ego_grid = F.grid_sample(grid_step, trans_disp_grid.float(), align_corners=False ) # apply translation 110 | ego_grid = F.grid_sample(trans_ego_grid, rot_disp_grid.float(), align_corners=False) # apply rotation 111 | ego_grid_out[i,:,:,:] = ego_grid 112 | return ego_grid_out 113 | 114 | 115 | def update_proj_grid_bayes(self, occup_grid=None, segm_grid=None): 116 | # Input geo_grid -- B x T (or 1) x num_of_classes x grid_dim x grid_dim 117 | # Update the ground-projected grid at each location4 118 | 119 | step_occup_grid = torch.zeros((occup_grid.shape[0], occup_grid.shape[1], self.spatial_labels, 120 | self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(occup_grid.device) 121 | occup_grid = occup_grid.to(self.device) 122 | for i in range(occup_grid.shape[1]): # sequence 123 | new_proj_grid = occup_grid[:,i,:,:,:] 124 | mul_proj_grid = new_proj_grid * self.occupancy_grid 125 | normalization_grid = torch.sum(mul_proj_grid, dim=1, keepdim=True) 126 | self.occupancy_grid = mul_proj_grid / normalization_grid.repeat(1, occup_grid.shape[2], 1, 1) 127 | step_occup_grid[:,i,:,:,:] = self.occupancy_grid.clone() 128 | 129 | 130 | step_segm_grid = torch.zeros((segm_grid.shape[0], segm_grid.shape[1], self.object_labels, 131 | self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(segm_grid.device) 132 | segm_grid = segm_grid.to(self.device) 133 | for i in range(segm_grid.shape[1]): # sequence 134 | new_proj_grid = segm_grid[:,i,:,:,:] 135 | mul_proj_grid = new_proj_grid * self.semantic_grid 136 | normalization_grid = torch.sum(mul_proj_grid, dim=1, keepdim=True) 137 | self.semantic_grid = mul_proj_grid / normalization_grid.repeat(1, segm_grid.shape[2], 1, 1) 138 | step_segm_grid[:,i,:,:,:] = self.semantic_grid.clone() 139 | 140 | return step_occup_grid, step_segm_grid 141 | 142 | -------------------------------------------------------------------------------- /vlnce_baselines/common/env_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import sys 4 | from typing import List, Optional, Type, Union 5 | 6 | import habitat 7 | from habitat import logger 8 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset 9 | from habitat_baselines.utils.env_utils import make_env_fn 10 | 11 | random.seed(0) 12 | 13 | SLURM_JOBID = os.environ.get("SLURM_JOB_ID", None) 14 | 15 | 16 | def is_slurm_job() -> bool: 17 | return SLURM_JOBID is not None 18 | 19 | 20 | def is_slurm_batch_job() -> bool: 21 | r"""Heuristic to determine if a slurm job is a batch job or not. Batch jobs 22 | will have a job name that is not a shell unless the user specifically set the job 23 | name to that of a shell. Interactive jobs have a shell name as their job name. 24 | """ 25 | return is_slurm_job() and os.environ.get("SLURM_JOB_NAME", None) not in ( 26 | None, 27 | "bash", 28 | "zsh", 29 | "fish", 30 | "tcsh", 31 | "sh", 32 | ) 33 | 34 | 35 | def construct_envs( 36 | config: Config, 37 | env_class: Type[Union[Env, RLEnv]], 38 | workers_ignore_signals: bool = False, 39 | auto_reset_done: bool = True, 40 | episodes_allowed: Optional[List[str]] = None, 41 | ) -> VectorEnv: 42 | r"""Create VectorEnv object with specified config and env class type. 43 | To allow better performance, dataset are split into small ones for 44 | each individual env, grouped by scenes. 45 | :param config: configs that contain num_environments as well as information 46 | :param necessary to create individual environments. 47 | :param env_class: class type of the envs to be created. 48 | :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor 49 | :param auto_reset_done: Whether or not to automatically reset the env on done 50 | :return: VectorEnv object created according to specification. 51 | """ 52 | 53 | num_envs_per_gpu = config.NUM_ENVIRONMENTS 54 | if isinstance(config.SIMULATOR_GPU_IDS, list): 55 | gpus = config.SIMULATOR_GPU_IDS 56 | else: 57 | gpus = [config.SIMULATOR_GPU_IDS] 58 | num_gpus = len(gpus) 59 | num_envs = num_gpus * num_envs_per_gpu 60 | 61 | if episodes_allowed is not None: 62 | config.defrost() 63 | config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = episodes_allowed 64 | config.freeze() 65 | 66 | configs = [] 67 | env_classes = [env_class for _ in range(num_envs)] 68 | dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE) 69 | scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES 70 | if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES: 71 | scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) 72 | logger.info(f"SPLTI: {config.TASK_CONFIG.DATASET.SPLIT}, NUMBER OF SCENES: {len(scenes)}") 73 | 74 | if num_envs > 1: 75 | if len(scenes) == 0: 76 | raise RuntimeError( 77 | "No scenes to load, multi-process logic relies on being able" 78 | " to split scenes uniquely between processes" 79 | ) 80 | 81 | if len(scenes) < num_envs and len(scenes) != 1: 82 | raise RuntimeError( 83 | "reduce the number of GPUs or envs as there" 84 | " aren't enough number of scenes" 85 | ) 86 | 87 | random.shuffle(scenes) 88 | 89 | if len(scenes) == 1: 90 | scene_splits = [[scenes[0]] for _ in range(num_envs)] 91 | else: 92 | scene_splits = [[] for _ in range(num_envs)] 93 | for idx, scene in enumerate(scenes): 94 | scene_splits[idx % len(scene_splits)].append(scene) 95 | 96 | assert sum(map(len, scene_splits)) == len(scenes) 97 | 98 | for i in range(num_gpus): 99 | for j in range(num_envs_per_gpu): 100 | proc_config = config.clone() 101 | proc_config.defrost() 102 | proc_id = (i * num_envs_per_gpu) + j 103 | 104 | task_config = proc_config.TASK_CONFIG 105 | task_config.SEED += proc_id 106 | if len(scenes) > 0: 107 | task_config.DATASET.CONTENT_SCENES = scene_splits[proc_id] 108 | 109 | task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpus[i] 110 | 111 | task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS 112 | 113 | proc_config.freeze() 114 | configs.append(proc_config) 115 | 116 | is_debug = True if sys.gettrace() else False 117 | # env_entry = habitat.ThreadedVectorEnv if is_debug else habitat.VectorEnv 118 | env_entry = habitat.ThreadedVectorEnv 119 | envs = env_entry( 120 | make_env_fn=make_env_fn, 121 | env_fn_args=tuple(zip(configs, env_classes)), 122 | auto_reset_done=auto_reset_done, 123 | workers_ignore_signals=workers_ignore_signals, 124 | ) 125 | return envs 126 | 127 | 128 | def construct_envs_auto_reset_false( 129 | config: Config, env_class: Type[Union[Env, RLEnv]] 130 | ) -> VectorEnv: 131 | return construct_envs(config, env_class, auto_reset_done=False) 132 | 133 | def construct_envs_for_rl( 134 | config: Config, 135 | env_class: Type[Union[Env, RLEnv]], 136 | workers_ignore_signals: bool = False, 137 | auto_reset_done: bool = True, 138 | episodes_allowed: Optional[List[str]] = None, 139 | ) -> VectorEnv: 140 | r"""Create VectorEnv object with specified config and env class type. 141 | To allow better performance, dataset are split into small ones for 142 | each individual env, grouped by scenes. 143 | :param config: configs that contain num_environments as well as information 144 | :param necessary to create individual environments. 145 | :param env_class: class type of the envs to be created. 146 | :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor 147 | :param auto_reset_done: Whether or not to automatically reset the env on done 148 | :return: VectorEnv object created according to specification. 149 | """ 150 | 151 | num_envs_per_gpu = config.NUM_ENVIRONMENTS 152 | if isinstance(config.SIMULATOR_GPU_IDS, list): 153 | gpus = config.SIMULATOR_GPU_IDS 154 | else: 155 | gpus = [config.SIMULATOR_GPU_IDS] 156 | num_gpus = len(gpus) 157 | num_envs = num_gpus * num_envs_per_gpu 158 | 159 | if episodes_allowed is not None: 160 | config.defrost() 161 | config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = episodes_allowed 162 | config.freeze() 163 | 164 | configs = [] 165 | env_classes = [env_class for _ in range(num_envs)] 166 | dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE) 167 | scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES 168 | if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES: 169 | scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) 170 | 171 | if num_envs > 1: 172 | if len(scenes) == 0: 173 | raise RuntimeError( 174 | "No scenes to load, multi-process logic relies on being able" 175 | " to split scenes uniquely between processes" 176 | ) 177 | 178 | if len(scenes) < num_envs and len(scenes) != 1: 179 | raise RuntimeError( 180 | "reduce the number of GPUs or envs as there" 181 | " aren't enough number of scenes" 182 | ) 183 | random.shuffle(scenes) 184 | 185 | if len(scenes) == 1: 186 | scene_splits = [[scenes[0]] for _ in range(num_envs)] 187 | else: 188 | scene_splits = [[] for _ in range(num_envs)] 189 | for idx, scene in enumerate(scenes): 190 | scene_splits[idx % len(scene_splits)].append(scene) 191 | 192 | assert sum(map(len, scene_splits)) == len(scenes) 193 | 194 | for i in range(num_gpus): 195 | for j in range(num_envs_per_gpu): 196 | proc_config = config.clone() 197 | proc_config.defrost() 198 | proc_id = (i * num_envs_per_gpu) + j 199 | 200 | task_config = proc_config.TASK_CONFIG 201 | task_config.SEED += proc_id 202 | if len(scenes) > 0: 203 | task_config.DATASET.CONTENT_SCENES = scene_splits[proc_id] 204 | 205 | task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpus[i] 206 | 207 | task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS 208 | 209 | proc_config.freeze() 210 | configs.append(proc_config) 211 | 212 | is_debug = True if sys.gettrace() else False 213 | env_entry = habitat.ThreadedVectorEnv if is_debug else habitat.VectorEnv 214 | envs = env_entry( 215 | make_env_fn=make_env_fn, 216 | env_fn_args=tuple(zip(configs, env_classes)), 217 | auto_reset_done=auto_reset_done, 218 | workers_ignore_signals=workers_ignore_signals, 219 | ) 220 | return envs 221 | -------------------------------------------------------------------------------- /vlnce_baselines/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | import numpy as np 4 | import math 5 | import copy 6 | 7 | class ARGS(): 8 | def __init__(self): 9 | self.local_rank = 0 10 | 11 | def reduce_loss(tensor, rank, world_size): 12 | with torch.no_grad(): 13 | dist.reduce(tensor, dst=0) 14 | if rank == 0: 15 | # print(tensor) 16 | tensor /= world_size 17 | 18 | def gather_list_and_concat(list_of_nums,world_size): 19 | if not torch.is_tensor(list_of_nums): 20 | tensor = torch.Tensor(list_of_nums).cuda() 21 | else: 22 | if list_of_nums.is_cuda == False: 23 | tensor = list_of_nums.cuda() 24 | else: 25 | tensor = list_of_nums 26 | gather_t = [torch.ones_like(tensor) for _ in 27 | range(world_size)] 28 | dist.all_gather(gather_t, tensor) 29 | return gather_t 30 | 31 | def repeat_allocation(allocations, max_number): 32 | if torch.is_tensor(max_number): 33 | max_number = max_number.long().item() 34 | else: 35 | max_number = max_number.long() 36 | allocation_number = len(allocations) 37 | repeat_time, res = max_number // allocation_number, max_number % allocation_number 38 | allocations_ = [] 39 | for i in range(repeat_time): 40 | allocations_ += copy.deepcopy(allocations) 41 | allocations_ += copy.deepcopy(allocations)[:res] 42 | 43 | return allocations_ 44 | 45 | 46 | def allocate(number, ep_length, size_per_time): 47 | length_to_indexes = {ep_length[i]: [] for i in 48 | range(len(ep_length))} 49 | for i in range(len(ep_length)): 50 | length_to_indexes[ep_length[i]] += [i]*number[i] 51 | 52 | values = [] 53 | for i in range(len(number)): 54 | values += [ep_length[i]] * number[i] 55 | 56 | groups = int((len(values) - 0.01) // size_per_time + 1) 57 | 58 | values.sort(reverse=True) 59 | 60 | load_balance_groups = [[] for grp in range(groups)] 61 | 62 | for v in values: 63 | load_balance_groups.sort(key=lambda x: sum(x)) 64 | load_balance_groups[0].append(v) 65 | 66 | indexes = [] 67 | set_length = list(set(ep_length)) 68 | for i in range(groups): 69 | index = np.zeros(len(load_balance_groups[i]),dtype=int) 70 | for j in range(len(set_length)): 71 | length_indexes = length_to_indexes[set_length[j]] 72 | position = np.where(np.array(load_balance_groups[i]) == 73 | set_length[j])[0] 74 | position_length = len(position) 75 | # print(position_length,j) 76 | index[position] = length_indexes[:position_length] 77 | # print(length_indexes) 78 | length_to_indexes[set_length[j]] = length_indexes[position_length:] 79 | indexes.append((index).tolist()) 80 | 81 | return indexes 82 | 83 | def allocate_instructions(instruction_lengths, allocations,ep_length, instruction_ids): 84 | instruction_ids_copy = copy.deepcopy(instruction_ids) 85 | allocations_copy = copy.deepcopy(allocations) 86 | instruction_lengths_copy = copy.deepcopy(instruction_lengths) 87 | values = [] 88 | value_indexes = [] 89 | weights = [] 90 | for i in range(len(instruction_lengths)): 91 | instruction_length = instruction_lengths[i] 92 | values += instruction_length 93 | value_indexes += len(instruction_length)*[i] 94 | weights += [ep_length[i]] * len(instruction_length) 95 | # values = np.array(values) 96 | # value_indexes = np.array(value_indexes) 97 | values = np.array(values) 98 | weights = np.array(weights) 99 | value_indexes = np.array(value_indexes) 100 | sorted_index = np.argsort(values*weights)[::-1] 101 | values = values[sorted_index] 102 | value_indexes = value_indexes[sorted_index] 103 | weights = weights[sorted_index] 104 | 105 | groups = len(allocations) 106 | load_balance_groups = [[] for grp in range(groups)] 107 | group_weights = [[] for grp in range(groups)] 108 | instruction_allocations = [[] for grp in range(groups)] 109 | for j in range(len(values)): 110 | summation = np.array([np.sum(np.array(load_balance_groups[i])*np.array(group_weights[i])) for i in range(groups)]) 111 | sorted_index = np.argsort(summation) 112 | for i in sorted_index: 113 | index = value_indexes[j] 114 | value = values[j] 115 | if index in allocations_copy[i]: 116 | allocations_copy[i].remove(index) 117 | load_balance_groups[i].append(value) 118 | group_weights[i].append(weights[j]) 119 | # check[i].append(index) 120 | index_in_length = np.where(np.array(instruction_lengths_copy[index]) == value)[0][0] 121 | instruction_lengths_copy[index].pop(index_in_length) 122 | instruction_allocations[i].append(instruction_ids_copy[index].pop(index_in_length)) 123 | break 124 | 125 | return instruction_allocations 126 | 127 | 128 | def allocate_by_scene_for_ddp(number, ep_length, size_per_time): 129 | length_to_indexes = {ep_length[i]: [] for i in 130 | range(len(ep_length))} 131 | for i in range(len(ep_length)): 132 | length_to_indexes[ep_length[i]] += [i]*number[i] 133 | 134 | values = [] 135 | for i in range(len(number)): 136 | values += [ep_length[i]] * number[i] 137 | 138 | groups = int((len(values) - 0.01) // size_per_time + 1) 139 | 140 | values.sort(reverse=True) 141 | 142 | load_balance_groups = [[] for grp in range(groups)] 143 | 144 | for v in values: 145 | load_balance_groups.sort(key=lambda x: sum(x)) 146 | load_balance_groups[0].append(v) 147 | 148 | indexes = [] 149 | set_length = list(set(ep_length)) 150 | for i in range(groups): 151 | index = np.zeros(len(load_balance_groups[i]),dtype=int) 152 | for j in range(len(set_length)): 153 | length_indexes = length_to_indexes[set_length[j]] 154 | position = np.where(np.array(load_balance_groups[i]) == 155 | set_length[j])[0] 156 | position_length = len(position) 157 | # print(position_length,j) 158 | index[position] = length_indexes[:position_length] 159 | # print(length_indexes) 160 | length_to_indexes[set_length[j]] = length_indexes[position_length:] 161 | indexes.append((index).tolist()) 162 | 163 | return indexes 164 | 165 | 166 | def get_camera_orientations12(): 167 | base_angle_deg = 30 168 | base_angle_rad = math.pi / 6 169 | orient_dict = {} 170 | for k in range(1,12): 171 | orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0] 172 | return orient_dict 173 | 174 | 175 | def get_camera_orientations24(): 176 | base_angle_deg = 15 177 | base_angle_rad = math.pi / 12 178 | orient_dict = {} 179 | for k in range(1,24): 180 | orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0] 181 | return orient_dict 182 | 183 | 184 | def length2mask(length, size=None): 185 | batch_size = len(length) 186 | size = int(max(length)) if size is None else size 187 | mask = (torch.arange(size, dtype=torch.int64).unsqueeze(0).repeat(batch_size, 1) 188 | > (torch.LongTensor(length) - 1).unsqueeze(1)).cuda() 189 | return mask 190 | 191 | 192 | def dir_angle_feature(angle_list, device=None): 193 | feature_dim = 64 194 | batch_size = len(angle_list) 195 | max_leng = max([len(k) for k in angle_list]) + 1 # +1 for stop 196 | heading_enc = torch.zeros( 197 | batch_size, max_leng, feature_dim, dtype=torch.float32) 198 | 199 | for i in range(batch_size): 200 | for j, angle_rad in enumerate(angle_list[i]): 201 | heading_enc[i][j] = torch.tensor( 202 | [math.sin(angle_rad), 203 | math.cos(angle_rad)] * (feature_dim // 2)) 204 | 205 | return heading_enc 206 | 207 | 208 | def dir_angle_feature_with_ele(angle_list, device=None): 209 | feature_dim = 128 210 | batch_size = len(angle_list) 211 | max_leng = max([len(k) for k in angle_list]) + 1 # +1 for stop 212 | heading_enc = torch.zeros( 213 | batch_size, max_leng, feature_dim, dtype=torch.float32) 214 | 215 | for i in range(batch_size): 216 | for j, angle_rad in enumerate(angle_list[i]): 217 | heading_enc[i][j] = torch.tensor( 218 | [ 219 | math.sin(angle_rad), math.cos(angle_rad), 220 | math.sin(0.0), math.cos(0.0), # elevation 221 | ] * (128 // 4)) 222 | 223 | return heading_enc 224 | 225 | 226 | 227 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: morph 2 | channels: 3 | - aihabitat 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=1_llvm 9 | - attrs=21.4.0=pyhd8ed1ab_0 10 | - bzip2=1.0.8=h7b6447c_0 11 | - c-ares=1.18.1=h7f8727e_0 12 | - ca-certificates=2024.7.2=h06a4308_0 13 | - certifi=2022.12.7=py37h06a4308_0 14 | - cmake=3.14.0=h52cb24c_0 15 | - colorama=0.4.4=pyh9f0ad1d_0 16 | - cycler=0.11.0=pyhd8ed1ab_0 17 | - expat=2.4.4=h295c915_0 18 | - ffmpeg=4.3.2=h37c90e5_3 19 | - freetype=2.10.4=h0708190_1 20 | - fribidi=1.0.10=h36c2ea0_0 21 | - giflib=5.2.1=h36c2ea0_2 22 | - gitdb=4.0.9=pyhd8ed1ab_0 23 | - gitpython=3.1.27=pyhd8ed1ab_0 24 | - gmp=6.2.1=h58526e2_0 25 | - gnutls=3.6.13=h85f3911_1 26 | - habitat-sim-challenge-2022=0.2.1=py3.7_headless_bullet_linux_f02ce8317b4bcbccf1ed92a6436f6aabc9aecec6 27 | - habitat-sim-mutex=1.0=headless_bullet 28 | - headless=2.0=0 29 | - icu=67.1=he1b5a44_0 30 | - imageio=2.16.1=pyhcf75d05_0 31 | - imageio-ffmpeg=0.4.5=pyhd8ed1ab_0 32 | - jbig=2.1=h7f98852_2003 33 | - jpeg=9e=h7f98852_0 34 | - kiwisolver=1.4.0=py37h7cecad7_0 35 | - krb5=1.19.2=hac12032_0 36 | - lame=3.100=h7f98852_1001 37 | - lcms2=2.12=hddcbb42_0 38 | - ld_impl_linux-64=2.35.1=h7274673_9 39 | - lerc=3.0=h9c3ff4c_0 40 | - libblas=3.9.0=13_linux64_openblas 41 | - libcblas=3.9.0=13_linux64_openblas 42 | - libcurl=7.80.0=h0b77cf5_0 43 | - libdeflate=1.10=h7f98852_0 44 | - libedit=3.1.20210910=h7f8727e_0 45 | - libev=4.33=h7f8727e_1 46 | - libffi=3.3=he6710b0_2 47 | - libgcc-ng=11.2.0=h1d223b6_14 48 | - libgfortran-ng=11.2.0=h69a702a_14 49 | - libgfortran5=11.2.0=h5c6108e_14 50 | - libimagequant=2.17.0=h7f98852_1 51 | - liblapack=3.9.0=13_linux64_openblas 52 | - libllvm11=11.1.0=hf817b99_3 53 | - libnghttp2=1.46.0=hce63b2e_0 54 | - libopenblas=0.3.18=pthreads_h8fe5266_0 55 | - libpng=1.6.37=h21135ba_2 56 | - libssh2=1.9.0=h1ba5d50_1 57 | - libstdcxx-ng=11.2.0=he4da1e4_14 58 | - libtiff=4.3.0=h542a066_3 59 | - libwebp=1.2.2=h3452ae3_0 60 | - libwebp-base=1.2.2=h7f98852_1 61 | - libxcb=1.13=h7f98852_1004 62 | - libzlib=1.2.11=h36c2ea0_1013 63 | - llvm-openmp=13.0.1=he0ac6c6_1 64 | - lz4-c=1.9.3=h9c3ff4c_1 65 | - matplotlib=3.2.2=1 66 | - matplotlib-base=3.2.2=py37h1d35a4c_1 67 | - ncurses=6.3=h7f8727e_2 68 | - nettle=3.6=he412f7d_0 69 | - openh264=2.1.1=h780b84a_0 70 | - openjpeg=2.4.0=hb52868f_1 71 | - openssl=1.1.1w=h7f8727e_0 72 | - pip=21.2.2=py37h06a4308_0 73 | - pthread-stubs=0.4=h36c2ea0_1001 74 | - pyparsing=3.0.7=pyhd8ed1ab_0 75 | - python=3.7.11=h12debd9_0 76 | - python-dateutil=2.8.2=pyhd8ed1ab_0 77 | - python_abi=3.7=2_cp37m 78 | - quaternion=2022.2.10.14.20.39=py37h5e8e339_0 79 | - readline=8.1.2=h7f8727e_1 80 | - rhash=1.4.1=h3c74f83_1 81 | - scipy=1.7.3=py37hf2a6cf1_0 82 | - seaborn=0.12.2=py37h06a4308_0 83 | - six=1.16.0=pyh6c4a22f_0 84 | - smmap=3.0.5=pyh44b312d_0 85 | - sqlite=3.37.2=hc218d9a_0 86 | - tk=8.6.11=h1ccaba5_0 87 | - tornado=6.2=py37h5eee18b_0 88 | - tqdm=4.63.0=pyhd8ed1ab_0 89 | - typing_extensions=4.3.0=py37h06a4308_0 90 | - x264=1!161.3030=h7f98852_1 91 | - xorg-fixesproto=5.0=h7f98852_1002 92 | - xorg-inputproto=2.3.2=h7f98852_1002 93 | - xorg-kbproto=1.0.7=h7f98852_1002 94 | - xorg-libx11=1.7.2=h7f98852_0 95 | - xorg-libxau=1.0.9=h7f98852_0 96 | - xorg-libxcursor=1.2.0=h7f98852_0 97 | - xorg-libxdmcp=1.1.3=h7f98852_0 98 | - xorg-libxext=1.3.4=h7f98852_1 99 | - xorg-libxfixes=5.0.3=h7f98852_1004 100 | - xorg-libxi=1.7.10=h7f98852_0 101 | - xorg-libxinerama=1.1.4=h9c3ff4c_1001 102 | - xorg-libxrandr=1.5.2=h7f98852_1 103 | - xorg-libxrender=0.9.10=h7f98852_1003 104 | - xorg-randrproto=1.5.0=h7f98852_1001 105 | - xorg-renderproto=0.11.1=h7f98852_1002 106 | - xorg-xextproto=7.3.0=h7f98852_1002 107 | - xorg-xproto=7.0.31=h7f98852_1007 108 | - xz=5.2.5=h7b6447c_0 109 | - zlib=1.2.11=h36c2ea0_1013 110 | - zstd=1.5.2=ha95c52a_0 111 | - pip: 112 | - absl-py==1.0.0 113 | - addict==2.4.0 114 | - appdirs==1.4.4 115 | - astor==0.8.1 116 | - astunparse==1.6.3 117 | - backcall==0.2.0 118 | - backports-cached-property==1.0.2 119 | - beautifulsoup4==4.12.3 120 | - boto3==1.33.13 121 | - botocore==1.33.13 122 | - braceexpand==0.1.7 123 | - cachetools==5.0.0 124 | - charset-normalizer==2.0.12 125 | - click==8.0.4 126 | - clip==1.0 127 | - cloudpickle==2.0.0 128 | - comm==0.1.4 129 | - configargparse==1.7 130 | - dash==2.15.0 131 | - dash-core-components==2.0.0 132 | - dash-html-components==2.0.0 133 | - dash-table==5.0.0 134 | - decorator==4.4.2 135 | - distlib==0.3.8 136 | - docker-pycreds==0.4.0 137 | - docstring-parser==0.14.1 138 | - dtw==1.4.0 139 | - fastdtw==0.3.4 140 | - fastjsonschema==2.20.0 141 | - filelock==3.12.2 142 | - flask==2.2.5 143 | - flatbuffers==24.3.25 144 | - frozendict==2.4.4 145 | - ftfy==6.1.1 146 | - gast==0.4.0 147 | - gdown==4.7.3 148 | - google-auth==2.6.0 149 | - google-auth-oauthlib==0.4.6 150 | - google-pasta==0.2.0 151 | - grpcio==1.45.0rc1 152 | - gym==0.23.1 153 | - gym-notices==0.0.6 154 | - h5py==3.8.0 155 | - habitat-sim==0.2.1 156 | - huggingface-hub==0.0.12 157 | - idna==3.3 158 | - ifcfg==0.22 159 | - importlib-metadata==6.7.0 160 | - importlib-resources==5.12.0 161 | - ipython==7.34.0 162 | - ipywidgets==8.1.3 163 | - itsdangerous==2.1.2 164 | - jedi==0.19.1 165 | - jinja2==3.1.4 166 | - jmespath==1.0.1 167 | - joblib==1.3.2 168 | - jsonlines==3.1.0 169 | - jsonschema==4.17.3 170 | - jupyter-core==4.12.0 171 | - jupyterlab-widgets==3.0.11 172 | - keras==2.11.0 173 | - keras-applications==1.0.8 174 | - keras-preprocessing==1.1.2 175 | - libclang==18.1.1 176 | - llvmlite==0.31.0 177 | - lmdb==1.3.0 178 | - markdown==3.3.6 179 | - markdown-it-py==2.2.0 180 | - markupsafe==2.1.5 181 | - matplotlib-inline==0.1.6 182 | - mdurl==0.1.2 183 | - mock==5.1.0 184 | - moviepy==2.0.0.dev2 185 | - msgpack==1.0.3 186 | - msgpack-numpy==0.4.7.1 187 | - nbformat==5.7.0 188 | - nest-asyncio==1.6.0 189 | - networkx==2.6.3 190 | - numba==0.48.0 191 | - numpy==1.21.6 192 | - oauthlib==3.2.0 193 | - objectio==0.2.29 194 | - open3d==0.17.0 195 | - opencv-python==4.5.5.64 196 | - opt-einsum==3.3.0 197 | - packaging==24.0 198 | - pandas==1.3.0 199 | - pandocfilters==1.5.0 200 | - parso==0.8.4 201 | - pathtools==0.1.2 202 | - pexpect==4.9.0 203 | - pickle5==0.0.12 204 | - pickleshare==0.7.5 205 | - pillow==9.5.0 206 | - pkgutil-resolve-name==1.3.10 207 | - platformdirs==3.11.0 208 | - plotly==5.18.0 209 | - prompt-toolkit==3.0.47 210 | - protobuf==3.19.6 211 | - psutil==6.0.0 212 | - ptyprocess==0.7.0 213 | - pyasn1==0.4.8 214 | - pyasn1-modules==0.2.8 215 | - pygments==2.17.2 216 | - pyliblzfse==0.4.1 217 | - pyquaternion==0.9.9 218 | - pyrsistent==0.19.3 219 | - pysocks==1.7.1 220 | - pytorch-transformers==1.2.0 221 | - pytz==2024.1 222 | - pywavelets==1.3.0 223 | - pyyaml==6.0 224 | - regex==2024.4.16 225 | - requests==2.27.1 226 | - requests-oauthlib==1.3.1 227 | - retrying==1.3.4 228 | - rich==13.7.1 229 | - rsa==4.8 230 | - s3transfer==0.8.2 231 | - sacremoses==0.0.53 232 | - scikit-image==0.19.3 233 | - scikit-learn==1.0.2 234 | - sentencepiece==0.2.0 235 | - sentry-sdk==2.7.1 236 | - setproctitle==1.3.3 237 | - setuptools==68.0.0 238 | - shtab==1.7.1 239 | - simplejson==3.17.6 240 | - soupsieve==2.4.1 241 | - tenacity==8.2.3 242 | - tensorboard==2.11.2 243 | - tensorboard-data-server==0.6.1 244 | - tensorboard-plugin-wit==1.8.1 245 | - tensorflow==2.11.0 246 | - tensorflow-estimator==2.11.0 247 | - tensorflow-io-gcs-filesystem==0.34.0 248 | - termcolor==2.3.0 249 | - threadpoolctl==3.1.0 250 | - tifffile==2021.11.2 251 | - tinycudann==1.7 252 | - tokenizers==0.10.3 253 | - torch==1.12.1+cu113 254 | - torch-kdtree==1.0 255 | - torchvision==0.13.1+cu113 256 | - traitlets==5.8.0 257 | - transformers==4.9.2 258 | - trimesh==3.9.1 259 | - trove-classifiers==2023.7.6 260 | - typeguard==3.0.2 261 | - typer==0.4.0 262 | - typing-extensions==4.7.1 263 | - tyro==0.5.2 264 | - urllib3==1.26.15 265 | - virtualenv==20.23.1 266 | - viser==0.0.10 267 | - wandb==0.15.2 268 | - wcwidth==0.2.5 269 | - webdataset==0.1.40 270 | - webencodings==0.5.1 271 | - websocket-client==1.4.2 272 | - websockets==11.0.3 273 | - werkzeug==2.2.3 274 | - wheel==0.37.1 275 | - widgetsnbextension==4.0.11 276 | - wrapt==1.14.1 277 | - xatlas==0.0.7 278 | - xxhash==3.2.0 279 | - yacs==0.1.8 280 | - zipp==3.7.0 281 | prefix: /opt/conda/envs/morph 282 | -------------------------------------------------------------------------------- /vlnce_baselines/config/default.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | import habitat_baselines.config.default 4 | from habitat.config.default import CONFIG_FILE_SEPARATOR 5 | from habitat.config.default import Config as CN 6 | 7 | from habitat_extensions.config.default import ( 8 | get_extended_config as get_task_config, 9 | ) 10 | 11 | # ----------------------------------------------------------------------------- 12 | # EXPERIMENT CONFIG 13 | # ----------------------------------------------------------------------------- 14 | _C = CN() 15 | _C.BASE_TASK_CONFIG_PATH = "habitat_extensions/config/vlnce_task.yaml" 16 | _C.TASK_CONFIG = CN() # task_config will be stored as a config node 17 | _C.TRAINER_NAME = "dagger" 18 | _C.ENV_NAME = "VLNCEDaggerEnv" 19 | _C.SIMULATOR_GPU_IDS = [0] 20 | _C.VIDEO_OPTION = [] # options: "disk", "tensorboard" 21 | _C.VIDEO_DIR = "videos/debug" 22 | _C.TENSORBOARD_DIR = "data/tensorboard_dirs/debug" 23 | _C.RESULTS_DIR = "data/checkpoints/pretrained/evals" 24 | 25 | # ----------------------------------------------------------------------------- 26 | # EVAL CONFIG 27 | # ----------------------------------------------------------------------------- 28 | _C.EVAL = CN() 29 | # The split to evaluate on 30 | _C.EVAL.SPLIT = "val_seen" 31 | _C.EVAL.EPISODE_COUNT = -1 32 | _C.EVAL.LANGUAGES = ["en-US", "en-IN"] 33 | _C.EVAL.SAMPLE = False 34 | _C.EVAL.SAVE_RESULTS = True 35 | _C.EVAL.EVAL_NONLEARNING = False 36 | _C.EVAL.NONLEARNING = CN() 37 | _C.EVAL.NONLEARNING.AGENT = "RandomAgent" 38 | 39 | # ----------------------------------------------------------------------------- 40 | # INFERENCE CONFIG 41 | # ----------------------------------------------------------------------------- 42 | _C.INFERENCE = CN() 43 | _C.INFERENCE.SPLIT = "test" 44 | _C.INFERENCE.LANGUAGES = ["en-US", "en-IN"] 45 | _C.INFERENCE.SAMPLE = False 46 | _C.INFERENCE.USE_CKPT_CONFIG = True 47 | _C.INFERENCE.CKPT_PATH = "data/checkpoints/CMA_PM_DA_Aug.pth" 48 | _C.INFERENCE.PREDICTIONS_FILE = "predictions.json" 49 | _C.INFERENCE.INFERENCE_NONLEARNING = False 50 | _C.INFERENCE.NONLEARNING = CN() 51 | _C.INFERENCE.NONLEARNING.AGENT = "RandomAgent" 52 | _C.INFERENCE.FORMAT = "rxr" # either 'rxr' or 'r2r' 53 | # ----------------------------------------------------------------------------- 54 | # IMITATION LEARNING CONFIG 55 | # ----------------------------------------------------------------------------- 56 | _C.IL = CN() 57 | _C.IL.lr = 2.5e-4 58 | _C.IL.batch_size = 5 59 | _C.IL.epochs = 4 60 | _C.IL.use_iw = True 61 | # inflection coefficient for RxR training set GT trajectories (guide): 1.9 62 | # inflection coefficient for R2R training set GT trajectories: 3.2 63 | _C.IL.inflection_weight_coef = 3.2 64 | # load an already trained model for fine tuning 65 | _C.IL.waypoint_aug = False 66 | _C.IL.load_from_ckpt = False 67 | _C.IL.ckpt_to_load = "data/checkpoints/ckpt.0.pth" 68 | # if True, loads the optimizer state, epoch, and step_id from the ckpt dict. 69 | _C.IL.is_requeue = False 70 | # it True, start training from the saved epoch 71 | # ----------------------------------------------------------------------------- 72 | # IL: RXR TRAINER CONFIG 73 | # ----------------------------------------------------------------------------- 74 | _C.IL.RECOLLECT_TRAINER = CN() 75 | _C.IL.RECOLLECT_TRAINER.preload_trajectories_file = True 76 | _C.IL.RECOLLECT_TRAINER.trajectories_file = ( 77 | "data/trajectories_dirs/debug/trajectories.json.gz" 78 | ) 79 | # if set to a positive int, episodes with longer paths are ignored in training 80 | _C.IL.RECOLLECT_TRAINER.max_traj_len = -1 81 | # if set to a positive int, effective_batch_size must be some multiple of 82 | # IL.batch_size. Gradient accumulation enables an arbitrarily high "effective" 83 | # batch size. 84 | _C.IL.RECOLLECT_TRAINER.effective_batch_size = -1 85 | _C.IL.RECOLLECT_TRAINER.preload_size = 30 86 | _C.IL.RECOLLECT_TRAINER.use_iw = True 87 | _C.IL.RECOLLECT_TRAINER.gt_file = ( 88 | "data/datasets/RxR_VLNCE_v0_enc_xlmr/{split}/{split}_{role}_gt.json.gz" 89 | ) 90 | # ----------------------------------------------------------------------------- 91 | # IL: DAGGER CONFIG 92 | # ----------------------------------------------------------------------------- 93 | _C.IL.DAGGER = CN() 94 | _C.IL.DAGGER.iterations = 10 95 | _C.IL.DAGGER.update_size = 5000 96 | _C.IL.DAGGER.p = 0.75 97 | _C.IL.DAGGER.expert_policy_sensor = "SHORTEST_PATH_SENSOR" 98 | _C.IL.DAGGER.expert_policy_sensor_uuid = "shortest_path_sensor" 99 | _C.IL.DAGGER.load_space = False 100 | # if True, load saved observation space and action space 101 | _C.IL.DAGGER.lmdb_map_size = 1.0e12 102 | # if True, saves data to disk in fp16 and converts back to fp32 when loading. 103 | _C.IL.DAGGER.lmdb_fp16 = False 104 | # How often to commit the writes to the DB, less commits is 105 | # better, but everything must be in memory until a commit happens/ 106 | _C.IL.DAGGER.lmdb_commit_frequency = 500 107 | # If True, load precomputed features directly from lmdb_features_dir. 108 | _C.IL.DAGGER.preload_lmdb_features = False 109 | _C.IL.DAGGER.lmdb_features_dir = ( 110 | "data/trajectories_dirs/debug/trajectories.lmdb" 111 | ) 112 | # ----------------------------------------------------------------------------- 113 | # RL CONFIG 114 | # ----------------------------------------------------------------------------- 115 | _C.RL = CN() 116 | _C.RL.POLICY = CN() 117 | _C.RL.POLICY.OBS_TRANSFORMS = CN() 118 | _C.RL.POLICY.OBS_TRANSFORMS.ENABLED_TRANSFORMS = [ 119 | "CenterCropperPerSensor", 120 | ] 121 | _C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER_PER_SENSOR = CN() 122 | _C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER_PER_SENSOR.SENSOR_CROPS = [ 123 | ("rgb", (224, 224)), 124 | ("depth", (256, 256)), 125 | ] 126 | _C.RL.POLICY.OBS_TRANSFORMS.RESIZER_PER_SENSOR = CN() 127 | _C.RL.POLICY.OBS_TRANSFORMS.RESIZER_PER_SENSOR.SIZES = [ 128 | ("rgb", (224, 298)), 129 | ("depth", (256, 341)), 130 | ] 131 | # ----------------------------------------------------------------------------- 132 | # MODELING CONFIG 133 | # ----------------------------------------------------------------------------- 134 | _C.MODEL = CN() 135 | _C.MODEL.policy_name = "CMAPolicy" # or "Seq2SeqPolicy" 136 | _C.MODEL.ablate_depth = False 137 | _C.MODEL.ablate_rgb = False 138 | _C.MODEL.ablate_instruction = False 139 | 140 | _C.MODEL.INSTRUCTION_ENCODER = CN() 141 | _C.MODEL.INSTRUCTION_ENCODER.sensor_uuid = "instruction" 142 | _C.MODEL.INSTRUCTION_ENCODER.vocab_size = 2504 143 | _C.MODEL.INSTRUCTION_ENCODER.use_pretrained_embeddings = True 144 | _C.MODEL.INSTRUCTION_ENCODER.embedding_file = ( 145 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/embeddings.json.gz" 146 | ) 147 | _C.MODEL.INSTRUCTION_ENCODER.dataset_vocab = ( 148 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train.json.gz" 149 | ) 150 | _C.MODEL.INSTRUCTION_ENCODER.fine_tune_embeddings = False 151 | _C.MODEL.INSTRUCTION_ENCODER.embedding_size = 50 152 | _C.MODEL.INSTRUCTION_ENCODER.hidden_size = 128 153 | _C.MODEL.INSTRUCTION_ENCODER.rnn_type = "LSTM" 154 | _C.MODEL.INSTRUCTION_ENCODER.final_state_only = True 155 | _C.MODEL.INSTRUCTION_ENCODER.bidirectional = False 156 | 157 | _C.MODEL.spatial_output = True 158 | _C.MODEL.RGB_ENCODER = CN() 159 | _C.MODEL.RGB_ENCODER.cnn_type = "TorchVisionResNet50" 160 | _C.MODEL.RGB_ENCODER.output_size = 256 161 | 162 | _C.MODEL.DEPTH_ENCODER = CN() 163 | _C.MODEL.DEPTH_ENCODER.cnn_type = "VlnResnetDepthEncoder" 164 | _C.MODEL.DEPTH_ENCODER.output_size = 128 165 | # type of resnet to use 166 | _C.MODEL.DEPTH_ENCODER.backbone = "resnet50" 167 | # path to DDPPO resnet weights 168 | _C.MODEL.DEPTH_ENCODER.ddppo_checkpoint = ( 169 | "data/ddppo-models/gibson-2plus-resnet50.pth" 170 | ) 171 | 172 | _C.MODEL.STATE_ENCODER = CN() 173 | _C.MODEL.STATE_ENCODER.hidden_size = 512 174 | _C.MODEL.STATE_ENCODER.rnn_type = "GRU" 175 | 176 | _C.MODEL.SEQ2SEQ = CN() 177 | _C.MODEL.SEQ2SEQ.use_prev_action = False 178 | 179 | _C.MODEL.PROGRESS_MONITOR = CN() 180 | _C.MODEL.PROGRESS_MONITOR.use = False 181 | _C.MODEL.PROGRESS_MONITOR.alpha = 1.0 # loss multiplier 182 | 183 | 184 | def purge_keys(config: CN, keys: List[str]) -> None: 185 | for k in keys: 186 | del config[k] 187 | config.register_deprecated_key(k) 188 | 189 | 190 | def get_config( 191 | config_paths: Optional[Union[List[str], str]] = None, 192 | opts: Optional[list] = None, 193 | ) -> CN: 194 | r"""Create a unified config with default values. Initialized from the 195 | habitat_baselines default config. Overwritten by values from 196 | `config_paths` and overwritten by options from `opts`. 197 | Args: 198 | config_paths: List of config paths or string that contains comma 199 | separated list of config paths. 200 | opts: Config options (keys, values) in a list (e.g., passed from 201 | command line into the config. For example, `opts = ['FOO.BAR', 202 | 0.5]`. Argument can be used for parameter sweeping or quick tests. 203 | """ 204 | config = CN() 205 | config.merge_from_other_cfg(habitat_baselines.config.default._C) 206 | purge_keys(config, ["SIMULATOR_GPU_ID", "TEST_EPISODE_COUNT"]) 207 | config.merge_from_other_cfg(_C.clone()) 208 | 209 | if config_paths: 210 | if isinstance(config_paths, str): 211 | if CONFIG_FILE_SEPARATOR in config_paths: 212 | config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) 213 | else: 214 | config_paths = [config_paths] 215 | 216 | prev_task_config = "" 217 | for config_path in config_paths: 218 | config.merge_from_file(config_path) 219 | if config.BASE_TASK_CONFIG_PATH != prev_task_config: 220 | config.TASK_CONFIG = get_task_config( 221 | config.BASE_TASK_CONFIG_PATH 222 | ) 223 | prev_task_config = config.BASE_TASK_CONFIG_PATH 224 | 225 | if opts: 226 | config.CMD_TRAILING_OPTS = opts 227 | config.merge_from_list(opts) 228 | 229 | config.freeze() 230 | return config 231 | -------------------------------------------------------------------------------- /vlnce_baselines/common/recollection_dataset.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from collections import defaultdict, deque 4 | 5 | import numpy as np 6 | import torch 7 | import tqdm 8 | from gym import Space 9 | from habitat.config.default import Config 10 | from habitat.sims.habitat_simulator.actions import HabitatSimActions 11 | from habitat_baselines.common.environments import get_env_class 12 | from habitat_baselines.common.obs_transformers import ( 13 | apply_obs_transforms_obs_space, 14 | get_active_obs_transforms, 15 | ) 16 | 17 | from habitat_extensions.task import ALL_ROLES_MASK, RxRVLNCEDatasetV1 18 | from vlnce_baselines.common.env_utils import construct_envs 19 | from vlnce_baselines.common.utils import extract_instruction_tokens 20 | 21 | 22 | class TeacherRecollectionDataset(torch.utils.data.IterableDataset): 23 | def __init__(self, config: Config): 24 | super().__init__() 25 | self.config = config 26 | # self._preload = [] 27 | self._preload = deque() 28 | self.world_size = self.config.GPU_NUMBERS 29 | self.rank = self.config.local_rank 30 | 31 | assert ( 32 | config.IL.RECOLLECT_TRAINER.preload_size >= config.IL.batch_size 33 | ), "preload size must be greater than batch size." 34 | self.envs = None 35 | self._env_observations = None 36 | 37 | if config.IL.use_iw: 38 | self.inflec_weights = torch.tensor( 39 | [1.0, config.IL.inflection_weight_coef] 40 | ) 41 | else: 42 | self.inflec_weights = torch.tensor([1.0, 1.0]) 43 | 44 | if self.config.IL.RECOLLECT_TRAINER.preload_trajectories_file: 45 | self.config.defrost() 46 | self.config.IL.RECOLLECT_TRAINER.trajectories_file = \ 47 | self.config.IL.RECOLLECT_TRAINER.trajectories_file[ 48 | :-8] + '_w' + \ 49 | str(self.world_size) + '_r' + str(self.rank) + '.json.gz' 50 | self.config.freeze() 51 | with gzip.open( 52 | config.IL.RECOLLECT_TRAINER.trajectories_file, "rt" 53 | ) as f: 54 | self.trajectories = json.load(f) 55 | else: 56 | self.trajectories = self.collect_dataset() 57 | 58 | self.initialize_sims() 59 | 60 | def initialize_sims(self): 61 | config = self.config.clone() 62 | config.defrost() 63 | config.TASK_CONFIG.MEASUREMENTS = [] 64 | config.freeze() 65 | 66 | self.envs = construct_envs( 67 | config, 68 | get_env_class(config.ENV_NAME), 69 | episodes_allowed=list(self.trajectories.keys()), 70 | ) 71 | self.length = sum(self.envs.number_of_episodes) 72 | self.obs_transforms = get_active_obs_transforms(self.config) 73 | self._observation_space = apply_obs_transforms_obs_space( 74 | self.envs.observation_spaces[0], self.obs_transforms 75 | ) 76 | 77 | self.env_step = [0 for _ in range(self.envs.num_envs)] 78 | self._env_observations = [[] for _ in range(self.envs.num_envs)] 79 | 80 | observations = self.envs.reset() 81 | observations = extract_instruction_tokens( 82 | observations, 83 | self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, 84 | ) 85 | for i, ep in enumerate(self.envs.current_episodes()): 86 | path_step = self.trajectories[str(ep.episode_id)][0] 87 | self._env_observations[i].append( 88 | ( 89 | observations[i], 90 | path_step[0], # prev_action 91 | path_step[2], # oracle_action 92 | ) 93 | ) 94 | 95 | @property 96 | def batch_size(self): 97 | return self.config.IL.batch_size 98 | 99 | @property 100 | def observation_space(self) -> Space: 101 | assert self.envs is not None, "Simulator must first be loaded." 102 | assert self._observation_space is not None 103 | return self._observation_space 104 | 105 | @property 106 | def action_space(self) -> Space: 107 | assert self.envs is not None, "Simulator must first be loaded." 108 | return self.envs.action_spaces[0] 109 | 110 | def close_sims(self): 111 | self.envs.close() 112 | del self.envs 113 | del self._env_observations 114 | self.envs = None 115 | self._env_observations = None 116 | 117 | def collect_dataset(self): 118 | r"""Uses the ground truth trajectories to create a teacher forcing 119 | datset for a given split. Loads both guide and follower episodes. 120 | """ 121 | trajectories = defaultdict(list) 122 | split = self.config.TASK_CONFIG.DATASET.SPLIT 123 | 124 | if "{role}" in self.config.IL.RECOLLECT_TRAINER.gt_file: 125 | gt_data = {} 126 | for role in RxRVLNCEDatasetV1.annotation_roles: 127 | if ( 128 | ALL_ROLES_MASK not in self.config.TASK_CONFIG.DATASET.ROLES 129 | and role not in self.config.TASK_CONFIG.DATASET.ROLES 130 | ): 131 | continue 132 | 133 | with gzip.open( 134 | self.config.IL.RECOLLECT_TRAINER.gt_file.format( 135 | split=split, role=role 136 | ), 137 | "rt", 138 | ) as f: 139 | gt_data.update(json.load(f)) 140 | else: 141 | with gzip.open( 142 | self.config.IL.RECOLLECT_TRAINER.gt_path.format(split=split) 143 | ) as f: 144 | gt_data = json.load(f) 145 | 146 | t = ( 147 | tqdm.tqdm(gt_data.items(), "GT Collection") 148 | if self.config.use_pbar 149 | else gt_data.items() 150 | ) 151 | 152 | for episode_id, trajectory in t: 153 | if ( 154 | self.config.IL.RECOLLECT_TRAINER.max_traj_len != -1 155 | and len(trajectory["actions"]) 156 | > self.config.IL.RECOLLECT_TRAINER.max_traj_len 157 | ) or ( 158 | self.config.IL.RECOLLECT_TRAINER.min_traj_len != -1 159 | and len(trajectory["actions"]) 160 | < self.config.IL.RECOLLECT_TRAINER.min_traj_len 161 | ): 162 | continue 163 | 164 | for i, action in enumerate(trajectory["actions"]): 165 | prev_action = ( 166 | trajectories[episode_id][i - 1][1] 167 | if i 168 | else HabitatSimActions.STOP 169 | ) 170 | 171 | # [prev_action, action, oracle_action] 172 | trajectories[episode_id].append([prev_action, action, action]) 173 | 174 | trajectories = dict(list(trajectories.items())[self.rank::self.world_size]) 175 | self.config.defrost() 176 | self.config.IL.RECOLLECT_TRAINER.trajectories_file = \ 177 | self.config.IL.RECOLLECT_TRAINER.trajectories_file[:-8]+'_w'+ \ 178 | str(self.world_size)+'_r'+str(self.rank) + '.json.gz' 179 | self.config.freeze() 180 | with gzip.open( 181 | self.config.IL.RECOLLECT_TRAINER.trajectories_file, "wt" 182 | ) as f: 183 | f.write(json.dumps(trajectories)) 184 | return trajectories 185 | 186 | def _load_next(self): 187 | """ 188 | Episode length is currently not considered. We were previously batching episodes 189 | together with similar lengths. Not sure if we need to bring that back. 190 | """ 191 | # self.rank = 0 192 | if len(self._preload): 193 | # out = self._preload[self.rank] 194 | # self._preload = self._preload[self.world_size:] 195 | # return out 196 | return self._preload.popleft() 197 | 198 | while ( 199 | len(self._preload) < self.config.IL.RECOLLECT_TRAINER.preload_size 200 | ): 201 | current_episodes = self.envs.current_episodes() 202 | prev_eps = current_episodes 203 | 204 | # get the next action for each env 205 | actions = [ 206 | self.trajectories[str(ep.episode_id)][self.env_step[i]][1] 207 | for i, ep in enumerate(current_episodes) 208 | ] 209 | 210 | outputs = self.envs.step(actions) 211 | observations, _, dones, _ = [list(x) for x in zip(*outputs)] 212 | observations = extract_instruction_tokens( 213 | observations, 214 | self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, 215 | ) 216 | 217 | current_episodes = self.envs.current_episodes() 218 | 219 | for i in range(self.envs.num_envs): 220 | self.env_step[i] += 1 221 | if dones[i]: 222 | assert len(self._env_observations[i]) == len( 223 | self.trajectories[str(prev_eps[i].episode_id)] 224 | ), "Collected episode does not match the step count of trajectory" 225 | self._preload.append( 226 | ( 227 | [o[0] for o in self._env_observations[i]], 228 | [o[1] for o in self._env_observations[i]], 229 | [o[2] for o in self._env_observations[i]], 230 | ) 231 | ) 232 | self._env_observations[i] = [] 233 | self.env_step[i] = 0 234 | 235 | path_step = self.trajectories[ 236 | str(current_episodes[i].episode_id) 237 | ][self.env_step[i]] 238 | self._env_observations[i].append( 239 | ( 240 | observations[i], 241 | path_step[0], # prev_action 242 | path_step[2], # oracle_action 243 | ) 244 | ) 245 | assert ( 246 | len(self._env_observations[i]) 247 | <= self.config.TASK_CONFIG.ENVIRONMENT.MAX_EPISODE_STEPS 248 | ), "Trajectories should be no more than the maximum episode steps." 249 | 250 | # out = self._preload[self.rank] 251 | # self._preload = self._preload[self.world_size:] 252 | # return out 253 | return self._preload.popleft() 254 | 255 | def __next__(self): 256 | """Takes about 1s to once self._load_next() has finished with a batch 257 | size of 5. For this reason, we probably don't need to use extra workers. 258 | """ 259 | x = self._load_next() 260 | obs, prev_actions, oracle_actions = x 261 | 262 | # transpose obs 263 | obs_t = defaultdict(list) 264 | for k in obs[0]: 265 | for i in range(len(obs)): 266 | obs_t[k].append(obs[i][k]) 267 | 268 | obs_t[k] = np.array(obs_t[k]) 269 | 270 | for k, v in obs_t.items(): 271 | obs_t[k] = torch.from_numpy(np.copy(v)) 272 | 273 | prev_actions = torch.from_numpy(np.copy(prev_actions)) 274 | oracle_actions = torch.from_numpy(np.copy(oracle_actions)) 275 | 276 | inflections = torch.cat( 277 | [ 278 | torch.tensor([1], dtype=torch.long), 279 | (oracle_actions[1:] != oracle_actions[:-1]).long(), 280 | ] 281 | ) 282 | 283 | return ( 284 | obs_t, 285 | prev_actions, 286 | oracle_actions, 287 | self.inflec_weights[inflections], 288 | ) 289 | 290 | def __iter__(self): 291 | worker_info = torch.utils.data.get_worker_info() 292 | if worker_info is not None: 293 | assert ( 294 | worker_info.num_workers == 1 295 | ), "multiple workers not supported." 296 | 297 | return self 298 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/resnet_encoders.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torchvision.models as models 6 | from gym import spaces 7 | from habitat import logger 8 | from habitat_baselines.rl.ddppo.policy import resnet 9 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder 10 | import torchvision 11 | #import clip 12 | from .clip import CLIP 13 | from PIL import Image 14 | from torchvision import transforms 15 | 16 | class VlnResnetDepthEncoder(nn.Module): 17 | def __init__( 18 | self, 19 | observation_space, 20 | output_size=128, 21 | checkpoint="NONE", 22 | backbone="resnet50", 23 | resnet_baseplanes=32, 24 | normalize_visual_inputs=False, 25 | trainable=False, 26 | spatial_output: bool = False, 27 | ): 28 | super().__init__() 29 | self.visual_encoder = ResNetEncoder( 30 | spaces.Dict({"depth": observation_space.spaces["depth"]}), 31 | baseplanes=resnet_baseplanes, 32 | ngroups=resnet_baseplanes // 2, 33 | make_backbone=getattr(resnet, backbone), 34 | normalize_visual_inputs=normalize_visual_inputs, 35 | ) 36 | 37 | for param in self.visual_encoder.parameters(): 38 | param.requires_grad_(trainable) 39 | 40 | if checkpoint != "NONE": 41 | ddppo_weights = torch.load(checkpoint) 42 | 43 | weights_dict = {} 44 | for k, v in ddppo_weights["state_dict"].items(): 45 | split_layer_name = k.split(".")[2:] 46 | if split_layer_name[0] != "visual_encoder": 47 | continue 48 | 49 | layer_name = ".".join(split_layer_name[1:]) 50 | weights_dict[layer_name] = v 51 | 52 | del ddppo_weights 53 | self.visual_encoder.load_state_dict(weights_dict, strict=True) 54 | 55 | self.spatial_output = spatial_output 56 | 57 | if not self.spatial_output: 58 | self.output_shape = (output_size,) 59 | # self.visual_fc = nn.Sequential( 60 | # nn.Flatten(), 61 | # nn.Linear( 62 | # np.prod(self.visual_encoder.output_shape), output_size 63 | # ), 64 | # nn.ReLU(True), 65 | # ) 66 | None 67 | else: 68 | self.spatial_embeddings = nn.Embedding( 69 | self.visual_encoder.output_shape[1] 70 | * self.visual_encoder.output_shape[2], 71 | 64, 72 | ) 73 | 74 | self.output_shape = list(self.visual_encoder.output_shape) 75 | self.output_shape[0] += self.spatial_embeddings.embedding_dim 76 | self.output_shape = tuple(self.output_shape) 77 | 78 | 79 | def forward(self, observations): 80 | """ 81 | Args: 82 | observations: [BATCH, HEIGHT, WIDTH, CHANNEL] 83 | Returns: 84 | [BATCH, OUTPUT_SIZE] 85 | """ 86 | if "depth_features" in observations: 87 | x = observations["depth_features"] 88 | else: 89 | x = self.visual_encoder(observations) 90 | 91 | if self.spatial_output: 92 | b, c, h, w = x.size() 93 | 94 | spatial_features = ( 95 | self.spatial_embeddings( 96 | torch.arange( 97 | 0, 98 | self.spatial_embeddings.num_embeddings, 99 | device=x.device, 100 | dtype=torch.long, 101 | ) 102 | ) 103 | .view(1, -1, h, w) 104 | .expand(b, self.spatial_embeddings.embedding_dim, h, w) 105 | ) 106 | 107 | return torch.cat([x, spatial_features], dim=1) 108 | else: 109 | # return self.visual_fc(x) 110 | return x 111 | 112 | 113 | class TorchVisionResNet50(nn.Module): 114 | r""" 115 | Takes in observations and produces an embedding of the rgb component. 116 | 117 | Args: 118 | observation_space: The observation_space of the agent 119 | output_size: The size of the embedding vector 120 | device: torch.device 121 | """ 122 | 123 | def __init__( 124 | self, 125 | observation_space, 126 | output_size, 127 | device, 128 | spatial_output: bool = False, 129 | ): 130 | super().__init__() 131 | self.device = device 132 | self.resnet_layer_size = 2048 133 | linear_layer_input_size = 0 134 | if "rgb" in observation_space.spaces: 135 | self._n_input_rgb = observation_space.spaces["rgb"].shape[2] 136 | obs_size_0 = observation_space.spaces["rgb"].shape[0] 137 | obs_size_1 = observation_space.spaces["rgb"].shape[1] 138 | if obs_size_0 != 224 or obs_size_1 != 224: 139 | logger.warn( 140 | "TorchVisionResNet50: observation size is not conformant to expected ResNet input size [3x224x224]" 141 | ) 142 | linear_layer_input_size += self.resnet_layer_size 143 | else: 144 | self._n_input_rgb = 0 145 | 146 | if self.is_blind: 147 | self.cnn = nn.Sequential() 148 | return 149 | 150 | rgb_resnet = models.resnet50(pretrained=True) 151 | rgb_modules = list(rgb_resnet.children())[:-2] 152 | self.cnn = torch.nn.Sequential(*rgb_modules) 153 | 154 | # disable gradients for resnet, params frozen 155 | for param in self.cnn.parameters(): 156 | param.requires_grad_(False) 157 | self.cnn.eval() 158 | 159 | self.spatial_output = spatial_output 160 | 161 | if not self.spatial_output: 162 | self.output_shape = (output_size,) 163 | # self.fc = nn.Linear(linear_layer_input_size, output_size) 164 | # self.activation = nn.ReLU() 165 | None 166 | else: 167 | class SpatialAvgPool(nn.Module): 168 | def forward(self, x): 169 | x = F.adaptive_avg_pool2d(x, (4, 4)) 170 | 171 | return x 172 | self.cnn.avgpool = SpatialAvgPool() 173 | self.cnn.fc = nn.Sequential() 174 | self.spatial_embeddings = nn.Embedding(4 * 4, 64) 175 | self.output_shape = ( 176 | self.resnet_layer_size + self.spatial_embeddings.embedding_dim, 177 | 4, 178 | 4, 179 | ) 180 | 181 | # self.layer_extract = self.cnn._modules.get("avgpool") 182 | 183 | self.rgb_transform = torch.nn.Sequential( 184 | transforms.ConvertImageDtype(torch.float), 185 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 186 | ) 187 | 188 | @property 189 | def is_blind(self): 190 | return self._n_input_rgb == 0 191 | 192 | def forward(self, observations): 193 | r"""Sends RGB observation through the TorchVision ResNet50 pre-trained 194 | on ImageNet. Sends through fully connected layer, activates, and 195 | returns final embedding. 196 | """ 197 | 198 | def resnet_forward(observation): 199 | # resnet_output = torch.zeros( 200 | # 1, dtype=torch.float32, device=observation.device 201 | # ) 202 | # def hook(m, i, o): 203 | # resnet_output.set_(o) 204 | 205 | # output: [BATCH x RESNET_DIM] 206 | # h = self.layer_extract.register_forward_hook(hook) 207 | resnet_output = self.cnn(observation) 208 | # h.remove() 209 | return resnet_output 210 | 211 | if "rgb_features" in observations: 212 | resnet_output = observations["rgb_features"] 213 | else: 214 | # permute tensor to dimension [BATCH x CHANNEL x HEIGHT x WIDTH] 215 | rgb_observations = observations["rgb"].permute(0, 3, 1, 2) 216 | 217 | rgb_observations = self.rgb_transform(rgb_observations) 218 | # rgb_observations = rgb_observations / 255.0 # normalize RGB 219 | 220 | resnet_output = resnet_forward(rgb_observations.contiguous()) 221 | 222 | if self.spatial_output: 223 | b, c, h, w = resnet_output.size() 224 | 225 | spatial_features = ( 226 | self.spatial_embeddings( 227 | torch.arange( 228 | 0, 229 | self.spatial_embeddings.num_embeddings, 230 | device=resnet_output.device, 231 | dtype=torch.long, 232 | ) 233 | ) 234 | .view(1, -1, h, w) 235 | .expand(b, self.spatial_embeddings.embedding_dim, h, w) 236 | ) 237 | 238 | return torch.cat([resnet_output, spatial_features], dim=1)#.to(self.device) 239 | else: 240 | # return self.activation( 241 | # self.fc(torch.flatten(resnet_output, 1)) 242 | # ) # [BATCH x OUTPUT_DIM] 243 | return resnet_output 244 | 245 | 246 | class CLIPEncoder(nn.Module): 247 | r""" 248 | Takes in observations and produces an embedding of the rgb component. 249 | 250 | Args: 251 | observation_space: The observation_space of the agent 252 | output_size: The size of the embedding vector 253 | device: torch.device 254 | """ 255 | 256 | def __init__( 257 | self, device, patch_size=16 258 | ): 259 | super().__init__() 260 | #self.model, _ = clip.load("data/ViT-B-32.pt", device=device) 261 | self.model = CLIP( 262 | input_resolution=224, patch_size=patch_size, width=768, layers=12, heads=12 263 | ) 264 | if patch_size == 16: 265 | # scripted_model = torch.load('data/ViT-B-'+str(patch_size)+'.pt') 266 | # state_dict = scripted_model.state_dict() 267 | # self.model.load_state_dict(state_dict) 268 | self.model.load_state_dict(torch.load('/data/ViT-B-'+str(patch_size)+'.pt', map_location = torch.device('cpu')).state_dict(),strict=False) 269 | self.model = self.model.to('cuda') # transfer to GPU 270 | elif patch_size == 32: 271 | self.model.load_state_dict(torch.jit.load('data/ViT-B-'+str(patch_size)+'.pt', map_location = torch.device('cpu')).state_dict(),strict=False) 272 | 273 | for param in self.model.parameters(): 274 | param.requires_grad_(False) 275 | self.model.eval() 276 | 277 | 278 | self.rgb_transform = torch.nn.Sequential( 279 | transforms.Resize((224,224), interpolation=Image.BICUBIC), 280 | transforms.ConvertImageDtype(torch.float), 281 | transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]), 282 | ) 283 | 284 | 285 | def forward(self, observations, fine_grained_fts=False): 286 | r"""Sends RGB observation through the TorchVision ResNet50 pre-trained 287 | on ImageNet. Sends through fully connected layer, activates, and 288 | returns final embedding. 289 | """ 290 | 291 | rgb_observations = observations["rgb"].permute(0, 3, 1, 2) 292 | rgb_observations = self.rgb_transform(rgb_observations) 293 | rgb_observations = rgb_observations.to('cuda') #new 294 | 295 | if fine_grained_fts: 296 | output = self.model(rgb_observations.contiguous()) 297 | else: 298 | output = self.model.encode_image(rgb_observations.contiguous()) 299 | return output.float() # to fp32 -------------------------------------------------------------------------------- /vlnce_baselines/waypoint_networks/viz_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import os 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | import math 7 | import torch 8 | from PIL import Image 9 | 10 | ''' 11 | MP3D original semantic labels and reduced set correspondence 12 | # Original set from here: https://github.com/niessner/Matterport/blob/master/metadata/mpcat40.tsv 13 | 0 void 0 14 | 1 wall 15 structure 15 | 2 floor 17 free-space 16 | 3 chair 1 17 | 4 door 2 18 | 5 table 3 19 | 6 picture 18 20 | 7 cabinet 19 21 | 8 cushion 4 22 | 9 window 15 structure 23 | 10 sofa 5 24 | 11 bed 6 25 | 12 curtain 16 other 26 | 13 chest_of_drawers 20 27 | 14 plant 7 28 | 15 sink 8 29 | 16 stairs 17 free-space 30 | 17 ceiling 17 free-space 31 | 18 toilet 9 32 | 19 stool 21 33 | 20 towel 22 34 | 21 mirror 16 other 35 | 22 tv_monitor 10 36 | 23 shower 11 37 | 24 column 15 structure 38 | 25 bathtub 12 39 | 26 counter 13 40 | 27 fireplace 23 41 | 28 lighting 16 other 42 | 29 beam 16 other 43 | 30 railing 16 other 44 | 31 shelving 16 other 45 | 32 blinds 16 other 46 | 33 gym_equipment 24 47 | 34 seating 25 48 | 35 board_panel 16 other 49 | 36 furniture 16 other 50 | 37 appliances 14 51 | 38 clothes 26 52 | 39 objects 16 other 53 | 40 misc 16 other 54 | ''' 55 | # 27 categories which include the 21 object categories in the habitat challenge 56 | label_conversion_40_27 = {-1:0, 0:0, 1:15, 2:17, 3:1, 4:2, 5:3, 6:18, 7:19, 8:4, 9:15, 10:5, 11:6, 12:16, 13:20, 14:7, 15:8, 16:17, 17:17, 57 | 18:9, 19:21, 20:22, 21:16, 22:10, 23:11, 24:15, 25:12, 26:13, 27:23, 28:16, 29:16, 30:16, 31:16, 32:16, 58 | 33:24, 34:25, 35:16, 36:16, 37:14, 38:26, 39:16, 40:16} 59 | color_mapping_27 = { 60 | 0:(255,255,255), # white 61 | 1:(128,128,0), # olive (dark yellow) 62 | 2:(0,0,255), # blue 63 | 3:(255,0,0), # red 64 | 4:(255,0,255), # magenta 65 | 5:(0,255,255), # cyan 66 | 6:(255,165,0), # orange 67 | 7:(255,255,0), # yellow 68 | 8:(128,128,128), # gray 69 | 9:(128,0,0), # maroon 70 | 10:(255,20,147), # pink 71 | 11:(0,128,0), # dark green 72 | 12:(128,0,128), # purple 73 | 13:(0,128,128), # teal 74 | 14:(0,0,128), # navy (dark blue) 75 | 15:(210,105,30), # chocolate 76 | 16:(188,143,143), # rosy brown 77 | 17:(0,255,0), # green 78 | 18:(255,215,0), # gold 79 | 19:(0,0,0), # black 80 | 20:(192,192,192), # silver 81 | 21:(138,43,226), # blue violet 82 | 22:(255,127,80), # coral 83 | 23:(238,130,238), # violet 84 | 24:(245,245,220), # beige 85 | 25:(139,69,19), # saddle brown 86 | 26:(64,224,208) # turquoise 87 | } 88 | 89 | # three label classification (0:void, 1:occupied, 2:free) 90 | label_conversion_40_3 = {-1:0, 0:0, 1:1, 2:2, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:2, 17:2, 91 | 18:1, 19:1, 20:1, 21:1, 22:1, 23:1, 24:1, 25:1, 26:1, 27:1, 28:1, 29:1, 30:1, 31:1, 32:1, 92 | 33:1, 34:1, 35:1, 36:1, 37:1, 38:1, 39:1, 40:1} 93 | color_mapping_3 = { 94 | 0:(255,255,255), # white 95 | 1:(0,0,255), # blue 96 | 2:(0,255,0), # green 97 | } 98 | 99 | # visualize a scene map with it's sampled waypoints from RxR 100 | def vis_episode(gt_map_semantic, pose_coords, name="map_tmp", color_mapping=27): 101 | color_map = colorize_grid(gt_map_semantic.unsqueeze(0).unsqueeze(0), color_mapping=color_mapping) 102 | im_color_map = color_map[0,0,:,:,:].permute(1,2,0).cpu().numpy() 103 | 104 | plt.figure(figsize=(10 ,8)) 105 | plt.axis('off') 106 | plt.imshow(im_color_map) 107 | for i in range(len(pose_coords)): 108 | point = pose_coords[i] 109 | if point[0]>=0 and point[1]>=0: 110 | plt.scatter(point[0], point[1], color="blue", s=50) 111 | plt.savefig(name+'.png', bbox_inches='tight', pad_inches=0, dpi=100) 112 | plt.close() 113 | 114 | 115 | def vis_heatmaps(pred, gt): 116 | pred = pred.detach().cpu().numpy() 117 | gt = gt.detach().cpu().numpy() 118 | # heatmaps are 1 x h x w 119 | arr = [pred, gt] 120 | n = len(arr) 121 | plt.figure(figsize=(10,5)) 122 | for i, data in enumerate(arr): 123 | ax = plt.subplot(1, n, i+1) 124 | ax.axis('off') 125 | plt.imshow(data) 126 | plt.show() 127 | 128 | 129 | 130 | def write_img(img, savepath, name): 131 | # img: T x 3 x dim x dim, assumed normalized 132 | for i in range(img.shape[0]): 133 | vis_img = img[i,:,:,:].cpu().numpy() 134 | vis_img = np.transpose(vis_img, (1,2,0)) 135 | im_path = savepath + str(i) + "_" + name + ".png" 136 | cv2.imwrite(im_path, vis_img[:,:,::-1]*255.0) 137 | 138 | 139 | 140 | def colorize_grid(grid, color_mapping=27): # to pass into tensorboardX video 141 | # Input: grid -- B x T x C x grid_dim x grid_dim, where C=1,T=1 when gt and C=41,T>=1 for other 142 | # Output: grid_img -- B x T x 3 x grid_dim x grid_dim 143 | grid = grid.detach().cpu().numpy() 144 | grid_img = np.zeros((grid.shape[0], grid.shape[1], grid.shape[3], grid.shape[4], 3), dtype=np.uint8) 145 | if grid.shape[2] > 1: 146 | # For cells where prob distribution is all zeroes (or uniform), argmax returns arbitrary number (can be true for the accumulated maps) 147 | grid_prob_max = np.amax(grid, axis=2) 148 | inds = np.asarray(grid_prob_max<=0.05).nonzero() # if no label has prob higher than k then assume unobserved 149 | grid[inds[0], inds[1], 0, inds[2], inds[3]] = 1 # assign label 0 (void) to be the dominant label 150 | grid = np.argmax(grid, axis=2) # B x T x grid_dim x grid_dim 151 | else: 152 | grid = grid.squeeze(2) 153 | 154 | if color_mapping==27: 155 | color_mapping = color_mapping_27 156 | else: 157 | color_mapping = color_mapping_3 158 | for label in color_mapping.keys(): 159 | grid_img[ grid==label ] = color_mapping[label] 160 | 161 | return torch.tensor(grid_img.transpose(0, 1, 4, 2, 3), dtype=torch.uint8) 162 | 163 | 164 | def write_tensor_image(grid, savepath, name, sseg_labels=27): 165 | # grid: T x C x dim x dim 166 | grid_imgs = colorize_grid(grid.unsqueeze(0), color_mapping=sseg_labels) 167 | grid_imgs = grid_imgs.squeeze(0) 168 | grid_imgs = grid_imgs.detach().cpu().numpy() 169 | for t in range(grid_imgs.shape[0]): 170 | im = grid_imgs[t,:,:,:].transpose(1,2,0) 171 | im_path = savepath + str(t) + "_" + name + ".png" 172 | cv2.imwrite(im_path, im[:,:,::-1]) 173 | 174 | 175 | def write_tensor_imgSegm(img, savepath, name, t=None, labels=27, waypoints=None): 176 | # pred: T x C x dim x dim 177 | if img.shape[1] > 1: 178 | img = torch.argmax(img.cpu(), dim=1, keepdim=True) # T x 1 x cH x cW 179 | img_labels = img.squeeze(1) 180 | 181 | for i in range(img_labels.shape[0]): 182 | img0 = img_labels[i,:,:] 183 | 184 | vis_img = np.zeros((img0.shape[0], img0.shape[1], 3), dtype=np.uint8) 185 | 186 | if labels==27: 187 | color_mapping = color_mapping_27 188 | else: 189 | color_mapping = color_mapping_3 190 | 191 | for label in color_mapping.keys(): 192 | vis_img[ img0==label ] = color_mapping[label] 193 | 194 | if t is None: 195 | im_path = savepath + str(i) + "_" + name + ".png" 196 | else: 197 | im_path = savepath + name + "_" + str(t) + "_" + str(i) + ".png" 198 | 199 | if waypoints != None: 200 | for coords in waypoints: 201 | vis_img[coords[1]-3:coords[1]+3,coords[0]-3:coords[0]+3,:] = np.array([0,0,1]) 202 | 203 | cv2.imwrite(im_path, vis_img[:,:,::-1]) 204 | 205 | 206 | def display_sample(rgb_obs, depth_obs, t, sseg_img=None, savepath=None): 207 | # sseg_img is semantic observation from Matterport habitat 208 | depth_obs = depth_obs / np.amax(depth_obs) # normalize for visualization 209 | rgb_img = Image.fromarray(rgb_obs, mode="RGB") 210 | depth_img = Image.fromarray((depth_obs * 255).astype(np.uint8), mode="L") 211 | 212 | plt.figure(figsize=(12 ,8)) 213 | plt.axis('off') 214 | plt.imshow(rgb_img) 215 | plt.savefig(savepath+str(t)+"_rgb.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 216 | plt.close() 217 | 218 | plt.figure(figsize=(12 ,8)) 219 | plt.axis('off') 220 | plt.imshow(depth_img) 221 | plt.savefig(savepath+str(t)+"_depth.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 222 | plt.close() 223 | 224 | 225 | def save_map_goal(gt_map_semantic, pose_coords, goal_pose_coords, save_img_dir_, t): 226 | color_map_sem = colorize_grid(gt_map_semantic) 227 | im = color_map_sem[0,0,:,:,:].permute(1,2,0).cpu().numpy() 228 | 229 | plt.figure(figsize=(10 ,7)) 230 | plt.axis('off') 231 | plt.imshow(im) 232 | if goal_pose_coords[0,0,0]>=0 and goal_pose_coords[0,0,1]>=0: 233 | plt.scatter(goal_pose_coords[0,0,0], goal_pose_coords[0,0,1], color="magenta", s=70) 234 | plt.scatter(pose_coords[0,0,0], pose_coords[0,0,1], color="blue", s=70) 235 | plt.savefig(save_img_dir_+str(t)+'.png', bbox_inches='tight', pad_inches=0, dpi=100) 236 | plt.close() 237 | 238 | 239 | 240 | def save_map_pred_steps(spatial_in, spatial_pred, objects_pred, ego_img_segm, save_img_dir_, t): 241 | 242 | color_spatial_in = colorize_grid(spatial_in, color_mapping=3) 243 | im_spatial_in = color_spatial_in[0,0,:,:,:].permute(1,2,0).cpu().numpy() 244 | 245 | color_spatial_pred = colorize_grid(spatial_pred, color_mapping=3) 246 | im_spatial_pred = color_spatial_pred[0,0,:,:,:].permute(1,2,0).cpu().numpy() 247 | 248 | color_objects_pred = colorize_grid(objects_pred, color_mapping=27) 249 | im_objects_pred = color_objects_pred[0,0,:,:,:].permute(1,2,0).cpu().numpy() 250 | 251 | color_ego_img_segm = colorize_grid(ego_img_segm, color_mapping=27) 252 | im_ego_img_segm = color_ego_img_segm[0,0,:,:,:].permute(1,2,0).cpu().numpy() 253 | 254 | plt.figure(figsize=(12 ,8)) 255 | plt.axis('off') 256 | plt.imshow(im_spatial_in) 257 | plt.savefig(save_img_dir_+str(t)+"_im_spatial_in.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 258 | plt.close() 259 | 260 | plt.figure(figsize=(12 ,8)) 261 | plt.axis('off') 262 | plt.imshow(im_spatial_pred) 263 | plt.savefig(save_img_dir_+str(t)+"_im_spatial_pred.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 264 | plt.close() 265 | 266 | plt.figure(figsize=(12 ,8)) 267 | plt.axis('off') 268 | plt.imshow(im_objects_pred) 269 | plt.savefig(save_img_dir_+str(t)+"_im_objects_pred.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 270 | plt.close() 271 | 272 | plt.figure(figsize=(12 ,8)) 273 | plt.axis('off') 274 | plt.imshow(im_ego_img_segm) 275 | plt.savefig(save_img_dir_+str(t)+"_im_ego_img_segm.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100 276 | plt.close() 277 | 278 | 279 | def show_waypoint_pred(map_semantic, savepath, num_points, ltg=None, pose_coords=None, pred_waypoints=None, gt_waypoints=None): 280 | # Waypoints are provided in map pose coordinates 281 | color_map = colorize_grid(map_semantic.unsqueeze(0).unsqueeze(0)) 282 | im_color_map = color_map[0,0,:,:,:].permute(1,2,0).cpu().numpy() 283 | 284 | plt.figure(figsize=(10 ,8)) 285 | plt.axis('off') 286 | plt.imshow(im_color_map) 287 | 288 | for i in range(num_points): 289 | if gt_waypoints is not None: 290 | point_gt = gt_waypoints[i] 291 | if point_gt[0]>=0 and point_gt[1]>=0: 292 | plt.scatter(point_gt[0], point_gt[1], color="blue", s=50) 293 | plt.text(point_gt[0], point_gt[1], s=str(i), color="blue") 294 | if pred_waypoints is not None: 295 | point_pred = pred_waypoints[i] 296 | if point_pred[0]>=0 and point_pred[1]>=0: 297 | plt.scatter(point_pred[0], point_pred[1], color="red", s=50) 298 | plt.text(point_pred[0], point_pred[1], s=str(i), color="red") 299 | # ltg and agent position 300 | if ltg is not None: 301 | plt.scatter(ltg[0,0,0], ltg[0,0,1], color="magenta", s=50) 302 | if pose_coords is not None: 303 | plt.scatter(pose_coords[0,0,0], pose_coords[0,0,1], color="green", s=50) 304 | 305 | plt.savefig(savepath, bbox_inches='tight', pad_inches=0, dpi=100) 306 | plt.close() -------------------------------------------------------------------------------- /utils_p/memory.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from numpy.linalg import norm 4 | import pickle 5 | 6 | 7 | class Memory(object): 8 | """ 9 | Create the empty memory buffer 10 | """ 11 | 12 | def __init__(self, size, dimension=1 * 3 * 22 * 22, alpha=0.9): #1 * 3 * 224 * 224 13 | self.memory = {} 14 | self.size = size 15 | self.dimension = dimension 16 | self.alpha = alpha 17 | 18 | def reset(self): 19 | self.memory = {} 20 | 21 | def get_size(self): 22 | return len(self.memory) 23 | 24 | def push(self, keys, logits): 25 | # mo = 0.5 26 | keys = keys.reshape(len(keys), self.dimension) 27 | for i, key in enumerate(keys): 28 | 29 | if len(self.memory.keys()) >= self.size: 30 | # Memory is full, find the nearest neighbours and update them 31 | #key = key.reshape(len(key), self.dimension) 32 | all_keys = np.frombuffer(np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(self.get_size(), self.dimension) 33 | similarity_scores = np.dot(all_keys, key.T) / (norm(all_keys, axis=1) * norm(key.T)) 34 | top_k_indices = np.argsort(similarity_scores)[-5:] # Top-k indices with highest similarity 35 | for idx in top_k_indices: 36 | mem_key = all_keys[idx].tobytes() 37 | top_k_logit = self.memory[mem_key] 38 | #self.memory[mem_key] = mo * top_k_logit + (1 - mo) * logits[i] 39 | # Update the memory with a weighted average of the top-k logits 40 | self.memory[mem_key] = self.alpha * top_k_logit + (1 - self.alpha) * logits[i] 41 | """ 42 | neighbors, similarity_scores = self.get_topk(np.array([key_flat]), k=5) 43 | 44 | for nkey, score in zip(neighbors, similarity_scores): 45 | mem_key = nkey.tobytes() 46 | self.memory[mem_key] = self.alpha * self.memory[mem_key] + (1 - self.alpha) * logits[i] 47 | 48 | """ 49 | else: 50 | # Memory is not full, add new key-logit pair to memory 51 | self.memory.update({key.reshape(self.dimension).tobytes(): logits[i]}) 52 | 53 | 54 | def _prepare_batch(self, sample, attention_weight): 55 | attention_weight = np.array(attention_weight / 0.2) 56 | attention_weight = np.exp(attention_weight) / (np.sum(np.exp(attention_weight))) 57 | ensemble_prediction = sample[0] * attention_weight[0] 58 | for i in range(1, len(sample)): 59 | ensemble_prediction = ensemble_prediction + sample[i] * attention_weight[i] 60 | 61 | return torch.FloatTensor(ensemble_prediction) 62 | 63 | def get_neighbours(self, keys, k): 64 | """ 65 | Returns samples from buffer using nearest neighbour approach 66 | """ 67 | samples = [] 68 | 69 | keys = keys.reshape(len(keys), self.dimension) 70 | total_keys = len(self.memory.keys()) 71 | self.all_keys = np.frombuffer( 72 | np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension) 73 | 74 | for key in keys: 75 | similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T)) 76 | 77 | K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] 78 | neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] 79 | 80 | attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T)) 81 | batch = self._prepare_batch(neighbours, attention_weight) 82 | samples.append(batch) 83 | 84 | return torch.stack(samples), np.mean(similarity_scores) 85 | 86 | 87 | 88 | def save_memory(self, file_path): 89 | with open(file_path, 'wb') as f: 90 | pickle.dump(self.memory, f) 91 | 92 | def load_memory(self, file_path): 93 | with open(file_path, 'rb') as f: 94 | self.memory = pickle.load(f) 95 | 96 | 97 | def get_topk_avg(self, keys, k): 98 | 99 | samples = [] 100 | 101 | keys = keys.reshape(len(keys), self.dimension) 102 | total_keys = len(self.memory.keys()) 103 | self.all_keys = np.frombuffer( 104 | np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension) 105 | 106 | for key in keys: 107 | similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T)) 108 | 109 | K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] 110 | neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] 111 | 112 | mean_prompts = torch.FloatTensor(np.mean(neighbours, axis=0)) # (dimension,) 113 | # batch = self._prepare_batch(neighbours, attention_weight) #TEST 114 | samples.append(mean_prompts) 115 | 116 | #attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T)) 117 | #batch = self._prepare_batch(neighbours, attention_weight) 118 | #samples.append(batch) 119 | 120 | return torch.stack(samples), np.mean(similarity_scores) 121 | 122 | 123 | 124 | #---------------------------------------------------------------------------------------Contextual Evolution Memory 125 | class Memory_vft(object): 126 | """ 127 | Create the empty memory buffer 128 | """ 129 | 130 | def __init__(self, size, dimension=1 * 1536, key_dimension=1*768, alpha=0.1): #, key_dim=768, dim = 1536 131 | # self.memory = {} 132 | self.size = size 133 | self.dimension = dimension 134 | self.key_dimension = key_dimension 135 | self.alpha = alpha 136 | # self.dim = dim 137 | # self.key_dim = key_dim 138 | logits = torch.randn(size, dimension) 139 | #self.memory = {torch.randn(1, key_dimension): torch.randn(1, dimension) for _ in range(size)} 140 | self.memory = { 141 | torch.randn(1, key_dimension).numpy().tobytes(): torch.randn(1, dimension) 142 | for _ in range(size) 143 | } 144 | 145 | 146 | 147 | def reset(self): 148 | self.memory = {} 149 | 150 | def get_size(self): 151 | return len(self.memory) 152 | 153 | def push(self, keys, logits): 154 | 155 | keys = keys.reshape(len(keys), self.key_dimension) 156 | for i, key in enumerate(keys): 157 | 158 | if len(self.memory.keys()) >= self.size: 159 | # Memory is full, find the nearest neighbours and update them 160 | #key = key.reshape(len(key), self.dimension) 161 | all_keys = np.frombuffer(np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(self.get_size(), self.key_dimension) 162 | similarity_scores = np.dot(all_keys, key.T) / (norm(all_keys, axis=1) * norm(key.T)) 163 | top_k_indices = np.argsort(similarity_scores)[-5:] # Top-k indices with highest similarity 164 | for idx in top_k_indices: 165 | mem_key = all_keys[idx].tobytes() 166 | #mem_key = all_keys[idx].tobytes() 167 | top_k_logit = self.memory[mem_key] 168 | #self.memory[mem_key] = mo * top_k_logit + (1 - mo) * logits[i] 169 | self.memory[mem_key] = self.alpha * top_k_logit + (1 - self.alpha) * logits[i] 170 | 171 | else: 172 | self.memory.update({key.reshape(self.key_dimension).tobytes(): logits[i]}) 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | def _prepare_batch(self, sample, attention_weight): 181 | attention_weight = np.array(attention_weight / 0.2) 182 | attention_weight = np.exp(attention_weight) / (np.sum(np.exp(attention_weight))) 183 | ensemble_prediction = sample[0] * attention_weight[0] 184 | for i in range(1, len(sample)): 185 | ensemble_prediction = ensemble_prediction + sample[i] * attention_weight[i] 186 | 187 | return torch.FloatTensor(ensemble_prediction) 188 | 189 | def get_neighbours(self, keys, k): 190 | """ 191 | Returns samples from buffer using nearest neighbour approach 192 | """ 193 | samples = [] 194 | 195 | keys = keys.reshape(len(keys), self.dimension) 196 | total_keys = len(self.memory.keys()) 197 | self.all_keys = np.frombuffer( 198 | np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension) 199 | 200 | for key in keys: 201 | similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T)) 202 | 203 | K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] 204 | neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] 205 | 206 | attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T)) 207 | batch = self._prepare_batch(neighbours, attention_weight) 208 | samples.append(batch) 209 | 210 | return torch.stack(samples), np.mean(similarity_scores) 211 | 212 | def save_memory(self, file_path): 213 | with open(file_path, 'wb') as f: 214 | pickle.dump(self.memory, f) 215 | 216 | def load_memory(self, file_path): 217 | with open(file_path, 'rb') as f: 218 | self.memory = pickle.load(f) 219 | 220 | 221 | def get_topk(self, keys, k): 222 | 223 | samples = [] 224 | 225 | keys = keys.reshape(len(keys), self.key_dimension) #(num_keys, dimension) 226 | total_keys = len(self.memory.keys()) 227 | 228 | self.all_keys = np.frombuffer( 229 | np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.key_dimension) #(total_keys, dimension) 230 | 231 | for key in keys: 232 | similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T)) 233 | 234 | K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] #(k, dimension) 235 | neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] #(k,dimension) 236 | 237 | attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T)) 238 | batch = self._prepare_batch(neighbours, attention_weight) #TEST 239 | samples.append(batch) 240 | 241 | return torch.stack(samples), np.mean(similarity_scores) 242 | # return samples, np.mean(similarity_scores) 243 | 244 | 245 | def get_topk_mean(self, keys, k): 246 | 247 | samples = [] 248 | 249 | keys = keys.reshape(len(keys), self.dimension) #(num_keys, dimension) 250 | total_keys = len(self.memory.keys()) 251 | 252 | self.all_keys = np.frombuffer( 253 | np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension) #(total_keys, dimension) 254 | 255 | for key in keys: 256 | similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T)) 257 | 258 | K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] #(k, dimension) 259 | neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] #(k,dimension) 260 | 261 | # attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T)) 262 | # mean_prompts = np.mean(neighbours, axis=0) # (dimension,) 263 | mean_prompts = torch.FloatTensor(np.mean(neighbours, axis=0)) # (dimension,) 264 | # batch = self._prepare_batch(neighbours, attention_weight) #TEST 265 | samples.append(mean_prompts) 266 | 267 | return torch.stack(samples), np.mean(similarity_scores) 268 | 269 | 270 | 271 | 272 | def retrieve_prompt_add_avg(self, avg_pano_embeds, combined, top_k=16): #now for usage of visual feature 273 | """ 274 | Retrieve top-k similar prompts from memory for each directional pano_embeds (1*12*768) 275 | and prepend them to form new pano_embeds with prompts. 276 | 277 | Args: 278 | avg_pano_embeds: Tensor of shape (1, 768) representing the average panoramic embedding. 279 | top_k: Number of top similar prompts to retrieve from memory. 280 | 281 | Returns: 282 | avg_pano_with_prompts: Tensor of shape (1, 768) with enhanced embeddings. 283 | """ 284 | ud = 0.2 285 | # Initialize a list to store the updated pano_embeds with prompts 286 | pano_with_prompts = [] 287 | 288 | # Define a linear layer to project concatenated prompts to the desired dimension 289 | linear_layer = torch.nn.Linear(in_features=1536, out_features=768) 290 | 291 | # Iterate over each direction in pano_embeds (12 directions in total) 292 | if isinstance(avg_pano_embeds, torch.Tensor): 293 | avg_pano_embeds = avg_pano_embeds.detach().cpu().numpy() 294 | 295 | posprompts, _ = self.get_topk(keys=avg_pano_embeds, k=top_k) 296 | 297 | # Calculate the mean of the top-k prompts (1*768) 298 | #mean_prompt = torch.from_numpy(prompts).mean(dim=0, keepdim=True) 299 | #mean_prompt = prompts.mean(dim=0, keepdim=True) 300 | avg_pano_embeds = torch.from_numpy(avg_pano_embeds).float() 301 | combined_embeds = torch.from_numpy(combined).float() 302 | # Concatenate the mean_prompt with direction_embed 303 | #concatenated = torch.cat([mean_prompt, avg_pano_embeds], dim=-1) # (1, 1536) 304 | 305 | # Use the linear layer to project back to (1, 768) 306 | #enhanced_embed = linear_layer(concatenated) 307 | enhanced_embed = combined_embeds * (1-ud) + posprompts.squeeze(0) * ud 308 | 309 | # Add the enhanced result to the pano_with_prompts list 310 | #pano_with_prompts.append(enhanced_embed) 311 | 312 | # Stack the updated embeddings back to form a tensor of shape (1, 12, 768) 313 | pano_with_posprompts = enhanced_embed 314 | 315 | return pano_with_posprompts 316 | 317 | -------------------------------------------------------------------------------- /vlnce_baselines/models/etp/nerf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # torch.autograd.set_detect_anomaly(True) 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import tinycudann as tcnn 7 | import math 8 | import scipy.signal 9 | import heapq 10 | import cv2 11 | from tqdm import tqdm 12 | 13 | # Misc 14 | img2mse = lambda x, y : torch.mean((x - y) ** 2) 15 | mse2psnr = lambda x : -10. * torch.log(x) / torch.log(torch.Tensor([10.])) 16 | 17 | 18 | # Model 19 | class NeRF(nn.Module): 20 | def __init__(self, D=4, W=256, input_ch=3, output_ch=4): 21 | """ 22 | """ 23 | super(NeRF, self).__init__() 24 | self.D = D 25 | self.W = W 26 | 27 | self.tcnn = tcnn.Network( 28 | n_input_dims=input_ch, 29 | n_output_dims=output_ch, 30 | network_config={ 31 | "otype": "CutlassMLP", 32 | "activation": "ReLU", 33 | "output_activation": "None", 34 | "n_neurons": W, 35 | "n_hidden_layers": D, 36 | }, 37 | ) 38 | 39 | def forward(self, x): 40 | outputs = self.tcnn(x) 41 | return outputs 42 | 43 | 44 | 45 | 46 | def config_parser(): 47 | 48 | import configargparse 49 | parser = configargparse.ArgumentParser() 50 | 51 | 52 | # training options 53 | parser.add_argument("--near", type=float, default=0., 54 | help='near distance') 55 | parser.add_argument("--far", type=float, default=10., 56 | help='far distance') 57 | parser.add_argument("--camera_hfov", type=float, default= 90., #79., # 58 | help='camera HFOV angle') 59 | parser.add_argument("--camera_vfov", type=float, default= 90., # 79., # 60 | help='camera VFOV angle') 61 | parser.add_argument("--pointcloud_search_radius", type=float, default=0.1, 62 | help='pointcloud_search_radius') 63 | parser.add_argument("--featurecloud_search_radius", type=float, default=1., 64 | help='featurecloud_search_radius') 65 | parser.add_argument("--pointcloud_search_num", type=int, default=16, 66 | help='pointcloud_search_num') 67 | parser.add_argument("--featurecloud_search_num", type=int, default=4, 68 | help='featurecloud_search_num') 69 | parser.add_argument("--featuremap_scale", type=int, default=8, 70 | help='featuremap_scale') 71 | parser.add_argument("--chunk", type=int, default=1024, 72 | help='number of rays processed in parallel, decrease if running out of memory') 73 | parser.add_argument("--feature_loss_weight", type=float, default=0.01, 74 | help='weight of the language embedded feature loss') 75 | 76 | 77 | parser.add_argument("--rgba_net_layers", type=int, default=8, 78 | help='layers in rgb network') 79 | parser.add_argument("--rgba_net_width", type=int, default=512, 80 | help='channels per layer in rgb net') 81 | parser.add_argument("--clip_net_layers", type=int, default=8, 82 | help='layers in clip network') 83 | parser.add_argument("--clip_net_width", type=int, default=512, 84 | help='channels per layer in clip net') 85 | 86 | parser.add_argument("--N_rand", type=int, default=14*14, 87 | help='batch size (number of random rays per gradient step)') 88 | 89 | 90 | # rendering options 91 | parser.add_argument("--N_samples", type=int, default=256, 92 | help='number of coarse samples per ray') 93 | parser.add_argument("--N_importance", type=int, default=8, 94 | help='number of fine samples per ray') 95 | 96 | 97 | ## blender flags 98 | parser.add_argument("--white_bkgd", action='store_true', 99 | help='set to render synthetic data on a white bkgd (always use for dvoxels)') 100 | 101 | 102 | 103 | return parser 104 | 105 | 106 | 107 | def create_nerf(): 108 | """Instantiate NeRF's MLP model. 109 | """ 110 | parser = config_parser() 111 | args, unknown = parser.parse_known_args() #parser.parse_args() 112 | np.random.seed(0) 113 | torch.manual_seed(0) 114 | torch.cuda.manual_seed(0) 115 | 116 | width = 512 117 | scale = width ** -0.5 118 | 119 | rgba_mlp = NeRF(D=args.rgba_net_layers, W=args.rgba_net_width, 120 | input_ch=width*2, output_ch=4) # RGBA 121 | 122 | clip_mlp = NeRF(D=args.clip_net_layers, W=args.clip_net_width, 123 | input_ch=width, output_ch=width+1) # CLIP+Alpha 124 | 125 | return args, rgba_mlp, clip_mlp 126 | 127 | 128 | def raw2feature(raw, z_vals): 129 | """Transforms model's predictions to semantically meaningful values. 130 | Args: 131 | raw: [num_rays, num_samples along ray, 4]. Prediction from model. 132 | z_vals: [num_rays, num_samples along ray]. Integration time. 133 | rays_d: [num_rays, 3]. Direction of each ray. 134 | Returns: 135 | feature_map: [num_rays, 512]. Estimated semantic feature of a ray. 136 | disp_map: [num_rays]. Disparity map. Inverse of depth map. 137 | acc_map: [num_rays]. Sum of weights along each ray. 138 | weights: [num_rays, num_samples]. Weights assigned to each sampled color. 139 | depth_map: [num_rays]. Estimated distance to object. 140 | """ 141 | raw2alpha = lambda raw, dists, act_fn=F.relu: 1.-torch.exp(-act_fn(raw)*dists) 142 | 143 | dists = z_vals[...,1:] - z_vals[...,:-1] 144 | dists = torch.cat([dists, torch.Tensor([1e10]).expand(dists[...,:1].shape).to(dists.device)], -1) # [N_rays, N_samples] 145 | 146 | #dists = dists * torch.norm(rays_d[...,None,:], dim=-1) 147 | #rgb = torch.sigmoid(raw[...,:3]) # [N_rays, N_samples, 3] 148 | feature = raw[...,:-1] 149 | 150 | alpha = raw2alpha(raw[...,-1], dists) # [N_rays, N_samples] 151 | 152 | weights = alpha * torch.cumprod(torch.cat([torch.ones((alpha.shape[0], 1)).to(dists.device), 1.-alpha + 1e-10], -1), -1)[:, :-1] 153 | feature_map = torch.sum(weights[...,None] * feature, -2) # [N_rays, 3] 154 | feature_map = feature_map / torch.linalg.norm(feature_map, dim=-1, keepdim=True) 155 | 156 | depth_map = torch.sum(weights * z_vals, -1) 157 | disp_map = 1./torch.max(1e-10 * torch.ones_like(depth_map), depth_map / torch.sum(weights, -1)) 158 | acc_map = torch.sum(weights, -1) 159 | 160 | return feature_map, disp_map, acc_map, weights, depth_map 161 | 162 | 163 | 164 | def get_rays(args, H, W): 165 | rel_y = np.expand_dims(np.linspace(args.near, args.far, args.N_samples),axis=0).repeat(H*W,axis=0) 166 | hfov_angle = np.deg2rad(args.camera_hfov) 167 | vfov_angle = np.deg2rad(args.camera_vfov) 168 | half_W = W//2 169 | half_H = H//2 170 | tan_xy = np.array(([[i/half_W+1/W] for i in range(-half_W,half_W)])*W,np.float32) * math.tan(hfov_angle/2.) 171 | rel_x = rel_y * tan_xy 172 | rel_z = rel_y * (np.array([[i/half_H-1/H for i in range(half_H,-half_H,-1)]]*W,np.float32).T.reshape((-1,1)) * math.tan(vfov_angle/2.)) 173 | return (rel_x,rel_y,rel_z) 174 | 175 | 176 | def RGB_to_BGR(cvimg): 177 | pilimg = cvimg.copy() 178 | pilimg[:, :, 0] = cvimg[:, :, 2] 179 | pilimg[:, :, 2] = cvimg[:, :, 0] 180 | return pilimg 181 | 182 | 183 | def run_nerf_feature(args, model, scene_memory, position, direction, H=14, W=14): 184 | 185 | 186 | camera_x, camera_y, camera_z = position 187 | heading_angle = - direction 188 | scene_fts, patch_directions, patch_scales, fcd, fcd_tree, occupancy_pcd_tree = scene_memory 189 | patch_directions = torch.tensor(patch_directions + heading_angle, dtype=torch.float32).to("cuda") 190 | patch_directions = torch.cat((torch.sin(patch_directions).unsqueeze(-1), torch.cos(patch_directions).unsqueeze(-1)), dim=-1) 191 | patch_scales = torch.tensor(patch_scales, dtype=torch.float32).to("cuda").unsqueeze(-1) 192 | 193 | fcd_points = fcd.to("cuda") 194 | 195 | rel_x, rel_y, rel_z = get_rays(args, H, W) 196 | 197 | ray_x = rel_x * math.cos(heading_angle) + rel_y * math.sin(heading_angle) + camera_x 198 | ray_y = -rel_y * math.cos(heading_angle) + rel_x * math.sin(heading_angle) + camera_y 199 | ray_z = rel_z + camera_z 200 | ray_z_vals = rel_y 201 | 202 | 203 | ray_xyz = torch.tensor(np.concatenate((np.expand_dims(ray_x,-1),np.expand_dims(ray_y,-1),np.expand_dims(ray_z,-1)),axis=-1),dtype=torch.float32).to('cuda') 204 | 205 | occupancy_unit_length = 1**2 206 | with torch.no_grad(): 207 | occupancy_query = ray_xyz.view(-1,3) 208 | searched_occupancy_dists, searched_occupancy_inds = occupancy_pcd_tree.query(occupancy_query, nr_nns_searches=1) #Note that the cupy_kdtree distances are squared 209 | occupancy_map = (searched_occupancy_dists < occupancy_unit_length).view(-1,) 210 | occupancy_ray_xyz = ray_xyz.view(-1,3)[occupancy_map] 211 | 212 | occupancy_ray_k_neighbor_dists, occupancy_ray_k_neighbor_inds = fcd_tree.query(occupancy_ray_xyz, nr_nns_searches=args.featurecloud_search_num) 213 | 214 | searched_ray_k_neighbor_dists = torch.full((ray_xyz.shape[0]*ray_xyz.shape[1],args.featurecloud_search_num),args.featurecloud_search_radius,dtype=torch.float32).to('cuda') 215 | searched_ray_k_neighbor_dists[occupancy_map] = occupancy_ray_k_neighbor_dists 216 | 217 | searched_ray_k_neighbor_inds = torch.full((ray_xyz.shape[0]*ray_xyz.shape[1],args.featurecloud_search_num),-1,dtype=torch.int64).to('cuda') 218 | searched_ray_k_neighbor_inds[occupancy_map] = occupancy_ray_k_neighbor_inds 219 | 220 | searched_ray_k_neighbor_dists = torch.sqrt(searched_ray_k_neighbor_dists) #Note that the cupy_kdtree distances are squared 221 | searched_ray_k_neighbor_inds[searched_ray_k_neighbor_dists >= args.featurecloud_search_radius] = -1 222 | searched_ray_k_neighbor_dists[searched_ray_k_neighbor_dists >= args.featurecloud_search_radius] = args.featurecloud_search_radius 223 | 224 | 225 | searched_ray_k_neighbor_inds = searched_ray_k_neighbor_inds.view(ray_xyz.shape[0],ray_xyz.shape[1],args.featurecloud_search_num) 226 | searched_ray_k_neighbor_dists = searched_ray_k_neighbor_dists.view(ray_xyz.shape[0],ray_xyz.shape[1],args.featurecloud_search_num) 227 | 228 | sample_ray_xyz = torch.zeros((ray_xyz.shape[0],args.N_importance,3),dtype=torch.float32).to('cuda') 229 | sample_ray_z_vals = np.zeros((ray_xyz.shape[0],args.N_importance)) 230 | 231 | for i in range(ray_xyz.shape[0]): 232 | idx = searched_ray_k_neighbor_inds[i] 233 | tmp_distance = searched_ray_k_neighbor_dists[i].sum(-1) 234 | tmp_density = (1/tmp_distance).cpu().numpy().tolist() 235 | 236 | peaks,_ = scipy.signal.find_peaks(tmp_density,distance=1) 237 | topk = heapq.nlargest(args.N_importance, range(len(tmp_density)), tmp_density.__getitem__) 238 | k = max(args.N_importance//2, args.N_importance-len(peaks)) 239 | topk_peaks = topk[:k] 240 | topk_peaks.extend(peaks[:args.N_importance-k]) 241 | topk_peaks.sort() 242 | inds = np.array(topk_peaks,dtype=np.int64) 243 | sample_ray_xyz[i] = ray_xyz[i][torch.tensor(inds).to(ray_xyz.device)] 244 | sample_ray_z_vals[i] = ray_z_vals[i][inds] 245 | 246 | 247 | with torch.no_grad(): 248 | sample_feature_k_neighbor_dists, sample_feature_k_neighbor_inds = fcd_tree.query(sample_ray_xyz.view(-1,3), nr_nns_searches=args.featurecloud_search_num) 249 | 250 | sample_feature_k_neighbor_dists = torch.sqrt(sample_feature_k_neighbor_dists) #Note that the cupy_kdtree distances are squared 251 | sample_feature_k_neighbor_inds[sample_feature_k_neighbor_dists >= args.featurecloud_search_radius] = -1 252 | sample_feature_k_neighbor_dists[sample_feature_k_neighbor_dists >= args.featurecloud_search_radius] = args.featurecloud_search_radius 253 | sample_feature_k_neighbor_inds = sample_feature_k_neighbor_inds.view(sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num) 254 | sample_feature_k_neighbor_dists = sample_feature_k_neighbor_dists.view(sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num) 255 | 256 | 257 | sample_ft_neighbor_xyzds = torch.zeros((sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num,6),dtype=torch.float32).to('cuda') 258 | 259 | idx = sample_feature_k_neighbor_inds 260 | sample_ft_neighbor_xyzds[...,:3] = fcd_points[idx] - sample_ray_xyz.unsqueeze(-2) 261 | 262 | sample_ft_neighbor_x = sample_ft_neighbor_xyzds[...,0] 263 | sample_ft_neighbor_y = sample_ft_neighbor_xyzds[...,1] 264 | 265 | # Get the relative angle to the NeRF camera, so the rotation angle is - heading_angle 266 | sample_ft_neighbor_xyzds[...,0] = sample_ft_neighbor_x * math.cos(-heading_angle) + sample_ft_neighbor_y * math.sin(-heading_angle) 267 | sample_ft_neighbor_xyzds[...,1] = -sample_ft_neighbor_y * math.cos(-heading_angle) + sample_ft_neighbor_x * math.sin(-heading_angle) 268 | 269 | sample_ft_neighbor_xyzds[...,:3][idx==-1] = args.far 270 | sample_ft_neighbor_xyzds[...,3:5] = patch_directions[idx] 271 | sample_ft_neighbor_xyzds[...,3:5][idx==-1] = 0 272 | sample_ft_neighbor_xyzds[...,5:] = patch_scales[idx] 273 | sample_ft_neighbor_xyzds[...,5:][idx==-1] = 0 274 | 275 | sample_ft_neighbor_embedding = scene_fts[idx.cpu().numpy()] 276 | sample_ft_neighbor_embedding = torch.tensor(sample_ft_neighbor_embedding,dtype=torch.float32).to('cuda') 277 | sample_ft_neighbor_embedding[idx==-1] = 0 278 | 279 | 280 | sample_ft_neighbor_xyzds = model.fcd_position_embedding(sample_ft_neighbor_xyzds) 281 | sample_ft = model.fcd_aggregation( (sample_ft_neighbor_embedding + sample_ft_neighbor_xyzds).view(-1,args.N_importance, args.featurecloud_search_num*512) ) 282 | 283 | sample_input = sample_ft 284 | 285 | sample_feature = model.clip_mlp(sample_input.view(-1,sample_input.shape[-1])).view(-1,args.N_importance,512+1) 286 | 287 | sample_ray_z_vals = torch.tensor(sample_ray_z_vals,dtype=torch.float32).to('cuda') 288 | feature_map, disp_map, acc_map, weights, depth_map = raw2feature(sample_feature, sample_ray_z_vals.view(-1,args.N_importance)) 289 | 290 | transformer_input = torch.cat((model.class_embedding,feature_map),dim=0) 291 | transformer_input = transformer_input + model.positional_embedding 292 | 293 | predicted_fts = model.nerf_view_encoder(transformer_input.unsqueeze(0)).squeeze(0) 294 | 295 | return predicted_fts 296 | 297 | 298 | 299 | -------------------------------------------------------------------------------- /vlnce_baselines/models/graph_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | from copy import deepcopy 4 | import networkx as nx 5 | import matplotlib.pyplot as plt 6 | from habitat.tasks.utils import cartesian_to_polar 7 | from habitat.utils.geometry_utils import quaternion_rotate_vector, quaternion_from_coeff 8 | import math 9 | 10 | MAX_DIST = 30 11 | MAX_STEP = 10 12 | # NOISE = 0.5 13 | 14 | def calc_position_distance(a, b): 15 | # a, b: (x, y, z) 16 | dx = b[0] - a[0] 17 | dy = b[1] - a[1] 18 | dz = b[2] - a[2] 19 | dist = np.sqrt(dx**2 + dy**2 + dz**2) 20 | return dist 21 | 22 | def calculate_vp_rel_pos_fts(a, b, base_heading=0, base_elevation=0, to_clock=False): 23 | # a, b: (x, y, z) 24 | dx = b[0] - a[0] 25 | dy = b[1] - a[1] 26 | dz = b[2] - a[2] 27 | # xy_dist = max(np.sqrt(dx**2 + dy**2), 1e-8) 28 | xz_dist = max(np.sqrt(dx**2 + dz**2), 1e-8) 29 | xyz_dist = max(np.sqrt(dx**2 + dy**2 + dz**2), 1e-8) 30 | 31 | # the simulator's api is weired (x-y axis is transposed) 32 | # heading = np.arcsin(dx/xy_dist) # [-pi/2, pi/2] 33 | heading = np.arcsin(-dx / xz_dist) # [-pi/2, pi/2] 34 | # if b[1] < a[1]: 35 | # heading = np.pi - heading 36 | if b[2] > a[2]: 37 | heading = np.pi - heading 38 | heading -= base_heading 39 | if to_clock: 40 | heading = 2 * np.pi - heading 41 | 42 | elevation = np.arcsin(dz / xyz_dist) # [-pi/2, pi/2] 43 | elevation -= base_elevation 44 | 45 | return heading, elevation, xyz_dist 46 | 47 | def get_angle_fts(headings, elevations, angle_feat_size): 48 | ang_fts = [np.sin(headings), np.cos(headings), np.sin(elevations), np.cos(elevations)] 49 | ang_fts = np.vstack(ang_fts).transpose().astype(np.float32) 50 | num_repeats = angle_feat_size // 4 51 | if num_repeats > 1: 52 | ang_fts = np.concatenate([ang_fts] * num_repeats, 1) 53 | return ang_fts 54 | 55 | def heading_from_quaternion(quat: np.array): 56 | # https://github.com/facebookresearch/habitat-lab/blob/v0.1.7/habitat/tasks/nav/nav.py#L356 57 | quat = quaternion_from_coeff(quat) 58 | heading_vector = quaternion_rotate_vector(quat.inverse(), np.array([0, 0, -1])) 59 | phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] 60 | return phi % (2 * np.pi) 61 | 62 | def estimate_cand_pos(pos, ori, ang, dis): 63 | cand_num = len(ang) 64 | cand_pos = np.zeros([cand_num, 3]) 65 | 66 | ang = np.array(ang) 67 | dis = np.array(dis) 68 | ang = (heading_from_quaternion(ori) + ang) % (2 * np.pi) 69 | cand_pos[:, 0] = pos[0] - dis * np.sin(ang) # x 70 | cand_pos[:, 1] = pos[1] # y 71 | cand_pos[:, 2] = pos[2] - dis * np.cos(ang) # z 72 | return cand_pos 73 | 74 | 75 | class FloydGraph(object): 76 | def __init__(self): 77 | self._dis = defaultdict(lambda :defaultdict(lambda: 95959595)) 78 | self._point = defaultdict(lambda :defaultdict(lambda: "")) 79 | self._visited = set() 80 | 81 | def distance(self, x, y): 82 | if x == y: 83 | return 0 84 | else: 85 | return self._dis[x][y] 86 | 87 | def add_edge(self, x, y, dis): 88 | if dis < self._dis[x][y]: 89 | self._dis[x][y] = dis 90 | self._dis[y][x] = dis 91 | self._point[x][y] = "" 92 | self._point[y][x] = "" 93 | 94 | def update(self, k): 95 | for x in self._dis: 96 | for y in self._dis: 97 | if x != y and x !=k and y != k: 98 | t_dis = self._dis[x][y] + self._dis[y][k] 99 | if t_dis < self._dis[x][k]: 100 | self._dis[x][k] = t_dis 101 | self._dis[k][x] = t_dis 102 | self._point[x][k] = y 103 | self._point[k][x] = y 104 | 105 | for x in self._dis: 106 | for y in self._dis: 107 | if x != y: 108 | t_dis = self._dis[x][k] + self._dis[k][y] 109 | if t_dis < self._dis[x][y]: 110 | self._dis[x][y] = t_dis 111 | self._dis[y][x] = t_dis 112 | self._point[x][y] = k 113 | self._point[y][x] = k 114 | 115 | self._visited.add(k) 116 | 117 | def visited(self, k): 118 | return (k in self._visited) 119 | 120 | def path(self, x, y): 121 | """ 122 | :param x: start 123 | :param y: end 124 | :return: the path from x to y [v1, v2, ..., v_n, y] 125 | """ 126 | if x == y: 127 | return [] 128 | if self._point[x][y] == "": # Direct edge 129 | return [y] 130 | else: 131 | k = self._point[x][y] 132 | # print(x, y, k) 133 | # for x1 in (x, k, y): 134 | # for x2 in (x, k, y): 135 | # print(x1, x2, "%.4f" % self._dis[x1][x2]) 136 | return self.path(x, k) + self.path(k, y) 137 | 138 | 139 | class GraphMap(object): 140 | def __init__(self, has_real_pos, loc_noise, merge_ghost, ghost_aug): 141 | 142 | self.graph_nx = nx.Graph() 143 | 144 | self.node_pos = {} # viewpoint to position (x, y, z) 145 | self.node_embeds = {} # viewpoint to pano feature 146 | self.node_stepId = {} 147 | 148 | self.ghost_cnt = 0 # id to create ghost 149 | self.ghost_pos = {} 150 | self.ghost_mean_pos = {} 151 | self.ghost_embeds = {} # viewpoint to single_view feature 152 | self.ghost_fronts = {} # viewpoint to front_vp id 153 | self.ghost_real_pos = {} # for training 154 | self.has_real_pos = has_real_pos 155 | self.merge_ghost = merge_ghost 156 | self.ghost_aug = ghost_aug # 0 ~ 1, noise level 157 | self.loc_noise = loc_noise 158 | 159 | self.shortest_path = None 160 | self.shortest_dist = None 161 | 162 | self.node_stop_scores = {} # viewpoint to stop_score 163 | 164 | def _localize(self, qpos, kpos_dict, ignore_height=False): 165 | min_dis = 10000 166 | min_vp = None 167 | for kvp, kpos in kpos_dict.items(): 168 | if ignore_height: 169 | dis = ((qpos[[0,2]] - kpos[[0,2]])**2).sum()**0.5 170 | else: 171 | dis = ((qpos - kpos)**2).sum()**0.5 172 | if dis < min_dis: 173 | min_dis = dis 174 | min_vp = kvp 175 | min_vp = None if min_dis > self.loc_noise else min_vp 176 | return min_vp 177 | 178 | def identify_node(self, cur_pos, cur_ori, cand_ang, cand_dis): 179 | # assume no repeated node 180 | # since action is restricted to ghosts 181 | cur_vp = str(len(self.node_pos)) 182 | cand_vp = [f'{cur_vp}_{str(i)}' for i in range(len(cand_ang))] 183 | cand_pos = [p for p in estimate_cand_pos(cur_pos, cur_ori, cand_ang, cand_dis)] 184 | return cur_vp, cand_vp, cand_pos 185 | 186 | def delete_ghost(self, vp): 187 | self.ghost_pos.pop(vp) 188 | self.ghost_mean_pos.pop(vp) 189 | self.ghost_embeds.pop(vp) 190 | self.ghost_fronts.pop(vp) 191 | if self.has_real_pos: 192 | self.ghost_real_pos.pop(vp) 193 | 194 | def update_graph(self, prev_vp, step_id, 195 | cur_vp, cur_pos, cur_embeds, 196 | cand_vp, cand_pos, cand_embeds, 197 | cand_real_pos): 198 | # 1. connect prev_vp 199 | self.graph_nx.add_node(cur_vp) 200 | if prev_vp is not None: 201 | prev_pos = self.node_pos[prev_vp] 202 | dis = calc_position_distance(prev_pos, cur_pos) 203 | self.graph_nx.add_edge(prev_vp, cur_vp, weight=dis) 204 | 205 | # 2. update node & ghost info 206 | self.node_pos[cur_vp] = cur_pos 207 | self.node_embeds[cur_vp] = cur_embeds 208 | self.node_stepId[cur_vp] = step_id 209 | 210 | gvp_list = [] 211 | 212 | for i, (cvp, cpos, cembeds) in enumerate(zip(cand_vp, cand_pos, cand_embeds)): 213 | localized_nvp = self._localize(cpos, self.node_pos) 214 | # cand overlap with node, connect cur_vp with localized_nvp 215 | if localized_nvp is not None : 216 | dis = calc_position_distance(cur_pos, self.node_pos[localized_nvp]) 217 | self.graph_nx.add_edge(cur_vp, localized_nvp, weight=dis) 218 | 219 | gvp_list.append(localized_nvp) 220 | # cand not overlap with node, create/update ghost 221 | else: 222 | if self.merge_ghost: 223 | localized_gvp = self._localize(cpos, self.ghost_mean_pos) 224 | # create ghost 225 | if localized_gvp is None: 226 | gvp = f'g{str(self.ghost_cnt)}' 227 | self.ghost_cnt += 1 228 | self.ghost_pos[gvp] = [cpos] 229 | self.ghost_mean_pos[gvp] = cpos 230 | self.ghost_embeds[gvp] = [cembeds, 1] 231 | self.ghost_fronts[gvp] = [cur_vp] 232 | if self.has_real_pos: 233 | self.ghost_real_pos[gvp] = [cand_real_pos[i]] 234 | 235 | # update ghost 236 | else: 237 | gvp = localized_gvp 238 | self.ghost_pos[gvp].append(cpos) 239 | self.ghost_mean_pos[gvp] = np.mean(self.ghost_pos[gvp], axis=0) 240 | self.ghost_embeds[gvp][0] = self.ghost_embeds[gvp][0] + cembeds 241 | self.ghost_embeds[gvp][1] += 1 242 | self.ghost_fronts[gvp].append(cur_vp) 243 | if self.has_real_pos: 244 | self.ghost_real_pos[gvp].append(cand_real_pos[i]) 245 | 246 | else: 247 | gvp = f'g{str(self.ghost_cnt)}' 248 | self.ghost_cnt += 1 249 | self.ghost_pos[gvp] = [cpos] 250 | self.ghost_mean_pos[gvp] = cpos 251 | self.ghost_embeds[gvp] = [cembeds, 1] 252 | self.ghost_fronts[gvp] = [cur_vp] 253 | if self.has_real_pos: 254 | self.ghost_real_pos[gvp] = [cand_real_pos[i]] 255 | 256 | gvp_list.append(gvp) 257 | 258 | self.ghost_aug_pos = deepcopy(self.ghost_mean_pos) 259 | if self.ghost_aug != 0: 260 | for gvp, gpos in self.ghost_aug_pos.items(): 261 | gpos_noise = np.random.normal(loc=(0,0,0), scale=(self.ghost_aug,0,self.ghost_aug), size=(3,)) 262 | gpos_noise[gpos_noise < -self.ghost_aug] = -self.ghost_aug 263 | gpos_noise[gpos_noise > self.ghost_aug] = self.ghost_aug 264 | self.ghost_aug_pos[gvp] = gpos + gpos_noise 265 | 266 | self.shortest_path = dict(nx.all_pairs_dijkstra_path(self.graph_nx)) 267 | self.shortest_dist = dict(nx.all_pairs_dijkstra_path_length(self.graph_nx)) 268 | return gvp_list 269 | 270 | 271 | def update_graph_no_overlap(self, prev_vp, step_id, 272 | cur_vp, cur_pos, cur_embeds, 273 | cand_vp, cand_pos, cand_embeds, cand_angles, 274 | cand_real_pos): 275 | # 1. connect prev_vp 276 | self.graph_nx.add_node(cur_vp) 277 | if prev_vp is not None: 278 | prev_pos = self.node_pos[prev_vp] 279 | dis = calc_position_distance(prev_pos, cur_pos) 280 | self.graph_nx.add_edge(prev_vp, cur_vp, weight=dis) 281 | 282 | # 2. update node & ghost info 283 | self.node_pos[cur_vp] = cur_pos 284 | self.node_embeds[cur_vp] = cur_embeds 285 | self.node_stepId[cur_vp] = step_id 286 | 287 | gvp_list = [] 288 | 289 | for i, (cvp, cpos, cembeds, cangles) in enumerate(zip(cand_vp, cand_pos, cand_embeds, cand_angles)): 290 | 291 | if i != 0 and i != len(cand_angles)-1 and 1/2 * math.pi < cangles and cangles < 3/2 * math.pi: 292 | continue 293 | 294 | gvp = f'g{str(self.ghost_cnt)}' 295 | self.ghost_cnt += 1 296 | self.ghost_pos[gvp] = [cpos] 297 | self.ghost_mean_pos[gvp] = cpos 298 | self.ghost_embeds[gvp] = [cembeds, 1] 299 | self.ghost_fronts[gvp] = [cur_vp] 300 | if self.has_real_pos: 301 | self.ghost_real_pos[gvp] = [cand_real_pos[i]] 302 | 303 | gvp_list.append(gvp) 304 | 305 | self.ghost_aug_pos = deepcopy(self.ghost_mean_pos) 306 | if self.ghost_aug != 0: 307 | for gvp, gpos in self.ghost_aug_pos.items(): 308 | gpos_noise = np.random.normal(loc=(0,0,0), scale=(self.ghost_aug,0,self.ghost_aug), size=(3,)) 309 | gpos_noise[gpos_noise < -self.ghost_aug] = -self.ghost_aug 310 | gpos_noise[gpos_noise > self.ghost_aug] = self.ghost_aug 311 | self.ghost_aug_pos[gvp] = gpos + gpos_noise 312 | 313 | self.shortest_path = dict(nx.all_pairs_dijkstra_path(self.graph_nx)) 314 | self.shortest_dist = dict(nx.all_pairs_dijkstra_path_length(self.graph_nx)) 315 | return gvp_list 316 | 317 | def front_to_ghost_dist(self, ghost_vp): 318 | # assume the nearest front 319 | min_dis = 10000 320 | min_front = None 321 | for front_vp in self.ghost_fronts[ghost_vp]: 322 | dis = calc_position_distance( 323 | self.node_pos[front_vp], self.ghost_aug_pos[ghost_vp] 324 | ) 325 | if dis < min_dis: 326 | min_dis = dis 327 | min_front = front_vp 328 | return min_dis, min_front 329 | 330 | def get_node_embeds(self, vp): 331 | if not vp.startswith('g'): 332 | return self.node_embeds[vp] 333 | else: 334 | return self.ghost_embeds[vp][0] / self.ghost_embeds[vp][1] 335 | 336 | def get_pos_fts(self, cur_vp, cur_pos, cur_ori, gmap_vp_ids): 337 | # dim=7 (sin(heading), cos(heading), sin(elevation), cos(elevation), 338 | # line_dist, shortest_dist, shortest_step) 339 | rel_angles, rel_dists = [], [] 340 | for vp in gmap_vp_ids: 341 | if vp is None: 342 | rel_angles.append([0, 0]) 343 | rel_dists.append([0, 0, 0]) 344 | # for ghost 345 | elif vp.startswith('g'): 346 | base_heading = heading_from_quaternion(cur_ori) 347 | base_elevation = 0 348 | vp_pos = self.ghost_aug_pos[vp] 349 | rel_heading, rel_elevation, rel_dist = calculate_vp_rel_pos_fts( 350 | cur_pos, vp_pos, base_heading, base_elevation, to_clock=True, 351 | ) 352 | rel_angles.append([rel_heading, rel_elevation]) 353 | front_dis, front_vp = self.front_to_ghost_dist(vp) 354 | shortest_dist = self.shortest_dist[cur_vp][front_vp] + front_dis 355 | shortest_step = len(self.shortest_path[cur_vp][front_vp]) + 1 356 | rel_dists.append( 357 | [rel_dist / MAX_DIST, 358 | shortest_dist / MAX_DIST, 359 | shortest_step / MAX_STEP] 360 | ) 361 | # for node 362 | else: 363 | base_heading = heading_from_quaternion(cur_ori) 364 | base_elevation = 0 365 | vp_pos = self.node_pos[vp] 366 | rel_heading, rel_elevation, rel_dist = calculate_vp_rel_pos_fts( 367 | cur_pos, vp_pos, base_heading, base_elevation, to_clock=True, 368 | ) 369 | rel_angles.append([rel_heading, rel_elevation]) 370 | shortest_dist = self.shortest_dist[cur_vp][vp] 371 | shortest_step = len(self.shortest_path[cur_vp][vp]) 372 | rel_dists.append( 373 | [rel_dist / MAX_DIST, 374 | shortest_dist / MAX_DIST, 375 | shortest_step / MAX_STEP] 376 | ) 377 | rel_angles = np.array(rel_angles).astype(np.float32) 378 | rel_dists = np.array(rel_dists).astype(np.float32) 379 | rel_ang_fts = get_angle_fts(rel_angles[:, 0], rel_angles[:, 1], angle_feat_size=4) 380 | return np.concatenate([rel_ang_fts, rel_dists], 1) --------------------------------------------------------------------------------