├── utils_p
    ├── __init__.py
    ├── __pycache__
    │   ├── losses.cpython-37.pyc
    │   ├── memory.cpython-37.pyc
    │   ├── prompt.cpython-37.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── convert.cpython-37.pyc
    │   └── metrics.cpython-37.pyc
    ├── prompt.py
    ├── losses.py
    ├── convert.py
    └── memory.py
├── vlnce_baselines
    ├── config
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   └── default.cpython-37.pyc
    │   ├── r2r_configs
    │   │   ├── test_set_inference.yaml
    │   │   ├── cma_sf.yaml
    │   │   ├── seq2seq_da.yaml
    │   │   ├── seq2seq_aug.yaml
    │   │   ├── seq2seq_pm.yaml
    │   │   ├── cma_aug.yaml
    │   │   ├── seq2seq_pm_aug.yaml
    │   │   ├── cma_da.yaml
    │   │   ├── cma_pm.yaml
    │   │   ├── cma_ss.yaml
    │   │   ├── cma_pm_da.yaml
    │   │   ├── cma_pm_aug.yaml
    │   │   ├── seq2seq_aug_tune.yaml
    │   │   ├── seq2seq.yaml
    │   │   ├── cma_aug_tune.yaml
    │   │   ├── cma_pm_aug_tune.yaml
    │   │   ├── cma_da_aug_tune.yaml
    │   │   ├── cma.yaml
    │   │   ├── seq2seq_pm_da_aug_tune.yaml
    │   │   └── cma_pm_da_aug_tune.yaml
    │   ├── nonlearning.yaml
    │   └── default.py
    ├── models
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── policy.cpython-37.pyc
    │   │   ├── utils.cpython-37.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── graph_utils.cpython-37.pyc
    │   │   └── Policy_ViewSelection_ETP.cpython-37.pyc
    │   ├── etp
    │   │   ├── __pycache__
    │   │   │   ├── nerf.cpython-37.pyc
    │   │   │   ├── vilmodel_cmt.cpython-37.pyc
    │   │   │   └── vlnbert_init.cpython-37.pyc
    │   │   ├── vlnbert_init.py
    │   │   └── nerf.py
    │   ├── encoders
    │   │   ├── __pycache__
    │   │   │   ├── clip.cpython-37.pyc
    │   │   │   ├── resnet_encoders.cpython-37.pyc
    │   │   │   └── instruction_encoder.cpython-37.pyc
    │   │   ├── instruction_encoder.py
    │   │   ├── clip.py
    │   │   └── resnet_encoders.py
    │   ├── policy.py
    │   └── graph_utils.py
    ├── bert-base-uncased
    │   └── instruction
    ├── common
    │   ├── __pycache__
    │   │   ├── ops.cpython-37.pyc
    │   │   ├── utils.cpython-37.pyc
    │   │   ├── aux_losses.cpython-37.pyc
    │   │   ├── env_utils.cpython-37.pyc
    │   │   ├── transformer.cpython-37.pyc
    │   │   ├── environments.cpython-37.pyc
    │   │   └── base_il_trainer.cpython-37.pyc
    │   ├── aux_losses.py
    │   ├── utils.py
    │   ├── ops.py
    │   ├── env_utils.py
    │   └── recollection_dataset.py
    ├── waypoint_networks
    │   ├── __pycache__
    │   │   ├── utils.cpython-37.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── resnetUnet.cpython-37.pyc
    │   │   ├── viz_utils.cpython-37.pyc
    │   │   ├── semantic_grid.cpython-37.pyc
    │   │   └── img_segmentor_model.cpython-37.pyc
    │   ├── __init__.py
    │   ├── img_segmentor_model.py
    │   ├── resnetUnet.py
    │   ├── semantic_grid.py
    │   └── viz_utils.py
    ├── __init__.py
    └── utils.py
├── data
    └── instruction
├── img
    └── EWM.png
├── pretrained
    └── instruction
├── run_r2r
    ├── r2r_vlnce.yaml
    ├── iter_train.yaml
    └── main.bash
├── run.py
├── README.md
└── environment.yaml


/utils_p/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/instruction:
--------------------------------------------------------------------------------
1 | Please download the file 'data' and put it here.
2 | 


--------------------------------------------------------------------------------
/img/EWM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/img/EWM.png


--------------------------------------------------------------------------------
/pretrained/instruction:
--------------------------------------------------------------------------------
1 | Please download the file 'pretrained' and put it here.
2 | 


--------------------------------------------------------------------------------
/vlnce_baselines/bert-base-uncased/instruction:
--------------------------------------------------------------------------------
1 | Please download the file 'bert-base-uncased' and put it here.
2 | 


--------------------------------------------------------------------------------
/utils_p/__pycache__/losses.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/losses.cpython-37.pyc


--------------------------------------------------------------------------------
/utils_p/__pycache__/memory.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/memory.cpython-37.pyc


--------------------------------------------------------------------------------
/utils_p/__pycache__/prompt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/prompt.cpython-37.pyc


--------------------------------------------------------------------------------
/utils_p/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/utils_p/__pycache__/convert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/convert.cpython-37.pyc


--------------------------------------------------------------------------------
/utils_p/__pycache__/metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/utils_p/__pycache__/metrics.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/ops.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/ops.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/__pycache__/policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/policy.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/config/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/config/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/config/__pycache__/default.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/config/__pycache__/default.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/etp/__pycache__/nerf.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/nerf.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/aux_losses.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/aux_losses.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/env_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/env_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/__pycache__/graph_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/graph_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/environments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/environments.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/__pycache__/clip.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/clip.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/common/__pycache__/base_il_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/common/__pycache__/base_il_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/etp/__pycache__/vilmodel_cmt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/vilmodel_cmt.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/etp/__pycache__/vlnbert_init.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/etp/__pycache__/vlnbert_init.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/resnetUnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/resnetUnet.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/viz_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/viz_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/__pycache__/Policy_ViewSelection_ETP.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/__pycache__/Policy_ViewSelection_ETP.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/__pycache__/resnet_encoders.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/resnet_encoders.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/semantic_grid.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/semantic_grid.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/__pycache__/instruction_encoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/models/encoders/__pycache__/instruction_encoder.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__pycache__/img_segmentor_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Feliciaxyao/NavMorph/HEAD/vlnce_baselines/waypoint_networks/__pycache__/img_segmentor_model.cpython-37.pyc


--------------------------------------------------------------------------------
/vlnce_baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from vlnce_baselines import ss_trainer_ETP, dagger_trainer
2 | from vlnce_baselines.common import environments
3 | 
4 | from vlnce_baselines.models import (
5 |     Policy_ViewSelection_ETP,
6 | )
7 | 


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from .resnetUnet import ResNetUNet
 4 | from .img_segmentor_model import ImgSegmentor
 5 | 
 6 | def get_img_segmentor_from_options(n_object_classes,img_segm_loss_scale):
 7 |     return ImgSegmentor(segmentation_model=ResNetUNet(n_channel_in=3, n_class_out=n_object_classes),
 8 |                         loss_scale=img_segm_loss_scale)
 9 | 
10 | '''
11 | Model ResNetUnet taken from:
12 | https://github.com/usuyama/pytorch-unet
13 | '''


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/test_set_inference.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_PROCESSES: 1
 5 | 
 6 | INFERENCE:
 7 |   SPLIT: test
 8 |   USE_CKPT_CONFIG: False
 9 |   SAMPLE: False
10 |   CKPT_PATH: data/checkpoints/CMA_PM_DA_Aug.pth
11 |   PREDICTIONS_FILE: predictions.json
12 | 
13 | MODEL:
14 |   policy_name: CMAPolicy
15 | 
16 |   INSTRUCTION_ENCODER:
17 |     bidirectional: True
18 | 
19 |   CMA:
20 |     use: True
21 | 
22 |   PROGRESS_MONITOR:
23 |     use: True
24 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/nonlearning.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | EVAL:
 3 |     SPLIT: val_unseen
 4 |     # any num greater than the actual episode count evals every episode
 5 |     EPISODE_COUNT: 10
 6 |     EVAL_NONLEARNING: True
 7 |     NONLEARNING:
 8 |         # RandomAgent or HandcraftedAgent
 9 |         AGENT: RandomAgent
10 | 
11 | INFERENCE:
12 |     SPLIT: val_unseen
13 |     PREDICTIONS_FILE: predictions.json
14 |     INFERENCE_NONLEARNING: True
15 |     NONLEARNING:
16 |         # RandomAgent or HandcraftedAgent
17 |         AGENT: "RandomAgent"
18 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_sf.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_IDS: [0]
 3 | TORCH_GPU_ID: 0
 4 | TORCH_GPU_IDS: [0]
 5 | GPU_NUMBERS: 1
 6 | NUM_ENVIRONMENTS: 1
 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_sf
 8 | CHECKPOINT_FOLDER: data/checkpoints/cma_sf
 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_sf
10 | 
11 | EVAL:
12 |   USE_CKPT_CONFIG: False
13 |   SPLIT: val_unseen
14 |   EPISODE_COUNT: -1
15 | 
16 | IL:
17 |   epochs: 50
18 |   batch_size: 8
19 |   schedule_ratio: 0.75
20 |   decay_time: 10
21 | 
22 |   max_traj_len: 130
23 | 
24 | MODEL:
25 |   policy_name: CMAPolicyO
26 | 
27 |   INSTRUCTION_ENCODER:
28 |     bidirectional: True
29 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_da.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_da
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_da
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_da
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 4
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 10
20 |     update_size: 5000
21 |     p: 0.75
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq_da/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: Seq2SeqPolicy
27 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_aug.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_aug
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_aug
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_aug
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 15
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 157232
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq_aug/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: Seq2SeqPolicy
27 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_pm.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 15
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 10819
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: Seq2SeqPolicy
27 | 
28 |   PROGRESS_MONITOR:
29 |     use: True
30 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_aug.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 45
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 157232
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma_aug/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: CMAPolicy
27 | 
28 |   INSTRUCTION_ENCODER:
29 |     bidirectional: True
30 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_pm_aug.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm_aug
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm_aug
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm_aug
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 15
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 157232
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq_aug/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: Seq2SeqPolicy
27 | 
28 |   PROGRESS_MONITOR:
29 |     use: True
30 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_da.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_IDS: [0]
 3 | TORCH_GPU_ID: 0
 4 | TORCH_GPU_IDS: [0]
 5 | GPU_NUMBERS: 1
 6 | NUM_ENVIRONMENTS: 1
 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_da
 8 | CHECKPOINT_FOLDER: data/checkpoints/cma_da
 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_da
10 | 
11 | EVAL:
12 |   USE_CKPT_CONFIG: False
13 |   SPLIT: val_unseen
14 |   EPISODE_COUNT: -1
15 | 
16 | IL:
17 |   epochs: 4
18 |   batch_size: 5
19 | 
20 |   DAGGER:
21 |     iterations: 10
22 |     update_size: 5000
23 |     p: 0.75
24 |     preload_lmdb_features: False
25 |     lmdb_features_dir: data/trajectories_dirs/cma_da/trajectories.lmdb
26 | 
27 | MODEL:
28 |   policy_name: CMAPolicy
29 | 
30 |   INSTRUCTION_ENCODER:
31 |     bidirectional: True
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_pm.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 45
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 10819
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: CMAPolicy
27 | 
28 |   INSTRUCTION_ENCODER:
29 |     bidirectional: True
30 | 
31 |   PROGRESS_MONITOR:
32 |     use: True
33 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_ss.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_IDS: [0]
 3 | TORCH_GPU_ID: 0
 4 | TORCH_GPU_IDS: [0]
 5 | TRAINER_NAME: ss
 6 | GPU_NUMBERS: 1
 7 | NUM_ENVIRONMENTS: 1
 8 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_ss
 9 | CHECKPOINT_FOLDER: data/checkpoints/cma_ss
10 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_ss
11 | 
12 | EVAL:
13 |   USE_CKPT_CONFIG: False
14 |   SPLIT: val_unseen
15 |   EPISODE_COUNT: -1
16 | 
17 | #RL:
18 | #  POLICY:
19 | #    OBS_TRANSFORMS:
20 | #      ENABLED_TRANSFORMS: [Resize]
21 | 
22 | IL:
23 |   epochs: 50
24 |   batch_size: 8
25 |   schedule_ratio: 0.75
26 |   decay_time: 10
27 | 
28 |   max_traj_len: 130
29 | 
30 | MODEL:
31 |   policy_name: CMAPolicyO
32 | 
33 |   INSTRUCTION_ENCODER:
34 |     bidirectional: True
35 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_pm_da.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_da
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_da
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_da
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 4
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 10
20 |     update_size: 5000
21 |     p: 0.75
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma_pm_da/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: CMAPolicy
27 | 
28 |   INSTRUCTION_ENCODER:
29 |     bidirectional: True
30 | 
31 |   PROGRESS_MONITOR:
32 |     use: True
33 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_pm_aug.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_aug
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_aug
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_aug
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 45
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 157232
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma_aug/trajectories.lmdb
24 | 
25 | MODEL:
26 |   policy_name: CMAPolicy
27 | 
28 |   INSTRUCTION_ENCODER:
29 |     bidirectional: True
30 | 
31 |   PROGRESS_MONITOR:
32 |     use: True
33 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 15
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 10819
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/seq2seq_aug/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: Seq2SeqPolicy
29 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | TRAINER_NAME: dagger # recollect_trainer
 3 | SIMULATOR_GPU_ID: 0
 4 | TORCH_GPU_ID: 0
 5 | NUM_ENVIRONMENTS: 1
 6 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq
 7 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq
 8 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq
 9 | 
10 | EVAL:
11 |   USE_CKPT_CONFIG: False
12 |   SPLIT: val_unseen
13 |   EPISODE_COUNT: -1
14 | 
15 | IL:
16 |   epochs: 15
17 |   batch_size: 5
18 | 
19 |   RECOLLECT_TRAINER:
20 |     gt_file:
21 |       data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
22 | 
23 |   DAGGER:
24 |     iterations: 1
25 |     update_size: 10819
26 |     p: 1.0
27 |     preload_lmdb_features: False
28 |     lmdb_features_dir: data/trajectories_dirs/seq2seq/trajectories.lmdb
29 | 
30 | MODEL:
31 |   policy_name: Seq2SeqPolicy
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 45
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 10819
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/cma_aug/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: CMAPolicy
29 | 
30 |   INSTRUCTION_ENCODER:
31 |     bidirectional: True
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_pm_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 45
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 1
20 |     update_size: 10819
21 |     p: 1.0
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/cma_pm_aug/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: CMAPolicy
29 | 
30 |   INSTRUCTION_ENCODER:
31 |     bidirectional: True
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_da_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_da_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_da_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_da_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 4
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 10
20 |     update_size: 5000
21 |     p: 0.5
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma_da_aug_tune/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/cma_aug_tune/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: CMAPolicy
29 | 
30 |   INSTRUCTION_ENCODER:
31 |     bidirectional: True
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | TRAINER_NAME: dagger # recollect_trainer
 3 | SIMULATOR_GPU_IDS: [0]
 4 | TORCH_GPU_ID: 0
 5 | GPU_NUMBERS: 1
 6 | NUM_ENVIRONMENTS: 1
 7 | TENSORBOARD_DIR: data/tensorboard_dirs/cma
 8 | CHECKPOINT_FOLDER: data/checkpoints/cma
 9 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma
10 | 
11 | EVAL:
12 |   USE_CKPT_CONFIG: False
13 |   SPLIT: val_unseen
14 |   EPISODE_COUNT: -1
15 | 
16 | IL:
17 |   epochs: 45
18 |   batch_size: 5
19 | 
20 |   RECOLLECT_TRAINER:
21 |     gt_file:
22 |       data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
23 | 
24 |   DAGGER:
25 |     iterations: 1
26 |     update_size: 10819
27 |     p: 1.0
28 |     preload_lmdb_features: False
29 |     lmdb_features_dir: data/trajectories_dirs/cma/trajectories.lmdb
30 | 
31 | MODEL:
32 |   policy_name: CMAPolicy
33 | 
34 |   INSTRUCTION_ENCODER:
35 |     bidirectional: True
36 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/seq2seq_pm_da_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/seq2seq_pm_da_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/seq2seq_pm_da_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/seq2seq_pm_da_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 4
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 10
20 |     update_size: 5000
21 |     p: 0.75
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/seq2seq_pm_da_aug_tune/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/seq2seq_pm_aug/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: Seq2SeqPolicy
29 | 
30 |   PROGRESS_MONITOR:
31 |     use: True
32 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/r2r_configs/cma_pm_da_aug_tune.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_ENVIRONMENTS: 1
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_pm_da_aug_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_pm_da_aug_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_pm_da_aug_tune
 8 | 
 9 | EVAL:
10 |   USE_CKPT_CONFIG: False
11 |   SPLIT: val_unseen
12 |   EPISODE_COUNT: -1
13 | 
14 | IL:
15 |   epochs: 4
16 |   batch_size: 5
17 | 
18 |   DAGGER:
19 |     iterations: 10
20 |     update_size: 5000
21 |     p: 0.5
22 |     preload_lmdb_features: False
23 |     lmdb_features_dir: data/trajectories_dirs/cma_pm_da_aug_tune/trajectories.lmdb
24 |     load_from_ckpt: True
25 |     ckpt_to_load: data/checkpoints/cma_pm_aug/best_checkpoint.pth # REPLACE
26 | 
27 | MODEL:
28 |   policy_name: CMAPolicy
29 | 
30 |   INSTRUCTION_ENCODER:
31 |     bidirectional: True
32 | 
33 |   PROGRESS_MONITOR:
34 |     use: True
35 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/aux_losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class _AuxLosses:
 5 |     def __init__(self):
 6 |         self._losses = {}
 7 |         self._loss_alphas = {}
 8 |         self._is_active = False
 9 | 
10 |     def clear(self):
11 |         self._losses.clear()
12 |         self._loss_alphas.clear()
13 | 
14 |     def register_loss(self, name, loss, alpha=1.0):
15 |         assert self.is_active()
16 |         assert name not in self._losses
17 | 
18 |         self._losses[name] = loss
19 |         self._loss_alphas[name] = alpha
20 | 
21 |     def get_loss(self, name):
22 |         return self._losses[name]
23 | 
24 |     def reduce(self, mask):
25 |         assert self.is_active()
26 |         total = torch.tensor(0.0).cuda()
27 | 
28 |         for k in self._losses.keys():
29 |             k_loss = torch.masked_select(self._losses[k], mask).mean()
30 |             total = total + self._loss_alphas[k] * k_loss
31 | 
32 |         return total
33 | 
34 |     def is_active(self):
35 |         return self._is_active
36 | 
37 |     def activate(self):
38 |         self._is_active = True
39 | 
40 |     def deactivate(self):
41 |         self._is_active = False
42 | 
43 | 
44 | AuxLosses = _AuxLosses()
45 | 


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/img_segmentor_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class ImgSegmentor(nn.Module):
 7 |     
 8 |     def __init__(self, segmentation_model, loss_scale):
 9 |         super(ImgSegmentor, self).__init__()
10 |         self._segmentation_model = segmentation_model
11 |         self.loss_scale = loss_scale
12 |         
13 |         self.cel_loss = nn.CrossEntropyLoss()
14 | 
15 | 
16 | 
17 |     def forward(self, batch):
18 |         
19 |         imgs = batch['images']
20 |         B, T, _, H, W = imgs.shape
21 | 
22 |         pred_segm_raw = self._segmentation_model(imgs)
23 | 
24 |         C = pred_segm_raw.shape[1]
25 | 
26 |         # Get a prob distribution over the labels
27 |         pred_segm_raw = pred_segm_raw.view(B,T,C,H,W)
28 |         pred_segm = F.softmax(pred_segm_raw, dim=2)
29 | 
30 |         output = {'pred_segm_raw':pred_segm_raw,
31 |                   'pred_segm':pred_segm}
32 | 
33 |         return output
34 |         
35 | 
36 |     def loss_cel(self, batch, pred_outputs):
37 |         pred_segm_raw = pred_outputs['pred_segm_raw']
38 |         B, T, C, H, W = pred_segm_raw.shape
39 | 
40 |         gt_segm = batch['gt_segm']
41 |         pred_segm_loss = self.cel_loss(input=pred_segm_raw.view(B*T,C,H,W), target=gt_segm.view(B*T,H,W))
42 |         
43 |         pred_segm_err = pred_segm_loss.clone().detach()
44 | 
45 |         output={}
46 |         output['pred_segm_err'] = pred_segm_err
47 |         output['pred_segm_loss'] = self.loss_scale * pred_segm_loss
48 |         return output


--------------------------------------------------------------------------------
/run_r2r/r2r_vlnce.yaml:
--------------------------------------------------------------------------------
 1 | ENVIRONMENT:
 2 |   MAX_EPISODE_STEPS: 5000
 3 | 
 4 | SIMULATOR:
 5 |   ACTION_SPACE_CONFIG: v0
 6 |   AGENT_0:
 7 |     SENSORS: [RGB_SENSOR, DEPTH_SENSOR]
 8 |   FORWARD_STEP_SIZE: 0.25
 9 |   TURN_ANGLE: 15
10 |   HABITAT_SIM_V0:
11 |     GPU_DEVICE_ID: 0
12 |     ALLOW_SLIDING: True
13 |   RGB_SENSOR:
14 |     WIDTH: 224
15 |     HEIGHT: 224
16 |     HFOV: 90
17 |     TYPE: HabitatSimRGBSensor
18 |   DEPTH_SENSOR:
19 |     WIDTH: 256  # pretrained DDPPO resnet needs 256x256
20 |     HEIGHT: 256
21 |     HFOV: 90
22 |   TYPE:
23 |     Sim-v1
24 |     
25 | TASK:
26 |   TYPE: VLN-v0
27 |   POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT, HIGHTOLOW]
28 |   SUCCESS_DISTANCE: 3.0
29 |   SENSORS: [
30 |     INSTRUCTION_SENSOR,
31 |     # SHORTEST_PATH_SENSOR,
32 |     # VLN_ORACLE_PROGRESS_SENSOR
33 |   ]
34 |   INSTRUCTION_SENSOR_UUID: instruction
35 |   MEASUREMENTS: [
36 |     # DISTANCE_TO_GOAL,
37 |     # SUCCESS,
38 |     # SPL,
39 |     # NDTW,
40 |     # PATH_LENGTH,
41 |     # ORACLE_SUCCESS,
42 |     # STEPS_TAKEN
43 |   ]
44 |   SUCCESS:
45 |     SUCCESS_DISTANCE: 3.0
46 |   SPL:
47 |     SUCCESS_DISTANCE: 3.0
48 |   NDTW:
49 |     SUCCESS_DISTANCE: 3.0
50 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
51 |   SDTW:
52 |     SUCCESS_DISTANCE: 3.0
53 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
54 |   ORACLE_SUCCESS:
55 |     SUCCESS_DISTANCE: 3.0
56 |     
57 | DATASET:
58 |   TYPE: VLN-CE-v1
59 |   SPLIT: train
60 |   DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed_BERTidx/{split}/{split}_bertidx.json.gz
61 |   SCENES_DIR: data/scene_datasets/
62 | 


--------------------------------------------------------------------------------
/run_r2r/iter_train.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: run_r2r/r2r_vlnce.yaml
 2 | SIMULATOR_GPU_IDS: [0]
 3 | TORCH_GPU_ID: 0
 4 | TORCH_GPU_IDS: [0]
 5 | TRAINER_NAME: SS-ETP
 6 | GPU_NUMBERS: 1
 7 | NUM_ENVIRONMENTS: 1
 8 | TENSORBOARD_DIR: data/logs/tensorboard_dirs/
 9 | CHECKPOINT_FOLDER: data/logs/checkpoints/
10 | EVAL_CKPT_PATH_DIR: data/logs/checkpoints/
11 | RESULTS_DIR: data/logs/eval_results/
12 | VIDEO_DIR: data/logs/video/
13 | 
14 | INFERENCE:
15 |   SPLIT: test
16 |   USE_CKPT_CONFIG: False
17 |   SAMPLE: False
18 |   CKPT_PATH: '' # REPLACE THIS
19 |   PREDICTIONS_FILE: ''
20 |   FORMAT: r2r
21 |   EPISODE_COUNT: -1
22 | 
23 | EVAL:
24 |   USE_CKPT_CONFIG: False
25 |   SPLIT: val_unseen 
26 |   EPISODE_COUNT: -1
27 |   CKPT_PATH_DIR: ''
28 |   fast_eval: False
29 | 
30 | IL:
31 |   iters: 30000
32 |   log_every: 500
33 |   lr: 1e-5
34 |   batch_size: 1 # equal to NUM_ENVIRONMENTS
35 |   ml_weight: 1.0
36 |   expert_policy: spl
37 | 
38 |   sample_ratio: 0.75
39 |   decay_interval: 3000
40 |   
41 |   max_traj_len: 30
42 |   max_text_len: 80
43 |   loc_noise: 0.5
44 |   waypoint_aug: False
45 |   ghost_aug: 0.0
46 |   back_algo: teleport
47 |   # back_algo: control
48 |   tryout: True
49 | 
50 | MODEL:
51 |   task_type: r2r
52 | 
53 |   policy_name: PolicyViewSelectionETP
54 |   NUM_ANGLES: 12
55 |   pretrained_path: pretrained/model_step_82500.pt
56 |   fix_lang_embedding: False
57 |   fix_pano_embedding: False
58 |   use_depth_embedding: True
59 |   use_sprels: True
60 |   merge_ghost: True
61 |   consume_ghost: True
62 |   
63 |   spatial_output: False
64 |   RGB_ENCODER:
65 |     output_size: 512
66 |   DEPTH_ENCODER:
67 |     output_size: 256    
68 |   VISUAL_DIM:
69 |     vis_hidden: 768
70 |     directional: 128
71 |   INSTRUCTION_ENCODER:
72 |     bidirectional: True
73 | 
74 | 


--------------------------------------------------------------------------------
/run_r2r/main.bash:
--------------------------------------------------------------------------------
 1 | export GLOG_minloglevel=2
 2 | export MAGNUM_LOG=quiet
 3 | 
 4 | flag1="--exp_name release_r2r
 5 |       --run-type train
 6 |       --exp-config run_r2r/iter_train.yaml
 7 |       SIMULATOR_GPU_IDS [0]
 8 |       TORCH_GPU_IDS [0]
 9 |       GPU_NUMBERS 1
10 |       NUM_ENVIRONMENTS 1
11 |       IL.iters 29000
12 |       IL.lr 1e-5
13 |       IL.log_every 500
14 |       IL.ml_weight 1.0
15 |       IL.sample_ratio 0.75
16 |       IL.decay_interval 4000
17 |       IL.load_from_ckpt True
18 |       IL.is_requeue True
19 |       IL.waypoint_aug  True
20 | 	  IL.ckpt_to_load data/checkpoints/ckpt.iter25000.pth
21 |       TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True
22 |       MODEL.pretrained_path pretrained/model_step_100000.pt
23 |       "
24 | 
25 | flag2=" --exp_name release_r2r
26 |       --run-type eval
27 |       --exp-config run_r2r/iter_train.yaml
28 |       SIMULATOR_GPU_IDS [0]
29 |       TORCH_GPU_IDS [0]
30 |       GPU_NUMBERS 1
31 |       NUM_ENVIRONMENTS 1
32 |       TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True
33 |       EVAL.CKPT_PATH_DIR data/checkpoints/ckpt.pth
34 | 	  MODEL.pretrained_path pretrained/model_step_100000.pt
35 |       IL.back_algo control
36 |       "
37 | 
38 | flag3="--exp_name release_r2r
39 |       --run-type inference
40 |       --exp-config run_r2r/iter_train.yaml
41 |       SIMULATOR_GPU_IDS [0]
42 |       TORCH_GPU_IDS [0]
43 |       GPU_NUMBERS 1
44 |       NUM_ENVIRONMENTS 1
45 |       TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING True
46 |       INFERENCE.CKPT_PATH data/checkpoints/ckpt.pth
47 |       INFERENCE.PREDICTIONS_FILE preds.json
48 | 	  MODEL.pretrained_path pretrained/model_step_100000.pt
49 |       IL.back_algo control
50 |       "
51 | 
52 | mode=$1
53 | case $mode in 
54 |       train)
55 |       echo "###### train mode ######"
56 |       CUDA_VISIBLE_DEVICES='7' python run.py $flag1
57 |       ;;
58 |       eval)
59 |       echo "###### eval mode ######"
60 |       #CUDA_VISIBLE_DEVICES='5' python -m pdb run.py $flag2
61 |       CUDA_VISIBLE_DEVICES='0' python run.py $flag2
62 |       ;;
63 |       infer)
64 |       echo "###### infer mode ######"
65 |       CUDA_VISIBLE_DEVICES='7' python run.py $flag3
66 |       ;;
67 | esac


--------------------------------------------------------------------------------
/vlnce_baselines/common/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | import torch
 3 | import torch.distributed as dist
 4 | import numpy as np
 5 | import copy
 6 | import math
 7 | 
 8 | def extract_instruction_tokens(
 9 |     observations: List[Dict],
10 |     instruction_sensor_uuid: str,
11 |     tokens_uuid: str = "tokens",
12 |     max_length: int = 512,
13 |     pad_id: int = 0,
14 | ):
15 |     """Extracts instruction tokens from an instruction sensor if the tokens
16 |     exist and are in a dict structure."""
17 |     if instruction_sensor_uuid not in observations[0]:
18 |         return observations
19 | 
20 |     for i in range(len(observations)):
21 |         if (
22 |             isinstance(observations[i][instruction_sensor_uuid], dict)
23 |             and tokens_uuid in observations[i][instruction_sensor_uuid]
24 |         ):
25 |             token = observations[i][instruction_sensor_uuid]["tokens"][:max_length]
26 |             if len(token) < max_length:
27 |                 token += [pad_id] * (max_length - len(token))
28 |             observations[i][instruction_sensor_uuid] = token
29 |         else:
30 |             break
31 |     return observations
32 | 
33 | def gather_list_and_concat(list_of_nums,world_size):
34 |     if not torch.is_tensor(list_of_nums):
35 |         tensor = torch.Tensor(list_of_nums).cuda()
36 |     else:
37 |         if list_of_nums.is_cuda == False:
38 |             tensor = list_of_nums.cuda()
39 |         else:
40 |             tensor = list_of_nums
41 |     gather_t = [torch.ones_like(tensor) for _ in
42 |                 range(world_size)]
43 |     dist.all_gather(gather_t, tensor)
44 |     return gather_t
45 | 
46 | def dis_to_con(path, amount=0.25):
47 |     starts = path[:-1]
48 |     ends = path[1:]
49 |     new_path = [path[0]]
50 |     for s, e in zip(starts,ends):
51 |         vec = np.array(e) - np.array(s)
52 |         ratio = amount/np.linalg.norm(vec[[0,2]])
53 |         unit = vec*ratio
54 |         times = int(1/ratio)
55 |         for i in range(times):
56 |             if i != times - 1:
57 |                 location = np.array(new_path[-1])+unit
58 |                 new_path.append(location.tolist())
59 |         new_path.append(e)
60 |     
61 |     return new_path
62 | 
63 | def get_camera_orientations12():
64 |     base_angle_deg = 30
65 |     base_angle_rad = math.pi / 6
66 |     orient_dict = {}
67 |     for k in range(1,12):
68 |         orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0]
69 |     return orient_dict


--------------------------------------------------------------------------------
/vlnce_baselines/common/ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .transformer import TransformerEncoder, TransformerEncoderLayer
 4 | 
 5 | try:
 6 |     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 7 | except (ImportError, AttributeError) as e:
 8 |     # logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
 9 |     BertLayerNorm = torch.nn.LayerNorm
10 | 
11 | def create_transformer_encoder(config, num_layers, norm=False):
12 |     enc_layer = TransformerEncoderLayer(
13 |         config.hidden_size, config.num_attention_heads,
14 |         dim_feedforward=config.intermediate_size, 
15 |         dropout=config.hidden_dropout_prob,
16 |         activation=config.hidden_act,
17 |         normalize_before=True
18 |     )
19 |     if norm:
20 |         norm_layer = BertLayerNorm(config.hidden_size, eps=1e-12)
21 |     else:
22 |         norm_layer = None
23 |     return TransformerEncoder(enc_layer, num_layers, norm=norm_layer, batch_first=True)
24 | 
25 | def extend_neg_masks(masks, dtype=None):
26 |     """
27 |     mask from (N, L) into (N, 1(H), 1(L), L) and make it negative
28 |     """
29 |     if dtype is None:
30 |         dtype = torch.float
31 |     extended_masks = masks.unsqueeze(1).unsqueeze(2)
32 |     extended_masks = extended_masks.to(dtype=dtype)
33 |     extended_masks = (1.0 - extended_masks) * -10000.0
34 |     return extended_masks
35 | 
36 | def gen_seq_masks(seq_lens, max_len=None):
37 |     if max_len is None:
38 |         max_len = max(seq_lens)
39 |     batch_size = len(seq_lens)
40 |     device = seq_lens.device
41 | 
42 |     masks = torch.arange(max_len).unsqueeze(0).repeat(batch_size, 1).to(device)
43 |     masks = masks < seq_lens.unsqueeze(1)
44 |     return masks
45 | 
46 | def pad_tensors_wgrad(tensors, lens=None):
47 |     """B x [T, ...] torch tensors"""
48 |     if lens is None:
49 |         lens = [t.size(0) for t in tensors]
50 |     max_len = max(lens)
51 |     batch_size = len(tensors)
52 |     hid = list(tensors[0].size()[1:])
53 | 
54 |     device = tensors[0].device
55 |     dtype = tensors[0].dtype
56 | 
57 |     output = []
58 |     for i in range(batch_size):
59 |         if lens[i] < max_len:
60 |             tmp = torch.cat(
61 |                 [tensors[i], torch.zeros([max_len-lens[i]]+hid, dtype=dtype).to(device)],
62 |                 dim=0
63 |             )
64 |         else:
65 |             tmp = tensors[i]
66 |         output.append(tmp)
67 |     output = torch.stack(output, 0)
68 |     return output
69 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/etp/vlnbert_init.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_tokenizer(args):
 5 |     from transformers import AutoTokenizer
 6 |     if args.dataset == 'rxr' or args.tokenizer == 'xlm':
 7 |         cfg_name = 'bert_config/xlm-roberta-base'
 8 |     else:
 9 |         cfg_name = 'bert_config/bert-base-uncased'
10 |     tokenizer = AutoTokenizer.from_pretrained(cfg_name)
11 |     return tokenizer
12 | 
13 | def get_vlnbert_models(config=None):
14 |     
15 |     from transformers import PretrainedConfig, BertTokenizer, BertModel
16 |     from vlnce_baselines.models.etp.vilmodel_cmt import GlocalTextPathNavCMT
17 | 
18 |     model_class = GlocalTextPathNavCMT
19 | 
20 |     model_name_or_path = config.pretrained_path
21 |     new_ckpt_weights = {}
22 |     if model_name_or_path is not None:
23 |         ckpt_weights = torch.load(model_name_or_path, map_location='cpu')
24 |         for k, v in ckpt_weights.items():
25 |             if k.startswith('module'):
26 |                 new_ckpt_weights[k[7:]] = v
27 |             if 'sap_head' in k:
28 |                 new_ckpt_weights['bert.' + k] = v
29 |             else:
30 |                  new_ckpt_weights[k] = v
31 |     
32 |     if config.task_type == 'r2r':
33 |         #cfg_name = 'bert_config/bert-base-uncased'
34 |         cfg_name = 'vlnce_baselines/bert-base-uncased'
35 |     elif config.task_type == 'rxr':
36 |         cfg_name = 'bert_config/xlm-roberta-base'
37 |     vis_config = PretrainedConfig.from_pretrained(cfg_name)
38 | 
39 |     if config.task_type == 'rxr':
40 |         vis_config.type_vocab_size = 2
41 | 
42 |     vis_config.max_action_steps = 100
43 |     vis_config.image_feat_size = 512
44 |     vis_config.use_depth_embedding = config.use_depth_embedding
45 |     vis_config.depth_feat_size = 128
46 |     vis_config.angle_feat_size = 4
47 | 
48 |     vis_config.num_l_layers = 9
49 |     vis_config.num_pano_layers = 2
50 |     vis_config.num_x_layers = 4
51 |     vis_config.graph_sprels = config.use_sprels
52 |     vis_config.glocal_fuse = 'global'
53 | 
54 |     vis_config.fix_lang_embedding = config.fix_lang_embedding
55 |     vis_config.fix_pano_embedding = config.fix_pano_embedding
56 | 
57 |     vis_config.update_lang_bert = not vis_config.fix_lang_embedding
58 |     vis_config.output_attentions = True
59 |     vis_config.pred_head_dropout_prob = 0.1
60 |     vis_config.use_lang2visn_attn = False
61 | 
62 |     visual_model = model_class.from_pretrained(
63 |         pretrained_model_name_or_path=None, 
64 |         config=vis_config, 
65 |         state_dict=new_ckpt_weights)
66 |         
67 |     return visual_model
68 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import Any
 3 | 
 4 | from habitat_baselines.rl.ppo.policy import Policy
 5 | from habitat_baselines.utils.common import (
 6 |     CategoricalNet,
 7 |     CustomFixedCategorical,
 8 | )
 9 | from torch.distributions import Categorical
10 | 
11 | 
12 | class ILPolicy(Policy, metaclass=abc.ABCMeta):
13 |     def __init__(self, net, dim_actions):
14 |         r"""Defines an imitation learning policy as having functions act() and
15 |         build_distribution().
16 |         """
17 |         super(Policy, self).__init__()
18 |         self.net = net
19 |         self.dim_actions = dim_actions
20 | 
21 |         # self.action_distribution = CategoricalNet(
22 |         #     self.net.output_size, self.dim_actions
23 |         # )
24 | 
25 |     def forward(self, *x):
26 |         raise NotImplementedError
27 | 
28 |     def act(
29 |         self,
30 |         observations,
31 |         rnn_hidden_states,
32 |         prev_actions,
33 |         masks,
34 |         deterministic=False,
35 |     ):
36 | 
37 |         print('need to revise for CMA and VLNBERT')
38 |         import pdb; pdb.set_trace()
39 | 
40 |         features, rnn_hidden_states = self.net(
41 |             observations, rnn_hidden_states, prev_actions, masks
42 |         )
43 |         distribution = self.action_distribution(features)
44 | 
45 |         # if distribution.logit
46 |         if deterministic:
47 |             action = distribution.mode()
48 |         else:
49 |             action = distribution.sample()
50 | 
51 |         return action, rnn_hidden_states
52 | 
53 |     def get_value(self, *args: Any, **kwargs: Any):
54 |         raise NotImplementedError
55 | 
56 |     def evaluate_actions(self, *args: Any, **kwargs: Any):
57 |         raise NotImplementedError
58 | 
59 |     def build_distribution(
60 |         self, observations, rnn_hidden_states, prev_actions, masks
61 |     ) -> CustomFixedCategorical:
62 |         features, rnn_hidden_states = self.net(
63 |             observations, rnn_hidden_states, prev_actions, masks
64 |         )
65 |         return self.action_distribution(features)
66 | 
67 |     def act2(
68 |         self,
69 |         observations,
70 |         rnn_hidden_states,
71 |         prev_actions,
72 |         masks,
73 |         deterministic=False,
74 |     ):
75 | 
76 |         print('need to revise for CMA and VLNBERT')
77 |         import pdb; pdb.set_trace()
78 | 
79 |         feature_rgb, feature_depth, rnn_hidden_states = self.net(
80 |             observations, rnn_hidden_states, prev_actions, masks
81 |         )
82 |         distribution_rgb = self.action_distribution(feature_rgb)
83 |         distribution_depth = self.action_distribution(feature_depth)
84 | 
85 |         probs = (distribution_rgb.probs + distribution_depth.probs)/2
86 |         # if distribution.logit
87 |         if deterministic:
88 |             action = probs.argmax(dim=-1, keepdim=True)
89 |         else:
90 |             action = Categorical(probs).sample().unsqueeze(-1)
91 | 
92 |         return action, rnn_hidden_states
93 | 


--------------------------------------------------------------------------------
/utils_p/prompt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import time
 5 | 
 6 | 
 7 | class Prompt(nn.Module):
 8 |     def __init__(self, prompt_alpha=0.01, image_size=224):
 9 |         super().__init__()
10 |         self.prompt_size = int(image_size * prompt_alpha) if int(image_size * prompt_alpha) > 1 else 1
11 |         self.padding_size = (image_size - self.prompt_size)//2
12 |         self.init_para = torch.ones((1, 3, self.prompt_size, self.prompt_size))
13 |         self.data_prompt = nn.Parameter(self.init_para, requires_grad=True)
14 |         self.pre_prompt = self.data_prompt.detach().cpu().data
15 | 
16 |     def update(self, init_data):
17 |         with torch.no_grad():
18 |             self.data_prompt.copy_(init_data)
19 | 
20 |     def iFFT(self, amp_src_, pha_src, imgH, imgW):
21 |         # recompose fft
22 |         real = torch.cos(pha_src) * amp_src_
23 |         imag = torch.sin(pha_src) * amp_src_
24 |         fft_src_ = torch.complex(real=real, imag=imag)
25 | 
26 |         src_in_trg = torch.fft.ifft2(fft_src_, dim=(-2, -1), s=[imgH, imgW]).real
27 |         return src_in_trg
28 | 
29 |     def forward(self, x):
30 |         start = time.time()
31 |         _, _, imgH, imgW = x.size() # image size
32 | 
33 |         fft = torch.fft.fft2(x.clone(), dim=(-2, -1)) # fft for image
34 | 
35 |         # extract amplitude and phase of both ffts
36 |         amp_src, pha_src = torch.abs(fft), torch.angle(fft) # amp: 振幅谱，表示频率大小；pha：相位谱，相位偏移
37 |         amp_src = torch.fft.fftshift(amp_src) # 将振幅谱的低频分量移动到图像的中心
38 | 
39 |         # F.pad: 填充self.data_prompt, 将其居中于输入图像尺寸
40 |         prompt = F.pad(self.data_prompt, [self.padding_size, imgH - self.padding_size - self.prompt_size,
41 |                                           self.padding_size, imgW - self.padding_size - self.prompt_size],
42 |                        mode='constant', value=1.0).contiguous() # self.data:预定义的提示，通过填充操作使其与输入图像的尺寸匹配。填充操作确保只处理频谱中的特定部分（低频部分）
43 | 
44 |         amp_src_ = amp_src * prompt # 振幅谱与提示相乘，以专注于特定的低频
45 |         amp_src_ = torch.fft.ifftshift(amp_src_) # prompt调制后，将频率成分移回其原始排列方式
46 | 
47 |         amp_low_ = amp_src[:, :, self.padding_size:self.padding_size+self.prompt_size, self.padding_size:self.padding_size+self.prompt_size] # 提取低频成分
48 | 
49 |         src_in_trg = self.iFFT(amp_src_, pha_src, imgH, imgW) #重构图像
50 |         end = time.time()
51 |         T = end - start
52 |         #print(T)
53 |         
54 |         return src_in_trg, amp_low_
55 | 
56 | 
57 |     
58 |     def enhance(self, x, retrieve_p):
59 |         
60 |         _, _, imgH, imgW = x.size() # image size
61 | 
62 |         fft = torch.fft.fft2(x.clone(), dim=(-2, -1)) # fft for image
63 | 
64 |         # extract amplitude and phase of both ffts
65 |         amp_src, pha_src = torch.abs(fft), torch.angle(fft) # amp: 振幅谱，表示频率大小；pha：相位谱，相位偏移
66 |         amp_src = torch.fft.fftshift(amp_src) # 将振幅谱的低频分量移动到图像的中心
67 | 
68 |         # F.pad: 填充self.data_prompt, 将其居中于输入图像尺寸
69 |         prompt = F.pad(retrieve_p.cuda(), [self.padding_size, imgH - self.padding_size - self.prompt_size,
70 |                                           self.padding_size, imgW - self.padding_size - self.prompt_size],
71 |                        mode='constant', value=1.0).contiguous() # self.data:预定义的提示，通过填充操作使其与输入图像的尺寸匹配。填充操作确保只处理频谱中的特定部分（低频部分）
72 | 
73 |         amp_src_ = amp_src * prompt # 振幅谱与提示相乘，以专注于特定的低频
74 |         amp_src_ = torch.fft.ifftshift(amp_src_) # prompt调制后，将频率成分移回其原始排列方式
75 | 
76 |         # amp_low_ = amp_src[:, :, self.padding_size:self.padding_size+self.prompt_size, self.padding_size:self.padding_size+self.prompt_size] # 提取低频成分
77 | 
78 |         src_in_trg = self.iFFT(amp_src_, pha_src, imgH, imgW) #重构图像
79 |         #end = time.time()
80 |         #T = end - start
81 |         #print(T)
82 |         
83 |         return src_in_trg #, amp_low_


--------------------------------------------------------------------------------
/utils_p/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class RegressionLoss(nn.Module):
 7 |     def __init__(self, norm, channel_dim=-1):
 8 |         super().__init__()
 9 |         self.norm = norm
10 |         self.channel_dim = channel_dim
11 | 
12 |         if norm == 1:
13 |             self.loss_fn = F.l1_loss
14 |         elif norm == 2:
15 |             self.loss_fn = F.mse_loss
16 |         else:
17 |             raise ValueError(f'Expected norm 1 or 2, but got norm={norm}')
18 | 
19 |     def forward(self, prediction, target):
20 |         loss = self.loss_fn(prediction.float(), target,float(), reduction='none')
21 | 
22 |         # Sum channel dimension
23 |         loss = torch.sum(loss, dim=self.channel_dim, keepdims=True)
24 |         return loss.mean()
25 | 
26 | 
27 | class SpatialRegressionLoss(nn.Module):
28 |     def __init__(self, norm, ignore_index=255):
29 |         super(SpatialRegressionLoss, self).__init__()
30 |         self.norm = norm
31 |         self.ignore_index = ignore_index
32 | 
33 |         if norm == 1:
34 |             self.loss_fn = F.l1_loss
35 |         elif norm == 2:
36 |             self.loss_fn = F.mse_loss
37 |         else:
38 |             raise ValueError(f'Expected norm 1 or 2, but got norm={norm}')
39 | 
40 |     def forward(self, prediction, target):
41 |         assert len(prediction.shape) == 5, 'Must be a 5D tensor'
42 |         # ignore_index is the same across all channels
43 |         mask = target[:, :, :1] != self.ignore_index
44 |         if mask.sum() == 0:
45 |             return prediction.new_zeros(1)[0].float()
46 | 
47 |         loss = self.loss_fn(prediction, target, reduction='none')
48 | 
49 |         # Sum channel dimension
50 |         loss = torch.sum(loss, dim=-3, keepdims=True)
51 | 
52 |         return loss[mask].mean()
53 | 
54 | 
55 | class ProbabilisticLoss(nn.Module):
56 |     """ Given a prior distribution and a posterior distribution, this module computes KL(posterior, prior)"""
57 |     def __init__(self, remove_first_timestamp=True):
58 |         super().__init__()
59 |         self.remove_first_timestamp = remove_first_timestamp
60 | 
61 |     def forward(self, prior_mu, prior_sigma, posterior_mu, posterior_sigma):
62 |         posterior_var = posterior_sigma[:, 1:] ** 2
63 |         prior_var = prior_sigma[:, 1:] ** 2
64 | 
65 |         posterior_log_sigma = torch.log(posterior_sigma[:, 1:])
66 |         prior_log_sigma = torch.log(prior_sigma[:, 1:])
67 | 
68 |         kl_div = (
69 |                 prior_log_sigma - posterior_log_sigma - 0.5
70 |                 + (posterior_var + (posterior_mu[:, 1:] - prior_mu[:, 1:]) ** 2) / (2 * prior_var)
71 |         )
72 |         first_kl = - posterior_log_sigma[:, :1] - 0.5 + (posterior_var[:, :1] + posterior_mu[:, :1] ** 2) / 2
73 |         kl_div = torch.cat([first_kl, kl_div], dim=1)
74 | 
75 |         # Sum across channel dimension
76 |         # Average across batch dimension, keep time dimension for monitoring
77 |         kl_loss = torch.mean(torch.sum(kl_div, dim=-1))
78 |         return kl_loss
79 | 
80 | 
81 | class KLLoss(nn.Module):
82 |     def __init__(self, alpha):
83 |         super().__init__()
84 |         self.alpha = alpha
85 |         self.loss = ProbabilisticLoss(remove_first_timestamp=True)
86 | 
87 |     def forward(self, prior, posterior):
88 |         prior_mu, prior_sigma = prior['mu'], prior['sigma']
89 |         posterior_mu, posterior_sigma = posterior['mu'], posterior['sigma']
90 |         prior_mu = prior_mu.float()
91 |         prior_sigma = prior_sigma.float()
92 |         posterior_mu = posterior_mu.float()
93 |         posterior_sigma = posterior_sigma.float()
94 | 
95 |         prior_loss = self.loss(prior_mu, prior_sigma, posterior_mu.detach(), posterior_sigma.detach())
96 |         posterior_loss = self.loss(prior_mu.detach(), prior_sigma.detach(), posterior_mu, posterior_sigma)
97 | 
98 |         return self.alpha * prior_loss + (1 - self.alpha) * posterior_loss


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/resnetUnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchvision import models
 4 | 
 5 | def convrelu(in_channels, out_channels, kernel, padding):
 6 |     return nn.Sequential(
 7 |         nn.Conv2d(in_channels, out_channels, kernel, padding=padding),
 8 |         nn.BatchNorm2d(num_features=out_channels),
 9 |         nn.ReLU(inplace=True),
10 |     )
11 | 
12 | 
13 | class ResNetUNet(nn.Module):
14 |     def __init__(self, n_channel_in, n_class_out, has_positional_embedding = False):
15 |         super().__init__()
16 | 
17 |         self.has_positional_embedding = has_positional_embedding
18 |         if self.has_positional_embedding == True:
19 |             positional_embedding_dim = 16
20 |             n_channel_in += positional_embedding_dim
21 | 
22 |             scale = positional_embedding_dim ** -0.5
23 |             self.positional_embedding = nn.Parameter(scale * torch.randn((positional_embedding_dim,192,192)))
24 | 
25 |         self.base_model = models.resnet18(pretrained=False)
26 |         self.base_model.load_state_dict(torch.load("pretrained/resnet18-f37072fd.pth"))
27 |         self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3,bias=False)
28 |         self.base_layers = list(self.base_model.children())
29 | 
30 |         self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2)
31 |         self.layer0_1x1 = convrelu(64, 64, 1, 0)
32 |         self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4)
33 |         self.layer1_1x1 = convrelu(64, 64, 1, 0)
34 |         self.layer2 = self.base_layers[5]  # size=(N, 128, x.H/8, x.W/8)
35 |         self.layer2_1x1 = convrelu(128, 128, 1, 0)
36 |         self.layer3 = self.base_layers[6]  # size=(N, 256, x.H/16, x.W/16)
37 |         self.layer3_1x1 = convrelu(256, 256, 1, 0)
38 |         self.layer4 = self.base_layers[7]  # size=(N, 512, x.H/32, x.W/32)
39 |         self.layer4_1x1 = convrelu(512, 512, 1, 0)
40 | 
41 |         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
42 | 
43 |         self.conv_up3 = convrelu(256 + 512, 512, 3, 1)
44 |         self.conv_up2 = convrelu(128 + 512, 256, 3, 1)
45 |         self.conv_up1 = convrelu(64 + 256, 256, 3, 1)
46 |         self.conv_up0 = convrelu(64 + 256, 128, 3, 1)
47 | 
48 |         self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1)
49 |         self.conv_original_size1 = convrelu(64, 64, 3, 1)
50 |         self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1)
51 | 
52 |         self.conv_last = nn.Conv2d(64, n_class_out, 1)
53 | 
54 |     def forward(self, input):
55 |         B, T, C, cH, cW = input.shape
56 |         input = input.view(B*T,C,cH,cW)
57 |         
58 |         if self.has_positional_embedding:
59 |             input = torch.cat((self.positional_embedding.unsqueeze(0).repeat(B*T,1,1,1),input),dim=1)
60 | 
61 |         x_original = self.conv_original_size0(input)
62 |         x_original = self.conv_original_size1(x_original)
63 | 
64 |         layer0 = self.layer0(input)
65 |         layer1 = self.layer1(layer0)
66 |         layer2 = self.layer2(layer1)
67 |         layer3 = self.layer3(layer2)
68 |         layer4 = self.layer4(layer3)
69 | 
70 |         layer4 = self.layer4_1x1(layer4)
71 |         x = self.upsample(layer4)
72 |         
73 |         layer3 = self.layer3_1x1(layer3)
74 |         x = torch.cat([x, layer3], dim=1)
75 |         x = self.conv_up3(x)
76 | 
77 |         x = self.upsample(x)
78 |         layer2 = self.layer2_1x1(layer2)
79 |         x = torch.cat([x, layer2], dim=1)
80 |         x = self.conv_up2(x)
81 | 
82 |         x = self.upsample(x)
83 |         layer1 = self.layer1_1x1(layer1)
84 |         x = torch.cat([x, layer1], dim=1)
85 |         x = self.conv_up1(x)
86 | 
87 |         x = self.upsample(x)
88 |         layer0 = self.layer0_1x1(layer0)
89 |         x = torch.cat([x, layer0], dim=1)
90 |         x = self.conv_up0(x)
91 | 
92 |         x = self.upsample(x)
93 |         x = torch.cat([x, x_original], dim=1)
94 |         x = self.conv_original_size2(x)
95 | 
96 |         out = self.conv_last(x)
97 | 
98 |         return out


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/instruction_encoder.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from habitat import Config
  7 | 
  8 | 
  9 | class InstructionEncoder(nn.Module):
 10 |     def __init__(self, config: Config):
 11 |         r"""An encoder that uses RNN to encode an instruction. Returns
 12 |         the final hidden state after processing the instruction sequence.
 13 | 
 14 |         Args:
 15 |             config: must have
 16 |                 embedding_size: The dimension of each embedding vector
 17 |                 hidden_size: The hidden (output) size
 18 |                 rnn_type: The RNN cell type.  Must be GRU or LSTM
 19 |                 final_state_only: Whether or not to return just the final state
 20 |         """
 21 |         super().__init__()
 22 | 
 23 |         self.config = config
 24 | 
 25 |         # lang_drop_ratio = 0.50
 26 |         # self.drop = nn.Dropout(p=lang_drop_ratio)
 27 | 
 28 |         rnn = nn.GRU if self.config.rnn_type == "GRU" else nn.LSTM
 29 |         self.encoder_rnn = rnn(
 30 |             input_size=config.embedding_size,
 31 |             hidden_size=config.hidden_size,
 32 |             bidirectional=config.bidirectional,
 33 |         )
 34 | 
 35 |         if config.sensor_uuid == "instruction":
 36 |             if self.config.use_pretrained_embeddings:
 37 |                 self.embedding_layer = nn.Embedding.from_pretrained(
 38 |                     embeddings=self._load_embeddings(),
 39 |                     freeze=not self.config.fine_tune_embeddings,
 40 |                 )
 41 |             else:  # each embedding initialized to sampled Gaussian
 42 |                 self.embedding_layer = nn.Embedding(
 43 |                     num_embeddings=config.vocab_size,
 44 |                     embedding_dim=config.embedding_size,
 45 |                     padding_idx=0,
 46 |                 )
 47 | 
 48 |     @property
 49 |     def output_size(self):
 50 |         return self.config.hidden_size * (1 + int(self.config.bidirectional))
 51 | 
 52 |     def _load_embeddings(self):
 53 |         """Loads word embeddings from a pretrained embeddings file.
 54 |         PAD: index 0. [0.0, ... 0.0]
 55 |         UNK: index 1. mean of all R2R word embeddings: [mean_0, ..., mean_n]
 56 |         why UNK is averaged: https://bit.ly/3u3hkYg
 57 |         Returns:
 58 |             embeddings tensor of size [num_words x embedding_dim]
 59 |         """
 60 |         with gzip.open(self.config.embedding_file, "rt") as f:
 61 |             embeddings = torch.tensor(json.load(f))
 62 |         return embeddings
 63 | 
 64 |     def forward(self, observations):
 65 |         """
 66 |         Tensor sizes after computation:
 67 |             instruction: [batch_size x seq_length]
 68 |             lengths: [batch_size]
 69 |             hidden_state: [batch_size x hidden_size]
 70 |         """
 71 | 
 72 |         if self.config.sensor_uuid == "instruction":
 73 |             instruction = observations["instruction"].long()
 74 |             lengths = (instruction != 0.0).long().sum(dim=1)
 75 |             instruction = self.embedding_layer(instruction)
 76 |             # instruction = self.drop(instruction)
 77 |         else:
 78 |             instruction = observations["rxr_instruction"]
 79 | 
 80 |         lengths = (instruction != 0.0).long().sum(dim=2)
 81 |         lengths = (lengths != 0.0).long().sum(dim=1)
 82 | 
 83 |         packed_seq = nn.utils.rnn.pack_padded_sequence(
 84 |             instruction, lengths.cpu(), batch_first=True, enforce_sorted=False
 85 |         )
 86 |         output, final_state = self.encoder_rnn(packed_seq)
 87 | 
 88 |         if self.config.rnn_type == "LSTM":
 89 |             final_state = final_state[0]
 90 | 
 91 |         if self.config.final_state_only:  # default False
 92 |             return final_state.squeeze(0)
 93 |         else:
 94 |             ctx = nn.utils.rnn.pad_packed_sequence(output, 
 95 |                 batch_first=True)[0].permute(0, 2, 1)
 96 |             all_lang_masks = (ctx == 0.0).all(dim=1)
 97 |             ctx = ctx.permute(0, 2, 1)
 98 | 
 99 |             # ctx = self.drop(ctx)
100 | 
101 |             return ctx, all_lang_masks 
102 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import random
  5 | import os
  6 | import numpy as np
  7 | import torch
  8 | from habitat import logger
  9 | from habitat_baselines.common.baseline_registry import baseline_registry
 10 | 
 11 | import habitat_extensions  # noqa: F401
 12 | import vlnce_baselines  # noqa: F401
 13 | from vlnce_baselines.config.default import get_config
 14 | # from vlnce_baselines.nonlearning_agents import (
 15 | #     evaluate_agent,
 16 | #     nonlearning_inference,
 17 | # )
 18 | os.environ["CUDA_VISIBLE_DEVICES"] = "4"
 19 | 
 20 | 
 21 | def main():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument(
 24 |         "--exp_name",
 25 |         type=str,
 26 |         default="release_r2r",
 27 |         #required=True,
 28 |         help="experiment id that matches to exp-id in Notion log",
 29 |     )
 30 |     parser.add_argument(
 31 |         "--run-type",
 32 |         choices=["train", "eval", "inference"],
 33 |         default="eval",
 34 |         #required=True,
 35 |         help="run type of the experiment (train, eval, inference)",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--exp-config",
 39 |         type=str,
 40 |         default="run_r2r/iter_train.yaml",
 41 |         # required=True,
 42 |         help="path to config yaml containing info about experiment",
 43 |     )
 44 |     parser.add_argument(
 45 |         "opts",
 46 |         default=None,
 47 |         nargs=argparse.REMAINDER,
 48 |         help="Modify config options from command line",
 49 |     )
 50 |     parser.add_argument('--local_rank', type=int, default=0, help="local gpu id")
 51 | 
 52 |     #Prompt
 53 |     parser.add_argument('--memory_size', type=int, default=1000)
 54 |     parser.add_argument('--neighbor', type=int, default=5)
 55 |     parser.add_argument('--prompt_alpha', type=float, default=0.1)
 56 |     parser.add_argument('--warm_n', type=int, default=5)
 57 |     parser.add_argument('--image_size', type=int, default=224)
 58 |     parser.add_argument('--imagine_T', type=int, default=2)
 59 | 
 60 |     args = parser.parse_args()
 61 |     run_exp(**vars(args))
 62 | 
 63 | 
 64 | def run_exp(exp_name: str, exp_config: str, 
 65 |             run_type: str, memory_size: int, neighbor: int, prompt_alpha: float, warm_n: int, image_size: int, imagine_T: int, 
 66 |             opts=None, local_rank=None) -> None:
 67 |     r"""Runs experiment given mode and config
 68 | 
 69 |     Args:
 70 |         exp_config: path to config file.
 71 |         run_type: "train" or "eval.
 72 |         opts: list of strings of additional config options.
 73 | 
 74 |     Returns:
 75 |         None.
 76 |     """
 77 |     config = get_config(exp_config, opts)
 78 |     config.defrost()
 79 | 
 80 |     config.TENSORBOARD_DIR += exp_name
 81 |     config.CHECKPOINT_FOLDER += exp_name
 82 |     if os.path.isdir(config.EVAL_CKPT_PATH_DIR):
 83 |         config.EVAL_CKPT_PATH_DIR += exp_name
 84 |     config.RESULTS_DIR += exp_name
 85 |     config.VIDEO_DIR += exp_name
 86 |     # config.TASK_CONFIG.TASK.RXR_INSTRUCTION_SENSOR.max_text_len = config.IL.max_text_len
 87 |     config.LOG_FILE = exp_name + '_' + config.LOG_FILE
 88 | 
 89 |     if 'CMA' in config.MODEL.policy_name and 'r2r' in config.BASE_TASK_CONFIG_PATH:
 90 |         config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz'
 91 | 
 92 |     config.local_rank = local_rank
 93 | 
 94 |     #prompt
 95 |     config.memory_size = memory_size
 96 |     config.neighbor = neighbor
 97 |     config.prompt_alpha = prompt_alpha
 98 |     config.warm_n = warm_n
 99 |     config.image_size = image_size
100 |     config.imagine_T = imagine_T
101 | 
102 |     config.freeze()
103 |     os.system("mkdir -p data/logs/running_log")
104 |     logger.add_filehandler('data/logs/running_log/'+config.LOG_FILE)
105 | 
106 |     random.seed(config.TASK_CONFIG.SEED)
107 |     np.random.seed(config.TASK_CONFIG.SEED)
108 |     torch.manual_seed(config.TASK_CONFIG.SEED)
109 |     torch.backends.cudnn.benchmark = False
110 |     torch.backends.cudnn.deterministic = False
111 |     if torch.cuda.is_available():
112 |         torch.set_num_threads(1)
113 | 
114 |     # if run_type == "eval" and config.EVAL.EVAL_NONLEARNING:
115 |     #     evaluate_agent(config)
116 |     #     return
117 | 
118 |     # if run_type == "inference" and config.INFERENCE.INFERENCE_NONLEARNING:
119 |     #     nonlearning_inference(config)
120 |     #     return
121 | 
122 |     trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME)
123 |     assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported"
124 |     trainer = trainer_init(config)
125 | 
126 |     # import pdb; pdb.set_trace()
127 |     if run_type == "train":
128 |         trainer.train()
129 |     elif run_type == "eval":
130 |         trainer.eval()
131 |     elif run_type == "inference":
132 |         trainer.inference()
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/utils_p/convert.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import numpy as np
  3 | 
  4 | 
  5 | class AdaBN(nn.BatchNorm2d):
  6 |     def __init__(self, in_ch, warm_n=5):
  7 |         super(AdaBN, self).__init__(in_ch)
  8 |         self.warm_n = warm_n
  9 |         self.sample_num = 0
 10 |         self.new_sample = False
 11 | 
 12 |     def get_mu_var(self, x):
 13 |         if self.new_sample:
 14 |             self.sample_num += 1
 15 |         C = x.shape[1]
 16 | 
 17 |         cur_mu = x.mean((0, 2, 3), keepdims=True).detach()
 18 |         cur_var = x.var((0, 2, 3), keepdims=True).detach()
 19 | 
 20 |         src_mu = self.running_mean.view(1, C, 1, 1)
 21 |         src_var = self.running_var.view(1, C, 1, 1)
 22 | 
 23 |         moment = 1 / ((np.sqrt(self.sample_num) / self.warm_n) + 1)
 24 | 
 25 |         new_mu = moment * cur_mu + (1 - moment) * src_mu
 26 |         new_var = moment * cur_var + (1 - moment) * src_var
 27 |         return new_mu, new_var
 28 | 
 29 |     def forward(self, x):
 30 |         N, C, H, W = x.shape
 31 | 
 32 |         new_mu, new_var = self.get_mu_var(x)
 33 | 
 34 |         cur_mu = x.mean((2, 3), keepdims=True)
 35 |         cur_std = x.std((2, 3), keepdims=True)
 36 |         self.bn_loss = (
 37 |                 (new_mu - cur_mu).abs().mean() + (new_var.sqrt() - cur_std).abs().mean()
 38 |         )
 39 | 
 40 |         # Normalization with new statistics
 41 |         new_sig = (new_var + self.eps).sqrt()
 42 |         new_x = ((x - new_mu) / new_sig) * self.weight.view(1, C, 1, 1) + self.bias.view(1, C, 1, 1)
 43 |         return new_x
 44 | 
 45 | 
 46 | def convert_encoder_to_target(net, norm, start=0, end=5, verbose=True, bottleneck=False, input_size=512, warm_n=5):
 47 |     def convert_norm(old_norm, new_norm, num_features, idx, fea_size):
 48 |         norm_layer = new_norm(num_features, warm_n).to(net.conv1.weight.device)
 49 |         if hasattr(norm_layer, 'load_old_dict'):
 50 |             info = 'Converted to : {}'.format(norm)
 51 |             norm_layer.load_old_dict(old_norm)
 52 |         elif hasattr(norm_layer, 'load_state_dict'):
 53 |             state_dict = old_norm.state_dict()
 54 |             info = norm_layer.load_state_dict(state_dict, strict=False)
 55 |         else:
 56 |             info = 'No load_old_dict() found!!!'
 57 |         if verbose:
 58 |             print(info)
 59 |         return norm_layer
 60 | 
 61 |     layers = [0, net.layer1, net.layer2, net.layer3, net.layer4]
 62 | 
 63 |     idx = 0
 64 |     for i, layer in enumerate(layers):
 65 |         if not (start <= i < end):
 66 |             continue
 67 |         if i == 0:
 68 |             net.bn1 = convert_norm(net.bn1, norm, net.bn1.num_features, idx, fea_size=input_size // 2)
 69 |             idx += 1
 70 |         else:
 71 |             down_sample = 2 ** (1 + i)
 72 | 
 73 |             for j, block in enumerate(layer):
 74 |                 block.bn1 = convert_norm(block.bn1, norm, block.bn1.num_features, idx, fea_size=input_size // down_sample)
 75 |                 idx += 1
 76 |                 block.bn2 = convert_norm(block.bn2, norm, block.bn2.num_features, idx, fea_size=input_size // down_sample)
 77 |                 idx += 1
 78 |                 if bottleneck:
 79 |                     block.bn3 = convert_norm(block.bn3, norm, block.bn3.num_features, idx, fea_size=input_size // down_sample)
 80 |                     idx += 1
 81 |                 if block.downsample is not None:
 82 |                     block.downsample[1] = convert_norm(block.downsample[1], norm, block.downsample[1].num_features, idx, fea_size=input_size // down_sample)
 83 |                     idx += 1
 84 |     return net
 85 | 
 86 | 
 87 | def convert_decoder_to_target(net, norm, start=0, end=5, verbose=True, input_size=512, warm_n=5):
 88 |     def convert_norm(old_norm, new_norm, num_features, idx, fea_size):
 89 |         norm_layer = new_norm(num_features, warm_n).to(old_norm.weight.device)
 90 |         if hasattr(norm_layer, 'load_old_dict'):
 91 |             info = 'Converted to : {}'.format(norm)
 92 |             norm_layer.load_old_dict(old_norm)
 93 |         elif hasattr(norm_layer, 'load_state_dict'):
 94 |             state_dict = old_norm.state_dict()
 95 |             info = norm_layer.load_state_dict(state_dict, strict=False)
 96 |         else:
 97 |             info = 'No load_old_dict() found!!!'
 98 |         if verbose:
 99 |             print(info)
100 |         return norm_layer
101 | 
102 |     layers = [net[0], net[1], net[2], net[3], net[4]]
103 | 
104 |     idx = 0
105 |     for i, layer in enumerate(layers):
106 |         if not (start <= i < end):
107 |             continue
108 |         if i == 4:
109 |             net[4] = convert_norm(layer, norm, layer.num_features, idx, input_size)
110 |             idx += 1
111 |         else:
112 |             down_sample = 2 ** (4 - i)
113 |             layer.bn = convert_norm(layer.bn, norm, layer.bn.num_features, idx, input_size // down_sample)
114 |             idx += 1
115 |     return net
116 | 
117 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/clip.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adapted from: https://github.com/openai/CLIP/blob/main/clip/clip.py
  3 | """
  4 | from collections import OrderedDict
  5 | from typing import Tuple, Union
  6 | 
  7 | import hashlib
  8 | import os
  9 | import urllib
 10 | import warnings
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | import torch
 14 | import torch.nn.functional as F
 15 | from torch import nn
 16 | 
 17 | class LayerNorm(nn.LayerNorm):
 18 |     """Subclass torch's LayerNorm to handle fp16."""
 19 | 
 20 |     def forward(self, x: torch.Tensor):
 21 |         orig_type = x.dtype
 22 |         ret = super().forward(x.type(torch.float32))
 23 |         return ret.type(orig_type)
 24 | 
 25 | 
 26 | class QuickGELU(nn.Module):
 27 |     def forward(self, x: torch.Tensor):
 28 |         return x * torch.sigmoid(1.702 * x)
 29 | 
 30 | 
 31 | class ResidualAttentionBlock(nn.Module):
 32 |     def __init__(self, d_model: int, n_head: int, attn_mask=None):
 33 |         super().__init__()
 34 | 
 35 |         self.attn = nn.MultiheadAttention(d_model, n_head)
 36 |         self.ln_1 = LayerNorm(d_model)
 37 |         self.mlp = nn.Sequential(OrderedDict([
 38 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
 39 |             ("gelu", QuickGELU()),
 40 |             ("c_proj", nn.Linear(d_model * 4, d_model))
 41 |         ]))
 42 |         self.ln_2 = LayerNorm(d_model)
 43 |         self.attn_mask = attn_mask
 44 | 
 45 |     def attention(self, x: torch.Tensor):
 46 |         attn_mask_ = self.attn_mask
 47 |         if self.attn_mask is not None and hasattr(self.attn_mask, '__call__'):
 48 |             attn_mask_ = self.attn_mask(x.size(0))   # LND
 49 | 
 50 |         attn_mask_ = attn_mask_.to(dtype=x.dtype, device=x.device) if attn_mask_ is not None else None
 51 |         return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask_)[0]
 52 | 
 53 |     def forward(self, x):
 54 |         x = x + self.attention(self.ln_1(x))
 55 |         x = x + self.mlp(self.ln_2(x))
 56 |         return x
 57 | 
 58 | 
 59 | class Transformer(nn.Module):
 60 |     def __init__(self, width: int, layers: int, heads: int, attn_mask = None):
 61 |         super().__init__()
 62 |         self.width = width
 63 |         self.layers = layers
 64 |         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
 65 | 
 66 |     def forward(self, x: torch.Tensor):
 67 |         return self.resblocks(x)
 68 | 
 69 | class VisionTransformer(nn.Module):
 70 |     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int):
 71 |         super().__init__()
 72 |         self.input_resolution = input_resolution
 73 |         self.layers = layers
 74 |         self.patch_size = patch_size
 75 | 
 76 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
 77 |         scale = width ** -0.5
 78 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
 79 |         self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
 80 |         self.ln_pre = LayerNorm(width)
 81 |         self.transformer = Transformer(width, layers, heads)
 82 |         self.proj = nn.Parameter(scale * torch.randn(width, 512))
 83 |         self.ln_post = LayerNorm(width)
 84 | 
 85 | 
 86 | 
 87 |     def forward(self, x: torch.Tensor):
 88 | 
 89 |         x = self.conv1(x)  # shape = [*, width, grid, grid]
 90 |         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
 91 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 92 | 
 93 |         x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
 94 |         x = x + self.positional_embedding.to(x.dtype)
 95 |         x = self.ln_pre(x)
 96 | 
 97 |         x = x.permute(1, 0, 2)  # NLD -> LND
 98 |         x = self.transformer(x)
 99 |         x = x.permute(1, 0, 2)  # LND -> NLD
100 |         x = self.ln_post(x)
101 |         x = x @ self.proj
102 |         return x
103 | 
104 | class CLIP(nn.Module):
105 |     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int):
106 |         super().__init__()
107 | 
108 |         self.visual = VisionTransformer(input_resolution, patch_size, width, layers, heads)
109 | 
110 |         transformer_width = 512
111 |         self.vocab_size = 49408
112 |         self.context_length = 77
113 | 
114 |         self.transformer = Transformer(
115 |             width=512,
116 |             layers=12,
117 |             heads=8,
118 |             attn_mask=self.build_attention_mask()
119 |         )
120 |         self.token_embedding = nn.Embedding(self.vocab_size, transformer_width)
121 |         self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
122 |         self.ln_final = LayerNorm(transformer_width)
123 | 
124 |         self.text_projection = nn.Parameter(torch.empty(transformer_width, 512))
125 |         self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
126 | 
127 |     def build_attention_mask(self):
128 |         # lazily create causal attention mask, with full attention between the vision tokens
129 |         # pytorch uses additive attention mask; fill with -inf
130 |         mask = torch.empty(self.context_length, self.context_length)
131 |         mask.fill_(float("-inf"))
132 |         mask.triu_(1)  # zero out the lower diagonal
133 |         return mask
134 | 
135 |     def forward(self, x: torch.Tensor):
136 |         
137 |         return self.visual(x)
138 | 
139 |     def encode_image(self, x: torch.Tensor):
140 |         return self.visual(x)[:,0]
141 | 
142 |     
143 |     def encode_text(self, text):
144 |         x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
145 |         x = x + self.positional_embedding
146 |         x = x.permute(1, 0, 2)  # NLD -> LND
147 |         x = self.transformer(x)
148 |         x = x.permute(1, 0, 2)  # LND -> NLD
149 |         x = self.ln_final(x)
150 | 
151 |         # x.shape = [batch_size, n_ctx, transformer.width]
152 |         # take features from the eot embedding (eot_token is the highest number in each sequence)
153 |         x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
154 | 
155 |         return x
156 | 
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments
  2 | 
  3 | **Xuan Yao, Junyu Gao, and Changsheng Xu**
  4 | 
  5 | This repository is the official implementation of [NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments](https://arxiv.org/abs/2506.23468).
  6 | 
  7 | > Vision-and-Language Navigation in Continuous Environments (VLN-CE) requires agents to execute sequential navigation actions in complex environments guided by natural language instructions. Current approaches often struggle with generalizing to novel environments and adapting to ongoing changes during navigation.
  8 | Inspired by human cognition, we present NavMorph, a self-evolving world model framework that enhances environmental understanding and decision-making in VLN-CE tasks. NavMorph employs compact latent representations to model environmental dynamics, equipping agents with foresight for adaptive planning and policy refinement. By integrating a novel Contextual Evolution Memory, NavMorph leverages scene-contextual information to support effective navigation while maintaining online adaptability. Extensive experiments demonstrate that our method achieves notable performance improvements on popular VLN-CE benchmarks.
  9 | 
 10 | ![image](img/EWM.png)
 11 | 
 12 | 
 13 | ## 🌍 Usage
 14 | 
 15 | ### Prerequisites
 16 | 
 17 | 1. Follow the [Habitat Installation Guide](https://github.com/facebookresearch/habitat-lab#installation) and [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to install [`habitat-lab`](https://github.com/facebookresearch/habitat-lab) and [`habitat-sim`](https://github.com/facebookresearch/habitat-sim). We use version `v0.2.1` in our experiments.
 18 |    
 19 | 2. Install `torch_kdtree` and `tinycudann`: follow instructions [here](https://github.com/MrZihan/Sim2Real-VLN-3DFF). 
 20 | 
 21 | 3. Install requirements:
 22 |    ```setup
 23 |    conda create --name morph python=3.7.11
 24 |    conda activate morph
 25 |    ```
 26 |    * Required packages are listed in `environment.yaml`. You can install by running:
 27 |    
 28 |    ```
 29 |    conda env create -f environment.yaml
 30 |    ```
 31 |       
 32 | 
 33 | ### Dataset Preparation
 34 | 
 35 | 1. **Scenes for Matterport3D**
 36 | 
 37 |    > Instructions copied from [VLN-CE](https://github.com/jacobkrantz/VLN-CE)
 38 | 
 39 |    Matterport3D (MP3D) scene reconstructions are used. The official Matterport3D download script (`download_mp.py`) can be accessed by following the instructions on their [project webpage](https://niessner.github.io/Matterport/). The scene data can then be downloaded:
 40 | 
 41 |    ```bash
 42 |    # requires running with python 2.7
 43 |    python download_mp.py --task habitat -o data/scene_datasets/mp3d/
 44 |    ```
 45 |    
 46 |    Extract such that it has the form `scene_datasets/mp3d/{scene}/{scene}.glb`. There should be 90 scenes. Place the `scene_datasets` folder in `data/`.
 47 | 
 48 | 2. **Data and Trained Models**
 49 |   
 50 |    Please download the pretrained models and checkpoints from [GoogleDrive](https://drive.google.com/file/d/1x01wods-LUA6EyAD8C3ahiEaO8lKD6jy/view?usp=sharing).
 51 |    
 52 |     ```
 53 |      unzip NavMorph-8324.zip    
 54 |     ```
 55 |       Overall, files and folds should be organized as follows:
 56 |    
 57 |      ```
 58 |       NavMorph
 59 |       ├── data
 60 |       │   ├── checkpoints
 61 |       │   │   └── ckpt.pth
 62 |       │   ├── vpm_1000_wm_im.pkl
 63 |       │   ├── datasets
 64 |       │   |   ├── R2R_VLNCE_v1-2
 65 |       │   |   ├── R2R_VLNCE_v1-2_preprocessed
 66 |       │   |   ├── R2R_VLNCE_v1-2_preprocessed_BERTidx
 67 |       │   |   └── RxR_VLNCE_v0_enc_xlmr
 68 |       │   ├── logs
 69 |       │   ├── scene_datasets
 70 |       │   └── wp_pred
 71 |       │       ├── check_cwp_bestdist_hfov90
 72 |       │       └── check_cwp_bestdist_hfov63
 73 |       ├── pretrained
 74 |       │   ├── NeRF_p16_8x8.pth
 75 |       │   ├── ViT-B-32.pt
 76 |       │   ├── segm.pt
 77 |       │   ├── resnet18-f37072fd.pth
 78 |       │   ├── cwp_predictor.pth
 79 |       │   └── model_step_100000.pt
 80 |       └── bert_config
 81 |           └── bert-base-uncased
 82 |      ```
 83 | 
 84 |    🧑‍💻 We will soon provide a clean, organized compressed package matching this structure for easy download.
 85 | 
 86 | 3. **Supplementary Notes** 📌
 87 | 
 88 |    - **2025-11-28 Update:**  → See [Issue #11](https://github.com/Feliciaxyao/NavMorph/issues/11) for details.
 89 |    
 90 |      Clarified missing pretrained files (*e.g., waypoint prediction models* — `data/wp_pred/`, *e.g., Vision backbone weights* — `data/pretrained/ViT-B-32.pth`, ) and provided external download links.
 91 | 
 92 |    - **2025-11-28 Update:**   → See [Issue #12](https://github.com/Feliciaxyao/NavMorph/issues/12) for details.
 93 |      
 94 |      Clarified missing BERT model weights required by NavMorph (`data/bert_config/bert-base-uncased`) and provided external download links.
 95 |      
 96 |    - **2025-12-01 Update:**    → See [Issue #13](https://github.com/Feliciaxyao/NavMorph/issues/13) for details.
 97 | 
 98 |      Clarified the absence of the datasets (`R2R_VLNCE_v1-2_preprocessed_BERTidx` and `RxR_VLNCE_v0_enc_xlmr`) and provided external download links.  
 99 |     
100 | 
101 | 
102 | ### Training for R2R-CE / RxR-CE
103 | 
104 |    Use pseudo interative demonstrator to train the world model Navmorph:
105 |    ```
106 |    bash run_r2r/main.bash train # (run_rxr/main.bash)
107 |    ```
108 | 
109 | ### Online Evaluation on R2R-CE / RxR-CE
110 | 
111 |    Use pseudo interative demonstrator to equip the model with our NavMorph:
112 |    ```
113 |    bash run_r2r/main.bash eval # (run_rxr/main.bash)
114 |    ```
115 | 
116 | ### Notes❗
117 | 
118 |    When transitioning from the R2R dataset to the RxR dataset based on the baseline code, you will need to adjust the camera settings in three places to prevent any simulation issues.
119 | 
120 | 1. **Camera HFOV and VFOV Adjustment**:  
121 |    In [vlnce_bacelines/models/etp/nerf.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/models/etp/nerf.py#L57-L60), update the camera's **HFOV** and **VFOV**:
122 |    - Set `HFOV = 90` for R2R.
123 |    - Set `HFOV = 79` for RxR.
124 | 
125 | 2. **Dataset Setting**:  
126 |    In [vlnce_bacelines/models/Policy_ViewSelection_ETP.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/models/Policy_ViewSelection_ETP.py#L41), modify the `DATASET` variable:
127 |    - Set `DATASET = 'R2R'` for R2R.
128 |    - Set `DATASET = 'RxR'` for RxR.
129 | 
130 | 3. **Camera Configuration**:  
131 |    In [vlnce_baselines/ss_trainer_ETP.py](https://github.com/Feliciaxyao/NavMorph/blob/ae3246b902cdedf8533211ff62b2062cb9ed0e39/vlnce_baselines/ss_trainer_ETP.py#L181), ensure the camera configuration is updated:
132 |    - Set `camera.config.HFOV = 90` for R2R.
133 |    - Set `camera.config.HFOV = 79` for RxR.
134 | 
135 |    These adjustments are essential for proper camera calibration and to avoid discrepancies during simulation.
136 | 
137 | ## 📢 TODO list：
138 | 
139 |    -◻️ Checkpoints for RxR-CE release
140 |    
141 |    -◻️ Pre-trained CEM for RxR-CE release
142 |    
143 |    -◻️ Real-world Verification
144 | 
145 | ## Acknowledgements
146 | Our implementations are partially based on [VLN-3DFF](https://github.com/MrZihan/Sim2Real-VLN-3DFF) and [ETPNav](https://github.com/MarSaKi/ETPNav). Thanks to the authors for sharing their code.
147 | 
148 | 
149 | ## Related Work
150 | * [Beyond the Nav-Graph: Vision-and-Language Navigation in Continuous Environments](https://arxiv.org/pdf/2004.02857)
151 | 
152 | ## 📝 Citation
153 | 
154 | If you find this project useful in your research, please consider cite:
155 | ```
156 | @inproceedings{yao2025navmorph,
157 |   title={NavMorph: A Self-Evolving World Model for Vision-and-Language Navigation in Continuous Environments},
158 |   author={Xuan Yao, Junyu Gao and Changsheng Xu},
159 |   booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
160 |   pages={5536-5546},
161 |   year={2025}
162 | } 
163 | 


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/semantic_grid.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class SemanticGrid(object):
  8 |     
  9 |     def __init__(self, batch_size, grid_dim, crop_size, cell_size, spatial_labels, object_labels):
 10 |         self.grid_dim = grid_dim
 11 |         self.cell_size = cell_size
 12 |         self.spatial_labels = spatial_labels
 13 |         self.object_labels = object_labels
 14 |         self.batch_size = batch_size
 15 |         self.crop_size = crop_size
 16 | 
 17 |         self.crop_start = int( (self.grid_dim[0] / 2) - (self.crop_size / 2) )
 18 |         self.crop_end = int( (self.grid_dim[0] / 2) + (self.crop_size / 2) )
 19 | 
 20 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 21 | 
 22 |         # observed ground-projected sem grid over entire scene
 23 |         self.occupancy_grid = torch.ones((self.batch_size, self.spatial_labels, self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32, device=self.device)
 24 |         self.occupancy_grid = self.occupancy_grid*(1/self.spatial_labels)
 25 | 
 26 |         self.semantic_grid = torch.ones((self.batch_size, self.object_labels, self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32, device=self.device)
 27 |         self.semantic_grid = self.semantic_grid*(1/self.object_labels)
 28 | 
 29 | 
 30 |     def pop(self, batch_id):
 31 |         self.batch_size -= 1
 32 |         self.occupancy_grid = torch.cat([self.occupancy_grid[:batch_id],self.occupancy_grid[batch_id+1:]],dim=0)
 33 |         self.semantic_grid = torch.cat([self.semantic_grid[:batch_id],self.semantic_grid[batch_id+1:]],dim=0)
 34 | 
 35 | 
 36 |     # Transform each ground-projected grid into geocentric coordinates
 37 |     def spatialTransformer(self, grid, pose, abs_pose):
 38 |         # Input: 
 39 |         # grid -- sequence len x number of classes x grid_dim x grid_dim
 40 |         # pose -- sequence len x 3
 41 |         # abs_pose -- same as pose
 42 | 
 43 |         geo_grid_out = torch.zeros((grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(grid.device)
 44 | 
 45 | 
 46 |         for j in range(grid.shape[0]): # sequence length
 47 | 
 48 |             init_pose = abs_pose[j,0,:] # init absolute pose of each sequence
 49 |             init_rot_mat = torch.tensor([[torch.cos(init_pose[2]), -torch.sin(init_pose[2])],
 50 |                                         [torch.sin(init_pose[2]),torch.cos(init_pose[2])]], dtype=torch.float32).to(grid.device)
 51 | 
 52 |             grid_step = grid[j,:,:,:].unsqueeze(0)
 53 |             pose_step = pose[j,:]
 54 |         
 55 |             rel_coord = torch.tensor([pose_step[1],pose_step[0]], dtype=torch.float32).to(grid.device)
 56 |             rel_coord = rel_coord.reshape((2,1))
 57 |             rel_coord = torch.matmul(init_rot_mat,rel_coord)
 58 |     
 59 |             x = 2*(rel_coord[0]/self.cell_size)/(self.grid_dim[0])
 60 |             z = 2*(rel_coord[1]/self.cell_size)/(self.grid_dim[1])
 61 |     
 62 |             angle = pose_step[2]
 63 | 
 64 |             trans_theta = torch.tensor( [ [1, -0, x], [0, 1, z] ], dtype=torch.float32 ).unsqueeze(0)
 65 |             rot_theta = torch.tensor( [ [torch.cos(angle), -1.0*torch.sin(angle), 0], [torch.sin(angle), torch.cos(angle), 0] ], dtype=torch.float32 ).unsqueeze(0)
 66 |             trans_theta = trans_theta.to(grid.device)
 67 |             rot_theta = rot_theta.to(grid.device)
 68 |             
 69 |             trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False) # get grid translation displacement
 70 |             rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False) # get grid rotation displacement
 71 |             
 72 |             rot_geo_grid = F.grid_sample(grid_step, rot_disp_grid.float(), align_corners=False ) # apply rotation
 73 |             geo_grid = F.grid_sample(rot_geo_grid, trans_disp_grid.float(), align_corners=False) # apply translation
 74 | 
 75 |             geo_grid = geo_grid + 1e-12
 76 |             geo_grid_out[j,:,:,:] = geo_grid
 77 | 
 78 |         return geo_grid_out
 79 | 
 80 |     
 81 |     # Transform a geocentric map back to egocentric view
 82 |     def rotate_map(self, grid, rel_pose, abs_pose):
 83 |             # grid -- sequence len x number of classes x grid_dim x grid_dim
 84 |             # rel_pose -- sequence len x 3
 85 |             ego_grid_out = torch.zeros((grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(grid.device)
 86 | 
 87 |             for i in range(grid.shape[0]): # sequence length
 88 | 
 89 |                 init_pose = abs_pose[i,0,:] # init absolute pose of each sequence
 90 |                 init_rot_mat = torch.tensor([[torch.cos(init_pose[2]), -torch.sin(init_pose[2])],
 91 |                                                     [torch.sin(init_pose[2]),torch.cos(init_pose[2])]], dtype=torch.float32).to(grid.device)
 92 | 
 93 |                 grid_step = grid[i,:,:,:].unsqueeze(0)
 94 |                 rel_pose_step = rel_pose[i,:]
 95 |                 rel_coord = torch.tensor([rel_pose_step[1],rel_pose_step[0]], dtype=torch.float32).to(grid.device)
 96 |                 rel_coord = rel_coord.reshape((2,1))
 97 |                 rel_coord = torch.matmul(init_rot_mat,rel_coord)
 98 |                 x = -2*(rel_coord[0]/self.cell_size)/(self.grid_dim[0])
 99 |                 z = -2*(rel_coord[1]/self.cell_size)/(self.grid_dim[1])
100 |                 angle = -rel_pose_step[2]
101 |                 
102 |                 trans_theta = torch.tensor( [ [1, -0, x], [0, 1, z] ], dtype=torch.float32 ).unsqueeze(0)
103 |                 rot_theta = torch.tensor( [ [torch.cos(angle), -1.0*torch.sin(angle), 0], [torch.sin(angle), torch.cos(angle), 0] ], dtype=torch.float32 ).unsqueeze(0)
104 |                 trans_theta = trans_theta.to(grid.device)
105 |                 rot_theta = rot_theta.to(grid.device)
106 |                 
107 |                 trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False) # get grid translation displacement
108 |                 rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False) # get grid rotation displacement
109 |                 trans_ego_grid = F.grid_sample(grid_step, trans_disp_grid.float(), align_corners=False ) # apply translation 
110 |                 ego_grid = F.grid_sample(trans_ego_grid, rot_disp_grid.float(), align_corners=False) # apply rotation
111 |                 ego_grid_out[i,:,:,:] = ego_grid
112 |             return ego_grid_out
113 |     
114 | 
115 |     def update_proj_grid_bayes(self, occup_grid=None, segm_grid=None):
116 |         # Input geo_grid -- B x T (or 1) x num_of_classes x grid_dim x grid_dim
117 |         # Update the ground-projected grid at each location4
118 | 
119 |         step_occup_grid = torch.zeros((occup_grid.shape[0], occup_grid.shape[1], self.spatial_labels, 
120 |                                             self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(occup_grid.device)
121 |         occup_grid = occup_grid.to(self.device)     
122 |         for i in range(occup_grid.shape[1]): # sequence
123 |             new_proj_grid = occup_grid[:,i,:,:,:]
124 |             mul_proj_grid = new_proj_grid * self.occupancy_grid
125 |             normalization_grid = torch.sum(mul_proj_grid, dim=1, keepdim=True)
126 |             self.occupancy_grid = mul_proj_grid / normalization_grid.repeat(1, occup_grid.shape[2], 1, 1)
127 |             step_occup_grid[:,i,:,:,:] = self.occupancy_grid.clone()
128 | 
129 | 
130 |         step_segm_grid = torch.zeros((segm_grid.shape[0], segm_grid.shape[1], self.object_labels, 
131 |                                             self.grid_dim[0], self.grid_dim[1]), dtype=torch.float32).to(segm_grid.device)
132 |         segm_grid = segm_grid.to(self.device)     
133 |         for i in range(segm_grid.shape[1]): # sequence
134 |             new_proj_grid = segm_grid[:,i,:,:,:]
135 |             mul_proj_grid = new_proj_grid * self.semantic_grid
136 |             normalization_grid = torch.sum(mul_proj_grid, dim=1, keepdim=True)
137 |             self.semantic_grid = mul_proj_grid / normalization_grid.repeat(1, segm_grid.shape[2], 1, 1)
138 |             step_segm_grid[:,i,:,:,:] = self.semantic_grid.clone()
139 | 
140 |         return step_occup_grid, step_segm_grid
141 | 
142 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/env_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import sys
  4 | from typing import List, Optional, Type, Union
  5 | 
  6 | import habitat
  7 | from habitat import logger
  8 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
  9 | from habitat_baselines.utils.env_utils import make_env_fn
 10 | 
 11 | random.seed(0)
 12 | 
 13 | SLURM_JOBID = os.environ.get("SLURM_JOB_ID", None)
 14 | 
 15 | 
 16 | def is_slurm_job() -> bool:
 17 |     return SLURM_JOBID is not None
 18 | 
 19 | 
 20 | def is_slurm_batch_job() -> bool:
 21 |     r"""Heuristic to determine if a slurm job is a batch job or not. Batch jobs
 22 |     will have a job name that is not a shell unless the user specifically set the job
 23 |     name to that of a shell. Interactive jobs have a shell name as their job name.
 24 |     """
 25 |     return is_slurm_job() and os.environ.get("SLURM_JOB_NAME", None) not in (
 26 |         None,
 27 |         "bash",
 28 |         "zsh",
 29 |         "fish",
 30 |         "tcsh",
 31 |         "sh",
 32 |     )
 33 | 
 34 | 
 35 | def construct_envs(
 36 |     config: Config,
 37 |     env_class: Type[Union[Env, RLEnv]],
 38 |     workers_ignore_signals: bool = False,
 39 |     auto_reset_done: bool = True,
 40 |     episodes_allowed: Optional[List[str]] = None,
 41 | ) -> VectorEnv:
 42 |     r"""Create VectorEnv object with specified config and env class type.
 43 |     To allow better performance, dataset are split into small ones for
 44 |     each individual env, grouped by scenes.
 45 |     :param config: configs that contain num_environments as well as information
 46 |     :param necessary to create individual environments.
 47 |     :param env_class: class type of the envs to be created.
 48 |     :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor
 49 |     :param auto_reset_done: Whether or not to automatically reset the env on done
 50 |     :return: VectorEnv object created according to specification.
 51 |     """
 52 | 
 53 |     num_envs_per_gpu = config.NUM_ENVIRONMENTS
 54 |     if isinstance(config.SIMULATOR_GPU_IDS, list):
 55 |         gpus = config.SIMULATOR_GPU_IDS
 56 |     else:
 57 |         gpus = [config.SIMULATOR_GPU_IDS]
 58 |     num_gpus = len(gpus)
 59 |     num_envs = num_gpus * num_envs_per_gpu
 60 | 
 61 |     if episodes_allowed is not None:
 62 |         config.defrost()
 63 |         config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = episodes_allowed
 64 |         config.freeze()
 65 | 
 66 |     configs = []
 67 |     env_classes = [env_class for _ in range(num_envs)]
 68 |     dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE)
 69 |     scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES
 70 |     if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES:
 71 |         scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET)
 72 |     logger.info(f"SPLTI: {config.TASK_CONFIG.DATASET.SPLIT}, NUMBER OF SCENES: {len(scenes)}")
 73 | 
 74 |     if num_envs > 1:
 75 |         if len(scenes) == 0:
 76 |             raise RuntimeError(
 77 |                 "No scenes to load, multi-process logic relies on being able"
 78 |                 " to split scenes uniquely between processes"
 79 |             )
 80 | 
 81 |         if len(scenes) < num_envs and len(scenes) != 1:
 82 |             raise RuntimeError(
 83 |                 "reduce the number of GPUs or envs as there"
 84 |                 " aren't enough number of scenes"
 85 |             )
 86 | 
 87 |         random.shuffle(scenes)
 88 | 
 89 |     if len(scenes) == 1:
 90 |         scene_splits = [[scenes[0]] for _ in range(num_envs)]
 91 |     else:
 92 |         scene_splits = [[] for _ in range(num_envs)]
 93 |         for idx, scene in enumerate(scenes):
 94 |             scene_splits[idx % len(scene_splits)].append(scene)
 95 | 
 96 |         assert sum(map(len, scene_splits)) == len(scenes)
 97 | 
 98 |     for i in range(num_gpus):
 99 |         for j in range(num_envs_per_gpu):
100 |             proc_config = config.clone()
101 |             proc_config.defrost()
102 |             proc_id = (i * num_envs_per_gpu) + j
103 | 
104 |             task_config = proc_config.TASK_CONFIG
105 |             task_config.SEED += proc_id
106 |             if len(scenes) > 0:
107 |                 task_config.DATASET.CONTENT_SCENES = scene_splits[proc_id]
108 | 
109 |             task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpus[i]
110 | 
111 |             task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS
112 | 
113 |             proc_config.freeze()
114 |             configs.append(proc_config) 
115 | 
116 |     is_debug = True if sys.gettrace() else False
117 |     # env_entry = habitat.ThreadedVectorEnv if is_debug else habitat.VectorEnv
118 |     env_entry = habitat.ThreadedVectorEnv
119 |     envs = env_entry(
120 |         make_env_fn=make_env_fn,
121 |         env_fn_args=tuple(zip(configs, env_classes)), 
122 |         auto_reset_done=auto_reset_done,
123 |         workers_ignore_signals=workers_ignore_signals,
124 |     )
125 |     return envs
126 | 
127 | 
128 | def construct_envs_auto_reset_false(
129 |     config: Config, env_class: Type[Union[Env, RLEnv]]
130 | ) -> VectorEnv:
131 |     return construct_envs(config, env_class, auto_reset_done=False)
132 | 
133 | def construct_envs_for_rl(
134 |     config: Config,
135 |     env_class: Type[Union[Env, RLEnv]],
136 |     workers_ignore_signals: bool = False,
137 |     auto_reset_done: bool = True,
138 |     episodes_allowed: Optional[List[str]] = None,
139 | ) -> VectorEnv:
140 |     r"""Create VectorEnv object with specified config and env class type.
141 |     To allow better performance, dataset are split into small ones for
142 |     each individual env, grouped by scenes.
143 |     :param config: configs that contain num_environments as well as information
144 |     :param necessary to create individual environments.
145 |     :param env_class: class type of the envs to be created.
146 |     :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor
147 |     :param auto_reset_done: Whether or not to automatically reset the env on done
148 |     :return: VectorEnv object created according to specification.
149 |     """
150 | 
151 |     num_envs_per_gpu = config.NUM_ENVIRONMENTS
152 |     if isinstance(config.SIMULATOR_GPU_IDS, list):
153 |         gpus = config.SIMULATOR_GPU_IDS
154 |     else:
155 |         gpus = [config.SIMULATOR_GPU_IDS]
156 |     num_gpus = len(gpus)
157 |     num_envs = num_gpus * num_envs_per_gpu
158 | 
159 |     if episodes_allowed is not None:
160 |         config.defrost()
161 |         config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = episodes_allowed
162 |         config.freeze()
163 | 
164 |     configs = []
165 |     env_classes = [env_class for _ in range(num_envs)]
166 |     dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE)
167 |     scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES
168 |     if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES:
169 |         scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET)
170 | 
171 |     if num_envs > 1:
172 |         if len(scenes) == 0:
173 |             raise RuntimeError(
174 |                 "No scenes to load, multi-process logic relies on being able"
175 |                 " to split scenes uniquely between processes"
176 |             )
177 | 
178 |         if len(scenes) < num_envs and len(scenes) != 1:
179 |             raise RuntimeError(
180 |                 "reduce the number of GPUs or envs as there"
181 |                 " aren't enough number of scenes"
182 |             )
183 |         random.shuffle(scenes)
184 | 
185 |     if len(scenes) == 1:
186 |         scene_splits = [[scenes[0]] for _ in range(num_envs)]
187 |     else:
188 |         scene_splits = [[] for _ in range(num_envs)]
189 |         for idx, scene in enumerate(scenes):
190 |             scene_splits[idx % len(scene_splits)].append(scene)
191 | 
192 |         assert sum(map(len, scene_splits)) == len(scenes)
193 | 
194 |     for i in range(num_gpus):
195 |         for j in range(num_envs_per_gpu):
196 |             proc_config = config.clone()
197 |             proc_config.defrost()
198 |             proc_id = (i * num_envs_per_gpu) + j
199 | 
200 |             task_config = proc_config.TASK_CONFIG
201 |             task_config.SEED += proc_id
202 |             if len(scenes) > 0:
203 |                 task_config.DATASET.CONTENT_SCENES = scene_splits[proc_id]
204 | 
205 |             task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpus[i]
206 | 
207 |             task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS
208 | 
209 |             proc_config.freeze()
210 |             configs.append(proc_config)
211 | 
212 |     is_debug = True if sys.gettrace() else False
213 |     env_entry = habitat.ThreadedVectorEnv if is_debug else habitat.VectorEnv
214 |     envs = env_entry(
215 |         make_env_fn=make_env_fn,
216 |         env_fn_args=tuple(zip(configs, env_classes)),
217 |         auto_reset_done=auto_reset_done,
218 |         workers_ignore_signals=workers_ignore_signals,
219 |     )
220 |     return envs
221 | 


--------------------------------------------------------------------------------
/vlnce_baselines/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed as dist
  3 | import numpy as np
  4 | import math
  5 | import copy
  6 | 
  7 | class ARGS():
  8 |     def __init__(self):
  9 |         self.local_rank = 0
 10 | 
 11 | def reduce_loss(tensor, rank, world_size):
 12 |     with torch.no_grad():
 13 |         dist.reduce(tensor, dst=0)
 14 |         if rank == 0:
 15 |             # print(tensor)
 16 |             tensor /= world_size
 17 | 
 18 | def gather_list_and_concat(list_of_nums,world_size):
 19 |     if not torch.is_tensor(list_of_nums):
 20 |         tensor = torch.Tensor(list_of_nums).cuda()
 21 |     else:
 22 |         if list_of_nums.is_cuda == False:
 23 |             tensor = list_of_nums.cuda()
 24 |         else:
 25 |             tensor = list_of_nums
 26 |     gather_t = [torch.ones_like(tensor) for _ in
 27 |                 range(world_size)]
 28 |     dist.all_gather(gather_t, tensor)
 29 |     return gather_t
 30 | 
 31 | def repeat_allocation(allocations, max_number):
 32 |     if torch.is_tensor(max_number):
 33 |         max_number = max_number.long().item()
 34 |     else:
 35 |         max_number = max_number.long()
 36 |     allocation_number = len(allocations)
 37 |     repeat_time, res = max_number // allocation_number, max_number % allocation_number
 38 |     allocations_ = []
 39 |     for i in range(repeat_time):
 40 |         allocations_ += copy.deepcopy(allocations)
 41 |     allocations_ += copy.deepcopy(allocations)[:res]
 42 | 
 43 |     return allocations_
 44 | 
 45 | 
 46 | def allocate(number, ep_length, size_per_time):
 47 |     length_to_indexes = {ep_length[i]: [] for i in
 48 |                         range(len(ep_length))}
 49 |     for i in range(len(ep_length)):
 50 |         length_to_indexes[ep_length[i]] += [i]*number[i]
 51 | 
 52 |     values = []
 53 |     for i in range(len(number)):
 54 |         values += [ep_length[i]] * number[i]
 55 | 
 56 |     groups = int((len(values) - 0.01) // size_per_time + 1)
 57 | 
 58 |     values.sort(reverse=True)
 59 | 
 60 |     load_balance_groups = [[] for grp in range(groups)]
 61 | 
 62 |     for v in values:
 63 |         load_balance_groups.sort(key=lambda x: sum(x))
 64 |         load_balance_groups[0].append(v)
 65 | 
 66 |     indexes = []
 67 |     set_length = list(set(ep_length))
 68 |     for i in range(groups):
 69 |         index = np.zeros(len(load_balance_groups[i]),dtype=int)
 70 |         for j in range(len(set_length)):
 71 |             length_indexes = length_to_indexes[set_length[j]]
 72 |             position = np.where(np.array(load_balance_groups[i]) ==
 73 |                           set_length[j])[0]
 74 |             position_length = len(position)
 75 |             # print(position_length,j)
 76 |             index[position] = length_indexes[:position_length]
 77 |             # print(length_indexes)
 78 |             length_to_indexes[set_length[j]] = length_indexes[position_length:]
 79 |         indexes.append((index).tolist())
 80 | 
 81 |     return indexes
 82 | 
 83 | def allocate_instructions(instruction_lengths, allocations,ep_length, instruction_ids):
 84 |     instruction_ids_copy = copy.deepcopy(instruction_ids)
 85 |     allocations_copy = copy.deepcopy(allocations)
 86 |     instruction_lengths_copy = copy.deepcopy(instruction_lengths)
 87 |     values = []
 88 |     value_indexes = []
 89 |     weights = []
 90 |     for i in range(len(instruction_lengths)):
 91 |         instruction_length = instruction_lengths[i]
 92 |         values += instruction_length
 93 |         value_indexes += len(instruction_length)*[i]
 94 |         weights += [ep_length[i]] * len(instruction_length)
 95 |     # values = np.array(values)
 96 |     # value_indexes = np.array(value_indexes)
 97 |     values = np.array(values)
 98 |     weights = np.array(weights)
 99 |     value_indexes = np.array(value_indexes)
100 |     sorted_index = np.argsort(values*weights)[::-1]
101 |     values = values[sorted_index]
102 |     value_indexes = value_indexes[sorted_index]
103 |     weights = weights[sorted_index]
104 | 
105 |     groups = len(allocations)
106 |     load_balance_groups = [[] for grp in range(groups)]
107 |     group_weights = [[] for grp in range(groups)]
108 |     instruction_allocations = [[] for grp in range(groups)]
109 |     for j in range(len(values)):
110 |         summation = np.array([np.sum(np.array(load_balance_groups[i])*np.array(group_weights[i])) for i in range(groups)])
111 |         sorted_index = np.argsort(summation)
112 |         for i in sorted_index:
113 |             index = value_indexes[j]
114 |             value = values[j]
115 |             if index in allocations_copy[i]:
116 |                 allocations_copy[i].remove(index)
117 |                 load_balance_groups[i].append(value)
118 |                 group_weights[i].append(weights[j])
119 |                 # check[i].append(index)
120 |                 index_in_length = np.where(np.array(instruction_lengths_copy[index]) == value)[0][0]
121 |                 instruction_lengths_copy[index].pop(index_in_length)
122 |                 instruction_allocations[i].append(instruction_ids_copy[index].pop(index_in_length))
123 |                 break
124 | 
125 |     return instruction_allocations
126 | 
127 | 
128 | def allocate_by_scene_for_ddp(number, ep_length, size_per_time):
129 |     length_to_indexes = {ep_length[i]: [] for i in
130 |                         range(len(ep_length))}
131 |     for i in range(len(ep_length)):
132 |         length_to_indexes[ep_length[i]] += [i]*number[i]
133 | 
134 |     values = []
135 |     for i in range(len(number)):
136 |         values += [ep_length[i]] * number[i]
137 | 
138 |     groups = int((len(values) - 0.01) // size_per_time + 1)
139 | 
140 |     values.sort(reverse=True)
141 | 
142 |     load_balance_groups = [[] for grp in range(groups)]
143 | 
144 |     for v in values:
145 |         load_balance_groups.sort(key=lambda x: sum(x))
146 |         load_balance_groups[0].append(v)
147 | 
148 |     indexes = []
149 |     set_length = list(set(ep_length))
150 |     for i in range(groups):
151 |         index = np.zeros(len(load_balance_groups[i]),dtype=int)
152 |         for j in range(len(set_length)):
153 |             length_indexes = length_to_indexes[set_length[j]]
154 |             position = np.where(np.array(load_balance_groups[i]) ==
155 |                           set_length[j])[0]
156 |             position_length = len(position)
157 |             # print(position_length,j)
158 |             index[position] = length_indexes[:position_length]
159 |             # print(length_indexes)
160 |             length_to_indexes[set_length[j]] = length_indexes[position_length:]
161 |         indexes.append((index).tolist())
162 | 
163 |     return indexes
164 | 
165 | 
166 | def get_camera_orientations12():
167 |     base_angle_deg = 30
168 |     base_angle_rad = math.pi / 6
169 |     orient_dict = {}
170 |     for k in range(1,12):
171 |         orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0]
172 |     return orient_dict
173 | 
174 | 
175 | def get_camera_orientations24():
176 |     base_angle_deg = 15
177 |     base_angle_rad = math.pi / 12
178 |     orient_dict = {}
179 |     for k in range(1,24):
180 |         orient_dict[str(base_angle_deg*k)] = [0.0, base_angle_rad*k, 0.0]
181 |     return orient_dict
182 | 
183 | 
184 | def length2mask(length, size=None):
185 |     batch_size = len(length)
186 |     size = int(max(length)) if size is None else size
187 |     mask = (torch.arange(size, dtype=torch.int64).unsqueeze(0).repeat(batch_size, 1)
188 |                 > (torch.LongTensor(length) - 1).unsqueeze(1)).cuda()
189 |     return mask
190 | 
191 | 
192 | def dir_angle_feature(angle_list, device=None):
193 |     feature_dim = 64
194 |     batch_size = len(angle_list)
195 |     max_leng = max([len(k) for k in angle_list]) + 1  # +1 for stop
196 |     heading_enc = torch.zeros(
197 |         batch_size, max_leng, feature_dim, dtype=torch.float32)
198 | 
199 |     for i in range(batch_size):
200 |         for j, angle_rad in enumerate(angle_list[i]):
201 |             heading_enc[i][j] = torch.tensor(
202 |                 [math.sin(angle_rad), 
203 |                 math.cos(angle_rad)] * (feature_dim // 2))
204 | 
205 |     return heading_enc
206 | 
207 | 
208 | def dir_angle_feature_with_ele(angle_list, device=None):
209 |     feature_dim = 128
210 |     batch_size = len(angle_list)
211 |     max_leng = max([len(k) for k in angle_list]) + 1  # +1 for stop
212 |     heading_enc = torch.zeros(
213 |         batch_size, max_leng, feature_dim, dtype=torch.float32)
214 | 
215 |     for i in range(batch_size):
216 |         for j, angle_rad in enumerate(angle_list[i]):
217 |             heading_enc[i][j] = torch.tensor(
218 |             [
219 |                 math.sin(angle_rad), math.cos(angle_rad),
220 |                 math.sin(0.0), math.cos(0.0),  # elevation
221 |             ] * (128 // 4))
222 | 
223 |     return heading_enc
224 | 
225 | 
226 | 
227 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: morph
  2 | channels:
  3 |   - aihabitat
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=1_llvm
  9 |   - attrs=21.4.0=pyhd8ed1ab_0
 10 |   - bzip2=1.0.8=h7b6447c_0
 11 |   - c-ares=1.18.1=h7f8727e_0
 12 |   - ca-certificates=2024.7.2=h06a4308_0
 13 |   - certifi=2022.12.7=py37h06a4308_0
 14 |   - cmake=3.14.0=h52cb24c_0
 15 |   - colorama=0.4.4=pyh9f0ad1d_0
 16 |   - cycler=0.11.0=pyhd8ed1ab_0
 17 |   - expat=2.4.4=h295c915_0
 18 |   - ffmpeg=4.3.2=h37c90e5_3
 19 |   - freetype=2.10.4=h0708190_1
 20 |   - fribidi=1.0.10=h36c2ea0_0
 21 |   - giflib=5.2.1=h36c2ea0_2
 22 |   - gitdb=4.0.9=pyhd8ed1ab_0
 23 |   - gitpython=3.1.27=pyhd8ed1ab_0
 24 |   - gmp=6.2.1=h58526e2_0
 25 |   - gnutls=3.6.13=h85f3911_1
 26 |   - habitat-sim-challenge-2022=0.2.1=py3.7_headless_bullet_linux_f02ce8317b4bcbccf1ed92a6436f6aabc9aecec6
 27 |   - habitat-sim-mutex=1.0=headless_bullet
 28 |   - headless=2.0=0
 29 |   - icu=67.1=he1b5a44_0
 30 |   - imageio=2.16.1=pyhcf75d05_0
 31 |   - imageio-ffmpeg=0.4.5=pyhd8ed1ab_0
 32 |   - jbig=2.1=h7f98852_2003
 33 |   - jpeg=9e=h7f98852_0
 34 |   - kiwisolver=1.4.0=py37h7cecad7_0
 35 |   - krb5=1.19.2=hac12032_0
 36 |   - lame=3.100=h7f98852_1001
 37 |   - lcms2=2.12=hddcbb42_0
 38 |   - ld_impl_linux-64=2.35.1=h7274673_9
 39 |   - lerc=3.0=h9c3ff4c_0
 40 |   - libblas=3.9.0=13_linux64_openblas
 41 |   - libcblas=3.9.0=13_linux64_openblas
 42 |   - libcurl=7.80.0=h0b77cf5_0
 43 |   - libdeflate=1.10=h7f98852_0
 44 |   - libedit=3.1.20210910=h7f8727e_0
 45 |   - libev=4.33=h7f8727e_1
 46 |   - libffi=3.3=he6710b0_2
 47 |   - libgcc-ng=11.2.0=h1d223b6_14
 48 |   - libgfortran-ng=11.2.0=h69a702a_14
 49 |   - libgfortran5=11.2.0=h5c6108e_14
 50 |   - libimagequant=2.17.0=h7f98852_1
 51 |   - liblapack=3.9.0=13_linux64_openblas
 52 |   - libllvm11=11.1.0=hf817b99_3
 53 |   - libnghttp2=1.46.0=hce63b2e_0
 54 |   - libopenblas=0.3.18=pthreads_h8fe5266_0
 55 |   - libpng=1.6.37=h21135ba_2
 56 |   - libssh2=1.9.0=h1ba5d50_1
 57 |   - libstdcxx-ng=11.2.0=he4da1e4_14
 58 |   - libtiff=4.3.0=h542a066_3
 59 |   - libwebp=1.2.2=h3452ae3_0
 60 |   - libwebp-base=1.2.2=h7f98852_1
 61 |   - libxcb=1.13=h7f98852_1004
 62 |   - libzlib=1.2.11=h36c2ea0_1013
 63 |   - llvm-openmp=13.0.1=he0ac6c6_1
 64 |   - lz4-c=1.9.3=h9c3ff4c_1
 65 |   - matplotlib=3.2.2=1
 66 |   - matplotlib-base=3.2.2=py37h1d35a4c_1
 67 |   - ncurses=6.3=h7f8727e_2
 68 |   - nettle=3.6=he412f7d_0
 69 |   - openh264=2.1.1=h780b84a_0
 70 |   - openjpeg=2.4.0=hb52868f_1
 71 |   - openssl=1.1.1w=h7f8727e_0
 72 |   - pip=21.2.2=py37h06a4308_0
 73 |   - pthread-stubs=0.4=h36c2ea0_1001
 74 |   - pyparsing=3.0.7=pyhd8ed1ab_0
 75 |   - python=3.7.11=h12debd9_0
 76 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
 77 |   - python_abi=3.7=2_cp37m
 78 |   - quaternion=2022.2.10.14.20.39=py37h5e8e339_0
 79 |   - readline=8.1.2=h7f8727e_1
 80 |   - rhash=1.4.1=h3c74f83_1
 81 |   - scipy=1.7.3=py37hf2a6cf1_0
 82 |   - seaborn=0.12.2=py37h06a4308_0
 83 |   - six=1.16.0=pyh6c4a22f_0
 84 |   - smmap=3.0.5=pyh44b312d_0
 85 |   - sqlite=3.37.2=hc218d9a_0
 86 |   - tk=8.6.11=h1ccaba5_0
 87 |   - tornado=6.2=py37h5eee18b_0
 88 |   - tqdm=4.63.0=pyhd8ed1ab_0
 89 |   - typing_extensions=4.3.0=py37h06a4308_0
 90 |   - x264=1!161.3030=h7f98852_1
 91 |   - xorg-fixesproto=5.0=h7f98852_1002
 92 |   - xorg-inputproto=2.3.2=h7f98852_1002
 93 |   - xorg-kbproto=1.0.7=h7f98852_1002
 94 |   - xorg-libx11=1.7.2=h7f98852_0
 95 |   - xorg-libxau=1.0.9=h7f98852_0
 96 |   - xorg-libxcursor=1.2.0=h7f98852_0
 97 |   - xorg-libxdmcp=1.1.3=h7f98852_0
 98 |   - xorg-libxext=1.3.4=h7f98852_1
 99 |   - xorg-libxfixes=5.0.3=h7f98852_1004
100 |   - xorg-libxi=1.7.10=h7f98852_0
101 |   - xorg-libxinerama=1.1.4=h9c3ff4c_1001
102 |   - xorg-libxrandr=1.5.2=h7f98852_1
103 |   - xorg-libxrender=0.9.10=h7f98852_1003
104 |   - xorg-randrproto=1.5.0=h7f98852_1001
105 |   - xorg-renderproto=0.11.1=h7f98852_1002
106 |   - xorg-xextproto=7.3.0=h7f98852_1002
107 |   - xorg-xproto=7.0.31=h7f98852_1007
108 |   - xz=5.2.5=h7b6447c_0
109 |   - zlib=1.2.11=h36c2ea0_1013
110 |   - zstd=1.5.2=ha95c52a_0
111 |   - pip:
112 |     - absl-py==1.0.0
113 |     - addict==2.4.0
114 |     - appdirs==1.4.4
115 |     - astor==0.8.1
116 |     - astunparse==1.6.3
117 |     - backcall==0.2.0
118 |     - backports-cached-property==1.0.2
119 |     - beautifulsoup4==4.12.3
120 |     - boto3==1.33.13
121 |     - botocore==1.33.13
122 |     - braceexpand==0.1.7
123 |     - cachetools==5.0.0
124 |     - charset-normalizer==2.0.12
125 |     - click==8.0.4
126 |     - clip==1.0
127 |     - cloudpickle==2.0.0
128 |     - comm==0.1.4
129 |     - configargparse==1.7
130 |     - dash==2.15.0
131 |     - dash-core-components==2.0.0
132 |     - dash-html-components==2.0.0
133 |     - dash-table==5.0.0
134 |     - decorator==4.4.2
135 |     - distlib==0.3.8
136 |     - docker-pycreds==0.4.0
137 |     - docstring-parser==0.14.1
138 |     - dtw==1.4.0
139 |     - fastdtw==0.3.4
140 |     - fastjsonschema==2.20.0
141 |     - filelock==3.12.2
142 |     - flask==2.2.5
143 |     - flatbuffers==24.3.25
144 |     - frozendict==2.4.4
145 |     - ftfy==6.1.1
146 |     - gast==0.4.0
147 |     - gdown==4.7.3
148 |     - google-auth==2.6.0
149 |     - google-auth-oauthlib==0.4.6
150 |     - google-pasta==0.2.0
151 |     - grpcio==1.45.0rc1
152 |     - gym==0.23.1
153 |     - gym-notices==0.0.6
154 |     - h5py==3.8.0
155 |     - habitat-sim==0.2.1
156 |     - huggingface-hub==0.0.12
157 |     - idna==3.3
158 |     - ifcfg==0.22
159 |     - importlib-metadata==6.7.0
160 |     - importlib-resources==5.12.0
161 |     - ipython==7.34.0
162 |     - ipywidgets==8.1.3
163 |     - itsdangerous==2.1.2
164 |     - jedi==0.19.1
165 |     - jinja2==3.1.4
166 |     - jmespath==1.0.1
167 |     - joblib==1.3.2
168 |     - jsonlines==3.1.0
169 |     - jsonschema==4.17.3
170 |     - jupyter-core==4.12.0
171 |     - jupyterlab-widgets==3.0.11
172 |     - keras==2.11.0
173 |     - keras-applications==1.0.8
174 |     - keras-preprocessing==1.1.2
175 |     - libclang==18.1.1
176 |     - llvmlite==0.31.0
177 |     - lmdb==1.3.0
178 |     - markdown==3.3.6
179 |     - markdown-it-py==2.2.0
180 |     - markupsafe==2.1.5
181 |     - matplotlib-inline==0.1.6
182 |     - mdurl==0.1.2
183 |     - mock==5.1.0
184 |     - moviepy==2.0.0.dev2
185 |     - msgpack==1.0.3
186 |     - msgpack-numpy==0.4.7.1
187 |     - nbformat==5.7.0
188 |     - nest-asyncio==1.6.0
189 |     - networkx==2.6.3
190 |     - numba==0.48.0
191 |     - numpy==1.21.6
192 |     - oauthlib==3.2.0
193 |     - objectio==0.2.29
194 |     - open3d==0.17.0
195 |     - opencv-python==4.5.5.64
196 |     - opt-einsum==3.3.0
197 |     - packaging==24.0
198 |     - pandas==1.3.0
199 |     - pandocfilters==1.5.0
200 |     - parso==0.8.4
201 |     - pathtools==0.1.2
202 |     - pexpect==4.9.0
203 |     - pickle5==0.0.12
204 |     - pickleshare==0.7.5
205 |     - pillow==9.5.0
206 |     - pkgutil-resolve-name==1.3.10
207 |     - platformdirs==3.11.0
208 |     - plotly==5.18.0
209 |     - prompt-toolkit==3.0.47
210 |     - protobuf==3.19.6
211 |     - psutil==6.0.0
212 |     - ptyprocess==0.7.0
213 |     - pyasn1==0.4.8
214 |     - pyasn1-modules==0.2.8
215 |     - pygments==2.17.2
216 |     - pyliblzfse==0.4.1
217 |     - pyquaternion==0.9.9
218 |     - pyrsistent==0.19.3
219 |     - pysocks==1.7.1
220 |     - pytorch-transformers==1.2.0
221 |     - pytz==2024.1
222 |     - pywavelets==1.3.0
223 |     - pyyaml==6.0
224 |     - regex==2024.4.16
225 |     - requests==2.27.1
226 |     - requests-oauthlib==1.3.1
227 |     - retrying==1.3.4
228 |     - rich==13.7.1
229 |     - rsa==4.8
230 |     - s3transfer==0.8.2
231 |     - sacremoses==0.0.53
232 |     - scikit-image==0.19.3
233 |     - scikit-learn==1.0.2
234 |     - sentencepiece==0.2.0
235 |     - sentry-sdk==2.7.1
236 |     - setproctitle==1.3.3
237 |     - setuptools==68.0.0
238 |     - shtab==1.7.1
239 |     - simplejson==3.17.6
240 |     - soupsieve==2.4.1
241 |     - tenacity==8.2.3
242 |     - tensorboard==2.11.2
243 |     - tensorboard-data-server==0.6.1
244 |     - tensorboard-plugin-wit==1.8.1
245 |     - tensorflow==2.11.0
246 |     - tensorflow-estimator==2.11.0
247 |     - tensorflow-io-gcs-filesystem==0.34.0
248 |     - termcolor==2.3.0
249 |     - threadpoolctl==3.1.0
250 |     - tifffile==2021.11.2
251 |     - tinycudann==1.7
252 |     - tokenizers==0.10.3
253 |     - torch==1.12.1+cu113
254 |     - torch-kdtree==1.0
255 |     - torchvision==0.13.1+cu113
256 |     - traitlets==5.8.0
257 |     - transformers==4.9.2
258 |     - trimesh==3.9.1
259 |     - trove-classifiers==2023.7.6
260 |     - typeguard==3.0.2
261 |     - typer==0.4.0
262 |     - typing-extensions==4.7.1
263 |     - tyro==0.5.2
264 |     - urllib3==1.26.15
265 |     - virtualenv==20.23.1
266 |     - viser==0.0.10
267 |     - wandb==0.15.2
268 |     - wcwidth==0.2.5
269 |     - webdataset==0.1.40
270 |     - webencodings==0.5.1
271 |     - websocket-client==1.4.2
272 |     - websockets==11.0.3
273 |     - werkzeug==2.2.3
274 |     - wheel==0.37.1
275 |     - widgetsnbextension==4.0.11
276 |     - wrapt==1.14.1
277 |     - xatlas==0.0.7
278 |     - xxhash==3.2.0
279 |     - yacs==0.1.8
280 |     - zipp==3.7.0
281 | prefix: /opt/conda/envs/morph
282 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/default.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | 
  3 | import habitat_baselines.config.default
  4 | from habitat.config.default import CONFIG_FILE_SEPARATOR
  5 | from habitat.config.default import Config as CN
  6 | 
  7 | from habitat_extensions.config.default import (
  8 |     get_extended_config as get_task_config,
  9 | )
 10 | 
 11 | # -----------------------------------------------------------------------------
 12 | # EXPERIMENT CONFIG
 13 | # -----------------------------------------------------------------------------
 14 | _C = CN()
 15 | _C.BASE_TASK_CONFIG_PATH = "habitat_extensions/config/vlnce_task.yaml"
 16 | _C.TASK_CONFIG = CN()  # task_config will be stored as a config node
 17 | _C.TRAINER_NAME = "dagger"
 18 | _C.ENV_NAME = "VLNCEDaggerEnv"
 19 | _C.SIMULATOR_GPU_IDS = [0]
 20 | _C.VIDEO_OPTION = []  # options: "disk", "tensorboard"
 21 | _C.VIDEO_DIR = "videos/debug"
 22 | _C.TENSORBOARD_DIR = "data/tensorboard_dirs/debug"
 23 | _C.RESULTS_DIR = "data/checkpoints/pretrained/evals"
 24 | 
 25 | # -----------------------------------------------------------------------------
 26 | # EVAL CONFIG
 27 | # -----------------------------------------------------------------------------
 28 | _C.EVAL = CN()
 29 | # The split to evaluate on
 30 | _C.EVAL.SPLIT = "val_seen"
 31 | _C.EVAL.EPISODE_COUNT = -1
 32 | _C.EVAL.LANGUAGES = ["en-US", "en-IN"]
 33 | _C.EVAL.SAMPLE = False
 34 | _C.EVAL.SAVE_RESULTS = True
 35 | _C.EVAL.EVAL_NONLEARNING = False
 36 | _C.EVAL.NONLEARNING = CN()
 37 | _C.EVAL.NONLEARNING.AGENT = "RandomAgent"
 38 | 
 39 | # -----------------------------------------------------------------------------
 40 | # INFERENCE CONFIG
 41 | # -----------------------------------------------------------------------------
 42 | _C.INFERENCE = CN()
 43 | _C.INFERENCE.SPLIT = "test"
 44 | _C.INFERENCE.LANGUAGES = ["en-US", "en-IN"]
 45 | _C.INFERENCE.SAMPLE = False
 46 | _C.INFERENCE.USE_CKPT_CONFIG = True
 47 | _C.INFERENCE.CKPT_PATH = "data/checkpoints/CMA_PM_DA_Aug.pth"
 48 | _C.INFERENCE.PREDICTIONS_FILE = "predictions.json"
 49 | _C.INFERENCE.INFERENCE_NONLEARNING = False
 50 | _C.INFERENCE.NONLEARNING = CN()
 51 | _C.INFERENCE.NONLEARNING.AGENT = "RandomAgent"
 52 | _C.INFERENCE.FORMAT = "rxr"  # either 'rxr' or 'r2r'
 53 | # -----------------------------------------------------------------------------
 54 | # IMITATION LEARNING CONFIG
 55 | # -----------------------------------------------------------------------------
 56 | _C.IL = CN()
 57 | _C.IL.lr = 2.5e-4
 58 | _C.IL.batch_size = 5
 59 | _C.IL.epochs = 4
 60 | _C.IL.use_iw = True
 61 | # inflection coefficient for RxR training set GT trajectories (guide): 1.9
 62 | # inflection coefficient for R2R training set GT trajectories: 3.2
 63 | _C.IL.inflection_weight_coef = 3.2
 64 | # load an already trained model for fine tuning
 65 | _C.IL.waypoint_aug = False
 66 | _C.IL.load_from_ckpt = False
 67 | _C.IL.ckpt_to_load = "data/checkpoints/ckpt.0.pth"
 68 | # if True, loads the optimizer state, epoch, and step_id from the ckpt dict.
 69 | _C.IL.is_requeue = False
 70 | # it True, start training from the saved epoch
 71 | # -----------------------------------------------------------------------------
 72 | # IL: RXR TRAINER CONFIG
 73 | # -----------------------------------------------------------------------------
 74 | _C.IL.RECOLLECT_TRAINER = CN()
 75 | _C.IL.RECOLLECT_TRAINER.preload_trajectories_file = True
 76 | _C.IL.RECOLLECT_TRAINER.trajectories_file = (
 77 |     "data/trajectories_dirs/debug/trajectories.json.gz"
 78 | )
 79 | # if set to a positive int, episodes with longer paths are ignored in training
 80 | _C.IL.RECOLLECT_TRAINER.max_traj_len = -1
 81 | # if set to a positive int, effective_batch_size must be some multiple of
 82 | # IL.batch_size. Gradient accumulation enables an arbitrarily high "effective"
 83 | # batch size.
 84 | _C.IL.RECOLLECT_TRAINER.effective_batch_size = -1
 85 | _C.IL.RECOLLECT_TRAINER.preload_size = 30
 86 | _C.IL.RECOLLECT_TRAINER.use_iw = True
 87 | _C.IL.RECOLLECT_TRAINER.gt_file = (
 88 |     "data/datasets/RxR_VLNCE_v0_enc_xlmr/{split}/{split}_{role}_gt.json.gz"
 89 | )
 90 | # -----------------------------------------------------------------------------
 91 | # IL: DAGGER CONFIG
 92 | # -----------------------------------------------------------------------------
 93 | _C.IL.DAGGER = CN()
 94 | _C.IL.DAGGER.iterations = 10
 95 | _C.IL.DAGGER.update_size = 5000
 96 | _C.IL.DAGGER.p = 0.75
 97 | _C.IL.DAGGER.expert_policy_sensor = "SHORTEST_PATH_SENSOR"
 98 | _C.IL.DAGGER.expert_policy_sensor_uuid = "shortest_path_sensor"
 99 | _C.IL.DAGGER.load_space = False
100 | # if True, load saved observation space and action space
101 | _C.IL.DAGGER.lmdb_map_size = 1.0e12
102 | # if True, saves data to disk in fp16 and converts back to fp32 when loading.
103 | _C.IL.DAGGER.lmdb_fp16 = False
104 | # How often to commit the writes to the DB, less commits is
105 | # better, but everything must be in memory until a commit happens/
106 | _C.IL.DAGGER.lmdb_commit_frequency = 500
107 | # If True, load precomputed features directly from lmdb_features_dir.
108 | _C.IL.DAGGER.preload_lmdb_features = False
109 | _C.IL.DAGGER.lmdb_features_dir = (
110 |     "data/trajectories_dirs/debug/trajectories.lmdb"
111 | )
112 | # -----------------------------------------------------------------------------
113 | # RL CONFIG
114 | # -----------------------------------------------------------------------------
115 | _C.RL = CN()
116 | _C.RL.POLICY = CN()
117 | _C.RL.POLICY.OBS_TRANSFORMS = CN()
118 | _C.RL.POLICY.OBS_TRANSFORMS.ENABLED_TRANSFORMS = [
119 |     "CenterCropperPerSensor",
120 | ]
121 | _C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER_PER_SENSOR = CN()
122 | _C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER_PER_SENSOR.SENSOR_CROPS = [
123 |     ("rgb", (224, 224)),
124 |     ("depth", (256, 256)),
125 | ]
126 | _C.RL.POLICY.OBS_TRANSFORMS.RESIZER_PER_SENSOR = CN()
127 | _C.RL.POLICY.OBS_TRANSFORMS.RESIZER_PER_SENSOR.SIZES = [
128 |     ("rgb", (224, 298)),
129 |     ("depth", (256, 341)),
130 | ]
131 | # -----------------------------------------------------------------------------
132 | # MODELING CONFIG
133 | # -----------------------------------------------------------------------------
134 | _C.MODEL = CN()
135 | _C.MODEL.policy_name = "CMAPolicy"  # or "Seq2SeqPolicy"
136 | _C.MODEL.ablate_depth = False
137 | _C.MODEL.ablate_rgb = False
138 | _C.MODEL.ablate_instruction = False
139 | 
140 | _C.MODEL.INSTRUCTION_ENCODER = CN()
141 | _C.MODEL.INSTRUCTION_ENCODER.sensor_uuid = "instruction"
142 | _C.MODEL.INSTRUCTION_ENCODER.vocab_size = 2504
143 | _C.MODEL.INSTRUCTION_ENCODER.use_pretrained_embeddings = True
144 | _C.MODEL.INSTRUCTION_ENCODER.embedding_file = (
145 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/embeddings.json.gz"
146 | )
147 | _C.MODEL.INSTRUCTION_ENCODER.dataset_vocab = (
148 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train.json.gz"
149 | )
150 | _C.MODEL.INSTRUCTION_ENCODER.fine_tune_embeddings = False
151 | _C.MODEL.INSTRUCTION_ENCODER.embedding_size = 50
152 | _C.MODEL.INSTRUCTION_ENCODER.hidden_size = 128
153 | _C.MODEL.INSTRUCTION_ENCODER.rnn_type = "LSTM"
154 | _C.MODEL.INSTRUCTION_ENCODER.final_state_only = True
155 | _C.MODEL.INSTRUCTION_ENCODER.bidirectional = False
156 | 
157 | _C.MODEL.spatial_output = True
158 | _C.MODEL.RGB_ENCODER = CN()
159 | _C.MODEL.RGB_ENCODER.cnn_type = "TorchVisionResNet50"
160 | _C.MODEL.RGB_ENCODER.output_size = 256
161 | 
162 | _C.MODEL.DEPTH_ENCODER = CN()
163 | _C.MODEL.DEPTH_ENCODER.cnn_type = "VlnResnetDepthEncoder"
164 | _C.MODEL.DEPTH_ENCODER.output_size = 128
165 | # type of resnet to use
166 | _C.MODEL.DEPTH_ENCODER.backbone = "resnet50"
167 | # path to DDPPO resnet weights
168 | _C.MODEL.DEPTH_ENCODER.ddppo_checkpoint = (
169 |     "data/ddppo-models/gibson-2plus-resnet50.pth"
170 | )
171 | 
172 | _C.MODEL.STATE_ENCODER = CN()
173 | _C.MODEL.STATE_ENCODER.hidden_size = 512
174 | _C.MODEL.STATE_ENCODER.rnn_type = "GRU"
175 | 
176 | _C.MODEL.SEQ2SEQ = CN()
177 | _C.MODEL.SEQ2SEQ.use_prev_action = False
178 | 
179 | _C.MODEL.PROGRESS_MONITOR = CN()
180 | _C.MODEL.PROGRESS_MONITOR.use = False
181 | _C.MODEL.PROGRESS_MONITOR.alpha = 1.0  # loss multiplier
182 | 
183 | 
184 | def purge_keys(config: CN, keys: List[str]) -> None:
185 |     for k in keys:
186 |         del config[k]
187 |         config.register_deprecated_key(k)
188 | 
189 | 
190 | def get_config(
191 |     config_paths: Optional[Union[List[str], str]] = None,
192 |     opts: Optional[list] = None,
193 | ) -> CN:
194 |     r"""Create a unified config with default values. Initialized from the
195 |     habitat_baselines default config. Overwritten by values from
196 |     `config_paths` and overwritten by options from `opts`.
197 |     Args:
198 |         config_paths: List of config paths or string that contains comma
199 |         separated list of config paths.
200 |         opts: Config options (keys, values) in a list (e.g., passed from
201 |         command line into the config. For example, `opts = ['FOO.BAR',
202 |         0.5]`. Argument can be used for parameter sweeping or quick tests.
203 |     """
204 |     config = CN()
205 |     config.merge_from_other_cfg(habitat_baselines.config.default._C)
206 |     purge_keys(config, ["SIMULATOR_GPU_ID", "TEST_EPISODE_COUNT"])
207 |     config.merge_from_other_cfg(_C.clone())
208 | 
209 |     if config_paths:
210 |         if isinstance(config_paths, str):
211 |             if CONFIG_FILE_SEPARATOR in config_paths:
212 |                 config_paths = config_paths.split(CONFIG_FILE_SEPARATOR)
213 |             else:
214 |                 config_paths = [config_paths]
215 | 
216 |         prev_task_config = ""
217 |         for config_path in config_paths:
218 |             config.merge_from_file(config_path)
219 |             if config.BASE_TASK_CONFIG_PATH != prev_task_config:
220 |                 config.TASK_CONFIG = get_task_config(
221 |                     config.BASE_TASK_CONFIG_PATH
222 |                 )
223 |                 prev_task_config = config.BASE_TASK_CONFIG_PATH
224 | 
225 |     if opts:
226 |         config.CMD_TRAILING_OPTS = opts
227 |         config.merge_from_list(opts)
228 | 
229 |     config.freeze()
230 |     return config
231 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/recollection_dataset.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | from collections import defaultdict, deque
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import tqdm
  8 | from gym import Space
  9 | from habitat.config.default import Config
 10 | from habitat.sims.habitat_simulator.actions import HabitatSimActions
 11 | from habitat_baselines.common.environments import get_env_class
 12 | from habitat_baselines.common.obs_transformers import (
 13 |     apply_obs_transforms_obs_space,
 14 |     get_active_obs_transforms,
 15 | )
 16 | 
 17 | from habitat_extensions.task import ALL_ROLES_MASK, RxRVLNCEDatasetV1
 18 | from vlnce_baselines.common.env_utils import construct_envs
 19 | from vlnce_baselines.common.utils import extract_instruction_tokens
 20 | 
 21 | 
 22 | class TeacherRecollectionDataset(torch.utils.data.IterableDataset):
 23 |     def __init__(self, config: Config):
 24 |         super().__init__()
 25 |         self.config = config
 26 |         # self._preload = []
 27 |         self._preload = deque()
 28 |         self.world_size = self.config.GPU_NUMBERS
 29 |         self.rank = self.config.local_rank
 30 | 
 31 |         assert (
 32 |             config.IL.RECOLLECT_TRAINER.preload_size >= config.IL.batch_size
 33 |         ), "preload size must be greater than batch size."
 34 |         self.envs = None
 35 |         self._env_observations = None
 36 | 
 37 |         if config.IL.use_iw:
 38 |             self.inflec_weights = torch.tensor(
 39 |                 [1.0, config.IL.inflection_weight_coef]
 40 |             )
 41 |         else:
 42 |             self.inflec_weights = torch.tensor([1.0, 1.0])
 43 | 
 44 |         if self.config.IL.RECOLLECT_TRAINER.preload_trajectories_file:
 45 |             self.config.defrost()
 46 |             self.config.IL.RECOLLECT_TRAINER.trajectories_file = \
 47 |                 self.config.IL.RECOLLECT_TRAINER.trajectories_file[
 48 |                 :-8] + '_w' + \
 49 |                 str(self.world_size) + '_r' + str(self.rank) + '.json.gz'
 50 |             self.config.freeze()
 51 |             with gzip.open(
 52 |                 config.IL.RECOLLECT_TRAINER.trajectories_file, "rt"
 53 |             ) as f:
 54 |                 self.trajectories = json.load(f)
 55 |         else:
 56 |             self.trajectories = self.collect_dataset()
 57 | 
 58 |         self.initialize_sims()
 59 | 
 60 |     def initialize_sims(self):
 61 |         config = self.config.clone()
 62 |         config.defrost()
 63 |         config.TASK_CONFIG.MEASUREMENTS = []
 64 |         config.freeze()
 65 | 
 66 |         self.envs = construct_envs(
 67 |             config,
 68 |             get_env_class(config.ENV_NAME),
 69 |             episodes_allowed=list(self.trajectories.keys()),
 70 |         )
 71 |         self.length = sum(self.envs.number_of_episodes)
 72 |         self.obs_transforms = get_active_obs_transforms(self.config)
 73 |         self._observation_space = apply_obs_transforms_obs_space(
 74 |             self.envs.observation_spaces[0], self.obs_transforms
 75 |         )
 76 | 
 77 |         self.env_step = [0 for _ in range(self.envs.num_envs)]
 78 |         self._env_observations = [[] for _ in range(self.envs.num_envs)]
 79 | 
 80 |         observations = self.envs.reset()
 81 |         observations = extract_instruction_tokens(
 82 |             observations,
 83 |             self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID,
 84 |         )
 85 |         for i, ep in enumerate(self.envs.current_episodes()):
 86 |             path_step = self.trajectories[str(ep.episode_id)][0]
 87 |             self._env_observations[i].append(
 88 |                 (
 89 |                     observations[i],
 90 |                     path_step[0],  # prev_action
 91 |                     path_step[2],  # oracle_action
 92 |                 )
 93 |             )
 94 | 
 95 |     @property
 96 |     def batch_size(self):
 97 |         return self.config.IL.batch_size
 98 | 
 99 |     @property
100 |     def observation_space(self) -> Space:
101 |         assert self.envs is not None, "Simulator must first be loaded."
102 |         assert self._observation_space is not None
103 |         return self._observation_space
104 | 
105 |     @property
106 |     def action_space(self) -> Space:
107 |         assert self.envs is not None, "Simulator must first be loaded."
108 |         return self.envs.action_spaces[0]
109 | 
110 |     def close_sims(self):
111 |         self.envs.close()
112 |         del self.envs
113 |         del self._env_observations
114 |         self.envs = None
115 |         self._env_observations = None
116 | 
117 |     def collect_dataset(self):
118 |         r"""Uses the ground truth trajectories to create a teacher forcing
119 |         datset for a given split. Loads both guide and follower episodes.
120 |         """
121 |         trajectories = defaultdict(list)
122 |         split = self.config.TASK_CONFIG.DATASET.SPLIT
123 | 
124 |         if "{role}" in self.config.IL.RECOLLECT_TRAINER.gt_file:
125 |             gt_data = {}
126 |             for role in RxRVLNCEDatasetV1.annotation_roles:
127 |                 if (
128 |                     ALL_ROLES_MASK not in self.config.TASK_CONFIG.DATASET.ROLES
129 |                     and role not in self.config.TASK_CONFIG.DATASET.ROLES
130 |                 ):
131 |                     continue
132 | 
133 |                 with gzip.open(
134 |                     self.config.IL.RECOLLECT_TRAINER.gt_file.format(
135 |                         split=split, role=role
136 |                     ),
137 |                     "rt",
138 |                 ) as f:
139 |                     gt_data.update(json.load(f))
140 |         else:
141 |             with gzip.open(
142 |                 self.config.IL.RECOLLECT_TRAINER.gt_path.format(split=split)
143 |             ) as f:
144 |                 gt_data = json.load(f)
145 | 
146 |         t = (
147 |             tqdm.tqdm(gt_data.items(), "GT Collection")
148 |             if self.config.use_pbar
149 |             else gt_data.items()
150 |         )
151 | 
152 |         for episode_id, trajectory in t:
153 |             if (
154 |                 self.config.IL.RECOLLECT_TRAINER.max_traj_len != -1
155 |                 and len(trajectory["actions"])
156 |                 > self.config.IL.RECOLLECT_TRAINER.max_traj_len
157 |             ) or (
158 |                 self.config.IL.RECOLLECT_TRAINER.min_traj_len != -1
159 |                 and len(trajectory["actions"])
160 |                 < self.config.IL.RECOLLECT_TRAINER.min_traj_len
161 |             ):
162 |                 continue
163 | 
164 |             for i, action in enumerate(trajectory["actions"]):
165 |                 prev_action = (
166 |                     trajectories[episode_id][i - 1][1]
167 |                     if i
168 |                     else HabitatSimActions.STOP
169 |                 )
170 | 
171 |                 # [prev_action, action, oracle_action]
172 |                 trajectories[episode_id].append([prev_action, action, action])
173 | 
174 |         trajectories = dict(list(trajectories.items())[self.rank::self.world_size])
175 |         self.config.defrost()
176 |         self.config.IL.RECOLLECT_TRAINER.trajectories_file = \
177 |             self.config.IL.RECOLLECT_TRAINER.trajectories_file[:-8]+'_w'+ \
178 |             str(self.world_size)+'_r'+str(self.rank) + '.json.gz'
179 |         self.config.freeze()
180 |         with gzip.open(
181 |             self.config.IL.RECOLLECT_TRAINER.trajectories_file, "wt"
182 |         ) as f:
183 |             f.write(json.dumps(trajectories))
184 |         return trajectories
185 | 
186 |     def _load_next(self):
187 |         """
188 |         Episode length is currently not considered. We were previously batching episodes
189 |         together with similar lengths. Not sure if we need to bring that back.
190 |         """
191 |         # self.rank = 0
192 |         if len(self._preload):
193 |             # out = self._preload[self.rank]
194 |             # self._preload = self._preload[self.world_size:]
195 |             # return out
196 |             return self._preload.popleft()
197 | 
198 |         while (
199 |             len(self._preload) < self.config.IL.RECOLLECT_TRAINER.preload_size
200 |         ):
201 |             current_episodes = self.envs.current_episodes()
202 |             prev_eps = current_episodes
203 | 
204 |             # get the next action for each env
205 |             actions = [
206 |                 self.trajectories[str(ep.episode_id)][self.env_step[i]][1]
207 |                 for i, ep in enumerate(current_episodes)
208 |             ]
209 | 
210 |             outputs = self.envs.step(actions)
211 |             observations, _, dones, _ = [list(x) for x in zip(*outputs)]
212 |             observations = extract_instruction_tokens(
213 |                 observations,
214 |                 self.config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID,
215 |             )
216 | 
217 |             current_episodes = self.envs.current_episodes()
218 | 
219 |             for i in range(self.envs.num_envs):
220 |                 self.env_step[i] += 1
221 |                 if dones[i]:
222 |                     assert len(self._env_observations[i]) == len(
223 |                         self.trajectories[str(prev_eps[i].episode_id)]
224 |                     ), "Collected episode does not match the step count of trajectory"
225 |                     self._preload.append(
226 |                         (
227 |                             [o[0] for o in self._env_observations[i]],
228 |                             [o[1] for o in self._env_observations[i]],
229 |                             [o[2] for o in self._env_observations[i]],
230 |                         )
231 |                     )
232 |                     self._env_observations[i] = []
233 |                     self.env_step[i] = 0
234 | 
235 |                 path_step = self.trajectories[
236 |                     str(current_episodes[i].episode_id)
237 |                 ][self.env_step[i]]
238 |                 self._env_observations[i].append(
239 |                     (
240 |                         observations[i],
241 |                         path_step[0],  # prev_action
242 |                         path_step[2],  # oracle_action
243 |                     )
244 |                 )
245 |                 assert (
246 |                     len(self._env_observations[i])
247 |                     <= self.config.TASK_CONFIG.ENVIRONMENT.MAX_EPISODE_STEPS
248 |                 ), "Trajectories should be no more than the maximum episode steps."
249 | 
250 |         # out = self._preload[self.rank]
251 |         # self._preload = self._preload[self.world_size:]
252 |         # return out
253 |         return self._preload.popleft()
254 | 
255 |     def __next__(self):
256 |         """Takes about 1s to once self._load_next() has finished with a batch
257 |         size of 5. For this reason, we probably don't need to use extra workers.
258 |         """
259 |         x = self._load_next()
260 |         obs, prev_actions, oracle_actions = x
261 | 
262 |         # transpose obs
263 |         obs_t = defaultdict(list)
264 |         for k in obs[0]:
265 |             for i in range(len(obs)):
266 |                 obs_t[k].append(obs[i][k])
267 | 
268 |             obs_t[k] = np.array(obs_t[k])
269 | 
270 |         for k, v in obs_t.items():
271 |             obs_t[k] = torch.from_numpy(np.copy(v))
272 | 
273 |         prev_actions = torch.from_numpy(np.copy(prev_actions))
274 |         oracle_actions = torch.from_numpy(np.copy(oracle_actions))
275 | 
276 |         inflections = torch.cat(
277 |             [
278 |                 torch.tensor([1], dtype=torch.long),
279 |                 (oracle_actions[1:] != oracle_actions[:-1]).long(),
280 |             ]
281 |         )
282 | 
283 |         return (
284 |             obs_t,
285 |             prev_actions,
286 |             oracle_actions,
287 |             self.inflec_weights[inflections],
288 |         )
289 | 
290 |     def __iter__(self):
291 |         worker_info = torch.utils.data.get_worker_info()
292 |         if worker_info is not None:
293 |             assert (
294 |                 worker_info.num_workers == 1
295 |             ), "multiple workers not supported."
296 | 
297 |         return self
298 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/resnet_encoders.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torchvision.models as models
  6 | from gym import spaces
  7 | from habitat import logger
  8 | from habitat_baselines.rl.ddppo.policy import resnet
  9 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder
 10 | import torchvision
 11 | #import clip
 12 | from .clip import CLIP
 13 | from PIL import Image
 14 | from torchvision import transforms
 15 | 
 16 | class VlnResnetDepthEncoder(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         observation_space,
 20 |         output_size=128,
 21 |         checkpoint="NONE",
 22 |         backbone="resnet50",
 23 |         resnet_baseplanes=32,
 24 |         normalize_visual_inputs=False,
 25 |         trainable=False,
 26 |         spatial_output: bool = False,
 27 |     ):
 28 |         super().__init__()
 29 |         self.visual_encoder = ResNetEncoder(
 30 |             spaces.Dict({"depth": observation_space.spaces["depth"]}),
 31 |             baseplanes=resnet_baseplanes,
 32 |             ngroups=resnet_baseplanes // 2,
 33 |             make_backbone=getattr(resnet, backbone),
 34 |             normalize_visual_inputs=normalize_visual_inputs,
 35 |         )
 36 | 
 37 |         for param in self.visual_encoder.parameters():
 38 |             param.requires_grad_(trainable)
 39 | 
 40 |         if checkpoint != "NONE":
 41 |             ddppo_weights = torch.load(checkpoint)
 42 | 
 43 |             weights_dict = {}
 44 |             for k, v in ddppo_weights["state_dict"].items():
 45 |                 split_layer_name = k.split(".")[2:]
 46 |                 if split_layer_name[0] != "visual_encoder":
 47 |                     continue
 48 | 
 49 |                 layer_name = ".".join(split_layer_name[1:])
 50 |                 weights_dict[layer_name] = v
 51 | 
 52 |             del ddppo_weights
 53 |             self.visual_encoder.load_state_dict(weights_dict, strict=True)
 54 | 
 55 |         self.spatial_output = spatial_output
 56 | 
 57 |         if not self.spatial_output:
 58 |             self.output_shape = (output_size,)
 59 |             # self.visual_fc = nn.Sequential(
 60 |             #     nn.Flatten(),
 61 |             #     nn.Linear(
 62 |             #         np.prod(self.visual_encoder.output_shape), output_size
 63 |             #     ),
 64 |             #     nn.ReLU(True),
 65 |             # )
 66 |             None
 67 |         else:
 68 |             self.spatial_embeddings = nn.Embedding(
 69 |                 self.visual_encoder.output_shape[1]
 70 |                 * self.visual_encoder.output_shape[2],
 71 |                 64,
 72 |             )
 73 | 
 74 |             self.output_shape = list(self.visual_encoder.output_shape)
 75 |             self.output_shape[0] += self.spatial_embeddings.embedding_dim
 76 |             self.output_shape = tuple(self.output_shape)
 77 | 
 78 | 
 79 |     def forward(self, observations):
 80 |         """
 81 |         Args:
 82 |             observations: [BATCH, HEIGHT, WIDTH, CHANNEL]
 83 |         Returns:
 84 |             [BATCH, OUTPUT_SIZE]
 85 |         """
 86 |         if "depth_features" in observations:
 87 |             x = observations["depth_features"]
 88 |         else:
 89 |             x = self.visual_encoder(observations)
 90 | 
 91 |         if self.spatial_output:
 92 |             b, c, h, w = x.size()
 93 | 
 94 |             spatial_features = (
 95 |                 self.spatial_embeddings(
 96 |                     torch.arange(
 97 |                         0,
 98 |                         self.spatial_embeddings.num_embeddings,
 99 |                         device=x.device,
100 |                         dtype=torch.long,
101 |                     )
102 |                 )
103 |                 .view(1, -1, h, w)
104 |                 .expand(b, self.spatial_embeddings.embedding_dim, h, w)
105 |             )
106 | 
107 |             return torch.cat([x, spatial_features], dim=1)
108 |         else:
109 |             # return self.visual_fc(x)
110 |             return x
111 | 
112 | 
113 | class TorchVisionResNet50(nn.Module):
114 |     r"""
115 |     Takes in observations and produces an embedding of the rgb component.
116 | 
117 |     Args:
118 |         observation_space: The observation_space of the agent
119 |         output_size: The size of the embedding vector
120 |         device: torch.device
121 |     """
122 | 
123 |     def __init__(
124 |         self,
125 |         observation_space,
126 |         output_size,
127 |         device,
128 |         spatial_output: bool = False,
129 |     ):
130 |         super().__init__()
131 |         self.device = device
132 |         self.resnet_layer_size = 2048
133 |         linear_layer_input_size = 0
134 |         if "rgb" in observation_space.spaces:
135 |             self._n_input_rgb = observation_space.spaces["rgb"].shape[2]
136 |             obs_size_0 = observation_space.spaces["rgb"].shape[0]
137 |             obs_size_1 = observation_space.spaces["rgb"].shape[1]
138 |             if obs_size_0 != 224 or obs_size_1 != 224:
139 |                 logger.warn(
140 |                     "TorchVisionResNet50: observation size is not conformant to expected ResNet input size [3x224x224]"
141 |                 )
142 |             linear_layer_input_size += self.resnet_layer_size
143 |         else:
144 |             self._n_input_rgb = 0
145 | 
146 |         if self.is_blind:
147 |             self.cnn = nn.Sequential()
148 |             return
149 | 
150 |         rgb_resnet = models.resnet50(pretrained=True)
151 |         rgb_modules = list(rgb_resnet.children())[:-2]
152 |         self.cnn = torch.nn.Sequential(*rgb_modules)
153 | 
154 |         # disable gradients for resnet, params frozen
155 |         for param in self.cnn.parameters():
156 |             param.requires_grad_(False)
157 |         self.cnn.eval()
158 | 
159 |         self.spatial_output = spatial_output
160 | 
161 |         if not self.spatial_output:
162 |             self.output_shape = (output_size,)
163 |             # self.fc = nn.Linear(linear_layer_input_size, output_size)
164 |             # self.activation = nn.ReLU()
165 |             None
166 |         else:
167 |             class SpatialAvgPool(nn.Module):
168 |                 def forward(self, x):
169 |                     x = F.adaptive_avg_pool2d(x, (4, 4))
170 | 
171 |                     return x
172 |             self.cnn.avgpool = SpatialAvgPool()
173 |             self.cnn.fc = nn.Sequential()
174 |             self.spatial_embeddings = nn.Embedding(4 * 4, 64)
175 |             self.output_shape = (
176 |                 self.resnet_layer_size + self.spatial_embeddings.embedding_dim,
177 |                 4,
178 |                 4,
179 |             )
180 | 
181 |         # self.layer_extract = self.cnn._modules.get("avgpool")
182 | 
183 |         self.rgb_transform = torch.nn.Sequential(
184 |             transforms.ConvertImageDtype(torch.float),
185 |             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
186 |             )
187 | 
188 |     @property
189 |     def is_blind(self):
190 |         return self._n_input_rgb == 0
191 | 
192 |     def forward(self, observations):
193 |         r"""Sends RGB observation through the TorchVision ResNet50 pre-trained
194 |         on ImageNet. Sends through fully connected layer, activates, and
195 |         returns final embedding.
196 |         """
197 | 
198 |         def resnet_forward(observation):
199 |             # resnet_output = torch.zeros(
200 |             #     1, dtype=torch.float32, device=observation.device
201 |             # )
202 |             # def hook(m, i, o):
203 |             #     resnet_output.set_(o)
204 | 
205 |             # output: [BATCH x RESNET_DIM]
206 |             # h = self.layer_extract.register_forward_hook(hook)
207 |             resnet_output = self.cnn(observation)
208 |             # h.remove()
209 |             return resnet_output
210 | 
211 |         if "rgb_features" in observations:
212 |             resnet_output = observations["rgb_features"]
213 |         else:
214 |             # permute tensor to dimension [BATCH x CHANNEL x HEIGHT x WIDTH]
215 |             rgb_observations = observations["rgb"].permute(0, 3, 1, 2)
216 | 
217 |             rgb_observations = self.rgb_transform(rgb_observations)
218 |             # rgb_observations = rgb_observations / 255.0  # normalize RGB
219 | 
220 |             resnet_output = resnet_forward(rgb_observations.contiguous())
221 | 
222 |         if self.spatial_output:
223 |             b, c, h, w = resnet_output.size()
224 | 
225 |             spatial_features = (
226 |                 self.spatial_embeddings(
227 |                     torch.arange(
228 |                         0,
229 |                         self.spatial_embeddings.num_embeddings,
230 |                         device=resnet_output.device,
231 |                         dtype=torch.long,
232 |                     )
233 |                 )
234 |                 .view(1, -1, h, w)
235 |                 .expand(b, self.spatial_embeddings.embedding_dim, h, w)
236 |             )
237 | 
238 |             return torch.cat([resnet_output, spatial_features], dim=1)#.to(self.device)
239 |         else:
240 |             # return self.activation(
241 |             #     self.fc(torch.flatten(resnet_output, 1))
242 |             # )  # [BATCH x OUTPUT_DIM]
243 |             return resnet_output
244 | 
245 | 
246 | class CLIPEncoder(nn.Module):
247 |     r"""
248 |     Takes in observations and produces an embedding of the rgb component.
249 | 
250 |     Args:
251 |         observation_space: The observation_space of the agent
252 |         output_size: The size of the embedding vector
253 |         device: torch.device
254 |     """
255 | 
256 |     def __init__(
257 |         self, device, patch_size=16
258 |     ):
259 |         super().__init__()
260 |         #self.model, _ = clip.load("data/ViT-B-32.pt", device=device)
261 |         self.model = CLIP(
262 |             input_resolution=224, patch_size=patch_size, width=768, layers=12, heads=12
263 |         )
264 |         if patch_size == 16:
265 |             # scripted_model = torch.load('data/ViT-B-'+str(patch_size)+'.pt')
266 |             # state_dict = scripted_model.state_dict()
267 |             # self.model.load_state_dict(state_dict)
268 |             self.model.load_state_dict(torch.load('/data/ViT-B-'+str(patch_size)+'.pt', map_location = torch.device('cpu')).state_dict(),strict=False)
269 |             self.model = self.model.to('cuda')  # transfer to GPU 
270 |         elif patch_size == 32:
271 |             self.model.load_state_dict(torch.jit.load('data/ViT-B-'+str(patch_size)+'.pt', map_location = torch.device('cpu')).state_dict(),strict=False)
272 | 
273 |         for param in self.model.parameters():
274 |             param.requires_grad_(False)
275 |         self.model.eval()
276 | 
277 |         
278 |         self.rgb_transform = torch.nn.Sequential(
279 |             transforms.Resize((224,224), interpolation=Image.BICUBIC),
280 |             transforms.ConvertImageDtype(torch.float),
281 |             transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]),
282 |             )
283 | 
284 | 
285 |     def forward(self, observations, fine_grained_fts=False):
286 |         r"""Sends RGB observation through the TorchVision ResNet50 pre-trained
287 |         on ImageNet. Sends through fully connected layer, activates, and
288 |         returns final embedding.
289 |         """
290 | 
291 |         rgb_observations = observations["rgb"].permute(0, 3, 1, 2)
292 |         rgb_observations = self.rgb_transform(rgb_observations)
293 |         rgb_observations = rgb_observations.to('cuda') #new
294 |         
295 |         if fine_grained_fts:
296 |             output = self.model(rgb_observations.contiguous())
297 |         else:
298 |             output = self.model.encode_image(rgb_observations.contiguous())
299 |         return output.float() # to fp32


--------------------------------------------------------------------------------
/vlnce_baselines/waypoint_networks/viz_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import os
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | import math
  7 | import torch
  8 | from PIL import Image
  9 | 
 10 | '''
 11 | MP3D original semantic labels and reduced set correspondence
 12 | # Original set from here: https://github.com/niessner/Matterport/blob/master/metadata/mpcat40.tsv
 13 | 0 void 0
 14 | 1 wall 15 structure
 15 | 2 floor 17 free-space
 16 | 3 chair 1
 17 | 4 door 2
 18 | 5 table 3
 19 | 6 picture 18
 20 | 7 cabinet 19
 21 | 8 cushion 4
 22 | 9 window 15 structure
 23 | 10 sofa 5
 24 | 11 bed 6
 25 | 12 curtain 16 other
 26 | 13 chest_of_drawers 20
 27 | 14 plant 7
 28 | 15 sink 8
 29 | 16 stairs 17 free-space
 30 | 17 ceiling 17 free-space
 31 | 18 toilet 9
 32 | 19 stool 21
 33 | 20 towel 22
 34 | 21 mirror 16 other
 35 | 22 tv_monitor 10
 36 | 23 shower 11
 37 | 24 column 15 structure
 38 | 25 bathtub 12
 39 | 26 counter 13
 40 | 27 fireplace 23
 41 | 28 lighting 16 other
 42 | 29 beam 16 other
 43 | 30 railing 16 other
 44 | 31 shelving 16 other
 45 | 32 blinds 16 other
 46 | 33 gym_equipment 24
 47 | 34 seating 25
 48 | 35 board_panel 16 other
 49 | 36 furniture 16 other
 50 | 37 appliances 14
 51 | 38 clothes 26
 52 | 39 objects 16 other
 53 | 40 misc 16 other
 54 | '''
 55 | # 27 categories which include the 21 object categories in the habitat challenge
 56 | label_conversion_40_27 = {-1:0, 0:0, 1:15, 2:17, 3:1, 4:2, 5:3, 6:18, 7:19, 8:4, 9:15, 10:5, 11:6, 12:16, 13:20, 14:7, 15:8, 16:17, 17:17,
 57 |                     18:9, 19:21, 20:22, 21:16, 22:10, 23:11, 24:15, 25:12, 26:13, 27:23, 28:16, 29:16, 30:16, 31:16, 32:16,
 58 |                     33:24, 34:25, 35:16, 36:16, 37:14, 38:26, 39:16, 40:16}
 59 | color_mapping_27 = {
 60 |     0:(255,255,255), # white
 61 |     1:(128,128,0), # olive (dark yellow)
 62 |     2:(0,0,255), # blue
 63 |     3:(255,0,0), # red
 64 |     4:(255,0,255), # magenta
 65 |     5:(0,255,255), # cyan
 66 |     6:(255,165,0), # orange
 67 |     7:(255,255,0), # yellow
 68 |     8:(128,128,128), # gray
 69 |     9:(128,0,0), # maroon
 70 |     10:(255,20,147), # pink 
 71 |     11:(0,128,0), # dark green
 72 |     12:(128,0,128), # purple
 73 |     13:(0,128,128), # teal
 74 |     14:(0,0,128), # navy (dark blue)
 75 |     15:(210,105,30), # chocolate
 76 |     16:(188,143,143), # rosy brown
 77 |     17:(0,255,0), # green
 78 |     18:(255,215,0), # gold
 79 |     19:(0,0,0), # black
 80 |     20:(192,192,192), # silver
 81 |     21:(138,43,226), # blue violet
 82 |     22:(255,127,80), # coral
 83 |     23:(238,130,238), # violet
 84 |     24:(245,245,220), # beige
 85 |     25:(139,69,19), # saddle brown
 86 |     26:(64,224,208) # turquoise
 87 | }
 88 | 
 89 | # three label classification (0:void, 1:occupied, 2:free)
 90 | label_conversion_40_3 = {-1:0, 0:0, 1:1, 2:2, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:2, 17:2,
 91 |                     18:1, 19:1, 20:1, 21:1, 22:1, 23:1, 24:1, 25:1, 26:1, 27:1, 28:1, 29:1, 30:1, 31:1, 32:1,
 92 |                     33:1, 34:1, 35:1, 36:1, 37:1, 38:1, 39:1, 40:1}
 93 | color_mapping_3 = {
 94 |     0:(255,255,255), # white
 95 |     1:(0,0,255), # blue
 96 |     2:(0,255,0), # green
 97 | }
 98 | 
 99 | # visualize a scene map with it's sampled waypoints from RxR
100 | def vis_episode(gt_map_semantic, pose_coords, name="map_tmp", color_mapping=27):
101 |     color_map = colorize_grid(gt_map_semantic.unsqueeze(0).unsqueeze(0), color_mapping=color_mapping)
102 |     im_color_map = color_map[0,0,:,:,:].permute(1,2,0).cpu().numpy()
103 | 
104 |     plt.figure(figsize=(10 ,8))
105 |     plt.axis('off')
106 |     plt.imshow(im_color_map)
107 |     for i in range(len(pose_coords)):
108 |         point = pose_coords[i]
109 |         if point[0]>=0 and point[1]>=0:
110 |             plt.scatter(point[0], point[1], color="blue", s=50)
111 |     plt.savefig(name+'.png', bbox_inches='tight', pad_inches=0, dpi=100)
112 |     plt.close()
113 | 
114 | 
115 | def vis_heatmaps(pred, gt):
116 |     pred = pred.detach().cpu().numpy()
117 |     gt = gt.detach().cpu().numpy()
118 |     # heatmaps are 1 x h x w
119 |     arr = [pred, gt]
120 |     n = len(arr)
121 |     plt.figure(figsize=(10,5))
122 |     for i, data in enumerate(arr):
123 |         ax = plt.subplot(1, n, i+1)
124 |         ax.axis('off')
125 |         plt.imshow(data)
126 |     plt.show()
127 | 
128 | 
129 | 
130 | def write_img(img, savepath, name):
131 |     # img: T x 3 x dim x dim, assumed normalized
132 |     for i in range(img.shape[0]):
133 |         vis_img = img[i,:,:,:].cpu().numpy()
134 |         vis_img = np.transpose(vis_img, (1,2,0))
135 |         im_path = savepath + str(i) + "_" + name + ".png"
136 |         cv2.imwrite(im_path, vis_img[:,:,::-1]*255.0)
137 | 
138 | 
139 | 
140 | def colorize_grid(grid, color_mapping=27): # to pass into tensorboardX video
141 |     # Input: grid -- B x T x C x grid_dim x grid_dim, where C=1,T=1 when gt and C=41,T>=1 for other
142 |     # Output: grid_img -- B x T x 3 x grid_dim x grid_dim
143 |     grid = grid.detach().cpu().numpy()
144 |     grid_img = np.zeros((grid.shape[0], grid.shape[1], grid.shape[3], grid.shape[4], 3),  dtype=np.uint8)
145 |     if grid.shape[2] > 1:
146 |         # For cells where prob distribution is all zeroes (or uniform), argmax returns arbitrary number (can be true for the accumulated maps)
147 |         grid_prob_max = np.amax(grid, axis=2)
148 |         inds = np.asarray(grid_prob_max<=0.05).nonzero() # if no label has prob higher than k then assume unobserved
149 |         grid[inds[0], inds[1], 0, inds[2], inds[3]] = 1 # assign label 0 (void) to be the dominant label
150 |         grid = np.argmax(grid, axis=2) # B x T x grid_dim x grid_dim
151 |     else:
152 |         grid = grid.squeeze(2)
153 | 
154 |     if color_mapping==27:
155 |         color_mapping = color_mapping_27
156 |     else:
157 |         color_mapping = color_mapping_3
158 |     for label in color_mapping.keys():
159 |         grid_img[ grid==label ] = color_mapping[label]
160 |     
161 |     return torch.tensor(grid_img.transpose(0, 1, 4, 2, 3), dtype=torch.uint8)
162 | 
163 | 
164 | def write_tensor_image(grid, savepath, name, sseg_labels=27):
165 |     # grid: T x C x dim x dim 
166 |     grid_imgs = colorize_grid(grid.unsqueeze(0), color_mapping=sseg_labels)
167 |     grid_imgs = grid_imgs.squeeze(0)
168 |     grid_imgs = grid_imgs.detach().cpu().numpy()
169 |     for t in range(grid_imgs.shape[0]):
170 |         im = grid_imgs[t,:,:,:].transpose(1,2,0)
171 |         im_path = savepath + str(t) + "_" + name + ".png"
172 |         cv2.imwrite(im_path, im[:,:,::-1])
173 | 
174 | 
175 | def write_tensor_imgSegm(img, savepath, name, t=None, labels=27, waypoints=None):
176 |     # pred: T x C x dim x dim
177 |     if img.shape[1] > 1:
178 |         img = torch.argmax(img.cpu(), dim=1, keepdim=True) # T x 1 x cH x cW
179 |     img_labels = img.squeeze(1)
180 | 
181 |     for i in range(img_labels.shape[0]):
182 |         img0 = img_labels[i,:,:]
183 | 
184 |         vis_img = np.zeros((img0.shape[0], img0.shape[1], 3), dtype=np.uint8)
185 |         
186 |         if labels==27:
187 |             color_mapping = color_mapping_27
188 |         else:
189 |             color_mapping = color_mapping_3
190 | 
191 |         for label in color_mapping.keys():
192 |             vis_img[ img0==label ] = color_mapping[label]
193 |         
194 |         if t is None:
195 |             im_path = savepath + str(i) + "_" + name + ".png"
196 |         else:
197 |             im_path = savepath + name + "_" + str(t) + "_" + str(i) + ".png"
198 | 
199 |         if waypoints != None:
200 |             for coords in waypoints:
201 |                 vis_img[coords[1]-3:coords[1]+3,coords[0]-3:coords[0]+3,:] = np.array([0,0,1])
202 | 
203 |         cv2.imwrite(im_path, vis_img[:,:,::-1])
204 | 
205 | 
206 | def display_sample(rgb_obs, depth_obs, t, sseg_img=None, savepath=None):
207 |     # sseg_img is semantic observation from Matterport habitat
208 |     depth_obs = depth_obs / np.amax(depth_obs) # normalize for visualization
209 |     rgb_img = Image.fromarray(rgb_obs, mode="RGB")
210 |     depth_img = Image.fromarray((depth_obs * 255).astype(np.uint8), mode="L")
211 |         
212 |     plt.figure(figsize=(12 ,8))
213 |     plt.axis('off')
214 |     plt.imshow(rgb_img)
215 |     plt.savefig(savepath+str(t)+"_rgb.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
216 |     plt.close()
217 | 
218 |     plt.figure(figsize=(12 ,8))
219 |     plt.axis('off')
220 |     plt.imshow(depth_img)
221 |     plt.savefig(savepath+str(t)+"_depth.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
222 |     plt.close()
223 | 
224 | 
225 | def save_map_goal(gt_map_semantic, pose_coords, goal_pose_coords, save_img_dir_, t):
226 |     color_map_sem = colorize_grid(gt_map_semantic)
227 |     im = color_map_sem[0,0,:,:,:].permute(1,2,0).cpu().numpy()
228 | 
229 |     plt.figure(figsize=(10 ,7))
230 |     plt.axis('off')
231 |     plt.imshow(im)
232 |     if goal_pose_coords[0,0,0]>=0 and goal_pose_coords[0,0,1]>=0:
233 |         plt.scatter(goal_pose_coords[0,0,0], goal_pose_coords[0,0,1], color="magenta", s=70)
234 |     plt.scatter(pose_coords[0,0,0], pose_coords[0,0,1], color="blue", s=70)
235 |     plt.savefig(save_img_dir_+str(t)+'.png', bbox_inches='tight', pad_inches=0, dpi=100)
236 |     plt.close()
237 | 
238 | 
239 | 
240 | def save_map_pred_steps(spatial_in, spatial_pred, objects_pred, ego_img_segm, save_img_dir_, t):
241 |     
242 |     color_spatial_in = colorize_grid(spatial_in, color_mapping=3)
243 |     im_spatial_in = color_spatial_in[0,0,:,:,:].permute(1,2,0).cpu().numpy()
244 | 
245 |     color_spatial_pred = colorize_grid(spatial_pred, color_mapping=3)
246 |     im_spatial_pred = color_spatial_pred[0,0,:,:,:].permute(1,2,0).cpu().numpy()
247 | 
248 |     color_objects_pred = colorize_grid(objects_pred, color_mapping=27)
249 |     im_objects_pred = color_objects_pred[0,0,:,:,:].permute(1,2,0).cpu().numpy()
250 | 
251 |     color_ego_img_segm = colorize_grid(ego_img_segm, color_mapping=27)
252 |     im_ego_img_segm = color_ego_img_segm[0,0,:,:,:].permute(1,2,0).cpu().numpy()
253 | 
254 |     plt.figure(figsize=(12 ,8))
255 |     plt.axis('off')
256 |     plt.imshow(im_spatial_in)
257 |     plt.savefig(save_img_dir_+str(t)+"_im_spatial_in.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
258 |     plt.close()
259 | 
260 |     plt.figure(figsize=(12 ,8))
261 |     plt.axis('off')
262 |     plt.imshow(im_spatial_pred)
263 |     plt.savefig(save_img_dir_+str(t)+"_im_spatial_pred.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
264 |     plt.close()    
265 | 
266 |     plt.figure(figsize=(12 ,8))
267 |     plt.axis('off')
268 |     plt.imshow(im_objects_pred)
269 |     plt.savefig(save_img_dir_+str(t)+"_im_objects_pred.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
270 |     plt.close()
271 | 
272 |     plt.figure(figsize=(12 ,8))
273 |     plt.axis('off')
274 |     plt.imshow(im_ego_img_segm)
275 |     plt.savefig(save_img_dir_+str(t)+"_im_ego_img_segm.png", bbox_inches='tight', pad_inches=0, dpi=50) # 100
276 |     plt.close()    
277 | 
278 | 
279 | def show_waypoint_pred(map_semantic, savepath, num_points, ltg=None, pose_coords=None, pred_waypoints=None, gt_waypoints=None):
280 |     # Waypoints are provided in map pose coordinates
281 |     color_map = colorize_grid(map_semantic.unsqueeze(0).unsqueeze(0))
282 |     im_color_map = color_map[0,0,:,:,:].permute(1,2,0).cpu().numpy()
283 | 
284 |     plt.figure(figsize=(10 ,8))
285 |     plt.axis('off')
286 |     plt.imshow(im_color_map)
287 | 
288 |     for i in range(num_points):
289 |         if gt_waypoints is not None:
290 |             point_gt = gt_waypoints[i]
291 |             if point_gt[0]>=0 and point_gt[1]>=0:
292 |                 plt.scatter(point_gt[0], point_gt[1], color="blue", s=50)
293 |                 plt.text(point_gt[0], point_gt[1], s=str(i), color="blue")
294 |         if pred_waypoints is not None:
295 |             point_pred = pred_waypoints[i]
296 |             if point_pred[0]>=0 and point_pred[1]>=0:
297 |                 plt.scatter(point_pred[0], point_pred[1], color="red", s=50)
298 |                 plt.text(point_pred[0], point_pred[1], s=str(i), color="red")
299 |     # ltg and agent position
300 |     if ltg is not None:
301 |         plt.scatter(ltg[0,0,0], ltg[0,0,1], color="magenta", s=50)
302 |     if pose_coords is not None:
303 |         plt.scatter(pose_coords[0,0,0], pose_coords[0,0,1], color="green", s=50)
304 | 
305 |     plt.savefig(savepath, bbox_inches='tight', pad_inches=0, dpi=100)
306 |     plt.close()


--------------------------------------------------------------------------------
/utils_p/memory.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from numpy.linalg import norm
  4 | import pickle
  5 | 
  6 | 
  7 | class Memory(object):
  8 |     """
  9 |         Create the empty memory buffer
 10 |     """
 11 | 
 12 |     def __init__(self, size, dimension=1 * 3 * 22 * 22, alpha=0.9): #1 * 3 * 224 * 224
 13 |         self.memory = {}
 14 |         self.size = size
 15 |         self.dimension = dimension
 16 |         self.alpha = alpha
 17 | 
 18 |     def reset(self):
 19 |         self.memory = {}
 20 | 
 21 |     def get_size(self):
 22 |         return len(self.memory)
 23 | 
 24 |     def push(self, keys, logits):
 25 |         # mo = 0.5 
 26 |         keys = keys.reshape(len(keys), self.dimension)
 27 |         for i, key in enumerate(keys):
 28 |             
 29 |             if len(self.memory.keys()) >= self.size:
 30 |                 # Memory is full, find the nearest neighbours and update them
 31 |                 #key = key.reshape(len(key), self.dimension)
 32 |                 all_keys = np.frombuffer(np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(self.get_size(), self.dimension)
 33 |                 similarity_scores = np.dot(all_keys, key.T) / (norm(all_keys, axis=1) * norm(key.T))
 34 |                 top_k_indices = np.argsort(similarity_scores)[-5:]  # Top-k indices with highest similarity
 35 |                 for idx in top_k_indices:
 36 |                     mem_key = all_keys[idx].tobytes()
 37 |                     top_k_logit = self.memory[mem_key]
 38 |                     #self.memory[mem_key] = mo * top_k_logit + (1 - mo) * logits[i]
 39 |                     # Update the memory with a weighted average of the top-k logits
 40 |                     self.memory[mem_key] = self.alpha * top_k_logit + (1 - self.alpha) * logits[i]
 41 |                 """
 42 |                 neighbors, similarity_scores = self.get_topk(np.array([key_flat]), k=5)
 43 | 
 44 |                 for nkey, score in zip(neighbors, similarity_scores):
 45 |                     mem_key = nkey.tobytes()
 46 |                     self.memory[mem_key] = self.alpha * self.memory[mem_key] + (1 - self.alpha) * logits[i]
 47 | 
 48 |                 """
 49 |             else:
 50 |                 # Memory is not full, add new key-logit pair to memory
 51 |                 self.memory.update({key.reshape(self.dimension).tobytes(): logits[i]})    
 52 |         
 53 | 
 54 |     def _prepare_batch(self, sample, attention_weight):
 55 |         attention_weight = np.array(attention_weight / 0.2)
 56 |         attention_weight = np.exp(attention_weight) / (np.sum(np.exp(attention_weight)))
 57 |         ensemble_prediction = sample[0] * attention_weight[0]
 58 |         for i in range(1, len(sample)):
 59 |             ensemble_prediction = ensemble_prediction + sample[i] * attention_weight[i]
 60 | 
 61 |         return torch.FloatTensor(ensemble_prediction)
 62 | 
 63 |     def get_neighbours(self, keys, k):
 64 |         """
 65 |         Returns samples from buffer using nearest neighbour approach
 66 |         """
 67 |         samples = []
 68 | 
 69 |         keys = keys.reshape(len(keys), self.dimension)
 70 |         total_keys = len(self.memory.keys())
 71 |         self.all_keys = np.frombuffer(
 72 |             np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension)
 73 | 
 74 |         for key in keys:
 75 |             similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T))
 76 | 
 77 |             K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]]
 78 |             neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys]
 79 | 
 80 |             attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T))
 81 |             batch = self._prepare_batch(neighbours, attention_weight)
 82 |             samples.append(batch)
 83 | 
 84 |         return torch.stack(samples), np.mean(similarity_scores)
 85 | 
 86 | 
 87 |     
 88 |     def save_memory(self, file_path):
 89 |         with open(file_path, 'wb') as f:
 90 |             pickle.dump(self.memory, f)
 91 | 
 92 |     def load_memory(self, file_path):
 93 |         with open(file_path, 'rb') as f:
 94 |             self.memory = pickle.load(f)
 95 | 
 96 | 
 97 |     def get_topk_avg(self, keys, k):
 98 |         
 99 |         samples = []
100 | 
101 |         keys = keys.reshape(len(keys), self.dimension)
102 |         total_keys = len(self.memory.keys())
103 |         self.all_keys = np.frombuffer(
104 |             np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension)
105 | 
106 |         for key in keys:
107 |             similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T))
108 | 
109 |             K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]]
110 |             neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys]
111 | 
112 |             mean_prompts = torch.FloatTensor(np.mean(neighbours, axis=0))  # (dimension,)
113 |             # batch = self._prepare_batch(neighbours, attention_weight) #TEST
114 |             samples.append(mean_prompts)
115 | 
116 |             #attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T))
117 |             #batch = self._prepare_batch(neighbours, attention_weight)
118 |             #samples.append(batch)
119 | 
120 |         return torch.stack(samples), np.mean(similarity_scores)
121 |     
122 | 
123 | 
124 | #---------------------------------------------------------------------------------------Contextual Evolution Memory
125 | class Memory_vft(object):
126 |     """
127 |         Create the empty memory buffer
128 |     """
129 | 
130 |     def __init__(self, size, dimension=1 * 1536, key_dimension=1*768, alpha=0.1): #, key_dim=768, dim = 1536
131 |         # self.memory = {}
132 |         self.size = size
133 |         self.dimension = dimension
134 |         self.key_dimension = key_dimension
135 |         self.alpha = alpha
136 |         # self.dim = dim
137 |         # self.key_dim = key_dim
138 |         logits = torch.randn(size, dimension)
139 |         #self.memory = {torch.randn(1, key_dimension): torch.randn(1, dimension) for _ in range(size)}
140 |         self.memory = {
141 |             torch.randn(1, key_dimension).numpy().tobytes(): torch.randn(1, dimension) 
142 |             for _ in range(size)
143 |         }
144 |        
145 |         
146 | 
147 |     def reset(self):
148 |         self.memory = {}
149 | 
150 |     def get_size(self):
151 |         return len(self.memory)
152 | 
153 |     def push(self, keys, logits):
154 |         
155 |         keys = keys.reshape(len(keys), self.key_dimension)
156 |         for i, key in enumerate(keys):
157 |             
158 |             if len(self.memory.keys()) >= self.size:
159 |                 # Memory is full, find the nearest neighbours and update them
160 |                 #key = key.reshape(len(key), self.dimension)
161 |                 all_keys = np.frombuffer(np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(self.get_size(), self.key_dimension)
162 |                 similarity_scores = np.dot(all_keys, key.T) / (norm(all_keys, axis=1) * norm(key.T))
163 |                 top_k_indices = np.argsort(similarity_scores)[-5:]  # Top-k indices with highest similarity
164 |                 for idx in top_k_indices:
165 |                     mem_key = all_keys[idx].tobytes()
166 |                     #mem_key = all_keys[idx].tobytes()
167 |                     top_k_logit = self.memory[mem_key]
168 |                     #self.memory[mem_key] = mo * top_k_logit + (1 - mo) * logits[i]
169 |                     self.memory[mem_key] = self.alpha * top_k_logit + (1 - self.alpha) * logits[i]
170 | 
171 |             else:
172 |                 self.memory.update({key.reshape(self.key_dimension).tobytes(): logits[i]})
173 | 
174 |        
175 |         
176 | 
177 |         
178 |         
179 | 
180 |     def _prepare_batch(self, sample, attention_weight):
181 |         attention_weight = np.array(attention_weight / 0.2)
182 |         attention_weight = np.exp(attention_weight) / (np.sum(np.exp(attention_weight)))
183 |         ensemble_prediction = sample[0] * attention_weight[0]
184 |         for i in range(1, len(sample)):
185 |             ensemble_prediction = ensemble_prediction + sample[i] * attention_weight[i]
186 | 
187 |         return torch.FloatTensor(ensemble_prediction)
188 | 
189 |     def get_neighbours(self, keys, k):
190 |         """
191 |         Returns samples from buffer using nearest neighbour approach
192 |         """
193 |         samples = []
194 | 
195 |         keys = keys.reshape(len(keys), self.dimension)
196 |         total_keys = len(self.memory.keys())
197 |         self.all_keys = np.frombuffer(
198 |             np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension)
199 | 
200 |         for key in keys:
201 |             similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T))
202 | 
203 |             K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]]
204 |             neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys]
205 | 
206 |             attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T))
207 |             batch = self._prepare_batch(neighbours, attention_weight)
208 |             samples.append(batch)
209 | 
210 |         return torch.stack(samples), np.mean(similarity_scores)
211 | 
212 |     def save_memory(self, file_path):
213 |         with open(file_path, 'wb') as f:
214 |             pickle.dump(self.memory, f)
215 | 
216 |     def load_memory(self, file_path):
217 |         with open(file_path, 'rb') as f:
218 |             self.memory = pickle.load(f)
219 | 
220 | 
221 |     def get_topk(self, keys, k):
222 |         
223 |         samples = []
224 | 
225 |         keys = keys.reshape(len(keys), self.key_dimension) #(num_keys, dimension)
226 |         total_keys = len(self.memory.keys())
227 | 
228 |         self.all_keys = np.frombuffer(
229 |             np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.key_dimension) #(total_keys, dimension)
230 | 
231 |         for key in keys:
232 |             similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T))
233 | 
234 |             K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] #(k, dimension)
235 |             neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] #(k,dimension)
236 | 
237 |             attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T))
238 |             batch = self._prepare_batch(neighbours, attention_weight) #TEST
239 |             samples.append(batch)
240 | 
241 |         return torch.stack(samples), np.mean(similarity_scores)
242 |         # return samples, np.mean(similarity_scores)
243 |     
244 | 
245 |     def get_topk_mean(self, keys, k):
246 |         
247 |         samples = []
248 | 
249 |         keys = keys.reshape(len(keys), self.dimension) #(num_keys, dimension)
250 |         total_keys = len(self.memory.keys())
251 | 
252 |         self.all_keys = np.frombuffer(
253 |             np.asarray(list(self.memory.keys())), dtype=np.float32).reshape(total_keys, self.dimension) #(total_keys, dimension)
254 | 
255 |         for key in keys:
256 |             similarity_scores = np.dot(self.all_keys, key.T) / (norm(self.all_keys, axis=1) * norm(key.T))
257 | 
258 |             K_neighbour_keys = self.all_keys[np.argpartition(similarity_scores, -k)[-k:]] #(k, dimension)
259 |             neighbours = [self.memory[nkey.tobytes()] for nkey in K_neighbour_keys] #(k,dimension)
260 | 
261 |             # attention_weight = np.dot(K_neighbour_keys, key.T) / (norm(K_neighbour_keys, axis=1) * norm(key.T))
262 |             # mean_prompts = np.mean(neighbours, axis=0)  # (dimension,)
263 |             mean_prompts = torch.FloatTensor(np.mean(neighbours, axis=0))  # (dimension,)
264 |             # batch = self._prepare_batch(neighbours, attention_weight) #TEST
265 |             samples.append(mean_prompts)
266 | 
267 |         return torch.stack(samples), np.mean(similarity_scores)
268 |     
269 | 
270 |     
271 | 
272 |     def retrieve_prompt_add_avg(self, avg_pano_embeds, combined, top_k=16): #now for usage of visual feature
273 |         """
274 |         Retrieve top-k similar prompts from memory for each directional pano_embeds (1*12*768)
275 |         and prepend them to form new pano_embeds with prompts.
276 | 
277 |         Args:
278 |             avg_pano_embeds: Tensor of shape (1, 768) representing the average panoramic embedding.
279 |             top_k: Number of top similar prompts to retrieve from memory.
280 | 
281 |         Returns:
282 |             avg_pano_with_prompts: Tensor of shape (1, 768) with enhanced embeddings.
283 |         """
284 |         ud = 0.2
285 |         # Initialize a list to store the updated pano_embeds with prompts
286 |         pano_with_prompts = []
287 | 
288 |         # Define a linear layer to project concatenated prompts to the desired dimension
289 |         linear_layer = torch.nn.Linear(in_features=1536, out_features=768)
290 | 
291 |         # Iterate over each direction in pano_embeds (12 directions in total)
292 |         if isinstance(avg_pano_embeds, torch.Tensor):
293 |             avg_pano_embeds = avg_pano_embeds.detach().cpu().numpy()
294 | 
295 |         posprompts, _ = self.get_topk(keys=avg_pano_embeds, k=top_k)
296 | 
297 |                 # Calculate the mean of the top-k prompts (1*768)
298 |             #mean_prompt = torch.from_numpy(prompts).mean(dim=0, keepdim=True)
299 |             #mean_prompt = prompts.mean(dim=0, keepdim=True)
300 |         avg_pano_embeds = torch.from_numpy(avg_pano_embeds).float()
301 |         combined_embeds = torch.from_numpy(combined).float()
302 |                 # Concatenate the mean_prompt with direction_embed
303 |             #concatenated = torch.cat([mean_prompt, avg_pano_embeds], dim=-1)  # (1, 1536)
304 | 
305 |                 # Use the linear layer to project back to (1, 768)
306 |             #enhanced_embed = linear_layer(concatenated)
307 |         enhanced_embed = combined_embeds * (1-ud) + posprompts.squeeze(0) * ud
308 | 
309 |                 # Add the enhanced result to the pano_with_prompts list
310 |             #pano_with_prompts.append(enhanced_embed)
311 | 
312 |             # Stack the updated embeddings back to form a tensor of shape (1, 12, 768)
313 |         pano_with_posprompts = enhanced_embed
314 | 
315 |         return pano_with_posprompts
316 |     
317 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/etp/nerf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | # torch.autograd.set_detect_anomaly(True)
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import tinycudann as tcnn
  7 | import math
  8 | import scipy.signal
  9 | import heapq
 10 | import cv2
 11 | from tqdm import tqdm
 12 | 
 13 | # Misc
 14 | img2mse = lambda x, y : torch.mean((x - y) ** 2)
 15 | mse2psnr = lambda x : -10. * torch.log(x) / torch.log(torch.Tensor([10.]))
 16 | 
 17 | 
 18 | # Model
 19 | class NeRF(nn.Module):
 20 |     def __init__(self, D=4, W=256, input_ch=3, output_ch=4):
 21 |         """ 
 22 |         """
 23 |         super(NeRF, self).__init__()
 24 |         self.D = D
 25 |         self.W = W
 26 |       
 27 |         self.tcnn = tcnn.Network(
 28 |             n_input_dims=input_ch,
 29 |             n_output_dims=output_ch,
 30 |             network_config={
 31 |                 "otype": "CutlassMLP",
 32 |                 "activation": "ReLU",
 33 |                 "output_activation": "None",
 34 |                 "n_neurons": W,
 35 |                 "n_hidden_layers": D,
 36 |             },
 37 |         )
 38 | 
 39 |     def forward(self, x):
 40 |         outputs = self.tcnn(x)
 41 |         return outputs    
 42 | 
 43 | 
 44 | 
 45 | 
 46 | def config_parser():
 47 | 
 48 |     import configargparse
 49 |     parser = configargparse.ArgumentParser()
 50 | 
 51 | 
 52 |     # training options
 53 |     parser.add_argument("--near", type=float, default=0., 
 54 |                         help='near distance')
 55 |     parser.add_argument("--far", type=float, default=10., 
 56 |                         help='far distance')
 57 |     parser.add_argument("--camera_hfov", type=float, default= 90., #79., # 
 58 |                         help='camera HFOV angle')
 59 |     parser.add_argument("--camera_vfov", type=float, default= 90., # 79., # 
 60 |                     help='camera VFOV angle')
 61 |     parser.add_argument("--pointcloud_search_radius", type=float, default=0.1, 
 62 |                         help='pointcloud_search_radius')
 63 |     parser.add_argument("--featurecloud_search_radius", type=float, default=1., 
 64 |                         help='featurecloud_search_radius')
 65 |     parser.add_argument("--pointcloud_search_num", type=int, default=16, 
 66 |                         help='pointcloud_search_num')
 67 |     parser.add_argument("--featurecloud_search_num", type=int, default=4, 
 68 |                         help='featurecloud_search_num')
 69 |     parser.add_argument("--featuremap_scale", type=int, default=8, 
 70 |                         help='featuremap_scale')
 71 |     parser.add_argument("--chunk", type=int, default=1024, 
 72 |                         help='number of rays processed in parallel, decrease if running out of memory')
 73 |     parser.add_argument("--feature_loss_weight", type=float, default=0.01, 
 74 |                         help='weight of the language embedded feature loss')
 75 | 
 76 | 
 77 |     parser.add_argument("--rgba_net_layers", type=int, default=8, 
 78 |                         help='layers in rgb network')
 79 |     parser.add_argument("--rgba_net_width", type=int, default=512, 
 80 |                         help='channels per layer in rgb net')
 81 |     parser.add_argument("--clip_net_layers", type=int, default=8, 
 82 |                         help='layers in clip network')
 83 |     parser.add_argument("--clip_net_width", type=int, default=512, 
 84 |                         help='channels per layer in clip net')
 85 | 
 86 |     parser.add_argument("--N_rand", type=int, default=14*14, 
 87 |                         help='batch size (number of random rays per gradient step)')
 88 |    
 89 | 
 90 |     # rendering options
 91 |     parser.add_argument("--N_samples", type=int, default=256, 
 92 |                         help='number of coarse samples per ray')
 93 |     parser.add_argument("--N_importance", type=int, default=8,
 94 |                         help='number of fine samples per ray')
 95 | 
 96 | 
 97 |     ## blender flags
 98 |     parser.add_argument("--white_bkgd", action='store_true', 
 99 |                         help='set to render synthetic data on a white bkgd (always use for dvoxels)')
100 | 
101 |  
102 | 
103 |     return parser
104 | 
105 | 
106 | 
107 | def create_nerf():
108 |     """Instantiate NeRF's MLP model.
109 |     """
110 |     parser = config_parser()
111 |     args, unknown = parser.parse_known_args() #parser.parse_args()
112 |     np.random.seed(0)
113 |     torch.manual_seed(0)
114 |     torch.cuda.manual_seed(0)
115 | 
116 |     width = 512
117 |     scale = width ** -0.5
118 | 
119 |     rgba_mlp = NeRF(D=args.rgba_net_layers, W=args.rgba_net_width,
120 |                         input_ch=width*2, output_ch=4) # RGBA
121 | 
122 |     clip_mlp = NeRF(D=args.clip_net_layers, W=args.clip_net_width,
123 |                         input_ch=width, output_ch=width+1) # CLIP+Alpha 
124 | 
125 |     return args, rgba_mlp, clip_mlp
126 | 
127 | 
128 | def raw2feature(raw, z_vals):
129 |     """Transforms model's predictions to semantically meaningful values.
130 |     Args:
131 |         raw: [num_rays, num_samples along ray, 4]. Prediction from model.
132 |         z_vals: [num_rays, num_samples along ray]. Integration time.
133 |         rays_d: [num_rays, 3]. Direction of each ray.
134 |     Returns:
135 |         feature_map: [num_rays, 512]. Estimated semantic feature of a ray.
136 |         disp_map: [num_rays]. Disparity map. Inverse of depth map.
137 |         acc_map: [num_rays]. Sum of weights along each ray.
138 |         weights: [num_rays, num_samples]. Weights assigned to each sampled color.
139 |         depth_map: [num_rays]. Estimated distance to object.
140 |     """
141 |     raw2alpha = lambda raw, dists, act_fn=F.relu: 1.-torch.exp(-act_fn(raw)*dists)
142 | 
143 |     dists = z_vals[...,1:] - z_vals[...,:-1]
144 |     dists = torch.cat([dists, torch.Tensor([1e10]).expand(dists[...,:1].shape).to(dists.device)], -1)  # [N_rays, N_samples]
145 | 
146 |     #dists = dists * torch.norm(rays_d[...,None,:], dim=-1)
147 |     #rgb = torch.sigmoid(raw[...,:3])  # [N_rays, N_samples, 3]
148 |     feature = raw[...,:-1]
149 | 
150 |     alpha = raw2alpha(raw[...,-1], dists)  # [N_rays, N_samples]
151 |     
152 |     weights = alpha * torch.cumprod(torch.cat([torch.ones((alpha.shape[0], 1)).to(dists.device), 1.-alpha + 1e-10], -1), -1)[:, :-1]
153 |     feature_map = torch.sum(weights[...,None] * feature, -2)  # [N_rays, 3]
154 |     feature_map = feature_map / torch.linalg.norm(feature_map, dim=-1, keepdim=True)
155 | 
156 |     depth_map = torch.sum(weights * z_vals, -1)
157 |     disp_map = 1./torch.max(1e-10 * torch.ones_like(depth_map), depth_map / torch.sum(weights, -1))
158 |     acc_map = torch.sum(weights, -1)
159 | 
160 |     return feature_map, disp_map, acc_map, weights, depth_map
161 | 
162 | 
163 | 
164 | def get_rays(args, H, W):
165 |     rel_y = np.expand_dims(np.linspace(args.near, args.far, args.N_samples),axis=0).repeat(H*W,axis=0)    
166 |     hfov_angle = np.deg2rad(args.camera_hfov)
167 |     vfov_angle = np.deg2rad(args.camera_vfov)
168 |     half_W = W//2
169 |     half_H = H//2
170 |     tan_xy = np.array(([[i/half_W+1/W] for i in range(-half_W,half_W)])*W,np.float32) * math.tan(hfov_angle/2.)
171 |     rel_x = rel_y * tan_xy
172 |     rel_z = rel_y * (np.array([[i/half_H-1/H for i in range(half_H,-half_H,-1)]]*W,np.float32).T.reshape((-1,1)) * math.tan(vfov_angle/2.))
173 |     return (rel_x,rel_y,rel_z)
174 | 
175 | 
176 | def RGB_to_BGR(cvimg):
177 |     pilimg = cvimg.copy()
178 |     pilimg[:, :, 0] = cvimg[:, :, 2]
179 |     pilimg[:, :, 2] = cvimg[:, :, 0]
180 |     return pilimg
181 | 
182 | 
183 | def run_nerf_feature(args, model, scene_memory, position, direction, H=14, W=14):
184 | 
185 | 
186 |     camera_x, camera_y, camera_z = position
187 |     heading_angle = - direction
188 |     scene_fts, patch_directions, patch_scales, fcd, fcd_tree, occupancy_pcd_tree = scene_memory
189 |     patch_directions = torch.tensor(patch_directions + heading_angle, dtype=torch.float32).to("cuda")
190 |     patch_directions = torch.cat((torch.sin(patch_directions).unsqueeze(-1), torch.cos(patch_directions).unsqueeze(-1)), dim=-1)
191 |     patch_scales = torch.tensor(patch_scales, dtype=torch.float32).to("cuda").unsqueeze(-1)
192 | 
193 |     fcd_points = fcd.to("cuda")
194 | 
195 |     rel_x, rel_y, rel_z = get_rays(args, H, W)
196 |   
197 |     ray_x = rel_x * math.cos(heading_angle) + rel_y * math.sin(heading_angle) + camera_x
198 |     ray_y = -rel_y * math.cos(heading_angle) + rel_x * math.sin(heading_angle) + camera_y
199 |     ray_z = rel_z + camera_z
200 |     ray_z_vals = rel_y
201 | 
202 | 
203 |     ray_xyz = torch.tensor(np.concatenate((np.expand_dims(ray_x,-1),np.expand_dims(ray_y,-1),np.expand_dims(ray_z,-1)),axis=-1),dtype=torch.float32).to('cuda')
204 | 
205 |     occupancy_unit_length = 1**2
206 |     with torch.no_grad():
207 |         occupancy_query = ray_xyz.view(-1,3)
208 |         searched_occupancy_dists, searched_occupancy_inds = occupancy_pcd_tree.query(occupancy_query, nr_nns_searches=1) #Note that the cupy_kdtree distances are squared
209 |         occupancy_map = (searched_occupancy_dists < occupancy_unit_length).view(-1,)
210 |         occupancy_ray_xyz = ray_xyz.view(-1,3)[occupancy_map]
211 | 
212 |         occupancy_ray_k_neighbor_dists, occupancy_ray_k_neighbor_inds = fcd_tree.query(occupancy_ray_xyz, nr_nns_searches=args.featurecloud_search_num)
213 | 
214 |     searched_ray_k_neighbor_dists = torch.full((ray_xyz.shape[0]*ray_xyz.shape[1],args.featurecloud_search_num),args.featurecloud_search_radius,dtype=torch.float32).to('cuda')
215 |     searched_ray_k_neighbor_dists[occupancy_map] = occupancy_ray_k_neighbor_dists
216 | 
217 |     searched_ray_k_neighbor_inds = torch.full((ray_xyz.shape[0]*ray_xyz.shape[1],args.featurecloud_search_num),-1,dtype=torch.int64).to('cuda')
218 |     searched_ray_k_neighbor_inds[occupancy_map] = occupancy_ray_k_neighbor_inds
219 | 
220 |     searched_ray_k_neighbor_dists = torch.sqrt(searched_ray_k_neighbor_dists) #Note that the cupy_kdtree distances are squared
221 |     searched_ray_k_neighbor_inds[searched_ray_k_neighbor_dists >= args.featurecloud_search_radius] = -1
222 |     searched_ray_k_neighbor_dists[searched_ray_k_neighbor_dists >= args.featurecloud_search_radius] = args.featurecloud_search_radius
223 | 
224 | 
225 |     searched_ray_k_neighbor_inds = searched_ray_k_neighbor_inds.view(ray_xyz.shape[0],ray_xyz.shape[1],args.featurecloud_search_num)
226 |     searched_ray_k_neighbor_dists = searched_ray_k_neighbor_dists.view(ray_xyz.shape[0],ray_xyz.shape[1],args.featurecloud_search_num)
227 | 
228 |     sample_ray_xyz = torch.zeros((ray_xyz.shape[0],args.N_importance,3),dtype=torch.float32).to('cuda')
229 |     sample_ray_z_vals = np.zeros((ray_xyz.shape[0],args.N_importance))
230 | 
231 |     for i in range(ray_xyz.shape[0]):
232 |         idx = searched_ray_k_neighbor_inds[i]
233 |         tmp_distance = searched_ray_k_neighbor_dists[i].sum(-1)
234 |         tmp_density = (1/tmp_distance).cpu().numpy().tolist()
235 | 
236 |         peaks,_ = scipy.signal.find_peaks(tmp_density,distance=1)   
237 |         topk = heapq.nlargest(args.N_importance, range(len(tmp_density)), tmp_density.__getitem__)
238 |         k = max(args.N_importance//2, args.N_importance-len(peaks))
239 |         topk_peaks = topk[:k]
240 |         topk_peaks.extend(peaks[:args.N_importance-k])
241 |         topk_peaks.sort()
242 |         inds = np.array(topk_peaks,dtype=np.int64)
243 |         sample_ray_xyz[i] = ray_xyz[i][torch.tensor(inds).to(ray_xyz.device)]
244 |         sample_ray_z_vals[i] = ray_z_vals[i][inds]
245 | 
246 | 
247 |     with torch.no_grad():
248 |         sample_feature_k_neighbor_dists, sample_feature_k_neighbor_inds = fcd_tree.query(sample_ray_xyz.view(-1,3), nr_nns_searches=args.featurecloud_search_num)
249 | 
250 |     sample_feature_k_neighbor_dists = torch.sqrt(sample_feature_k_neighbor_dists) #Note that the cupy_kdtree distances are squared
251 |     sample_feature_k_neighbor_inds[sample_feature_k_neighbor_dists >= args.featurecloud_search_radius] = -1
252 |     sample_feature_k_neighbor_dists[sample_feature_k_neighbor_dists >= args.featurecloud_search_radius] = args.featurecloud_search_radius
253 |     sample_feature_k_neighbor_inds = sample_feature_k_neighbor_inds.view(sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num)
254 |     sample_feature_k_neighbor_dists = sample_feature_k_neighbor_dists.view(sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num)
255 | 
256 | 
257 |     sample_ft_neighbor_xyzds = torch.zeros((sample_ray_xyz.shape[0],sample_ray_xyz.shape[1],args.featurecloud_search_num,6),dtype=torch.float32).to('cuda')
258 | 
259 |     idx = sample_feature_k_neighbor_inds 
260 |     sample_ft_neighbor_xyzds[...,:3] = fcd_points[idx] - sample_ray_xyz.unsqueeze(-2)
261 | 
262 |     sample_ft_neighbor_x = sample_ft_neighbor_xyzds[...,0]
263 |     sample_ft_neighbor_y = sample_ft_neighbor_xyzds[...,1]
264 | 
265 |     # Get the relative angle to the NeRF camera, so the rotation angle is - heading_angle
266 |     sample_ft_neighbor_xyzds[...,0] = sample_ft_neighbor_x * math.cos(-heading_angle) + sample_ft_neighbor_y * math.sin(-heading_angle)
267 |     sample_ft_neighbor_xyzds[...,1] = -sample_ft_neighbor_y * math.cos(-heading_angle) + sample_ft_neighbor_x * math.sin(-heading_angle)
268 | 
269 |     sample_ft_neighbor_xyzds[...,:3][idx==-1] = args.far
270 |     sample_ft_neighbor_xyzds[...,3:5] = patch_directions[idx]
271 |     sample_ft_neighbor_xyzds[...,3:5][idx==-1] = 0
272 |     sample_ft_neighbor_xyzds[...,5:] = patch_scales[idx]
273 |     sample_ft_neighbor_xyzds[...,5:][idx==-1] = 0
274 | 
275 |     sample_ft_neighbor_embedding = scene_fts[idx.cpu().numpy()]
276 |     sample_ft_neighbor_embedding = torch.tensor(sample_ft_neighbor_embedding,dtype=torch.float32).to('cuda')
277 |     sample_ft_neighbor_embedding[idx==-1] = 0
278 | 
279 | 
280 |     sample_ft_neighbor_xyzds = model.fcd_position_embedding(sample_ft_neighbor_xyzds)
281 |     sample_ft = model.fcd_aggregation( (sample_ft_neighbor_embedding + sample_ft_neighbor_xyzds).view(-1,args.N_importance, args.featurecloud_search_num*512) )
282 | 
283 |     sample_input = sample_ft
284 | 
285 |     sample_feature = model.clip_mlp(sample_input.view(-1,sample_input.shape[-1])).view(-1,args.N_importance,512+1)
286 | 
287 |     sample_ray_z_vals = torch.tensor(sample_ray_z_vals,dtype=torch.float32).to('cuda')
288 |     feature_map, disp_map, acc_map, weights, depth_map = raw2feature(sample_feature, sample_ray_z_vals.view(-1,args.N_importance))
289 |     
290 |     transformer_input = torch.cat((model.class_embedding,feature_map),dim=0)
291 |     transformer_input = transformer_input + model.positional_embedding
292 | 
293 |     predicted_fts = model.nerf_view_encoder(transformer_input.unsqueeze(0)).squeeze(0)
294 | 
295 |     return predicted_fts
296 | 
297 | 
298 | 
299 |         


--------------------------------------------------------------------------------
/vlnce_baselines/models/graph_utils.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | from copy import deepcopy
  4 | import networkx as nx
  5 | import matplotlib.pyplot as plt
  6 | from habitat.tasks.utils import cartesian_to_polar
  7 | from habitat.utils.geometry_utils import quaternion_rotate_vector, quaternion_from_coeff
  8 | import math
  9 | 
 10 | MAX_DIST = 30
 11 | MAX_STEP = 10
 12 | # NOISE = 0.5
 13 | 
 14 | def calc_position_distance(a, b):
 15 |     # a, b: (x, y, z)
 16 |     dx = b[0] - a[0]
 17 |     dy = b[1] - a[1]
 18 |     dz = b[2] - a[2]
 19 |     dist = np.sqrt(dx**2 + dy**2 + dz**2)
 20 |     return dist
 21 | 
 22 | def calculate_vp_rel_pos_fts(a, b, base_heading=0, base_elevation=0, to_clock=False):
 23 |     # a, b: (x, y, z)
 24 |     dx = b[0] - a[0]
 25 |     dy = b[1] - a[1]
 26 |     dz = b[2] - a[2]
 27 |     # xy_dist = max(np.sqrt(dx**2 + dy**2), 1e-8)
 28 |     xz_dist = max(np.sqrt(dx**2 + dz**2), 1e-8)
 29 |     xyz_dist = max(np.sqrt(dx**2 + dy**2 + dz**2), 1e-8)
 30 | 
 31 |     # the simulator's api is weired (x-y axis is transposed)
 32 |     # heading = np.arcsin(dx/xy_dist) # [-pi/2, pi/2]
 33 |     heading = np.arcsin(-dx / xz_dist)  # [-pi/2, pi/2]
 34 |     # if b[1] < a[1]:
 35 |     #     heading = np.pi - heading
 36 |     if b[2] > a[2]:
 37 |         heading = np.pi - heading
 38 |     heading -= base_heading
 39 |     if to_clock:
 40 |         heading = 2 * np.pi - heading
 41 | 
 42 |     elevation = np.arcsin(dz / xyz_dist)  # [-pi/2, pi/2]
 43 |     elevation -= base_elevation
 44 | 
 45 |     return heading, elevation, xyz_dist
 46 | 
 47 | def get_angle_fts(headings, elevations, angle_feat_size):
 48 |     ang_fts = [np.sin(headings), np.cos(headings), np.sin(elevations), np.cos(elevations)]
 49 |     ang_fts = np.vstack(ang_fts).transpose().astype(np.float32)
 50 |     num_repeats = angle_feat_size // 4
 51 |     if num_repeats > 1:
 52 |         ang_fts = np.concatenate([ang_fts] * num_repeats, 1)
 53 |     return ang_fts
 54 | 
 55 | def heading_from_quaternion(quat: np.array):
 56 |     # https://github.com/facebookresearch/habitat-lab/blob/v0.1.7/habitat/tasks/nav/nav.py#L356
 57 |     quat = quaternion_from_coeff(quat)
 58 |     heading_vector = quaternion_rotate_vector(quat.inverse(), np.array([0, 0, -1]))
 59 |     phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
 60 |     return phi % (2 * np.pi)
 61 | 
 62 | def estimate_cand_pos(pos, ori, ang, dis):
 63 |     cand_num = len(ang)
 64 |     cand_pos = np.zeros([cand_num, 3])
 65 | 
 66 |     ang = np.array(ang)
 67 |     dis = np.array(dis)
 68 |     ang = (heading_from_quaternion(ori) + ang) % (2 * np.pi)
 69 |     cand_pos[:, 0] = pos[0] - dis * np.sin(ang)    # x
 70 |     cand_pos[:, 1] = pos[1]                        # y
 71 |     cand_pos[:, 2] = pos[2] - dis * np.cos(ang)    # z
 72 |     return cand_pos
 73 | 
 74 | 
 75 | class FloydGraph(object):
 76 |     def __init__(self):
 77 |         self._dis = defaultdict(lambda :defaultdict(lambda: 95959595))
 78 |         self._point = defaultdict(lambda :defaultdict(lambda: ""))
 79 |         self._visited = set()
 80 | 
 81 |     def distance(self, x, y):
 82 |         if x == y:
 83 |             return 0
 84 |         else:
 85 |             return self._dis[x][y]
 86 | 
 87 |     def add_edge(self, x, y, dis):
 88 |         if dis < self._dis[x][y]:
 89 |             self._dis[x][y] = dis
 90 |             self._dis[y][x] = dis
 91 |             self._point[x][y] = ""
 92 |             self._point[y][x] = ""
 93 | 
 94 |     def update(self, k):
 95 |         for x in self._dis:
 96 |             for y in self._dis:
 97 |                 if x != y and x !=k and y != k:
 98 |                     t_dis = self._dis[x][y] + self._dis[y][k]
 99 |                     if t_dis < self._dis[x][k]:
100 |                         self._dis[x][k] = t_dis
101 |                         self._dis[k][x] = t_dis
102 |                         self._point[x][k] = y
103 |                         self._point[k][x] = y
104 | 
105 |         for x in self._dis:
106 |             for y in self._dis:
107 |                 if x != y:
108 |                     t_dis = self._dis[x][k] + self._dis[k][y]
109 |                     if t_dis < self._dis[x][y]:
110 |                         self._dis[x][y] = t_dis
111 |                         self._dis[y][x] = t_dis
112 |                         self._point[x][y] = k
113 |                         self._point[y][x] = k
114 | 
115 |         self._visited.add(k)
116 | 
117 |     def visited(self, k):
118 |         return (k in self._visited)
119 | 
120 |     def path(self, x, y):
121 |         """
122 |         :param x: start
123 |         :param y: end
124 |         :return: the path from x to y [v1, v2, ..., v_n, y]
125 |         """
126 |         if x == y:
127 |             return []
128 |         if self._point[x][y] == "":     # Direct edge
129 |             return [y]
130 |         else:
131 |             k = self._point[x][y]
132 |             # print(x, y, k)
133 |             # for x1 in (x, k, y):
134 |             #     for x2 in (x, k, y):
135 |             #         print(x1, x2, "%.4f" % self._dis[x1][x2])
136 |             return self.path(x, k) + self.path(k, y)
137 | 
138 | 
139 | class GraphMap(object):
140 |     def __init__(self, has_real_pos, loc_noise, merge_ghost, ghost_aug):
141 | 
142 |         self.graph_nx = nx.Graph()
143 | 
144 |         self.node_pos = {}          # viewpoint to position (x, y, z)
145 |         self.node_embeds = {}       # viewpoint to pano feature
146 |         self.node_stepId = {}
147 | 
148 |         self.ghost_cnt = 0          # id to create ghost 
149 |         self.ghost_pos = {}
150 |         self.ghost_mean_pos = {}
151 |         self.ghost_embeds = {}      # viewpoint to single_view feature
152 |         self.ghost_fronts = {}      # viewpoint to front_vp id
153 |         self.ghost_real_pos = {}    # for training
154 |         self.has_real_pos = has_real_pos
155 |         self.merge_ghost = merge_ghost
156 |         self.ghost_aug = ghost_aug  # 0 ~ 1, noise level
157 |         self.loc_noise = loc_noise
158 | 
159 |         self.shortest_path = None
160 |         self.shortest_dist = None
161 |         
162 |         self.node_stop_scores = {}  # viewpoint to stop_score
163 | 
164 |     def _localize(self, qpos, kpos_dict, ignore_height=False):
165 |         min_dis = 10000
166 |         min_vp = None
167 |         for kvp, kpos in kpos_dict.items():
168 |             if ignore_height:
169 |                 dis = ((qpos[[0,2]] - kpos[[0,2]])**2).sum()**0.5
170 |             else:
171 |                 dis = ((qpos - kpos)**2).sum()**0.5
172 |             if dis < min_dis:
173 |                 min_dis = dis
174 |                 min_vp = kvp
175 |         min_vp = None if min_dis > self.loc_noise else min_vp
176 |         return min_vp
177 |     
178 |     def identify_node(self, cur_pos, cur_ori, cand_ang, cand_dis):
179 |         # assume no repeated node
180 |         # since action is restricted to ghosts
181 |         cur_vp = str(len(self.node_pos))
182 |         cand_vp = [f'{cur_vp}_{str(i)}' for i in range(len(cand_ang))]
183 |         cand_pos = [p for p in estimate_cand_pos(cur_pos, cur_ori, cand_ang, cand_dis)]
184 |         return cur_vp, cand_vp, cand_pos
185 | 
186 |     def delete_ghost(self, vp):
187 |         self.ghost_pos.pop(vp)
188 |         self.ghost_mean_pos.pop(vp)
189 |         self.ghost_embeds.pop(vp)
190 |         self.ghost_fronts.pop(vp)
191 |         if self.has_real_pos:
192 |             self.ghost_real_pos.pop(vp)
193 | 
194 |     def update_graph(self, prev_vp, step_id,
195 |                            cur_vp, cur_pos, cur_embeds,
196 |                            cand_vp, cand_pos, cand_embeds, 
197 |                            cand_real_pos):
198 |         # 1. connect prev_vp
199 |         self.graph_nx.add_node(cur_vp)
200 |         if prev_vp is not None:
201 |             prev_pos = self.node_pos[prev_vp]
202 |             dis = calc_position_distance(prev_pos, cur_pos)
203 |             self.graph_nx.add_edge(prev_vp, cur_vp, weight=dis)
204 | 
205 |         # 2. update node & ghost info
206 |         self.node_pos[cur_vp] = cur_pos
207 |         self.node_embeds[cur_vp] = cur_embeds
208 |         self.node_stepId[cur_vp] = step_id
209 | 
210 |         gvp_list = []
211 | 
212 |         for i, (cvp, cpos, cembeds) in enumerate(zip(cand_vp, cand_pos, cand_embeds)):
213 |             localized_nvp = self._localize(cpos, self.node_pos)
214 |             # cand overlap with node, connect cur_vp with localized_nvp
215 |             if localized_nvp is not None :
216 |                 dis = calc_position_distance(cur_pos, self.node_pos[localized_nvp])
217 |                 self.graph_nx.add_edge(cur_vp, localized_nvp, weight=dis)
218 | 
219 |                 gvp_list.append(localized_nvp)
220 |             # cand not overlap with node, create/update ghost
221 |             else:
222 |                 if self.merge_ghost:
223 |                     localized_gvp = self._localize(cpos, self.ghost_mean_pos)
224 |                     # create ghost
225 |                     if localized_gvp is None:
226 |                         gvp = f'g{str(self.ghost_cnt)}'
227 |                         self.ghost_cnt += 1
228 |                         self.ghost_pos[gvp] = [cpos]
229 |                         self.ghost_mean_pos[gvp] = cpos
230 |                         self.ghost_embeds[gvp] = [cembeds, 1]
231 |                         self.ghost_fronts[gvp] = [cur_vp]
232 |                         if self.has_real_pos:
233 |                             self.ghost_real_pos[gvp] = [cand_real_pos[i]]
234 | 
235 |                     # update ghost
236 |                     else:
237 |                         gvp = localized_gvp
238 |                         self.ghost_pos[gvp].append(cpos)
239 |                         self.ghost_mean_pos[gvp] = np.mean(self.ghost_pos[gvp], axis=0)
240 |                         self.ghost_embeds[gvp][0] = self.ghost_embeds[gvp][0] + cembeds
241 |                         self.ghost_embeds[gvp][1] += 1
242 |                         self.ghost_fronts[gvp].append(cur_vp)
243 |                         if self.has_real_pos:
244 |                             self.ghost_real_pos[gvp].append(cand_real_pos[i])
245 | 
246 |                 else:
247 |                     gvp = f'g{str(self.ghost_cnt)}'
248 |                     self.ghost_cnt += 1
249 |                     self.ghost_pos[gvp] = [cpos]
250 |                     self.ghost_mean_pos[gvp] = cpos
251 |                     self.ghost_embeds[gvp] = [cembeds, 1]
252 |                     self.ghost_fronts[gvp] = [cur_vp]
253 |                     if self.has_real_pos:
254 |                         self.ghost_real_pos[gvp] = [cand_real_pos[i]]
255 | 
256 |                 gvp_list.append(gvp)
257 | 
258 |         self.ghost_aug_pos = deepcopy(self.ghost_mean_pos)
259 |         if self.ghost_aug != 0:
260 |             for gvp, gpos in self.ghost_aug_pos.items():
261 |                 gpos_noise = np.random.normal(loc=(0,0,0), scale=(self.ghost_aug,0,self.ghost_aug), size=(3,))
262 |                 gpos_noise[gpos_noise < -self.ghost_aug] = -self.ghost_aug
263 |                 gpos_noise[gpos_noise >  self.ghost_aug] =  self.ghost_aug
264 |                 self.ghost_aug_pos[gvp] = gpos + gpos_noise
265 | 
266 |         self.shortest_path = dict(nx.all_pairs_dijkstra_path(self.graph_nx))
267 |         self.shortest_dist = dict(nx.all_pairs_dijkstra_path_length(self.graph_nx))
268 |         return gvp_list
269 | 
270 | 
271 |     def update_graph_no_overlap(self, prev_vp, step_id,
272 |                            cur_vp, cur_pos, cur_embeds,
273 |                            cand_vp, cand_pos, cand_embeds, cand_angles,
274 |                            cand_real_pos):
275 |         # 1. connect prev_vp
276 |         self.graph_nx.add_node(cur_vp)
277 |         if prev_vp is not None:
278 |             prev_pos = self.node_pos[prev_vp]
279 |             dis = calc_position_distance(prev_pos, cur_pos)
280 |             self.graph_nx.add_edge(prev_vp, cur_vp, weight=dis)
281 | 
282 |         # 2. update node & ghost info
283 |         self.node_pos[cur_vp] = cur_pos
284 |         self.node_embeds[cur_vp] = cur_embeds
285 |         self.node_stepId[cur_vp] = step_id
286 | 
287 |         gvp_list = []
288 | 
289 |         for i, (cvp, cpos, cembeds, cangles) in enumerate(zip(cand_vp, cand_pos, cand_embeds, cand_angles)):
290 | 
291 |             if i != 0 and i != len(cand_angles)-1 and 1/2 * math.pi < cangles and cangles < 3/2 * math.pi:
292 |                 continue
293 | 
294 |             gvp = f'g{str(self.ghost_cnt)}'
295 |             self.ghost_cnt += 1
296 |             self.ghost_pos[gvp] = [cpos]
297 |             self.ghost_mean_pos[gvp] = cpos
298 |             self.ghost_embeds[gvp] = [cembeds, 1]
299 |             self.ghost_fronts[gvp] = [cur_vp]
300 |             if self.has_real_pos:
301 |                 self.ghost_real_pos[gvp] = [cand_real_pos[i]]
302 | 
303 |             gvp_list.append(gvp)
304 | 
305 |         self.ghost_aug_pos = deepcopy(self.ghost_mean_pos)
306 |         if self.ghost_aug != 0:
307 |             for gvp, gpos in self.ghost_aug_pos.items():
308 |                 gpos_noise = np.random.normal(loc=(0,0,0), scale=(self.ghost_aug,0,self.ghost_aug), size=(3,))
309 |                 gpos_noise[gpos_noise < -self.ghost_aug] = -self.ghost_aug
310 |                 gpos_noise[gpos_noise >  self.ghost_aug] =  self.ghost_aug
311 |                 self.ghost_aug_pos[gvp] = gpos + gpos_noise
312 | 
313 |         self.shortest_path = dict(nx.all_pairs_dijkstra_path(self.graph_nx))
314 |         self.shortest_dist = dict(nx.all_pairs_dijkstra_path_length(self.graph_nx))
315 |         return gvp_list
316 | 
317 |     def front_to_ghost_dist(self, ghost_vp):
318 |         # assume the nearest front
319 |         min_dis = 10000
320 |         min_front = None
321 |         for front_vp in self.ghost_fronts[ghost_vp]:
322 |             dis = calc_position_distance(
323 |                 self.node_pos[front_vp], self.ghost_aug_pos[ghost_vp]
324 |             )
325 |             if dis < min_dis:
326 |                 min_dis = dis
327 |                 min_front = front_vp
328 |         return min_dis, min_front
329 | 
330 |     def get_node_embeds(self, vp):
331 |         if not vp.startswith('g'):
332 |             return self.node_embeds[vp]
333 |         else:
334 |             return self.ghost_embeds[vp][0] / self.ghost_embeds[vp][1]
335 | 
336 |     def get_pos_fts(self, cur_vp, cur_pos, cur_ori, gmap_vp_ids):
337 |         # dim=7 (sin(heading), cos(heading), sin(elevation), cos(elevation),
338 |         #  line_dist, shortest_dist, shortest_step)
339 |         rel_angles, rel_dists = [], []
340 |         for vp in gmap_vp_ids:
341 |             if vp is None:
342 |                 rel_angles.append([0, 0])
343 |                 rel_dists.append([0, 0, 0])
344 |             # for ghost
345 |             elif vp.startswith('g'):
346 |                 base_heading = heading_from_quaternion(cur_ori)
347 |                 base_elevation = 0
348 |                 vp_pos = self.ghost_aug_pos[vp]
349 |                 rel_heading, rel_elevation, rel_dist = calculate_vp_rel_pos_fts(
350 |                     cur_pos, vp_pos, base_heading, base_elevation, to_clock=True,
351 |                 )
352 |                 rel_angles.append([rel_heading, rel_elevation])
353 |                 front_dis, front_vp = self.front_to_ghost_dist(vp)
354 |                 shortest_dist = self.shortest_dist[cur_vp][front_vp] + front_dis
355 |                 shortest_step = len(self.shortest_path[cur_vp][front_vp]) + 1
356 |                 rel_dists.append(
357 |                     [rel_dist / MAX_DIST, 
358 |                     shortest_dist / MAX_DIST, 
359 |                     shortest_step / MAX_STEP]
360 |                 )
361 |             # for node
362 |             else:
363 |                 base_heading = heading_from_quaternion(cur_ori)
364 |                 base_elevation = 0
365 |                 vp_pos = self.node_pos[vp]
366 |                 rel_heading, rel_elevation, rel_dist = calculate_vp_rel_pos_fts(
367 |                     cur_pos, vp_pos, base_heading, base_elevation, to_clock=True,
368 |                 )
369 |                 rel_angles.append([rel_heading, rel_elevation])
370 |                 shortest_dist = self.shortest_dist[cur_vp][vp]
371 |                 shortest_step = len(self.shortest_path[cur_vp][vp])
372 |                 rel_dists.append(
373 |                     [rel_dist / MAX_DIST, 
374 |                     shortest_dist / MAX_DIST, 
375 |                     shortest_step / MAX_STEP]
376 |                 )
377 |         rel_angles = np.array(rel_angles).astype(np.float32)
378 |         rel_dists = np.array(rel_dists).astype(np.float32)
379 |         rel_ang_fts = get_angle_fts(rel_angles[:, 0], rel_angles[:, 1], angle_feat_size=4)
380 |         return np.concatenate([rel_ang_fts, rel_dists], 1)


--------------------------------------------------------------------------------