├── habitat_extensions
    ├── config
    │   ├── __init__.py
    │   ├── vlnce_task.yaml
    │   ├── vlnce_task_aug.yaml
    │   └── default.py
    ├── __init__.py
    ├── task.py
    ├── shortest_path_follower.py
    ├── utils.py
    ├── measures.py
    └── sensors.py
├── vlnce_baselines
    ├── config
    │   ├── __init__.py
    │   ├── CMA_AUG.yaml
    │   ├── CMA_AUG_DA_TUNE.yaml
    │   └── default.py
    ├── models
    │   ├── __init__.py
    │   ├── encoders
    │   │   ├── instruction_encoder.py
    │   │   ├── resnet_encoders.py
    │   │   ├── unet_encoder.py
    │   │   └── map_encoder.py
    │   ├── policy.py
    │   ├── mg_map_policy.py
    │   └── ddppo_policy.py
    ├── __init__.py
    ├── common
    │   ├── aux_losses.py
    │   ├── distributions.py
    │   ├── env_utils.py
    │   ├── environments.py
    │   ├── action_maker.py
    │   ├── utils.py
    │   └── rgb_mapping.py
    └── common_trainer.py
├── img
    └── framework.png
├── requirements.txt
├── SETUP.md
├── .gitignore
├── run.py
└── README.md


/habitat_extensions/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/img/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeihaoChen/WS-MGMap/HEAD/img/framework.png


--------------------------------------------------------------------------------
/vlnce_baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from vlnce_baselines import dagger_trainer
2 | from vlnce_baselines.common import environments
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs>=19.1.0
 2 | dtw==1.4.0
 3 | fastdtw==0.3.4
 4 | gym==0.10.9
 5 | lmdb
 6 | msgpack_numpy
 7 | numpy
 8 | torch>=1.3.1
 9 | torchvision==0.2.2.post3
10 | tqdm>=4.0.0
11 | 


--------------------------------------------------------------------------------
/habitat_extensions/__init__.py:
--------------------------------------------------------------------------------
1 | from habitat_extensions import measures, sensors
2 | from habitat_extensions.config.default import get_extended_config
3 | from habitat_extensions.task import VLNCEDatasetV1
4 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/CMA_AUG.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_PROCESSES: 5
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug
 8 | 
 9 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']
10 | 
11 | EVAL:
12 |   USE_CKPT_CONFIG: False
13 |   SPLIT: val_unseen
14 |   EPISODE_COUNT: 50000
15 | 
16 | DAGGER:
17 |   ITERATIONS: 1
18 |   EPOCHS: 30
19 |   UPDATE_SIZE: 157232
20 |   BATCH_SIZE: 8
21 |   P: 1.0
22 |   PRELOAD_LMDB_FEATURES: False
23 |   LMDB_FEATURES_DIR: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_train_seen_data/trajectories.lmdb
24 | 
25 | same_level_train: False
26 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml:
--------------------------------------------------------------------------------
 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml
 2 | SIMULATOR_GPU_ID: 0
 3 | TORCH_GPU_ID: 0
 4 | NUM_PROCESSES: 5
 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug_da_tune
 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug_da_tune
 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug_da_tune
 8 | 
 9 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']
10 | 
11 | EVAL:
12 |   USE_CKPT_CONFIG: False
13 |   SPLIT: val_unseen
14 |   EPISODE_COUNT: 50000
15 | 
16 | DAGGER:
17 |   ITERATIONS: 10
18 |   EPOCHS: 4
19 |   UPDATE_SIZE: 5000
20 |   BATCH_SIZE: 8
21 |   P: 0.5
22 |   PRELOAD_LMDB_FEATURES: False
23 |   LMDB_FEATURES_DIR: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_train_seen_data/trajectories.lmdb
24 |   LOAD_FROM_CKPT: True
25 |   CKPT_TO_LOAD: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_exp_4/IL_RgbMap_Step3_SegPred-Alpha0.1_KlLoss-Tau0.07_DataAug/run_train_base/checkpoint/ckpt.12.pth
26 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/aux_losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class _AuxLosses:
 5 |     def __init__(self):
 6 |         self._losses = {}
 7 |         self._loss_alphas = {}
 8 |         self._is_active = False
 9 | 
10 |     def clear(self):
11 |         self._losses.clear()
12 |         self._loss_alphas.clear()
13 | 
14 |     def register_loss(self, name, loss, alpha=1.0):
15 |         assert self.is_active()
16 |         assert name not in self._losses
17 | 
18 |         self._losses[name] = loss
19 |         self._loss_alphas[name] = alpha
20 | 
21 |     def get_loss(self, name):
22 |         return self._losses[name]
23 | 
24 |     def reduce(self, mask=None):
25 |         assert self.is_active()
26 |         total = 0.0
27 | 
28 |         for k in self._losses.keys():
29 |             if mask is not None:
30 |                 k_loss = torch.masked_select(self._losses[k], mask).mean()
31 |             else:
32 |                 k_loss = self._losses[k].mean()
33 |             total = total + self._loss_alphas[k] * k_loss
34 | 
35 |         return total
36 | 
37 |     def is_active(self):
38 |         return self._is_active
39 | 
40 |     def activate(self):
41 |         self._is_active = True
42 | 
43 |     def deactivate(self):
44 |         self._is_active = False
45 | 
46 | 
47 | AuxLosses = _AuxLosses()
48 | 


--------------------------------------------------------------------------------
/habitat_extensions/config/vlnce_task.yaml:
--------------------------------------------------------------------------------
 1 | ENVIRONMENT:
 2 |   MAX_EPISODE_STEPS: 500
 3 | SIMULATOR:
 4 |   AGENT_0:
 5 |     SENSORS: [RGB_SENSOR, DEPTH_SENSOR]
 6 |   FORWARD_STEP_SIZE: 0.25
 7 |   TURN_ANGLE: 15
 8 |   HABITAT_SIM_V0:
 9 |     GPU_DEVICE_ID: 0
10 |     ALLOW_SLIDING: True
11 |   RGB_SENSOR:
12 |     WIDTH: 224
13 |     HEIGHT: 224
14 |     HFOV: 90
15 |     TYPE: HabitatSimRGBSensor
16 |   DEPTH_SENSOR:
17 |     WIDTH: 256  # pretrained DDPPO resnet needs 256x256
18 |     HEIGHT: 256
19 |   SEMANTIC_SENSOR:
20 |     WIDTH: 256
21 |     HEIGHT: 256
22 | TASK:
23 |   TYPE: VLN-v0
24 |   SUCCESS_DISTANCE: 3.0
25 |   SENSORS: [
26 |     INSTRUCTION_SENSOR,
27 |     VLN_ORACLE_ACTION_SENSOR,
28 |     VLN_ORACLE_PROGRESS_SENSOR,
29 |     VLN_ORACLE_WAYPOINT_SENSOR,
30 |     VLN_ORACLE_PATH_SENSOR,
31 |     HEADING_SENSOR,
32 |     COMPASS_SENSOR,
33 |     GPS_SENSOR,
34 |     GT_SEMANTIC_MAP_SENSOR,
35 |   ]
36 |   INSTRUCTION_SENSOR_UUID: instruction
37 |   POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT]
38 |   MEASUREMENTS: [
39 |     DISTANCE_TO_GOAL,
40 |     SUCCESS,
41 |     SPL,
42 |     NDTW,
43 |     PATH_LENGTH,
44 |     ORACLE_SUCCESS,
45 |     STEPS_TAKEN
46 |   ]
47 |   SUCCESS:
48 |     SUCCESS_DISTANCE: 3.0
49 |   SPL:
50 |     SUCCESS_DISTANCE: 3.0
51 |   NDTW:
52 |     SUCCESS_DISTANCE: 3.0
53 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
54 |   SDTW:
55 |     SUCCESS_DISTANCE: 3.0
56 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
57 |   ORACLE_SUCCESS:
58 |     SUCCESS_DISTANCE: 3.0
59 | DATASET:
60 |   TYPE: VLN-CE-v1
61 |   SPLIT: train
62 |   DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz
63 |   SCENES_DIR: data/scene_datasets/
64 | 


--------------------------------------------------------------------------------
/habitat_extensions/config/vlnce_task_aug.yaml:
--------------------------------------------------------------------------------
 1 | # Same as vlnce_task.yaml but with a dataset split that
 2 | # contains both the training and EnvDrop episodes.
 3 | 
 4 | ENVIRONMENT:
 5 |   MAX_EPISODE_STEPS: 500
 6 | SIMULATOR:
 7 |   AGENT_0:
 8 |     SENSORS: [RGB_SENSOR, DEPTH_SENSOR]
 9 |   FORWARD_STEP_SIZE: 0.25
10 |   TURN_ANGLE: 15
11 |   HABITAT_SIM_V0:
12 |     GPU_DEVICE_ID: 0
13 |     ALLOW_SLIDING: True
14 |   RGB_SENSOR:
15 |     WIDTH: 224
16 |     HEIGHT: 224
17 |     HFOV: 90
18 |     TYPE: HabitatSimRGBSensor
19 |   DEPTH_SENSOR:
20 |     WIDTH: 256  # pretrained DDPPO resnet needs 256x256
21 |     HEIGHT: 256
22 |   SEMANTIC_SENSOR:
23 |     WIDTH: 256
24 |     HEIGHT: 256
25 | TASK:
26 |   TYPE: VLN-v0
27 |   SUCCESS_DISTANCE: 3.0
28 |   SENSORS: [
29 |     INSTRUCTION_SENSOR,
30 |     VLN_ORACLE_ACTION_SENSOR,
31 |     VLN_ORACLE_PROGRESS_SENSOR,
32 |     VLN_ORACLE_WAYPOINT_SENSOR,
33 |     VLN_ORACLE_PATH_SENSOR,
34 |     HEADING_SENSOR,
35 |     COMPASS_SENSOR,
36 |     GPS_SENSOR,
37 |     GT_SEMANTIC_MAP_SENSOR,
38 |   ]
39 |   INSTRUCTION_SENSOR_UUID: instruction
40 |   POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT]
41 |   MEASUREMENTS: [
42 |     DISTANCE_TO_GOAL,
43 |     SUCCESS,
44 |     SPL,
45 |     NDTW,
46 |     PATH_LENGTH,
47 |     ORACLE_SUCCESS,
48 |     STEPS_TAKEN
49 |   ]
50 |   SUCCESS:
51 |     SUCCESS_DISTANCE: 3.0
52 |   SPL:
53 |     SUCCESS_DISTANCE: 3.0
54 |   NDTW:
55 |     SUCCESS_DISTANCE: 3.0
56 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
57 |   SDTW:
58 |     SUCCESS_DISTANCE: 3.0
59 |     GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz
60 |   ORACLE_SUCCESS:
61 |     SUCCESS_DISTANCE: 3.0
62 | DATASET:
63 |   TYPE: VLN-CE-v1
64 |   SPLIT: joint_train_envdrop
65 |   DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz
66 |   SCENES_DIR: data/scene_datasets/
67 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/distributions.py:
--------------------------------------------------------------------------------
 1 | # The following code is largely borrowed from:
 2 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | """
 8 | Modify standard PyTorch distributions so they are compatible with this code.
 9 | """
10 | 
11 | FixedCategorical = torch.distributions.Categorical
12 | 
13 | old_sample = FixedCategorical.sample
14 | FixedCategorical.sample = lambda self: old_sample(self)
15 | 
16 | log_prob_cat = FixedCategorical.log_prob
17 | FixedCategorical.log_probs = lambda self, actions: \
18 |     log_prob_cat(self, actions.squeeze(-1))
19 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True)
20 | 
21 | FixedNormal = torch.distributions.Normal
22 | log_prob_normal = FixedNormal.log_prob
23 | FixedNormal.log_probs = lambda self, actions: \
24 |     log_prob_normal(self, actions).sum(-1, keepdim=False)
25 | 
26 | entropy = FixedNormal.entropy
27 | FixedNormal.entropy = lambda self: entropy(self).sum(-1)
28 | 
29 | FixedNormal.mode = lambda self: self.mean
30 | 
31 | 
32 | class Categorical(nn.Module):
33 |     def __init__(self, num_inputs, num_outputs):
34 |         super(Categorical, self).__init__()
35 |         self.linear = nn.Linear(num_inputs, num_outputs)
36 | 
37 |     def forward(self, x):
38 |         x = self.linear(x)
39 |         return FixedCategorical(logits=x)
40 | 
41 | 
42 | class DiagGaussian(nn.Module):
43 |     def __init__(self, num_inputs, num_outputs):
44 |         super(DiagGaussian, self).__init__()
45 | 
46 |         self.fc_mean = nn.Linear(num_inputs, num_outputs)
47 |         self.logstd = AddBias(torch.zeros(num_outputs))
48 | 
49 |     def forward(self, x):
50 |         action_mean = self.fc_mean(x)
51 | 
52 |         zeros = torch.zeros(action_mean.size())
53 |         if x.is_cuda:
54 |             zeros = zeros.cuda(x.device)
55 | 
56 |         action_logstd = self.logstd(zeros)
57 |         return FixedNormal(action_mean, action_logstd.exp())
58 | 
59 | 
60 | class AddBias(nn.Module):
61 |     def __init__(self, bias):
62 |         super(AddBias, self).__init__()
63 |         self._bias = nn.Parameter(bias.unsqueeze(1))
64 | 
65 |     def forward(self, x):
66 |         if x.dim() == 2:
67 |             bias = self._bias.t().view(1, -1)
68 |         else:
69 |             bias = self._bias.t().view(1, -1, 1, 1)
70 | 
71 |         return x + bias
72 | 


--------------------------------------------------------------------------------
/SETUP.md:
--------------------------------------------------------------------------------
 1 | # Setup
 2 | 
 3 | ## clone this code
 4 | ```bash
 5 | git clone https://github.com/PeihaoChen/WS-MGMap.git
 6 | cd WS-MGMap
 7 | ```
 8 | 
 9 | ## Python
10 | This project is developed with Python 3.6.13. If you are using miniconda or anaconda, you can create an environment:
11 | 
12 | ```bash
13 | conda create -n wsmgmap python==3.6.13
14 | conda activate wsmgmap
15 | ```
16 | 
17 | ## Pytorch
18 | VLN-CE uses Pytorch 1.6.0 & Cuda 10.2 which can be built installed from conda:
19 | 
20 | ```bash
21 | conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.2 -c pytorch
22 | ```
23 | 
24 | ## Habitat
25 | VLN-CE uses Habitat-Sim 0.1.5 which can be built from source or installed from conda:
26 | 
27 | ```bash
28 | conda install -y -c aihabitat -c conda-forge bullet=2.88 habitat-sim=0.1.5 headless withbullet python=3.6
29 | ```
30 | Tips: You'd better to install bullet and withbulllet simultaneously, in order to avoid ImportError at run time.
31 | 
32 | Then install Habitat-Lab:
33 | 
34 | ```bash
35 | git clone --branch v0.1.5 https://github.com/facebookresearch/habitat-lab.git
36 | cd habitat-lab
37 | # installs both habitat and habitat_baselines
38 | pip install --upgrade pip   # update pip
39 | python -m pip install -r requirements.txt
40 | 
41 | python -m pip install -r habitat_baselines/rl/requirements.txt
42 | python -m pip install -r habitat_baselines/rl/ddppo/requirements.txt
43 | python setup.py develop --all
44 | ```
45 | 
46 | ## WS-MGMap for VLN
47 | ```bash
48 | cd ..
49 | pip install -r requirements.txt
50 | 
51 | # requirements
52 | conda install psutil 
53 | pip install einops 
54 | 
55 | # torch_scatter
56 | cd data
57 | wget https://data.pyg.org/whl/torch-1.6.0%2Bcu102/torch_scatter-2.0.6-cp36-cp36m-linux_x86_64.whl
58 | pip install torch_scatter-2.0.6-cp36-cp36m-linux_x86_64.whl
59 | cd ..
60 | ```
61 | 
62 | # Data 
63 | ```bash
64 | # Fisrt install the gdown to download data in google drive.
65 | pip install gdown
66 | 
67 | mkdir data
68 | cd data
69 | ```
70 | 
71 | ## Semantic Map
72 | ```bash
73 | # Download map_data.tar.gz
74 | gdown https://drive.google.com/uc?id=1pJwx0E95WsJXThcx8tPrUTB_6gTlryoy
75 | tar -xvf map_data.tar.gz
76 | 
77 | # Unzip all train files
78 | cd map_data/semantic/train
79 | find . -name '*.tar.gz' -print0 | xargs -0 -I {} -P 10 tar -zvxf {}
80 | 
81 | # Unzip all train_aug files
82 | cd ../train_aug 
83 | find . -name '*.tar.gz' -print0 | xargs -0 -I {} -P 10 tar -zvxf {}
84 | ```
85 | 
86 | ## Pre-Trained Model
87 | ```bash
88 | gdown https://drive.google.com/uc?id=1DYkXbRIBVgMU1qHF_mLT41esSAdcQJaf
89 | tar -zxvf pretrain_model.tar.gz
90 | ```
91 | 
92 | ## Trained model
93 | ```bash
94 | gdown https://drive.google.com/uc?id=1HcD8s-tyBeH2LsXs6Rj5x5DC1hVD4GNs
95 | tar -zxvf trained_model.tar.gz
96 | ```
97 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | result
  2 | scripts/
  3 | result1
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # VSCode
133 | .vscode
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # exclude data from source control by default
139 | data
140 | 
141 | # Generated videos
142 | videos
143 | 
144 | # database files
145 | *.lmdb
146 | 
147 | # logging
148 | .log
149 | 
150 | # evaluation results
151 | stats_*.json
152 | 
153 | # Other
154 | habitat-lab
155 | temp
156 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/env_utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import gym
 4 | from typing import Type, Union
 5 | 
 6 | import habitat
 7 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
 8 | from habitat_baselines.common.env_utils import make_env_fn
 9 | 
10 | 
11 | def construct_envs(
12 |     config: Config, env_class: Type[Union[Env, RLEnv]], auto_reset_done: bool = True
13 | ) -> VectorEnv:
14 |     r"""Create VectorEnv object with specified config and env class type.
15 |     To allow better performance, dataset are split into small ones for
16 |     each individual env, grouped by scenes.
17 | 
18 |     Args:
19 |         config: configs that contain num_processes as well as information
20 |         necessary to create individual environments.
21 |         env_class: class type of the envs to be created.
22 |         auto_reset_done: Whether or not to automatically reset the env on done
23 | 
24 |     Returns:
25 |         VectorEnv object created according to specification.
26 |     """
27 | 
28 |     num_processes = config.NUM_PROCESSES
29 |     configs = []
30 |     env_classes = [env_class for _ in range(num_processes)]
31 |     dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE)
32 |     scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET)
33 | 
34 |     if num_processes > 1:
35 |         if len(scenes) == 0:
36 |             raise RuntimeError(
37 |                 "No scenes to load, multiple process logic relies on being able to split scenes uniquely between processes"
38 |             )
39 | 
40 |         if len(scenes) < num_processes:
41 |             raise RuntimeError(
42 |                 "reduce the number of processes as there "
43 |                 "aren't enough number of scenes"
44 |             )
45 | 
46 |         random.shuffle(scenes)
47 | 
48 |     scene_splits = [[] for _ in range(num_processes)]
49 |     for idx, scene in enumerate(scenes):
50 |         scene_splits[idx % len(scene_splits)].append(scene)
51 | 
52 |     # assert sum(map(len, scene_splits)) == len(scenes)
53 |     if config.SIMULATOR_GPU_IDS is None:
54 |         devices = [config.SIMULATOR_GPU_ID]
55 |     else:
56 |         devices = config.SIMULATOR_GPU_IDS
57 | 
58 |     for i in range(num_processes):
59 |         proc_config = config.clone()
60 |         proc_config.defrost()
61 | 
62 |         task_config = proc_config.TASK_CONFIG
63 |         if len(scenes) > 0:
64 |             task_config.DATASET.CONTENT_SCENES = scene_splits[i]
65 | 
66 |         task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = devices[i % len(devices)]
67 | 
68 |         task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS
69 | 
70 |         proc_config.freeze()
71 |         configs.append(proc_config)
72 | 
73 |     envs = habitat.VectorEnv(
74 |         make_env_fn=make_env_fn,
75 |         env_fn_args=tuple(tuple(zip(configs, env_classes, range(num_processes)))),
76 |         auto_reset_done=auto_reset_done,
77 |     )
78 | 
79 |     action_space = gym.spaces.Box(low=0.0, high=0.99, shape=(2,), dtype=np.float32)
80 |     envs.action_spaces = [action_space for _ in range(num_processes)]
81 | 
82 |     return envs
83 | 
84 | 
85 | def construct_envs_auto_reset_false(
86 |     config: Config, env_class: Type[Union[Env, RLEnv]]
87 | ) -> VectorEnv:
88 |     return construct_envs(config, env_class, auto_reset_done=False)
89 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/instruction_encoder.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import json
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from habitat import Config
 8 | 
 9 | 
10 | class InstructionEncoder(nn.Module):
11 |     def __init__(self, config: Config):
12 |         """An encoder that uses RNN to encode an instruction. Returns
13 |         the final hidden state after processing the instruction sequence.
14 |         Args:
15 |             config: must have
16 |                 vocab_size: number of words in the vocabulary
17 |                 embedding_size: The dimension of each embedding vector
18 |                 use_pretrained_embeddings:
19 |                 embedding_file:
20 |                 fine_tune_embeddings:
21 |                 dataset_vocab:
22 |                 hidden_size: The hidden (output) size
23 |                 rnn_type: The RNN cell type.  Must be GRU or LSTM
24 |                 final_state_only: Whether or not to return just the final state
25 |         """
26 |         super().__init__()
27 | 
28 |         self.config = config
29 | 
30 |         if self.config.use_pretrained_embeddings:
31 |             self.embedding_layer = nn.Embedding.from_pretrained(
32 |                 embeddings=self._load_embeddings(),
33 |                 freeze=not self.config.fine_tune_embeddings,
34 |             )
35 |         else:  # each embedding initialized to sampled Gaussian
36 |             self.embedding_layer = nn.Embedding(
37 |                 num_embeddings=config.vocab_size,
38 |                 embedding_dim=config.embedding_size,
39 |                 padding_idx=0,
40 |             )
41 | 
42 |         rnn = nn.GRU if self.config.rnn_type == "GRU" else nn.LSTM
43 |         self.bidir = config.bidirectional
44 |         self.encoder_rnn = rnn(
45 |             input_size=config.embedding_size,
46 |             hidden_size=config.hidden_size,
47 |             bidirectional=self.bidir,
48 |         )
49 |         self.final_state_only = config.final_state_only
50 | 
51 |     @property
52 |     def output_size(self):
53 |         return self.config.hidden_size * (2 if self.bidir else 1)
54 | 
55 |     def _load_embeddings(self):
56 |         """ Loads word embeddings from a pretrained embeddings file.
57 |         PAD: index 0. [0.0, ... 0.0]
58 |         UNK: index 1. mean of all R2R word embeddings: [mean_0, ..., mean_n]
59 |         why UNK is averaged:
60 |             https://groups.google.com/forum/#!searchin/globalvectors/unk|sort:date/globalvectors/9w8ZADXJclA/hRdn4prm-XUJ
61 |         Returns:
62 |             embeddings tensor of size [num_words x embedding_dim]
63 |         """
64 |         with gzip.open(self.config.embedding_file, "rt") as f:
65 |             embeddings = torch.tensor(json.load(f))
66 |         return embeddings
67 | 
68 |     def forward(self, observations):
69 |         """
70 |         Tensor sizes after computation:
71 |             instruction: [batch_size x seq_length]
72 |             lengths: [batch_size]
73 |             hidden_state: [batch_size x hidden_size]
74 |         """
75 |         instruction = observations["instruction"].long()
76 | 
77 |         lengths = (instruction != 0.0).long().sum(dim=1)
78 |         embedded = self.embedding_layer(instruction)
79 | 
80 |         packed_seq = nn.utils.rnn.pack_padded_sequence(
81 |             embedded, lengths, batch_first=True, enforce_sorted=False
82 |         )
83 | 
84 |         output, final_state = self.encoder_rnn(packed_seq)
85 | 
86 |         if self.config.rnn_type == "LSTM":
87 |             final_state = final_state[0]
88 | 
89 |         if self.final_state_only:
90 |             return final_state.squeeze(0)
91 |         else:
92 |             hidden_states = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0].permute(0, 2, 1)
93 |             return hidden_states, (hidden_states == 0.0).all(dim=1)
94 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/resnet_encoders.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import spaces
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from habitat_baselines.rl.ddppo.policy import resnet
  8 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder
  9 | from habitat_baselines.common.utils import Flatten
 10 | 
 11 | 
 12 | class VlnResnetDepthEncoder(nn.Module):
 13 |     def __init__(
 14 |         self,
 15 |         observation_space,
 16 |         output_size=128,
 17 |         checkpoint="NONE",
 18 |         backbone="resnet50",
 19 |         resnet_baseplanes=32,
 20 |         normalize_visual_inputs=False,
 21 |         trainable=False,
 22 |         spatial_output: bool = False,
 23 |     ):
 24 |         super().__init__()
 25 |         self.visual_encoder = ResNetEncoder(
 26 |             spaces.Dict({"depth": observation_space.spaces["depth"]}),
 27 |             baseplanes=resnet_baseplanes,
 28 |             ngroups=resnet_baseplanes // 2,
 29 |             make_backbone=getattr(resnet, backbone),
 30 |             normalize_visual_inputs=normalize_visual_inputs,
 31 |             obs_transform=None,
 32 |         )
 33 | 
 34 |         for param in self.visual_encoder.parameters():
 35 |             param.requires_grad_(trainable)
 36 | 
 37 |         if checkpoint != "NONE":
 38 |             ddppo_weights = torch.load(checkpoint)
 39 | 
 40 |             weights_dict = {}
 41 |             for k, v in ddppo_weights["state_dict"].items():
 42 |                 split_layer_name = k.split(".")[2:]
 43 |                 if split_layer_name[0] != "visual_encoder":
 44 |                     continue
 45 | 
 46 |                 layer_name = ".".join(split_layer_name[1:])
 47 |                 weights_dict[layer_name] = v
 48 | 
 49 |             del ddppo_weights
 50 |             self.visual_encoder.load_state_dict(weights_dict, strict=True)
 51 | 
 52 |         self.spatial_output = spatial_output
 53 | 
 54 |         if not self.spatial_output:
 55 |             self.output_shape = (output_size,)
 56 |             self.visual_fc = nn.Sequential(
 57 |                 Flatten(),
 58 |                 nn.Linear(np.prod(self.visual_encoder.output_shape), output_size),
 59 |                 nn.ReLU(True),
 60 |             )
 61 |         else:
 62 |             self.spatial_embeddings = nn.Embedding(
 63 |                 self.visual_encoder.output_shape[1]
 64 |                 * self.visual_encoder.output_shape[2],
 65 |                 64,
 66 |             )
 67 | 
 68 |             self.output_shape = list(self.visual_encoder.output_shape)
 69 |             self.output_shape[0] += self.spatial_embeddings.embedding_dim
 70 |             self.output_shape = tuple(self.output_shape)
 71 | 
 72 |     def forward(self, observations):
 73 |         """
 74 |         Args:
 75 |             observations: [BATCH, HEIGHT, WIDTH, CHANNEL]
 76 |         Returns:
 77 |             [BATCH, OUTPUT_SIZE]
 78 |         """
 79 |         if "depth_features" in observations:
 80 |             x = observations["depth_features"]
 81 |         else:
 82 |             x = self.visual_encoder(observations)
 83 | 
 84 |         if self.spatial_output:
 85 |             b, c, h, w = x.size()
 86 | 
 87 |             spatial_features = (
 88 |                 self.spatial_embeddings(
 89 |                     torch.arange(
 90 |                         0,
 91 |                         self.spatial_embeddings.num_embeddings,
 92 |                         device=x.device,
 93 |                         dtype=torch.long,
 94 |                     )
 95 |                 )
 96 |                 .view(1, -1, h, w)
 97 |                 .expand(b, self.spatial_embeddings.embedding_dim, h, w)
 98 |             )
 99 | 
100 |             return torch.cat([x, spatial_features], dim=1)
101 |         else:
102 |             return self.visual_fc(x)
103 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import random
  5 | import os
  6 | import warnings
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | from typing import List
 10 | plt.switch_backend('agg')
 11 | os.environ['GLOG_minloglevel'] = '2'
 12 | os.environ['MAGNUM_LOG'] = 'quiet'
 13 | warnings.filterwarnings("ignore")
 14 | 
 15 | import torch
 16 | 
 17 | from habitat import logger
 18 | from habitat_baselines.common.baseline_registry import baseline_registry
 19 | 
 20 | from vlnce_baselines.config.default import get_config, refine_config, set_saveDir_GPUs
 21 | from vlnce_baselines.common.utils import check_exist_file, save_sh_n_codes, save_config
 22 | 
 23 | 
 24 | def main():
 25 |     parser = argparse.ArgumentParser()
 26 | 
 27 |     parser.add_argument(
 28 |         "--run-type",
 29 |         choices=["train", "eval", "inference"],
 30 |         default="train",
 31 |         help="run type of the experiment (train, eval, inference)",
 32 |     )
 33 |     parser.add_argument(
 34 |         "-c", "--exp-config",
 35 |         type=str,
 36 |         required=True,
 37 |         help="path to config yaml containing info about experiment",
 38 |     )
 39 |     parser.add_argument(
 40 |         "-e", "--model-dir",
 41 |         default=None,
 42 |         help="path to save checkpoint, log and others",
 43 |     )
 44 |     parser.add_argument(
 45 |         "--note",
 46 |         default='base',
 47 |         help="add extra note for running file",
 48 |     )
 49 |     parser.add_argument(
 50 |         "-g", "--gpus",
 51 |         default=None,
 52 |         nargs="+",
 53 |         type=int,
 54 |         help="GPU id to run experiments",
 55 |     )
 56 |     parser.add_argument(
 57 |         "opts",
 58 |         default=None,
 59 |         nargs=argparse.REMAINDER,
 60 |         help="Modify config options from command line",
 61 |     )
 62 |     parser.add_argument(
 63 |         '--local_rank',
 64 |         default=-1,
 65 |         type=int,
 66 |         help='node rank for distributed training'
 67 |     )
 68 | 
 69 |     args = parser.parse_args()
 70 |     run_exp(**vars(args))
 71 | 
 72 | 
 73 | def run_exp(exp_config: str,
 74 |             run_type: str,
 75 |             model_dir: str,
 76 |             note: str,
 77 |             gpus: List[int],
 78 |             opts=None,
 79 |             local_rank=-1) -> None:
 80 |     """Runs experiment given mode and config
 81 |     Args:
 82 |         exp_config: path to config file.
 83 |         run_type: "train" or "eval.
 84 |         model_dir: path to save.
 85 |         note: extra note.
 86 |         opts: list of strings of additional config options.
 87 |     Returns:
 88 |         None.
 89 |     
 90 |     """
 91 |     config = get_config(exp_config, opts)
 92 |     config = set_saveDir_GPUs(config, run_type, model_dir, note, gpus, local_rank)
 93 |     config = refine_config(config, local_rank)
 94 |     if local_rank == 0:
 95 |         check_exist_file(config)
 96 |         save_sh_n_codes(
 97 |             config,
 98 |             run_type,
 99 |             ignore_dir=['habitat-lab', 'data', 'result', 'habitat-sim', 'temp']
100 |         )
101 |         save_config(config, run_type)
102 |     logger.add_filehandler(config.LOG_FILE)
103 | 
104 |     random.seed(config.TASK_CONFIG.SEED)
105 |     np.random.seed(config.TASK_CONFIG.SEED)
106 |     torch.manual_seed(config.TASK_CONFIG.SEED)
107 |     torch.backends.cudnn.benchmark = False
108 |     torch.backends.cudnn.deterministic = True
109 | 
110 |     trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME)
111 |     assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported"
112 |     trainer = trainer_init(config)
113 | 
114 |     if run_type == "train":
115 |         trainer.train()
116 |     elif run_type == "eval":
117 |         trainer.eval()
118 |     elif run_type == "inference":
119 |         trainer.inference()
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     main()
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [NeurIPS 2022] WS-MGMap for VLN
  2 | Official Pytorch implementation for NeurIPS 2022 paper "Weakly-Supervised Multi-Granularity Map Learning for Vision-and-Language Navigation”
  3 | 
  4 | ![](img/framework.png)
  5 | 
  6 | 
  7 | ## Setup
  8 | This code is developed with Python 3.6, PyTorch 1.6.0. We follow [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to install Habitat-Sim and Habitat-Lab. Then clone this repository and install requirements. (More details in SETUP.md)
  9 | 
 10 | ```bash
 11 | git clone https://github.com/PeihaoChen/WS-MGMap.git
 12 | cd WS-MGMap
 13 | pip install -r requirements.txt
 14 | ```
 15 | 
 16 | 
 17 | ## Data
 18 | 
 19 | ### Download Scenes and Episodes
 20 | Follow the instructions in [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to download Matterport3D scenes to `data/scene_datasets` folder and VLN-CE datasets to `data/datasets` folder  and corresponding episodes data.
 21 | 
 22 | ### Download Ground-Truth Semantic Map
 23 | Download the cache ground-truth semantic map [here](https://drive.google.com/file/d/1pJwx0E95WsJXThcx8tPrUTB_6gTlryoy/view?usp=share_link) to `data/map_data` folder as the supervision for the semantic hallucination.
 24 | 
 25 | ### Download Pre-Trained Model
 26 | The pre-trained semantic segmentation model for the semantic segmentation and DD-PPO model for the navigation control can be found [here](https://drive.google.com/file/d/1DYkXbRIBVgMU1qHF_mLT41esSAdcQJaf/view?usp=sharing). Download it to `data/pretrain_model` folder
 27 | 
 28 | ### Data Format
 29 | This code expects all data files in the following structure:
 30 | 
 31 | ```graphql
 32 | WS-MGMap
 33 | ├─ data
 34 | |   ├─ datasets
 35 | |   |    ├─ R2R_VLNCE_v1-2
 36 | |   |    ├─ R2R_VLNCE_v1-2_preprocessed
 37 | |   ├─ map_data
 38 | |   |    ├─ semantic
 39 | |   |    |    ├─ train
 40 | |   |    |    |    ├─ ep_0.npy
 41 | |   |    |    |    ├─ ...
 42 | |   |    |    ├─ train_aug
 43 | |   |    |    |    ├─ ep_0.npy
 44 | |   |    |    |    ├─ ...
 45 | |   ├─ pretrain_model
 46 | |   |    ├─ ddppo-models
 47 | |   |    |    ├─ gibson-2plus-resnet50.pth
 48 | |   |    ├─ unet-models
 49 | |   |    |    ├─ 2021_02_14-23_42_50.pt
 50 | |   ├─ scene_datasets
 51 | |   |    ├─ mp3d
 52 | |   |    |    ├─ 1LXtFkjw3qL
 53 | |   |    |    ├─ ...
 54 | ```
 55 | 
 56 | 
 57 | ## Usage
 58 | 
 59 | ### Evaluation
 60 | We provide our trained models [here](https://drive.google.com/file/d/1HcD8s-tyBeH2LsXs6Rj5x5DC1hVD4GNs/view?usp=share_link) for reproducing the results shown in the paper.
 61 | Run the following to evaluate a trained model:
 62 | 
 63 | ```bash
 64 | export CUDA_VISIBLE_DEVICES=0
 65 | python -m torch.distributed.launch --nproc_per_node=1 run.py \
 66 | --run-type eval \
 67 | -c vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml \
 68 | -e $PATH_TO_SAVE_RESULT$ \
 69 | EVAL_CKPT_PATH_DIR $PATH_TO_TRAINED_MODEL$ \
 70 | NUM_PROCESSES 1 \
 71 | use_ddppo True
 72 | ```
 73 | 
 74 | 
 75 | ### Training
 76 | STAGE1: Run the following for teacher forcing training on augmented data:
 77 | 
 78 | ```bash
 79 | export CUDA_VISIBLE_DEVICES=0,1,2
 80 | python -m torch.distributed.launch --nproc_per_node=3 run.py \
 81 | -c vlnce_baselines/config/CMA_AUG.yaml \
 82 | -e $PATH_TO_SAVE_RESULT$ \
 83 | NUM_PROCESSES 6 \
 84 | DAGGER.BATCH_SIZE 8
 85 | ```
 86 | 
 87 | STAGE2: Run the following for dagger training to fine-tune the model:
 88 | 
 89 | ```bash
 90 | export CUDA_VISIBLE_DEVICES=0,1,2
 91 | python -m torch.distributed.launch --nproc_per_node=3 run.py \
 92 | -c vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml \
 93 | -e $PATH_TO_SAVE_RESULT$ \
 94 | NUM_PROCESSES 5 \
 95 | DAGGER.BATCH_SIZE 8 \
 96 | DAGGER.CKPT_TO_LOAD $PATH_TO_MODEL_FROM_STAGE1$
 97 | ```
 98 | 
 99 | 
100 | ## Citation
101 | If you use or discuss WS-MGMap in your research, please consider citing the paper as follows
102 | ```
103 | @article{chen2022weakly,
104 |   title={Weakly-supervised multi-granularity map learning for vision-and-language navigation},
105 |   author={Chen, Peihao and Ji, Dongyu and Lin, Kunyang and Zeng, Runhao and Li, Thomas H and Tan, Mingkui and Gan, Chuang},
106 |   journal={arXiv preprint arXiv:2210.07506},
107 |   year={2022}
108 | }
109 | ```
110 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/unet_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchvision import models
  4 | 
  5 | 
  6 | def convrelu(in_channels, out_channels, kernel, padding):
  7 |     return nn.Sequential(
  8 |         nn.Conv2d(in_channels, out_channels, kernel, padding=padding),
  9 |         nn.BatchNorm2d(num_features=out_channels),
 10 |         nn.ReLU(inplace=True),
 11 |     )
 12 | 
 13 | 
 14 | class UNet(nn.Module):
 15 |     def __init__(self, model_config):
 16 |         super().__init__()
 17 |         self.base_model = ResNetUNet(3, 27)
 18 | 
 19 |         state = torch.load(model_config.RGB_ENCODER.pretrain_model, map_location='cpu')
 20 |         model_state = state['models']['img_segm_model']
 21 |         new_model_state = {'.'.join(k.split('.')[2:]):v for k,v in model_state.items()}
 22 |         self.base_model.load_state_dict(new_model_state)
 23 | 
 24 |         self.output_shape = self.base_model.output_shape
 25 | 
 26 |     def forward(self, observations):
 27 |         return self.base_model(observations)
 28 | 
 29 | 
 30 | class ResNetUNet(nn.Module):
 31 |     def __init__(self, n_channel_in, n_class_out):
 32 |         super().__init__()
 33 | 
 34 |         self.base_model = models.resnet18(pretrained=True)
 35 |         self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3,bias=False)
 36 |         self.base_layers = list(self.base_model.children())
 37 | 
 38 |         self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2)
 39 |         self.layer0_1x1 = convrelu(64, 64, 1, 0)
 40 |         self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4)
 41 |         self.layer1_1x1 = convrelu(64, 64, 1, 0)
 42 |         self.layer2 = self.base_layers[5]  # size=(N, 128, x.H/8, x.W/8)
 43 |         self.layer2_1x1 = convrelu(128, 128, 1, 0)
 44 |         self.layer3 = self.base_layers[6]  # size=(N, 256, x.H/16, x.W/16)
 45 |         self.layer3_1x1 = convrelu(256, 256, 1, 0)
 46 |         self.layer4 = self.base_layers[7]  # size=(N, 512, x.H/32, x.W/32)
 47 |         self.layer4_1x1 = convrelu(512, 512, 1, 0)
 48 | 
 49 |         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
 50 | 
 51 |         self.conv_up3 = convrelu(256 + 512, 512, 3, 1)
 52 |         self.conv_up2 = convrelu(128 + 512, 256, 3, 1)
 53 |         self.conv_up1 = convrelu(64 + 256, 256, 3, 1)
 54 |         self.conv_up0 = convrelu(64 + 256, 128, 3, 1)
 55 | 
 56 |         self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1)
 57 |         self.conv_original_size1 = convrelu(64, 64, 3, 1)
 58 |         self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1)
 59 | 
 60 |         self.conv_last = nn.Conv2d(64, n_class_out, 1)
 61 |         
 62 |         self.output_shape = [512, 7, 7]
 63 | 
 64 |     def forward(self, observations):
 65 |         if 'rgb_features' in observations:
 66 |             return observations['rgb_features'], None
 67 |         else:
 68 |             input = observations['rgb'].permute(0, 3, 1, 2)
 69 |             B, C, cH, cW = input.shape
 70 |             input = input.view(B, C, cH, cW)
 71 | 
 72 |             x_original = self.conv_original_size0(input)
 73 |             x_original = self.conv_original_size1(x_original)
 74 | 
 75 |             layer0 = self.layer0(input)
 76 |             layer1 = self.layer1(layer0)
 77 |             layer2 = self.layer2(layer1)
 78 |             layer3 = self.layer3(layer2)
 79 |             layer4 = self.layer4(layer3)
 80 | 
 81 |             layer4 = self.layer4_1x1(layer4)
 82 |             x = self.upsample(layer4)
 83 |             
 84 |             layer3 = self.layer3_1x1(layer3)
 85 |             x = torch.cat([x, layer3], dim=1)
 86 |             x = self.conv_up3(x)
 87 | 
 88 |             x = self.upsample(x)
 89 |             layer2 = self.layer2_1x1(layer2)
 90 |             x = torch.cat([x, layer2], dim=1)
 91 |             x = self.conv_up2(x)
 92 | 
 93 |             x = self.upsample(x)
 94 |             layer1 = self.layer1_1x1(layer1)
 95 |             x = torch.cat([x, layer1], dim=1)
 96 |             x = self.conv_up1(x)
 97 | 
 98 |             x = self.upsample(x)
 99 |             layer0 = self.layer0_1x1(layer0)
100 |             x = torch.cat([x, layer0], dim=1)
101 |             x = self.conv_up0(x)
102 | 
103 |             x = self.upsample(x)
104 |             x = torch.cat([x, x_original], dim=1)
105 |             x = self.conv_original_size2(x)
106 | 
107 |             out = self.conv_last(x)
108 | 
109 |             proj_feat = x
110 | 
111 |             return layer4, proj_feat
112 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/environments.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import Optional
  3 | 
  4 | import habitat
  5 | from habitat import Config, Dataset
  6 | from habitat.tasks.utils import cartesian_to_polar
  7 | from habitat.utils.geometry_utils import quaternion_rotate_vector
  8 | from habitat_baselines.common.baseline_registry import baseline_registry
  9 | from habitat_extensions.shortest_path_follower import ShortestPathFollowerCompat
 10 | 
 11 | from vlnce_baselines.common.action_maker import GTMapActionMaker, DDPPOActionMaker
 12 | 
 13 | 
 14 | @baseline_registry.register_env(name="VLNCEDaggerEnv")
 15 | class VLNCEDaggerEnv(habitat.RLEnv):
 16 |     def __init__(self, config: Config, dataset: Optional[Dataset] = None):
 17 |         super().__init__(config.TASK_CONFIG, dataset)
 18 |         self.config = config
 19 |         self.device = self._env._config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID
 20 |         self._success_distance = config.TASK_CONFIG.TASK.SUCCESS_DISTANCE
 21 | 
 22 |         self.follower = ShortestPathFollowerCompat(self._env.sim, 0.5, return_one_hot=False)
 23 |         self.follower.mode = 'geodesic_path'
 24 |         self.steppppp = 0
 25 | 
 26 |         if self.config.use_ddppo:
 27 |             self.ddppo_action_maker = DDPPOActionMaker(config, self._env)
 28 |         else:
 29 |             self.gt_map_action_maker = GTMapActionMaker(config)
 30 | 
 31 |     def reset(self):
 32 |         observation = super(VLNCEDaggerEnv, self).reset()
 33 |         return observation
 34 | 
 35 |     def step(self, action, prog, epidsode_reset_flag=None, depth_img=None):
 36 |         if self.config.use_ddppo and epidsode_reset_flag is True:
 37 |             self.ddppo_action_maker.l_policy.reset()
 38 |             self.ddppo_action_maker.sg_reset()
 39 |             self.steppppp = 0
 40 | 
 41 |         agent_state = self._env._sim.get_agent_state()
 42 |         if self.config.use_ddppo:
 43 |             self.waypoint = self.ddppo_action_maker.preprocess(action, agent_state)
 44 |             action_choice = self.ddppo_action_maker.action_decision(self.steppppp, self.waypoint, depth_img)
 45 |         else:
 46 |             self.waypoint = self.gt_map_action_maker.preprocess(action, agent_state)
 47 |             action_choice = self.gt_map_action_maker.action_decision(self.waypoint, self.follower)
 48 | 
 49 |         stop = self.decide_stop(prog)
 50 |         if stop:
 51 |             action_choice = 0
 52 | 
 53 |         if self._env._elapsed_steps < 24:
 54 |             action_choice = 2
 55 | 
 56 |         observation, reward, done, info = self.step_bak(action_choice)
 57 | 
 58 |         self.steppppp += 1
 59 | 
 60 |         return observation, reward, done, info
 61 | 
 62 |     def step_bak(self, action):
 63 |         observations, reward, done, info = super().step(action)
 64 |         return observations, reward, done, info
 65 | 
 66 |     def decide_stop(self, prog):
 67 |         if prog == -1 and self._distance_waypoint(self._env.current_episode.goals[0].position) < 0.5:
 68 |             return True
 69 |         elif prog > self.config.STOP_CONDITION.PROG_THRESHOLD:
 70 |             return True
 71 |         return False
 72 | 
 73 |     def _distance_waypoint(self, waypoint):
 74 |         agent_position = self._env._sim.get_agent_state().position
 75 |         return self._env.sim.geodesic_distance(waypoint, agent_position)
 76 | 
 77 |     def get_reward_range(self):
 78 |         return (0.0, 0.0)
 79 | 
 80 |     def get_reward(self, observations):
 81 |         return 0.0
 82 | 
 83 |     def get_done(self, observations):
 84 |         return self._env.episode_over
 85 | 
 86 |     def get_info(self, observations):
 87 |         return self.habitat_env.get_metrics()
 88 | 
 89 | 
 90 | @baseline_registry.register_env(name="VLNCEInferenceEnv")
 91 | class VLNCEInferenceEnv(VLNCEDaggerEnv):
 92 |     def __init__(self, config: Config, dataset: Optional[Dataset] = None):
 93 |         super().__init__(config, dataset)
 94 | 
 95 |     def get_reward_range(self):
 96 |         return (0.0, 0.0)
 97 | 
 98 |     def get_reward(self, observations):
 99 |         return 0.0
100 | 
101 |     def get_done(self, observations):
102 |         return self._env.episode_over
103 | 
104 |     def get_info(self, observations):
105 |         agent_state = self._env.sim.get_agent_state()
106 |         heading_vector = quaternion_rotate_vector(
107 |             agent_state.rotation.inverse(), np.array([0, 0, -1])
108 |         )
109 |         heading = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
110 |         return {
111 |             "position": agent_state.position.tolist(),
112 |             "heading": heading,
113 |             "stop": self._env.task.is_stop_called,
114 |         }
115 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/encoders/map_encoder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision.models as models
  6 | 
  7 | 
  8 | def convrelu(in_channels, out_channels, kernel, padding):
  9 |     return nn.Sequential(
 10 |         nn.Conv2d(in_channels, out_channels, kernel, padding=padding),
 11 |         nn.BatchNorm2d(num_features=out_channels),
 12 |         nn.ReLU(inplace=True),
 13 |     )
 14 | 
 15 | 
 16 | class MapEncoder(nn.Module):
 17 |     def __init__(self, map_size, input_channel, output_channel):
 18 |         super().__init__()
 19 |         self.cnn = nn.Sequential(
 20 |             nn.Conv2d(input_channel, 64, 8, stride=2, padding=3),   # 100 -> 50
 21 |             nn.BatchNorm2d(num_features=64),
 22 |             nn.ReLU(inplace=True),
 23 |             nn.Conv2d(64, 128, 5, stride=2, padding=1),
 24 |             nn.BatchNorm2d(num_features=128),
 25 |             nn.ReLU(inplace=True),
 26 |             nn.Conv2d(128, output_channel, 3, stride=1, padding=1),
 27 |             nn.BatchNorm2d(num_features=output_channel),
 28 |             nn.ReLU(inplace=True),
 29 |         )
 30 | 
 31 |         cnn_dims = np.array([map_size, map_size], dtype=np.float32)
 32 |         self._cnn_layers_kernel = [(8, 8), (5, 5), (3, 3)]
 33 |         self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)]
 34 |         self._cnn_layers_padding = [(3, 3), (1, 1), (1, 1)]
 35 |         for kernel, stride, padding in zip(self._cnn_layers_kernel, self._cnn_layers_stride, self._cnn_layers_padding):
 36 |             cnn_dims = self._conv_output_dim(
 37 |                 dimension=cnn_dims,
 38 |                 padding=np.array(padding, dtype=np.float32),
 39 |                 dilation=np.array([1, 1], dtype=np.float32),
 40 |                 kernel=np.array(kernel, dtype=np.float32),
 41 |                 stride=np.array(stride, dtype=np.float32),
 42 |             )
 43 |         
 44 |         self.output_shape = [output_channel, cnn_dims[0], cnn_dims[1]]
 45 | 
 46 |     def _conv_output_dim(self, dimension, padding, dilation, kernel, stride):
 47 |         assert len(dimension) == 2
 48 |         out_dimension = []
 49 |         for i in range(len(dimension)):
 50 |             out_dimension.append(
 51 |                 int(
 52 |                     np.floor(
 53 |                         (
 54 |                             (
 55 |                                 dimension[i]
 56 |                                 + 2 * padding[i]
 57 |                                 - dilation[i] * (kernel[i] - 1)
 58 |                                 - 1
 59 |                             )
 60 |                             / stride[i]
 61 |                         )
 62 |                         + 1
 63 |                     )
 64 |                 )
 65 |             )
 66 |         return tuple(out_dimension)
 67 | 
 68 |     def forward(self, rgb_map):
 69 |         return self.cnn(rgb_map)
 70 | 
 71 | 
 72 | class MapDecoder(nn.Module):
 73 |     def __init__(self, n_channel_in):
 74 |         super().__init__()
 75 |         self.base_model = models.resnet18(pretrained=True)
 76 |         self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3, bias=False)
 77 |         self.base_layers = list(self.base_model.children())
 78 | 
 79 |         self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2)
 80 |         self.layer0_1x1 = convrelu(64, 64, 1, 0)
 81 |         self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4)
 82 |         self.layer1_1x1 = convrelu(64, 64, 1, 0)
 83 | 
 84 |         self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
 85 | 
 86 |         self.conv_up0 = convrelu(64 + 64, 128, 3, 1)
 87 | 
 88 |         self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1)
 89 |         self.conv_original_size1 = convrelu(64, 64, 3, 1)
 90 |         self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1)
 91 |         
 92 |         self.output_shape = [64, 100, 100]
 93 | 
 94 |     def forward(self, input):
 95 |         x_original = self.conv_original_size0(input)
 96 |         x_original = self.conv_original_size1(x_original)
 97 | 
 98 |         layer0 = self.layer0(input)
 99 |         layer1 = self.layer1(layer0)
100 | 
101 |         layer1 = self.layer1_1x1(layer1)
102 |         x = self.upsample(layer1)
103 |         
104 |         layer0 = self.layer0_1x1(layer0)
105 |         x = torch.cat([x, layer0], dim=1)
106 |         x = self.conv_up0(x)
107 | 
108 |         x = self.upsample(x)
109 |         x = torch.cat([x, x_original], dim=1)
110 |         x = self.conv_original_size2(x)
111 | 
112 |         return x
113 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/policy.py:
--------------------------------------------------------------------------------
  1 | from gym import Space
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from habitat import Config
  8 | from habitat_baselines.rl.ppo.policy import CriticHead
  9 | 
 10 | from vlnce_baselines.models.mg_map_policy import MGMapNet
 11 | from vlnce_baselines.common.distributions import DiagGaussian
 12 | from vlnce_baselines.common.aux_losses import AuxLosses
 13 | 
 14 | 
 15 | class BasePolicy(nn.Module):
 16 |     def __init__(self, observation_space: Space, action_space: Space, model_config: Config):
 17 |         super(BasePolicy, self).__init__()
 18 |         self.model_config = model_config
 19 | 
 20 |         # Forward Network
 21 |         self.net = MGMapNet(observation_space, model_config)
 22 | 
 23 |         # Actor_critic Network
 24 |         self.action_distribution = DiagGaussian(self.net.output_size, action_space.shape[0])
 25 |         self.critic = CriticHead(self.net.output_size)
 26 | 
 27 |         # Aux Network
 28 |         self.prog_pred = nn.Linear(model_config.STATE_ENCODER.hidden_size, 1)
 29 | 
 30 |     def update_map(self, observations, masks):
 31 |         _, rgb_embedding_proj = self.net.rgb_encoder(observations)
 32 |         self.net.rgb_mapping_module(rgb_embedding_proj, observations, masks)
 33 | 
 34 |     def act(
 35 |         self,
 36 |         observations,
 37 |         rnn_hidden_states,
 38 |         prev_actions,
 39 |         masks,
 40 |         deterministic=False,
 41 |     ):
 42 |         features, rnn_hidden_states, pred_map = self.net(
 43 |             observations, rnn_hidden_states, prev_actions, masks
 44 |         )
 45 |         self.aux_prediction(features, observations, pred_map)
 46 |         distribution = self.action_distribution(features)
 47 |         value = self.critic(features)
 48 | 
 49 |         if deterministic:
 50 |             action = distribution.mode()
 51 |         else:
 52 |             action = distribution.sample()
 53 | 
 54 |         action_log_probs = distribution.log_probs(action)
 55 | 
 56 |         return value, action, action_log_probs, rnn_hidden_states
 57 | 
 58 |     def aux_prediction(self, features, observations, pred_map):
 59 |         self.prog = torch.tanh(self.prog_pred(features))
 60 | 
 61 |         # Calculate loss
 62 |         if AuxLosses.is_active():
 63 |             if self.model_config.PREDICTION_MONITOR.use:
 64 |                 target_map = torch.nn.functional.interpolate(observations['gt_semantic_map'].unsqueeze(1), size=(48, 48)).squeeze().long()
 65 |                 prediction_loss = F.cross_entropy(pred_map, target_map, reduction='none')
 66 |                 prediction_loss = prediction_loss.mean([1,2])
 67 |                 AuxLosses.register_loss('prediction_monitor', prediction_loss, self.model_config.PREDICTION_MONITOR.alpha)
 68 |             
 69 |             if self.model_config.CONTRASTIVE_MONITOR.use:
 70 |                 feature_size = self.net.map_encoder.output_shape[-1]
 71 | 
 72 |                 if 'gt_path' in observations.keys():
 73 |                     dis_map = observations['gt_path']
 74 |                 else:
 75 |                     dis_map = observations['waypoint_distribution']
 76 |                 target = (dis_map.max() - dis_map) / (dis_map.max() - dis_map.min())
 77 |                 target = F.interpolate(target.unsqueeze(1), size=[feature_size, feature_size], mode='area').squeeze(1)
 78 |                 target = target.reshape(target.shape[0], -1)
 79 |                 target = F.softmax(target/self.model_config.CONTRASTIVE_MONITOR.target_tau, dim=1)
 80 |                 pred = self.net.att_map_t_m
 81 |                 
 82 |                 kl_loss = F.kl_div(torch.log(pred), target, reduction='none')
 83 |                 kl_loss = kl_loss.mean(-1)
 84 |                 AuxLosses.register_loss('contrastive_monitor', kl_loss, self.model_config.CONTRASTIVE_MONITOR.alpha)
 85 | 
 86 |             if self.model_config.PROGRESS_MONITOR.use:
 87 |                 progress_loss = F.mse_loss(self.prog, observations['progress'], reduction='none')
 88 |                 progress_loss = progress_loss.mean(-1)
 89 |                 AuxLosses.register_loss('progress_monitor', progress_loss, self.model_config.PROGRESS_MONITOR.alpha)
 90 | 
 91 |     def forward(self, observations, rnn_hidden_states, prev_actions, masks, weights):
 92 |         features, rnn_hidden_states, pred_map = self.net(
 93 |             observations, rnn_hidden_states, prev_actions, masks
 94 |         )
 95 | 
 96 |         distribution = self.action_distribution(features)
 97 |         pred = distribution.mean
 98 | 
 99 |         self.aux_prediction(features, observations, pred_map)
100 |         aux_mask = (weights > 0).view(-1)
101 |         aux_loss = AuxLosses.reduce(aux_mask)
102 | 
103 |         return pred, aux_loss
104 | 


--------------------------------------------------------------------------------
/habitat_extensions/task.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | import os
  4 | from typing import List, Optional
  5 | import numpy as np
  6 | 
  7 | import attr
  8 | from habitat.config import Config
  9 | from habitat.core.dataset import Dataset
 10 | from habitat.core.registry import registry
 11 | from habitat.datasets.pointnav.pointnav_dataset import ALL_SCENES_MASK
 12 | from habitat.datasets.utils import VocabDict
 13 | from habitat.tasks.nav.nav import NavigationGoal
 14 | from habitat.tasks.vln.vln import InstructionData, VLNEpisode
 15 | 
 16 | DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/"
 17 | 
 18 | 
 19 | @attr.s(auto_attribs=True, kw_only=True)
 20 | class VLNExtendedEpisode(VLNEpisode):
 21 |     r"""
 22 |     instruction_index_string: optional identifier of instruction.
 23 |     """
 24 |     instruction_index_string: Optional[str] = attr.ib(default=None)
 25 |     goals: Optional[List[NavigationGoal]] = attr.ib(default=None)
 26 |     reference_path: Optional[List[List[float]]] = attr.ib(default=None)
 27 | 
 28 | 
 29 | @registry.register_dataset(name="VLN-CE-v1")
 30 | class VLNCEDatasetV1(Dataset):
 31 |     r"""Class inherited from Dataset that loads a Vision and Language
 32 |     Navigation dataset.
 33 |     """
 34 | 
 35 |     episodes: List[VLNEpisode]
 36 |     instruction_vocab: VocabDict
 37 | 
 38 |     @staticmethod
 39 |     def check_config_paths_exist(config: Config) -> bool:
 40 |         return os.path.exists(
 41 |             config.DATA_PATH.format(split=config.SPLIT)
 42 |         ) and os.path.exists(config.SCENES_DIR)
 43 | 
 44 |     @staticmethod
 45 |     def _scene_from_episode(episode: VLNExtendedEpisode) -> str:
 46 |         r"""Helper method to get the scene name from an episode.  Assumes
 47 |         the scene_id is formated /path/to/<scene_name>.<ext>
 48 |         """
 49 |         return os.path.splitext(os.path.basename(episode.scene_id))[0]
 50 | 
 51 |     @classmethod
 52 |     def get_scenes_to_load(cls, config: Config) -> List[str]:
 53 |         r"""Return a sorted list of scenes
 54 |         """
 55 |         assert cls.check_config_paths_exist(config)
 56 |         dataset = cls(config)
 57 |         scenes = {cls._scene_from_episode(episode) for episode in dataset.episodes}
 58 | 
 59 |         return sorted(list(scenes))
 60 | 
 61 |     def _split_dataset(self, config):
 62 |         all_scene = []
 63 |         for ep in self.episodes:
 64 |             if ep.scene_id not in all_scene:
 65 |                 all_scene.append(ep.scene_id)
 66 |         
 67 |         data_dict = [[] for _ in range(len(all_scene))]
 68 |         for ep in self.episodes:
 69 |             data_dict[all_scene.index(ep.scene_id)].append(ep)
 70 |         
 71 |         split_episode = []
 72 |         for scene in range(len(all_scene)):
 73 |             if len(data_dict[scene]) < 4:
 74 |                 continue
 75 |             split_num = int(np.floor(len(data_dict[scene]) / config.split_num))
 76 |             split_scene = [data_dict[scene][i: i+split_num] for i in range(0, len(data_dict[scene]), split_num)]
 77 |             if len(split_scene) > config.split_num:
 78 |                 split_scene[-2].extend(split_scene[-1])
 79 |                 del split_scene[-1]
 80 |             split_episode.extend(split_scene[config.split_rank])
 81 |         
 82 |         return split_episode
 83 | 
 84 |     def __init__(self, config: Optional[Config] = None) -> None:
 85 |         self.episodes = []
 86 | 
 87 |         if config is None:
 88 |             return
 89 | 
 90 |         dataset_filename = config.DATA_PATH.format(split=config.SPLIT)
 91 |         with gzip.open(dataset_filename, "rt") as f:
 92 |             self.from_json(f.read(), scenes_dir=config.SCENES_DIR)
 93 | 
 94 |         if config.split_num > 1:
 95 |             self.episodes = self._split_dataset(config)
 96 | 
 97 |         if ALL_SCENES_MASK not in config.CONTENT_SCENES:
 98 |             scenes_to_load = set(config.CONTENT_SCENES)
 99 |             self.episodes = [
100 |                 episode
101 |                 for episode in self.episodes
102 |                 if self._scene_from_episode(episode) in scenes_to_load
103 |             ]
104 | 
105 |     def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None:
106 | 
107 |         deserialized = json.loads(json_str)
108 |         self.instruction_vocab = VocabDict(
109 |             word_list=deserialized["instruction_vocab"]["word_list"]
110 |         )
111 | 
112 |         for episode in deserialized["episodes"]:
113 |             episode = VLNExtendedEpisode(**episode)
114 | 
115 |             if scenes_dir is not None:
116 |                 if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX):
117 |                     episode.scene_id = episode.scene_id[
118 |                         len(DEFAULT_SCENE_PATH_PREFIX) :
119 |                     ]
120 | 
121 |                 episode.scene_id = os.path.join(scenes_dir, episode.scene_id)
122 | 
123 |             episode.instruction = InstructionData(**episode.instruction)
124 |             if episode.goals is not None:
125 |                 for g_index, goal in enumerate(episode.goals):
126 |                     episode.goals[g_index] = NavigationGoal(**goal)
127 |             self.episodes.append(episode)
128 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/action_maker.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from habitat.utils.visualizations import maps
  4 | 
  5 | from vlnce_baselines.common.utils import TransfomationRealworldAgent
  6 | from vlnce_baselines.models.ddppo_policy import DdppoPolicy, SemanticGrid, utils
  7 | 
  8 | 
  9 | class ActionMaker():
 10 |     def __init__(self, config) -> None:
 11 |         self.config = config
 12 |         self.ego_map_size = config.ego_map_size
 13 |         self.map_range_max = maps.COORDINATE_MAX
 14 |         self.map_range_min = maps.COORDINATE_MIN
 15 |         self.map_size = 1250
 16 | 
 17 |     def preprocess(self, action, agent_state):
 18 |         resolution = (self.map_range_max - self.map_range_min) / self.map_size
 19 |         tran_real_to_agent = TransfomationRealworldAgent(agent_state)
 20 | 
 21 |         waypoint_norm = torch.tanh(action)
 22 |         waypoint_a = torch.zeros([3])
 23 |         waypoint_a[0] = waypoint_norm[0] * (self.ego_map_size / 2) * resolution
 24 |         waypoint_a[2] = -waypoint_norm[1] * (self.ego_map_size / 2) * resolution
 25 | 
 26 |         waypoint_w = tran_real_to_agent.agent2realworld(waypoint_a)
 27 | 
 28 |         return waypoint_w
 29 | 
 30 |     def action_decision(self) -> int:
 31 |         pass
 32 | 
 33 | 
 34 | class GTMapActionMaker(ActionMaker):
 35 |     def __init__(self, config) -> None:
 36 |         super().__init__(config)
 37 | 
 38 |     def action_decision(self, goal, follower) -> int:
 39 |         action = follower.get_next_action(goal)
 40 | 
 41 |         if action is None:
 42 |             action = 1
 43 | 
 44 |         return action
 45 | 
 46 | 
 47 | class DDPPOActionMaker(ActionMaker):
 48 |     def __init__(self, config, _env) -> None:
 49 |         super().__init__(config)
 50 |         self.utils = utils()
 51 |         self._env = _env
 52 |         self.device = torch.device("cuda", self._env._config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID)
 53 |         self.grid_dim = (192, 192)
 54 |         self.global_dim = (512,512)
 55 |         self.heatmap_size = 24
 56 |         self.cell_size = 0.05
 57 |         self.img_size = (256, 256)
 58 |         self.n_object_classes = 27
 59 |         self.n_spatial_classes = 3
 60 |         model_path = 'data/pretrain_model/ddppo-models/gibson-4plus-mp3d-train-val-test-resnet50.pth'
 61 |         self.l_policy = DdppoPolicy(path=model_path)
 62 |         self.l_policy = self.l_policy.to(self.device)
 63 |         self.sg_reset()
 64 | 
 65 |     def sg_reset(self):
 66 |         self.sg_global = SemanticGrid(
 67 |             1, self.global_dim, self.heatmap_size, self.cell_size,
 68 |             spatial_labels=self.n_spatial_classes, object_labels=self.n_object_classes,
 69 |             device=self.device,
 70 |         )
 71 |         self.abs_poses = []
 72 |         self.agent_height = []
 73 |         # long term goal in global grid map
 74 |         self.ltg_abs_coords = torch.zeros((1, 1, 2), dtype=torch.int64).to(self.device)
 75 |         self.ltg_abs_coords_list = []
 76 | 
 77 |     def run_local_policy(self, depth, goal, pose_coords, rel_agent_o, step):
 78 |         planning_goal = goal.squeeze(0).squeeze(0)
 79 |         planning_pose = pose_coords.squeeze(0).squeeze(0)
 80 | 
 81 |         sq = torch.square(planning_goal[0]-planning_pose[0]) + torch.square(planning_goal[1]-planning_pose[1])
 82 |         rho = torch.sqrt(sq.float())
 83 |         phi = torch.atan2(((planning_pose[0]-planning_goal[0]).float()), (planning_pose[1]-planning_goal[1]).float())
 84 |         phi = phi - rel_agent_o
 85 |         rho = rho * self.cell_size
 86 | 
 87 |         point_goal_with_gps_compass = torch.tensor([rho,phi], dtype=torch.float32).to(self.device)
 88 |         depth = depth.reshape(self.img_size[0], self.img_size[1], 1)
 89 |         return self.l_policy.plan(depth, point_goal_with_gps_compass, step)
 90 | 
 91 |     def transform_waypoint2cm2(self, t, ltg):
 92 |         ltg_cm2 = [] 
 93 |         ltg_cm2.append(-ltg[2])
 94 |         ltg_cm2.append(-ltg[0])
 95 | 
 96 |         agent_state = self._env.sim.get_agent_state()
 97 |         agent_pose, y_height = self.utils.get_sim_location(agent_state) 
 98 |         ltg_cm2.append(agent_pose[2])
 99 |         self.abs_poses.append(agent_pose)
100 |         self.agent_height.append(y_height)
101 | 
102 |         rel_abs_pose = self.utils.get_rel_pose(self.abs_poses[t], self.abs_poses[0])  
103 |         _rel_abs_pose = torch.Tensor(rel_abs_pose).unsqueeze(0).float()
104 |         _rel_abs_pose = _rel_abs_pose.to(self.device)
105 |         abs_pose_coords = self.utils.get_coord_pose(self.sg_global, _rel_abs_pose, self.abs_poses[0], self.global_dim[0], self.cell_size, self.device) # B x T x 3
106 | 
107 |         rel_ltg_abs_pose = self.utils.get_rel_pose(pos2=ltg_cm2, pos1=self.abs_poses[0])  
108 |         _rel_ltg_abs_pose = torch.Tensor(rel_ltg_abs_pose).unsqueeze(0).float()
109 |         _rel_ltg_abs_pose = _rel_ltg_abs_pose.to(self.device)
110 |         ltg_coords = self.utils.get_coord_pose(self.sg_global, _rel_ltg_abs_pose, self.abs_poses[0], self.global_dim[0], self.cell_size, self.device)
111 | 
112 |         return ltg_coords, abs_pose_coords, rel_abs_pose
113 | 
114 |     def action_decision(self, t, ltg, depth):
115 |         ltg_abs_coords, abs_pose_coords, rel_abs_pose = self.transform_waypoint2cm2(t, ltg)
116 |         depth = torch.tensor(depth).to(self.device)
117 |         action_id = self.run_local_policy(
118 |             depth=depth,
119 |             goal=ltg_abs_coords.clone(),
120 |             pose_coords=abs_pose_coords.clone(), 
121 |             rel_agent_o=rel_abs_pose[2],
122 |             step=t,
123 |         )
124 |         return action_id
125 | 


--------------------------------------------------------------------------------
/habitat_extensions/config/default.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | 
  3 | from habitat.config.default import Config as CN
  4 | from habitat.config.default import get_config
  5 | 
  6 | 
  7 | _C = get_config()
  8 | _C.defrost()
  9 | 
 10 | # -----------------------------------------------------------------------------
 11 | # VLN ORACLE ACTION SENSOR
 12 | # -----------------------------------------------------------------------------
 13 | _C.TASK.VLN_ORACLE_ACTION_SENSOR = CN()
 14 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.TYPE = "VLNOracleActionSensor"
 15 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.GOAL_RADIUS = 0.5
 16 | # compatibility with the dataset generation oracle and paper results.
 17 | # if False, use the ShortestPathFollower in Habitat
 18 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.USE_ORIGINAL_FOLLOWER = True
 19 | # -----------------------------------------------------------------------------
 20 | # VLN ORACLE PROGRESS SENSOR
 21 | # -----------------------------------------------------------------------------
 22 | _C.TASK.VLN_ORACLE_PROGRESS_SENSOR = CN()
 23 | _C.TASK.VLN_ORACLE_PROGRESS_SENSOR.TYPE = "VLNOracleProgressSensor"
 24 | # -----------------------------------------------------------------------------
 25 | # VLN ORACLE WAYPOINT SENSOR
 26 | # -----------------------------------------------------------------------------
 27 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR = CN()
 28 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.TYPE = "VLNOracleWaypointSensor"
 29 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.GOAL_RADIUS = 0.5
 30 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.USE_ORIGINAL_FOLLOWER = True
 31 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.MAP_SIZE = 100
 32 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.MAP_RESOLUTION = 1250
 33 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW = CN()
 34 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.USE = True
 35 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = "train"
 36 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.GT_PATH = "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz"
 37 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.IS_SPARSE = True
 38 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.NUM_WAYPOINTS = 6
 39 | # -----------------------------------------------------------------------------
 40 | # VLN ORACLE PATH SENSOR
 41 | # -----------------------------------------------------------------------------
 42 | _C.TASK.VLN_ORACLE_PATH_SENSOR = CN()
 43 | _C.TASK.VLN_ORACLE_PATH_SENSOR.TYPE = "VLNOraclePathSensor"
 44 | _C.TASK.VLN_ORACLE_PATH_SENSOR.MAP_RESOLUTION = 1250
 45 | _C.TASK.VLN_ORACLE_PATH_SENSOR.MAP_SIZE = 100
 46 | _C.TASK.VLN_ORACLE_PATH_SENSOR.LINE_WIDTH = 1
 47 | # -----------------------------------------------------------------------------
 48 | # SEMANTIC FILTER SENSOR
 49 | # -----------------------------------------------------------------------------
 50 | _C.TASK.SEMANTIC_FILTER_SENSOR = CN()
 51 | _C.TASK.SEMANTIC_FILTER_SENSOR.TYPE = "SemanticFilterSensor"
 52 | _C.TASK.SEMANTIC_FILTER_SENSOR.HEIGHT = 256
 53 | _C.TASK.SEMANTIC_FILTER_SENSOR.WIDTH = 256
 54 | _C.TASK.SEMANTIC_FILTER_SENSOR.CATEGORY = 27
 55 | # -----------------------------------------------------------------------------
 56 | # GT SEMANTIC MAP SENSOR
 57 | # -----------------------------------------------------------------------------
 58 | _C.TASK.GT_SEMANTIC_MAP_SENSOR = CN()
 59 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.TYPE = "GtSemanticMapSensor"
 60 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.MAP_SIZE = 100
 61 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.SPLIT = 'train'  # 'train', 'train_aug'
 62 | # -----------------------------------------------------------------------------
 63 | # HEADING SENSOR
 64 | # -----------------------------------------------------------------------------
 65 | _C.TASK.HEADING_SENSOR = CN()
 66 | _C.TASK.HEADING_SENSOR.TYPE = "HeadingSensor"
 67 | 
 68 | 
 69 | # -----------------------------------------------------------------------------
 70 | # NDTW MEASUREMENT
 71 | # -----------------------------------------------------------------------------
 72 | _C.TASK.NDTW = CN()
 73 | _C.TASK.NDTW.TYPE = "NDTW"
 74 | _C.TASK.NDTW.SPLIT = "val_seen"
 75 | _C.TASK.NDTW.FDTW = True  # False: DTW
 76 | _C.TASK.NDTW.GT_PATH = (
 77 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json"
 78 | )
 79 | _C.TASK.NDTW.SUCCESS_DISTANCE = 0.2
 80 | # -----------------------------------------------------------------------------
 81 | # SDTW MEASUREMENT
 82 | # -----------------------------------------------------------------------------
 83 | _C.TASK.SDTW = CN()
 84 | _C.TASK.SDTW.TYPE = "SDTW"
 85 | _C.TASK.SDTW.SPLIT = "val_seen"
 86 | _C.TASK.SDTW.FDTW = True  # False: DTW
 87 | _C.TASK.SDTW.GT_PATH = (
 88 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json"
 89 | )
 90 | _C.TASK.SDTW.SUCCESS_DISTANCE = 0.2
 91 | # -----------------------------------------------------------------------------
 92 | # PATH_LENGTH MEASUREMENT
 93 | # -----------------------------------------------------------------------------
 94 | _C.TASK.PATH_LENGTH = CN()
 95 | _C.TASK.PATH_LENGTH.TYPE = "PathLength"
 96 | # -----------------------------------------------------------------------------
 97 | # ORACLE_NAVIGATION_ERROR MEASUREMENT
 98 | # -----------------------------------------------------------------------------
 99 | _C.TASK.ORACLE_NAVIGATION_ERROR = CN()
100 | _C.TASK.ORACLE_NAVIGATION_ERROR.TYPE = "OracleNavigationError"
101 | # -----------------------------------------------------------------------------
102 | # ORACLE_SUCCESS MEASUREMENT
103 | # -----------------------------------------------------------------------------
104 | _C.TASK.ORACLE_SUCCESS = CN()
105 | _C.TASK.ORACLE_SUCCESS.TYPE = "OracleSuccess"
106 | _C.TASK.ORACLE_SUCCESS.SUCCESS_DISTANCE = 0.2
107 | # -----------------------------------------------------------------------------
108 | # ORACLE_SPL MEASUREMENT
109 | # -----------------------------------------------------------------------------
110 | _C.TASK.ORACLE_SPL = CN()
111 | _C.TASK.ORACLE_SPL.TYPE = "OracleSPL"
112 | _C.TASK.ORACLE_SPL.SUCCESS_DISTANCE = 0.2
113 | # -----------------------------------------------------------------------------
114 | # STEPS_TAKEN MEASUREMENT
115 | # -----------------------------------------------------------------------------
116 | _C.TASK.STEPS_TAKEN = CN()
117 | _C.TASK.STEPS_TAKEN.TYPE = "StepsTaken"
118 | 
119 | _C.DATASET.split_num = 0
120 | _C.DATASET.split_rank = 0
121 | 
122 | 
123 | def get_extended_config(
124 |     config_paths: Optional[Union[List[str], str]] = None, opts: Optional[list] = None
125 | ) -> CN:
126 |     """Create a unified config with default values overwritten by values from
127 |     :p:`config_paths` and overwritten by options from :p:`opts`.
128 |     :param config_paths: List of config paths or string that contains comma
129 |         separated list of config paths.
130 |     :param opts: Config options (keys, values) in a list (e.g., passed from
131 |         command line into the config. For example,
132 |         :py:`opts = ['FOO.BAR', 0.5]`. Argument can be used for parameter
133 |         sweeping or quick tests.
134 |     """
135 |     config = _C.clone()
136 | 
137 |     if config_paths:
138 |         if isinstance(config_paths, str):
139 |             config_paths = [config_paths]
140 | 
141 |         for config_path in config_paths:
142 |             config.merge_from_file(config_path)
143 | 
144 |     if opts:
145 |         config.merge_from_list(opts)
146 |     config.freeze()
147 |     return config
148 | 


--------------------------------------------------------------------------------
/habitat_extensions/shortest_path_follower.py:
--------------------------------------------------------------------------------
  1 | # Copied from https://github.com/facebookresearch/habitat-lab/blob/v0.1.4/habitat/tasks/nav/shortest_path_follower.py
  2 | # Use the Habitat v0.1.4 ShortestPathFollower for compatibility with
  3 | # the dataset generation oracle.
  4 | 
  5 | from typing import Optional, Union
  6 | 
  7 | import habitat_sim
  8 | import numpy as np
  9 | from habitat.sims.habitat_simulator.actions import HabitatSimActions
 10 | from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim
 11 | from habitat.utils.geometry_utils import (
 12 |     angle_between_quaternions,
 13 |     quaternion_from_two_vectors,
 14 | )
 15 | 
 16 | EPSILON = 1e-6
 17 | 
 18 | 
 19 | def action_to_one_hot(action: int) -> np.array:
 20 |     one_hot = np.zeros(len(HabitatSimActions), dtype=np.float32)
 21 |     one_hot[action] = 1
 22 |     return one_hot
 23 | 
 24 | 
 25 | class ShortestPathFollowerCompat:
 26 |     r"""Utility class for extracting the action on the shortest path to the
 27 |         goal.
 28 |     Args:
 29 |         sim: HabitatSim instance.
 30 |         goal_radius: Distance between the agent and the goal for it to be
 31 |             considered successful.
 32 |         return_one_hot: If true, returns a one-hot encoding of the action
 33 |             (useful for training ML agents). If false, returns the
 34 |             SimulatorAction.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self, sim: HabitatSim, goal_radius: float, return_one_hot: bool = True
 39 |     ):
 40 |         assert (
 41 |             getattr(sim, "geodesic_distance", None) is not None
 42 |         ), "{} must have a method called geodesic_distance".format(type(sim).__name__)
 43 | 
 44 |         self._sim = sim
 45 |         self._max_delta = self._sim.config.FORWARD_STEP_SIZE - EPSILON
 46 |         self._goal_radius = goal_radius
 47 |         self._step_size = self._sim.config.FORWARD_STEP_SIZE
 48 | 
 49 |         self._mode = (
 50 |             "geodesic_path"
 51 |             if getattr(sim, "get_straight_shortest_path_points", None) is not None
 52 |             else "greedy"
 53 |         )
 54 |         self._return_one_hot = return_one_hot
 55 | 
 56 |     def _get_return_value(self, action) -> Union[int, np.array]:
 57 |         if self._return_one_hot:
 58 |             return action_to_one_hot(action)
 59 |         else:
 60 |             return action
 61 | 
 62 |     def get_next_action(self, goal_pos: np.array) -> Optional[Union[int, np.array]]:
 63 |         """Returns the next action along the shortest path.
 64 |         """
 65 |         if (
 66 |             self._sim.geodesic_distance(self._sim.get_agent_state().position, goal_pos)
 67 |             <= self._goal_radius
 68 |         ):
 69 |             return None
 70 | 
 71 |         max_grad_dir = self._est_max_grad_dir(goal_pos)
 72 |         if max_grad_dir is None:
 73 |             return self._get_return_value(HabitatSimActions.MOVE_FORWARD)
 74 |         return self._step_along_grad(max_grad_dir)
 75 | 
 76 |     def _step_along_grad(self, grad_dir: np.quaternion) -> Union[int, np.array]:
 77 |         current_state = self._sim.get_agent_state()
 78 |         alpha = angle_between_quaternions(grad_dir, current_state.rotation)
 79 |         if alpha <= np.deg2rad(self._sim.config.TURN_ANGLE) + EPSILON:
 80 |             return self._get_return_value(HabitatSimActions.MOVE_FORWARD)
 81 |         else:
 82 |             sim_action = HabitatSimActions.TURN_LEFT
 83 |             self._sim.step(sim_action)
 84 |             best_turn = (
 85 |                 HabitatSimActions.TURN_LEFT
 86 |                 if (
 87 |                     angle_between_quaternions(
 88 |                         grad_dir, self._sim.get_agent_state().rotation
 89 |                     )
 90 |                     < alpha
 91 |                 )
 92 |                 else HabitatSimActions.TURN_RIGHT
 93 |             )
 94 |             self._reset_agent_state(current_state)
 95 |             return self._get_return_value(best_turn)
 96 | 
 97 |     def _reset_agent_state(self, state: habitat_sim.AgentState) -> None:
 98 |         self._sim.set_agent_state(state.position, state.rotation, reset_sensors=False)
 99 | 
100 |     def _geo_dist(self, goal_pos: np.array) -> float:
101 |         return self._sim.geodesic_distance(
102 |             self._sim.get_agent_state().position, goal_pos
103 |         )
104 | 
105 |     def _est_max_grad_dir(self, goal_pos: np.array) -> np.array:
106 | 
107 |         current_state = self._sim.get_agent_state()
108 |         current_pos = current_state.position
109 | 
110 |         if self.mode == "geodesic_path":
111 |             points = self._sim.get_straight_shortest_path_points(
112 |                 self._sim.get_agent_state().position, goal_pos
113 |             )
114 |             # Add a little offset as things get weird if
115 |             # points[1] - points[0] is anti-parallel with forward
116 |             if len(points) < 2:
117 |                 return None
118 |             max_grad_dir = quaternion_from_two_vectors(
119 |                 self._sim.forward_vector,
120 |                 points[1]
121 |                 - points[0]
122 |                 + EPSILON * np.cross(self._sim.up_vector, self._sim.forward_vector),
123 |             )
124 |             max_grad_dir.x = 0
125 |             max_grad_dir = np.normalized(max_grad_dir)
126 |         else:
127 |             current_rotation = self._sim.get_agent_state().rotation
128 |             current_dist = self._geo_dist(goal_pos)
129 | 
130 |             best_geodesic_delta = -2 * self._max_delta
131 |             best_rotation = current_rotation
132 |             for _ in range(0, 360, self._sim.config.TURN_ANGLE):
133 |                 sim_action = HabitatSimActions.MOVE_FORWARD
134 |                 self._sim.step(sim_action)
135 |                 new_delta = current_dist - self._geo_dist(goal_pos)
136 | 
137 |                 if new_delta > best_geodesic_delta:
138 |                     best_rotation = self._sim.get_agent_state().rotation
139 |                     best_geodesic_delta = new_delta
140 | 
141 |                 # If the best delta is within (1 - cos(TURN_ANGLE))% of the
142 |                 # best delta (the step size), then we almost certainly have
143 |                 # found the max grad dir and should just exit
144 |                 if np.isclose(
145 |                     best_geodesic_delta,
146 |                     self._max_delta,
147 |                     rtol=1 - np.cos(np.deg2rad(self._sim.config.TURN_ANGLE)),
148 |                 ):
149 |                     break
150 | 
151 |                 self._sim.set_agent_state(
152 |                     current_pos,
153 |                     self._sim.get_agent_state().rotation,
154 |                     reset_sensors=False,
155 |                 )
156 | 
157 |                 sim_action = HabitatSimActions.TURN_LEFT
158 |                 self._sim.step(sim_action)
159 | 
160 |             self._reset_agent_state(current_state)
161 | 
162 |             max_grad_dir = best_rotation
163 | 
164 |         return max_grad_dir
165 | 
166 |     @property
167 |     def mode(self):
168 |         return self._mode
169 | 
170 |     @mode.setter
171 |     def mode(self, new_mode: str):
172 |         r"""Sets the mode for how the greedy follower determines the best next
173 |             step.
174 |         Args:
175 |             new_mode: geodesic_path indicates using the simulator's shortest
176 |                 path algorithm to find points on the map to navigate between.
177 |                 greedy indicates trying to move forward at all possible
178 |                 orientations and selecting the one which reduces the geodesic
179 |                 distance the most.
180 |         """
181 |         assert new_mode in {"geodesic_path", "greedy"}
182 |         if new_mode == "geodesic_path":
183 |             assert (
184 |                 getattr(self._sim, "get_straight_shortest_path_points", None)
185 |                 is not None
186 |             )
187 |         self._mode = new_mode
188 | 


--------------------------------------------------------------------------------
/vlnce_baselines/config/default.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Optional, Union
  3 | 
  4 | from habitat.config.default import Config as CN
  5 | from habitat_extensions.config.default import get_extended_config as get_task_config
  6 | 
  7 | 
  8 | # -----------------------------------------------------------------------------
  9 | # EXPERIMENT CONFIG
 10 | # -----------------------------------------------------------------------------
 11 | _C = CN()
 12 | _C.BASE_TASK_CONFIG_PATH = "habitat_extensions/config/vlnce_task.yaml"
 13 | _C.TASK_CONFIG = CN()  # task_config will be stored as a config node
 14 | _C.CMD_TRAILING_OPTS = []  # store command line options as list of strings
 15 | _C.TRAINER_NAME = "dagger"
 16 | _C.ENV_NAME = "VLNCEDaggerEnv"
 17 | _C.SIMULATOR_GPU_ID = 0
 18 | _C.SIMULATOR_GPU_IDS = None
 19 | _C.TORCH_GPU_ID = 0
 20 | _C.NUM_PROCESSES = 4
 21 | _C.VIDEO_OPTION = []  # options: "disk", "tensorboard"
 22 | _C.VIDEO_DIR = "videos/debug"
 23 | _C.TENSORBOARD_DIR = "data/tensorboard_dirs/debug"
 24 | _C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"]
 25 | _C.CHECKPOINT_FOLDER = "data/checkpoints"
 26 | _C.LOG_FILE = "train.log"
 27 | _C.EVAL_CKPT_PATH_DIR = "data/checkpoints"  # path to ckpt or path to ckpts dir
 28 | _C.NUM_UPDATES = 300000
 29 | _C.CHECKPOINT_INTERVAL = 512000
 30 | 
 31 | # -----------------------------------------------------------------------------
 32 | # EVAL CONFIG
 33 | # -----------------------------------------------------------------------------
 34 | _C.EVAL = CN()
 35 | _C.EVAL.SPLIT = "val_seen"  # The split to evaluate on
 36 | _C.EVAL.USE_CKPT_CONFIG = True
 37 | _C.EVAL.EPISODE_COUNT = 2
 38 | 
 39 | # -----------------------------------------------------------------------------
 40 | # INFERENCE CONFIG
 41 | # -----------------------------------------------------------------------------
 42 | _C.INFERENCE = CN()
 43 | _C.INFERENCE.SPLIT = "test"
 44 | _C.INFERENCE.USE_CKPT_CONFIG = True
 45 | _C.INFERENCE.CKPT_PATH = "data/checkpoints/CMA_PM_DA_Aug.pth"
 46 | _C.INFERENCE.PREDICTIONS_FILE = "predictions.json"
 47 | 
 48 | # -----------------------------------------------------------------------------
 49 | # DAGGER ENVIRONMENT CONFIG
 50 | # -----------------------------------------------------------------------------
 51 | _C.DAGGER = CN()
 52 | _C.DAGGER.LR = 2.5e-4
 53 | _C.DAGGER.ITERATIONS = 10
 54 | _C.DAGGER.EPOCHS = 4
 55 | _C.DAGGER.UPDATE_SIZE = 5000
 56 | _C.DAGGER.BATCH_SIZE = 5
 57 | _C.DAGGER.P = 0.75
 58 | _C.DAGGER.LMDB_MAP_SIZE = 5.0e12
 59 | # How often to commit the writes to the DB, less commits is
 60 | # better, but everything must be in memory until a commit happens/
 61 | _C.DAGGER.LMDB_COMMIT_FREQUENCY = 50
 62 | _C.DAGGER.USE_IW = True
 63 | # If True, load precomputed features directly from LMDB_FEATURES_DIR.
 64 | _C.DAGGER.PRELOAD_LMDB_FEATURES = False
 65 | _C.DAGGER.LMDB_FEATURES_DIR = "data/trajectories_dirs/debug/trajectories.lmdb"
 66 | # load an already trained model for fine tuning
 67 | _C.DAGGER.LOAD_FROM_CKPT = False
 68 | _C.DAGGER.CKPT_TO_LOAD = "data/checkpoints/ckpt.0.pth"
 69 | 
 70 | # -----------------------------------------------------------------------------
 71 | # MODELING CONFIG
 72 | # -----------------------------------------------------------------------------
 73 | _C.MODEL = CN()
 74 | # on GT trajectories in the training set
 75 | _C.MODEL.inflection_weight_coef = 3.2
 76 | 
 77 | _C.MODEL.ablate_depth = False
 78 | _C.MODEL.ablate_rgb = False
 79 | _C.MODEL.ablate_instruction = False
 80 | 
 81 | _C.MODEL.INSTRUCTION_ENCODER = CN()
 82 | _C.MODEL.INSTRUCTION_ENCODER.vocab_size = 2504
 83 | _C.MODEL.INSTRUCTION_ENCODER.max_length = 200
 84 | _C.MODEL.INSTRUCTION_ENCODER.use_pretrained_embeddings = True
 85 | _C.MODEL.INSTRUCTION_ENCODER.embedding_file = (
 86 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/embeddings.json.gz"
 87 | )
 88 | _C.MODEL.INSTRUCTION_ENCODER.dataset_vocab = (
 89 |     "data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train.json.gz"
 90 | )
 91 | _C.MODEL.INSTRUCTION_ENCODER.fine_tune_embeddings = False
 92 | _C.MODEL.INSTRUCTION_ENCODER.embedding_size = 50
 93 | _C.MODEL.INSTRUCTION_ENCODER.hidden_size = 128
 94 | _C.MODEL.INSTRUCTION_ENCODER.rnn_type = "LSTM"
 95 | _C.MODEL.INSTRUCTION_ENCODER.final_state_only = False
 96 | _C.MODEL.INSTRUCTION_ENCODER.bidirectional = True
 97 | _C.MODEL.INSTRUCTION_ENCODER.backbone = 'lstm'
 98 | 
 99 | _C.MODEL.RGB_ENCODER = CN()
100 | _C.MODEL.RGB_ENCODER.output_size = 256
101 | _C.MODEL.RGB_ENCODER.backbone = "unet"
102 | _C.MODEL.RGB_ENCODER.pretrain_model = 'data/pretrain_model/unet-models/2021_02_14-23_42_50.pt'
103 | 
104 | _C.MODEL.DEPTH_ENCODER = CN()
105 | _C.MODEL.DEPTH_ENCODER.output_size = 128
106 | _C.MODEL.DEPTH_ENCODER.backbone = "resnet50"  # type of resnet to use
107 | _C.MODEL.DEPTH_ENCODER.ddppo_checkpoint = "data/pretrain_model/ddppo-models/gibson-2plus-resnet50.pth"  # path to DDPPO resnet weights
108 | 
109 | _C.MODEL.MAP_ENCODER = CN()
110 | _C.MODEL.MAP_ENCODER.ego_map_size = 100
111 | _C.MODEL.MAP_ENCODER.output_size = 256
112 | 
113 | _C.MODEL.STATE_ENCODER = CN()
114 | _C.MODEL.STATE_ENCODER.hidden_size = 512
115 | _C.MODEL.STATE_ENCODER.rnn_type = "GRU"
116 | _C.MODEL.STATE_ENCODER.input_type = ['rgb', 'depth', 'map']
117 | 
118 | _C.MODEL.PROGRESS_MONITOR = CN()
119 | _C.MODEL.PROGRESS_MONITOR.use = True
120 | _C.MODEL.PROGRESS_MONITOR.alpha = 1.0  # loss multiplier
121 | 
122 | _C.MODEL.CONTRASTIVE_MONITOR = CN()
123 | _C.MODEL.CONTRASTIVE_MONITOR.target_tau = 0.07
124 | _C.MODEL.CONTRASTIVE_MONITOR.use = True
125 | _C.MODEL.CONTRASTIVE_MONITOR.alpha = 1.0
126 | 
127 | _C.MODEL.PREDICTION_MONITOR = CN()
128 | _C.MODEL.PREDICTION_MONITOR.use = True
129 | _C.MODEL.PREDICTION_MONITOR.alpha = 0.1
130 | 
131 | _C.MODEL.RGBMAPPING = CN()
132 | _C.MODEL.RGBMAPPING.map_depth = 64
133 | _C.MODEL.RGBMAPPING.global_map_size = 240
134 | _C.MODEL.RGBMAPPING.egocentric_map_size = 100
135 | _C.MODEL.RGBMAPPING.resolution = 0.12
136 | _C.MODEL.RGBMAPPING.gpu_id = 0
137 | _C.MODEL.RGBMAPPING.num_proc = 1
138 | 
139 | _C.STOP_CONDITION = CN()
140 | _C.STOP_CONDITION.TYPE = 'prog'
141 | _C.STOP_CONDITION.PROG_THRESHOLD = 0.8
142 | 
143 | _C.OVERWRITE = False
144 | _C.LOG_INTERVAL = 100
145 | _C.random_agent = False
146 | _C.RESUME_CKPT = None # resume from this ckpt
147 | _C.VIDEO_NUM = 99999
148 | _C.ego_map_size = 100
149 | _C.same_level_train = False
150 | _C.ep_max_len = 200
151 | _C.step_num = 3
152 | _C.use_ddppo = False
153 | 
154 | 
155 | def get_config(
156 |     config_paths: Optional[Union[List[str], str]] = None, opts: Optional[list] = None
157 | ) -> CN:
158 |     r"""Create a unified config with default values overwritten by values from
159 |     `config_paths` and overwritten by options from `opts`.
160 |     Args:
161 |         config_paths: List of config paths or string that contains comma
162 |         separated list of config paths.
163 |         opts: Config options (keys, values) in a list (e.g., passed from
164 |         command line into the config. For example, `opts = ['FOO.BAR',
165 |         0.5]`. Argument can be used for parameter sweeping or quick tests.
166 |     """
167 |     config = _C.clone()
168 |     if config_paths:
169 |         if isinstance(config_paths, str):
170 |             config_paths = [config_paths]
171 | 
172 |         for config_path in config_paths:
173 |             config.merge_from_file(config_path)
174 | 
175 |     if config.BASE_TASK_CONFIG_PATH != "":
176 |         config.TASK_CONFIG = get_task_config(config.BASE_TASK_CONFIG_PATH)
177 |     if opts:
178 |         config.CMD_TRAILING_OPTS = opts
179 |         config.merge_from_list(opts)
180 | 
181 |     return config
182 | 
183 | 
184 | def refine_config(config, local_rank):
185 |     config.defrost()
186 | 
187 |     config.TORCH_GPU_ID = local_rank
188 |     config.MODEL.RGBMAPPING.gpu_id = config.TORCH_GPU_ID
189 |     config.MODEL.RGBMAPPING.num_proc = config.NUM_PROCESSES
190 | 
191 |     split = config.TASK_CONFIG.DATASET.SPLIT
192 |     config.TASK_CONFIG.TASK.NDTW.SPLIT = split
193 |     config.TASK_CONFIG.TASK.SDTW.SPLIT = split
194 |     config.TASK_CONFIG.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = split
195 | 
196 |     if config.DAGGER.P == 1.0: # if doing teacher forcing, don't switch the scene until it is complete
197 |         config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = (-1)
198 | 
199 |     if config.same_level_train:
200 |         config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train_same_level.json.gz'
201 | 
202 |     if 'aug' in config.BASE_TASK_CONFIG_PATH:
203 |         config.TASK_CONFIG.TASK.GT_SEMANTIC_MAP_SENSOR.SPLIT = 'train_aug'
204 | 
205 |     config.freeze()
206 |     return config
207 | 
208 | def set_saveDir_GPUs(config, run_type, model_dir, note, gpus, local_rank):
209 |     config.defrost()
210 | 
211 |     run_dir = os.path.join(model_dir, "run_{}_{}".format(run_type, note))
212 |     os.makedirs(run_dir, exist_ok=True)
213 | 
214 |     config.CHECKPOINT_FOLDER = os.path.join(run_dir, 'checkpoint')
215 |     config.LOG_FILE = os.path.join(run_dir, '{}.log'.format(run_type))
216 |     config.TENSORBOARD_DIR = os.path.join(run_dir, 'tensorboard')
217 |     if config.DAGGER.PRELOAD_LMDB_FEATURES is False:
218 |         config.DAGGER.LMDB_FEATURES_DIR = os.path.join(run_dir, 'trajectories.lmdb')
219 |     config.VIDEO_DIR = os.path.join(run_dir, 'video_dir')
220 |     config.CODE_DIR = os.path.join(run_dir, 'sh_n_codes')
221 |     config.CONFIG_DIR = os.path.join(run_dir, 'config')
222 |     config.METRIC_DIR = os.path.join(run_dir, 'metric')
223 | 
224 |     config.SIMULATOR_GPU_ID = local_rank
225 |     config.SIMULATOR_GPU_IDS = None
226 |     if gpus is not None:
227 |         config.TORCH_GPU_ID = gpus[0]
228 |         config.SIMULATOR_GPU_IDS = gpus if len(gpus) == 1 else gpus[1:]
229 |     config.freeze()
230 | 
231 |     return config
232 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/mg_map_policy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import Space
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from habitat import Config
  9 | from habitat_baselines.rl.models.rnn_state_encoder import RNNStateEncoder
 10 | from habitat_baselines.rl.ppo.policy import Net
 11 | 
 12 | from vlnce_baselines.models.encoders.instruction_encoder import InstructionEncoder
 13 | from vlnce_baselines.models.encoders.unet_encoder import UNet
 14 | from vlnce_baselines.models.encoders.resnet_encoders import VlnResnetDepthEncoder
 15 | from vlnce_baselines.models.encoders.map_encoder import MapEncoder, MapDecoder
 16 | from vlnce_baselines.common.rgb_mapping import RGBMapping
 17 | 
 18 | 
 19 | class MGMapNet(Net):
 20 |     """ A multi-granularity map (MGMap) network that contains:
 21 |         Instruction encoder
 22 |         RGB encoder
 23 |         Depth encoder
 24 |         Map encoder and decoder
 25 |         RNN state encoder
 26 |     """
 27 |     def __init__(self, observation_space: Space, model_config: Config):
 28 |         super().__init__()
 29 |         self.model_config = model_config
 30 | 
 31 |         # Init the instruction encoder
 32 |         self.instruction_encoder = InstructionEncoder(model_config.INSTRUCTION_ENCODER)
 33 | 
 34 |         # # Init the rgb encoder
 35 |         self.rgb_encoder = UNet(model_config)
 36 |         for param in self.rgb_encoder.parameters():
 37 |             param.requires_grad = False
 38 |         self.rgb_linear = nn.Sequential(
 39 |             nn.AdaptiveAvgPool1d(1),
 40 |             nn.Flatten(),
 41 |             nn.Linear(
 42 |                 self.rgb_encoder.output_shape[0],
 43 |                 model_config.RGB_ENCODER.output_size,
 44 |             ),
 45 |             nn.ReLU(True),
 46 |         )
 47 | 
 48 |         # Init the depth encoder
 49 |         self.depth_encoder = VlnResnetDepthEncoder(
 50 |             observation_space,
 51 |             output_size=model_config.DEPTH_ENCODER.output_size,
 52 |             checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint,
 53 |             backbone=model_config.DEPTH_ENCODER.backbone,
 54 |             spatial_output=True,
 55 |         )
 56 |         self.depth_linear = nn.Sequential(
 57 |             nn.Flatten(),
 58 |             nn.Linear(
 59 |                 np.prod(self.depth_encoder.output_shape),
 60 |                 model_config.DEPTH_ENCODER.output_size,
 61 |             ),
 62 |             nn.ReLU(True),
 63 |         )
 64 | 
 65 |         # Init the mapping network 
 66 |         self.rgb_mapping_module = RGBMapping(model_config.RGBMAPPING)
 67 |         map_channel = model_config.RGBMAPPING.map_depth
 68 | 
 69 |         # Init the map encoder
 70 |         self.map_encoder = MapEncoder(
 71 |             model_config.MAP_ENCODER.ego_map_size,
 72 |             map_channel,
 73 |             model_config.MAP_ENCODER.output_size,
 74 |         )
 75 | 
 76 |         # Init the map decoder
 77 |         self.map_decoder = MapDecoder(model_config.MAP_ENCODER.output_size)
 78 |         self.map_classfier = nn.Sequential(
 79 |             nn.ConvTranspose2d(self.map_decoder.output_shape[0], 32, kernel_size=4, stride=2, padding=1, bias=False),
 80 |             nn.BatchNorm2d(32),
 81 |             nn.ReLU(inplace=True),
 82 |             nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, bias=False),
 83 |             nn.BatchNorm2d(32),
 84 |             nn.ReLU(inplace=True),
 85 |             nn.Conv2d(32, 27, kernel_size=1, stride=1, padding=0, bias=True),
 86 |         )
 87 | 
 88 |         # Init the map linear
 89 |         self.map_encoded_linear = nn.Sequential(
 90 |             nn.Conv2d(self.map_encoder.output_shape[0], 128, 3, stride=1, padding=1),
 91 |             nn.ReLU(),
 92 |         )
 93 |         self.map_classified_linear = nn.Sequential(
 94 |             nn.Conv2d(27, 128, 3, stride=1, padding=1),
 95 |             nn.ReLU(),
 96 |         )
 97 |         self.map_cated_linear = nn.Sequential(
 98 |             nn.Conv2d(128*2, model_config.MAP_ENCODER.output_size, 3, stride=1, padding=1),
 99 |             nn.ReLU(),
100 |         )
101 |         self.map_linear = nn.Sequential(
102 |             nn.AdaptiveAvgPool1d(1),
103 |             nn.Flatten(),
104 |             nn.Linear(
105 |                 model_config.MAP_ENCODER.output_size,
106 |                 model_config.MAP_ENCODER.output_size,
107 |             ),
108 |             nn.ReLU(True),
109 |         )
110 | 
111 |         # Init the first rnn state decoder
112 |         self._hidden_size = model_config.STATE_ENCODER.hidden_size
113 |         first_state_input_size = (
114 |             (model_config.RGB_ENCODER.output_size if 'rgb' in model_config.STATE_ENCODER.input_type else 0)
115 |             + (model_config.DEPTH_ENCODER.output_size if 'depth' in model_config.STATE_ENCODER.input_type else 0)
116 |             + (model_config.MAP_ENCODER.output_size if 'map' in model_config.STATE_ENCODER.input_type else 0)
117 |         )
118 |         self.state_encoder = RNNStateEncoder(
119 |             input_size=first_state_input_size,
120 |             hidden_size=self._hidden_size,
121 |             num_layers=1,
122 |             rnn_type=model_config.STATE_ENCODER.rnn_type,
123 |         )
124 | 
125 |         # Init the attention encoder
126 |         self.state_text_q_layer = nn.Linear(self._hidden_size, self._hidden_size // 2)
127 |         self.state_text_k_layer = nn.Conv1d(self.instruction_encoder.output_size, self._hidden_size // 2, 1)
128 | 
129 |         self.text_map_q_layer = nn.Linear(self.instruction_encoder.output_size, self._hidden_size // 2)
130 |         self.text_map_k_layer = nn.Conv1d(self.map_encoder.output_shape[0], self._hidden_size // 2, 1)
131 |         
132 |         self.register_buffer("_scale", torch.tensor(1.0 / ((self._hidden_size // 2) ** 0.5)))
133 | 
134 |         # Init the second rnn state decoder
135 |         second_state_input_size = (
136 |             model_config.STATE_ENCODER.hidden_size
137 |             + model_config.STATE_ENCODER.hidden_size // 2
138 |             + (model_config.STATE_ENCODER.hidden_size // 2 if 'map' in model_config.STATE_ENCODER.input_type else 0)
139 |         )
140 |         self.second_state_compress = nn.Sequential(
141 |             nn.Linear(
142 |                 second_state_input_size,
143 |                 self._hidden_size,
144 |             ),
145 |             nn.ReLU(True),
146 |         )
147 |         self.second_state_encoder = RNNStateEncoder(
148 |             input_size=self._hidden_size,
149 |             hidden_size=self._hidden_size,
150 |             num_layers=1,
151 |             rnn_type=model_config.STATE_ENCODER.rnn_type,
152 |         )
153 |         self._output_size = model_config.STATE_ENCODER.hidden_size
154 | 
155 |         self.train()
156 |         self.depth_encoder.eval()
157 |         self.rgb_encoder.eval()
158 | 
159 |     @property
160 |     def output_size(self):
161 |         return self._output_size
162 | 
163 |     @property
164 |     def is_blind(self):
165 |         return False
166 | 
167 |     @property
168 |     def num_recurrent_layers(self):
169 |         return self.state_encoder.num_recurrent_layers + (
170 |             self.second_state_encoder.num_recurrent_layers
171 |         )
172 | 
173 |     def _attn(self, q, k, v, mask=None):
174 |         logits = torch.einsum("nc, nci -> ni", q, k)
175 |         if mask is not None:
176 |             logits = logits - mask.float() * 1e8
177 |         attn = F.softmax(logits * self._scale, dim=1)
178 |         return torch.einsum("ni, nci -> nc", attn, v), attn
179 | 
180 |     def forward(self, observations, rnn_hidden_states, prev_actions, masks):
181 |         instruction_embedding, text_mask = self.instruction_encoder(observations)
182 |         rgb_embedding, rgb_embedding_proj = self.rgb_encoder(observations)
183 |         depth_embedding = self.depth_encoder(observations)
184 | 
185 |         # Get map
186 |         self.rgb_mapping_module(rgb_embedding_proj, observations, masks)
187 |         ego_map = observations['rgb_ego_map']
188 | 
189 |         # Encoding map
190 |         map_encoded = self.map_encoder(ego_map)
191 |         map_encoded_proj = self.map_encoded_linear(map_encoded)
192 | 
193 |         # Decoding map (ie segmentation prediction)
194 |         map_decoded = self.map_decoder(map_encoded)   # [bs, 64, 64]
195 |         pred_sem_map = self.map_classfier(map_decoded) 
196 |         map_classified_proj = self.map_classified_linear(
197 |             torch.nn.functional.avg_pool2d(pred_sem_map, kernel_size=2, stride=2)
198 |         )
199 | 
200 |         # Get concated map embedding
201 |         map_cat = [map_encoded_proj, map_classified_proj]
202 |         map_embedding = torch.cat(map_cat, dim=1)   # [bs, 2*c / c, 50, 50]
203 |         map_embedding = self.map_cated_linear(map_embedding)
204 | 
205 |         rgb_embedding = torch.flatten(rgb_embedding, 2)
206 |         depth_embedding = torch.flatten(depth_embedding, 2)
207 |         map_embedding = torch.flatten(map_embedding, 2)
208 | 
209 |         state_in = []
210 |         if 'rgb' in self.model_config.STATE_ENCODER.input_type:
211 |             rgb_in = self.rgb_linear(rgb_embedding)
212 |             state_in.append(rgb_in)
213 |         if 'depth' in self.model_config.STATE_ENCODER.input_type:
214 |             depth_in = self.depth_linear(depth_embedding)
215 |             state_in.append(depth_in)
216 |         if 'map' in self.model_config.STATE_ENCODER.input_type:
217 |             map_in = self.map_linear(map_embedding)
218 |             state_in.append(map_in)
219 |         state_in = torch.cat(state_in, dim=1)
220 |         (
221 |             state,
222 |             rnn_hidden_states[0: self.state_encoder.num_recurrent_layers],
223 |         ) = self.state_encoder(
224 |             state_in,
225 |             rnn_hidden_states[0: self.state_encoder.num_recurrent_layers],
226 |             masks,
227 |         )
228 | 
229 |         state_text_q = self.state_text_q_layer(state)
230 |         state_text_k = self.state_text_k_layer(instruction_embedding)
231 |         text_embedding, _ = self._attn(state_text_q, state_text_k, instruction_embedding, text_mask)
232 | 
233 |         text_map_q = self.text_map_q_layer(text_embedding)
234 |         text_map_k = self.text_map_k_layer(map_embedding)
235 |         map_embedding, self.att_map_t_m = self._attn(text_map_q, text_map_k, map_embedding, None)
236 | 
237 |         if 'map' in self.model_config.STATE_ENCODER.input_type:
238 |             x = torch.cat([state, text_embedding, map_embedding], dim=1)
239 |         else:
240 |             x = torch.cat([state, text_embedding], dim=1)
241 |         x = self.second_state_compress(x)
242 |         (
243 |             x,
244 |             rnn_hidden_states[self.state_encoder.num_recurrent_layers:],
245 |         ) = self.second_state_encoder(
246 |             x,
247 |             rnn_hidden_states[self.state_encoder.num_recurrent_layers:],
248 |             masks
249 |         )
250 | 
251 |         return x, rnn_hidden_states, pred_sem_map
252 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import zipfile
  5 | import socket
  6 | import shutil
  7 | import quaternion
  8 | from glob import glob
  9 | from shlex import quote
 10 | from typing import Dict, List
 11 | 
 12 | import torch
 13 | 
 14 | 
 15 | def transform_obs(
 16 |     observations: List[Dict], instruction_sensor_uuid: str, device=None
 17 | ) -> Dict[str, torch.Tensor]:
 18 |     """Extracts instruction tokens from an instruction sensor and
 19 |     transposes a batch of observation dicts to a dict of batched
 20 |     observations.
 21 | 
 22 |     Args:
 23 |         observations:  list of dicts of observations.
 24 |         instruction_sensor_uuid: name of the instructoin sensor to
 25 |             extract from.
 26 |         device: The torch.device to put the resulting tensors on.
 27 |             Will not move the tensors if None
 28 | 
 29 |     Returns:
 30 |         transposed dict of lists of observations.
 31 |     """
 32 |     for i in range(len(observations)):
 33 |         observations[i][instruction_sensor_uuid] = observations[i][
 34 |             instruction_sensor_uuid
 35 |         ]["tokens"]
 36 | 
 37 |     for obs in observations:
 38 |         if 'semantic' in obs:
 39 |             del obs['semantic']
 40 | 
 41 |     for obs in observations:
 42 |         for sensor in obs:
 43 |             if type(obs[sensor]) == torch.Tensor:
 44 |                 obs[sensor] = obs[sensor].to(device)
 45 |     return observations
 46 | 
 47 | 
 48 | def check_exist_file(config):
 49 |     dirs = [config.VIDEO_DIR, config.TENSORBOARD_DIR, config.CHECKPOINT_FOLDER]
 50 |     if any([os.path.exists(d) for d in dirs]):
 51 |         if config.OVERWRITE:
 52 |             for d in dirs:
 53 |                 if os.path.exists(d):
 54 |                     shutil.rmtree(d)
 55 |         else:
 56 |             order = None
 57 |             while order not in ['y', 'n']:
 58 |                 order = input('Output directory already exists! Overwrite the folder? (y/n)')
 59 |                 if order == 'y':
 60 |                     for d in dirs:
 61 |                         if os.path.exists(d):
 62 |                             shutil.rmtree(d)
 63 |                 elif order == 'n':
 64 |                     break
 65 | 
 66 | 
 67 | def save_sh_n_codes(config, run_type, ignore_dir=['']):
 68 |     os.makedirs(config.CODE_DIR, exist_ok=True)
 69 | 
 70 |     name = os.path.join(config.CODE_DIR, 'run_{}_{}.sh'.format(run_type, socket.gethostname()))
 71 |     with open(name, 'w') as f:
 72 |         envs = ['CUDA_VISIBLE_DEVICES']
 73 |         for env in envs:
 74 |             value = os.environ.get(env, None)
 75 |             if value is not None:
 76 |                 f.write(f'export {env}={quote(value)}\n')
 77 |         f.write(sys.executable + ' ' + ' '.join(quote(arg) for arg in sys.argv) + '\n')
 78 | 
 79 |     name = os.path.join(config.CODE_DIR, 'code.zip')
 80 |     with zipfile.ZipFile(name, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
 81 | 
 82 |         first_list = glob('*', recursive=True)
 83 |         first_list = [i for i in first_list if i not in ignore_dir]
 84 | 
 85 |         file_list = []
 86 |         patterns = [x + '/**' for x in first_list]
 87 |         for pattern in patterns:
 88 |             file_list.extend(glob(pattern, recursive=True))
 89 | 
 90 |         file_list = [x[:-1] if x[-1] == "/" else x for x in file_list]
 91 |         for filename in file_list:
 92 |             zf.write(filename)
 93 | 
 94 | 
 95 | def save_config(config, run_type):
 96 |     os.makedirs(config.CONFIG_DIR, exist_ok=True)
 97 |     name = os.path.join(config.CONFIG_DIR, 'config_of_{}.txt'.format(run_type))
 98 |     with open(name, 'w') as f:
 99 |         f.write(str(config))
100 | 
101 | 
102 | label_conversion_40_27 = {-1:0, 0:0, 1:15, 2:17, 3:1, 4:2, 5:3, 6:18, 7:19, 8:4, 9:15, 10:5, 11:6, 12:16, 13:20, 14:7, 15:8, 16:17, 17:17,
103 |                     18:9, 19:21, 20:22, 21:16, 22:10, 23:11, 24:15, 25:12, 26:13, 27:23, 28:16, 29:16, 30:16, 31:16, 32:16,
104 |                     33:24, 34:25, 35:16, 36:16, 37:14, 38:26, 39:16, 40:16}
105 | label_conversion_40_3 = {-1:0, 0:0, 1:1, 2:2, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:2, 17:2,
106 |                     18:1, 19:1, 20:1, 21:1, 22:1, 23:1, 24:1, 25:1, 26:1, 27:1, 28:1, 29:1, 30:1, 31:1, 32:1,
107 |                     33:1, 34:1, 35:1, 36:1, 37:1, 38:1, 39:1, 40:1}
108 | 
109 | 
110 | def get_sim_location(agent_state):
111 |     x = -agent_state.position[2]
112 |     y = -agent_state.position[0]
113 |     height = agent_state.position[1]
114 |     axis = quaternion.as_euler_angles(agent_state.rotation)[0]
115 |     if (axis%(2*np.pi)) < 0.1 or (axis%(2*np.pi)) > 2*np.pi - 0.1:
116 |         o = quaternion.as_euler_angles(agent_state.rotation)[1]
117 |     else:
118 |         o = 2*np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
119 |     if o > np.pi:
120 |         o -= 2 * np.pi
121 |     pose = x, y, o
122 |     return pose, height
123 | 
124 | def load_scene_pcloud(preprocessed_scenes_dir, scene_id, n_object_classes):
125 |     pcloud_path = preprocessed_scenes_dir+scene_id+'_pcloud.npz'
126 |     if not os.path.exists(pcloud_path):
127 |         raise Exception('Preprocessed point cloud for scene', scene_id,'not found!')
128 | 
129 |     data = np.load(pcloud_path)
130 |     x = data['x']
131 |     y = data['y']
132 |     z = data['z']
133 |     label_seq = data['label_seq']
134 |     data.close()
135 | 
136 |     label_seq[ label_seq<0.0 ] = 0.0
137 |     # Convert the labels to the reduced set of categories
138 |     label_seq_spatial = label_seq.copy()
139 |     label_seq_objects = label_seq.copy()
140 |     for i in range(label_seq.shape[0]):
141 |         curr_lbl = label_seq[i,0]
142 |         label_seq_spatial[i] = label_conversion_40_3[curr_lbl]
143 |         label_seq_objects[i] = label_conversion_40_27[curr_lbl]
144 |     return (x, y, z), label_seq_spatial, label_seq_objects
145 | 
146 | def load_scene_color(preprocessed_scenes_dir, scene_id):
147 |     # loads the rgb information of the map
148 |     color_path = preprocessed_scenes_dir+scene_id+'_color.npz'
149 |     if not os.path.exists(color_path):
150 |         raise Exception('Preprocessed color for scene', scene_id,'not found!')
151 | 
152 |     data = np.load(color_path)
153 |     r = data['r']
154 |     g = data['g']
155 |     b = data['b']
156 |     color_pcloud = np.stack((r,g,b)) # 3 x Npoints
157 |     return color_pcloud
158 | 
159 | def discretize_coords(x, z, grid_dim, cell_size, translation=0):
160 |     # x, z are the coordinates of the 3D point (either in camera coordinate frame, or the ground-truth camera position)
161 |     # If translation=0, assumes the agent is at the center
162 |     # If we want the agent to be positioned lower then use positive translation. When getting the gt_crop, we need negative translation
163 |     #map_coords = torch.zeros((len(x), 2), device='cuda')
164 |     map_coords = torch.zeros((len(x), 2))
165 |     xb = torch.floor(x[:]/cell_size) + (grid_dim[0]-1)/2.0
166 |     zb = torch.floor(z[:]/cell_size) + (grid_dim[1]-1)/2.0 + translation
167 |     xb = xb.int()
168 |     zb = zb.int()
169 |     map_coords[:,0] = xb
170 |     map_coords[:,1] = zb
171 |     # keep bin coords within dimensions
172 |     map_coords[map_coords>grid_dim[0]-1] = grid_dim[0]-1
173 |     map_coords[map_coords<0] = 0
174 |     return map_coords.long()
175 | 
176 | def slice_scene(x, y, z, label_seq, position, height, color_pcloud=None, device='cuda'):
177 |     # z = -z
178 |     # Slice the scene below and above the agent
179 |     below_thresh = height-0.2
180 |     above_thresh = height+2.0
181 |     all_inds = np.arange(y.shape[0])
182 |     below_inds = np.where(z<below_thresh)[0]
183 |     above_inds = np.where(z>above_thresh)[0]
184 |     # xout_inds = np.where(abs(x-position[1]) > 8)[0]
185 |     # yout_inds = np.where(abs(y-position[0]) > 8)[0]
186 |     invalid_inds = np.concatenate( (below_inds, above_inds), 0) # remove the floor and ceiling inds from the local3D points
187 |     inds = np.delete(all_inds, invalid_inds)
188 |     x_fil = x[inds]
189 |     y_fil = y[inds]
190 |     z_fil = z[inds]
191 |     label_seq_fil = torch.tensor(label_seq[inds], dtype=torch.float, device=device)
192 |     if color_pcloud is not None:
193 |         color_pcloud_fil = torch.tensor(color_pcloud[:,inds], dtype=torch.float, device=device)
194 |         return x_fil, y_fil, z_fil, label_seq_fil, color_pcloud_fil
195 |     else:
196 |         return x_fil, y_fil, z_fil, label_seq_fil
197 | 
198 | def get_gt_map(x, y, label_seq, abs_pose, grid_dim, cell_size, color_pcloud=None, z=None, device='cuda'):
199 |     # Transform the ground-truth map to align with the agent's pose
200 |     # The agent is at the center looking upwards
201 |     point_map = np.array([x,y])
202 |     angle = -abs_pose[2]
203 |     rot_mat_abs = np.array([[np.cos(angle), -np.sin(angle)],[np.sin(angle),np.cos(angle)]])
204 |     trans_mat_abs = np.array([[-abs_pose[1]],[abs_pose[0]]]) #### This is important, the first index is negative.
205 |     ##rotating and translating point map points
206 |     t_points = point_map - trans_mat_abs
207 |     rot_points = np.matmul(rot_mat_abs,t_points)
208 |     x_abs = torch.tensor(rot_points[0,:], device=device)
209 |     y_abs = torch.tensor(rot_points[1,:], device=device)
210 | 
211 |     map_coords = discretize_coords(x=x_abs, z=y_abs, grid_dim=grid_dim, cell_size=cell_size)
212 | 
213 |     # Coordinates in map_coords need to be sorted based on their height, floor values go first
214 |     # Still not perfect
215 |     if z is not None:
216 |         z = np.asarray(z)
217 |         sort_inds = np.argsort(z)
218 |         map_coords = map_coords[sort_inds,:]
219 |         label_seq = label_seq[sort_inds,:]
220 | 
221 |     true_seg_grid = torch.zeros((grid_dim[0], grid_dim[1], 1), device=device)
222 |     true_seg_grid[map_coords[:,1], map_coords[:,0]] = label_seq.clone()
223 | 
224 |     ### We need to flip the ground truth to align with the observations.
225 |     ### Probably because the -y tp -z is a rotation about x axis which also flips the y coordinate for matteport.
226 |     true_seg_grid = torch.flip(true_seg_grid, dims=[0])
227 |     true_seg_grid = true_seg_grid.permute(2, 0, 1)
228 | 
229 |     if color_pcloud is not None:
230 |         color_grid = torch.zeros((grid_dim[0], grid_dim[1], 3), device=device)
231 |         color_grid[map_coords[:,1], map_coords[:,0],0] = color_pcloud[0]
232 |         color_grid[map_coords[:,1], map_coords[:,0],1] = color_pcloud[1]
233 |         color_grid[map_coords[:,1], map_coords[:,0],2] = color_pcloud[2]
234 |         color_grid = torch.flip(color_grid, dims=[0])
235 |         color_grid = color_grid.permute(2, 0 ,1)
236 |         return true_seg_grid, color_grid/255.0
237 |     else:
238 |         return true_seg_grid
239 | 
240 | 
241 | class TransfomationRealworldAgent():
242 |     def __init__(self, agent_state) -> None:
243 |         self.agent_state = agent_state
244 |         self.T = self.agent_state.position.reshape(1,-1).T 
245 |         self.R = quaternion.as_rotation_matrix(self.agent_state.rotation)
246 | 
247 |     def original_matrix(self, position):
248 |         original_matrix = np.matrix(
249 |             [[position[0]], [position[1]], [position[2]]]
250 |         )
251 |         return original_matrix
252 | 
253 |     def realworld2agent(self, point):
254 |         O = self.original_matrix(point)
255 |         point_a = (self.R.T @ O) + (self.R.T @ -self.T)
256 |         return np.squeeze(np.asarray(point_a))
257 | 
258 |     def agent2realworld(self, point):
259 |         O = self.original_matrix(point)
260 |         point_w = (self.R @ O) + self.T
261 |         return np.squeeze(np.asarray(point_w))
262 | 


--------------------------------------------------------------------------------
/habitat_extensions/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Dict
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from habitat.core.utils import try_cv2_import
  8 | from habitat.utils.visualizations import maps
  9 | from habitat.utils.visualizations.utils import draw_collision
 10 | 
 11 | 
 12 | cv2 = try_cv2_import()
 13 | 
 14 | COLOR_ProjSem_27 = [
 15 | [255,255,255] # white
 16 | ,[128,128,0] # olive (dark yellow)
 17 | ,[0,0,255] # blue
 18 | ,[255,0,0] # red
 19 | ,[255,0,255] # magenta
 20 | ,[0,255,255] # cyan
 21 | ,[255,165,0] # orange
 22 | ,[255,255,0] # yellow
 23 | ,[128,128,128] # gray
 24 | ,[128,0,0] # maroon
 25 | ,[255,20,147] # pink 
 26 | ,[0,128,0] # dark green
 27 | ,[128,0,128] # purple
 28 | ,[0,128,128] # teal
 29 | ,[0,0,128] # navy (dark blue)
 30 | ,[210,105,30] # chocolate
 31 | ,[188,143,143] # rosy brown
 32 | ,[0,255,0] # green
 33 | ,[255,215,0] # gold
 34 | ,[0,0,0] # black
 35 | ,[192,192,192] # silver
 36 | ,[138,43,226] # blue violet
 37 | ,[255,127,80] # coral
 38 | ,[238,130,238] # violet
 39 | ,[245,245,220] # beige
 40 | ,[139,69,19] # saddle brown
 41 | ,[64,224,208] # turquoise
 42 | ]
 43 | 
 44 | OBJECTS_ProjSem_27 = [
 45 |     'void', 'chair', 'door', 'table', 'cushion',
 46 |     'sofa', 'bed', 'plant', 'sink', 'toilet', 
 47 |     'tv_monitor', 'shower', 'bathtub',
 48 |     'counter', 'appliances', 'structure', 'other',
 49 |     'free-space', 'picture', 'cabinet', 'chest_of_drawers', 'stool',
 50 |     'towel', 'fireplace', 'gym_equipment', 'seating',
 51 |     'clothes'
 52 | ]
 53 | 
 54 | COLOR_ProjSem = [
 55 |     [235, 190, 157], [235, 219, 156], [255, 255, 255], [189, 234, 155], [163, 233, 158],
 56 |     [156, 234, 180], [156, 235, 206], [157, 226, 236], [156, 198, 235], [156, 170, 231],
 57 |     [170, 155, 235], [198, 154, 234], [230, 156, 235], [234, 154, 213], [235, 156, 181],
 58 |     [157, 190, 181], [198, 156, 206],
 59 | ]
 60 | OBJECTS_ProjSem = [
 61 |     'wall', 'chair', 'door', 'table', 'picture',
 62 |     'cabinet', 'window', 'sofa', 'bed', 'plant',
 63 |     'sink', 'stairs', 'mirror', 'shower', 'counter',
 64 |     'fireplace', 'railing',
 65 | ]
 66 | 
 67 | COLOR_AABBSem = [
 68 |     [235, 190, 157], [235, 219, 156], [208, 234, 157], [189, 234, 155], [163, 233, 158],
 69 |     [156, 234, 180], [156, 235, 206], [157, 226, 236], [156, 198, 235], [156, 170, 231],
 70 |     [170, 155, 235], [198, 154, 234], [230, 156, 235], [234, 154, 213], [235, 156, 181],
 71 | ]
 72 | OBJECTS_AABBSem = [
 73 |     'door', 'stair', 'bed', 'doorway', 'table',
 74 |     'chair', 'couch', 'sink', 'closet', 'fireplace',
 75 |     'rug', 'counter', 'desk', 'painting', 'window',
 76 | ]
 77 | 
 78 | COCO_COLOR = [
 79 |     [1.0, 1.0, 1.0],
 80 |     [0.6, 0.6, 0.6],
 81 |     [0.95, 0.95, 0.95],
 82 |     [0.96, 0.36, 0.26],
 83 |     [0.12156862745098039, 0.47058823529411764, 0.7058823529411765],
 84 |     [0.9400000000000001, 0.7818, 0.66],
 85 |     [0.9400000000000001, 0.8868, 0.66],
 86 |     [0.8882000000000001, 0.9400000000000001, 0.66],
 87 |     [0.7832000000000001, 0.9400000000000001, 0.66],
 88 |     [0.6782000000000001, 0.9400000000000001, 0.66],
 89 |     [0.66, 0.9400000000000001, 0.7468000000000001],
 90 |     [0.66, 0.9400000000000001, 0.8518000000000001],
 91 |     [0.66, 0.9232, 0.9400000000000001],
 92 |     [0.66, 0.8182, 0.9400000000000001],
 93 |     [0.66, 0.7132, 0.9400000000000001],
 94 |     [0.7117999999999999, 0.66, 0.9400000000000001],
 95 |     [0.8168, 0.66, 0.9400000000000001],
 96 |     [0.9218, 0.66, 0.9400000000000001],
 97 |     [0.9400000000000001, 0.66, 0.8531999999999998],
 98 |     [0.9400000000000001, 0.66, 0.748199999999999]]
 99 | 
100 | COCO_OBJECTS = [ 'unexplored', 'obstacle', 'free', 'waypoint', 'agent',
101 |     'chair', 'couch', 'potted plant', 'bed', 'toilet',
102 |     'tv', 'dining-table', 'oven', 'sink', 'refrigerator',
103 |     'book', 'clock', 'vase', 'cup', 'bottle',
104 | ]
105 | 
106 | COLOR_HEAT = {
107 |     'R': [
108 |         255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
109 |         240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20, 0,
110 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 |         0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240,
113 |     ],
114 |     'G': [
115 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 |         0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240,
118 |         255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
119 |         255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
120 |     ],
121 |     'B': [
122 |         0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240,
123 |         255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
124 |         255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
125 |         240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20, 0,
126 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 |     ]
128 | }
129 | 
130 | 
131 | def observations_to_image(observation: Dict, info: Dict, waypoint_info, att_map) -> np.ndarray:
132 |     r"""Generate image of single frame from observation and info
133 |     returned from a single environment step().
134 | 
135 |     Args:
136 |         observation: observation returned from an environment step().
137 |         info: info returned from an environment step().
138 | 
139 |     Returns:
140 |         generated image of a single frame.
141 |     """
142 |     egocentric_view = []
143 |     observation_size = -1
144 |     if "rgb" in observation:
145 |         observation_size = observation["rgb"].shape[0]
146 |         rgb = observation["rgb"][:, :, :3]
147 |         egocentric_view.append(rgb)
148 | 
149 |     # draw depth map if observation has depth info. resize to rgb size.
150 |     # if "depth" in observation:
151 |     #     if observation_size == -1:
152 |     #         observation_size = observation["depth"].shape[0]
153 |     #     depth_map = (observation["depth"].squeeze() * 255).astype(np.uint8)
154 |     #     depth_map = np.stack([depth_map for _ in range(3)], axis=2)
155 |     #     depth_map = cv2.resize(
156 |     #         depth_map,
157 |     #         dsize=(observation_size, observation_size),
158 |     #         interpolation=cv2.INTER_CUBIC,
159 |     #     )
160 |     #     egocentric_view.append(depth_map)
161 | 
162 |     assert len(egocentric_view) > 0, "Expected at least one visual sensor enabled."
163 |     egocentric_view = np.concatenate(egocentric_view, axis=1)
164 | 
165 |     # draw collision
166 |     if "collisions" in info and info["collisions"]["is_collision"]:
167 |         egocentric_view = draw_collision(egocentric_view)
168 | 
169 |     frame = egocentric_view
170 | 
171 |     if "top_down_map" in info:
172 |         top_down_map = info["top_down_map"]["map"]
173 |         top_down_map = maps.colorize_topdown_map(
174 |             top_down_map, info["top_down_map"]["fog_of_war_mask"]
175 |         )
176 |         map_agent_pos = info["top_down_map"]["agent_map_coord"]
177 |         top_down_map = maps.draw_agent(
178 |             image=top_down_map,
179 |             agent_center_coord=map_agent_pos,
180 |             agent_rotation=info["top_down_map"]["agent_angle"],
181 |             agent_radius_px=top_down_map.shape[0] // 16,
182 |         )
183 | 
184 |         # if 'waypoint' in waypoint_info:
185 |         #     waypoint = maps.to_grid(
186 |         #         waypoint_info['action'][0],
187 |         #         waypoint_info['action'][2],
188 |         #         maps.COORDINATE_MIN, maps.COORDINATE_MAX, (1250, 1250),
189 |         #     )
190 |         #     crop_waypoint = (
191 |         #         waypoint[0] - info['top_down_map']['waypoint']['ind_x_min'],
192 |         #         waypoint[1] - info['top_down_map']['waypoint']['ind_y_min']
193 |         #     )
194 |         #     maps.draw_path(
195 |         #         top_down_map=top_down_map,
196 |         #         path_points=[crop_waypoint, crop_waypoint],
197 |         #         color=[200, 0, 0],
198 |         #         thickness=5,
199 |         #     )
200 | 
201 |         if top_down_map.shape[0] > top_down_map.shape[1]:
202 |             top_down_map = np.rot90(top_down_map, 1)
203 | 
204 |         # scale top down map to align with rgb view
205 |         old_h, old_w, _ = top_down_map.shape
206 |         top_down_height = observation_size
207 |         top_down_width = int(float(top_down_height) / old_h * old_w)
208 |         # cv2 resize (dsize is width first)
209 |         top_down_map = cv2.resize(
210 |             top_down_map,
211 |             (top_down_width, top_down_height),
212 |             interpolation=cv2.INTER_CUBIC,
213 |         )
214 |         frame = np.concatenate((frame, top_down_map), axis=1)
215 | 
216 |         # DRAW SEMANTIC MAP
217 |         ego_map = observation['ego_map_vis']
218 |         channel = ego_map.shape[0]
219 |         semantic_map = np.ones([*ego_map.shape[1:], 3], dtype=np.uint8) * 255
220 |         
221 |         if channel == 17: # AABBSem: occ map + history path + 15 objects
222 |             offset = 2
223 |             objects, color = OBJECTS_AABBSem, COLOR_AABBSem
224 |             semantic_map[ego_map[0, :, :] == 1, :] = [75, 75, 75]
225 |         elif channel == 18: # PorjSem: occ map + 17 objects
226 |             offset = 1
227 |             objects, color = OBJECTS_ProjSem, COLOR_ProjSem
228 |             semantic_map[ego_map[0, :, :] < 0.1, :] = [75, 75, 75]
229 |         elif channel == 29: # ProjSem: occ map + explored map + 27 objects
230 |             offset = 2
231 |             objects, color = OBJECTS_ProjSem_27, COLOR_ProjSem_27
232 | 
233 |         for i in range(len(objects)):
234 |             semantic_map[ego_map[i + offset, :, :] > 0.5, :] = color[i]
235 | 
236 |         semantic_map = maps.draw_agent(
237 |             image=semantic_map,
238 |             agent_center_coord=[50, 50], # FIXME 用参数代替
239 |             agent_rotation=info["top_down_map"]["agent_angle"],
240 |             agent_radius_px=top_down_map.shape[0] // 64,
241 |         )
242 |         wp_grid_x = -torch.tanh(waypoint_info['action'])[1] * 50 + 50
243 |         wp_grid_y = torch.tanh(waypoint_info['action'])[0] * 50 + 50
244 |         _limit = lambda x: min(max(int(x), 0), 100)
245 |         semantic_map[_limit(wp_grid_x - 2):_limit(wp_grid_x + 2),
246 |                      _limit(wp_grid_y - 2):_limit(wp_grid_y + 2), :] = [200, 0, 0]  # draw waypoint
247 | 
248 |         semantic_map = cv2.resize(semantic_map,
249 |                                   (observation_size, observation_size),
250 |                                   interpolation=cv2.INTER_CUBIC)
251 |         frame = np.concatenate((frame, semantic_map), axis=1)
252 | 
253 |         legend = np.ones([observation_size, 120, 3], dtype=np.uint8) * 255
254 |         grid = legend.shape[0] // 30 * 2
255 |         for i in range(len(objects)):
256 |             cv2.rectangle(legend, (grid, grid * i + 10), (grid * 2, grid * i + 10 + grid // 2), color[i], -1)
257 |             cv2.putText(legend, objects[i], (grid * 2 + 5, grid * i + 10 + grid // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
258 |         frame = np.concatenate((frame, legend), axis=1)
259 | 
260 |         vis_att_map = np.ones([att_map.shape[0], 3], dtype=np.uint8) * 255
261 |         for idx, value in enumerate(att_map):
262 |             color_idx = ((1 - (value - att_map.min()) / (att_map.max() - att_map.min() + 1e-6)) * (len(COLOR_HEAT['R'])) - 1)
263 |             color_idx = int(color_idx.item())
264 |             vis_att_map[idx, :] = [COLOR_HEAT['R'][color_idx], COLOR_HEAT['G'][color_idx], COLOR_HEAT['B'][color_idx]]
265 |         vis_att_map = vis_att_map.reshape(24, 24, 3)
266 |         vis_att_map = cv2.resize(vis_att_map, (observation_size, observation_size), interpolation=cv2.INTER_CUBIC)
267 |         frame = np.concatenate((frame, vis_att_map), axis=1)
268 | 
269 |     return frame
270 | 


--------------------------------------------------------------------------------
/vlnce_baselines/models/ddppo_policy.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import quaternion
  4 | from gym.spaces import Box, Dict, Discrete
  5 | from gym.spaces.box import Box
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | from habitat_baselines.rl.ddppo.policy.resnet_policy import PointNavResNetPolicy
 12 | 
 13 | 
 14 | class DdppoPolicy(nn.Module):
 15 |     def __init__(self, path):
 16 |         super().__init__()
 17 |         spaces = {
 18 |             'pointgoal_with_gps_compass': Box(
 19 |                 low=np.finfo(np.float32).min,
 20 |                 high=np.finfo(np.float32).max,
 21 |                 shape=(2,),
 22 |                 dtype=np.float32,
 23 |             ),
 24 |             'depth': Box(
 25 |                 low=0,
 26 |                 high=1,
 27 |                 shape=(256, 256, 1),
 28 |                 dtype=np.float32,
 29 |             )
 30 |         }
 31 |         observation_space = Dict(spaces)
 32 |         action_space = Discrete(4)
 33 | 
 34 |         checkpoint = torch.load(path)
 35 |         self.hidden_size = checkpoint['model_args'].hidden_size
 36 |         # The model must be named self.actor_critic to make the namespaces correct for loading
 37 |         self.actor_critic = PointNavResNetPolicy(
 38 |             observation_space=observation_space,
 39 |             action_space=action_space,
 40 |             hidden_size=self.hidden_size,
 41 |             num_recurrent_layers=2,
 42 |             rnn_type='LSTM',
 43 |             backbone='resnet50',
 44 |         )
 45 |         self.actor_critic.load_state_dict(
 46 |             {
 47 |                 k[len("actor_critic."):]: v
 48 |                 for k, v in checkpoint['state_dict'].items()
 49 |                 if "actor_critic" in k
 50 |             }
 51 |         )
 52 |         self.actor_critic.eval()
 53 | 
 54 |         self.hidden_state = torch.zeros(self.actor_critic.net.num_recurrent_layers, 1, checkpoint['model_args'].hidden_size)
 55 |         self.prev_actions = torch.zeros(1, 1, dtype=torch.long)
 56 | 
 57 |     def plan(self, depth, goal, t):
 58 |         batch = {
 59 |             'pointgoal_with_gps_compass': goal.view(1, -1),
 60 |             'depth': depth.view(1, depth.shape[0], depth.shape[1], depth.shape[2]),
 61 |         }
 62 | 
 63 |         if t ==0:
 64 |             not_done_masks = torch.zeros(1, 1, dtype=torch.bool, device=depth.device)
 65 |         else:
 66 |             not_done_masks = torch.ones(1, 1, dtype=torch.bool, device=depth.device)
 67 | 
 68 |         _, actions, _, self.hidden_state = self.actor_critic.act(
 69 |             batch,
 70 |             self.hidden_state.to(depth.device),
 71 |             self.prev_actions.to(depth.device),
 72 |             not_done_masks,
 73 |             deterministic=True,
 74 |         )
 75 |         self.prev_actions = torch.clone(actions)
 76 | 
 77 |         return actions.item()
 78 | 
 79 |     def reset(self):
 80 |         self.hidden_state = torch.zeros_like(self.hidden_state)
 81 |         self.prev_actions = torch.zeros_like(self.prev_actions)
 82 | 
 83 | 
 84 | class SemanticGrid(object):
 85 |     def __init__(self, batch_size, grid_dim, crop_size, cell_size, spatial_labels, object_labels, device):
 86 |         self.batch_size = batch_size
 87 |         self.grid_dim = grid_dim
 88 |         self.crop_size = crop_size
 89 |         self.cell_size = cell_size
 90 |         self.spatial_labels = spatial_labels
 91 |         self.object_labels = object_labels
 92 |         self.device = device
 93 | 
 94 |         self.crop_start = int((self.grid_dim[0] / 2) - (self.crop_size / 2))
 95 |         self.crop_end = int((self.grid_dim[0] / 2) + (self.crop_size / 2))
 96 | 
 97 |     # Transform each ground-projected grid into geocentric coordinates
 98 |     def spatialTransformer(self, grid, pose, abs_pose):
 99 |         geo_grid_out = torch.zeros(
100 |             (grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]),
101 |             dtype=torch.float32,
102 |         )
103 | 
104 |         init_pose = abs_pose[0, :]
105 |         init_rot_mat = torch.tensor(
106 |             [
107 |                 [torch.cos(init_pose[2]), -torch.sin(init_pose[2])],
108 |                 [torch.sin(init_pose[2]), torch.cos(init_pose[2])]
109 |             ],
110 |             dtype=torch.float32,
111 |         )
112 | 
113 |         for i in range(grid.shape[0]):
114 |             pose_step = pose[i, :]
115 | 
116 |             rel_coord = torch.tensor([pose_step[1], pose_step[0]], dtype=torch.float32)
117 |             rel_coord = rel_coord.reshape((2, 1))
118 |             rel_coord = torch.matmul(init_rot_mat, rel_coord)
119 | 
120 |             goal_grid_pos = torch.tensor([
121 |                 round(-rel_coord[1].item() / self.cell_size + 255),
122 |                 round(-rel_coord[0].item() / self.cell_size + 255),
123 |             ])
124 | 
125 |         return geo_grid_out, goal_grid_pos
126 | 
127 |     # Transform a geocentric map back to egocentric view
128 |     def rotate_map(self, grid, rel_pose, abs_pose):
129 |         ego_grid_out = torch.zeros(
130 |             (grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]),
131 |             dtype=torch.float32,
132 |         ).to(grid.device)
133 | 
134 |         init_pose = abs_pose[0, :]
135 |         init_rot_mat = torch.tensor(
136 |             [
137 |                 [torch.cos(init_pose[2]), -torch.sin(init_pose[2])],
138 |                 [torch.sin(init_pose[2]), torch.cos(init_pose[2])]
139 |             ],
140 |             dtype=torch.float32,
141 |         ).to(grid.device)
142 | 
143 |         for i in range(grid.shape[0]):
144 |             rel_pose_step = rel_pose[i, :]
145 | 
146 |             rel_coord = torch.tensor([rel_pose_step[1], rel_pose_step[0]], dtype=torch.float32).to(grid.device)
147 |             rel_coord = rel_coord.reshape((2, 1))
148 |             rel_coord = torch.matmul(init_rot_mat, rel_coord)
149 | 
150 |             x = -2*(rel_coord[0] / self.cell_size) / (self.grid_dim[0])
151 |             z = -2*(rel_coord[1] / self.cell_size) / (self.grid_dim[1])
152 |             angle = -rel_pose_step[2]
153 | 
154 |             trans_theta = torch.tensor([[1, -0, x], [0, 1, z]], dtype=torch.float32).unsqueeze(0)
155 |             rot_theta = torch.tensor(
156 |                 [
157 |                     [torch.cos(angle), -torch.sin(angle), 0],
158 |                     [torch.sin(angle), torch.cos(angle), 0]
159 |                 ],
160 |                 dtype=torch.float32,
161 |             ).unsqueeze(0)
162 |             trans_theta = trans_theta.to(grid.device)
163 |             rot_theta = rot_theta.to(grid.device)
164 | 
165 |             grid_step = grid[i, :, :, :].unsqueeze(0)
166 |             trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False)
167 |             rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False)
168 |             trans_ego_grid = F.grid_sample(grid_step, trans_disp_grid.float(), align_corners=False)
169 |             ego_grid = F.grid_sample(trans_ego_grid, rot_disp_grid.float(), align_corners=False)
170 |             ego_grid_out[i, :, :, :] = ego_grid
171 | 
172 |         return ego_grid_out
173 | 
174 | 
175 | class utils():
176 |     def get_rel_pose(self, pos2, pos1):
177 |         x1, y1, o1 = pos1
178 |         if len(pos2) == 2:  # if pos2 has no rotation
179 |             x2, y2 = pos2
180 |             dx = x2 - x1
181 |             dy = y2 - y1
182 |             return dx, dy
183 |         else:
184 |             x2, y2, o2 = pos2
185 |             dx = x2 - x1
186 |             dy = y2 - y1
187 |             do = o2 - o1
188 |             if do < -math.pi:
189 |                 do += 2 * math.pi
190 |             if do > math.pi:
191 |                 do -= 2 * math.pi
192 |             return dx, dy, do
193 | 
194 |     def discretize_coords(self, x, z, grid_dim, cell_size, translation=0):
195 |         map_coords = torch.zeros((len(x), 2))
196 |         xb = torch.floor(x[:]/cell_size) + (grid_dim[0]-1)/2.0
197 |         zb = torch.floor(z[:]/cell_size) + (grid_dim[1]-1)/2.0 + translation
198 |         xb = xb.int()
199 |         zb = zb.int()
200 |         map_coords[:,0] = xb
201 |         map_coords[:,1] = zb
202 |         # keep bin coords within dimensions
203 |         map_coords[map_coords > grid_dim[0] - 1] = grid_dim[0] - 1
204 |         map_coords[map_coords < 0] = 0
205 |         return map_coords.long()
206 | 
207 |     def get_sim_location(self, agent_state):
208 |         x = -agent_state.position[2]
209 |         y = -agent_state.position[0]
210 |         height = agent_state.position[1]
211 |         axis = quaternion.as_euler_angles(agent_state.rotation)[0]
212 |         if (axis%(2*np.pi)) < 0.1 or (axis%(2*np.pi)) > 2*np.pi - 0.1:
213 |             o = quaternion.as_euler_angles(agent_state.rotation)[1]
214 |         else:
215 |             o = 2*np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
216 |         if o > np.pi:
217 |             o -= 2 * np.pi
218 |         pose = x, y, o
219 |         return pose, height
220 | 
221 |     def unravel_index(self, indices, shape):
222 |         """Converts flat indices into unraveled coordinates in a target shape.
223 |         This is a `torch` implementation of `numpy.unravel_index`.
224 |         Args:
225 |             indices: A tensor of indices, (*, N).
226 |             shape: The targeted shape, (D,).
227 |         Returns:
228 |             unravel coordinates, (*, N, D).
229 |         """
230 |         shape = torch.tensor(shape)
231 |         indices = indices % shape.prod()  # prevent out-of-bounds indices
232 | 
233 |         coord = torch.zeros(indices.size() + shape.size(), dtype=int)
234 | 
235 |         for i, dim in enumerate(reversed(shape)):
236 |             coord[..., i] = indices % dim
237 |             indices = indices // dim
238 | 
239 |         return coord.flip(-1)
240 | 
241 |     def get_coord_pose(self, sg, rel_pose, init_pose, grid_dim, cell_size, device=None):
242 |         if isinstance(init_pose, list) or isinstance(init_pose, tuple):
243 |             init_pose = torch.tensor(init_pose).unsqueeze(0)
244 |         else:
245 |             init_pose = init_pose.unsqueeze(0)
246 | 
247 |         zero_pose = torch.tensor([[0., 0., 0.]])
248 |         if device != None:
249 |             init_pose = init_pose.to(device)
250 |             zero_pose = zero_pose.to(device)
251 | 
252 |         zero_coords = self.discretize_coords(
253 |             x=zero_pose[:, 0],
254 |             z=zero_pose[:, 1],
255 |             grid_dim=(grid_dim, grid_dim),
256 |             cell_size=cell_size,
257 |         )
258 | 
259 |         pose_grid = torch.zeros((1, 1, grid_dim, grid_dim), dtype=torch.float32)
260 |         pose_grid[0, 0, zero_coords[0,0], zero_coords[0,1]] = 1
261 | 
262 |         _, goal_grid_pos = sg.spatialTransformer(grid=pose_grid, pose=rel_pose, abs_pose=init_pose)
263 |         inds = goal_grid_pos
264 | 
265 |         pose_coord = torch.zeros((1, 1, 2), dtype=torch.int64)
266 |         pose_coord[0, 0, 0] = inds[1]
267 |         pose_coord[0, 0, 1] = inds[0]
268 |         return pose_coord
269 | 
270 |     def transform_ego_to_geo(self, ego_point, pose_coords, abs_pose_coords, abs_poses, t):
271 |         rel_rot = torch.tensor(abs_poses[0][2]) - torch.tensor(abs_poses[t][2])
272 |         dist_x = (ego_point[0, 0, 0] - pose_coords[0, 0, 0])
273 |         dist_z = (ego_point[0, 0, 1] - pose_coords[0, 0, 1])
274 |         rel_rot_mat = torch.tensor(
275 |             [
276 |                 [torch.cos(rel_rot), -torch.sin(rel_rot)],
277 |                 [torch.sin(rel_rot), torch.cos(rel_rot)]
278 |             ],
279 |             dtype=torch.float32,
280 |         )
281 |         dist_vect = torch.tensor([dist_x, dist_z], dtype=torch.float)
282 |         dist_vect = dist_vect.reshape((2, 1))
283 |         rot_vect = torch.matmul(rel_rot_mat, dist_vect)
284 | 
285 |         abs_coords_x = abs_pose_coords[0, 0, 0] + rot_vect[0]
286 |         abs_coords_z = abs_pose_coords[0, 0, 1] + rot_vect[1]
287 |         abs_coords = torch.tensor([[[abs_coords_x, abs_coords_z]]])
288 |         return abs_coords
289 | 


--------------------------------------------------------------------------------
/habitat_extensions/measures.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | from typing import Any
  4 | 
  5 | import numpy as np
  6 | from dtw import dtw
  7 | from fastdtw import fastdtw
  8 | from habitat.config import Config
  9 | from habitat.core.embodied_task import EmbodiedTask, Measure
 10 | from habitat.core.registry import registry
 11 | from habitat.core.simulator import Simulator
 12 | 
 13 | 
 14 | @registry.register_measure
 15 | class PathLength(Measure):
 16 |     r"""Path Length (PL)
 17 | 
 18 |     PL = sum(geodesic_distance(agent_prev_position, agent_position)
 19 |             over all agent positions.
 20 |     """
 21 | 
 22 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
 23 |         self._previous_position = None
 24 |         self._start_end_episode_distance = None
 25 |         self._agent_episode_distance = None
 26 |         self._sim = sim
 27 |         self._config = config
 28 | 
 29 |         super().__init__(**kwargs)
 30 | 
 31 |     def reset_metric(self, episode, *args: Any, **kwargs: Any):
 32 |         self._previous_position = self._sim.get_agent_state().position.tolist()
 33 |         self._start_end_episode_distance = self._sim.geodesic_distance(
 34 |             self._previous_position, episode.goals[0].position
 35 |         )
 36 |         self._agent_episode_distance = 0.0
 37 |         self._metric = None
 38 | 
 39 |     def _euclidean_distance(self, position_a, position_b):
 40 |         return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2)
 41 | 
 42 |     def update_metric(self, episode, action, *args: Any, **kwargs: Any):
 43 |         current_position = self._sim.get_agent_state().position.tolist()
 44 | 
 45 |         distance_to_target = self._sim.geodesic_distance(
 46 |             current_position, episode.goals[0].position
 47 |         )
 48 | 
 49 |         self._agent_episode_distance += self._euclidean_distance(
 50 |             current_position, self._previous_position
 51 |         )
 52 | 
 53 |         self._previous_position = current_position
 54 | 
 55 |         self._metric = self._agent_episode_distance
 56 | 
 57 |     @staticmethod
 58 |     def _get_uuid(*args: Any, **kwargs: Any):
 59 |         return "path_length"
 60 | 
 61 | 
 62 | @registry.register_measure
 63 | class OracleNavigationError(Measure):
 64 |     r"""Oracle Navigation Error (ONE)
 65 | 
 66 |     ONE = min(geosdesic_distance(agent_pos, goal))
 67 |             over all agent_pos in agent path.
 68 | 
 69 |     This computes oracle navigation error for every update regardless of
 70 |     whether or not the end of the episode has been reached.
 71 |     """
 72 | 
 73 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
 74 |         self._sim = sim
 75 |         self._config = config
 76 |         super().__init__()
 77 | 
 78 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
 79 |         self._metric = float("inf")
 80 | 
 81 |     def update_metric(self, *args: Any, episode, action, **kwargs: Any):
 82 |         current_position = self._sim.get_agent_state().position.tolist()
 83 |         distance_to_target = self._sim.geodesic_distance(
 84 |             current_position, episode.goals[0].position
 85 |         )
 86 |         if distance_to_target < self._metric:
 87 |             self._metric = distance_to_target
 88 | 
 89 |     @staticmethod
 90 |     def _get_uuid(*args: Any, **kwargs: Any):
 91 |         return "oracle_navigation_error"
 92 | 
 93 | 
 94 | @registry.register_measure
 95 | class OracleSuccess(Measure):
 96 |     r"""Oracle Success Rate (OSR)
 97 | 
 98 |     OSR = I(ONE <= goal_radius),
 99 |     where ONE is Oracle Navigation Error.
100 |     """
101 | 
102 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
103 |         self._sim = sim
104 |         self._config = config
105 |         super().__init__()
106 | 
107 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
108 |         self._metric = 0
109 | 
110 |     def update_metric(
111 |         self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any
112 |     ):
113 |         if self._metric:
114 |             # skip, already had oracle success
115 |             return
116 | 
117 |         current_position = self._sim.get_agent_state().position.tolist()
118 |         distance_to_target = self._sim.geodesic_distance(
119 |             current_position, episode.goals[0].position
120 |         )
121 | 
122 |         if distance_to_target < self._config.SUCCESS_DISTANCE:
123 |             self._metric = 1
124 | 
125 |     @staticmethod
126 |     def _get_uuid(*args: Any, **kwargs: Any):
127 |         return "oracle_success"
128 | 
129 | 
130 | @registry.register_measure
131 | class OracleSPL(Measure):
132 |     r"""OracleSPL (Oracle Success weighted by Path Length)
133 | 
134 |     OracleSPL = max(SPL) over all points in the agent path
135 |     """
136 | 
137 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
138 |         self._previous_position = None
139 |         self._start_end_episode_distance = None
140 |         self._agent_episode_distance = None
141 |         self._ep_success = None
142 |         self._sim = sim
143 |         self._config = config
144 |         super().__init__()
145 | 
146 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
147 |         self._previous_position = self._sim.get_agent_state().position.tolist()
148 |         self._start_end_episode_distance = episode.info["geodesic_distance"]
149 |         self._agent_episode_distance = 0.0
150 |         self._ep_success = 0
151 |         self._metric = 0.0
152 | 
153 |     def _euclidean_distance(self, position_a, position_b):
154 |         return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2)
155 | 
156 |     def update_metric(
157 |         self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any
158 |     ):
159 |         if self._ep_success:  # shortest path already found
160 |             return
161 | 
162 |         current_position = self._sim.get_agent_state().position.tolist()
163 | 
164 |         self._agent_episode_distance += self._euclidean_distance(
165 |             current_position, self._previous_position
166 |         )
167 |         self._previous_position = current_position
168 | 
169 |         distance_to_target = self._sim.geodesic_distance(
170 |             current_position, episode.goals[0].position
171 |         )
172 |         if distance_to_target < self._config.SUCCESS_DISTANCE:
173 |             self._ep_success = 1
174 |             self._metric = self._ep_success * (
175 |                 self._start_end_episode_distance
176 |                 / max(self._start_end_episode_distance, self._agent_episode_distance)
177 |             )
178 | 
179 |     @staticmethod
180 |     def _get_uuid(*args: Any, **kwargs: Any):
181 |         return "oracle_spl"
182 | 
183 | 
184 | @registry.register_measure
185 | class StepsTaken(Measure):
186 |     r"""Counts the number of times update_metric() is called. This is equal to
187 |     the number of times that the agent takes an action. STOP counts as an
188 |     action.
189 |     """
190 | 
191 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
192 |         self._sim = sim
193 |         self._config = config
194 |         self._metric = 0
195 |         super().__init__()
196 | 
197 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
198 |         self._metric = 0
199 | 
200 |     def update_metric(
201 |         self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any
202 |     ):
203 |         self._metric += 1
204 | 
205 |     @staticmethod
206 |     def _get_uuid(*args: Any, **kwargs: Any):
207 |         return "steps_taken"
208 | 
209 | 
210 | @registry.register_measure
211 | class NDTW(Measure):
212 |     r"""NDTW (Normalized Dynamic Time Warping)
213 | 
214 |     ref: Effective and General Evaluation for Instruction
215 |         Conditioned Navigation using Dynamic Time
216 |         Warping - Magalhaes et. al
217 |     https://arxiv.org/pdf/1907.05446.pdf
218 |     """
219 | 
220 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
221 |         self._sim = sim
222 |         self._config = config
223 |         self.locations = []
224 |         self.gt_locations = []
225 |         self.dtw_func = fastdtw if config.FDTW else dtw
226 | 
227 |         gt_path = config.GT_PATH.format(split=config.SPLIT)
228 |         with gzip.open(gt_path, "rt") as f:
229 |             self.gt_json = json.load(f)
230 |         super().__init__()
231 | 
232 |     @staticmethod
233 |     def _get_uuid(*args: Any, **kwargs: Any):
234 |         return "ndtw"
235 | 
236 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
237 |         self.locations.clear()
238 |         self.gt_locations = self.gt_json[str(episode.episode_id)]["locations"]
239 |         self._metric = None
240 | 
241 |     def _euclidean_distance(self, position_a, position_b):
242 |         return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2)
243 | 
244 |     def update_metric(
245 |         self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any
246 |     ):
247 |         current_position = self._sim.get_agent_state().position.tolist()
248 |         if len(self.locations) == 0:
249 |             self.locations.append(current_position)
250 |         else:
251 |             if current_position == self.locations[-1]:
252 |                 return
253 |             self.locations.append(current_position)
254 | 
255 |         dtw_distance = self.dtw_func(
256 |             self.locations, self.gt_locations, dist=self._euclidean_distance
257 |         )[0]
258 | 
259 |         nDTW = np.exp(
260 |             -dtw_distance / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
261 |         )
262 |         self._metric = nDTW
263 | 
264 | 
265 | @registry.register_measure
266 | class SDTW(Measure):
267 |     r"""SDTW (Success Weighted be nDTW)
268 | 
269 |     ref: Effective and General Evaluation for Instruction
270 |         Conditioned Navigation using Dynamic Time
271 |         Warping - Magalhaes et. al
272 |     https://arxiv.org/pdf/1907.05446.pdf
273 |     """
274 | 
275 |     def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any):
276 |         self._sim = sim
277 |         self._config = config
278 |         self.locations = []
279 |         self.gt_locations = []
280 |         self.dtw_func = fastdtw if config.FDTW else dtw
281 | 
282 |         gt_path = config.GT_PATH.format(split=config.SPLIT)
283 |         with gzip.open(gt_path, "rt") as f:
284 |             self.gt_json = json.load(f)
285 |         super().__init__()
286 | 
287 |     @staticmethod
288 |     def _get_uuid(*args: Any, **kwargs: Any):
289 |         return "sdtw"
290 | 
291 |     def reset_metric(self, *args: Any, episode, **kwargs: Any):
292 |         self.locations.clear()
293 |         self.gt_locations = self.gt_json[str(episode.episode_id)]["locations"]
294 |         self._metric = None
295 | 
296 |     def _euclidean_distance(self, position_a, position_b):
297 |         return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2)
298 | 
299 |     def update_metric(
300 |         self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any
301 |     ):
302 |         current_position = self._sim.get_agent_state().position.tolist()
303 |         if len(self.locations) == 0:
304 |             self.locations.append(current_position)
305 |         else:
306 |             if current_position != self.locations[-1]:
307 |                 self.locations.append(current_position)
308 | 
309 |         dtw_distance = self.dtw_func(
310 |             self.locations, self.gt_locations, dist=self._euclidean_distance
311 |         )[0]
312 | 
313 |         nDTW = np.exp(
314 |             -dtw_distance / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
315 |         )
316 | 
317 |         distance_to_target = self._sim.geodesic_distance(
318 |             current_position, episode.goals[0].position
319 |         )
320 |         if task.is_stop_called and distance_to_target < self._config.SUCCESS_DISTANCE:
321 |             ep_success = 1
322 |         else:
323 |             ep_success = 0
324 | 
325 |         self._metric = ep_success * nDTW
326 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common/rgb_mapping.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from einops import rearrange
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | import torch_scatter
  9 | 
 10 | 
 11 | class Mapping(nn.Module):
 12 |     def __init__(self, model_config):
 13 |         super().__init__()
 14 |         self.device = torch.device("cuda", model_config.gpu_id)
 15 |         self.num_proc = model_config.num_proc
 16 | 
 17 |         self.resolution = model_config.resolution
 18 |         self.egocentric_map_size = model_config.egocentric_map_size
 19 |         self.global_map_size = model_config.global_map_size
 20 |         self.global_map_depth = model_config.map_depth
 21 |         coordinate_min = - self.global_map_size * self.resolution / 2
 22 |         coordinate_max = self.global_map_size * self.resolution / 2
 23 | 
 24 |         self.to_grid = to_grid(self.global_map_size, coordinate_min, coordinate_max)
 25 |         self.rotate_tensor = RotateTensor(self.device)
 26 | 
 27 |         self.projection = Projection(self.egocentric_map_size, self.global_map_size, self.device, coordinate_min, coordinate_max)
 28 | 
 29 |         self.full_global_map = torch.zeros(self.num_proc, self.global_map_size, self.global_map_size, self.global_map_depth, device=self.device)
 30 |         self.agent_view = torch.zeros(self.num_proc, self.global_map_depth, self.global_map_size, self.global_map_size, device=self.device)
 31 | 
 32 |     def project_feat_to_map(self, features, full_global_map, observations, masks):
 33 |         bs = features.shape[0]
 34 |         grid_x, grid_y = self.to_grid.get_grid_coords(observations['gps'])
 35 |         full_global_map[:bs, :, :, :] = full_global_map[:bs, :, :, :] * masks.unsqueeze(1).unsqueeze(1)
 36 | 
 37 |         proj_sem = self.projection.forward(features, observations['depth'] * 10, -(observations["compass"]))
 38 |         projection = torch.cat([proj_sem], dim=1)
 39 | 
 40 |         agent_view = self.agent_view[:bs] * 0
 41 |         agent_view[:, :, 
 42 |             self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2), 
 43 |             self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2)
 44 |         ] = projection
 45 |         st_pose = torch.cat(
 46 |             [
 47 |                 -(grid_y.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2),
 48 |                 -(grid_x.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2),
 49 |                 torch.zeros_like(observations['compass']),
 50 |             ], dim=1
 51 |         )
 52 |         _, trans_mat = get_grid(st_pose, agent_view.size(), self.device)
 53 |         translated = F.grid_sample(agent_view, trans_mat)
 54 | 
 55 |         fusion_map = torch.cat([full_global_map[:bs, :, :, :].unsqueeze(1), translated.permute(0, 2, 3, 1).unsqueeze(1)], dim=1)
 56 |         full_global_map[:bs, :, :, :], _ = torch.max(fusion_map, dim=1)
 57 |         st_pose_retrieval = torch.cat(
 58 |             [
 59 |                 (grid_y.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2),
 60 |                 (grid_x.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2),
 61 |                 torch.zeros_like(observations['compass']),
 62 |             ], dim=1
 63 |         )
 64 |         _, trans_mat_retrieval = get_grid(st_pose_retrieval, agent_view.size(), self.device)
 65 |         translated_retrieval = F.grid_sample(full_global_map[:bs, :, :, :].permute(0, 3, 1, 2).contiguous(), trans_mat_retrieval)
 66 |         translated_retrieval = translated_retrieval[:, :,
 67 |             self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2),
 68 |             self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2)
 69 |         ]
 70 |         final_retrieval = self.rotate_tensor.forward(translated_retrieval, observations["compass"])
 71 | 
 72 |         return final_retrieval, full_global_map
 73 | 
 74 | 
 75 | class RGBMapping(Mapping):
 76 |     def __init__(self, model_config):
 77 |         super().__init__(model_config)
 78 | 
 79 |     def forward(self, rgb_features, observations, masks):
 80 |         if 'rgb_ego_map' not in observations:
 81 |             bs, c, h, w = rgb_features.shape
 82 |             rgb_features = rgb_features.permute(0, 2, 3, 1).reshape(bs, -1, c) # [bs, hxw, c]
 83 |             rgb_features = torch.nn.functional.adaptive_max_pool1d(rgb_features, self.global_map_depth) # [bs, hxw, self.global_map_depth]
 84 |             rgb_features = rgb_features.reshape(bs, h, w, -1).permute(0, 3, 1, 2)   # [bs, self.global_map_depth, h, w]
 85 |             final_retrieval, self.full_global_map = self.project_feat_to_map(rgb_features, self.full_global_map, observations, masks)
 86 |             observations['rgb_ego_map'] = final_retrieval
 87 |         else:
 88 |             final_retrieval = observations['rgb_ego_map']
 89 | 
 90 |         return final_retrieval
 91 | 
 92 | 
 93 | class to_grid():
 94 |     def __init__(self, global_map_size, coordinate_min, coordinate_max):
 95 |         self.global_map_size = global_map_size
 96 |         self.coordinate_min = coordinate_min
 97 |         self.coordinate_max = coordinate_max
 98 |         self.grid_size = (coordinate_max - coordinate_min) / global_map_size
 99 | 
100 |     def get_grid_coords(self, positions):
101 |         grid_x = ((self.coordinate_max - positions[:, 0]) / self.grid_size).round()
102 |         grid_y = ((positions[:, 1] - self.coordinate_min) / self.grid_size).round()
103 |         return grid_x, grid_y
104 | 
105 | 
106 | def get_grid(pose, grid_size, device):
107 |     """
108 |     Input:
109 |         `pose` FloatTensor(bs, 3)
110 |         `grid_size` 4-tuple (bs, _, grid_h, grid_w)
111 |         `device` torch.device (cpu or gpu)
112 |     Output:
113 |         `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
114 |         `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
115 |     """
116 |     pose = pose.float()
117 |     x = pose[:, 0]
118 |     y = pose[:, 1]
119 |     t = pose[:, 2]
120 | 
121 |     cos_t = t.cos()
122 |     sin_t = t.sin()
123 | 
124 |     theta11 = torch.stack([cos_t, -sin_t,
125 |                            torch.zeros(cos_t.shape).float().to(device)], 1)
126 |     theta12 = torch.stack([sin_t, cos_t,
127 |                            torch.zeros(cos_t.shape).float().to(device)], 1)
128 |     theta1 = torch.stack([theta11, theta12], 1)
129 | 
130 |     theta21 = torch.stack([torch.ones(x.shape).to(device),
131 |                            -torch.zeros(x.shape).to(device), x], 1)
132 |     theta22 = torch.stack([torch.zeros(x.shape).to(device),
133 |                            torch.ones(x.shape).to(device), y], 1)
134 |     theta2 = torch.stack([theta21, theta22], 1)
135 | 
136 |     rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
137 |     trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
138 | 
139 |     return rot_grid, trans_grid
140 | 
141 | 
142 | class ComputeSpatialLocs():
143 |     def __init__(self, egocentric_map_size, global_map_size, device, coordinate_min, coordinate_max):
144 |         self.device = device
145 |         self.egocentric_map_size = egocentric_map_size
146 |         self.local_scale = float(coordinate_max - coordinate_min) / float(global_map_size)
147 | 
148 |     def get_camera_matrix(self, imh, imw, fov):
149 |         self.cx, self.cy = imh / 2., imw / 2.
150 |         self.fx = (imh / 2.) / np.tan(np.deg2rad(fov / 2.))
151 |         self.fy = (imw / 2.) / np.tan(np.deg2rad(fov / 2.))
152 | 
153 |     def forward(self, depth):
154 |         depth = depth.permute(0, 3, 1, 2)
155 |         _, _, imh, imw = depth.shape  # batchsize, 1, imh, imw
156 | 
157 |         self.get_camera_matrix(imh, imw, 90)
158 | 
159 |         x = rearrange(torch.arange(0, imw), 'w -> () () () w').to(self.device)
160 |         y = rearrange(torch.arange(imh, 0, step=-1), 'h -> () () h ()').to(self.device)
161 |         xx = (x - self.cx) / self.fx
162 |         yy = (y - self.cy) / self.fy
163 | 
164 |         # 3D real-world coordinates (in meters)
165 |         Z = depth
166 |         X = xx * Z
167 |         Y = yy * Z
168 | 
169 |         # Valid inputs
170 |         valid_inputs = (depth != 0) & ((Y > -1.5) & (Y < 0.1))
171 | 
172 |         # X ground projection and Y ground projection
173 |         x_gp = ((X / self.local_scale) + (self.egocentric_map_size - 1) / 2).round().long()  # (bs, imh, imw, 1)
174 |         y_gp = (-(Z / self.local_scale) + (self.egocentric_map_size - 1) / 2).round().long()  # (bs, imh, imw, 1)
175 | 
176 |         return torch.cat([x_gp, y_gp], dim=1), valid_inputs
177 | 
178 | 
179 | class ProjectToGroundPlane():
180 |     def __init__(self, egocentric_map_size, device):
181 |         self.egocentric_map_size = egocentric_map_size
182 |         self.device = device
183 | 
184 |     def forward(self, conv, spatial_locs, valid_inputs):
185 |         outh, outw = (self.egocentric_map_size, self.egocentric_map_size)
186 |         bs, f, HbyK, WbyK = conv.shape
187 |         eps = -1e16
188 |         depth_h = spatial_locs.shape[-1]
189 |         K = depth_h / WbyK  # Hardcoded value of K
190 | 
191 |         # Sub-sample spatial_locs, valid_inputs according to img_feats resolution.
192 |         idxes_ss = ((torch.arange(0, HbyK, 1) * K).long().to(self.device), \
193 |                     (torch.arange(0, WbyK, 1) * K).long().to(self.device))
194 | 
195 |         spatial_locs_ss = spatial_locs[:, :, idxes_ss[0][:, None], idxes_ss[1]]  # (bs, 2, HbyK, WbyK)
196 |         valid_inputs_ss = valid_inputs[:, :, idxes_ss[0][:, None], idxes_ss[1]]  # (bs, 1, HbyK, WbyK)
197 |         valid_inputs_ss = valid_inputs_ss.squeeze(1)  # (bs, HbyK, WbyK)
198 |         invalid_inputs_ss = ~valid_inputs_ss
199 | 
200 |         # Filter out invalid spatial locations
201 |         invalid_spatial_locs = (spatial_locs_ss[:, 1] >= outh) | (spatial_locs_ss[:, 1] < 0) | \
202 |                                (spatial_locs_ss[:, 0] >= outw) | (spatial_locs_ss[:, 0] < 0)  # (bs, H, W)
203 | 
204 |         invalid_writes = invalid_spatial_locs | invalid_inputs_ss
205 | 
206 |         # Set the idxes for all invalid locations to (0, 0)
207 |         spatial_locs_ss[:, 0][invalid_writes] = 0
208 |         spatial_locs_ss[:, 1][invalid_writes] = 0
209 | 
210 |         # Weird hack to account for max-pooling negative feature values
211 |         invalid_writes_f = rearrange(invalid_writes, 'b h w -> b () h w').float()
212 |         conv_masked = conv * (1 - invalid_writes_f) + eps * invalid_writes_f
213 |         conv_masked = rearrange(conv_masked, 'b e h w -> b e (h w)')
214 | 
215 |         # Linearize ground-plane indices (linear idx = y * W + x)
216 |         linear_locs_ss = spatial_locs_ss[:, 1] * outw + spatial_locs_ss[:, 0]  # (bs, H, W)
217 |         linear_locs_ss = rearrange(linear_locs_ss, 'b h w -> b () (h w)')
218 |         linear_locs_ss = linear_locs_ss.expand(-1, f, -1)  # .contiguous()
219 | 
220 |         proj_feats, _ = torch_scatter.scatter_max(
221 |             conv_masked,
222 |             linear_locs_ss,
223 |             dim=2,
224 |             dim_size=outh * outw,
225 |         )
226 |         proj_feats = rearrange(proj_feats, 'b e (h w) -> b e h w', h=outh)
227 | 
228 |         # Replace invalid features with zeros
229 |         eps_mask = (proj_feats == eps).float()
230 |         proj_feats = proj_feats * (1 - eps_mask) + eps_mask * (proj_feats - eps)
231 | 
232 |         return proj_feats
233 | 
234 | 
235 | class RotateTensor:
236 |     def __init__(self, device):
237 |         self.device = device
238 | 
239 |     def forward(self, x_gp, heading):
240 |         sin_t = torch.sin(heading.squeeze(1))
241 |         cos_t = torch.cos(heading.squeeze(1))
242 |         A = torch.zeros(x_gp.size(0), 2, 3).to(self.device)
243 |         A[:, 0, 0] = cos_t
244 |         A[:, 0, 1] = sin_t
245 |         A[:, 1, 0] = -sin_t
246 |         A[:, 1, 1] = cos_t
247 | 
248 |         grid = F.affine_grid(A, x_gp.size())
249 |         rotated_x_gp = F.grid_sample(x_gp, grid)
250 |         return rotated_x_gp
251 | 
252 | 
253 | class Projection:
254 |     def __init__(self, egocentric_map_size, global_map_size, device, coordinate_min, coordinate_max):
255 |         self.egocentric_map_size = egocentric_map_size
256 |         self.global_map_size = global_map_size
257 |         self.compute_spatial_locs = ComputeSpatialLocs(
258 |             egocentric_map_size, global_map_size,
259 |             device, coordinate_min, coordinate_max
260 |         )
261 |         self.project_to_ground_plane = ProjectToGroundPlane(egocentric_map_size, device)
262 |         self.rotate_tensor = RotateTensor(device)
263 | 
264 |     def forward(self, conv, depth, heading):
265 |         spatial_locs, valid_inputs = self.compute_spatial_locs.forward(depth)
266 |         x_gp = self.project_to_ground_plane.forward(conv, spatial_locs, valid_inputs)
267 |         rotated_x_gp = self.rotate_tensor.forward(x_gp, heading)
268 |         return rotated_x_gp
269 | 


--------------------------------------------------------------------------------
/habitat_extensions/sensors.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import gzip
  4 | import json
  5 | import numpy as np
  6 | from gym import spaces
  7 | from typing import Any
  8 | 
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from habitat.config import Config
 13 | from habitat.core.registry import registry
 14 | from habitat.core.simulator import Sensor, SensorTypes, Simulator
 15 | from habitat.sims.habitat_simulator.actions import HabitatSimActions
 16 | from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
 17 | from habitat.tasks.utils import cartesian_to_polar
 18 | from habitat.utils.geometry_utils import quaternion_rotate_vector
 19 | from habitat.utils.visualizations import maps
 20 | from habitat_extensions.shortest_path_follower import ShortestPathFollowerCompat
 21 | 
 22 | from vlnce_baselines.common.rgb_mapping import get_grid
 23 | from vlnce_baselines.common.action_maker import TransfomationRealworldAgent
 24 | 
 25 | 
 26 | @registry.register_sensor
 27 | class VLNOracleActionSensor(Sensor):
 28 |     """Sensor for observing the optimal action to take. The assumption this
 29 |     sensor currently makes is that the shortest path to the goal is the
 30 |     optimal path.
 31 |     Args:
 32 |         sim: reference to the simulator for calculating task observations.
 33 |         config: config for the sensor.
 34 |     """
 35 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
 36 |         super().__init__(config=config)
 37 | 
 38 |         # all goals can be navigated to within 0.5m.
 39 |         goal_radius = getattr(config, "GOAL_RADIUS", 0.5)
 40 |         if config.USE_ORIGINAL_FOLLOWER:
 41 |             self.follower = ShortestPathFollowerCompat(
 42 |                 sim, goal_radius, return_one_hot=False
 43 |             )
 44 |             self.follower.mode = "geodesic_path"
 45 |         else:
 46 |             self.follower = ShortestPathFollower(sim, goal_radius, return_one_hot=False)
 47 | 
 48 |     def _get_uuid(self, *args: Any, **kwargs: Any):
 49 |         return "vln_oracle_action_sensor"
 50 | 
 51 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
 52 |         return SensorTypes.TACTILE
 53 | 
 54 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
 55 |         return spaces.Box(low=0.0, high=100, shape=(1,), dtype=np.float)
 56 | 
 57 |     def get_observation(self, observations, *args: Any, episode, **kwargs: Any):
 58 |         best_action = self.follower.get_next_action(episode.goals[0].position)
 59 |         return np.array(
 60 |             [best_action if best_action is not None else HabitatSimActions.STOP]
 61 |         )
 62 | 
 63 | 
 64 | @registry.register_sensor
 65 | class VLNOracleProgressSensor(Sensor):
 66 |     """Sensor for observing how much progress has been made towards the goal.
 67 |     Args:
 68 |         sim: reference to the simulator for calculating task observations.
 69 |         config: config for the sensor.
 70 |     """
 71 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
 72 |         self._sim = sim
 73 |         super().__init__(config=config)
 74 | 
 75 |     def _get_uuid(self, *args: Any, **kwargs: Any):
 76 |         return "progress"
 77 | 
 78 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
 79 |         # TODO: what is the correct sensor type?
 80 |         return SensorTypes.MEASUREMENT
 81 | 
 82 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
 83 |         return spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float)
 84 | 
 85 |     def get_observation(self, observations, *args: Any, episode, **kwargs: Any):
 86 |         current_position = self._sim.get_agent_state().position.tolist()
 87 | 
 88 |         distance_to_target = self._sim.geodesic_distance(
 89 |             current_position, episode.goals[0].position
 90 |         )
 91 | 
 92 |         distance_from_start = episode.info["geodesic_distance"]
 93 |         progress = (distance_from_start - distance_to_target) / distance_from_start
 94 |         return np.array([progress])
 95 | 
 96 | 
 97 | @registry.register_sensor
 98 | class VLNOracleWaypointSensor(Sensor):
 99 |     """Sensor for waypoint towards the goal.
100 |     Args:
101 |         sim: reference to the simulator for calculating task observations.
102 |         config: config for the sensor.
103 |     """
104 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
105 |         super().__init__(config=config)
106 |         self._coordinate_min = maps.COORDINATE_MIN
107 |         self._coordinate_max = maps.COORDINATE_MAX
108 |         self._map_resolution = (config.MAP_RESOLUTION, config.MAP_RESOLUTION)
109 |         self._map_size = config.MAP_SIZE
110 | 
111 |         goal_radius = getattr(config, "GOAL_RADIUS", 0.5)
112 |         if config.USE_ORIGINAL_FOLLOWER:
113 |             self.follower = ShortestPathFollowerCompat(
114 |                 sim, goal_radius, return_one_hot=False
115 |             )
116 |             self.follower.mode = "geodesic_path"
117 |         else:
118 |             self.follower = ShortestPathFollower(sim, goal_radius, return_one_hot=False)
119 |         self._sim = sim
120 | 
121 |         self.use_law = config.LAW.USE
122 |         gt_path = config.LAW.GT_PATH.format(split=config.LAW.SPLIT)
123 |         with gzip.open(gt_path, "rt") as f:
124 |             self.gt_waypoint_locations = json.load(f)
125 |         self.is_sparse = config.LAW.IS_SPARSE
126 |         self.num_inter_waypoints = config.LAW.NUM_WAYPOINTS
127 | 
128 |     def _get_uuid(self, *args: Any, **kwargs: Any):
129 |         return "waypoint"
130 | 
131 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
132 |         return SensorTypes.TACTILE
133 | 
134 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
135 |         return spaces.Box(low=0.0, high=100, shape=(4,), dtype=np.float)
136 | 
137 |     def get_observation(self, observations, *args: Any, episode, **kwargs: Any):
138 |         agent_position = self._sim.get_agent_state().position
139 |         if self.use_law:
140 |             goal_pos = self.get_goal(episode)
141 |         else:
142 |             goal_pos = episode.goals[0].position
143 |         points = self._sim.get_straight_shortest_path_points(agent_position, goal_pos)
144 |         if len(points) < 2:
145 |             return None
146 | 
147 |         waypoint = self.get_waypoint(points)
148 | 
149 |         self.trans_tool = TransfomationRealworldAgent(self._sim.get_agent_state())
150 |         wp_a = self.trans_tool.realworld2agent(waypoint)
151 | 
152 |         resolution = (self._coordinate_max - self._coordinate_min) / self._map_resolution[0]
153 |         wp_ego_x = (wp_a[0] / resolution).astype(np.int)  
154 |         wp_ego_y = (-wp_a[2] / resolution).astype(np.int)
155 |         wp_norm_x = wp_ego_x / (self._map_size // 2)
156 |         wp_norm_y = wp_ego_y / (self._map_size // 2)
157 | 
158 |         return np.array([wp_norm_x, wp_norm_y])
159 | 
160 |     def get_goal(self, episode):
161 |         if self.num_inter_waypoints > 0:
162 |             locs = self.gt_waypoint_locations[str(episode.episode_id)]["locations"]
163 |             ep_path_length = self._sim.geodesic_distance(locs[0], episode.goals[0].position)
164 | 
165 |             way_locations = [locs[0]]
166 |             count = 0
167 |             dist = ep_path_length / (self.num_inter_waypoints+1)
168 |             for way in locs[:-1]:
169 |                 d = self._sim.geodesic_distance(locs[0], way)
170 |                 if d >= dist:
171 |                     way_locations.append(way)
172 |                     if count >= (self.num_inter_waypoints-1):
173 |                         break
174 |                     count += 1
175 |                     dist += ep_path_length / (self.num_inter_waypoints+1)
176 | 
177 |             way_locations.append(episode.goals[0].position)
178 |         else:
179 |             if self.is_sparse:
180 |                 # Sparse supervision of waypoints
181 |                 way_locations = episode.reference_path
182 |             else:
183 |                 # Dense supervision of waypoints
184 |                 way_locations = self.gt_waypoint_locations[str(episode.episode_id)]["locations"]
185 | 
186 |         current_position = self._sim.get_agent_state().position.tolist()
187 |         nearest_dist = float("inf")
188 |         nearest_way = way_locations[-1]
189 | 
190 |         for ind, way in reversed(list(enumerate(way_locations))):
191 |             distance_to_way = self._sim.geodesic_distance(current_position, way)
192 | 
193 |             if distance_to_way >= 3.0 and distance_to_way < nearest_dist:
194 |                 dist_way_to_goal = self._sim.geodesic_distance(way, episode.goals[0].position)
195 |                 dist_agent_to_goal = self._sim.geodesic_distance(current_position, episode.goals[0].position)
196 | 
197 |                 if dist_agent_to_goal > dist_way_to_goal:
198 |                     nearest_dist = distance_to_way
199 |                     nearest_way = way
200 | 
201 |         return nearest_way
202 | 
203 |     def get_waypoint(self, points):
204 |         path_line = np.zeros(self._map_resolution, dtype=np.uint8)
205 |         for index in range(len(points) - 1):
206 |             x_t_1, y_t_1 = maps.to_grid(
207 |                 points[index][0], points[index][2],
208 |                 self._coordinate_min, self._coordinate_max, self._map_resolution,
209 |             )
210 |             x_t_2, y_t_2 = maps.to_grid(
211 |                 points[index + 1][0], points[index + 1][2],
212 |                 self._coordinate_min, self._coordinate_max, self._map_resolution,
213 |             )
214 |             cv2.line(path_line, (y_t_1, x_t_1), (y_t_2, x_t_2), 255, 1)
215 | 
216 |         agent_position = self._sim.get_agent_state().position
217 |         a_x, a_y = maps.to_grid(
218 |             agent_position[0],
219 |             agent_position[2],
220 |             self._coordinate_min,
221 |             self._coordinate_max,
222 |             self._map_resolution,
223 |         )
224 |         fog_line = np.zeros(self._map_resolution, dtype=np.uint8)
225 |         cv2.circle(fog_line, (a_y, a_x), 20, 255, 2)
226 | 
227 |         searched = []
228 |         def search(point):
229 |             searched.append([point[0], point[1]])
230 |             if fog_line[point[0], point[1]]:
231 |                 return point
232 |             for p in [(-1,0), (0,-1), (1,0), (0,1), (-1,-1), (1,-1), (1,1), (-1,1)]:
233 |                 if path_line[point[0]+p[0], point[1]+p[1]] and [point[0]+p[0], point[1]+p[1]] not in searched:
234 |                     s_point = search([point[0]+p[0], point[1]+p[1]])
235 |                     if s_point is None:
236 |                         continue
237 |                     return s_point
238 | 
239 |         cross_line = np.where((path_line & fog_line) != 0)
240 |         if cross_line[0].shape[0] > 0:
241 |             frontier = search([a_x, a_y])
242 |             if frontier is None:
243 |                 frontier = [cross_line[0][0], cross_line[1][0]]
244 |             frontier = maps.from_grid(
245 |                 frontier[0], frontier[1],
246 |                 self._coordinate_min,
247 |                 self._coordinate_max,
248 |                 self._map_resolution,
249 |             )
250 |         else:
251 |             frontier = [points[-1][0], points[-1][2]]
252 | 
253 |         waypoint = np.array([frontier[0], points[0][1], frontier[1]])
254 |         return waypoint
255 | 
256 | 
257 | @registry.register_sensor
258 | class VLNOraclePathSensor(Sensor):
259 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
260 |         super().__init__(config=config)
261 |         self._sim = sim
262 |         self._coordinate_min = maps.COORDINATE_MIN
263 |         self._coordinate_max = maps.COORDINATE_MAX
264 |         self._map_resolution = (config.MAP_RESOLUTION, config.MAP_RESOLUTION)
265 |         self._map_size = config.MAP_SIZE
266 | 
267 |     def _get_uuid(self, *args: Any, **kwargs: Any):
268 |         return "gt_path"
269 | 
270 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
271 |         return SensorTypes.TACTILE
272 | 
273 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
274 |         return spaces.Box(low=0.0, high=1.0, shape=(100, 100), dtype=np.float)
275 | 
276 |     def get_observation(self, observations, *args: Any, episode, **kwargs: Any):
277 |         agent_position = self._sim.get_agent_state().position
278 |         goal_pos = episode.goals[0].position
279 |         points = self._sim.get_straight_shortest_path_points(agent_position, goal_pos)
280 |         if len(points) < 2:
281 |             return None
282 |         gt_path = self.get_gt_path(points)
283 |         return gt_path
284 | 
285 |     def get_gt_path(self, points):
286 |         path_line = np.zeros([self._map_size, self._map_size])
287 |         self.trans_tool = TransfomationRealworldAgent(self._sim.get_agent_state())
288 | 
289 |         for index in range(len(points) - 1):
290 |             resolution = (self._coordinate_max - self._coordinate_min) / self._map_resolution[0]
291 | 
292 |             a1 = self.trans_tool.realworld2agent(points[index])
293 |             x_t_1 = (a1[2] / resolution + self._map_size // 2).astype(np.int)
294 |             y_t_1 = (a1[0] / resolution + self._map_size // 2).astype(np.int)
295 | 
296 |             a2 = self.trans_tool.realworld2agent(points[index + 1])
297 |             x_t_2 = (a2[2] / resolution + self._map_size // 2).astype(np.int)
298 |             y_t_2 = (a2[0] / resolution + self._map_size // 2).astype(np.int)
299 | 
300 |             cv2.line(path_line, (y_t_1, x_t_1), (y_t_2, x_t_2), 255, self.config.LINE_WIDTH)
301 | 
302 |         waypoint_dis = path_line / 255
303 |         line_point_x, line_point_y = np.where(waypoint_dis != 0)
304 |         line_point = np.concatenate([line_point_x[np.newaxis, :], line_point_y[np.newaxis, :]], axis=0)
305 |         line_point = np.repeat(line_point[np.newaxis, :, :], 100, axis=0)
306 |         line_point = np.repeat(line_point[np.newaxis, :, :, :], 100, axis=0)
307 | 
308 |         x, y = np.linspace(0, 99, 100), np.linspace(0, 99, 100)
309 |         xv, yv = np.meshgrid(x, y)
310 |         all_point = np.concatenate([yv[:, :, np.newaxis], xv[:, :, np.newaxis]], axis=2)
311 |         all_point = np.repeat(all_point[:, :, :, np.newaxis], line_point.shape[-1], axis=3)
312 | 
313 |         dis_map = np.min(np.sqrt(np.sum((all_point - line_point)**2, axis=2)), axis=2)
314 | 
315 |         return dis_map
316 | 
317 | 
318 | @registry.register_sensor
319 | class SemanticFilterSensor(Sensor):
320 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
321 |         super().__init__(config=config)
322 |         self.sim = sim
323 |         self.prev_episode_id = None
324 |         self.label_to_27 = np.array([
325 |             0, 15, 17, 1, 2, 3, 18, 19, 4, 15, 5, 6, 16, 20, 7, 8, 17,
326 |             17, 9, 21, 22, 16, 10, 11, 15, 12, 13, 23, 16, 16, 16, 16,
327 |             16, 24, 25, 16, 16, 14, 26, 16, 16,
328 |         ])
329 | 
330 |     def _get_uuid(self, *args: Any, **kwargs: Any):
331 |         return "semantic_filter"
332 | 
333 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
334 |         return SensorTypes.TACTILE
335 | 
336 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
337 |         return spaces.Box(
338 |             low=np.iinfo(np.uint32).min,
339 |             high=np.iinfo(np.uint32).max,
340 |             shape=(self.config.HEIGHT, self.config.WIDTH, self.config.CATEGORY),
341 |             dtype=np.float,
342 |         )
343 | 
344 |     def get_observation(self, observations, episode, *args: Any, **kwargs: Any):
345 |         semantic = observations['semantic']
346 | 
347 |         if self.prev_episode_id != episode.episode_id:
348 |             scene = self.sim.semantic_annotations()
349 |             instance_id_to_label_id = {int(obj.id.split("_")[-1]): obj.category.index() for obj in scene.objects}
350 |             self.mapping = np.array([instance_id_to_label_id[i] for i in range(len(instance_id_to_label_id))])
351 |         self.prev_episode_id = episode.episode_id
352 | 
353 |         semantic = np.take(self.mapping, semantic)
354 |         semantic[semantic == -1] = 0
355 |         semantic = np.take(self.label_to_27, semantic)
356 |         h, w = semantic.shape
357 |         semantic_filter = np.eye(27, dtype=np.float32)[semantic.reshape(-1)].reshape(h, w, 27)
358 | 
359 |         return semantic_filter
360 | 
361 | 
362 | @registry.register_sensor
363 | class GtSemanticMapSensor(Sensor):
364 |     r"""Sensor for generating semantic map grounth truth
365 |     """
366 |     def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any):
367 |         self._sim = sim
368 |         self.gt_path = 'data/map_data/semantic/{}'.format(config.SPLIT)
369 |         self.half_size = config.MAP_SIZE // 2
370 |         self.prev_episode_id = None
371 |         super().__init__()
372 | 
373 |     @staticmethod
374 |     def _get_uuid(*args: Any, **kwargs: Any):
375 |         return "gt_semantic_map"
376 | 
377 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
378 |         return SensorTypes.TACTILE
379 | 
380 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
381 |         return spaces.Box(low=0.0, high=27.0, shape=(100, 100), dtype=np.long)
382 | 
383 |     def get_observation(self, observations, episode, *args: Any, **kwargs: Any):
384 |         if self.prev_episode_id != episode.episode_id:
385 |             self.init_agent_state = self._sim.get_agent_state()
386 | 
387 |             self.global_gt_semmap = np.load(os.path.join(self.gt_path, 'ep_'+str(episode.episode_id)+'.npy'))
388 |             self.global_gt_semmap = torch.from_numpy(self.global_gt_semmap).unsqueeze(0).unsqueeze(0).float()
389 | 
390 |             rever_pose = torch.FloatTensor([0, 0, self._sim.record_heading]).unsqueeze(0)
391 |             rot_mat, _ = get_grid(rever_pose, self.global_gt_semmap.size(), 'cpu')
392 |             self.global_gt_semmap = F.grid_sample(self.global_gt_semmap, rot_mat, mode='nearest')
393 | 
394 |         agent_state = self._sim.get_agent_state()
395 |         grid_y = (agent_state.position[0] - self.init_agent_state.position[0]) / 0.12 + 240
396 |         grid_x = (agent_state.position[2] - self.init_agent_state.position[2]) / 0.12 + 240
397 |         st_pose = torch.FloatTensor([
398 |             (grid_y - (480//2)) / (480//2),
399 |             (grid_x - (480//2)) / (480//2),
400 |             - self._sim.record_heading,
401 |         ]).unsqueeze(0)
402 | 
403 |         rot_mat, tra_mat = get_grid(st_pose, self.global_gt_semmap.size(), 'cpu')
404 |         transed_map = F.grid_sample(self.global_gt_semmap, tra_mat, mode='nearest')
405 |         rotated_map = F.grid_sample(transed_map, rot_mat, mode='nearest')
406 |         rotated_map = F.pad(rotated_map, (self.half_size, self.half_size, self.half_size, self.half_size), 'constant', 0)
407 | 
408 |         self.prev_episode_id = episode.episode_id
409 | 
410 |         return rotated_map.squeeze()[289-self.half_size: 289+self.half_size, 289-self.half_size: 289+self.half_size].long()
411 | 
412 | @registry.register_sensor
413 | class HeadingSensor(Sensor):
414 |     r"""Sensor for observing the agent's heading in the global coordinate
415 |     frame.
416 |     Args:
417 |         sim: reference to the simulator for calculating task observations.
418 |         config: config for the sensor.
419 |     """
420 | 
421 |     def __init__(
422 |         self, sim: Simulator, config: Config, *args: Any, **kwargs: Any
423 |     ):
424 |         self._sim = sim
425 |         super().__init__(config=config)
426 | 
427 |     def _get_uuid(self, *args: Any, **kwargs: Any):
428 |         return "heading"
429 | 
430 |     def _get_sensor_type(self, *args: Any, **kwargs: Any):
431 |         return SensorTypes.HEADING
432 | 
433 |     def _get_observation_space(self, *args: Any, **kwargs: Any):
434 |         return spaces.Box(low=-np.pi, high=np.pi, shape=(1,), dtype=np.float)
435 | 
436 |     def _quat_to_xy_heading(self, quat):
437 |         direction_vector = np.array([0, 0, -1])
438 |         heading_vector = quaternion_rotate_vector(quat, direction_vector)
439 |         phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
440 |         return np.array([phi], dtype=np.float32)
441 | 
442 |     def get_observation(
443 |         self, observations, episode, *args: Any, **kwargs: Any
444 |     ):
445 |         agent_state = self._sim.get_agent_state()
446 |         rotation_world_agent = agent_state.rotation
447 | 
448 |         heading = self._quat_to_xy_heading(rotation_world_agent.inverse())
449 |         self._sim.record_heading = heading
450 | 
451 |         return heading
452 | 


--------------------------------------------------------------------------------
/vlnce_baselines/common_trainer.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import json
  3 | import os
  4 | import time
  5 | import tqdm
  6 | import math
  7 | import datetime
  8 | import numpy as np
  9 | from collections import defaultdict
 10 | from typing import Dict, Optional
 11 | 
 12 | import torch
 13 | import torch.distributed as dist
 14 | from torch.nn.parallel import DistributedDataParallel as DDP
 15 | 
 16 | from habitat import Config, logger
 17 | from habitat.utils.visualizations.utils import append_text_to_image
 18 | from habitat_baselines.common.base_trainer import BaseRLTrainer
 19 | from habitat_baselines.common.environments import get_env_class
 20 | from habitat_baselines.common.tensorboard_utils import TensorboardWriter
 21 | from habitat_baselines.common.utils import batch_obs, generate_video, poll_checkpoint_folder
 22 | from habitat_extensions.utils import observations_to_image
 23 | 
 24 | from vlnce_baselines.models.policy import BasePolicy
 25 | from vlnce_baselines.common.env_utils import construct_envs_auto_reset_false
 26 | from vlnce_baselines.common.utils import transform_obs
 27 | 
 28 | 
 29 | class CommonTrainer(BaseRLTrainer):
 30 |     def __init__(self, config=None):
 31 |         super().__init__(config)
 32 |         self.actor_critic = None
 33 |         self.envs = None
 34 | 
 35 |         self.local_rank = int(os.environ["LOCAL_RANK"])
 36 |         self.world_size = int(os.environ["WORLD_SIZE"])
 37 |         torch.cuda.set_device(self.local_rank)
 38 |         dist.init_process_group(backend="nccl", timeout=datetime.timedelta(seconds=18000))
 39 |         self.device = (
 40 |             torch.device("cuda", self.local_rank)
 41 |             if torch.cuda.is_available()
 42 |             else torch.device("cpu")
 43 |         )
 44 |         print(f"[init] == local rank: {self.local_rank}")
 45 | 
 46 |     def _setup_actor_critic(
 47 |         self, config: Config, load_from_ckpt: bool, ckpt_path: str
 48 |     ) -> None:
 49 |         """Sets up actor critic and agent.
 50 |         Args:
 51 |             config: config
 52 |         Returns:
 53 |             None
 54 |         """
 55 |         self.actor_critic = BasePolicy(
 56 |             observation_space=self.envs.observation_spaces[0],
 57 |             action_space=self.envs.action_spaces[0],
 58 |             model_config=config.MODEL,
 59 |         )
 60 |         self.actor_critic.to(self.device)
 61 |         self.actor_critic = DDP(
 62 |             self.actor_critic,
 63 |             device_ids=[self.local_rank],
 64 |             output_device=self.local_rank,
 65 |             find_unused_parameters=True
 66 |         )
 67 |         self.optimizer = torch.optim.Adam(
 68 |             self.actor_critic.parameters(), lr=self.config.DAGGER.LR
 69 |         )
 70 | 
 71 |         if load_from_ckpt:
 72 |             ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu")
 73 |             ckpt_dict["state_dict"] = {'module.'+k: v for k, v in ckpt_dict["state_dict"].items()}
 74 |             msg = self.actor_critic.load_state_dict(ckpt_dict["state_dict"], strict=False)
 75 |             logger.warning(f'Missing keys: {msg.missing_keys}, Unexpected keys: {msg.unexpected_keys}')
 76 |             logger.info(f"Loaded weights from checkpoint: {ckpt_path}")
 77 |         logger.info("Finished setting up actor critic model.")
 78 | 
 79 |         if self.local_rank == 0:
 80 |             logger.info(
 81 |                 "agent number of parameters: {}".format(
 82 |                     sum(param.numel() for param in self.actor_critic.parameters())
 83 |                 )
 84 |             )
 85 |             logger.info(
 86 |                 "agent number of trainable parameters: {}".format(
 87 |                     sum(p.numel() for p in self.actor_critic.parameters() if p.requires_grad)
 88 |                 )
 89 |             )
 90 | 
 91 |     def save_checkpoint(self, file_name, extra_state: Optional[Dict] = None) -> None:
 92 |         """Save checkpoint with specified name.
 93 |         Args:
 94 |             file_name: file name for checkpoint
 95 |         Returns:
 96 |             None
 97 |         """
 98 |         checkpoint = {
 99 |             "state_dict": self.actor_critic.module.state_dict(),
100 |             "config": self.config,
101 |         }
102 |         if extra_state is not None:
103 |             checkpoint["extra_state"] = extra_state
104 |         torch.save(checkpoint, os.path.join(self.config.CHECKPOINT_FOLDER, file_name))
105 | 
106 |     def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict:
107 |         """Load checkpoint of specified path as a dict.
108 |         Args:
109 |             checkpoint_path: path of target checkpoint
110 |             *args: additional positional args
111 |             **kwargs: additional keyword args
112 |         Returns:
113 |             dict containing checkpoint info
114 |         """
115 |         ckpt = torch.load(checkpoint_path, *args, **kwargs)
116 |         return ckpt
117 | 
118 |     def resume_dagger(self):
119 |         start_dagger_it = 0
120 |         start_epoch_it = 0
121 | 
122 |         ckpt_file = None
123 |         if self.config.RESUME_CKPT is not None:
124 |             ckpt_file = self.config.RESUME_CKPT
125 |         if len(os.listdir(self.config.CHECKPOINT_FOLDER)) != 0:
126 |             dir_list = sorted(os.listdir(self.config.CHECKPOINT_FOLDER), key=lambda x: os.path.getmtime(os.path.join(self.config.CHECKPOINT_FOLDER, x)))
127 |             ckpt_file = os.path.join(self.config.CHECKPOINT_FOLDER, dir_list[-1])   # load the last saved ckpt
128 | 
129 |         if ckpt_file is not None:
130 |             previous_model = self.load_checkpoint(ckpt_file, map_location=torch.device('cpu'))
131 |             msg = self.actor_critic.module.load_state_dict(previous_model["state_dict"], strict=False)
132 |             logger.warning(f'Missing keys: {msg.missing_keys}, Unexpected keys: {msg.unexpected_keys}')
133 |             logger.info("Loaded previous checkpoint:%s"%ckpt_file)
134 |             start_dagger_it = previous_model['extra_state']['dagger_it']
135 |             start_epoch_it = (int(ckpt_file.split('/')[-1].split('.')[1]) + 1) % self.config.DAGGER.EPOCHS
136 |             if start_epoch_it == 0:
137 |                 start_dagger_it += 1
138 | 
139 |         return start_dagger_it, start_epoch_it
140 | 
141 |     @staticmethod
142 |     def _pause_envs(
143 |         envs_to_pause,
144 |         envs,
145 |         recurrent_hidden_states,
146 |         not_done_masks,
147 |         prev_actions,
148 |         batch,
149 |         actions=None,
150 |         prog=None,
151 |         rgb_full_global_map=None,
152 |         rgb_frames=None,
153 |     ):
154 |         # pausing self.envs with no new episode
155 |         if len(envs_to_pause) > 0:
156 |             state_index = list(range(envs.num_envs))
157 |             for idx in reversed(envs_to_pause):
158 |                 if rgb_frames is not None:
159 |                     rgb_frames.pop(idx)
160 |                 state_index.pop(idx)
161 |                 envs.pause_at(idx)
162 | 
163 |             # indexing along the batch dimensions
164 |             recurrent_hidden_states = recurrent_hidden_states[:, state_index]
165 |             not_done_masks = not_done_masks[state_index]
166 |             prev_actions = prev_actions[state_index]
167 |             if actions is not None:
168 |                 actions = actions[state_index]
169 |             if prog is not None:
170 |                 prog = prog[state_index]
171 |             if rgb_full_global_map is not None:
172 |                 rgb_full_global_map = rgb_full_global_map[state_index]
173 | 
174 |             for k, v in batch.items():
175 |                 batch[k] = v[state_index]
176 | 
177 |         return (
178 |             envs,
179 |             recurrent_hidden_states,
180 |             not_done_masks,
181 |             prev_actions,
182 |             batch,
183 |             actions,
184 |             prog,
185 |             rgb_full_global_map,
186 |             rgb_frames,
187 |         )
188 | 
189 |     def eval(self) -> None:
190 |         """Main method of trainer evaluation. Calls _eval_checkpoint() that
191 |         is specified in Trainer class that inherits from BaseRLTrainer
192 |         """
193 |         if "tensorboard" in self.config.VIDEO_OPTION:
194 |             assert (
195 |                 len(self.config.TENSORBOARD_DIR) > 0
196 |             ), "Must specify a tensorboard directory for video display"
197 |             os.makedirs(self.config.TENSORBOARD_DIR, exist_ok=True)
198 |         if "disk" in self.config.VIDEO_OPTION:
199 |             assert (
200 |                 len(self.config.VIDEO_DIR) > 0
201 |             ), "Must specify a directory for storing videos on disk"
202 | 
203 |         with TensorboardWriter(
204 |             '', flush_secs=self.flush_secs
205 |         ) as writer:
206 |             if os.path.isfile(self.config.EVAL_CKPT_PATH_DIR):
207 |                 # evaluate singe checkpoint
208 |                 self._eval_checkpoint(self.config.EVAL_CKPT_PATH_DIR, writer)
209 |             else:
210 |                 # evaluate multiple checkpoints in order
211 |                 num_ckpt = len(os.listdir(self.config.EVAL_CKPT_PATH_DIR))
212 |                 prev_ckpt_ind = num_ckpt - 2
213 |                 while True:
214 |                     current_ckpt = None
215 |                     while current_ckpt is None:
216 |                         current_ckpt = poll_checkpoint_folder(
217 |                             self.config.EVAL_CKPT_PATH_DIR, prev_ckpt_ind
218 |                         )
219 |                         time.sleep(2)  # sleep for 2 secs before polling again
220 |                     logger.info(f"=======current_ckpt: {current_ckpt}=======")
221 |                     self._eval_checkpoint(
222 |                         checkpoint_path=current_ckpt,
223 |                         writer=writer,
224 |                         checkpoint_index=prev_ckpt_ind + 1,
225 |                     )
226 |                     prev_ckpt_ind -= 1
227 | 
228 |     def _eval_checkpoint(
229 |         self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, training=False, training_step=0
230 |     ) -> None:
231 |         """Evaluates a single checkpoint. Assumes episode IDs are unique.
232 |         Args:
233 |             checkpoint_path: path of checkpoint
234 |             writer: tensorboard writer object for logging to tensorboard
235 |             checkpoint_index: index of cur checkpoint for logging
236 |         Returns:
237 |             None
238 |         """
239 |         if training:
240 |             checkpoint_path = ''
241 | 
242 |         finish_process = []
243 |         logger.info(f"checkpoint_path: {checkpoint_path}")
244 | 
245 |         if self.config.EVAL.USE_CKPT_CONFIG and not training:
246 |             config = self._setup_eval_config(
247 |                 self.load_checkpoint(checkpoint_path, map_location="cpu")["config"]
248 |             )
249 |         else:
250 |             config = self.config.clone()
251 | 
252 |         config.defrost()
253 |         config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT
254 |         config.TASK_CONFIG.TASK.NDTW.SPLIT = config.EVAL.SPLIT
255 |         config.TASK_CONFIG.TASK.SDTW.SPLIT = config.EVAL.SPLIT
256 |         config.TASK_CONFIG.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = config.EVAL.SPLIT
257 |         config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
258 |         config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = -1
259 |         config.TASK_CONFIG.DATASET.split_num = 1
260 |         if config.MODEL.PREDICTION_MONITOR.use:
261 |             config.TASK_CONFIG.TASK.SENSORS.remove('GT_SEMANTIC_MAP_SENSOR')
262 |         config.NUM_PROCESSES = 11 if config.NUM_PROCESSES > 11 else config.NUM_PROCESSES
263 |         config.SIMULATOR_GPU_IDS = list(range(len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))))
264 | 
265 |         if training:
266 |             self.actor_critic.module.net.rgb_mapping_module.full_global_map = torch.zeros([config.NUM_PROCESSES] + list(self.actor_critic.module.net.rgb_mapping_module.full_global_map.shape[1:]), device=self.device)
267 |             self.actor_critic.module.net.rgb_mapping_module.agent_view = torch.zeros([config.NUM_PROCESSES] + list(self.actor_critic.module.net.rgb_mapping_module.agent_view.shape[1:]), device=self.device)
268 | 
269 |         if training:
270 |             config.STOP_CONDITION.TYPE = 'prog'
271 |             config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/val_unseen/val_unseen_min.json.gz'
272 |         # config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/val_unseen/val_unseen.json.gz'
273 |         if len(config.VIDEO_OPTION) > 0 and not training:
274 |             config.SENSORS.append('SEMANTIC_SENSOR')
275 |             config.TASK_CONFIG.TASK.SENSORS.append('SEMANTIC_FILTER_SENSOR')
276 |             config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP")
277 |             config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS")
278 |         config.freeze()
279 | 
280 |         # setup agent
281 |         if self.envs is not None:
282 |             self.envs.close()
283 |             self.envs = None
284 |         self.envs = construct_envs_auto_reset_false(
285 |             config, get_env_class(config.ENV_NAME)
286 |         )
287 | 
288 |         if not training:
289 |             self._setup_actor_critic(config, not config.random_agent, checkpoint_path)
290 | 
291 |         observations = self.envs.reset()
292 |         epidsode_reset_flag = True
293 |         observations = transform_obs(
294 |             observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, self.device
295 |         )
296 |         batch = batch_obs(observations, self.device)
297 | 
298 |         eval_recurrent_hidden_states = torch.zeros(
299 |             self.actor_critic.module.net.num_recurrent_layers,
300 |             config.NUM_PROCESSES,
301 |             self.config.MODEL.STATE_ENCODER.hidden_size,
302 |             device=self.device,
303 |         )
304 |         prev_actions = torch.zeros(
305 |             config.NUM_PROCESSES, 2, device=self.device
306 |         )
307 |         not_done_masks = torch.zeros(config.NUM_PROCESSES, 1, device=self.device)
308 | 
309 |         stats_episodes = {}  # dict of dicts that stores stats per episode
310 | 
311 |         count_step = 0
312 | 
313 |         rgb_frames = None
314 |         if len(config.VIDEO_OPTION) > 0 and not training:
315 |             os.makedirs(config.VIDEO_DIR, exist_ok=True)
316 |             rgb_frames = [[] for _ in range(config.NUM_PROCESSES)]
317 | 
318 |         pbar = tqdm.tqdm(total=sum(self.envs.number_of_episodes), dynamic_ncols=True, desc="Eval_ckpt_{}".format(str(training_step)))
319 |         self.actor_critic.eval()
320 |         step = 0
321 |         while (
322 |             self.envs.num_envs > 0 and len(stats_episodes) < config.EVAL.EPISODE_COUNT
323 |         ):
324 |             current_episodes = self.envs.current_episodes()
325 | 
326 |             with torch.no_grad():
327 |                 if count_step % config.step_num == 0 and count_step >= 24:
328 |                     (_, actions, _, eval_recurrent_hidden_states) = self.actor_critic.module.act(
329 |                         batch,
330 |                         eval_recurrent_hidden_states,
331 |                         prev_actions,
332 |                         not_done_masks,
333 |                         deterministic=True,
334 |                     )
335 |                 else:
336 |                     self.actor_critic.module.update_map(batch, not_done_masks)
337 |                 if count_step < 24:
338 |                     actions = batch['waypoint'][:, :2]
339 |                 prev_actions.copy_(actions)
340 | 
341 |             step_inputs = [
342 |                 {
343 |                     'action': actions[e].cpu(),
344 |                     'prog': self.actor_critic.module.prog[e].cpu().item() if count_step >= 24 else -1,
345 |                     'epidsode_reset_flag': epidsode_reset_flag ,
346 |                     'depth_img': observations[e]['depth'],
347 |                 }
348 |                 for e in range(self.envs.num_envs)
349 |             ]
350 |             outputs = self.envs.step(step_inputs) 
351 |             epidsode_reset_flag = False
352 |             step += 1
353 |             observations, _, dones, infos = [list(x) for x in zip(*outputs)]
354 |             if len(config.VIDEO_OPTION) > 0 and not training:
355 |                 for i in range(self.envs.num_envs):
356 |                     observations[i]['ego_map_vis'] = batch['ego_map'][i].cpu().numpy()
357 | 
358 |             count_step += 1
359 | 
360 |             not_done_masks = torch.tensor(
361 |                 [[0.0] if done else [1.0] for done in dones],
362 |                 dtype=torch.float,
363 |                 device=self.device,
364 |             )
365 | 
366 |             # reset envs and observations if necessary
367 |             for i in range(self.envs.num_envs):
368 |                 if len(config.VIDEO_OPTION) > 0 and len(os.listdir(config.VIDEO_DIR)) < config.VIDEO_NUM and not training:
369 |                     att_map = self.actor_critic.module.net.att_map_t_m[i] if count_step-1 >= 24 else torch.zeros(24*24)
370 |                     frame = observations_to_image(observations[i], infos[i], step_inputs[i], att_map)
371 |                     frame = append_text_to_image(
372 |                         frame, current_episodes[i].instruction.instruction_text
373 |                     )
374 |                     rgb_frames[i].append(frame)
375 | 
376 |                 if not dones[i]:
377 |                     continue
378 | 
379 |                 pbar.update()
380 |                 stats_episodes[current_episodes[i].episode_id] = infos[i]
381 |                 prev_actions[i] = torch.zeros(2)
382 | 
383 |                 finish_process.append(i)
384 |                 if len(config.VIDEO_OPTION) > 0 and len(os.listdir(config.VIDEO_DIR)) < config.VIDEO_NUM and not training \
385 |                     and finish_process.count(i) // 3 <= math.ceil(config.VIDEO_NUM / config.NUM_PROCESSES) and finish_process.count(i) % 3 == 1:
386 |                     generate_video(
387 |                         video_option=config.VIDEO_OPTION,
388 |                         video_dir=config.VIDEO_DIR,
389 |                         images=rgb_frames[i],
390 |                         episode_id=current_episodes[i].episode_id,
391 |                         checkpoint_idx=checkpoint_index,
392 |                         metrics={
393 |                             "spl": stats_episodes[current_episodes[i].episode_id]["spl"]
394 |                         },
395 |                         tb_writer=writer,
396 |                     )
397 | 
398 |                 if len(config.VIDEO_OPTION) > 0:
399 |                     del stats_episodes[current_episodes[i].episode_id]["top_down_map"]
400 |                     del stats_episodes[current_episodes[i].episode_id]["collisions"]
401 |                     rgb_frames[i] = []
402 | 
403 |                 if not training:
404 |                     aggregated_stats = {}
405 |                     num_episodes = len(stats_episodes)
406 |                     for stat_key in next(iter(stats_episodes.values())).keys():
407 |                         aggregated_stats[stat_key] = (
408 |                             sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes
409 |                         )
410 |                     logger.info(aggregated_stats)
411 | 
412 |             if np.array(dones).all():
413 |                 self.envs.resume_all()
414 |                 observations = self.envs.reset()
415 |                 epidsode_reset_flag = True
416 |                 count_step = 0
417 |                 eval_recurrent_hidden_states = torch.zeros(
418 |                     self.actor_critic.module.net.num_recurrent_layers,
419 |                     config.NUM_PROCESSES,
420 |                     self.config.MODEL.STATE_ENCODER.hidden_size,
421 |                     device=self.device,
422 |                 )
423 |                 prev_actions = torch.zeros(
424 |                     config.NUM_PROCESSES, 2, device=self.device
425 |                 )
426 |                 not_done_masks = torch.zeros(config.NUM_PROCESSES, 1, device=self.device)
427 |                 self.actor_critic.module.prog = torch.zeros(config.NUM_PROCESSES, 1, device=self.device)
428 |                 if self.actor_critic.module.net.rgb_mapping_module is not None:
429 |                     self.actor_critic.module.net.rgb_mapping_module.full_global_map = torch.zeros(
430 |                         config.NUM_PROCESSES,
431 |                         config.MODEL.RGBMAPPING.global_map_size,
432 |                         config.MODEL.RGBMAPPING.global_map_size,
433 |                         config.MODEL.RGBMAPPING.map_depth,
434 |                         device=self.device
435 |                     )
436 |                 if len(config.VIDEO_OPTION) > 0:
437 |                     rgb_frames = [[] for _ in range(config.NUM_PROCESSES)]
438 | 
439 |             observations = transform_obs(
440 |                 observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, self.device
441 |             )
442 |             batch = batch_obs(observations, self.device)
443 | 
444 |             if np.array(dones).all():
445 |                 actions = batch['waypoint'][:, :2]
446 | 
447 |             envs_to_pause = []
448 |             next_episodes = self.envs.current_episodes()
449 | 
450 |             for i in range(self.envs.num_envs):
451 |                 if next_episodes[i].episode_id in stats_episodes:
452 |                     envs_to_pause.append(i)
453 | 
454 |             (
455 |                 self.envs,
456 |                 eval_recurrent_hidden_states,
457 |                 not_done_masks,
458 |                 prev_actions,
459 |                 batch,
460 |                 actions,
461 |                 self.actor_critic.module.prog,
462 |                 rgb_full_global_map,
463 |                 rgb_frames,
464 |             ) = self._pause_envs(
465 |                 envs_to_pause,
466 |                 self.envs,
467 |                 eval_recurrent_hidden_states,
468 |                 not_done_masks,
469 |                 prev_actions,
470 |                 batch,
471 |                 actions,
472 |                 self.actor_critic.module.prog if count_step >= 24 else None,
473 |                 self.actor_critic.module.net.rgb_mapping_module.full_global_map,
474 |                 rgb_frames,
475 |             )
476 |             self.actor_critic.module.net.rgb_mapping_module.full_global_map = rgb_full_global_map
477 | 
478 |         self.envs.close()
479 |         self.envs = None
480 | 
481 |         aggregated_stats = {}
482 |         num_episodes = len(stats_episodes)
483 |         for stat_key in next(iter(stats_episodes.values())).keys():
484 |             aggregated_stats[stat_key] = (
485 |                 sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes
486 |             )
487 | 
488 |         if not training:
489 |             split = config.TASK_CONFIG.DATASET.SPLIT
490 |             os.makedirs(config.METRIC_DIR, exist_ok=True)
491 |             with open(os.path.join(config.METRIC_DIR, f"stats_ckpt_{checkpoint_index}_{split}.json"), "w") as f:
492 |                 json.dump(aggregated_stats, f, indent=4)
493 |             with open(os.path.join(config.METRIC_DIR, f"each_stat_ckpt_{checkpoint_index}_{split}.json"), "w") as f:
494 |                 json.dump(stats_episodes, f)
495 | 
496 |         if not training:
497 |             logger.info(f"Episodes evaluated: {num_episodes}")
498 |             checkpoint_num = checkpoint_index + 1
499 |             for k, v in aggregated_stats.items():
500 |                 logger.info(f"Average episode {k}: {v:.6f}")
501 |                 writer.add_scalar(f"eval_{split}_{k}", v, checkpoint_num)
502 |         else:
503 |             for k, v in aggregated_stats.items():
504 |                 logger.info(f"Eval while training average episode {k}: {v:.6f}")
505 |                 writer.add_scalar(f"eval_while_training_{k}", v, training_step)
506 |             writer.flush()
507 | 
508 |     def empty_cuda_cache(self):
509 |         if torch.cuda.is_available():
510 |             with torch.cuda.device(self.device):
511 |                 torch.cuda.empty_cache()
512 |         gc.collect()
513 | 
514 |     def change_data_type(self, traj_obs):
515 |         for k, v in traj_obs.items():
516 |             traj_obs[k] = v.numpy()
517 |             if k == 'vln_oracle_action_sensor':
518 |                 traj_obs[k] = traj_obs[k].astype(np.uint8)
519 |             if k == 'rgb_ego_map':
520 |                 traj_obs[k] = traj_obs[k].astype(np.float16)
521 |             if k == 'gt_path':
522 |                 traj_obs[k] = traj_obs[k].astype(np.float16)
523 |             if k == 'rgb':
524 |                 traj_obs[k] = traj_obs[k].astype(np.uint8)
525 |             if k == 'depth':
526 |                 traj_obs[k] = traj_obs[k].astype(np.float16)
527 |             if k == 'rgb_features':
528 |                 traj_obs[k] = traj_obs[k].astype(np.float16)
529 |             if k == 'depth_features':
530 |                 traj_obs[k] = traj_obs[k].astype(np.float16)
531 |             if k == 'gt_semantic_map':
532 |                 traj_obs[k] = traj_obs[k].astype(np.int)
533 | 
534 |     def inference(self) -> None:
535 |         pass
536 | 


--------------------------------------------------------------------------------