├── habitat_extensions ├── config │ ├── __init__.py │ ├── vlnce_task.yaml │ ├── vlnce_task_aug.yaml │ └── default.py ├── __init__.py ├── task.py ├── shortest_path_follower.py ├── utils.py ├── measures.py └── sensors.py ├── vlnce_baselines ├── config │ ├── __init__.py │ ├── CMA_AUG.yaml │ ├── CMA_AUG_DA_TUNE.yaml │ └── default.py ├── models │ ├── __init__.py │ ├── encoders │ │ ├── instruction_encoder.py │ │ ├── resnet_encoders.py │ │ ├── unet_encoder.py │ │ └── map_encoder.py │ ├── policy.py │ ├── mg_map_policy.py │ └── ddppo_policy.py ├── __init__.py ├── common │ ├── aux_losses.py │ ├── distributions.py │ ├── env_utils.py │ ├── environments.py │ ├── action_maker.py │ ├── utils.py │ └── rgb_mapping.py └── common_trainer.py ├── img └── framework.png ├── requirements.txt ├── SETUP.md ├── .gitignore ├── run.py └── README.md /habitat_extensions/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vlnce_baselines/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vlnce_baselines/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeihaoChen/WS-MGMap/HEAD/img/framework.png -------------------------------------------------------------------------------- /vlnce_baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from vlnce_baselines import dagger_trainer 2 | from vlnce_baselines.common import environments 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs>=19.1.0 2 | dtw==1.4.0 3 | fastdtw==0.3.4 4 | gym==0.10.9 5 | lmdb 6 | msgpack_numpy 7 | numpy 8 | torch>=1.3.1 9 | torchvision==0.2.2.post3 10 | tqdm>=4.0.0 11 | -------------------------------------------------------------------------------- /habitat_extensions/__init__.py: -------------------------------------------------------------------------------- 1 | from habitat_extensions import measures, sensors 2 | from habitat_extensions.config.default import get_extended_config 3 | from habitat_extensions.task import VLNCEDatasetV1 4 | -------------------------------------------------------------------------------- /vlnce_baselines/config/CMA_AUG.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task_aug.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_PROCESSES: 5 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug 8 | 9 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] 10 | 11 | EVAL: 12 | USE_CKPT_CONFIG: False 13 | SPLIT: val_unseen 14 | EPISODE_COUNT: 50000 15 | 16 | DAGGER: 17 | ITERATIONS: 1 18 | EPOCHS: 30 19 | UPDATE_SIZE: 157232 20 | BATCH_SIZE: 8 21 | P: 1.0 22 | PRELOAD_LMDB_FEATURES: False 23 | LMDB_FEATURES_DIR: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_train_seen_data/trajectories.lmdb 24 | 25 | same_level_train: False 26 | -------------------------------------------------------------------------------- /vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml: -------------------------------------------------------------------------------- 1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/vlnce_task.yaml 2 | SIMULATOR_GPU_ID: 0 3 | TORCH_GPU_ID: 0 4 | NUM_PROCESSES: 5 5 | TENSORBOARD_DIR: data/tensorboard_dirs/cma_aug_da_tune 6 | CHECKPOINT_FOLDER: data/checkpoints/cma_aug_da_tune 7 | EVAL_CKPT_PATH_DIR: data/checkpoints/cma_aug_da_tune 8 | 9 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] 10 | 11 | EVAL: 12 | USE_CKPT_CONFIG: False 13 | SPLIT: val_unseen 14 | EPISODE_COUNT: 50000 15 | 16 | DAGGER: 17 | ITERATIONS: 10 18 | EPOCHS: 4 19 | UPDATE_SIZE: 5000 20 | BATCH_SIZE: 8 21 | P: 0.5 22 | PRELOAD_LMDB_FEATURES: False 23 | LMDB_FEATURES_DIR: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_train_seen_data/trajectories.lmdb 24 | LOAD_FROM_CKPT: True 25 | CKPT_TO_LOAD: /mnt/cephfs/dataset/VLN-CE/result/jidongyu/_exp_4/IL_RgbMap_Step3_SegPred-Alpha0.1_KlLoss-Tau0.07_DataAug/run_train_base/checkpoint/ckpt.12.pth 26 | -------------------------------------------------------------------------------- /vlnce_baselines/common/aux_losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class _AuxLosses: 5 | def __init__(self): 6 | self._losses = {} 7 | self._loss_alphas = {} 8 | self._is_active = False 9 | 10 | def clear(self): 11 | self._losses.clear() 12 | self._loss_alphas.clear() 13 | 14 | def register_loss(self, name, loss, alpha=1.0): 15 | assert self.is_active() 16 | assert name not in self._losses 17 | 18 | self._losses[name] = loss 19 | self._loss_alphas[name] = alpha 20 | 21 | def get_loss(self, name): 22 | return self._losses[name] 23 | 24 | def reduce(self, mask=None): 25 | assert self.is_active() 26 | total = 0.0 27 | 28 | for k in self._losses.keys(): 29 | if mask is not None: 30 | k_loss = torch.masked_select(self._losses[k], mask).mean() 31 | else: 32 | k_loss = self._losses[k].mean() 33 | total = total + self._loss_alphas[k] * k_loss 34 | 35 | return total 36 | 37 | def is_active(self): 38 | return self._is_active 39 | 40 | def activate(self): 41 | self._is_active = True 42 | 43 | def deactivate(self): 44 | self._is_active = False 45 | 46 | 47 | AuxLosses = _AuxLosses() 48 | -------------------------------------------------------------------------------- /habitat_extensions/config/vlnce_task.yaml: -------------------------------------------------------------------------------- 1 | ENVIRONMENT: 2 | MAX_EPISODE_STEPS: 500 3 | SIMULATOR: 4 | AGENT_0: 5 | SENSORS: [RGB_SENSOR, DEPTH_SENSOR] 6 | FORWARD_STEP_SIZE: 0.25 7 | TURN_ANGLE: 15 8 | HABITAT_SIM_V0: 9 | GPU_DEVICE_ID: 0 10 | ALLOW_SLIDING: True 11 | RGB_SENSOR: 12 | WIDTH: 224 13 | HEIGHT: 224 14 | HFOV: 90 15 | TYPE: HabitatSimRGBSensor 16 | DEPTH_SENSOR: 17 | WIDTH: 256 # pretrained DDPPO resnet needs 256x256 18 | HEIGHT: 256 19 | SEMANTIC_SENSOR: 20 | WIDTH: 256 21 | HEIGHT: 256 22 | TASK: 23 | TYPE: VLN-v0 24 | SUCCESS_DISTANCE: 3.0 25 | SENSORS: [ 26 | INSTRUCTION_SENSOR, 27 | VLN_ORACLE_ACTION_SENSOR, 28 | VLN_ORACLE_PROGRESS_SENSOR, 29 | VLN_ORACLE_WAYPOINT_SENSOR, 30 | VLN_ORACLE_PATH_SENSOR, 31 | HEADING_SENSOR, 32 | COMPASS_SENSOR, 33 | GPS_SENSOR, 34 | GT_SEMANTIC_MAP_SENSOR, 35 | ] 36 | INSTRUCTION_SENSOR_UUID: instruction 37 | POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT] 38 | MEASUREMENTS: [ 39 | DISTANCE_TO_GOAL, 40 | SUCCESS, 41 | SPL, 42 | NDTW, 43 | PATH_LENGTH, 44 | ORACLE_SUCCESS, 45 | STEPS_TAKEN 46 | ] 47 | SUCCESS: 48 | SUCCESS_DISTANCE: 3.0 49 | SPL: 50 | SUCCESS_DISTANCE: 3.0 51 | NDTW: 52 | SUCCESS_DISTANCE: 3.0 53 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 54 | SDTW: 55 | SUCCESS_DISTANCE: 3.0 56 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 57 | ORACLE_SUCCESS: 58 | SUCCESS_DISTANCE: 3.0 59 | DATASET: 60 | TYPE: VLN-CE-v1 61 | SPLIT: train 62 | DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz 63 | SCENES_DIR: data/scene_datasets/ 64 | -------------------------------------------------------------------------------- /habitat_extensions/config/vlnce_task_aug.yaml: -------------------------------------------------------------------------------- 1 | # Same as vlnce_task.yaml but with a dataset split that 2 | # contains both the training and EnvDrop episodes. 3 | 4 | ENVIRONMENT: 5 | MAX_EPISODE_STEPS: 500 6 | SIMULATOR: 7 | AGENT_0: 8 | SENSORS: [RGB_SENSOR, DEPTH_SENSOR] 9 | FORWARD_STEP_SIZE: 0.25 10 | TURN_ANGLE: 15 11 | HABITAT_SIM_V0: 12 | GPU_DEVICE_ID: 0 13 | ALLOW_SLIDING: True 14 | RGB_SENSOR: 15 | WIDTH: 224 16 | HEIGHT: 224 17 | HFOV: 90 18 | TYPE: HabitatSimRGBSensor 19 | DEPTH_SENSOR: 20 | WIDTH: 256 # pretrained DDPPO resnet needs 256x256 21 | HEIGHT: 256 22 | SEMANTIC_SENSOR: 23 | WIDTH: 256 24 | HEIGHT: 256 25 | TASK: 26 | TYPE: VLN-v0 27 | SUCCESS_DISTANCE: 3.0 28 | SENSORS: [ 29 | INSTRUCTION_SENSOR, 30 | VLN_ORACLE_ACTION_SENSOR, 31 | VLN_ORACLE_PROGRESS_SENSOR, 32 | VLN_ORACLE_WAYPOINT_SENSOR, 33 | VLN_ORACLE_PATH_SENSOR, 34 | HEADING_SENSOR, 35 | COMPASS_SENSOR, 36 | GPS_SENSOR, 37 | GT_SEMANTIC_MAP_SENSOR, 38 | ] 39 | INSTRUCTION_SENSOR_UUID: instruction 40 | POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT] 41 | MEASUREMENTS: [ 42 | DISTANCE_TO_GOAL, 43 | SUCCESS, 44 | SPL, 45 | NDTW, 46 | PATH_LENGTH, 47 | ORACLE_SUCCESS, 48 | STEPS_TAKEN 49 | ] 50 | SUCCESS: 51 | SUCCESS_DISTANCE: 3.0 52 | SPL: 53 | SUCCESS_DISTANCE: 3.0 54 | NDTW: 55 | SUCCESS_DISTANCE: 3.0 56 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 57 | SDTW: 58 | SUCCESS_DISTANCE: 3.0 59 | GT_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz 60 | ORACLE_SUCCESS: 61 | SUCCESS_DISTANCE: 3.0 62 | DATASET: 63 | TYPE: VLN-CE-v1 64 | SPLIT: joint_train_envdrop 65 | DATA_PATH: data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}.json.gz 66 | SCENES_DIR: data/scene_datasets/ 67 | -------------------------------------------------------------------------------- /vlnce_baselines/common/distributions.py: -------------------------------------------------------------------------------- 1 | # The following code is largely borrowed from: 2 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/distributions.py 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | """ 8 | Modify standard PyTorch distributions so they are compatible with this code. 9 | """ 10 | 11 | FixedCategorical = torch.distributions.Categorical 12 | 13 | old_sample = FixedCategorical.sample 14 | FixedCategorical.sample = lambda self: old_sample(self) 15 | 16 | log_prob_cat = FixedCategorical.log_prob 17 | FixedCategorical.log_probs = lambda self, actions: \ 18 | log_prob_cat(self, actions.squeeze(-1)) 19 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True) 20 | 21 | FixedNormal = torch.distributions.Normal 22 | log_prob_normal = FixedNormal.log_prob 23 | FixedNormal.log_probs = lambda self, actions: \ 24 | log_prob_normal(self, actions).sum(-1, keepdim=False) 25 | 26 | entropy = FixedNormal.entropy 27 | FixedNormal.entropy = lambda self: entropy(self).sum(-1) 28 | 29 | FixedNormal.mode = lambda self: self.mean 30 | 31 | 32 | class Categorical(nn.Module): 33 | def __init__(self, num_inputs, num_outputs): 34 | super(Categorical, self).__init__() 35 | self.linear = nn.Linear(num_inputs, num_outputs) 36 | 37 | def forward(self, x): 38 | x = self.linear(x) 39 | return FixedCategorical(logits=x) 40 | 41 | 42 | class DiagGaussian(nn.Module): 43 | def __init__(self, num_inputs, num_outputs): 44 | super(DiagGaussian, self).__init__() 45 | 46 | self.fc_mean = nn.Linear(num_inputs, num_outputs) 47 | self.logstd = AddBias(torch.zeros(num_outputs)) 48 | 49 | def forward(self, x): 50 | action_mean = self.fc_mean(x) 51 | 52 | zeros = torch.zeros(action_mean.size()) 53 | if x.is_cuda: 54 | zeros = zeros.cuda(x.device) 55 | 56 | action_logstd = self.logstd(zeros) 57 | return FixedNormal(action_mean, action_logstd.exp()) 58 | 59 | 60 | class AddBias(nn.Module): 61 | def __init__(self, bias): 62 | super(AddBias, self).__init__() 63 | self._bias = nn.Parameter(bias.unsqueeze(1)) 64 | 65 | def forward(self, x): 66 | if x.dim() == 2: 67 | bias = self._bias.t().view(1, -1) 68 | else: 69 | bias = self._bias.t().view(1, -1, 1, 1) 70 | 71 | return x + bias 72 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | ## clone this code 4 | ```bash 5 | git clone https://github.com/PeihaoChen/WS-MGMap.git 6 | cd WS-MGMap 7 | ``` 8 | 9 | ## Python 10 | This project is developed with Python 3.6.13. If you are using miniconda or anaconda, you can create an environment: 11 | 12 | ```bash 13 | conda create -n wsmgmap python==3.6.13 14 | conda activate wsmgmap 15 | ``` 16 | 17 | ## Pytorch 18 | VLN-CE uses Pytorch 1.6.0 & Cuda 10.2 which can be built installed from conda: 19 | 20 | ```bash 21 | conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.2 -c pytorch 22 | ``` 23 | 24 | ## Habitat 25 | VLN-CE uses Habitat-Sim 0.1.5 which can be built from source or installed from conda: 26 | 27 | ```bash 28 | conda install -y -c aihabitat -c conda-forge bullet=2.88 habitat-sim=0.1.5 headless withbullet python=3.6 29 | ``` 30 | Tips: You'd better to install bullet and withbulllet simultaneously, in order to avoid ImportError at run time. 31 | 32 | Then install Habitat-Lab: 33 | 34 | ```bash 35 | git clone --branch v0.1.5 https://github.com/facebookresearch/habitat-lab.git 36 | cd habitat-lab 37 | # installs both habitat and habitat_baselines 38 | pip install --upgrade pip # update pip 39 | python -m pip install -r requirements.txt 40 | 41 | python -m pip install -r habitat_baselines/rl/requirements.txt 42 | python -m pip install -r habitat_baselines/rl/ddppo/requirements.txt 43 | python setup.py develop --all 44 | ``` 45 | 46 | ## WS-MGMap for VLN 47 | ```bash 48 | cd .. 49 | pip install -r requirements.txt 50 | 51 | # requirements 52 | conda install psutil 53 | pip install einops 54 | 55 | # torch_scatter 56 | cd data 57 | wget https://data.pyg.org/whl/torch-1.6.0%2Bcu102/torch_scatter-2.0.6-cp36-cp36m-linux_x86_64.whl 58 | pip install torch_scatter-2.0.6-cp36-cp36m-linux_x86_64.whl 59 | cd .. 60 | ``` 61 | 62 | # Data 63 | ```bash 64 | # Fisrt install the gdown to download data in google drive. 65 | pip install gdown 66 | 67 | mkdir data 68 | cd data 69 | ``` 70 | 71 | ## Semantic Map 72 | ```bash 73 | # Download map_data.tar.gz 74 | gdown https://drive.google.com/uc?id=1pJwx0E95WsJXThcx8tPrUTB_6gTlryoy 75 | tar -xvf map_data.tar.gz 76 | 77 | # Unzip all train files 78 | cd map_data/semantic/train 79 | find . -name '*.tar.gz' -print0 | xargs -0 -I {} -P 10 tar -zvxf {} 80 | 81 | # Unzip all train_aug files 82 | cd ../train_aug 83 | find . -name '*.tar.gz' -print0 | xargs -0 -I {} -P 10 tar -zvxf {} 84 | ``` 85 | 86 | ## Pre-Trained Model 87 | ```bash 88 | gdown https://drive.google.com/uc?id=1DYkXbRIBVgMU1qHF_mLT41esSAdcQJaf 89 | tar -zxvf pretrain_model.tar.gz 90 | ``` 91 | 92 | ## Trained model 93 | ```bash 94 | gdown https://drive.google.com/uc?id=1HcD8s-tyBeH2LsXs6Rj5x5DC1hVD4GNs 95 | tar -zxvf trained_model.tar.gz 96 | ``` 97 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | result 2 | scripts/ 3 | result1 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # VSCode 133 | .vscode 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # exclude data from source control by default 139 | data 140 | 141 | # Generated videos 142 | videos 143 | 144 | # database files 145 | *.lmdb 146 | 147 | # logging 148 | .log 149 | 150 | # evaluation results 151 | stats_*.json 152 | 153 | # Other 154 | habitat-lab 155 | temp 156 | -------------------------------------------------------------------------------- /vlnce_baselines/common/env_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import gym 4 | from typing import Type, Union 5 | 6 | import habitat 7 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset 8 | from habitat_baselines.common.env_utils import make_env_fn 9 | 10 | 11 | def construct_envs( 12 | config: Config, env_class: Type[Union[Env, RLEnv]], auto_reset_done: bool = True 13 | ) -> VectorEnv: 14 | r"""Create VectorEnv object with specified config and env class type. 15 | To allow better performance, dataset are split into small ones for 16 | each individual env, grouped by scenes. 17 | 18 | Args: 19 | config: configs that contain num_processes as well as information 20 | necessary to create individual environments. 21 | env_class: class type of the envs to be created. 22 | auto_reset_done: Whether or not to automatically reset the env on done 23 | 24 | Returns: 25 | VectorEnv object created according to specification. 26 | """ 27 | 28 | num_processes = config.NUM_PROCESSES 29 | configs = [] 30 | env_classes = [env_class for _ in range(num_processes)] 31 | dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE) 32 | scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) 33 | 34 | if num_processes > 1: 35 | if len(scenes) == 0: 36 | raise RuntimeError( 37 | "No scenes to load, multiple process logic relies on being able to split scenes uniquely between processes" 38 | ) 39 | 40 | if len(scenes) < num_processes: 41 | raise RuntimeError( 42 | "reduce the number of processes as there " 43 | "aren't enough number of scenes" 44 | ) 45 | 46 | random.shuffle(scenes) 47 | 48 | scene_splits = [[] for _ in range(num_processes)] 49 | for idx, scene in enumerate(scenes): 50 | scene_splits[idx % len(scene_splits)].append(scene) 51 | 52 | # assert sum(map(len, scene_splits)) == len(scenes) 53 | if config.SIMULATOR_GPU_IDS is None: 54 | devices = [config.SIMULATOR_GPU_ID] 55 | else: 56 | devices = config.SIMULATOR_GPU_IDS 57 | 58 | for i in range(num_processes): 59 | proc_config = config.clone() 60 | proc_config.defrost() 61 | 62 | task_config = proc_config.TASK_CONFIG 63 | if len(scenes) > 0: 64 | task_config.DATASET.CONTENT_SCENES = scene_splits[i] 65 | 66 | task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = devices[i % len(devices)] 67 | 68 | task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS 69 | 70 | proc_config.freeze() 71 | configs.append(proc_config) 72 | 73 | envs = habitat.VectorEnv( 74 | make_env_fn=make_env_fn, 75 | env_fn_args=tuple(tuple(zip(configs, env_classes, range(num_processes)))), 76 | auto_reset_done=auto_reset_done, 77 | ) 78 | 79 | action_space = gym.spaces.Box(low=0.0, high=0.99, shape=(2,), dtype=np.float32) 80 | envs.action_spaces = [action_space for _ in range(num_processes)] 81 | 82 | return envs 83 | 84 | 85 | def construct_envs_auto_reset_false( 86 | config: Config, env_class: Type[Union[Env, RLEnv]] 87 | ) -> VectorEnv: 88 | return construct_envs(config, env_class, auto_reset_done=False) 89 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/instruction_encoder.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from habitat import Config 8 | 9 | 10 | class InstructionEncoder(nn.Module): 11 | def __init__(self, config: Config): 12 | """An encoder that uses RNN to encode an instruction. Returns 13 | the final hidden state after processing the instruction sequence. 14 | Args: 15 | config: must have 16 | vocab_size: number of words in the vocabulary 17 | embedding_size: The dimension of each embedding vector 18 | use_pretrained_embeddings: 19 | embedding_file: 20 | fine_tune_embeddings: 21 | dataset_vocab: 22 | hidden_size: The hidden (output) size 23 | rnn_type: The RNN cell type. Must be GRU or LSTM 24 | final_state_only: Whether or not to return just the final state 25 | """ 26 | super().__init__() 27 | 28 | self.config = config 29 | 30 | if self.config.use_pretrained_embeddings: 31 | self.embedding_layer = nn.Embedding.from_pretrained( 32 | embeddings=self._load_embeddings(), 33 | freeze=not self.config.fine_tune_embeddings, 34 | ) 35 | else: # each embedding initialized to sampled Gaussian 36 | self.embedding_layer = nn.Embedding( 37 | num_embeddings=config.vocab_size, 38 | embedding_dim=config.embedding_size, 39 | padding_idx=0, 40 | ) 41 | 42 | rnn = nn.GRU if self.config.rnn_type == "GRU" else nn.LSTM 43 | self.bidir = config.bidirectional 44 | self.encoder_rnn = rnn( 45 | input_size=config.embedding_size, 46 | hidden_size=config.hidden_size, 47 | bidirectional=self.bidir, 48 | ) 49 | self.final_state_only = config.final_state_only 50 | 51 | @property 52 | def output_size(self): 53 | return self.config.hidden_size * (2 if self.bidir else 1) 54 | 55 | def _load_embeddings(self): 56 | """ Loads word embeddings from a pretrained embeddings file. 57 | PAD: index 0. [0.0, ... 0.0] 58 | UNK: index 1. mean of all R2R word embeddings: [mean_0, ..., mean_n] 59 | why UNK is averaged: 60 | https://groups.google.com/forum/#!searchin/globalvectors/unk|sort:date/globalvectors/9w8ZADXJclA/hRdn4prm-XUJ 61 | Returns: 62 | embeddings tensor of size [num_words x embedding_dim] 63 | """ 64 | with gzip.open(self.config.embedding_file, "rt") as f: 65 | embeddings = torch.tensor(json.load(f)) 66 | return embeddings 67 | 68 | def forward(self, observations): 69 | """ 70 | Tensor sizes after computation: 71 | instruction: [batch_size x seq_length] 72 | lengths: [batch_size] 73 | hidden_state: [batch_size x hidden_size] 74 | """ 75 | instruction = observations["instruction"].long() 76 | 77 | lengths = (instruction != 0.0).long().sum(dim=1) 78 | embedded = self.embedding_layer(instruction) 79 | 80 | packed_seq = nn.utils.rnn.pack_padded_sequence( 81 | embedded, lengths, batch_first=True, enforce_sorted=False 82 | ) 83 | 84 | output, final_state = self.encoder_rnn(packed_seq) 85 | 86 | if self.config.rnn_type == "LSTM": 87 | final_state = final_state[0] 88 | 89 | if self.final_state_only: 90 | return final_state.squeeze(0) 91 | else: 92 | hidden_states = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0].permute(0, 2, 1) 93 | return hidden_states, (hidden_states == 0.0).all(dim=1) 94 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/resnet_encoders.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from habitat_baselines.rl.ddppo.policy import resnet 8 | from habitat_baselines.rl.ddppo.policy.resnet_policy import ResNetEncoder 9 | from habitat_baselines.common.utils import Flatten 10 | 11 | 12 | class VlnResnetDepthEncoder(nn.Module): 13 | def __init__( 14 | self, 15 | observation_space, 16 | output_size=128, 17 | checkpoint="NONE", 18 | backbone="resnet50", 19 | resnet_baseplanes=32, 20 | normalize_visual_inputs=False, 21 | trainable=False, 22 | spatial_output: bool = False, 23 | ): 24 | super().__init__() 25 | self.visual_encoder = ResNetEncoder( 26 | spaces.Dict({"depth": observation_space.spaces["depth"]}), 27 | baseplanes=resnet_baseplanes, 28 | ngroups=resnet_baseplanes // 2, 29 | make_backbone=getattr(resnet, backbone), 30 | normalize_visual_inputs=normalize_visual_inputs, 31 | obs_transform=None, 32 | ) 33 | 34 | for param in self.visual_encoder.parameters(): 35 | param.requires_grad_(trainable) 36 | 37 | if checkpoint != "NONE": 38 | ddppo_weights = torch.load(checkpoint) 39 | 40 | weights_dict = {} 41 | for k, v in ddppo_weights["state_dict"].items(): 42 | split_layer_name = k.split(".")[2:] 43 | if split_layer_name[0] != "visual_encoder": 44 | continue 45 | 46 | layer_name = ".".join(split_layer_name[1:]) 47 | weights_dict[layer_name] = v 48 | 49 | del ddppo_weights 50 | self.visual_encoder.load_state_dict(weights_dict, strict=True) 51 | 52 | self.spatial_output = spatial_output 53 | 54 | if not self.spatial_output: 55 | self.output_shape = (output_size,) 56 | self.visual_fc = nn.Sequential( 57 | Flatten(), 58 | nn.Linear(np.prod(self.visual_encoder.output_shape), output_size), 59 | nn.ReLU(True), 60 | ) 61 | else: 62 | self.spatial_embeddings = nn.Embedding( 63 | self.visual_encoder.output_shape[1] 64 | * self.visual_encoder.output_shape[2], 65 | 64, 66 | ) 67 | 68 | self.output_shape = list(self.visual_encoder.output_shape) 69 | self.output_shape[0] += self.spatial_embeddings.embedding_dim 70 | self.output_shape = tuple(self.output_shape) 71 | 72 | def forward(self, observations): 73 | """ 74 | Args: 75 | observations: [BATCH, HEIGHT, WIDTH, CHANNEL] 76 | Returns: 77 | [BATCH, OUTPUT_SIZE] 78 | """ 79 | if "depth_features" in observations: 80 | x = observations["depth_features"] 81 | else: 82 | x = self.visual_encoder(observations) 83 | 84 | if self.spatial_output: 85 | b, c, h, w = x.size() 86 | 87 | spatial_features = ( 88 | self.spatial_embeddings( 89 | torch.arange( 90 | 0, 91 | self.spatial_embeddings.num_embeddings, 92 | device=x.device, 93 | dtype=torch.long, 94 | ) 95 | ) 96 | .view(1, -1, h, w) 97 | .expand(b, self.spatial_embeddings.embedding_dim, h, w) 98 | ) 99 | 100 | return torch.cat([x, spatial_features], dim=1) 101 | else: 102 | return self.visual_fc(x) 103 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import random 5 | import os 6 | import warnings 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from typing import List 10 | plt.switch_backend('agg') 11 | os.environ['GLOG_minloglevel'] = '2' 12 | os.environ['MAGNUM_LOG'] = 'quiet' 13 | warnings.filterwarnings("ignore") 14 | 15 | import torch 16 | 17 | from habitat import logger 18 | from habitat_baselines.common.baseline_registry import baseline_registry 19 | 20 | from vlnce_baselines.config.default import get_config, refine_config, set_saveDir_GPUs 21 | from vlnce_baselines.common.utils import check_exist_file, save_sh_n_codes, save_config 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument( 28 | "--run-type", 29 | choices=["train", "eval", "inference"], 30 | default="train", 31 | help="run type of the experiment (train, eval, inference)", 32 | ) 33 | parser.add_argument( 34 | "-c", "--exp-config", 35 | type=str, 36 | required=True, 37 | help="path to config yaml containing info about experiment", 38 | ) 39 | parser.add_argument( 40 | "-e", "--model-dir", 41 | default=None, 42 | help="path to save checkpoint, log and others", 43 | ) 44 | parser.add_argument( 45 | "--note", 46 | default='base', 47 | help="add extra note for running file", 48 | ) 49 | parser.add_argument( 50 | "-g", "--gpus", 51 | default=None, 52 | nargs="+", 53 | type=int, 54 | help="GPU id to run experiments", 55 | ) 56 | parser.add_argument( 57 | "opts", 58 | default=None, 59 | nargs=argparse.REMAINDER, 60 | help="Modify config options from command line", 61 | ) 62 | parser.add_argument( 63 | '--local_rank', 64 | default=-1, 65 | type=int, 66 | help='node rank for distributed training' 67 | ) 68 | 69 | args = parser.parse_args() 70 | run_exp(**vars(args)) 71 | 72 | 73 | def run_exp(exp_config: str, 74 | run_type: str, 75 | model_dir: str, 76 | note: str, 77 | gpus: List[int], 78 | opts=None, 79 | local_rank=-1) -> None: 80 | """Runs experiment given mode and config 81 | Args: 82 | exp_config: path to config file. 83 | run_type: "train" or "eval. 84 | model_dir: path to save. 85 | note: extra note. 86 | opts: list of strings of additional config options. 87 | Returns: 88 | None. 89 | 90 | """ 91 | config = get_config(exp_config, opts) 92 | config = set_saveDir_GPUs(config, run_type, model_dir, note, gpus, local_rank) 93 | config = refine_config(config, local_rank) 94 | if local_rank == 0: 95 | check_exist_file(config) 96 | save_sh_n_codes( 97 | config, 98 | run_type, 99 | ignore_dir=['habitat-lab', 'data', 'result', 'habitat-sim', 'temp'] 100 | ) 101 | save_config(config, run_type) 102 | logger.add_filehandler(config.LOG_FILE) 103 | 104 | random.seed(config.TASK_CONFIG.SEED) 105 | np.random.seed(config.TASK_CONFIG.SEED) 106 | torch.manual_seed(config.TASK_CONFIG.SEED) 107 | torch.backends.cudnn.benchmark = False 108 | torch.backends.cudnn.deterministic = True 109 | 110 | trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) 111 | assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" 112 | trainer = trainer_init(config) 113 | 114 | if run_type == "train": 115 | trainer.train() 116 | elif run_type == "eval": 117 | trainer.eval() 118 | elif run_type == "inference": 119 | trainer.inference() 120 | 121 | 122 | if __name__ == "__main__": 123 | main() 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [NeurIPS 2022] WS-MGMap for VLN 2 | Official Pytorch implementation for NeurIPS 2022 paper "Weakly-Supervised Multi-Granularity Map Learning for Vision-and-Language Navigation” 3 | 4 | ![](img/framework.png) 5 | 6 | 7 | ## Setup 8 | This code is developed with Python 3.6, PyTorch 1.6.0. We follow [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to install Habitat-Sim and Habitat-Lab. Then clone this repository and install requirements. (More details in SETUP.md) 9 | 10 | ```bash 11 | git clone https://github.com/PeihaoChen/WS-MGMap.git 12 | cd WS-MGMap 13 | pip install -r requirements.txt 14 | ``` 15 | 16 | 17 | ## Data 18 | 19 | ### Download Scenes and Episodes 20 | Follow the instructions in [VLN-CE](https://github.com/jacobkrantz/VLN-CE) to download Matterport3D scenes to `data/scene_datasets` folder and VLN-CE datasets to `data/datasets` folder and corresponding episodes data. 21 | 22 | ### Download Ground-Truth Semantic Map 23 | Download the cache ground-truth semantic map [here](https://drive.google.com/file/d/1pJwx0E95WsJXThcx8tPrUTB_6gTlryoy/view?usp=share_link) to `data/map_data` folder as the supervision for the semantic hallucination. 24 | 25 | ### Download Pre-Trained Model 26 | The pre-trained semantic segmentation model for the semantic segmentation and DD-PPO model for the navigation control can be found [here](https://drive.google.com/file/d/1DYkXbRIBVgMU1qHF_mLT41esSAdcQJaf/view?usp=sharing). Download it to `data/pretrain_model` folder 27 | 28 | ### Data Format 29 | This code expects all data files in the following structure: 30 | 31 | ```graphql 32 | WS-MGMap 33 | ├─ data 34 | | ├─ datasets 35 | | | ├─ R2R_VLNCE_v1-2 36 | | | ├─ R2R_VLNCE_v1-2_preprocessed 37 | | ├─ map_data 38 | | | ├─ semantic 39 | | | | ├─ train 40 | | | | | ├─ ep_0.npy 41 | | | | | ├─ ... 42 | | | | ├─ train_aug 43 | | | | | ├─ ep_0.npy 44 | | | | | ├─ ... 45 | | ├─ pretrain_model 46 | | | ├─ ddppo-models 47 | | | | ├─ gibson-2plus-resnet50.pth 48 | | | ├─ unet-models 49 | | | | ├─ 2021_02_14-23_42_50.pt 50 | | ├─ scene_datasets 51 | | | ├─ mp3d 52 | | | | ├─ 1LXtFkjw3qL 53 | | | | ├─ ... 54 | ``` 55 | 56 | 57 | ## Usage 58 | 59 | ### Evaluation 60 | We provide our trained models [here](https://drive.google.com/file/d/1HcD8s-tyBeH2LsXs6Rj5x5DC1hVD4GNs/view?usp=share_link) for reproducing the results shown in the paper. 61 | Run the following to evaluate a trained model: 62 | 63 | ```bash 64 | export CUDA_VISIBLE_DEVICES=0 65 | python -m torch.distributed.launch --nproc_per_node=1 run.py \ 66 | --run-type eval \ 67 | -c vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml \ 68 | -e $PATH_TO_SAVE_RESULT$ \ 69 | EVAL_CKPT_PATH_DIR $PATH_TO_TRAINED_MODEL$ \ 70 | NUM_PROCESSES 1 \ 71 | use_ddppo True 72 | ``` 73 | 74 | 75 | ### Training 76 | STAGE1: Run the following for teacher forcing training on augmented data: 77 | 78 | ```bash 79 | export CUDA_VISIBLE_DEVICES=0,1,2 80 | python -m torch.distributed.launch --nproc_per_node=3 run.py \ 81 | -c vlnce_baselines/config/CMA_AUG.yaml \ 82 | -e $PATH_TO_SAVE_RESULT$ \ 83 | NUM_PROCESSES 6 \ 84 | DAGGER.BATCH_SIZE 8 85 | ``` 86 | 87 | STAGE2: Run the following for dagger training to fine-tune the model: 88 | 89 | ```bash 90 | export CUDA_VISIBLE_DEVICES=0,1,2 91 | python -m torch.distributed.launch --nproc_per_node=3 run.py \ 92 | -c vlnce_baselines/config/CMA_AUG_DA_TUNE.yaml \ 93 | -e $PATH_TO_SAVE_RESULT$ \ 94 | NUM_PROCESSES 5 \ 95 | DAGGER.BATCH_SIZE 8 \ 96 | DAGGER.CKPT_TO_LOAD $PATH_TO_MODEL_FROM_STAGE1$ 97 | ``` 98 | 99 | 100 | ## Citation 101 | If you use or discuss WS-MGMap in your research, please consider citing the paper as follows 102 | ``` 103 | @article{chen2022weakly, 104 | title={Weakly-supervised multi-granularity map learning for vision-and-language navigation}, 105 | author={Chen, Peihao and Ji, Dongyu and Lin, Kunyang and Zeng, Runhao and Li, Thomas H and Tan, Mingkui and Gan, Chuang}, 106 | journal={arXiv preprint arXiv:2210.07506}, 107 | year={2022} 108 | } 109 | ``` 110 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/unet_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchvision import models 4 | 5 | 6 | def convrelu(in_channels, out_channels, kernel, padding): 7 | return nn.Sequential( 8 | nn.Conv2d(in_channels, out_channels, kernel, padding=padding), 9 | nn.BatchNorm2d(num_features=out_channels), 10 | nn.ReLU(inplace=True), 11 | ) 12 | 13 | 14 | class UNet(nn.Module): 15 | def __init__(self, model_config): 16 | super().__init__() 17 | self.base_model = ResNetUNet(3, 27) 18 | 19 | state = torch.load(model_config.RGB_ENCODER.pretrain_model, map_location='cpu') 20 | model_state = state['models']['img_segm_model'] 21 | new_model_state = {'.'.join(k.split('.')[2:]):v for k,v in model_state.items()} 22 | self.base_model.load_state_dict(new_model_state) 23 | 24 | self.output_shape = self.base_model.output_shape 25 | 26 | def forward(self, observations): 27 | return self.base_model(observations) 28 | 29 | 30 | class ResNetUNet(nn.Module): 31 | def __init__(self, n_channel_in, n_class_out): 32 | super().__init__() 33 | 34 | self.base_model = models.resnet18(pretrained=True) 35 | self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3,bias=False) 36 | self.base_layers = list(self.base_model.children()) 37 | 38 | self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2) 39 | self.layer0_1x1 = convrelu(64, 64, 1, 0) 40 | self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4) 41 | self.layer1_1x1 = convrelu(64, 64, 1, 0) 42 | self.layer2 = self.base_layers[5] # size=(N, 128, x.H/8, x.W/8) 43 | self.layer2_1x1 = convrelu(128, 128, 1, 0) 44 | self.layer3 = self.base_layers[6] # size=(N, 256, x.H/16, x.W/16) 45 | self.layer3_1x1 = convrelu(256, 256, 1, 0) 46 | self.layer4 = self.base_layers[7] # size=(N, 512, x.H/32, x.W/32) 47 | self.layer4_1x1 = convrelu(512, 512, 1, 0) 48 | 49 | self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) 50 | 51 | self.conv_up3 = convrelu(256 + 512, 512, 3, 1) 52 | self.conv_up2 = convrelu(128 + 512, 256, 3, 1) 53 | self.conv_up1 = convrelu(64 + 256, 256, 3, 1) 54 | self.conv_up0 = convrelu(64 + 256, 128, 3, 1) 55 | 56 | self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1) 57 | self.conv_original_size1 = convrelu(64, 64, 3, 1) 58 | self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1) 59 | 60 | self.conv_last = nn.Conv2d(64, n_class_out, 1) 61 | 62 | self.output_shape = [512, 7, 7] 63 | 64 | def forward(self, observations): 65 | if 'rgb_features' in observations: 66 | return observations['rgb_features'], None 67 | else: 68 | input = observations['rgb'].permute(0, 3, 1, 2) 69 | B, C, cH, cW = input.shape 70 | input = input.view(B, C, cH, cW) 71 | 72 | x_original = self.conv_original_size0(input) 73 | x_original = self.conv_original_size1(x_original) 74 | 75 | layer0 = self.layer0(input) 76 | layer1 = self.layer1(layer0) 77 | layer2 = self.layer2(layer1) 78 | layer3 = self.layer3(layer2) 79 | layer4 = self.layer4(layer3) 80 | 81 | layer4 = self.layer4_1x1(layer4) 82 | x = self.upsample(layer4) 83 | 84 | layer3 = self.layer3_1x1(layer3) 85 | x = torch.cat([x, layer3], dim=1) 86 | x = self.conv_up3(x) 87 | 88 | x = self.upsample(x) 89 | layer2 = self.layer2_1x1(layer2) 90 | x = torch.cat([x, layer2], dim=1) 91 | x = self.conv_up2(x) 92 | 93 | x = self.upsample(x) 94 | layer1 = self.layer1_1x1(layer1) 95 | x = torch.cat([x, layer1], dim=1) 96 | x = self.conv_up1(x) 97 | 98 | x = self.upsample(x) 99 | layer0 = self.layer0_1x1(layer0) 100 | x = torch.cat([x, layer0], dim=1) 101 | x = self.conv_up0(x) 102 | 103 | x = self.upsample(x) 104 | x = torch.cat([x, x_original], dim=1) 105 | x = self.conv_original_size2(x) 106 | 107 | out = self.conv_last(x) 108 | 109 | proj_feat = x 110 | 111 | return layer4, proj_feat 112 | -------------------------------------------------------------------------------- /vlnce_baselines/common/environments.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Optional 3 | 4 | import habitat 5 | from habitat import Config, Dataset 6 | from habitat.tasks.utils import cartesian_to_polar 7 | from habitat.utils.geometry_utils import quaternion_rotate_vector 8 | from habitat_baselines.common.baseline_registry import baseline_registry 9 | from habitat_extensions.shortest_path_follower import ShortestPathFollowerCompat 10 | 11 | from vlnce_baselines.common.action_maker import GTMapActionMaker, DDPPOActionMaker 12 | 13 | 14 | @baseline_registry.register_env(name="VLNCEDaggerEnv") 15 | class VLNCEDaggerEnv(habitat.RLEnv): 16 | def __init__(self, config: Config, dataset: Optional[Dataset] = None): 17 | super().__init__(config.TASK_CONFIG, dataset) 18 | self.config = config 19 | self.device = self._env._config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID 20 | self._success_distance = config.TASK_CONFIG.TASK.SUCCESS_DISTANCE 21 | 22 | self.follower = ShortestPathFollowerCompat(self._env.sim, 0.5, return_one_hot=False) 23 | self.follower.mode = 'geodesic_path' 24 | self.steppppp = 0 25 | 26 | if self.config.use_ddppo: 27 | self.ddppo_action_maker = DDPPOActionMaker(config, self._env) 28 | else: 29 | self.gt_map_action_maker = GTMapActionMaker(config) 30 | 31 | def reset(self): 32 | observation = super(VLNCEDaggerEnv, self).reset() 33 | return observation 34 | 35 | def step(self, action, prog, epidsode_reset_flag=None, depth_img=None): 36 | if self.config.use_ddppo and epidsode_reset_flag is True: 37 | self.ddppo_action_maker.l_policy.reset() 38 | self.ddppo_action_maker.sg_reset() 39 | self.steppppp = 0 40 | 41 | agent_state = self._env._sim.get_agent_state() 42 | if self.config.use_ddppo: 43 | self.waypoint = self.ddppo_action_maker.preprocess(action, agent_state) 44 | action_choice = self.ddppo_action_maker.action_decision(self.steppppp, self.waypoint, depth_img) 45 | else: 46 | self.waypoint = self.gt_map_action_maker.preprocess(action, agent_state) 47 | action_choice = self.gt_map_action_maker.action_decision(self.waypoint, self.follower) 48 | 49 | stop = self.decide_stop(prog) 50 | if stop: 51 | action_choice = 0 52 | 53 | if self._env._elapsed_steps < 24: 54 | action_choice = 2 55 | 56 | observation, reward, done, info = self.step_bak(action_choice) 57 | 58 | self.steppppp += 1 59 | 60 | return observation, reward, done, info 61 | 62 | def step_bak(self, action): 63 | observations, reward, done, info = super().step(action) 64 | return observations, reward, done, info 65 | 66 | def decide_stop(self, prog): 67 | if prog == -1 and self._distance_waypoint(self._env.current_episode.goals[0].position) < 0.5: 68 | return True 69 | elif prog > self.config.STOP_CONDITION.PROG_THRESHOLD: 70 | return True 71 | return False 72 | 73 | def _distance_waypoint(self, waypoint): 74 | agent_position = self._env._sim.get_agent_state().position 75 | return self._env.sim.geodesic_distance(waypoint, agent_position) 76 | 77 | def get_reward_range(self): 78 | return (0.0, 0.0) 79 | 80 | def get_reward(self, observations): 81 | return 0.0 82 | 83 | def get_done(self, observations): 84 | return self._env.episode_over 85 | 86 | def get_info(self, observations): 87 | return self.habitat_env.get_metrics() 88 | 89 | 90 | @baseline_registry.register_env(name="VLNCEInferenceEnv") 91 | class VLNCEInferenceEnv(VLNCEDaggerEnv): 92 | def __init__(self, config: Config, dataset: Optional[Dataset] = None): 93 | super().__init__(config, dataset) 94 | 95 | def get_reward_range(self): 96 | return (0.0, 0.0) 97 | 98 | def get_reward(self, observations): 99 | return 0.0 100 | 101 | def get_done(self, observations): 102 | return self._env.episode_over 103 | 104 | def get_info(self, observations): 105 | agent_state = self._env.sim.get_agent_state() 106 | heading_vector = quaternion_rotate_vector( 107 | agent_state.rotation.inverse(), np.array([0, 0, -1]) 108 | ) 109 | heading = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] 110 | return { 111 | "position": agent_state.position.tolist(), 112 | "heading": heading, 113 | "stop": self._env.task.is_stop_called, 114 | } 115 | -------------------------------------------------------------------------------- /vlnce_baselines/models/encoders/map_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.models as models 6 | 7 | 8 | def convrelu(in_channels, out_channels, kernel, padding): 9 | return nn.Sequential( 10 | nn.Conv2d(in_channels, out_channels, kernel, padding=padding), 11 | nn.BatchNorm2d(num_features=out_channels), 12 | nn.ReLU(inplace=True), 13 | ) 14 | 15 | 16 | class MapEncoder(nn.Module): 17 | def __init__(self, map_size, input_channel, output_channel): 18 | super().__init__() 19 | self.cnn = nn.Sequential( 20 | nn.Conv2d(input_channel, 64, 8, stride=2, padding=3), # 100 -> 50 21 | nn.BatchNorm2d(num_features=64), 22 | nn.ReLU(inplace=True), 23 | nn.Conv2d(64, 128, 5, stride=2, padding=1), 24 | nn.BatchNorm2d(num_features=128), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(128, output_channel, 3, stride=1, padding=1), 27 | nn.BatchNorm2d(num_features=output_channel), 28 | nn.ReLU(inplace=True), 29 | ) 30 | 31 | cnn_dims = np.array([map_size, map_size], dtype=np.float32) 32 | self._cnn_layers_kernel = [(8, 8), (5, 5), (3, 3)] 33 | self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)] 34 | self._cnn_layers_padding = [(3, 3), (1, 1), (1, 1)] 35 | for kernel, stride, padding in zip(self._cnn_layers_kernel, self._cnn_layers_stride, self._cnn_layers_padding): 36 | cnn_dims = self._conv_output_dim( 37 | dimension=cnn_dims, 38 | padding=np.array(padding, dtype=np.float32), 39 | dilation=np.array([1, 1], dtype=np.float32), 40 | kernel=np.array(kernel, dtype=np.float32), 41 | stride=np.array(stride, dtype=np.float32), 42 | ) 43 | 44 | self.output_shape = [output_channel, cnn_dims[0], cnn_dims[1]] 45 | 46 | def _conv_output_dim(self, dimension, padding, dilation, kernel, stride): 47 | assert len(dimension) == 2 48 | out_dimension = [] 49 | for i in range(len(dimension)): 50 | out_dimension.append( 51 | int( 52 | np.floor( 53 | ( 54 | ( 55 | dimension[i] 56 | + 2 * padding[i] 57 | - dilation[i] * (kernel[i] - 1) 58 | - 1 59 | ) 60 | / stride[i] 61 | ) 62 | + 1 63 | ) 64 | ) 65 | ) 66 | return tuple(out_dimension) 67 | 68 | def forward(self, rgb_map): 69 | return self.cnn(rgb_map) 70 | 71 | 72 | class MapDecoder(nn.Module): 73 | def __init__(self, n_channel_in): 74 | super().__init__() 75 | self.base_model = models.resnet18(pretrained=True) 76 | self.base_model.conv1 = nn.Conv2d(n_channel_in, 64, kernel_size=7, stride=2, padding=3, bias=False) 77 | self.base_layers = list(self.base_model.children()) 78 | 79 | self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2) 80 | self.layer0_1x1 = convrelu(64, 64, 1, 0) 81 | self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4) 82 | self.layer1_1x1 = convrelu(64, 64, 1, 0) 83 | 84 | self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) 85 | 86 | self.conv_up0 = convrelu(64 + 64, 128, 3, 1) 87 | 88 | self.conv_original_size0 = convrelu(n_channel_in, 64, 3, 1) 89 | self.conv_original_size1 = convrelu(64, 64, 3, 1) 90 | self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1) 91 | 92 | self.output_shape = [64, 100, 100] 93 | 94 | def forward(self, input): 95 | x_original = self.conv_original_size0(input) 96 | x_original = self.conv_original_size1(x_original) 97 | 98 | layer0 = self.layer0(input) 99 | layer1 = self.layer1(layer0) 100 | 101 | layer1 = self.layer1_1x1(layer1) 102 | x = self.upsample(layer1) 103 | 104 | layer0 = self.layer0_1x1(layer0) 105 | x = torch.cat([x, layer0], dim=1) 106 | x = self.conv_up0(x) 107 | 108 | x = self.upsample(x) 109 | x = torch.cat([x, x_original], dim=1) 110 | x = self.conv_original_size2(x) 111 | 112 | return x 113 | -------------------------------------------------------------------------------- /vlnce_baselines/models/policy.py: -------------------------------------------------------------------------------- 1 | from gym import Space 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from habitat import Config 8 | from habitat_baselines.rl.ppo.policy import CriticHead 9 | 10 | from vlnce_baselines.models.mg_map_policy import MGMapNet 11 | from vlnce_baselines.common.distributions import DiagGaussian 12 | from vlnce_baselines.common.aux_losses import AuxLosses 13 | 14 | 15 | class BasePolicy(nn.Module): 16 | def __init__(self, observation_space: Space, action_space: Space, model_config: Config): 17 | super(BasePolicy, self).__init__() 18 | self.model_config = model_config 19 | 20 | # Forward Network 21 | self.net = MGMapNet(observation_space, model_config) 22 | 23 | # Actor_critic Network 24 | self.action_distribution = DiagGaussian(self.net.output_size, action_space.shape[0]) 25 | self.critic = CriticHead(self.net.output_size) 26 | 27 | # Aux Network 28 | self.prog_pred = nn.Linear(model_config.STATE_ENCODER.hidden_size, 1) 29 | 30 | def update_map(self, observations, masks): 31 | _, rgb_embedding_proj = self.net.rgb_encoder(observations) 32 | self.net.rgb_mapping_module(rgb_embedding_proj, observations, masks) 33 | 34 | def act( 35 | self, 36 | observations, 37 | rnn_hidden_states, 38 | prev_actions, 39 | masks, 40 | deterministic=False, 41 | ): 42 | features, rnn_hidden_states, pred_map = self.net( 43 | observations, rnn_hidden_states, prev_actions, masks 44 | ) 45 | self.aux_prediction(features, observations, pred_map) 46 | distribution = self.action_distribution(features) 47 | value = self.critic(features) 48 | 49 | if deterministic: 50 | action = distribution.mode() 51 | else: 52 | action = distribution.sample() 53 | 54 | action_log_probs = distribution.log_probs(action) 55 | 56 | return value, action, action_log_probs, rnn_hidden_states 57 | 58 | def aux_prediction(self, features, observations, pred_map): 59 | self.prog = torch.tanh(self.prog_pred(features)) 60 | 61 | # Calculate loss 62 | if AuxLosses.is_active(): 63 | if self.model_config.PREDICTION_MONITOR.use: 64 | target_map = torch.nn.functional.interpolate(observations['gt_semantic_map'].unsqueeze(1), size=(48, 48)).squeeze().long() 65 | prediction_loss = F.cross_entropy(pred_map, target_map, reduction='none') 66 | prediction_loss = prediction_loss.mean([1,2]) 67 | AuxLosses.register_loss('prediction_monitor', prediction_loss, self.model_config.PREDICTION_MONITOR.alpha) 68 | 69 | if self.model_config.CONTRASTIVE_MONITOR.use: 70 | feature_size = self.net.map_encoder.output_shape[-1] 71 | 72 | if 'gt_path' in observations.keys(): 73 | dis_map = observations['gt_path'] 74 | else: 75 | dis_map = observations['waypoint_distribution'] 76 | target = (dis_map.max() - dis_map) / (dis_map.max() - dis_map.min()) 77 | target = F.interpolate(target.unsqueeze(1), size=[feature_size, feature_size], mode='area').squeeze(1) 78 | target = target.reshape(target.shape[0], -1) 79 | target = F.softmax(target/self.model_config.CONTRASTIVE_MONITOR.target_tau, dim=1) 80 | pred = self.net.att_map_t_m 81 | 82 | kl_loss = F.kl_div(torch.log(pred), target, reduction='none') 83 | kl_loss = kl_loss.mean(-1) 84 | AuxLosses.register_loss('contrastive_monitor', kl_loss, self.model_config.CONTRASTIVE_MONITOR.alpha) 85 | 86 | if self.model_config.PROGRESS_MONITOR.use: 87 | progress_loss = F.mse_loss(self.prog, observations['progress'], reduction='none') 88 | progress_loss = progress_loss.mean(-1) 89 | AuxLosses.register_loss('progress_monitor', progress_loss, self.model_config.PROGRESS_MONITOR.alpha) 90 | 91 | def forward(self, observations, rnn_hidden_states, prev_actions, masks, weights): 92 | features, rnn_hidden_states, pred_map = self.net( 93 | observations, rnn_hidden_states, prev_actions, masks 94 | ) 95 | 96 | distribution = self.action_distribution(features) 97 | pred = distribution.mean 98 | 99 | self.aux_prediction(features, observations, pred_map) 100 | aux_mask = (weights > 0).view(-1) 101 | aux_loss = AuxLosses.reduce(aux_mask) 102 | 103 | return pred, aux_loss 104 | -------------------------------------------------------------------------------- /habitat_extensions/task.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import os 4 | from typing import List, Optional 5 | import numpy as np 6 | 7 | import attr 8 | from habitat.config import Config 9 | from habitat.core.dataset import Dataset 10 | from habitat.core.registry import registry 11 | from habitat.datasets.pointnav.pointnav_dataset import ALL_SCENES_MASK 12 | from habitat.datasets.utils import VocabDict 13 | from habitat.tasks.nav.nav import NavigationGoal 14 | from habitat.tasks.vln.vln import InstructionData, VLNEpisode 15 | 16 | DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/" 17 | 18 | 19 | @attr.s(auto_attribs=True, kw_only=True) 20 | class VLNExtendedEpisode(VLNEpisode): 21 | r""" 22 | instruction_index_string: optional identifier of instruction. 23 | """ 24 | instruction_index_string: Optional[str] = attr.ib(default=None) 25 | goals: Optional[List[NavigationGoal]] = attr.ib(default=None) 26 | reference_path: Optional[List[List[float]]] = attr.ib(default=None) 27 | 28 | 29 | @registry.register_dataset(name="VLN-CE-v1") 30 | class VLNCEDatasetV1(Dataset): 31 | r"""Class inherited from Dataset that loads a Vision and Language 32 | Navigation dataset. 33 | """ 34 | 35 | episodes: List[VLNEpisode] 36 | instruction_vocab: VocabDict 37 | 38 | @staticmethod 39 | def check_config_paths_exist(config: Config) -> bool: 40 | return os.path.exists( 41 | config.DATA_PATH.format(split=config.SPLIT) 42 | ) and os.path.exists(config.SCENES_DIR) 43 | 44 | @staticmethod 45 | def _scene_from_episode(episode: VLNExtendedEpisode) -> str: 46 | r"""Helper method to get the scene name from an episode. Assumes 47 | the scene_id is formated /path/to/. 48 | """ 49 | return os.path.splitext(os.path.basename(episode.scene_id))[0] 50 | 51 | @classmethod 52 | def get_scenes_to_load(cls, config: Config) -> List[str]: 53 | r"""Return a sorted list of scenes 54 | """ 55 | assert cls.check_config_paths_exist(config) 56 | dataset = cls(config) 57 | scenes = {cls._scene_from_episode(episode) for episode in dataset.episodes} 58 | 59 | return sorted(list(scenes)) 60 | 61 | def _split_dataset(self, config): 62 | all_scene = [] 63 | for ep in self.episodes: 64 | if ep.scene_id not in all_scene: 65 | all_scene.append(ep.scene_id) 66 | 67 | data_dict = [[] for _ in range(len(all_scene))] 68 | for ep in self.episodes: 69 | data_dict[all_scene.index(ep.scene_id)].append(ep) 70 | 71 | split_episode = [] 72 | for scene in range(len(all_scene)): 73 | if len(data_dict[scene]) < 4: 74 | continue 75 | split_num = int(np.floor(len(data_dict[scene]) / config.split_num)) 76 | split_scene = [data_dict[scene][i: i+split_num] for i in range(0, len(data_dict[scene]), split_num)] 77 | if len(split_scene) > config.split_num: 78 | split_scene[-2].extend(split_scene[-1]) 79 | del split_scene[-1] 80 | split_episode.extend(split_scene[config.split_rank]) 81 | 82 | return split_episode 83 | 84 | def __init__(self, config: Optional[Config] = None) -> None: 85 | self.episodes = [] 86 | 87 | if config is None: 88 | return 89 | 90 | dataset_filename = config.DATA_PATH.format(split=config.SPLIT) 91 | with gzip.open(dataset_filename, "rt") as f: 92 | self.from_json(f.read(), scenes_dir=config.SCENES_DIR) 93 | 94 | if config.split_num > 1: 95 | self.episodes = self._split_dataset(config) 96 | 97 | if ALL_SCENES_MASK not in config.CONTENT_SCENES: 98 | scenes_to_load = set(config.CONTENT_SCENES) 99 | self.episodes = [ 100 | episode 101 | for episode in self.episodes 102 | if self._scene_from_episode(episode) in scenes_to_load 103 | ] 104 | 105 | def from_json(self, json_str: str, scenes_dir: Optional[str] = None) -> None: 106 | 107 | deserialized = json.loads(json_str) 108 | self.instruction_vocab = VocabDict( 109 | word_list=deserialized["instruction_vocab"]["word_list"] 110 | ) 111 | 112 | for episode in deserialized["episodes"]: 113 | episode = VLNExtendedEpisode(**episode) 114 | 115 | if scenes_dir is not None: 116 | if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): 117 | episode.scene_id = episode.scene_id[ 118 | len(DEFAULT_SCENE_PATH_PREFIX) : 119 | ] 120 | 121 | episode.scene_id = os.path.join(scenes_dir, episode.scene_id) 122 | 123 | episode.instruction = InstructionData(**episode.instruction) 124 | if episode.goals is not None: 125 | for g_index, goal in enumerate(episode.goals): 126 | episode.goals[g_index] = NavigationGoal(**goal) 127 | self.episodes.append(episode) 128 | -------------------------------------------------------------------------------- /vlnce_baselines/common/action_maker.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from habitat.utils.visualizations import maps 4 | 5 | from vlnce_baselines.common.utils import TransfomationRealworldAgent 6 | from vlnce_baselines.models.ddppo_policy import DdppoPolicy, SemanticGrid, utils 7 | 8 | 9 | class ActionMaker(): 10 | def __init__(self, config) -> None: 11 | self.config = config 12 | self.ego_map_size = config.ego_map_size 13 | self.map_range_max = maps.COORDINATE_MAX 14 | self.map_range_min = maps.COORDINATE_MIN 15 | self.map_size = 1250 16 | 17 | def preprocess(self, action, agent_state): 18 | resolution = (self.map_range_max - self.map_range_min) / self.map_size 19 | tran_real_to_agent = TransfomationRealworldAgent(agent_state) 20 | 21 | waypoint_norm = torch.tanh(action) 22 | waypoint_a = torch.zeros([3]) 23 | waypoint_a[0] = waypoint_norm[0] * (self.ego_map_size / 2) * resolution 24 | waypoint_a[2] = -waypoint_norm[1] * (self.ego_map_size / 2) * resolution 25 | 26 | waypoint_w = tran_real_to_agent.agent2realworld(waypoint_a) 27 | 28 | return waypoint_w 29 | 30 | def action_decision(self) -> int: 31 | pass 32 | 33 | 34 | class GTMapActionMaker(ActionMaker): 35 | def __init__(self, config) -> None: 36 | super().__init__(config) 37 | 38 | def action_decision(self, goal, follower) -> int: 39 | action = follower.get_next_action(goal) 40 | 41 | if action is None: 42 | action = 1 43 | 44 | return action 45 | 46 | 47 | class DDPPOActionMaker(ActionMaker): 48 | def __init__(self, config, _env) -> None: 49 | super().__init__(config) 50 | self.utils = utils() 51 | self._env = _env 52 | self.device = torch.device("cuda", self._env._config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID) 53 | self.grid_dim = (192, 192) 54 | self.global_dim = (512,512) 55 | self.heatmap_size = 24 56 | self.cell_size = 0.05 57 | self.img_size = (256, 256) 58 | self.n_object_classes = 27 59 | self.n_spatial_classes = 3 60 | model_path = 'data/pretrain_model/ddppo-models/gibson-4plus-mp3d-train-val-test-resnet50.pth' 61 | self.l_policy = DdppoPolicy(path=model_path) 62 | self.l_policy = self.l_policy.to(self.device) 63 | self.sg_reset() 64 | 65 | def sg_reset(self): 66 | self.sg_global = SemanticGrid( 67 | 1, self.global_dim, self.heatmap_size, self.cell_size, 68 | spatial_labels=self.n_spatial_classes, object_labels=self.n_object_classes, 69 | device=self.device, 70 | ) 71 | self.abs_poses = [] 72 | self.agent_height = [] 73 | # long term goal in global grid map 74 | self.ltg_abs_coords = torch.zeros((1, 1, 2), dtype=torch.int64).to(self.device) 75 | self.ltg_abs_coords_list = [] 76 | 77 | def run_local_policy(self, depth, goal, pose_coords, rel_agent_o, step): 78 | planning_goal = goal.squeeze(0).squeeze(0) 79 | planning_pose = pose_coords.squeeze(0).squeeze(0) 80 | 81 | sq = torch.square(planning_goal[0]-planning_pose[0]) + torch.square(planning_goal[1]-planning_pose[1]) 82 | rho = torch.sqrt(sq.float()) 83 | phi = torch.atan2(((planning_pose[0]-planning_goal[0]).float()), (planning_pose[1]-planning_goal[1]).float()) 84 | phi = phi - rel_agent_o 85 | rho = rho * self.cell_size 86 | 87 | point_goal_with_gps_compass = torch.tensor([rho,phi], dtype=torch.float32).to(self.device) 88 | depth = depth.reshape(self.img_size[0], self.img_size[1], 1) 89 | return self.l_policy.plan(depth, point_goal_with_gps_compass, step) 90 | 91 | def transform_waypoint2cm2(self, t, ltg): 92 | ltg_cm2 = [] 93 | ltg_cm2.append(-ltg[2]) 94 | ltg_cm2.append(-ltg[0]) 95 | 96 | agent_state = self._env.sim.get_agent_state() 97 | agent_pose, y_height = self.utils.get_sim_location(agent_state) 98 | ltg_cm2.append(agent_pose[2]) 99 | self.abs_poses.append(agent_pose) 100 | self.agent_height.append(y_height) 101 | 102 | rel_abs_pose = self.utils.get_rel_pose(self.abs_poses[t], self.abs_poses[0]) 103 | _rel_abs_pose = torch.Tensor(rel_abs_pose).unsqueeze(0).float() 104 | _rel_abs_pose = _rel_abs_pose.to(self.device) 105 | abs_pose_coords = self.utils.get_coord_pose(self.sg_global, _rel_abs_pose, self.abs_poses[0], self.global_dim[0], self.cell_size, self.device) # B x T x 3 106 | 107 | rel_ltg_abs_pose = self.utils.get_rel_pose(pos2=ltg_cm2, pos1=self.abs_poses[0]) 108 | _rel_ltg_abs_pose = torch.Tensor(rel_ltg_abs_pose).unsqueeze(0).float() 109 | _rel_ltg_abs_pose = _rel_ltg_abs_pose.to(self.device) 110 | ltg_coords = self.utils.get_coord_pose(self.sg_global, _rel_ltg_abs_pose, self.abs_poses[0], self.global_dim[0], self.cell_size, self.device) 111 | 112 | return ltg_coords, abs_pose_coords, rel_abs_pose 113 | 114 | def action_decision(self, t, ltg, depth): 115 | ltg_abs_coords, abs_pose_coords, rel_abs_pose = self.transform_waypoint2cm2(t, ltg) 116 | depth = torch.tensor(depth).to(self.device) 117 | action_id = self.run_local_policy( 118 | depth=depth, 119 | goal=ltg_abs_coords.clone(), 120 | pose_coords=abs_pose_coords.clone(), 121 | rel_agent_o=rel_abs_pose[2], 122 | step=t, 123 | ) 124 | return action_id 125 | -------------------------------------------------------------------------------- /habitat_extensions/config/default.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from habitat.config.default import Config as CN 4 | from habitat.config.default import get_config 5 | 6 | 7 | _C = get_config() 8 | _C.defrost() 9 | 10 | # ----------------------------------------------------------------------------- 11 | # VLN ORACLE ACTION SENSOR 12 | # ----------------------------------------------------------------------------- 13 | _C.TASK.VLN_ORACLE_ACTION_SENSOR = CN() 14 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.TYPE = "VLNOracleActionSensor" 15 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.GOAL_RADIUS = 0.5 16 | # compatibility with the dataset generation oracle and paper results. 17 | # if False, use the ShortestPathFollower in Habitat 18 | _C.TASK.VLN_ORACLE_ACTION_SENSOR.USE_ORIGINAL_FOLLOWER = True 19 | # ----------------------------------------------------------------------------- 20 | # VLN ORACLE PROGRESS SENSOR 21 | # ----------------------------------------------------------------------------- 22 | _C.TASK.VLN_ORACLE_PROGRESS_SENSOR = CN() 23 | _C.TASK.VLN_ORACLE_PROGRESS_SENSOR.TYPE = "VLNOracleProgressSensor" 24 | # ----------------------------------------------------------------------------- 25 | # VLN ORACLE WAYPOINT SENSOR 26 | # ----------------------------------------------------------------------------- 27 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR = CN() 28 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.TYPE = "VLNOracleWaypointSensor" 29 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.GOAL_RADIUS = 0.5 30 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.USE_ORIGINAL_FOLLOWER = True 31 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.MAP_SIZE = 100 32 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.MAP_RESOLUTION = 1250 33 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW = CN() 34 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.USE = True 35 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = "train" 36 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.GT_PATH = "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json.gz" 37 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.IS_SPARSE = True 38 | _C.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.NUM_WAYPOINTS = 6 39 | # ----------------------------------------------------------------------------- 40 | # VLN ORACLE PATH SENSOR 41 | # ----------------------------------------------------------------------------- 42 | _C.TASK.VLN_ORACLE_PATH_SENSOR = CN() 43 | _C.TASK.VLN_ORACLE_PATH_SENSOR.TYPE = "VLNOraclePathSensor" 44 | _C.TASK.VLN_ORACLE_PATH_SENSOR.MAP_RESOLUTION = 1250 45 | _C.TASK.VLN_ORACLE_PATH_SENSOR.MAP_SIZE = 100 46 | _C.TASK.VLN_ORACLE_PATH_SENSOR.LINE_WIDTH = 1 47 | # ----------------------------------------------------------------------------- 48 | # SEMANTIC FILTER SENSOR 49 | # ----------------------------------------------------------------------------- 50 | _C.TASK.SEMANTIC_FILTER_SENSOR = CN() 51 | _C.TASK.SEMANTIC_FILTER_SENSOR.TYPE = "SemanticFilterSensor" 52 | _C.TASK.SEMANTIC_FILTER_SENSOR.HEIGHT = 256 53 | _C.TASK.SEMANTIC_FILTER_SENSOR.WIDTH = 256 54 | _C.TASK.SEMANTIC_FILTER_SENSOR.CATEGORY = 27 55 | # ----------------------------------------------------------------------------- 56 | # GT SEMANTIC MAP SENSOR 57 | # ----------------------------------------------------------------------------- 58 | _C.TASK.GT_SEMANTIC_MAP_SENSOR = CN() 59 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.TYPE = "GtSemanticMapSensor" 60 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.MAP_SIZE = 100 61 | _C.TASK.GT_SEMANTIC_MAP_SENSOR.SPLIT = 'train' # 'train', 'train_aug' 62 | # ----------------------------------------------------------------------------- 63 | # HEADING SENSOR 64 | # ----------------------------------------------------------------------------- 65 | _C.TASK.HEADING_SENSOR = CN() 66 | _C.TASK.HEADING_SENSOR.TYPE = "HeadingSensor" 67 | 68 | 69 | # ----------------------------------------------------------------------------- 70 | # NDTW MEASUREMENT 71 | # ----------------------------------------------------------------------------- 72 | _C.TASK.NDTW = CN() 73 | _C.TASK.NDTW.TYPE = "NDTW" 74 | _C.TASK.NDTW.SPLIT = "val_seen" 75 | _C.TASK.NDTW.FDTW = True # False: DTW 76 | _C.TASK.NDTW.GT_PATH = ( 77 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json" 78 | ) 79 | _C.TASK.NDTW.SUCCESS_DISTANCE = 0.2 80 | # ----------------------------------------------------------------------------- 81 | # SDTW MEASUREMENT 82 | # ----------------------------------------------------------------------------- 83 | _C.TASK.SDTW = CN() 84 | _C.TASK.SDTW.TYPE = "SDTW" 85 | _C.TASK.SDTW.SPLIT = "val_seen" 86 | _C.TASK.SDTW.FDTW = True # False: DTW 87 | _C.TASK.SDTW.GT_PATH = ( 88 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json" 89 | ) 90 | _C.TASK.SDTW.SUCCESS_DISTANCE = 0.2 91 | # ----------------------------------------------------------------------------- 92 | # PATH_LENGTH MEASUREMENT 93 | # ----------------------------------------------------------------------------- 94 | _C.TASK.PATH_LENGTH = CN() 95 | _C.TASK.PATH_LENGTH.TYPE = "PathLength" 96 | # ----------------------------------------------------------------------------- 97 | # ORACLE_NAVIGATION_ERROR MEASUREMENT 98 | # ----------------------------------------------------------------------------- 99 | _C.TASK.ORACLE_NAVIGATION_ERROR = CN() 100 | _C.TASK.ORACLE_NAVIGATION_ERROR.TYPE = "OracleNavigationError" 101 | # ----------------------------------------------------------------------------- 102 | # ORACLE_SUCCESS MEASUREMENT 103 | # ----------------------------------------------------------------------------- 104 | _C.TASK.ORACLE_SUCCESS = CN() 105 | _C.TASK.ORACLE_SUCCESS.TYPE = "OracleSuccess" 106 | _C.TASK.ORACLE_SUCCESS.SUCCESS_DISTANCE = 0.2 107 | # ----------------------------------------------------------------------------- 108 | # ORACLE_SPL MEASUREMENT 109 | # ----------------------------------------------------------------------------- 110 | _C.TASK.ORACLE_SPL = CN() 111 | _C.TASK.ORACLE_SPL.TYPE = "OracleSPL" 112 | _C.TASK.ORACLE_SPL.SUCCESS_DISTANCE = 0.2 113 | # ----------------------------------------------------------------------------- 114 | # STEPS_TAKEN MEASUREMENT 115 | # ----------------------------------------------------------------------------- 116 | _C.TASK.STEPS_TAKEN = CN() 117 | _C.TASK.STEPS_TAKEN.TYPE = "StepsTaken" 118 | 119 | _C.DATASET.split_num = 0 120 | _C.DATASET.split_rank = 0 121 | 122 | 123 | def get_extended_config( 124 | config_paths: Optional[Union[List[str], str]] = None, opts: Optional[list] = None 125 | ) -> CN: 126 | """Create a unified config with default values overwritten by values from 127 | :p:`config_paths` and overwritten by options from :p:`opts`. 128 | :param config_paths: List of config paths or string that contains comma 129 | separated list of config paths. 130 | :param opts: Config options (keys, values) in a list (e.g., passed from 131 | command line into the config. For example, 132 | :py:`opts = ['FOO.BAR', 0.5]`. Argument can be used for parameter 133 | sweeping or quick tests. 134 | """ 135 | config = _C.clone() 136 | 137 | if config_paths: 138 | if isinstance(config_paths, str): 139 | config_paths = [config_paths] 140 | 141 | for config_path in config_paths: 142 | config.merge_from_file(config_path) 143 | 144 | if opts: 145 | config.merge_from_list(opts) 146 | config.freeze() 147 | return config 148 | -------------------------------------------------------------------------------- /habitat_extensions/shortest_path_follower.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/facebookresearch/habitat-lab/blob/v0.1.4/habitat/tasks/nav/shortest_path_follower.py 2 | # Use the Habitat v0.1.4 ShortestPathFollower for compatibility with 3 | # the dataset generation oracle. 4 | 5 | from typing import Optional, Union 6 | 7 | import habitat_sim 8 | import numpy as np 9 | from habitat.sims.habitat_simulator.actions import HabitatSimActions 10 | from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim 11 | from habitat.utils.geometry_utils import ( 12 | angle_between_quaternions, 13 | quaternion_from_two_vectors, 14 | ) 15 | 16 | EPSILON = 1e-6 17 | 18 | 19 | def action_to_one_hot(action: int) -> np.array: 20 | one_hot = np.zeros(len(HabitatSimActions), dtype=np.float32) 21 | one_hot[action] = 1 22 | return one_hot 23 | 24 | 25 | class ShortestPathFollowerCompat: 26 | r"""Utility class for extracting the action on the shortest path to the 27 | goal. 28 | Args: 29 | sim: HabitatSim instance. 30 | goal_radius: Distance between the agent and the goal for it to be 31 | considered successful. 32 | return_one_hot: If true, returns a one-hot encoding of the action 33 | (useful for training ML agents). If false, returns the 34 | SimulatorAction. 35 | """ 36 | 37 | def __init__( 38 | self, sim: HabitatSim, goal_radius: float, return_one_hot: bool = True 39 | ): 40 | assert ( 41 | getattr(sim, "geodesic_distance", None) is not None 42 | ), "{} must have a method called geodesic_distance".format(type(sim).__name__) 43 | 44 | self._sim = sim 45 | self._max_delta = self._sim.config.FORWARD_STEP_SIZE - EPSILON 46 | self._goal_radius = goal_radius 47 | self._step_size = self._sim.config.FORWARD_STEP_SIZE 48 | 49 | self._mode = ( 50 | "geodesic_path" 51 | if getattr(sim, "get_straight_shortest_path_points", None) is not None 52 | else "greedy" 53 | ) 54 | self._return_one_hot = return_one_hot 55 | 56 | def _get_return_value(self, action) -> Union[int, np.array]: 57 | if self._return_one_hot: 58 | return action_to_one_hot(action) 59 | else: 60 | return action 61 | 62 | def get_next_action(self, goal_pos: np.array) -> Optional[Union[int, np.array]]: 63 | """Returns the next action along the shortest path. 64 | """ 65 | if ( 66 | self._sim.geodesic_distance(self._sim.get_agent_state().position, goal_pos) 67 | <= self._goal_radius 68 | ): 69 | return None 70 | 71 | max_grad_dir = self._est_max_grad_dir(goal_pos) 72 | if max_grad_dir is None: 73 | return self._get_return_value(HabitatSimActions.MOVE_FORWARD) 74 | return self._step_along_grad(max_grad_dir) 75 | 76 | def _step_along_grad(self, grad_dir: np.quaternion) -> Union[int, np.array]: 77 | current_state = self._sim.get_agent_state() 78 | alpha = angle_between_quaternions(grad_dir, current_state.rotation) 79 | if alpha <= np.deg2rad(self._sim.config.TURN_ANGLE) + EPSILON: 80 | return self._get_return_value(HabitatSimActions.MOVE_FORWARD) 81 | else: 82 | sim_action = HabitatSimActions.TURN_LEFT 83 | self._sim.step(sim_action) 84 | best_turn = ( 85 | HabitatSimActions.TURN_LEFT 86 | if ( 87 | angle_between_quaternions( 88 | grad_dir, self._sim.get_agent_state().rotation 89 | ) 90 | < alpha 91 | ) 92 | else HabitatSimActions.TURN_RIGHT 93 | ) 94 | self._reset_agent_state(current_state) 95 | return self._get_return_value(best_turn) 96 | 97 | def _reset_agent_state(self, state: habitat_sim.AgentState) -> None: 98 | self._sim.set_agent_state(state.position, state.rotation, reset_sensors=False) 99 | 100 | def _geo_dist(self, goal_pos: np.array) -> float: 101 | return self._sim.geodesic_distance( 102 | self._sim.get_agent_state().position, goal_pos 103 | ) 104 | 105 | def _est_max_grad_dir(self, goal_pos: np.array) -> np.array: 106 | 107 | current_state = self._sim.get_agent_state() 108 | current_pos = current_state.position 109 | 110 | if self.mode == "geodesic_path": 111 | points = self._sim.get_straight_shortest_path_points( 112 | self._sim.get_agent_state().position, goal_pos 113 | ) 114 | # Add a little offset as things get weird if 115 | # points[1] - points[0] is anti-parallel with forward 116 | if len(points) < 2: 117 | return None 118 | max_grad_dir = quaternion_from_two_vectors( 119 | self._sim.forward_vector, 120 | points[1] 121 | - points[0] 122 | + EPSILON * np.cross(self._sim.up_vector, self._sim.forward_vector), 123 | ) 124 | max_grad_dir.x = 0 125 | max_grad_dir = np.normalized(max_grad_dir) 126 | else: 127 | current_rotation = self._sim.get_agent_state().rotation 128 | current_dist = self._geo_dist(goal_pos) 129 | 130 | best_geodesic_delta = -2 * self._max_delta 131 | best_rotation = current_rotation 132 | for _ in range(0, 360, self._sim.config.TURN_ANGLE): 133 | sim_action = HabitatSimActions.MOVE_FORWARD 134 | self._sim.step(sim_action) 135 | new_delta = current_dist - self._geo_dist(goal_pos) 136 | 137 | if new_delta > best_geodesic_delta: 138 | best_rotation = self._sim.get_agent_state().rotation 139 | best_geodesic_delta = new_delta 140 | 141 | # If the best delta is within (1 - cos(TURN_ANGLE))% of the 142 | # best delta (the step size), then we almost certainly have 143 | # found the max grad dir and should just exit 144 | if np.isclose( 145 | best_geodesic_delta, 146 | self._max_delta, 147 | rtol=1 - np.cos(np.deg2rad(self._sim.config.TURN_ANGLE)), 148 | ): 149 | break 150 | 151 | self._sim.set_agent_state( 152 | current_pos, 153 | self._sim.get_agent_state().rotation, 154 | reset_sensors=False, 155 | ) 156 | 157 | sim_action = HabitatSimActions.TURN_LEFT 158 | self._sim.step(sim_action) 159 | 160 | self._reset_agent_state(current_state) 161 | 162 | max_grad_dir = best_rotation 163 | 164 | return max_grad_dir 165 | 166 | @property 167 | def mode(self): 168 | return self._mode 169 | 170 | @mode.setter 171 | def mode(self, new_mode: str): 172 | r"""Sets the mode for how the greedy follower determines the best next 173 | step. 174 | Args: 175 | new_mode: geodesic_path indicates using the simulator's shortest 176 | path algorithm to find points on the map to navigate between. 177 | greedy indicates trying to move forward at all possible 178 | orientations and selecting the one which reduces the geodesic 179 | distance the most. 180 | """ 181 | assert new_mode in {"geodesic_path", "greedy"} 182 | if new_mode == "geodesic_path": 183 | assert ( 184 | getattr(self._sim, "get_straight_shortest_path_points", None) 185 | is not None 186 | ) 187 | self._mode = new_mode 188 | -------------------------------------------------------------------------------- /vlnce_baselines/config/default.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional, Union 3 | 4 | from habitat.config.default import Config as CN 5 | from habitat_extensions.config.default import get_extended_config as get_task_config 6 | 7 | 8 | # ----------------------------------------------------------------------------- 9 | # EXPERIMENT CONFIG 10 | # ----------------------------------------------------------------------------- 11 | _C = CN() 12 | _C.BASE_TASK_CONFIG_PATH = "habitat_extensions/config/vlnce_task.yaml" 13 | _C.TASK_CONFIG = CN() # task_config will be stored as a config node 14 | _C.CMD_TRAILING_OPTS = [] # store command line options as list of strings 15 | _C.TRAINER_NAME = "dagger" 16 | _C.ENV_NAME = "VLNCEDaggerEnv" 17 | _C.SIMULATOR_GPU_ID = 0 18 | _C.SIMULATOR_GPU_IDS = None 19 | _C.TORCH_GPU_ID = 0 20 | _C.NUM_PROCESSES = 4 21 | _C.VIDEO_OPTION = [] # options: "disk", "tensorboard" 22 | _C.VIDEO_DIR = "videos/debug" 23 | _C.TENSORBOARD_DIR = "data/tensorboard_dirs/debug" 24 | _C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] 25 | _C.CHECKPOINT_FOLDER = "data/checkpoints" 26 | _C.LOG_FILE = "train.log" 27 | _C.EVAL_CKPT_PATH_DIR = "data/checkpoints" # path to ckpt or path to ckpts dir 28 | _C.NUM_UPDATES = 300000 29 | _C.CHECKPOINT_INTERVAL = 512000 30 | 31 | # ----------------------------------------------------------------------------- 32 | # EVAL CONFIG 33 | # ----------------------------------------------------------------------------- 34 | _C.EVAL = CN() 35 | _C.EVAL.SPLIT = "val_seen" # The split to evaluate on 36 | _C.EVAL.USE_CKPT_CONFIG = True 37 | _C.EVAL.EPISODE_COUNT = 2 38 | 39 | # ----------------------------------------------------------------------------- 40 | # INFERENCE CONFIG 41 | # ----------------------------------------------------------------------------- 42 | _C.INFERENCE = CN() 43 | _C.INFERENCE.SPLIT = "test" 44 | _C.INFERENCE.USE_CKPT_CONFIG = True 45 | _C.INFERENCE.CKPT_PATH = "data/checkpoints/CMA_PM_DA_Aug.pth" 46 | _C.INFERENCE.PREDICTIONS_FILE = "predictions.json" 47 | 48 | # ----------------------------------------------------------------------------- 49 | # DAGGER ENVIRONMENT CONFIG 50 | # ----------------------------------------------------------------------------- 51 | _C.DAGGER = CN() 52 | _C.DAGGER.LR = 2.5e-4 53 | _C.DAGGER.ITERATIONS = 10 54 | _C.DAGGER.EPOCHS = 4 55 | _C.DAGGER.UPDATE_SIZE = 5000 56 | _C.DAGGER.BATCH_SIZE = 5 57 | _C.DAGGER.P = 0.75 58 | _C.DAGGER.LMDB_MAP_SIZE = 5.0e12 59 | # How often to commit the writes to the DB, less commits is 60 | # better, but everything must be in memory until a commit happens/ 61 | _C.DAGGER.LMDB_COMMIT_FREQUENCY = 50 62 | _C.DAGGER.USE_IW = True 63 | # If True, load precomputed features directly from LMDB_FEATURES_DIR. 64 | _C.DAGGER.PRELOAD_LMDB_FEATURES = False 65 | _C.DAGGER.LMDB_FEATURES_DIR = "data/trajectories_dirs/debug/trajectories.lmdb" 66 | # load an already trained model for fine tuning 67 | _C.DAGGER.LOAD_FROM_CKPT = False 68 | _C.DAGGER.CKPT_TO_LOAD = "data/checkpoints/ckpt.0.pth" 69 | 70 | # ----------------------------------------------------------------------------- 71 | # MODELING CONFIG 72 | # ----------------------------------------------------------------------------- 73 | _C.MODEL = CN() 74 | # on GT trajectories in the training set 75 | _C.MODEL.inflection_weight_coef = 3.2 76 | 77 | _C.MODEL.ablate_depth = False 78 | _C.MODEL.ablate_rgb = False 79 | _C.MODEL.ablate_instruction = False 80 | 81 | _C.MODEL.INSTRUCTION_ENCODER = CN() 82 | _C.MODEL.INSTRUCTION_ENCODER.vocab_size = 2504 83 | _C.MODEL.INSTRUCTION_ENCODER.max_length = 200 84 | _C.MODEL.INSTRUCTION_ENCODER.use_pretrained_embeddings = True 85 | _C.MODEL.INSTRUCTION_ENCODER.embedding_file = ( 86 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/embeddings.json.gz" 87 | ) 88 | _C.MODEL.INSTRUCTION_ENCODER.dataset_vocab = ( 89 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train.json.gz" 90 | ) 91 | _C.MODEL.INSTRUCTION_ENCODER.fine_tune_embeddings = False 92 | _C.MODEL.INSTRUCTION_ENCODER.embedding_size = 50 93 | _C.MODEL.INSTRUCTION_ENCODER.hidden_size = 128 94 | _C.MODEL.INSTRUCTION_ENCODER.rnn_type = "LSTM" 95 | _C.MODEL.INSTRUCTION_ENCODER.final_state_only = False 96 | _C.MODEL.INSTRUCTION_ENCODER.bidirectional = True 97 | _C.MODEL.INSTRUCTION_ENCODER.backbone = 'lstm' 98 | 99 | _C.MODEL.RGB_ENCODER = CN() 100 | _C.MODEL.RGB_ENCODER.output_size = 256 101 | _C.MODEL.RGB_ENCODER.backbone = "unet" 102 | _C.MODEL.RGB_ENCODER.pretrain_model = 'data/pretrain_model/unet-models/2021_02_14-23_42_50.pt' 103 | 104 | _C.MODEL.DEPTH_ENCODER = CN() 105 | _C.MODEL.DEPTH_ENCODER.output_size = 128 106 | _C.MODEL.DEPTH_ENCODER.backbone = "resnet50" # type of resnet to use 107 | _C.MODEL.DEPTH_ENCODER.ddppo_checkpoint = "data/pretrain_model/ddppo-models/gibson-2plus-resnet50.pth" # path to DDPPO resnet weights 108 | 109 | _C.MODEL.MAP_ENCODER = CN() 110 | _C.MODEL.MAP_ENCODER.ego_map_size = 100 111 | _C.MODEL.MAP_ENCODER.output_size = 256 112 | 113 | _C.MODEL.STATE_ENCODER = CN() 114 | _C.MODEL.STATE_ENCODER.hidden_size = 512 115 | _C.MODEL.STATE_ENCODER.rnn_type = "GRU" 116 | _C.MODEL.STATE_ENCODER.input_type = ['rgb', 'depth', 'map'] 117 | 118 | _C.MODEL.PROGRESS_MONITOR = CN() 119 | _C.MODEL.PROGRESS_MONITOR.use = True 120 | _C.MODEL.PROGRESS_MONITOR.alpha = 1.0 # loss multiplier 121 | 122 | _C.MODEL.CONTRASTIVE_MONITOR = CN() 123 | _C.MODEL.CONTRASTIVE_MONITOR.target_tau = 0.07 124 | _C.MODEL.CONTRASTIVE_MONITOR.use = True 125 | _C.MODEL.CONTRASTIVE_MONITOR.alpha = 1.0 126 | 127 | _C.MODEL.PREDICTION_MONITOR = CN() 128 | _C.MODEL.PREDICTION_MONITOR.use = True 129 | _C.MODEL.PREDICTION_MONITOR.alpha = 0.1 130 | 131 | _C.MODEL.RGBMAPPING = CN() 132 | _C.MODEL.RGBMAPPING.map_depth = 64 133 | _C.MODEL.RGBMAPPING.global_map_size = 240 134 | _C.MODEL.RGBMAPPING.egocentric_map_size = 100 135 | _C.MODEL.RGBMAPPING.resolution = 0.12 136 | _C.MODEL.RGBMAPPING.gpu_id = 0 137 | _C.MODEL.RGBMAPPING.num_proc = 1 138 | 139 | _C.STOP_CONDITION = CN() 140 | _C.STOP_CONDITION.TYPE = 'prog' 141 | _C.STOP_CONDITION.PROG_THRESHOLD = 0.8 142 | 143 | _C.OVERWRITE = False 144 | _C.LOG_INTERVAL = 100 145 | _C.random_agent = False 146 | _C.RESUME_CKPT = None # resume from this ckpt 147 | _C.VIDEO_NUM = 99999 148 | _C.ego_map_size = 100 149 | _C.same_level_train = False 150 | _C.ep_max_len = 200 151 | _C.step_num = 3 152 | _C.use_ddppo = False 153 | 154 | 155 | def get_config( 156 | config_paths: Optional[Union[List[str], str]] = None, opts: Optional[list] = None 157 | ) -> CN: 158 | r"""Create a unified config with default values overwritten by values from 159 | `config_paths` and overwritten by options from `opts`. 160 | Args: 161 | config_paths: List of config paths or string that contains comma 162 | separated list of config paths. 163 | opts: Config options (keys, values) in a list (e.g., passed from 164 | command line into the config. For example, `opts = ['FOO.BAR', 165 | 0.5]`. Argument can be used for parameter sweeping or quick tests. 166 | """ 167 | config = _C.clone() 168 | if config_paths: 169 | if isinstance(config_paths, str): 170 | config_paths = [config_paths] 171 | 172 | for config_path in config_paths: 173 | config.merge_from_file(config_path) 174 | 175 | if config.BASE_TASK_CONFIG_PATH != "": 176 | config.TASK_CONFIG = get_task_config(config.BASE_TASK_CONFIG_PATH) 177 | if opts: 178 | config.CMD_TRAILING_OPTS = opts 179 | config.merge_from_list(opts) 180 | 181 | return config 182 | 183 | 184 | def refine_config(config, local_rank): 185 | config.defrost() 186 | 187 | config.TORCH_GPU_ID = local_rank 188 | config.MODEL.RGBMAPPING.gpu_id = config.TORCH_GPU_ID 189 | config.MODEL.RGBMAPPING.num_proc = config.NUM_PROCESSES 190 | 191 | split = config.TASK_CONFIG.DATASET.SPLIT 192 | config.TASK_CONFIG.TASK.NDTW.SPLIT = split 193 | config.TASK_CONFIG.TASK.SDTW.SPLIT = split 194 | config.TASK_CONFIG.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = split 195 | 196 | if config.DAGGER.P == 1.0: # if doing teacher forcing, don't switch the scene until it is complete 197 | config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = (-1) 198 | 199 | if config.same_level_train: 200 | config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/train/train_same_level.json.gz' 201 | 202 | if 'aug' in config.BASE_TASK_CONFIG_PATH: 203 | config.TASK_CONFIG.TASK.GT_SEMANTIC_MAP_SENSOR.SPLIT = 'train_aug' 204 | 205 | config.freeze() 206 | return config 207 | 208 | def set_saveDir_GPUs(config, run_type, model_dir, note, gpus, local_rank): 209 | config.defrost() 210 | 211 | run_dir = os.path.join(model_dir, "run_{}_{}".format(run_type, note)) 212 | os.makedirs(run_dir, exist_ok=True) 213 | 214 | config.CHECKPOINT_FOLDER = os.path.join(run_dir, 'checkpoint') 215 | config.LOG_FILE = os.path.join(run_dir, '{}.log'.format(run_type)) 216 | config.TENSORBOARD_DIR = os.path.join(run_dir, 'tensorboard') 217 | if config.DAGGER.PRELOAD_LMDB_FEATURES is False: 218 | config.DAGGER.LMDB_FEATURES_DIR = os.path.join(run_dir, 'trajectories.lmdb') 219 | config.VIDEO_DIR = os.path.join(run_dir, 'video_dir') 220 | config.CODE_DIR = os.path.join(run_dir, 'sh_n_codes') 221 | config.CONFIG_DIR = os.path.join(run_dir, 'config') 222 | config.METRIC_DIR = os.path.join(run_dir, 'metric') 223 | 224 | config.SIMULATOR_GPU_ID = local_rank 225 | config.SIMULATOR_GPU_IDS = None 226 | if gpus is not None: 227 | config.TORCH_GPU_ID = gpus[0] 228 | config.SIMULATOR_GPU_IDS = gpus if len(gpus) == 1 else gpus[1:] 229 | config.freeze() 230 | 231 | return config 232 | -------------------------------------------------------------------------------- /vlnce_baselines/models/mg_map_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Space 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from habitat import Config 9 | from habitat_baselines.rl.models.rnn_state_encoder import RNNStateEncoder 10 | from habitat_baselines.rl.ppo.policy import Net 11 | 12 | from vlnce_baselines.models.encoders.instruction_encoder import InstructionEncoder 13 | from vlnce_baselines.models.encoders.unet_encoder import UNet 14 | from vlnce_baselines.models.encoders.resnet_encoders import VlnResnetDepthEncoder 15 | from vlnce_baselines.models.encoders.map_encoder import MapEncoder, MapDecoder 16 | from vlnce_baselines.common.rgb_mapping import RGBMapping 17 | 18 | 19 | class MGMapNet(Net): 20 | """ A multi-granularity map (MGMap) network that contains: 21 | Instruction encoder 22 | RGB encoder 23 | Depth encoder 24 | Map encoder and decoder 25 | RNN state encoder 26 | """ 27 | def __init__(self, observation_space: Space, model_config: Config): 28 | super().__init__() 29 | self.model_config = model_config 30 | 31 | # Init the instruction encoder 32 | self.instruction_encoder = InstructionEncoder(model_config.INSTRUCTION_ENCODER) 33 | 34 | # # Init the rgb encoder 35 | self.rgb_encoder = UNet(model_config) 36 | for param in self.rgb_encoder.parameters(): 37 | param.requires_grad = False 38 | self.rgb_linear = nn.Sequential( 39 | nn.AdaptiveAvgPool1d(1), 40 | nn.Flatten(), 41 | nn.Linear( 42 | self.rgb_encoder.output_shape[0], 43 | model_config.RGB_ENCODER.output_size, 44 | ), 45 | nn.ReLU(True), 46 | ) 47 | 48 | # Init the depth encoder 49 | self.depth_encoder = VlnResnetDepthEncoder( 50 | observation_space, 51 | output_size=model_config.DEPTH_ENCODER.output_size, 52 | checkpoint=model_config.DEPTH_ENCODER.ddppo_checkpoint, 53 | backbone=model_config.DEPTH_ENCODER.backbone, 54 | spatial_output=True, 55 | ) 56 | self.depth_linear = nn.Sequential( 57 | nn.Flatten(), 58 | nn.Linear( 59 | np.prod(self.depth_encoder.output_shape), 60 | model_config.DEPTH_ENCODER.output_size, 61 | ), 62 | nn.ReLU(True), 63 | ) 64 | 65 | # Init the mapping network 66 | self.rgb_mapping_module = RGBMapping(model_config.RGBMAPPING) 67 | map_channel = model_config.RGBMAPPING.map_depth 68 | 69 | # Init the map encoder 70 | self.map_encoder = MapEncoder( 71 | model_config.MAP_ENCODER.ego_map_size, 72 | map_channel, 73 | model_config.MAP_ENCODER.output_size, 74 | ) 75 | 76 | # Init the map decoder 77 | self.map_decoder = MapDecoder(model_config.MAP_ENCODER.output_size) 78 | self.map_classfier = nn.Sequential( 79 | nn.ConvTranspose2d(self.map_decoder.output_shape[0], 32, kernel_size=4, stride=2, padding=1, bias=False), 80 | nn.BatchNorm2d(32), 81 | nn.ReLU(inplace=True), 82 | nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, bias=False), 83 | nn.BatchNorm2d(32), 84 | nn.ReLU(inplace=True), 85 | nn.Conv2d(32, 27, kernel_size=1, stride=1, padding=0, bias=True), 86 | ) 87 | 88 | # Init the map linear 89 | self.map_encoded_linear = nn.Sequential( 90 | nn.Conv2d(self.map_encoder.output_shape[0], 128, 3, stride=1, padding=1), 91 | nn.ReLU(), 92 | ) 93 | self.map_classified_linear = nn.Sequential( 94 | nn.Conv2d(27, 128, 3, stride=1, padding=1), 95 | nn.ReLU(), 96 | ) 97 | self.map_cated_linear = nn.Sequential( 98 | nn.Conv2d(128*2, model_config.MAP_ENCODER.output_size, 3, stride=1, padding=1), 99 | nn.ReLU(), 100 | ) 101 | self.map_linear = nn.Sequential( 102 | nn.AdaptiveAvgPool1d(1), 103 | nn.Flatten(), 104 | nn.Linear( 105 | model_config.MAP_ENCODER.output_size, 106 | model_config.MAP_ENCODER.output_size, 107 | ), 108 | nn.ReLU(True), 109 | ) 110 | 111 | # Init the first rnn state decoder 112 | self._hidden_size = model_config.STATE_ENCODER.hidden_size 113 | first_state_input_size = ( 114 | (model_config.RGB_ENCODER.output_size if 'rgb' in model_config.STATE_ENCODER.input_type else 0) 115 | + (model_config.DEPTH_ENCODER.output_size if 'depth' in model_config.STATE_ENCODER.input_type else 0) 116 | + (model_config.MAP_ENCODER.output_size if 'map' in model_config.STATE_ENCODER.input_type else 0) 117 | ) 118 | self.state_encoder = RNNStateEncoder( 119 | input_size=first_state_input_size, 120 | hidden_size=self._hidden_size, 121 | num_layers=1, 122 | rnn_type=model_config.STATE_ENCODER.rnn_type, 123 | ) 124 | 125 | # Init the attention encoder 126 | self.state_text_q_layer = nn.Linear(self._hidden_size, self._hidden_size // 2) 127 | self.state_text_k_layer = nn.Conv1d(self.instruction_encoder.output_size, self._hidden_size // 2, 1) 128 | 129 | self.text_map_q_layer = nn.Linear(self.instruction_encoder.output_size, self._hidden_size // 2) 130 | self.text_map_k_layer = nn.Conv1d(self.map_encoder.output_shape[0], self._hidden_size // 2, 1) 131 | 132 | self.register_buffer("_scale", torch.tensor(1.0 / ((self._hidden_size // 2) ** 0.5))) 133 | 134 | # Init the second rnn state decoder 135 | second_state_input_size = ( 136 | model_config.STATE_ENCODER.hidden_size 137 | + model_config.STATE_ENCODER.hidden_size // 2 138 | + (model_config.STATE_ENCODER.hidden_size // 2 if 'map' in model_config.STATE_ENCODER.input_type else 0) 139 | ) 140 | self.second_state_compress = nn.Sequential( 141 | nn.Linear( 142 | second_state_input_size, 143 | self._hidden_size, 144 | ), 145 | nn.ReLU(True), 146 | ) 147 | self.second_state_encoder = RNNStateEncoder( 148 | input_size=self._hidden_size, 149 | hidden_size=self._hidden_size, 150 | num_layers=1, 151 | rnn_type=model_config.STATE_ENCODER.rnn_type, 152 | ) 153 | self._output_size = model_config.STATE_ENCODER.hidden_size 154 | 155 | self.train() 156 | self.depth_encoder.eval() 157 | self.rgb_encoder.eval() 158 | 159 | @property 160 | def output_size(self): 161 | return self._output_size 162 | 163 | @property 164 | def is_blind(self): 165 | return False 166 | 167 | @property 168 | def num_recurrent_layers(self): 169 | return self.state_encoder.num_recurrent_layers + ( 170 | self.second_state_encoder.num_recurrent_layers 171 | ) 172 | 173 | def _attn(self, q, k, v, mask=None): 174 | logits = torch.einsum("nc, nci -> ni", q, k) 175 | if mask is not None: 176 | logits = logits - mask.float() * 1e8 177 | attn = F.softmax(logits * self._scale, dim=1) 178 | return torch.einsum("ni, nci -> nc", attn, v), attn 179 | 180 | def forward(self, observations, rnn_hidden_states, prev_actions, masks): 181 | instruction_embedding, text_mask = self.instruction_encoder(observations) 182 | rgb_embedding, rgb_embedding_proj = self.rgb_encoder(observations) 183 | depth_embedding = self.depth_encoder(observations) 184 | 185 | # Get map 186 | self.rgb_mapping_module(rgb_embedding_proj, observations, masks) 187 | ego_map = observations['rgb_ego_map'] 188 | 189 | # Encoding map 190 | map_encoded = self.map_encoder(ego_map) 191 | map_encoded_proj = self.map_encoded_linear(map_encoded) 192 | 193 | # Decoding map (ie segmentation prediction) 194 | map_decoded = self.map_decoder(map_encoded) # [bs, 64, 64] 195 | pred_sem_map = self.map_classfier(map_decoded) 196 | map_classified_proj = self.map_classified_linear( 197 | torch.nn.functional.avg_pool2d(pred_sem_map, kernel_size=2, stride=2) 198 | ) 199 | 200 | # Get concated map embedding 201 | map_cat = [map_encoded_proj, map_classified_proj] 202 | map_embedding = torch.cat(map_cat, dim=1) # [bs, 2*c / c, 50, 50] 203 | map_embedding = self.map_cated_linear(map_embedding) 204 | 205 | rgb_embedding = torch.flatten(rgb_embedding, 2) 206 | depth_embedding = torch.flatten(depth_embedding, 2) 207 | map_embedding = torch.flatten(map_embedding, 2) 208 | 209 | state_in = [] 210 | if 'rgb' in self.model_config.STATE_ENCODER.input_type: 211 | rgb_in = self.rgb_linear(rgb_embedding) 212 | state_in.append(rgb_in) 213 | if 'depth' in self.model_config.STATE_ENCODER.input_type: 214 | depth_in = self.depth_linear(depth_embedding) 215 | state_in.append(depth_in) 216 | if 'map' in self.model_config.STATE_ENCODER.input_type: 217 | map_in = self.map_linear(map_embedding) 218 | state_in.append(map_in) 219 | state_in = torch.cat(state_in, dim=1) 220 | ( 221 | state, 222 | rnn_hidden_states[0: self.state_encoder.num_recurrent_layers], 223 | ) = self.state_encoder( 224 | state_in, 225 | rnn_hidden_states[0: self.state_encoder.num_recurrent_layers], 226 | masks, 227 | ) 228 | 229 | state_text_q = self.state_text_q_layer(state) 230 | state_text_k = self.state_text_k_layer(instruction_embedding) 231 | text_embedding, _ = self._attn(state_text_q, state_text_k, instruction_embedding, text_mask) 232 | 233 | text_map_q = self.text_map_q_layer(text_embedding) 234 | text_map_k = self.text_map_k_layer(map_embedding) 235 | map_embedding, self.att_map_t_m = self._attn(text_map_q, text_map_k, map_embedding, None) 236 | 237 | if 'map' in self.model_config.STATE_ENCODER.input_type: 238 | x = torch.cat([state, text_embedding, map_embedding], dim=1) 239 | else: 240 | x = torch.cat([state, text_embedding], dim=1) 241 | x = self.second_state_compress(x) 242 | ( 243 | x, 244 | rnn_hidden_states[self.state_encoder.num_recurrent_layers:], 245 | ) = self.second_state_encoder( 246 | x, 247 | rnn_hidden_states[self.state_encoder.num_recurrent_layers:], 248 | masks 249 | ) 250 | 251 | return x, rnn_hidden_states, pred_sem_map 252 | -------------------------------------------------------------------------------- /vlnce_baselines/common/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import zipfile 5 | import socket 6 | import shutil 7 | import quaternion 8 | from glob import glob 9 | from shlex import quote 10 | from typing import Dict, List 11 | 12 | import torch 13 | 14 | 15 | def transform_obs( 16 | observations: List[Dict], instruction_sensor_uuid: str, device=None 17 | ) -> Dict[str, torch.Tensor]: 18 | """Extracts instruction tokens from an instruction sensor and 19 | transposes a batch of observation dicts to a dict of batched 20 | observations. 21 | 22 | Args: 23 | observations: list of dicts of observations. 24 | instruction_sensor_uuid: name of the instructoin sensor to 25 | extract from. 26 | device: The torch.device to put the resulting tensors on. 27 | Will not move the tensors if None 28 | 29 | Returns: 30 | transposed dict of lists of observations. 31 | """ 32 | for i in range(len(observations)): 33 | observations[i][instruction_sensor_uuid] = observations[i][ 34 | instruction_sensor_uuid 35 | ]["tokens"] 36 | 37 | for obs in observations: 38 | if 'semantic' in obs: 39 | del obs['semantic'] 40 | 41 | for obs in observations: 42 | for sensor in obs: 43 | if type(obs[sensor]) == torch.Tensor: 44 | obs[sensor] = obs[sensor].to(device) 45 | return observations 46 | 47 | 48 | def check_exist_file(config): 49 | dirs = [config.VIDEO_DIR, config.TENSORBOARD_DIR, config.CHECKPOINT_FOLDER] 50 | if any([os.path.exists(d) for d in dirs]): 51 | if config.OVERWRITE: 52 | for d in dirs: 53 | if os.path.exists(d): 54 | shutil.rmtree(d) 55 | else: 56 | order = None 57 | while order not in ['y', 'n']: 58 | order = input('Output directory already exists! Overwrite the folder? (y/n)') 59 | if order == 'y': 60 | for d in dirs: 61 | if os.path.exists(d): 62 | shutil.rmtree(d) 63 | elif order == 'n': 64 | break 65 | 66 | 67 | def save_sh_n_codes(config, run_type, ignore_dir=['']): 68 | os.makedirs(config.CODE_DIR, exist_ok=True) 69 | 70 | name = os.path.join(config.CODE_DIR, 'run_{}_{}.sh'.format(run_type, socket.gethostname())) 71 | with open(name, 'w') as f: 72 | envs = ['CUDA_VISIBLE_DEVICES'] 73 | for env in envs: 74 | value = os.environ.get(env, None) 75 | if value is not None: 76 | f.write(f'export {env}={quote(value)}\n') 77 | f.write(sys.executable + ' ' + ' '.join(quote(arg) for arg in sys.argv) + '\n') 78 | 79 | name = os.path.join(config.CODE_DIR, 'code.zip') 80 | with zipfile.ZipFile(name, mode='w', compression=zipfile.ZIP_DEFLATED) as zf: 81 | 82 | first_list = glob('*', recursive=True) 83 | first_list = [i for i in first_list if i not in ignore_dir] 84 | 85 | file_list = [] 86 | patterns = [x + '/**' for x in first_list] 87 | for pattern in patterns: 88 | file_list.extend(glob(pattern, recursive=True)) 89 | 90 | file_list = [x[:-1] if x[-1] == "/" else x for x in file_list] 91 | for filename in file_list: 92 | zf.write(filename) 93 | 94 | 95 | def save_config(config, run_type): 96 | os.makedirs(config.CONFIG_DIR, exist_ok=True) 97 | name = os.path.join(config.CONFIG_DIR, 'config_of_{}.txt'.format(run_type)) 98 | with open(name, 'w') as f: 99 | f.write(str(config)) 100 | 101 | 102 | label_conversion_40_27 = {-1:0, 0:0, 1:15, 2:17, 3:1, 4:2, 5:3, 6:18, 7:19, 8:4, 9:15, 10:5, 11:6, 12:16, 13:20, 14:7, 15:8, 16:17, 17:17, 103 | 18:9, 19:21, 20:22, 21:16, 22:10, 23:11, 24:15, 25:12, 26:13, 27:23, 28:16, 29:16, 30:16, 31:16, 32:16, 104 | 33:24, 34:25, 35:16, 36:16, 37:14, 38:26, 39:16, 40:16} 105 | label_conversion_40_3 = {-1:0, 0:0, 1:1, 2:2, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:2, 17:2, 106 | 18:1, 19:1, 20:1, 21:1, 22:1, 23:1, 24:1, 25:1, 26:1, 27:1, 28:1, 29:1, 30:1, 31:1, 32:1, 107 | 33:1, 34:1, 35:1, 36:1, 37:1, 38:1, 39:1, 40:1} 108 | 109 | 110 | def get_sim_location(agent_state): 111 | x = -agent_state.position[2] 112 | y = -agent_state.position[0] 113 | height = agent_state.position[1] 114 | axis = quaternion.as_euler_angles(agent_state.rotation)[0] 115 | if (axis%(2*np.pi)) < 0.1 or (axis%(2*np.pi)) > 2*np.pi - 0.1: 116 | o = quaternion.as_euler_angles(agent_state.rotation)[1] 117 | else: 118 | o = 2*np.pi - quaternion.as_euler_angles(agent_state.rotation)[1] 119 | if o > np.pi: 120 | o -= 2 * np.pi 121 | pose = x, y, o 122 | return pose, height 123 | 124 | def load_scene_pcloud(preprocessed_scenes_dir, scene_id, n_object_classes): 125 | pcloud_path = preprocessed_scenes_dir+scene_id+'_pcloud.npz' 126 | if not os.path.exists(pcloud_path): 127 | raise Exception('Preprocessed point cloud for scene', scene_id,'not found!') 128 | 129 | data = np.load(pcloud_path) 130 | x = data['x'] 131 | y = data['y'] 132 | z = data['z'] 133 | label_seq = data['label_seq'] 134 | data.close() 135 | 136 | label_seq[ label_seq<0.0 ] = 0.0 137 | # Convert the labels to the reduced set of categories 138 | label_seq_spatial = label_seq.copy() 139 | label_seq_objects = label_seq.copy() 140 | for i in range(label_seq.shape[0]): 141 | curr_lbl = label_seq[i,0] 142 | label_seq_spatial[i] = label_conversion_40_3[curr_lbl] 143 | label_seq_objects[i] = label_conversion_40_27[curr_lbl] 144 | return (x, y, z), label_seq_spatial, label_seq_objects 145 | 146 | def load_scene_color(preprocessed_scenes_dir, scene_id): 147 | # loads the rgb information of the map 148 | color_path = preprocessed_scenes_dir+scene_id+'_color.npz' 149 | if not os.path.exists(color_path): 150 | raise Exception('Preprocessed color for scene', scene_id,'not found!') 151 | 152 | data = np.load(color_path) 153 | r = data['r'] 154 | g = data['g'] 155 | b = data['b'] 156 | color_pcloud = np.stack((r,g,b)) # 3 x Npoints 157 | return color_pcloud 158 | 159 | def discretize_coords(x, z, grid_dim, cell_size, translation=0): 160 | # x, z are the coordinates of the 3D point (either in camera coordinate frame, or the ground-truth camera position) 161 | # If translation=0, assumes the agent is at the center 162 | # If we want the agent to be positioned lower then use positive translation. When getting the gt_crop, we need negative translation 163 | #map_coords = torch.zeros((len(x), 2), device='cuda') 164 | map_coords = torch.zeros((len(x), 2)) 165 | xb = torch.floor(x[:]/cell_size) + (grid_dim[0]-1)/2.0 166 | zb = torch.floor(z[:]/cell_size) + (grid_dim[1]-1)/2.0 + translation 167 | xb = xb.int() 168 | zb = zb.int() 169 | map_coords[:,0] = xb 170 | map_coords[:,1] = zb 171 | # keep bin coords within dimensions 172 | map_coords[map_coords>grid_dim[0]-1] = grid_dim[0]-1 173 | map_coords[map_coords<0] = 0 174 | return map_coords.long() 175 | 176 | def slice_scene(x, y, z, label_seq, position, height, color_pcloud=None, device='cuda'): 177 | # z = -z 178 | # Slice the scene below and above the agent 179 | below_thresh = height-0.2 180 | above_thresh = height+2.0 181 | all_inds = np.arange(y.shape[0]) 182 | below_inds = np.where(zabove_thresh)[0] 184 | # xout_inds = np.where(abs(x-position[1]) > 8)[0] 185 | # yout_inds = np.where(abs(y-position[0]) > 8)[0] 186 | invalid_inds = np.concatenate( (below_inds, above_inds), 0) # remove the floor and ceiling inds from the local3D points 187 | inds = np.delete(all_inds, invalid_inds) 188 | x_fil = x[inds] 189 | y_fil = y[inds] 190 | z_fil = z[inds] 191 | label_seq_fil = torch.tensor(label_seq[inds], dtype=torch.float, device=device) 192 | if color_pcloud is not None: 193 | color_pcloud_fil = torch.tensor(color_pcloud[:,inds], dtype=torch.float, device=device) 194 | return x_fil, y_fil, z_fil, label_seq_fil, color_pcloud_fil 195 | else: 196 | return x_fil, y_fil, z_fil, label_seq_fil 197 | 198 | def get_gt_map(x, y, label_seq, abs_pose, grid_dim, cell_size, color_pcloud=None, z=None, device='cuda'): 199 | # Transform the ground-truth map to align with the agent's pose 200 | # The agent is at the center looking upwards 201 | point_map = np.array([x,y]) 202 | angle = -abs_pose[2] 203 | rot_mat_abs = np.array([[np.cos(angle), -np.sin(angle)],[np.sin(angle),np.cos(angle)]]) 204 | trans_mat_abs = np.array([[-abs_pose[1]],[abs_pose[0]]]) #### This is important, the first index is negative. 205 | ##rotating and translating point map points 206 | t_points = point_map - trans_mat_abs 207 | rot_points = np.matmul(rot_mat_abs,t_points) 208 | x_abs = torch.tensor(rot_points[0,:], device=device) 209 | y_abs = torch.tensor(rot_points[1,:], device=device) 210 | 211 | map_coords = discretize_coords(x=x_abs, z=y_abs, grid_dim=grid_dim, cell_size=cell_size) 212 | 213 | # Coordinates in map_coords need to be sorted based on their height, floor values go first 214 | # Still not perfect 215 | if z is not None: 216 | z = np.asarray(z) 217 | sort_inds = np.argsort(z) 218 | map_coords = map_coords[sort_inds,:] 219 | label_seq = label_seq[sort_inds,:] 220 | 221 | true_seg_grid = torch.zeros((grid_dim[0], grid_dim[1], 1), device=device) 222 | true_seg_grid[map_coords[:,1], map_coords[:,0]] = label_seq.clone() 223 | 224 | ### We need to flip the ground truth to align with the observations. 225 | ### Probably because the -y tp -z is a rotation about x axis which also flips the y coordinate for matteport. 226 | true_seg_grid = torch.flip(true_seg_grid, dims=[0]) 227 | true_seg_grid = true_seg_grid.permute(2, 0, 1) 228 | 229 | if color_pcloud is not None: 230 | color_grid = torch.zeros((grid_dim[0], grid_dim[1], 3), device=device) 231 | color_grid[map_coords[:,1], map_coords[:,0],0] = color_pcloud[0] 232 | color_grid[map_coords[:,1], map_coords[:,0],1] = color_pcloud[1] 233 | color_grid[map_coords[:,1], map_coords[:,0],2] = color_pcloud[2] 234 | color_grid = torch.flip(color_grid, dims=[0]) 235 | color_grid = color_grid.permute(2, 0 ,1) 236 | return true_seg_grid, color_grid/255.0 237 | else: 238 | return true_seg_grid 239 | 240 | 241 | class TransfomationRealworldAgent(): 242 | def __init__(self, agent_state) -> None: 243 | self.agent_state = agent_state 244 | self.T = self.agent_state.position.reshape(1,-1).T 245 | self.R = quaternion.as_rotation_matrix(self.agent_state.rotation) 246 | 247 | def original_matrix(self, position): 248 | original_matrix = np.matrix( 249 | [[position[0]], [position[1]], [position[2]]] 250 | ) 251 | return original_matrix 252 | 253 | def realworld2agent(self, point): 254 | O = self.original_matrix(point) 255 | point_a = (self.R.T @ O) + (self.R.T @ -self.T) 256 | return np.squeeze(np.asarray(point_a)) 257 | 258 | def agent2realworld(self, point): 259 | O = self.original_matrix(point) 260 | point_w = (self.R @ O) + self.T 261 | return np.squeeze(np.asarray(point_w)) 262 | -------------------------------------------------------------------------------- /habitat_extensions/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Dict 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from habitat.core.utils import try_cv2_import 8 | from habitat.utils.visualizations import maps 9 | from habitat.utils.visualizations.utils import draw_collision 10 | 11 | 12 | cv2 = try_cv2_import() 13 | 14 | COLOR_ProjSem_27 = [ 15 | [255,255,255] # white 16 | ,[128,128,0] # olive (dark yellow) 17 | ,[0,0,255] # blue 18 | ,[255,0,0] # red 19 | ,[255,0,255] # magenta 20 | ,[0,255,255] # cyan 21 | ,[255,165,0] # orange 22 | ,[255,255,0] # yellow 23 | ,[128,128,128] # gray 24 | ,[128,0,0] # maroon 25 | ,[255,20,147] # pink 26 | ,[0,128,0] # dark green 27 | ,[128,0,128] # purple 28 | ,[0,128,128] # teal 29 | ,[0,0,128] # navy (dark blue) 30 | ,[210,105,30] # chocolate 31 | ,[188,143,143] # rosy brown 32 | ,[0,255,0] # green 33 | ,[255,215,0] # gold 34 | ,[0,0,0] # black 35 | ,[192,192,192] # silver 36 | ,[138,43,226] # blue violet 37 | ,[255,127,80] # coral 38 | ,[238,130,238] # violet 39 | ,[245,245,220] # beige 40 | ,[139,69,19] # saddle brown 41 | ,[64,224,208] # turquoise 42 | ] 43 | 44 | OBJECTS_ProjSem_27 = [ 45 | 'void', 'chair', 'door', 'table', 'cushion', 46 | 'sofa', 'bed', 'plant', 'sink', 'toilet', 47 | 'tv_monitor', 'shower', 'bathtub', 48 | 'counter', 'appliances', 'structure', 'other', 49 | 'free-space', 'picture', 'cabinet', 'chest_of_drawers', 'stool', 50 | 'towel', 'fireplace', 'gym_equipment', 'seating', 51 | 'clothes' 52 | ] 53 | 54 | COLOR_ProjSem = [ 55 | [235, 190, 157], [235, 219, 156], [255, 255, 255], [189, 234, 155], [163, 233, 158], 56 | [156, 234, 180], [156, 235, 206], [157, 226, 236], [156, 198, 235], [156, 170, 231], 57 | [170, 155, 235], [198, 154, 234], [230, 156, 235], [234, 154, 213], [235, 156, 181], 58 | [157, 190, 181], [198, 156, 206], 59 | ] 60 | OBJECTS_ProjSem = [ 61 | 'wall', 'chair', 'door', 'table', 'picture', 62 | 'cabinet', 'window', 'sofa', 'bed', 'plant', 63 | 'sink', 'stairs', 'mirror', 'shower', 'counter', 64 | 'fireplace', 'railing', 65 | ] 66 | 67 | COLOR_AABBSem = [ 68 | [235, 190, 157], [235, 219, 156], [208, 234, 157], [189, 234, 155], [163, 233, 158], 69 | [156, 234, 180], [156, 235, 206], [157, 226, 236], [156, 198, 235], [156, 170, 231], 70 | [170, 155, 235], [198, 154, 234], [230, 156, 235], [234, 154, 213], [235, 156, 181], 71 | ] 72 | OBJECTS_AABBSem = [ 73 | 'door', 'stair', 'bed', 'doorway', 'table', 74 | 'chair', 'couch', 'sink', 'closet', 'fireplace', 75 | 'rug', 'counter', 'desk', 'painting', 'window', 76 | ] 77 | 78 | COCO_COLOR = [ 79 | [1.0, 1.0, 1.0], 80 | [0.6, 0.6, 0.6], 81 | [0.95, 0.95, 0.95], 82 | [0.96, 0.36, 0.26], 83 | [0.12156862745098039, 0.47058823529411764, 0.7058823529411765], 84 | [0.9400000000000001, 0.7818, 0.66], 85 | [0.9400000000000001, 0.8868, 0.66], 86 | [0.8882000000000001, 0.9400000000000001, 0.66], 87 | [0.7832000000000001, 0.9400000000000001, 0.66], 88 | [0.6782000000000001, 0.9400000000000001, 0.66], 89 | [0.66, 0.9400000000000001, 0.7468000000000001], 90 | [0.66, 0.9400000000000001, 0.8518000000000001], 91 | [0.66, 0.9232, 0.9400000000000001], 92 | [0.66, 0.8182, 0.9400000000000001], 93 | [0.66, 0.7132, 0.9400000000000001], 94 | [0.7117999999999999, 0.66, 0.9400000000000001], 95 | [0.8168, 0.66, 0.9400000000000001], 96 | [0.9218, 0.66, 0.9400000000000001], 97 | [0.9400000000000001, 0.66, 0.8531999999999998], 98 | [0.9400000000000001, 0.66, 0.748199999999999]] 99 | 100 | COCO_OBJECTS = [ 'unexplored', 'obstacle', 'free', 'waypoint', 'agent', 101 | 'chair', 'couch', 'potted plant', 'bed', 'toilet', 102 | 'tv', 'dining-table', 'oven', 'sink', 'refrigerator', 103 | 'book', 'clock', 'vase', 'cup', 'bottle', 104 | ] 105 | 106 | COLOR_HEAT = { 107 | 'R': [ 108 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 109 | 240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20, 0, 110 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112 | 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 113 | ], 114 | 'G': [ 115 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 | 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 118 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 119 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 120 | ], 121 | 'B': [ 122 | 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 123 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 124 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 125 | 240, 220, 200, 180, 160, 140, 120, 100, 80, 60, 40, 20, 0, 126 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 | ] 128 | } 129 | 130 | 131 | def observations_to_image(observation: Dict, info: Dict, waypoint_info, att_map) -> np.ndarray: 132 | r"""Generate image of single frame from observation and info 133 | returned from a single environment step(). 134 | 135 | Args: 136 | observation: observation returned from an environment step(). 137 | info: info returned from an environment step(). 138 | 139 | Returns: 140 | generated image of a single frame. 141 | """ 142 | egocentric_view = [] 143 | observation_size = -1 144 | if "rgb" in observation: 145 | observation_size = observation["rgb"].shape[0] 146 | rgb = observation["rgb"][:, :, :3] 147 | egocentric_view.append(rgb) 148 | 149 | # draw depth map if observation has depth info. resize to rgb size. 150 | # if "depth" in observation: 151 | # if observation_size == -1: 152 | # observation_size = observation["depth"].shape[0] 153 | # depth_map = (observation["depth"].squeeze() * 255).astype(np.uint8) 154 | # depth_map = np.stack([depth_map for _ in range(3)], axis=2) 155 | # depth_map = cv2.resize( 156 | # depth_map, 157 | # dsize=(observation_size, observation_size), 158 | # interpolation=cv2.INTER_CUBIC, 159 | # ) 160 | # egocentric_view.append(depth_map) 161 | 162 | assert len(egocentric_view) > 0, "Expected at least one visual sensor enabled." 163 | egocentric_view = np.concatenate(egocentric_view, axis=1) 164 | 165 | # draw collision 166 | if "collisions" in info and info["collisions"]["is_collision"]: 167 | egocentric_view = draw_collision(egocentric_view) 168 | 169 | frame = egocentric_view 170 | 171 | if "top_down_map" in info: 172 | top_down_map = info["top_down_map"]["map"] 173 | top_down_map = maps.colorize_topdown_map( 174 | top_down_map, info["top_down_map"]["fog_of_war_mask"] 175 | ) 176 | map_agent_pos = info["top_down_map"]["agent_map_coord"] 177 | top_down_map = maps.draw_agent( 178 | image=top_down_map, 179 | agent_center_coord=map_agent_pos, 180 | agent_rotation=info["top_down_map"]["agent_angle"], 181 | agent_radius_px=top_down_map.shape[0] // 16, 182 | ) 183 | 184 | # if 'waypoint' in waypoint_info: 185 | # waypoint = maps.to_grid( 186 | # waypoint_info['action'][0], 187 | # waypoint_info['action'][2], 188 | # maps.COORDINATE_MIN, maps.COORDINATE_MAX, (1250, 1250), 189 | # ) 190 | # crop_waypoint = ( 191 | # waypoint[0] - info['top_down_map']['waypoint']['ind_x_min'], 192 | # waypoint[1] - info['top_down_map']['waypoint']['ind_y_min'] 193 | # ) 194 | # maps.draw_path( 195 | # top_down_map=top_down_map, 196 | # path_points=[crop_waypoint, crop_waypoint], 197 | # color=[200, 0, 0], 198 | # thickness=5, 199 | # ) 200 | 201 | if top_down_map.shape[0] > top_down_map.shape[1]: 202 | top_down_map = np.rot90(top_down_map, 1) 203 | 204 | # scale top down map to align with rgb view 205 | old_h, old_w, _ = top_down_map.shape 206 | top_down_height = observation_size 207 | top_down_width = int(float(top_down_height) / old_h * old_w) 208 | # cv2 resize (dsize is width first) 209 | top_down_map = cv2.resize( 210 | top_down_map, 211 | (top_down_width, top_down_height), 212 | interpolation=cv2.INTER_CUBIC, 213 | ) 214 | frame = np.concatenate((frame, top_down_map), axis=1) 215 | 216 | # DRAW SEMANTIC MAP 217 | ego_map = observation['ego_map_vis'] 218 | channel = ego_map.shape[0] 219 | semantic_map = np.ones([*ego_map.shape[1:], 3], dtype=np.uint8) * 255 220 | 221 | if channel == 17: # AABBSem: occ map + history path + 15 objects 222 | offset = 2 223 | objects, color = OBJECTS_AABBSem, COLOR_AABBSem 224 | semantic_map[ego_map[0, :, :] == 1, :] = [75, 75, 75] 225 | elif channel == 18: # PorjSem: occ map + 17 objects 226 | offset = 1 227 | objects, color = OBJECTS_ProjSem, COLOR_ProjSem 228 | semantic_map[ego_map[0, :, :] < 0.1, :] = [75, 75, 75] 229 | elif channel == 29: # ProjSem: occ map + explored map + 27 objects 230 | offset = 2 231 | objects, color = OBJECTS_ProjSem_27, COLOR_ProjSem_27 232 | 233 | for i in range(len(objects)): 234 | semantic_map[ego_map[i + offset, :, :] > 0.5, :] = color[i] 235 | 236 | semantic_map = maps.draw_agent( 237 | image=semantic_map, 238 | agent_center_coord=[50, 50], # FIXME 用参数代替 239 | agent_rotation=info["top_down_map"]["agent_angle"], 240 | agent_radius_px=top_down_map.shape[0] // 64, 241 | ) 242 | wp_grid_x = -torch.tanh(waypoint_info['action'])[1] * 50 + 50 243 | wp_grid_y = torch.tanh(waypoint_info['action'])[0] * 50 + 50 244 | _limit = lambda x: min(max(int(x), 0), 100) 245 | semantic_map[_limit(wp_grid_x - 2):_limit(wp_grid_x + 2), 246 | _limit(wp_grid_y - 2):_limit(wp_grid_y + 2), :] = [200, 0, 0] # draw waypoint 247 | 248 | semantic_map = cv2.resize(semantic_map, 249 | (observation_size, observation_size), 250 | interpolation=cv2.INTER_CUBIC) 251 | frame = np.concatenate((frame, semantic_map), axis=1) 252 | 253 | legend = np.ones([observation_size, 120, 3], dtype=np.uint8) * 255 254 | grid = legend.shape[0] // 30 * 2 255 | for i in range(len(objects)): 256 | cv2.rectangle(legend, (grid, grid * i + 10), (grid * 2, grid * i + 10 + grid // 2), color[i], -1) 257 | cv2.putText(legend, objects[i], (grid * 2 + 5, grid * i + 10 + grid // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) 258 | frame = np.concatenate((frame, legend), axis=1) 259 | 260 | vis_att_map = np.ones([att_map.shape[0], 3], dtype=np.uint8) * 255 261 | for idx, value in enumerate(att_map): 262 | color_idx = ((1 - (value - att_map.min()) / (att_map.max() - att_map.min() + 1e-6)) * (len(COLOR_HEAT['R'])) - 1) 263 | color_idx = int(color_idx.item()) 264 | vis_att_map[idx, :] = [COLOR_HEAT['R'][color_idx], COLOR_HEAT['G'][color_idx], COLOR_HEAT['B'][color_idx]] 265 | vis_att_map = vis_att_map.reshape(24, 24, 3) 266 | vis_att_map = cv2.resize(vis_att_map, (observation_size, observation_size), interpolation=cv2.INTER_CUBIC) 267 | frame = np.concatenate((frame, vis_att_map), axis=1) 268 | 269 | return frame 270 | -------------------------------------------------------------------------------- /vlnce_baselines/models/ddppo_policy.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import quaternion 4 | from gym.spaces import Box, Dict, Discrete 5 | from gym.spaces.box import Box 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | from habitat_baselines.rl.ddppo.policy.resnet_policy import PointNavResNetPolicy 12 | 13 | 14 | class DdppoPolicy(nn.Module): 15 | def __init__(self, path): 16 | super().__init__() 17 | spaces = { 18 | 'pointgoal_with_gps_compass': Box( 19 | low=np.finfo(np.float32).min, 20 | high=np.finfo(np.float32).max, 21 | shape=(2,), 22 | dtype=np.float32, 23 | ), 24 | 'depth': Box( 25 | low=0, 26 | high=1, 27 | shape=(256, 256, 1), 28 | dtype=np.float32, 29 | ) 30 | } 31 | observation_space = Dict(spaces) 32 | action_space = Discrete(4) 33 | 34 | checkpoint = torch.load(path) 35 | self.hidden_size = checkpoint['model_args'].hidden_size 36 | # The model must be named self.actor_critic to make the namespaces correct for loading 37 | self.actor_critic = PointNavResNetPolicy( 38 | observation_space=observation_space, 39 | action_space=action_space, 40 | hidden_size=self.hidden_size, 41 | num_recurrent_layers=2, 42 | rnn_type='LSTM', 43 | backbone='resnet50', 44 | ) 45 | self.actor_critic.load_state_dict( 46 | { 47 | k[len("actor_critic."):]: v 48 | for k, v in checkpoint['state_dict'].items() 49 | if "actor_critic" in k 50 | } 51 | ) 52 | self.actor_critic.eval() 53 | 54 | self.hidden_state = torch.zeros(self.actor_critic.net.num_recurrent_layers, 1, checkpoint['model_args'].hidden_size) 55 | self.prev_actions = torch.zeros(1, 1, dtype=torch.long) 56 | 57 | def plan(self, depth, goal, t): 58 | batch = { 59 | 'pointgoal_with_gps_compass': goal.view(1, -1), 60 | 'depth': depth.view(1, depth.shape[0], depth.shape[1], depth.shape[2]), 61 | } 62 | 63 | if t ==0: 64 | not_done_masks = torch.zeros(1, 1, dtype=torch.bool, device=depth.device) 65 | else: 66 | not_done_masks = torch.ones(1, 1, dtype=torch.bool, device=depth.device) 67 | 68 | _, actions, _, self.hidden_state = self.actor_critic.act( 69 | batch, 70 | self.hidden_state.to(depth.device), 71 | self.prev_actions.to(depth.device), 72 | not_done_masks, 73 | deterministic=True, 74 | ) 75 | self.prev_actions = torch.clone(actions) 76 | 77 | return actions.item() 78 | 79 | def reset(self): 80 | self.hidden_state = torch.zeros_like(self.hidden_state) 81 | self.prev_actions = torch.zeros_like(self.prev_actions) 82 | 83 | 84 | class SemanticGrid(object): 85 | def __init__(self, batch_size, grid_dim, crop_size, cell_size, spatial_labels, object_labels, device): 86 | self.batch_size = batch_size 87 | self.grid_dim = grid_dim 88 | self.crop_size = crop_size 89 | self.cell_size = cell_size 90 | self.spatial_labels = spatial_labels 91 | self.object_labels = object_labels 92 | self.device = device 93 | 94 | self.crop_start = int((self.grid_dim[0] / 2) - (self.crop_size / 2)) 95 | self.crop_end = int((self.grid_dim[0] / 2) + (self.crop_size / 2)) 96 | 97 | # Transform each ground-projected grid into geocentric coordinates 98 | def spatialTransformer(self, grid, pose, abs_pose): 99 | geo_grid_out = torch.zeros( 100 | (grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), 101 | dtype=torch.float32, 102 | ) 103 | 104 | init_pose = abs_pose[0, :] 105 | init_rot_mat = torch.tensor( 106 | [ 107 | [torch.cos(init_pose[2]), -torch.sin(init_pose[2])], 108 | [torch.sin(init_pose[2]), torch.cos(init_pose[2])] 109 | ], 110 | dtype=torch.float32, 111 | ) 112 | 113 | for i in range(grid.shape[0]): 114 | pose_step = pose[i, :] 115 | 116 | rel_coord = torch.tensor([pose_step[1], pose_step[0]], dtype=torch.float32) 117 | rel_coord = rel_coord.reshape((2, 1)) 118 | rel_coord = torch.matmul(init_rot_mat, rel_coord) 119 | 120 | goal_grid_pos = torch.tensor([ 121 | round(-rel_coord[1].item() / self.cell_size + 255), 122 | round(-rel_coord[0].item() / self.cell_size + 255), 123 | ]) 124 | 125 | return geo_grid_out, goal_grid_pos 126 | 127 | # Transform a geocentric map back to egocentric view 128 | def rotate_map(self, grid, rel_pose, abs_pose): 129 | ego_grid_out = torch.zeros( 130 | (grid.shape[0], grid.shape[1], self.grid_dim[0], self.grid_dim[1]), 131 | dtype=torch.float32, 132 | ).to(grid.device) 133 | 134 | init_pose = abs_pose[0, :] 135 | init_rot_mat = torch.tensor( 136 | [ 137 | [torch.cos(init_pose[2]), -torch.sin(init_pose[2])], 138 | [torch.sin(init_pose[2]), torch.cos(init_pose[2])] 139 | ], 140 | dtype=torch.float32, 141 | ).to(grid.device) 142 | 143 | for i in range(grid.shape[0]): 144 | rel_pose_step = rel_pose[i, :] 145 | 146 | rel_coord = torch.tensor([rel_pose_step[1], rel_pose_step[0]], dtype=torch.float32).to(grid.device) 147 | rel_coord = rel_coord.reshape((2, 1)) 148 | rel_coord = torch.matmul(init_rot_mat, rel_coord) 149 | 150 | x = -2*(rel_coord[0] / self.cell_size) / (self.grid_dim[0]) 151 | z = -2*(rel_coord[1] / self.cell_size) / (self.grid_dim[1]) 152 | angle = -rel_pose_step[2] 153 | 154 | trans_theta = torch.tensor([[1, -0, x], [0, 1, z]], dtype=torch.float32).unsqueeze(0) 155 | rot_theta = torch.tensor( 156 | [ 157 | [torch.cos(angle), -torch.sin(angle), 0], 158 | [torch.sin(angle), torch.cos(angle), 0] 159 | ], 160 | dtype=torch.float32, 161 | ).unsqueeze(0) 162 | trans_theta = trans_theta.to(grid.device) 163 | rot_theta = rot_theta.to(grid.device) 164 | 165 | grid_step = grid[i, :, :, :].unsqueeze(0) 166 | trans_disp_grid = F.affine_grid(trans_theta, grid_step.size(), align_corners=False) 167 | rot_disp_grid = F.affine_grid(rot_theta, grid_step.size(), align_corners=False) 168 | trans_ego_grid = F.grid_sample(grid_step, trans_disp_grid.float(), align_corners=False) 169 | ego_grid = F.grid_sample(trans_ego_grid, rot_disp_grid.float(), align_corners=False) 170 | ego_grid_out[i, :, :, :] = ego_grid 171 | 172 | return ego_grid_out 173 | 174 | 175 | class utils(): 176 | def get_rel_pose(self, pos2, pos1): 177 | x1, y1, o1 = pos1 178 | if len(pos2) == 2: # if pos2 has no rotation 179 | x2, y2 = pos2 180 | dx = x2 - x1 181 | dy = y2 - y1 182 | return dx, dy 183 | else: 184 | x2, y2, o2 = pos2 185 | dx = x2 - x1 186 | dy = y2 - y1 187 | do = o2 - o1 188 | if do < -math.pi: 189 | do += 2 * math.pi 190 | if do > math.pi: 191 | do -= 2 * math.pi 192 | return dx, dy, do 193 | 194 | def discretize_coords(self, x, z, grid_dim, cell_size, translation=0): 195 | map_coords = torch.zeros((len(x), 2)) 196 | xb = torch.floor(x[:]/cell_size) + (grid_dim[0]-1)/2.0 197 | zb = torch.floor(z[:]/cell_size) + (grid_dim[1]-1)/2.0 + translation 198 | xb = xb.int() 199 | zb = zb.int() 200 | map_coords[:,0] = xb 201 | map_coords[:,1] = zb 202 | # keep bin coords within dimensions 203 | map_coords[map_coords > grid_dim[0] - 1] = grid_dim[0] - 1 204 | map_coords[map_coords < 0] = 0 205 | return map_coords.long() 206 | 207 | def get_sim_location(self, agent_state): 208 | x = -agent_state.position[2] 209 | y = -agent_state.position[0] 210 | height = agent_state.position[1] 211 | axis = quaternion.as_euler_angles(agent_state.rotation)[0] 212 | if (axis%(2*np.pi)) < 0.1 or (axis%(2*np.pi)) > 2*np.pi - 0.1: 213 | o = quaternion.as_euler_angles(agent_state.rotation)[1] 214 | else: 215 | o = 2*np.pi - quaternion.as_euler_angles(agent_state.rotation)[1] 216 | if o > np.pi: 217 | o -= 2 * np.pi 218 | pose = x, y, o 219 | return pose, height 220 | 221 | def unravel_index(self, indices, shape): 222 | """Converts flat indices into unraveled coordinates in a target shape. 223 | This is a `torch` implementation of `numpy.unravel_index`. 224 | Args: 225 | indices: A tensor of indices, (*, N). 226 | shape: The targeted shape, (D,). 227 | Returns: 228 | unravel coordinates, (*, N, D). 229 | """ 230 | shape = torch.tensor(shape) 231 | indices = indices % shape.prod() # prevent out-of-bounds indices 232 | 233 | coord = torch.zeros(indices.size() + shape.size(), dtype=int) 234 | 235 | for i, dim in enumerate(reversed(shape)): 236 | coord[..., i] = indices % dim 237 | indices = indices // dim 238 | 239 | return coord.flip(-1) 240 | 241 | def get_coord_pose(self, sg, rel_pose, init_pose, grid_dim, cell_size, device=None): 242 | if isinstance(init_pose, list) or isinstance(init_pose, tuple): 243 | init_pose = torch.tensor(init_pose).unsqueeze(0) 244 | else: 245 | init_pose = init_pose.unsqueeze(0) 246 | 247 | zero_pose = torch.tensor([[0., 0., 0.]]) 248 | if device != None: 249 | init_pose = init_pose.to(device) 250 | zero_pose = zero_pose.to(device) 251 | 252 | zero_coords = self.discretize_coords( 253 | x=zero_pose[:, 0], 254 | z=zero_pose[:, 1], 255 | grid_dim=(grid_dim, grid_dim), 256 | cell_size=cell_size, 257 | ) 258 | 259 | pose_grid = torch.zeros((1, 1, grid_dim, grid_dim), dtype=torch.float32) 260 | pose_grid[0, 0, zero_coords[0,0], zero_coords[0,1]] = 1 261 | 262 | _, goal_grid_pos = sg.spatialTransformer(grid=pose_grid, pose=rel_pose, abs_pose=init_pose) 263 | inds = goal_grid_pos 264 | 265 | pose_coord = torch.zeros((1, 1, 2), dtype=torch.int64) 266 | pose_coord[0, 0, 0] = inds[1] 267 | pose_coord[0, 0, 1] = inds[0] 268 | return pose_coord 269 | 270 | def transform_ego_to_geo(self, ego_point, pose_coords, abs_pose_coords, abs_poses, t): 271 | rel_rot = torch.tensor(abs_poses[0][2]) - torch.tensor(abs_poses[t][2]) 272 | dist_x = (ego_point[0, 0, 0] - pose_coords[0, 0, 0]) 273 | dist_z = (ego_point[0, 0, 1] - pose_coords[0, 0, 1]) 274 | rel_rot_mat = torch.tensor( 275 | [ 276 | [torch.cos(rel_rot), -torch.sin(rel_rot)], 277 | [torch.sin(rel_rot), torch.cos(rel_rot)] 278 | ], 279 | dtype=torch.float32, 280 | ) 281 | dist_vect = torch.tensor([dist_x, dist_z], dtype=torch.float) 282 | dist_vect = dist_vect.reshape((2, 1)) 283 | rot_vect = torch.matmul(rel_rot_mat, dist_vect) 284 | 285 | abs_coords_x = abs_pose_coords[0, 0, 0] + rot_vect[0] 286 | abs_coords_z = abs_pose_coords[0, 0, 1] + rot_vect[1] 287 | abs_coords = torch.tensor([[[abs_coords_x, abs_coords_z]]]) 288 | return abs_coords 289 | -------------------------------------------------------------------------------- /habitat_extensions/measures.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | from typing import Any 4 | 5 | import numpy as np 6 | from dtw import dtw 7 | from fastdtw import fastdtw 8 | from habitat.config import Config 9 | from habitat.core.embodied_task import EmbodiedTask, Measure 10 | from habitat.core.registry import registry 11 | from habitat.core.simulator import Simulator 12 | 13 | 14 | @registry.register_measure 15 | class PathLength(Measure): 16 | r"""Path Length (PL) 17 | 18 | PL = sum(geodesic_distance(agent_prev_position, agent_position) 19 | over all agent positions. 20 | """ 21 | 22 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 23 | self._previous_position = None 24 | self._start_end_episode_distance = None 25 | self._agent_episode_distance = None 26 | self._sim = sim 27 | self._config = config 28 | 29 | super().__init__(**kwargs) 30 | 31 | def reset_metric(self, episode, *args: Any, **kwargs: Any): 32 | self._previous_position = self._sim.get_agent_state().position.tolist() 33 | self._start_end_episode_distance = self._sim.geodesic_distance( 34 | self._previous_position, episode.goals[0].position 35 | ) 36 | self._agent_episode_distance = 0.0 37 | self._metric = None 38 | 39 | def _euclidean_distance(self, position_a, position_b): 40 | return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2) 41 | 42 | def update_metric(self, episode, action, *args: Any, **kwargs: Any): 43 | current_position = self._sim.get_agent_state().position.tolist() 44 | 45 | distance_to_target = self._sim.geodesic_distance( 46 | current_position, episode.goals[0].position 47 | ) 48 | 49 | self._agent_episode_distance += self._euclidean_distance( 50 | current_position, self._previous_position 51 | ) 52 | 53 | self._previous_position = current_position 54 | 55 | self._metric = self._agent_episode_distance 56 | 57 | @staticmethod 58 | def _get_uuid(*args: Any, **kwargs: Any): 59 | return "path_length" 60 | 61 | 62 | @registry.register_measure 63 | class OracleNavigationError(Measure): 64 | r"""Oracle Navigation Error (ONE) 65 | 66 | ONE = min(geosdesic_distance(agent_pos, goal)) 67 | over all agent_pos in agent path. 68 | 69 | This computes oracle navigation error for every update regardless of 70 | whether or not the end of the episode has been reached. 71 | """ 72 | 73 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 74 | self._sim = sim 75 | self._config = config 76 | super().__init__() 77 | 78 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 79 | self._metric = float("inf") 80 | 81 | def update_metric(self, *args: Any, episode, action, **kwargs: Any): 82 | current_position = self._sim.get_agent_state().position.tolist() 83 | distance_to_target = self._sim.geodesic_distance( 84 | current_position, episode.goals[0].position 85 | ) 86 | if distance_to_target < self._metric: 87 | self._metric = distance_to_target 88 | 89 | @staticmethod 90 | def _get_uuid(*args: Any, **kwargs: Any): 91 | return "oracle_navigation_error" 92 | 93 | 94 | @registry.register_measure 95 | class OracleSuccess(Measure): 96 | r"""Oracle Success Rate (OSR) 97 | 98 | OSR = I(ONE <= goal_radius), 99 | where ONE is Oracle Navigation Error. 100 | """ 101 | 102 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 103 | self._sim = sim 104 | self._config = config 105 | super().__init__() 106 | 107 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 108 | self._metric = 0 109 | 110 | def update_metric( 111 | self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any 112 | ): 113 | if self._metric: 114 | # skip, already had oracle success 115 | return 116 | 117 | current_position = self._sim.get_agent_state().position.tolist() 118 | distance_to_target = self._sim.geodesic_distance( 119 | current_position, episode.goals[0].position 120 | ) 121 | 122 | if distance_to_target < self._config.SUCCESS_DISTANCE: 123 | self._metric = 1 124 | 125 | @staticmethod 126 | def _get_uuid(*args: Any, **kwargs: Any): 127 | return "oracle_success" 128 | 129 | 130 | @registry.register_measure 131 | class OracleSPL(Measure): 132 | r"""OracleSPL (Oracle Success weighted by Path Length) 133 | 134 | OracleSPL = max(SPL) over all points in the agent path 135 | """ 136 | 137 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 138 | self._previous_position = None 139 | self._start_end_episode_distance = None 140 | self._agent_episode_distance = None 141 | self._ep_success = None 142 | self._sim = sim 143 | self._config = config 144 | super().__init__() 145 | 146 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 147 | self._previous_position = self._sim.get_agent_state().position.tolist() 148 | self._start_end_episode_distance = episode.info["geodesic_distance"] 149 | self._agent_episode_distance = 0.0 150 | self._ep_success = 0 151 | self._metric = 0.0 152 | 153 | def _euclidean_distance(self, position_a, position_b): 154 | return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2) 155 | 156 | def update_metric( 157 | self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any 158 | ): 159 | if self._ep_success: # shortest path already found 160 | return 161 | 162 | current_position = self._sim.get_agent_state().position.tolist() 163 | 164 | self._agent_episode_distance += self._euclidean_distance( 165 | current_position, self._previous_position 166 | ) 167 | self._previous_position = current_position 168 | 169 | distance_to_target = self._sim.geodesic_distance( 170 | current_position, episode.goals[0].position 171 | ) 172 | if distance_to_target < self._config.SUCCESS_DISTANCE: 173 | self._ep_success = 1 174 | self._metric = self._ep_success * ( 175 | self._start_end_episode_distance 176 | / max(self._start_end_episode_distance, self._agent_episode_distance) 177 | ) 178 | 179 | @staticmethod 180 | def _get_uuid(*args: Any, **kwargs: Any): 181 | return "oracle_spl" 182 | 183 | 184 | @registry.register_measure 185 | class StepsTaken(Measure): 186 | r"""Counts the number of times update_metric() is called. This is equal to 187 | the number of times that the agent takes an action. STOP counts as an 188 | action. 189 | """ 190 | 191 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 192 | self._sim = sim 193 | self._config = config 194 | self._metric = 0 195 | super().__init__() 196 | 197 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 198 | self._metric = 0 199 | 200 | def update_metric( 201 | self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any 202 | ): 203 | self._metric += 1 204 | 205 | @staticmethod 206 | def _get_uuid(*args: Any, **kwargs: Any): 207 | return "steps_taken" 208 | 209 | 210 | @registry.register_measure 211 | class NDTW(Measure): 212 | r"""NDTW (Normalized Dynamic Time Warping) 213 | 214 | ref: Effective and General Evaluation for Instruction 215 | Conditioned Navigation using Dynamic Time 216 | Warping - Magalhaes et. al 217 | https://arxiv.org/pdf/1907.05446.pdf 218 | """ 219 | 220 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 221 | self._sim = sim 222 | self._config = config 223 | self.locations = [] 224 | self.gt_locations = [] 225 | self.dtw_func = fastdtw if config.FDTW else dtw 226 | 227 | gt_path = config.GT_PATH.format(split=config.SPLIT) 228 | with gzip.open(gt_path, "rt") as f: 229 | self.gt_json = json.load(f) 230 | super().__init__() 231 | 232 | @staticmethod 233 | def _get_uuid(*args: Any, **kwargs: Any): 234 | return "ndtw" 235 | 236 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 237 | self.locations.clear() 238 | self.gt_locations = self.gt_json[str(episode.episode_id)]["locations"] 239 | self._metric = None 240 | 241 | def _euclidean_distance(self, position_a, position_b): 242 | return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2) 243 | 244 | def update_metric( 245 | self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any 246 | ): 247 | current_position = self._sim.get_agent_state().position.tolist() 248 | if len(self.locations) == 0: 249 | self.locations.append(current_position) 250 | else: 251 | if current_position == self.locations[-1]: 252 | return 253 | self.locations.append(current_position) 254 | 255 | dtw_distance = self.dtw_func( 256 | self.locations, self.gt_locations, dist=self._euclidean_distance 257 | )[0] 258 | 259 | nDTW = np.exp( 260 | -dtw_distance / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE) 261 | ) 262 | self._metric = nDTW 263 | 264 | 265 | @registry.register_measure 266 | class SDTW(Measure): 267 | r"""SDTW (Success Weighted be nDTW) 268 | 269 | ref: Effective and General Evaluation for Instruction 270 | Conditioned Navigation using Dynamic Time 271 | Warping - Magalhaes et. al 272 | https://arxiv.org/pdf/1907.05446.pdf 273 | """ 274 | 275 | def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): 276 | self._sim = sim 277 | self._config = config 278 | self.locations = [] 279 | self.gt_locations = [] 280 | self.dtw_func = fastdtw if config.FDTW else dtw 281 | 282 | gt_path = config.GT_PATH.format(split=config.SPLIT) 283 | with gzip.open(gt_path, "rt") as f: 284 | self.gt_json = json.load(f) 285 | super().__init__() 286 | 287 | @staticmethod 288 | def _get_uuid(*args: Any, **kwargs: Any): 289 | return "sdtw" 290 | 291 | def reset_metric(self, *args: Any, episode, **kwargs: Any): 292 | self.locations.clear() 293 | self.gt_locations = self.gt_json[str(episode.episode_id)]["locations"] 294 | self._metric = None 295 | 296 | def _euclidean_distance(self, position_a, position_b): 297 | return np.linalg.norm(np.array(position_b) - np.array(position_a), ord=2) 298 | 299 | def update_metric( 300 | self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any 301 | ): 302 | current_position = self._sim.get_agent_state().position.tolist() 303 | if len(self.locations) == 0: 304 | self.locations.append(current_position) 305 | else: 306 | if current_position != self.locations[-1]: 307 | self.locations.append(current_position) 308 | 309 | dtw_distance = self.dtw_func( 310 | self.locations, self.gt_locations, dist=self._euclidean_distance 311 | )[0] 312 | 313 | nDTW = np.exp( 314 | -dtw_distance / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE) 315 | ) 316 | 317 | distance_to_target = self._sim.geodesic_distance( 318 | current_position, episode.goals[0].position 319 | ) 320 | if task.is_stop_called and distance_to_target < self._config.SUCCESS_DISTANCE: 321 | ep_success = 1 322 | else: 323 | ep_success = 0 324 | 325 | self._metric = ep_success * nDTW 326 | -------------------------------------------------------------------------------- /vlnce_baselines/common/rgb_mapping.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from einops import rearrange 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch_scatter 9 | 10 | 11 | class Mapping(nn.Module): 12 | def __init__(self, model_config): 13 | super().__init__() 14 | self.device = torch.device("cuda", model_config.gpu_id) 15 | self.num_proc = model_config.num_proc 16 | 17 | self.resolution = model_config.resolution 18 | self.egocentric_map_size = model_config.egocentric_map_size 19 | self.global_map_size = model_config.global_map_size 20 | self.global_map_depth = model_config.map_depth 21 | coordinate_min = - self.global_map_size * self.resolution / 2 22 | coordinate_max = self.global_map_size * self.resolution / 2 23 | 24 | self.to_grid = to_grid(self.global_map_size, coordinate_min, coordinate_max) 25 | self.rotate_tensor = RotateTensor(self.device) 26 | 27 | self.projection = Projection(self.egocentric_map_size, self.global_map_size, self.device, coordinate_min, coordinate_max) 28 | 29 | self.full_global_map = torch.zeros(self.num_proc, self.global_map_size, self.global_map_size, self.global_map_depth, device=self.device) 30 | self.agent_view = torch.zeros(self.num_proc, self.global_map_depth, self.global_map_size, self.global_map_size, device=self.device) 31 | 32 | def project_feat_to_map(self, features, full_global_map, observations, masks): 33 | bs = features.shape[0] 34 | grid_x, grid_y = self.to_grid.get_grid_coords(observations['gps']) 35 | full_global_map[:bs, :, :, :] = full_global_map[:bs, :, :, :] * masks.unsqueeze(1).unsqueeze(1) 36 | 37 | proj_sem = self.projection.forward(features, observations['depth'] * 10, -(observations["compass"])) 38 | projection = torch.cat([proj_sem], dim=1) 39 | 40 | agent_view = self.agent_view[:bs] * 0 41 | agent_view[:, :, 42 | self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2), 43 | self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2) 44 | ] = projection 45 | st_pose = torch.cat( 46 | [ 47 | -(grid_y.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2), 48 | -(grid_x.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2), 49 | torch.zeros_like(observations['compass']), 50 | ], dim=1 51 | ) 52 | _, trans_mat = get_grid(st_pose, agent_view.size(), self.device) 53 | translated = F.grid_sample(agent_view, trans_mat) 54 | 55 | fusion_map = torch.cat([full_global_map[:bs, :, :, :].unsqueeze(1), translated.permute(0, 2, 3, 1).unsqueeze(1)], dim=1) 56 | full_global_map[:bs, :, :, :], _ = torch.max(fusion_map, dim=1) 57 | st_pose_retrieval = torch.cat( 58 | [ 59 | (grid_y.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2), 60 | (grid_x.unsqueeze(1) - (self.global_map_size//2)) / (self.global_map_size//2), 61 | torch.zeros_like(observations['compass']), 62 | ], dim=1 63 | ) 64 | _, trans_mat_retrieval = get_grid(st_pose_retrieval, agent_view.size(), self.device) 65 | translated_retrieval = F.grid_sample(full_global_map[:bs, :, :, :].permute(0, 3, 1, 2).contiguous(), trans_mat_retrieval) 66 | translated_retrieval = translated_retrieval[:, :, 67 | self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2), 68 | self.global_map_size//2 - math.floor(self.egocentric_map_size/2): self.global_map_size//2 + math.ceil(self.egocentric_map_size/2) 69 | ] 70 | final_retrieval = self.rotate_tensor.forward(translated_retrieval, observations["compass"]) 71 | 72 | return final_retrieval, full_global_map 73 | 74 | 75 | class RGBMapping(Mapping): 76 | def __init__(self, model_config): 77 | super().__init__(model_config) 78 | 79 | def forward(self, rgb_features, observations, masks): 80 | if 'rgb_ego_map' not in observations: 81 | bs, c, h, w = rgb_features.shape 82 | rgb_features = rgb_features.permute(0, 2, 3, 1).reshape(bs, -1, c) # [bs, hxw, c] 83 | rgb_features = torch.nn.functional.adaptive_max_pool1d(rgb_features, self.global_map_depth) # [bs, hxw, self.global_map_depth] 84 | rgb_features = rgb_features.reshape(bs, h, w, -1).permute(0, 3, 1, 2) # [bs, self.global_map_depth, h, w] 85 | final_retrieval, self.full_global_map = self.project_feat_to_map(rgb_features, self.full_global_map, observations, masks) 86 | observations['rgb_ego_map'] = final_retrieval 87 | else: 88 | final_retrieval = observations['rgb_ego_map'] 89 | 90 | return final_retrieval 91 | 92 | 93 | class to_grid(): 94 | def __init__(self, global_map_size, coordinate_min, coordinate_max): 95 | self.global_map_size = global_map_size 96 | self.coordinate_min = coordinate_min 97 | self.coordinate_max = coordinate_max 98 | self.grid_size = (coordinate_max - coordinate_min) / global_map_size 99 | 100 | def get_grid_coords(self, positions): 101 | grid_x = ((self.coordinate_max - positions[:, 0]) / self.grid_size).round() 102 | grid_y = ((positions[:, 1] - self.coordinate_min) / self.grid_size).round() 103 | return grid_x, grid_y 104 | 105 | 106 | def get_grid(pose, grid_size, device): 107 | """ 108 | Input: 109 | `pose` FloatTensor(bs, 3) 110 | `grid_size` 4-tuple (bs, _, grid_h, grid_w) 111 | `device` torch.device (cpu or gpu) 112 | Output: 113 | `rot_grid` FloatTensor(bs, grid_h, grid_w, 2) 114 | `trans_grid` FloatTensor(bs, grid_h, grid_w, 2) 115 | """ 116 | pose = pose.float() 117 | x = pose[:, 0] 118 | y = pose[:, 1] 119 | t = pose[:, 2] 120 | 121 | cos_t = t.cos() 122 | sin_t = t.sin() 123 | 124 | theta11 = torch.stack([cos_t, -sin_t, 125 | torch.zeros(cos_t.shape).float().to(device)], 1) 126 | theta12 = torch.stack([sin_t, cos_t, 127 | torch.zeros(cos_t.shape).float().to(device)], 1) 128 | theta1 = torch.stack([theta11, theta12], 1) 129 | 130 | theta21 = torch.stack([torch.ones(x.shape).to(device), 131 | -torch.zeros(x.shape).to(device), x], 1) 132 | theta22 = torch.stack([torch.zeros(x.shape).to(device), 133 | torch.ones(x.shape).to(device), y], 1) 134 | theta2 = torch.stack([theta21, theta22], 1) 135 | 136 | rot_grid = F.affine_grid(theta1, torch.Size(grid_size)) 137 | trans_grid = F.affine_grid(theta2, torch.Size(grid_size)) 138 | 139 | return rot_grid, trans_grid 140 | 141 | 142 | class ComputeSpatialLocs(): 143 | def __init__(self, egocentric_map_size, global_map_size, device, coordinate_min, coordinate_max): 144 | self.device = device 145 | self.egocentric_map_size = egocentric_map_size 146 | self.local_scale = float(coordinate_max - coordinate_min) / float(global_map_size) 147 | 148 | def get_camera_matrix(self, imh, imw, fov): 149 | self.cx, self.cy = imh / 2., imw / 2. 150 | self.fx = (imh / 2.) / np.tan(np.deg2rad(fov / 2.)) 151 | self.fy = (imw / 2.) / np.tan(np.deg2rad(fov / 2.)) 152 | 153 | def forward(self, depth): 154 | depth = depth.permute(0, 3, 1, 2) 155 | _, _, imh, imw = depth.shape # batchsize, 1, imh, imw 156 | 157 | self.get_camera_matrix(imh, imw, 90) 158 | 159 | x = rearrange(torch.arange(0, imw), 'w -> () () () w').to(self.device) 160 | y = rearrange(torch.arange(imh, 0, step=-1), 'h -> () () h ()').to(self.device) 161 | xx = (x - self.cx) / self.fx 162 | yy = (y - self.cy) / self.fy 163 | 164 | # 3D real-world coordinates (in meters) 165 | Z = depth 166 | X = xx * Z 167 | Y = yy * Z 168 | 169 | # Valid inputs 170 | valid_inputs = (depth != 0) & ((Y > -1.5) & (Y < 0.1)) 171 | 172 | # X ground projection and Y ground projection 173 | x_gp = ((X / self.local_scale) + (self.egocentric_map_size - 1) / 2).round().long() # (bs, imh, imw, 1) 174 | y_gp = (-(Z / self.local_scale) + (self.egocentric_map_size - 1) / 2).round().long() # (bs, imh, imw, 1) 175 | 176 | return torch.cat([x_gp, y_gp], dim=1), valid_inputs 177 | 178 | 179 | class ProjectToGroundPlane(): 180 | def __init__(self, egocentric_map_size, device): 181 | self.egocentric_map_size = egocentric_map_size 182 | self.device = device 183 | 184 | def forward(self, conv, spatial_locs, valid_inputs): 185 | outh, outw = (self.egocentric_map_size, self.egocentric_map_size) 186 | bs, f, HbyK, WbyK = conv.shape 187 | eps = -1e16 188 | depth_h = spatial_locs.shape[-1] 189 | K = depth_h / WbyK # Hardcoded value of K 190 | 191 | # Sub-sample spatial_locs, valid_inputs according to img_feats resolution. 192 | idxes_ss = ((torch.arange(0, HbyK, 1) * K).long().to(self.device), \ 193 | (torch.arange(0, WbyK, 1) * K).long().to(self.device)) 194 | 195 | spatial_locs_ss = spatial_locs[:, :, idxes_ss[0][:, None], idxes_ss[1]] # (bs, 2, HbyK, WbyK) 196 | valid_inputs_ss = valid_inputs[:, :, idxes_ss[0][:, None], idxes_ss[1]] # (bs, 1, HbyK, WbyK) 197 | valid_inputs_ss = valid_inputs_ss.squeeze(1) # (bs, HbyK, WbyK) 198 | invalid_inputs_ss = ~valid_inputs_ss 199 | 200 | # Filter out invalid spatial locations 201 | invalid_spatial_locs = (spatial_locs_ss[:, 1] >= outh) | (spatial_locs_ss[:, 1] < 0) | \ 202 | (spatial_locs_ss[:, 0] >= outw) | (spatial_locs_ss[:, 0] < 0) # (bs, H, W) 203 | 204 | invalid_writes = invalid_spatial_locs | invalid_inputs_ss 205 | 206 | # Set the idxes for all invalid locations to (0, 0) 207 | spatial_locs_ss[:, 0][invalid_writes] = 0 208 | spatial_locs_ss[:, 1][invalid_writes] = 0 209 | 210 | # Weird hack to account for max-pooling negative feature values 211 | invalid_writes_f = rearrange(invalid_writes, 'b h w -> b () h w').float() 212 | conv_masked = conv * (1 - invalid_writes_f) + eps * invalid_writes_f 213 | conv_masked = rearrange(conv_masked, 'b e h w -> b e (h w)') 214 | 215 | # Linearize ground-plane indices (linear idx = y * W + x) 216 | linear_locs_ss = spatial_locs_ss[:, 1] * outw + spatial_locs_ss[:, 0] # (bs, H, W) 217 | linear_locs_ss = rearrange(linear_locs_ss, 'b h w -> b () (h w)') 218 | linear_locs_ss = linear_locs_ss.expand(-1, f, -1) # .contiguous() 219 | 220 | proj_feats, _ = torch_scatter.scatter_max( 221 | conv_masked, 222 | linear_locs_ss, 223 | dim=2, 224 | dim_size=outh * outw, 225 | ) 226 | proj_feats = rearrange(proj_feats, 'b e (h w) -> b e h w', h=outh) 227 | 228 | # Replace invalid features with zeros 229 | eps_mask = (proj_feats == eps).float() 230 | proj_feats = proj_feats * (1 - eps_mask) + eps_mask * (proj_feats - eps) 231 | 232 | return proj_feats 233 | 234 | 235 | class RotateTensor: 236 | def __init__(self, device): 237 | self.device = device 238 | 239 | def forward(self, x_gp, heading): 240 | sin_t = torch.sin(heading.squeeze(1)) 241 | cos_t = torch.cos(heading.squeeze(1)) 242 | A = torch.zeros(x_gp.size(0), 2, 3).to(self.device) 243 | A[:, 0, 0] = cos_t 244 | A[:, 0, 1] = sin_t 245 | A[:, 1, 0] = -sin_t 246 | A[:, 1, 1] = cos_t 247 | 248 | grid = F.affine_grid(A, x_gp.size()) 249 | rotated_x_gp = F.grid_sample(x_gp, grid) 250 | return rotated_x_gp 251 | 252 | 253 | class Projection: 254 | def __init__(self, egocentric_map_size, global_map_size, device, coordinate_min, coordinate_max): 255 | self.egocentric_map_size = egocentric_map_size 256 | self.global_map_size = global_map_size 257 | self.compute_spatial_locs = ComputeSpatialLocs( 258 | egocentric_map_size, global_map_size, 259 | device, coordinate_min, coordinate_max 260 | ) 261 | self.project_to_ground_plane = ProjectToGroundPlane(egocentric_map_size, device) 262 | self.rotate_tensor = RotateTensor(device) 263 | 264 | def forward(self, conv, depth, heading): 265 | spatial_locs, valid_inputs = self.compute_spatial_locs.forward(depth) 266 | x_gp = self.project_to_ground_plane.forward(conv, spatial_locs, valid_inputs) 267 | rotated_x_gp = self.rotate_tensor.forward(x_gp, heading) 268 | return rotated_x_gp 269 | -------------------------------------------------------------------------------- /habitat_extensions/sensors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import gzip 4 | import json 5 | import numpy as np 6 | from gym import spaces 7 | from typing import Any 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | from habitat.config import Config 13 | from habitat.core.registry import registry 14 | from habitat.core.simulator import Sensor, SensorTypes, Simulator 15 | from habitat.sims.habitat_simulator.actions import HabitatSimActions 16 | from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower 17 | from habitat.tasks.utils import cartesian_to_polar 18 | from habitat.utils.geometry_utils import quaternion_rotate_vector 19 | from habitat.utils.visualizations import maps 20 | from habitat_extensions.shortest_path_follower import ShortestPathFollowerCompat 21 | 22 | from vlnce_baselines.common.rgb_mapping import get_grid 23 | from vlnce_baselines.common.action_maker import TransfomationRealworldAgent 24 | 25 | 26 | @registry.register_sensor 27 | class VLNOracleActionSensor(Sensor): 28 | """Sensor for observing the optimal action to take. The assumption this 29 | sensor currently makes is that the shortest path to the goal is the 30 | optimal path. 31 | Args: 32 | sim: reference to the simulator for calculating task observations. 33 | config: config for the sensor. 34 | """ 35 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 36 | super().__init__(config=config) 37 | 38 | # all goals can be navigated to within 0.5m. 39 | goal_radius = getattr(config, "GOAL_RADIUS", 0.5) 40 | if config.USE_ORIGINAL_FOLLOWER: 41 | self.follower = ShortestPathFollowerCompat( 42 | sim, goal_radius, return_one_hot=False 43 | ) 44 | self.follower.mode = "geodesic_path" 45 | else: 46 | self.follower = ShortestPathFollower(sim, goal_radius, return_one_hot=False) 47 | 48 | def _get_uuid(self, *args: Any, **kwargs: Any): 49 | return "vln_oracle_action_sensor" 50 | 51 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 52 | return SensorTypes.TACTILE 53 | 54 | def _get_observation_space(self, *args: Any, **kwargs: Any): 55 | return spaces.Box(low=0.0, high=100, shape=(1,), dtype=np.float) 56 | 57 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any): 58 | best_action = self.follower.get_next_action(episode.goals[0].position) 59 | return np.array( 60 | [best_action if best_action is not None else HabitatSimActions.STOP] 61 | ) 62 | 63 | 64 | @registry.register_sensor 65 | class VLNOracleProgressSensor(Sensor): 66 | """Sensor for observing how much progress has been made towards the goal. 67 | Args: 68 | sim: reference to the simulator for calculating task observations. 69 | config: config for the sensor. 70 | """ 71 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 72 | self._sim = sim 73 | super().__init__(config=config) 74 | 75 | def _get_uuid(self, *args: Any, **kwargs: Any): 76 | return "progress" 77 | 78 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 79 | # TODO: what is the correct sensor type? 80 | return SensorTypes.MEASUREMENT 81 | 82 | def _get_observation_space(self, *args: Any, **kwargs: Any): 83 | return spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float) 84 | 85 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any): 86 | current_position = self._sim.get_agent_state().position.tolist() 87 | 88 | distance_to_target = self._sim.geodesic_distance( 89 | current_position, episode.goals[0].position 90 | ) 91 | 92 | distance_from_start = episode.info["geodesic_distance"] 93 | progress = (distance_from_start - distance_to_target) / distance_from_start 94 | return np.array([progress]) 95 | 96 | 97 | @registry.register_sensor 98 | class VLNOracleWaypointSensor(Sensor): 99 | """Sensor for waypoint towards the goal. 100 | Args: 101 | sim: reference to the simulator for calculating task observations. 102 | config: config for the sensor. 103 | """ 104 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 105 | super().__init__(config=config) 106 | self._coordinate_min = maps.COORDINATE_MIN 107 | self._coordinate_max = maps.COORDINATE_MAX 108 | self._map_resolution = (config.MAP_RESOLUTION, config.MAP_RESOLUTION) 109 | self._map_size = config.MAP_SIZE 110 | 111 | goal_radius = getattr(config, "GOAL_RADIUS", 0.5) 112 | if config.USE_ORIGINAL_FOLLOWER: 113 | self.follower = ShortestPathFollowerCompat( 114 | sim, goal_radius, return_one_hot=False 115 | ) 116 | self.follower.mode = "geodesic_path" 117 | else: 118 | self.follower = ShortestPathFollower(sim, goal_radius, return_one_hot=False) 119 | self._sim = sim 120 | 121 | self.use_law = config.LAW.USE 122 | gt_path = config.LAW.GT_PATH.format(split=config.LAW.SPLIT) 123 | with gzip.open(gt_path, "rt") as f: 124 | self.gt_waypoint_locations = json.load(f) 125 | self.is_sparse = config.LAW.IS_SPARSE 126 | self.num_inter_waypoints = config.LAW.NUM_WAYPOINTS 127 | 128 | def _get_uuid(self, *args: Any, **kwargs: Any): 129 | return "waypoint" 130 | 131 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 132 | return SensorTypes.TACTILE 133 | 134 | def _get_observation_space(self, *args: Any, **kwargs: Any): 135 | return spaces.Box(low=0.0, high=100, shape=(4,), dtype=np.float) 136 | 137 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any): 138 | agent_position = self._sim.get_agent_state().position 139 | if self.use_law: 140 | goal_pos = self.get_goal(episode) 141 | else: 142 | goal_pos = episode.goals[0].position 143 | points = self._sim.get_straight_shortest_path_points(agent_position, goal_pos) 144 | if len(points) < 2: 145 | return None 146 | 147 | waypoint = self.get_waypoint(points) 148 | 149 | self.trans_tool = TransfomationRealworldAgent(self._sim.get_agent_state()) 150 | wp_a = self.trans_tool.realworld2agent(waypoint) 151 | 152 | resolution = (self._coordinate_max - self._coordinate_min) / self._map_resolution[0] 153 | wp_ego_x = (wp_a[0] / resolution).astype(np.int) 154 | wp_ego_y = (-wp_a[2] / resolution).astype(np.int) 155 | wp_norm_x = wp_ego_x / (self._map_size // 2) 156 | wp_norm_y = wp_ego_y / (self._map_size // 2) 157 | 158 | return np.array([wp_norm_x, wp_norm_y]) 159 | 160 | def get_goal(self, episode): 161 | if self.num_inter_waypoints > 0: 162 | locs = self.gt_waypoint_locations[str(episode.episode_id)]["locations"] 163 | ep_path_length = self._sim.geodesic_distance(locs[0], episode.goals[0].position) 164 | 165 | way_locations = [locs[0]] 166 | count = 0 167 | dist = ep_path_length / (self.num_inter_waypoints+1) 168 | for way in locs[:-1]: 169 | d = self._sim.geodesic_distance(locs[0], way) 170 | if d >= dist: 171 | way_locations.append(way) 172 | if count >= (self.num_inter_waypoints-1): 173 | break 174 | count += 1 175 | dist += ep_path_length / (self.num_inter_waypoints+1) 176 | 177 | way_locations.append(episode.goals[0].position) 178 | else: 179 | if self.is_sparse: 180 | # Sparse supervision of waypoints 181 | way_locations = episode.reference_path 182 | else: 183 | # Dense supervision of waypoints 184 | way_locations = self.gt_waypoint_locations[str(episode.episode_id)]["locations"] 185 | 186 | current_position = self._sim.get_agent_state().position.tolist() 187 | nearest_dist = float("inf") 188 | nearest_way = way_locations[-1] 189 | 190 | for ind, way in reversed(list(enumerate(way_locations))): 191 | distance_to_way = self._sim.geodesic_distance(current_position, way) 192 | 193 | if distance_to_way >= 3.0 and distance_to_way < nearest_dist: 194 | dist_way_to_goal = self._sim.geodesic_distance(way, episode.goals[0].position) 195 | dist_agent_to_goal = self._sim.geodesic_distance(current_position, episode.goals[0].position) 196 | 197 | if dist_agent_to_goal > dist_way_to_goal: 198 | nearest_dist = distance_to_way 199 | nearest_way = way 200 | 201 | return nearest_way 202 | 203 | def get_waypoint(self, points): 204 | path_line = np.zeros(self._map_resolution, dtype=np.uint8) 205 | for index in range(len(points) - 1): 206 | x_t_1, y_t_1 = maps.to_grid( 207 | points[index][0], points[index][2], 208 | self._coordinate_min, self._coordinate_max, self._map_resolution, 209 | ) 210 | x_t_2, y_t_2 = maps.to_grid( 211 | points[index + 1][0], points[index + 1][2], 212 | self._coordinate_min, self._coordinate_max, self._map_resolution, 213 | ) 214 | cv2.line(path_line, (y_t_1, x_t_1), (y_t_2, x_t_2), 255, 1) 215 | 216 | agent_position = self._sim.get_agent_state().position 217 | a_x, a_y = maps.to_grid( 218 | agent_position[0], 219 | agent_position[2], 220 | self._coordinate_min, 221 | self._coordinate_max, 222 | self._map_resolution, 223 | ) 224 | fog_line = np.zeros(self._map_resolution, dtype=np.uint8) 225 | cv2.circle(fog_line, (a_y, a_x), 20, 255, 2) 226 | 227 | searched = [] 228 | def search(point): 229 | searched.append([point[0], point[1]]) 230 | if fog_line[point[0], point[1]]: 231 | return point 232 | for p in [(-1,0), (0,-1), (1,0), (0,1), (-1,-1), (1,-1), (1,1), (-1,1)]: 233 | if path_line[point[0]+p[0], point[1]+p[1]] and [point[0]+p[0], point[1]+p[1]] not in searched: 234 | s_point = search([point[0]+p[0], point[1]+p[1]]) 235 | if s_point is None: 236 | continue 237 | return s_point 238 | 239 | cross_line = np.where((path_line & fog_line) != 0) 240 | if cross_line[0].shape[0] > 0: 241 | frontier = search([a_x, a_y]) 242 | if frontier is None: 243 | frontier = [cross_line[0][0], cross_line[1][0]] 244 | frontier = maps.from_grid( 245 | frontier[0], frontier[1], 246 | self._coordinate_min, 247 | self._coordinate_max, 248 | self._map_resolution, 249 | ) 250 | else: 251 | frontier = [points[-1][0], points[-1][2]] 252 | 253 | waypoint = np.array([frontier[0], points[0][1], frontier[1]]) 254 | return waypoint 255 | 256 | 257 | @registry.register_sensor 258 | class VLNOraclePathSensor(Sensor): 259 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 260 | super().__init__(config=config) 261 | self._sim = sim 262 | self._coordinate_min = maps.COORDINATE_MIN 263 | self._coordinate_max = maps.COORDINATE_MAX 264 | self._map_resolution = (config.MAP_RESOLUTION, config.MAP_RESOLUTION) 265 | self._map_size = config.MAP_SIZE 266 | 267 | def _get_uuid(self, *args: Any, **kwargs: Any): 268 | return "gt_path" 269 | 270 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 271 | return SensorTypes.TACTILE 272 | 273 | def _get_observation_space(self, *args: Any, **kwargs: Any): 274 | return spaces.Box(low=0.0, high=1.0, shape=(100, 100), dtype=np.float) 275 | 276 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any): 277 | agent_position = self._sim.get_agent_state().position 278 | goal_pos = episode.goals[0].position 279 | points = self._sim.get_straight_shortest_path_points(agent_position, goal_pos) 280 | if len(points) < 2: 281 | return None 282 | gt_path = self.get_gt_path(points) 283 | return gt_path 284 | 285 | def get_gt_path(self, points): 286 | path_line = np.zeros([self._map_size, self._map_size]) 287 | self.trans_tool = TransfomationRealworldAgent(self._sim.get_agent_state()) 288 | 289 | for index in range(len(points) - 1): 290 | resolution = (self._coordinate_max - self._coordinate_min) / self._map_resolution[0] 291 | 292 | a1 = self.trans_tool.realworld2agent(points[index]) 293 | x_t_1 = (a1[2] / resolution + self._map_size // 2).astype(np.int) 294 | y_t_1 = (a1[0] / resolution + self._map_size // 2).astype(np.int) 295 | 296 | a2 = self.trans_tool.realworld2agent(points[index + 1]) 297 | x_t_2 = (a2[2] / resolution + self._map_size // 2).astype(np.int) 298 | y_t_2 = (a2[0] / resolution + self._map_size // 2).astype(np.int) 299 | 300 | cv2.line(path_line, (y_t_1, x_t_1), (y_t_2, x_t_2), 255, self.config.LINE_WIDTH) 301 | 302 | waypoint_dis = path_line / 255 303 | line_point_x, line_point_y = np.where(waypoint_dis != 0) 304 | line_point = np.concatenate([line_point_x[np.newaxis, :], line_point_y[np.newaxis, :]], axis=0) 305 | line_point = np.repeat(line_point[np.newaxis, :, :], 100, axis=0) 306 | line_point = np.repeat(line_point[np.newaxis, :, :, :], 100, axis=0) 307 | 308 | x, y = np.linspace(0, 99, 100), np.linspace(0, 99, 100) 309 | xv, yv = np.meshgrid(x, y) 310 | all_point = np.concatenate([yv[:, :, np.newaxis], xv[:, :, np.newaxis]], axis=2) 311 | all_point = np.repeat(all_point[:, :, :, np.newaxis], line_point.shape[-1], axis=3) 312 | 313 | dis_map = np.min(np.sqrt(np.sum((all_point - line_point)**2, axis=2)), axis=2) 314 | 315 | return dis_map 316 | 317 | 318 | @registry.register_sensor 319 | class SemanticFilterSensor(Sensor): 320 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 321 | super().__init__(config=config) 322 | self.sim = sim 323 | self.prev_episode_id = None 324 | self.label_to_27 = np.array([ 325 | 0, 15, 17, 1, 2, 3, 18, 19, 4, 15, 5, 6, 16, 20, 7, 8, 17, 326 | 17, 9, 21, 22, 16, 10, 11, 15, 12, 13, 23, 16, 16, 16, 16, 327 | 16, 24, 25, 16, 16, 14, 26, 16, 16, 328 | ]) 329 | 330 | def _get_uuid(self, *args: Any, **kwargs: Any): 331 | return "semantic_filter" 332 | 333 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 334 | return SensorTypes.TACTILE 335 | 336 | def _get_observation_space(self, *args: Any, **kwargs: Any): 337 | return spaces.Box( 338 | low=np.iinfo(np.uint32).min, 339 | high=np.iinfo(np.uint32).max, 340 | shape=(self.config.HEIGHT, self.config.WIDTH, self.config.CATEGORY), 341 | dtype=np.float, 342 | ) 343 | 344 | def get_observation(self, observations, episode, *args: Any, **kwargs: Any): 345 | semantic = observations['semantic'] 346 | 347 | if self.prev_episode_id != episode.episode_id: 348 | scene = self.sim.semantic_annotations() 349 | instance_id_to_label_id = {int(obj.id.split("_")[-1]): obj.category.index() for obj in scene.objects} 350 | self.mapping = np.array([instance_id_to_label_id[i] for i in range(len(instance_id_to_label_id))]) 351 | self.prev_episode_id = episode.episode_id 352 | 353 | semantic = np.take(self.mapping, semantic) 354 | semantic[semantic == -1] = 0 355 | semantic = np.take(self.label_to_27, semantic) 356 | h, w = semantic.shape 357 | semantic_filter = np.eye(27, dtype=np.float32)[semantic.reshape(-1)].reshape(h, w, 27) 358 | 359 | return semantic_filter 360 | 361 | 362 | @registry.register_sensor 363 | class GtSemanticMapSensor(Sensor): 364 | r"""Sensor for generating semantic map grounth truth 365 | """ 366 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any): 367 | self._sim = sim 368 | self.gt_path = 'data/map_data/semantic/{}'.format(config.SPLIT) 369 | self.half_size = config.MAP_SIZE // 2 370 | self.prev_episode_id = None 371 | super().__init__() 372 | 373 | @staticmethod 374 | def _get_uuid(*args: Any, **kwargs: Any): 375 | return "gt_semantic_map" 376 | 377 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 378 | return SensorTypes.TACTILE 379 | 380 | def _get_observation_space(self, *args: Any, **kwargs: Any): 381 | return spaces.Box(low=0.0, high=27.0, shape=(100, 100), dtype=np.long) 382 | 383 | def get_observation(self, observations, episode, *args: Any, **kwargs: Any): 384 | if self.prev_episode_id != episode.episode_id: 385 | self.init_agent_state = self._sim.get_agent_state() 386 | 387 | self.global_gt_semmap = np.load(os.path.join(self.gt_path, 'ep_'+str(episode.episode_id)+'.npy')) 388 | self.global_gt_semmap = torch.from_numpy(self.global_gt_semmap).unsqueeze(0).unsqueeze(0).float() 389 | 390 | rever_pose = torch.FloatTensor([0, 0, self._sim.record_heading]).unsqueeze(0) 391 | rot_mat, _ = get_grid(rever_pose, self.global_gt_semmap.size(), 'cpu') 392 | self.global_gt_semmap = F.grid_sample(self.global_gt_semmap, rot_mat, mode='nearest') 393 | 394 | agent_state = self._sim.get_agent_state() 395 | grid_y = (agent_state.position[0] - self.init_agent_state.position[0]) / 0.12 + 240 396 | grid_x = (agent_state.position[2] - self.init_agent_state.position[2]) / 0.12 + 240 397 | st_pose = torch.FloatTensor([ 398 | (grid_y - (480//2)) / (480//2), 399 | (grid_x - (480//2)) / (480//2), 400 | - self._sim.record_heading, 401 | ]).unsqueeze(0) 402 | 403 | rot_mat, tra_mat = get_grid(st_pose, self.global_gt_semmap.size(), 'cpu') 404 | transed_map = F.grid_sample(self.global_gt_semmap, tra_mat, mode='nearest') 405 | rotated_map = F.grid_sample(transed_map, rot_mat, mode='nearest') 406 | rotated_map = F.pad(rotated_map, (self.half_size, self.half_size, self.half_size, self.half_size), 'constant', 0) 407 | 408 | self.prev_episode_id = episode.episode_id 409 | 410 | return rotated_map.squeeze()[289-self.half_size: 289+self.half_size, 289-self.half_size: 289+self.half_size].long() 411 | 412 | @registry.register_sensor 413 | class HeadingSensor(Sensor): 414 | r"""Sensor for observing the agent's heading in the global coordinate 415 | frame. 416 | Args: 417 | sim: reference to the simulator for calculating task observations. 418 | config: config for the sensor. 419 | """ 420 | 421 | def __init__( 422 | self, sim: Simulator, config: Config, *args: Any, **kwargs: Any 423 | ): 424 | self._sim = sim 425 | super().__init__(config=config) 426 | 427 | def _get_uuid(self, *args: Any, **kwargs: Any): 428 | return "heading" 429 | 430 | def _get_sensor_type(self, *args: Any, **kwargs: Any): 431 | return SensorTypes.HEADING 432 | 433 | def _get_observation_space(self, *args: Any, **kwargs: Any): 434 | return spaces.Box(low=-np.pi, high=np.pi, shape=(1,), dtype=np.float) 435 | 436 | def _quat_to_xy_heading(self, quat): 437 | direction_vector = np.array([0, 0, -1]) 438 | heading_vector = quaternion_rotate_vector(quat, direction_vector) 439 | phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] 440 | return np.array([phi], dtype=np.float32) 441 | 442 | def get_observation( 443 | self, observations, episode, *args: Any, **kwargs: Any 444 | ): 445 | agent_state = self._sim.get_agent_state() 446 | rotation_world_agent = agent_state.rotation 447 | 448 | heading = self._quat_to_xy_heading(rotation_world_agent.inverse()) 449 | self._sim.record_heading = heading 450 | 451 | return heading 452 | -------------------------------------------------------------------------------- /vlnce_baselines/common_trainer.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import json 3 | import os 4 | import time 5 | import tqdm 6 | import math 7 | import datetime 8 | import numpy as np 9 | from collections import defaultdict 10 | from typing import Dict, Optional 11 | 12 | import torch 13 | import torch.distributed as dist 14 | from torch.nn.parallel import DistributedDataParallel as DDP 15 | 16 | from habitat import Config, logger 17 | from habitat.utils.visualizations.utils import append_text_to_image 18 | from habitat_baselines.common.base_trainer import BaseRLTrainer 19 | from habitat_baselines.common.environments import get_env_class 20 | from habitat_baselines.common.tensorboard_utils import TensorboardWriter 21 | from habitat_baselines.common.utils import batch_obs, generate_video, poll_checkpoint_folder 22 | from habitat_extensions.utils import observations_to_image 23 | 24 | from vlnce_baselines.models.policy import BasePolicy 25 | from vlnce_baselines.common.env_utils import construct_envs_auto_reset_false 26 | from vlnce_baselines.common.utils import transform_obs 27 | 28 | 29 | class CommonTrainer(BaseRLTrainer): 30 | def __init__(self, config=None): 31 | super().__init__(config) 32 | self.actor_critic = None 33 | self.envs = None 34 | 35 | self.local_rank = int(os.environ["LOCAL_RANK"]) 36 | self.world_size = int(os.environ["WORLD_SIZE"]) 37 | torch.cuda.set_device(self.local_rank) 38 | dist.init_process_group(backend="nccl", timeout=datetime.timedelta(seconds=18000)) 39 | self.device = ( 40 | torch.device("cuda", self.local_rank) 41 | if torch.cuda.is_available() 42 | else torch.device("cpu") 43 | ) 44 | print(f"[init] == local rank: {self.local_rank}") 45 | 46 | def _setup_actor_critic( 47 | self, config: Config, load_from_ckpt: bool, ckpt_path: str 48 | ) -> None: 49 | """Sets up actor critic and agent. 50 | Args: 51 | config: config 52 | Returns: 53 | None 54 | """ 55 | self.actor_critic = BasePolicy( 56 | observation_space=self.envs.observation_spaces[0], 57 | action_space=self.envs.action_spaces[0], 58 | model_config=config.MODEL, 59 | ) 60 | self.actor_critic.to(self.device) 61 | self.actor_critic = DDP( 62 | self.actor_critic, 63 | device_ids=[self.local_rank], 64 | output_device=self.local_rank, 65 | find_unused_parameters=True 66 | ) 67 | self.optimizer = torch.optim.Adam( 68 | self.actor_critic.parameters(), lr=self.config.DAGGER.LR 69 | ) 70 | 71 | if load_from_ckpt: 72 | ckpt_dict = self.load_checkpoint(ckpt_path, map_location="cpu") 73 | ckpt_dict["state_dict"] = {'module.'+k: v for k, v in ckpt_dict["state_dict"].items()} 74 | msg = self.actor_critic.load_state_dict(ckpt_dict["state_dict"], strict=False) 75 | logger.warning(f'Missing keys: {msg.missing_keys}, Unexpected keys: {msg.unexpected_keys}') 76 | logger.info(f"Loaded weights from checkpoint: {ckpt_path}") 77 | logger.info("Finished setting up actor critic model.") 78 | 79 | if self.local_rank == 0: 80 | logger.info( 81 | "agent number of parameters: {}".format( 82 | sum(param.numel() for param in self.actor_critic.parameters()) 83 | ) 84 | ) 85 | logger.info( 86 | "agent number of trainable parameters: {}".format( 87 | sum(p.numel() for p in self.actor_critic.parameters() if p.requires_grad) 88 | ) 89 | ) 90 | 91 | def save_checkpoint(self, file_name, extra_state: Optional[Dict] = None) -> None: 92 | """Save checkpoint with specified name. 93 | Args: 94 | file_name: file name for checkpoint 95 | Returns: 96 | None 97 | """ 98 | checkpoint = { 99 | "state_dict": self.actor_critic.module.state_dict(), 100 | "config": self.config, 101 | } 102 | if extra_state is not None: 103 | checkpoint["extra_state"] = extra_state 104 | torch.save(checkpoint, os.path.join(self.config.CHECKPOINT_FOLDER, file_name)) 105 | 106 | def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: 107 | """Load checkpoint of specified path as a dict. 108 | Args: 109 | checkpoint_path: path of target checkpoint 110 | *args: additional positional args 111 | **kwargs: additional keyword args 112 | Returns: 113 | dict containing checkpoint info 114 | """ 115 | ckpt = torch.load(checkpoint_path, *args, **kwargs) 116 | return ckpt 117 | 118 | def resume_dagger(self): 119 | start_dagger_it = 0 120 | start_epoch_it = 0 121 | 122 | ckpt_file = None 123 | if self.config.RESUME_CKPT is not None: 124 | ckpt_file = self.config.RESUME_CKPT 125 | if len(os.listdir(self.config.CHECKPOINT_FOLDER)) != 0: 126 | dir_list = sorted(os.listdir(self.config.CHECKPOINT_FOLDER), key=lambda x: os.path.getmtime(os.path.join(self.config.CHECKPOINT_FOLDER, x))) 127 | ckpt_file = os.path.join(self.config.CHECKPOINT_FOLDER, dir_list[-1]) # load the last saved ckpt 128 | 129 | if ckpt_file is not None: 130 | previous_model = self.load_checkpoint(ckpt_file, map_location=torch.device('cpu')) 131 | msg = self.actor_critic.module.load_state_dict(previous_model["state_dict"], strict=False) 132 | logger.warning(f'Missing keys: {msg.missing_keys}, Unexpected keys: {msg.unexpected_keys}') 133 | logger.info("Loaded previous checkpoint:%s"%ckpt_file) 134 | start_dagger_it = previous_model['extra_state']['dagger_it'] 135 | start_epoch_it = (int(ckpt_file.split('/')[-1].split('.')[1]) + 1) % self.config.DAGGER.EPOCHS 136 | if start_epoch_it == 0: 137 | start_dagger_it += 1 138 | 139 | return start_dagger_it, start_epoch_it 140 | 141 | @staticmethod 142 | def _pause_envs( 143 | envs_to_pause, 144 | envs, 145 | recurrent_hidden_states, 146 | not_done_masks, 147 | prev_actions, 148 | batch, 149 | actions=None, 150 | prog=None, 151 | rgb_full_global_map=None, 152 | rgb_frames=None, 153 | ): 154 | # pausing self.envs with no new episode 155 | if len(envs_to_pause) > 0: 156 | state_index = list(range(envs.num_envs)) 157 | for idx in reversed(envs_to_pause): 158 | if rgb_frames is not None: 159 | rgb_frames.pop(idx) 160 | state_index.pop(idx) 161 | envs.pause_at(idx) 162 | 163 | # indexing along the batch dimensions 164 | recurrent_hidden_states = recurrent_hidden_states[:, state_index] 165 | not_done_masks = not_done_masks[state_index] 166 | prev_actions = prev_actions[state_index] 167 | if actions is not None: 168 | actions = actions[state_index] 169 | if prog is not None: 170 | prog = prog[state_index] 171 | if rgb_full_global_map is not None: 172 | rgb_full_global_map = rgb_full_global_map[state_index] 173 | 174 | for k, v in batch.items(): 175 | batch[k] = v[state_index] 176 | 177 | return ( 178 | envs, 179 | recurrent_hidden_states, 180 | not_done_masks, 181 | prev_actions, 182 | batch, 183 | actions, 184 | prog, 185 | rgb_full_global_map, 186 | rgb_frames, 187 | ) 188 | 189 | def eval(self) -> None: 190 | """Main method of trainer evaluation. Calls _eval_checkpoint() that 191 | is specified in Trainer class that inherits from BaseRLTrainer 192 | """ 193 | if "tensorboard" in self.config.VIDEO_OPTION: 194 | assert ( 195 | len(self.config.TENSORBOARD_DIR) > 0 196 | ), "Must specify a tensorboard directory for video display" 197 | os.makedirs(self.config.TENSORBOARD_DIR, exist_ok=True) 198 | if "disk" in self.config.VIDEO_OPTION: 199 | assert ( 200 | len(self.config.VIDEO_DIR) > 0 201 | ), "Must specify a directory for storing videos on disk" 202 | 203 | with TensorboardWriter( 204 | '', flush_secs=self.flush_secs 205 | ) as writer: 206 | if os.path.isfile(self.config.EVAL_CKPT_PATH_DIR): 207 | # evaluate singe checkpoint 208 | self._eval_checkpoint(self.config.EVAL_CKPT_PATH_DIR, writer) 209 | else: 210 | # evaluate multiple checkpoints in order 211 | num_ckpt = len(os.listdir(self.config.EVAL_CKPT_PATH_DIR)) 212 | prev_ckpt_ind = num_ckpt - 2 213 | while True: 214 | current_ckpt = None 215 | while current_ckpt is None: 216 | current_ckpt = poll_checkpoint_folder( 217 | self.config.EVAL_CKPT_PATH_DIR, prev_ckpt_ind 218 | ) 219 | time.sleep(2) # sleep for 2 secs before polling again 220 | logger.info(f"=======current_ckpt: {current_ckpt}=======") 221 | self._eval_checkpoint( 222 | checkpoint_path=current_ckpt, 223 | writer=writer, 224 | checkpoint_index=prev_ckpt_ind + 1, 225 | ) 226 | prev_ckpt_ind -= 1 227 | 228 | def _eval_checkpoint( 229 | self, checkpoint_path: str, writer: TensorboardWriter, checkpoint_index: int = 0, training=False, training_step=0 230 | ) -> None: 231 | """Evaluates a single checkpoint. Assumes episode IDs are unique. 232 | Args: 233 | checkpoint_path: path of checkpoint 234 | writer: tensorboard writer object for logging to tensorboard 235 | checkpoint_index: index of cur checkpoint for logging 236 | Returns: 237 | None 238 | """ 239 | if training: 240 | checkpoint_path = '' 241 | 242 | finish_process = [] 243 | logger.info(f"checkpoint_path: {checkpoint_path}") 244 | 245 | if self.config.EVAL.USE_CKPT_CONFIG and not training: 246 | config = self._setup_eval_config( 247 | self.load_checkpoint(checkpoint_path, map_location="cpu")["config"] 248 | ) 249 | else: 250 | config = self.config.clone() 251 | 252 | config.defrost() 253 | config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT 254 | config.TASK_CONFIG.TASK.NDTW.SPLIT = config.EVAL.SPLIT 255 | config.TASK_CONFIG.TASK.SDTW.SPLIT = config.EVAL.SPLIT 256 | config.TASK_CONFIG.TASK.VLN_ORACLE_WAYPOINT_SENSOR.LAW.SPLIT = config.EVAL.SPLIT 257 | config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False 258 | config.TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = -1 259 | config.TASK_CONFIG.DATASET.split_num = 1 260 | if config.MODEL.PREDICTION_MONITOR.use: 261 | config.TASK_CONFIG.TASK.SENSORS.remove('GT_SEMANTIC_MAP_SENSOR') 262 | config.NUM_PROCESSES = 11 if config.NUM_PROCESSES > 11 else config.NUM_PROCESSES 263 | config.SIMULATOR_GPU_IDS = list(range(len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')))) 264 | 265 | if training: 266 | self.actor_critic.module.net.rgb_mapping_module.full_global_map = torch.zeros([config.NUM_PROCESSES] + list(self.actor_critic.module.net.rgb_mapping_module.full_global_map.shape[1:]), device=self.device) 267 | self.actor_critic.module.net.rgb_mapping_module.agent_view = torch.zeros([config.NUM_PROCESSES] + list(self.actor_critic.module.net.rgb_mapping_module.agent_view.shape[1:]), device=self.device) 268 | 269 | if training: 270 | config.STOP_CONDITION.TYPE = 'prog' 271 | config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/val_unseen/val_unseen_min.json.gz' 272 | # config.TASK_CONFIG.DATASET.DATA_PATH = 'data/datasets/R2R_VLNCE_v1-2_preprocessed/val_unseen/val_unseen.json.gz' 273 | if len(config.VIDEO_OPTION) > 0 and not training: 274 | config.SENSORS.append('SEMANTIC_SENSOR') 275 | config.TASK_CONFIG.TASK.SENSORS.append('SEMANTIC_FILTER_SENSOR') 276 | config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") 277 | config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") 278 | config.freeze() 279 | 280 | # setup agent 281 | if self.envs is not None: 282 | self.envs.close() 283 | self.envs = None 284 | self.envs = construct_envs_auto_reset_false( 285 | config, get_env_class(config.ENV_NAME) 286 | ) 287 | 288 | if not training: 289 | self._setup_actor_critic(config, not config.random_agent, checkpoint_path) 290 | 291 | observations = self.envs.reset() 292 | epidsode_reset_flag = True 293 | observations = transform_obs( 294 | observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, self.device 295 | ) 296 | batch = batch_obs(observations, self.device) 297 | 298 | eval_recurrent_hidden_states = torch.zeros( 299 | self.actor_critic.module.net.num_recurrent_layers, 300 | config.NUM_PROCESSES, 301 | self.config.MODEL.STATE_ENCODER.hidden_size, 302 | device=self.device, 303 | ) 304 | prev_actions = torch.zeros( 305 | config.NUM_PROCESSES, 2, device=self.device 306 | ) 307 | not_done_masks = torch.zeros(config.NUM_PROCESSES, 1, device=self.device) 308 | 309 | stats_episodes = {} # dict of dicts that stores stats per episode 310 | 311 | count_step = 0 312 | 313 | rgb_frames = None 314 | if len(config.VIDEO_OPTION) > 0 and not training: 315 | os.makedirs(config.VIDEO_DIR, exist_ok=True) 316 | rgb_frames = [[] for _ in range(config.NUM_PROCESSES)] 317 | 318 | pbar = tqdm.tqdm(total=sum(self.envs.number_of_episodes), dynamic_ncols=True, desc="Eval_ckpt_{}".format(str(training_step))) 319 | self.actor_critic.eval() 320 | step = 0 321 | while ( 322 | self.envs.num_envs > 0 and len(stats_episodes) < config.EVAL.EPISODE_COUNT 323 | ): 324 | current_episodes = self.envs.current_episodes() 325 | 326 | with torch.no_grad(): 327 | if count_step % config.step_num == 0 and count_step >= 24: 328 | (_, actions, _, eval_recurrent_hidden_states) = self.actor_critic.module.act( 329 | batch, 330 | eval_recurrent_hidden_states, 331 | prev_actions, 332 | not_done_masks, 333 | deterministic=True, 334 | ) 335 | else: 336 | self.actor_critic.module.update_map(batch, not_done_masks) 337 | if count_step < 24: 338 | actions = batch['waypoint'][:, :2] 339 | prev_actions.copy_(actions) 340 | 341 | step_inputs = [ 342 | { 343 | 'action': actions[e].cpu(), 344 | 'prog': self.actor_critic.module.prog[e].cpu().item() if count_step >= 24 else -1, 345 | 'epidsode_reset_flag': epidsode_reset_flag , 346 | 'depth_img': observations[e]['depth'], 347 | } 348 | for e in range(self.envs.num_envs) 349 | ] 350 | outputs = self.envs.step(step_inputs) 351 | epidsode_reset_flag = False 352 | step += 1 353 | observations, _, dones, infos = [list(x) for x in zip(*outputs)] 354 | if len(config.VIDEO_OPTION) > 0 and not training: 355 | for i in range(self.envs.num_envs): 356 | observations[i]['ego_map_vis'] = batch['ego_map'][i].cpu().numpy() 357 | 358 | count_step += 1 359 | 360 | not_done_masks = torch.tensor( 361 | [[0.0] if done else [1.0] for done in dones], 362 | dtype=torch.float, 363 | device=self.device, 364 | ) 365 | 366 | # reset envs and observations if necessary 367 | for i in range(self.envs.num_envs): 368 | if len(config.VIDEO_OPTION) > 0 and len(os.listdir(config.VIDEO_DIR)) < config.VIDEO_NUM and not training: 369 | att_map = self.actor_critic.module.net.att_map_t_m[i] if count_step-1 >= 24 else torch.zeros(24*24) 370 | frame = observations_to_image(observations[i], infos[i], step_inputs[i], att_map) 371 | frame = append_text_to_image( 372 | frame, current_episodes[i].instruction.instruction_text 373 | ) 374 | rgb_frames[i].append(frame) 375 | 376 | if not dones[i]: 377 | continue 378 | 379 | pbar.update() 380 | stats_episodes[current_episodes[i].episode_id] = infos[i] 381 | prev_actions[i] = torch.zeros(2) 382 | 383 | finish_process.append(i) 384 | if len(config.VIDEO_OPTION) > 0 and len(os.listdir(config.VIDEO_DIR)) < config.VIDEO_NUM and not training \ 385 | and finish_process.count(i) // 3 <= math.ceil(config.VIDEO_NUM / config.NUM_PROCESSES) and finish_process.count(i) % 3 == 1: 386 | generate_video( 387 | video_option=config.VIDEO_OPTION, 388 | video_dir=config.VIDEO_DIR, 389 | images=rgb_frames[i], 390 | episode_id=current_episodes[i].episode_id, 391 | checkpoint_idx=checkpoint_index, 392 | metrics={ 393 | "spl": stats_episodes[current_episodes[i].episode_id]["spl"] 394 | }, 395 | tb_writer=writer, 396 | ) 397 | 398 | if len(config.VIDEO_OPTION) > 0: 399 | del stats_episodes[current_episodes[i].episode_id]["top_down_map"] 400 | del stats_episodes[current_episodes[i].episode_id]["collisions"] 401 | rgb_frames[i] = [] 402 | 403 | if not training: 404 | aggregated_stats = {} 405 | num_episodes = len(stats_episodes) 406 | for stat_key in next(iter(stats_episodes.values())).keys(): 407 | aggregated_stats[stat_key] = ( 408 | sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes 409 | ) 410 | logger.info(aggregated_stats) 411 | 412 | if np.array(dones).all(): 413 | self.envs.resume_all() 414 | observations = self.envs.reset() 415 | epidsode_reset_flag = True 416 | count_step = 0 417 | eval_recurrent_hidden_states = torch.zeros( 418 | self.actor_critic.module.net.num_recurrent_layers, 419 | config.NUM_PROCESSES, 420 | self.config.MODEL.STATE_ENCODER.hidden_size, 421 | device=self.device, 422 | ) 423 | prev_actions = torch.zeros( 424 | config.NUM_PROCESSES, 2, device=self.device 425 | ) 426 | not_done_masks = torch.zeros(config.NUM_PROCESSES, 1, device=self.device) 427 | self.actor_critic.module.prog = torch.zeros(config.NUM_PROCESSES, 1, device=self.device) 428 | if self.actor_critic.module.net.rgb_mapping_module is not None: 429 | self.actor_critic.module.net.rgb_mapping_module.full_global_map = torch.zeros( 430 | config.NUM_PROCESSES, 431 | config.MODEL.RGBMAPPING.global_map_size, 432 | config.MODEL.RGBMAPPING.global_map_size, 433 | config.MODEL.RGBMAPPING.map_depth, 434 | device=self.device 435 | ) 436 | if len(config.VIDEO_OPTION) > 0: 437 | rgb_frames = [[] for _ in range(config.NUM_PROCESSES)] 438 | 439 | observations = transform_obs( 440 | observations, config.TASK_CONFIG.TASK.INSTRUCTION_SENSOR_UUID, self.device 441 | ) 442 | batch = batch_obs(observations, self.device) 443 | 444 | if np.array(dones).all(): 445 | actions = batch['waypoint'][:, :2] 446 | 447 | envs_to_pause = [] 448 | next_episodes = self.envs.current_episodes() 449 | 450 | for i in range(self.envs.num_envs): 451 | if next_episodes[i].episode_id in stats_episodes: 452 | envs_to_pause.append(i) 453 | 454 | ( 455 | self.envs, 456 | eval_recurrent_hidden_states, 457 | not_done_masks, 458 | prev_actions, 459 | batch, 460 | actions, 461 | self.actor_critic.module.prog, 462 | rgb_full_global_map, 463 | rgb_frames, 464 | ) = self._pause_envs( 465 | envs_to_pause, 466 | self.envs, 467 | eval_recurrent_hidden_states, 468 | not_done_masks, 469 | prev_actions, 470 | batch, 471 | actions, 472 | self.actor_critic.module.prog if count_step >= 24 else None, 473 | self.actor_critic.module.net.rgb_mapping_module.full_global_map, 474 | rgb_frames, 475 | ) 476 | self.actor_critic.module.net.rgb_mapping_module.full_global_map = rgb_full_global_map 477 | 478 | self.envs.close() 479 | self.envs = None 480 | 481 | aggregated_stats = {} 482 | num_episodes = len(stats_episodes) 483 | for stat_key in next(iter(stats_episodes.values())).keys(): 484 | aggregated_stats[stat_key] = ( 485 | sum([v[stat_key] for v in stats_episodes.values()]) / num_episodes 486 | ) 487 | 488 | if not training: 489 | split = config.TASK_CONFIG.DATASET.SPLIT 490 | os.makedirs(config.METRIC_DIR, exist_ok=True) 491 | with open(os.path.join(config.METRIC_DIR, f"stats_ckpt_{checkpoint_index}_{split}.json"), "w") as f: 492 | json.dump(aggregated_stats, f, indent=4) 493 | with open(os.path.join(config.METRIC_DIR, f"each_stat_ckpt_{checkpoint_index}_{split}.json"), "w") as f: 494 | json.dump(stats_episodes, f) 495 | 496 | if not training: 497 | logger.info(f"Episodes evaluated: {num_episodes}") 498 | checkpoint_num = checkpoint_index + 1 499 | for k, v in aggregated_stats.items(): 500 | logger.info(f"Average episode {k}: {v:.6f}") 501 | writer.add_scalar(f"eval_{split}_{k}", v, checkpoint_num) 502 | else: 503 | for k, v in aggregated_stats.items(): 504 | logger.info(f"Eval while training average episode {k}: {v:.6f}") 505 | writer.add_scalar(f"eval_while_training_{k}", v, training_step) 506 | writer.flush() 507 | 508 | def empty_cuda_cache(self): 509 | if torch.cuda.is_available(): 510 | with torch.cuda.device(self.device): 511 | torch.cuda.empty_cache() 512 | gc.collect() 513 | 514 | def change_data_type(self, traj_obs): 515 | for k, v in traj_obs.items(): 516 | traj_obs[k] = v.numpy() 517 | if k == 'vln_oracle_action_sensor': 518 | traj_obs[k] = traj_obs[k].astype(np.uint8) 519 | if k == 'rgb_ego_map': 520 | traj_obs[k] = traj_obs[k].astype(np.float16) 521 | if k == 'gt_path': 522 | traj_obs[k] = traj_obs[k].astype(np.float16) 523 | if k == 'rgb': 524 | traj_obs[k] = traj_obs[k].astype(np.uint8) 525 | if k == 'depth': 526 | traj_obs[k] = traj_obs[k].astype(np.float16) 527 | if k == 'rgb_features': 528 | traj_obs[k] = traj_obs[k].astype(np.float16) 529 | if k == 'depth_features': 530 | traj_obs[k] = traj_obs[k].astype(np.float16) 531 | if k == 'gt_semantic_map': 532 | traj_obs[k] = traj_obs[k].astype(np.int) 533 | 534 | def inference(self) -> None: 535 | pass 536 | --------------------------------------------------------------------------------