├── .gitignore
├── README.md
├── assets
├── framework.png
├── teaser.png
└── valuemap.png
├── habitat_extensions
├── __init__.py
├── config
│ ├── __init__.py
│ ├── default.py
│ └── zs_vlnce_task.yaml
├── habitat_simulator.py
├── maps.py
├── measures.py
├── pose_utils.py
├── sensors.py
└── task.py
├── requirements.txt
├── requirements2.txt
├── run_mp.py
├── run_r2r
└── main.sh
└── vlnce_baselines
├── ZS_Evaluator_mp.py
├── __init__.py
├── common
├── constraints.py
├── env_utils.py
├── environments.py
├── instruction_tools.py
└── utils.py
├── config
├── __init__.py
├── default.py
└── exp1.yaml
├── map
├── RepViTSAM
│ ├── repvit.py
│ └── setup_repvit_sam.py
├── direction_map.py
├── history_map.py
├── mapping.py
├── semantic_prediction.py
└── value_map.py
├── models
├── Policy.py
├── __init__.py
├── fmm_planner.py
├── frontier_policy.py
├── frontier_waypoint_selector.py
├── super_pixel_policy.py
├── superpixel_waypoint_selector.py
└── vanilla_waypoint_selector.py
└── utils
├── acyclic_enforcer.py
├── constant.py
├── data_utils.py
├── depth_utils.py
├── map_utils.py
├── pose.py
├── rotation_utils.py
└── visualization.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ftpignore
2 | .ftpconfig
3 | .vscode
4 |
5 | # Byte-compiled / optimized / DLL files
6 | .ipynb_checkpoints/
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 | *.swp
11 | *.zip
12 |
13 | /data
14 | /tests
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Constraint-Aware Zero-Shot Vision-Language Navigation in Continuous Environments
4 |
5 |
15 |
16 |
17 |
18 |
23 |
24 |
25 | We address the task of Vision-Language Navigation in Continuous Environments (VLN-CE) under the zero-shot setting. Zero-shot VLN-CE is particularly challenging due to the absence of expert demonstrations for training and minimal environment structural prior to guide navigation. To confront these challenges, we propose a Constraint-Aware Navigator (CA-Nav), which reframes zero-shot VLN-CE as a sequential, constraint aware sub-instruction completion process. CA-Nav continuously translates sub-instructions into navigation plans using two core modules: the Constraint-Aware Sub-instruction Manager (CSM) and the Constraint-Aware Value Mapper (CVM). CSM defines the completion criteria for decomposed sub-instructions as constraints and tracks navigation progress by switching sub-instructions in a constraint-aware manner. CVM, guided by CSM’s constraints, generates a value map on the fly and refines it using superpixel clustering to improve navigation stability. CA-Nav achieves the state-of-the-art performance on two VLN-CE benchmarks, surpassing the previous best method by 12% and 13% in Success Rate on the validation unseen splits of R2R-CE and RxR-CE, respectively. Moreover, CA-Nav demonstrates its effectiveness in real-world robot deployments across various indoor scenes and instructions.
26 |
27 |
28 |

29 |
31 |
32 |
33 | # Setup
34 |
35 | ## Installation
36 |
37 | 1. Create a virtual environment. We develop this project with Python 3.8:
38 |
39 | ```bash
40 | conda create -n CA-Nav python==3.8
41 | conda activate CA-Nav
42 | ```
43 |
44 | 2. Install `habitat-sim-v0.1.7` for a machine with multiple GPUs or without an attached display (i.e. a cluster):
45 |
46 | ```bash
47 | git clone https://github.com/facebookresearch/habitat-sim.git
48 | cd habitat-sim
49 | git checkout tags/v0.1.7
50 | pip install -r requirements.txt
51 | python setup.py install --headless
52 | ```
53 |
54 | 3. Install `habitat-lab-v0.1.7`:
55 |
56 | ```bash
57 | git clone https://github.com/facebookresearch/habitat-lab.git
58 | cd habitat-lab
59 | git checkout tags/v0.1.7
60 | cd habitat_baselines/rl
61 | vi requirements.txt # delete tensorflow==1.13.1
62 | cd ../../ # (return to habitat-lab direction)
63 |
64 | pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
65 |
66 | pip install -r requirements.txt
67 | python setup.py develop --all # install habitat and habitat_baselines; If the installation fails, try again, most of the time it is due to network problems
68 | ```
69 |
70 | If you encounter some problems and failed to install habitat, please try to follow the [Official Habitat Installation Guide](https://github.com/facebookresearch/habitat-lab#installation) to install [`habitat-lab`](https://github.com/facebookresearch/habitat-lab) and [`habitat-sim`](https://github.com/facebookresearch/habitat-sim). We use version [`v0.1.7`](https://github.com/facebookresearch/habitat-lab/releases/tag/v0.1.7) in our experiments, same as in the VLN-CE, please refer to the [VLN-CE](https://github.com/jacobkrantz/VLN-CE) page for more details.
71 |
72 | 3. Install Grounded-SAM and refine its phrases2classes function
73 |
74 | ```bash
75 | git clone https://github.com/IDEA-Research/GroundingDINO.git
76 | cd GroundingDINO
77 | git checkout -q 57535c5a79791cb76e36fdb64975271354f10251
78 | pip install -q -e .
79 | pip install 'git+https://github.com/facebookresearch/segment-anything.git'
80 | ```
81 |
82 | ATTENTION: We found that optimizing the phrase-to-class mapping logic in Grounded-SAM using minimum edit distance leads to more stable prediction outputs.
83 |
84 | ```bash
85 | cd /GroundingDINO/groundingdino/util/inference.py
86 | pip install nltk
87 | ```
88 |
89 | Find and comment the raw phrase2class function in Line 235, then write the refined version:
90 | ```python
91 | # @staticmethod
92 | # def phrases2classes(phrases: List[str], classes: List[str]) -> np.ndarray:
93 | # class_ids = []
94 | # for phrase in phrases:
95 | # try:
96 | # class_ids.append(classes.index(phrase))
97 | # except ValueError:
98 | # class_ids.append(None)
99 | # return np.array(class_ids)
100 |
101 | from nltk.metrics import edit_distance
102 | @staticmethod
103 | def phrases2classes(phrases: List[str], classes: List[str]) -> np.ndarray:
104 | class_ids = []
105 | for phrase in phrases:
106 | if phrase in classes:
107 | class_ids.append(classes.index(phrase))
108 | else:
109 | distances = np.array([edit_distance(phrase, class_id) for class_id in classes])
110 | idx = np.argmin(distances)
111 | class_ids.append(idx)
112 | return np.array(class_ids)
113 | ```
114 |
115 | 4. Install other requirements
116 |
117 | ```bash
118 | git clone https://github.com/Chenkehan21/CA-Nav-code.git
119 | cd CA-Nav-code
120 | pip install requirements.txt
121 | pip install requirements2.txt
122 | ```
123 |
124 |
125 | ## Datasets
126 | 1. R2R-CE
127 | - Instructions:
128 | Download the R2R_VLNCE_v1-3_preprocessed instructions from [VLN-CE](https://github.com/jacobkrantz/VLN-CE):
129 |
130 | - Scenes:
131 | Matterport3D (MP3D) scene reconstructions are used. The official Matterport3D download script (`download_mp.py`) can be accessed by following the instructions on their [project webpage](https://niessner.github.io/Matterport/). The scene data can then be downloaded:
132 |
133 | ```bash
134 | # requires running with python 2.7
135 | python download_mp.py --task habitat -o data/scene_datasets/mp3d/
136 | ```
137 |
138 | Extract such that it has the form `scene_datasets/mp3d/{scene}/{scene}.glb`. There should be 90 scenes. Place the `scene_datasets` folder in `data/`.
139 |
140 |
141 | 2. CA-Nav LLM Replys / BLIP2-ITM / BLIP2-VQA / Grounded-SAM
142 |
143 | Download from [CA-Nav-Google-Drive](https://drive.google.com/drive/folders/1fHUDDnK-gNNABrcb5u_F93mAQhu8tC8z?usp=sharing)
144 |
145 | Overall, datas are organized as follows:
146 |
147 | ```
148 | CA-Nav-code
149 | ├── data
150 | │ ├── blip2
151 | │ ├── datasets
152 | │ ├── LLM_REPLYS_VAL_UNSEEN
153 | │ ├── R2R_VLNCE_v1-3_preprocessed
154 | │ ├── grounded_sam
155 | │ ├── logs
156 | │ ├── scene_datasets
157 | │ └── vqa
158 | └── ...
159 | ```
160 |
161 |
162 | ## Running
163 |
164 | ```bash
165 | cd CA-NAV-code
166 | sh run_r2r/main.sh
167 | ```
168 |
169 |
170 | # Contact Information
171 |
172 | * kehan.chen@cripac.ia.ac.cn, [Kehan Chen](https://github.com/Chenkehan21)
173 | * dong.an@mbzuai.ac.ae, [Dong An](https://marsaki.github.io/)
174 | * yhuang@nlpr.ia.ac.cn, [Yan Huang](https://yanrockhuang.github.io/)
175 |
176 | # Acknowledge
177 |
178 | Our implementations are partially inspired by [SemExp](https://arxiv.org/abs/2007.00643) and [ETPNav](https://arxiv.org/abs/2304.03047v2).
179 | Thanks for their great works!
180 |
181 | # Citation
182 |
183 | If you find this repository is useful, please consider citing our paper:
184 |
185 | ```
186 | @article{chen2024CANav,
187 | title={Constraint-Aware Zero-Shot Vision-Language Navigation in Continuous Environments},
188 | author={Kehan Chen and Dong An and Yan Huang and Rongtao Xu and Yifei Su and Yonggen Ling and Ian Reid and Liang Wang},
189 | year={2024},
190 | journal={arXiv preprint arXiv:2412.10137}
191 | }
192 | ```
--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/assets/framework.png
--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/assets/teaser.png
--------------------------------------------------------------------------------
/assets/valuemap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/assets/valuemap.png
--------------------------------------------------------------------------------
/habitat_extensions/__init__.py:
--------------------------------------------------------------------------------
1 | from habitat_extensions.task import VLNCEDatasetV1
2 | from habitat_extensions.habitat_simulator import Simulator
3 | from habitat_extensions import measures
4 | from habitat_extensions import sensors
--------------------------------------------------------------------------------
/habitat_extensions/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/habitat_extensions/config/__init__.py
--------------------------------------------------------------------------------
/habitat_extensions/config/default.py:
--------------------------------------------------------------------------------
1 | """
2 | Default environment, simulaotr, agent configs
3 | """
4 | from typing import List, Optional, Union
5 |
6 | from habitat.config.default import Config as CN
7 | from habitat.config.default import get_config
8 |
9 |
10 | _C = get_config()
11 | _C.defrost()
12 |
13 | # -----------------------------------------------------------------------------
14 | # VLN SENSOR POSE SENSOR
15 | # -----------------------------------------------------------------------------
16 | _C.TASK.SENSOR_POSE_SENSOR = CN()
17 | _C.TASK.SENSOR_POSE_SENSOR.TYPE = "SensorPoseSensor"
18 |
19 | # -----------------------------------------------------------------------------
20 | # VLN LLM SENSOR
21 | # -----------------------------------------------------------------------------
22 | _C.TASK.LLM_SENSOR = CN()
23 | _C.TASK.LLM_SENSOR.TYPE = "LLMSensor"
24 |
25 | # -----------------------------------------------------------------------------
26 | # VLN POSITION SENSOR
27 | # -----------------------------------------------------------------------------
28 | _C.TASK.POSITION_SENSOR = CN()
29 | _C.TASK.POSITION_SENSOR.TYPE = "PositionSensor"
30 |
31 | # -----------------------------------------------------------------------------
32 | # VLN HEADING SENSOR
33 | # -----------------------------------------------------------------------------
34 | _C.TASK.HEADING_SENSOR = CN()
35 | _C.TASK.HEADING_SENSOR.TYPE = "HeadingSensor"
36 |
37 | # ----------------------------------------------------------------------------
38 | # NDTW MEASUREMENT
39 | # ----------------------------------------------------------------------------
40 | _C.TASK.NDTW = CN()
41 | _C.TASK.NDTW.TYPE = "NDTW"
42 | _C.TASK.NDTW.SPLIT = "val_unseen"
43 | _C.TASK.NDTW.FDTW = True # False: DTW
44 | _C.TASK.NDTW.GT_PATH = (
45 | "data/datasets/R2R_VLNCE_v1-2_preprocessed/{split}/{split}_gt.json"
46 | )
47 | _C.TASK.NDTW.SUCCESS_DISTANCE = 3.0
48 |
49 | # ----------------------------------------------------------------------------
50 | # SDTW MEASUREMENT
51 | # ----------------------------------------------------------------------------
52 | _C.TASK.SDTW = CN()
53 | _C.TASK.SDTW.TYPE = "SDTW"
54 |
55 | # ----------------------------------------------------------------------------
56 | # PATH_LENGTH MEASUREMENT
57 | # ----------------------------------------------------------------------------
58 | _C.TASK.PATH_LENGTH = CN()
59 | _C.TASK.PATH_LENGTH.TYPE = "PathLength"
60 |
61 | # -----------------------------------------------------------------------------
62 | # ORACLE_NAVIGATION_ERROR MEASUREMENT
63 | # -----------------------------------------------------------------------------
64 | _C.TASK.ORACLE_NAVIGATION_ERROR = CN()
65 | _C.TASK.ORACLE_NAVIGATION_ERROR.TYPE = "OracleNavigationError"
66 |
67 | # -----------------------------------------------------------------------------
68 | # ORACLE_SUCCESS MEASUREMENT
69 | # -----------------------------------------------------------------------------
70 | _C.TASK.ORACLE_SUCCESS = CN()
71 | _C.TASK.ORACLE_SUCCESS.TYPE = "OracleSuccess"
72 | _C.TASK.ORACLE_SUCCESS.SUCCESS_DISTANCE = 3.0
73 |
74 | # -----------------------------------------------------------------------------
75 | # ORACLE_SPL MEASUREMENT
76 | # -----------------------------------------------------------------------------
77 | _C.TASK.ORACLE_SPL = CN()
78 | _C.TASK.ORACLE_SPL.TYPE = "OracleSPL"
79 | _C.TASK.ORACLE_SPL.SUCCESS_DISTANCE = 3.0
80 |
81 | # -----------------------------------------------------------------------------
82 | # STEPS_TAKEN MEASUREMENT
83 | # -----------------------------------------------------------------------------
84 | _C.TASK.STEPS_TAKEN = CN()
85 | _C.TASK.STEPS_TAKEN.TYPE = "StepsTaken"
86 |
87 | # ----------------------------------------------------------------------------
88 | # POSITION MEASUREMENT For faster eval
89 | # ----------------------------------------------------------------------------
90 | _C.TASK.POSITION = CN()
91 | _C.TASK.POSITION.TYPE = 'Position'
92 |
93 | _C.DATASET.split_num = 0
94 | _C.DATASET.split_rank = 0
95 |
96 | # -----------------------------------------------------------------------------
97 | # TOP_DOWN_MAP_VLNCE MEASUREMENT
98 | # -----------------------------------------------------------------------------
99 | _C.TASK.TOP_DOWN_MAP_VLNCE = CN()
100 | _C.TASK.TOP_DOWN_MAP_VLNCE.TYPE = "TopDownMapVLNCE"
101 | _C.TASK.TOP_DOWN_MAP_VLNCE.MAX_EPISODE_STEPS = _C.ENVIRONMENT.MAX_EPISODE_STEPS
102 | _C.TASK.TOP_DOWN_MAP_VLNCE.MAP_RESOLUTION = 512
103 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_SOURCE_AND_TARGET = True
104 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_BORDER = False
105 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_SHORTEST_PATH = False
106 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_REFERENCE_PATH = False
107 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_FIXED_WAYPOINTS = False
108 | _C.TASK.TOP_DOWN_MAP_VLNCE.DRAW_MP3D_AGENT_PATH = False
109 | _C.TASK.TOP_DOWN_MAP_VLNCE.GRAPHS_FILE = "data/connectivity_graphs.pkl"
110 | _C.TASK.TOP_DOWN_MAP_VLNCE.FOG_OF_WAR = CN()
111 | _C.TASK.TOP_DOWN_MAP_VLNCE.FOG_OF_WAR.DRAW = False
112 | _C.TASK.TOP_DOWN_MAP_VLNCE.FOG_OF_WAR.FOV = 79
113 | _C.TASK.TOP_DOWN_MAP_VLNCE.FOG_OF_WAR.VISIBILITY_DIST = 5.0
114 |
115 | # ----------------------------------------------------------------------------
116 | # DATASET EXTENSIONS
117 | # ----------------------------------------------------------------------------
118 | _C.DATASET.EPISODES_ALLOWED = [701]
119 | # _C.DATASET.EPISODES_ALLOWED = [389]
120 |
121 |
122 | def get_extended_config(
123 | config_paths: Optional[Union[List[str], str]] = None,
124 | opts: Optional[list] = None,
125 | ) -> CN:
126 | r"""Create a unified config with default values overwritten by values from
127 | :p:`config_paths` and overwritten by options from :p:`opts`.
128 |
129 | :param config_paths: List of config paths or string that contains comma
130 | separated list of config paths.
131 | :param opts: Config options (keys, values) in a list (e.g., passed from
132 | command line into the config. For example,
133 | :py:`opts = ['FOO.BAR', 0.5]`. Argument can be used for parameter
134 | sweeping or quick tests.
135 | """
136 | config = _C.clone()
137 |
138 | if config_paths:
139 | if isinstance(config_paths, str):
140 | config_paths = [config_paths]
141 |
142 | for config_path in config_paths:
143 | config.merge_from_file(config_path)
144 |
145 | if opts:
146 | config.merge_from_list(opts)
147 | config.freeze()
148 | return config
--------------------------------------------------------------------------------
/habitat_extensions/config/zs_vlnce_task.yaml:
--------------------------------------------------------------------------------
1 | SEED: 0
2 | ENVIRONMENT:
3 | MAX_EPISODE_STEPS: 250
4 | SIMULATOR:
5 | ACTION_SPACE_CONFIG: v1
6 | AGENT_0:
7 | SENSORS: [RGB_SENSOR, DEPTH_SENSOR]
8 | HEIGHT: 0.88
9 | RADIUS: 0.1
10 | FORWARD_STEP_SIZE: 0.25
11 | TURN_ANGLE: 30
12 | HABITAT_SIM_V0:
13 | GPU_DEVICE_ID: 0
14 | ALLOW_SLIDING: True
15 | RGB_SENSOR:
16 | WIDTH: 640
17 | HEIGHT: 480
18 | HFOV: 79
19 | POSITION: [0, 0.88, 0]
20 | TYPE: HabitatSimRGBSensor
21 | DEPTH_SENSOR:
22 | WIDTH: 640
23 | HEIGHT: 480
24 | HFOV: 79
25 | MIN_DEPTH: 0.5
26 | MAX_DEPTH: 5.0
27 | POSITION: [0, 0.88, 0]
28 | TYPE:
29 | Sim-v1
30 | TASK:
31 | TYPE: VLN-v0
32 | SUCCESS_DISTANCE: 3.0
33 | SENSORS: [
34 | INSTRUCTION_SENSOR,
35 | SENSOR_POSE_SENSOR,
36 | LLM_SENSOR,
37 | POSITION_SENSOR,
38 | HEADING_SENSOR
39 | ]
40 | INSTRUCTION_SENSOR_UUID: instruction
41 | POSSIBLE_ACTIONS: [STOP, MOVE_FORWARD, TURN_LEFT, TURN_RIGHT, LOOK_UP, LOOK_DOWN]
42 | MEASUREMENTS: [
43 | DISTANCE_TO_GOAL,
44 | SUCCESS,
45 | SPL,
46 | NDTW,
47 | PATH_LENGTH,
48 | ORACLE_SUCCESS,
49 | STEPS_TAKEN,
50 | POSITION
51 | ]
52 | SUCCESS:
53 | SUCCESS_DISTANCE: 3.0
54 | SPL:
55 | SUCCESS_DISTANCE: 3.0
56 | NDTW:
57 | SUCCESS_DISTANCE: 3.0
58 | GT_PATH: data/datasets/R2R_VLNCE_v1-3_preprocessed/{split}/{split}_gt.json.gz
59 | SDTW:
60 | SUCCESS_DISTANCE: 3.0
61 | GT_PATH: data/datasets/R2R_VLNCE_v1-3_preprocessed/{split}/{split}_gt.json.gz
62 | ORACLE_SUCCESS:
63 | SUCCESS_DISTANCE: 3.0
64 | DATASET:
65 | TYPE: VLN-CE-v1
66 | SPLIT: val_unseen
67 | DATA_PATH: data/datasets/R2R_VLNCE_v1-3_preprocessed/{split}/{split}.json.gz
68 | SCENES_DIR: data/scene_datasets/
69 | LLM_REPLYS_PATH: data/datasets/LLM_REPLYS_VAL_UNSEEN/llm_reply_valunseen1839.json
--------------------------------------------------------------------------------
/habitat_extensions/habitat_simulator.py:
--------------------------------------------------------------------------------
1 | """
2 | Implement a VLNSimulator that step without returning observation
3 | in order to save space and time.
4 | """
5 |
6 | #!/usr/bin/env python3
7 |
8 | # Copyright (c) Facebook, Inc. and its affiliates.
9 | # This source code is licensed under the MIT license found in the
10 | # LICENSE file in the root directory of this source tree.
11 |
12 | from typing import (
13 | TYPE_CHECKING,
14 | Any,
15 | Dict,
16 | List,
17 | Optional,
18 | Sequence,
19 | Set,
20 | Union,
21 | cast,
22 | )
23 |
24 | import numpy as np
25 | from gym import spaces
26 | from gym.spaces.box import Box
27 | from numpy import ndarray
28 |
29 | if TYPE_CHECKING:
30 | from torch import Tensor
31 |
32 | import habitat_sim
33 |
34 | from habitat_sim.simulator import MutableMapping, MutableMapping_T
35 | from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim
36 | from habitat.core.dataset import Episode
37 | from habitat.core.registry import registry
38 | from habitat.core.simulator import (
39 | AgentState,
40 | Config,
41 | DepthSensor,
42 | Observations,
43 | RGBSensor,
44 | SemanticSensor,
45 | Sensor,
46 | SensorSuite,
47 | ShortestPathPoint,
48 | Simulator,
49 | VisualObservation,
50 | )
51 | from habitat.core.spaces import Space
52 |
53 | # inherit habitat-lab/habitat/sims/habitat_simulator/habitat_simulator.py
54 | @registry.register_simulator(name="Sim-v1")
55 | class Simulator(HabitatSim):
56 | r"""Simulator wrapper over habitat-sim
57 |
58 | habitat-sim repo: https://github.com/facebookresearch/habitat-sim
59 |
60 | Args:
61 | config: configuration for initializing the simulator.
62 | """
63 |
64 | def __init__(self, config: Config) -> None:
65 | super().__init__(config)
66 |
67 | def step_without_obs(self,
68 | action: Union[str, int, MutableMapping_T[int, Union[str, int]]],
69 | dt: float = 1.0 / 60.0,):
70 | self._num_total_frames += 1
71 | if isinstance(action, MutableMapping):
72 | return_single = False
73 | else:
74 | action = cast(Dict[int, Union[str, int]], {self._default_agent_id: action})
75 | return_single = True
76 | collided_dict: Dict[int, bool] = {}
77 | for agent_id, agent_act in action.items():
78 | agent = self.get_agent(agent_id)
79 | collided_dict[agent_id] = agent.act(agent_act)
80 | self.__last_state[agent_id] = agent.get_state()
81 |
82 | # # step physics by dt
83 | # step_start_Time = time.time()
84 | # super().step_world(dt)
85 | # self._previous_step_time = time.time() - step_start_Time
86 |
87 | multi_observations = {}
88 | for agent_id in action.keys():
89 | agent_observation = {}
90 | agent_observation["collided"] = collided_dict[agent_id]
91 | multi_observations[agent_id] = agent_observation
92 |
93 | if return_single:
94 | sim_obs = multi_observations[self._default_agent_id]
95 | else:
96 | sim_obs = multi_observations
97 |
98 | self._prev_sim_obs = sim_obs
--------------------------------------------------------------------------------
/habitat_extensions/maps.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Optional, Tuple, Union
2 |
3 | import networkx as nx
4 | import numpy as np
5 | from habitat.core.simulator import Simulator
6 | from habitat.core.utils import try_cv2_import
7 | from habitat.tasks.vln.vln import VLNEpisode
8 | from habitat.utils.visualizations import maps as habitat_maps
9 |
10 | cv2 = try_cv2_import()
11 |
12 | AGENT_SPRITE = habitat_maps.AGENT_SPRITE
13 |
14 | MAP_THICKNESS_SCALAR: int = 128
15 |
16 | MAP_INVALID_POINT = 0
17 | MAP_VALID_POINT = 1
18 | MAP_BORDER_INDICATOR = 2
19 | MAP_SOURCE_POINT_INDICATOR = 4
20 | MAP_TARGET_POINT_INDICATOR = 6
21 | MAP_MP3D_WAYPOINT = 7
22 | MAP_VIEW_POINT_INDICATOR = 8
23 | MAP_TARGET_BOUNDING_BOX = 9
24 | MAP_REFERENCE_POINT = 10
25 | MAP_MP3D_REFERENCE_PATH = 11
26 | MAP_WAYPOINT_PREDICTION = 12
27 | MAP_ORACLE_WAYPOINT = 13
28 | MAP_SHORTEST_PATH_WAYPOINT = 14
29 |
30 | MAP_CAND_WP = 15
31 | MAP_ORACLE_WP = 16
32 | MAP_HIST_WP = 17
33 | MAP_ACTION_WP = 18
34 |
35 | NODE = 19
36 | GHOST = 20
37 | PREDICT_GHOST = 21
38 | TEACHER_GHOST = 22
39 |
40 | TOP_DOWN_MAP_COLORS = np.full((256, 3), 150, dtype=np.uint8)
41 | TOP_DOWN_MAP_COLORS[15:] = cv2.applyColorMap(
42 | np.arange(241, dtype=np.uint8), cv2.COLORMAP_JET
43 | ).squeeze(1)[:, ::-1]
44 | TOP_DOWN_MAP_COLORS[MAP_INVALID_POINT] = [255, 255, 255] # White
45 | TOP_DOWN_MAP_COLORS[MAP_VALID_POINT] = [150, 150, 150] # Light Grey
46 | TOP_DOWN_MAP_COLORS[MAP_BORDER_INDICATOR] = [50, 50, 50] # Grey
47 | TOP_DOWN_MAP_COLORS[MAP_SOURCE_POINT_INDICATOR] = [0, 200, 0] # Green
48 | TOP_DOWN_MAP_COLORS[MAP_TARGET_POINT_INDICATOR] = [200, 0, 0] # Red
49 | TOP_DOWN_MAP_COLORS[MAP_MP3D_WAYPOINT] = [0, 200, 0] # Green
50 | TOP_DOWN_MAP_COLORS[MAP_VIEW_POINT_INDICATOR] = [255, 165, 0] # Orange
51 | TOP_DOWN_MAP_COLORS[MAP_TARGET_BOUNDING_BOX] = [0, 175, 0] # Dark Green
52 | TOP_DOWN_MAP_COLORS[MAP_REFERENCE_POINT] = [0, 0, 0] # Black
53 | TOP_DOWN_MAP_COLORS[MAP_MP3D_REFERENCE_PATH] = [0, 0, 0] # Black
54 | TOP_DOWN_MAP_COLORS[MAP_WAYPOINT_PREDICTION] = [240, 240, 0] # Yellow
55 | TOP_DOWN_MAP_COLORS[MAP_ORACLE_WAYPOINT] = [240, 0, 0] # Red
56 | TOP_DOWN_MAP_COLORS[MAP_SHORTEST_PATH_WAYPOINT] = [0, 150, 0] # Dark Green
57 |
58 | TOP_DOWN_MAP_COLORS[MAP_CAND_WP] = [240, 240, 0] # Yellow
59 | TOP_DOWN_MAP_COLORS[MAP_ORACLE_WP] = [240, 0, 0] # Red
60 | TOP_DOWN_MAP_COLORS[MAP_ACTION_WP] = [255, 165, 0] # Orange
61 |
62 | TOP_DOWN_MAP_COLORS[NODE] = [255, 165, 0] # Orange
63 | TOP_DOWN_MAP_COLORS[GHOST] = [0, 0, 200] # Blue
64 | TOP_DOWN_MAP_COLORS[PREDICT_GHOST] = [240, 240, 0] # Yellow
65 | TOP_DOWN_MAP_COLORS[TEACHER_GHOST] = [240, 0, 0] # Red
66 |
67 |
68 | def get_top_down_map(sim, map_resolution, meters_per_pixel):
69 | base_height = sim.get_agent(0).state.position[1]
70 | td_map = habitat_maps.get_topdown_map(
71 | sim.pathfinder,
72 | base_height,
73 | map_resolution,
74 | False,
75 | meters_per_pixel,
76 | )
77 | return td_map
78 |
79 |
80 | def colorize_topdown_map(
81 | top_down_map: np.ndarray,
82 | fog_of_war_mask: Optional[np.ndarray] = None,
83 | fog_of_war_desat_amount: float = 0.5,
84 | ) -> np.ndarray:
85 | """Same as `maps.colorize_topdown_map` in Habitat-Lab, but with different
86 | colors.
87 | """
88 | _map = TOP_DOWN_MAP_COLORS[top_down_map]
89 |
90 | if fog_of_war_mask is not None:
91 | fog_of_war_desat_values = np.array([[fog_of_war_desat_amount], [1.0]])
92 | # Only desaturate valid points as only valid points get revealed
93 | desat_mask = top_down_map != MAP_INVALID_POINT
94 |
95 | _map[desat_mask] = (
96 | _map * fog_of_war_desat_values[fog_of_war_mask]
97 | ).astype(np.uint8)[desat_mask]
98 |
99 | return _map
100 |
101 |
102 | def static_to_grid(
103 | realworld_x: float,
104 | realworld_y: float,
105 | grid_resolution: Tuple[int, int],
106 | bounds: Dict[str, Tuple[float, float]],
107 | ) -> Tuple[int, int]:
108 | """Return gridworld index of realworld coordinates assuming top-left
109 | corner is the origin. The real world coordinates of lower left corner are
110 | (coordinate_min, coordinate_min) and of top right corner are
111 | (coordinate_max, coordinate_max). Same as the habitat-Lab maps.to_grid
112 | function but with a static `bounds` instead of requiring a simulator or
113 | pathfinder instance.
114 | """
115 | grid_size = (
116 | abs(bounds["upper"][2] - bounds["lower"][2]) / grid_resolution[0],
117 | abs(bounds["upper"][0] - bounds["lower"][0]) / grid_resolution[1],
118 | )
119 | grid_x = int((realworld_x - bounds["lower"][2]) / grid_size[0])
120 | grid_y = int((realworld_y - bounds["lower"][0]) / grid_size[1])
121 | return grid_x, grid_y
122 |
123 |
124 | def drawline(
125 | img: np.ndarray,
126 | pt1: Union[Tuple[float], List[float]],
127 | pt2: Union[Tuple[float], List[float]],
128 | color: List[int],
129 | thickness: int = 1,
130 | style: str = "dotted",
131 | gap: int = 15,
132 | ) -> None:
133 | """https://stackoverflow.com/questions/26690932/opencv-rectangle-with-dotted-or-dashed-lines
134 | style: "dotted", "dashed", or "filled"
135 | """
136 | assert style in ["dotted", "dashed", "filled"]
137 |
138 | if style == "filled":
139 | cv2.line(img, pt1, pt2, color, thickness)
140 | return
141 |
142 | dist = ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
143 | pts = []
144 | for i in np.arange(0, dist, gap):
145 | r = i / dist
146 | x = int((pt1[0] * (1 - r) + pt2[0] * r) + 0.5)
147 | y = int((pt1[1] * (1 - r) + pt2[1] * r) + 0.5)
148 | pts.append((x, y))
149 |
150 | if style == "dotted":
151 | for p in pts:
152 | cv2.circle(img, p, thickness, color, -1)
153 | else:
154 | s = pts[0]
155 | e = pts[0]
156 | for i, p in enumerate(pts):
157 | s = e
158 | e = p
159 | if i % 2 == 1:
160 | cv2.line(img, s, e, color, thickness)
161 |
162 |
163 | def drawpoint(
164 | img: np.ndarray,
165 | position: Union[Tuple[int], List[int]],
166 | color: List[int],
167 | meters_per_px: float,
168 | pad: float = 0.2,
169 | ) -> None:
170 | point_padding = int(pad / meters_per_px)
171 | img[
172 | position[0] - point_padding : position[0] + point_padding + 1,
173 | position[1] - point_padding : position[1] + point_padding + 1,
174 | ] = color
175 |
176 |
177 | def draw_triangle(
178 | img: np.ndarray,
179 | centroid: Union[Tuple[int], List[int]],
180 | color: List[int],
181 | meters_per_px: float,
182 | pad: int = 0.35,
183 | ) -> None:
184 | point_padding = int(pad / meters_per_px)
185 |
186 | # (Y, X)
187 | left = (centroid[1] - point_padding, centroid[0] + point_padding)
188 | right = (centroid[1] + point_padding, centroid[0] + point_padding)
189 | top = (centroid[1], centroid[0] - point_padding)
190 | cv2.drawContours(img, [np.array([left, right, top])], 0, color, -1)
191 |
192 |
193 | def draw_reference_path(
194 | img: np.ndarray,
195 | sim: Simulator,
196 | episode: VLNEpisode,
197 | map_resolution: int,
198 | meters_per_px: float,
199 | ) -> None:
200 | """Draws lines between each waypoint in the reference path."""
201 | shortest_path_points = [
202 | habitat_maps.to_grid(
203 | p[2],
204 | p[0],
205 | img.shape[0:2],
206 | sim,
207 | )[::-1]
208 | for p in episode.reference_path
209 | ]
210 |
211 | pt_from = None
212 | for i, pt_to in enumerate(shortest_path_points):
213 |
214 | if i != 0:
215 | drawline(
216 | img,
217 | (pt_from[0], pt_from[1]),
218 | (pt_to[0], pt_to[1]),
219 | MAP_SHORTEST_PATH_WAYPOINT,
220 | thickness=int(0.6 * map_resolution / MAP_THICKNESS_SCALAR),
221 | style="filled",
222 | gap=10,
223 | )
224 | pt_from = pt_to
225 |
226 | # for pt in shortest_path_points:
227 | # drawpoint(
228 | # img, (pt[1], pt[0]), MAP_SHORTEST_PATH_WAYPOINT, meters_per_px
229 | # )
230 |
231 |
232 | def draw_straight_shortest_path_points(
233 | img: np.ndarray,
234 | sim: Simulator,
235 | map_resolution: int,
236 | shortest_path_points: List[List[float]],
237 | ) -> None:
238 | """Draws the shortest path from start to goal assuming a standard
239 | discrete action space.
240 | """
241 | shortest_path_points = [
242 | habitat_maps.to_grid(p[2], p[0], img.shape[0:2], sim)[::-1]
243 | for p in shortest_path_points
244 | ]
245 |
246 | habitat_maps.draw_path(
247 | img,
248 | [(p[1], p[0]) for p in shortest_path_points],
249 | MAP_SHORTEST_PATH_WAYPOINT,
250 | int(0.4 * map_resolution / MAP_THICKNESS_SCALAR),
251 | )
252 |
253 |
254 | def draw_source_and_target(
255 | img: np.ndarray, sim: Simulator, episode: VLNEpisode, meters_per_px: float
256 | ) -> None:
257 | # s_x, s_y = habitat_maps.to_grid(
258 | # episode.start_position[2],
259 | # episode.start_position[0],
260 | # img.shape[0:2],
261 | # sim,
262 | # )
263 | # drawpoint(img, (s_x, s_y), MAP_SOURCE_POINT_INDICATOR, meters_per_px)
264 |
265 | # mark target point
266 | t_x, t_y = habitat_maps.to_grid(
267 | episode.goals[0].position[2],
268 | episode.goals[0].position[0],
269 | img.shape[0:2],
270 | sim,
271 | )
272 | drawpoint(img, (t_x, t_y), MAP_TARGET_POINT_INDICATOR, meters_per_px)
273 |
274 |
275 | def draw_waypoint_prediction(
276 | img: np.ndarray,
277 | waypoint: Union[Tuple[float], List[float]],
278 | meters_per_px: float,
279 | bounds: Dict[str, Tuple[float]],
280 | ) -> None:
281 | w_x, w_y = static_to_grid(waypoint[1], waypoint[0], img.shape[0:2], bounds)
282 | if w_x < img.shape[0] and w_x > 0 and w_y < img.shape[1] and w_y > 0:
283 | drawpoint(img, (w_x, w_y), MAP_WAYPOINT_PREDICTION, meters_per_px, pad=0.1)
284 |
285 |
286 | def draw_waypoint(
287 | img: np.ndarray,
288 | waypoint: Union[Tuple[float], List[float]],
289 | meters_per_px: float,
290 | bounds: Dict[str, Tuple[float]],
291 | color,
292 | ):
293 | w_x, w_y = static_to_grid(waypoint[1], waypoint[0], img.shape[0:2], bounds)
294 | if w_x < img.shape[0] and w_x > 0 and w_y < img.shape[1] and w_y > 0:
295 | drawpoint(img, (w_x, w_y), color, meters_per_px, pad=0.1)
296 |
297 |
298 | def draw_conn(img, p1, p2, bounds, color):
299 | p1_x, p1_y = static_to_grid(p1[1], p1[0], img.shape[0:2], bounds)
300 | p2_x, p2_y = static_to_grid(p2[1], p2[0], img.shape[0:2], bounds)
301 | drawline(img, [p1_y,p1_x], [p2_y,p2_x], color, thickness=2, style="dotted", gap=10)
302 |
303 |
304 | def get_nearest_node(graph: nx.Graph, current_position: List[float]) -> str:
305 | """Determine the closest MP3D node to the agent's start position as given
306 | by a [x,z] position vector.
307 | Returns:
308 | node ID
309 | """
310 | nearest = None
311 | dist = float("inf")
312 | for node in graph:
313 | node_pos = graph.nodes[node]["position"]
314 | node_pos = np.take(node_pos, (0, 2))
315 | cur_dist = np.linalg.norm(
316 | np.array(node_pos) - np.array(current_position), ord=2
317 | )
318 | if cur_dist < dist:
319 | dist = cur_dist
320 | nearest = node
321 | return nearest
322 |
323 |
324 | def update_nearest_node(
325 | graph: nx.Graph, nearest_node: str, current_position: np.ndarray
326 | ) -> str:
327 | """Determine the closest MP3D node to the agent's current position as
328 | given by a [x,z] position vector. The selected node must be reachable
329 | from the previous MP3D node as specified in the nav-graph edges.
330 | Returns:
331 | node ID
332 | """
333 | nearest = None
334 | dist = float("inf")
335 |
336 | for node in [nearest_node] + [e[1] for e in graph.edges(nearest_node)]:
337 | node_pos = graph.nodes[node]["position"]
338 | node_pos = np.take(node_pos, (0, 2))
339 | cur_dist = np.linalg.norm(
340 | np.array(node_pos) - np.array(current_position), ord=2
341 | )
342 | if cur_dist < dist:
343 | dist = cur_dist
344 | nearest = node
345 | return nearest
346 |
347 |
348 | def draw_mp3d_nodes(
349 | img: np.ndarray,
350 | sim: Simulator,
351 | episode: VLNEpisode,
352 | graph: nx.Graph,
353 | meters_per_px: float,
354 | ) -> None:
355 | n = get_nearest_node(
356 | graph, (episode.start_position[0], episode.start_position[2])
357 | )
358 | starting_height = graph.nodes[n]["position"][1]
359 | for node in graph:
360 | pos = graph.nodes[node]["position"]
361 |
362 | # no obvious way to differentiate between floors. Use this for now.
363 | if abs(pos[1] - starting_height) < 1.0:
364 | r_x, r_y = habitat_maps.to_grid(
365 | pos[2], pos[0], img.shape[0:2], sim
366 | )
367 |
368 | # only paint if over a valid point
369 | if img[r_x, r_y]:
370 | drawpoint(img, (r_x, r_y), MAP_MP3D_WAYPOINT, meters_per_px)
371 |
--------------------------------------------------------------------------------
/habitat_extensions/pose_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Design other useful functions
3 | """
4 |
5 | import quaternion
6 | import numpy as np
7 | from typing import List
8 |
9 |
10 | def get_l2_distance(x1, x2, y1, y2):
11 | """
12 | Computes the L2 distance between two points.
13 | """
14 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
15 |
16 |
17 | def get_rel_pose_change(pos2, pos1):
18 | x1, y1, o1 = pos1
19 | x2, y2, o2 = pos2
20 |
21 | theta = np.arctan2(y2 - y1, x2 - x1) - o1
22 | dist = get_l2_distance(x1, x2, y1, y2)
23 | dx = dist * np.cos(theta)
24 | dy = dist * np.sin(theta)
25 | do = o2 - o1
26 |
27 | return dx, dy, do
28 |
29 |
30 | def get_new_pose(pose, rel_pose_change):
31 | x, y, o = pose
32 | dx, dy, do = rel_pose_change
33 |
34 | global_dx = dx * np.sin(np.deg2rad(o)) + dy * np.cos(np.deg2rad(o))
35 | global_dy = dx * np.cos(np.deg2rad(o)) - dy * np.sin(np.deg2rad(o))
36 | x += global_dy
37 | y += global_dx
38 | o += np.rad2deg(do)
39 | if o > 180.:
40 | o -= 360.
41 |
42 | return x, y, o
43 |
44 |
45 | def threshold_poses(coords, shape):
46 | coords[0] = min(max(0, coords[0]), shape[0] - 1)
47 | coords[1] = min(max(0, coords[1]), shape[1] - 1)
48 | return coords
49 |
50 |
51 | def get_sim_location(sim):
52 | """Returns x, y, o pose of the agent in the Habitat simulator."""
53 | agent_state = sim.get_agent_state(0)
54 | x = -agent_state.position[2]
55 | y = -agent_state.position[0]
56 | axis = quaternion.as_euler_angles(agent_state.rotation)[0]
57 | if (axis % (2 * np.pi)) < 0.1 or (axis %
58 | (2 * np.pi)) > 2 * np.pi - 0.1:
59 | o = quaternion.as_euler_angles(agent_state.rotation)[1]
60 | else:
61 | o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
62 | if o > np.pi:
63 | o -= 2 * np.pi
64 | return x, y, o
65 |
66 |
67 | def get_start_sim_location(position: List, rotation: List) -> tuple:
68 | x = -position[2]
69 | y = -position[0]
70 | axis = quaternion.as_euler_angles(rotation)[0]
71 | if (axis % (2 * np.pi)) < 0.1 or (axis %
72 | (2 * np.pi)) > 2 * np.pi - 0.1:
73 | o = quaternion.as_euler_angles(rotation)[1]
74 | else:
75 | o = 2 * np.pi - quaternion.as_euler_angles(rotation)[1]
76 | if o > np.pi:
77 | o -= 2 * np.pi
78 |
79 | return x, y, o
80 |
81 |
82 | def get_pose_change(sim, last_sim_location):
83 | """Returns dx, dy, do pose change of the agent relative to the last
84 | timestep."""
85 | curr_sim_pose = get_sim_location(sim)
86 | dx, dy, do = get_rel_pose_change(curr_sim_pose, last_sim_location)
87 |
88 | return dx, dy, do, curr_sim_pose
--------------------------------------------------------------------------------
/habitat_extensions/sensors.py:
--------------------------------------------------------------------------------
1 | """
2 | Implement other sensors if needed.
3 | """
4 |
5 | import numpy as np
6 | from gym import spaces
7 | from typing import Any, Dict
8 |
9 | from habitat.config import Config
10 | from habitat.core.registry import registry
11 | from habitat.core.simulator import Sensor, Simulator, SensorTypes
12 |
13 | from habitat.tasks.utils import cartesian_to_polar
14 | from habitat.utils.geometry_utils import quaternion_rotate_vector
15 |
16 | from habitat_extensions.pose_utils import get_pose_change, get_start_sim_location, get_sim_location
17 |
18 |
19 | @registry.register_sensor
20 | class SensorPoseSensor(Sensor):
21 | """It is a senor to get sensor's pose
22 | """
23 | def __init__(self, sim: Simulator, config: Config, *args: Any, **kwargs: Any) -> None:
24 | super().__init__(config=config)
25 | self._sim = sim
26 | self.episode_start = False
27 | self.last_sim_location = get_sim_location(self._sim)
28 | self.sensor_pose = [0., 0., 0.] # initialize last sensor pose as [0,0,0]
29 |
30 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
31 | return "sensor_pose"
32 |
33 | def _get_sensor_type(self, *args: Any, **kwargs: Any):
34 | return SensorTypes.TACTILE
35 |
36 | def _get_observation_space(self, *args: Any, **kwargs: Any):
37 | return spaces.Box(low=-100., high=100, shape=(3,), dtype=np.float)
38 |
39 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any):
40 | if not self.episode_start:
41 | start_position = episode.start_position
42 | start_rotation = np.quaternion(episode.start_rotation[-1], *episode.start_rotation[:-1])
43 | self.last_sim_location = get_start_sim_location(start_position, start_rotation)
44 | self.episode_start = True
45 | dx, dy, do, self.last_sim_location = get_pose_change(self._sim, self.last_sim_location)
46 | self.sensor_pose = [dx, dy, do]
47 |
48 | return self.sensor_pose
49 |
50 |
51 | @registry.register_sensor
52 | class LLMSensor(Sensor):
53 | def __init__(self, *args: Any, **kwargs: Any) -> None:
54 | super().__init__(*args, **kwargs)
55 | pass
56 |
57 | def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
58 | return "llm_reply"
59 |
60 | def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes:
61 | return SensorTypes.TEXT
62 |
63 | def _get_observation_space(self, *args: Any, **kwargs: Any) -> spaces.Space:
64 | return spaces.Discrete(0)
65 |
66 | def get_observation(self, observations, *args: Any, episode, **kwargs: Any) -> Dict:
67 | return episode.llm_reply
68 |
69 |
70 | @registry.register_sensor
71 | class PositionSensor(Sensor):
72 | r"""The agents current location in the global coordinate frame
73 |
74 | Args:
75 | sim: reference to the simulator for calculating task observations.
76 | config: Contains the DIMENSIONALITY field for the number of dimensions
77 | to express the agents position
78 | Attributes:
79 | _dimensionality: number of dimensions used to specify the agents position
80 | """
81 |
82 | cls_uuid: str = "position"
83 |
84 | def __init__(
85 | self, sim: Simulator, config: Config, *args: Any, **kwargs: Any
86 | ):
87 | self._sim = sim
88 | super().__init__(config=config)
89 |
90 | def _get_uuid(self, *args: Any, **kwargs: Any):
91 | return self.cls_uuid
92 |
93 | def _get_sensor_type(self, *args: Any, **kwargs: Any):
94 | return SensorTypes.POSITION
95 |
96 | def _get_observation_space(self, *args: Any, **kwargs: Any):
97 | return spaces.Box(
98 | low=np.finfo(np.float32).min,
99 | high=np.finfo(np.float32).max,
100 | shape=(2,),
101 | dtype=np.float32,
102 | )
103 |
104 | def get_observation(self, *args: Any, **kwargs: Any):
105 | return self._sim.get_agent_state().position.astype(np.float32)
106 |
107 |
108 | @registry.register_sensor
109 | class HeadingSensor(Sensor):
110 | r"""Sensor for observing the agent's heading in the global coordinate
111 | frame.
112 | Args:
113 | sim: reference to the simulator for calculating task observations.
114 | config: config for the sensor.
115 | """
116 |
117 | def __init__(
118 | self, sim: Simulator, config: Config, *args: Any, **kwargs: Any
119 | ):
120 | self._sim = sim
121 | super().__init__(config=config)
122 |
123 | def _get_uuid(self, *args: Any, **kwargs: Any):
124 | return "heading"
125 |
126 | def _get_sensor_type(self, *args: Any, **kwargs: Any):
127 | return SensorTypes.HEADING
128 |
129 | def _get_observation_space(self, *args: Any, **kwargs: Any):
130 | return spaces.Box(low=-np.pi, high=np.pi, shape=(1,), dtype=np.float)
131 |
132 | def _quat_to_xy_heading(self, quat):
133 | direction_vector = np.array([0, 0, -1])
134 | heading_vector = quaternion_rotate_vector(quat, direction_vector)
135 | phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
136 | return np.array([phi], dtype=np.float32)
137 |
138 | def get_observation(
139 | self, observations, episode, *args: Any, **kwargs: Any
140 | ):
141 | agent_state = self._sim.get_agent_state()
142 | rotation_world_agent = agent_state.rotation
143 |
144 | heading = self._quat_to_xy_heading(rotation_world_agent.inverse())
145 | self._sim.record_heading = heading
146 |
147 | return heading
--------------------------------------------------------------------------------
/habitat_extensions/task.py:
--------------------------------------------------------------------------------
1 | """
2 | Denfine Zero-shot VLN task: Dataset, Episode
3 | """
4 |
5 | import gzip
6 | import json
7 | import os
8 | from typing import Dict, List, Optional, Union
9 |
10 | import attr
11 | from habitat.config import Config
12 | from habitat.core.dataset import Dataset
13 | from habitat.core.registry import registry
14 | from habitat.core.utils import not_none_validator
15 | from habitat.datasets.pointnav.pointnav_dataset import ALL_SCENES_MASK
16 | from habitat.datasets.utils import VocabDict
17 | from habitat.tasks.nav.nav import NavigationGoal
18 | from habitat.tasks.vln.vln import InstructionData, VLNEpisode
19 | import random
20 |
21 | random.seed(0)
22 |
23 | DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/"
24 | ALL_LANGUAGES_MASK = "*"
25 | ALL_ROLES_MASK = "*"
26 |
27 |
28 | @attr.s(auto_attribs=True)
29 | class ExtendedInstructionData:
30 | instruction_text: str = attr.ib(default=None, validator=not_none_validator)
31 | instruction_id: Optional[str] = attr.ib(default=None)
32 | language: Optional[str] = attr.ib(default=None)
33 | annotator_id: Optional[str] = attr.ib(default=None)
34 | edit_distance: Optional[float] = attr.ib(default=None)
35 | timed_instruction: Optional[List[Dict[str, Union[float, str]]]] = attr.ib(
36 | default=None
37 | )
38 | instruction_tokens: Optional[List[str]] = attr.ib(default=None)
39 | split: Optional[str] = attr.ib(default=None)
40 |
41 |
42 | @attr.s(auto_attribs=True, kw_only=True)
43 | class VLNExtendedEpisode(VLNEpisode):
44 | goals: Optional[List[NavigationGoal]] = attr.ib(default=None)
45 | reference_path: Optional[List[List[float]]] = attr.ib(default=None)
46 | instruction: ExtendedInstructionData = attr.ib(
47 | default=None, validator=not_none_validator
48 | )
49 | trajectory_id: Optional[Union[int, str]] = attr.ib(default=None)
50 | llm_reply: Dict = attr.ib(default=None)
51 |
52 |
53 | @registry.register_dataset(name="VLN-CE-v1")
54 | class VLNCEDatasetV1(Dataset):
55 | r"""Class inherited from Dataset that loads a Vision and Language
56 | Navigation dataset.
57 | """
58 |
59 | episodes: List[VLNEpisode]
60 | instruction_vocab: VocabDict
61 |
62 | @staticmethod
63 | def check_config_paths_exist(config: Config) -> bool:
64 | return os.path.exists(
65 | config.DATA_PATH.format(split=config.SPLIT)
66 | ) and os.path.exists(config.SCENES_DIR)
67 |
68 | @staticmethod
69 | def _scene_from_episode(episode: VLNEpisode) -> str:
70 | r"""Helper method to get the scene name from an episode. Assumes
71 | the scene_id is formated /path/to/.
72 | """
73 | return os.path.splitext(os.path.basename(episode.scene_id))[0]
74 |
75 | @classmethod
76 | def get_scenes_to_load(cls, config: Config) -> List[str]:
77 | r"""Return a sorted list of scenes"""
78 | assert cls.check_config_paths_exist(config)
79 | dataset = cls(config)
80 | return sorted(
81 | {cls._scene_from_episode(episode) for episode in dataset.episodes}
82 | )
83 |
84 | def __init__(self, config: Optional[Config] = None) -> None:
85 | self.episodes = []
86 |
87 | if config is None:
88 | return
89 |
90 | dataset_filename = config.DATA_PATH.format(split=config.SPLIT)
91 | llm_replys_filename = config.LLM_REPLYS_PATH
92 | with gzip.open(dataset_filename, "rt") as f:
93 | self.from_json(f.read(),
94 | scenes_dir=config.SCENES_DIR,
95 | llm_replys_filename=llm_replys_filename,
96 | config=config)
97 |
98 | if ALL_SCENES_MASK not in config.CONTENT_SCENES:
99 | scenes_to_load = set(config.CONTENT_SCENES)
100 | self.episodes = [
101 | episode
102 | for episode in self.episodes
103 | if self._scene_from_episode(episode) in scenes_to_load
104 | ]
105 |
106 | if config.EPISODES_ALLOWED is not None:
107 | ep_ids_before = {ep.episode_id for ep in self.episodes}
108 | ep_ids_to_purge = ep_ids_before - set([ int(id) for id in config.EPISODES_ALLOWED])
109 | self.episodes = [
110 | episode
111 | for episode in self.episodes
112 | if episode.episode_id not in ep_ids_to_purge
113 | ]
114 |
115 | def from_json(self,
116 | json_str: str, scenes_dir: Optional[str] = None,
117 | llm_replys_filename = None, config: Optional[Config] = None) -> None:
118 |
119 | deserialized = json.loads(json_str)
120 | if llm_replys_filename:
121 | with open(llm_replys_filename, 'r') as f:
122 | llm_replys = json.load(f)
123 | # if config.EPISODES_ALLOWED is not None:
124 | # config.defrost()
125 | # config.EPISODES_ALLOWED = list(map(lambda x: int(x), list(llm_replys.keys()))) #在这里抽的episode子集
126 | # config.freeze()
127 |
128 | self.instruction_vocab = VocabDict(
129 | word_list=deserialized["instruction_vocab"]["word_list"]
130 | )
131 |
132 | for episode in deserialized["episodes"]:
133 | if llm_replys_filename:
134 | llm_reply = llm_replys.get(str(episode["episode_id"]), {})
135 | episode["llm_reply"] = llm_reply
136 | episode = VLNExtendedEpisode(**episode)
137 |
138 | if scenes_dir is not None:
139 | if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX):
140 | episode.scene_id = episode.scene_id[
141 | len(DEFAULT_SCENE_PATH_PREFIX) :
142 | ]
143 |
144 | episode.scene_id = os.path.join(scenes_dir, episode.scene_id)
145 |
146 | episode.instruction = InstructionData(**episode.instruction)
147 | if episode.goals is not None:
148 | for g_index, goal in enumerate(episode.goals):
149 | episode.goals[g_index] = NavigationGoal(**goal)
150 | self.episodes.append(episode)
151 |
152 | random.shuffle(self.episodes)
153 |
154 |
155 | @registry.register_dataset(name="RxR-VLN-CE-v1")
156 | class RxRVLNCEDatasetV1(Dataset):
157 | r"""Loads the RxR VLN-CE Dataset."""
158 |
159 | episodes: List[VLNEpisode]
160 | instruction_vocab: VocabDict
161 | annotation_roles: List[str] = ["guide", "follower"]
162 | languages: List[str] = ["en-US", "en-IN", "hi-IN", "te-IN"]
163 |
164 | @staticmethod
165 | def _scene_from_episode(episode: VLNEpisode) -> str:
166 | r"""Helper method to get the scene name from an episode. Assumes
167 | the scene_id is formated /path/to/.
168 | """
169 | return os.path.splitext(os.path.basename(episode.scene_id))[0]
170 |
171 | @staticmethod
172 | def _language_from_episode(episode: VLNExtendedEpisode) -> str:
173 | return episode.instruction.language
174 |
175 | @classmethod
176 | def get_scenes_to_load(cls, config: Config) -> List[str]:
177 | r"""Return a sorted list of scenes"""
178 | assert cls.check_config_paths_exist(config)
179 | dataset = cls(config)
180 | return sorted(
181 | {cls._scene_from_episode(episode) for episode in dataset.episodes}
182 | )
183 |
184 | @classmethod
185 | def extract_roles_from_config(cls, config: Config) -> List[str]:
186 | if ALL_ROLES_MASK in config.ROLES:
187 | return cls.annotation_roles
188 | assert set(config.ROLES).issubset(set(cls.annotation_roles))
189 | return config.ROLES
190 |
191 | @classmethod
192 | def check_config_paths_exist(cls, config: Config) -> bool:
193 | return all(
194 | os.path.exists(
195 | config.DATA_PATH.format(split=config.SPLIT, role=role)
196 | )
197 | for role in cls.extract_roles_from_config(config)
198 | ) and os.path.exists(config.SCENES_DIR)
199 |
200 | def __init__(self, config: Optional[Config] = None) -> None:
201 | self.episodes = []
202 | self.config = config
203 |
204 | if config is None:
205 | return
206 |
207 | for role in self.extract_roles_from_config(config):
208 | with gzip.open(
209 | config.DATA_PATH.format(split=config.SPLIT, role=role), "rt"
210 | ) as f:
211 | self.from_json(f.read(), scenes_dir=config.SCENES_DIR)
212 |
213 | if ALL_SCENES_MASK not in config.CONTENT_SCENES:
214 | scenes_to_load = set(config.CONTENT_SCENES)
215 | self.episodes = [
216 | episode
217 | for episode in self.episodes
218 | if self._scene_from_episode(episode) in scenes_to_load
219 | ]
220 |
221 | if ALL_LANGUAGES_MASK not in config.LANGUAGES:
222 | languages_to_load = set(config.LANGUAGES)
223 | self.episodes = [
224 | episode
225 | for episode in self.episodes
226 | if self._language_from_episode(episode) in languages_to_load
227 | ]
228 |
229 | if config.EPISODES_ALLOWED is not None:
230 | ep_ids_before = {ep.episode_id for ep in self.episodes}
231 | ep_ids_to_purge = ep_ids_before - set(config.EPISODES_ALLOWED)
232 | self.episodes = [
233 | episode
234 | for episode in self.episodes
235 | if episode.episode_id not in ep_ids_to_purge
236 | ]
237 |
238 | def from_json(
239 | self, json_str: str, scenes_dir: Optional[str] = None
240 | ) -> None:
241 |
242 | deserialized = json.loads(json_str)
243 |
244 | for episode in deserialized["episodes"]:
245 | episode = VLNExtendedEpisode(**episode)
246 |
247 | if scenes_dir is not None:
248 | if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX):
249 | episode.scene_id = episode.scene_id[
250 | len(DEFAULT_SCENE_PATH_PREFIX) :
251 | ]
252 |
253 | episode.scene_id = os.path.join(scenes_dir, episode.scene_id)
254 |
255 | episode.instruction = ExtendedInstructionData(
256 | **episode.instruction
257 | )
258 | episode.instruction.split = self.config.SPLIT
259 | if episode.goals is not None:
260 | for g_index, goal in enumerate(episode.goals):
261 | episode.goals[g_index] = NavigationGoal(**goal)
262 | self.episodes.append(episode)
263 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools==65.5.0
2 | wheel==0.38.0
3 | dtw_python==1.4.2
4 | fast_slic==0.4.0
5 | fastdtw==0.3.4
6 | matplotlib==3.7.2
7 | networkx==3.1
8 | numpy==1.23.4
9 | openai==1.28.1
10 | Pillow==8.4.0
11 | pyinstrument
12 | scikit_fmm
13 | scikit_image==0.15.0
14 | supervision==0.6.0
15 | timm==0.4.12
16 | tqdm==4.66.4
17 | roboflow==1.1.28
18 | scipy==1.10.1
19 | transformer==4.38.1
20 | nltk
--------------------------------------------------------------------------------
/requirements2.txt:
--------------------------------------------------------------------------------
1 | gym==0.21.0
2 | opencv_python==4.9.0.80
3 | salesforce_lavis==1.0.2
--------------------------------------------------------------------------------
/run_mp.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import random
3 | import os
4 | import json
5 | from copy import deepcopy
6 | import glob
7 | from pprint import pprint
8 |
9 | import numpy as np
10 | import torch
11 | import torch.multiprocessing as mp
12 | torch.multiprocessing.set_start_method('spawn', force=True)
13 | from multiprocessing import Pool
14 |
15 | from habitat import logger
16 | from habitat_baselines.common.baseline_registry import baseline_registry
17 |
18 | from vlnce_baselines.config.default import get_config
19 | from vlnce_baselines.common.utils import seed_everything
20 |
21 |
22 | def run_exp(exp_name: str, exp_config: str,
23 | run_type: str, nprocesses: int, opts=None) -> None:
24 | r"""Runs experiment given mode and config
25 |
26 | Args:
27 | exp_config: path to config file.
28 | run_type: "train" or "eval.
29 | opts: list of strings of additional config options.
30 |
31 | Returns:
32 | None.
33 | """
34 | config = get_config(exp_config, opts)
35 | config.defrost()
36 | config.TENSORBOARD_DIR += exp_name
37 | config.CHECKPOINT_FOLDER += exp_name
38 | config.EVAL_CKPT_PATH_DIR += exp_name
39 | config.RESULTS_DIR += exp_name
40 | config.VIDEO_DIR += exp_name
41 | config.LOG_FILE = exp_name + '_' + config.LOG_FILE
42 | config.freeze()
43 |
44 | os.makedirs(config.RESULTS_DIR, exist_ok=True)
45 | os.makedirs(config.EVAL_CKPT_PATH_DIR, exist_ok=True)
46 | os.system("mkdir -p data/logs/running_log")
47 | logger.add_filehandler('data/logs/running_log/' + config.LOG_FILE)
48 | logger.info(f"hyper parameters:\n{config.EVAL}")
49 | logger.info(f"llm reply file: {config.TASK_CONFIG.DATASET.LLM_REPLYS_PATH}")
50 |
51 | # dataset split, start multi-processes
52 | num_devices = torch.cuda.device_count()
53 | print(f'num devices: {num_devices}, num processes: {nprocesses}')
54 | with open(config.TASK_CONFIG.DATASET.LLM_REPLYS_PATH, 'r') as f:
55 | llm_reply_dataset = json.load(f)
56 | episode_ids = list(llm_reply_dataset.keys())
57 | split_episode_ids = [episode_ids[i::nprocesses] for i in range(nprocesses)]
58 |
59 | configs = []
60 | for i, ep_ids in enumerate(split_episode_ids):
61 | shared_config = deepcopy(config)
62 | shared_config.defrost()
63 | device_num = i % num_devices
64 | shared_config.local_rank = i
65 | shared_config.world_size = nprocesses
66 | shared_config.TORCH_GPU_ID = device_num
67 | shared_config.TORCH_GPU_IDS = [device_num]
68 | shared_config.SIMULATOR_GPU_IDS = [device_num]
69 | shared_config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = ep_ids
70 | shared_config.freeze()
71 | configs.append(shared_config)
72 |
73 | pool = Pool(processes=nprocesses)
74 | pool.map(worker, configs)
75 | pool.close()
76 | pool.join()
77 | fns = glob.glob(config.CHECKPOINT_FOLDER + '/stats_ep_ckpt_*.json')
78 | summary = {}
79 | for fn in fns:
80 | with open(fn, 'r') as f:
81 | summary.update(json.load(f))
82 | summary_metrics = {
83 | "steps_taken": [],
84 | "distance_to_goal": [],
85 | "success": [],
86 | "oracle_success": [],
87 | "path_length": [],
88 | "spl": [],
89 | "ndtw": [],
90 | "sdtw": [],
91 | }
92 | for epid, metric in summary.items():
93 | for k, v in metric.items():
94 | summary_metrics[k].append(v)
95 | for k, v in summary_metrics.items():
96 | summary_metrics[k] = np.mean(v)
97 | pprint(summary_metrics)
98 | with open(config.CHECKPOINT_FOLDER + '/stats_ckpt_val_unseen.json', 'w') as f:
99 | json.dump(summary_metrics, f, indent=2)
100 |
101 | def worker(config):
102 | seed_everything(config.TASK_CONFIG.SEED)
103 | torch.backends.cudnn.benchmark = False
104 | torch.backends.cudnn.deterministic = False
105 | if torch.cuda.is_available():
106 | torch.set_num_threads(1)
107 |
108 | TRAINER = baseline_registry.get_trainer(config.TRAINER_NAME)
109 | assert TRAINER is not None, f"{config.TRAINER_NAME} is not supported"
110 | trainer = TRAINER(config)
111 | trainer.eval()
112 |
113 | if __name__ == "__main__":
114 | parser = argparse.ArgumentParser()
115 | parser.add_argument(
116 | "--exp_name",
117 | type=str,
118 | default="test",
119 | required=True,
120 | help="experiment id that matches to exp-id in Notion log",
121 | )
122 | parser.add_argument(
123 | "--run-type",
124 | choices=["eval"],
125 | required=True,
126 | help="run type of the experiment(train, eval, inference), only eval for zero-shot vln",
127 | )
128 | parser.add_argument(
129 | "--nprocesses",
130 | type=int,
131 | default=1,
132 | help="number of processes",
133 | )
134 | parser.add_argument(
135 | "--exp-config",
136 | type=str,
137 | required=True,
138 | help="path to config yaml containing info about experiment",
139 | )
140 | parser.add_argument(
141 | "opts",
142 | default=None,
143 | nargs=argparse.REMAINDER,
144 | help="Modify config options from command line",
145 | )
146 | args = parser.parse_args()
147 | print(args)
148 |
149 | mp.set_start_method('spawn', force=True)
150 | run_exp(**vars(args))
--------------------------------------------------------------------------------
/run_r2r/main.sh:
--------------------------------------------------------------------------------
1 | export GLOG_minloglevel=2
2 | export MAGNUM_LOG=quiet
3 |
4 | flag=" --exp_name exp_1
5 | --run-type eval
6 | --exp-config vlnce_baselines/config/exp1.yaml
7 | --nprocesses 16
8 | NUM_ENVIRONMENTS 1
9 | TRAINER_NAME ZS-Evaluator-mp
10 | TORCH_GPU_IDS [0,1,2,3,4,5,6,7]
11 | SIMULATOR_GPU_IDS [0,1,2,3,4,5,6,7]
12 | "
13 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python run_mp.py $flag
--------------------------------------------------------------------------------
/vlnce_baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from vlnce_baselines import ZS_Evaluator_mp
2 | from vlnce_baselines.common import environments
3 | from vlnce_baselines.models import Policy
--------------------------------------------------------------------------------
/vlnce_baselines/common/constraints.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from PIL import Image
4 | import torch.nn as nn
5 | from typing import List
6 | import supervision as sv
7 | from habitat import Config
8 | from collections import Sequence
9 | from vlnce_baselines.utils.map_utils import *
10 | from lavis.models import load_model_and_preprocess
11 | from vlnce_baselines.utils.constant import direction_mapping
12 |
13 |
14 | class ConstraintsMonitor(nn.Module):
15 | def __init__(self, config: Config, device: torch.device) -> None:
16 | super().__init__()
17 | self.config = config
18 | self.resolution = config.MAP.MAP_RESOLUTION
19 | self.turn_angle = config.TASK_CONFIG.SIMULATOR.TURN_ANGLE
20 | self.device = device
21 | self._load_from_disk()
22 |
23 | def _create_model(self):
24 | self.model, vis_processors, text_processors = \
25 | load_model_and_preprocess("blip_vqa", model_type="vqav2", device=self.device, is_eval=True)
26 | self.vis_processors = vis_processors["eval"]
27 | self.text_processors = text_processors["eval"]
28 |
29 | def _load_from_disk(self):
30 | self.model = torch.load(self.config.VQA_MODEL_DIR, map_location='cpu').to(self.device)
31 | self.vis_processors = torch.load(self.config.VQA_VIS_PROCESSORS_DIR)["eval"]
32 | self.text_processors = torch.load(self.config.VQA_TEXT_PROCESSORS_DIR)["eval"]
33 |
34 | def location_constraint(self, obs: np.ndarray, scene: str):
35 | """
36 | use VQA to check scene type
37 | """
38 | image = Image.fromarray(obs['rgb'].astype(np.uint8))
39 | question = f"Are you in the {scene}"
40 | image = self.vis_processors(image).unsqueeze(0).to(self.device)
41 | question = self.text_processors(question)
42 | samples = {"image": image, "text_input": question}
43 | answer_candidates = ["yes", "no"]
44 | answer = self.model.predict_answers(samples, answer_list=answer_candidates, inference_method="rank")[0]
45 | if answer == "yes":
46 | return True
47 | else:
48 | return False
49 |
50 | def object_constraint(self, current_detection: sv.Detections, object: str, classes: List):
51 | """
52 | use grounded-sam's detections to check object
53 | """
54 | class_ids = current_detection.class_id
55 | class_names = [classes[i] for i in class_ids]
56 | if object in class_names:
57 | return True
58 | else:
59 | return False
60 |
61 | def direction_constraint(self, current_pose: Sequence, last_pose: Sequence, object):
62 | """
63 | check by geometric relation
64 | """
65 | heading = -1 * last_pose[-1]
66 | current_position, _ = get_agent_position(current_pose, self.resolution)
67 | last_position, _ = get_agent_position(last_pose, self.resolution)
68 | position_vector = current_position - last_position
69 | displacement = np.linalg.norm(position_vector)
70 | heading_vector = angle_to_vector(heading)
71 | rotation_matrix = np.array([[0, -1],
72 | [1, 0]])
73 | heading_vector = np.dot(rotation_matrix, heading_vector)
74 | if np.array_equal(position_vector, np.array([0., 0.])):
75 | return False
76 | degrees, direction = angle_and_direction(heading_vector, position_vector, self.turn_angle + 1)
77 | if degrees >= 120:
78 | movement = "backward"
79 | elif degrees == 0 or degrees == 180 or direction == 1:
80 | movement = "forward"
81 | else:
82 | if direction == 2:
83 | movement = "left"
84 | elif direction == 3:
85 | movement = "right"
86 | object_direction = direction_mapping.get(object, "ambiguous direction")
87 | if object_direction == "ambiguous direction":
88 | print("!Won't check ambiguous direction!")
89 | return True
90 | elif movement == object_direction and displacement >= 0.5 * 100 / self.resolution:
91 | return True
92 | else:
93 | return False
94 |
95 | def forward(self,
96 | constraints: List, obs: np.ndarray,
97 | detection: sv.Detections, classes: List,
98 | current_pose: Sequence, last_pose: Sequence):
99 | res = []
100 | for item in constraints:
101 | constraint_type, constraint_object = item[:2]
102 | constraint_type = constraint_type.lower().strip()
103 | if constraint_type == "location constraint":
104 | check = self.location_constraint(obs, constraint_object)
105 | res.append(check)
106 | elif constraint_type == "object constraint":
107 | check = self.object_constraint(detection, constraint_object, classes)
108 | res.append(check)
109 | elif constraint_type == "direction constraint":
110 | check = self.direction_constraint(current_pose, last_pose, constraint_object)
111 | res.append(check)
112 |
113 | return res
--------------------------------------------------------------------------------
/vlnce_baselines/common/env_utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import random
3 | from typing import List, Optional, Type, Union
4 |
5 | import habitat
6 | from habitat import logger
7 | from habitat_baselines.utils.env_utils import make_env_fn
8 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
9 |
10 | random.seed(0)
11 |
12 |
13 | def construct_envs(
14 | config: Config,
15 | env_class: Type[Union[Env, RLEnv]],
16 | workers_ignore_signals: bool = False,
17 | auto_reset_done: bool = True,
18 | episodes_allowed: Optional[List[str]] = None,
19 | ) -> VectorEnv:
20 | r"""Create VectorEnv object with specified config and env class type.
21 | To allow better performance, dataset are split into small ones for
22 | each individual env, grouped by scenes.
23 | :param config: configs that contain num_environments as well as information
24 | :param necessary to create individual environments.
25 | :param env_class: class type of the envs to be created.
26 | :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor
27 | :param auto_reset_done: Whether or not to automatically reset the env on done
28 | :return: VectorEnv object created according to specification.
29 | """
30 |
31 | num_envs_per_gpu = config.NUM_ENVIRONMENTS
32 | if isinstance(config.SIMULATOR_GPU_IDS, list):
33 | gpus = config.SIMULATOR_GPU_IDS
34 | else:
35 | gpus = [config.SIMULATOR_GPU_IDS]
36 | num_gpus = len(gpus)
37 | num_envs = num_gpus * num_envs_per_gpu
38 |
39 | if episodes_allowed is not None:
40 | config.defrost()
41 | config.TASK_CONFIG.DATASET.EPISODES_ALLOWED = episodes_allowed
42 | config.freeze()
43 |
44 | configs = []
45 | env_classes = [env_class for _ in range(num_envs)]
46 | dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE)
47 | scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES
48 | if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES:
49 | scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET)
50 | logger.info(f"SPLTI: {config.TASK_CONFIG.DATASET.SPLIT}, NUMBER OF SCENES: {len(scenes)}")
51 |
52 | if num_envs > 1:
53 | if len(scenes) == 0:
54 | raise RuntimeError(
55 | "No scenes to load, multi-process logic relies on being able"
56 | " to split scenes uniquely between processes"
57 | )
58 |
59 | if len(scenes) < num_envs and len(scenes) != 1:
60 | raise RuntimeError(
61 | "reduce the number of GPUs or envs as there"
62 | " aren't enough number of scenes"
63 | )
64 |
65 | random.shuffle(scenes)
66 |
67 | if len(scenes) == 1:
68 | scene_splits = [[scenes[0]] for _ in range(num_envs)]
69 | else:
70 | scene_splits = [[] for _ in range(num_envs)]
71 | for idx, scene in enumerate(scenes):
72 | scene_splits[idx % len(scene_splits)].append(scene)
73 |
74 | assert sum(map(len, scene_splits)) == len(scenes)
75 |
76 | for i in range(num_gpus):
77 | for j in range(num_envs_per_gpu):
78 | proc_config = config.clone()
79 | proc_config.defrost()
80 | proc_id = (i * num_envs_per_gpu) + j
81 |
82 | task_config = proc_config.TASK_CONFIG
83 | task_config.SEED += proc_id
84 | if len(scenes) > 0:
85 | task_config.DATASET.CONTENT_SCENES = scene_splits[proc_id]
86 |
87 | task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpus[i]
88 |
89 | task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS
90 |
91 | proc_config.freeze()
92 | configs.append(proc_config)
93 |
94 | is_debug = True if sys.gettrace() else False
95 | # env_entry = habitat.ThreadedVectorEnv if is_debug else habitat.VectorEnv
96 | env_entry = habitat.ThreadedVectorEnv
97 | envs = env_entry(
98 | make_env_fn=make_env_fn,
99 | env_fn_args=tuple(zip(configs, env_classes)),
100 | auto_reset_done=auto_reset_done,
101 | workers_ignore_signals=workers_ignore_signals,
102 | )
103 | return envs
104 |
--------------------------------------------------------------------------------
/vlnce_baselines/common/environments.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, Union
2 |
3 | import habitat
4 | from habitat import Config, Dataset
5 | from habitat.core.embodied_task import Metrics
6 | from habitat.core.simulator import Observations
7 | from habitat_baselines.common.baseline_registry import baseline_registry
8 | from habitat_extensions.pose_utils import get_sim_location
9 |
10 |
11 | @baseline_registry.register_env(name="VLNCEZeroShotEnv")
12 | class VLNCEZeroShotEnv(habitat.RLEnv):
13 | def __init__(self, config: Config, dataset: Union[Dataset,None]=None) -> None:
14 | super().__init__(config.TASK_CONFIG, dataset)
15 | self.sensor_pose_sensor = self.habitat_env.task.sensor_suite.get('sensor_pose')
16 |
17 | def reset(self) -> Observations:
18 | self.sensor_pose_sensor.episode_start = False
19 | self.last_sim_location = get_sim_location(self.habitat_env.sim)
20 | self.sensor_pose = [0., 0., 0.] # initialize last sensor pose as [0,0,0]
21 | obs = super().reset()
22 |
23 | return obs
24 |
25 | def step(self, action: Union[int, str, Dict[str, Any]], **kwargs) -> Observations:
26 | obs, reward, done, info = super().step(action, **kwargs)
27 |
28 | return obs, reward, done, info
29 |
30 | def get_reward(self, observations: Observations) -> float:
31 | return 0.0
32 |
33 | def get_info(self, observations: Observations) -> Dict[Any, Any]:
34 | return self.habitat_env.get_metrics()
35 |
36 | def get_done(self, observations):
37 | return self._env.episode_over
38 |
39 | def get_reward_range(self):
40 | return (0.0, 0.0)
41 |
42 | def get_reward(self, observations: Observations) -> Any:
43 | return 0.0
44 |
45 | def _get_sensor_pose(self):
46 | pass
--------------------------------------------------------------------------------
/vlnce_baselines/common/instruction_tools.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import gzip
4 | import json
5 | import time
6 | import random
7 | from openai import OpenAI
8 | from concurrent.futures import ThreadPoolExecutor, as_completed
9 |
10 |
11 | R2R_VALUNSEEN_PATH = "data/datasets/R2R_VLNCE_v1-3_preprocessed/val_unseen/val_unseen.json.gz"
12 | DIR_NAME = "data/datasets/LLM_REPLYS_VAL_UNSEEN/"
13 | FILE_NAME = "llm_reply_valunseen"
14 | TEMP_SAVE_PATH = '/data/ckh/Zero-Shot-VLN-FusionMap-mp/tests/llm_reply_valunseen_temp.json'
15 | dones = 0
16 |
17 |
18 | prompt_template = f"""Parse a navigation instruction delimited by triple quotes and your task is to perform the following actions:
19 | 1. Extract Destination: Understand the entire instruction and summarize a description of the destination. The description should be a sentence containing landmark and roomtype. The description of the destination should not accurately describe the orientation and order. Here are examples about destination: "second room on the left" -> "room"(neglect order and direction); "between the bottom of the first stair and the console table in the entry way" -> "console table near entry way"(simplify description); "in front of the railing about halfway between the two upstairs rooms" -> "railing near two upstair rooms";
20 | 2. Split instructions: Split the instruction into a series of sub-instructions according to the execution steps. Each sub-instruction contain one landmark.
21 | 3. Infer agent's state constraints: Infer the state constraints that the agent should satisfy for each sub-instruction. There're thee constraint types: location constraints, diretion constraints, object constraints. You need to select an appropriate constraint type and give the corresponding constraint object. Direction constraint object has two types: left, right. Constraints can format as a tuple: (constraint type, constraint object)
22 | 4. Make a decision: Analyze the landmarks, actions, and directions in each sub-instruction to determine how the agent should act. For a landmark, the agent has three options: approach, move away, or approach and then move away. For direction, the agent has three options: turn left, turn right, or go forward
23 | Provide your answer in JSON format with the following details:
24 | 1. use the following keys: destination, sub-instructions, state-constraints, decisions
25 | 2. the value of destination is a string
26 | 3. the value of sub-instructions is a list of all sub-instructions
27 | 4. the value of state-constraints is a JSON. The key is index start from zero and the value is a list of all constraints, each constraint is a tuple
28 | 5. the value of decisions is a nested JSON. The first level JSON's key is index start from zero and it;s value is second level JONS with keys: landmarks, directions. The value of landmarks is a list of tuples, each tuple contains (landmark, action). The value of directions is a list of direction choice for each sub-instruction.
29 | An Example:
30 | User: "Walk into the living room and keep walking straight past the living room. Then walk into the entrance under the balcony. Wait in the entrance to the other room."
31 | You: {{"destination": "entrance to the other room under the balcony", "sub-instructions": ["Walk into the living room", "keep walking straight past the living room", "walk into the entrance under the balcony", "wait in the entrance to the other room"], "state-constraints": {{"0": [["location constraint", "living room"]], "1": [["location constraint", "living room"]], "2": [["location constraint", "balcony"], ["object constraint", "entrance"]], "3": [["location constraint", "other room"], ["object constraint", "entrance"]]}}, "decisions": {{"0": {{"landmarks": [["living room", "approach"]], "directions": ["forward"]}}, "1": {{"landmarks": [["living room", "move away"]], "directions": ["forward"]}}, "2": {{"landmarks": [["balcony", "approach"], ["entrance", "approach"]], "directions": ["forward"]}}, "3": {{"landmarks": [["other room", "approach"], ["entrance", "approach"]], "directions": ["forward"]}}}}}}
32 | ATTENTION:
33 | 1. constraint type: location constraint is for room type, object constraint is for object type, directions constraint. Don't confuse object constriant with location constraint!
34 | 2. landmark choice: approach, move away, approach then move away
35 | 3. direction choice: left, right, forward
36 | 4. The landmark and constraint object should not accurately describe the orientation and order. Here arre examples about landmark: "second step from the top" -> "step"(neglect order and position relation); "room directly ahead" -> "room"; "right bedroom door" -> "bedroom door"
37 | """
38 |
39 |
40 | def get_reply(client, id, prompt, max_retry_times=5, retry_interval_initial=1):
41 | global dones
42 | retry_interval = retry_interval_initial
43 | reply = None
44 | for _ in range(max_retry_times):
45 | try:
46 | chat_completion = client.chat.completions.create(
47 | messages=[
48 | {
49 | "role": "user",
50 | "content": prompt,
51 | }
52 | ],
53 | # model="gpt-3.5-turbo",
54 | model="gpt-4",
55 | temperature=0
56 | )
57 | reply = eval(chat_completion.choices[0].message.content.strip())
58 | res = {str(id): reply}
59 | with open(TEMP_SAVE_PATH, 'a') as f:
60 | json.dump(res, f, indent=4, ensure_ascii=False)
61 |
62 | dones += 1
63 | print(id, dones)
64 |
65 | return id, reply
66 | except:
67 | print(f"{id} Error, retrying...")
68 | time.sleep(max(retry_interval, 10))
69 | retry_interval *= 2
70 |
71 | dones += 1
72 | print(id, dones)
73 | return id, reply
74 |
75 |
76 | def check_exist_replys(path):
77 | if os.path.exists(path):
78 | with open(path, 'r') as f:
79 | existing_data = json.load(f)
80 | exist_keys = list(existing_data.keys())
81 | exist_keys = [int(k) for k in exist_keys]
82 |
83 | return exist_keys
84 | else:
85 | return []
86 |
87 |
88 | def generate_prompts(exist_replys, num=None):
89 | with gzip.open(R2R_VALUNSEEN_PATH, 'r') as f:
90 | eps_data = json.loads(f.read().decode('utf-8'))
91 | eps_data = eps_data["episodes"]
92 | eps_data = [item for item in eps_data if item["episode_id"] not in exist_replys]
93 | if len(eps_data) == 0:
94 | print("all episodes are generated")
95 | return {}
96 | random.shuffle(eps_data)
97 | episodes = random.sample(eps_data, min(num, len(eps_data)))
98 | prompts = {}
99 | for episode in episodes:
100 | id = episode["episode_id"]
101 | instruction = episode["instruction"]["instruction_text"]
102 | prompts[id] = prompt_template + f"\"\"\"{instruction}\"\"\""
103 |
104 | return prompts
105 |
106 |
107 | def generate_specific_prompts(id: int):
108 | with gzip.open(R2R_VALUNSEEN_PATH, 'r') as f:
109 | eps_data = json.loads(f.read().decode('utf-8'))
110 | eps_data = eps_data["episodes"]
111 | prompts = {}
112 | for episode in eps_data:
113 | if episode["episode_id"] == id:
114 | instruction = episode["instruction"]["instruction_text"]
115 | prompts[id] = prompt_template + f"\"\"\"{instruction}\"\"\""
116 |
117 | return prompts
118 |
119 |
120 | def regenerate_exist_keys(exist_replys):
121 | with gzip.open(R2R_VALUNSEEN_PATH, 'r') as f:
122 | eps_data = json.loads(f.read().decode('utf-8'))
123 | eps_data = eps_data["episodes"]
124 | eps_data = [item for item in eps_data if item["episode_id"] in exist_replys]
125 | prompts = {}
126 | for episode in eps_data:
127 | id = episode["episode_id"]
128 | instruction = episode["instruction"]["instruction_text"]
129 | prompts[id] = prompt_template + f"\"\"\"{instruction}\"\"\""
130 |
131 | return prompts
132 |
133 |
134 | def natural_sort_key(s):
135 | sub_strings = re.split(r'(\d+)', s)
136 | sub_strings = [int(c) if c.isdigit() else c for c in sub_strings]
137 |
138 | return sub_strings
139 |
140 |
141 | def main():
142 | client = OpenAI(
143 | api_key="dc4e8ca91bb9509727662d60ff4ad16b",
144 | base_url="https://flag.smarttrot.com/v1/"
145 | )
146 |
147 | if os.path.exists(DIR_NAME):
148 | all_exist_files = sorted(os.listdir(DIR_NAME), key=natural_sort_key, reverse=True)
149 | if len(all_exist_files) > 0:
150 | current_file = all_exist_files[0]
151 | file_path = os.path.join(DIR_NAME, current_file)
152 | else:
153 | file_path = ''
154 | else:
155 | os.makedirs(DIR_NAME, exist_ok=True)
156 | file_path = ''
157 |
158 | exist_replys = check_exist_replys(path=file_path)
159 | prompts = generate_prompts(exist_replys, num=1)
160 | if len(prompts) == 0:
161 | return
162 | # prompts = generate_specific_prompts(id=164)
163 | # prompts = regenerate_exist_keys(exist_replys)
164 |
165 |
166 | with ThreadPoolExecutor(max_workers=1) as executor:
167 | results = [executor.submit(get_reply, client, id, prompt) for id, prompt in prompts.items()]
168 | query2res = {job.result()[0]: job.result()[1] for job in as_completed(results)}
169 |
170 | # sorted_data = {k: query2res[k] for k in sorted(query2res, key=int)}
171 | # length = len(sorted_data)
172 | # new_filename = FILE_NAME + str(length) + "_version2.json"
173 | # with open(os.path.join(DIR_NAME, new_filename), 'w') as f:
174 | # json.dump(sorted_data, f, indent=4, ensure_ascii=False)
175 |
176 | if os.path.exists(file_path):
177 | with open(file_path, 'r') as f:
178 | existing_data = json.load(f)
179 | for key, value in query2res.items():
180 | if str(key) not in existing_data:
181 | existing_data[str(key)] = value
182 | sorted_data = {k: existing_data[k] for k in sorted(existing_data, key=int)}
183 |
184 | # avoid overwrite
185 | length = len(sorted_data)
186 | new_filename = FILE_NAME + str(length) + ".json"
187 | with open(os.path.join(DIR_NAME, new_filename), 'w') as f:
188 | json.dump(sorted_data, f, indent=4, ensure_ascii=False)
189 | else:
190 | sorted_data = {k: query2res[k] for k in sorted(query2res, key=int)}
191 | length = len(sorted_data)
192 | new_filename = FILE_NAME + str(length) + ".json"
193 | with open(os.path.join(DIR_NAME, new_filename), 'w') as f:
194 | json.dump(sorted_data, f, indent=4, ensure_ascii=False)
195 |
196 |
197 | if __name__ == "__main__":
198 | main()
--------------------------------------------------------------------------------
/vlnce_baselines/common/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import numpy as np
4 | import torch
5 | import torch.distributed as distr
6 |
7 | def gather_list_and_concat(list_of_nums,world_size):
8 | if not torch.is_tensor(list_of_nums):
9 | tensor = torch.Tensor(list_of_nums).cuda()
10 | else:
11 | if list_of_nums.is_cuda == False:
12 | tensor = list_of_nums.cuda()
13 | else:
14 | tensor = list_of_nums
15 | gather_t = [torch.ones_like(tensor) for _ in
16 | range(world_size)]
17 | distr.all_gather(gather_t, tensor)
18 |
19 | return gather_t
20 |
21 | def seed_everything(seed: int):
22 | # print(f"setting seed: {seed}")
23 | os.environ["PL_GLOBAL_SEED"] = str(seed)
24 | random.seed(seed)
25 | np.random.seed(seed)
26 | torch.manual_seed(seed)
27 | torch.cuda.manual_seed_all(seed)
28 |
29 | def get_device(device_number):
30 | device = torch.device("cpu")
31 | if device_number >= 0 and torch.cuda.is_available():
32 | device = torch.device("cuda:{0}".format(device_number))
33 |
34 | return device
--------------------------------------------------------------------------------
/vlnce_baselines/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/vlnce_baselines/config/__init__.py
--------------------------------------------------------------------------------
/vlnce_baselines/config/default.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union
2 |
3 | import habitat_baselines.config.default
4 | from habitat.config.default import Config as CN
5 | from habitat.config.default import CONFIG_FILE_SEPARATOR
6 | from habitat_extensions.config.default import get_extended_config as get_task_config
7 |
8 |
9 | # -----------------------------------------------------------------------------
10 | # EXPERIMENT CONFIG
11 | # -----------------------------------------------------------------------------
12 | _C = CN()
13 | _C.BASE_TASK_CONFIG_PATH = "habitat_extensions/config/zs_vlnce_task.yaml"
14 | _C.TASK_CONFIG = CN() # task_config will be stored as a config node
15 | _C.TRAINER_NAME = "ZS-Evaluator"
16 | _C.ENV_NAME = "VLNCEZeroShotEnv"
17 | _C.SIMULATOR_GPU_IDS = [0]
18 | _C.VIDEO_OPTION = [] # options: "disk", "tensorboard"
19 | _C.VIDEO_DIR = "videos/debug"
20 | _C.TENSORBOARD_DIR = "data/tensorboard_dirs/debug"
21 | _C.RESULTS_DIR = "data/checkpoints/pretrained/evals"
22 | _C.BLIP2_MODEL_DIR = "data/blip2/blip2_model.pt"
23 | _C.BLIP2_VIS_PROCESSORS_DIR = "data/blip2/blip2_vis_processors.pt"
24 | _C.BLIP2_TEXT_PROCESSORS_DIR = "data/blip2/blip2_text_processors.pt"
25 | _C.VQA_MODEL_DIR = "data/vqa/vqa_model.pt"
26 | _C.VQA_VIS_PROCESSORS_DIR = "data/vqa/vqa_vis_processors.pt"
27 | _C.VQA_TEXT_PROCESSORS_DIR = "data/vqa/vqa_text_processors.pt"
28 | _C.KEYBOARD_CONTROL = 0
29 |
30 | # -----------------------------------------------------------------------------
31 | # MAP CONFIG
32 | # -----------------------------------------------------------------------------
33 | _C.MAP = CN()
34 | _C.MAP.GROUNDING_DINO_CONFIG_PATH = "data/grounded_sam/GroundingDINO_SwinT_OGC.py"
35 | _C.MAP.GROUNDING_DINO_CHECKPOINT_PATH = "data/grounded_sam/groundingdino_swint_ogc.pth"
36 | _C.MAP.SAM_CHECKPOINT_PATH = "data/grounded_sam/sam_vit_h_4b8939.pth"
37 | _C.MAP.RepViTSAM_CHECKPOINT_PATH = "data/grounded_sam/repvit_sam.pt"
38 | _C.MAP.SAM_ENCODER_VERSION = "vit_h"
39 | _C.MAP.BOX_THRESHOLD = 0.25
40 | _C.MAP.TEXT_THRESHOLD = 0.25
41 | _C.MAP.FRAME_WIDTH = 160
42 | _C.MAP.FRAME_HEIGHT = 120
43 | _C.MAP.MAP_RESOLUTION = 5
44 | _C.MAP.MAP_SIZE_CM = 2400
45 | _C.MAP.GLOBAL_DOWNSCALING = 2
46 | _C.MAP.VISION_RANGE = 100
47 | _C.MAP.DU_SCALE = 1
48 | _C.MAP.CAT_PRED_THRESHOLD = 5.0
49 | _C.MAP.EXP_PRED_THRESHOLD = 1.0
50 | _C.MAP.MAP_PRED_THRESHOLD = 1.0
51 | _C.MAP.MAX_SEM_CATEGORIES = 16
52 | _C.MAP.CENTER_RESET_STEPS = 25
53 | _C.MAP.MIN_Z = 2 # a lager min_z could lost some information on the floor, 2cm is ok
54 | _C.MAP.VISUALIZE = False
55 | _C.MAP.PRINT_IMAGES = False
56 | _C.MAP.REPVITSAM = 0
57 |
58 |
59 | # -----------------------------------------------------------------------------
60 | # EVAL CONFIG
61 | # -----------------------------------------------------------------------------
62 | _C.EVAL = CN()
63 | _C.EVAL.SPLIT = "val_unseen" # The split to evaluate on
64 | _C.EVAL.USE_CKPT_CONFIG = True
65 | _C.EVAL.EPISODE_COUNT = 5000
66 | _C.EVAL.SAVE_RESULTS = True
67 |
68 |
69 | def purge_keys(config: CN, keys: List[str]) -> None:
70 | for k in keys:
71 | del config[k]
72 | config.register_deprecated_key(k)
73 |
74 |
75 | def get_config(
76 | config_paths: Optional[Union[List[str], str]] = None,
77 | opts: Optional[list] = None,
78 | ) -> CN:
79 | r"""Create a unified config with default values. Initialized from the
80 | habitat_baselines default config. Overwritten by values from
81 | `config_paths` and overwritten by options from `opts`.
82 | Args:
83 | config_paths: List of config paths or string that contains comma
84 | separated list of config paths.
85 | opts: Config options (keys, values) in a list (e.g., passed from
86 | command line into the config. For example, `opts = ['FOO.BAR',
87 | 0.5]`. Argument can be used for parameter sweeping or quick tests.
88 | """
89 | config = CN()
90 | config.merge_from_other_cfg(habitat_baselines.config.default._C)
91 | purge_keys(config, ["SIMULATOR_GPU_ID", "TEST_EPISODE_COUNT"])
92 | config.merge_from_other_cfg(_C.clone())
93 |
94 | if config_paths:
95 | if isinstance(config_paths, str):
96 | if CONFIG_FILE_SEPARATOR in config_paths:
97 | config_paths = config_paths.split(CONFIG_FILE_SEPARATOR)
98 | else:
99 | config_paths = [config_paths]
100 |
101 | prev_task_config = ""
102 | for config_path in config_paths:
103 | config.merge_from_file(config_path)
104 | if config.BASE_TASK_CONFIG_PATH != prev_task_config:
105 | config.TASK_CONFIG = get_task_config(
106 | config.BASE_TASK_CONFIG_PATH
107 | )
108 | prev_task_config = config.BASE_TASK_CONFIG_PATH
109 |
110 | if opts:
111 | config.CMD_TRAILING_OPTS = opts
112 | config.merge_from_list(opts)
113 |
114 | config.freeze()
115 | return config
--------------------------------------------------------------------------------
/vlnce_baselines/config/exp1.yaml:
--------------------------------------------------------------------------------
1 | BASE_TASK_CONFIG_PATH: habitat_extensions/config/zs_vlnce_task.yaml
2 | SIMULATOR_GPU_IDS: [0]
3 | TORCH_GPU_ID: 0
4 | TORCH_GPU_IDS: [0]
5 | GPU_NUMBERS: 1
6 | NUM_ENVIRONMENTS: 1
7 | TENSORBOARD_DIR: data/tensorboard_dirs/
8 | CHECKPOINT_FOLDER: data/checkpoints/
9 | EVAL_CKPT_PATH_DIR: data/checkpoints/
10 | RESULTS_DIR: data/logs/eval_results/
11 | VIDEO_DIR: data/logs/video/
12 | KEYBOARD_CONTROL: 0
13 |
14 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']
15 |
16 | MAP:
17 | VISUALIZE: False
18 | PRINT_IMAGES: False
19 | REPVITSAM: 1
20 |
21 | EVAL:
22 | USE_CKPT_CONFIG: False
23 | SAVE_RESULTS: True
24 | SPLIT: val_unseen
25 | MIN_CONSTRAINT_STEPS: 10
26 | MAX_CONSTRAINT_STEPS: 25
27 | SCORE_THRESHOLD: 0.5
28 | VALUE_THRESHOLD: 0.30
29 | DECISION_THRESHOLD: 0.4
30 | FMM_WAYPOINT_THRESHOLD: 2.0
31 | FMM_GOAL_THRESHOLD: 1.0
32 | CHANGE_THRESHOLD: -0.03
--------------------------------------------------------------------------------
/vlnce_baselines/map/RepViTSAM/repvit.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | __all__ = ['repvit_m1']
5 |
6 |
7 | def _make_divisible(v, divisor, min_value=None):
8 | """
9 | This function is taken from the original tf repo.
10 | It ensures that all layers have a channel number that is divisible by 8
11 | It can be seen here:
12 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
13 | :param v:
14 | :param divisor:
15 | :param min_value:
16 | :return:
17 | """
18 | if min_value is None:
19 | min_value = divisor
20 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
21 | # Make sure that round down does not go down by more than 10%.
22 | if new_v < 0.9 * v:
23 | new_v += divisor
24 | return new_v
25 |
26 | from timm.models.layers import SqueezeExcite
27 |
28 | import torch
29 |
30 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
31 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
32 | class LayerNorm2d(nn.Module):
33 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
34 | super().__init__()
35 | self.weight = nn.Parameter(torch.ones(num_channels))
36 | self.bias = nn.Parameter(torch.zeros(num_channels))
37 | self.eps = eps
38 |
39 | def forward(self, x: torch.Tensor) -> torch.Tensor:
40 | u = x.mean(1, keepdim=True)
41 | s = (x - u).pow(2).mean(1, keepdim=True)
42 | x = (x - u) / torch.sqrt(s + self.eps)
43 | x = self.weight[:, None, None] * x + self.bias[:, None, None]
44 | return x
45 |
46 | class Conv2d_BN(torch.nn.Sequential):
47 | def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
48 | groups=1, bn_weight_init=1, resolution=-10000):
49 | super().__init__()
50 | self.add_module('c', torch.nn.Conv2d(
51 | a, b, ks, stride, pad, dilation, groups, bias=False))
52 | self.add_module('bn', torch.nn.BatchNorm2d(b))
53 | torch.nn.init.constant_(self.bn.weight, bn_weight_init)
54 | torch.nn.init.constant_(self.bn.bias, 0)
55 |
56 | @torch.no_grad()
57 | def fuse(self):
58 | c, bn = self._modules.values()
59 | w = bn.weight / (bn.running_var + bn.eps)**0.5
60 | w = c.weight * w[:, None, None, None]
61 | b = bn.bias - bn.running_mean * bn.weight / \
62 | (bn.running_var + bn.eps)**0.5
63 | m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
64 | 0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups,
65 | device=c.weight.device)
66 | m.weight.data.copy_(w)
67 | m.bias.data.copy_(b)
68 | return m
69 |
70 | class Residual(torch.nn.Module):
71 | def __init__(self, m, drop=0.):
72 | super().__init__()
73 | self.m = m
74 | self.drop = drop
75 |
76 | def forward(self, x):
77 | if self.training and self.drop > 0:
78 | return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
79 | device=x.device).ge_(self.drop).div(1 - self.drop).detach()
80 | else:
81 | return x + self.m(x)
82 |
83 | @torch.no_grad()
84 | def fuse(self):
85 | if isinstance(self.m, Conv2d_BN):
86 | m = self.m.fuse()
87 | assert(m.groups == m.in_channels)
88 | identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
89 | identity = torch.nn.functional.pad(identity, [1,1,1,1])
90 | m.weight += identity.to(m.weight.device)
91 | return m
92 | elif isinstance(self.m, torch.nn.Conv2d):
93 | m = self.m
94 | assert(m.groups != m.in_channels)
95 | identity = torch.ones(m.weight.shape[0], m.weight.shape[1], 1, 1)
96 | identity = torch.nn.functional.pad(identity, [1,1,1,1])
97 | m.weight += identity.to(m.weight.device)
98 | return m
99 | else:
100 | return self
101 |
102 |
103 | class RepVGGDW(torch.nn.Module):
104 | def __init__(self, ed) -> None:
105 | super().__init__()
106 | self.conv = Conv2d_BN(ed, ed, 3, 1, 1, groups=ed)
107 | self.conv1 = torch.nn.Conv2d(ed, ed, 1, 1, 0, groups=ed)
108 | self.dim = ed
109 | self.bn = torch.nn.BatchNorm2d(ed)
110 |
111 | def forward(self, x):
112 | return self.bn((self.conv(x) + self.conv1(x)) + x)
113 |
114 | @torch.no_grad()
115 | def fuse(self):
116 | conv = self.conv.fuse()
117 | conv1 = self.conv1
118 |
119 | conv_w = conv.weight
120 | conv_b = conv.bias
121 | conv1_w = conv1.weight
122 | conv1_b = conv1.bias
123 |
124 | conv1_w = torch.nn.functional.pad(conv1_w, [1,1,1,1])
125 |
126 | identity = torch.nn.functional.pad(torch.ones(conv1_w.shape[0], conv1_w.shape[1], 1, 1, device=conv1_w.device), [1,1,1,1])
127 |
128 | final_conv_w = conv_w + conv1_w + identity
129 | final_conv_b = conv_b + conv1_b
130 |
131 | conv.weight.data.copy_(final_conv_w)
132 | conv.bias.data.copy_(final_conv_b)
133 |
134 | bn = self.bn
135 | w = bn.weight / (bn.running_var + bn.eps)**0.5
136 | w = conv.weight * w[:, None, None, None]
137 | b = bn.bias + (conv.bias - bn.running_mean) * bn.weight / \
138 | (bn.running_var + bn.eps)**0.5
139 | conv.weight.data.copy_(w)
140 | conv.bias.data.copy_(b)
141 | return conv
142 |
143 |
144 | class RepViTBlock(nn.Module):
145 | def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
146 | super(RepViTBlock, self).__init__()
147 | assert stride in [1, 2]
148 |
149 | self.identity = stride == 1 and inp == oup
150 | assert(hidden_dim == 2 * inp)
151 |
152 | if stride == 2:
153 | self.token_mixer = nn.Sequential(
154 | Conv2d_BN(inp, inp, kernel_size, stride if inp != 320 else 1, (kernel_size - 1) // 2, groups=inp),
155 | SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
156 | Conv2d_BN(inp, oup, ks=1, stride=1, pad=0)
157 | )
158 | self.channel_mixer = Residual(nn.Sequential(
159 | # pw
160 | Conv2d_BN(oup, 2 * oup, 1, 1, 0),
161 | nn.GELU() if use_hs else nn.GELU(),
162 | # pw-linear
163 | Conv2d_BN(2 * oup, oup, 1, 1, 0, bn_weight_init=0),
164 | ))
165 | else:
166 | # assert(self.identity)
167 | self.token_mixer = nn.Sequential(
168 | RepVGGDW(inp),
169 | SqueezeExcite(inp, 0.25) if use_se else nn.Identity(),
170 | )
171 | if self.identity:
172 | self.channel_mixer = Residual(nn.Sequential(
173 | # pw
174 | Conv2d_BN(inp, hidden_dim, 1, 1, 0),
175 | nn.GELU() if use_hs else nn.GELU(),
176 | # pw-linear
177 | Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
178 | ))
179 | else:
180 | self.channel_mixer = nn.Sequential(
181 | # pw
182 | Conv2d_BN(inp, hidden_dim, 1, 1, 0),
183 | nn.GELU() if use_hs else nn.GELU(),
184 | # pw-linear
185 | Conv2d_BN(hidden_dim, oup, 1, 1, 0, bn_weight_init=0),
186 | )
187 |
188 | def forward(self, x):
189 | return self.channel_mixer(self.token_mixer(x))
190 |
191 | from timm.models.vision_transformer import trunc_normal_
192 | class BN_Linear(torch.nn.Sequential):
193 | def __init__(self, a, b, bias=True, std=0.02):
194 | super().__init__()
195 | self.add_module('bn', torch.nn.BatchNorm1d(a))
196 | self.add_module('l', torch.nn.Linear(a, b, bias=bias))
197 | trunc_normal_(self.l.weight, std=std)
198 | if bias:
199 | torch.nn.init.constant_(self.l.bias, 0)
200 |
201 | @torch.no_grad()
202 | def fuse(self):
203 | bn, l = self._modules.values()
204 | w = bn.weight / (bn.running_var + bn.eps)**0.5
205 | b = bn.bias - self.bn.running_mean * \
206 | self.bn.weight / (bn.running_var + bn.eps)**0.5
207 | w = l.weight * w[None, :]
208 | if l.bias is None:
209 | b = b @ self.l.weight.T
210 | else:
211 | b = (l.weight @ b[:, None]).view(-1) + self.l.bias
212 | m = torch.nn.Linear(w.size(1), w.size(0), device=l.weight.device)
213 | m.weight.data.copy_(w)
214 | m.bias.data.copy_(b)
215 | return m
216 |
217 | class Classfier(nn.Module):
218 | def __init__(self, dim, num_classes, distillation=True):
219 | super().__init__()
220 | self.classifier = BN_Linear(dim, num_classes) if num_classes > 0 else torch.nn.Identity()
221 | self.distillation = distillation
222 | if distillation:
223 | self.classifier_dist = BN_Linear(dim, num_classes) if num_classes > 0 else torch.nn.Identity()
224 |
225 | def forward(self, x):
226 | if self.distillation:
227 | x = self.classifier(x), self.classifier_dist(x)
228 | if not self.training:
229 | x = (x[0] + x[1]) / 2
230 | else:
231 | x = self.classifier(x)
232 | return x
233 |
234 | @torch.no_grad()
235 | def fuse(self):
236 | classifier = self.classifier.fuse()
237 | if self.distillation:
238 | classifier_dist = self.classifier_dist.fuse()
239 | classifier.weight += classifier_dist.weight
240 | classifier.bias += classifier_dist.bias
241 | classifier.weight /= 2
242 | classifier.bias /= 2
243 | return classifier
244 | else:
245 | return classifier
246 |
247 | class RepViT(nn.Module):
248 | def __init__(self, cfgs, num_classes=1000, distillation=False, img_size=1024):
249 | super(RepViT, self).__init__()
250 | # setting of inverted residual blocks
251 | self.cfgs = cfgs
252 |
253 | self.img_size = img_size
254 |
255 | # building first layer
256 | input_channel = self.cfgs[0][2]
257 | patch_embed = torch.nn.Sequential(Conv2d_BN(3, input_channel // 2, 3, 2, 1), torch.nn.GELU(),
258 | Conv2d_BN(input_channel // 2, input_channel, 3, 2, 1))
259 | layers = [patch_embed]
260 | # building inverted residual blocks
261 | block = RepViTBlock
262 | for k, t, c, use_se, use_hs, s in self.cfgs:
263 | output_channel = _make_divisible(c, 8)
264 | exp_size = _make_divisible(input_channel * t, 8)
265 | layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
266 | input_channel = output_channel
267 | self.features = nn.ModuleList(layers)
268 | # self.classifier = Classfier(output_channel, num_classes, distillation)
269 |
270 | self.neck = nn.Sequential(
271 | nn.Conv2d(
272 | output_channel,
273 | 256,
274 | kernel_size=1,
275 | bias=False,
276 | ),
277 | LayerNorm2d(256),
278 | nn.Conv2d(
279 | 256,
280 | 256,
281 | kernel_size=3,
282 | padding=1,
283 | bias=False,
284 | ),
285 | LayerNorm2d(256),
286 | )
287 |
288 | def forward(self, x):
289 | # x = self.features(x)
290 | for f in self.features:
291 | x = f(x)
292 | # x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
293 | x = self.neck(x)
294 | return x
295 |
296 | from timm.models import register_model
297 |
298 | @register_model
299 | def repvit(pretrained=False, num_classes = 1000, distillation=False, **kwargs):
300 | """
301 | Constructs a MobileNetV3-Large model
302 | """
303 | cfgs = [
304 | # k, t, c, SE, HS, s
305 | [3, 2, 80, 1, 0, 1],
306 | [3, 2, 80, 0, 0, 1],
307 | [3, 2, 80, 1, 0, 1],
308 | [3, 2, 80, 0, 0, 1],
309 | [3, 2, 80, 1, 0, 1],
310 | [3, 2, 80, 0, 0, 1],
311 | [3, 2, 80, 0, 0, 1],
312 | [3, 2, 160, 0, 0, 2],
313 | [3, 2, 160, 1, 0, 1],
314 | [3, 2, 160, 0, 0, 1],
315 | [3, 2, 160, 1, 0, 1],
316 | [3, 2, 160, 0, 0, 1],
317 | [3, 2, 160, 1, 0, 1],
318 | [3, 2, 160, 0, 0, 1],
319 | [3, 2, 160, 0, 0, 1],
320 | [3, 2, 320, 0, 1, 2],
321 | [3, 2, 320, 1, 1, 1],
322 | [3, 2, 320, 0, 1, 1],
323 | [3, 2, 320, 1, 1, 1],
324 | [3, 2, 320, 0, 1, 1],
325 | [3, 2, 320, 1, 1, 1],
326 | [3, 2, 320, 0, 1, 1],
327 | [3, 2, 320, 1, 1, 1],
328 | [3, 2, 320, 0, 1, 1],
329 | [3, 2, 320, 1, 1, 1],
330 | [3, 2, 320, 0, 1, 1],
331 | [3, 2, 320, 1, 1, 1],
332 | [3, 2, 320, 0, 1, 1],
333 | [3, 2, 320, 1, 1, 1],
334 | [3, 2, 320, 0, 1, 1],
335 | [3, 2, 320, 1, 1, 1],
336 | [3, 2, 320, 0, 1, 1],
337 | [3, 2, 320, 1, 1, 1],
338 | [3, 2, 320, 0, 1, 1],
339 | [3, 2, 320, 1, 1, 1],
340 | [3, 2, 320, 0, 1, 1],
341 | [3, 2, 320, 1, 1, 1],
342 | [3, 2, 320, 0, 1, 1],
343 | [3, 2, 320, 1, 1, 1],
344 | [3, 2, 320, 0, 1, 1],
345 | [3, 2, 320, 1, 1, 1],
346 | [3, 2, 320, 0, 1, 1],
347 | [3, 2, 320, 1, 1, 1],
348 | [3, 2, 320, 0, 1, 1],
349 | [3, 2, 320, 1, 1, 1],
350 | [3, 2, 320, 0, 1, 1],
351 | [3, 2, 320, 1, 1, 1],
352 | [3, 2, 320, 0, 1, 1],
353 | [3, 2, 320, 1, 1, 1],
354 | [3, 2, 320, 0, 1, 1],
355 | # [3, 2, 320, 1, 1, 1],
356 | # [3, 2, 320, 0, 1, 1],
357 | [3, 2, 320, 0, 1, 1],
358 | [3, 2, 640, 0, 1, 2],
359 | [3, 2, 640, 1, 1, 1],
360 | [3, 2, 640, 0, 1, 1],
361 | # [3, 2, 640, 1, 1, 1],
362 | # [3, 2, 640, 0, 1, 1]
363 | ]
364 | return RepViT(cfgs, num_classes=num_classes, distillation=distillation)
--------------------------------------------------------------------------------
/vlnce_baselines/map/RepViTSAM/setup_repvit_sam.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import torch
8 | from functools import partial
9 | from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer
10 | from timm.models import create_model
11 |
12 | from vlnce_baselines.map.RepViTSAM import repvit
13 |
14 |
15 | def build_sam_repvit(checkpoint=None):
16 | prompt_embed_dim = 256
17 | image_size = 1024
18 | vit_patch_size = 16
19 | image_embedding_size = image_size // vit_patch_size
20 | repvit_sam = Sam(
21 | image_encoder=create_model('repvit'),
22 | prompt_encoder=PromptEncoder(
23 | embed_dim=prompt_embed_dim,
24 | image_embedding_size=(image_embedding_size, image_embedding_size),
25 | input_image_size=(image_size, image_size),
26 | mask_in_chans=16,
27 | ),
28 | mask_decoder=MaskDecoder(
29 | num_multimask_outputs=3,
30 | transformer=TwoWayTransformer(
31 | depth=2,
32 | embedding_dim=prompt_embed_dim,
33 | mlp_dim=2048,
34 | num_heads=8,
35 | ),
36 | transformer_dim=prompt_embed_dim,
37 | iou_head_depth=3,
38 | iou_head_hidden_dim=256,
39 | ),
40 | pixel_mean=[123.675, 116.28, 103.53],
41 | pixel_std=[58.395, 57.12, 57.375],
42 | )
43 |
44 | repvit_sam.eval()
45 | if checkpoint is not None:
46 | with open(checkpoint, "rb") as f:
47 | state_dict = torch.load(f)
48 | repvit_sam.load_state_dict(state_dict)
49 | return repvit_sam
50 |
51 | from functools import partial
52 |
53 | sam_model_registry = {
54 | "repvit": partial(build_sam_repvit),
55 | }
--------------------------------------------------------------------------------
/vlnce_baselines/map/direction_map.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import numpy as np
4 | import torch.nn as nn
5 | from typing import Union, Tuple, List
6 | from vlnce_baselines.utils.map_utils import *
7 | from vlnce_baselines.utils.constant import direction_mapping
8 |
9 | from habitat import Config
10 |
11 |
12 | class DirectionMap(nn.Module):
13 | def __init__(self, config: Config, full_map_shape: Union[Tuple, List, np.ndarray],
14 | theta: float=90.0, radius: float=5.0) -> None:
15 | super().__init__()
16 | self.config = config
17 | self.shape = full_map_shape
18 | self.visualize = config.MAP.VISUALIZE
19 | self.print_images = config.MAP.PRINT_IMAGES
20 | self.resolution = config.MAP.MAP_RESOLUTION
21 |
22 | self.theta = theta
23 | self.radius = radius
24 |
25 | self.direction_map = np.zeros(self.shape)
26 |
27 | def reset(self):
28 | self.direction_map = np.ones(self.shape)
29 |
30 | def _create_sector_mask(self, position: np.ndarray, heading: float, direction_weight: float=1.5):
31 | """
32 | arg "position" came from full pose, full pose use standard Cartesian coordinate.
33 | """
34 | mask = np.zeros(self.shape)
35 | heading = (360 - heading) % 360
36 | angle_high = (heading + self.theta / 2) % 360
37 | angle_low = (heading - self.theta / 2) % 360
38 | heading = np.ones(self.shape) * heading
39 |
40 | y, x = np.meshgrid(np.arange(self.shape[0]) - position[0], np.arange(self.shape[1]) - position[1])
41 | distance = np.sqrt(x**2 + y**2)
42 | angle = np.arctan2(x, y) * 180 / np.pi
43 | angle = (360 - angle) % 360
44 |
45 | valid_distance = distance <= self.radius * 100 / self.resolution
46 | if angle_high > angle_low:
47 | valid_angle = (angle_low <= angle) & (angle <= angle_high)
48 | else:
49 | valid_angle = (angle_low <= angle) | (angle <= angle_high)
50 | mask[valid_distance & valid_angle] = direction_weight
51 |
52 | return mask
53 |
54 | def forward(self, current_position: np.ndarray, last_five_step_position: np.ndarray, heading: float,
55 | direction: str, step: int, current_episode_id: int) -> np.ndarray:
56 | heading_vector = current_position - last_five_step_position
57 | if np.linalg.norm(heading_vector) <= 0.2 * 100 / self.resolution:
58 | heading_angle = heading
59 | else:
60 | heading_angle = angle_between_vectors(np.array([1, 0]), heading_vector)
61 | print("!!!!heading angle: ", heading_angle, direction, "left" in direction)
62 | direction = direction_mapping.get(direction, "ambiguous direction")
63 | if direction == "forward":
64 | sector_mask = self._create_sector_mask(current_position, heading_angle)
65 | elif direction == "left":
66 | heading_angle += 45
67 | sector_mask = self._create_sector_mask(current_position, heading_angle)
68 | elif direction == "right":
69 | heading_angle -= 45
70 | sector_mask = self._create_sector_mask(current_position, heading_angle)
71 | elif direction == "backward":
72 | heading_angle += 180
73 | sector_mask = self._create_sector_mask(current_position, heading_angle)
74 | else:
75 | sector_mask = np.ones(self.shape)
76 |
77 | if self.visualize:
78 | self._visualize(sector_mask, step, current_episode_id)
79 |
80 | return sector_mask
81 |
82 | def _visualize(self, direction_map: np.ndarray, step: int, current_episode_id: int):
83 | direction_map_vis = direction_map.copy()
84 | direction_map_vis[direction_map_vis == 0] = 1
85 | direction_map_vis = np.flipud((direction_map * 255).astype(np.uint8))
86 | cv2.imshow("history map", direction_map_vis)
87 | cv2.waitKey(1)
88 |
89 | if self.print_images:
90 | save_dir = os.path.join(self.config.RESULTS_DIR, "direction_map/eps_%d"%current_episode_id)
91 | os.makedirs(save_dir, exist_ok=True)
92 | fn = "{}/step-{}.png".format(save_dir, step)
93 | cv2.imwrite(fn, direction_map_vis)
--------------------------------------------------------------------------------
/vlnce_baselines/map/history_map.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import numpy as np
4 | import torch.nn as nn
5 | from typing import List
6 |
7 |
8 | class HistoryMap(nn.Module):
9 | def __init__(self, config, full_map_shape) -> None:
10 | super().__init__()
11 | self.config = config
12 | self.shape = full_map_shape
13 | self.visualize = config.MAP.VISUALIZE
14 | self.print_images = config.MAP.PRINT_IMAGES
15 | self.history_map = np.ones(self.shape)
16 |
17 | def reset(self):
18 | self.trajectory = np.zeros(self.shape)
19 | self.history_map = np.ones(self.shape)
20 |
21 | def _draw_polyline(self, trajectory_points: List, thickness: int=20):
22 | image = np.zeros(self.shape)
23 | points_array = np.array(trajectory_points, dtype=np.int32)
24 | color = (255, 255, 255)
25 | is_closed = False
26 |
27 | # thicness = 20 => 1.0(meter) takes 20 grids in map
28 | trajectory = cv2.polylines(image, [points_array], is_closed, color, thickness)
29 |
30 | return trajectory.astype(bool)
31 |
32 | def forward(self, trajectory_points: List, step: int, current_episode_id: int, alpha: float=0.95):
33 | if len(trajectory_points) == 2 and trajectory_points[0] == trajectory_points[1]:
34 | return self.history_map
35 | trajectory = self._draw_polyline(trajectory_points)
36 | self.history_map[trajectory == True] *= alpha
37 |
38 | if self.visualize:
39 | self._visualize(self.history_map, step, current_episode_id)
40 |
41 | return self.history_map
42 |
43 | def _visualize(self, history_map, step, current_episode_id):
44 | history_map_vis = history_map.copy()
45 | history_map_vis[history_map_vis == 0] = 1
46 | history_map_vis = np.flipud((history_map * 255).astype(np.uint8))
47 | cv2.imshow("history map", history_map_vis)
48 | cv2.waitKey(1)
49 |
50 | if self.print_images:
51 | save_dir = os.path.join(self.config.RESULTS_DIR, "history_map/eps_%d"%current_episode_id)
52 | os.makedirs(save_dir, exist_ok=True)
53 | fn = "{}/step-{}.png".format(save_dir, step)
54 | cv2.imwrite(fn, history_map_vis)
--------------------------------------------------------------------------------
/vlnce_baselines/map/semantic_prediction.py:
--------------------------------------------------------------------------------
1 | import attr
2 | import time
3 | from typing import Any, Union, List, Tuple
4 | from abc import ABCMeta, abstractmethod
5 |
6 | import cv2
7 | import torch
8 | import numpy as np
9 |
10 | from habitat import Config
11 |
12 | import supervision as sv
13 | from groundingdino.util.inference import Model
14 | from segment_anything import sam_model_registry, SamPredictor
15 |
16 | from vlnce_baselines.map.RepViTSAM.setup_repvit_sam import build_sam_repvit
17 | from vlnce_baselines.common.utils import get_device
18 |
19 |
20 | VisualObservation = Union[torch.Tensor, np.ndarray]
21 |
22 |
23 | @attr.s(auto_attribs=True)
24 | class Segment(metaclass=ABCMeta):
25 | config: Config
26 | device: torch.device
27 |
28 | def __attrs_post_init__(self):
29 | self._create_model(self.config, self.device)
30 |
31 | @abstractmethod
32 | def _create_model(self, config: Config, device: torch.device) -> None:
33 | pass
34 |
35 | @abstractmethod
36 | def segment(self, image: VisualObservation, **kwargs) -> Any:
37 | pass
38 |
39 |
40 | @attr.s(auto_attribs=True)
41 | class GroundedSAM(Segment):
42 | height: float = 480.
43 | width: float = 640.
44 |
45 | def _create_model(self, config: Config, device: torch.device) -> Any:
46 | GROUNDING_DINO_CONFIG_PATH = config.MAP.GROUNDING_DINO_CONFIG_PATH
47 | GROUNDING_DINO_CHECKPOINT_PATH = config.MAP.GROUNDING_DINO_CHECKPOINT_PATH
48 | SAM_CHECKPOINT_PATH = config.MAP.SAM_CHECKPOINT_PATH
49 | SAM_ENCODER_VERSION = config.MAP.SAM_ENCODER_VERSION
50 | RepViTSAM_CHECKPOINT_PATH = config.MAP.RepViTSAM_CHECKPOINT_PATH
51 | # device = torch.device("cuda", config.TORCH_GPU_ID if torch.cuda.is_available() else "cpu")
52 |
53 | self.grounding_dino_model = Model(
54 | model_config_path=GROUNDING_DINO_CONFIG_PATH,
55 | model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
56 | device=device
57 | )
58 | if config.MAP.REPVITSAM:
59 | sam = build_sam_repvit(checkpoint=RepViTSAM_CHECKPOINT_PATH)
60 | sam.to(device=device)
61 | else:
62 | sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH).to(device=device)
63 | self.sam_predictor = SamPredictor(sam)
64 | self.box_threshold = config.MAP.BOX_THRESHOLD
65 | self.text_threshold = config.MAP.TEXT_THRESHOLD
66 | self.grounding_dino_model.model.eval()
67 |
68 | def _segment(self, sam_predictor: SamPredictor, image: np.ndarray, xyxy: np.ndarray) -> np.ndarray:
69 | sam_predictor.set_image(image)
70 | result_masks = []
71 | for box in xyxy:
72 | masks, scores, logits = sam_predictor.predict(
73 | box=box,
74 | multimask_output=True
75 | )
76 | index = np.argmax(scores)
77 | result_masks.append(masks[index])
78 | return np.array(result_masks)
79 |
80 | def _process_detections(self, detections: sv.Detections) -> sv.Detections:
81 | box_areas = detections.box_area
82 | i = len(detections) - 1
83 | while i >= 0:
84 | if box_areas[i] / (self.width * self.height) < 0.95:
85 | i -= 1
86 | continue
87 | else:
88 | detections.xyxy = np.delete(detections.xyxy, i, axis=0)
89 | if detections.mask is not None:
90 | detections.mask = np.delete(detections.mask, i, axis=0)
91 | if detections.confidence is not None:
92 | detections.confidence = np.delete(detections.confidence, i)
93 | if detections.class_id is not None:
94 | detections.class_id = np.delete(detections.class_id, i)
95 | if detections.tracker_id is not None:
96 | detections.tracker_id = np.delete(detections.tracker_id, i)
97 | i -= 1
98 |
99 | return detections
100 |
101 | @torch.no_grad()
102 | def segment(self, image: VisualObservation, **kwargs) -> Tuple[np.ndarray, List[str], np.ndarray]:
103 | classes = kwargs.get("classes", [])
104 | box_annotator = sv.BoxAnnotator()
105 | mask_annotator = sv.MaskAnnotator()
106 | labels = []
107 | # t1 = time.time()
108 | detections = self.grounding_dino_model.predict_with_classes(
109 | image=image,
110 | classes=classes,
111 | box_threshold=self.box_threshold,
112 | text_threshold=self.text_threshold
113 | )
114 | # t2 = time.time()
115 | detections = self._process_detections(detections)
116 | for _, _, confidence, class_id, _ in detections:
117 | if class_id is not None:
118 | labels.append(f"{classes[class_id]} {confidence:0.2f}")
119 | else:
120 | labels.append(f"unknow {confidence:0.2f}")
121 | # t3 = time.time()
122 | detections.mask = self._segment(
123 | sam_predictor=self.sam_predictor,
124 | image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
125 | xyxy=detections.xyxy
126 | )
127 | # t4 = time.time()
128 | # print("grounding dino: ", t2 - t1)
129 | # print("process detections: ", t3 - t2)
130 | # print("sam: ", t4 - t3)
131 | # annotated_image.shape=(h,w,3)
132 | annotated_image = mask_annotator.annotate(scene=image.copy(), detections=detections)
133 | annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
134 |
135 | # detectins.mask.shape=[num_detected_classes, h, w]
136 | # attention: sometimes the model can't detect all classes, so num_detected_classes <= len(classes)
137 | return (detections.mask.astype(np.float32), labels, annotated_image, detections)
138 |
139 |
140 | class BatchWrapper:
141 | """
142 | Create a simple end-to-end predictor with the given config that runs on
143 | single device for a list of input images.
144 | """
145 | def __init__(self, model) -> None:
146 | self.model = model
147 |
148 | def __call__(self, images: List[VisualObservation]) -> List:
149 | return [self.model(image) for image in images]
--------------------------------------------------------------------------------
/vlnce_baselines/map/value_map.py:
--------------------------------------------------------------------------------
1 | """
2 | Value map moudle aims to calcluate cosine similarity
3 | between current observation and destination description
4 | """
5 |
6 | import os
7 | import cv2
8 | import torch
9 | import torch.nn as nn
10 |
11 | import numpy as np
12 | from PIL import Image
13 | from habitat import Config
14 | from collections import Sequence
15 | from typing import Union, Tuple, List
16 | from lavis.models import load_model_and_preprocess
17 | from skimage.morphology import remove_small_objects
18 |
19 | from vlnce_baselines.utils.map_utils import *
20 | from vlnce_baselines.utils.visualization import *
21 |
22 |
23 | class ValueMap(nn.Module):
24 | def __init__(self,
25 | config: Config,
26 | full_value_map_shape: Union[Tuple, List, np.ndarray],
27 | device: torch.device) -> None:
28 | super(ValueMap, self).__init__()
29 | self.config = config
30 | self.shape = full_value_map_shape
31 | self.visualize = config.MAP.VISUALIZE
32 | self.print_images = config.MAP.PRINT_IMAGES
33 |
34 | # two channels for value map:
35 | # channel 0 is confidence map;
36 | # channel 1 is blip value map;
37 | self.value_map = np.zeros((2, *self.shape))
38 | self.accumulated_mask = np.zeros(self.shape)
39 | self.resolution = config.MAP.MAP_RESOLUTION
40 | self.hfov = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.HFOV
41 | self.radius = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH
42 | self.device = device
43 | # self.device = (torch.device("cuda", self.config.TORCH_GPU_ID) if
44 | # torch.cuda.is_available() else torch.device("cpu"))
45 | self.vis_image = np.ones((580, 480 * 3 + 20 * 4, 3)).astype(np.uint8) * 255
46 | self.previous_floor = np.zeros(self.shape)
47 | self._load_model_from_disk()
48 | self.model.eval()
49 |
50 | def _create_model(self):
51 | self.model, vis_processors, text_processors = \
52 | load_model_and_preprocess(
53 | "blip2_image_text_matching",
54 | "pretrain",
55 | device=self.device,
56 | is_eval=True)
57 | self.vis_processors = vis_processors["eval"]
58 | self.text_processors = text_processors["eval"]
59 |
60 | def _load_model_from_disk(self):
61 | self.model = torch.load(self.config.BLIP2_MODEL_DIR, map_location="cpu").to(self.device)
62 | self.vis_processors = torch.load(self.config.BLIP2_VIS_PROCESSORS_DIR)["eval"]
63 | self.text_processors = torch.load(self.config.BLIP2_TEXT_PROCESSORS_DIR)["eval"]
64 |
65 | def _calculate_confidence(self, theta: np.ndarray) -> np.float64:
66 | return (np.cos(0.5 * np.pi * theta / (self.hfov / 2)))**2
67 |
68 | def _angle_to_vector(self, angle: np.ndarray) -> np.ndarray:
69 | angle_rad = np.radians(angle)
70 | x = np.cos(angle_rad)
71 | y = np.sin(angle_rad)
72 |
73 | return np.array([x, y])
74 |
75 | def _angle_between_vectors(self, vector1: np.ndarray, vector2: np.ndarray) -> np.ndarray:
76 | # return [0, pi]
77 | dot_product = np.sum(vector1 * vector2, axis=0)
78 | vector1_length = np.linalg.norm(vector1, axis=0)
79 | vector2_length = np.linalg.norm(vector2, axis=0)
80 | angle = np.arccos(dot_product / (vector1_length * vector2_length))
81 |
82 | return np.degrees(angle)
83 |
84 | def _create_sector_mask(self, position: Sequence, heading: float):
85 | """
86 | arg "position" came from full pose, full pose use standard Cartesian coordinate.
87 | """
88 | mask = np.zeros(self.shape)
89 | confidence_mask = np.zeros(self.shape)
90 | heading = (360 - heading) % 360
91 | angle_high = (heading + self.hfov / 2) % 360
92 | angle_low = (heading - self.hfov / 2) % 360
93 | heading = np.ones(self.shape) * heading
94 | heading_vector = self._angle_to_vector(heading)
95 |
96 | y, x = np.meshgrid(np.arange(self.shape[0]) - position[0], np.arange(self.shape[1]) - position[1])
97 | # x = np.flipud(x)
98 | distance = np.sqrt(x**2 + y**2)
99 | angle = np.arctan2(x, y) * 180 / np.pi
100 | angle = (360 - angle) % 360
101 |
102 | angle_vector = self._angle_to_vector(angle) # (2, 480, 480)
103 | theta = self._angle_between_vectors(heading_vector, angle_vector)
104 |
105 | confidence = self._calculate_confidence(theta)
106 |
107 | valid_distance = distance <= self.radius * 100 / self.resolution
108 | if angle_high > angle_low:
109 | valid_angle = (angle_low <= angle) & (angle <= angle_high)
110 | else:
111 | valid_angle = (angle_low <= angle) | (angle <= angle_high)
112 | mask[valid_distance & valid_angle] = 1
113 | confidence_mask[valid_distance & valid_angle] = confidence[valid_distance & valid_angle]
114 |
115 | return mask, confidence_mask
116 |
117 | def _update_value_map(self,
118 | prev_value: np.ndarray,
119 | curr_value: np.ndarray,
120 | prev_confidence: np.ndarray,
121 | curr_confidence: np.ndarray,
122 | one_step_floor: np.ndarray,
123 | mask: np.ndarray) -> np.ndarray:
124 | new_map_mask = np.logical_and(curr_confidence < 0.35, curr_confidence < prev_confidence)
125 | curr_confidence[new_map_mask] = 0.0
126 | new_value = curr_confidence * curr_value * self.current_floor + prev_confidence * prev_value
127 | new_confidence = (self.current_floor * curr_confidence)**2 + prev_confidence**2
128 | partion = curr_confidence * self.current_floor + prev_confidence
129 | partion[partion == 0] = np.inf
130 | new_value /= partion
131 | new_confidence /= partion
132 | self.value_map[0][one_step_floor == 1] = new_confidence[one_step_floor == 1]
133 | self.value_map[1][one_step_floor == 1] = new_value[one_step_floor == 1]
134 | self.value_map *= self.current_floor
135 |
136 | def reset(self) -> None:
137 | self.value_map = np.zeros((2, *self.shape))
138 | self.vis_image = np.ones((580, 480 * 3 + 20 * 4, 3)).astype(np.uint8) * 255
139 |
140 | @torch.no_grad()
141 | def get_blip_value(self, image: Image, caption: str) -> torch.Tensor:
142 | img = self.vis_processors(image).unsqueeze(0).to(self.device)
143 | txt = self.text_processors(caption)
144 | itc_score = self.model({"image": img, "text_input": txt}, match_head='itc')
145 |
146 | return itc_score
147 |
148 | def forward(self,
149 | step: int,
150 | full_map: np.ndarray,
151 | floor: np.ndarray,
152 | one_step_floor: np.ndarray,
153 | collision_map: np.ndarray,
154 | blip_value: np.ndarray,
155 | full_pose: Sequence,
156 | classes: List,
157 | current_episode_id: int):
158 | """project cosine similarity to floor
159 |
160 | Args:
161 | local_map (np.array): one step local map, current observation's
162 | 2D Top-down semantic map. shape: [c,h,w]
163 | no batch dimension
164 | value (torch.Tensor): torch.size([1,1]) on device
165 | """
166 | self.current_floor = floor
167 | self.current_floor[collision_map == 1] = 0
168 | position = full_pose[:2] * (100 / self.resolution)
169 | heading = full_pose[-1]
170 | mask, confidence_mask = self._create_sector_mask(position, heading)
171 | current_confidence = confidence_mask
172 | previous_confidence = self.value_map[0]
173 | current_value = blip_value
174 | previous_value = self.value_map[1]
175 | self._update_value_map(previous_value, current_value, previous_confidence, current_confidence, one_step_floor, mask)
176 | if self.visualize:
177 | self._visualize(step, current_episode_id)
178 |
179 | return self.value_map[1]
180 |
181 | def _visualize(self, step: int, current_episode_id: int):
182 | confidence_mask_vis = cv2.convertScaleAbs(self.value_map[0] * 255)
183 | confidence_mask_vis = np.stack((confidence_mask_vis,) * 3, axis=-1)
184 | value_map_vis = self.value_map[1]
185 |
186 | min_val = np.min(value_map_vis)
187 | max_val = np.max(value_map_vis)
188 | normalized_values = (value_map_vis - min_val) / (max_val - min_val + 1e-8)
189 | normalized_values[value_map_vis == 0] = 1
190 | value_map_vis = cv2.applyColorMap((normalized_values* 255).astype(np.uint8), cv2.COLORMAP_HOT)
191 | floor_vis = cv2.convertScaleAbs(self.current_floor * 255)
192 | floor_vis = np.stack((floor_vis,) * 3, axis=-1)
193 | self.vis_image[80 : 560, 20 : 500] = np.flipud(floor_vis)
194 | self.vis_image[80: 560, 520 : 1000] = np.flipud(confidence_mask_vis)
195 | self.vis_image[80: 560, 1020: 1500] = np.flipud(value_map_vis)
196 |
197 | self.vis_image = add_text(self.vis_image, "Floor", (20, 50))
198 | self.vis_image = add_text(self.vis_image, "Confidence Mask", (520, 50))
199 | self.vis_image = add_text(self.vis_image, "Value Map", (1020, 50))
200 |
201 | cv2.imshow("info", self.vis_image)
202 | cv2.waitKey(1)
203 |
204 | if self.print_images:
205 | save_dir = os.path.join(self.config.RESULTS_DIR, "floor_confidence_value/eps_%d"%current_episode_id)
206 | os.makedirs(save_dir, exist_ok=True)
207 | fn = "{}/step-{}.png".format(save_dir, step)
208 | cv2.imwrite(fn, self.vis_image)
--------------------------------------------------------------------------------
/vlnce_baselines/models/Policy.py:
--------------------------------------------------------------------------------
1 | """
2 | Design a policy to combine different maps then decide action
3 | """
4 | import os
5 | import cv2
6 | import torch
7 | import numpy as np
8 | import torch.nn as nn
9 | from typing import List
10 | import supervision as sv
11 | from collections import Sequence
12 |
13 | from habitat import Config
14 |
15 | from vlnce_baselines.utils.map_utils import *
16 | from vlnce_baselines.utils.data_utils import OrderedSet
17 | from vlnce_baselines.models.fmm_planner import FMMPlanner
18 | from vlnce_baselines.models.frontier_policy import FrontierPolicy
19 | from vlnce_baselines.models.super_pixel_policy import SuperPixelPolicy
20 |
21 |
22 | class FusionMapPolicy(nn.Module):
23 | def __init__(self, config: Config, map_shape: float=480) -> None:
24 | super().__init__()
25 | self.config = config
26 | self.map_shape = map_shape
27 | self.visualize = config.MAP.VISUALIZE
28 | self.print_images = config.MAP.PRINT_IMAGES
29 | self.resolution = config.MAP.MAP_RESOLUTION
30 | self.turn_angle = config.TASK_CONFIG.SIMULATOR.TURN_ANGLE
31 | self.superpixel_policy = SuperPixelPolicy(config)
32 | self.max_destination_socre = -1e5
33 | self.fixed_destination = None
34 | self.fmm_dist = np.zeros((self.map_shape, self.map_shape))
35 | self.decision_threshold = config.EVAL.DECISION_THRESHOLD
36 | self.score_threshold = config.EVAL.SCORE_THRESHOLD
37 | self.value_threshold = config.EVAL.VALUE_THRESHOLD
38 |
39 | def reset(self) -> None:
40 | self.superpixel_policy.reset()
41 | self.fixed_destination = None
42 | self.max_destination_socre = -1e5
43 | self.max_destination_confidence = -1.
44 | self.vis_image = np.ones((self.map_shape, self.map_shape, 3)).astype(np.uint8) * 255
45 |
46 | def _get_action(self,
47 | full_pose: Sequence,
48 | waypoint: np.ndarray,
49 | map: np.ndarray,
50 | traversible: np.ndarray,
51 | collision_map: np.ndarray,
52 | step: int,
53 | current_episode_id: int,
54 | classes: List,
55 | search_destination: bool) -> int:
56 | """
57 | The coordinates among agent's pose in full_pose, agent's position in full_map,
58 | agent's position in visualization are ignoring. And there're many np.flipud which
59 | could be confusing.
60 |
61 | PAY ATTENTION:
62 |
63 | 1. full pose: [x, y, heading] -> standard cartesian coordinates
64 | agent's initial pose is [12, 12, 0]. (12, 12) is the center of the whole range
65 | heading = 0 in cartesian is pointing in the right direction.
66 |
67 | Now let's take an example: agent's full_pose=[7, 21, 0].
68 | ^ y
69 | |
70 | | * (7, 21) => (7*100/5, 21*100/5)=(140, 420)
71 | |
72 | |
73 | |
74 | -------------> x
75 |
76 |
77 | 2. what's the agent's position in full map?
78 | full_map.shape=[480, 480], agent's initial index is [240, 240]
79 | since full_map is a 2D ndarray so the x axis points downward and y axis points rightward
80 | -------------> y
81 | |
82 | | * (60, 140)
83 | |
84 | |
85 | |
86 | V x
87 |
88 | when we want to convert agent's position from cartesian coordinate to ndarray coordinate
89 | x_ndarray = 480 - y_cartesian
90 | y_ndarray = x_cartesian
91 | so the index in full_map is [60, 140]
92 |
93 | NOTICE: the agent didn't move when you convert coordinate from cartesian to ndarray, which
94 | means you should not just rotate the coordinate 90 degrees
95 |
96 | 3. Does that finish?
97 | No! You should be extreamly careful that (60, 140) is the position we want to see in visualization
98 | but before visualization we will flip upside-down and this means (60, 140) is the position after
99 | flip upside-down. So, what's the index before flip upside-down?
100 | x_ndarray_raw = 480 - x_ndarray = y_cartesian
101 | y_ndarray_raw = y_ndarray = x_cartesian
102 | so the index in full_map before flip should be (420, 140)
103 |
104 | Till now, we have convert full_pose from cartesian coordinate to ndarray coordinate
105 | we designed a function: "angle_and_direction" to calculate wheather agent should turn
106 | left or right to face the goal. this function takse in everything in ndarray coordinate.
107 |
108 | We design it in this way because ndarray coordinate is the most commonly used.
109 |
110 | """
111 | x, y, heading = full_pose
112 | x, y = x * (100 / self.resolution), y * (100 / self.resolution)
113 | position = np.array([y, x])
114 | heading = -1 * full_pose[-1]
115 | rotation_matrix = np.array([[0, -1],
116 | [1, 0]])
117 | traversible[collision_map == 1] = 0
118 | planner = FMMPlanner(self.config, traversible, visualize=self.visualize)
119 | if traversible[waypoint[0], waypoint[1]] == 0:
120 | goal = get_nearest_nonzero_waypoint(traversible, waypoint)
121 | else:
122 | goal = waypoint
123 | planner.set_goal(goal)
124 | self.fmm_dist = planner.fmm_dist
125 | stg_x, stg_y, stop = planner.get_short_term_goal(position, self.fixed_destination)
126 | sub_waypoint = (stg_x, stg_y)
127 | heading_vector = angle_to_vector(heading)
128 | heading_vector = np.dot(rotation_matrix, heading_vector)
129 | waypoint_vector = sub_waypoint - position
130 |
131 | if stop and self.fixed_destination is not None:
132 | action = 0
133 | print("stop")
134 | elif stop and self.fixed_destination is None:
135 | action = 2
136 | else:
137 | relative_angle, action = angle_and_direction(heading_vector, waypoint_vector, self.turn_angle)
138 |
139 | if self.visualize:
140 | normalized_data = ((planner.fmm_dist - np.min(planner.fmm_dist)) /
141 | (np.max(planner.fmm_dist) - np.min(planner.fmm_dist)) * 255).astype(np.uint8)
142 | normalized_data = np.stack((normalized_data,) * 3, axis=-1)
143 | normalized_data = cv2.circle(normalized_data, (int(x), int(y)), radius=5, color=(255,0,0), thickness=1)
144 | normalized_data = cv2.circle(normalized_data, (waypoint[1], waypoint[0]),
145 | radius=5, color=(0,0,255), thickness=1)
146 | cv2.imshow("fmm distance field", np.flipud(normalized_data))
147 |
148 | cv2.waitKey(1)
149 | if self.print_images:
150 | save_dir = os.path.join(self.config.RESULTS_DIR, "fmm_fields/eps_%d"%current_episode_id)
151 | os.makedirs(save_dir, exist_ok=True)
152 | fn = "{}/step-{}.png".format(save_dir, step)
153 | cv2.imwrite(fn, np.flipud(normalized_data))
154 |
155 | return action
156 |
157 | def _search_destination(self,
158 | destinations: List[str],
159 | classes: List,
160 | current_value: float,
161 | max_value: float,
162 | detected_classes: OrderedSet,
163 | one_step_full_map: np.ndarray,
164 | value_map: np.ndarray,
165 | floor: np.ndarray,
166 | traversible: np.ndarray,
167 | current_detection: sv.Detections, step: int):
168 | check = [item in detected_classes for item in destinations]
169 | if sum(check) == 0:
170 | """
171 | havn't detected destination
172 | """
173 | return None, -1e5
174 |
175 | candidates = []
176 | for i, destination in enumerate(destinations):
177 | if not check[i]:
178 | continue
179 | map_idx = detected_classes.index(destination)
180 | destination_map = one_step_full_map[4 + map_idx]
181 | class_idx = classes.index(destination)
182 | class_ids = current_detection.class_id
183 | confidences = current_detection.confidence
184 | # masks = current_detection.mask
185 |
186 | if class_idx not in class_ids:
187 | """
188 | Agent have already seen the destination in the past but not detected it in current step
189 | """
190 | continue
191 |
192 | destination_ids = np.argwhere(class_ids == class_idx)
193 | destination_confidences = confidences[destination_ids]
194 | max_confidence_idx = np.argmax(destination_confidences)
195 | max_idx = destination_ids[max_confidence_idx].item()
196 | destination_confidence = confidences[max_idx]
197 | if destination_confidence > self.max_destination_confidence:
198 | self.max_destination_confidence = destination_confidence
199 |
200 | destination_waypoint = process_destination2(destination_map, floor, traversible)
201 | if destination_waypoint is not None:
202 | x, y = destination_waypoint
203 | destination_value = value_map[x, y]
204 | if destination_value == 0 or traversible[x, y] == 0:
205 | destination_waypoint = get_nearest_nonzero_waypoint(np.logical_and(value_map, traversible),
206 | destination_waypoint)
207 | x, y = destination_waypoint
208 | destination_value = value_map[x, y]
209 |
210 | confidence_part = destination_confidence / self.max_destination_confidence
211 | value_part = destination_value / max_value
212 | score = (confidence_part + value_part) / 2.0
213 | print("value part: ", value_part)
214 | print("confidence part: ", confidence_part)
215 | print("destination value: ", destination_value)
216 | print("destination waypoint: ", destination_waypoint)
217 |
218 | if current_value >= self.decision_threshold:
219 | candidates.append((destination_waypoint, score))
220 | elif (score >= self.score_threshold and
221 | destination_value >= self.value_threshold and
222 | destination_waypoint is not None):
223 | print("!!! APPEND !!!")
224 | candidates.append((destination_waypoint, score))
225 | else:
226 | candidates.append((None, -1e5))
227 | else:
228 | candidates.append((None, -1e5))
229 |
230 | if len(candidates) > 0:
231 | candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
232 | waypoint, score = candidates[0]
233 | else:
234 | waypoint, score = None, -1e5
235 |
236 | return waypoint, score
237 |
238 | def forward(self,
239 | value_map: np.ndarray,
240 | collision_map: np.ndarray,
241 | full_map: np.ndarray,
242 | floor: np.ndarray,
243 | traversible: np.ndarray,
244 | full_pose: Sequence,
245 | frontiers: np.ndarray,
246 | detected_classes: OrderedSet,
247 | destination: List,
248 | classes: List,
249 | search_destination: bool,
250 | one_step_full_map: np.ndarray,
251 | current_detection: sv.Detections,
252 | current_episode_id: int,
253 | replan: bool,
254 | step: int):
255 |
256 | x, y, heading = full_pose
257 | x, y = x * (100 / self.resolution), y * (100 / self.resolution)
258 | position = np.array([y, x])
259 | best_waypoint, best_value, sorted_waypoints = self.superpixel_policy(full_map, traversible, value_map, collision_map,
260 | detected_classes, position, self.fmm_dist, replan,
261 | step, current_episode_id)
262 | print("current_position's value: ", value_map[min(int(y), self.map_shape - 1), min(int(x), self.map_shape - 1)])
263 | print("current pose: ", full_pose)
264 | current_value = value_map[min(int(y), self.map_shape - 1), min(int(x), self.map_shape - 1)]
265 | max_value = np.max(value_map)
266 | if search_destination:
267 | destination_waypoint, score = self._search_destination(destination, classes, current_value, max_value,
268 | detected_classes, one_step_full_map,
269 | value_map, floor, traversible, current_detection, step)
270 | if destination_waypoint is not None and score >= self.max_destination_socre:
271 | print("!!!!!!!find destination: ", destination_waypoint)
272 | self.fixed_destination = destination_waypoint
273 |
274 | if score >= self.max_destination_socre + 0.03:
275 | self.max_destination_socre = score
276 |
277 | if self.fixed_destination is not None:
278 | action = self._get_action(full_pose, self.fixed_destination, full_map,
279 | traversible, collision_map,
280 | step, current_episode_id, detected_classes,
281 | search_destination)
282 | else:
283 | action = self._get_action(full_pose, best_waypoint, full_map,
284 | traversible, collision_map,
285 | step, current_episode_id, detected_classes,
286 | search_destination)
287 | else:
288 | action = self._get_action(full_pose, best_waypoint, full_map,
289 | traversible, collision_map,
290 | step, current_episode_id, detected_classes,
291 | search_destination)
292 |
293 | if self.visualize:
294 | if self.fixed_destination is not None:
295 | best_waypoint = self.fixed_destination
296 | self._visualization(value_map, sorted_waypoints, best_waypoint, step, current_episode_id)
297 |
298 | return {"action": action}
299 |
300 | def _visualization(self,
301 | value_map: np.ndarray,
302 | waypoints: np.ndarray,
303 | best_waypoint: np.ndarray,
304 | step: int,
305 | current_episode_id: int):
306 |
307 | min_val = np.min(value_map)
308 | max_val = np.max(value_map)
309 | normalized_values = (value_map - min_val) / (max_val - min_val)
310 | normalized_values[value_map == 0] = 1
311 | map_vis = cv2.applyColorMap((normalized_values* 255).astype(np.uint8), cv2.COLORMAP_HOT)
312 |
313 | for i, waypoint in enumerate(waypoints):
314 | cx, cy = waypoint
315 | if i == 0:
316 | color = (0, 0, 255)
317 | else:
318 | color = (255, 0, 0)
319 | map_vis = cv2.circle(map_vis, (cy, cx), radius=3, color=color, thickness=1)
320 | map_vis = cv2.circle(map_vis, (best_waypoint[1], best_waypoint[0]), radius=5, color=(0,255,0), thickness=1)
321 | map_vis = np.flipud(map_vis)
322 | self.vis_image[:, :] = map_vis
323 | cv2.imshow("waypoints", self.vis_image)
324 | cv2.waitKey(1)
325 |
326 | if self.print_images:
327 | save_dir = os.path.join(self.config.RESULTS_DIR, "waypoints/eps_%d"%current_episode_id)
328 | os.makedirs(save_dir, exist_ok=True)
329 | fn = "{}/step-{}.png".format(save_dir, step)
330 | cv2.imwrite(fn, self.vis_image)
--------------------------------------------------------------------------------
/vlnce_baselines/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chenkehan21/CA-Nav-code/23481246e9e54e9ea890cf81986397709cee8ac8/vlnce_baselines/models/__init__.py
--------------------------------------------------------------------------------
/vlnce_baselines/models/fmm_planner.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import copy
3 | import skfmm
4 | import numpy as np
5 | from numpy import ma
6 | from typing import Tuple
7 |
8 | from vlnce_baselines.utils.map_utils import get_mask, get_dist
9 |
10 |
11 | class FMMPlanner:
12 | def __init__(self, config, traversible: np.ndarray, scale: int=1, step_size: int=5, visualize: bool=False) -> None:
13 | self.scale = scale
14 | self.step_size = step_size
15 | self.visualize = visualize
16 | if scale != 1.:
17 | self.traversible = cv2.resize(traversible,
18 | (traversible.shape[1] // scale,
19 | traversible.shape[0] // scale),
20 | interpolation=cv2.INTER_NEAREST)
21 | self.traversible = np.rint(self.traversible)
22 | else:
23 | self.traversible = traversible
24 |
25 | self.du = int(self.step_size / (self.scale * 1.)) # du=5
26 | self.fmm_dist = None
27 | self.waypoint_threshold = config.EVAL.FMM_WAYPOINT_THRESHOLD
28 | self.goal_threshold = config.EVAL.FMM_GOAL_THRESHOLD
29 | self.resolution = config.MAP.MAP_RESOLUTION
30 |
31 | def set_goal(self, goal: np.ndarray) -> None:
32 | traversible_ma = ma.masked_values(self.traversible * 1, 0)
33 | goal_x, goal_y = goal
34 |
35 | traversible_ma[goal_x, goal_y] = 0
36 | dd = skfmm.distance(traversible_ma, dx=1)
37 | dd = ma.filled(dd, np.max(dd) + 1)
38 | self.fmm_dist = dd
39 |
40 | def get_short_term_goal(self, agent_position: np.ndarray, fixed_destination: np.ndarray, pad: int=5, ) -> Tuple:
41 | dist = np.pad(self.fmm_dist, pad, 'constant', constant_values=np.max(self.fmm_dist))
42 | x, y = int(agent_position[0]), int(agent_position[1])
43 | dx, dy = agent_position[0] - x, agent_position[1] - y
44 | mask = get_mask(dx, dy, scale=1, step_size=5)
45 | dist_mask = get_dist(dx, dy, scale=1, step_size=5)
46 | x += pad
47 | y += pad
48 | subset = dist[x - 5 : x + 6, y - 5: y + 6].copy()
49 | if subset.shape != mask.shape:
50 | print("subset and mask have different shape")
51 | print(f"subset shape:{subset.shape}, mask shape:{mask.shape}")
52 | print(f"current positon:{agent_position}")
53 | return x, y, True
54 | subset *= mask
55 | subset += (1 - mask) * 1e5
56 | if subset[5, 5] < self.waypoint_threshold * 100 / self.resolution:
57 | stop = True
58 | if fixed_destination is not None and subset[5, 5] < self.goal_threshold * 100 / self.resolution:
59 | stop = True
60 | else:
61 | stop = False
62 | (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape)
63 | offset_x, offset_y = stg_x - 5, stg_y - 5
64 | goal_x = x + offset_x - pad
65 | goal_y = y + offset_y - pad
66 |
67 | return goal_x, goal_y, stop
--------------------------------------------------------------------------------
/vlnce_baselines/models/frontier_policy.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import torch.nn as nn
4 | from typing import List
5 | from collections import Sequence
6 | from scipy.spatial.distance import cdist
7 | from vlnce_baselines.utils.map_utils import get_nearest_nonzero_waypoint
8 | from vlnce_baselines.models.vanilla_waypoint_selector import WaypointSelector
9 |
10 |
11 | class FrontierPolicy(nn.Module):
12 | def __init__(self, config) -> None:
13 | super().__init__()
14 | self.config = config
15 | self.waypoint_selector = WaypointSelector(config)
16 |
17 | def reset(self) -> None:
18 | self.waypoint_selector.reset()
19 |
20 | def _sort_waypoints_by_value(self, frontiers: np.ndarray, value_map: np.ndarray,
21 | floor: np.ndarray, traversible: np.ndarray, position: np.ndarray) -> List:
22 | nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(frontiers)
23 | centroids = centroids[1:]
24 | tmp_waypoints = [[int(item[1]), int(item[0])] for item in centroids]
25 | waypoints = []
26 | for waypoint in tmp_waypoints:
27 | value = value_map[waypoint[0], waypoint[1]]
28 | if value == 0:
29 | target_area = np.logical_and(value_map.astype(bool), traversible)
30 | nearest_waypoint = get_nearest_nonzero_waypoint(target_area, waypoint)
31 | waypoints.append(nearest_waypoint)
32 | else:
33 | waypoints.append(waypoint)
34 |
35 | waypoints_value = [[waypoint, value_map[waypoint[0], waypoint[1]]] for waypoint in waypoints]
36 | waypoints_value = sorted(waypoints_value, key=lambda x: x[1], reverse=True)
37 | if len(waypoints_value) > 0:
38 | sorted_waypoints = np.concatenate([[np.array(item[0])] for item in waypoints_value], axis=0)
39 | sorted_values = [item[1] for item in waypoints_value]
40 | else:
41 | sorted_waypoints = np.expand_dims(position.astype(int), axis=0)
42 | sorted_values = [value_map[int(position[0]), int(position[1])]]
43 |
44 | return sorted_waypoints, sorted_values
45 |
46 | def forward(self, frontiers: np.ndarray, value_map: np.ndarray, collision_map: np.ndarray,
47 | floor: np.ndarray, traversible: np.ndarray, position: np.ndarray):
48 | sorted_waypoints, sorted_values = self._sort_waypoints_by_value(frontiers, value_map,
49 | floor, traversible, position)
50 | best_waypoint, best_value, sorted_waypoints = \
51 | self.waypoint_selector(sorted_waypoints, frontiers, position, collision_map, value_map)
52 |
53 | return best_waypoint, best_value, sorted_waypoints
--------------------------------------------------------------------------------
/vlnce_baselines/models/frontier_waypoint_selector.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch.nn as nn
3 | from typing import List
4 | from vlnce_baselines.utils.map_utils import *
5 | from vlnce_baselines.utils.acyclic_enforcer import AcyclicEnforcer
6 |
7 |
8 | class WaypointSelector(nn.Module):
9 | def __init__(self) -> None:
10 | super().__init__()
11 | self.reset()
12 |
13 | def reset(self) -> None:
14 | self._acyclic_enforcer = AcyclicEnforcer()
15 | self._last_value = float("-inf")
16 | self._last_waypoint = np.zeros(2)
17 |
18 | def forward(self, sorted_waypoints: np.ndarray, sorted_values: List, position: np.ndarray):
19 | best_waypoint_idx = None
20 | if not np.array_equal(self._last_waypoint, np.zeros(2)):
21 | curr_index = None
22 |
23 | for idx, waypoint in enumerate(sorted_waypoints):
24 | if np.array_equal(waypoint, self._last_waypoint):
25 | curr_index = idx
26 | break
27 |
28 | if curr_index is None:
29 | closest_index = closest_point_within_threshold(sorted_waypoints, self._last_waypoint, threshold=0.5)
30 | if closest_index != -1:
31 | curr_index = closest_index
32 | else:
33 | curr_value = sorted_values[curr_index]
34 | if curr_value + 0.01 > self._last_value:
35 | best_waypoint_idx = curr_index
36 |
37 | if best_waypoint_idx is None:
38 | for idx, waypoint in enumerate(sorted_waypoints):
39 | cyclic = self._acyclic_enforcer.check_cyclic(position, waypoint, threshold=0.5*20)
40 | if cyclic:
41 | continue
42 | best_waypoint_idx = idx
43 | break
44 |
45 | if best_waypoint_idx is None:
46 | print("All waypoints are cyclic! Choosing the closest one.")
47 | best_waypoint_idx = max(range(len(sorted_waypoints)),
48 | key=lambda i: np.linalg.norm(sorted_waypoints[i] - position))
49 |
50 | best_waypoint = sorted_waypoints[best_waypoint_idx]
51 | best_value = sorted_values[best_waypoint_idx]
52 | self._acyclic_enforcer.add_state_action(position, best_waypoint)
53 | self._last_value = best_value
54 | self._last_waypoint = best_waypoint
55 |
56 | return best_waypoint, best_value, sorted_waypoints
--------------------------------------------------------------------------------
/vlnce_baselines/models/super_pixel_policy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import numpy as np
4 | import torch.nn as nn
5 | from typing import List
6 | from fast_slic import Slic
7 | from collections import Sequence
8 | from scipy.spatial.distance import cdist
9 | from vlnce_baselines.utils.map_utils import *
10 | from vlnce_baselines.utils.data_utils import OrderedSet
11 | from vlnce_baselines.models.superpixel_waypoint_selector import WaypointSelector
12 |
13 | import time
14 | from pyinstrument import Profiler
15 |
16 |
17 | class SuperPixelPolicy(nn.Module):
18 | def __init__(self, config) -> None:
19 | super().__init__()
20 | self.config = config
21 | self.visualize = config.MAP.VISUALIZE
22 | self.print_images = config.MAP.PRINT_IMAGES
23 |
24 | self.waypoint_selector = WaypointSelector(config)
25 |
26 | def reset(self) -> None:
27 | self.waypoint_selector.reset()
28 |
29 | def _get_sorted_regions(self, full_map: np.ndarray, traversible: np.ndarray, value_map: np.ndarray,
30 | collision_map: np.ndarray, detected_classes: OrderedSet) -> List:
31 | valid_mask = value_map.astype(bool)
32 | min_val = np.min(value_map)
33 | max_val = np.max(value_map)
34 | normalized_values = (value_map - min_val) / (max_val - min_val + 1e-5)
35 | normalized_values[value_map == 0] = 1
36 | img = cv2.applyColorMap((normalized_values * 255).astype(np.uint8), cv2.COLORMAP_HOT)
37 | slic = cv2.ximgproc.createSuperpixelSLIC(img, region_size=20, ruler=20.0)
38 | slic.iterate(10)
39 | mask_slic = slic.getLabelContourMask()
40 | mask_slic *= valid_mask
41 | label_slic = slic.getLabels()
42 | label_slic *= valid_mask
43 | valid_labels = np.unique(label_slic)[1:]
44 | value_regions = []
45 | for label in valid_labels:
46 | mask = np.zeros_like(mask_slic)
47 | mask[label_slic == label] = 1
48 | nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(mask, connectivity=8)
49 | if np.sum(output == 1) < 100:
50 | continue
51 | waypoint = np.array([int(centroids[1][1]), int(centroids[1][0])])
52 | value_mask = np.zeros_like(value_map)
53 | value_mask[waypoint[0] - 5: waypoint[0] + 5, waypoint[1] - 5: waypoint[1] + 5] = 1
54 | masked_value = value_mask * value_map
55 | waypoint = get_nearest_nonzero_waypoint(traversible, waypoint)
56 | if np.sum(collision_map) > 0:
57 | nonzero_indices = np.argwhere(collision_map != 0)
58 | distances = cdist([waypoint], nonzero_indices)
59 | if np.min(distances) <= 5:
60 | print("!!!!!!!!!!!!!!!!!waypoint close to collision area, change waypoint!")
61 | continue
62 | value_regions.append((mask, np.mean(masked_value[masked_value != 0]), waypoint))
63 | sorted_regions = sorted(value_regions, key=lambda x: x[1], reverse=True)
64 | waypoint_values = np.array([item[1] for item in value_regions])
65 |
66 | return sorted_regions
67 |
68 | def _get_sorted_region_fast_slic(self, full_map: np.ndarray, traversible: np.ndarray, value_map: np.ndarray,
69 | collision_map: np.ndarray, detected_classes: OrderedSet) -> List:
70 | valid_mask = value_map.astype(bool)
71 | min_val = np.min(value_map)
72 | max_val = np.max(value_map)
73 | normalized_values = (value_map - min_val) / (max_val - min_val + 1e-5)
74 | normalized_values[value_map == 0] = 1
75 | img = cv2.applyColorMap((normalized_values * 255).astype(np.uint8), cv2.COLORMAP_HOT)
76 | slic = Slic(num_components=24**2, compactness=100)
77 | assignment = slic.iterate(img)
78 | assignment *= valid_mask
79 | valid_labels = np.unique(assignment)[1:]
80 | value_regions = []
81 | for label in valid_labels:
82 | mask = np.zeros_like(value_map)
83 | mask[assignment == label] = 1
84 | nb_components, output, stats, centroids = \
85 | cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8)
86 | if np.sum(output == 1) < 100:
87 | continue
88 | waypoint = np.array([int(centroids[1][1]), int(centroids[1][0])])
89 | value_mask = np.zeros_like(value_map)
90 | value_mask[waypoint[0] - 5: waypoint[0] + 5, waypoint[1] - 5: waypoint[1] + 5] = 1
91 | masked_value = value_mask * value_map
92 | if np.sum(masked_value) > 0:
93 | value_regions.append((mask, np.mean(masked_value[masked_value != 0]), waypoint))
94 | else:
95 | value_regions.append((mask, 0., waypoint))
96 | sorted_regions = sorted(value_regions, key=lambda x: x[1], reverse=True)
97 |
98 | return sorted_regions
99 |
100 | def _sorted_waypoints(self, sorted_regions: List, top_k: int=3):
101 | waypoints, values = [], []
102 | for item in sorted_regions[:top_k]:
103 | waypoints.append([item[2]])
104 | values.append(item[1])
105 | waypoints = np.concatenate(waypoints, axis=0)
106 |
107 | return waypoints, values
108 |
109 | def forward(self, full_map: np.ndarray, traversible: np.ndarray, value_map: np.ndarray, collision_map: np.ndarray,
110 | detected_classes: OrderedSet, position: Sequence, fmm_dist: np.ndarray, replan: bool, step: int, current_episode_id: int):
111 | if np.sum(value_map.astype(bool)) < 24**2:
112 | best_waypoint = np.array([int(position[0]), int(position[1])])
113 | best_value = 0.
114 | sorted_waypoints = [np.array([int(position[0]), int(position[1])])]
115 |
116 | return best_waypoint, best_value, sorted_waypoints
117 | else:
118 | sorted_regions = self._get_sorted_region_fast_slic(full_map, traversible, value_map, collision_map, detected_classes)
119 | sorted_waypoints, sorted_values = self._sorted_waypoints(sorted_regions)
120 | best_waypoint, best_value, sorted_waypoints = \
121 | self.waypoint_selector(sorted_waypoints, position, collision_map, value_map, fmm_dist, traversible, replan)
122 |
123 | if self.visualize:
124 | self._visualize(sorted_regions, value_map, step, current_episode_id)
125 | return best_waypoint, best_value, sorted_waypoints
126 |
127 | def _visualize(self, sorted_regions: List, value_map: np.ndarray, step: int, current_episode_id: int):
128 | waypoints = []
129 | res = np.zeros(value_map.shape)
130 | num_regions = len(sorted_regions)
131 |
132 | for i, (mask, _, _) in enumerate(sorted_regions):
133 | res[mask == 1] = num_regions + 1 - i
134 | _, _, _, centroids = cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8)
135 | waypoint = np.array([int(centroids[1][1]), int(centroids[1][0])])
136 | waypoints.append(waypoint)
137 |
138 | min_val = np.min(res)
139 | max_val = np.max(res)
140 | normalized_values = (res - min_val) / (max_val - min_val + 1)
141 | normalized_values[res == 0] = 1
142 | res = cv2.applyColorMap((normalized_values* 255).astype(np.uint8), cv2.COLORMAP_HOT)
143 | for waypoint in waypoints:
144 | cv2.circle(res, (waypoint[1], waypoint[0]), radius=2, color=(0,0,0), thickness=-1)
145 |
146 | cv2.imshow("super pixel", np.flipud(res))
147 | cv2.waitKey(1)
148 |
149 | if self.print_images:
150 | save_dir = os.path.join(self.config.RESULTS_DIR, "super_pixel/eps_%d"%current_episode_id)
151 | os.makedirs(save_dir, exist_ok=True)
152 | fn = "{}/step-{}.png".format(save_dir, step)
153 | cv2.imwrite(fn, np.flipud(res))
--------------------------------------------------------------------------------
/vlnce_baselines/models/superpixel_waypoint_selector.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch.nn as nn
3 | from scipy.spatial.distance import cdist
4 | from vlnce_baselines.utils.acyclic_enforcer import AcyclicEnforcer
5 | from vlnce_baselines.utils.map_utils import get_nearest_nonzero_waypoint
6 |
7 |
8 | class WaypointSelector(nn.Module):
9 | def __init__(self, config) -> None:
10 | super().__init__()
11 | self.config = config
12 | self.resolution = config.MAP.MAP_RESOLUTION
13 | self.distance_threshold = 0.25 * 100 / self.resolution
14 | self._acyclic_enforcer = AcyclicEnforcer()
15 | self._last_value = float("-inf")
16 | self._last_waypoint = np.zeros(2)
17 | self._stick_current_waypoint = False
18 | self.change_threshold = self.config.EVAL.CHANGE_THRESHOLD
19 |
20 | def reset(self) -> None:
21 | self._last_value = float("-inf")
22 | self._last_waypoint = np.zeros(2)
23 | self._stick_current_waypoint = False
24 | self._acyclic_enforcer.reset()
25 |
26 | def closest_point(self, points: np.ndarray, target_point: np.ndarray) -> np.ndarray:
27 | distances = np.linalg.norm(points - target_point, axis=1)
28 |
29 | return points[np.argmin(distances)]
30 |
31 | def _get_value(self, position: np.ndarray, value_map: np.ndarray) -> float:
32 | x, y = position
33 | value = value_map[x - 5 : x + 6, y - 5: y + 6]
34 | value = np.mean(value[value != 0])
35 |
36 | return value
37 |
38 | def forward(self, sorted_waypoints: np.ndarray, position: np.ndarray, collision_map: np.ndarray,
39 | value_map: np.ndarray, fmm_dist: np.ndarray, traversible: np.ndarray, replan: bool):
40 | best_waypoint, best_value = None, None
41 | invalid_waypoint = False
42 | if not np.array_equal(self._last_waypoint, np.zeros(2)):
43 | if replan:
44 | invalid_waypoint = True
45 |
46 | if np.sum(collision_map) > 0:
47 | """
48 | check if the last_waypoint is too close to the current collision area
49 | """
50 | nonzero_indices = np.argwhere(collision_map != 0)
51 | distances = cdist([self._last_waypoint], nonzero_indices)
52 | if np.min(distances) <= 5:
53 | invalid_waypoint = True
54 | print("################################################ close to collision")
55 |
56 | if np.linalg.norm(self._last_waypoint - position) < self.distance_threshold:
57 | invalid_waypoint = True
58 | print("################################################ achieve")
59 |
60 | x, y = int(position[0]), int(position[1])
61 | if fmm_dist is not None:
62 | print("fmm dist: ", np.mean(fmm_dist[x-10:x+11, y-10:y+11]), np.max(fmm_dist))
63 | if fmm_dist is not None and abs(np.mean(fmm_dist[x-10:x+11, y-10:y+11]) - np.max(fmm_dist)) <= 5.0:
64 | invalid_waypoint = True
65 | print("################################################ created an enclosed area!")
66 |
67 | if invalid_waypoint:
68 | idx = 0
69 | new_waypoint = sorted_waypoints[idx]
70 | distance_flag = np.linalg.norm(new_waypoint - position) < self.distance_threshold
71 | last_waypoint_flag = np.linalg.norm(new_waypoint - self._last_waypoint) < self.distance_threshold
72 | flag = distance_flag or last_waypoint_flag
73 | while ( flag and idx + 1 < len(sorted_waypoints)):
74 | idx += 1
75 | new_waypoint = sorted_waypoints[idx]
76 | self._last_waypoint = new_waypoint
77 |
78 | """
79 | if last_waypoint's current value doesn't get worse too much
80 | then we stick to it.
81 | """
82 | curr_value = self._get_value(self._last_waypoint, value_map)
83 |
84 | if ((np.linalg.norm(self._last_waypoint - position) > self.distance_threshold) and
85 | (curr_value - self._last_value > self.change_threshold)):
86 | best_waypoint = self._last_waypoint
87 |
88 | if best_waypoint is None:
89 | for waypoint in sorted_waypoints:
90 | cyclic = self._acyclic_enforcer.check_cyclic(position, waypoint,
91 | threshold=0.5*100/self.resolution)
92 | if cyclic or np.linalg.norm(waypoint - position) < self.distance_threshold:
93 | continue
94 |
95 | best_waypoint= waypoint
96 | break
97 |
98 | if best_waypoint is None:
99 | print("All waypoints are cyclic! Choosing the closest one.")
100 | best_waypoint = self.closest_point(sorted_waypoints, position)
101 |
102 | if traversible[best_waypoint[0], best_waypoint[1]] == 0:
103 | best_waypoint = get_nearest_nonzero_waypoint(traversible, best_waypoint)
104 |
105 | best_value = self._get_value(best_waypoint, value_map)
106 |
107 | self._acyclic_enforcer.add_state_action(position, best_waypoint)
108 | self._last_value = best_value
109 | self._last_waypoint = best_waypoint
110 |
111 | return best_waypoint, best_value, sorted_waypoints
--------------------------------------------------------------------------------
/vlnce_baselines/models/vanilla_waypoint_selector.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch.nn as nn
3 | from scipy.spatial.distance import cdist
4 | from vlnce_baselines.utils.acyclic_enforcer import AcyclicEnforcer
5 | from vlnce_baselines.utils.map_utils import get_nearest_nonzero_waypoint
6 |
7 |
8 | class WaypointSelector(nn.Module):
9 | def __init__(self, config) -> None:
10 | super().__init__()
11 | self.config = config
12 | self.resolution = config.MAP.MAP_RESOLUTION
13 | self.distance_threshold = 0.5 * 100 / self.resolution
14 |
15 | self.reset()
16 |
17 | def reset(self) -> None:
18 | self._last_value = float("-inf")
19 | self._last_waypoint = np.zeros(2)
20 | self._stick_current_waypoint = False
21 | self._acyclic_enforcer = AcyclicEnforcer()
22 |
23 | def closest_point(self, points: np.ndarray, target_point: np.ndarray) -> np.ndarray:
24 | distances = np.linalg.norm(points - target_point, axis=1)
25 |
26 | return points[np.argmin(distances)]
27 |
28 | def _get_value(self, position: np.ndarray, value_map: np.ndarray) -> float:
29 | x, y = position
30 | value = value_map[x - 5 : x + 6, y - 5: y + 6]
31 | value = np.mean(value[value != 0])
32 |
33 | return value
34 |
35 | def forward(self, sorted_waypoints: np.ndarray, frontiers: np.ndarray, position: np.ndarray,
36 | collision_map: np.ndarray, value_map: np.ndarray):
37 | best_waypoint, best_value = None, None
38 | invalid_waypoint = False
39 | print("last waypoint: ", self._last_waypoint, self._last_value)
40 | if not np.array_equal(self._last_waypoint, np.zeros(2)):
41 | if np.sum(collision_map) > 0:
42 | """
43 | check if the last_waypoint is too close to the current collision area
44 | """
45 | nonzero_indices = np.argwhere(collision_map != 0)
46 | distances = cdist([self._last_waypoint], nonzero_indices)
47 | if np.min(distances) <= 5:
48 | invalid_waypoint = True
49 | print("################################################ close to collision")
50 |
51 | if np.sum(frontiers) > 0:
52 | nonzero_indices = np.argwhere(frontiers != 0)
53 | distances = cdist([self._last_waypoint], nonzero_indices)
54 | if np.min(distances) >= 5 * 100 / self.resolution:
55 | invalid_waypoint = True
56 | print("################################################ too far from frontiers")
57 |
58 | if np.linalg.norm(self._last_waypoint - position) <= self.distance_threshold:
59 | """
60 | already achieved last waypoint, need to change another waypoint
61 | """
62 | invalid_waypoint = True
63 | print("################################################ achieve")
64 |
65 | if invalid_waypoint:
66 | idx = 0
67 | new_waypoint = sorted_waypoints[idx]
68 | while (np.linalg.norm(new_waypoint - position) < self.distance_threshold and
69 | idx < len(sorted_waypoints)):
70 | idx += 1
71 | new_waypoint = sorted_waypoints[idx]
72 | self._last_waypoint = new_waypoint
73 | print("################################################ get new last waypoint")
74 |
75 | """
76 | do not change waypoint if last waypoint's value is not getting too worse
77 | """
78 | curr_value = self._get_value(self._last_waypoint, value_map)
79 |
80 | if (np.linalg.norm(self._last_waypoint - position) > self.distance_threshold and
81 | (curr_value - self._last_value > -0.03)):
82 | best_waypoint = self._last_waypoint
83 | else:
84 | print("!!!!!!!! already achieve last waypoint")
85 |
86 | if best_waypoint is None:
87 | for waypoint in sorted_waypoints:
88 | cyclic = self._acyclic_enforcer.check_cyclic(position, waypoint,
89 | threshold=0.5*100/self.resolution)
90 | if cyclic or np.linalg.norm(waypoint - position) <= self.distance_threshold:
91 | continue
92 |
93 | best_waypoint= waypoint
94 | break
95 |
96 | if best_waypoint is None:
97 | print("All waypoints are cyclic! Choosing the closest one.")
98 | best_waypoint = self.closest_point(sorted_waypoints, position)
99 |
100 | if value_map[best_waypoint[0], best_waypoint[1]] == 0:
101 | best_waypoint = get_nearest_nonzero_waypoint(value_map, best_waypoint)
102 |
103 | best_value = self._get_value(best_waypoint, value_map)
104 | self._acyclic_enforcer.add_state_action(position, best_waypoint)
105 | self._last_value = best_value
106 | self._last_waypoint = best_waypoint
107 |
108 | return best_waypoint, best_value, sorted_waypoints
--------------------------------------------------------------------------------
/vlnce_baselines/utils/acyclic_enforcer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Boston Dynamics AI Institute LLC. All rights reserved.
2 |
3 | from typing import Any, Set
4 |
5 | import numpy as np
6 |
7 |
8 | class StateAction:
9 | def __init__(self, position: np.ndarray, waypoint: Any):
10 | self.position = position
11 | self.waypoint = waypoint
12 |
13 | def __hash__(self) -> int:
14 | string_repr = f"{self.position}_{self.waypoint}"
15 | return hash(string_repr)
16 |
17 |
18 | class AcyclicEnforcer:
19 | history: Set[StateAction] = set()
20 |
21 | def reset(self):
22 | self.history = set()
23 |
24 | def check_cyclic(self, position: np.ndarray, waypoint: Any, threshold: float) -> bool:
25 | for item in self.history:
26 | if (np.linalg.norm(item.waypoint - waypoint) <= threshold and
27 | np.linalg.norm(item.position - position) <= threshold):
28 | return True
29 |
30 | return False
31 |
32 | def add_state_action(self, position: np.ndarray, waypoint: Any) -> None:
33 | state_action = StateAction(position, waypoint)
34 | self.history.add(state_action)
--------------------------------------------------------------------------------
/vlnce_baselines/utils/constant.py:
--------------------------------------------------------------------------------
1 | color_palette = [
2 | 1.00, 1.00, 1.00,
3 | 0.60, 0.60, 0.60,
4 | 0.95, 0.95, 0.95,
5 | 0.96, 0.36, 0.26,
6 | 0.12, 0.47, 0.71,
7 | 0.94, 0.78, 0.66,
8 | 0.94, 0.89, 0.66,
9 | 0.89, 0.94, 0.66,
10 | 0.78, 0.94, 0.66,
11 | 0.68, 0.94, 0.66,
12 | 0.66, 0.94, 0.75,
13 | 0.66, 0.94, 0.85,
14 | 0.66, 0.92, 0.94,
15 | 0.66, 0.82, 0.94,
16 | 0.66, 0.71, 0.94,
17 | 0.71, 0.66, 0.94,
18 | 0.82, 0.66, 0.94,
19 | 0.92, 0.66, 0.94,
20 | 0.94, 0.66, 0.85,
21 | 0.94, 0.66, 0.75,
22 | 0.66, 0.66, 0.66,
23 | 0.77, 0.77, 0.77,
24 | 0.99, 0.50, 0.40,
25 | 0.14, 0.49, 0.74,
26 | 0.99, 0.83, 0.71,
27 | 0.99, 0.94, 0.71,
28 | 0.94, 0.99, 0.71,
29 | 0.83, 0.99, 0.71,
30 | 0.72, 0.99, 0.71,
31 | 0.71, 0.99, 0.77
32 | ]
33 |
34 | legend_color_palette = [
35 | (1.00, 1.00, 1.00),
36 | (0.60, 0.60, 0.60),
37 | (0.95, 0.95, 0.95),
38 | (0.96, 0.36, 0.26),
39 | (0.12, 0.47, 0.71),
40 | (0.94, 0.78, 0.66),
41 | (0.94, 0.89, 0.66),
42 | (0.89, 0.94, 0.66),
43 | (0.78, 0.94, 0.66),
44 | (0.68, 0.94, 0.66),
45 | (0.66, 0.94, 0.75),
46 | (0.66, 0.94, 0.85),
47 | (0.66, 0.92, 0.94),
48 | (0.66, 0.82, 0.94),
49 | (0.66, 0.71, 0.94),
50 | (0.71, 0.66, 0.94),
51 | (0.82, 0.66, 0.94),
52 | (0.92, 0.66, 0.94),
53 | (0.94, 0.66, 0.85),
54 | (0.94, 0.66, 0.75),
55 | (0.66, 0.66, 0.66),
56 | (0.77, 0.77, 0.77),
57 | (0.99, 0.50, 0.40),
58 | (0.14, 0.49, 0.74),
59 | (0.99, 0.83, 0.71),
60 | (0.99, 0.94, 0.71),
61 | (0.94, 0.99, 0.71),
62 | (0.83, 0.99, 0.71),
63 | (0.72, 0.99, 0.71),
64 | (0.71, 0.99, 0.77)
65 | ]
66 |
67 |
68 | legend_color_palette = [(int(color[0] * 255),
69 | int(color[1] * 255),
70 | int(color[2] * 255)) for color in legend_color_palette]
71 |
72 | map_channels = 4
73 |
74 | base_classes = [
75 | "chair", "couch", "plant", "bed", "toilet",
76 | "tv", "table", "oven", "sink", "refrigerator",
77 | "book", "clock", "vase", "cup", "bottle", "floor"
78 | ]
79 |
80 |
81 | navigable_classes = [
82 | "stair", "stairs",
83 | "step", "steps",
84 | "stairway", "stairways",
85 | "staircase", "staircases",
86 | "floor", "ground",
87 | "walkway"
88 | ]
89 |
90 |
91 | direction_mapping = {
92 | "forward": "forward",
93 | "straight": "forward",
94 | "front": "forward",
95 | "left": "left",
96 | "right": "right",
97 | "around": "backward",
98 | "round": "backward",
99 | "back": "backward",
100 | "backward": "backward"
101 | }
--------------------------------------------------------------------------------
/vlnce_baselines/utils/data_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Union
2 |
3 |
4 | class OrderedSet:
5 | def __init__(self) -> None:
6 | self.order = ['unknow']
7 | self.unique_elements = set()
8 |
9 | def add(self, x: Union[int, float, str]) -> None:
10 | if x not in self.unique_elements:
11 | self.order.remove('unknow')
12 | self.order.append(x)
13 | self.order.append('unknow')
14 | self.unique_elements.add(x)
15 |
16 | def remove(self, x: Any) -> None:
17 | if x in self.order:
18 | self.order.remove(x)
19 | self.unique_elements.remove(x)
20 |
21 | def clear(self) -> None:
22 | self.order.clear()
23 | self.unique_elements.clear()
24 |
25 | def index(self, x: Any) -> None:
26 | if x in self.order:
27 | return self.order.index(x)
28 | else:
29 | raise ValueError("f{x} not found in OrderedSet")
30 |
31 | def __len__(self):
32 | return len(self.order)
33 |
34 | def __str__(self):
35 | return str(self.order)
36 |
37 | def __getitem__(self, index: int):
38 | return self.order[index]
--------------------------------------------------------------------------------
/vlnce_baselines/utils/depth_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for processing depth images.
17 | """
18 | from argparse import Namespace
19 |
20 | import itertools
21 | import numpy as np
22 | import torch
23 |
24 | import vlnce_baselines.utils.rotation_utils as ru
25 |
26 |
27 | def get_camera_matrix(width, height, fov):
28 | """Returns a camera matrix from image size and fov."""
29 | xc = (width - 1.) / 2. # x camera
30 | zc = (height - 1.) / 2. # z camera
31 | f = (width / 2.) / np.tan(np.deg2rad(fov / 2.))
32 | camera_matrix = {'xc': xc, 'zc': zc, 'f': f}
33 | camera_matrix = Namespace(**camera_matrix)
34 | return camera_matrix
35 |
36 |
37 | def get_point_cloud_from_z(Y, camera_matrix, scale=1):
38 | """Projects the depth image Y into a 3D point cloud.
39 | Inputs:
40 | Y is ...xHxW
41 | camera_matrix
42 | Outputs:
43 | X is positive going right
44 | Y is positive into the image
45 | Z is positive up in the image
46 | XYZ is ...xHxWx3
47 | """
48 | x, z = np.meshgrid(np.arange(Y.shape[-1]),
49 | np.arange(Y.shape[-2] - 1, -1, -1))
50 | for _ in range(Y.ndim - 2):
51 | x = np.expand_dims(x, axis=0)
52 | z = np.expand_dims(z, axis=0)
53 | X = (x[::scale, ::scale] - camera_matrix.xc) * \
54 | Y[::scale, ::scale] / camera_matrix.f
55 | Z = (z[::scale, ::scale] - camera_matrix.zc) * \
56 | Y[::scale, ::scale] / camera_matrix.f
57 | XYZ = np.concatenate((X[..., np.newaxis],
58 | Y[::scale, ::scale][..., np.newaxis],
59 | Z[..., np.newaxis]), axis=X.ndim)
60 | return XYZ
61 |
62 |
63 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree):
64 | """
65 | Transforms the point cloud into geocentric frame to account for
66 | camera elevation and angle
67 | Input:
68 | XYZ : ...x3
69 | sensor_height : height of the sensor
70 | camera_elevation_degree : camera elevation to rectify.
71 | Output:
72 | XYZ : ...x3
73 | """
74 | R = ru.get_r_matrix(
75 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
76 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
77 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
78 | return XYZ
79 |
80 |
81 | def transform_pose(XYZ, current_pose):
82 | """
83 | Transforms the point cloud into geocentric frame to account for
84 | camera position
85 | Input:
86 | XYZ : ...x3
87 | current_pose : camera position (x, y, theta (radians))
88 | Output:
89 | XYZ : ...x3
90 | """
91 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
92 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
93 | XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0]
94 | XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1]
95 | return XYZ
96 |
97 |
98 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution):
99 | """Bins points into xy-z bins
100 | XYZ_cms is ... x H x W x3
101 | Outputs is ... x map_size x map_size x (len(z_bins)+1)
102 | """
103 | sh = XYZ_cms.shape
104 | XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
105 | n_z_bins = len(z_bins) + 1
106 | counts = []
107 | for XYZ_cm in XYZ_cms:
108 | isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0]))
109 | X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32)
110 | Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32)
111 | Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32)
112 |
113 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0,
114 | Y_bin < map_size,
115 | Z_bin >= 0, Z_bin < n_z_bins, isnotnan])
116 | isvalid = np.all(isvalid, axis=0)
117 |
118 | ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin
119 | ind[np.logical_not(isvalid)] = 0
120 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
121 | minlength=map_size * map_size * n_z_bins)
122 | counts = np.reshape(count, [map_size, map_size, n_z_bins])
123 |
124 | counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins])
125 |
126 | return counts
127 |
128 |
129 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1):
130 | """
131 | Transform from pixel axis to camera axis.
132 | Projects the depth image Y into a 3D point cloud.
133 | Inputs:
134 | Y is ...xHxW
135 | camera_matrix
136 | Outputs:
137 | X is positive going right
138 | Y is positive into the image
139 | Z is positive up in the image
140 | XYZ is ...xHxWx3
141 | """
142 | # Y_t.shape = (batchsize, H, W)
143 | # grid_x.shape = grid_z.shape = (W, H)
144 | grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]), # grid_x range: [0, H - 1]
145 | torch.arange(Y_t.shape[-2] - 1, -1, -1)) # grid_z range: [W - 1, 0]
146 | grid_x = grid_x.transpose(1, 0).to(device) # (H, W)
147 | grid_z = grid_z.transpose(1, 0).to(device) # (H, W)
148 | grid_x = grid_x.unsqueeze(0).expand(Y_t.size()) # (batchsize, H, W)
149 | grid_z = grid_z.unsqueeze(0).expand(Y_t.size() )# (batchsize, H, W)
150 |
151 | X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \
152 | Y_t[:, ::scale, ::scale] / camera_matrix.f
153 | Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \
154 | Y_t[:, ::scale, ::scale] / camera_matrix.f
155 |
156 | XYZ = torch.stack(
157 | (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size()))
158 |
159 | return XYZ
160 |
161 |
162 | def transform_camera_view_t(
163 | XYZ, sensor_height, camera_elevation_degree, device):
164 | """
165 | Transform from camera axis to world axis, height and elevation first
166 | Transforms the point cloud into geocentric frame to account for
167 | camera elevation and angle
168 | Input:
169 | XYZ : ...x3
170 | sensor_height : height of the sensor
171 | camera_elevation_degree : camera elevation to rectify.
172 | Output:
173 | XYZ : ...x3
174 | """
175 | R = ru.get_r_matrix([1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
176 | XYZ = torch.matmul(XYZ.reshape(-1, 3), torch.from_numpy(R).transpose(1, 0).to(device)).reshape(XYZ.shape)
177 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
178 |
179 | return XYZ
180 |
181 |
182 | def transform_pose_t(XYZ, current_pose, device):
183 | """
184 | Transform from camera axis to world axis, x,y,heading second
185 | Transforms the point cloud into geocentric frame to account for
186 | camera position
187 | Input:
188 | XYZ : ...x3
189 | current_pose : camera position (x, y, theta (radians))
190 | Output:
191 | XYZ : ...x3
192 | """
193 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
194 | XYZ = torch.matmul(XYZ.reshape(-1, 3), torch.from_numpy(R).transpose(1, 0).to(device)).reshape(XYZ.shape)
195 | XYZ[..., 0] += current_pose[0]
196 | XYZ[..., 1] += current_pose[1]
197 |
198 | return XYZ
199 |
200 |
201 | def splat_feat_nd(init_grid, feat, coords):
202 | """
203 | Args:
204 | init_grid: B X nF X W X H X D X ..
205 | feat: B X nF X nPt
206 | coords: B X nDims X nPt in [-1, 1]
207 | Returns:
208 | grid: B X nF X W X H X D X ..
209 | """
210 | wts_dim = [] # store weights
211 | pos_dim = []
212 | grid_dims = init_grid.shape[2:] # [vr, vr, max_height - min_height]
213 |
214 | B = init_grid.shape[0]
215 | F = init_grid.shape[1] # num_categories + 1
216 |
217 | n_dims = len(grid_dims) # n_dims=3
218 |
219 | grid_flat = init_grid.view(B, F, -1) # [bs, 17, 100*100*80]
220 |
221 | '''
222 | coords: XYZ_cm_std => [bs, 3, x*y]
223 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution)
224 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] - vision_range // 2.) / vision_range * 2.
225 | XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution
226 | XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] - (max_h + min_h) // 2.) / (max_h - min_h) * 2.
227 | since XYZ_cm_std were normalized, so pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 recovers XYZ_cm_std to unnormalized
228 | '''
229 | for d in range(n_dims):
230 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 # [bs, 1, 19200=120*160]; pos has negative values
231 | pos_d = []
232 | wts_d = []
233 |
234 | for ix in [0, 1]:
235 | # when ix=0, round down; when ix=1, round up
236 | pos_ix = torch.floor(pos) + ix # [bs, 19200]
237 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
238 | safe_ix = safe_ix.type(pos.dtype)
239 |
240 | # when round down: e.g. 1.75 -> 1.0; weight = 1 - abs(1.75 - 1.0) = 0.25;
241 | # when round up: e.g. 1.75 -> 1.0 + 1 = 2.0; weight = 1 - abs(1.75 - 2.0) = 0.75
242 | wts_ix = 1 - torch.abs(pos - pos_ix)
243 |
244 | wts_ix = wts_ix * safe_ix # positions outside range have weight=0
245 | pos_ix = pos_ix * safe_ix # positions outside range are all set to 0
246 |
247 | pos_d.append(pos_ix)
248 | wts_d.append(wts_ix)
249 |
250 | # len(pos_d)=2, len(pos_dim=3) pos_dim = [[[...],[...]],[],[]], pos_dim[0][0].shape=[bs, 1, 19200]
251 | # pos_dim[0][0]: x_floor; pos_dim[0][1]: x_ceil
252 | # pos_dim[1][0]: y_floor; pos_dim[1][1]: y_ceil
253 | # pos_dim[2][0]: z_floor; pos_dim[2][1]: z_ceil
254 | # actually pos_dim saves point clouds in vision range, which are truely usefull for subsequent steps
255 | pos_dim.append(pos_d)
256 | wts_dim.append(wts_d)
257 |
258 | l_ix = [[0, 1] for d in range(n_dims)]
259 | for ix_d in itertools.product(*l_ix):
260 | '''
261 | each dimension(x,y,z) should consider floor and ceiling
262 | ix_d:
263 | (0,0,0) => x floor, y floor, z floor
264 | (0,0,1) => x floor, y floor, z ceiling
265 | (0,1,0) => x floor, y ceiling, z floor
266 | (0,1,1) => x floor, y ceiling, z ceiling
267 | (1,0,0) => x ceiling, y floor, z floor
268 | (1,0,1) => x ceiling, y floor, z ceil
269 | (1,1,0) => x ceiling, y ceiling, z floor
270 | (1,1,1) => x ceiling, y ceiling, z ceiling
271 | '''
272 | wts = torch.ones_like(wts_dim[0][0]) # [bs, 1, 19200]
273 | index = torch.zeros_like(wts_dim[0][0]) #[bs, 1, 19200]
274 | for d in range(n_dims):
275 | # pos_dim[0]=[floor(x), floor(x) + 1]; len(pos_dim[0])=2
276 | # ix_d of first iteration: (0,0,0); ix_d[0]=0
277 | # pos_dim[0][0].shape=[bs, 1, 19200]
278 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]] # [bs, 1, 19200]
279 | wts = wts * wts_dim[d][ix_d[d]] # [bs, 1, 19200]
280 |
281 | index = index.long() #[bs, 1, 19200]
282 |
283 | # grid_flat.shape = [bs, 17, 100*100*80]
284 | # index.expand(-1, F, -1).shape=[1, 17, 19200] => repeat 17 times
285 | # scatter_add_(dim, index, src)
286 | # index = [233960(index=0), 241960, ..., 233960(index=320)]
287 | # src = feat * wts; feat.shape=[bs, 17, 19200]
288 | # grid_flat[233960] = src[0] + src[320]
289 | grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts) # [bs, 17, 100*100*80]
290 | grid_flat = torch.round(grid_flat)
291 |
292 | return grid_flat.view(init_grid.shape) # [bs, 17, 100, 100, 80]
293 |
--------------------------------------------------------------------------------
/vlnce_baselines/utils/map_utils.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import torch
3 | import numpy as np
4 | from typing import Tuple, List
5 | import torch.nn.functional as F
6 | from collections import Sequence
7 | from scipy.spatial.distance import cdist
8 | from skimage.morphology import remove_small_objects, closing, disk, dilation
9 |
10 | from vlnce_baselines.utils.constant import *
11 | from vlnce_baselines.utils.pose import get_agent_position, threshold_poses
12 | import time
13 |
14 |
15 | def get_grid(pose: torch.Tensor, grid_size: Tuple, device: torch.device):
16 | """
17 | Input:
18 | `pose` FloatTensor(bs, 3)
19 | `grid_size` 4-tuple (bs, _, grid_h, grid_w)
20 | `device` torch.device (cpu or gpu)
21 | Output:
22 | `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
23 | `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
24 |
25 | """
26 | pose = pose.float()
27 | x = pose[:, 0]
28 | y = pose[:, 1]
29 | t = pose[:, 2]
30 |
31 | bs = x.size(0)
32 | t = t * np.pi / 180. # t.shape=[bs]
33 | cos_t = t.cos()
34 | sin_t = t.sin()
35 |
36 | theta11 = torch.stack([cos_t, -sin_t,
37 | torch.zeros(cos_t.shape).float().to(device)], 1) # [bs, 3]
38 | theta12 = torch.stack([sin_t, cos_t,
39 | torch.zeros(cos_t.shape).float().to(device)], 1)
40 | theta1 = torch.stack([theta11, theta12], 1) # [bs, 2, 3] rotation matrix
41 |
42 | theta21 = torch.stack([torch.ones(x.shape).to(device),
43 | -torch.zeros(x.shape).to(device), x], 1)
44 | theta22 = torch.stack([torch.zeros(x.shape).to(device),
45 | torch.ones(x.shape).to(device), y], 1)
46 | theta2 = torch.stack([theta21, theta22], 1) # [bs, 2, 3] translation matrix
47 |
48 | rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
49 | trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
50 |
51 | return rot_grid, trans_grid
52 |
53 |
54 | def get_mask(sx, sy, scale, step_size):
55 | size = int(step_size // scale) * 2 + 1 # size=11
56 | mask = np.zeros((size, size)) # (11,11)
57 | for i in range(size):
58 | for j in range(size):
59 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
60 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
61 | step_size ** 2 \
62 | and ((i + 0.5) - (size // 2 + sx)) ** 2 + \
63 | ((j + 0.5) - (size // 2 + sy)) ** 2 > \
64 | (step_size - 1) ** 2:
65 | mask[i, j] = 1
66 |
67 | mask[size // 2, size // 2] = 1
68 | return mask
69 |
70 |
71 | def get_dist(sx, sy, scale, step_size):
72 | size = int(step_size // scale) * 2 + 1
73 | mask = np.zeros((size, size)) + 1e-10
74 | for i in range(size):
75 | for j in range(size):
76 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
77 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
78 | step_size ** 2:
79 | mask[i, j] = max(5,
80 | (((i + 0.5) - (size // 2 + sx)) ** 2 +
81 | ((j + 0.5) - (size // 2 + sy)) ** 2) ** 0.5)
82 | return mask
83 |
84 |
85 | def create_sector_mask(position: Sequence, heading: float, radius: float,
86 | angle: float, map_shape: Sequence):
87 | """
88 | arg "position" came from full pose, full pose use standard Cartesian coordinate.
89 | """
90 | mask = np.zeros(map_shape)
91 | heading = (360 - heading) % 360
92 | angle_high = (heading + angle / 2) % 360
93 | angle_low = (heading - angle / 2) % 360
94 |
95 | y, x = np.meshgrid(np.arange(map_shape[0]) - position[0], np.arange(map_shape[1]) - position[1])
96 | distance = np.sqrt(x**2 + y**2)
97 | angle = np.arctan2(x, y) * 180 / np.pi
98 | angle = (360 - angle) % 360
99 |
100 | valid_distance = distance <= radius
101 | if angle_high > angle_low:
102 | valid_angle = (angle_low <= angle) & (angle <= angle_high)
103 | else:
104 | valid_angle = (angle_low <= angle) | (angle <= angle_high)
105 | mask[valid_distance & valid_angle] = 1
106 |
107 | return mask
108 |
109 |
110 | def get_collision_mask(known_vector: np.ndarray, mask_data: np.ndarray, angle_threshold: float):
111 | collision_map = np.zeros_like(mask_data)
112 | center = np.array(mask_data.shape) // 2
113 |
114 | rows, cols = np.indices(mask_data.shape)
115 | rows_from_center = rows - center[0]
116 | cols_from_center = cols - center[1]
117 |
118 | nonzero_indices = np.nonzero(mask_data)
119 |
120 | vectors = np.array([rows_from_center[nonzero_indices], cols_from_center[nonzero_indices]])
121 | vector_lengths = np.linalg.norm(vectors, axis=0)
122 | known_vector_length = np.linalg.norm(known_vector)
123 | rotation_matrix = np.array([[0, -1],
124 | [1, 0]])
125 | known_vector = np.dot(rotation_matrix, known_vector)
126 | known_vector_expanded = np.tile(known_vector[:, np.newaxis], vectors.shape[1])
127 |
128 | cos_angles = np.sum(known_vector_expanded * vectors, axis=0) / (known_vector_length * vector_lengths + 1e-10)
129 | angles_rad = np.arccos(np.clip(cos_angles, -1.0, 1.0))
130 | angles_deg = np.degrees(angles_rad)
131 | print("angles: ", angles_deg)
132 | collision_map[nonzero_indices[0][angles_deg <= angle_threshold],
133 | nonzero_indices[1][angles_deg <= angle_threshold]] = 1
134 |
135 | return collision_map
136 |
137 |
138 | def process_navigable_classes(classes: List):
139 | classes = [item.strip().lower() for item in classes]
140 | common_items = set(navigable_classes) & set(classes)
141 | if len(common_items) > 0:
142 | navigable_index = [classes.index(item) for item in common_items]
143 | else:
144 | navigable_index = []
145 |
146 | return navigable_index
147 |
148 |
149 | def get_obstacle(map: np.ndarray, kernel_size: int=3) -> np.ndarray:
150 | """
151 | The agent radius is 0.18m and resolution is 5, so the agent
152 | takes at least a 8*8 square area in map whose size is 64.
153 | Now we will remove some small objects(think of them as noise) first
154 | and then do morphological closing, so set min_size=64 which is coincidentally
155 | the default value of min_size is a good choice
156 | """
157 | obstacle = map[0, ...]
158 | obstacle = remove_small_objects(
159 | obstacle.astype(bool),
160 | min_size=64, # you can try different minimum object size
161 | connectivity=5)
162 | selem = disk(kernel_size)
163 | obstacle = closing(obstacle, selem)
164 |
165 | return obstacle.astype(bool)
166 |
167 |
168 | def get_objects(map: np.ndarray, classes: List, kernel_size: int=3) -> Tuple:
169 | navigable = np.zeros(map.shape[-2:])
170 | navigable_index = process_navigable_classes(classes)
171 | objects = np.zeros(map.shape[-2:])
172 | for i, obj in enumerate(map[map_channels:, ...]):
173 | obj = remove_small_objects(obj.astype(bool), min_size=64)
174 | obj = closing(obj, selem=disk(kernel_size))
175 | if i in navigable_index:
176 | navigable = np.logical_or(navigable, obj)
177 | else:
178 | objects = np.logical_or(obj, objects)
179 |
180 | return objects.astype(bool), navigable.astype(bool)
181 |
182 |
183 | def get_explored_area(map: np.ndarray, kernel_size: int=3) -> np.ndarray:
184 | """
185 | when extract large area like explored area, we prefer do morphological
186 | closing first then remove small objects.
187 | the agent takes 8*8 area and one step is 0.25m which takes 5 squares
188 | so an area of 20*20 size is enough for the agent to take one step in four directions.
189 | """
190 | explored_area = map[1, ...]
191 | selem = disk(kernel_size)
192 | explored_area = closing(explored_area, selem)
193 | explored_area = remove_small_objects(explored_area.astype(bool), min_size=400)
194 |
195 | return explored_area
196 |
197 |
198 | def process_floor(map: np.ndarray, classes: List, kernel_size: int=3) -> np.ndarray:
199 | """
200 | we didn't use get_objects() and get_explored_area() here because
201 | we want to extract the floor area more precisely. So we're going
202 | to do the morphological closing at the last step.
203 | """
204 | t1 = time.time()
205 | navigable_index = process_navigable_classes(classes)
206 | navigable = np.zeros(map.shape[-2:])
207 | explored_area = map[1, ...]
208 | obstacles = map[0, ...]
209 | objects = np.zeros(map.shape[-2:])
210 | for i, obj in enumerate(map[map_channels:, ...]):
211 | if i in navigable_index:
212 | navigable += obj
213 | else:
214 | objects += obj
215 | free_mask = 1 - np.logical_or(obstacles, objects)
216 | free_mask = np.logical_or(free_mask, navigable)
217 | free_space = explored_area * free_mask
218 | floor = remove_small_objects(free_space.astype(bool), min_size=400)
219 | floor = closing(floor, selem=disk(kernel_size))
220 | t2 = time.time()
221 | print("process floor cost time: ", t2 - t1)
222 | return floor
223 |
224 |
225 | def find_frontiers(map: np.ndarray, classes: List) -> np.ndarray:
226 | floor = get_floor_area(map, classes)
227 | explored_area = get_explored_area(map)
228 | contours, _ = cv2.findContours(explored_area.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
229 | image = np.zeros(map.shape[-2:], dtype=np.uint8)
230 | image = cv2.drawContours(image, contours, -1, (255, 255, 255), thickness=3)
231 | res = np.logical_and(floor, image)
232 | res = dilation(res, selem=disk(2))
233 | res = remove_small_objects(res.astype(bool), min_size=64)
234 |
235 | return res.astype(np.uint8)
236 |
237 |
238 | def get_traversible_area(map: np.ndarray, classes: List) -> np.ndarray:
239 | """
240 | Sometimes there may be holes in enclosed obstacle areas
241 | this function aims to fill these holes.
242 | """
243 | objects, navigable = get_objects(map, classes)
244 | obstacles = get_obstacle(map)
245 | untraversible = np.logical_or(objects, obstacles)
246 | untraversible[navigable == 1] = 0
247 | untraversible = remove_small_objects(untraversible, min_size=64)
248 | untraversible = closing(untraversible, selem=disk(3))
249 | traversible = 1 - untraversible
250 |
251 | return traversible
252 |
253 | def get_floor_area(map: np.ndarray, classes: List) -> np.ndarray:
254 | """
255 | find traversible area that are connected with floor area
256 | """
257 | traversible = get_traversible_area(map, classes)
258 | floor = process_floor(map, classes)
259 | res = np.logical_xor(floor, traversible)
260 | res = remove_small_objects(res, min_size=64)
261 | nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(res.astype(np.uint8))
262 | if nb_components > 2:
263 | areas = [np.sum(output == i) for i in range(1, nb_components)]
264 | max_id = areas.index(max(areas)) + 1
265 | for i in range(1, nb_components):
266 | if i != max_id:
267 | floor = np.logical_or(floor, output==i)
268 |
269 | return floor.astype(bool)
270 |
271 |
272 | def get_nearest_nonzero_waypoint(arr: np.ndarray, start: Sequence) -> np.ndarray:
273 | nonzero_indices = np.argwhere(arr != 0)
274 | if len(nonzero_indices) > 0:
275 | distances = cdist([start], nonzero_indices)
276 | nearest_index = np.argmin(distances)
277 |
278 | return np.array(nonzero_indices[nearest_index])
279 | else:
280 | return np.array([int(start[0]), int(start[1])])
281 |
282 |
283 | def angle_between_vectors(vector1: np.ndarray, vector2: np.ndarray) -> np.ndarray:
284 | dot_product = np.dot(vector1, vector2)
285 | vector1_length = np.linalg.norm(vector1)
286 | vector2_length = np.linalg.norm(vector2)
287 | angle = np.arccos(dot_product / (vector1_length * vector2_length))
288 |
289 | cross_product = np.cross(vector1, vector2)
290 | if cross_product == 0 and vector1[0] == vector2[0] * -1:
291 | return 180.
292 | signed_angle = np.sign(cross_product) * angle
293 | angle_degrees = np.degrees(signed_angle)
294 |
295 | return angle_degrees
296 |
297 |
298 | def angle_to_vector(angle: float) -> np.ndarray:
299 | angle_rad = np.radians(angle)
300 | x = np.cos(angle_rad)
301 | y = np.sin(angle_rad)
302 |
303 | return np.array([x, y])
304 |
305 |
306 | def process_destination(destination: np.ndarray, full_map: np.ndarray, classes: List) -> np.ndarray:
307 | """
308 | destination could be some small objects, so we dilate them first
309 | and then remove small objects
310 | """
311 | floor = process_floor(full_map, classes)
312 | traversible = get_traversible_area(full_map, classes)
313 | destination = dilation(destination, selem=disk(5))
314 | destination = remove_small_objects(destination.astype(bool), min_size=64).astype(np.uint8)
315 | nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(destination)
316 | if len(centroids) > 1:
317 | centroid = centroids[1] # the first one is background
318 | waypoint = np.array([int(centroid[1]), int(centroid[0])])
319 | waypoint = get_nearest_nonzero_waypoint(np.logical_and(floor, traversible), waypoint)
320 | return waypoint
321 | else:
322 | return None
323 |
324 | def process_destination2(destination: np.ndarray, floor: np.ndarray, traversible: np.ndarray) -> np.ndarray:
325 | """
326 | destination could be some small objects, so we dilate them first
327 | and then remove small objects
328 | """
329 | destination = dilation(destination, selem=disk(5))
330 | destination = remove_small_objects(destination.astype(bool), min_size=64).astype(np.uint8)
331 | nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(destination)
332 | if len(centroids) > 1:
333 | centroid = centroids[1] # the first one is background
334 | waypoint = np.array([int(centroid[1]), int(centroid[0])])
335 | waypoint = get_nearest_nonzero_waypoint(traversible, waypoint)
336 | return waypoint
337 | else:
338 | return None
339 |
340 |
341 | def angle_and_direction(a: np.ndarray, b: np.ndarray, turn_angle: float) -> Tuple:
342 | unit_a = a / (np.linalg.norm(a) + 1e-5)
343 | unit_b = b / (np.linalg.norm(b) + 1e-5)
344 |
345 | cross_product = np.cross(unit_a, unit_b)
346 | dot_product = np.dot(unit_a, unit_b)
347 |
348 | angle = np.arccos(dot_product)
349 | angle_degrees = np.degrees(angle)
350 |
351 | if cross_product > 0 and angle_degrees >= (turn_angle / 2 + 0.01):
352 | direction = 3 # right
353 | # print("turn right", angle_degrees)
354 | elif cross_product < 0 and angle_degrees >= (turn_angle / 2):
355 | direction = 2 # left
356 | # print("turn left", angle_degrees)
357 | elif cross_product == 0 and angle_degrees == 180:
358 | direction = 3
359 | else:
360 | direction = 1 # forward
361 | # print("go forward", angle_degrees, cross_product)
362 |
363 | return angle_degrees, direction
364 |
365 |
366 | def closest_point_within_threshold(points_array: np.ndarray, target_point: np.ndarray, threshold: float) -> int:
367 | """Find the point within the threshold distance that is closest to the target_point.
368 |
369 | Args:
370 | points_array (np.ndarray): An array of 2D points, where each point is a tuple
371 | (x, y).
372 | target_point (np.ndarray): The target 2D point (x, y).
373 | threshold (float): The maximum distance threshold.
374 |
375 | Returns:
376 | int: The index of the closest point within the threshold distance.
377 | """
378 | distances = np.sqrt((points_array[:, 0] - target_point[0]) ** 2 + (points_array[:, 1] - target_point[1]) ** 2)
379 | within_threshold = distances <= threshold
380 |
381 | if np.any(within_threshold):
382 | closest_index = np.argmin(distances)
383 | return int(closest_index)
384 |
385 | return -1
386 |
387 |
388 | def collision_check(last_pose: np.ndarray, current_pose: np.ndarray,
389 | resolution: float, map_shape: Sequence,
390 | collision_threshold: float=0.2,
391 | width: float=0.4, height: float=1.5, buf: float=0.2) -> np.ndarray:
392 | last_position, last_heading = get_agent_position(last_pose, resolution)
393 | x0, y0 = last_position
394 | current_position, _ = get_agent_position(current_pose, resolution)
395 | position_vector = current_position - last_position
396 | displacement = np.linalg.norm(position_vector)
397 | collision_map = np.zeros(map_shape)
398 | print("displacement: ", displacement)
399 |
400 | if displacement < collision_threshold * 100 / resolution:
401 | print("!!!!!!!!! COLLISION !!!!!!!!!")
402 | theta = np.deg2rad(last_heading)
403 | width_range = int(width * 100 / resolution)
404 | height_range = int(height * 100 / resolution)
405 | # width_range = int((0.25 - displacement) * 100 / resolution * 2)
406 | # height_range = int((0.25 - displacement) * 100 / resolution * 2)
407 | buf = displacement * 100 / resolution + 3
408 |
409 | for i in range(height_range):
410 | for j in range(width_range):
411 | l1 = j + buf
412 | l2 = i - width_range // 2
413 | dy = l1 * np.cos(theta) + l2 * np.sin(theta) # change to ndarray coordinate
414 | dx = l1 * np.sin(theta) - l2 * np.cos(theta) # change to ndarray coordinate
415 | x1 = int(x0 - dx)
416 | y1 = int(y0 + dy)
417 | x1, y1 = threshold_poses([x1, y1], collision_map.shape)
418 | collision_map[x1, y1] = 1
419 |
420 | dy = l1 * np.cos(theta) - l2 * np.sin(theta) # change to ndarray coordinate
421 | dx = l1 * np.sin(theta) + l2 * np.cos(theta) # change to ndarray coordinate
422 | x1 = int(x0 - dx)
423 | y1 = int(y0 + dy)
424 | x1, y1 = threshold_poses([x1, y1], collision_map.shape)
425 | collision_map[x1, y1] = 1
426 | collision_map = closing(collision_map, selem=disk(1))
427 |
428 | return collision_map
429 |
430 |
431 | def calculate_displacement(last_pose: np.ndarray, current_pose: np.ndarray, resolution: float):
432 | last_position, last_heading = get_agent_position(last_pose, resolution)
433 | current_position, current_heading = get_agent_position(current_pose, resolution)
434 | x, y = current_position
435 | position_vector = current_position - last_position
436 | displacement = np.linalg.norm(position_vector)
437 |
438 | return displacement
439 |
440 | def collision_check_fmm(last_pose: np.ndarray, current_pose: np.ndarray,
441 | resolution: float, map_shape: Sequence,
442 | collision_threshold: float=0.2,
443 | width: float=0.4, height: float=1.5, buf: float=0.2) -> np.ndarray:
444 | last_position, last_heading = get_agent_position(last_pose, resolution)
445 | current_position, current_heading = get_agent_position(current_pose, resolution)
446 | x, y = current_position
447 | position_vector = current_position - last_position
448 | displacement = np.linalg.norm(position_vector)
449 | collision_map = np.zeros(map_shape)
450 | collision_mask = None
451 | print("displacement: ", displacement)
452 |
453 | if displacement < collision_threshold * 100 / resolution:
454 | print("!!!!!!!!! COLLISION !!!!!!!!!")
455 | dx, dy = x - int(x), y - int(y)
456 | mask = get_mask(dx, dy, scale=1, step_size=5)
457 | heading_vector = angle_to_vector(current_heading)
458 | collision_mask = get_collision_mask(heading_vector, mask, 32)
459 | x, y = int(x), int(y)
460 | if x - 5 >= 0 and x + 6 < map_shape[0] and y - 5 >= 0 and y + 6 < map_shape[1]:
461 | collision_map[x - 5 : x + 6, y - 5 : y + 6] = collision_mask
462 |
463 | return collision_map
--------------------------------------------------------------------------------
/vlnce_baselines/utils/pose.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import List, Tuple
3 | from collections import Sequence
4 |
5 |
6 | def get_agent_position(agent_pose: Sequence, resolution: float) -> Tuple[np.ndarray, float]:
7 | x, y, heading = agent_pose
8 | heading *= -1
9 | x, y = x * (100 / resolution), y * (100 / resolution)
10 | position = np.array([y, x])
11 |
12 | return position, heading
13 |
14 |
15 | def threshold_poses(coords: List, shape: Sequence) -> List:
16 | coords[0] = min(max(0, coords[0]), shape[0] - 1)
17 | coords[1] = min(max(0, coords[1]), shape[1] - 1)
18 |
19 | return coords
--------------------------------------------------------------------------------
/vlnce_baselines/utils/rotation_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for generating and applying rotation matrices.
17 | """
18 | import numpy as np
19 |
20 | ANGLE_EPS = 0.001
21 |
22 |
23 | def normalize(v):
24 | return v / np.linalg.norm(v)
25 |
26 |
27 | def get_r_matrix(ax_, angle):
28 | ax = normalize(ax_)
29 | if np.abs(angle) > ANGLE_EPS:
30 | S_hat = np.array(
31 | [[0.0, -ax[2], ax[1]], [ax[2], 0.0, -ax[0]], [-ax[1], ax[0], 0.0]],
32 | dtype=np.float32)
33 | R = np.eye(3) + np.sin(angle) * S_hat + \
34 | (1 - np.cos(angle)) * (np.linalg.matrix_power(S_hat, 2))
35 | else:
36 | R = np.eye(3)
37 | return R
38 |
39 |
40 | def r_between(v_from_, v_to_):
41 | v_from = normalize(v_from_)
42 | v_to = normalize(v_to_)
43 | ax = normalize(np.cross(v_from, v_to))
44 | angle = np.arccos(np.dot(v_from, v_to))
45 | return get_r_matrix(ax, angle)
46 |
47 |
48 | def rotate_camera_to_point_at(up_from, lookat_from, up_to, lookat_to):
49 | inputs = [up_from, lookat_from, up_to, lookat_to]
50 | for i in range(4):
51 | inputs[i] = normalize(np.array(inputs[i]).reshape((-1,)))
52 | up_from, lookat_from, up_to, lookat_to = inputs
53 | r1 = r_between(lookat_from, lookat_to)
54 |
55 | new_x = np.dot(r1, np.array([1, 0, 0]).reshape((-1, 1))).reshape((-1))
56 | to_x = normalize(np.cross(lookat_to, up_to))
57 | angle = np.arccos(np.dot(new_x, to_x))
58 | if angle > ANGLE_EPS:
59 | if angle < np.pi - ANGLE_EPS:
60 | ax = normalize(np.cross(new_x, to_x))
61 | flip = np.dot(lookat_to, ax)
62 | if flip > 0:
63 | r2 = get_r_matrix(lookat_to, angle)
64 | elif flip < 0:
65 | r2 = get_r_matrix(lookat_to, -1. * angle)
66 | else:
67 | # Angle of rotation is too close to 180 degrees, direction of
68 | # rotation does not matter.
69 | r2 = get_r_matrix(lookat_to, angle)
70 | else:
71 | r2 = np.eye(3)
72 | return np.dot(r2, r1)
73 |
--------------------------------------------------------------------------------
/vlnce_baselines/utils/visualization.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | from collections import Sequence
4 | from vlnce_baselines.utils.constant import legend_color_palette
5 |
6 |
7 | def get_contour_points(pos, origin, size=20):
8 | x, y, o = pos
9 | pt1 = (int(x) + origin[0],
10 | int(y) + origin[1])
11 | pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0],
12 | int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1])
13 | pt3 = (int(x + size * np.cos(o)) + origin[0],
14 | int(y + size * np.sin(o)) + origin[1])
15 | pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0],
16 | int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1])
17 |
18 | return np.array([pt1, pt2, pt3, pt4])
19 |
20 |
21 | def draw_line(start, end, mat, steps=25, w=1):
22 | for i in range(steps + 1):
23 | x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps))
24 | y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps))
25 | mat[x - w:x + w, y - w:y + w] = 1
26 | return mat
27 |
28 |
29 | def init_vis_image():
30 | vis_image = np.ones((755, 1165, 3)).astype(np.uint8) * 255
31 | font = cv2.FONT_HERSHEY_SIMPLEX
32 | fontScale = 1
33 | color = (20, 20, 20) # BGR
34 | thickness = 2
35 |
36 | text = "Goal: "
37 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
38 | textX = (640 - textsize[0]) // 2 + 15
39 | textY = (50 + textsize[1]) // 2
40 | vis_image = cv2.putText(vis_image, text, (textX, textY),
41 | font, fontScale, color, thickness,
42 | cv2.LINE_AA)
43 |
44 | text = "Predicted Semantic Map"
45 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
46 | textX = 640 + (480 - textsize[0]) // 2 + 30
47 | textY = (50 + textsize[1]) // 2
48 | vis_image = cv2.putText(vis_image, text, (textX, textY),
49 | font, fontScale, color, thickness,
50 | cv2.LINE_AA)
51 |
52 | color = [100, 100, 100]
53 | vis_image[49, 15:655] = color
54 | vis_image[49, 670:1150] = color
55 | vis_image[50:530, 14] = color
56 | vis_image[50:530, 655] = color
57 | vis_image[50:530, 669] = color
58 | vis_image[50:530, 1150] = color
59 | vis_image[530, 15:655] = color
60 | vis_image[530, 670:1150] = color
61 |
62 | vis_image = add_class(vis_image, 0, "out of map", legend_color_palette)
63 | vis_image = add_class(vis_image, 1, "obstacle", legend_color_palette)
64 | vis_image = add_class(vis_image, 2, "free space", legend_color_palette)
65 | vis_image = add_class(vis_image, 3, "agent trajecy", legend_color_palette)
66 | vis_image = add_class(vis_image, 4, "waypoint", legend_color_palette)
67 |
68 | return vis_image
69 |
70 |
71 | def add_class(vis_image, id, name, color_palette):
72 | text = f"{id}:{name}"
73 | font_color = (0, 0, 0)
74 | class_color = list(color_palette[id])
75 | class_color.reverse() # BGR -> RGB
76 | fontScale = 0.6
77 | thickness = 1
78 | font = cv2.FONT_HERSHEY_SIMPLEX
79 | h, w = 20, 50
80 | start_x, start_y = 25, 540 # x is horizontal and point to right, y is vertical and point to down
81 | gap_x, gap_y = 280, 30
82 |
83 | x1 = start_x + (id % 4) * gap_x
84 | y1 = start_y + (id // 4) * gap_y
85 | x2 = x1 + w
86 | y2 = y1 + h
87 | text_x = x2 + 10
88 | text_y = (y1 + y2) // 2 + 5
89 |
90 | cv2.rectangle(vis_image, (x1, y1), (x2, y2), class_color, thickness=cv2.FILLED)
91 | vis_image = cv2.putText(vis_image, text, (text_x, text_y),
92 | font, fontScale, font_color, thickness, cv2.LINE_AA)
93 |
94 | return vis_image
95 |
96 |
97 | def add_text(image: np.ndarray, text: str, position: Sequence):
98 | textsize = cv2.getTextSize(text, fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=1)[0]
99 | x, y = position
100 | x += (480 - textsize[0]) // 2
101 | image = cv2.putText(image, text, (x, y), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
102 | fontScale=1, color=0, thickness=1, lineType=cv2.LINE_AA)
103 |
104 | return image
--------------------------------------------------------------------------------