├── img └── t.png ├── .gitignore ├── requirements.txt ├── configs ├── COCO-InstanceSegmentation │ └── mask_rcnn_R_50_FPN_3x.yaml └── Base-RCNN-FPN.yaml ├── envs ├── utils │ ├── pose.py │ ├── rotation_utils.py │ ├── map_builder.py │ ├── fmm_planner.py │ ├── depth_utils.py │ └── depth_utils_multi.py ├── habitat │ ├── configs │ │ └── tasks │ │ │ ├── objectnav_gibson.yaml │ │ │ └── objectnav_hm3d.yaml │ ├── __init__.py │ ├── objectgoal_env21.py │ └── objectgoal_env.py └── __init__.py ├── README.md ├── agents ├── utils │ ├── visualization.py │ └── semantic_prediction.py └── sem_exp.py ├── utils ├── tensorboard_utils.py └── model.py ├── constants.py ├── RedNet ├── utils.py └── RedNet_model.py ├── arguments.py ├── model_multi.py └── get_point_score.py /img/t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shalexyuan/GAMap/HEAD/img/t.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | RedNet/model/* 2 | vis/* 3 | tmp/* 4 | data/* 5 | *.symlink 6 | **/__pycache__/ 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-fmm==2019.1.30 2 | scikit-learn==0.22.2.post1 3 | scikit-image==0.15.0 4 | numpy>=1.20.2 5 | ifcfg 6 | transformers==4.23.1 7 | tensorboard 8 | -------------------------------------------------------------------------------- /configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | SOLVER: 8 | STEPS: (210000, 250000) 9 | MAX_ITER: 270000 10 | -------------------------------------------------------------------------------- /envs/utils/pose.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_l2_distance(x1, x2, y1, y2): 5 | """ 6 | Computes the L2 distance between two points. 7 | """ 8 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5 9 | 10 | 11 | def get_rel_pose_change(pos2, pos1): 12 | x1, y1, o1 = pos1 13 | x2, y2, o2 = pos2 14 | 15 | theta = np.arctan2(y2 - y1, x2 - x1) - o1 16 | dist = get_l2_distance(x1, x2, y1, y2) 17 | dx = dist * np.cos(theta) 18 | dy = dist * np.sin(theta) 19 | do = o2 - o1 20 | 21 | return dx, dy, do 22 | 23 | 24 | def get_new_pose(pose, rel_pose_change): 25 | x, y, o = pose 26 | dx, dy, do = rel_pose_change 27 | 28 | global_dx = dx * np.sin(np.deg2rad(o)) + dy * np.cos(np.deg2rad(o)) 29 | global_dy = dx * np.cos(np.deg2rad(o)) - dy * np.sin(np.deg2rad(o)) 30 | x += global_dy 31 | y += global_dx 32 | o += np.rad2deg(do) 33 | if o > 180.: 34 | o -= 360. 35 | 36 | return x, y, o 37 | 38 | 39 | def threshold_poses(coords, shape): 40 | coords[0] = min(max(0, coords[0]), shape[0] - 1) 41 | coords[1] = min(max(0, coords[1]), shape[1] - 1) 42 | return coords 43 | -------------------------------------------------------------------------------- /envs/habitat/configs/tasks/objectnav_gibson.yaml: -------------------------------------------------------------------------------- 1 | ENVIRONMENT: 2 | MAX_EPISODE_STEPS: 500 3 | SIMULATOR: 4 | TURN_ANGLE: 30 5 | TILT_ANGLE: 30 6 | ACTION_SPACE_CONFIG: "v1" 7 | AGENT_0: 8 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] 9 | HEIGHT: 0.88 10 | RADIUS: 0.18 11 | HABITAT_SIM_V0: 12 | GPU_DEVICE_ID: 0 13 | ALLOW_SLIDING: False 14 | SEMANTIC_SENSOR: 15 | WIDTH: 640 16 | HEIGHT: 480 17 | HFOV: 79 18 | POSITION: [0, 0.88, 0] 19 | RGB_SENSOR: 20 | WIDTH: 640 21 | HEIGHT: 480 22 | HFOV: 79 23 | POSITION: [0, 0.88, 0] 24 | DEPTH_SENSOR: 25 | WIDTH: 640 26 | HEIGHT: 480 27 | HFOV: 79 28 | MIN_DEPTH: 0.5 29 | MAX_DEPTH: 5.0 30 | POSITION: [0, 0.88, 0] 31 | TASK: 32 | TYPE: ObjectNav-v1 33 | POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] 34 | SENSORS: ['GPS_SENSOR', 'COMPASS_SENSOR'] 35 | MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] 36 | GOAL_SENSOR_UUID: objectgoal 37 | SUCCESS: 38 | SUCCESS_DISTANCE: 0.5 39 | 40 | DATASET: 41 | TYPE: PointNav-v1 42 | SPLIT: train 43 | DATA_PATH: "data/datasets/objectnav/gibson/v1.1/{split}/{split}.json.gz" 44 | EPISODES_DIR: "data/datasets/objectnav/gibson/v1.1/{split}/" 45 | SCENES_DIR: "/data/p305574/data/scene_datasets" 46 | -------------------------------------------------------------------------------- /envs/habitat/configs/tasks/objectnav_hm3d.yaml: -------------------------------------------------------------------------------- 1 | ENVIRONMENT: 2 | MAX_EPISODE_STEPS: 500 3 | SIMULATOR: 4 | TURN_ANGLE: 30 5 | TILT_ANGLE: 30 6 | ACTION_SPACE_CONFIG: "v1" 7 | AGENT_0: 8 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR'] 9 | HEIGHT: 0.88 10 | RADIUS: 0.18 11 | HABITAT_SIM_V0: 12 | GPU_DEVICE_ID: 0 13 | ALLOW_SLIDING: False 14 | SEMANTIC_SENSOR: 15 | WIDTH: 640 16 | HEIGHT: 480 17 | HFOV: 79 18 | POSITION: [0, 0.88, 0] 19 | RGB_SENSOR: 20 | WIDTH: 640 21 | HEIGHT: 480 22 | HFOV: 79 23 | POSITION: [0, 0.88, 0] 24 | DEPTH_SENSOR: 25 | WIDTH: 640 26 | HEIGHT: 480 27 | HFOV: 79 28 | MIN_DEPTH: 0.5 29 | MAX_DEPTH: 5.0 30 | POSITION: [0, 0.88, 0] 31 | SCENE_DATASET: "data/scene_datasets/hm3d/hm3d_annotated_basis.scene_dataset_config.json" 32 | TASK: 33 | TYPE: ObjectNav-v1 34 | POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] 35 | SENSORS: ['OBJECTGOAL_SENSOR', 'GPS_SENSOR', 'COMPASS_SENSOR'] 36 | GOAL_SENSOR_UUID: objectgoal 37 | MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] 38 | DISTANCE_TO_GOAL: 39 | DISTANCE_TO: VIEW_POINTS 40 | SUCCESS: 41 | SUCCESS_DISTANCE: 0.1 42 | 43 | DATASET: 44 | TYPE: ObjectNav-v1 45 | SPLIT: val 46 | DATA_PATH: "data/objectgoal_hm3d/{split}/{split}.json.gz" 47 | SCENES_DIR: "data/scene_datasets" 48 | 49 | -------------------------------------------------------------------------------- /configs/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | BACKBONE: 4 | NAME: "build_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 7 | FPN: 8 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 9 | ANCHOR_GENERATOR: 10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 12 | RPN: 13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 16 | # Detectron1 uses 2000 proposals per-batch, 17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 19 | POST_NMS_TOPK_TRAIN: 1000 20 | POST_NMS_TOPK_TEST: 1000 21 | ROI_HEADS: 22 | NAME: "StandardROIHeads" 23 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 24 | ROI_BOX_HEAD: 25 | NAME: "FastRCNNConvFCHead" 26 | NUM_FC: 2 27 | POOLER_RESOLUTION: 7 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 4 31 | POOLER_RESOLUTION: 14 32 | DATASETS: 33 | TRAIN: ("coco_2017_train",) 34 | TEST: ("coco_2017_val",) 35 | SOLVER: 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 38 | STEPS: (60000, 80000) 39 | MAX_ITER: 90000 40 | INPUT: 41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 42 | VERSION: 2 43 | -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .habitat import construct_envs, construct_envs21 4 | 5 | 6 | def make_vec_envs(args): 7 | if args.task_config == "tasks/objectnav_gibson.yaml": 8 | envs = construct_envs(args) 9 | else: 10 | envs = construct_envs21(args) 11 | envs = VecPyTorch(envs, args.device) 12 | return envs 13 | 14 | 15 | # Adapted from 16 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/envs.py#L159 17 | class VecPyTorch(): 18 | 19 | def __init__(self, venv, device): 20 | self.venv = venv 21 | self.num_envs = venv.num_envs 22 | self.observation_space = venv.observation_space 23 | self.action_space = venv.action_space 24 | self.device = device 25 | 26 | def reset(self): 27 | obs, info = self.venv.reset() 28 | obs = torch.from_numpy(obs).float().to(self.device) 29 | return obs, info 30 | 31 | def step_async(self, actions): 32 | actions = actions.cpu().numpy() 33 | self.venv.step_async(actions) 34 | 35 | def step_wait(self): 36 | obs, reward, done, info = self.venv.step_wait() 37 | obs = torch.from_numpy(obs).float().to(self.device) 38 | reward = torch.from_numpy(reward).float() 39 | return obs, reward, done, info 40 | 41 | def step(self, actions): 42 | actions = actions.cpu().numpy() 43 | obs, reward, done, info = self.venv.step(actions) 44 | obs = torch.from_numpy(obs).float().to(self.device) 45 | reward = torch.from_numpy(reward).float() 46 | return obs, reward, done, info 47 | 48 | def get_rewards(self, inputs): 49 | reward = self.venv.get_rewards(inputs) 50 | reward = torch.from_numpy(reward).float() 51 | return reward 52 | 53 | def plan_act_and_preprocess(self, inputs): 54 | obs, reward, done, info = self.venv.plan_act_and_preprocess(inputs) 55 | obs = torch.from_numpy(obs).float().to(self.device) 56 | # reward = torch.from_numpy(reward).float() 57 | return obs, reward, done, info 58 | 59 | def rotate_at_beginning_act(self, inputs): 60 | obs, reward, done, info = self.venv.rotate_at_beginning_act(inputs) 61 | obs = torch.from_numpy(obs).float().to(self.device) 62 | # reward = torch.from_numpy(reward).float() 63 | return obs, reward, done, info 64 | 65 | def close(self): 66 | return self.venv.close() 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance 3 | 4 | Neurips 2024 5 |

6 | 7 | This is the official repository of [GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance](https://arxiv.org/pdf/2410.23978). 8 | 9 |
10 | 11 |
12 | 13 | ## Setup 14 | 15 | ### Dataset Preparation 16 | 17 | Please follow [HM3D](https://aihabitat.org/datasets/hm3d/) to download the dataset and prepare the data. The data format should be: 18 | 19 | ``` 20 | data/ 21 | scene_datasets/ 22 | matterport_category_mappings.tsv 23 | object_norm_inv_perplexity.npy 24 | versioned_data 25 | objectgoal_hm3d/ 26 | train/ 27 | val/ 28 | val_mini/ 29 | ``` 30 | 31 | ### Dependencies 32 | 33 | 1. Python & PyTorch 34 | 35 | This code is tested on Python 3.9.16 on Ubuntu 20.04, with PyTorch 1.11.0+cu113. 36 | 37 | 2. Habitat-Sim & Habitat-Lab 38 | 39 | ``` 40 | # Habitat-Sim 41 | git clone https://github.com/facebookresearch/habitat-sim.git 42 | cd habitat-sim; git checkout tags/challenge-2022; 43 | pip install -r requirements.txt; 44 | python setup.py install --headless 45 | 46 | # Habitat-Lab 47 | git clone https://github.com/facebookresearch/habitat-lab.git 48 | cd habitat-lab; git checkout tags/challenge-2022; 49 | pip install -e . 50 | ``` 51 | 52 | 4. Others 53 | 54 | ``` 55 | pip install -r requirements.txt 56 | ``` 57 | 58 | ## Running 59 | 60 | ### Example 61 | 62 | An example command to run the pipeline: 63 | 64 | ``` 65 | python ga.py --split val_mini --eval 0 --auto_gpu_config 0 -n 1 \ 66 | --num_eval_episodes 2000 --use_gtsem 0 --num_local_steps 10 \ 67 | --exp_name gamap --agent multi_attr_exp 68 | ``` 69 | 70 | [//]: # () 71 | [//]: # (### Visualization) 72 | 73 | [//]: # () 74 | [//]: # (To make a demo video on your saved images, you can either use `ffmpeg` to make separate videos or use) 75 | 76 | [//]: # () 77 | [//]: # (```) 78 | 79 | [//]: # (python make_demo.py --exp_name test # add `--delete_img` to delete images after making video) 80 | 81 | [//]: # (```) 82 | 83 | [//]: # () 84 | [//]: # (to make batched videos.) 85 | 86 | ## Acknowledgements 87 | 88 | This repo is heavily based on [L3MVN](https://github.com/ybgdgh/L3MVN). We thank the authors for their great work. 89 | 90 | ## Citation 91 | 92 | If you find this work helpful, please consider citing: 93 | -------------------------------------------------------------------------------- /agents/utils/visualization.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | def get_contour_points(pos, origin, size=20): 6 | x, y, o = pos 7 | pt1 = (int(x) + origin[0], 8 | int(y) + origin[1]) 9 | pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0], 10 | int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1]) 11 | pt3 = (int(x + size * np.cos(o)) + origin[0], 12 | int(y + size * np.sin(o)) + origin[1]) 13 | pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0], 14 | int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1]) 15 | 16 | return np.array([pt1, pt2, pt3, pt4]) 17 | 18 | 19 | def draw_line(start, end, mat, steps=25, w=1): 20 | for i in range(steps + 1): 21 | x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps)) 22 | y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps)) 23 | mat[x - w:x + w, y - w:y + w] = 1 24 | return mat 25 | 26 | 27 | def init_vis_image(goal_name, legend): 28 | vis_image = np.ones((655, 1165, 3)).astype(np.uint8) * 255 29 | font = cv2.FONT_HERSHEY_SIMPLEX 30 | fontScale = 1 31 | color = (20, 20, 20) # BGR 32 | thickness = 2 33 | 34 | text = "Observations (Goal: {})".format(goal_name) 35 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0] 36 | textX = (640 - textsize[0]) // 2 + 15 37 | textY = (50 + textsize[1]) // 2 38 | vis_image = cv2.putText(vis_image, text, (textX, textY), 39 | font, fontScale, color, thickness, 40 | cv2.LINE_AA) 41 | 42 | text = "Predicted Semantic Map" 43 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0] 44 | textX = 640 + (480 - textsize[0]) // 2 + 30 45 | textY = (50 + textsize[1]) // 2 46 | vis_image = cv2.putText(vis_image, text, (textX, textY), 47 | font, fontScale, color, thickness, 48 | cv2.LINE_AA) 49 | 50 | # draw outlines 51 | color = [100, 100, 100] 52 | vis_image[49, 15:655] = color 53 | vis_image[49, 670:1150] = color 54 | vis_image[50:530, 14] = color 55 | vis_image[50:530, 655] = color 56 | vis_image[50:530, 669] = color 57 | vis_image[50:530, 1150] = color 58 | vis_image[530, 15:655] = color 59 | vis_image[530, 670:1150] = color 60 | 61 | # draw legend 62 | # lx, ly, _ = legend.shape 63 | # vis_image[537:537 + lx, 155:155 + ly, :] = legend 64 | 65 | return vis_image 66 | -------------------------------------------------------------------------------- /utils/tensorboard_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any 8 | 9 | import numpy as np 10 | import torch 11 | from torch.utils.tensorboard import SummaryWriter 12 | 13 | 14 | class TensorboardWriter: 15 | def __init__(self, log_dir: str, *args: Any, **kwargs: Any): 16 | r"""A Wrapper for tensorboard SummaryWriter. It creates a dummy writer 17 | when log_dir is empty string or None. It also has functionality that 18 | generates tb video directly from numpy images. 19 | 20 | Args: 21 | log_dir: Save directory location. Will not write to disk if 22 | log_dir is an empty string. 23 | *args: Additional positional args for SummaryWriter 24 | **kwargs: Additional keyword args for SummaryWriter 25 | """ 26 | self.writer = None 27 | if log_dir is not None and len(log_dir) > 0: 28 | self.writer = SummaryWriter(log_dir, *args, **kwargs) 29 | 30 | def __getattr__(self, item): 31 | if self.writer: 32 | return self.writer.__getattribute__(item) 33 | else: 34 | return lambda *args, **kwargs: None 35 | 36 | def __enter__(self): 37 | return self 38 | 39 | def __exit__(self, exc_type, exc_val, exc_tb): 40 | if self.writer: 41 | self.writer.close() 42 | 43 | def add_video_from_np_images( 44 | self, video_name: str, step_idx: int, images: np.ndarray, fps: int = 10 45 | ) -> None: 46 | r"""Write video into tensorboard from images frames. 47 | 48 | Args: 49 | video_name: name of video string. 50 | step_idx: int of checkpoint index to be displayed. 51 | images: list of n frames. Each frame is a np.ndarray of shape. 52 | fps: frame per second for output video. 53 | 54 | Returns: 55 | None. 56 | """ 57 | if not self.writer: 58 | return 59 | # initial shape of np.ndarray list: N * (H, W, 3) 60 | frame_tensors = [ 61 | torch.from_numpy(np_arr).unsqueeze(0) for np_arr in images 62 | ] 63 | video_tensor = torch.cat(tuple(frame_tensors)) 64 | video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0) 65 | # final shape of video tensor: (1, n, 3, H, W) 66 | self.writer.add_video( 67 | video_name, video_tensor, fps=fps, global_step=step_idx 68 | ) 69 | -------------------------------------------------------------------------------- /envs/utils/rotation_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for generating and applying rotation matrices. 17 | """ 18 | import numpy as np 19 | 20 | ANGLE_EPS = 0.001 21 | 22 | 23 | def normalize(v): 24 | return v / np.linalg.norm(v) 25 | 26 | 27 | def get_r_matrix(ax_, angle): 28 | ax = normalize(ax_) 29 | if np.abs(angle) > ANGLE_EPS: 30 | S_hat = np.array( 31 | [[0.0, -ax[2], ax[1]], [ax[2], 0.0, -ax[0]], [-ax[1], ax[0], 0.0]], 32 | dtype=np.float32) 33 | R = np.eye(3) + np.sin(angle) * S_hat + \ 34 | (1 - np.cos(angle)) * (np.linalg.matrix_power(S_hat, 2)) 35 | else: 36 | R = np.eye(3) 37 | return R 38 | 39 | 40 | def r_between(v_from_, v_to_): 41 | v_from = normalize(v_from_) 42 | v_to = normalize(v_to_) 43 | ax = normalize(np.cross(v_from, v_to)) 44 | angle = np.arccos(np.dot(v_from, v_to)) 45 | return get_r_matrix(ax, angle) 46 | 47 | 48 | def rotate_camera_to_point_at(up_from, lookat_from, up_to, lookat_to): 49 | inputs = [up_from, lookat_from, up_to, lookat_to] 50 | for i in range(4): 51 | inputs[i] = normalize(np.array(inputs[i]).reshape((-1,))) 52 | up_from, lookat_from, up_to, lookat_to = inputs 53 | r1 = r_between(lookat_from, lookat_to) 54 | 55 | new_x = np.dot(r1, np.array([1, 0, 0]).reshape((-1, 1))).reshape((-1)) 56 | to_x = normalize(np.cross(lookat_to, up_to)) 57 | angle = np.arccos(np.dot(new_x, to_x)) 58 | if angle > ANGLE_EPS: 59 | if angle < np.pi - ANGLE_EPS: 60 | ax = normalize(np.cross(new_x, to_x)) 61 | flip = np.dot(lookat_to, ax) 62 | if flip > 0: 63 | r2 = get_r_matrix(lookat_to, angle) 64 | elif flip < 0: 65 | r2 = get_r_matrix(lookat_to, -1. * angle) 66 | else: 67 | # Angle of rotation is too close to 180 degrees, direction of 68 | # rotation does not matter. 69 | r2 = get_r_matrix(lookat_to, angle) 70 | else: 71 | r2 = np.eye(3) 72 | return np.dot(r2, r1) 73 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | coco_categories = { 2 | "chair": 0, 3 | "couch": 1, 4 | "potted plant": 2, 5 | "bed": 3, 6 | "toilet": 4, 7 | "tv": 5, 8 | "dining-table": 6, 9 | "oven": 7, 10 | "sink": 8, 11 | "refrigerator": 9, 12 | "book": 10, 13 | "clock": 11, 14 | "vase": 12, 15 | "cup": 13, 16 | "bottle": 14 17 | } 18 | 19 | category_to_id = [ 20 | "chair", 21 | "bed", 22 | "plant", 23 | "toilet", 24 | "tv_monitor", 25 | "sofa" 26 | ] 27 | 28 | category_to_id_gibson = [ 29 | "chair", 30 | "couch", 31 | "potted plant", 32 | "bed", 33 | "toilet", 34 | "tv" 35 | ] 36 | 37 | mp3d_category_id = { 38 | 'void': 1, 39 | 'chair': 2, 40 | 'sofa': 3, 41 | 'plant': 4, 42 | 'bed': 5, 43 | 'toilet': 6, 44 | 'tv_monitor': 7, 45 | 'table': 8, 46 | 'refrigerator': 9, 47 | 'sink': 10, 48 | 'stairs': 16, 49 | 'fireplace': 12 50 | } 51 | 52 | # mp_categories_mapping = [4, 11, 15, 12, 19, 23, 6, 7, 15, 38, 40, 28, 29, 8, 17] 53 | 54 | mp_categories_mapping = [4, 11, 15, 12, 19, 23, 26, 24, 28, 38, 21, 16, 14, 6, 16] 55 | 56 | hm3d_category = [ 57 | "chair", 58 | "sofa", 59 | "plant", 60 | "bed", 61 | "toilet", 62 | "tv_monitor", 63 | "bathtub", 64 | "shower", 65 | "fireplace", 66 | "appliances", 67 | "towel", 68 | "sink", 69 | "chest_of_drawers", 70 | "table", 71 | "stairs" 72 | ] 73 | 74 | coco_categories_mapping = { 75 | 56: 0, # chair 76 | 57: 1, # couch 77 | 58: 2, # potted plant 78 | 59: 3, # bed 79 | 61: 4, # toilet 80 | 62: 5, # tv 81 | 60: 6, # dining-table 82 | 69: 7, # oven 83 | 71: 8, # sink 84 | 72: 9, # refrigerator 85 | 73: 10, # book 86 | 74: 11, # clock 87 | 75: 12, # vase 88 | 41: 13, # cup 89 | 39: 14, # bottle 90 | } 91 | 92 | color_palette = [ 93 | 1.0, 1.0, 1.0, 94 | 0.6, 0.6, 0.6, 95 | 0.95, 0.95, 0.95, 96 | 0.96, 0.36, 0.26, 97 | 0.12156862745098039, 0.47058823529411764, 0.7058823529411765, 98 | 0.9400000000000001, 0.7818, 0.66, 99 | 0.9400000000000001, 0.8868, 0.66, 100 | 0.8882000000000001, 0.9400000000000001, 0.66, 101 | 0.7832000000000001, 0.9400000000000001, 0.66, 102 | 0.6782000000000001, 0.9400000000000001, 0.66, 103 | 0.66, 0.9400000000000001, 0.7468000000000001, 104 | 0.66, 0.9400000000000001, 0.8518000000000001, 105 | 0.66, 0.9232, 0.9400000000000001, 106 | 0.66, 0.8182, 0.9400000000000001, 107 | 0.66, 0.7132, 0.9400000000000001, 108 | 0.7117999999999999, 0.66, 0.9400000000000001, 109 | 0.8168, 0.66, 0.9400000000000001, 110 | 0.9218, 0.66, 0.9400000000000001, 111 | 0.9400000000000001, 0.66, 0.8531999999999998, 112 | 0.9400000000000001, 0.66, 0.748199999999999] 113 | 114 | 115 | 116 | cat_2_multi_att_prompt={ 117 | "chair": ['chair','leg','backrest','seat','armrest'], 118 | "bed": ['bed','sleep','lie','mattress','bedframe'], 119 | "plant": ['plant','leaf','stem','branch','pot'], 120 | "toilet": ['toilet','bathroom','defecating','urinating','flushing'], 121 | "tv_monitor": ['tv monitor','livingroom','bedroom','screen','display'], 122 | "sofa": ['sofa','sit','cushion','armrest','backrest'], 123 | } -------------------------------------------------------------------------------- /envs/utils/map_builder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import envs.utils.depth_utils as du 3 | 4 | 5 | class MapBuilder(object): 6 | def __init__(self, params): 7 | self.params = params 8 | frame_width = params['frame_width'] 9 | frame_height = params['frame_height'] 10 | fov = params['fov'] 11 | self.camera_matrix = du.get_camera_matrix( 12 | frame_width, 13 | frame_height, 14 | fov) 15 | self.vision_range = params['vision_range'] 16 | 17 | self.map_size_cm = params['map_size_cm'] 18 | self.resolution = params['resolution'] 19 | agent_min_z = params['agent_min_z'] 20 | agent_max_z = params['agent_max_z'] 21 | self.z_bins = [agent_min_z, agent_max_z] 22 | self.du_scale = params['du_scale'] 23 | self.visualize = params['visualize'] 24 | self.obs_threshold = params['obs_threshold'] 25 | 26 | self.map = np.zeros((self.map_size_cm // self.resolution, 27 | self.map_size_cm // self.resolution, 28 | len(self.z_bins) + 1), dtype=np.float32) 29 | 30 | self.agent_height = params['agent_height'] 31 | self.agent_view_angle = params['agent_view_angle'] 32 | return 33 | 34 | def update_map(self, depth, current_pose): 35 | with np.errstate(invalid="ignore"): 36 | depth[depth > self.vision_range * self.resolution] = np.NaN 37 | point_cloud = du.get_point_cloud_from_z(depth, self.camera_matrix, 38 | scale=self.du_scale) 39 | 40 | agent_view = du.transform_camera_view(point_cloud, 41 | self.agent_height, 42 | self.agent_view_angle) 43 | 44 | shift_loc = [self.vision_range * self.resolution // 2, 0, np.pi / 2.0] 45 | agent_view_centered = du.transform_pose(agent_view, shift_loc) 46 | 47 | agent_view_flat = du.bin_points( 48 | agent_view_centered, 49 | self.vision_range, 50 | self.z_bins, 51 | self.resolution) 52 | 53 | agent_view_cropped = agent_view_flat[:, :, 1] 54 | agent_view_cropped = agent_view_cropped / self.obs_threshold 55 | agent_view_cropped[agent_view_cropped >= 0.5] = 1.0 56 | agent_view_cropped[agent_view_cropped < 0.5] = 0.0 57 | 58 | agent_view_explored = agent_view_flat.sum(2) 59 | agent_view_explored[agent_view_explored > 0] = 1.0 60 | 61 | geocentric_pc = du.transform_pose(agent_view, current_pose) 62 | 63 | geocentric_flat = du.bin_points( 64 | geocentric_pc, 65 | self.map.shape[0], 66 | self.z_bins, 67 | self.resolution) 68 | 69 | self.map = self.map + geocentric_flat 70 | 71 | map_gt = self.map[:, :, 1] / self.obs_threshold 72 | map_gt[map_gt >= 0.5] = 1.0 73 | map_gt[map_gt < 0.5] = 0.0 74 | 75 | explored_gt = self.map.sum(2) 76 | explored_gt[explored_gt > 1] = 1.0 77 | 78 | return agent_view_cropped, map_gt, agent_view_explored, explored_gt 79 | 80 | def get_st_pose(self, current_loc): 81 | loc = [- (current_loc[0] / self.resolution 82 | - self.map_size_cm // (self.resolution * 2)) / 83 | (self.map_size_cm // (self.resolution * 2)), 84 | - (current_loc[1] / self.resolution 85 | - self.map_size_cm // (self.resolution * 2)) / 86 | (self.map_size_cm // (self.resolution * 2)), 87 | 90 - np.rad2deg(current_loc[2])] 88 | return loc 89 | 90 | def reset_map(self, map_size): 91 | self.map_size_cm = map_size 92 | 93 | self.map = np.zeros((self.map_size_cm // self.resolution, 94 | self.map_size_cm // self.resolution, 95 | len(self.z_bins) + 1), dtype=np.float32) 96 | 97 | def get_map(self): 98 | return self.map 99 | -------------------------------------------------------------------------------- /utils/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | def get_grid(pose, grid_size, device): 8 | """ 9 | Input: 10 | `pose` FloatTensor(bs, 3) 11 | `grid_size` 4-tuple (bs, _, grid_h, grid_w) 12 | `device` torch.device (cpu or gpu) 13 | Output: 14 | `rot_grid` FloatTensor(bs, grid_h, grid_w, 2) 15 | `trans_grid` FloatTensor(bs, grid_h, grid_w, 2) 16 | 17 | """ 18 | pose = pose.float() 19 | x = pose[:, 0] 20 | y = pose[:, 1] 21 | t = pose[:, 2] 22 | 23 | bs = x.size(0) 24 | t = t * np.pi / 180. 25 | cos_t = t.cos() 26 | sin_t = t.sin() 27 | 28 | theta11 = torch.stack([cos_t, -sin_t, 29 | torch.zeros(cos_t.shape).float().to(device)], 1) 30 | theta12 = torch.stack([sin_t, cos_t, 31 | torch.zeros(cos_t.shape).float().to(device)], 1) 32 | theta1 = torch.stack([theta11, theta12], 1) 33 | 34 | theta21 = torch.stack([torch.ones(x.shape).to(device), 35 | -torch.zeros(x.shape).to(device), x], 1) 36 | theta22 = torch.stack([torch.zeros(x.shape).to(device), 37 | torch.ones(x.shape).to(device), y], 1) 38 | theta2 = torch.stack([theta21, theta22], 1) 39 | 40 | rot_grid = F.affine_grid(theta1, torch.Size(grid_size)) 41 | trans_grid = F.affine_grid(theta2, torch.Size(grid_size)) 42 | 43 | return rot_grid, trans_grid 44 | 45 | 46 | class ChannelPool(nn.MaxPool1d): 47 | def forward(self, x): 48 | n, c, w, h = x.size() 49 | x = x.view(n, c, w * h).permute(0, 2, 1) 50 | x = x.contiguous() 51 | pooled = F.max_pool1d(x, c, 1) 52 | _, _, c = pooled.size() 53 | pooled = pooled.permute(0, 2, 1) 54 | return pooled.view(n, c, w, h) 55 | 56 | 57 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/utils.py#L32 58 | class AddBias(nn.Module): 59 | def __init__(self, bias): 60 | super(AddBias, self).__init__() 61 | self._bias = nn.Parameter(bias.unsqueeze(1)) 62 | 63 | def forward(self, x): 64 | if x.dim() == 2: 65 | bias = self._bias.t().view(1, -1) 66 | else: 67 | bias = self._bias.t().view(1, -1, 1, 1) 68 | 69 | return x + bias 70 | 71 | 72 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L10 73 | class Flatten(nn.Module): 74 | def forward(self, x): 75 | return x.view(x.size(0), -1) 76 | 77 | 78 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L82 79 | class NNBase(nn.Module): 80 | 81 | def __init__(self, recurrent, recurrent_input_size, hidden_size): 82 | 83 | super(NNBase, self).__init__() 84 | self._hidden_size = hidden_size 85 | self._recurrent = recurrent 86 | 87 | if recurrent: 88 | self.gru = nn.GRUCell(recurrent_input_size, hidden_size) 89 | nn.init.orthogonal_(self.gru.weight_ih.data) 90 | nn.init.orthogonal_(self.gru.weight_hh.data) 91 | self.gru.bias_ih.data.fill_(0) 92 | self.gru.bias_hh.data.fill_(0) 93 | 94 | @property 95 | def is_recurrent(self): 96 | return self._recurrent 97 | 98 | @property 99 | def rec_state_size(self): 100 | if self._recurrent: 101 | return self._hidden_size 102 | return 1 103 | 104 | @property 105 | def output_size(self): 106 | return self._hidden_size 107 | 108 | def _forward_gru(self, x, hxs, masks): 109 | if x.size(0) == hxs.size(0): 110 | x = hxs = self.gru(x, hxs * masks[:, None]) 111 | else: 112 | # x is a (T, N, -1) tensor that has been flatten to (T * N, -1) 113 | N = hxs.size(0) 114 | T = int(x.size(0) / N) 115 | 116 | # unflatten 117 | x = x.view(T, N, x.size(1)) 118 | 119 | # Same deal with masks 120 | masks = masks.view(T, N, 1) 121 | 122 | outputs = [] 123 | for i in range(T): 124 | hx = hxs = self.gru(x[i], hxs * masks[i]) 125 | outputs.append(hx) 126 | 127 | # x is a (T, N, -1) tensor 128 | x = torch.stack(outputs, dim=0) 129 | # flatten 130 | x = x.view(T * N, -1) 131 | 132 | return x, hxs 133 | -------------------------------------------------------------------------------- /RedNet/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | import torch 4 | import os 5 | 6 | med_frq = [0.382900, 0.452448, 0.637584, 0.377464, 0.585595, 7 | 0.479574, 0.781544, 0.982534, 1.017466, 0.624581, 8 | 2.589096, 0.980794, 0.920340, 0.667984, 1.172291, 9 | 0.862240, 0.921714, 2.154782, 1.187832, 1.178115, 10 | 1.848545, 1.428922, 2.849658, 0.771605, 1.656668, 11 | 4.483506, 2.209922, 1.120280, 2.790182, 0.706519, 12 | 3.994768, 2.220004, 0.972934, 1.481525, 5.342475, 13 | 0.750738, 4.040773] 14 | 15 | model_urls = { 16 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 17 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 18 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 19 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 20 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 21 | } 22 | 23 | label_colours = [(0, 0, 0), 24 | # 0=background 25 | (148, 65, 137), (255, 116, 69), (86, 156, 137), 26 | (202, 179, 158), (155, 99, 235), (161, 107, 108), 27 | (133, 160, 103), (76, 152, 126), (84, 62, 35), 28 | (44, 80, 130), (31, 184, 157), (101, 144, 77), 29 | (23, 197, 62), (141, 168, 145), (142, 151, 136), 30 | (115, 201, 77), (100, 216, 255), (57, 156, 36), 31 | (88, 108, 129), (105, 129, 112), (42, 137, 126), 32 | (155, 108, 249), (166, 148, 143), (81, 91, 87), 33 | (100, 124, 51), (73, 131, 121), (157, 210, 220), 34 | (134, 181, 60), (221, 223, 147), (123, 108, 131), 35 | (161, 66, 179), (163, 221, 160), (31, 146, 98), 36 | (99, 121, 30), (49, 89, 240), (116, 108, 9), 37 | (161, 176, 169), (80, 29, 135), (177, 105, 197), 38 | (139, 110, 246)] 39 | 40 | 41 | class CrossEntropyLoss2d(nn.Module): 42 | def __init__(self, weight=med_frq): 43 | super(CrossEntropyLoss2d, self).__init__() 44 | self.ce_loss = nn.CrossEntropyLoss(reduction='none') 45 | 46 | def forward(self, inputs_scales, targets_scales): 47 | losses = [] 48 | for inputs, targets in zip(inputs_scales, targets_scales): 49 | mask = targets > 0 50 | targets_m = targets.clone() 51 | targets_m[mask] -= 1 52 | loss_all = self.ce_loss(inputs, targets_m.long()) 53 | losses.append(torch.sum(torch.masked_select(loss_all, mask)) / torch.sum(mask.float())) 54 | total_loss = sum(losses) 55 | return total_loss 56 | 57 | 58 | def color_label(label): 59 | label = label.clone().cpu().data.numpy() 60 | colored_label = np.vectorize(lambda x: label_colours[int(x)]) 61 | 62 | colored = np.asarray(colored_label(label)).astype(np.float32) 63 | colored = colored.squeeze() 64 | 65 | try: 66 | return torch.from_numpy(colored.transpose([1, 0, 2, 3])) 67 | except ValueError: 68 | return torch.from_numpy(colored[np.newaxis, ...]) 69 | 70 | 71 | def print_log(global_step, epoch, local_count, count_inter, dataset_size, loss, time_inter): 72 | print('Step: {:>5} Train Epoch: {:>3} [{:>4}/{:>4} ({:3.1f}%)] ' 73 | 'Loss: {:.6f} [{:.2f}s every {:>4} data]'.format( 74 | global_step, epoch, local_count, dataset_size, 75 | 100. * local_count / dataset_size, loss.data, time_inter, count_inter)) 76 | 77 | 78 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train): 79 | # usually this happens only on the start of a epoch 80 | epoch_float = epoch + (local_count / num_train) 81 | state = { 82 | 'global_step': global_step, 83 | 'epoch': epoch_float, 84 | 'state_dict': model.state_dict(), 85 | 'optimizer': optimizer.state_dict(), 86 | } 87 | ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float) 88 | path = os.path.join(ckpt_dir, ckpt_model_filename) 89 | torch.save(state, path) 90 | print('{:>2} has been successfully saved'.format(path)) 91 | 92 | 93 | # def load_ckpt(model, optimizer, model_file, device): 94 | # if os.path.isfile(model_file): 95 | # print("=> loading checkpoint '{}'".format(model_file)) 96 | # if device.type == 'cuda': 97 | # checkpoint = torch.load(model_file) 98 | # else: 99 | # checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) 100 | # model.load_state_dict(checkpoint['state_dict']) 101 | # if optimizer: 102 | # optimizer.load_state_dict(checkpoint['optimizer']) 103 | # print("=> loaded checkpoint '{}' (epoch {})" 104 | # .format(model_file, checkpoint['epoch'])) 105 | # step = checkpoint['global_step'] 106 | # epoch = checkpoint['epoch'] 107 | # return step, epoch 108 | # else: 109 | # print("=> no checkpoint found at '{}'".format(model_file)) 110 | # os._exit(0) 111 | 112 | 113 | def load_ckpt(model, optimizer, model_file, device): 114 | if os.path.isfile(model_file): 115 | print("=> loading checkpoint '{}'".format(model_file)) 116 | if device.type == 'cuda': 117 | checkpoint = torch.load(model_file) 118 | else: 119 | checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) 120 | state_dict = checkpoint['model_state'] 121 | prefix = 'module.' 122 | state_dict = { 123 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items() 124 | } 125 | model.load_state_dict(state_dict) 126 | if optimizer: 127 | optimizer.load_state_dict(checkpoint['optimizer']) 128 | print("=> loaded checkpoint '{}' (epoch {})" 129 | .format(model_file, checkpoint['epoch'])) 130 | # step = checkpoint['global_step'] 131 | # epoch = checkpoint['epoch'] 132 | # return step, epoch 133 | else: 134 | print("=> no checkpoint found at '{}'".format(model_file)) 135 | os._exit(0) -------------------------------------------------------------------------------- /envs/utils/fmm_planner.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import skfmm 4 | import skimage 5 | from numpy import around, array, ma 6 | 7 | 8 | def get_mask(sx, sy, scale, step_size): 9 | size = int(step_size // scale) * 2 + 1 10 | mask = np.zeros((size, size)) 11 | for i in range(size): 12 | for j in range(size): 13 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \ 14 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \ 15 | step_size ** 2 \ 16 | and ((i + 0.5) - (size // 2 + sx)) ** 2 + \ 17 | ((j + 0.5) - (size // 2 + sy)) ** 2 > \ 18 | (step_size - 1) ** 2: 19 | mask[i, j] = 1 20 | 21 | mask[size // 2, size // 2] = 1 22 | return mask 23 | 24 | 25 | def get_dist(sx, sy, scale, step_size): 26 | size = int(step_size // scale) * 2 + 1 27 | mask = np.zeros((size, size)) + 1e-10 28 | for i in range(size): 29 | for j in range(size): 30 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \ 31 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \ 32 | step_size ** 2: 33 | mask[i, j] = max(5, 34 | (((i + 0.5) - (size // 2 + sx)) ** 2 + 35 | ((j + 0.5) - (size // 2 + sy)) ** 2) ** 0.5) 36 | return mask 37 | 38 | 39 | class FMMPlanner(): 40 | def __init__(self, traversible, scale=1, step_size=5): 41 | self.scale = scale 42 | self.step_size = step_size 43 | if scale != 1.: 44 | self.traversible = cv2.resize(traversible, 45 | (traversible.shape[1] // scale, 46 | traversible.shape[0] // scale), 47 | interpolation=cv2.INTER_NEAREST) 48 | self.traversible = np.rint(self.traversible) 49 | else: 50 | self.traversible = traversible 51 | 52 | self.du = int(self.step_size / (self.scale * 1.)) 53 | self.fmm_dist = None 54 | self.around = np.zeros(traversible.shape) 55 | 56 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(9, 9)) 57 | self.traversible_around = cv2.dilate((1-self.traversible*1).astype('uint8'), kernel) 58 | traversible_around_ma = ma.masked_values(self.traversible_around*1, 0) 59 | traversible_around_ma[self.traversible==0] = 0 60 | if np.any(traversible_around_ma == 0): 61 | dd = skfmm.distance(traversible_around_ma, dx=1) 62 | dd = (np.max(dd) - dd) 63 | dd = ma.filled(dd, 0) 64 | self.around = dd 65 | 66 | 67 | def set_goal(self, goal, auto_improve=False): 68 | traversible_ma = ma.masked_values(self.traversible * 1, 0) 69 | goal_x, goal_y = int(goal[0] / (self.scale * 1.)), \ 70 | int(goal[1] / (self.scale * 1.)) 71 | 72 | if self.traversible[goal_x, goal_y] == 0. and auto_improve: 73 | goal_x, goal_y = self._find_nearest_goal([goal_x, goal_y]) 74 | 75 | traversible_ma[goal_x, goal_y] = 0 76 | dd = skfmm.distance(traversible_ma, dx=1) 77 | dd = ma.filled(dd, np.max(dd) + 1) 78 | self.fmm_dist = dd 79 | return 80 | 81 | def set_multi_goal(self, goal_map): 82 | traversible_ma = ma.masked_values(self.traversible * 1, 0) #mask掉障碍物层(0),只剩空白区域 83 | traversible_ma[goal_map == 1] = 0 # 除去障碍物层的地图,目标为0,其他为1 84 | dd = skfmm.distance(traversible_ma, dx=1) 85 | dd = ma.filled(dd, np.max(dd) + 1) # 将mask的值替换为max(dd)+1, 即障碍物为最大距离 86 | self.fmm_dist = dd 87 | self.around[goal_map == 1] = 0 88 | return 89 | 90 | def get_short_term_goal(self, state): 91 | scale = self.scale * 1. 92 | state = [x / scale for x in state] 93 | dx, dy = state[0] - int(state[0]), state[1] - int(state[1]) 94 | mask = get_mask(dx, dy, scale, self.step_size) 95 | dist_mask = get_dist(dx, dy, scale, self.step_size) 96 | 97 | state = [int(x) for x in state] 98 | self.fmm_dist += self.around 99 | dist = np.pad(self.fmm_dist, self.du, 100 | 'constant', constant_values=self.fmm_dist.shape[0] ** 2) 101 | subset = dist[state[0]:state[0] + 2 * self.du + 1, 102 | state[1]:state[1] + 2 * self.du + 1] 103 | 104 | dist_around = np.pad(self.around, self.du, 105 | 'constant', constant_values=self.fmm_dist.shape[0] ** 2) 106 | subset_around = dist_around[state[0]:state[0] + 2 * self.du + 1, 107 | state[1]:state[1] + 2 * self.du + 1] 108 | 109 | assert subset.shape[0] == 2 * self.du + 1 and \ 110 | subset.shape[1] == 2 * self.du + 1, \ 111 | "Planning error: unexpected subset shape {}".format(subset.shape) 112 | 113 | subset *= mask 114 | subset += (1 - mask) * self.fmm_dist.shape[0] ** 2 115 | 116 | if subset[self.du, self.du] < 0.25 * 100 / 5.: # 25cm 117 | stop = True 118 | else: 119 | stop = False 120 | 121 | subset -= subset[self.du, self.du] 122 | ratio1 = subset / dist_mask 123 | subset[ratio1 < -1.5] = 1 124 | 125 | (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape) 126 | 127 | # remove the imfluence of the around map 128 | if subset[stg_x, stg_y]+subset_around[self.du, self.du] > -0.0001: 129 | replan = True 130 | else: 131 | replan = False 132 | 133 | return (stg_x + state[0] - self.du) * scale, \ 134 | (stg_y + state[1] - self.du) * scale, replan, stop 135 | 136 | def _find_nearest_goal(self, goal): 137 | traversible = skimage.morphology.binary_dilation( 138 | np.zeros(self.traversible.shape), 139 | skimage.morphology.disk(2)) != True 140 | traversible = traversible * 1. 141 | planner = FMMPlanner(traversible) 142 | planner.set_goal(goal) 143 | 144 | mask = self.traversible 145 | 146 | dist_map = planner.fmm_dist * mask 147 | dist_map[dist_map == 0] = dist_map.max() 148 | 149 | goal = np.unravel_index(dist_map.argmin(), dist_map.shape) 150 | 151 | return goal 152 | -------------------------------------------------------------------------------- /agents/utils/semantic_prediction.py: -------------------------------------------------------------------------------- 1 | # The following code is largely borrowed from 2 | # https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and 3 | # https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py 4 | 5 | import argparse 6 | import time 7 | 8 | import torch 9 | import numpy as np 10 | 11 | from detectron2.config import get_cfg 12 | from detectron2.utils.logger import setup_logger 13 | from detectron2.data.catalog import MetadataCatalog 14 | from detectron2.modeling import build_model 15 | from detectron2.checkpoint import DetectionCheckpointer 16 | from detectron2.utils.visualizer import ColorMode, Visualizer 17 | import detectron2.data.transforms as T 18 | 19 | from constants import coco_categories_mapping 20 | 21 | 22 | class SemanticPredMaskRCNN(): 23 | 24 | def __init__(self, args): 25 | self.segmentation_model = ImageSegmentation(args) 26 | self.args = args 27 | 28 | def get_prediction(self, img): 29 | args = self.args 30 | image_list = [] 31 | img = img[:, :, ::-1] 32 | image_list.append(img) 33 | seg_predictions, vis_output = self.segmentation_model.get_predictions( 34 | image_list, visualize=args.visualize == 2) 35 | 36 | if args.visualize == 2: 37 | img = vis_output.get_image() 38 | 39 | semantic_input = np.zeros((img.shape[0], img.shape[1], 15 + 1)) 40 | 41 | for j, class_idx in enumerate( 42 | seg_predictions[0]['instances'].pred_classes.cpu().numpy()): 43 | if class_idx in list(coco_categories_mapping.keys()): 44 | idx = coco_categories_mapping[class_idx] 45 | obj_mask = seg_predictions[0]['instances'].pred_masks[j] * 1. 46 | semantic_input[:, :, idx] += obj_mask.cpu().numpy() 47 | 48 | return semantic_input, img 49 | 50 | 51 | def compress_sem_map(sem_map): 52 | c_map = np.zeros((sem_map.shape[1], sem_map.shape[2])) 53 | for i in range(sem_map.shape[0]): 54 | c_map[sem_map[i] > 0.] = i + 1 55 | return c_map 56 | 57 | 58 | class ImageSegmentation(): 59 | def __init__(self, args): 60 | string_args = """ 61 | --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml 62 | --input input1.jpeg 63 | --confidence-threshold {} 64 | --opts MODEL.WEIGHTS 65 | detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl 66 | """.format(args.sem_pred_prob_thr) 67 | 68 | if args.sem_gpu_id == -2: 69 | string_args += """ MODEL.DEVICE cpu""" 70 | else: 71 | string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id) 72 | 73 | string_args = string_args.split() 74 | 75 | args = get_seg_parser().parse_args(string_args) 76 | logger = setup_logger() 77 | logger.info("Arguments: " + str(args)) 78 | 79 | cfg = setup_cfg(args) 80 | self.demo = VisualizationDemo(cfg) 81 | 82 | def get_predictions(self, img, visualize=0): 83 | return self.demo.run_on_image(img, visualize=visualize) 84 | 85 | 86 | def setup_cfg(args): 87 | # load config from file and command-line arguments 88 | cfg = get_cfg() 89 | cfg.merge_from_file(args.config_file) 90 | cfg.merge_from_list(args.opts) 91 | # Set score_threshold for builtin models 92 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold 93 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold 94 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = \ 95 | args.confidence_threshold 96 | cfg.freeze() 97 | return cfg 98 | 99 | 100 | def get_seg_parser(): 101 | parser = argparse.ArgumentParser( 102 | description="Detectron2 demo for builtin models") 103 | parser.add_argument( 104 | "--config-file", 105 | default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml", 106 | metavar="FILE", 107 | help="path to config file", 108 | ) 109 | parser.add_argument( 110 | "--webcam", 111 | action="store_true", 112 | help="Take inputs from webcam.") 113 | parser.add_argument("--video-input", help="Path to video file.") 114 | parser.add_argument( 115 | "--input", 116 | nargs="+", 117 | help="A list of space separated input images") 118 | parser.add_argument( 119 | "--output", 120 | help="A file or directory to save output visualizations. " 121 | "If not given, will show output in an OpenCV window.", 122 | ) 123 | 124 | parser.add_argument( 125 | "--confidence-threshold", 126 | type=float, 127 | default=0.5, 128 | help="Minimum score for instance predictions to be shown", 129 | ) 130 | parser.add_argument( 131 | "--opts", 132 | help="Modify config options using the command-line 'KEY VALUE' pairs", 133 | default=[], 134 | nargs=argparse.REMAINDER, 135 | ) 136 | return parser 137 | 138 | 139 | class VisualizationDemo(object): 140 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE): 141 | """ 142 | Args: 143 | cfg (CfgNode): 144 | instance_mode (ColorMode): 145 | """ 146 | self.metadata = MetadataCatalog.get( 147 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 148 | ) 149 | self.cpu_device = torch.device("cpu") 150 | self.instance_mode = instance_mode 151 | 152 | self.predictor = BatchPredictor(cfg) 153 | 154 | def run_on_image(self, image_list, visualize=0): 155 | """ 156 | Args: 157 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 158 | This is the format used by OpenCV. 159 | 160 | Returns: 161 | predictions (dict): the output of the model. 162 | vis_output (VisImage): the visualized image output. 163 | """ 164 | vis_output = None 165 | all_predictions = self.predictor(image_list) 166 | # Convert image from OpenCV BGR format to Matplotlib RGB format. 167 | 168 | if visualize: 169 | predictions = all_predictions[0] 170 | image = image_list[0] 171 | visualizer = Visualizer( 172 | image, self.metadata, instance_mode=self.instance_mode) 173 | if "panoptic_seg" in predictions: 174 | panoptic_seg, segments_info = predictions["panoptic_seg"] 175 | vis_output = visualizer.draw_panoptic_seg_predictions( 176 | panoptic_seg.to(self.cpu_device), segments_info 177 | ) 178 | else: 179 | if "sem_seg" in predictions: 180 | vis_output = visualizer.draw_sem_seg( 181 | predictions["sem_seg"].argmax( 182 | dim=0).to(self.cpu_device) 183 | ) 184 | if "instances" in predictions: 185 | instances = predictions["instances"].to(self.cpu_device) 186 | vis_output = visualizer.draw_instance_predictions( 187 | predictions=instances) 188 | 189 | return all_predictions, vis_output 190 | 191 | 192 | class BatchPredictor: 193 | """ 194 | Create a simple end-to-end predictor with the given config that runs on 195 | single device for a list of input images. 196 | 197 | Compared to using the model directly, this class does the following 198 | additions: 199 | 200 | 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. 201 | 2. Always take BGR image as the input and apply conversion defined by 202 | `cfg.INPUT.FORMAT`. 203 | 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. 204 | 4. Take a list of input images 205 | 206 | Attributes: 207 | metadata (Metadata): the metadata of the underlying dataset, obtained 208 | from cfg.DATASETS.TEST. 209 | 210 | """ 211 | 212 | def __init__(self, cfg): 213 | self.cfg = cfg.clone() # cfg can be modified by model 214 | self.model = build_model(self.cfg) 215 | self.model.eval() 216 | self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) 217 | 218 | checkpointer = DetectionCheckpointer(self.model) 219 | checkpointer.load(cfg.MODEL.WEIGHTS) 220 | 221 | self.input_format = cfg.INPUT.FORMAT 222 | assert self.input_format in ["RGB", "BGR"], self.input_format 223 | 224 | def __call__(self, image_list): 225 | """ 226 | Args: 227 | image_list (list of np.ndarray): a list of images of 228 | shape (H, W, C) (in BGR order). 229 | 230 | Returns: 231 | predictions (dict): 232 | the output of the model for all images. 233 | See :doc:`/tutorials/models` for details about the format. 234 | """ 235 | inputs = [] 236 | for original_image in image_list: 237 | # https://github.com/sphinx-doc/sphinx/issues/4258 238 | # Apply pre-processing to image. 239 | if self.input_format == "RGB": 240 | # whether the model expects BGR inputs or RGB 241 | original_image = original_image[:, :, ::-1] 242 | height, width = original_image.shape[:2] 243 | image = original_image 244 | image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) 245 | 246 | instance = {"image": image, "height": height, "width": width} 247 | 248 | inputs.append(instance) 249 | 250 | with torch.no_grad(): 251 | predictions = self.model(inputs) 252 | return predictions 253 | -------------------------------------------------------------------------------- /envs/habitat/__init__.py: -------------------------------------------------------------------------------- 1 | # Parts of the code in this file have been borrowed from: 2 | # https://github.com/facebookresearch/habitat-api 3 | import os 4 | import pdb 5 | import sys 6 | 7 | import numpy as np 8 | import torch 9 | from habitat.config.default import get_config as cfg_env 10 | from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1 11 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset 12 | 13 | from agents.sem_exp import Sem_Exp_Env_Agent 14 | from agents.multi_attr_exp import Multi_Attr_Exp_Env_Agent 15 | from .objectgoal_env import ObjectGoal_Env 16 | from .objectgoal_env21 import ObjectGoal_Env21 17 | 18 | from .utils.vector_env import VectorEnv 19 | 20 | 21 | def make_env_fn(args, config_env, rank): 22 | dataset = make_dataset(config_env.DATASET.TYPE, config=config_env.DATASET) 23 | config_env.defrost() 24 | config_env.SIMULATOR.SCENE = dataset.episodes[0].scene_id 25 | config_env.freeze() 26 | if args.agent == "sem_exp": 27 | env = Sem_Exp_Env_Agent(args=args, rank=rank, 28 | config_env=config_env, 29 | dataset=dataset 30 | ) 31 | elif args.agent=='multi_attr_exp': 32 | env = Multi_Attr_Exp_Env_Agent(args=args, rank=rank, 33 | config_env=config_env, 34 | dataset=dataset 35 | ) 36 | elif args.agent == "obj21": 37 | env = ObjectGoal_Env21(args=args, rank=rank, 38 | config_env=config_env, 39 | dataset=dataset 40 | ) 41 | else: 42 | env = ObjectGoal_Env(args=args, rank=rank, 43 | config_env=config_env, 44 | dataset=dataset 45 | ) 46 | 47 | env.seed(rank) 48 | return env 49 | 50 | 51 | def _get_scenes_from_folder(content_dir): 52 | scene_dataset_ext = ".glb.json.gz" 53 | scenes = [] 54 | for filename in os.listdir(content_dir): 55 | if filename.endswith(scene_dataset_ext): 56 | scene = filename[: -len(scene_dataset_ext) + 4] 57 | scenes.append(scene) 58 | scenes.sort() 59 | return scenes 60 | 61 | 62 | def construct_envs(args): 63 | env_configs = [] 64 | args_list = [] 65 | 66 | basic_config = cfg_env(config_paths=["envs/habitat/configs/" 67 | + args.task_config]) 68 | basic_config.defrost() 69 | basic_config.DATASET.SPLIT = args.split 70 | 71 | # basic_config.DATASET.DATA_PATH = \ 72 | # basic_config.DATASET.DATA_PATH.replace("v1", args.version) 73 | # basic_config.DATASET.EPISODES_DIR = \ 74 | # basic_config.DATASET.EPISODES_DIR.replace("v1", args.version) 75 | basic_config.freeze() 76 | 77 | scenes = basic_config.DATASET.CONTENT_SCENES 78 | if "*" in basic_config.DATASET.CONTENT_SCENES: 79 | content_dir = os.path.join("data/datasets/objectnav/gibson/v1.1/" + args.split, "content") 80 | scenes = _get_scenes_from_folder(content_dir) 81 | 82 | if len(scenes) > 0: 83 | assert len(scenes) >= args.num_processes, ( 84 | "reduce the number of processes as there " 85 | "aren't enough number of scenes" 86 | ) 87 | 88 | scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes)) 89 | for _ in range(args.num_processes)] 90 | for i in range(len(scenes) % args.num_processes): 91 | scene_split_sizes[i] += 1 92 | 93 | print("Scenes per thread:") 94 | for i in range(args.num_processes): 95 | config_env = cfg_env(config_paths=["envs/habitat/configs/" 96 | + args.task_config]) 97 | config_env.defrost() 98 | 99 | if len(scenes) > 0: 100 | config_env.DATASET.CONTENT_SCENES = scenes[ 101 | sum(scene_split_sizes[:i]): 102 | sum(scene_split_sizes[:i + 1]) 103 | ] 104 | print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES)) 105 | 106 | if i < args.num_processes_on_first_gpu: 107 | gpu_id = 0 108 | else: 109 | gpu_id = int((i - args.num_processes_on_first_gpu) 110 | // args.num_processes_per_gpu) + args.sim_gpu_id 111 | gpu_id = min(torch.cuda.device_count() - 1, gpu_id) 112 | config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id 113 | 114 | agent_sensors = [] 115 | agent_sensors.append("RGB_SENSOR") 116 | agent_sensors.append("DEPTH_SENSOR") 117 | agent_sensors.append("SEMANTIC_SENSOR") 118 | 119 | config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors 120 | 121 | # Reseting episodes manually, setting high max episode length in sim 122 | config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000 123 | config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False 124 | 125 | config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width 126 | config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height 127 | config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov 128 | config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0] 129 | 130 | config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width 131 | config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height 132 | config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov 133 | config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth 134 | config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth 135 | config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0] 136 | 137 | config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width 138 | config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height 139 | config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov 140 | config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \ 141 | [0, args.camera_height, 0] 142 | 143 | config_env.SIMULATOR.TURN_ANGLE = args.turn_angle 144 | config_env.DATASET.SPLIT = args.split 145 | # config_env.DATASET.DATA_PATH = \ 146 | # config_env.DATASET.DATA_PATH.replace("v1", args.version) 147 | # config_env.DATASET.EPISODES_DIR = \ 148 | # config_env.DATASET.EPISODES_DIR.replace("v1", args.version) 149 | 150 | config_env.freeze() 151 | env_configs.append(config_env) 152 | 153 | args_list.append(args) 154 | 155 | envs = VectorEnv( 156 | make_env_fn=make_env_fn, 157 | env_fn_args=tuple( 158 | tuple( 159 | zip(args_list, env_configs, range(args.num_processes)) 160 | ) 161 | ), 162 | ) 163 | 164 | return envs 165 | 166 | 167 | def construct_envs21(args): 168 | env_configs = [] 169 | args_list = [] 170 | 171 | basic_config = cfg_env(config_paths=["envs/habitat/configs/" 172 | + args.task_config]) 173 | basic_config.defrost() 174 | basic_config.DATASET.SPLIT = args.split 175 | basic_config.DATASET.DATA_PATH = \ 176 | basic_config.DATASET.DATA_PATH.replace("v1", args.version) 177 | # basic_config.DATASET.EPISODES_DIR = \ 178 | # basic_config.DATASET.EPISODES_DIR.replace("v1", args.version) 179 | basic_config.freeze() 180 | 181 | scenes = basic_config.DATASET.CONTENT_SCENES 182 | dataset = make_dataset(basic_config.DATASET.TYPE, config=basic_config.DATASET) 183 | if "*" in basic_config.DATASET.CONTENT_SCENES: 184 | scenes = dataset.get_scenes_to_load(basic_config.DATASET) 185 | 186 | if len(scenes) > 0: 187 | assert len(scenes) >= args.num_processes, ( 188 | "reduce the number of processes as there " 189 | "aren't enough number of scenes" 190 | ) 191 | 192 | scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes)) 193 | for _ in range(args.num_processes)] 194 | for i in range(len(scenes) % args.num_processes): 195 | scene_split_sizes[i] += 1 196 | 197 | print("Scenes per thread:") 198 | for i in range(args.num_processes): 199 | config_env = cfg_env(config_paths=["envs/habitat/configs/" 200 | + args.task_config]) 201 | config_env.defrost() 202 | 203 | if len(scenes) > 0: 204 | config_env.DATASET.CONTENT_SCENES = scenes[ 205 | sum(scene_split_sizes[:i]): 206 | sum(scene_split_sizes[:i + 1]) 207 | ] 208 | print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES)) 209 | 210 | if i < args.num_processes_on_first_gpu: 211 | gpu_id = 0 212 | else: 213 | gpu_id = int((i - args.num_processes_on_first_gpu) 214 | // args.num_processes_per_gpu) + args.sim_gpu_id 215 | gpu_id = min(torch.cuda.device_count() - 1, gpu_id) 216 | config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id 217 | 218 | agent_sensors = [] 219 | agent_sensors.append("RGB_SENSOR") 220 | agent_sensors.append("DEPTH_SENSOR") 221 | agent_sensors.append("SEMANTIC_SENSOR") 222 | 223 | config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors 224 | 225 | # Reseting episodes manually, setting high max episode length in sim 226 | config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000 227 | config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False 228 | 229 | config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width 230 | config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height 231 | config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov 232 | config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0] 233 | 234 | config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width 235 | config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height 236 | config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov 237 | config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth 238 | config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth 239 | config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0] 240 | 241 | config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width 242 | config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height 243 | config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov 244 | config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \ 245 | [0, args.camera_height, 0] 246 | 247 | 248 | config_env.SIMULATOR.TURN_ANGLE = args.turn_angle 249 | config_env.DATASET.SPLIT = args.split 250 | config_env.DATASET.DATA_PATH = \ 251 | config_env.DATASET.DATA_PATH.replace("v1", args.version) 252 | # config_env.DATASET.EPISODES_DIR = \ 253 | # config_env.DATASET.EPISODES_DIR.replace("v1", args.version) 254 | 255 | config_env.freeze() 256 | env_configs.append(config_env) 257 | 258 | args_list.append(args) 259 | 260 | envs = VectorEnv( 261 | make_env_fn=make_env_fn, 262 | env_fn_args=tuple( 263 | tuple( 264 | zip(args_list, env_configs, range(args.num_processes)) 265 | ) 266 | ), 267 | ) 268 | 269 | return envs 270 | 271 | 272 | 273 | -------------------------------------------------------------------------------- /envs/habitat/objectgoal_env21.py: -------------------------------------------------------------------------------- 1 | import json 2 | import bz2 3 | import gzip 4 | import _pickle as cPickle 5 | import pdb 6 | 7 | import gym 8 | import numpy as np 9 | import quaternion 10 | import skimage.morphology 11 | import habitat 12 | 13 | from envs.utils.fmm_planner import FMMPlanner 14 | from constants import category_to_id, mp3d_category_id 15 | import envs.utils.pose as pu 16 | 17 | coco_categories = [0, 3, 2, 4, 5, 1] 18 | 19 | class ObjectGoal_Env21(habitat.RLEnv): 20 | """The Object Goal Navigation environment class. The class is responsible 21 | for loading the dataset, generating episodes, and computing evaluation 22 | metrics. 23 | """ 24 | 25 | def __init__(self, args, rank, config_env, dataset): 26 | self.args = args 27 | self.rank = rank 28 | 29 | super().__init__(config_env, dataset) 30 | 31 | # Initializations 32 | self.episode_no = 0 33 | 34 | # Scene info 35 | self.last_scene_path = None 36 | self.scene_path = None 37 | self.scene_name = None 38 | 39 | # Episode Dataset info 40 | self.eps_data = None 41 | self.eps_data_idx = None 42 | self.gt_planner = None 43 | self.object_boundary = None 44 | self.goal_idx = None 45 | self.goal_name = None 46 | self.map_obj_origin = None 47 | self.starting_loc = None 48 | self.starting_distance = None 49 | 50 | # Episode tracking info 51 | self.curr_distance = None 52 | self.prev_distance = None 53 | self.timestep = None 54 | self.stopped = None 55 | self.path_length = None 56 | self.last_sim_location = None 57 | self.trajectory_states = [] 58 | self.info = {} 59 | self.info['distance_to_goal'] = None 60 | self.info['spl'] = None 61 | self.info['success'] = None 62 | 63 | # self.scene = self._env.sim.semantic_annotations() 64 | 65 | fileName = 'data/matterport_category_mappings.tsv' 66 | 67 | text = '' 68 | lines = [] 69 | items = [] 70 | self.hm3d_semantic_mapping={} 71 | 72 | with open(fileName, 'r') as f: 73 | text = f.read() 74 | lines = text.split('\n') 75 | 76 | for l in lines: 77 | items.append(l.split(' ')) 78 | 79 | for i in items: 80 | if len(i) > 3: 81 | self.hm3d_semantic_mapping[i[2]] = i[-1] 82 | 83 | # print() 84 | 85 | # for obj in self.scene.objects: 86 | # if obj is not None: 87 | # print( 88 | # f"Object id:{obj.id}, category:{obj.category.name()}, index:{obj.category.index()}" 89 | # f" center:{obj.aabb.center}, dims:{obj.aabb.sizes}" 90 | # ) 91 | 92 | def reset(self): 93 | """Resets the environment to a new episode. 94 | 95 | Returns: 96 | obs (ndarray): RGBD observations (4 x H x W) 97 | info (dict): contains timestep, pose, goal category and 98 | evaluation metric info 99 | """ 100 | args = self.args 101 | # new_scene = self.episode_no % args.num_train_episodes == 0 102 | 103 | 104 | self.episode_no += 1 105 | 106 | # Initializations 107 | self.timestep = 0 108 | self.stopped = False 109 | self.path_length = 1e-5 110 | self.trajectory_states = [] 111 | 112 | # if new_scene: 113 | obs = super().reset() 114 | start_height = 0 115 | self.scene = self._env.sim.semantic_annotations() 116 | # start_height = self._env.current_episode.start_position[1] 117 | # goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1] 118 | 119 | # floor_height = [] 120 | # floor_size = [] 121 | # for obj in self.scene.objects: 122 | # if obj.category.name() in self.hm3d_semantic_mapping and \ 123 | # self.hm3d_semantic_mapping[obj.category.name()] == 'floor': 124 | # floor_height.append(abs(obj.aabb.center[1] - start_height)) 125 | # floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2]) 126 | 127 | 128 | # if not args.eval: 129 | # while all(h > 0.1 or s < 12 for (h,s) in zip(floor_height, floor_size)) or abs(start_height-goal_height) > 1.2: 130 | # obs = super().reset() 131 | 132 | # self.scene = self._env.sim.semantic_annotations() 133 | # start_height = self._env.current_episode.start_position[1] 134 | # goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1] 135 | 136 | # floor_height = [] 137 | # floor_size = [] 138 | # for obj in self.scene.objects: 139 | # if obj.category.name() in self.hm3d_semantic_mapping and \ 140 | # self.hm3d_semantic_mapping[obj.category.name()] == 'floor': 141 | # floor_height.append(abs(obj.aabb.center[1] - start_height)) 142 | # floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2]) 143 | 144 | self.prev_distance = self._env.get_metrics()["distance_to_goal"] 145 | self.starting_distance = self._env.get_metrics()["distance_to_goal"] 146 | self.info['starting_distance_to_goal']=self.starting_distance 147 | 148 | # print("obs: ,", obs) 149 | # print("obs shape: ,", obs.shape) 150 | rgb = obs['rgb'].astype(np.uint8) 151 | depth = obs['depth'] 152 | semantic = self._preprocess_semantic(obs["semantic"]) 153 | 154 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1) 155 | self.last_sim_location = self.get_sim_location() 156 | 157 | # Set info 158 | self.info['time'] = self.timestep 159 | self.info['sensor_pose'] = [0., 0., 0.] 160 | self.info['goal_cat_id'] = coco_categories[obs['objectgoal'][0]] 161 | self.info['goal_name'] = category_to_id[obs['objectgoal'][0]] 162 | 163 | self.goal_name = category_to_id[obs['objectgoal'][0]] 164 | 165 | return state, self.info 166 | 167 | def step(self, action): 168 | """Function to take an action in the environment. 169 | 170 | Args: 171 | action (dict): 172 | dict with following keys: 173 | 'action' (int): 0: stop, 1: forward, 2: left, 3: right 174 | 175 | Returns: 176 | obs (ndarray): RGBD observations (4 x H x W) 177 | reward (float): amount of reward returned after previous action 178 | done (bool): whether the episode has ended 179 | info (dict): contains timestep, pose, goal category and 180 | evaluation metric info 181 | """ 182 | action = action["action"] 183 | if action == 0: 184 | self.stopped = True 185 | # Not sending stop to simulator, resetting manually 186 | action = 3 187 | 188 | obs, rew, done, _ = super().step(action) 189 | 190 | # Get pose change 191 | dx, dy, do = self.get_pose_change() 192 | self.info['sensor_pose'] = [dx, dy, do] 193 | self.path_length += pu.get_l2_distance(0, dx, 0, dy) 194 | 195 | self.info['distance_to_goal'] = self._env.get_metrics()["distance_to_goal"] 196 | spl, success, dist = 0., 0., 0. 197 | if done: 198 | spl, success, dist = self.get_metrics() 199 | self.info['distance_to_goal'] = dist 200 | self.info['spl'] = spl 201 | self.info['success'] = success 202 | 203 | rgb = obs['rgb'].astype(np.uint8) 204 | depth = obs['depth'] 205 | semantic = self._preprocess_semantic(obs["semantic"]) 206 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1) 207 | 208 | self.timestep += 1 209 | self.info['time'] = self.timestep 210 | 211 | 212 | 213 | 214 | return state, rew, done, self.info 215 | 216 | def _preprocess_semantic(self, semantic): 217 | # print("*********semantic type: ", type(semantic)) 218 | se = list(set(semantic.ravel())) 219 | # pdb.set_trace() 220 | for i in range(len(se)): 221 | if self.scene.objects[se[i]].category.name() in self.hm3d_semantic_mapping: 222 | hm3d_category_name = self.hm3d_semantic_mapping[self.scene.objects[se[i]].category.name()] 223 | else: 224 | hm3d_category_name = self.scene.objects[se[i]].category.name() 225 | 226 | if hm3d_category_name in mp3d_category_id: 227 | # print("sum: ", np.sum(sem_output[sem_output==se[i]])/se[i]) 228 | semantic[semantic==se[i]] = mp3d_category_id[hm3d_category_name]-1 229 | else : 230 | semantic[ 231 | semantic==se[i] 232 | ] = 0 233 | 234 | # se = list(set(semantic.ravel())) 235 | # print("semantic: ", se) # [] 236 | # semantic = np.expand_dims(semantic.astype(np.uint8), 2) 237 | return semantic.astype(np.uint8) 238 | 239 | def get_reward_range(self): 240 | """This function is not used, Habitat-RLEnv requires this function""" 241 | return (0., 1.0) 242 | 243 | def get_reward(self, observations): 244 | self.curr_distance = self._env.get_metrics()['distance_to_goal'] 245 | 246 | reward = (self.prev_distance - self.curr_distance) * \ 247 | self.args.reward_coeff 248 | 249 | self.prev_distance = self.curr_distance 250 | return reward 251 | 252 | def get_llm_distance(self, target_point_map, frontier_loc_g): 253 | 254 | frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0], 255 | frontier_loc_g[1]] / 20.0 256 | tpm = len(list(set(target_point_map.ravel()))) -1 257 | for lay in range(tpm): 258 | frontier_loc = np.argwhere(target_point_map == lay+1) 259 | frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0], 260 | frontier_loc[1]] / 20.0 261 | if frontier_distance > frontier_dis_g: 262 | return 0 263 | return 1 264 | 265 | def get_metrics(self): 266 | """This function computes evaluation metrics for the Object Goal task 267 | 268 | Returns: 269 | spl (float): Success weighted by Path Length 270 | (See https://arxiv.org/pdf/1807.06757.pdf) 271 | success (int): 0: Failure, 1: Successful 272 | dist (float): Distance to Success (DTS), distance of the agent 273 | from the success threshold boundary in meters. 274 | (See https://arxiv.org/pdf/2007.00643.pdf) 275 | """ 276 | dist = self._env.get_metrics()['distance_to_goal'] 277 | if dist < 0.1: 278 | success = 1 279 | else: 280 | success = 0 281 | spl = min(success * self.starting_distance / self.path_length, 1) 282 | return spl, success, dist 283 | 284 | def get_done(self, observations): 285 | if self.info['time'] >= self.args.max_episode_length - 1: 286 | done = True 287 | elif self.stopped: 288 | done = True 289 | # print(self._env.get_metrics()) 290 | else: 291 | done = False 292 | return done 293 | 294 | def _episode_success(self): 295 | return self._env.get_metrics()['success'] 296 | 297 | def get_info(self, observations): 298 | """This function is not used, Habitat-RLEnv requires this function""" 299 | info = {} 300 | return info 301 | 302 | def get_sim_location(self): 303 | """Returns x, y, o pose of the agent in the Habitat simulator.""" 304 | 305 | agent_state = super().habitat_env.sim.get_agent_state(0) 306 | x = -agent_state.position[2] 307 | y = -agent_state.position[0] 308 | axis = quaternion.as_euler_angles(agent_state.rotation)[0] 309 | if (axis % (2 * np.pi)) < 0.1 or (axis % 310 | (2 * np.pi)) > 2 * np.pi - 0.1: 311 | o = quaternion.as_euler_angles(agent_state.rotation)[1] 312 | else: 313 | o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1] 314 | if o > np.pi: 315 | o -= 2 * np.pi 316 | return x, y, o 317 | 318 | def get_pose_change(self): 319 | """Returns dx, dy, do pose change of the agent relative to the last 320 | timestep.""" 321 | curr_sim_pose = self.get_sim_location() 322 | dx, dy, do = pu.get_rel_pose_change( 323 | curr_sim_pose, self.last_sim_location) 324 | self.last_sim_location = curr_sim_pose 325 | return dx, dy, do 326 | -------------------------------------------------------------------------------- /envs/utils/depth_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for processing depth images. 17 | """ 18 | import pdb 19 | from argparse import Namespace 20 | 21 | import itertools 22 | import numpy as np 23 | import torch 24 | 25 | import envs.utils.rotation_utils as ru 26 | 27 | 28 | def get_camera_matrix(width, height, fov): 29 | """Returns a camera matrix from image size and fov.""" 30 | xc = (width - 1.) / 2. 31 | zc = (height - 1.) / 2. 32 | f = (width / 2.) / np.tan(np.deg2rad(fov / 2.)) 33 | camera_matrix = {'xc': xc, 'zc': zc, 'f': f} 34 | camera_matrix = Namespace(**camera_matrix) 35 | return camera_matrix 36 | 37 | 38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1): 39 | """Projects the depth image Y into a 3D point cloud. 40 | Inputs: 41 | Y is ...xHxW 42 | camera_matrix 43 | Outputs: 44 | X is positive going right 45 | Y is positive into the image 46 | Z is positive up in the image 47 | XYZ is ...xHxWx3 48 | """ 49 | x, z = np.meshgrid(np.arange(Y.shape[-1]), 50 | np.arange(Y.shape[-2] - 1, -1, -1)) 51 | for _ in range(Y.ndim - 2): 52 | x = np.expand_dims(x, axis=0) 53 | z = np.expand_dims(z, axis=0) 54 | X = (x[::scale, ::scale] - camera_matrix.xc) * \ 55 | Y[::scale, ::scale] / camera_matrix.f 56 | Z = (z[::scale, ::scale] - camera_matrix.zc) * \ 57 | Y[::scale, ::scale] / camera_matrix.f 58 | XYZ = np.concatenate((X[..., np.newaxis], 59 | Y[::scale, ::scale][..., np.newaxis], 60 | Z[..., np.newaxis]), axis=X.ndim) 61 | return XYZ 62 | 63 | 64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree): 65 | """ 66 | Transforms the point cloud into geocentric frame to account for 67 | camera elevation and angle 68 | Input: 69 | XYZ : ...x3 70 | sensor_height : height of the sensor 71 | camera_elevation_degree : camera elevation to rectify. 72 | Output: 73 | XYZ : ...x3 74 | """ 75 | R = ru.get_r_matrix( 76 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree)) 77 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape) 78 | XYZ[..., 2] = XYZ[..., 2] + sensor_height 79 | return XYZ 80 | 81 | 82 | def transform_pose(XYZ, current_pose): 83 | """ 84 | Transforms the point cloud into geocentric frame to account for 85 | camera position 86 | Input: 87 | XYZ : ...x3 88 | current_pose : camera position (x, y, theta (radians)) 89 | Output: 90 | XYZ : ...x3 91 | """ 92 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.) 93 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape) 94 | XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0] 95 | XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1] 96 | return XYZ 97 | 98 | 99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution): 100 | """Bins points into xy-z bins 101 | XYZ_cms is ... x H x W x3 102 | Outputs is ... x map_size x map_size x (len(z_bins)+1) 103 | """ 104 | sh = XYZ_cms.shape 105 | XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]]) 106 | n_z_bins = len(z_bins) + 1 107 | counts = [] 108 | for XYZ_cm in XYZ_cms: 109 | isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0])) 110 | X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32) 111 | Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32) 112 | Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32) 113 | 114 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, 115 | Y_bin < map_size, 116 | Z_bin >= 0, Z_bin < n_z_bins, isnotnan]) 117 | isvalid = np.all(isvalid, axis=0) 118 | 119 | ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin 120 | ind[np.logical_not(isvalid)] = 0 121 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32), 122 | minlength=map_size * map_size * n_z_bins) 123 | counts = np.reshape(count, [map_size, map_size, n_z_bins]) 124 | 125 | counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins]) 126 | 127 | return counts 128 | 129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution): 130 | """Bins points into xy-z bins 131 | XYZ_cms is ... x H x W x3 132 | semantic is ... x H x W 133 | Outputs is ... x map_size x map_size x len 134 | """ 135 | # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3 136 | # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]]) 137 | # print("XYZ_cms: ",XYZ_cms) 138 | # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3 139 | # print("z_bins: ",z_bins) 140 | # se = list(set(semantic.ravel())) 141 | # print("bin_semantic_points: ", se) # [] 142 | # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3 143 | isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0])) 144 | X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32) 145 | Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32) 146 | Z_bin = semantic.astype(np.int32) 147 | 148 | # print("X_bin S count: ", Counter(X_bin.ravel())) 149 | # print("Y_bin S count: ", Counter(Y_bin.ravel())) 150 | # print("Z_bin S count: ", Counter(Z_bin.ravel())) 151 | # print("Z_bin: {}".format(X_bin)) 152 | # print("X_bin: {}".format(X_bin.shape)) #128*128 153 | # print("Z_bin: {}".format(Z_bin.shape)) #128*128 154 | 155 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size, 156 | Z_bin > 0, Z_bin <= semantic_map_len, isnotnan]) 157 | isvalid = np.all(isvalid, axis=0) 158 | 159 | # print("isvalid: ", Counter(isvalid.ravel())) 160 | # print("isvalid: ", isvalid.shape) #128*128 161 | 162 | ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1 163 | ind[np.logical_not(isvalid)] = 0 164 | # print("ind: ", Counter(ind.ravel())) 165 | # print("ind: ", ind.shape) # 128*128 166 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32), 167 | minlength=map_size * map_size * semantic_map_len) 168 | # print("counts: ", count.shape) # 4838400 169 | 170 | counts = np.reshape(count, [map_size, map_size, semantic_map_len]) 171 | # print("counts: ", counts.shape) # 480*480*semantic_map_len 172 | 173 | return counts 174 | 175 | 176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1): 177 | """Projects the depth image Y into a 3D point cloud. 178 | Inputs: 179 | Y is ...xHxW 180 | camera_matrix 181 | Outputs: 182 | X is positive going right 183 | Y is positive into the image 184 | Z is positive up in the image 185 | XYZ is ...xHxWx3 186 | """ 187 | grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]), 188 | torch.arange(Y_t.shape[-2] - 1, -1, -1)) 189 | grid_x = grid_x.transpose(1, 0).to(device) 190 | grid_z = grid_z.transpose(1, 0).to(device) 191 | grid_x = grid_x.unsqueeze(0).expand(Y_t.size()) 192 | grid_z = grid_z.unsqueeze(0).expand(Y_t.size()) 193 | 194 | X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \ 195 | Y_t[:, ::scale, ::scale] / camera_matrix.f 196 | Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \ 197 | Y_t[:, ::scale, ::scale] / camera_matrix.f 198 | 199 | XYZ = torch.stack( 200 | (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size())) 201 | 202 | return XYZ 203 | 204 | 205 | def transform_camera_view_t( 206 | XYZ, sensor_height, camera_elevation_degree, device): 207 | """ 208 | Transforms the point cloud into geocentric frame to account for 209 | camera elevation and angle 210 | Input: 211 | XYZ : ...x3 212 | sensor_height : height of the sensor 213 | camera_elevation_degree : camera elevation to rectify. 214 | Output: 215 | XYZ : ...x3 216 | """ 217 | for i in range(XYZ.shape[0]): 218 | R = ru.get_r_matrix( 219 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i])) 220 | XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device) 221 | ) 222 | XYZ[..., 2] = XYZ[..., 2] + sensor_height 223 | return XYZ 224 | 225 | 226 | def transform_pose_t(XYZ, current_pose, device): 227 | """ 228 | Transforms the point cloud into geocentric frame to account for 229 | camera position 230 | Input: 231 | XYZ : ...x3 232 | current_pose : camera position (x, y, theta (radians)) 233 | Output: 234 | XYZ : ...x3 235 | """ 236 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.) 237 | XYZ = torch.matmul(XYZ.reshape(-1, 3), 238 | torch.from_numpy(R).float().transpose(1, 0).to(device) 239 | ).reshape(XYZ.shape) 240 | XYZ[..., 0] += current_pose[0] 241 | XYZ[..., 1] += current_pose[1] 242 | return XYZ 243 | 244 | 245 | def splat_feat_nd(init_grid, feat, coords): 246 | """ 247 | Args: 248 | init_grid: B X nF X W X H X D X .. 249 | feat: B X nF X nPt 250 | coords: B X nDims X nPt in [-1, 1] 251 | Returns: 252 | grid: B X nF X W X H X D X .. 253 | """ 254 | wts_dim = [] 255 | pos_dim = [] 256 | grid_dims = init_grid.shape[2:] 257 | 258 | B = init_grid.shape[0] 259 | F = init_grid.shape[1] 260 | 261 | n_dims = len(grid_dims) 262 | 263 | grid_flat = init_grid.view(B, F, -1) 264 | 265 | for d in range(n_dims): 266 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 267 | pos_d = [] 268 | wts_d = [] 269 | 270 | for ix in [0, 1]: 271 | pos_ix = torch.floor(pos) + ix 272 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d]) 273 | safe_ix = safe_ix.type(pos.dtype) 274 | 275 | wts_ix = 1 - torch.abs(pos - pos_ix) 276 | 277 | wts_ix = wts_ix * safe_ix 278 | pos_ix = pos_ix * safe_ix 279 | 280 | pos_d.append(pos_ix) 281 | wts_d.append(wts_ix) 282 | 283 | pos_dim.append(pos_d) 284 | wts_dim.append(wts_d) 285 | 286 | l_ix = [[0, 1] for d in range(n_dims)] 287 | # pdb.set_trace() 288 | for ix_d in itertools.product(*l_ix): 289 | wts = torch.ones_like(wts_dim[0][0]) 290 | index = torch.zeros_like(wts_dim[0][0]) 291 | for d in range(n_dims): 292 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]] 293 | wts = wts * wts_dim[d][ix_d[d]] 294 | 295 | index = index.long() 296 | grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts) 297 | grid_flat = torch.round(grid_flat) 298 | 299 | return grid_flat.view(init_grid.shape) 300 | 301 | def splat_att_feat_nd(init_grid, feat, coords): 302 | """ 303 | Args: 304 | init_grid: B X nF X W X H X D X .. 305 | feat: B X nF X nPt 306 | coords: B X nDims X nPt in [-1, 1] 307 | Returns: 308 | grid: B X nF X W X H X D X .. 309 | """ 310 | wts_dim = [] 311 | pos_dim = [] 312 | grid_dims = init_grid.shape[2:] 313 | 314 | B = init_grid.shape[0] 315 | F = init_grid.shape[1] 316 | 317 | n_dims = len(grid_dims) 318 | 319 | grid_flat = init_grid[:,:-1,...].view(B, F-1, -1) 320 | grid_flat_att = init_grid[:,-1:,...].view(B,1, -1) 321 | 322 | for d in range(n_dims): 323 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 324 | pos_d = [] 325 | wts_d = [] 326 | 327 | for ix in [0, 1]: 328 | pos_ix = torch.floor(pos) + ix 329 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d]) 330 | safe_ix = safe_ix.type(pos.dtype) 331 | 332 | wts_ix = 1 - torch.abs(pos - pos_ix) 333 | 334 | wts_ix = wts_ix * safe_ix 335 | pos_ix = pos_ix * safe_ix 336 | 337 | pos_d.append(pos_ix) 338 | wts_d.append(wts_ix) 339 | 340 | pos_dim.append(pos_d) 341 | wts_dim.append(wts_d) 342 | 343 | l_ix = [[0, 1] for d in range(n_dims)] 344 | for ix_d in itertools.product(*l_ix): 345 | wts = torch.ones_like(wts_dim[0][0]) 346 | index = torch.zeros_like(wts_dim[0][0]) 347 | for d in range(n_dims): 348 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]] 349 | wts = wts * wts_dim[d][ix_d[d]] 350 | 351 | index = index.long() 352 | grid_flat.scatter_add_(2, index.expand(-1, F-1, -1), feat[:,:-1,:] * wts) 353 | grid_flat = torch.round(grid_flat) 354 | 355 | current_values = grid_flat_att.gather(2, index) 356 | new_val = torch.max(current_values, (feat[:,-1:,:] * wts)) 357 | grid_flat_att.scatter_(2, index, new_val) 358 | 359 | final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1) 360 | 361 | assert final_grid_flat.shape==init_grid.view(B, F, -1).shape 362 | return final_grid_flat.view(init_grid.shape) 363 | -------------------------------------------------------------------------------- /envs/utils/depth_utils_multi.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for processing depth images. 17 | """ 18 | import pdb 19 | from argparse import Namespace 20 | 21 | import itertools 22 | import numpy as np 23 | import torch 24 | 25 | import envs.utils.rotation_utils as ru 26 | 27 | 28 | def get_camera_matrix(width, height, fov): 29 | """Returns a camera matrix from image size and fov.""" 30 | xc = (width - 1.) / 2. 31 | zc = (height - 1.) / 2. 32 | f = (width / 2.) / np.tan(np.deg2rad(fov / 2.)) 33 | camera_matrix = {'xc': xc, 'zc': zc, 'f': f} 34 | camera_matrix = Namespace(**camera_matrix) 35 | return camera_matrix 36 | 37 | 38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1): 39 | """Projects the depth image Y into a 3D point cloud. 40 | Inputs: 41 | Y is ...xHxW 42 | camera_matrix 43 | Outputs: 44 | X is positive going right 45 | Y is positive into the image 46 | Z is positive up in the image 47 | XYZ is ...xHxWx3 48 | """ 49 | x, z = np.meshgrid(np.arange(Y.shape[-1]), 50 | np.arange(Y.shape[-2] - 1, -1, -1)) 51 | for _ in range(Y.ndim - 2): 52 | x = np.expand_dims(x, axis=0) 53 | z = np.expand_dims(z, axis=0) 54 | X = (x[::scale, ::scale] - camera_matrix.xc) * \ 55 | Y[::scale, ::scale] / camera_matrix.f 56 | Z = (z[::scale, ::scale] - camera_matrix.zc) * \ 57 | Y[::scale, ::scale] / camera_matrix.f 58 | XYZ = np.concatenate((X[..., np.newaxis], 59 | Y[::scale, ::scale][..., np.newaxis], 60 | Z[..., np.newaxis]), axis=X.ndim) 61 | return XYZ 62 | 63 | 64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree): 65 | """ 66 | Transforms the point cloud into geocentric frame to account for 67 | camera elevation and angle 68 | Input: 69 | XYZ : ...x3 70 | sensor_height : height of the sensor 71 | camera_elevation_degree : camera elevation to rectify. 72 | Output: 73 | XYZ : ...x3 74 | """ 75 | R = ru.get_r_matrix( 76 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree)) 77 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape) 78 | XYZ[..., 2] = XYZ[..., 2] + sensor_height 79 | return XYZ 80 | 81 | 82 | def transform_pose(XYZ, current_pose): 83 | """ 84 | Transforms the point cloud into geocentric frame to account for 85 | camera position 86 | Input: 87 | XYZ : ...x3 88 | current_pose : camera position (x, y, theta (radians)) 89 | Output: 90 | XYZ : ...x3 91 | """ 92 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.) 93 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape) 94 | XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0] 95 | XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1] 96 | return XYZ 97 | 98 | 99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution): 100 | """Bins points into xy-z bins 101 | XYZ_cms is ... x H x W x3 102 | Outputs is ... x map_size x map_size x (len(z_bins)+1) 103 | """ 104 | sh = XYZ_cms.shape 105 | XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]]) 106 | n_z_bins = len(z_bins) + 1 107 | counts = [] 108 | for XYZ_cm in XYZ_cms: 109 | isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0])) 110 | X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32) 111 | Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32) 112 | Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32) 113 | 114 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, 115 | Y_bin < map_size, 116 | Z_bin >= 0, Z_bin < n_z_bins, isnotnan]) 117 | isvalid = np.all(isvalid, axis=0) 118 | 119 | ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin 120 | ind[np.logical_not(isvalid)] = 0 121 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32), 122 | minlength=map_size * map_size * n_z_bins) 123 | counts = np.reshape(count, [map_size, map_size, n_z_bins]) 124 | 125 | counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins]) 126 | 127 | return counts 128 | 129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution): 130 | """Bins points into xy-z bins 131 | XYZ_cms is ... x H x W x3 132 | semantic is ... x H x W 133 | Outputs is ... x map_size x map_size x len 134 | """ 135 | # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3 136 | # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]]) 137 | # print("XYZ_cms: ",XYZ_cms) 138 | # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3 139 | # print("z_bins: ",z_bins) 140 | # se = list(set(semantic.ravel())) 141 | # print("bin_semantic_points: ", se) # [] 142 | # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3 143 | isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0])) 144 | X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32) 145 | Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32) 146 | Z_bin = semantic.astype(np.int32) 147 | 148 | # print("X_bin S count: ", Counter(X_bin.ravel())) 149 | # print("Y_bin S count: ", Counter(Y_bin.ravel())) 150 | # print("Z_bin S count: ", Counter(Z_bin.ravel())) 151 | # print("Z_bin: {}".format(X_bin)) 152 | # print("X_bin: {}".format(X_bin.shape)) #128*128 153 | # print("Z_bin: {}".format(Z_bin.shape)) #128*128 154 | 155 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size, 156 | Z_bin > 0, Z_bin <= semantic_map_len, isnotnan]) 157 | isvalid = np.all(isvalid, axis=0) 158 | 159 | # print("isvalid: ", Counter(isvalid.ravel())) 160 | # print("isvalid: ", isvalid.shape) #128*128 161 | 162 | ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1 163 | ind[np.logical_not(isvalid)] = 0 164 | # print("ind: ", Counter(ind.ravel())) 165 | # print("ind: ", ind.shape) # 128*128 166 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32), 167 | minlength=map_size * map_size * semantic_map_len) 168 | # print("counts: ", count.shape) # 4838400 169 | 170 | counts = np.reshape(count, [map_size, map_size, semantic_map_len]) 171 | # print("counts: ", counts.shape) # 480*480*semantic_map_len 172 | 173 | return counts 174 | 175 | 176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1): 177 | """Projects the depth image Y into a 3D point cloud. 178 | Inputs: 179 | Y is ...xHxW 180 | camera_matrix 181 | Outputs: 182 | X is positive going right 183 | Y is positive into the image 184 | Z is positive up in the image 185 | XYZ is ...xHxWx3 186 | """ 187 | grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]), 188 | torch.arange(Y_t.shape[-2] - 1, -1, -1)) 189 | grid_x = grid_x.transpose(1, 0).to(device) 190 | grid_z = grid_z.transpose(1, 0).to(device) 191 | grid_x = grid_x.unsqueeze(0).expand(Y_t.size()) 192 | grid_z = grid_z.unsqueeze(0).expand(Y_t.size()) 193 | 194 | X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \ 195 | Y_t[:, ::scale, ::scale] / camera_matrix.f 196 | Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \ 197 | Y_t[:, ::scale, ::scale] / camera_matrix.f 198 | 199 | XYZ = torch.stack( 200 | (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size())) 201 | 202 | return XYZ 203 | 204 | 205 | def transform_camera_view_t( 206 | XYZ, sensor_height, camera_elevation_degree, device): 207 | """ 208 | Transforms the point cloud into geocentric frame to account for 209 | camera elevation and angle 210 | Input: 211 | XYZ : ...x3 212 | sensor_height : height of the sensor 213 | camera_elevation_degree : camera elevation to rectify. 214 | Output: 215 | XYZ : ...x3 216 | """ 217 | for i in range(XYZ.shape[0]): 218 | R = ru.get_r_matrix( 219 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i])) 220 | XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device) 221 | ) 222 | XYZ[..., 2] = XYZ[..., 2] + sensor_height 223 | return XYZ 224 | 225 | 226 | def transform_pose_t(XYZ, current_pose, device): 227 | """ 228 | Transforms the point cloud into geocentric frame to account for 229 | camera position 230 | Input: 231 | XYZ : ...x3 232 | current_pose : camera position (x, y, theta (radians)) 233 | Output: 234 | XYZ : ...x3 235 | """ 236 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.) 237 | XYZ = torch.matmul(XYZ.reshape(-1, 3), 238 | torch.from_numpy(R).float().transpose(1, 0).to(device) 239 | ).reshape(XYZ.shape) 240 | XYZ[..., 0] += current_pose[0] 241 | XYZ[..., 1] += current_pose[1] 242 | return XYZ 243 | 244 | 245 | def splat_feat_nd(init_grid, feat, coords): 246 | """ 247 | Args: 248 | init_grid: B X nF X W X H X D X .. 249 | feat: B X nF X nPt 250 | coords: B X nDims X nPt in [-1, 1] 251 | Returns: 252 | grid: B X nF X W X H X D X .. 253 | """ 254 | wts_dim = [] 255 | pos_dim = [] 256 | grid_dims = init_grid.shape[2:] 257 | 258 | B = init_grid.shape[0] 259 | F = init_grid.shape[1] 260 | 261 | n_dims = len(grid_dims) 262 | 263 | grid_flat = init_grid.view(B, F, -1) 264 | 265 | for d in range(n_dims): 266 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 267 | pos_d = [] 268 | wts_d = [] 269 | 270 | for ix in [0, 1]: 271 | pos_ix = torch.floor(pos) + ix 272 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d]) 273 | safe_ix = safe_ix.type(pos.dtype) 274 | 275 | wts_ix = 1 - torch.abs(pos - pos_ix) 276 | 277 | wts_ix = wts_ix * safe_ix 278 | pos_ix = pos_ix * safe_ix 279 | 280 | pos_d.append(pos_ix) 281 | wts_d.append(wts_ix) 282 | 283 | pos_dim.append(pos_d) 284 | wts_dim.append(wts_d) 285 | 286 | l_ix = [[0, 1] for d in range(n_dims)] 287 | # pdb.set_trace() 288 | for ix_d in itertools.product(*l_ix): 289 | wts = torch.ones_like(wts_dim[0][0]) 290 | index = torch.zeros_like(wts_dim[0][0]) 291 | for d in range(n_dims): 292 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]] 293 | wts = wts * wts_dim[d][ix_d[d]] 294 | 295 | index = index.long() 296 | grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts) 297 | grid_flat = torch.round(grid_flat) 298 | 299 | return grid_flat.view(init_grid.shape) 300 | 301 | def splat_att_feat_nd(init_grid, feat, coords,num_atts): 302 | """ 303 | Args: 304 | init_grid: B X nF X W X H X D X .. 305 | feat: B X nF X nPt 306 | coords: B X nDims X nPt in [-1, 1] 307 | Returns: 308 | grid: B X nF X W X H X D X .. 309 | """ 310 | wts_dim = [] 311 | pos_dim = [] 312 | grid_dims = init_grid.shape[2:] 313 | 314 | B = init_grid.shape[0] 315 | F = init_grid.shape[1] 316 | 317 | n_dims = len(grid_dims) 318 | # pdb.set_trace() 319 | grid_flat = init_grid[:,:-num_atts,...].view(B, F-num_atts, -1) 320 | grid_flat_att = init_grid[:,-num_atts:,...].view(B,num_atts, -1) 321 | 322 | for d in range(n_dims): 323 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2 324 | pos_d = [] 325 | wts_d = [] 326 | 327 | for ix in [0, 1]: 328 | pos_ix = torch.floor(pos) + ix 329 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d]) 330 | safe_ix = safe_ix.type(pos.dtype) 331 | 332 | wts_ix = 1 - torch.abs(pos - pos_ix) 333 | 334 | wts_ix = wts_ix * safe_ix 335 | pos_ix = pos_ix * safe_ix 336 | 337 | pos_d.append(pos_ix) 338 | wts_d.append(wts_ix) 339 | 340 | pos_dim.append(pos_d) 341 | wts_dim.append(wts_d) 342 | 343 | l_ix = [[0, 1] for d in range(n_dims)] 344 | for ix_d in itertools.product(*l_ix): 345 | wts = torch.ones_like(wts_dim[0][0]) 346 | index = torch.zeros_like(wts_dim[0][0]) 347 | for d in range(n_dims): 348 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]] 349 | wts = wts * wts_dim[d][ix_d[d]] 350 | 351 | index = index.long() 352 | grid_flat.scatter_add_(2, index.expand(-1, F-num_atts, -1), feat[:,:-num_atts,:] * wts) 353 | grid_flat = torch.round(grid_flat) 354 | 355 | current_values = grid_flat_att.gather(2, index.expand(-1, num_atts, -1)) 356 | new_val = torch.max(current_values, (feat[:,-num_atts:,:] * wts)) 357 | grid_flat_att.scatter_(2, index.expand(-1, num_atts, -1), new_val) 358 | 359 | final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1) 360 | 361 | assert final_grid_flat.shape==init_grid.view(B, F, -1).shape 362 | return final_grid_flat.view(init_grid.shape) 363 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | 5 | def get_args(): 6 | parser = argparse.ArgumentParser( 7 | description='Goal-Oriented-Semantic-Exploration') 8 | 9 | # General Arguments 10 | parser.add_argument('--seed', type=int, default=1, 11 | help='random seed (default: 1)') 12 | parser.add_argument('--auto_gpu_config', type=int, default=1) 13 | parser.add_argument('--total_num_scenes', type=str, default="auto") 14 | parser.add_argument('-n', '--num_processes', type=int, default=5, 15 | help="""how many training processes to use (default:5) 16 | Overridden when auto_gpu_config=1 17 | and training on gpus""") 18 | parser.add_argument('--num_processes_per_gpu', type=int, default=6) 19 | parser.add_argument('--num_processes_on_first_gpu', type=int, default=1) 20 | parser.add_argument('--eval', type=int, default=0, 21 | help='0: Train, 1: Evaluate (default: 0)') 22 | parser.add_argument('--num_training_frames', type=int, default=10000000, 23 | help='total number of training frames') 24 | parser.add_argument('--num_eval_episodes', type=int, default=400, 25 | help="number of test episodes per scene") 26 | parser.add_argument('--num_train_episodes', type=int, default=10000, 27 | help="""number of train episodes per scene 28 | before loading the next scene""") 29 | parser.add_argument('--no_cuda', action='store_true', default=False, 30 | help='disables CUDA training') 31 | parser.add_argument("--sim_gpu_id", type=int, default=0, 32 | help="gpu id on which scenes are loaded") 33 | parser.add_argument("--sem_gpu_id", type=int, default=-1, 34 | help="""gpu id for semantic model, 35 | -1: same as sim gpu, -2: cpu""") 36 | parser.add_argument("--blip_gpu_id", type=int, default=1, 37 | help="""gpu id for BLIP model, 38 | -1: same as sim gpu, -2: cpu""") 39 | 40 | # Logging, loading models, visualization 41 | parser.add_argument('--log_interval', type=int, default=10, 42 | help="""log interval, one log per n updates 43 | (default: 10) """) 44 | parser.add_argument('--save_interval', type=int, default=1, 45 | help="""save interval""") 46 | parser.add_argument('-d', '--dump_location', type=str, default="./tmp/", 47 | help='path to dump models and log (default: ./tmp/)') 48 | parser.add_argument('--exp_name', type=str, default="exp1", 49 | help='experiment name (default: exp1)') 50 | parser.add_argument('--save_periodic', type=int, default=500000, 51 | help='Model save frequency in number of updates') 52 | parser.add_argument('--load', type=str, default="0", 53 | help="""model path to load, 54 | 0 to not reload (default: 0)""") 55 | parser.add_argument('-v', '--visualize', type=int, default=0, 56 | help="""1: Render the observation and 57 | the predicted semantic map, 58 | 2: Render the observation with semantic 59 | predictions and the predicted semantic map 60 | (default: 0)""") 61 | parser.add_argument('--print_images', type=int, default=0, 62 | help='1: save visualization as images') 63 | 64 | # Environment, dataset and episode specifications 65 | parser.add_argument('-efw', '--env_frame_width', type=int, default=640, 66 | help='Frame width (default:640)') 67 | parser.add_argument('-efh', '--env_frame_height', type=int, default=480, 68 | help='Frame height (default:480)') 69 | parser.add_argument('-fw', '--frame_width', type=int, default=160, 70 | help='Frame width (default:160)') 71 | parser.add_argument('-fh', '--frame_height', type=int, default=120, 72 | help='Frame height (default:120)') 73 | parser.add_argument('-el', '--max_episode_length', type=int, default=532, 74 | help="""Maximum episode length""") 75 | parser.add_argument("--task_config", type=str, 76 | default="tasks/objectnav_hm3d.yaml", 77 | help="path to config yaml containing task information") 78 | parser.add_argument("--split", type=str, default="train", 79 | help="dataset split (train | val | val_mini) ") 80 | parser.add_argument('--camera_height', type=float, default=0.88, 81 | help="agent camera height in metres") 82 | parser.add_argument('--hfov', type=float, default=79.0, 83 | help="horizontal field of view in degrees") 84 | parser.add_argument('--turn_angle', type=float, default=30, 85 | help="Agent turn angle in degrees") 86 | parser.add_argument('--min_depth', type=float, default=0.5, 87 | help="Minimum depth for depth sensor in meters") 88 | parser.add_argument('--max_depth', type=float, default=5.0, 89 | help="Maximum depth for depth sensor in meters") 90 | parser.add_argument('--success_dist', type=float, default=1.5, 91 | help="success distance threshold in meters") 92 | parser.add_argument('--floor_thr', type=int, default=50, 93 | help="floor threshold in cm") 94 | parser.add_argument('--min_d', type=float, default=1.5, 95 | help="min distance to goal during training in meters") 96 | parser.add_argument('--max_d', type=float, default=100.0, 97 | help="max distance to goal during training in meters") 98 | parser.add_argument('--version', type=str, default="v1.1", 99 | help="dataset version") 100 | 101 | # Model Hyperparameters 102 | parser.add_argument('--agent', type=str, default="sem_exp") 103 | parser.add_argument('--lr', type=float, default=2.5e-5, 104 | help='learning rate (default: 2.5e-5)') 105 | parser.add_argument('--global_hidden_size', type=int, default=256, 106 | help='global_hidden_size') 107 | parser.add_argument('--eps', type=float, default=1e-5, 108 | help='RL Optimizer epsilon (default: 1e-5)') 109 | parser.add_argument('--alpha', type=float, default=0.99, 110 | help='RL Optimizer alpha (default: 0.99)') 111 | parser.add_argument('--gamma', type=float, default=0.99, 112 | help='discount factor for rewards (default: 0.99)') 113 | parser.add_argument('--use_gae', action='store_true', default=False, 114 | help='use generalized advantage estimation') 115 | parser.add_argument('--tau', type=float, default=0.95, 116 | help='gae parameter (default: 0.95)') 117 | parser.add_argument('--entropy_coef', type=float, default=0.001, 118 | help='entropy term coefficient (default: 0.01)') 119 | parser.add_argument('--value_loss_coef', type=float, default=0.5, 120 | help='value loss coefficient (default: 0.5)') 121 | parser.add_argument('--max_grad_norm', type=float, default=0.5, 122 | help='max norm of gradients (default: 0.5)') 123 | parser.add_argument('--num_global_steps', type=int, default=20, 124 | help='number of forward steps in A2C (default: 5)') 125 | parser.add_argument('--ppo_epoch', type=int, default=4, 126 | help='number of ppo epochs (default: 4)') 127 | parser.add_argument('--num_mini_batch', type=str, default="auto", 128 | help='number of batches for ppo (default: 32)') 129 | parser.add_argument('--clip_param', type=float, default=0.2, 130 | help='ppo clip parameter (default: 0.2)') 131 | parser.add_argument('--use_recurrent_global', type=int, default=0, 132 | help='use a recurrent global policy') 133 | parser.add_argument('--num_local_steps', type=int, default=25, 134 | help="""Number of steps the local policy 135 | between each global step""") 136 | parser.add_argument('--reward_coeff', type=float, default=0.1, 137 | help="Object goal reward coefficient") 138 | parser.add_argument('--intrinsic_rew_coeff', type=float, default=0.05, 139 | help="intrinsic exploration reward coefficient") 140 | parser.add_argument('--num_sem_categories', type=float, default=16) 141 | parser.add_argument('--sem_pred_prob_thr', type=float, default=0.9, 142 | help="Semantic prediction confidence threshold") 143 | 144 | # Mapping 145 | parser.add_argument('--global_downscaling', type=int, default=2) 146 | parser.add_argument('--vision_range', type=int, default=100) 147 | parser.add_argument('--map_resolution', type=int, default=5) 148 | parser.add_argument('--du_scale', type=int, default=1) 149 | parser.add_argument('--map_size_cm', type=int, default=4800) 150 | parser.add_argument('--cat_pred_threshold', type=float, default=5.0) 151 | parser.add_argument('--map_pred_threshold', type=float, default=1.0) 152 | parser.add_argument('--exp_pred_threshold', type=float, default=1.0) 153 | parser.add_argument('--collision_threshold', type=float, default=0.10) 154 | parser.add_argument('--collision_queue_size', type=int, default=3) 155 | parser.add_argument('--wtec', type=float, default=0.5) 156 | parser.add_argument('--num_attrs', type=float, default=5) 157 | 158 | parser.add_argument('--use_gtsem', type=int, default=0) 159 | 160 | # train_se_frontier 161 | parser.add_argument('--train_se_f', type=int, default=0) 162 | parser.add_argument('--load_se_edge', type=str, default="0", 163 | help="""model path to load, 164 | 0 to not reload (default: 0)""") 165 | 166 | 167 | # parse arguments 168 | args = parser.parse_args() 169 | 170 | args.cuda = not args.no_cuda and torch.cuda.is_available() 171 | 172 | if args.cuda: 173 | if args.auto_gpu_config: 174 | num_gpus = torch.cuda.device_count() 175 | if args.total_num_scenes != "auto": 176 | args.total_num_scenes = int(args.total_num_scenes) 177 | elif "objectnav_gibson" in args.task_config and \ 178 | "train" in args.split: 179 | args.total_num_scenes = 25 180 | elif "objectnav_gibson" in args.task_config and \ 181 | "val" in args.split: 182 | args.total_num_scenes = 5 183 | else: 184 | assert False, "Unknown task config, please specify" + \ 185 | " total_num_scenes" 186 | 187 | # GPU Memory required for the SemExp model: 188 | # 0.8 + 0.4 * args.total_num_scenes (GB) 189 | # GPU Memory required per thread: 2.6 (GB) 190 | min_memory_required = max(0.8 + 0.4 * args.total_num_scenes, 2.6) 191 | # Automatically configure number of training threads based on 192 | # number of GPUs available and GPU memory size 193 | gpu_memory = 1000 194 | for i in range(num_gpus): 195 | gpu_memory = min(gpu_memory, 196 | torch.cuda.get_device_properties( 197 | i).total_memory 198 | / 1024 / 1024 / 1024) 199 | assert gpu_memory > min_memory_required, \ 200 | """Insufficient GPU memory for GPU {}, gpu memory ({}GB) 201 | needs to be greater than {}GB""".format( 202 | i, gpu_memory, min_memory_required) 203 | 204 | num_processes_per_gpu = int(gpu_memory / 2.6) 205 | num_processes_on_first_gpu = \ 206 | int((gpu_memory - min_memory_required) / 2.6) 207 | 208 | if args.eval: 209 | max_threads = num_processes_per_gpu * (num_gpus - 1) \ 210 | + num_processes_on_first_gpu 211 | assert max_threads >= args.total_num_scenes, \ 212 | """Insufficient GPU memory for evaluation""" 213 | 214 | if num_gpus == 1: 215 | args.num_processes_on_first_gpu = num_processes_on_first_gpu 216 | args.num_processes_per_gpu = 0 217 | args.num_processes = num_processes_on_first_gpu 218 | assert args.num_processes > 0, "Insufficient GPU memory" 219 | else: 220 | num_threads = num_processes_per_gpu * (num_gpus - 1) \ 221 | + num_processes_on_first_gpu 222 | num_threads = min(num_threads, args.total_num_scenes) 223 | args.num_processes_per_gpu = num_processes_per_gpu 224 | args.num_processes_on_first_gpu = max( 225 | 0, 226 | num_threads - args.num_processes_per_gpu * (num_gpus - 1)) 227 | args.num_processes = num_threads 228 | 229 | args.sim_gpu_id = 1 230 | 231 | print("Auto GPU config:") 232 | print("Number of processes: {}".format(args.num_processes)) 233 | print("Number of processes on GPU 0: {}".format( 234 | args.num_processes_on_first_gpu)) 235 | print("Number of processes per GPU: {}".format( 236 | args.num_processes_per_gpu)) 237 | else: 238 | args.sem_gpu_id = -2 239 | 240 | if args.num_mini_batch == "auto": 241 | args.num_mini_batch = max(args.num_processes // 2, 1) 242 | else: 243 | args.num_mini_batch = int(args.num_mini_batch) 244 | 245 | return args 246 | -------------------------------------------------------------------------------- /model_multi.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | import numpy as np 7 | 8 | # from utils.distributions import Categorical, DiagGaussian 9 | from torch.distributions.categorical import Categorical 10 | from utils.model import get_grid, ChannelPool, Flatten, NNBase 11 | import envs.utils.depth_utils_multi as du 12 | 13 | class Semantic_Mapping(nn.Module): 14 | 15 | """ 16 | Semantic_Mapping 17 | """ 18 | 19 | def __init__(self, args): 20 | super(Semantic_Mapping, self).__init__() 21 | 22 | self.device = args.device 23 | self.screen_h = args.frame_height 24 | self.screen_w = args.frame_width 25 | self.resolution = args.map_resolution 26 | self.z_resolution = args.map_resolution 27 | self.map_size_cm = args.map_size_cm // args.global_downscaling 28 | self.n_channels = 3 29 | self.vision_range = args.vision_range 30 | self.dropout = 0.5 31 | self.fov = args.hfov 32 | self.du_scale = args.du_scale 33 | self.cat_pred_threshold = args.cat_pred_threshold 34 | self.exp_pred_threshold = args.exp_pred_threshold 35 | self.map_pred_threshold = args.map_pred_threshold 36 | self.num_sem_categories = args.num_sem_categories 37 | self.num_atts = args.num_attrs 38 | 39 | self.max_height = int(200 / self.z_resolution) 40 | self.min_height = int(-40 / self.z_resolution) 41 | self.agent_height = args.camera_height * 100. 42 | self.shift_loc = [self.vision_range * 43 | self.resolution // 2, 0, np.pi / 2.0] 44 | self.camera_matrix = du.get_camera_matrix( 45 | self.screen_w, self.screen_h, self.fov) 46 | 47 | self.pool = ChannelPool(1) 48 | 49 | vr = self.vision_range 50 | # pdb.set_trace() 51 | self.init_grid = torch.zeros( 52 | args.num_processes, 1 + self.num_sem_categories+self.num_atts, vr, vr, 53 | self.max_height - self.min_height 54 | ).float().to(self.device) 55 | self.feat = torch.ones( 56 | args.num_processes, 1 + self.num_sem_categories+self.num_atts, 57 | self.screen_h // self.du_scale * self.screen_w // self.du_scale 58 | ).float().to(self.device) 59 | 60 | 61 | self.max_pool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 62 | 63 | self.stair_mask_radius = 30 64 | self.stair_mask = self.get_mask(self.stair_mask_radius).to(self.device) 65 | 66 | def forward(self, obs, pose_obs, maps_last, poses_last, eve_angle,goal_found): 67 | bs, c, h, w = obs.size() 68 | depth = obs[:, 3, :, :] 69 | point_cloud_t = du.get_point_cloud_from_z_t( 70 | depth, self.camera_matrix, self.device, scale=self.du_scale) 71 | 72 | agent_view_t = du.transform_camera_view_t( 73 | point_cloud_t, self.agent_height, eve_angle, self.device) 74 | 75 | agent_view_centered_t = du.transform_pose_t( 76 | agent_view_t, self.shift_loc, self.device) 77 | 78 | max_h = self.max_height 79 | min_h = self.min_height 80 | xy_resolution = self.resolution 81 | z_resolution = self.z_resolution 82 | vision_range = self.vision_range 83 | XYZ_cm_std = agent_view_centered_t.float() 84 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution) 85 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] - 86 | vision_range // 2.) / vision_range * 2. 87 | XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution 88 | XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] - 89 | (max_h + min_h) // 2.) / (max_h - min_h) * 2. 90 | self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)( 91 | obs[:, 4:, :, :] 92 | ).view(bs, c - 4, h // self.du_scale * w // self.du_scale) 93 | 94 | XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2) 95 | XYZ_cm_std = XYZ_cm_std.view(XYZ_cm_std.shape[0], 96 | XYZ_cm_std.shape[1], 97 | XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3]) 98 | 99 | # voxels = du.splat_feat_nd( 100 | # self.init_grid * 0., self.feat, XYZ_cm_std).transpose(2, 3) 101 | voxels = du.splat_att_feat_nd( 102 | self.init_grid * 0., self.feat, XYZ_cm_std,self.num_atts).transpose(2, 3) 103 | 104 | min_z = int(25 / z_resolution - min_h) 105 | max_z = int((self.agent_height + 50) / z_resolution - min_h) 106 | mid_z = int(self.agent_height / z_resolution - min_h) 107 | 108 | # print("========== IN SEM MAP ==========") 109 | # print(np.unique(voxels[:,[-1],...].max(4)[0].detach().cpu().numpy())) 110 | 111 | agent_height_proj = voxels[:,:-self.num_atts,..., min_z:max_z].sum(4) 112 | agent_height_proj_att = voxels[:,-self.num_atts:,..., min_z:max_z].max(4)[0] 113 | agent_height_proj=torch.cat([agent_height_proj,agent_height_proj_att],dim=1) 114 | agent_height_stair_proj = voxels[:,:-self.num_atts,..., mid_z-5:mid_z].sum(4) 115 | agent_height_stair_proj_att = voxels[:,-self.num_atts:,..., mid_z-5:mid_z].max(4)[0] 116 | agent_height_stair_proj=torch.cat([agent_height_stair_proj,agent_height_stair_proj_att],dim=1) 117 | all_height_proj = voxels[:,:-self.num_atts,...].sum(4) 118 | all_height_proj_att = voxels[:,-self.num_atts:,...].max(4)[0] 119 | all_height_proj=torch.cat([all_height_proj,all_height_proj_att],dim=1) 120 | 121 | # pdb.set_trace() 122 | fp_map_pred = agent_height_proj[:, 0:1, :, :] 123 | fp_exp_pred = all_height_proj[:, 0:1, :, :] 124 | fp_stair_pred = agent_height_stair_proj[:, 0:1, :, :] 125 | fp_map_pred = fp_map_pred / self.map_pred_threshold 126 | fp_stair_pred = fp_stair_pred / self.map_pred_threshold 127 | fp_exp_pred = fp_exp_pred / self.exp_pred_threshold 128 | fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0) 129 | fp_stair_pred = torch.clamp(fp_stair_pred, min=0.0, max=1.0) 130 | fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0) 131 | 132 | pose_pred = poses_last 133 | 134 | agent_view = torch.zeros(bs, c, 135 | self.map_size_cm // self.resolution, 136 | self.map_size_cm // self.resolution 137 | ).to(self.device) 138 | 139 | x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2 140 | x2 = x1 + self.vision_range 141 | y1 = self.map_size_cm // (self.resolution * 2) 142 | y2 = y1 + self.vision_range 143 | agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred 144 | agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred 145 | agent_view[:, 4:-self.num_atts, y1:y2, x1:x2] = torch.clamp( 146 | agent_height_proj[:, 1:-self.num_atts, :, :] / self.cat_pred_threshold, 147 | min=0.0, max=1.0) 148 | agent_view[:, -self.num_atts:, y1:y2, x1:x2] =agent_height_proj[:, -self.num_atts:, :, :] 149 | 150 | # print("SEM MAP: MAX ",agent_height_proj[:, -1, :, :].max(),", MIN: ",agent_height_proj[:, -1, :, :].min()) 151 | # print("SEM MAP/HEIGHT: MAX ",agent_view[:, -1, ...].max(),", MIN: ",agent_view[:, -1, ...].min()) 152 | 153 | agent_view_stair = agent_view.clone().detach() 154 | agent_view_stair[:, 0:1, y1:y2, x1:x2] = fp_stair_pred 155 | 156 | corrected_pose = pose_obs 157 | 158 | def get_new_pose_batch(pose, rel_pose_change): 159 | 160 | pose[:, 1] += rel_pose_change[:, 0] * \ 161 | torch.sin(pose[:, 2] / 57.29577951308232) \ 162 | + rel_pose_change[:, 1] * \ 163 | torch.cos(pose[:, 2] / 57.29577951308232) 164 | pose[:, 0] += rel_pose_change[:, 0] * \ 165 | torch.cos(pose[:, 2] / 57.29577951308232) \ 166 | - rel_pose_change[:, 1] * \ 167 | torch.sin(pose[:, 2] / 57.29577951308232) 168 | pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232 169 | 170 | pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0 171 | pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0 172 | 173 | return pose 174 | 175 | current_poses = get_new_pose_batch(poses_last, corrected_pose) 176 | st_pose = current_poses.clone().detach() 177 | 178 | st_pose[:, :2] = - (st_pose[:, :2] 179 | * 100.0 / self.resolution 180 | - self.map_size_cm // (self.resolution * 2)) /\ 181 | (self.map_size_cm // (self.resolution * 2)) 182 | st_pose[:, 2] = 90. - (st_pose[:, 2]) 183 | 184 | rot_mat, trans_mat = get_grid(st_pose, agent_view.size(), 185 | self.device) 186 | 187 | rotated = F.grid_sample(agent_view, rot_mat, align_corners=True) 188 | translated = F.grid_sample(rotated, trans_mat, align_corners=True) 189 | 190 | # translated[:, 18:19, :, :] = -self.max_pool(-translated[:, 18:19, :, :]) 191 | 192 | diff_ob_ex = translated[:, 1:2, :, :] - self.max_pool(translated[:, 0:1, :, :]) 193 | 194 | diff_ob_ex[diff_ob_ex>0.8] = 1.0 195 | diff_ob_ex[diff_ob_ex!=1.0] = 0.0 196 | 197 | # pdb.set_trace() 198 | if not goal_found: 199 | # TODO: better way to get the score 200 | # maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1) 201 | _score_maps_last=maps_last[:,-self.num_atts:,...] 202 | _score_maps_new=translated[:, -self.num_atts:, ...] 203 | updated_mask=_score_maps_new>0 204 | _s_map=torch.cat((_score_maps_last.unsqueeze(1), _score_maps_new.unsqueeze(1)), 1) 205 | _s_map,_=torch.max(_s_map, 1) 206 | map_score_new = torch.where(updated_mask, _s_map / 2, _score_maps_last) 207 | maps2 = torch.cat((maps_last[:, :-self.num_atts, ...].unsqueeze(1), translated[:, :-self.num_atts, ...].unsqueeze(1)), 1) 208 | map_pred, _ = torch.max(maps2, 1) 209 | map_pred=torch.cat([map_pred,map_score_new],1) 210 | # print("map_score_new: MAX ",map_score_new.max(),", MIN: ",map_score_new.min()) 211 | else: 212 | maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1) 213 | map_pred, _ = torch.max(maps2, 1) 214 | 215 | 216 | # map_pred=torch.cat((_map_pred, map_score_new), dim=1) 217 | 218 | for i in range(eve_angle.shape[0]): 219 | if eve_angle[i] == 0: 220 | map_pred[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0 221 | 222 | # stairs view 223 | rot_mat_stair, trans_mat_stair = get_grid(st_pose, agent_view_stair.size(), 224 | self.device) 225 | 226 | rotated_stair = F.grid_sample(agent_view_stair, rot_mat_stair, align_corners=True) 227 | translated_stair = F.grid_sample(rotated_stair, trans_mat_stair, align_corners=True) 228 | 229 | stair_mask = torch.zeros(self.map_size_cm // self.resolution, self.map_size_cm // self.resolution).to(self.device) 230 | 231 | s_y = int(current_poses[0][1]*100/5) 232 | s_x = int(current_poses[0][0]*100/5) 233 | limit_up = self.map_size_cm // self.resolution - self.stair_mask_radius - 1 234 | limit_be = self.stair_mask_radius 235 | if s_y > limit_up: 236 | s_y = limit_up 237 | if s_y < self.stair_mask_radius: 238 | s_y = self.stair_mask_radius 239 | if s_x > limit_up: 240 | s_x = limit_up 241 | if s_x < self.stair_mask_radius: 242 | s_x = self.stair_mask_radius 243 | stair_mask[int(s_y-self.stair_mask_radius):int(s_y+self.stair_mask_radius), int(s_x-self.stair_mask_radius):int(s_x+self.stair_mask_radius)] = self.stair_mask 244 | 245 | translated_stair[0, 0:1, :, :] *= stair_mask 246 | translated_stair[0, 1:2, :, :] *= stair_mask 247 | 248 | # translated_stair[:, 13:14, :, :] = -self.max_pool(-translated_stair[:, 13:14, :, :]) 249 | 250 | diff_ob_ex = translated_stair[:, 1:2, :, :] - translated_stair[:, 0:1, :, :] 251 | 252 | diff_ob_ex[diff_ob_ex>0.8] = 1.0 253 | diff_ob_ex[diff_ob_ex!=1.0] = 0.0 254 | 255 | maps3 = torch.cat((maps_last.unsqueeze(1), translated_stair.unsqueeze(1)), 1) 256 | 257 | map_pred_stair, _ = torch.max(maps3, 1) 258 | 259 | for i in range(eve_angle.shape[0]): 260 | if eve_angle[i] == 0: 261 | map_pred_stair[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0 262 | 263 | 264 | # print("map_pred_-1: MAX ",map_pred[:,-1,...].max(),", MIN: ",map_pred[:,-1,...].min()) 265 | return translated, map_pred, map_pred_stair, current_poses 266 | 267 | 268 | def get_mask(self, step_size): 269 | size = int(step_size) * 2 270 | mask = torch.zeros(size, size) 271 | for i in range(size): 272 | for j in range(size): 273 | if ((i + 0.5) - (size // 2)) ** 2 + \ 274 | ((j + 0.5) - (size // 2)) ** 2 <= \ 275 | step_size ** 2: 276 | mask[i, j] = 1 277 | return mask 278 | 279 | 280 | class FeedforwardNet(nn.Module): 281 | 282 | def __init__(self, input_dim, output_dim): 283 | super(FeedforwardNet, self).__init__() 284 | """ self.layers = nn.Sequential( 285 | nn.Linear(input_dim, 512), 286 | nn.BatchNorm1d(512), 287 | nn.ReLU(inplace=True), 288 | nn.Linear(512, 512), 289 | nn.BatchNorm1d(512), 290 | nn.ReLU(inplace=True), 291 | nn.Linear(512, 256), 292 | nn.BatchNorm1d(256), 293 | nn.ReLU(inplace=True), 294 | nn.Linear(256, output_dim), 295 | ) """ 296 | self.layers = nn.Sequential( 297 | nn.Linear(input_dim, 512), 298 | nn.BatchNorm1d(512), 299 | nn.ReLU(inplace=True), 300 | nn.Linear(512, 256), 301 | nn.BatchNorm1d(256), 302 | nn.ReLU(inplace=True), 303 | nn.Linear(256, output_dim), 304 | ) 305 | 306 | def forward(self, x): 307 | return self.layers(x) -------------------------------------------------------------------------------- /RedNet/RedNet_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import math 5 | import torch.utils.model_zoo as model_zoo 6 | import utils 7 | from torch.utils.checkpoint import checkpoint 8 | import os 9 | import torchvision 10 | 11 | 12 | class RedNet(nn.Module): 13 | def __init__(self, num_classes=40, pretrained=False): 14 | 15 | super().__init__() 16 | 17 | block = Bottleneck 18 | transblock = TransBasicBlock 19 | layers = [3, 4, 6, 3] 20 | # original resnet 21 | self.inplanes = 64 22 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 23 | bias=False) 24 | self.bn1 = nn.BatchNorm2d(64) 25 | self.relu = nn.ReLU(inplace=True) 26 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 27 | self.layer1 = self._make_layer(block, 64, layers[0]) 28 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 29 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 30 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 31 | 32 | # resnet for depth channel 33 | self.inplanes = 64 34 | self.conv1_d = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, 35 | bias=False) 36 | self.bn1_d = nn.BatchNorm2d(64) 37 | self.layer1_d = self._make_layer(block, 64, layers[0]) 38 | self.layer2_d = self._make_layer(block, 128, layers[1], stride=2) 39 | self.layer3_d = self._make_layer(block, 256, layers[2], stride=2) 40 | self.layer4_d = self._make_layer(block, 512, layers[3], stride=2) 41 | 42 | self.inplanes = 512 43 | self.deconv1 = self._make_transpose(transblock, 256, 6, stride=2) 44 | self.deconv2 = self._make_transpose(transblock, 128, 4, stride=2) 45 | self.deconv3 = self._make_transpose(transblock, 64, 3, stride=2) 46 | self.deconv4 = self._make_transpose(transblock, 64, 3, stride=2) 47 | 48 | self.agant0 = self._make_agant_layer(64, 64) 49 | self.agant1 = self._make_agant_layer(64 * 4, 64) 50 | self.agant2 = self._make_agant_layer(128 * 4, 128) 51 | self.agant3 = self._make_agant_layer(256 * 4, 256) 52 | self.agant4 = self._make_agant_layer(512 * 4, 512) 53 | 54 | # final block 55 | self.inplanes = 64 56 | self.final_conv = self._make_transpose(transblock, 64, 3) 57 | 58 | self.final_deconv_custom = nn.ConvTranspose2d(self.inplanes, num_classes, kernel_size=2, 59 | stride=2, padding=0, bias=True) 60 | 61 | self.out5_conv_custom = nn.Conv2d(256, num_classes, kernel_size=1, stride=1, bias=True) 62 | self.out4_conv_custom = nn.Conv2d(128, num_classes, kernel_size=1, stride=1, bias=True) 63 | self.out3_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True) 64 | self.out2_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True) 65 | 66 | if pretrained: 67 | self._load_resnet_pretrained() 68 | 69 | def weights_init(self, m): 70 | classname = m.__class__.__name__ 71 | if classname.find('Conv') != -1: 72 | nn.init.kaiming_normal_(m.weight) 73 | if m.bias is not None: 74 | nn.init.zeros_(m.bias) 75 | elif isinstance(m, nn.BatchNorm2d): 76 | m.weight.data.fill_(1) 77 | m.bias.data.zero_() 78 | 79 | 80 | 81 | def _make_layer(self, block, planes, blocks, stride=1): 82 | downsample = None 83 | if stride != 1 or self.inplanes != planes * block.expansion: 84 | downsample = nn.Sequential( 85 | nn.Conv2d(self.inplanes, planes * block.expansion, 86 | kernel_size=1, stride=stride, bias=False), 87 | nn.BatchNorm2d(planes * block.expansion), 88 | ) 89 | 90 | layers = [] 91 | 92 | layers.append(block(self.inplanes, planes, stride, downsample)) 93 | self.inplanes = planes * block.expansion 94 | for i in range(1, blocks): 95 | layers.append(block(self.inplanes, planes)) 96 | 97 | return nn.Sequential(*layers) 98 | 99 | def _make_transpose(self, block, planes, blocks, stride=1): 100 | 101 | upsample = None 102 | if stride != 1: 103 | upsample = nn.Sequential( 104 | nn.ConvTranspose2d(self.inplanes, planes, 105 | kernel_size=2, stride=stride, 106 | padding=0, bias=False), 107 | nn.BatchNorm2d(planes), 108 | ) 109 | elif self.inplanes != planes: 110 | upsample = nn.Sequential( 111 | nn.Conv2d(self.inplanes, planes, 112 | kernel_size=1, stride=stride, bias=False), 113 | nn.BatchNorm2d(planes), 114 | ) 115 | 116 | layers = [] 117 | 118 | for i in range(1, blocks): 119 | layers.append(block(self.inplanes, self.inplanes)) 120 | 121 | layers.append(block(self.inplanes, planes, stride, upsample)) 122 | self.inplanes = planes 123 | 124 | return nn.Sequential(*layers) 125 | 126 | def _make_agant_layer(self, inplanes, planes): 127 | 128 | layers = nn.Sequential( 129 | nn.Conv2d(inplanes, planes, kernel_size=1, 130 | stride=1, padding=0, bias=False), 131 | nn.BatchNorm2d(planes), 132 | nn.ReLU(inplace=True) 133 | ) 134 | return layers 135 | 136 | def _load_resnet_pretrained(self): 137 | pretrain_dict = torch.load(ckpt, map_location='cpu')['model_state'] 138 | prefix = 'module.' 139 | pretrain_dict = { 140 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in pretrain_dict.items() 141 | } 142 | pretrain_dict.pop('final_deconv_custom.weight') 143 | pretrain_dict.pop('final_deconv_custom.bias') 144 | 145 | pretrain_dict.pop('out5_conv_custom.weight') 146 | pretrain_dict.pop('out5_conv_custom.bias') 147 | 148 | pretrain_dict.pop('out4_conv_custom.weight') 149 | pretrain_dict.pop('out4_conv_custom.bias') 150 | 151 | pretrain_dict.pop('out3_conv_custom.weight') 152 | pretrain_dict.pop('out3_conv_custom.bias') 153 | 154 | pretrain_dict.pop('out2_conv_custom.weight') 155 | pretrain_dict.pop('out2_conv_custom.bias') 156 | 157 | model_dict = {} 158 | state_dict = self.state_dict() 159 | 160 | state_dict.update(model_dict) 161 | self.load_state_dict(state_dict) 162 | 163 | def forward_downsample(self, rgb, depth): 164 | x = self.conv1(rgb) 165 | x = self.bn1(x) 166 | x = self.relu(x) 167 | depth = self.conv1_d(depth) 168 | depth = self.bn1_d(depth) 169 | depth = self.relu(depth) 170 | 171 | fuse0 = x + depth 172 | 173 | x = self.maxpool(fuse0) 174 | depth = self.maxpool(depth) 175 | 176 | # block 1 177 | x = self.layer1(x) 178 | # debug_tensor('post1', depth) 179 | # for i, mod in enumerate(self.layer1_d): 180 | # depth = mod(depth) 181 | # debug_tensor(f'post1d-{i}', depth) 182 | 183 | depth = self.layer1_d(depth) 184 | # debug_tensor('post1d', depth) 185 | 186 | fuse1 = x + depth 187 | # block 2 188 | x = self.layer2(fuse1) 189 | depth = self.layer2_d(depth) 190 | fuse2 = x + depth 191 | # block 3 192 | x = self.layer3(fuse2) 193 | depth = self.layer3_d(depth) 194 | fuse3 = x + depth 195 | # block 4 196 | x = self.layer4(fuse3) 197 | depth = self.layer4_d(depth) 198 | fuse4 = x + depth 199 | 200 | agant4 = self.agant4(fuse4) 201 | 202 | return fuse0, fuse1, fuse2, fuse3, agant4 203 | 204 | def forward_upsample(self, fuse0, fuse1, fuse2, fuse3, agant4, train=False): 205 | 206 | # upsample 1 207 | x = self.deconv1(agant4) 208 | if train: 209 | out5 = self.out5_conv_custom(x) 210 | x = x + self.agant3(fuse3) 211 | # upsample 2 212 | x = self.deconv2(x) 213 | if train: 214 | out4 = self.out4_conv_custom(x) 215 | x = x + self.agant2(fuse2) 216 | # upsample 3 217 | x = self.deconv3(x) 218 | if train: 219 | out3 = self.out3_conv_custom(x) 220 | x = x + self.agant1(fuse1) 221 | # upsample 4 222 | x = self.deconv4(x) 223 | if train: 224 | out2 = self.out2_conv_custom(x) 225 | x = x + self.agant0(fuse0) 226 | # final 227 | x = self.final_conv(x) 228 | 229 | last_layer = x 230 | 231 | out = self.final_deconv_custom(x) 232 | 233 | if train: 234 | return out, out2, out3, out4, out5 235 | 236 | return out 237 | 238 | def forward(self, rgb, depth, train=False): 239 | 240 | fuses = self.forward_downsample(rgb, depth) 241 | 242 | # We only need predictions. 243 | # features_encoder = fuses[-1] 244 | # scores, features_lastlayer = self.forward_upsample(*fuses) 245 | scores = self.forward_upsample(*fuses, train) 246 | # debug_tensor('scores', scores) 247 | # return features_encoder, features_lastlayer, scores 248 | # print("scores: ", scores.shape) 249 | return scores 250 | 251 | 252 | def conv3x3(in_planes, out_planes, stride=1): 253 | "3x3 convolution with padding" 254 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 255 | padding=1, bias=False) 256 | 257 | 258 | class Bottleneck(nn.Module): 259 | expansion = 4 260 | 261 | def __init__(self, inplanes, planes, stride=1, downsample=None): 262 | super(Bottleneck, self).__init__() 263 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 264 | self.bn1 = nn.BatchNorm2d(planes) 265 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 266 | padding=1, bias=False) 267 | self.bn2 = nn.BatchNorm2d(planes) 268 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 269 | self.bn3 = nn.BatchNorm2d(planes * 4) 270 | self.relu = nn.ReLU(inplace=True) 271 | self.downsample = downsample 272 | self.stride = stride 273 | 274 | def forward(self, x): 275 | residual = x 276 | 277 | out = self.conv1(x) 278 | out = self.bn1(out) 279 | out = self.relu(out) 280 | 281 | out = self.conv2(out) 282 | out = self.bn2(out) 283 | out = self.relu(out) 284 | 285 | out = self.conv3(out) 286 | out = self.bn3(out) 287 | 288 | if self.downsample is not None: 289 | residual = self.downsample(x) 290 | 291 | out += residual 292 | out = self.relu(out) 293 | 294 | return out 295 | 296 | class TransBasicBlock(nn.Module): 297 | expansion = 1 298 | 299 | def __init__(self, inplanes, planes, stride=1, upsample=None, **kwargs): 300 | super(TransBasicBlock, self).__init__() 301 | self.conv1 = conv3x3(inplanes, inplanes) 302 | self.bn1 = nn.BatchNorm2d(inplanes) 303 | self.relu = nn.ReLU(inplace=True) 304 | if upsample is not None and stride != 1: 305 | self.conv2 = nn.ConvTranspose2d(inplanes, planes, 306 | kernel_size=3, stride=stride, padding=1, 307 | output_padding=1, bias=False) 308 | else: 309 | self.conv2 = conv3x3(inplanes, planes, stride) 310 | self.bn2 = nn.BatchNorm2d(planes) 311 | self.upsample = upsample 312 | self.stride = stride 313 | 314 | def forward(self, x): 315 | residual = x 316 | 317 | out = self.conv1(x) 318 | out = self.bn1(out) 319 | out = self.relu(out) 320 | 321 | out = self.conv2(out) 322 | out = self.bn2(out) 323 | 324 | if self.upsample is not None: 325 | residual = self.upsample(x) 326 | 327 | out += residual 328 | out = self.relu(out) 329 | 330 | return out 331 | 332 | class BatchNormalize(nn.Module): 333 | r""" 334 | I can't believe this isn't supported 335 | https://github.com/pytorch/vision/issues/157 336 | """ 337 | 338 | def __init__(self, mean, std, device): 339 | super().__init__() 340 | self.mean = torch.tensor(mean, device=device)[None, :, None, None] 341 | self.std = torch.tensor(std, device=device)[None, :, None, None] 342 | 343 | def forward(self, x): 344 | # mean = torch.tensor(self.mean, device=x.device) 345 | # std = torch.tensor(self.std, device=x.device) 346 | # mean = self.mean[None, :, None, None] 347 | # std = self.std[None, :, None, None] 348 | # print(x.shape) 349 | # print(self.mean.shape) 350 | return (x - self.mean) / self.std 351 | # x.sub_(self.mean).div_(self.std) 352 | # return x 353 | 354 | class RedNetResizeWrapper(nn.Module): 355 | def __init__(self, device, resize=True, stabilize=False): 356 | super().__init__() 357 | self.rednet = RedNet() 358 | self.rednet.eval() 359 | self.semmap_rgb_norm = BatchNormalize( 360 | mean=[0.493, 0.468, 0.438], 361 | std=[0.544, 0.521, 0.499], 362 | device=device 363 | ) 364 | self.semmap_depth_norm = BatchNormalize( 365 | mean=[0.213], 366 | std=[0.285], 367 | device=device 368 | ) 369 | self.pretrained_size = (480, 640) 370 | self.resize = resize 371 | self.stabilize = stabilize 372 | 373 | def forward(self, rgb, depth, train=False): 374 | r""" 375 | Args: 376 | Raw sensor inputs. 377 | rgb: B x H=256 x W=256 x 3 378 | depth: B x H x W x 1 379 | Returns: 380 | semantic: drop-in replacement for default semantic sensor. B x H x W (no channel, for some reason) 381 | """ 382 | # Not quite sure what depth is produced here. Let's just check 383 | if self.resize: 384 | _, og_h, og_w, _ = rgb.size() # b h w c 385 | rgb = rgb.permute(0, 3, 1, 2) 386 | depth = depth.permute(0, 3, 1, 2) 387 | 388 | rgb = rgb.float() / 255 389 | if self.resize: 390 | rgb = F.interpolate(rgb, self.pretrained_size, mode='bilinear') 391 | depth = F.interpolate(depth, self.pretrained_size, mode='nearest') 392 | # print("rgb: ", rgb.shape) 393 | rgb = self.semmap_rgb_norm(rgb) 394 | 395 | # rgb = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], 396 | # std=[0.229, 0.224, 0.225])(rgb) 397 | # depth = torchvision.transforms.Normalize(mean=[19050], 398 | # std=[9650])(depth) 399 | 400 | depth_clip = (depth < 1.0).squeeze(1) 401 | # depth_clip = ((depth < 1.0) & (depth > 0.0)).squeeze(1) 402 | # print("depth: ", depth.shape) 403 | depth = self.semmap_depth_norm(depth) 404 | with torch.no_grad(): 405 | scores = self.rednet(rgb, depth, train) 406 | # print("scores: ", scores[0][5][100]) 407 | 408 | if train: 409 | return scores 410 | else: 411 | with torch.no_grad(): 412 | scores = nn.Softmax(dim=1)(scores) 413 | # print("score: ", scores.shape) #torch.Size([1, 40, 480, 640]) 414 | # print("pred: ", pred.shape) # torch.Size([1, 40, 480, 640]) 415 | pred = (torch.max(scores, 1)[1] + 1).float() # B x 480 x 640 416 | 417 | # print("torch: ", pred[0][100]) 418 | pred[torch.max(scores, 1)[0] < 0.8] = 1 419 | 420 | if self.stabilize: # downsample tiny 421 | # Mask out out of depth samples 422 | pred[~depth_clip] = -1 # 41 is UNK in MP3D, but hab-sim renders with -1 423 | # pred = F.interpolate(pred.unsqueeze(1), (15, 20), mode='nearest').squeeze(1) 424 | if self.resize: 425 | pred = F.interpolate(pred.unsqueeze(1), (og_h, og_w), mode='nearest') 426 | 427 | return pred.squeeze(1) 428 | 429 | # def load_rednet(device, ckpt="", resize=True, stabilize=False): 430 | # if not os.path.isfile(ckpt): 431 | # raise Exception(f"invalid path {ckpt} provided for rednet weights") 432 | 433 | # model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device) 434 | 435 | # print("=> loading RedNet checkpoint '{}'".format(ckpt)) 436 | # if device.type == 'cuda': 437 | # checkpoint = torch.load(ckpt, map_location='cpu') 438 | # else: 439 | # checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage) 440 | 441 | # state_dict = checkpoint['model_state'] 442 | # prefix = 'module.' 443 | # state_dict = { 444 | # (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items() 445 | # } 446 | # state_dict.pop('final_deconv_custom.weight') 447 | # state_dict.pop('final_deconv_custom.bias') 448 | 449 | # state_dict.pop('out5_conv_custom.weight') 450 | # state_dict.pop('out5_conv_custom.bias') 451 | 452 | # state_dict.pop('out4_conv_custom.weight') 453 | # state_dict.pop('out4_conv_custom.bias') 454 | 455 | # state_dict.pop('out3_conv_custom.weight') 456 | # state_dict.pop('out3_conv_custom.bias') 457 | 458 | # state_dict.pop('out2_conv_custom.weight') 459 | # state_dict.pop('out2_conv_custom.bias') 460 | 461 | # model.rednet.state_dict().update(state_dict) 462 | # # model.rednet.load_state_dict(state_dict) 463 | # print("=> loaded checkpoint '{}' (epoch {})" 464 | # .format(ckpt, checkpoint['epoch'])) 465 | # # print(model) 466 | # print("Model's state_dict:") 467 | # # for param_tensor in model.state_dict(): 468 | # # print(param_tensor, "\t", model.state_dict()[param_tensor].size()) 469 | 470 | # return model 471 | 472 | 473 | def load_rednet(device, ckpt="", resize=True, stabilize=False): 474 | if not os.path.isfile(ckpt): 475 | raise Exception(f"invalid path {ckpt} provided for rednet weights") 476 | 477 | model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device) 478 | 479 | print("=> loading RedNet checkpoint '{}'".format(ckpt)) 480 | if device.type == 'cuda': 481 | checkpoint = torch.load(ckpt, map_location='cpu') 482 | else: 483 | checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage) 484 | 485 | # state_dict = checkpoint['state_dict'] 486 | state_dict = checkpoint['model_state'] 487 | prefix = 'module.' 488 | state_dict = { 489 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items() 490 | } 491 | # model.load_state_dict(state_dict) 492 | model.rednet.load_state_dict(state_dict) 493 | print("=> loaded checkpoint '{}' (epoch {})" 494 | .format(ckpt, checkpoint['epoch'])) 495 | # print(model) 496 | print("Model's state_dict:") 497 | # for param_tensor in model.state_dict(): 498 | # print(param_tensor, "\t", model.state_dict()[param_tensor].size()) 499 | 500 | return model 501 | 502 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train): 503 | # usually this happens only on the start of a epoch 504 | epoch_float = epoch + (local_count / num_train) 505 | epoch_float = epoch + (local_count / num_train) 506 | state = { 507 | 'global_step': global_step, 508 | 'epoch': epoch_float, 509 | 'state_dict': model.state_dict(), 510 | 'optimizer': optimizer.state_dict(), 511 | } 512 | ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float) 513 | path = os.path.join(ckpt_dir, ckpt_model_filename) 514 | torch.save(state, path) 515 | print('{:>2} has been successfully saved'.format(path)) -------------------------------------------------------------------------------- /get_point_score.py: -------------------------------------------------------------------------------- 1 | import sys,pdb,os,time 2 | import torch 3 | from PIL import Image 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from torchvision.transforms.functional import to_pil_image 7 | from lavis.models import model_zoo 8 | from lavis.models import load_model_and_preprocess 9 | from lavis.common.gradcam import getAttMap 10 | from lavis.models.blip_models.blip_image_text_matching import compute_gradcam 11 | import cv2 12 | import scipy.ndimage 13 | import gzip 14 | import json 15 | 16 | def crop_img(image, vis_processors,scales=[1,2,4]): 17 | """ 18 | Crops the image into multiple patches at different scales and visualizes the patches. 19 | 20 | Parameters: 21 | - image_path: The path to the image to be processed. 22 | - scales: A list of scales at which to crop the image. 23 | 24 | Returns: 25 | - A list of PIL Image objects representing the cropped patches. 26 | """ 27 | 28 | # Initialize a list to hold the cropped images 29 | cropped_images = [] 30 | cropped_batch=[] 31 | # Define the crop function using numpy slicing 32 | 33 | def crop_numpy(img, num_x, num_y): 34 | # Convert to numpy array for slicing 35 | np_img = np.array(img) 36 | h, w, _ = np_img.shape 37 | patches=list(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y, 38 | j * w // num_x:(j + 1) * w // num_x, :]) 39 | for i in range(num_y) for j in range(num_x)) 40 | patches_batch = list(vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y, 41 | j * w // num_x:(j + 1) * w // num_x, :])).to(device) 42 | for i in range(num_y) for j in range(num_x)) 43 | 44 | return patches, patches_batch 45 | 46 | for scale in scales: 47 | # Calculate number of patches in each axis 48 | num_x, num_y = scale, scale 49 | # Crop using numpy and store in dictionary 50 | patch,batch=crop_numpy(image, num_x, num_y) 51 | cropped_images.extend(patch) 52 | cropped_batch.extend(batch) 53 | 54 | return cropped_images,torch.stack(cropped_batch) 55 | 56 | def crop_img_padding(image, vis_processors,scales=[0.5, 0.25]): 57 | w, h = image.size 58 | patches = [image] 59 | for scale in scales: 60 | # Determine the size of the patches 61 | new_w, new_h = int(w * scale), int(h * scale) 62 | 63 | # Calculate the number of patches to create 64 | patches_across_width = w // new_w 65 | patches_across_height = h // new_h 66 | 67 | for i in range(patches_across_width): 68 | for j in range(patches_across_height): 69 | # Calculate the left, upper, right, and lower pixel coordinates for cropping 70 | left = i * new_w 71 | upper = j * new_h 72 | right = left + new_w 73 | lower = upper + new_h 74 | 75 | # Crop the patch and create a masked image of the original size 76 | patch = image.crop((left, upper, right, lower)) 77 | mask = Image.new("RGB", (w, h), (0, 0, 0)) 78 | mask.paste(patch, (left, upper)) 79 | patches.append(mask) 80 | return patches #from top to down, from large patch to small patch 81 | 82 | def vis_patches(cropped_images,pth): 83 | # Visualize the cropped images 84 | fig, axes = plt.subplots(len(cropped_images), 1, figsize=(10, len(cropped_images) * 5)) 85 | if len(cropped_images) == 1: 86 | axes = [axes] 87 | for ax, img in zip(axes, cropped_images): 88 | ax.imshow(img) 89 | ax.axis('off') 90 | plt.tight_layout() 91 | plt.savefig(pth) 92 | 93 | def save_images_with_scores(batch, itc_score, attributes,directory): 94 | for attr in attributes: 95 | for i, tensor in enumerate(batch): 96 | # Convert the tensor to a PIL Image 97 | mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:,None,None].cuda() 98 | std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:,None,None].cuda() 99 | # pdb.set_trace() 100 | tensor=tensor*std+mean 101 | image = to_pil_image(tensor) 102 | score = itc_score[attr][i].item() 103 | # Save the image with the score in the filename 104 | filename = f"image_{i}_attri_{attr}_score_{score:.4f}.png" 105 | image.save(os.path.join(directory, filename)) 106 | 107 | def depth_to_colormap(depth_array, colormap=cv2.COLORMAP_JET): 108 | # Normalize the depth array to the range [0, 255] 109 | normalized_depth = cv2.normalize(depth_array, None, 0, 255, cv2.NORM_MINMAX) 110 | # Convert to unsigned 8-bit type 111 | normalized_depth = normalized_depth.astype(np.uint8) 112 | # Apply colormap 113 | colored_depth = cv2.applyColorMap(normalized_depth, colormap) 114 | colored_depth_pil = Image.fromarray(cv2.cvtColor(colored_depth, cv2.COLOR_BGR2RGB)) 115 | return colored_depth_pil 116 | 117 | def get_pyramid_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]): 118 | """ 119 | Crops the image into multiple patches at different scales, computes scores for each patch, 120 | normalizes these scores, and maps them back to the original image pixels. 121 | 122 | Parameters: 123 | - image: The PIL Image to be processed. 124 | - vis_processors: A function that processes each patch and returns a tensor suitable for scoring. 125 | - score_processor: A function that takes a batch of patches and returns scores. 126 | - scales: A list of scales at which to crop the image. 127 | 128 | Returns: 129 | - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel. 130 | """ 131 | 132 | # Initialize lists to hold cropped images and their scores 133 | cropped_images = [] 134 | cropped_batches = [] 135 | 136 | # Define the crop function using numpy slicing 137 | def crop_numpy(img, num_x, num_y): 138 | np_img = np.array(img) 139 | h, w, _ = np_img.shape 140 | return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y, 141 | j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices) 142 | for i in range(num_y) for j in range(num_x)] 143 | 144 | 145 | # Process each scale 146 | for scale in scales: 147 | num_x, num_y = scale, scale 148 | patches = crop_numpy(image, num_x, num_y) 149 | cropped_images.extend(patches) 150 | # batch = torch.stack(cropped_images) 151 | 152 | attr_score={} 153 | attr_score_map={} 154 | for attr in attributes: 155 | 156 | image_scores = score_processor({"image": torch.stack(cropped_images), "text_input": attr}, match_head='itc') 157 | attr_score[attr] = image_scores.detach().cpu().numpy() 158 | 159 | # Normalize the scores: TODO 160 | # total_score = sum(image_scores) 161 | # normalized_scores = [score / total_score for score in image_scores] 162 | 163 | # Create a map of scores for the original image 164 | scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32) 165 | patch_idx = 0 166 | for scale in scales: 167 | num_x, num_y = scale, scale 168 | patch_height = scores_map.shape[0] // num_y 169 | patch_width = scores_map.shape[1] // num_x 170 | for i in range(num_y): 171 | for j in range(num_x): 172 | # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx] 173 | scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += image_scores[patch_idx].detach().cpu().numpy() 174 | patch_idx += 1 175 | 176 | # Average the scores map if there is overlap (simple averaging for overlap handling) 177 | # scores_map /= len(scales) 178 | attr_score_map[attr]=scores_map/3 179 | # pdb.set_trace() 180 | return cropped_images, attr_score_map 181 | 182 | def get_pyramid_clip_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]): 183 | """ 184 | Crops the image into multiple patches at different scales, computes scores for each patch, 185 | normalizes these scores, and maps them back to the original image pixels. 186 | 187 | Parameters: 188 | - image: The PIL Image to be processed. 189 | - vis_processors: A function that processes each patch and returns a tensor suitable for scoring. 190 | - score_processor: A function that takes a batch of patches and returns scores. 191 | - scales: A list of scales at which to crop the image. 192 | 193 | Returns: 194 | - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel. 195 | """ 196 | 197 | # Initialize lists to hold cropped images and their scores 198 | cropped_images = [] 199 | cropped_batches = [] 200 | 201 | # Define the crop function using numpy slicing 202 | def crop_numpy(img, num_x, num_y): 203 | np_img = np.array(img) 204 | h, w, _ = np_img.shape 205 | return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y, 206 | j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices) 207 | for i in range(num_y) for j in range(num_x)] 208 | 209 | # Process each scale 210 | for scale in scales: 211 | num_x, num_y = scale, scale 212 | patches = crop_numpy(image, num_x, num_y) 213 | cropped_images.extend(patches) 214 | # batch = torch.stack(cropped_images) 215 | 216 | attr_score_map = {} 217 | for att_i,attr in enumerate(attributes): 218 | sample = {"image": torch.stack(cropped_images), "text_input": txt_processors(attr)} 219 | 220 | clip_features = score_processor.extract_features(sample) 221 | 222 | features_image = clip_features.image_embeds_proj 223 | features_text = clip_features.text_embeds_proj 224 | 225 | image_scores = (features_image @ features_text.t()).squeeze() 226 | 227 | scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32) 228 | patch_idx = 0 229 | for scale in scales: 230 | num_x, num_y = scale, scale 231 | patch_height = scores_map.shape[0] // num_y 232 | patch_width = scores_map.shape[1] // num_x 233 | for i in range(num_y): 234 | for j in range(num_x): 235 | # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx] 236 | scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += \ 237 | image_scores[patch_idx].detach().cpu().numpy() 238 | patch_idx += 1 239 | 240 | attr_score_map[attr] = scores_map / len(scales) 241 | 242 | return cropped_images, attr_score_map 243 | 244 | def get_att_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None): 245 | img=vis_processors(image).unsqueeze(0).to(blip_devices) 246 | 247 | txt = txt_processors(attributes[0]) 248 | txt_tokens = score_processor.tokenizer(txt, return_tensors="pt").to(blip_devices) 249 | # pdb.set_trace() 250 | gradcam, _ = compute_gradcam(score_processor, img, txt, txt_tokens, block_num=7) 251 | gradcam_np = gradcam[0][1].numpy().astype(np.float32) 252 | avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False) 253 | return avg_gradcam 254 | 255 | def save_scores_map(scores_map, filename='score_map.png'): 256 | """ 257 | Visualizes and saves the scores map with a color bar. 258 | 259 | Parameters: 260 | - scores_map: A numpy array with scores assigned to each pixel. 261 | - filename: The filename to save the image to. 262 | """ 263 | 264 | # Normalize scores_map to the range 0-1 for better color mapping 265 | normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map)) 266 | 267 | # Create a figure and an axes object 268 | fig, ax = plt.subplots() 269 | 270 | # Display the image 271 | cax = ax.imshow(normalized_map, cmap='viridis') # Choose a colormap that you prefer 272 | 273 | # Create a color bar 274 | cbar = fig.colorbar(cax, ax=ax) 275 | cbar.set_label('Score') 276 | 277 | # Save the figure 278 | plt.axis('off') # Turn off axis 279 | plt.savefig(filename, bbox_inches='tight', pad_inches=0) 280 | 281 | # Close the plot to free up memory 282 | plt.close(fig) 283 | 284 | def overlay_attention_map(image, scores_map, filename='attention_overlay.png'): 285 | """ 286 | Applies a smoothed attention map over an image to highlight areas based on the scores, 287 | and then saves the resulting image. 288 | 289 | Parameters: 290 | - image: A PIL Image to be enhanced. 291 | - scores_map: A numpy array with scores assigned to each pixel. 292 | - filename: The filename to save the image to. 293 | """ 294 | 295 | # Normalize scores_map to the range 0-1 for better color mapping 296 | normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map)) 297 | 298 | # Apply Gaussian smoothing to the scores map 299 | smoothed_map = scipy.ndimage.gaussian_filter(normalized_map, sigma=2) # Adjust sigma as needed for the desired smoothness 300 | 301 | # Convert the original image to a numpy array for processing 302 | img_array = np.array(image) 303 | 304 | # Convert the smoothed scores map to a heatmap 305 | plt.figure(figsize=(10, 6)) 306 | plt.imshow(img_array, cmap='gray', interpolation='nearest') # Display the original image 307 | plt.imshow(smoothed_map, cmap='jet', alpha=0.6, interpolation='bilinear') # Overlay the attention map 308 | plt.colorbar() # Optionally add a color bar to indicate scoring scale 309 | plt.axis('off') # Hide axes 310 | # Save the figure 311 | plt.savefig(filename, bbox_inches='tight', pad_inches=0) 312 | plt.close() 313 | 314 | def doubly_right_detection(image,model, vis_processors,target,blip_devices=None): 315 | image = vis_processors(image).unsqueeze(0).to(blip_devices) 316 | out=model.generate({"image": image, "prompt": f"Question: is {target} shown in this image? Answer:"}) 317 | print(out) 318 | if "yes" in out[0].lower(): 319 | return True 320 | else: 321 | return False 322 | 323 | class Pos_Queue: 324 | def __init__(self, capacity): 325 | self.capacity = capacity 326 | self.queue = [] 327 | 328 | def add_element(self, element): 329 | # Add the new element to the queue 330 | self.queue.append(element) 331 | # Check if the queue is beyond capacity 332 | if len(self.queue) > self.capacity: 333 | # If it's full, pop and return the oldest element 334 | return self.queue.pop(0) 335 | # If not full, return None 336 | return None 337 | 338 | 339 | 340 | if __name__ == "__main__": 341 | # setup model 342 | # device = torch.device("cuda") if torch.cuda.is_available() else "cpu" 343 | 344 | # model, vis_processors, text_processors = load_model_and_preprocess(name="blip2_feature_extractor", 345 | # model_type="pretrain", is_eval=True, 346 | # device=device) 347 | 348 | # model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", 349 | # device=device, is_eval=True) 350 | 351 | # model, vis_processors, text_processors = load_model_and_preprocess("blip_image_text_matching", "large", 352 | # device=device, is_eval=True) 353 | 354 | # model, vis_processors, _ = load_model_and_preprocess(name="blip2_opt", model_type="pretrain_opt2.7b", 355 | # is_eval=True, device=device) 356 | 357 | # model, vis_processors, text_processors = load_model_and_preprocess("clip_feature_extractor", model_type="ViT-B-32", 358 | # is_eval=True, device=device) 359 | # load the observation 360 | # data_root="/home/mmvc/Desktop/L3MVN_Reproduce/L3MVN/vis/3d_score" 361 | # obs = np.load(os.path.join(data_root,"sample_T.npy"))#0:3 rgb, 4 depth 362 | 363 | # get rgb 364 | # rgb=obs[:,:,0:3].astype(np.uint8) 365 | # rgb_obs=Image.fromarray(rgb) 366 | # rgb_obs.save(os.path.join(data_root,"rgb.png")) 367 | # patches,batch=crop_img(rgb_obs,vis_processors["eval"]) 368 | # vis_patches(patches,os.path.join(data_root,"patches.png")) 369 | 370 | # get detph 371 | # depth=obs[:,:,4] 372 | # depth_img=depth_to_colormap(depth,) 373 | # depth_img.save(os.path.join(data_root,"depth.png")) 374 | 375 | #batch img and list txt 3.2s 376 | # txt_list=[] 377 | # for i in range(batch.shape[0]): 378 | # txt_list.append(txt) 379 | # for i in range(5): 380 | # itc_score = model({"image": batch, "text_input": txt_list}, match_head='itc') 381 | 382 | # single img and single txt 8.13s 383 | # txt="chair" 384 | # for j in range(5): 385 | # for i in range(batch.shape[0]): 386 | # itc_score = model({"image": batch[i][None,:], "text_input": txt}, match_head='itc') 387 | # print(itc_score) 388 | #batch img and single txt 3.1s 389 | # "leg", 390 | # "backrest", 391 | # "seat", 392 | # "armrest" 393 | 394 | # attributes=['chair backrest', 'chair seat', 'chair leg', 'chair armrest'] 395 | # target=["chair"] 396 | 397 | t=time.time() 398 | # ------------- multi_scale approach ------------------ 399 | # attributes=['object with chair leg, chair backrest, chair seat, and chair armrest'] 400 | # imgs,score_mape=get_pyramid_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device) 401 | # imgs,score_mape=get_pyramid_score_feature(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device) 402 | 403 | # ------------- gradcam approach ------------- 404 | # img=vis_processors["eval"](rgb_obs).unsqueeze(0).to(device) 405 | # txt = text_processors["eval"](attributes[0]) 406 | # txt_tokens = model.tokenizer(txt, return_tensors="pt").to(device) 407 | # gradcam, _ = compute_gradcam(model, img, txt, txt_tokens, block_num=7) 408 | # gradcam_np = gradcam[0][1].numpy().astype(np.float32) 409 | # avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False) 410 | 411 | # fig, ax = plt.subplots(1, 1, figsize=(10, 10)) 412 | # ax.imshow(avg_gradcam) 413 | # ax.axis('off') 414 | # plt.tight_layout() 415 | # plt.savefig(os.path.join(data_root, f"grad_score_map.png")) 416 | 417 | # ------------------ doubly right detection ------------------ 418 | # print(doubly_right_detection(rgb_obs,vis_processors["eval"],target[0],device)) 419 | # end_t=time.time() 420 | 421 | # ------------------ CLIP multi-scale ------------------ 422 | # imgs,score_mape=get_pyramid_clip_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device) 423 | 424 | # check dataset 425 | data_pth='data/objectgoal_hm3d/val_mini/content/TEEsavR23oF.json.gz' 426 | with gzip.open(data_pth, 'r') as f: 427 | data = json.loads(f.read().decode('utf-8')) 428 | for k in data.keys(): 429 | print(k,len(data[k])) 430 | print("========================") 431 | print(data['episodes'][1]) 432 | print("========================") 433 | print(data['episodes'][2]) 434 | print("========================") 435 | print(data['episodes'][3]) 436 | end_t=time.time() 437 | 438 | # for k in score_mape.keys(): 439 | # overlay_attention_map(rgb_obs,score_mape[k],os.path.join(data_root,f"clip_score_map_{k}.png")) 440 | 441 | print(f"duration: {end_t-t} s") 442 | 443 | # print('The image feature and text feature has a cosine similarity of %.4f'%itc_score) 444 | -------------------------------------------------------------------------------- /envs/habitat/objectgoal_env.py: -------------------------------------------------------------------------------- 1 | import json 2 | import bz2 3 | import gzip 4 | import _pickle as cPickle 5 | import gym 6 | import numpy as np 7 | import quaternion 8 | import skimage.morphology 9 | import habitat 10 | 11 | from envs.utils.fmm_planner import FMMPlanner 12 | from constants import coco_categories, hm3d_category 13 | import envs.utils.pose as pu 14 | 15 | 16 | class ObjectGoal_Env(habitat.RLEnv): 17 | """The Object Goal Navigation environment class. The class is responsible 18 | for loading the dataset, generating episodes, and computing evaluation 19 | metrics. 20 | """ 21 | 22 | def __init__(self, args, rank, config_env, dataset): 23 | self.args = args 24 | self.rank = rank 25 | 26 | super().__init__(config_env, dataset) 27 | 28 | # Loading dataset info file 29 | self.split = config_env.DATASET.SPLIT 30 | self.episodes_dir = config_env.DATASET.EPISODES_DIR.format( 31 | split=self.split) 32 | 33 | dataset_info_file = self.episodes_dir + \ 34 | "{split}_info.pbz2".format(split=self.split) 35 | with bz2.BZ2File(dataset_info_file, 'rb') as f: 36 | self.dataset_info = cPickle.load(f) 37 | 38 | # Specifying action and observation space 39 | self.action_space = gym.spaces.Discrete(3) 40 | 41 | self.observation_space = gym.spaces.Box(0, 255, 42 | (3, args.frame_height, 43 | args.frame_width), 44 | dtype='uint8') 45 | 46 | # Initializations 47 | self.episode_no = 0 48 | 49 | # Scene info 50 | self.last_scene_path = None 51 | self.scene_path = None 52 | self.scene_name = None 53 | 54 | # Episode Dataset info 55 | self.eps_data = None 56 | self.eps_data_idx = None 57 | self.gt_planner = None 58 | self.object_boundary = None 59 | self.goal_idx = None 60 | self.goal_name = None 61 | self.map_obj_origin = None 62 | self.starting_loc = None 63 | self.starting_distance = None 64 | 65 | # Episode tracking info 66 | self.curr_distance = None 67 | self.prev_distance = None 68 | self.timestep = None 69 | self.stopped = None 70 | self.path_length = None 71 | self.last_sim_location = None 72 | self.trajectory_states = [] 73 | self.info = {} 74 | self.info['distance_to_goal'] = None 75 | self.info['spl'] = None 76 | self.info['success'] = None 77 | 78 | fileName = 'data/matterport_category_mappings.tsv' 79 | 80 | text = '' 81 | lines = [] 82 | items = [] 83 | self.hm3d_semantic_mapping={} 84 | 85 | with open(fileName, 'r') as f: 86 | text = f.read() 87 | lines = text.split('\n') 88 | 89 | for l in lines: 90 | items.append(l.split(' ')) 91 | 92 | for i in items: 93 | if len(i) > 3: 94 | self.hm3d_semantic_mapping[i[2]] = i[-1] 95 | 96 | def load_new_episode(self): 97 | """The function loads a fixed episode from the episode dataset. This 98 | function is used for evaluating a trained model on the val split. 99 | """ 100 | 101 | args = self.args 102 | self.scene_path = self.habitat_env.sim.curr_scene_name 103 | scene_name = self.scene_path.split("/")[-1].split(".")[0] 104 | 105 | if self.scene_path != self.last_scene_path: 106 | episodes_file = self.episodes_dir + \ 107 | "content/{}_episodes.json.gz".format(scene_name) 108 | 109 | print("Loading episodes from: {}".format(episodes_file)) 110 | with gzip.open(episodes_file, 'r') as f: 111 | self.eps_data = json.loads( 112 | f.read().decode('utf-8'))["episodes"] 113 | 114 | self.eps_data_idx = 0 115 | self.last_scene_path = self.scene_path 116 | 117 | # Load episode info 118 | episode = self.eps_data[self.eps_data_idx] 119 | self.eps_data_idx += 1 120 | self.eps_data_idx = self.eps_data_idx % len(self.eps_data) 121 | pos = episode["start_position"] 122 | rot = quaternion.from_float_array(episode["start_rotation"]) 123 | 124 | goal_name = episode["object_category"] 125 | goal_idx = episode["object_id"] 126 | floor_idx = episode["floor_id"] 127 | 128 | # Load scene info 129 | scene_info = self.dataset_info[scene_name] 130 | sem_map = scene_info[floor_idx]['sem_map'] 131 | map_obj_origin = scene_info[floor_idx]['origin'] 132 | 133 | # Setup ground truth planner 134 | object_boundary = args.success_dist 135 | map_resolution = args.map_resolution 136 | selem = skimage.morphology.disk(2) 137 | traversible = skimage.morphology.binary_dilation( 138 | sem_map[0], selem) != True 139 | traversible = 1 - traversible 140 | planner = FMMPlanner(traversible) 141 | selem = skimage.morphology.disk( 142 | int(object_boundary * 100. / map_resolution)) 143 | goal_map = skimage.morphology.binary_dilation( 144 | sem_map[goal_idx + 1], selem) != True 145 | goal_map = 1 - goal_map 146 | planner.set_multi_goal(goal_map) 147 | 148 | # Get starting loc in GT map coordinates 149 | x = -pos[2] 150 | y = -pos[0] 151 | min_x, min_y = map_obj_origin / 100.0 152 | map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.) 153 | 154 | self.gt_planner = planner 155 | self.starting_loc = map_loc 156 | self.object_boundary = object_boundary 157 | self.goal_idx = goal_idx 158 | self.goal_name = goal_name 159 | self.map_obj_origin = map_obj_origin 160 | 161 | self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc]\ 162 | / 20.0 + self.object_boundary 163 | self.prev_distance = self.starting_distance 164 | self._env.sim.set_agent_state(pos, rot) 165 | 166 | # The following two should match approximately 167 | # print(starting_loc) 168 | # print(self.sim_continuous_to_sim_map(self.get_sim_location())) 169 | 170 | obs = self._env.sim.get_observations_at(pos, rot) 171 | 172 | return obs 173 | 174 | def generate_new_episode(self): 175 | """The function generates a random valid episode. This function is used 176 | for training a model on the train split. 177 | """ 178 | 179 | args = self.args 180 | 181 | self.scene_path = self.habitat_env.sim.config.SCENE 182 | scene_name = self.scene_path.split("/")[-1].split(".")[0] 183 | 184 | scene_info = self.dataset_info[scene_name] 185 | map_resolution = args.map_resolution 186 | 187 | floor_idx = np.random.randint(len(scene_info.keys())) 188 | floor_height = scene_info[floor_idx]['floor_height'] 189 | sem_map = scene_info[floor_idx]['sem_map'] 190 | map_obj_origin = scene_info[floor_idx]['origin'] 191 | 192 | cat_counts = sem_map.sum(2).sum(1) 193 | possible_cats = list(np.arange(6)) 194 | 195 | for i in range(6): 196 | if cat_counts[i + 1] == 0: 197 | possible_cats.remove(i) 198 | 199 | object_boundary = args.success_dist 200 | 201 | loc_found = False 202 | while not loc_found: 203 | if len(possible_cats) == 0: 204 | print("No valid objects for {}".format(floor_height)) 205 | eps = eps - 1 206 | continue 207 | 208 | goal_idx = np.random.choice(possible_cats) 209 | 210 | for key, value in coco_categories.items(): 211 | if value == goal_idx: 212 | goal_name = key 213 | 214 | selem = skimage.morphology.disk(2) 215 | traversible = skimage.morphology.binary_dilation( 216 | sem_map[0], selem) != True 217 | traversible = 1 - traversible 218 | 219 | planner = FMMPlanner(traversible) 220 | 221 | selem = skimage.morphology.disk( 222 | int(object_boundary * 100. / map_resolution)) 223 | goal_map = skimage.morphology.binary_dilation( 224 | sem_map[goal_idx + 1], selem) != True 225 | goal_map = 1 - goal_map 226 | 227 | planner.set_multi_goal(goal_map) 228 | 229 | m1 = sem_map[0] > 0 230 | m2 = planner.fmm_dist > (args.min_d - object_boundary) * 20.0 231 | m3 = planner.fmm_dist < (args.max_d - object_boundary) * 20.0 232 | 233 | possible_starting_locs = np.logical_and(m1, m2) 234 | possible_starting_locs = np.logical_and( 235 | possible_starting_locs, m3) * 1. 236 | if possible_starting_locs.sum() != 0: 237 | loc_found = True 238 | else: 239 | print("Invalid object: {} / {} / {}".format( 240 | scene_name, floor_height, goal_name)) 241 | possible_cats.remove(goal_idx) 242 | scene_info[floor_idx]["sem_map"][goal_idx + 1, :, :] = 0. 243 | self.dataset_info[scene_name][floor_idx][ 244 | "sem_map"][goal_idx + 1, :, :] = 0. 245 | 246 | loc_found = False 247 | while not loc_found: 248 | pos = self._env.sim.sample_navigable_point() 249 | x = -pos[2] 250 | y = -pos[0] 251 | min_x, min_y = map_obj_origin / 100.0 252 | map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.) 253 | if abs(pos[1] - floor_height) < args.floor_thr / 100.0 and \ 254 | possible_starting_locs[map_loc[0], map_loc[1]] == 1: 255 | loc_found = True 256 | 257 | agent_state = self._env.sim.get_agent_state(0) 258 | rotation = agent_state.rotation 259 | rvec = quaternion.as_rotation_vector(rotation) 260 | rvec[1] = np.random.rand() * 2 * np.pi 261 | rot = quaternion.from_rotation_vector(rvec) 262 | 263 | self.gt_planner = planner 264 | self.starting_loc = map_loc 265 | self.object_boundary = object_boundary 266 | self.goal_idx = goal_idx 267 | self.goal_name = goal_name 268 | self.map_obj_origin = map_obj_origin 269 | 270 | self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc] \ 271 | / 20.0 + self.object_boundary 272 | self.prev_distance = self.starting_distance 273 | 274 | self._env.sim.set_agent_state(pos, rot) 275 | 276 | # The following two should match approximately 277 | # print(starting_loc) 278 | # print(self.sim_continuous_to_sim_map(self.get_sim_location())) 279 | 280 | obs = self._env.sim.get_observations_at(pos, rot) 281 | 282 | return obs 283 | 284 | def sim_map_to_sim_continuous(self, coords): 285 | """Converts ground-truth 2D Map coordinates to absolute Habitat 286 | simulator position and rotation. 287 | """ 288 | agent_state = self._env.sim.get_agent_state(0) 289 | y, x = coords 290 | min_x, min_y = self.map_obj_origin / 100.0 291 | 292 | cont_x = x / 20. + min_x 293 | cont_y = y / 20. + min_y 294 | agent_state.position[0] = cont_y 295 | agent_state.position[2] = cont_x 296 | 297 | rotation = agent_state.rotation 298 | rvec = quaternion.as_rotation_vector(rotation) 299 | 300 | if self.args.train_single_eps: 301 | rvec[1] = 0.0 302 | else: 303 | rvec[1] = np.random.rand() * 2 * np.pi 304 | rot = quaternion.from_rotation_vector(rvec) 305 | 306 | return agent_state.position, rot 307 | 308 | def sim_continuous_to_sim_map(self, sim_loc): 309 | """Converts absolute Habitat simulator pose to ground-truth 2D Map 310 | coordinates. 311 | """ 312 | x, y, o = sim_loc 313 | min_x, min_y = self.map_obj_origin / 100.0 314 | x, y = int((-x - min_x) * 20.), int((-y - min_y) * 20.) 315 | 316 | o = np.rad2deg(o) + 180.0 317 | return y, x, o 318 | 319 | def reset(self): 320 | """Resets the environment to a new episode. 321 | 322 | Returns: 323 | obs (ndarray): RGBD observations (4 x H x W) 324 | info (dict): contains timestep, pose, goal category and 325 | evaluation metric info 326 | """ 327 | args = self.args 328 | new_scene = self.episode_no % args.num_train_episodes == 0 329 | 330 | self.scene = self._env.sim.semantic_annotations() 331 | 332 | self.episode_no += 1 333 | 334 | # Initializations 335 | self.timestep = 0 336 | self.stopped = False 337 | self.path_length = 1e-5 338 | self.trajectory_states = [] 339 | 340 | if new_scene: 341 | obs = super().reset() 342 | self.scene_name = self.habitat_env.sim.curr_scene_name 343 | print("Changing scene: {}/{}".format(self.rank, self.scene_name)) 344 | 345 | self.scene_path = self.habitat_env.sim.curr_scene_name 346 | 347 | if self.split == "val": 348 | obs = self.load_new_episode() 349 | else: 350 | obs = self.generate_new_episode() 351 | 352 | # print("obs: ,", obs) 353 | # print("obs shape: ,", obs.shape) 354 | rgb = obs['rgb'].astype(np.uint8) 355 | depth = obs['depth'] 356 | semantic = self._preprocess_semantic(obs["semantic"]) 357 | # print("rgb shape: ,", rgb.shape) 358 | # print("depth shape: ,", depth.shape) 359 | # print("semantic shape: ,", semantic.shape) 360 | 361 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1) 362 | self.last_sim_location = self.get_sim_location() 363 | 364 | # Set info 365 | self.info['time'] = self.timestep 366 | self.info['sensor_pose'] = [0., 0., 0.] 367 | self.info['goal_cat_id'] = self.goal_idx 368 | self.info['goal_name'] = hm3d_category[self.goal_idx] 369 | 370 | return state, self.info 371 | 372 | def step(self, action): 373 | """Function to take an action in the environment. 374 | 375 | Args: 376 | action (dict): 377 | dict with following keys: 378 | 'action' (int): 0: stop, 1: forward, 2: left, 3: right 379 | 380 | Returns: 381 | obs (ndarray): RGBD observations (4 x H x W) 382 | reward (float): amount of reward returned after previous action 383 | done (bool): whether the episode has ended 384 | info (dict): contains timestep, pose, goal category and 385 | evaluation metric info 386 | """ 387 | action = action["action"] 388 | if action == 0: 389 | self.stopped = True 390 | # Not sending stop to simulator, resetting manually 391 | action = 3 392 | 393 | obs, rew, done, _ = super().step(action) 394 | 395 | # Get pose change 396 | dx, dy, do = self.get_pose_change() 397 | self.info['sensor_pose'] = [dx, dy, do] 398 | self.path_length += pu.get_l2_distance(0, dx, 0, dy) 399 | 400 | spl, success, dist = 0., 0., 0. 401 | if done: 402 | spl, success, dist = self.get_metrics() 403 | self.info['distance_to_goal'] = dist 404 | self.info['spl'] = spl 405 | self.info['success'] = success 406 | 407 | rgb = obs['rgb'].astype(np.uint8) 408 | depth = obs['depth'] 409 | semantic = self._preprocess_semantic(obs["semantic"]) 410 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1) 411 | 412 | self.timestep += 1 413 | self.info['time'] = self.timestep 414 | 415 | return state, rew, done, self.info 416 | 417 | def _preprocess_semantic(self, semantic): 418 | # print("*********semantic type: ", type(semantic)) 419 | se = list(set(semantic.ravel())) 420 | # print(se) # [] 421 | for i in range(len(se)): 422 | if self.scene.objects[se[i]] is not None and self.scene.objects[se[i]].category.name() in coco_categories: 423 | # print(self.scene.objects[se[i]].id) 424 | # print(self.scene.objects[se[i]].category.index()) 425 | # print(type(self.scene.objects[se[i]].category.index()) ) # int 426 | semantic[semantic==se[i]] = coco_categories[self.scene.objects[se[i]].category.name()]+1 427 | # print(self.scene.objects[se[i]+1].category.name()) 428 | else : 429 | semantic[ 430 | semantic==se[i] 431 | ] = 0 432 | 433 | # se = list(set(semantic.ravel())) 434 | # print("semantic: ", se) # [] 435 | # semantic = np.expand_dims(semantic.astype(np.uint8), 2) 436 | return semantic 437 | 438 | def get_reward_range(self): 439 | """This function is not used, Habitat-RLEnv requires this function""" 440 | return (0., 1.0) 441 | 442 | def get_reward(self, observations): 443 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) 444 | self.curr_distance = self.gt_planner.fmm_dist[curr_loc[0], 445 | curr_loc[1]] / 20.0 446 | 447 | reward = (self.prev_distance - self.curr_distance) * \ 448 | self.args.reward_coeff 449 | 450 | self.prev_distance = self.curr_distance 451 | return reward 452 | 453 | def get_llm_distance(self, target_point_map, frontier_loc_g): 454 | 455 | tpm = len(list(set(target_point_map.ravel()))) -1 456 | frontier_dis_g = 0 457 | if tpm > 1: 458 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) 459 | frontier_loc_g[0][0] = frontier_loc_g[0][0] - target_point_map.shape[0]/2 + curr_loc[0] 460 | frontier_loc_g[1][0] = frontier_loc_g[1][0] - target_point_map.shape[1]/2 + curr_loc[1] 461 | (aa, bb) = self.gt_planner.traversible.shape 462 | if frontier_loc_g[0][0] > aa-1: 463 | frontier_loc_g[0][0] = aa-1 464 | if frontier_loc_g[1][0] > bb-1: 465 | frontier_loc_g[1][0] = bb -1 466 | frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0], 467 | frontier_loc_g[1]] / 20.0 468 | for lay in range(tpm): 469 | frontier_loc = np.where(target_point_map == lay+1) 470 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) 471 | frontier_loc[0][0] = frontier_loc[0][0] - target_point_map.shape[0]/2 + curr_loc[0] 472 | frontier_loc[1][0] = frontier_loc[1][0] - target_point_map.shape[1]/2 + curr_loc[1] 473 | (aa, bb) = self.gt_planner.traversible.shape 474 | if frontier_loc[0][0] > aa-1: 475 | frontier_loc[0][0] = aa-1 476 | if frontier_loc[1][0] > bb-1: 477 | frontier_loc[1][0] = bb -1 478 | frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0], 479 | frontier_loc[1]] / 20.0 480 | if frontier_distance < frontier_dis_g: 481 | return 0 482 | return 1 483 | else: 484 | return 0 485 | 486 | 487 | def get_metrics(self): 488 | """This function computes evaluation metrics for the Object Goal task 489 | 490 | Returns: 491 | spl (float): Success weighted by Path Length 492 | (See https://arxiv.org/pdf/1807.06757.pdf) 493 | success (int): 0: Failure, 1: Successful 494 | dist (float): Distance to Success (DTS), distance of the agent 495 | from the success threshold boundary in meters. 496 | (See https://arxiv.org/pdf/2007.00643.pdf) 497 | """ 498 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location()) 499 | dist = self.gt_planner.fmm_dist[curr_loc[0], curr_loc[1]] / 20.0 500 | if dist == 0.0: 501 | success = 1 502 | else: 503 | success = 0 504 | spl = min(success * self.starting_distance / self.path_length, 1) 505 | return spl, success, dist 506 | 507 | def get_done(self, observations): 508 | if self.info['time'] >= self.args.max_episode_length - 1: 509 | done = True 510 | elif self.stopped: 511 | done = True 512 | else: 513 | done = False 514 | return done 515 | 516 | def get_info(self, observations): 517 | """This function is not used, Habitat-RLEnv requires this function""" 518 | info = {} 519 | return info 520 | 521 | def get_spaces(self): 522 | """Returns observation and action spaces for the ObjectGoal task.""" 523 | return self.observation_space, self.action_space 524 | 525 | def get_sim_location(self): 526 | """Returns x, y, o pose of the agent in the Habitat simulator.""" 527 | 528 | agent_state = super().habitat_env.sim.get_agent_state(0) 529 | x = -agent_state.position[2] 530 | y = -agent_state.position[0] 531 | axis = quaternion.as_euler_angles(agent_state.rotation)[0] 532 | if (axis % (2 * np.pi)) < 0.1 or (axis % 533 | (2 * np.pi)) > 2 * np.pi - 0.1: 534 | o = quaternion.as_euler_angles(agent_state.rotation)[1] 535 | else: 536 | o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1] 537 | if o > np.pi: 538 | o -= 2 * np.pi 539 | return x, y, o 540 | 541 | def get_pose_change(self): 542 | """Returns dx, dy, do pose change of the agent relative to the last 543 | timestep.""" 544 | curr_sim_pose = self.get_sim_location() 545 | dx, dy, do = pu.get_rel_pose_change( 546 | curr_sim_pose, self.last_sim_location) 547 | self.last_sim_location = curr_sim_pose 548 | return dx, dy, do 549 | -------------------------------------------------------------------------------- /agents/sem_exp.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import sys 4 | 5 | import cv2 6 | import numpy as np 7 | import skimage.morphology 8 | from PIL import Image 9 | from torchvision import transforms 10 | import time 11 | 12 | from envs.utils.fmm_planner import FMMPlanner 13 | from envs.habitat.objectgoal_env import ObjectGoal_Env 14 | from envs.habitat.objectgoal_env21 import ObjectGoal_Env21 15 | from agents.utils.semantic_prediction import SemanticPredMaskRCNN 16 | from constants import color_palette 17 | import envs.utils.pose as pu 18 | import agents.utils.visualization as vu 19 | 20 | from RedNet.RedNet_model import load_rednet 21 | from constants import mp_categories_mapping 22 | import torch 23 | from get_point_score import * 24 | 25 | class Sem_Exp_Env_Agent(ObjectGoal_Env21): 26 | """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class 27 | object is used for each environment thread. 28 | 29 | """ 30 | 31 | def __init__(self, args, rank, config_env, dataset): 32 | 33 | self.args = args 34 | super().__init__(args, rank, config_env, dataset) 35 | 36 | # initialize transform for RGB observations 37 | self.res = transforms.Compose( 38 | [transforms.ToPILImage(), 39 | transforms.Resize((args.frame_height, args.frame_width), 40 | interpolation=Image.NEAREST)]) 41 | 42 | # initialize semantic segmentation prediction model 43 | if args.sem_gpu_id == -1: 44 | args.sem_gpu_id = config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID 45 | 46 | self.device = args.device 47 | self.sem_pred = SemanticPredMaskRCNN(args) 48 | self.red_sem_pred = load_rednet( 49 | self.device, ckpt='RedNet/model/rednet_semmap_mp3d_40.pth', resize=True, # since we train on half-vision 50 | ) 51 | self.red_sem_pred.eval() 52 | # self.red_sem_pred.to(self.device) 53 | 54 | 55 | # initializations for planning: 56 | self.selem = skimage.morphology.disk(3) 57 | 58 | self.obs = None 59 | self.obs_shape = None 60 | self.collision_map = None 61 | self.visited = None 62 | self.visited_vis = None 63 | self.col_width = None 64 | self.curr_loc = None 65 | self.last_loc = None 66 | self.last_action = None 67 | self.count_forward_actions = None 68 | 69 | self.replan_count = 0 70 | self.collision_n = 0 71 | self.kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(3, 3)) 72 | 73 | if args.visualize or args.print_images: 74 | self.legend = cv2.imread('docs/legend.png') 75 | self.vis_image = None 76 | self.rgb_vis = None 77 | 78 | self.fail_case = {} 79 | self.fail_case['collision'] = 0 80 | self.fail_case['success'] = 0 81 | self.fail_case['detection'] = 0 82 | self.fail_case['exploration'] = 0 83 | 84 | self.eve_angle = 0 85 | 86 | def reset(self): 87 | args = self.args 88 | 89 | self.replan_count = 0 90 | self.collision_n = 0 91 | 92 | obs, info = super().reset() 93 | store_obs=obs.transpose(1, 2, 0) 94 | # print(obs.shape) 95 | # print(store_obs.shape) 96 | # print("!!!") 97 | # np.save("vis/3d_score/sample_T.npy",store_obs) 98 | obs = self._preprocess_obs(obs) 99 | 100 | self.obs_shape = obs.shape 101 | 102 | # Episode initializations 103 | map_shape = (args.map_size_cm // args.map_resolution, 104 | args.map_size_cm // args.map_resolution) 105 | self.collision_map = np.zeros(map_shape) 106 | self.visited = np.zeros(map_shape) 107 | self.visited_vis = np.zeros(map_shape) 108 | self.col_width = 1 109 | self.count_forward_actions = 0 110 | self.curr_loc = [args.map_size_cm / 100.0 / 2.0, 111 | args.map_size_cm / 100.0 / 2.0, 0.] 112 | self.last_action = None 113 | 114 | self.eve_angle = 0 115 | self.eve_angle_old = 0 116 | 117 | info['eve_angle'] = self.eve_angle 118 | 119 | 120 | if args.visualize or args.print_images: 121 | self.vis_image = vu.init_vis_image(self.goal_name, self.legend) 122 | 123 | # print(obs.shape) 124 | return obs, info 125 | 126 | def plan_act_and_preprocess(self, planner_inputs): 127 | """Function responsible for planning, taking the action and 128 | preprocessing observations 129 | 130 | Args: 131 | planner_inputs (dict): 132 | dict with following keys: 133 | 'map_pred' (ndarray): (M, M) map prediction 134 | 'goal' (ndarray): (M, M) mat denoting goal locations 135 | 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) 136 | and planning window (gx1, gx2, gy1, gy2) 137 | 'found_goal' (bool): whether the goal object is found 138 | 139 | Returns: 140 | obs (ndarray): preprocessed observations ((4+C) x H x W) 141 | reward (float): amount of reward returned after previous action 142 | done (bool): whether the episode has ended 143 | info (dict): contains timestep, pose, goal category and 144 | evaluation metric info 145 | """ 146 | # s_time = time.time() 147 | 148 | # plan 149 | if planner_inputs["wait"]: 150 | self.last_action = None 151 | self.info["sensor_pose"] = [0., 0., 0.] 152 | return np.zeros(self.obs.shape), self.fail_case, False, self.info 153 | 154 | # Reset reward if new long-term goal 155 | if planner_inputs["new_goal"]: 156 | goal = planner_inputs['goal'] 157 | if np.sum(goal == 1) == 1 and self.args.task_config == "tasks/objectnav_gibson.yaml": 158 | frontier_loc = np.where(goal == 1) 159 | self.info["g_reward"] = self.get_llm_distance(planner_inputs["map_target"], frontier_loc) 160 | 161 | # self.collision_map = np.zeros(self.visited.shape) 162 | self.info['clear_flag'] = 0 163 | 164 | action = self._plan(planner_inputs) 165 | 166 | # c_time = time.time() 167 | # ss_time = c_time - s_time 168 | # print('plan map: %.3f秒'%ss_time) 0.19 169 | 170 | if self.collision_n > 20 or self.replan_count > 26: 171 | self.info['clear_flag'] = 1 172 | self.collision_n = 0 173 | 174 | if self.args.visualize or self.args.print_images: 175 | 176 | self._visualize(planner_inputs) 177 | 178 | if action >= 0: 179 | 180 | # act 181 | action = {'action': action} 182 | obs, rew, done, info = super().step(action) 183 | 184 | if done and self.info['success'] == 0: 185 | if self.info['time'] >= self.args.max_episode_length - 1: 186 | self.fail_case['exploration'] += 1 187 | elif self.replan_count > 26: 188 | self.fail_case['collision'] += 1 189 | else: 190 | self.fail_case['detection'] += 1 191 | 192 | if done and self.info['success'] == 1: 193 | self.fail_case['success'] += 1 194 | 195 | # preprocess obs 196 | obs = self._preprocess_obs(obs) 197 | self.last_action = action['action'] 198 | self.obs = obs 199 | 200 | info['last_action'] = action['action'] # Updated to get action info as well 201 | self.info = info 202 | info['eve_angle'] = self.eve_angle 203 | 204 | 205 | # e_time = time.time() 206 | # ss_time = e_time - c_time 207 | # print('act map: %.3f秒'%ss_time) 0.23 208 | 209 | # info['g_reward'] += rew 210 | 211 | return obs, self.fail_case, done, info 212 | 213 | else: 214 | self.last_action = None 215 | self.info["sensor_pose"] = [0., 0., 0.] 216 | info['last_action'] = None # Updated to get action info as well 217 | 218 | return np.zeros(self.obs_shape), self.fail_case, False, self.info 219 | 220 | def _plan(self, planner_inputs): 221 | """Function responsible for planning 222 | 223 | Args: 224 | planner_inputs (dict): 225 | dict with following keys: 226 | 'map_pred' (ndarray): (M, M) map prediction 227 | 'goal' (ndarray): (M, M) goal locations 228 | 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o) 229 | and planning window (gx1, gx2, gy1, gy2) 230 | 'found_goal' (bool): whether the goal object is found 231 | 232 | Returns: 233 | action (int): action id 234 | """ 235 | args = self.args 236 | 237 | self.last_loc = self.curr_loc 238 | 239 | # Get Map prediction 240 | map_pred = np.rint(planner_inputs['map_pred']) 241 | exp_pred = np.rint(planner_inputs['exp_pred']) 242 | goal = planner_inputs['goal'] 243 | 244 | # Get pose prediction and global policy planning window 245 | start_x, start_y, start_o, gx1, gx2, gy1, gy2 = \ 246 | planner_inputs['pose_pred'] 247 | gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) 248 | planning_window = [gx1, gx2, gy1, gy2] 249 | 250 | # Get curr loc 251 | self.curr_loc = [start_x, start_y, start_o] 252 | r, c = start_y, start_x 253 | start = [int(r * 100.0 / args.map_resolution - gx1), 254 | int(c * 100.0 / args.map_resolution - gy1)] 255 | start = pu.threshold_poses(start, map_pred.shape) 256 | 257 | self.visited[gx1:gx2, gy1:gy2][start[0] - 0:start[0] + 1, 258 | start[1] - 0:start[1] + 1] = 1 259 | 260 | # if args.visualize or args.print_images: 261 | # Get last loc 262 | last_start_x, last_start_y = self.last_loc[0], self.last_loc[1] 263 | r, c = last_start_y, last_start_x 264 | last_start = [int(r * 100.0 / args.map_resolution - gx1), 265 | int(c * 100.0 / args.map_resolution - gy1)] 266 | last_start = pu.threshold_poses(last_start, map_pred.shape) 267 | self.visited_vis[gx1:gx2, gy1:gy2] = \ 268 | vu.draw_line(last_start, start, 269 | self.visited_vis[gx1:gx2, gy1:gy2]) 270 | 271 | # Collision check 272 | if self.last_action == 1 and not planner_inputs["new_goal"]: 273 | x1, y1, t1 = self.last_loc 274 | x2, y2, _ = self.curr_loc 275 | buf = 4 276 | length = 2 277 | 278 | if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05: 279 | self.col_width += 2 280 | if self.col_width == 7: 281 | length = 4 282 | buf = 3 283 | self.col_width = min(self.col_width, 5) 284 | else: 285 | self.col_width = 1 286 | 287 | dist = pu.get_l2_distance(x1, x2, y1, y2) 288 | if dist < args.collision_threshold: # Collision 289 | self.collision_n += 1 290 | width = self.col_width 291 | for i in range(length): 292 | for j in range(width): 293 | wx = x1 + 0.05 * \ 294 | ((i + buf) * np.cos(np.deg2rad(t1)) 295 | + (j - width // 2) * np.sin(np.deg2rad(t1))) 296 | wy = y1 + 0.05 * \ 297 | ((i + buf) * np.sin(np.deg2rad(t1)) 298 | - (j - width // 2) * np.cos(np.deg2rad(t1))) 299 | r, c = wy, wx 300 | r, c = int(r * 100 / args.map_resolution), \ 301 | int(c * 100 / args.map_resolution) 302 | [r, c] = pu.threshold_poses([r, c], 303 | self.collision_map.shape) 304 | self.collision_map[r, c] = 1 305 | 306 | stg, replan, stop = self._get_stg(map_pred, start, np.copy(goal), 307 | planning_window) 308 | 309 | if replan: 310 | self.replan_count += 1 311 | print("false: ", self.replan_count) 312 | else: 313 | self.replan_count = 0 314 | 315 | # Deterministic Local Policy 316 | if (stop and planner_inputs['found_goal'] == 1) or self.replan_count > 26: 317 | action = 0 # Stop 318 | else: 319 | (stg_x, stg_y) = stg 320 | angle_st_goal = math.degrees(math.atan2(stg_x - start[0], 321 | stg_y - start[1])) 322 | angle_agent = (start_o) % 360.0 323 | if angle_agent > 180: 324 | angle_agent -= 360 325 | 326 | relative_angle = (angle_agent - angle_st_goal) % 360.0 327 | if relative_angle > 180: 328 | relative_angle -= 360 329 | 330 | ## add the evelution angle 331 | eve_start_x = int(5 * math.sin(angle_st_goal) + start[0]) 332 | eve_start_y = int(5 * math.cos(angle_st_goal) + start[1]) 333 | if eve_start_x > map_pred.shape[0]: eve_start_x = map_pred.shape[0] 334 | if eve_start_y > map_pred.shape[0]: eve_start_y = map_pred.shape[0] 335 | if eve_start_x < 0: eve_start_x = 0 336 | if eve_start_y < 0: eve_start_y = 0 337 | if exp_pred[eve_start_x, eve_start_y] == 0 and self.eve_angle > -60: 338 | action = 5 339 | self.eve_angle -= 30 340 | elif exp_pred[eve_start_x, eve_start_y] == 1 and self.eve_angle < 0: 341 | action = 4 342 | self.eve_angle += 30 343 | elif relative_angle > self.args.turn_angle / 2.: 344 | action = 3 # Right 345 | elif relative_angle < -self.args.turn_angle / 2.: 346 | action = 2 # Left 347 | else: 348 | action = 1 # Forward 349 | 350 | return action 351 | 352 | def _get_stg(self, grid, start, goal, planning_window): 353 | """Get short-term goal""" 354 | 355 | [gx1, gx2, gy1, gy2] = planning_window 356 | 357 | x1, y1, = 0, 0 358 | x2, y2 = grid.shape 359 | 360 | def add_boundary(mat, value=1): 361 | h, w = mat.shape 362 | new_mat = np.zeros((h + 2, w + 2)) + value 363 | new_mat[1:h + 1, 1:w + 1] = mat 364 | return new_mat 365 | 366 | traversible = skimage.morphology.binary_dilation( 367 | grid[x1:x2, y1:y2], 368 | self.selem) != True 369 | traversible[self.collision_map[gx1:gx2, gy1:gy2] 370 | [x1:x2, y1:y2] == 1] = 0 371 | traversible[cv2.dilate(self.visited_vis[gx1:gx2, gy1:gy2][x1:x2, y1:y2], self.kernel) == 1] = 1 372 | 373 | traversible[int(start[0] - x1) - 1:int(start[0] - x1) + 2, 374 | int(start[1] - y1) - 1:int(start[1] - y1) + 2] = 1 375 | 376 | traversible = add_boundary(traversible) 377 | goal = add_boundary(goal, value=0) 378 | 379 | planner = FMMPlanner(traversible) 380 | selem = skimage.morphology.disk(10) 381 | goal = skimage.morphology.binary_dilation( 382 | goal, selem) != True 383 | goal = 1 - goal * 1. 384 | planner.set_multi_goal(goal) 385 | 386 | state = [start[0] - x1 + 1, start[1] - y1 + 1] 387 | stg_x, stg_y, replan, stop = planner.get_short_term_goal(state) 388 | 389 | stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1 390 | 391 | return (stg_x, stg_y), replan, stop 392 | 393 | def _preprocess_obs(self, obs, use_seg=True): 394 | args = self.args 395 | # print("obs: ", obs) 396 | 397 | obs = obs.transpose(1, 2, 0) 398 | rgb = obs[:, :, :3] 399 | depth = obs[:, :, 3:4] 400 | semantic = obs[:,:,4:5].squeeze() 401 | # print("obs: ", semantic.shape) 402 | if args.use_gtsem: 403 | self.rgb_vis = rgb 404 | sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1)) 405 | for i in range(16): 406 | sem_seg_pred[:,:,i][semantic == i+1] = 1 407 | else: 408 | # print("sem_exp_preprocess_semantic") 409 | red_semantic_pred, semantic_pred = self._get_sem_pred( 410 | rgb.astype(np.uint8), depth, use_seg=use_seg) 411 | # print(red_semantic_pred.shape) 412 | # print(semantic_pred.shape) 413 | # print(mp_categories_mapping) 414 | # sys.exit(0) 415 | sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1)) 416 | for i in range(0, 15): 417 | # print(mp_categories_mapping[i]) 418 | sem_seg_pred[:,:,i][red_semantic_pred == mp_categories_mapping[i]] = 1 419 | 420 | sem_seg_pred[:,:,0][semantic_pred[:,:,0] == 0] = 0 421 | sem_seg_pred[:,:,1][semantic_pred[:,:,1] == 0] = 0 422 | sem_seg_pred[:,:,3][semantic_pred[:,:,3] == 0] = 0 423 | sem_seg_pred[:,:,4][semantic_pred[:,:,4] == 1] = 1 424 | sem_seg_pred[:,:,5][semantic_pred[:,:,5] == 1] = 1 425 | 426 | 427 | # sem_seg_pred = self._get_sem_pred( 428 | # rgb.astype(np.uint8), depth, use_seg=use_seg) 429 | 430 | depth = self._preprocess_depth(depth, args.min_depth, args.max_depth) 431 | 432 | ds = args.env_frame_width // args.frame_width # Downscaling factor 433 | 434 | if ds != 1: 435 | rgb = np.asarray(self.res(rgb.astype(np.uint8))) 436 | depth = depth[ds // 2::ds, ds // 2::ds] 437 | sem_seg_pred = sem_seg_pred[ds // 2::ds, ds // 2::ds] 438 | 439 | depth = np.expand_dims(depth, axis=2) 440 | state = np.concatenate((rgb, depth, sem_seg_pred), 441 | axis=2).transpose(2, 0, 1) 442 | 443 | return state 444 | 445 | def _preprocess_depth(self, depth, min_d, max_d): 446 | depth = depth[:, :, 0] * 1 447 | 448 | for i in range(depth.shape[1]): 449 | depth[:, i][depth[:, i] == 0.] = depth[:, i].max() 450 | 451 | mask2 = depth > 0.99 452 | depth[mask2] = 0. 453 | 454 | mask1 = depth == 0 455 | depth[mask1] = 100.0 456 | # depth = min_d * 100.0 + depth * max_d * 100.0 457 | depth = min_d * 100.0 + depth * (max_d-min_d) * 100.0 458 | # depth = depth*1000. 459 | 460 | return depth 461 | 462 | def _get_sem_pred(self, rgb, depth, use_seg=True): 463 | if use_seg: 464 | image = torch.from_numpy(rgb).to(self.device).unsqueeze_(0).float() 465 | depth = torch.from_numpy(depth).to(self.device).unsqueeze_(0).float() 466 | with torch.no_grad(): 467 | red_semantic_pred = self.red_sem_pred(image, depth) 468 | red_semantic_pred = red_semantic_pred.squeeze().cpu().detach().numpy() 469 | semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb) 470 | semantic_pred = semantic_pred.astype(np.float32) 471 | else: 472 | semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16)) 473 | self.rgb_vis = rgb[:, :, ::-1] 474 | return red_semantic_pred, semantic_pred 475 | 476 | def _visualize(self, inputs): 477 | args = self.args 478 | dump_dir = "{}/dump/{}/".format(args.dump_location, 479 | args.exp_name) 480 | ep_dir = '{}/episodes/thread_{}/eps_{}/'.format( 481 | dump_dir, self.rank, self.episode_no) 482 | score_dir = '{}/episodes/thread_{}/score_{}/'.format( 483 | dump_dir, self.rank, self.episode_no) 484 | 485 | if not os.path.exists(ep_dir): 486 | os.makedirs(ep_dir) 487 | if not os.path.exists(score_dir): 488 | os.makedirs(score_dir) 489 | 490 | local_w = inputs['map_pred'].shape[0] 491 | 492 | map_pred = inputs['map_pred'] 493 | exp_pred = inputs['exp_pred'] 494 | map_edge = inputs['map_edge'] 495 | start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs['pose_pred'] 496 | 497 | goal = inputs['goal'] 498 | sem_map = inputs['sem_map_pred'] 499 | 500 | gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2) 501 | 502 | sem_map += 5 503 | 504 | no_cat_mask = sem_map == 20 505 | map_mask = np.rint(map_pred) == 1 506 | exp_mask = np.rint(exp_pred) == 1 507 | vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1 508 | edge_mask = map_edge == 1 509 | 510 | sem_map[no_cat_mask] = 0 511 | m1 = np.logical_and(no_cat_mask, exp_mask) 512 | sem_map[m1] = 2 513 | 514 | m2 = np.logical_and(no_cat_mask, map_mask) 515 | sem_map[m2] = 1 516 | 517 | sem_map[vis_mask] = 3 518 | sem_map[edge_mask] = 3 519 | 520 | selem = skimage.morphology.disk(4) 521 | goal_mat = 1 - skimage.morphology.binary_dilation( 522 | goal, selem) != True 523 | 524 | goal_mask = goal_mat == 1 525 | sem_map[goal_mask] = 4 526 | if np.sum(goal) == 1: 527 | f_pos = np.argwhere(goal == 1) 528 | # fmb = get_frontier_boundaries((f_pos[0][0], f_pos[0][1])) 529 | # goal_fmb = skimage.draw.circle_perimeter(int((fmb[0]+fmb[1])/2), int((fmb[2]+fmb[3])/2), 23) 530 | goal_fmb = skimage.draw.circle_perimeter(f_pos[0][0], f_pos[0][1], int(local_w/4-2)) 531 | goal_fmb[0][goal_fmb[0] > local_w-1] = local_w-1 532 | goal_fmb[1][goal_fmb[1] > local_w-1] = local_w-1 533 | goal_fmb[0][goal_fmb[0] < 0] = 0 534 | goal_fmb[1][goal_fmb[1] < 0] = 0 535 | # goal_fmb[goal_fmb < 0] =0 536 | goal_mask[goal_fmb[0], goal_fmb[1]] = 1 537 | sem_map[goal_mask] = 4 538 | 539 | 540 | color_pal = [int(x * 255.) for x in color_palette] 541 | sem_map_vis = Image.new("P", (sem_map.shape[1], 542 | sem_map.shape[0])) 543 | sem_map_vis.putpalette(color_pal) 544 | sem_map_vis.putdata(sem_map.flatten().astype(np.uint8)) 545 | sem_map_vis = sem_map_vis.convert("RGB") 546 | sem_map_vis = np.flipud(sem_map_vis) 547 | 548 | sem_map_vis = sem_map_vis[:, :, [2, 1, 0]] 549 | sem_map_vis = cv2.resize(sem_map_vis, (480, 480), 550 | interpolation=cv2.INTER_NEAREST) 551 | self.vis_image[50:530, 15:655] = self.rgb_vis 552 | self.vis_image[50:530, 670:1150] = sem_map_vis 553 | 554 | pos = ( 555 | (start_x * 100. / args.map_resolution - gy1) 556 | * 480 / map_pred.shape[0], 557 | (map_pred.shape[1] - start_y * 100. / args.map_resolution + gx1) 558 | * 480 / map_pred.shape[1], 559 | np.deg2rad(-start_o) 560 | ) 561 | 562 | agent_arrow = vu.get_contour_points(pos, origin=(670, 50), size=10) 563 | color = (int(color_palette[11] * 255), 564 | int(color_palette[10] * 255), 565 | int(color_palette[9] * 255)) 566 | cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1) 567 | 568 | if args.visualize: 569 | # Displaying the image 570 | cv2.imshow("Thread {}".format(self.rank), self.vis_image) 571 | cv2.waitKey(1) 572 | 573 | if args.print_images: 574 | fn = '{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png'.format( 575 | dump_dir, self.rank, self.episode_no, 576 | self.rank, self.episode_no, self.timestep) 577 | cv2.imwrite(fn, self.vis_image) 578 | 579 | 580 | --------------------------------------------------------------------------------