├── img
    └── t.png
├── .gitignore
├── requirements.txt
├── configs
    ├── COCO-InstanceSegmentation
    │   └── mask_rcnn_R_50_FPN_3x.yaml
    └── Base-RCNN-FPN.yaml
├── envs
    ├── utils
    │   ├── pose.py
    │   ├── rotation_utils.py
    │   ├── map_builder.py
    │   ├── fmm_planner.py
    │   ├── depth_utils.py
    │   └── depth_utils_multi.py
    ├── habitat
    │   ├── configs
    │   │   └── tasks
    │   │   │   ├── objectnav_gibson.yaml
    │   │   │   └── objectnav_hm3d.yaml
    │   ├── __init__.py
    │   ├── objectgoal_env21.py
    │   └── objectgoal_env.py
    └── __init__.py
├── README.md
├── agents
    ├── utils
    │   ├── visualization.py
    │   └── semantic_prediction.py
    └── sem_exp.py
├── utils
    ├── tensorboard_utils.py
    └── model.py
├── constants.py
├── RedNet
    ├── utils.py
    └── RedNet_model.py
├── arguments.py
├── model_multi.py
└── get_point_score.py


/img/t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shalexyuan/GAMap/HEAD/img/t.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | RedNet/model/*
2 | vis/*
3 | tmp/*
4 | data/*
5 | *.symlink
6 | **/__pycache__/
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-fmm==2019.1.30
2 | scikit-learn==0.22.2.post1
3 | scikit-image==0.15.0
4 | numpy>=1.20.2
5 | ifcfg
6 | transformers==4.23.1
7 | tensorboard
8 | 


--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 | SOLVER:
 8 |   STEPS: (210000, 250000)
 9 |   MAX_ITER: 270000
10 | 


--------------------------------------------------------------------------------
/envs/utils/pose.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def get_l2_distance(x1, x2, y1, y2):
 5 |     """
 6 |     Computes the L2 distance between two points.
 7 |     """
 8 |     return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
 9 | 
10 | 
11 | def get_rel_pose_change(pos2, pos1):
12 |     x1, y1, o1 = pos1
13 |     x2, y2, o2 = pos2
14 | 
15 |     theta = np.arctan2(y2 - y1, x2 - x1) - o1
16 |     dist = get_l2_distance(x1, x2, y1, y2)
17 |     dx = dist * np.cos(theta)
18 |     dy = dist * np.sin(theta)
19 |     do = o2 - o1
20 | 
21 |     return dx, dy, do
22 | 
23 | 
24 | def get_new_pose(pose, rel_pose_change):
25 |     x, y, o = pose
26 |     dx, dy, do = rel_pose_change
27 | 
28 |     global_dx = dx * np.sin(np.deg2rad(o)) + dy * np.cos(np.deg2rad(o))
29 |     global_dy = dx * np.cos(np.deg2rad(o)) - dy * np.sin(np.deg2rad(o))
30 |     x += global_dy
31 |     y += global_dx
32 |     o += np.rad2deg(do)
33 |     if o > 180.:
34 |         o -= 360.
35 | 
36 |     return x, y, o
37 | 
38 | 
39 | def threshold_poses(coords, shape):
40 |     coords[0] = min(max(0, coords[0]), shape[0] - 1)
41 |     coords[1] = min(max(0, coords[1]), shape[1] - 1)
42 |     return coords
43 | 


--------------------------------------------------------------------------------
/envs/habitat/configs/tasks/objectnav_gibson.yaml:
--------------------------------------------------------------------------------
 1 | ENVIRONMENT:
 2 |   MAX_EPISODE_STEPS: 500
 3 | SIMULATOR:
 4 |   TURN_ANGLE: 30
 5 |   TILT_ANGLE: 30
 6 |   ACTION_SPACE_CONFIG: "v1"
 7 |   AGENT_0:
 8 |     SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']
 9 |     HEIGHT: 0.88
10 |     RADIUS: 0.18
11 |   HABITAT_SIM_V0:
12 |     GPU_DEVICE_ID: 0
13 |     ALLOW_SLIDING: False 
14 |   SEMANTIC_SENSOR:
15 |     WIDTH: 640
16 |     HEIGHT: 480
17 |     HFOV: 79
18 |     POSITION: [0, 0.88, 0]
19 |   RGB_SENSOR:
20 |     WIDTH: 640
21 |     HEIGHT: 480
22 |     HFOV: 79
23 |     POSITION: [0, 0.88, 0]
24 |   DEPTH_SENSOR:
25 |     WIDTH: 640
26 |     HEIGHT: 480
27 |     HFOV: 79
28 |     MIN_DEPTH: 0.5
29 |     MAX_DEPTH: 5.0
30 |     POSITION: [0, 0.88, 0]
31 | TASK:
32 |   TYPE: ObjectNav-v1
33 |   POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"]
34 |   SENSORS: ['GPS_SENSOR', 'COMPASS_SENSOR']
35 |   MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL']
36 |   GOAL_SENSOR_UUID: objectgoal
37 |   SUCCESS:
38 |     SUCCESS_DISTANCE: 0.5
39 | 
40 | DATASET:
41 |   TYPE: PointNav-v1
42 |   SPLIT: train
43 |   DATA_PATH: "data/datasets/objectnav/gibson/v1.1/{split}/{split}.json.gz"
44 |   EPISODES_DIR: "data/datasets/objectnav/gibson/v1.1/{split}/"
45 |   SCENES_DIR: "/data/p305574/data/scene_datasets"
46 | 


--------------------------------------------------------------------------------
/envs/habitat/configs/tasks/objectnav_hm3d.yaml:
--------------------------------------------------------------------------------
 1 | ENVIRONMENT:
 2 |   MAX_EPISODE_STEPS: 500
 3 | SIMULATOR:
 4 |   TURN_ANGLE: 30
 5 |   TILT_ANGLE: 30
 6 |   ACTION_SPACE_CONFIG: "v1"
 7 |   AGENT_0:
 8 |     SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR']
 9 |     HEIGHT: 0.88
10 |     RADIUS: 0.18
11 |   HABITAT_SIM_V0:
12 |     GPU_DEVICE_ID: 0
13 |     ALLOW_SLIDING: False 
14 |   SEMANTIC_SENSOR:
15 |     WIDTH: 640
16 |     HEIGHT: 480
17 |     HFOV: 79
18 |     POSITION: [0, 0.88, 0]
19 |   RGB_SENSOR:
20 |     WIDTH: 640
21 |     HEIGHT: 480
22 |     HFOV: 79
23 |     POSITION: [0, 0.88, 0]
24 |   DEPTH_SENSOR:
25 |     WIDTH: 640
26 |     HEIGHT: 480
27 |     HFOV: 79
28 |     MIN_DEPTH: 0.5
29 |     MAX_DEPTH: 5.0
30 |     POSITION: [0, 0.88, 0]
31 |   SCENE_DATASET: "data/scene_datasets/hm3d/hm3d_annotated_basis.scene_dataset_config.json"
32 | TASK:
33 |   TYPE: ObjectNav-v1
34 |   POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"]
35 |   SENSORS: ['OBJECTGOAL_SENSOR', 'GPS_SENSOR', 'COMPASS_SENSOR']
36 |   GOAL_SENSOR_UUID: objectgoal
37 |   MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL']
38 |   DISTANCE_TO_GOAL:
39 |     DISTANCE_TO: VIEW_POINTS
40 |   SUCCESS:
41 |     SUCCESS_DISTANCE: 0.1
42 | 
43 | DATASET:
44 |   TYPE: ObjectNav-v1
45 |   SPLIT: val
46 |   DATA_PATH: "data/objectgoal_hm3d/{split}/{split}.json.gz"
47 |   SCENES_DIR: "data/scene_datasets"
48 | 
49 | 


--------------------------------------------------------------------------------
/configs/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "StandardROIHeads"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 | 


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .habitat import construct_envs, construct_envs21
 4 | 
 5 | 
 6 | def make_vec_envs(args):
 7 |     if args.task_config == "tasks/objectnav_gibson.yaml":
 8 |         envs = construct_envs(args)
 9 |     else:
10 |         envs = construct_envs21(args)
11 |     envs = VecPyTorch(envs, args.device)
12 |     return envs
13 | 
14 | 
15 | # Adapted from
16 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/envs.py#L159
17 | class VecPyTorch():
18 | 
19 |     def __init__(self, venv, device):
20 |         self.venv = venv
21 |         self.num_envs = venv.num_envs
22 |         self.observation_space = venv.observation_space
23 |         self.action_space = venv.action_space
24 |         self.device = device
25 | 
26 |     def reset(self):
27 |         obs, info = self.venv.reset()
28 |         obs = torch.from_numpy(obs).float().to(self.device)
29 |         return obs, info
30 | 
31 |     def step_async(self, actions):
32 |         actions = actions.cpu().numpy()
33 |         self.venv.step_async(actions)
34 | 
35 |     def step_wait(self):
36 |         obs, reward, done, info = self.venv.step_wait()
37 |         obs = torch.from_numpy(obs).float().to(self.device)
38 |         reward = torch.from_numpy(reward).float()
39 |         return obs, reward, done, info
40 | 
41 |     def step(self, actions):
42 |         actions = actions.cpu().numpy()
43 |         obs, reward, done, info = self.venv.step(actions)
44 |         obs = torch.from_numpy(obs).float().to(self.device)
45 |         reward = torch.from_numpy(reward).float()
46 |         return obs, reward, done, info
47 | 
48 |     def get_rewards(self, inputs):
49 |         reward = self.venv.get_rewards(inputs)
50 |         reward = torch.from_numpy(reward).float()
51 |         return reward
52 | 
53 |     def plan_act_and_preprocess(self, inputs):
54 |         obs, reward, done, info = self.venv.plan_act_and_preprocess(inputs)
55 |         obs = torch.from_numpy(obs).float().to(self.device)
56 |         # reward = torch.from_numpy(reward).float()
57 |         return obs, reward, done, info
58 | 
59 |     def rotate_at_beginning_act(self, inputs):
60 |         obs, reward, done, info = self.venv.rotate_at_beginning_act(inputs)
61 |         obs = torch.from_numpy(obs).float().to(self.device)
62 |         # reward = torch.from_numpy(reward).float()
63 |         return obs, reward, done, info
64 | 
65 |     def close(self):
66 |         return self.venv.close()
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h2 align="center">
 2 |   <b>GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance</b>
 3 | 
 4 |   <b><i>Neurips 2024</i></b>
 5 | </h2>
 6 | 
 7 | This is the official repository of [GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance](https://arxiv.org/pdf/2410.23978).
 8 | 
 9 | <div align=center>
10 | <img src="./img/t.png" width=100%>
11 | </div>
12 | 
13 | ## Setup
14 | 
15 | ### Dataset Preparation
16 | 
17 | Please follow [HM3D](https://aihabitat.org/datasets/hm3d/) to download the dataset and prepare the data. The data format should be:
18 | 
19 | ```
20 | data/
21 |   scene_datasets/
22 |   matterport_category_mappings.tsv
23 |   object_norm_inv_perplexity.npy
24 |   versioned_data
25 |   objectgoal_hm3d/
26 |       train/
27 |       val/
28 |       val_mini/
29 | ```
30 | 
31 | ### Dependencies
32 | 
33 | 1. Python & PyTorch
34 | 
35 |     This code is tested on Python 3.9.16 on Ubuntu 20.04, with PyTorch 1.11.0+cu113.
36 | 
37 | 2. Habitat-Sim & Habitat-Lab
38 | 
39 |     ```
40 |     # Habitat-Sim
41 |     git clone https://github.com/facebookresearch/habitat-sim.git
42 |     cd habitat-sim; git checkout tags/challenge-2022; 
43 |     pip install -r requirements.txt; 
44 |     python setup.py install --headless
45 | 
46 |     # Habitat-Lab
47 |     git clone https://github.com/facebookresearch/habitat-lab.git
48 |     cd habitat-lab; git checkout tags/challenge-2022; 
49 |     pip install -e .
50 |     ```
51 | 
52 | 4. Others
53 | 
54 |     ```
55 |     pip install -r requirements.txt
56 |     ```
57 | 
58 | ## Running
59 | 
60 | ### Example
61 | 
62 | An example command to run the pipeline:
63 | 
64 | ```
65 | python ga.py --split val_mini --eval 0 --auto_gpu_config 0 -n 1 \
66 | --num_eval_episodes 2000 --use_gtsem 0 --num_local_steps 10 \
67 | --exp_name gamap --agent multi_attr_exp
68 | ```
69 | 
70 | [//]: # ()
71 | [//]: # (### Visualization)
72 | 
73 | [//]: # ()
74 | [//]: # (To make a demo video on your saved images, you can either use `ffmpeg` to make separate videos or use)
75 | 
76 | [//]: # ()
77 | [//]: # (```)
78 | 
79 | [//]: # (python make_demo.py --exp_name test # add `--delete_img` to delete images after making video)
80 | 
81 | [//]: # (```)
82 | 
83 | [//]: # ()
84 | [//]: # (to make batched videos.)
85 | 
86 | ## Acknowledgements
87 | 
88 | This repo is heavily based on [L3MVN](https://github.com/ybgdgh/L3MVN). We thank the authors for their great work.
89 | 
90 | ## Citation
91 | 
92 | If you find this work helpful, please consider citing:
93 | 


--------------------------------------------------------------------------------
/agents/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | 
 5 | def get_contour_points(pos, origin, size=20):
 6 |     x, y, o = pos
 7 |     pt1 = (int(x) + origin[0],
 8 |            int(y) + origin[1])
 9 |     pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0],
10 |            int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1])
11 |     pt3 = (int(x + size * np.cos(o)) + origin[0],
12 |            int(y + size * np.sin(o)) + origin[1])
13 |     pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0],
14 |            int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1])
15 | 
16 |     return np.array([pt1, pt2, pt3, pt4])
17 | 
18 | 
19 | def draw_line(start, end, mat, steps=25, w=1):
20 |     for i in range(steps + 1):
21 |         x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps))
22 |         y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps))
23 |         mat[x - w:x + w, y - w:y + w] = 1
24 |     return mat
25 | 
26 | 
27 | def init_vis_image(goal_name, legend):
28 |     vis_image = np.ones((655, 1165, 3)).astype(np.uint8) * 255
29 |     font = cv2.FONT_HERSHEY_SIMPLEX
30 |     fontScale = 1
31 |     color = (20, 20, 20)  # BGR
32 |     thickness = 2
33 | 
34 |     text = "Observations (Goal: {})".format(goal_name)
35 |     textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
36 |     textX = (640 - textsize[0]) // 2 + 15
37 |     textY = (50 + textsize[1]) // 2
38 |     vis_image = cv2.putText(vis_image, text, (textX, textY),
39 |                             font, fontScale, color, thickness,
40 |                             cv2.LINE_AA)
41 | 
42 |     text = "Predicted Semantic Map"
43 |     textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
44 |     textX = 640 + (480 - textsize[0]) // 2 + 30
45 |     textY = (50 + textsize[1]) // 2
46 |     vis_image = cv2.putText(vis_image, text, (textX, textY),
47 |                             font, fontScale, color, thickness,
48 |                             cv2.LINE_AA)
49 | 
50 |     # draw outlines
51 |     color = [100, 100, 100]
52 |     vis_image[49, 15:655] = color
53 |     vis_image[49, 670:1150] = color
54 |     vis_image[50:530, 14] = color
55 |     vis_image[50:530, 655] = color
56 |     vis_image[50:530, 669] = color
57 |     vis_image[50:530, 1150] = color
58 |     vis_image[530, 15:655] = color
59 |     vis_image[530, 670:1150] = color
60 | 
61 |     # draw legend
62 | #     lx, ly, _ = legend.shape
63 | #     vis_image[537:537 + lx, 155:155 + ly, :] = legend
64 | 
65 |     return vis_image
66 | 


--------------------------------------------------------------------------------
/utils/tensorboard_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Any
 8 | 
 9 | import numpy as np
10 | import torch
11 | from torch.utils.tensorboard import SummaryWriter
12 | 
13 | 
14 | class TensorboardWriter:
15 |     def __init__(self, log_dir: str, *args: Any, **kwargs: Any):
16 |         r"""A Wrapper for tensorboard SummaryWriter. It creates a dummy writer
17 |         when log_dir is empty string or None. It also has functionality that
18 |         generates tb video directly from numpy images.
19 | 
20 |         Args:
21 |             log_dir: Save directory location. Will not write to disk if
22 |             log_dir is an empty string.
23 |             *args: Additional positional args for SummaryWriter
24 |             **kwargs: Additional keyword args for SummaryWriter
25 |         """
26 |         self.writer = None
27 |         if log_dir is not None and len(log_dir) > 0:
28 |             self.writer = SummaryWriter(log_dir, *args, **kwargs)
29 | 
30 |     def __getattr__(self, item):
31 |         if self.writer:
32 |             return self.writer.__getattribute__(item)
33 |         else:
34 |             return lambda *args, **kwargs: None
35 | 
36 |     def __enter__(self):
37 |         return self
38 | 
39 |     def __exit__(self, exc_type, exc_val, exc_tb):
40 |         if self.writer:
41 |             self.writer.close()
42 | 
43 |     def add_video_from_np_images(
44 |         self, video_name: str, step_idx: int, images: np.ndarray, fps: int = 10
45 |     ) -> None:
46 |         r"""Write video into tensorboard from images frames.
47 | 
48 |         Args:
49 |             video_name: name of video string.
50 |             step_idx: int of checkpoint index to be displayed.
51 |             images: list of n frames. Each frame is a np.ndarray of shape.
52 |             fps: frame per second for output video.
53 | 
54 |         Returns:
55 |             None.
56 |         """
57 |         if not self.writer:
58 |             return
59 |         # initial shape of np.ndarray list: N * (H, W, 3)
60 |         frame_tensors = [
61 |             torch.from_numpy(np_arr).unsqueeze(0) for np_arr in images
62 |         ]
63 |         video_tensor = torch.cat(tuple(frame_tensors))
64 |         video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0)
65 |         # final shape of video tensor: (1, n, 3, H, W)
66 |         self.writer.add_video(
67 |             video_name, video_tensor, fps=fps, global_step=step_idx
68 |         )
69 | 


--------------------------------------------------------------------------------
/envs/utils/rotation_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Utilities for generating and applying rotation matrices.
17 | """
18 | import numpy as np
19 | 
20 | ANGLE_EPS = 0.001
21 | 
22 | 
23 | def normalize(v):
24 |     return v / np.linalg.norm(v)
25 | 
26 | 
27 | def get_r_matrix(ax_, angle):
28 |     ax = normalize(ax_)
29 |     if np.abs(angle) > ANGLE_EPS:
30 |         S_hat = np.array(
31 |             [[0.0, -ax[2], ax[1]], [ax[2], 0.0, -ax[0]], [-ax[1], ax[0], 0.0]],
32 |             dtype=np.float32)
33 |         R = np.eye(3) + np.sin(angle) * S_hat + \
34 |             (1 - np.cos(angle)) * (np.linalg.matrix_power(S_hat, 2))
35 |     else:
36 |         R = np.eye(3)
37 |     return R
38 | 
39 | 
40 | def r_between(v_from_, v_to_):
41 |     v_from = normalize(v_from_)
42 |     v_to = normalize(v_to_)
43 |     ax = normalize(np.cross(v_from, v_to))
44 |     angle = np.arccos(np.dot(v_from, v_to))
45 |     return get_r_matrix(ax, angle)
46 | 
47 | 
48 | def rotate_camera_to_point_at(up_from, lookat_from, up_to, lookat_to):
49 |     inputs = [up_from, lookat_from, up_to, lookat_to]
50 |     for i in range(4):
51 |         inputs[i] = normalize(np.array(inputs[i]).reshape((-1,)))
52 |     up_from, lookat_from, up_to, lookat_to = inputs
53 |     r1 = r_between(lookat_from, lookat_to)
54 | 
55 |     new_x = np.dot(r1, np.array([1, 0, 0]).reshape((-1, 1))).reshape((-1))
56 |     to_x = normalize(np.cross(lookat_to, up_to))
57 |     angle = np.arccos(np.dot(new_x, to_x))
58 |     if angle > ANGLE_EPS:
59 |         if angle < np.pi - ANGLE_EPS:
60 |             ax = normalize(np.cross(new_x, to_x))
61 |             flip = np.dot(lookat_to, ax)
62 |             if flip > 0:
63 |                 r2 = get_r_matrix(lookat_to, angle)
64 |             elif flip < 0:
65 |                 r2 = get_r_matrix(lookat_to, -1. * angle)
66 |         else:
67 |             # Angle of rotation is too close to 180 degrees, direction of
68 |             # rotation does not matter.
69 |             r2 = get_r_matrix(lookat_to, angle)
70 |     else:
71 |         r2 = np.eye(3)
72 |     return np.dot(r2, r1)
73 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
  1 | coco_categories = {
  2 |     "chair": 0,
  3 |     "couch": 1,
  4 |     "potted plant": 2,
  5 |     "bed": 3,
  6 |     "toilet": 4,
  7 |     "tv": 5,
  8 |     "dining-table": 6,
  9 |     "oven": 7,
 10 |     "sink": 8,
 11 |     "refrigerator": 9,
 12 |     "book": 10,
 13 |     "clock": 11,
 14 |     "vase": 12,
 15 |     "cup": 13,
 16 |     "bottle": 14
 17 | }
 18 | 
 19 | category_to_id = [
 20 |         "chair",
 21 |         "bed",
 22 |         "plant",
 23 |         "toilet",
 24 |         "tv_monitor",
 25 |         "sofa"
 26 | ]
 27 | 
 28 | category_to_id_gibson = [
 29 |         "chair",
 30 |         "couch",
 31 |         "potted plant",
 32 |         "bed",
 33 |         "toilet",
 34 |         "tv"
 35 | ]
 36 | 
 37 | mp3d_category_id = {
 38 |     'void': 1,
 39 |     'chair': 2,
 40 |     'sofa': 3,
 41 |     'plant': 4,
 42 |     'bed': 5,
 43 |     'toilet': 6,
 44 |     'tv_monitor': 7,
 45 |     'table': 8,
 46 |     'refrigerator': 9,
 47 |     'sink': 10,
 48 |     'stairs': 16,
 49 |     'fireplace': 12
 50 | }
 51 | 
 52 | # mp_categories_mapping = [4, 11, 15, 12, 19, 23, 6, 7, 15, 38, 40, 28, 29, 8, 17]
 53 | 
 54 | mp_categories_mapping = [4, 11, 15, 12, 19, 23, 26, 24, 28, 38, 21, 16, 14, 6, 16]
 55 | 
 56 | hm3d_category = [
 57 |         "chair",
 58 |         "sofa",
 59 |         "plant",
 60 |         "bed",
 61 |         "toilet",
 62 |         "tv_monitor",
 63 |         "bathtub",
 64 |         "shower",
 65 |         "fireplace",
 66 |         "appliances",
 67 |         "towel",
 68 |         "sink",
 69 |         "chest_of_drawers",
 70 |         "table",
 71 |         "stairs"
 72 | ]
 73 | 
 74 | coco_categories_mapping = {
 75 |     56: 0,  # chair
 76 |     57: 1,  # couch
 77 |     58: 2,  # potted plant
 78 |     59: 3,  # bed
 79 |     61: 4,  # toilet
 80 |     62: 5,  # tv
 81 |     60: 6,  # dining-table
 82 |     69: 7,  # oven
 83 |     71: 8,  # sink
 84 |     72: 9,  # refrigerator
 85 |     73: 10,  # book
 86 |     74: 11,  # clock
 87 |     75: 12,  # vase
 88 |     41: 13,  # cup
 89 |     39: 14,  # bottle
 90 | }
 91 | 
 92 | color_palette = [
 93 |     1.0, 1.0, 1.0,
 94 |     0.6, 0.6, 0.6,
 95 |     0.95, 0.95, 0.95,
 96 |     0.96, 0.36, 0.26,
 97 |     0.12156862745098039, 0.47058823529411764, 0.7058823529411765,
 98 |     0.9400000000000001, 0.7818, 0.66,
 99 |     0.9400000000000001, 0.8868, 0.66,
100 |     0.8882000000000001, 0.9400000000000001, 0.66,
101 |     0.7832000000000001, 0.9400000000000001, 0.66,
102 |     0.6782000000000001, 0.9400000000000001, 0.66,
103 |     0.66, 0.9400000000000001, 0.7468000000000001,
104 |     0.66, 0.9400000000000001, 0.8518000000000001,
105 |     0.66, 0.9232, 0.9400000000000001,
106 |     0.66, 0.8182, 0.9400000000000001,
107 |     0.66, 0.7132, 0.9400000000000001,
108 |     0.7117999999999999, 0.66, 0.9400000000000001,
109 |     0.8168, 0.66, 0.9400000000000001,
110 |     0.9218, 0.66, 0.9400000000000001,
111 |     0.9400000000000001, 0.66, 0.8531999999999998,
112 |     0.9400000000000001, 0.66, 0.748199999999999]
113 | 
114 | 
115 | 
116 | cat_2_multi_att_prompt={
117 |     "chair":        ['chair','leg','backrest','seat','armrest'],
118 |     "bed":          ['bed','sleep','lie','mattress','bedframe'],
119 |     "plant":        ['plant','leaf','stem','branch','pot'],
120 |     "toilet":       ['toilet','bathroom','defecating','urinating','flushing'],
121 |     "tv_monitor":   ['tv monitor','livingroom','bedroom','screen','display'],
122 |     "sofa":         ['sofa','sit','cushion','armrest','backrest'],
123 | }


--------------------------------------------------------------------------------
/envs/utils/map_builder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import envs.utils.depth_utils as du
 3 | 
 4 | 
 5 | class MapBuilder(object):
 6 |     def __init__(self, params):
 7 |         self.params = params
 8 |         frame_width = params['frame_width']
 9 |         frame_height = params['frame_height']
10 |         fov = params['fov']
11 |         self.camera_matrix = du.get_camera_matrix(
12 |             frame_width,
13 |             frame_height,
14 |             fov)
15 |         self.vision_range = params['vision_range']
16 | 
17 |         self.map_size_cm = params['map_size_cm']
18 |         self.resolution = params['resolution']
19 |         agent_min_z = params['agent_min_z']
20 |         agent_max_z = params['agent_max_z']
21 |         self.z_bins = [agent_min_z, agent_max_z]
22 |         self.du_scale = params['du_scale']
23 |         self.visualize = params['visualize']
24 |         self.obs_threshold = params['obs_threshold']
25 | 
26 |         self.map = np.zeros((self.map_size_cm // self.resolution,
27 |                              self.map_size_cm // self.resolution,
28 |                              len(self.z_bins) + 1), dtype=np.float32)
29 | 
30 |         self.agent_height = params['agent_height']
31 |         self.agent_view_angle = params['agent_view_angle']
32 |         return
33 | 
34 |     def update_map(self, depth, current_pose):
35 |         with np.errstate(invalid="ignore"):
36 |             depth[depth > self.vision_range * self.resolution] = np.NaN
37 |         point_cloud = du.get_point_cloud_from_z(depth, self.camera_matrix,
38 |                                                 scale=self.du_scale)
39 | 
40 |         agent_view = du.transform_camera_view(point_cloud,
41 |                                               self.agent_height,
42 |                                               self.agent_view_angle)
43 | 
44 |         shift_loc = [self.vision_range * self.resolution // 2, 0, np.pi / 2.0]
45 |         agent_view_centered = du.transform_pose(agent_view, shift_loc)
46 | 
47 |         agent_view_flat = du.bin_points(
48 |             agent_view_centered,
49 |             self.vision_range,
50 |             self.z_bins,
51 |             self.resolution)
52 | 
53 |         agent_view_cropped = agent_view_flat[:, :, 1]
54 |         agent_view_cropped = agent_view_cropped / self.obs_threshold
55 |         agent_view_cropped[agent_view_cropped >= 0.5] = 1.0
56 |         agent_view_cropped[agent_view_cropped < 0.5] = 0.0
57 | 
58 |         agent_view_explored = agent_view_flat.sum(2)
59 |         agent_view_explored[agent_view_explored > 0] = 1.0
60 | 
61 |         geocentric_pc = du.transform_pose(agent_view, current_pose)
62 | 
63 |         geocentric_flat = du.bin_points(
64 |             geocentric_pc,
65 |             self.map.shape[0],
66 |             self.z_bins,
67 |             self.resolution)
68 | 
69 |         self.map = self.map + geocentric_flat
70 | 
71 |         map_gt = self.map[:, :, 1] / self.obs_threshold
72 |         map_gt[map_gt >= 0.5] = 1.0
73 |         map_gt[map_gt < 0.5] = 0.0
74 | 
75 |         explored_gt = self.map.sum(2)
76 |         explored_gt[explored_gt > 1] = 1.0
77 | 
78 |         return agent_view_cropped, map_gt, agent_view_explored, explored_gt
79 | 
80 |     def get_st_pose(self, current_loc):
81 |         loc = [- (current_loc[0] / self.resolution
82 |                   - self.map_size_cm // (self.resolution * 2)) /
83 |                (self.map_size_cm // (self.resolution * 2)),
84 |                - (current_loc[1] / self.resolution
85 |                   - self.map_size_cm // (self.resolution * 2)) /
86 |                (self.map_size_cm // (self.resolution * 2)),
87 |                90 - np.rad2deg(current_loc[2])]
88 |         return loc
89 | 
90 |     def reset_map(self, map_size):
91 |         self.map_size_cm = map_size
92 | 
93 |         self.map = np.zeros((self.map_size_cm // self.resolution,
94 |                              self.map_size_cm // self.resolution,
95 |                              len(self.z_bins) + 1), dtype=np.float32)
96 | 
97 |     def get_map(self):
98 |         return self.map
99 | 


--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def get_grid(pose, grid_size, device):
  8 |     """
  9 |     Input:
 10 |         `pose` FloatTensor(bs, 3)
 11 |         `grid_size` 4-tuple (bs, _, grid_h, grid_w)
 12 |         `device` torch.device (cpu or gpu)
 13 |     Output:
 14 |         `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
 15 |         `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
 16 | 
 17 |     """
 18 |     pose = pose.float()
 19 |     x = pose[:, 0]
 20 |     y = pose[:, 1]
 21 |     t = pose[:, 2]
 22 | 
 23 |     bs = x.size(0)
 24 |     t = t * np.pi / 180.
 25 |     cos_t = t.cos()
 26 |     sin_t = t.sin()
 27 | 
 28 |     theta11 = torch.stack([cos_t, -sin_t,
 29 |                            torch.zeros(cos_t.shape).float().to(device)], 1)
 30 |     theta12 = torch.stack([sin_t, cos_t,
 31 |                            torch.zeros(cos_t.shape).float().to(device)], 1)
 32 |     theta1 = torch.stack([theta11, theta12], 1)
 33 | 
 34 |     theta21 = torch.stack([torch.ones(x.shape).to(device),
 35 |                            -torch.zeros(x.shape).to(device), x], 1)
 36 |     theta22 = torch.stack([torch.zeros(x.shape).to(device),
 37 |                            torch.ones(x.shape).to(device), y], 1)
 38 |     theta2 = torch.stack([theta21, theta22], 1)
 39 | 
 40 |     rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
 41 |     trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
 42 | 
 43 |     return rot_grid, trans_grid
 44 | 
 45 | 
 46 | class ChannelPool(nn.MaxPool1d):
 47 |     def forward(self, x):
 48 |         n, c, w, h = x.size()
 49 |         x = x.view(n, c, w * h).permute(0, 2, 1)
 50 |         x = x.contiguous()
 51 |         pooled = F.max_pool1d(x, c, 1)
 52 |         _, _, c = pooled.size()
 53 |         pooled = pooled.permute(0, 2, 1)
 54 |         return pooled.view(n, c, w, h)
 55 | 
 56 | 
 57 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/utils.py#L32
 58 | class AddBias(nn.Module):
 59 |     def __init__(self, bias):
 60 |         super(AddBias, self).__init__()
 61 |         self._bias = nn.Parameter(bias.unsqueeze(1))
 62 | 
 63 |     def forward(self, x):
 64 |         if x.dim() == 2:
 65 |             bias = self._bias.t().view(1, -1)
 66 |         else:
 67 |             bias = self._bias.t().view(1, -1, 1, 1)
 68 | 
 69 |         return x + bias
 70 | 
 71 | 
 72 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L10
 73 | class Flatten(nn.Module):
 74 |     def forward(self, x):
 75 |         return x.view(x.size(0), -1)
 76 | 
 77 | 
 78 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L82
 79 | class NNBase(nn.Module):
 80 | 
 81 |     def __init__(self, recurrent, recurrent_input_size, hidden_size):
 82 | 
 83 |         super(NNBase, self).__init__()
 84 |         self._hidden_size = hidden_size
 85 |         self._recurrent = recurrent
 86 | 
 87 |         if recurrent:
 88 |             self.gru = nn.GRUCell(recurrent_input_size, hidden_size)
 89 |             nn.init.orthogonal_(self.gru.weight_ih.data)
 90 |             nn.init.orthogonal_(self.gru.weight_hh.data)
 91 |             self.gru.bias_ih.data.fill_(0)
 92 |             self.gru.bias_hh.data.fill_(0)
 93 | 
 94 |     @property
 95 |     def is_recurrent(self):
 96 |         return self._recurrent
 97 | 
 98 |     @property
 99 |     def rec_state_size(self):
100 |         if self._recurrent:
101 |             return self._hidden_size
102 |         return 1
103 | 
104 |     @property
105 |     def output_size(self):
106 |         return self._hidden_size
107 | 
108 |     def _forward_gru(self, x, hxs, masks):
109 |         if x.size(0) == hxs.size(0):
110 |             x = hxs = self.gru(x, hxs * masks[:, None])
111 |         else:
112 |             # x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
113 |             N = hxs.size(0)
114 |             T = int(x.size(0) / N)
115 | 
116 |             # unflatten
117 |             x = x.view(T, N, x.size(1))
118 | 
119 |             # Same deal with masks
120 |             masks = masks.view(T, N, 1)
121 | 
122 |             outputs = []
123 |             for i in range(T):
124 |                 hx = hxs = self.gru(x[i], hxs * masks[i])
125 |                 outputs.append(hx)
126 | 
127 |             # x is a (T, N, -1) tensor
128 |             x = torch.stack(outputs, dim=0)
129 |             # flatten
130 |             x = x.view(T * N, -1)
131 | 
132 |         return x, hxs
133 | 


--------------------------------------------------------------------------------
/RedNet/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torch import nn
  3 | import torch
  4 | import os
  5 | 
  6 | med_frq = [0.382900, 0.452448, 0.637584, 0.377464, 0.585595,
  7 |            0.479574, 0.781544, 0.982534, 1.017466, 0.624581,
  8 |            2.589096, 0.980794, 0.920340, 0.667984, 1.172291,
  9 |            0.862240, 0.921714, 2.154782, 1.187832, 1.178115,
 10 |            1.848545, 1.428922, 2.849658, 0.771605, 1.656668,
 11 |            4.483506, 2.209922, 1.120280, 2.790182, 0.706519,
 12 |            3.994768, 2.220004, 0.972934, 1.481525, 5.342475,
 13 |            0.750738, 4.040773]
 14 | 
 15 | model_urls = {
 16 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 17 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 18 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 19 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 20 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 21 | }
 22 | 
 23 | label_colours = [(0, 0, 0),
 24 |                  # 0=background
 25 |                  (148, 65, 137), (255, 116, 69), (86, 156, 137),
 26 |                  (202, 179, 158), (155, 99, 235), (161, 107, 108),
 27 |                  (133, 160, 103), (76, 152, 126), (84, 62, 35),
 28 |                  (44, 80, 130), (31, 184, 157), (101, 144, 77),
 29 |                  (23, 197, 62), (141, 168, 145), (142, 151, 136),
 30 |                  (115, 201, 77), (100, 216, 255), (57, 156, 36),
 31 |                  (88, 108, 129), (105, 129, 112), (42, 137, 126),
 32 |                  (155, 108, 249), (166, 148, 143), (81, 91, 87),
 33 |                  (100, 124, 51), (73, 131, 121), (157, 210, 220),
 34 |                  (134, 181, 60), (221, 223, 147), (123, 108, 131),
 35 |                  (161, 66, 179), (163, 221, 160), (31, 146, 98),
 36 |                  (99, 121, 30), (49, 89, 240), (116, 108, 9),
 37 |                  (161, 176, 169), (80, 29, 135), (177, 105, 197),
 38 |                  (139, 110, 246)]
 39 | 
 40 | 
 41 | class CrossEntropyLoss2d(nn.Module):
 42 |     def __init__(self, weight=med_frq):
 43 |         super(CrossEntropyLoss2d, self).__init__()
 44 |         self.ce_loss = nn.CrossEntropyLoss(reduction='none')
 45 | 
 46 |     def forward(self, inputs_scales, targets_scales):
 47 |         losses = []
 48 |         for inputs, targets in zip(inputs_scales, targets_scales):
 49 |             mask = targets > 0
 50 |             targets_m = targets.clone()
 51 |             targets_m[mask] -= 1
 52 |             loss_all = self.ce_loss(inputs, targets_m.long())
 53 |             losses.append(torch.sum(torch.masked_select(loss_all, mask)) / torch.sum(mask.float()))
 54 |         total_loss = sum(losses)
 55 |         return total_loss
 56 | 
 57 | 
 58 | def color_label(label):
 59 |     label = label.clone().cpu().data.numpy()
 60 |     colored_label = np.vectorize(lambda x: label_colours[int(x)])
 61 | 
 62 |     colored = np.asarray(colored_label(label)).astype(np.float32)
 63 |     colored = colored.squeeze()
 64 | 
 65 |     try:
 66 |         return torch.from_numpy(colored.transpose([1, 0, 2, 3]))
 67 |     except ValueError:
 68 |         return torch.from_numpy(colored[np.newaxis, ...])
 69 | 
 70 | 
 71 | def print_log(global_step, epoch, local_count, count_inter, dataset_size, loss, time_inter):
 72 |     print('Step: {:>5} Train Epoch: {:>3} [{:>4}/{:>4} ({:3.1f}%)]    '
 73 |           'Loss: {:.6f} [{:.2f}s every {:>4} data]'.format(
 74 |         global_step, epoch, local_count, dataset_size,
 75 |         100. * local_count / dataset_size, loss.data, time_inter, count_inter))
 76 | 
 77 | 
 78 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train):
 79 |     # usually this happens only on the start of a epoch
 80 |     epoch_float = epoch + (local_count / num_train)
 81 |     state = {
 82 |         'global_step': global_step,
 83 |         'epoch': epoch_float,
 84 |         'state_dict': model.state_dict(),
 85 |         'optimizer': optimizer.state_dict(),
 86 |     }
 87 |     ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float)
 88 |     path = os.path.join(ckpt_dir, ckpt_model_filename)
 89 |     torch.save(state, path)
 90 |     print('{:>2} has been successfully saved'.format(path))
 91 | 
 92 | 
 93 | # def load_ckpt(model, optimizer, model_file, device):
 94 | #     if os.path.isfile(model_file):
 95 | #         print("=> loading checkpoint '{}'".format(model_file))
 96 | #         if device.type == 'cuda':
 97 | #             checkpoint = torch.load(model_file)
 98 | #         else:
 99 | #             checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
100 | #         model.load_state_dict(checkpoint['state_dict'])
101 | #         if optimizer:
102 | #             optimizer.load_state_dict(checkpoint['optimizer'])
103 | #         print("=> loaded checkpoint '{}' (epoch {})"
104 | #               .format(model_file, checkpoint['epoch']))
105 | #         step = checkpoint['global_step']
106 | #         epoch = checkpoint['epoch']
107 | #         return step, epoch
108 | #     else:
109 | #         print("=> no checkpoint found at '{}'".format(model_file))
110 | #         os._exit(0)
111 | 
112 | 
113 | def load_ckpt(model, optimizer, model_file, device):
114 |     if os.path.isfile(model_file):
115 |         print("=> loading checkpoint '{}'".format(model_file))
116 |         if device.type == 'cuda':
117 |             checkpoint = torch.load(model_file)
118 |         else:
119 |             checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
120 |         state_dict = checkpoint['model_state']
121 |         prefix = 'module.'
122 |         state_dict = {
123 |             (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
124 |         }
125 |         model.load_state_dict(state_dict)
126 |         if optimizer:
127 |             optimizer.load_state_dict(checkpoint['optimizer'])
128 |         print("=> loaded checkpoint '{}' (epoch {})"
129 |               .format(model_file, checkpoint['epoch']))
130 |         # step = checkpoint['global_step']
131 |         # epoch = checkpoint['epoch']
132 |         # return step, epoch
133 |     else:
134 |         print("=> no checkpoint found at '{}'".format(model_file))
135 |         os._exit(0)


--------------------------------------------------------------------------------
/envs/utils/fmm_planner.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import skfmm
  4 | import skimage
  5 | from numpy import around, array, ma
  6 | 
  7 | 
  8 | def get_mask(sx, sy, scale, step_size):
  9 |     size = int(step_size // scale) * 2 + 1
 10 |     mask = np.zeros((size, size))
 11 |     for i in range(size):
 12 |         for j in range(size):
 13 |             if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
 14 |                ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
 15 |                     step_size ** 2 \
 16 |                and ((i + 0.5) - (size // 2 + sx)) ** 2 + \
 17 |                ((j + 0.5) - (size // 2 + sy)) ** 2 > \
 18 |                     (step_size - 1) ** 2:
 19 |                 mask[i, j] = 1
 20 | 
 21 |     mask[size // 2, size // 2] = 1
 22 |     return mask
 23 | 
 24 | 
 25 | def get_dist(sx, sy, scale, step_size):
 26 |     size = int(step_size // scale) * 2 + 1
 27 |     mask = np.zeros((size, size)) + 1e-10
 28 |     for i in range(size):
 29 |         for j in range(size):
 30 |             if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
 31 |                ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
 32 |                     step_size ** 2:
 33 |                 mask[i, j] = max(5,
 34 |                                  (((i + 0.5) - (size // 2 + sx)) ** 2 +
 35 |                                   ((j + 0.5) - (size // 2 + sy)) ** 2) ** 0.5)
 36 |     return mask
 37 | 
 38 | 
 39 | class FMMPlanner():
 40 |     def __init__(self, traversible, scale=1, step_size=5):
 41 |         self.scale = scale
 42 |         self.step_size = step_size
 43 |         if scale != 1.:
 44 |             self.traversible = cv2.resize(traversible,
 45 |                                           (traversible.shape[1] // scale,
 46 |                                            traversible.shape[0] // scale),
 47 |                                           interpolation=cv2.INTER_NEAREST)
 48 |             self.traversible = np.rint(self.traversible)
 49 |         else:
 50 |             self.traversible = traversible
 51 | 
 52 |         self.du = int(self.step_size / (self.scale * 1.))
 53 |         self.fmm_dist = None
 54 |         self.around = np.zeros(traversible.shape)
 55 | 
 56 |         kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(9, 9))
 57 |         self.traversible_around = cv2.dilate((1-self.traversible*1).astype('uint8'), kernel)
 58 |         traversible_around_ma = ma.masked_values(self.traversible_around*1, 0)
 59 |         traversible_around_ma[self.traversible==0] = 0
 60 |         if np.any(traversible_around_ma == 0):
 61 |             dd = skfmm.distance(traversible_around_ma, dx=1)
 62 |             dd = (np.max(dd) - dd)
 63 |             dd = ma.filled(dd, 0)
 64 |             self.around = dd
 65 | 
 66 | 
 67 |     def set_goal(self, goal, auto_improve=False):
 68 |         traversible_ma = ma.masked_values(self.traversible * 1, 0)
 69 |         goal_x, goal_y = int(goal[0] / (self.scale * 1.)), \
 70 |             int(goal[1] / (self.scale * 1.))
 71 | 
 72 |         if self.traversible[goal_x, goal_y] == 0. and auto_improve:
 73 |             goal_x, goal_y = self._find_nearest_goal([goal_x, goal_y])
 74 | 
 75 |         traversible_ma[goal_x, goal_y] = 0
 76 |         dd = skfmm.distance(traversible_ma, dx=1)
 77 |         dd = ma.filled(dd, np.max(dd) + 1)
 78 |         self.fmm_dist = dd
 79 |         return
 80 | 
 81 |     def set_multi_goal(self, goal_map):
 82 |         traversible_ma = ma.masked_values(self.traversible * 1, 0) #mask掉障碍物层(0),只剩空白区域
 83 |         traversible_ma[goal_map == 1] = 0 # 除去障碍物层的地图，目标为0，其他为1
 84 |         dd = skfmm.distance(traversible_ma, dx=1)
 85 |         dd = ma.filled(dd, np.max(dd) + 1) # 将mask的值替换为max(dd)+1, 即障碍物为最大距离
 86 |         self.fmm_dist = dd
 87 |         self.around[goal_map == 1] = 0
 88 |         return
 89 | 
 90 |     def get_short_term_goal(self, state):
 91 |         scale = self.scale * 1.
 92 |         state = [x / scale for x in state]
 93 |         dx, dy = state[0] - int(state[0]), state[1] - int(state[1])
 94 |         mask = get_mask(dx, dy, scale, self.step_size)
 95 |         dist_mask = get_dist(dx, dy, scale, self.step_size)
 96 | 
 97 |         state = [int(x) for x in state]
 98 |         self.fmm_dist += self.around
 99 |         dist = np.pad(self.fmm_dist, self.du,
100 |                       'constant', constant_values=self.fmm_dist.shape[0] ** 2)
101 |         subset = dist[state[0]:state[0] + 2 * self.du + 1,
102 |                       state[1]:state[1] + 2 * self.du + 1]
103 |         
104 |         dist_around = np.pad(self.around, self.du,
105 |                       'constant', constant_values=self.fmm_dist.shape[0] ** 2)
106 |         subset_around = dist_around[state[0]:state[0] + 2 * self.du + 1,
107 |                       state[1]:state[1] + 2 * self.du + 1]
108 | 
109 |         assert subset.shape[0] == 2 * self.du + 1 and \
110 |             subset.shape[1] == 2 * self.du + 1, \
111 |             "Planning error: unexpected subset shape {}".format(subset.shape)
112 | 
113 |         subset *= mask
114 |         subset += (1 - mask) * self.fmm_dist.shape[0] ** 2
115 | 
116 |         if subset[self.du, self.du] < 0.25 * 100 / 5.:  # 25cm
117 |             stop = True
118 |         else:
119 |             stop = False
120 | 
121 |         subset -= subset[self.du, self.du]
122 |         ratio1 = subset / dist_mask
123 |         subset[ratio1 < -1.5] = 1
124 | 
125 |         (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape)
126 | 
127 |         # remove the imfluence of the around map
128 |         if subset[stg_x, stg_y]+subset_around[self.du, self.du] > -0.0001:
129 |             replan = True
130 |         else:
131 |             replan = False
132 | 
133 |         return (stg_x + state[0] - self.du) * scale, \
134 |                (stg_y + state[1] - self.du) * scale, replan, stop
135 | 
136 |     def _find_nearest_goal(self, goal):
137 |         traversible = skimage.morphology.binary_dilation(
138 |             np.zeros(self.traversible.shape),
139 |             skimage.morphology.disk(2)) != True
140 |         traversible = traversible * 1.
141 |         planner = FMMPlanner(traversible)
142 |         planner.set_goal(goal)
143 | 
144 |         mask = self.traversible
145 | 
146 |         dist_map = planner.fmm_dist * mask
147 |         dist_map[dist_map == 0] = dist_map.max()
148 | 
149 |         goal = np.unravel_index(dist_map.argmin(), dist_map.shape)
150 | 
151 |         return goal
152 | 


--------------------------------------------------------------------------------
/agents/utils/semantic_prediction.py:
--------------------------------------------------------------------------------
  1 | # The following code is largely borrowed from
  2 | # https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and
  3 | # https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
  4 | 
  5 | import argparse
  6 | import time
  7 | 
  8 | import torch
  9 | import numpy as np
 10 | 
 11 | from detectron2.config import get_cfg
 12 | from detectron2.utils.logger import setup_logger
 13 | from detectron2.data.catalog import MetadataCatalog
 14 | from detectron2.modeling import build_model
 15 | from detectron2.checkpoint import DetectionCheckpointer
 16 | from detectron2.utils.visualizer import ColorMode, Visualizer
 17 | import detectron2.data.transforms as T
 18 | 
 19 | from constants import coco_categories_mapping
 20 | 
 21 | 
 22 | class SemanticPredMaskRCNN():
 23 | 
 24 |     def __init__(self, args):
 25 |         self.segmentation_model = ImageSegmentation(args)
 26 |         self.args = args
 27 | 
 28 |     def get_prediction(self, img):
 29 |         args = self.args
 30 |         image_list = []
 31 |         img = img[:, :, ::-1]
 32 |         image_list.append(img)
 33 |         seg_predictions, vis_output = self.segmentation_model.get_predictions(
 34 |             image_list, visualize=args.visualize == 2)
 35 | 
 36 |         if args.visualize == 2:
 37 |             img = vis_output.get_image()
 38 | 
 39 |         semantic_input = np.zeros((img.shape[0], img.shape[1], 15 + 1))
 40 | 
 41 |         for j, class_idx in enumerate(
 42 |                 seg_predictions[0]['instances'].pred_classes.cpu().numpy()):
 43 |             if class_idx in list(coco_categories_mapping.keys()):
 44 |                 idx = coco_categories_mapping[class_idx]
 45 |                 obj_mask = seg_predictions[0]['instances'].pred_masks[j] * 1.
 46 |                 semantic_input[:, :, idx] += obj_mask.cpu().numpy()
 47 | 
 48 |         return semantic_input, img
 49 | 
 50 | 
 51 | def compress_sem_map(sem_map):
 52 |     c_map = np.zeros((sem_map.shape[1], sem_map.shape[2]))
 53 |     for i in range(sem_map.shape[0]):
 54 |         c_map[sem_map[i] > 0.] = i + 1
 55 |     return c_map
 56 | 
 57 | 
 58 | class ImageSegmentation():
 59 |     def __init__(self, args):
 60 |         string_args = """
 61 |             --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
 62 |             --input input1.jpeg
 63 |             --confidence-threshold {}
 64 |             --opts MODEL.WEIGHTS
 65 |             detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
 66 |             """.format(args.sem_pred_prob_thr)
 67 | 
 68 |         if args.sem_gpu_id == -2:
 69 |             string_args += """ MODEL.DEVICE cpu"""
 70 |         else:
 71 |             string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id)
 72 | 
 73 |         string_args = string_args.split()
 74 | 
 75 |         args = get_seg_parser().parse_args(string_args)
 76 |         logger = setup_logger()
 77 |         logger.info("Arguments: " + str(args))
 78 | 
 79 |         cfg = setup_cfg(args)
 80 |         self.demo = VisualizationDemo(cfg)
 81 | 
 82 |     def get_predictions(self, img, visualize=0):
 83 |         return self.demo.run_on_image(img, visualize=visualize)
 84 | 
 85 | 
 86 | def setup_cfg(args):
 87 |     # load config from file and command-line arguments
 88 |     cfg = get_cfg()
 89 |     cfg.merge_from_file(args.config_file)
 90 |     cfg.merge_from_list(args.opts)
 91 |     # Set score_threshold for builtin models
 92 |     cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
 93 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
 94 |     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = \
 95 |         args.confidence_threshold
 96 |     cfg.freeze()
 97 |     return cfg
 98 | 
 99 | 
100 | def get_seg_parser():
101 |     parser = argparse.ArgumentParser(
102 |         description="Detectron2 demo for builtin models")
103 |     parser.add_argument(
104 |         "--config-file",
105 |         default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
106 |         metavar="FILE",
107 |         help="path to config file",
108 |     )
109 |     parser.add_argument(
110 |         "--webcam",
111 |         action="store_true",
112 |         help="Take inputs from webcam.")
113 |     parser.add_argument("--video-input", help="Path to video file.")
114 |     parser.add_argument(
115 |         "--input",
116 |         nargs="+",
117 |         help="A list of space separated input images")
118 |     parser.add_argument(
119 |         "--output",
120 |         help="A file or directory to save output visualizations. "
121 |         "If not given, will show output in an OpenCV window.",
122 |     )
123 | 
124 |     parser.add_argument(
125 |         "--confidence-threshold",
126 |         type=float,
127 |         default=0.5,
128 |         help="Minimum score for instance predictions to be shown",
129 |     )
130 |     parser.add_argument(
131 |         "--opts",
132 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
133 |         default=[],
134 |         nargs=argparse.REMAINDER,
135 |     )
136 |     return parser
137 | 
138 | 
139 | class VisualizationDemo(object):
140 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE):
141 |         """
142 |         Args:
143 |             cfg (CfgNode):
144 |             instance_mode (ColorMode):
145 |         """
146 |         self.metadata = MetadataCatalog.get(
147 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
148 |         )
149 |         self.cpu_device = torch.device("cpu")
150 |         self.instance_mode = instance_mode
151 | 
152 |         self.predictor = BatchPredictor(cfg)
153 | 
154 |     def run_on_image(self, image_list, visualize=0):
155 |         """
156 |         Args:
157 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
158 |                 This is the format used by OpenCV.
159 | 
160 |         Returns:
161 |             predictions (dict): the output of the model.
162 |             vis_output (VisImage): the visualized image output.
163 |         """
164 |         vis_output = None
165 |         all_predictions = self.predictor(image_list)
166 |         # Convert image from OpenCV BGR format to Matplotlib RGB format.
167 | 
168 |         if visualize:
169 |             predictions = all_predictions[0]
170 |             image = image_list[0]
171 |             visualizer = Visualizer(
172 |                 image, self.metadata, instance_mode=self.instance_mode)
173 |             if "panoptic_seg" in predictions:
174 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
175 |                 vis_output = visualizer.draw_panoptic_seg_predictions(
176 |                     panoptic_seg.to(self.cpu_device), segments_info
177 |                 )
178 |             else:
179 |                 if "sem_seg" in predictions:
180 |                     vis_output = visualizer.draw_sem_seg(
181 |                         predictions["sem_seg"].argmax(
182 |                             dim=0).to(self.cpu_device)
183 |                     )
184 |                 if "instances" in predictions:
185 |                     instances = predictions["instances"].to(self.cpu_device)
186 |                     vis_output = visualizer.draw_instance_predictions(
187 |                         predictions=instances)
188 | 
189 |         return all_predictions, vis_output
190 | 
191 | 
192 | class BatchPredictor:
193 |     """
194 |     Create a simple end-to-end predictor with the given config that runs on
195 |     single device for a list of input images.
196 | 
197 |     Compared to using the model directly, this class does the following
198 |     additions:
199 | 
200 |     1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
201 |     2. Always take BGR image as the input and apply conversion defined by
202 |          `cfg.INPUT.FORMAT`.
203 |     3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
204 |     4. Take a list of input images
205 | 
206 |     Attributes:
207 |         metadata (Metadata): the metadata of the underlying dataset, obtained
208 |             from cfg.DATASETS.TEST.
209 | 
210 |     """
211 | 
212 |     def __init__(self, cfg):
213 |         self.cfg = cfg.clone()  # cfg can be modified by model
214 |         self.model = build_model(self.cfg)
215 |         self.model.eval()
216 |         self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
217 | 
218 |         checkpointer = DetectionCheckpointer(self.model)
219 |         checkpointer.load(cfg.MODEL.WEIGHTS)
220 | 
221 |         self.input_format = cfg.INPUT.FORMAT
222 |         assert self.input_format in ["RGB", "BGR"], self.input_format
223 | 
224 |     def __call__(self, image_list):
225 |         """
226 |         Args:
227 |             image_list (list of np.ndarray): a list of images of
228 |                                              shape (H, W, C) (in BGR order).
229 | 
230 |         Returns:
231 |             predictions (dict):
232 |                 the output of the model for all images.
233 |                 See :doc:`/tutorials/models` for details about the format.
234 |         """
235 |         inputs = []
236 |         for original_image in image_list:
237 |             # https://github.com/sphinx-doc/sphinx/issues/4258
238 |             # Apply pre-processing to image.
239 |             if self.input_format == "RGB":
240 |                 # whether the model expects BGR inputs or RGB
241 |                 original_image = original_image[:, :, ::-1]
242 |             height, width = original_image.shape[:2]
243 |             image = original_image
244 |             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
245 | 
246 |             instance = {"image": image, "height": height, "width": width}
247 | 
248 |             inputs.append(instance)
249 | 
250 |         with torch.no_grad():
251 |             predictions = self.model(inputs)
252 |             return predictions
253 | 


--------------------------------------------------------------------------------
/envs/habitat/__init__.py:
--------------------------------------------------------------------------------
  1 | # Parts of the code in this file have been borrowed from:
  2 | #    https://github.com/facebookresearch/habitat-api
  3 | import os
  4 | import pdb
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from habitat.config.default import get_config as cfg_env
 10 | from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1
 11 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
 12 | 
 13 | from agents.sem_exp import Sem_Exp_Env_Agent
 14 | from agents.multi_attr_exp import Multi_Attr_Exp_Env_Agent
 15 | from .objectgoal_env import ObjectGoal_Env
 16 | from .objectgoal_env21 import ObjectGoal_Env21
 17 | 
 18 | from .utils.vector_env import VectorEnv
 19 | 
 20 | 
 21 | def make_env_fn(args, config_env, rank):
 22 |     dataset = make_dataset(config_env.DATASET.TYPE, config=config_env.DATASET)
 23 |     config_env.defrost()
 24 |     config_env.SIMULATOR.SCENE = dataset.episodes[0].scene_id
 25 |     config_env.freeze()
 26 |     if args.agent == "sem_exp":
 27 |         env = Sem_Exp_Env_Agent(args=args, rank=rank,
 28 |                                 config_env=config_env,
 29 |                                 dataset=dataset
 30 |                                 )
 31 |     elif args.agent=='multi_attr_exp':
 32 |         env = Multi_Attr_Exp_Env_Agent(args=args, rank=rank,
 33 |                                 config_env=config_env,
 34 |                                 dataset=dataset
 35 |                                 )
 36 |     elif args.agent == "obj21":
 37 |         env = ObjectGoal_Env21(args=args, rank=rank,
 38 |                              config_env=config_env,
 39 |                              dataset=dataset
 40 |                              )
 41 |     else:
 42 |         env = ObjectGoal_Env(args=args, rank=rank,
 43 |                              config_env=config_env,
 44 |                              dataset=dataset
 45 |                              )
 46 | 
 47 |     env.seed(rank)
 48 |     return env
 49 | 
 50 | 
 51 | def _get_scenes_from_folder(content_dir):
 52 |     scene_dataset_ext = ".glb.json.gz"
 53 |     scenes = []
 54 |     for filename in os.listdir(content_dir):
 55 |         if filename.endswith(scene_dataset_ext):
 56 |             scene = filename[: -len(scene_dataset_ext) + 4]
 57 |             scenes.append(scene)
 58 |     scenes.sort()
 59 |     return scenes
 60 | 
 61 | 
 62 | def construct_envs(args):
 63 |     env_configs = []
 64 |     args_list = []
 65 | 
 66 |     basic_config = cfg_env(config_paths=["envs/habitat/configs/"
 67 |                                          + args.task_config])
 68 |     basic_config.defrost()
 69 |     basic_config.DATASET.SPLIT = args.split
 70 | 
 71 |     # basic_config.DATASET.DATA_PATH = \
 72 |     #     basic_config.DATASET.DATA_PATH.replace("v1", args.version)
 73 |     # basic_config.DATASET.EPISODES_DIR = \
 74 |     #     basic_config.DATASET.EPISODES_DIR.replace("v1", args.version)
 75 |     basic_config.freeze()
 76 | 
 77 |     scenes = basic_config.DATASET.CONTENT_SCENES
 78 |     if "*" in basic_config.DATASET.CONTENT_SCENES:
 79 |         content_dir = os.path.join("data/datasets/objectnav/gibson/v1.1/" + args.split, "content")
 80 |         scenes = _get_scenes_from_folder(content_dir)
 81 |     
 82 |     if len(scenes) > 0:
 83 |         assert len(scenes) >= args.num_processes, (
 84 |             "reduce the number of processes as there "
 85 |             "aren't enough number of scenes"
 86 |         )
 87 | 
 88 |         scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes))
 89 |                              for _ in range(args.num_processes)]
 90 |         for i in range(len(scenes) % args.num_processes):
 91 |             scene_split_sizes[i] += 1
 92 | 
 93 |     print("Scenes per thread:")
 94 |     for i in range(args.num_processes):
 95 |         config_env = cfg_env(config_paths=["envs/habitat/configs/"
 96 |                                            + args.task_config])
 97 |         config_env.defrost()
 98 | 
 99 |         if len(scenes) > 0:
100 |             config_env.DATASET.CONTENT_SCENES = scenes[
101 |                 sum(scene_split_sizes[:i]):
102 |                 sum(scene_split_sizes[:i + 1])
103 |             ]
104 |             print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES))
105 | 
106 |         if i < args.num_processes_on_first_gpu:
107 |             gpu_id = 0
108 |         else:
109 |             gpu_id = int((i - args.num_processes_on_first_gpu)
110 |                          // args.num_processes_per_gpu) + args.sim_gpu_id
111 |         gpu_id = min(torch.cuda.device_count() - 1, gpu_id)
112 |         config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id
113 | 
114 |         agent_sensors = []
115 |         agent_sensors.append("RGB_SENSOR")
116 |         agent_sensors.append("DEPTH_SENSOR")
117 |         agent_sensors.append("SEMANTIC_SENSOR")
118 | 
119 |         config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors
120 | 
121 |         # Reseting episodes manually, setting high max episode length in sim
122 |         config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000
123 |         config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
124 | 
125 |         config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width
126 |         config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height
127 |         config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov
128 |         config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0]
129 | 
130 |         config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width
131 |         config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height
132 |         config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov
133 |         config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth
134 |         config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth
135 |         config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0]
136 | 
137 |         config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width
138 |         config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height
139 |         config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov
140 |         config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \
141 |             [0, args.camera_height, 0]
142 |        
143 |         config_env.SIMULATOR.TURN_ANGLE = args.turn_angle
144 |         config_env.DATASET.SPLIT = args.split
145 |         # config_env.DATASET.DATA_PATH = \
146 |         #     config_env.DATASET.DATA_PATH.replace("v1", args.version)
147 |         # config_env.DATASET.EPISODES_DIR = \
148 |         #     config_env.DATASET.EPISODES_DIR.replace("v1", args.version)
149 | 
150 |         config_env.freeze()
151 |         env_configs.append(config_env)
152 | 
153 |         args_list.append(args)
154 | 
155 |     envs = VectorEnv(
156 |         make_env_fn=make_env_fn,
157 |         env_fn_args=tuple(
158 |             tuple(
159 |                 zip(args_list, env_configs, range(args.num_processes))
160 |             )
161 |         ),
162 |     )
163 | 
164 |     return envs
165 | 
166 | 
167 | def construct_envs21(args):
168 |     env_configs = []
169 |     args_list = []
170 | 
171 |     basic_config = cfg_env(config_paths=["envs/habitat/configs/"
172 |                                          + args.task_config])
173 |     basic_config.defrost()
174 |     basic_config.DATASET.SPLIT = args.split
175 |     basic_config.DATASET.DATA_PATH = \
176 |         basic_config.DATASET.DATA_PATH.replace("v1", args.version)
177 |     # basic_config.DATASET.EPISODES_DIR = \
178 |     #     basic_config.DATASET.EPISODES_DIR.replace("v1", args.version)
179 |     basic_config.freeze()
180 | 
181 |     scenes = basic_config.DATASET.CONTENT_SCENES
182 |     dataset = make_dataset(basic_config.DATASET.TYPE, config=basic_config.DATASET)
183 |     if "*" in basic_config.DATASET.CONTENT_SCENES:
184 |         scenes = dataset.get_scenes_to_load(basic_config.DATASET)
185 | 
186 |     if len(scenes) > 0:
187 |         assert len(scenes) >= args.num_processes, (
188 |             "reduce the number of processes as there "
189 |             "aren't enough number of scenes"
190 |         )
191 | 
192 |         scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes))
193 |                              for _ in range(args.num_processes)]
194 |         for i in range(len(scenes) % args.num_processes):
195 |             scene_split_sizes[i] += 1
196 | 
197 |     print("Scenes per thread:")
198 |     for i in range(args.num_processes):
199 |         config_env = cfg_env(config_paths=["envs/habitat/configs/"
200 |                                            + args.task_config])
201 |         config_env.defrost()
202 | 
203 |         if len(scenes) > 0:
204 |             config_env.DATASET.CONTENT_SCENES = scenes[
205 |                 sum(scene_split_sizes[:i]):
206 |                 sum(scene_split_sizes[:i + 1])
207 |             ]
208 |             print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES))
209 | 
210 |         if i < args.num_processes_on_first_gpu:
211 |             gpu_id = 0
212 |         else:
213 |             gpu_id = int((i - args.num_processes_on_first_gpu)
214 |                          // args.num_processes_per_gpu) + args.sim_gpu_id
215 |         gpu_id = min(torch.cuda.device_count() - 1, gpu_id)
216 |         config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id
217 | 
218 |         agent_sensors = []
219 |         agent_sensors.append("RGB_SENSOR")
220 |         agent_sensors.append("DEPTH_SENSOR")
221 |         agent_sensors.append("SEMANTIC_SENSOR")
222 | 
223 |         config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors
224 | 
225 |         # Reseting episodes manually, setting high max episode length in sim
226 |         config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000
227 |         config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
228 | 
229 |         config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width
230 |         config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height
231 |         config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov
232 |         config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0]
233 | 
234 |         config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width
235 |         config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height
236 |         config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov
237 |         config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth
238 |         config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth
239 |         config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0]
240 | 
241 |         config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width
242 |         config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height
243 |         config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov
244 |         config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \
245 |             [0, args.camera_height, 0]
246 | 
247 |         
248 |         config_env.SIMULATOR.TURN_ANGLE = args.turn_angle
249 |         config_env.DATASET.SPLIT = args.split
250 |         config_env.DATASET.DATA_PATH = \
251 |             config_env.DATASET.DATA_PATH.replace("v1", args.version)
252 |         # config_env.DATASET.EPISODES_DIR = \
253 |         #     config_env.DATASET.EPISODES_DIR.replace("v1", args.version)
254 | 
255 |         config_env.freeze()
256 |         env_configs.append(config_env)
257 | 
258 |         args_list.append(args)
259 | 
260 |     envs = VectorEnv(
261 |         make_env_fn=make_env_fn,
262 |         env_fn_args=tuple(
263 |             tuple(
264 |                 zip(args_list, env_configs, range(args.num_processes))
265 |             )
266 |         ),
267 |     )
268 | 
269 |     return envs
270 | 
271 | 
272 | 
273 |     


--------------------------------------------------------------------------------
/envs/habitat/objectgoal_env21.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import bz2
  3 | import gzip
  4 | import _pickle as cPickle
  5 | import pdb
  6 | 
  7 | import gym
  8 | import numpy as np
  9 | import quaternion
 10 | import skimage.morphology
 11 | import habitat
 12 | 
 13 | from envs.utils.fmm_planner import FMMPlanner
 14 | from constants import category_to_id, mp3d_category_id
 15 | import envs.utils.pose as pu
 16 | 
 17 | coco_categories = [0, 3, 2, 4, 5, 1]
 18 | 
 19 | class ObjectGoal_Env21(habitat.RLEnv):
 20 |     """The Object Goal Navigation environment class. The class is responsible
 21 |     for loading the dataset, generating episodes, and computing evaluation
 22 |     metrics.
 23 |     """
 24 | 
 25 |     def __init__(self, args, rank, config_env, dataset):
 26 |         self.args = args
 27 |         self.rank = rank
 28 | 
 29 |         super().__init__(config_env, dataset)
 30 | 
 31 |         # Initializations
 32 |         self.episode_no = 0
 33 | 
 34 |         # Scene info
 35 |         self.last_scene_path = None
 36 |         self.scene_path = None
 37 |         self.scene_name = None
 38 | 
 39 |         # Episode Dataset info
 40 |         self.eps_data = None
 41 |         self.eps_data_idx = None
 42 |         self.gt_planner = None
 43 |         self.object_boundary = None
 44 |         self.goal_idx = None
 45 |         self.goal_name = None
 46 |         self.map_obj_origin = None
 47 |         self.starting_loc = None
 48 |         self.starting_distance = None
 49 | 
 50 |         # Episode tracking info
 51 |         self.curr_distance = None
 52 |         self.prev_distance = None
 53 |         self.timestep = None
 54 |         self.stopped = None
 55 |         self.path_length = None
 56 |         self.last_sim_location = None
 57 |         self.trajectory_states = []
 58 |         self.info = {}
 59 |         self.info['distance_to_goal'] = None
 60 |         self.info['spl'] = None
 61 |         self.info['success'] = None
 62 | 
 63 |         # self.scene = self._env.sim.semantic_annotations()
 64 | 
 65 |         fileName = 'data/matterport_category_mappings.tsv'
 66 | 
 67 |         text = ''
 68 |         lines = []
 69 |         items = []
 70 |         self.hm3d_semantic_mapping={}
 71 | 
 72 |         with open(fileName, 'r') as f:
 73 |             text = f.read()
 74 |         lines = text.split('\n')
 75 | 
 76 |         for l in lines:
 77 |             items.append(l.split('    '))
 78 | 
 79 |         for i in items:
 80 |             if len(i) > 3:
 81 |                 self.hm3d_semantic_mapping[i[2]] = i[-1]
 82 | 
 83 |         # print()
 84 | 
 85 |         # for obj in self.scene.objects:
 86 |         #     if obj is not None:
 87 |         #         print(
 88 |         #             f"Object id:{obj.id}, category:{obj.category.name()}, index:{obj.category.index()}"
 89 |         #             f" center:{obj.aabb.center}, dims:{obj.aabb.sizes}"
 90 |         #         )
 91 | 
 92 |     def reset(self):
 93 |         """Resets the environment to a new episode.
 94 | 
 95 |         Returns:
 96 |             obs (ndarray): RGBD observations (4 x H x W)
 97 |             info (dict): contains timestep, pose, goal category and
 98 |                          evaluation metric info
 99 |         """
100 |         args = self.args
101 |         # new_scene = self.episode_no % args.num_train_episodes == 0
102 | 
103 | 
104 |         self.episode_no += 1
105 | 
106 |         # Initializations
107 |         self.timestep = 0
108 |         self.stopped = False
109 |         self.path_length = 1e-5
110 |         self.trajectory_states = []
111 | 
112 |         # if new_scene:
113 |         obs = super().reset()
114 |         start_height = 0
115 |         self.scene = self._env.sim.semantic_annotations()
116 |         # start_height = self._env.current_episode.start_position[1]
117 |         # goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1]
118 | 
119 |         # floor_height = []
120 |         # floor_size = []
121 |         # for obj in self.scene.objects:
122 |         #     if obj.category.name() in self.hm3d_semantic_mapping and \
123 |         #         self.hm3d_semantic_mapping[obj.category.name()] == 'floor':
124 |         #         floor_height.append(abs(obj.aabb.center[1] - start_height))
125 |         #         floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2])
126 | 
127 |         
128 |         # if not args.eval:
129 |         #     while all(h > 0.1 or s < 12 for (h,s) in zip(floor_height, floor_size)) or abs(start_height-goal_height) > 1.2:
130 |         #         obs = super().reset()
131 | 
132 |         #         self.scene = self._env.sim.semantic_annotations()
133 |         #         start_height = self._env.current_episode.start_position[1]
134 |         #         goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1]
135 | 
136 |         #         floor_height = []
137 |         #         floor_size = []
138 |         #         for obj in self.scene.objects:
139 |         #             if obj.category.name() in self.hm3d_semantic_mapping and \
140 |         #                 self.hm3d_semantic_mapping[obj.category.name()] == 'floor':
141 |         #                 floor_height.append(abs(obj.aabb.center[1] - start_height))
142 |         #                 floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2])
143 | 
144 |         self.prev_distance = self._env.get_metrics()["distance_to_goal"]
145 |         self.starting_distance = self._env.get_metrics()["distance_to_goal"]
146 |         self.info['starting_distance_to_goal']=self.starting_distance
147 | 
148 |         # print("obs: ,", obs)
149 |         # print("obs shape: ,", obs.shape)
150 |         rgb = obs['rgb'].astype(np.uint8)
151 |         depth = obs['depth']
152 |         semantic = self._preprocess_semantic(obs["semantic"])
153 | 
154 |         state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
155 |         self.last_sim_location = self.get_sim_location()
156 | 
157 |         # Set info
158 |         self.info['time'] = self.timestep
159 |         self.info['sensor_pose'] = [0., 0., 0.]
160 |         self.info['goal_cat_id'] = coco_categories[obs['objectgoal'][0]]
161 |         self.info['goal_name'] = category_to_id[obs['objectgoal'][0]]
162 | 
163 |         self.goal_name = category_to_id[obs['objectgoal'][0]]
164 | 
165 |         return state, self.info
166 | 
167 |     def step(self, action):
168 |         """Function to take an action in the environment.
169 | 
170 |         Args:
171 |             action (dict):
172 |                 dict with following keys:
173 |                     'action' (int): 0: stop, 1: forward, 2: left, 3: right
174 | 
175 |         Returns:
176 |             obs (ndarray): RGBD observations (4 x H x W)
177 |             reward (float): amount of reward returned after previous action
178 |             done (bool): whether the episode has ended
179 |             info (dict): contains timestep, pose, goal category and
180 |                          evaluation metric info
181 |         """
182 |         action = action["action"]
183 |         if action == 0:
184 |             self.stopped = True
185 |             # Not sending stop to simulator, resetting manually
186 |             action = 3
187 | 
188 |         obs, rew, done, _ = super().step(action)
189 | 
190 |         # Get pose change
191 |         dx, dy, do = self.get_pose_change()
192 |         self.info['sensor_pose'] = [dx, dy, do]
193 |         self.path_length += pu.get_l2_distance(0, dx, 0, dy)
194 | 
195 |         self.info['distance_to_goal'] = self._env.get_metrics()["distance_to_goal"]
196 |         spl, success, dist = 0., 0., 0.
197 |         if done:
198 |             spl, success, dist = self.get_metrics()
199 |             self.info['distance_to_goal'] = dist
200 |             self.info['spl'] = spl
201 |             self.info['success'] = success
202 | 
203 |         rgb = obs['rgb'].astype(np.uint8)
204 |         depth = obs['depth']
205 |         semantic = self._preprocess_semantic(obs["semantic"])
206 |         state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
207 | 
208 |         self.timestep += 1
209 |         self.info['time'] = self.timestep
210 | 
211 | 
212 | 
213 | 
214 |         return state, rew, done, self.info
215 | 
216 |     def _preprocess_semantic(self, semantic):
217 |         # print("*********semantic type: ", type(semantic))
218 |         se = list(set(semantic.ravel()))
219 |         # pdb.set_trace()
220 |         for i in range(len(se)):
221 |             if self.scene.objects[se[i]].category.name() in self.hm3d_semantic_mapping:
222 |                 hm3d_category_name = self.hm3d_semantic_mapping[self.scene.objects[se[i]].category.name()]
223 |             else:
224 |                 hm3d_category_name = self.scene.objects[se[i]].category.name()
225 | 
226 |             if hm3d_category_name in mp3d_category_id:
227 |                 # print("sum: ", np.sum(sem_output[sem_output==se[i]])/se[i])
228 |                 semantic[semantic==se[i]] = mp3d_category_id[hm3d_category_name]-1
229 |             else :
230 |                 semantic[
231 |                     semantic==se[i]
232 |                     ] = 0
233 |     
234 |         # se = list(set(semantic.ravel()))
235 |         # print("semantic: ", se) # []
236 |         # semantic = np.expand_dims(semantic.astype(np.uint8), 2)
237 |         return semantic.astype(np.uint8)
238 | 
239 |     def get_reward_range(self):
240 |         """This function is not used, Habitat-RLEnv requires this function"""
241 |         return (0., 1.0)
242 | 
243 |     def get_reward(self, observations):
244 |         self.curr_distance = self._env.get_metrics()['distance_to_goal']
245 | 
246 |         reward = (self.prev_distance - self.curr_distance) * \
247 |             self.args.reward_coeff
248 | 
249 |         self.prev_distance = self.curr_distance
250 |         return reward
251 | 
252 |     def get_llm_distance(self, target_point_map, frontier_loc_g):
253 | 
254 |         frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0],
255 |                                                 frontier_loc_g[1]] / 20.0
256 |         tpm = len(list(set(target_point_map.ravel()))) -1
257 |         for lay in range(tpm):
258 |             frontier_loc = np.argwhere(target_point_map == lay+1)
259 |             frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0],
260 |                                                       frontier_loc[1]] / 20.0
261 |             if frontier_distance > frontier_dis_g:
262 |                 return 0
263 |         return 1
264 |         
265 |     def get_metrics(self):
266 |         """This function computes evaluation metrics for the Object Goal task
267 | 
268 |         Returns:
269 |             spl (float): Success weighted by Path Length
270 |                         (See https://arxiv.org/pdf/1807.06757.pdf)
271 |             success (int): 0: Failure, 1: Successful
272 |             dist (float): Distance to Success (DTS),  distance of the agent
273 |                         from the success threshold boundary in meters.
274 |                         (See https://arxiv.org/pdf/2007.00643.pdf)
275 |         """
276 |         dist = self._env.get_metrics()['distance_to_goal']
277 |         if dist < 0.1:
278 |             success = 1
279 |         else:
280 |             success = 0
281 |         spl = min(success * self.starting_distance / self.path_length, 1)
282 |         return spl, success, dist
283 | 
284 |     def get_done(self, observations):
285 |         if self.info['time'] >= self.args.max_episode_length - 1:
286 |             done = True
287 |         elif self.stopped:
288 |             done = True
289 |             # print(self._env.get_metrics())
290 |         else:
291 |             done = False
292 |         return done
293 | 
294 |     def _episode_success(self):
295 |         return self._env.get_metrics()['success']
296 | 
297 |     def get_info(self, observations):
298 |         """This function is not used, Habitat-RLEnv requires this function"""
299 |         info = {}
300 |         return info
301 | 
302 |     def get_sim_location(self):
303 |         """Returns x, y, o pose of the agent in the Habitat simulator."""
304 | 
305 |         agent_state = super().habitat_env.sim.get_agent_state(0)
306 |         x = -agent_state.position[2]
307 |         y = -agent_state.position[0]
308 |         axis = quaternion.as_euler_angles(agent_state.rotation)[0]
309 |         if (axis % (2 * np.pi)) < 0.1 or (axis %
310 |                                           (2 * np.pi)) > 2 * np.pi - 0.1:
311 |             o = quaternion.as_euler_angles(agent_state.rotation)[1]
312 |         else:
313 |             o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
314 |         if o > np.pi:
315 |             o -= 2 * np.pi
316 |         return x, y, o
317 | 
318 |     def get_pose_change(self):
319 |         """Returns dx, dy, do pose change of the agent relative to the last
320 |         timestep."""
321 |         curr_sim_pose = self.get_sim_location()
322 |         dx, dy, do = pu.get_rel_pose_change(
323 |             curr_sim_pose, self.last_sim_location)
324 |         self.last_sim_location = curr_sim_pose
325 |         return dx, dy, do
326 | 


--------------------------------------------------------------------------------
/envs/utils/depth_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utilities for processing depth images.
 17 | """
 18 | import pdb
 19 | from argparse import Namespace
 20 | 
 21 | import itertools
 22 | import numpy as np
 23 | import torch
 24 | 
 25 | import envs.utils.rotation_utils as ru
 26 | 
 27 | 
 28 | def get_camera_matrix(width, height, fov):
 29 |     """Returns a camera matrix from image size and fov."""
 30 |     xc = (width - 1.) / 2.
 31 |     zc = (height - 1.) / 2.
 32 |     f = (width / 2.) / np.tan(np.deg2rad(fov / 2.))
 33 |     camera_matrix = {'xc': xc, 'zc': zc, 'f': f}
 34 |     camera_matrix = Namespace(**camera_matrix)
 35 |     return camera_matrix
 36 | 
 37 | 
 38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1):
 39 |     """Projects the depth image Y into a 3D point cloud.
 40 |     Inputs:
 41 |         Y is ...xHxW
 42 |         camera_matrix
 43 |     Outputs:
 44 |         X is positive going right
 45 |         Y is positive into the image
 46 |         Z is positive up in the image
 47 |         XYZ is ...xHxWx3
 48 |     """
 49 |     x, z = np.meshgrid(np.arange(Y.shape[-1]),
 50 |                        np.arange(Y.shape[-2] - 1, -1, -1))
 51 |     for _ in range(Y.ndim - 2):
 52 |         x = np.expand_dims(x, axis=0)
 53 |         z = np.expand_dims(z, axis=0)
 54 |     X = (x[::scale, ::scale] - camera_matrix.xc) * \
 55 |         Y[::scale, ::scale] / camera_matrix.f
 56 |     Z = (z[::scale, ::scale] - camera_matrix.zc) * \
 57 |         Y[::scale, ::scale] / camera_matrix.f
 58 |     XYZ = np.concatenate((X[..., np.newaxis],
 59 |                           Y[::scale, ::scale][..., np.newaxis],
 60 |                           Z[..., np.newaxis]), axis=X.ndim)
 61 |     return XYZ
 62 | 
 63 | 
 64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree):
 65 |     """
 66 |     Transforms the point cloud into geocentric frame to account for
 67 |     camera elevation and angle
 68 |     Input:
 69 |         XYZ                     : ...x3
 70 |         sensor_height           : height of the sensor
 71 |         camera_elevation_degree : camera elevation to rectify.
 72 |     Output:
 73 |         XYZ : ...x3
 74 |     """
 75 |     R = ru.get_r_matrix(
 76 |         [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
 77 |     XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
 78 |     XYZ[..., 2] = XYZ[..., 2] + sensor_height
 79 |     return XYZ
 80 | 
 81 | 
 82 | def transform_pose(XYZ, current_pose):
 83 |     """
 84 |     Transforms the point cloud into geocentric frame to account for
 85 |     camera position
 86 |     Input:
 87 |         XYZ                     : ...x3
 88 |         current_pose            : camera position (x, y, theta (radians))
 89 |     Output:
 90 |         XYZ : ...x3
 91 |     """
 92 |     R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
 93 |     XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
 94 |     XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0]
 95 |     XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1]
 96 |     return XYZ
 97 | 
 98 | 
 99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution):
100 |     """Bins points into xy-z bins
101 |     XYZ_cms is ... x H x W x3
102 |     Outputs is ... x map_size x map_size x (len(z_bins)+1)
103 |     """
104 |     sh = XYZ_cms.shape
105 |     XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
106 |     n_z_bins = len(z_bins) + 1
107 |     counts = []
108 |     for XYZ_cm in XYZ_cms:
109 |         isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0]))
110 |         X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32)
111 |         Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32)
112 |         Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32)
113 | 
114 |         isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0,
115 |                             Y_bin < map_size,
116 |                             Z_bin >= 0, Z_bin < n_z_bins, isnotnan])
117 |         isvalid = np.all(isvalid, axis=0)
118 | 
119 |         ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin
120 |         ind[np.logical_not(isvalid)] = 0
121 |         count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
122 |                             minlength=map_size * map_size * n_z_bins)
123 |         counts = np.reshape(count, [map_size, map_size, n_z_bins])
124 | 
125 |     counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins])
126 | 
127 |     return counts
128 | 
129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution):
130 |     """Bins points into xy-z bins
131 |     XYZ_cms is ... x H x W x3
132 |     semantic is ... x H x W
133 |     Outputs is ... x map_size x map_size x len
134 |     """
135 |     # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3
136 |     # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
137 |     # print("XYZ_cms: ",XYZ_cms)
138 |     # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3
139 |     # print("z_bins: ",z_bins) 
140 |     # se = list(set(semantic.ravel()))
141 |     # print("bin_semantic_points: ", se) # []
142 |     # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3
143 |     isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0]))
144 |     X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32)
145 |     Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32)
146 |     Z_bin = semantic.astype(np.int32)
147 | 
148 |     # print("X_bin S count: ", Counter(X_bin.ravel()))
149 |     # print("Y_bin S count: ", Counter(Y_bin.ravel()))
150 |     # print("Z_bin S count: ", Counter(Z_bin.ravel()))
151 |     # print("Z_bin: {}".format(X_bin))
152 |     # print("X_bin: {}".format(X_bin.shape)) #128*128
153 |     # print("Z_bin: {}".format(Z_bin.shape)) #128*128
154 | 
155 |     isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size,
156 |                         Z_bin > 0, Z_bin <= semantic_map_len, isnotnan])
157 |     isvalid = np.all(isvalid, axis=0)
158 | 
159 |     # print("isvalid: ", Counter(isvalid.ravel()))
160 |     # print("isvalid: ", isvalid.shape) #128*128
161 | 
162 |     ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1 
163 |     ind[np.logical_not(isvalid)] = 0
164 |     # print("ind: ", Counter(ind.ravel()))
165 |     # print("ind: ", ind.shape) # 128*128
166 |     count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
167 |                         minlength=map_size * map_size * semantic_map_len)
168 |     # print("counts: ", count.shape) # 4838400
169 | 
170 |     counts = np.reshape(count, [map_size, map_size, semantic_map_len])
171 |     # print("counts: ", counts.shape) # 480*480*semantic_map_len
172 | 
173 |     return counts
174 | 
175 | 
176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1):
177 |     """Projects the depth image Y into a 3D point cloud.
178 |     Inputs:
179 |         Y is ...xHxW
180 |         camera_matrix
181 |     Outputs:
182 |         X is positive going right
183 |         Y is positive into the image
184 |         Z is positive up in the image
185 |         XYZ is ...xHxWx3
186 |     """
187 |     grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]),
188 |                                     torch.arange(Y_t.shape[-2] - 1, -1, -1))
189 |     grid_x = grid_x.transpose(1, 0).to(device)
190 |     grid_z = grid_z.transpose(1, 0).to(device)
191 |     grid_x = grid_x.unsqueeze(0).expand(Y_t.size())
192 |     grid_z = grid_z.unsqueeze(0).expand(Y_t.size())
193 | 
194 |     X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \
195 |         Y_t[:, ::scale, ::scale] / camera_matrix.f
196 |     Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \
197 |         Y_t[:, ::scale, ::scale] / camera_matrix.f
198 | 
199 |     XYZ = torch.stack(
200 |         (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size()))
201 | 
202 |     return XYZ
203 | 
204 | 
205 | def transform_camera_view_t(
206 |         XYZ, sensor_height, camera_elevation_degree, device):
207 |     """
208 |     Transforms the point cloud into geocentric frame to account for
209 |     camera elevation and angle
210 |     Input:
211 |         XYZ                     : ...x3
212 |         sensor_height           : height of the sensor
213 |         camera_elevation_degree : camera elevation to rectify.
214 |     Output:
215 |         XYZ : ...x3
216 |     """
217 |     for i in range(XYZ.shape[0]):
218 |         R = ru.get_r_matrix(
219 |             [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i]))
220 |         XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device)
221 |                         )
222 |     XYZ[..., 2] = XYZ[..., 2] + sensor_height
223 |     return XYZ
224 | 
225 | 
226 | def transform_pose_t(XYZ, current_pose, device):
227 |     """
228 |     Transforms the point cloud into geocentric frame to account for
229 |     camera position
230 |     Input:
231 |         XYZ                     : ...x3
232 |         current_pose            : camera position (x, y, theta (radians))
233 |     Output:
234 |         XYZ : ...x3
235 |     """
236 |     R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
237 |     XYZ = torch.matmul(XYZ.reshape(-1, 3),
238 |                        torch.from_numpy(R).float().transpose(1, 0).to(device)
239 |                        ).reshape(XYZ.shape)
240 |     XYZ[..., 0] += current_pose[0]
241 |     XYZ[..., 1] += current_pose[1]
242 |     return XYZ
243 | 
244 | 
245 | def splat_feat_nd(init_grid, feat, coords):
246 |     """
247 |     Args:
248 |         init_grid: B X nF X W X H X D X ..
249 |         feat: B X nF X nPt
250 |         coords: B X nDims X nPt in [-1, 1]
251 |     Returns:
252 |         grid: B X nF X W X H X D X ..
253 |     """
254 |     wts_dim = []
255 |     pos_dim = []
256 |     grid_dims = init_grid.shape[2:]
257 | 
258 |     B = init_grid.shape[0]
259 |     F = init_grid.shape[1]
260 | 
261 |     n_dims = len(grid_dims)
262 | 
263 |     grid_flat = init_grid.view(B, F, -1)
264 | 
265 |     for d in range(n_dims):
266 |         pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
267 |         pos_d = []
268 |         wts_d = []
269 | 
270 |         for ix in [0, 1]:
271 |             pos_ix = torch.floor(pos) + ix
272 |             safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
273 |             safe_ix = safe_ix.type(pos.dtype)
274 | 
275 |             wts_ix = 1 - torch.abs(pos - pos_ix)
276 | 
277 |             wts_ix = wts_ix * safe_ix
278 |             pos_ix = pos_ix * safe_ix
279 | 
280 |             pos_d.append(pos_ix)
281 |             wts_d.append(wts_ix)
282 | 
283 |         pos_dim.append(pos_d)
284 |         wts_dim.append(wts_d)
285 | 
286 |     l_ix = [[0, 1] for d in range(n_dims)]
287 |     # pdb.set_trace()
288 |     for ix_d in itertools.product(*l_ix):
289 |         wts = torch.ones_like(wts_dim[0][0])
290 |         index = torch.zeros_like(wts_dim[0][0])
291 |         for d in range(n_dims):
292 |             index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
293 |             wts = wts * wts_dim[d][ix_d[d]]
294 | 
295 |         index = index.long()
296 |         grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts)
297 |         grid_flat = torch.round(grid_flat)
298 | 
299 |     return grid_flat.view(init_grid.shape)
300 | 
301 | def splat_att_feat_nd(init_grid, feat, coords):
302 |     """
303 |     Args:
304 |         init_grid: B X nF X W X H X D X ..
305 |         feat: B X nF X nPt
306 |         coords: B X nDims X nPt in [-1, 1]
307 |     Returns:
308 |         grid: B X nF X W X H X D X ..
309 |     """
310 |     wts_dim = []
311 |     pos_dim = []
312 |     grid_dims = init_grid.shape[2:]
313 | 
314 |     B = init_grid.shape[0]
315 |     F = init_grid.shape[1]
316 | 
317 |     n_dims = len(grid_dims)
318 | 
319 |     grid_flat = init_grid[:,:-1,...].view(B, F-1, -1)
320 |     grid_flat_att = init_grid[:,-1:,...].view(B,1, -1)
321 | 
322 |     for d in range(n_dims):
323 |         pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
324 |         pos_d = []
325 |         wts_d = []
326 | 
327 |         for ix in [0, 1]:
328 |             pos_ix = torch.floor(pos) + ix
329 |             safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
330 |             safe_ix = safe_ix.type(pos.dtype)
331 | 
332 |             wts_ix = 1 - torch.abs(pos - pos_ix)
333 | 
334 |             wts_ix = wts_ix * safe_ix
335 |             pos_ix = pos_ix * safe_ix
336 | 
337 |             pos_d.append(pos_ix)
338 |             wts_d.append(wts_ix)
339 | 
340 |         pos_dim.append(pos_d)
341 |         wts_dim.append(wts_d)
342 | 
343 |     l_ix = [[0, 1] for d in range(n_dims)]
344 |     for ix_d in itertools.product(*l_ix):
345 |         wts = torch.ones_like(wts_dim[0][0])
346 |         index = torch.zeros_like(wts_dim[0][0])
347 |         for d in range(n_dims):
348 |             index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
349 |             wts = wts * wts_dim[d][ix_d[d]]
350 | 
351 |         index = index.long()
352 |         grid_flat.scatter_add_(2, index.expand(-1, F-1, -1), feat[:,:-1,:] * wts)
353 |         grid_flat = torch.round(grid_flat)
354 | 
355 |         current_values = grid_flat_att.gather(2, index)
356 |         new_val = torch.max(current_values,  (feat[:,-1:,:] * wts))
357 |         grid_flat_att.scatter_(2, index, new_val)
358 | 
359 |     final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1)
360 | 
361 |     assert final_grid_flat.shape==init_grid.view(B, F, -1).shape
362 |     return final_grid_flat.view(init_grid.shape)
363 | 


--------------------------------------------------------------------------------
/envs/utils/depth_utils_multi.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utilities for processing depth images.
 17 | """
 18 | import pdb
 19 | from argparse import Namespace
 20 | 
 21 | import itertools
 22 | import numpy as np
 23 | import torch
 24 | 
 25 | import envs.utils.rotation_utils as ru
 26 | 
 27 | 
 28 | def get_camera_matrix(width, height, fov):
 29 |     """Returns a camera matrix from image size and fov."""
 30 |     xc = (width - 1.) / 2.
 31 |     zc = (height - 1.) / 2.
 32 |     f = (width / 2.) / np.tan(np.deg2rad(fov / 2.))
 33 |     camera_matrix = {'xc': xc, 'zc': zc, 'f': f}
 34 |     camera_matrix = Namespace(**camera_matrix)
 35 |     return camera_matrix
 36 | 
 37 | 
 38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1):
 39 |     """Projects the depth image Y into a 3D point cloud.
 40 |     Inputs:
 41 |         Y is ...xHxW
 42 |         camera_matrix
 43 |     Outputs:
 44 |         X is positive going right
 45 |         Y is positive into the image
 46 |         Z is positive up in the image
 47 |         XYZ is ...xHxWx3
 48 |     """
 49 |     x, z = np.meshgrid(np.arange(Y.shape[-1]),
 50 |                        np.arange(Y.shape[-2] - 1, -1, -1))
 51 |     for _ in range(Y.ndim - 2):
 52 |         x = np.expand_dims(x, axis=0)
 53 |         z = np.expand_dims(z, axis=0)
 54 |     X = (x[::scale, ::scale] - camera_matrix.xc) * \
 55 |         Y[::scale, ::scale] / camera_matrix.f
 56 |     Z = (z[::scale, ::scale] - camera_matrix.zc) * \
 57 |         Y[::scale, ::scale] / camera_matrix.f
 58 |     XYZ = np.concatenate((X[..., np.newaxis],
 59 |                           Y[::scale, ::scale][..., np.newaxis],
 60 |                           Z[..., np.newaxis]), axis=X.ndim)
 61 |     return XYZ
 62 | 
 63 | 
 64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree):
 65 |     """
 66 |     Transforms the point cloud into geocentric frame to account for
 67 |     camera elevation and angle
 68 |     Input:
 69 |         XYZ                     : ...x3
 70 |         sensor_height           : height of the sensor
 71 |         camera_elevation_degree : camera elevation to rectify.
 72 |     Output:
 73 |         XYZ : ...x3
 74 |     """
 75 |     R = ru.get_r_matrix(
 76 |         [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
 77 |     XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
 78 |     XYZ[..., 2] = XYZ[..., 2] + sensor_height
 79 |     return XYZ
 80 | 
 81 | 
 82 | def transform_pose(XYZ, current_pose):
 83 |     """
 84 |     Transforms the point cloud into geocentric frame to account for
 85 |     camera position
 86 |     Input:
 87 |         XYZ                     : ...x3
 88 |         current_pose            : camera position (x, y, theta (radians))
 89 |     Output:
 90 |         XYZ : ...x3
 91 |     """
 92 |     R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
 93 |     XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
 94 |     XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0]
 95 |     XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1]
 96 |     return XYZ
 97 | 
 98 | 
 99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution):
100 |     """Bins points into xy-z bins
101 |     XYZ_cms is ... x H x W x3
102 |     Outputs is ... x map_size x map_size x (len(z_bins)+1)
103 |     """
104 |     sh = XYZ_cms.shape
105 |     XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
106 |     n_z_bins = len(z_bins) + 1
107 |     counts = []
108 |     for XYZ_cm in XYZ_cms:
109 |         isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0]))
110 |         X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32)
111 |         Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32)
112 |         Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32)
113 | 
114 |         isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0,
115 |                             Y_bin < map_size,
116 |                             Z_bin >= 0, Z_bin < n_z_bins, isnotnan])
117 |         isvalid = np.all(isvalid, axis=0)
118 | 
119 |         ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin
120 |         ind[np.logical_not(isvalid)] = 0
121 |         count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
122 |                             minlength=map_size * map_size * n_z_bins)
123 |         counts = np.reshape(count, [map_size, map_size, n_z_bins])
124 | 
125 |     counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins])
126 | 
127 |     return counts
128 | 
129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution):
130 |     """Bins points into xy-z bins
131 |     XYZ_cms is ... x H x W x3
132 |     semantic is ... x H x W
133 |     Outputs is ... x map_size x map_size x len
134 |     """
135 |     # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3
136 |     # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
137 |     # print("XYZ_cms: ",XYZ_cms)
138 |     # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3
139 |     # print("z_bins: ",z_bins)
140 |     # se = list(set(semantic.ravel()))
141 |     # print("bin_semantic_points: ", se) # []
142 |     # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3
143 |     isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0]))
144 |     X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32)
145 |     Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32)
146 |     Z_bin = semantic.astype(np.int32)
147 | 
148 |     # print("X_bin S count: ", Counter(X_bin.ravel()))
149 |     # print("Y_bin S count: ", Counter(Y_bin.ravel()))
150 |     # print("Z_bin S count: ", Counter(Z_bin.ravel()))
151 |     # print("Z_bin: {}".format(X_bin))
152 |     # print("X_bin: {}".format(X_bin.shape)) #128*128
153 |     # print("Z_bin: {}".format(Z_bin.shape)) #128*128
154 | 
155 |     isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size,
156 |                         Z_bin > 0, Z_bin <= semantic_map_len, isnotnan])
157 |     isvalid = np.all(isvalid, axis=0)
158 | 
159 |     # print("isvalid: ", Counter(isvalid.ravel()))
160 |     # print("isvalid: ", isvalid.shape) #128*128
161 | 
162 |     ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1
163 |     ind[np.logical_not(isvalid)] = 0
164 |     # print("ind: ", Counter(ind.ravel()))
165 |     # print("ind: ", ind.shape) # 128*128
166 |     count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
167 |                         minlength=map_size * map_size * semantic_map_len)
168 |     # print("counts: ", count.shape) # 4838400
169 | 
170 |     counts = np.reshape(count, [map_size, map_size, semantic_map_len])
171 |     # print("counts: ", counts.shape) # 480*480*semantic_map_len
172 | 
173 |     return counts
174 | 
175 | 
176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1):
177 |     """Projects the depth image Y into a 3D point cloud.
178 |     Inputs:
179 |         Y is ...xHxW
180 |         camera_matrix
181 |     Outputs:
182 |         X is positive going right
183 |         Y is positive into the image
184 |         Z is positive up in the image
185 |         XYZ is ...xHxWx3
186 |     """
187 |     grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]),
188 |                                     torch.arange(Y_t.shape[-2] - 1, -1, -1))
189 |     grid_x = grid_x.transpose(1, 0).to(device)
190 |     grid_z = grid_z.transpose(1, 0).to(device)
191 |     grid_x = grid_x.unsqueeze(0).expand(Y_t.size())
192 |     grid_z = grid_z.unsqueeze(0).expand(Y_t.size())
193 | 
194 |     X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \
195 |         Y_t[:, ::scale, ::scale] / camera_matrix.f
196 |     Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \
197 |         Y_t[:, ::scale, ::scale] / camera_matrix.f
198 | 
199 |     XYZ = torch.stack(
200 |         (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size()))
201 | 
202 |     return XYZ
203 | 
204 | 
205 | def transform_camera_view_t(
206 |         XYZ, sensor_height, camera_elevation_degree, device):
207 |     """
208 |     Transforms the point cloud into geocentric frame to account for
209 |     camera elevation and angle
210 |     Input:
211 |         XYZ                     : ...x3
212 |         sensor_height           : height of the sensor
213 |         camera_elevation_degree : camera elevation to rectify.
214 |     Output:
215 |         XYZ : ...x3
216 |     """
217 |     for i in range(XYZ.shape[0]):
218 |         R = ru.get_r_matrix(
219 |             [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i]))
220 |         XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device)
221 |                         )
222 |     XYZ[..., 2] = XYZ[..., 2] + sensor_height
223 |     return XYZ
224 | 
225 | 
226 | def transform_pose_t(XYZ, current_pose, device):
227 |     """
228 |     Transforms the point cloud into geocentric frame to account for
229 |     camera position
230 |     Input:
231 |         XYZ                     : ...x3
232 |         current_pose            : camera position (x, y, theta (radians))
233 |     Output:
234 |         XYZ : ...x3
235 |     """
236 |     R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
237 |     XYZ = torch.matmul(XYZ.reshape(-1, 3),
238 |                        torch.from_numpy(R).float().transpose(1, 0).to(device)
239 |                        ).reshape(XYZ.shape)
240 |     XYZ[..., 0] += current_pose[0]
241 |     XYZ[..., 1] += current_pose[1]
242 |     return XYZ
243 | 
244 | 
245 | def splat_feat_nd(init_grid, feat, coords):
246 |     """
247 |     Args:
248 |         init_grid: B X nF X W X H X D X ..
249 |         feat: B X nF X nPt
250 |         coords: B X nDims X nPt in [-1, 1]
251 |     Returns:
252 |         grid: B X nF X W X H X D X ..
253 |     """
254 |     wts_dim = []
255 |     pos_dim = []
256 |     grid_dims = init_grid.shape[2:]
257 | 
258 |     B = init_grid.shape[0]
259 |     F = init_grid.shape[1]
260 | 
261 |     n_dims = len(grid_dims)
262 | 
263 |     grid_flat = init_grid.view(B, F, -1)
264 | 
265 |     for d in range(n_dims):
266 |         pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
267 |         pos_d = []
268 |         wts_d = []
269 | 
270 |         for ix in [0, 1]:
271 |             pos_ix = torch.floor(pos) + ix
272 |             safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
273 |             safe_ix = safe_ix.type(pos.dtype)
274 | 
275 |             wts_ix = 1 - torch.abs(pos - pos_ix)
276 | 
277 |             wts_ix = wts_ix * safe_ix
278 |             pos_ix = pos_ix * safe_ix
279 | 
280 |             pos_d.append(pos_ix)
281 |             wts_d.append(wts_ix)
282 | 
283 |         pos_dim.append(pos_d)
284 |         wts_dim.append(wts_d)
285 | 
286 |     l_ix = [[0, 1] for d in range(n_dims)]
287 |     # pdb.set_trace()
288 |     for ix_d in itertools.product(*l_ix):
289 |         wts = torch.ones_like(wts_dim[0][0])
290 |         index = torch.zeros_like(wts_dim[0][0])
291 |         for d in range(n_dims):
292 |             index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
293 |             wts = wts * wts_dim[d][ix_d[d]]
294 | 
295 |         index = index.long()
296 |         grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts)
297 |         grid_flat = torch.round(grid_flat)
298 | 
299 |     return grid_flat.view(init_grid.shape)
300 | 
301 | def splat_att_feat_nd(init_grid, feat, coords,num_atts):
302 |     """
303 |     Args:
304 |         init_grid: B X nF X W X H X D X ..
305 |         feat: B X nF X nPt
306 |         coords: B X nDims X nPt in [-1, 1]
307 |     Returns:
308 |         grid: B X nF X W X H X D X ..
309 |     """
310 |     wts_dim = []
311 |     pos_dim = []
312 |     grid_dims = init_grid.shape[2:]
313 | 
314 |     B = init_grid.shape[0]
315 |     F = init_grid.shape[1]
316 | 
317 |     n_dims = len(grid_dims)
318 |     # pdb.set_trace()
319 |     grid_flat = init_grid[:,:-num_atts,...].view(B, F-num_atts, -1)
320 |     grid_flat_att = init_grid[:,-num_atts:,...].view(B,num_atts, -1)
321 | 
322 |     for d in range(n_dims):
323 |         pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
324 |         pos_d = []
325 |         wts_d = []
326 | 
327 |         for ix in [0, 1]:
328 |             pos_ix = torch.floor(pos) + ix
329 |             safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
330 |             safe_ix = safe_ix.type(pos.dtype)
331 | 
332 |             wts_ix = 1 - torch.abs(pos - pos_ix)
333 | 
334 |             wts_ix = wts_ix * safe_ix
335 |             pos_ix = pos_ix * safe_ix
336 | 
337 |             pos_d.append(pos_ix)
338 |             wts_d.append(wts_ix)
339 | 
340 |         pos_dim.append(pos_d)
341 |         wts_dim.append(wts_d)
342 | 
343 |     l_ix = [[0, 1] for d in range(n_dims)]
344 |     for ix_d in itertools.product(*l_ix):
345 |         wts = torch.ones_like(wts_dim[0][0])
346 |         index = torch.zeros_like(wts_dim[0][0])
347 |         for d in range(n_dims):
348 |             index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
349 |             wts = wts * wts_dim[d][ix_d[d]]
350 | 
351 |         index = index.long()
352 |         grid_flat.scatter_add_(2, index.expand(-1, F-num_atts, -1), feat[:,:-num_atts,:] * wts)
353 |         grid_flat = torch.round(grid_flat)
354 | 
355 |         current_values = grid_flat_att.gather(2, index.expand(-1, num_atts, -1))
356 |         new_val = torch.max(current_values,  (feat[:,-num_atts:,:] * wts))
357 |         grid_flat_att.scatter_(2, index.expand(-1, num_atts, -1), new_val)
358 | 
359 |     final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1)
360 | 
361 |     assert final_grid_flat.shape==init_grid.view(B, F, -1).shape
362 |     return final_grid_flat.view(init_grid.shape)
363 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | 
  4 | 
  5 | def get_args():
  6 |     parser = argparse.ArgumentParser(
  7 |         description='Goal-Oriented-Semantic-Exploration')
  8 | 
  9 |     # General Arguments
 10 |     parser.add_argument('--seed', type=int, default=1,
 11 |                         help='random seed (default: 1)')
 12 |     parser.add_argument('--auto_gpu_config', type=int, default=1)
 13 |     parser.add_argument('--total_num_scenes', type=str, default="auto")
 14 |     parser.add_argument('-n', '--num_processes', type=int, default=5,
 15 |                         help="""how many training processes to use (default:5)
 16 |                                 Overridden when auto_gpu_config=1
 17 |                                 and training on gpus""")
 18 |     parser.add_argument('--num_processes_per_gpu', type=int, default=6)
 19 |     parser.add_argument('--num_processes_on_first_gpu', type=int, default=1)
 20 |     parser.add_argument('--eval', type=int, default=0,
 21 |                         help='0: Train, 1: Evaluate (default: 0)')
 22 |     parser.add_argument('--num_training_frames', type=int, default=10000000,
 23 |                         help='total number of training frames')
 24 |     parser.add_argument('--num_eval_episodes', type=int, default=400,
 25 |                         help="number of test episodes per scene")
 26 |     parser.add_argument('--num_train_episodes', type=int, default=10000,
 27 |                         help="""number of train episodes per scene
 28 |                                 before loading the next scene""")
 29 |     parser.add_argument('--no_cuda', action='store_true', default=False,
 30 |                         help='disables CUDA training')
 31 |     parser.add_argument("--sim_gpu_id", type=int, default=0,
 32 |                         help="gpu id on which scenes are loaded")
 33 |     parser.add_argument("--sem_gpu_id", type=int, default=-1,
 34 |                         help="""gpu id for semantic model,
 35 |                                 -1: same as sim gpu, -2: cpu""")
 36 |     parser.add_argument("--blip_gpu_id", type=int, default=1,
 37 |                         help="""gpu id for BLIP model,
 38 |                                 -1: same as sim gpu, -2: cpu""")
 39 | 
 40 |     # Logging, loading models, visualization
 41 |     parser.add_argument('--log_interval', type=int, default=10,
 42 |                         help="""log interval, one log per n updates
 43 |                                 (default: 10) """)
 44 |     parser.add_argument('--save_interval', type=int, default=1,
 45 |                         help="""save interval""")
 46 |     parser.add_argument('-d', '--dump_location', type=str, default="./tmp/",
 47 |                         help='path to dump models and log (default: ./tmp/)')
 48 |     parser.add_argument('--exp_name', type=str, default="exp1",
 49 |                         help='experiment name (default: exp1)')
 50 |     parser.add_argument('--save_periodic', type=int, default=500000,
 51 |                         help='Model save frequency in number of updates')
 52 |     parser.add_argument('--load', type=str, default="0",
 53 |                         help="""model path to load,
 54 |                                 0 to not reload (default: 0)""")
 55 |     parser.add_argument('-v', '--visualize', type=int, default=0,
 56 |                         help="""1: Render the observation and
 57 |                                    the predicted semantic map,
 58 |                                 2: Render the observation with semantic
 59 |                                    predictions and the predicted semantic map
 60 |                                 (default: 0)""")
 61 |     parser.add_argument('--print_images', type=int, default=0,
 62 |                         help='1: save visualization as images')
 63 | 
 64 |     # Environment, dataset and episode specifications
 65 |     parser.add_argument('-efw', '--env_frame_width', type=int, default=640,
 66 |                         help='Frame width (default:640)')
 67 |     parser.add_argument('-efh', '--env_frame_height', type=int, default=480,
 68 |                         help='Frame height (default:480)')
 69 |     parser.add_argument('-fw', '--frame_width', type=int, default=160,
 70 |                         help='Frame width (default:160)')
 71 |     parser.add_argument('-fh', '--frame_height', type=int, default=120,
 72 |                         help='Frame height (default:120)')
 73 |     parser.add_argument('-el', '--max_episode_length', type=int, default=532,
 74 |                         help="""Maximum episode length""")
 75 |     parser.add_argument("--task_config", type=str,
 76 |                         default="tasks/objectnav_hm3d.yaml",
 77 |                         help="path to config yaml containing task information")
 78 |     parser.add_argument("--split", type=str, default="train",
 79 |                         help="dataset split (train | val | val_mini) ")
 80 |     parser.add_argument('--camera_height', type=float, default=0.88,
 81 |                         help="agent camera height in metres")
 82 |     parser.add_argument('--hfov', type=float, default=79.0,
 83 |                         help="horizontal field of view in degrees")
 84 |     parser.add_argument('--turn_angle', type=float, default=30,
 85 |                         help="Agent turn angle in degrees")
 86 |     parser.add_argument('--min_depth', type=float, default=0.5,
 87 |                         help="Minimum depth for depth sensor in meters")
 88 |     parser.add_argument('--max_depth', type=float, default=5.0,
 89 |                         help="Maximum depth for depth sensor in meters")
 90 |     parser.add_argument('--success_dist', type=float, default=1.5,
 91 |                         help="success distance threshold in meters")
 92 |     parser.add_argument('--floor_thr', type=int, default=50,
 93 |                         help="floor threshold in cm")
 94 |     parser.add_argument('--min_d', type=float, default=1.5,
 95 |                         help="min distance to goal during training in meters")
 96 |     parser.add_argument('--max_d', type=float, default=100.0,
 97 |                         help="max distance to goal during training in meters")
 98 |     parser.add_argument('--version', type=str, default="v1.1",
 99 |                         help="dataset version")
100 | 
101 |     # Model Hyperparameters
102 |     parser.add_argument('--agent', type=str, default="sem_exp")
103 |     parser.add_argument('--lr', type=float, default=2.5e-5,
104 |                         help='learning rate (default: 2.5e-5)')
105 |     parser.add_argument('--global_hidden_size', type=int, default=256,
106 |                         help='global_hidden_size')
107 |     parser.add_argument('--eps', type=float, default=1e-5,
108 |                         help='RL Optimizer epsilon (default: 1e-5)')
109 |     parser.add_argument('--alpha', type=float, default=0.99,
110 |                         help='RL Optimizer alpha (default: 0.99)')
111 |     parser.add_argument('--gamma', type=float, default=0.99,
112 |                         help='discount factor for rewards (default: 0.99)')
113 |     parser.add_argument('--use_gae', action='store_true', default=False,
114 |                         help='use generalized advantage estimation')
115 |     parser.add_argument('--tau', type=float, default=0.95,
116 |                         help='gae parameter (default: 0.95)')
117 |     parser.add_argument('--entropy_coef', type=float, default=0.001,
118 |                         help='entropy term coefficient (default: 0.01)')
119 |     parser.add_argument('--value_loss_coef', type=float, default=0.5,
120 |                         help='value loss coefficient (default: 0.5)')
121 |     parser.add_argument('--max_grad_norm', type=float, default=0.5,
122 |                         help='max norm of gradients (default: 0.5)')
123 |     parser.add_argument('--num_global_steps', type=int, default=20,
124 |                         help='number of forward steps in A2C (default: 5)')
125 |     parser.add_argument('--ppo_epoch', type=int, default=4,
126 |                         help='number of ppo epochs (default: 4)')
127 |     parser.add_argument('--num_mini_batch', type=str, default="auto",
128 |                         help='number of batches for ppo (default: 32)')
129 |     parser.add_argument('--clip_param', type=float, default=0.2,
130 |                         help='ppo clip parameter (default: 0.2)')
131 |     parser.add_argument('--use_recurrent_global', type=int, default=0,
132 |                         help='use a recurrent global policy')
133 |     parser.add_argument('--num_local_steps', type=int, default=25,
134 |                         help="""Number of steps the local policy
135 |                                 between each global step""")
136 |     parser.add_argument('--reward_coeff', type=float, default=0.1,
137 |                         help="Object goal reward coefficient")
138 |     parser.add_argument('--intrinsic_rew_coeff', type=float, default=0.05,
139 |                         help="intrinsic exploration reward coefficient")
140 |     parser.add_argument('--num_sem_categories', type=float, default=16)
141 |     parser.add_argument('--sem_pred_prob_thr', type=float, default=0.9,
142 |                         help="Semantic prediction confidence threshold")
143 | 
144 |     # Mapping
145 |     parser.add_argument('--global_downscaling', type=int, default=2)
146 |     parser.add_argument('--vision_range', type=int, default=100)
147 |     parser.add_argument('--map_resolution', type=int, default=5)
148 |     parser.add_argument('--du_scale', type=int, default=1)
149 |     parser.add_argument('--map_size_cm', type=int, default=4800)
150 |     parser.add_argument('--cat_pred_threshold', type=float, default=5.0)
151 |     parser.add_argument('--map_pred_threshold', type=float, default=1.0)
152 |     parser.add_argument('--exp_pred_threshold', type=float, default=1.0)
153 |     parser.add_argument('--collision_threshold', type=float, default=0.10)
154 |     parser.add_argument('--collision_queue_size', type=int, default=3)
155 |     parser.add_argument('--wtec', type=float, default=0.5)
156 |     parser.add_argument('--num_attrs', type=float, default=5)
157 | 
158 |     parser.add_argument('--use_gtsem', type=int, default=0)
159 | 
160 |     # train_se_frontier
161 |     parser.add_argument('--train_se_f', type=int, default=0)
162 |     parser.add_argument('--load_se_edge', type=str, default="0",
163 |                         help="""model path to load,
164 |                                 0 to not reload (default: 0)""")
165 | 
166 | 
167 |     # parse arguments
168 |     args = parser.parse_args()
169 | 
170 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
171 | 
172 |     if args.cuda:
173 |         if args.auto_gpu_config:
174 |             num_gpus = torch.cuda.device_count()
175 |             if args.total_num_scenes != "auto":
176 |                 args.total_num_scenes = int(args.total_num_scenes)
177 |             elif "objectnav_gibson" in args.task_config and \
178 |                     "train" in args.split:
179 |                 args.total_num_scenes = 25
180 |             elif "objectnav_gibson" in args.task_config and \
181 |                     "val" in args.split:
182 |                 args.total_num_scenes = 5
183 |             else:
184 |                 assert False, "Unknown task config, please specify" + \
185 |                     " total_num_scenes"
186 | 
187 |             # GPU Memory required for the SemExp model:
188 |             #       0.8 + 0.4 * args.total_num_scenes (GB)
189 |             # GPU Memory required per thread: 2.6 (GB)
190 |             min_memory_required = max(0.8 + 0.4 * args.total_num_scenes, 2.6)
191 |             # Automatically configure number of training threads based on
192 |             # number of GPUs available and GPU memory size
193 |             gpu_memory = 1000
194 |             for i in range(num_gpus):
195 |                 gpu_memory = min(gpu_memory,
196 |                                  torch.cuda.get_device_properties(
197 |                                      i).total_memory
198 |                                  / 1024 / 1024 / 1024)
199 |                 assert gpu_memory > min_memory_required, \
200 |                     """Insufficient GPU memory for GPU {}, gpu memory ({}GB)
201 |                     needs to be greater than {}GB""".format(
202 |                         i, gpu_memory, min_memory_required)
203 | 
204 |             num_processes_per_gpu = int(gpu_memory / 2.6)
205 |             num_processes_on_first_gpu = \
206 |                 int((gpu_memory - min_memory_required) / 2.6)
207 | 
208 |             if args.eval:
209 |                 max_threads = num_processes_per_gpu * (num_gpus - 1) \
210 |                     + num_processes_on_first_gpu
211 |                 assert max_threads >= args.total_num_scenes, \
212 |                     """Insufficient GPU memory for evaluation"""
213 | 
214 |             if num_gpus == 1:
215 |                 args.num_processes_on_first_gpu = num_processes_on_first_gpu
216 |                 args.num_processes_per_gpu = 0
217 |                 args.num_processes = num_processes_on_first_gpu
218 |                 assert args.num_processes > 0, "Insufficient GPU memory"
219 |             else:
220 |                 num_threads = num_processes_per_gpu * (num_gpus - 1) \
221 |                     + num_processes_on_first_gpu
222 |                 num_threads = min(num_threads, args.total_num_scenes)
223 |                 args.num_processes_per_gpu = num_processes_per_gpu
224 |                 args.num_processes_on_first_gpu = max(
225 |                     0,
226 |                     num_threads - args.num_processes_per_gpu * (num_gpus - 1))
227 |                 args.num_processes = num_threads
228 | 
229 |             args.sim_gpu_id = 1
230 | 
231 |             print("Auto GPU config:")
232 |             print("Number of processes: {}".format(args.num_processes))
233 |             print("Number of processes on GPU 0: {}".format(
234 |                 args.num_processes_on_first_gpu))
235 |             print("Number of processes per GPU: {}".format(
236 |                 args.num_processes_per_gpu))
237 |     else:
238 |         args.sem_gpu_id = -2
239 | 
240 |     if args.num_mini_batch == "auto":
241 |         args.num_mini_batch = max(args.num_processes // 2, 1)
242 |     else:
243 |         args.num_mini_batch = int(args.num_mini_batch)
244 | 
245 |     return args
246 | 


--------------------------------------------------------------------------------
/model_multi.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn import functional as F
  6 | import numpy as np
  7 | 
  8 | # from utils.distributions import Categorical, DiagGaussian
  9 | from torch.distributions.categorical import Categorical
 10 | from utils.model import get_grid, ChannelPool, Flatten, NNBase
 11 | import envs.utils.depth_utils_multi as du
 12 | 
 13 | class Semantic_Mapping(nn.Module):
 14 | 
 15 |     """
 16 |     Semantic_Mapping
 17 |     """
 18 | 
 19 |     def __init__(self, args):
 20 |         super(Semantic_Mapping, self).__init__()
 21 | 
 22 |         self.device = args.device
 23 |         self.screen_h = args.frame_height
 24 |         self.screen_w = args.frame_width
 25 |         self.resolution = args.map_resolution
 26 |         self.z_resolution = args.map_resolution
 27 |         self.map_size_cm = args.map_size_cm // args.global_downscaling
 28 |         self.n_channels = 3
 29 |         self.vision_range = args.vision_range
 30 |         self.dropout = 0.5
 31 |         self.fov = args.hfov
 32 |         self.du_scale = args.du_scale
 33 |         self.cat_pred_threshold = args.cat_pred_threshold
 34 |         self.exp_pred_threshold = args.exp_pred_threshold
 35 |         self.map_pred_threshold = args.map_pred_threshold
 36 |         self.num_sem_categories = args.num_sem_categories
 37 |         self.num_atts = args.num_attrs
 38 | 
 39 |         self.max_height = int(200 / self.z_resolution)
 40 |         self.min_height = int(-40 / self.z_resolution)
 41 |         self.agent_height = args.camera_height * 100.
 42 |         self.shift_loc = [self.vision_range *
 43 |                           self.resolution // 2, 0, np.pi / 2.0]
 44 |         self.camera_matrix = du.get_camera_matrix(
 45 |             self.screen_w, self.screen_h, self.fov)
 46 | 
 47 |         self.pool = ChannelPool(1)
 48 | 
 49 |         vr = self.vision_range
 50 |         # pdb.set_trace()
 51 |         self.init_grid = torch.zeros(
 52 |             args.num_processes, 1 + self.num_sem_categories+self.num_atts, vr, vr,
 53 |             self.max_height - self.min_height
 54 |         ).float().to(self.device)
 55 |         self.feat = torch.ones(
 56 |             args.num_processes, 1 + self.num_sem_categories+self.num_atts,
 57 |             self.screen_h // self.du_scale * self.screen_w // self.du_scale
 58 |         ).float().to(self.device)
 59 | 
 60 | 
 61 |         self.max_pool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 62 | 
 63 |         self.stair_mask_radius = 30
 64 |         self.stair_mask = self.get_mask(self.stair_mask_radius).to(self.device)
 65 | 
 66 |     def forward(self, obs, pose_obs, maps_last, poses_last, eve_angle,goal_found):
 67 |         bs, c, h, w = obs.size()
 68 |         depth = obs[:, 3, :, :]
 69 |         point_cloud_t = du.get_point_cloud_from_z_t(
 70 |             depth, self.camera_matrix, self.device, scale=self.du_scale)
 71 | 
 72 |         agent_view_t = du.transform_camera_view_t(
 73 |             point_cloud_t, self.agent_height, eve_angle, self.device)
 74 | 
 75 |         agent_view_centered_t = du.transform_pose_t(
 76 |             agent_view_t, self.shift_loc, self.device)
 77 | 
 78 |         max_h = self.max_height
 79 |         min_h = self.min_height
 80 |         xy_resolution = self.resolution
 81 |         z_resolution = self.z_resolution
 82 |         vision_range = self.vision_range
 83 |         XYZ_cm_std = agent_view_centered_t.float()
 84 |         XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution)
 85 |         XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] -
 86 |                                vision_range // 2.) / vision_range * 2.
 87 |         XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution
 88 |         XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] -
 89 |                               (max_h + min_h) // 2.) / (max_h - min_h) * 2.
 90 |         self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)(
 91 |             obs[:, 4:, :, :]
 92 |         ).view(bs, c - 4, h // self.du_scale * w // self.du_scale)
 93 | 
 94 |         XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2)
 95 |         XYZ_cm_std = XYZ_cm_std.view(XYZ_cm_std.shape[0],
 96 |                                      XYZ_cm_std.shape[1],
 97 |                                      XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3])
 98 | 
 99 |         # voxels = du.splat_feat_nd(
100 |         #     self.init_grid * 0., self.feat, XYZ_cm_std).transpose(2, 3)
101 |         voxels = du.splat_att_feat_nd(
102 |             self.init_grid * 0., self.feat, XYZ_cm_std,self.num_atts).transpose(2, 3)
103 | 
104 |         min_z = int(25 / z_resolution - min_h)
105 |         max_z = int((self.agent_height + 50) / z_resolution - min_h)
106 |         mid_z = int(self.agent_height / z_resolution - min_h)
107 | 
108 |         # print("========== IN SEM MAP ==========")
109 |         # print(np.unique(voxels[:,[-1],...].max(4)[0].detach().cpu().numpy()))
110 | 
111 |         agent_height_proj = voxels[:,:-self.num_atts,..., min_z:max_z].sum(4)
112 |         agent_height_proj_att = voxels[:,-self.num_atts:,..., min_z:max_z].max(4)[0]
113 |         agent_height_proj=torch.cat([agent_height_proj,agent_height_proj_att],dim=1)
114 |         agent_height_stair_proj = voxels[:,:-self.num_atts,..., mid_z-5:mid_z].sum(4)
115 |         agent_height_stair_proj_att = voxels[:,-self.num_atts:,..., mid_z-5:mid_z].max(4)[0]
116 |         agent_height_stair_proj=torch.cat([agent_height_stair_proj,agent_height_stair_proj_att],dim=1)
117 |         all_height_proj = voxels[:,:-self.num_atts,...].sum(4)
118 |         all_height_proj_att = voxels[:,-self.num_atts:,...].max(4)[0]
119 |         all_height_proj=torch.cat([all_height_proj,all_height_proj_att],dim=1)
120 | 
121 |         # pdb.set_trace()
122 |         fp_map_pred = agent_height_proj[:, 0:1, :, :]
123 |         fp_exp_pred = all_height_proj[:, 0:1, :, :]
124 |         fp_stair_pred = agent_height_stair_proj[:, 0:1, :, :]
125 |         fp_map_pred = fp_map_pred / self.map_pred_threshold
126 |         fp_stair_pred = fp_stair_pred / self.map_pred_threshold
127 |         fp_exp_pred = fp_exp_pred / self.exp_pred_threshold
128 |         fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0)
129 |         fp_stair_pred = torch.clamp(fp_stair_pred, min=0.0, max=1.0)
130 |         fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0)
131 | 
132 |         pose_pred = poses_last
133 | 
134 |         agent_view = torch.zeros(bs, c,
135 |                                  self.map_size_cm // self.resolution,
136 |                                  self.map_size_cm // self.resolution
137 |                                  ).to(self.device)
138 | 
139 |         x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2
140 |         x2 = x1 + self.vision_range
141 |         y1 = self.map_size_cm // (self.resolution * 2)
142 |         y2 = y1 + self.vision_range
143 |         agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred
144 |         agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred
145 |         agent_view[:, 4:-self.num_atts, y1:y2, x1:x2] = torch.clamp(
146 |             agent_height_proj[:, 1:-self.num_atts, :, :] / self.cat_pred_threshold,
147 |             min=0.0, max=1.0)
148 |         agent_view[:, -self.num_atts:, y1:y2, x1:x2] =agent_height_proj[:, -self.num_atts:, :, :]
149 | 
150 |         # print("SEM MAP: MAX ",agent_height_proj[:, -1, :, :].max(),", MIN: ",agent_height_proj[:, -1, :, :].min())
151 |         # print("SEM MAP/HEIGHT: MAX ",agent_view[:, -1, ...].max(),", MIN: ",agent_view[:, -1, ...].min())
152 | 
153 |         agent_view_stair = agent_view.clone().detach()
154 |         agent_view_stair[:, 0:1, y1:y2, x1:x2] = fp_stair_pred
155 | 
156 |         corrected_pose = pose_obs
157 | 
158 |         def get_new_pose_batch(pose, rel_pose_change):
159 | 
160 |             pose[:, 1] += rel_pose_change[:, 0] * \
161 |                 torch.sin(pose[:, 2] / 57.29577951308232) \
162 |                 + rel_pose_change[:, 1] * \
163 |                 torch.cos(pose[:, 2] / 57.29577951308232)
164 |             pose[:, 0] += rel_pose_change[:, 0] * \
165 |                 torch.cos(pose[:, 2] / 57.29577951308232) \
166 |                 - rel_pose_change[:, 1] * \
167 |                 torch.sin(pose[:, 2] / 57.29577951308232)
168 |             pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232
169 | 
170 |             pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0
171 |             pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0
172 | 
173 |             return pose
174 | 
175 |         current_poses = get_new_pose_batch(poses_last, corrected_pose)
176 |         st_pose = current_poses.clone().detach()
177 | 
178 |         st_pose[:, :2] = - (st_pose[:, :2]
179 |                             * 100.0 / self.resolution
180 |                             - self.map_size_cm // (self.resolution * 2)) /\
181 |             (self.map_size_cm // (self.resolution * 2))
182 |         st_pose[:, 2] = 90. - (st_pose[:, 2])
183 | 
184 |         rot_mat, trans_mat = get_grid(st_pose, agent_view.size(),
185 |                                       self.device)
186 | 
187 |         rotated = F.grid_sample(agent_view, rot_mat, align_corners=True)
188 |         translated = F.grid_sample(rotated, trans_mat, align_corners=True)
189 | 
190 |         # translated[:, 18:19, :, :] = -self.max_pool(-translated[:, 18:19, :, :])
191 | 
192 |         diff_ob_ex = translated[:, 1:2, :, :] - self.max_pool(translated[:, 0:1, :, :])
193 | 
194 |         diff_ob_ex[diff_ob_ex>0.8] = 1.0
195 |         diff_ob_ex[diff_ob_ex!=1.0] = 0.0
196 | 
197 |         # pdb.set_trace()
198 |         if not goal_found:
199 |             # TODO: better way to get the score
200 |             # maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
201 |             _score_maps_last=maps_last[:,-self.num_atts:,...]
202 |             _score_maps_new=translated[:, -self.num_atts:, ...]
203 |             updated_mask=_score_maps_new>0
204 |             _s_map=torch.cat((_score_maps_last.unsqueeze(1), _score_maps_new.unsqueeze(1)), 1)
205 |             _s_map,_=torch.max(_s_map, 1)
206 |             map_score_new = torch.where(updated_mask, _s_map / 2, _score_maps_last)
207 |             maps2 = torch.cat((maps_last[:, :-self.num_atts, ...].unsqueeze(1), translated[:, :-self.num_atts, ...].unsqueeze(1)), 1)
208 |             map_pred, _ = torch.max(maps2, 1)
209 |             map_pred=torch.cat([map_pred,map_score_new],1)
210 |             # print("map_score_new: MAX ",map_score_new.max(),", MIN: ",map_score_new.min())
211 |         else:
212 |             maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
213 |             map_pred, _ = torch.max(maps2, 1)
214 | 
215 | 
216 |         # map_pred=torch.cat((_map_pred, map_score_new), dim=1)
217 | 
218 |         for i in range(eve_angle.shape[0]):
219 |             if eve_angle[i] == 0:
220 |                 map_pred[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0
221 | 
222 |         # stairs view
223 |         rot_mat_stair, trans_mat_stair = get_grid(st_pose, agent_view_stair.size(),
224 |                                       self.device)
225 | 
226 |         rotated_stair = F.grid_sample(agent_view_stair, rot_mat_stair, align_corners=True)
227 |         translated_stair = F.grid_sample(rotated_stair, trans_mat_stair, align_corners=True)
228 | 
229 |         stair_mask = torch.zeros(self.map_size_cm // self.resolution, self.map_size_cm // self.resolution).to(self.device)
230 | 
231 |         s_y = int(current_poses[0][1]*100/5)
232 |         s_x = int(current_poses[0][0]*100/5)
233 |         limit_up = self.map_size_cm // self.resolution - self.stair_mask_radius - 1
234 |         limit_be = self.stair_mask_radius
235 |         if s_y > limit_up:
236 |             s_y = limit_up
237 |         if s_y < self.stair_mask_radius:
238 |             s_y = self.stair_mask_radius
239 |         if s_x > limit_up:
240 |             s_x = limit_up
241 |         if s_x < self.stair_mask_radius:
242 |             s_x = self.stair_mask_radius
243 |         stair_mask[int(s_y-self.stair_mask_radius):int(s_y+self.stair_mask_radius), int(s_x-self.stair_mask_radius):int(s_x+self.stair_mask_radius)] = self.stair_mask
244 | 
245 |         translated_stair[0, 0:1, :, :] *= stair_mask
246 |         translated_stair[0, 1:2, :, :] *= stair_mask
247 | 
248 |         # translated_stair[:, 13:14, :, :] = -self.max_pool(-translated_stair[:, 13:14, :, :])
249 | 
250 |         diff_ob_ex = translated_stair[:, 1:2, :, :] - translated_stair[:, 0:1, :, :]
251 | 
252 |         diff_ob_ex[diff_ob_ex>0.8] = 1.0
253 |         diff_ob_ex[diff_ob_ex!=1.0] = 0.0
254 | 
255 |         maps3 = torch.cat((maps_last.unsqueeze(1), translated_stair.unsqueeze(1)), 1)
256 | 
257 |         map_pred_stair, _ = torch.max(maps3, 1)
258 | 
259 |         for i in range(eve_angle.shape[0]):
260 |             if eve_angle[i] == 0:
261 |                 map_pred_stair[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0
262 | 
263 | 
264 |         # print("map_pred_-1: MAX ",map_pred[:,-1,...].max(),", MIN: ",map_pred[:,-1,...].min())
265 |         return translated, map_pred, map_pred_stair, current_poses
266 | 
267 | 
268 |     def get_mask(self, step_size):
269 |         size = int(step_size) * 2
270 |         mask = torch.zeros(size, size)
271 |         for i in range(size):
272 |             for j in range(size):
273 |                 if ((i + 0.5) - (size // 2)) ** 2 + \
274 |                 ((j + 0.5) - (size // 2)) ** 2 <= \
275 |                         step_size ** 2:
276 |                     mask[i, j] = 1
277 |         return mask
278 | 
279 | 
280 | class FeedforwardNet(nn.Module):
281 | 
282 |     def __init__(self, input_dim, output_dim):
283 |         super(FeedforwardNet, self).__init__()
284 |         """ self.layers = nn.Sequential(
285 |             nn.Linear(input_dim, 512),
286 |             nn.BatchNorm1d(512),
287 |             nn.ReLU(inplace=True),
288 |             nn.Linear(512, 512),
289 |             nn.BatchNorm1d(512),
290 |             nn.ReLU(inplace=True),
291 |             nn.Linear(512, 256),
292 |             nn.BatchNorm1d(256),
293 |             nn.ReLU(inplace=True),
294 |             nn.Linear(256, output_dim),
295 |         ) """
296 |         self.layers = nn.Sequential(
297 |             nn.Linear(input_dim, 512),
298 |             nn.BatchNorm1d(512),
299 |             nn.ReLU(inplace=True),
300 |             nn.Linear(512, 256),
301 |             nn.BatchNorm1d(256),
302 |             nn.ReLU(inplace=True),
303 |             nn.Linear(256, output_dim),
304 |         )
305 | 
306 |     def forward(self, x):
307 |         return self.layers(x)


--------------------------------------------------------------------------------
/RedNet/RedNet_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F 
  4 | import math
  5 | import torch.utils.model_zoo as model_zoo
  6 | import utils
  7 | from torch.utils.checkpoint import checkpoint
  8 | import os
  9 | import torchvision
 10 | 
 11 | 
 12 | class RedNet(nn.Module):
 13 |     def __init__(self, num_classes=40, pretrained=False):
 14 | 
 15 |         super().__init__()
 16 | 
 17 |         block = Bottleneck
 18 |         transblock = TransBasicBlock
 19 |         layers = [3, 4, 6, 3]
 20 |         # original resnet
 21 |         self.inplanes = 64
 22 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
 23 |                                bias=False)
 24 |         self.bn1 = nn.BatchNorm2d(64)
 25 |         self.relu = nn.ReLU(inplace=True)
 26 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 27 |         self.layer1 = self._make_layer(block, 64, layers[0])
 28 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 29 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 30 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 31 | 
 32 |         # resnet for depth channel
 33 |         self.inplanes = 64
 34 |         self.conv1_d = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
 35 |                                  bias=False)
 36 |         self.bn1_d = nn.BatchNorm2d(64)
 37 |         self.layer1_d = self._make_layer(block, 64, layers[0])
 38 |         self.layer2_d = self._make_layer(block, 128, layers[1], stride=2)
 39 |         self.layer3_d = self._make_layer(block, 256, layers[2], stride=2)
 40 |         self.layer4_d = self._make_layer(block, 512, layers[3], stride=2)
 41 | 
 42 |         self.inplanes = 512
 43 |         self.deconv1 = self._make_transpose(transblock, 256, 6, stride=2)
 44 |         self.deconv2 = self._make_transpose(transblock, 128, 4, stride=2)
 45 |         self.deconv3 = self._make_transpose(transblock, 64, 3, stride=2)
 46 |         self.deconv4 = self._make_transpose(transblock, 64, 3, stride=2)
 47 | 
 48 |         self.agant0 = self._make_agant_layer(64, 64)
 49 |         self.agant1 = self._make_agant_layer(64 * 4, 64)
 50 |         self.agant2 = self._make_agant_layer(128 * 4, 128)
 51 |         self.agant3 = self._make_agant_layer(256 * 4, 256)
 52 |         self.agant4 = self._make_agant_layer(512 * 4, 512)
 53 | 
 54 |         # final block
 55 |         self.inplanes = 64
 56 |         self.final_conv = self._make_transpose(transblock, 64, 3)
 57 | 
 58 |         self.final_deconv_custom = nn.ConvTranspose2d(self.inplanes, num_classes, kernel_size=2,
 59 |                                                stride=2, padding=0, bias=True)
 60 | 
 61 |         self.out5_conv_custom = nn.Conv2d(256, num_classes, kernel_size=1, stride=1, bias=True)
 62 |         self.out4_conv_custom = nn.Conv2d(128, num_classes, kernel_size=1, stride=1, bias=True)
 63 |         self.out3_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True)
 64 |         self.out2_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True)
 65 | 
 66 |         if pretrained:
 67 |             self._load_resnet_pretrained()
 68 | 
 69 |     def weights_init(self, m):
 70 |         classname = m.__class__.__name__
 71 |         if classname.find('Conv') != -1:
 72 |             nn.init.kaiming_normal_(m.weight)
 73 |             if m.bias is not None:
 74 |                 nn.init.zeros_(m.bias)
 75 |         elif isinstance(m, nn.BatchNorm2d):
 76 |             m.weight.data.fill_(1)
 77 |             m.bias.data.zero_()
 78 | 
 79 | 
 80 | 
 81 |     def _make_layer(self, block, planes, blocks, stride=1):
 82 |         downsample = None
 83 |         if stride != 1 or self.inplanes != planes * block.expansion:
 84 |             downsample = nn.Sequential(
 85 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 86 |                           kernel_size=1, stride=stride, bias=False),
 87 |                 nn.BatchNorm2d(planes * block.expansion),
 88 |             )
 89 | 
 90 |         layers = []
 91 | 
 92 |         layers.append(block(self.inplanes, planes, stride, downsample))
 93 |         self.inplanes = planes * block.expansion
 94 |         for i in range(1, blocks):
 95 |             layers.append(block(self.inplanes, planes))
 96 | 
 97 |         return nn.Sequential(*layers)
 98 | 
 99 |     def _make_transpose(self, block, planes, blocks, stride=1):
100 | 
101 |         upsample = None
102 |         if stride != 1:
103 |             upsample = nn.Sequential(
104 |                 nn.ConvTranspose2d(self.inplanes, planes,
105 |                                    kernel_size=2, stride=stride,
106 |                                    padding=0, bias=False),
107 |                 nn.BatchNorm2d(planes),
108 |             )
109 |         elif self.inplanes != planes:
110 |             upsample = nn.Sequential(
111 |                 nn.Conv2d(self.inplanes, planes,
112 |                           kernel_size=1, stride=stride, bias=False),
113 |                 nn.BatchNorm2d(planes),
114 |             )
115 | 
116 |         layers = []
117 | 
118 |         for i in range(1, blocks):
119 |             layers.append(block(self.inplanes, self.inplanes))
120 | 
121 |         layers.append(block(self.inplanes, planes, stride, upsample))
122 |         self.inplanes = planes
123 | 
124 |         return nn.Sequential(*layers)
125 | 
126 |     def _make_agant_layer(self, inplanes, planes):
127 | 
128 |         layers = nn.Sequential(
129 |             nn.Conv2d(inplanes, planes, kernel_size=1,
130 |                       stride=1, padding=0, bias=False),
131 |             nn.BatchNorm2d(planes),
132 |             nn.ReLU(inplace=True)
133 |         )
134 |         return layers
135 | 
136 |     def _load_resnet_pretrained(self):
137 |         pretrain_dict = torch.load(ckpt, map_location='cpu')['model_state']
138 |         prefix = 'module.'
139 |         pretrain_dict = {
140 |             (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in pretrain_dict.items()
141 |         }
142 |         pretrain_dict.pop('final_deconv_custom.weight')
143 |         pretrain_dict.pop('final_deconv_custom.bias')
144 | 
145 |         pretrain_dict.pop('out5_conv_custom.weight')
146 |         pretrain_dict.pop('out5_conv_custom.bias')
147 | 
148 |         pretrain_dict.pop('out4_conv_custom.weight')
149 |         pretrain_dict.pop('out4_conv_custom.bias')
150 | 
151 |         pretrain_dict.pop('out3_conv_custom.weight')
152 |         pretrain_dict.pop('out3_conv_custom.bias')
153 | 
154 |         pretrain_dict.pop('out2_conv_custom.weight')
155 |         pretrain_dict.pop('out2_conv_custom.bias')
156 | 
157 |         model_dict = {}
158 |         state_dict = self.state_dict()
159 | 
160 |         state_dict.update(model_dict)
161 |         self.load_state_dict(state_dict)
162 | 
163 |     def forward_downsample(self, rgb, depth):
164 |         x = self.conv1(rgb)
165 |         x = self.bn1(x)
166 |         x = self.relu(x)
167 |         depth = self.conv1_d(depth)
168 |         depth = self.bn1_d(depth)
169 |         depth = self.relu(depth)
170 | 
171 |         fuse0 = x + depth
172 | 
173 |         x = self.maxpool(fuse0)
174 |         depth = self.maxpool(depth)
175 | 
176 |         # block 1
177 |         x = self.layer1(x)
178 |         # debug_tensor('post1', depth)
179 |         # for i, mod in enumerate(self.layer1_d):
180 |         #     depth = mod(depth)
181 |         #     debug_tensor(f'post1d-{i}', depth)
182 | 
183 |         depth = self.layer1_d(depth)
184 |         # debug_tensor('post1d', depth)
185 | 
186 |         fuse1 = x + depth
187 |         # block 2
188 |         x = self.layer2(fuse1)
189 |         depth = self.layer2_d(depth)
190 |         fuse2 = x + depth
191 |         # block 3
192 |         x = self.layer3(fuse2)
193 |         depth = self.layer3_d(depth)
194 |         fuse3 = x + depth
195 |         # block 4
196 |         x = self.layer4(fuse3)
197 |         depth = self.layer4_d(depth)
198 |         fuse4 = x + depth
199 | 
200 |         agant4 = self.agant4(fuse4)
201 | 
202 |         return fuse0, fuse1, fuse2, fuse3, agant4
203 | 
204 |     def forward_upsample(self, fuse0, fuse1, fuse2, fuse3, agant4, train=False):
205 | 
206 |         # upsample 1
207 |         x = self.deconv1(agant4)
208 |         if train:
209 |             out5 = self.out5_conv_custom(x)
210 |         x = x + self.agant3(fuse3)
211 |         # upsample 2
212 |         x = self.deconv2(x)
213 |         if train:
214 |             out4 = self.out4_conv_custom(x)
215 |         x = x + self.agant2(fuse2)
216 |         # upsample 3
217 |         x = self.deconv3(x)
218 |         if train:
219 |             out3 = self.out3_conv_custom(x)
220 |         x = x + self.agant1(fuse1)
221 |         # upsample 4
222 |         x = self.deconv4(x)
223 |         if train:
224 |             out2 = self.out2_conv_custom(x)
225 |         x = x + self.agant0(fuse0)
226 |         # final
227 |         x = self.final_conv(x)
228 | 
229 |         last_layer = x
230 | 
231 |         out = self.final_deconv_custom(x)
232 | 
233 |         if train:
234 |             return out, out2, out3, out4, out5
235 | 
236 |         return out
237 | 
238 |     def forward(self, rgb, depth, train=False):
239 | 
240 |         fuses = self.forward_downsample(rgb, depth)
241 | 
242 |         # We only need predictions.
243 |         # features_encoder = fuses[-1]
244 |         # scores, features_lastlayer = self.forward_upsample(*fuses)
245 |         scores = self.forward_upsample(*fuses, train)
246 |         # debug_tensor('scores', scores)
247 |         # return features_encoder, features_lastlayer, scores
248 |         # print("scores: ", scores.shape)
249 |         return scores
250 | 
251 | 
252 | def conv3x3(in_planes, out_planes, stride=1):
253 |     "3x3 convolution with padding"
254 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
255 |                      padding=1, bias=False)
256 | 
257 | 
258 | class Bottleneck(nn.Module):
259 |     expansion = 4
260 | 
261 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
262 |         super(Bottleneck, self).__init__()
263 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
264 |         self.bn1 = nn.BatchNorm2d(planes)
265 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
266 |                                padding=1, bias=False)
267 |         self.bn2 = nn.BatchNorm2d(planes)
268 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
269 |         self.bn3 = nn.BatchNorm2d(planes * 4)
270 |         self.relu = nn.ReLU(inplace=True)
271 |         self.downsample = downsample
272 |         self.stride = stride
273 | 
274 |     def forward(self, x):
275 |         residual = x
276 | 
277 |         out = self.conv1(x)
278 |         out = self.bn1(out)
279 |         out = self.relu(out)
280 | 
281 |         out = self.conv2(out)
282 |         out = self.bn2(out)
283 |         out = self.relu(out)
284 | 
285 |         out = self.conv3(out)
286 |         out = self.bn3(out)
287 | 
288 |         if self.downsample is not None:
289 |             residual = self.downsample(x)
290 | 
291 |         out += residual
292 |         out = self.relu(out)
293 | 
294 |         return out
295 | 
296 | class TransBasicBlock(nn.Module):
297 |     expansion = 1
298 | 
299 |     def __init__(self, inplanes, planes, stride=1, upsample=None, **kwargs):
300 |         super(TransBasicBlock, self).__init__()
301 |         self.conv1 = conv3x3(inplanes, inplanes)
302 |         self.bn1 = nn.BatchNorm2d(inplanes)
303 |         self.relu = nn.ReLU(inplace=True)
304 |         if upsample is not None and stride != 1:
305 |             self.conv2 = nn.ConvTranspose2d(inplanes, planes,
306 |                                             kernel_size=3, stride=stride, padding=1,
307 |                                             output_padding=1, bias=False)
308 |         else:
309 |             self.conv2 = conv3x3(inplanes, planes, stride)
310 |         self.bn2 = nn.BatchNorm2d(planes)
311 |         self.upsample = upsample
312 |         self.stride = stride
313 | 
314 |     def forward(self, x):
315 |         residual = x
316 | 
317 |         out = self.conv1(x)
318 |         out = self.bn1(out)
319 |         out = self.relu(out)
320 | 
321 |         out = self.conv2(out)
322 |         out = self.bn2(out)
323 | 
324 |         if self.upsample is not None:
325 |             residual = self.upsample(x)
326 | 
327 |         out += residual
328 |         out = self.relu(out)
329 | 
330 |         return out
331 | 
332 | class BatchNormalize(nn.Module):
333 |     r"""
334 |         I can't believe this isn't supported
335 |         https://github.com/pytorch/vision/issues/157
336 |     """
337 | 
338 |     def __init__(self, mean, std, device):
339 |         super().__init__()
340 |         self.mean = torch.tensor(mean, device=device)[None, :, None, None]
341 |         self.std = torch.tensor(std, device=device)[None, :, None, None]
342 | 
343 |     def forward(self, x):
344 |         # mean = torch.tensor(self.mean, device=x.device)
345 |         # std = torch.tensor(self.std, device=x.device)
346 |         # mean = self.mean[None, :, None, None]
347 |         # std = self.std[None, :, None, None]
348 |         # print(x.shape)
349 |         # print(self.mean.shape)
350 |         return (x - self.mean) / self.std
351 |         # x.sub_(self.mean).div_(self.std)
352 |         # return x
353 | 
354 | class RedNetResizeWrapper(nn.Module):
355 |     def __init__(self, device, resize=True, stabilize=False):
356 |         super().__init__()
357 |         self.rednet = RedNet()
358 |         self.rednet.eval()
359 |         self.semmap_rgb_norm = BatchNormalize(
360 |             mean=[0.493, 0.468, 0.438],
361 |             std=[0.544, 0.521, 0.499],
362 |             device=device
363 |         )
364 |         self.semmap_depth_norm = BatchNormalize(
365 |             mean=[0.213],
366 |             std=[0.285],
367 |             device=device
368 |         )
369 |         self.pretrained_size = (480, 640)
370 |         self.resize = resize
371 |         self.stabilize = stabilize
372 | 
373 |     def forward(self, rgb, depth, train=False):
374 |         r"""
375 |             Args:
376 |                 Raw sensor inputs.
377 |                 rgb: B x H=256 x W=256 x 3
378 |                 depth: B x H x W x 1
379 |             Returns:
380 |                 semantic: drop-in replacement for default semantic sensor. B x H x W  (no channel, for some reason)
381 |         """
382 |         # Not quite sure what depth is produced here. Let's just check
383 |         if self.resize:
384 |             _, og_h, og_w, _ = rgb.size() # b h w c
385 |         rgb = rgb.permute(0, 3, 1, 2)
386 |         depth = depth.permute(0, 3, 1, 2)
387 | 
388 |         rgb = rgb.float() / 255
389 |         if self.resize:
390 |             rgb = F.interpolate(rgb, self.pretrained_size, mode='bilinear')
391 |             depth = F.interpolate(depth, self.pretrained_size, mode='nearest')
392 |         # print("rgb: ", rgb.shape)
393 |         rgb = self.semmap_rgb_norm(rgb)
394 | 
395 |         # rgb = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
396 |         #                                          std=[0.229, 0.224, 0.225])(rgb)
397 |         # depth = torchvision.transforms.Normalize(mean=[19050],
398 |         #                                          std=[9650])(depth)
399 | 
400 |         depth_clip = (depth < 1.0).squeeze(1)
401 |         # depth_clip = ((depth < 1.0) & (depth > 0.0)).squeeze(1)
402 |         # print("depth: ", depth.shape)
403 |         depth = self.semmap_depth_norm(depth)
404 |         with torch.no_grad():
405 |             scores = self.rednet(rgb, depth, train)
406 |         # print("scores: ", scores[0][5][100])
407 |         
408 |         if train:
409 |             return scores
410 |         else:
411 |             with torch.no_grad():
412 |                 scores = nn.Softmax(dim=1)(scores)
413 |                 # print("score: ", scores.shape) #torch.Size([1, 40, 480, 640])
414 |                 # print("pred: ", pred.shape) # torch.Size([1, 40, 480, 640])
415 |             pred = (torch.max(scores, 1)[1] + 1).float() # B x 480 x 640
416 | 
417 |             # print("torch: ", pred[0][100])
418 |             pred[torch.max(scores, 1)[0] < 0.8] = 1
419 | 
420 |             if self.stabilize: # downsample tiny
421 |                 # Mask out out of depth samples
422 |                 pred[~depth_clip] = -1 # 41 is UNK in MP3D, but hab-sim renders with -1
423 |                 # pred = F.interpolate(pred.unsqueeze(1), (15, 20), mode='nearest').squeeze(1)
424 |             if self.resize:
425 |                 pred = F.interpolate(pred.unsqueeze(1), (og_h, og_w), mode='nearest')
426 | 
427 |             return pred.squeeze(1)
428 | 
429 | # def load_rednet(device, ckpt="", resize=True, stabilize=False):
430 | #     if not os.path.isfile(ckpt):
431 | #         raise Exception(f"invalid path {ckpt} provided for rednet weights")
432 | 
433 | #     model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device)
434 | 
435 | #     print("=> loading RedNet checkpoint '{}'".format(ckpt))
436 | #     if device.type == 'cuda':
437 | #         checkpoint = torch.load(ckpt, map_location='cpu')
438 | #     else:
439 | #         checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage)
440 | 
441 | #     state_dict = checkpoint['model_state']
442 | #     prefix = 'module.'
443 | #     state_dict = {
444 | #         (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
445 | #     }
446 | #     state_dict.pop('final_deconv_custom.weight')
447 | #     state_dict.pop('final_deconv_custom.bias')
448 | 
449 | #     state_dict.pop('out5_conv_custom.weight')
450 | #     state_dict.pop('out5_conv_custom.bias')
451 | 
452 | #     state_dict.pop('out4_conv_custom.weight')
453 | #     state_dict.pop('out4_conv_custom.bias')
454 | 
455 | #     state_dict.pop('out3_conv_custom.weight')
456 | #     state_dict.pop('out3_conv_custom.bias')
457 | 
458 | #     state_dict.pop('out2_conv_custom.weight')
459 | #     state_dict.pop('out2_conv_custom.bias')
460 | 
461 | #     model.rednet.state_dict().update(state_dict)
462 | #     # model.rednet.load_state_dict(state_dict)
463 | #     print("=> loaded checkpoint '{}' (epoch {})"
464 | #             .format(ckpt, checkpoint['epoch']))
465 | #     # print(model)
466 | #     print("Model's state_dict:")
467 | #     # for param_tensor in model.state_dict():
468 | #     #     print(param_tensor, "\t", model.state_dict()[param_tensor].size())
469 |       
470 | #     return model
471 | 
472 | 
473 | def load_rednet(device, ckpt="", resize=True, stabilize=False):
474 |     if not os.path.isfile(ckpt):
475 |         raise Exception(f"invalid path {ckpt} provided for rednet weights")
476 | 
477 |     model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device)
478 | 
479 |     print("=> loading RedNet checkpoint '{}'".format(ckpt))
480 |     if device.type == 'cuda':
481 |         checkpoint = torch.load(ckpt, map_location='cpu')
482 |     else:
483 |         checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage)
484 | 
485 |     # state_dict = checkpoint['state_dict']
486 |     state_dict = checkpoint['model_state']
487 |     prefix = 'module.'
488 |     state_dict = {
489 |         (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
490 |     }
491 |     # model.load_state_dict(state_dict)
492 |     model.rednet.load_state_dict(state_dict)
493 |     print("=> loaded checkpoint '{}' (epoch {})"
494 |             .format(ckpt, checkpoint['epoch']))
495 |     # print(model)
496 |     print("Model's state_dict:")
497 |     # for param_tensor in model.state_dict():
498 |     #     print(param_tensor, "\t", model.state_dict()[param_tensor].size())
499 |       
500 |     return model
501 | 
502 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train):
503 |     # usually this happens only on the start of a epoch
504 |     epoch_float = epoch + (local_count / num_train)
505 |     epoch_float = epoch + (local_count / num_train)
506 |     state = {
507 |         'global_step': global_step,
508 |         'epoch': epoch_float,
509 |         'state_dict': model.state_dict(),
510 |         'optimizer': optimizer.state_dict(),
511 |     }
512 |     ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float)
513 |     path = os.path.join(ckpt_dir, ckpt_model_filename)
514 |     torch.save(state, path)
515 |     print('{:>2} has been successfully saved'.format(path))


--------------------------------------------------------------------------------
/get_point_score.py:
--------------------------------------------------------------------------------
  1 | import sys,pdb,os,time
  2 | import torch
  3 | from PIL import Image
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | from torchvision.transforms.functional import to_pil_image
  7 | from lavis.models import model_zoo
  8 | from lavis.models import load_model_and_preprocess
  9 | from lavis.common.gradcam import getAttMap
 10 | from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
 11 | import cv2
 12 | import scipy.ndimage
 13 | import gzip
 14 | import json
 15 | 
 16 | def crop_img(image, vis_processors,scales=[1,2,4]):
 17 |     """
 18 |         Crops the image into multiple patches at different scales and visualizes the patches.
 19 | 
 20 |         Parameters:
 21 |         - image_path: The path to the image to be processed.
 22 |         - scales: A list of scales at which to crop the image.
 23 | 
 24 |         Returns:
 25 |         - A list of PIL Image objects representing the cropped patches.
 26 |         """
 27 | 
 28 |     # Initialize a list to hold the cropped images
 29 |     cropped_images = []
 30 |     cropped_batch=[]
 31 |     # Define the crop function using numpy slicing
 32 | 
 33 |     def crop_numpy(img, num_x, num_y):
 34 |         # Convert to numpy array for slicing
 35 |         np_img = np.array(img)
 36 |         h, w, _ = np_img.shape
 37 |         patches=list(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
 38 |                                         j * w // num_x:(j + 1) * w // num_x, :])
 39 |          for i in range(num_y) for j in range(num_x))
 40 |         patches_batch = list(vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
 41 |                                    j * w // num_x:(j + 1) * w // num_x, :])).to(device)
 42 |                    for i in range(num_y) for j in range(num_x))
 43 | 
 44 |         return patches, patches_batch
 45 | 
 46 |     for scale in scales:
 47 |         # Calculate number of patches in each axis
 48 |         num_x, num_y = scale, scale
 49 |         # Crop using numpy and store in dictionary
 50 |         patch,batch=crop_numpy(image, num_x, num_y)
 51 |         cropped_images.extend(patch)
 52 |         cropped_batch.extend(batch)
 53 | 
 54 |     return cropped_images,torch.stack(cropped_batch)
 55 | 
 56 | def crop_img_padding(image, vis_processors,scales=[0.5, 0.25]):
 57 |     w, h = image.size
 58 |     patches = [image]
 59 |     for scale in scales:
 60 |         # Determine the size of the patches
 61 |         new_w, new_h = int(w * scale), int(h * scale)
 62 | 
 63 |         # Calculate the number of patches to create
 64 |         patches_across_width = w // new_w
 65 |         patches_across_height = h // new_h
 66 | 
 67 |         for i in range(patches_across_width):
 68 |             for j in range(patches_across_height):
 69 |                 # Calculate the left, upper, right, and lower pixel coordinates for cropping
 70 |                 left = i * new_w
 71 |                 upper = j * new_h
 72 |                 right = left + new_w
 73 |                 lower = upper + new_h
 74 | 
 75 |                 # Crop the patch and create a masked image of the original size
 76 |                 patch = image.crop((left, upper, right, lower))
 77 |                 mask = Image.new("RGB", (w, h), (0, 0, 0))
 78 |                 mask.paste(patch, (left, upper))
 79 |                 patches.append(mask)
 80 |     return patches #from top to down, from large patch to small patch
 81 | 
 82 | def vis_patches(cropped_images,pth):
 83 |     # Visualize the cropped images
 84 |     fig, axes = plt.subplots(len(cropped_images), 1, figsize=(10, len(cropped_images) * 5))
 85 |     if len(cropped_images) == 1:
 86 |         axes = [axes]
 87 |     for ax, img in zip(axes, cropped_images):
 88 |         ax.imshow(img)
 89 |         ax.axis('off')
 90 |     plt.tight_layout()
 91 |     plt.savefig(pth)
 92 | 
 93 | def save_images_with_scores(batch, itc_score, attributes,directory):
 94 |     for attr in attributes:
 95 |         for i, tensor in enumerate(batch):
 96 |             # Convert the tensor to a PIL Image
 97 |             mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:,None,None].cuda()
 98 |             std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:,None,None].cuda()
 99 |             # pdb.set_trace()
100 |             tensor=tensor*std+mean
101 |             image = to_pil_image(tensor)
102 |             score = itc_score[attr][i].item()
103 |             # Save the image with the score in the filename
104 |             filename = f"image_{i}_attri_{attr}_score_{score:.4f}.png"
105 |             image.save(os.path.join(directory, filename))
106 | 
107 | def depth_to_colormap(depth_array, colormap=cv2.COLORMAP_JET):
108 |     # Normalize the depth array to the range [0, 255]
109 |     normalized_depth = cv2.normalize(depth_array, None, 0, 255, cv2.NORM_MINMAX)
110 |     # Convert to unsigned 8-bit type
111 |     normalized_depth = normalized_depth.astype(np.uint8)
112 |     # Apply colormap
113 |     colored_depth = cv2.applyColorMap(normalized_depth, colormap)
114 |     colored_depth_pil = Image.fromarray(cv2.cvtColor(colored_depth, cv2.COLOR_BGR2RGB))
115 |     return colored_depth_pil
116 | 
117 | def get_pyramid_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]):
118 |     """
119 |     Crops the image into multiple patches at different scales, computes scores for each patch,
120 |     normalizes these scores, and maps them back to the original image pixels.
121 | 
122 |     Parameters:
123 |     - image: The PIL Image to be processed.
124 |     - vis_processors: A function that processes each patch and returns a tensor suitable for scoring.
125 |     - score_processor: A function that takes a batch of patches and returns scores.
126 |     - scales: A list of scales at which to crop the image.
127 | 
128 |     Returns:
129 |     - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel.
130 |     """
131 | 
132 |     # Initialize lists to hold cropped images and their scores
133 |     cropped_images = []
134 |     cropped_batches = []
135 | 
136 |     # Define the crop function using numpy slicing
137 |     def crop_numpy(img, num_x, num_y):
138 |         np_img = np.array(img)
139 |         h, w, _ = np_img.shape
140 |         return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
141 |                                                j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices)
142 |                 for i in range(num_y) for j in range(num_x)]
143 | 
144 | 
145 |     # Process each scale
146 |     for scale in scales:
147 |         num_x, num_y = scale, scale
148 |         patches = crop_numpy(image, num_x, num_y)
149 |         cropped_images.extend(patches)
150 |         # batch = torch.stack(cropped_images)
151 | 
152 |     attr_score={}
153 |     attr_score_map={}
154 |     for attr in attributes:
155 | 
156 |         image_scores = score_processor({"image": torch.stack(cropped_images), "text_input": attr}, match_head='itc')
157 |         attr_score[attr] = image_scores.detach().cpu().numpy()
158 | 
159 |         # Normalize the scores: TODO
160 |         # total_score = sum(image_scores)
161 |         # normalized_scores = [score / total_score for score in image_scores]
162 | 
163 |         # Create a map of scores for the original image
164 |         scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32)
165 |         patch_idx = 0
166 |         for scale in scales:
167 |             num_x, num_y = scale, scale
168 |             patch_height = scores_map.shape[0] // num_y
169 |             patch_width = scores_map.shape[1] // num_x
170 |             for i in range(num_y):
171 |                 for j in range(num_x):
172 |                     # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx]
173 |                     scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += image_scores[patch_idx].detach().cpu().numpy()
174 |                     patch_idx += 1
175 | 
176 |         # Average the scores map if there is overlap (simple averaging for overlap handling)
177 |         # scores_map /= len(scales)
178 |         attr_score_map[attr]=scores_map/3
179 |     # pdb.set_trace()
180 |     return cropped_images, attr_score_map
181 | 
182 | def get_pyramid_clip_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]):
183 |     """
184 |     Crops the image into multiple patches at different scales, computes scores for each patch,
185 |     normalizes these scores, and maps them back to the original image pixels.
186 | 
187 |     Parameters:
188 |     - image: The PIL Image to be processed.
189 |     - vis_processors: A function that processes each patch and returns a tensor suitable for scoring.
190 |     - score_processor: A function that takes a batch of patches and returns scores.
191 |     - scales: A list of scales at which to crop the image.
192 | 
193 |     Returns:
194 |     - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel.
195 |     """
196 | 
197 |     # Initialize lists to hold cropped images and their scores
198 |     cropped_images = []
199 |     cropped_batches = []
200 | 
201 |     # Define the crop function using numpy slicing
202 |     def crop_numpy(img, num_x, num_y):
203 |         np_img = np.array(img)
204 |         h, w, _ = np_img.shape
205 |         return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
206 |                                                j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices)
207 |                 for i in range(num_y) for j in range(num_x)]
208 | 
209 |     # Process each scale
210 |     for scale in scales:
211 |         num_x, num_y = scale, scale
212 |         patches = crop_numpy(image, num_x, num_y)
213 |         cropped_images.extend(patches)
214 |         # batch = torch.stack(cropped_images)
215 | 
216 |     attr_score_map = {}
217 |     for att_i,attr in enumerate(attributes):
218 |         sample = {"image": torch.stack(cropped_images), "text_input": txt_processors(attr)}
219 | 
220 |         clip_features = score_processor.extract_features(sample)
221 | 
222 |         features_image = clip_features.image_embeds_proj
223 |         features_text = clip_features.text_embeds_proj
224 | 
225 |         image_scores = (features_image @ features_text.t()).squeeze()
226 | 
227 |         scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32)
228 |         patch_idx = 0
229 |         for scale in scales:
230 |             num_x, num_y = scale, scale
231 |             patch_height = scores_map.shape[0] // num_y
232 |             patch_width = scores_map.shape[1] // num_x
233 |             for i in range(num_y):
234 |                 for j in range(num_x):
235 |                     # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx]
236 |                     scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += \
237 |                         image_scores[patch_idx].detach().cpu().numpy()
238 |                     patch_idx += 1
239 | 
240 |         attr_score_map[attr] = scores_map / len(scales)
241 | 
242 |     return cropped_images, attr_score_map
243 | 
244 | def get_att_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None):
245 |     img=vis_processors(image).unsqueeze(0).to(blip_devices)
246 | 
247 |     txt = txt_processors(attributes[0])
248 |     txt_tokens = score_processor.tokenizer(txt, return_tensors="pt").to(blip_devices)
249 |     # pdb.set_trace()
250 |     gradcam, _ = compute_gradcam(score_processor, img, txt, txt_tokens, block_num=7)
251 |     gradcam_np = gradcam[0][1].numpy().astype(np.float32)
252 |     avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False)
253 |     return avg_gradcam
254 | 
255 | def save_scores_map(scores_map, filename='score_map.png'):
256 |     """
257 |     Visualizes and saves the scores map with a color bar.
258 | 
259 |     Parameters:
260 |     - scores_map: A numpy array with scores assigned to each pixel.
261 |     - filename: The filename to save the image to.
262 |     """
263 | 
264 |     # Normalize scores_map to the range 0-1 for better color mapping
265 |     normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map))
266 | 
267 |     # Create a figure and an axes object
268 |     fig, ax = plt.subplots()
269 | 
270 |     # Display the image
271 |     cax = ax.imshow(normalized_map, cmap='viridis')  # Choose a colormap that you prefer
272 | 
273 |     # Create a color bar
274 |     cbar = fig.colorbar(cax, ax=ax)
275 |     cbar.set_label('Score')
276 | 
277 |     # Save the figure
278 |     plt.axis('off')  # Turn off axis
279 |     plt.savefig(filename, bbox_inches='tight', pad_inches=0)
280 | 
281 |     # Close the plot to free up memory
282 |     plt.close(fig)
283 | 
284 | def overlay_attention_map(image, scores_map, filename='attention_overlay.png'):
285 |     """
286 |     Applies a smoothed attention map over an image to highlight areas based on the scores,
287 |     and then saves the resulting image.
288 | 
289 |     Parameters:
290 |     - image: A PIL Image to be enhanced.
291 |     - scores_map: A numpy array with scores assigned to each pixel.
292 |     - filename: The filename to save the image to.
293 |     """
294 | 
295 |     # Normalize scores_map to the range 0-1 for better color mapping
296 |     normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map))
297 | 
298 |     # Apply Gaussian smoothing to the scores map
299 |     smoothed_map = scipy.ndimage.gaussian_filter(normalized_map, sigma=2)  # Adjust sigma as needed for the desired smoothness
300 | 
301 |     # Convert the original image to a numpy array for processing
302 |     img_array = np.array(image)
303 | 
304 |     # Convert the smoothed scores map to a heatmap
305 |     plt.figure(figsize=(10, 6))
306 |     plt.imshow(img_array, cmap='gray', interpolation='nearest')  # Display the original image
307 |     plt.imshow(smoothed_map, cmap='jet', alpha=0.6, interpolation='bilinear')  # Overlay the attention map
308 |     plt.colorbar()  # Optionally add a color bar to indicate scoring scale
309 |     plt.axis('off')  # Hide axes
310 |     # Save the figure
311 |     plt.savefig(filename, bbox_inches='tight', pad_inches=0)
312 |     plt.close()
313 | 
314 | def doubly_right_detection(image,model, vis_processors,target,blip_devices=None):
315 |     image = vis_processors(image).unsqueeze(0).to(blip_devices)
316 |     out=model.generate({"image": image, "prompt": f"Question: is {target} shown in this image? Answer:"})
317 |     print(out)
318 |     if "yes" in out[0].lower():
319 |         return True
320 |     else:
321 |         return False
322 | 
323 | class Pos_Queue:
324 |     def __init__(self, capacity):
325 |         self.capacity = capacity
326 |         self.queue = []
327 | 
328 |     def add_element(self, element):
329 |         # Add the new element to the queue
330 |         self.queue.append(element)
331 |         # Check if the queue is beyond capacity
332 |         if len(self.queue) > self.capacity:
333 |             # If it's full, pop and return the oldest element
334 |             return self.queue.pop(0)
335 |         # If not full, return None
336 |         return None
337 | 
338 | 
339 | 
340 | if __name__ == "__main__":
341 |     # setup model
342 |     # device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
343 | 
344 |     # model, vis_processors, text_processors = load_model_and_preprocess(name="blip2_feature_extractor",
345 |     #                                                                   model_type="pretrain", is_eval=True,
346 |     #                                                                   device=device)
347 | 
348 |     # model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain",
349 |     #                                                                    device=device, is_eval=True)
350 | 
351 |     # model, vis_processors, text_processors = load_model_and_preprocess("blip_image_text_matching", "large",
352 |     #                                                                    device=device, is_eval=True)
353 | 
354 |     # model, vis_processors, _ = load_model_and_preprocess(name="blip2_opt", model_type="pretrain_opt2.7b",
355 |     #                                                      is_eval=True, device=device)
356 | 
357 |     # model, vis_processors, text_processors = load_model_and_preprocess("clip_feature_extractor", model_type="ViT-B-32",
358 |     #                                                                   is_eval=True, device=device)
359 |     # load the observation
360 |     # data_root="/home/mmvc/Desktop/L3MVN_Reproduce/L3MVN/vis/3d_score"
361 |     # obs = np.load(os.path.join(data_root,"sample_T.npy"))#0:3 rgb, 4 depth
362 | 
363 |     # get rgb
364 |     # rgb=obs[:,:,0:3].astype(np.uint8)
365 |     # rgb_obs=Image.fromarray(rgb)
366 |     # rgb_obs.save(os.path.join(data_root,"rgb.png"))
367 |     # patches,batch=crop_img(rgb_obs,vis_processors["eval"])
368 |     # vis_patches(patches,os.path.join(data_root,"patches.png"))
369 | 
370 |     # get detph
371 |     # depth=obs[:,:,4]
372 |     # depth_img=depth_to_colormap(depth,)
373 |     # depth_img.save(os.path.join(data_root,"depth.png"))
374 | 
375 |     #batch img and list txt 3.2s
376 |     # txt_list=[]
377 |     # for i in range(batch.shape[0]):
378 |     #     txt_list.append(txt)
379 |     # for i in range(5):
380 |     #     itc_score = model({"image": batch, "text_input": txt_list}, match_head='itc')
381 | 
382 |     # single img and single txt 8.13s
383 |     # txt="chair"
384 |     # for j in range(5):
385 |     #     for i in range(batch.shape[0]):
386 |     #         itc_score = model({"image": batch[i][None,:], "text_input": txt}, match_head='itc')
387 |     #         print(itc_score)
388 |     #batch img and single txt 3.1s
389 |     #                 "leg",
390 |     #                 "backrest",
391 |     #                 "seat",
392 |     #                 "armrest"
393 | 
394 |     # attributes=['chair backrest', 'chair seat', 'chair leg', 'chair armrest']
395 |     # target=["chair"]
396 | 
397 |     t=time.time()
398 |     # ------------- multi_scale approach ------------------
399 |     # attributes=['object with chair leg, chair backrest, chair seat, and chair armrest']
400 |     # imgs,score_mape=get_pyramid_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
401 |     # imgs,score_mape=get_pyramid_score_feature(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
402 | 
403 |     # ------------- gradcam approach -------------
404 |     # img=vis_processors["eval"](rgb_obs).unsqueeze(0).to(device)
405 |     # txt = text_processors["eval"](attributes[0])
406 |     # txt_tokens = model.tokenizer(txt, return_tensors="pt").to(device)
407 |     # gradcam, _ = compute_gradcam(model, img, txt, txt_tokens, block_num=7)
408 |     # gradcam_np = gradcam[0][1].numpy().astype(np.float32)
409 |     # avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False)
410 | 
411 |     # fig, ax = plt.subplots(1, 1, figsize=(10, 10))
412 |     # ax.imshow(avg_gradcam)
413 |     # ax.axis('off')
414 |     # plt.tight_layout()
415 |     # plt.savefig(os.path.join(data_root, f"grad_score_map.png"))
416 | 
417 |     # ------------------ doubly right detection ------------------
418 |     # print(doubly_right_detection(rgb_obs,vis_processors["eval"],target[0],device))
419 |     # end_t=time.time()
420 | 
421 |     # ------------------ CLIP multi-scale  ------------------
422 |     # imgs,score_mape=get_pyramid_clip_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
423 | 
424 |     # check dataset
425 |     data_pth='data/objectgoal_hm3d/val_mini/content/TEEsavR23oF.json.gz'
426 |     with gzip.open(data_pth, 'r') as f:
427 |         data = json.loads(f.read().decode('utf-8'))
428 |     for k in data.keys():
429 |         print(k,len(data[k]))
430 |     print("========================")
431 |     print(data['episodes'][1])
432 |     print("========================")
433 |     print(data['episodes'][2])
434 |     print("========================")
435 |     print(data['episodes'][3])
436 |     end_t=time.time()
437 | 
438 |     # for k in score_mape.keys():
439 |     #     overlay_attention_map(rgb_obs,score_mape[k],os.path.join(data_root,f"clip_score_map_{k}.png"))
440 | 
441 |     print(f"duration: {end_t-t} s")
442 | 
443 |     # print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)
444 | 


--------------------------------------------------------------------------------
/envs/habitat/objectgoal_env.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import bz2
  3 | import gzip
  4 | import _pickle as cPickle
  5 | import gym
  6 | import numpy as np
  7 | import quaternion
  8 | import skimage.morphology
  9 | import habitat
 10 | 
 11 | from envs.utils.fmm_planner import FMMPlanner
 12 | from constants import coco_categories, hm3d_category
 13 | import envs.utils.pose as pu
 14 | 
 15 | 
 16 | class ObjectGoal_Env(habitat.RLEnv):
 17 |     """The Object Goal Navigation environment class. The class is responsible
 18 |     for loading the dataset, generating episodes, and computing evaluation
 19 |     metrics.
 20 |     """
 21 | 
 22 |     def __init__(self, args, rank, config_env, dataset):
 23 |         self.args = args
 24 |         self.rank = rank
 25 | 
 26 |         super().__init__(config_env, dataset)
 27 | 
 28 |         # Loading dataset info file
 29 |         self.split = config_env.DATASET.SPLIT
 30 |         self.episodes_dir = config_env.DATASET.EPISODES_DIR.format(
 31 |             split=self.split)
 32 | 
 33 |         dataset_info_file = self.episodes_dir + \
 34 |             "{split}_info.pbz2".format(split=self.split)
 35 |         with bz2.BZ2File(dataset_info_file, 'rb') as f:
 36 |             self.dataset_info = cPickle.load(f)
 37 | 
 38 |         # Specifying action and observation space
 39 |         self.action_space = gym.spaces.Discrete(3)
 40 | 
 41 |         self.observation_space = gym.spaces.Box(0, 255,
 42 |                                                 (3, args.frame_height,
 43 |                                                  args.frame_width),
 44 |                                                 dtype='uint8')
 45 | 
 46 |         # Initializations
 47 |         self.episode_no = 0
 48 | 
 49 |         # Scene info
 50 |         self.last_scene_path = None
 51 |         self.scene_path = None
 52 |         self.scene_name = None
 53 | 
 54 |         # Episode Dataset info
 55 |         self.eps_data = None
 56 |         self.eps_data_idx = None
 57 |         self.gt_planner = None
 58 |         self.object_boundary = None
 59 |         self.goal_idx = None
 60 |         self.goal_name = None
 61 |         self.map_obj_origin = None
 62 |         self.starting_loc = None
 63 |         self.starting_distance = None
 64 | 
 65 |         # Episode tracking info
 66 |         self.curr_distance = None
 67 |         self.prev_distance = None
 68 |         self.timestep = None
 69 |         self.stopped = None
 70 |         self.path_length = None
 71 |         self.last_sim_location = None
 72 |         self.trajectory_states = []
 73 |         self.info = {}
 74 |         self.info['distance_to_goal'] = None
 75 |         self.info['spl'] = None
 76 |         self.info['success'] = None
 77 | 
 78 |         fileName = 'data/matterport_category_mappings.tsv'
 79 | 
 80 |         text = ''
 81 |         lines = []
 82 |         items = []
 83 |         self.hm3d_semantic_mapping={}
 84 | 
 85 |         with open(fileName, 'r') as f:
 86 |             text = f.read()
 87 |         lines = text.split('\n')
 88 | 
 89 |         for l in lines:
 90 |             items.append(l.split('    '))
 91 | 
 92 |         for i in items:
 93 |             if len(i) > 3:
 94 |                 self.hm3d_semantic_mapping[i[2]] = i[-1]
 95 | 
 96 |     def load_new_episode(self):
 97 |         """The function loads a fixed episode from the episode dataset. This
 98 |         function is used for evaluating a trained model on the val split.
 99 |         """
100 | 
101 |         args = self.args
102 |         self.scene_path = self.habitat_env.sim.curr_scene_name
103 |         scene_name = self.scene_path.split("/")[-1].split(".")[0]
104 | 
105 |         if self.scene_path != self.last_scene_path:
106 |             episodes_file = self.episodes_dir + \
107 |                 "content/{}_episodes.json.gz".format(scene_name)
108 | 
109 |             print("Loading episodes from: {}".format(episodes_file))
110 |             with gzip.open(episodes_file, 'r') as f:
111 |                 self.eps_data = json.loads(
112 |                     f.read().decode('utf-8'))["episodes"]
113 | 
114 |             self.eps_data_idx = 0
115 |             self.last_scene_path = self.scene_path
116 | 
117 |         # Load episode info
118 |         episode = self.eps_data[self.eps_data_idx]
119 |         self.eps_data_idx += 1
120 |         self.eps_data_idx = self.eps_data_idx % len(self.eps_data)
121 |         pos = episode["start_position"]
122 |         rot = quaternion.from_float_array(episode["start_rotation"])
123 | 
124 |         goal_name = episode["object_category"]
125 |         goal_idx = episode["object_id"]
126 |         floor_idx = episode["floor_id"]
127 | 
128 |         # Load scene info
129 |         scene_info = self.dataset_info[scene_name]
130 |         sem_map = scene_info[floor_idx]['sem_map']
131 |         map_obj_origin = scene_info[floor_idx]['origin']
132 | 
133 |         # Setup ground truth planner
134 |         object_boundary = args.success_dist
135 |         map_resolution = args.map_resolution
136 |         selem = skimage.morphology.disk(2)
137 |         traversible = skimage.morphology.binary_dilation(
138 |             sem_map[0], selem) != True
139 |         traversible = 1 - traversible
140 |         planner = FMMPlanner(traversible)
141 |         selem = skimage.morphology.disk(
142 |             int(object_boundary * 100. / map_resolution))
143 |         goal_map = skimage.morphology.binary_dilation(
144 |             sem_map[goal_idx + 1], selem) != True
145 |         goal_map = 1 - goal_map
146 |         planner.set_multi_goal(goal_map)
147 | 
148 |         # Get starting loc in GT map coordinates
149 |         x = -pos[2]
150 |         y = -pos[0]
151 |         min_x, min_y = map_obj_origin / 100.0
152 |         map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
153 | 
154 |         self.gt_planner = planner
155 |         self.starting_loc = map_loc
156 |         self.object_boundary = object_boundary
157 |         self.goal_idx = goal_idx
158 |         self.goal_name = goal_name
159 |         self.map_obj_origin = map_obj_origin
160 | 
161 |         self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc]\
162 |             / 20.0 + self.object_boundary
163 |         self.prev_distance = self.starting_distance
164 |         self._env.sim.set_agent_state(pos, rot)
165 | 
166 |         # The following two should match approximately
167 |         # print(starting_loc)
168 |         # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
169 | 
170 |         obs = self._env.sim.get_observations_at(pos, rot)
171 | 
172 |         return obs
173 | 
174 |     def generate_new_episode(self):
175 |         """The function generates a random valid episode. This function is used
176 |         for training a model on the train split.
177 |         """
178 | 
179 |         args = self.args
180 | 
181 |         self.scene_path = self.habitat_env.sim.config.SCENE
182 |         scene_name = self.scene_path.split("/")[-1].split(".")[0]
183 | 
184 |         scene_info = self.dataset_info[scene_name]
185 |         map_resolution = args.map_resolution
186 | 
187 |         floor_idx = np.random.randint(len(scene_info.keys()))
188 |         floor_height = scene_info[floor_idx]['floor_height']
189 |         sem_map = scene_info[floor_idx]['sem_map']
190 |         map_obj_origin = scene_info[floor_idx]['origin']
191 | 
192 |         cat_counts = sem_map.sum(2).sum(1)
193 |         possible_cats = list(np.arange(6))
194 | 
195 |         for i in range(6):
196 |             if cat_counts[i + 1] == 0:
197 |                 possible_cats.remove(i)
198 | 
199 |         object_boundary = args.success_dist
200 | 
201 |         loc_found = False
202 |         while not loc_found:
203 |             if len(possible_cats) == 0:
204 |                 print("No valid objects for {}".format(floor_height))
205 |                 eps = eps - 1
206 |                 continue
207 | 
208 |             goal_idx = np.random.choice(possible_cats)
209 | 
210 |             for key, value in coco_categories.items():
211 |                 if value == goal_idx:
212 |                     goal_name = key
213 | 
214 |             selem = skimage.morphology.disk(2)
215 |             traversible = skimage.morphology.binary_dilation(
216 |                 sem_map[0], selem) != True
217 |             traversible = 1 - traversible
218 | 
219 |             planner = FMMPlanner(traversible)
220 | 
221 |             selem = skimage.morphology.disk(
222 |                 int(object_boundary * 100. / map_resolution))
223 |             goal_map = skimage.morphology.binary_dilation(
224 |                 sem_map[goal_idx + 1], selem) != True
225 |             goal_map = 1 - goal_map
226 | 
227 |             planner.set_multi_goal(goal_map)
228 | 
229 |             m1 = sem_map[0] > 0
230 |             m2 = planner.fmm_dist > (args.min_d - object_boundary) * 20.0
231 |             m3 = planner.fmm_dist < (args.max_d - object_boundary) * 20.0
232 | 
233 |             possible_starting_locs = np.logical_and(m1, m2)
234 |             possible_starting_locs = np.logical_and(
235 |                 possible_starting_locs, m3) * 1.
236 |             if possible_starting_locs.sum() != 0:
237 |                 loc_found = True
238 |             else:
239 |                 print("Invalid object: {} / {} / {}".format(
240 |                     scene_name, floor_height, goal_name))
241 |                 possible_cats.remove(goal_idx)
242 |                 scene_info[floor_idx]["sem_map"][goal_idx + 1, :, :] = 0.
243 |                 self.dataset_info[scene_name][floor_idx][
244 |                     "sem_map"][goal_idx + 1, :, :] = 0.
245 | 
246 |         loc_found = False
247 |         while not loc_found:
248 |             pos = self._env.sim.sample_navigable_point()
249 |             x = -pos[2]
250 |             y = -pos[0]
251 |             min_x, min_y = map_obj_origin / 100.0
252 |             map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
253 |             if abs(pos[1] - floor_height) < args.floor_thr / 100.0 and \
254 |                     possible_starting_locs[map_loc[0], map_loc[1]] == 1:
255 |                 loc_found = True
256 | 
257 |         agent_state = self._env.sim.get_agent_state(0)
258 |         rotation = agent_state.rotation
259 |         rvec = quaternion.as_rotation_vector(rotation)
260 |         rvec[1] = np.random.rand() * 2 * np.pi
261 |         rot = quaternion.from_rotation_vector(rvec)
262 | 
263 |         self.gt_planner = planner
264 |         self.starting_loc = map_loc
265 |         self.object_boundary = object_boundary
266 |         self.goal_idx = goal_idx
267 |         self.goal_name = goal_name
268 |         self.map_obj_origin = map_obj_origin
269 | 
270 |         self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc] \
271 |             / 20.0 + self.object_boundary
272 |         self.prev_distance = self.starting_distance
273 | 
274 |         self._env.sim.set_agent_state(pos, rot)
275 | 
276 |         # The following two should match approximately
277 |         # print(starting_loc)
278 |         # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
279 | 
280 |         obs = self._env.sim.get_observations_at(pos, rot)
281 | 
282 |         return obs
283 | 
284 |     def sim_map_to_sim_continuous(self, coords):
285 |         """Converts ground-truth 2D Map coordinates to absolute Habitat
286 |         simulator position and rotation.
287 |         """
288 |         agent_state = self._env.sim.get_agent_state(0)
289 |         y, x = coords
290 |         min_x, min_y = self.map_obj_origin / 100.0
291 | 
292 |         cont_x = x / 20. + min_x
293 |         cont_y = y / 20. + min_y
294 |         agent_state.position[0] = cont_y
295 |         agent_state.position[2] = cont_x
296 | 
297 |         rotation = agent_state.rotation
298 |         rvec = quaternion.as_rotation_vector(rotation)
299 | 
300 |         if self.args.train_single_eps:
301 |             rvec[1] = 0.0
302 |         else:
303 |             rvec[1] = np.random.rand() * 2 * np.pi
304 |         rot = quaternion.from_rotation_vector(rvec)
305 | 
306 |         return agent_state.position, rot
307 | 
308 |     def sim_continuous_to_sim_map(self, sim_loc):
309 |         """Converts absolute Habitat simulator pose to ground-truth 2D Map
310 |         coordinates.
311 |         """
312 |         x, y, o = sim_loc
313 |         min_x, min_y = self.map_obj_origin / 100.0
314 |         x, y = int((-x - min_x) * 20.), int((-y - min_y) * 20.)
315 | 
316 |         o = np.rad2deg(o) + 180.0
317 |         return y, x, o
318 | 
319 |     def reset(self):
320 |         """Resets the environment to a new episode.
321 | 
322 |         Returns:
323 |             obs (ndarray): RGBD observations (4 x H x W)
324 |             info (dict): contains timestep, pose, goal category and
325 |                          evaluation metric info
326 |         """
327 |         args = self.args
328 |         new_scene = self.episode_no % args.num_train_episodes == 0
329 | 
330 |         self.scene = self._env.sim.semantic_annotations()
331 | 
332 |         self.episode_no += 1
333 | 
334 |         # Initializations
335 |         self.timestep = 0
336 |         self.stopped = False
337 |         self.path_length = 1e-5
338 |         self.trajectory_states = []
339 | 
340 |         if new_scene:
341 |             obs = super().reset()
342 |             self.scene_name = self.habitat_env.sim.curr_scene_name
343 |             print("Changing scene: {}/{}".format(self.rank, self.scene_name))
344 | 
345 |         self.scene_path = self.habitat_env.sim.curr_scene_name
346 | 
347 |         if self.split == "val":
348 |             obs = self.load_new_episode()
349 |         else:
350 |             obs = self.generate_new_episode()
351 | 
352 |         # print("obs: ,", obs)
353 |         # print("obs shape: ,", obs.shape)
354 |         rgb = obs['rgb'].astype(np.uint8)
355 |         depth = obs['depth']
356 |         semantic = self._preprocess_semantic(obs["semantic"])
357 |         # print("rgb shape: ,", rgb.shape)
358 |         # print("depth shape: ,", depth.shape)
359 |         # print("semantic shape: ,", semantic.shape)
360 | 
361 |         state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
362 |         self.last_sim_location = self.get_sim_location()
363 | 
364 |         # Set info
365 |         self.info['time'] = self.timestep
366 |         self.info['sensor_pose'] = [0., 0., 0.]
367 |         self.info['goal_cat_id'] = self.goal_idx
368 |         self.info['goal_name'] = hm3d_category[self.goal_idx]
369 | 
370 |         return state, self.info
371 | 
372 |     def step(self, action):
373 |         """Function to take an action in the environment.
374 | 
375 |         Args:
376 |             action (dict):
377 |                 dict with following keys:
378 |                     'action' (int): 0: stop, 1: forward, 2: left, 3: right
379 | 
380 |         Returns:
381 |             obs (ndarray): RGBD observations (4 x H x W)
382 |             reward (float): amount of reward returned after previous action
383 |             done (bool): whether the episode has ended
384 |             info (dict): contains timestep, pose, goal category and
385 |                          evaluation metric info
386 |         """
387 |         action = action["action"]
388 |         if action == 0:
389 |             self.stopped = True
390 |             # Not sending stop to simulator, resetting manually
391 |             action = 3
392 | 
393 |         obs, rew, done, _ = super().step(action)
394 | 
395 |         # Get pose change
396 |         dx, dy, do = self.get_pose_change()
397 |         self.info['sensor_pose'] = [dx, dy, do]
398 |         self.path_length += pu.get_l2_distance(0, dx, 0, dy)
399 | 
400 |         spl, success, dist = 0., 0., 0.
401 |         if done:
402 |             spl, success, dist = self.get_metrics()
403 |             self.info['distance_to_goal'] = dist
404 |             self.info['spl'] = spl
405 |             self.info['success'] = success
406 | 
407 |         rgb = obs['rgb'].astype(np.uint8)
408 |         depth = obs['depth']
409 |         semantic = self._preprocess_semantic(obs["semantic"])
410 |         state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
411 | 
412 |         self.timestep += 1
413 |         self.info['time'] = self.timestep
414 | 
415 |         return state, rew, done, self.info
416 | 
417 |     def _preprocess_semantic(self, semantic):
418 |         # print("*********semantic type: ", type(semantic))
419 |         se = list(set(semantic.ravel()))
420 |         # print(se) # []
421 |         for i in range(len(se)):
422 |             if self.scene.objects[se[i]] is not None and self.scene.objects[se[i]].category.name() in coco_categories:
423 |                 # print(self.scene.objects[se[i]].id) 
424 |                 # print(self.scene.objects[se[i]].category.index()) 
425 |                 # print(type(self.scene.objects[se[i]].category.index()) ) # int
426 |                 semantic[semantic==se[i]] = coco_categories[self.scene.objects[se[i]].category.name()]+1
427 |                 # print(self.scene.objects[se[i]+1].category.name())
428 |             else :
429 |                 semantic[
430 |                     semantic==se[i]
431 |                     ] = 0
432 |        
433 |         # se = list(set(semantic.ravel()))
434 |         # print("semantic: ", se) # []
435 |         # semantic = np.expand_dims(semantic.astype(np.uint8), 2)
436 |         return semantic
437 | 
438 |     def get_reward_range(self):
439 |         """This function is not used, Habitat-RLEnv requires this function"""
440 |         return (0., 1.0)
441 | 
442 |     def get_reward(self, observations):
443 |         curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
444 |         self.curr_distance = self.gt_planner.fmm_dist[curr_loc[0],
445 |                                                       curr_loc[1]] / 20.0
446 | 
447 |         reward = (self.prev_distance - self.curr_distance) * \
448 |             self.args.reward_coeff
449 | 
450 |         self.prev_distance = self.curr_distance
451 |         return reward
452 | 
453 |     def get_llm_distance(self, target_point_map, frontier_loc_g):
454 | 
455 |         tpm = len(list(set(target_point_map.ravel()))) -1
456 |         frontier_dis_g = 0
457 |         if tpm > 1:
458 |             curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
459 |             frontier_loc_g[0][0] = frontier_loc_g[0][0] - target_point_map.shape[0]/2 + curr_loc[0]
460 |             frontier_loc_g[1][0] = frontier_loc_g[1][0] - target_point_map.shape[1]/2 + curr_loc[1]
461 |             (aa, bb) = self.gt_planner.traversible.shape
462 |             if frontier_loc_g[0][0] > aa-1:
463 |                 frontier_loc_g[0][0] = aa-1
464 |             if frontier_loc_g[1][0] > bb-1:
465 |                 frontier_loc_g[1][0] = bb -1
466 |             frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0],
467 |                                                     frontier_loc_g[1]] / 20.0
468 |             for lay in range(tpm):
469 |                 frontier_loc = np.where(target_point_map == lay+1)
470 |                 curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
471 |                 frontier_loc[0][0] = frontier_loc[0][0] - target_point_map.shape[0]/2 + curr_loc[0]
472 |                 frontier_loc[1][0] = frontier_loc[1][0] - target_point_map.shape[1]/2 + curr_loc[1]
473 |                 (aa, bb) = self.gt_planner.traversible.shape
474 |                 if frontier_loc[0][0] > aa-1:
475 |                     frontier_loc[0][0] = aa-1
476 |                 if frontier_loc[1][0] > bb-1:
477 |                     frontier_loc[1][0] = bb -1
478 |                 frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0],
479 |                                                         frontier_loc[1]] / 20.0
480 |                 if frontier_distance < frontier_dis_g:
481 |                     return 0
482 |             return 1
483 |         else:
484 |             return 0
485 | 
486 | 
487 |     def get_metrics(self):
488 |         """This function computes evaluation metrics for the Object Goal task
489 | 
490 |         Returns:
491 |             spl (float): Success weighted by Path Length
492 |                         (See https://arxiv.org/pdf/1807.06757.pdf)
493 |             success (int): 0: Failure, 1: Successful
494 |             dist (float): Distance to Success (DTS),  distance of the agent
495 |                         from the success threshold boundary in meters.
496 |                         (See https://arxiv.org/pdf/2007.00643.pdf)
497 |         """
498 |         curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
499 |         dist = self.gt_planner.fmm_dist[curr_loc[0], curr_loc[1]] / 20.0
500 |         if dist == 0.0:
501 |             success = 1
502 |         else:
503 |             success = 0
504 |         spl = min(success * self.starting_distance / self.path_length, 1)
505 |         return spl, success, dist
506 | 
507 |     def get_done(self, observations):
508 |         if self.info['time'] >= self.args.max_episode_length - 1:
509 |             done = True
510 |         elif self.stopped:
511 |             done = True
512 |         else:
513 |             done = False
514 |         return done
515 | 
516 |     def get_info(self, observations):
517 |         """This function is not used, Habitat-RLEnv requires this function"""
518 |         info = {}
519 |         return info
520 | 
521 |     def get_spaces(self):
522 |         """Returns observation and action spaces for the ObjectGoal task."""
523 |         return self.observation_space, self.action_space
524 | 
525 |     def get_sim_location(self):
526 |         """Returns x, y, o pose of the agent in the Habitat simulator."""
527 | 
528 |         agent_state = super().habitat_env.sim.get_agent_state(0)
529 |         x = -agent_state.position[2]
530 |         y = -agent_state.position[0]
531 |         axis = quaternion.as_euler_angles(agent_state.rotation)[0]
532 |         if (axis % (2 * np.pi)) < 0.1 or (axis %
533 |                                           (2 * np.pi)) > 2 * np.pi - 0.1:
534 |             o = quaternion.as_euler_angles(agent_state.rotation)[1]
535 |         else:
536 |             o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
537 |         if o > np.pi:
538 |             o -= 2 * np.pi
539 |         return x, y, o
540 | 
541 |     def get_pose_change(self):
542 |         """Returns dx, dy, do pose change of the agent relative to the last
543 |         timestep."""
544 |         curr_sim_pose = self.get_sim_location()
545 |         dx, dy, do = pu.get_rel_pose_change(
546 |             curr_sim_pose, self.last_sim_location)
547 |         self.last_sim_location = curr_sim_pose
548 |         return dx, dy, do
549 | 


--------------------------------------------------------------------------------
/agents/sem_exp.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import sys
  4 | 
  5 | import cv2
  6 | import numpy as np
  7 | import skimage.morphology
  8 | from PIL import Image
  9 | from torchvision import transforms
 10 | import time
 11 | 
 12 | from envs.utils.fmm_planner import FMMPlanner
 13 | from envs.habitat.objectgoal_env import ObjectGoal_Env
 14 | from envs.habitat.objectgoal_env21 import ObjectGoal_Env21
 15 | from agents.utils.semantic_prediction import SemanticPredMaskRCNN
 16 | from constants import color_palette
 17 | import envs.utils.pose as pu
 18 | import agents.utils.visualization as vu
 19 | 
 20 | from RedNet.RedNet_model import load_rednet
 21 | from constants import mp_categories_mapping
 22 | import torch
 23 | from get_point_score import *
 24 | 
 25 | class Sem_Exp_Env_Agent(ObjectGoal_Env21):
 26 |     """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class
 27 |     object is used for each environment thread.
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(self, args, rank, config_env, dataset):
 32 | 
 33 |         self.args = args
 34 |         super().__init__(args, rank, config_env, dataset)
 35 | 
 36 |         # initialize transform for RGB observations
 37 |         self.res = transforms.Compose(
 38 |             [transforms.ToPILImage(),
 39 |              transforms.Resize((args.frame_height, args.frame_width),
 40 |                                interpolation=Image.NEAREST)])
 41 | 
 42 |         # initialize semantic segmentation prediction model
 43 |         if args.sem_gpu_id == -1:
 44 |             args.sem_gpu_id = config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID
 45 | 
 46 |         self.device = args.device
 47 |         self.sem_pred = SemanticPredMaskRCNN(args)
 48 |         self.red_sem_pred = load_rednet(
 49 |             self.device, ckpt='RedNet/model/rednet_semmap_mp3d_40.pth', resize=True, # since we train on half-vision
 50 |         )
 51 |         self.red_sem_pred.eval()
 52 |         # self.red_sem_pred.to(self.device)
 53 | 
 54 | 
 55 |         # initializations for planning:
 56 |         self.selem = skimage.morphology.disk(3)
 57 | 
 58 |         self.obs = None
 59 |         self.obs_shape = None
 60 |         self.collision_map = None
 61 |         self.visited = None
 62 |         self.visited_vis = None
 63 |         self.col_width = None
 64 |         self.curr_loc = None
 65 |         self.last_loc = None
 66 |         self.last_action = None
 67 |         self.count_forward_actions = None
 68 | 
 69 |         self.replan_count = 0
 70 |         self.collision_n = 0
 71 |         self.kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(3, 3))
 72 | 
 73 |         if args.visualize or args.print_images:
 74 |             self.legend = cv2.imread('docs/legend.png')
 75 |             self.vis_image = None
 76 |             self.rgb_vis = None
 77 | 
 78 |         self.fail_case = {}
 79 |         self.fail_case['collision'] = 0
 80 |         self.fail_case['success'] = 0
 81 |         self.fail_case['detection'] = 0
 82 |         self.fail_case['exploration'] = 0
 83 | 
 84 |         self.eve_angle = 0
 85 | 
 86 |     def reset(self):
 87 |         args = self.args
 88 | 
 89 |         self.replan_count = 0
 90 |         self.collision_n = 0
 91 | 
 92 |         obs, info = super().reset()
 93 |         store_obs=obs.transpose(1, 2, 0)
 94 |         # print(obs.shape)
 95 |         # print(store_obs.shape)
 96 |         # print("!!!")
 97 |         # np.save("vis/3d_score/sample_T.npy",store_obs)
 98 |         obs = self._preprocess_obs(obs)
 99 | 
100 |         self.obs_shape = obs.shape
101 | 
102 |         # Episode initializations
103 |         map_shape = (args.map_size_cm // args.map_resolution,
104 |                      args.map_size_cm // args.map_resolution)
105 |         self.collision_map = np.zeros(map_shape)
106 |         self.visited = np.zeros(map_shape)
107 |         self.visited_vis = np.zeros(map_shape)
108 |         self.col_width = 1
109 |         self.count_forward_actions = 0
110 |         self.curr_loc = [args.map_size_cm / 100.0 / 2.0,
111 |                          args.map_size_cm / 100.0 / 2.0, 0.]
112 |         self.last_action = None
113 | 
114 |         self.eve_angle = 0
115 |         self.eve_angle_old = 0
116 | 
117 |         info['eve_angle'] = self.eve_angle
118 | 
119 | 
120 |         if args.visualize or args.print_images:
121 |             self.vis_image = vu.init_vis_image(self.goal_name, self.legend)
122 | 
123 |         # print(obs.shape)
124 |         return obs, info
125 | 
126 |     def plan_act_and_preprocess(self, planner_inputs):
127 |         """Function responsible for planning, taking the action and
128 |         preprocessing observations
129 | 
130 |         Args:
131 |             planner_inputs (dict):
132 |                 dict with following keys:
133 |                     'map_pred'  (ndarray): (M, M) map prediction
134 |                     'goal'      (ndarray): (M, M) mat denoting goal locations
135 |                     'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
136 |                                  and planning window (gx1, gx2, gy1, gy2)
137 |                      'found_goal' (bool): whether the goal object is found
138 | 
139 |         Returns:
140 |             obs (ndarray): preprocessed observations ((4+C) x H x W)
141 |             reward (float): amount of reward returned after previous action
142 |             done (bool): whether the episode has ended
143 |             info (dict): contains timestep, pose, goal category and
144 |                          evaluation metric info
145 |         """
146 |         # s_time = time.time()
147 | 
148 |         # plan
149 |         if planner_inputs["wait"]:
150 |             self.last_action = None
151 |             self.info["sensor_pose"] = [0., 0., 0.]
152 |             return np.zeros(self.obs.shape), self.fail_case, False, self.info
153 | 
154 |         # Reset reward if new long-term goal
155 |         if planner_inputs["new_goal"]:
156 |             goal = planner_inputs['goal']
157 |             if np.sum(goal == 1) == 1 and self.args.task_config == "tasks/objectnav_gibson.yaml":
158 |                 frontier_loc = np.where(goal == 1)
159 |                 self.info["g_reward"] = self.get_llm_distance(planner_inputs["map_target"], frontier_loc)
160 | 
161 |             # self.collision_map = np.zeros(self.visited.shape)
162 |             self.info['clear_flag'] = 0
163 | 
164 |         action = self._plan(planner_inputs)
165 | 
166 |         # c_time = time.time()
167 |         # ss_time = c_time - s_time
168 |         # print('plan map: %.3f秒'%ss_time) 0.19
169 | 
170 |         if self.collision_n > 20 or self.replan_count > 26:
171 |             self.info['clear_flag'] = 1
172 |             self.collision_n = 0
173 | 
174 |         if self.args.visualize or self.args.print_images:
175 | 
176 |             self._visualize(planner_inputs)
177 | 
178 |         if action >= 0:
179 | 
180 |             # act
181 |             action = {'action': action}
182 |             obs, rew, done, info = super().step(action)
183 | 
184 |             if done and self.info['success'] == 0:
185 |                 if self.info['time'] >= self.args.max_episode_length - 1:
186 |                     self.fail_case['exploration'] += 1
187 |                 elif self.replan_count > 26:
188 |                     self.fail_case['collision'] += 1
189 |                 else:
190 |                     self.fail_case['detection'] += 1
191 | 
192 |             if done and self.info['success'] == 1:
193 |                 self.fail_case['success'] += 1
194 | 
195 |             # preprocess obs
196 |             obs = self._preprocess_obs(obs) 
197 |             self.last_action = action['action']
198 |             self.obs = obs
199 | 
200 |             info['last_action'] = action['action'] # Updated to get action info as well
201 |             self.info = info
202 |             info['eve_angle'] = self.eve_angle
203 | 
204 | 
205 |             # e_time = time.time()
206 |             # ss_time = e_time - c_time
207 |             # print('act map: %.3f秒'%ss_time) 0.23
208 | 
209 |             # info['g_reward'] += rew
210 | 
211 |             return obs, self.fail_case, done, info
212 | 
213 |         else:
214 |             self.last_action = None
215 |             self.info["sensor_pose"] = [0., 0., 0.]
216 |             info['last_action'] = None # Updated to get action info as well
217 | 
218 |             return np.zeros(self.obs_shape), self.fail_case, False, self.info
219 | 
220 |     def _plan(self, planner_inputs):
221 |         """Function responsible for planning
222 | 
223 |         Args:
224 |             planner_inputs (dict):
225 |                 dict with following keys:
226 |                     'map_pred'  (ndarray): (M, M) map prediction
227 |                     'goal'      (ndarray): (M, M) goal locations
228 |                     'pose_pred' (ndarray): (7,) array  denoting pose (x,y,o)
229 |                                  and planning window (gx1, gx2, gy1, gy2)
230 |                     'found_goal' (bool): whether the goal object is found
231 | 
232 |         Returns:
233 |             action (int): action id
234 |         """
235 |         args = self.args
236 | 
237 |         self.last_loc = self.curr_loc
238 | 
239 |         # Get Map prediction
240 |         map_pred = np.rint(planner_inputs['map_pred'])
241 |         exp_pred = np.rint(planner_inputs['exp_pred'])
242 |         goal = planner_inputs['goal']
243 | 
244 |         # Get pose prediction and global policy planning window
245 |         start_x, start_y, start_o, gx1, gx2, gy1, gy2 = \
246 |             planner_inputs['pose_pred']
247 |         gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
248 |         planning_window = [gx1, gx2, gy1, gy2]
249 | 
250 |         # Get curr loc
251 |         self.curr_loc = [start_x, start_y, start_o]
252 |         r, c = start_y, start_x
253 |         start = [int(r * 100.0 / args.map_resolution - gx1),
254 |                  int(c * 100.0 / args.map_resolution - gy1)]
255 |         start = pu.threshold_poses(start, map_pred.shape)
256 | 
257 |         self.visited[gx1:gx2, gy1:gy2][start[0] - 0:start[0] + 1,
258 |                                        start[1] - 0:start[1] + 1] = 1
259 | 
260 |         # if args.visualize or args.print_images:
261 |             # Get last loc
262 |         last_start_x, last_start_y = self.last_loc[0], self.last_loc[1]
263 |         r, c = last_start_y, last_start_x
264 |         last_start = [int(r * 100.0 / args.map_resolution - gx1),
265 |                         int(c * 100.0 / args.map_resolution - gy1)]
266 |         last_start = pu.threshold_poses(last_start, map_pred.shape)
267 |         self.visited_vis[gx1:gx2, gy1:gy2] = \
268 |             vu.draw_line(last_start, start,
269 |                             self.visited_vis[gx1:gx2, gy1:gy2])
270 | 
271 |         # Collision check
272 |         if self.last_action == 1 and not planner_inputs["new_goal"]:
273 |             x1, y1, t1 = self.last_loc
274 |             x2, y2, _ = self.curr_loc
275 |             buf = 4
276 |             length = 2
277 | 
278 |             if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05:
279 |                 self.col_width += 2
280 |                 if self.col_width == 7:
281 |                     length = 4
282 |                     buf = 3
283 |                 self.col_width = min(self.col_width, 5)
284 |             else:
285 |                 self.col_width = 1
286 | 
287 |             dist = pu.get_l2_distance(x1, x2, y1, y2)
288 |             if dist < args.collision_threshold:  # Collision
289 |                 self.collision_n += 1
290 |                 width = self.col_width
291 |                 for i in range(length):
292 |                     for j in range(width):
293 |                         wx = x1 + 0.05 * \
294 |                             ((i + buf) * np.cos(np.deg2rad(t1))
295 |                              + (j - width // 2) * np.sin(np.deg2rad(t1)))
296 |                         wy = y1 + 0.05 * \
297 |                             ((i + buf) * np.sin(np.deg2rad(t1))
298 |                              - (j - width // 2) * np.cos(np.deg2rad(t1)))
299 |                         r, c = wy, wx
300 |                         r, c = int(r * 100 / args.map_resolution), \
301 |                             int(c * 100 / args.map_resolution)
302 |                         [r, c] = pu.threshold_poses([r, c],
303 |                                                     self.collision_map.shape)
304 |                         self.collision_map[r, c] = 1
305 | 
306 |         stg, replan, stop = self._get_stg(map_pred, start, np.copy(goal),
307 |                                   planning_window)
308 | 
309 |         if replan:
310 |             self.replan_count += 1
311 |             print("false: ", self.replan_count)
312 |         else:
313 |             self.replan_count = 0
314 | 
315 |         # Deterministic Local Policy
316 |         if (stop and planner_inputs['found_goal'] == 1) or self.replan_count > 26:
317 |             action = 0  # Stop
318 |         else:
319 |             (stg_x, stg_y) = stg
320 |             angle_st_goal = math.degrees(math.atan2(stg_x - start[0],
321 |                                                     stg_y - start[1]))
322 |             angle_agent = (start_o) % 360.0
323 |             if angle_agent > 180:
324 |                 angle_agent -= 360
325 | 
326 |             relative_angle = (angle_agent - angle_st_goal) % 360.0
327 |             if relative_angle > 180:
328 |                 relative_angle -= 360
329 | 
330 |             ## add the evelution angle
331 |             eve_start_x = int(5 * math.sin(angle_st_goal) + start[0])
332 |             eve_start_y = int(5 * math.cos(angle_st_goal) + start[1])
333 |             if eve_start_x > map_pred.shape[0]: eve_start_x = map_pred.shape[0] 
334 |             if eve_start_y > map_pred.shape[0]: eve_start_y = map_pred.shape[0] 
335 |             if eve_start_x < 0: eve_start_x = 0 
336 |             if eve_start_y < 0: eve_start_y = 0 
337 |             if exp_pred[eve_start_x, eve_start_y] == 0 and self.eve_angle > -60:
338 |                 action = 5
339 |                 self.eve_angle -= 30
340 |             elif exp_pred[eve_start_x, eve_start_y] == 1 and self.eve_angle < 0:
341 |                 action = 4
342 |                 self.eve_angle += 30
343 |             elif relative_angle > self.args.turn_angle / 2.:
344 |                 action = 3  # Right
345 |             elif relative_angle < -self.args.turn_angle / 2.:
346 |                 action = 2  # Left
347 |             else:
348 |                 action = 1  # Forward
349 | 
350 |         return action
351 | 
352 |     def _get_stg(self, grid, start, goal, planning_window):
353 |         """Get short-term goal"""
354 | 
355 |         [gx1, gx2, gy1, gy2] = planning_window
356 | 
357 |         x1, y1, = 0, 0
358 |         x2, y2 = grid.shape
359 | 
360 |         def add_boundary(mat, value=1):
361 |             h, w = mat.shape
362 |             new_mat = np.zeros((h + 2, w + 2)) + value
363 |             new_mat[1:h + 1, 1:w + 1] = mat
364 |             return new_mat
365 | 
366 |         traversible = skimage.morphology.binary_dilation(
367 |             grid[x1:x2, y1:y2],
368 |             self.selem) != True
369 |         traversible[self.collision_map[gx1:gx2, gy1:gy2]
370 |                     [x1:x2, y1:y2] == 1] = 0
371 |         traversible[cv2.dilate(self.visited_vis[gx1:gx2, gy1:gy2][x1:x2, y1:y2], self.kernel) == 1] = 1
372 | 
373 |         traversible[int(start[0] - x1) - 1:int(start[0] - x1) + 2,
374 |                     int(start[1] - y1) - 1:int(start[1] - y1) + 2] = 1
375 | 
376 |         traversible = add_boundary(traversible)
377 |         goal = add_boundary(goal, value=0)
378 | 
379 |         planner = FMMPlanner(traversible)
380 |         selem = skimage.morphology.disk(10)
381 |         goal = skimage.morphology.binary_dilation(
382 |             goal, selem) != True
383 |         goal = 1 - goal * 1.
384 |         planner.set_multi_goal(goal)
385 | 
386 |         state = [start[0] - x1 + 1, start[1] - y1 + 1]
387 |         stg_x, stg_y, replan, stop = planner.get_short_term_goal(state)
388 | 
389 |         stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1
390 | 
391 |         return (stg_x, stg_y), replan, stop
392 | 
393 |     def _preprocess_obs(self, obs, use_seg=True):
394 |         args = self.args
395 |         # print("obs: ", obs)
396 | 
397 |         obs = obs.transpose(1, 2, 0)
398 |         rgb = obs[:, :, :3]
399 |         depth = obs[:, :, 3:4]
400 |         semantic = obs[:,:,4:5].squeeze()
401 |         # print("obs: ", semantic.shape)
402 |         if args.use_gtsem:
403 |             self.rgb_vis = rgb
404 |             sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1))
405 |             for i in range(16):
406 |                 sem_seg_pred[:,:,i][semantic == i+1] = 1
407 |         else:
408 |             # print("sem_exp_preprocess_semantic")
409 |             red_semantic_pred, semantic_pred = self._get_sem_pred(
410 |                 rgb.astype(np.uint8), depth, use_seg=use_seg)
411 |             # print(red_semantic_pred.shape)
412 |             # print(semantic_pred.shape)
413 |             # print(mp_categories_mapping)
414 |             # sys.exit(0)
415 |             sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1))   
416 |             for i in range(0, 15):
417 |                 # print(mp_categories_mapping[i])
418 |                 sem_seg_pred[:,:,i][red_semantic_pred == mp_categories_mapping[i]] = 1
419 | 
420 |             sem_seg_pred[:,:,0][semantic_pred[:,:,0] == 0] = 0
421 |             sem_seg_pred[:,:,1][semantic_pred[:,:,1] == 0] = 0
422 |             sem_seg_pred[:,:,3][semantic_pred[:,:,3] == 0] = 0
423 |             sem_seg_pred[:,:,4][semantic_pred[:,:,4] == 1] = 1
424 |             sem_seg_pred[:,:,5][semantic_pred[:,:,5] == 1] = 1
425 | 
426 | 
427 |         # sem_seg_pred = self._get_sem_pred(
428 |         #     rgb.astype(np.uint8), depth, use_seg=use_seg)
429 | 
430 |         depth = self._preprocess_depth(depth, args.min_depth, args.max_depth)
431 | 
432 |         ds = args.env_frame_width // args.frame_width  # Downscaling factor
433 | 
434 |         if ds != 1:
435 |             rgb = np.asarray(self.res(rgb.astype(np.uint8)))
436 |             depth = depth[ds // 2::ds, ds // 2::ds]
437 |             sem_seg_pred = sem_seg_pred[ds // 2::ds, ds // 2::ds]
438 | 
439 |         depth = np.expand_dims(depth, axis=2)
440 |         state = np.concatenate((rgb, depth, sem_seg_pred),
441 |                                axis=2).transpose(2, 0, 1)
442 | 
443 |         return state
444 | 
445 |     def _preprocess_depth(self, depth, min_d, max_d):
446 |         depth = depth[:, :, 0] * 1
447 | 
448 |         for i in range(depth.shape[1]):
449 |             depth[:, i][depth[:, i] == 0.] = depth[:, i].max()
450 | 
451 |         mask2 = depth > 0.99
452 |         depth[mask2] = 0.
453 | 
454 |         mask1 = depth == 0
455 |         depth[mask1] = 100.0
456 |         # depth = min_d * 100.0 + depth * max_d * 100.0
457 |         depth = min_d * 100.0 + depth * (max_d-min_d) * 100.0
458 |         # depth = depth*1000.
459 | 
460 |         return depth
461 | 
462 |     def _get_sem_pred(self, rgb, depth, use_seg=True):
463 |         if use_seg:
464 |             image = torch.from_numpy(rgb).to(self.device).unsqueeze_(0).float()
465 |             depth = torch.from_numpy(depth).to(self.device).unsqueeze_(0).float()
466 |             with torch.no_grad():
467 |                 red_semantic_pred = self.red_sem_pred(image, depth)
468 |                 red_semantic_pred = red_semantic_pred.squeeze().cpu().detach().numpy()
469 |             semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb)
470 |             semantic_pred = semantic_pred.astype(np.float32)
471 |         else:
472 |             semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16))
473 |             self.rgb_vis = rgb[:, :, ::-1]
474 |         return red_semantic_pred, semantic_pred
475 | 
476 |     def _visualize(self, inputs):
477 |         args = self.args
478 |         dump_dir = "{}/dump/{}/".format(args.dump_location,
479 |                                         args.exp_name)
480 |         ep_dir = '{}/episodes/thread_{}/eps_{}/'.format(
481 |             dump_dir, self.rank, self.episode_no)
482 |         score_dir = '{}/episodes/thread_{}/score_{}/'.format(
483 |             dump_dir, self.rank, self.episode_no)
484 | 
485 |         if not os.path.exists(ep_dir):
486 |             os.makedirs(ep_dir)
487 |         if not os.path.exists(score_dir):
488 |             os.makedirs(score_dir)
489 | 
490 |         local_w = inputs['map_pred'].shape[0]
491 | 
492 |         map_pred = inputs['map_pred']
493 |         exp_pred = inputs['exp_pred']
494 |         map_edge = inputs['map_edge']
495 |         start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs['pose_pred']
496 | 
497 |         goal = inputs['goal']
498 |         sem_map = inputs['sem_map_pred']
499 | 
500 |         gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
501 | 
502 |         sem_map += 5
503 | 
504 |         no_cat_mask = sem_map == 20
505 |         map_mask = np.rint(map_pred) == 1
506 |         exp_mask = np.rint(exp_pred) == 1
507 |         vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1
508 |         edge_mask = map_edge == 1
509 | 
510 |         sem_map[no_cat_mask] = 0
511 |         m1 = np.logical_and(no_cat_mask, exp_mask)
512 |         sem_map[m1] = 2
513 | 
514 |         m2 = np.logical_and(no_cat_mask, map_mask)
515 |         sem_map[m2] = 1
516 | 
517 |         sem_map[vis_mask] = 3
518 |         sem_map[edge_mask] = 3
519 | 
520 |         selem = skimage.morphology.disk(4)
521 |         goal_mat = 1 - skimage.morphology.binary_dilation(
522 |             goal, selem) != True
523 | 
524 |         goal_mask = goal_mat == 1
525 |         sem_map[goal_mask] = 4
526 |         if np.sum(goal) == 1:
527 |             f_pos = np.argwhere(goal == 1)
528 |             # fmb = get_frontier_boundaries((f_pos[0][0], f_pos[0][1]))
529 |             # goal_fmb = skimage.draw.circle_perimeter(int((fmb[0]+fmb[1])/2), int((fmb[2]+fmb[3])/2), 23)
530 |             goal_fmb = skimage.draw.circle_perimeter(f_pos[0][0], f_pos[0][1], int(local_w/4-2))
531 |             goal_fmb[0][goal_fmb[0] > local_w-1] = local_w-1
532 |             goal_fmb[1][goal_fmb[1] > local_w-1] = local_w-1
533 |             goal_fmb[0][goal_fmb[0] < 0] = 0
534 |             goal_fmb[1][goal_fmb[1] < 0] = 0
535 |             # goal_fmb[goal_fmb < 0] =0
536 |             goal_mask[goal_fmb[0], goal_fmb[1]] = 1
537 |             sem_map[goal_mask] = 4
538 | 
539 | 
540 |         color_pal = [int(x * 255.) for x in color_palette]
541 |         sem_map_vis = Image.new("P", (sem_map.shape[1],
542 |                                       sem_map.shape[0]))
543 |         sem_map_vis.putpalette(color_pal)
544 |         sem_map_vis.putdata(sem_map.flatten().astype(np.uint8))
545 |         sem_map_vis = sem_map_vis.convert("RGB")
546 |         sem_map_vis = np.flipud(sem_map_vis)
547 | 
548 |         sem_map_vis = sem_map_vis[:, :, [2, 1, 0]]
549 |         sem_map_vis = cv2.resize(sem_map_vis, (480, 480),
550 |                                  interpolation=cv2.INTER_NEAREST)
551 |         self.vis_image[50:530, 15:655] = self.rgb_vis
552 |         self.vis_image[50:530, 670:1150] = sem_map_vis
553 | 
554 |         pos = (
555 |             (start_x * 100. / args.map_resolution - gy1)
556 |             * 480 / map_pred.shape[0],
557 |             (map_pred.shape[1] - start_y * 100. / args.map_resolution + gx1)
558 |             * 480 / map_pred.shape[1],
559 |             np.deg2rad(-start_o)
560 |         )
561 | 
562 |         agent_arrow = vu.get_contour_points(pos, origin=(670, 50), size=10)
563 |         color = (int(color_palette[11] * 255),
564 |                  int(color_palette[10] * 255),
565 |                  int(color_palette[9] * 255))
566 |         cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1)
567 | 
568 |         if args.visualize:
569 |             # Displaying the image
570 |             cv2.imshow("Thread {}".format(self.rank), self.vis_image)
571 |             cv2.waitKey(1)
572 | 
573 |         if args.print_images:
574 |             fn = '{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png'.format(
575 |                 dump_dir, self.rank, self.episode_no,
576 |                 self.rank, self.episode_no, self.timestep)
577 |             cv2.imwrite(fn, self.vis_image)
578 | 
579 | 
580 | 


--------------------------------------------------------------------------------