├── img
└── t.png
├── .gitignore
├── requirements.txt
├── configs
├── COCO-InstanceSegmentation
│ └── mask_rcnn_R_50_FPN_3x.yaml
└── Base-RCNN-FPN.yaml
├── envs
├── utils
│ ├── pose.py
│ ├── rotation_utils.py
│ ├── map_builder.py
│ ├── fmm_planner.py
│ ├── depth_utils.py
│ └── depth_utils_multi.py
├── habitat
│ ├── configs
│ │ └── tasks
│ │ │ ├── objectnav_gibson.yaml
│ │ │ └── objectnav_hm3d.yaml
│ ├── __init__.py
│ ├── objectgoal_env21.py
│ └── objectgoal_env.py
└── __init__.py
├── README.md
├── agents
├── utils
│ ├── visualization.py
│ └── semantic_prediction.py
└── sem_exp.py
├── utils
├── tensorboard_utils.py
└── model.py
├── constants.py
├── RedNet
├── utils.py
└── RedNet_model.py
├── arguments.py
├── model_multi.py
└── get_point_score.py
/img/t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shalexyuan/GAMap/HEAD/img/t.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | RedNet/model/*
2 | vis/*
3 | tmp/*
4 | data/*
5 | *.symlink
6 | **/__pycache__/
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-fmm==2019.1.30
2 | scikit-learn==0.22.2.post1
3 | scikit-image==0.15.0
4 | numpy>=1.20.2
5 | ifcfg
6 | transformers==4.23.1
7 | tensorboard
8 |
--------------------------------------------------------------------------------
/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../Base-RCNN-FPN.yaml"
2 | MODEL:
3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
4 | MASK_ON: True
5 | RESNETS:
6 | DEPTH: 50
7 | SOLVER:
8 | STEPS: (210000, 250000)
9 | MAX_ITER: 270000
10 |
--------------------------------------------------------------------------------
/envs/utils/pose.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def get_l2_distance(x1, x2, y1, y2):
5 | """
6 | Computes the L2 distance between two points.
7 | """
8 | return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
9 |
10 |
11 | def get_rel_pose_change(pos2, pos1):
12 | x1, y1, o1 = pos1
13 | x2, y2, o2 = pos2
14 |
15 | theta = np.arctan2(y2 - y1, x2 - x1) - o1
16 | dist = get_l2_distance(x1, x2, y1, y2)
17 | dx = dist * np.cos(theta)
18 | dy = dist * np.sin(theta)
19 | do = o2 - o1
20 |
21 | return dx, dy, do
22 |
23 |
24 | def get_new_pose(pose, rel_pose_change):
25 | x, y, o = pose
26 | dx, dy, do = rel_pose_change
27 |
28 | global_dx = dx * np.sin(np.deg2rad(o)) + dy * np.cos(np.deg2rad(o))
29 | global_dy = dx * np.cos(np.deg2rad(o)) - dy * np.sin(np.deg2rad(o))
30 | x += global_dy
31 | y += global_dx
32 | o += np.rad2deg(do)
33 | if o > 180.:
34 | o -= 360.
35 |
36 | return x, y, o
37 |
38 |
39 | def threshold_poses(coords, shape):
40 | coords[0] = min(max(0, coords[0]), shape[0] - 1)
41 | coords[1] = min(max(0, coords[1]), shape[1] - 1)
42 | return coords
43 |
--------------------------------------------------------------------------------
/envs/habitat/configs/tasks/objectnav_gibson.yaml:
--------------------------------------------------------------------------------
1 | ENVIRONMENT:
2 | MAX_EPISODE_STEPS: 500
3 | SIMULATOR:
4 | TURN_ANGLE: 30
5 | TILT_ANGLE: 30
6 | ACTION_SPACE_CONFIG: "v1"
7 | AGENT_0:
8 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']
9 | HEIGHT: 0.88
10 | RADIUS: 0.18
11 | HABITAT_SIM_V0:
12 | GPU_DEVICE_ID: 0
13 | ALLOW_SLIDING: False
14 | SEMANTIC_SENSOR:
15 | WIDTH: 640
16 | HEIGHT: 480
17 | HFOV: 79
18 | POSITION: [0, 0.88, 0]
19 | RGB_SENSOR:
20 | WIDTH: 640
21 | HEIGHT: 480
22 | HFOV: 79
23 | POSITION: [0, 0.88, 0]
24 | DEPTH_SENSOR:
25 | WIDTH: 640
26 | HEIGHT: 480
27 | HFOV: 79
28 | MIN_DEPTH: 0.5
29 | MAX_DEPTH: 5.0
30 | POSITION: [0, 0.88, 0]
31 | TASK:
32 | TYPE: ObjectNav-v1
33 | POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"]
34 | SENSORS: ['GPS_SENSOR', 'COMPASS_SENSOR']
35 | MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL']
36 | GOAL_SENSOR_UUID: objectgoal
37 | SUCCESS:
38 | SUCCESS_DISTANCE: 0.5
39 |
40 | DATASET:
41 | TYPE: PointNav-v1
42 | SPLIT: train
43 | DATA_PATH: "data/datasets/objectnav/gibson/v1.1/{split}/{split}.json.gz"
44 | EPISODES_DIR: "data/datasets/objectnav/gibson/v1.1/{split}/"
45 | SCENES_DIR: "/data/p305574/data/scene_datasets"
46 |
--------------------------------------------------------------------------------
/envs/habitat/configs/tasks/objectnav_hm3d.yaml:
--------------------------------------------------------------------------------
1 | ENVIRONMENT:
2 | MAX_EPISODE_STEPS: 500
3 | SIMULATOR:
4 | TURN_ANGLE: 30
5 | TILT_ANGLE: 30
6 | ACTION_SPACE_CONFIG: "v1"
7 | AGENT_0:
8 | SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR']
9 | HEIGHT: 0.88
10 | RADIUS: 0.18
11 | HABITAT_SIM_V0:
12 | GPU_DEVICE_ID: 0
13 | ALLOW_SLIDING: False
14 | SEMANTIC_SENSOR:
15 | WIDTH: 640
16 | HEIGHT: 480
17 | HFOV: 79
18 | POSITION: [0, 0.88, 0]
19 | RGB_SENSOR:
20 | WIDTH: 640
21 | HEIGHT: 480
22 | HFOV: 79
23 | POSITION: [0, 0.88, 0]
24 | DEPTH_SENSOR:
25 | WIDTH: 640
26 | HEIGHT: 480
27 | HFOV: 79
28 | MIN_DEPTH: 0.5
29 | MAX_DEPTH: 5.0
30 | POSITION: [0, 0.88, 0]
31 | SCENE_DATASET: "data/scene_datasets/hm3d/hm3d_annotated_basis.scene_dataset_config.json"
32 | TASK:
33 | TYPE: ObjectNav-v1
34 | POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"]
35 | SENSORS: ['OBJECTGOAL_SENSOR', 'GPS_SENSOR', 'COMPASS_SENSOR']
36 | GOAL_SENSOR_UUID: objectgoal
37 | MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL']
38 | DISTANCE_TO_GOAL:
39 | DISTANCE_TO: VIEW_POINTS
40 | SUCCESS:
41 | SUCCESS_DISTANCE: 0.1
42 |
43 | DATASET:
44 | TYPE: ObjectNav-v1
45 | SPLIT: val
46 | DATA_PATH: "data/objectgoal_hm3d/{split}/{split}.json.gz"
47 | SCENES_DIR: "data/scene_datasets"
48 |
49 |
--------------------------------------------------------------------------------
/configs/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | META_ARCHITECTURE: "GeneralizedRCNN"
3 | BACKBONE:
4 | NAME: "build_resnet_fpn_backbone"
5 | RESNETS:
6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"]
7 | FPN:
8 | IN_FEATURES: ["res2", "res3", "res4", "res5"]
9 | ANCHOR_GENERATOR:
10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
12 | RPN:
13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level
16 | # Detectron1 uses 2000 proposals per-batch,
17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 | POST_NMS_TOPK_TRAIN: 1000
20 | POST_NMS_TOPK_TEST: 1000
21 | ROI_HEADS:
22 | NAME: "StandardROIHeads"
23 | IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 | ROI_BOX_HEAD:
25 | NAME: "FastRCNNConvFCHead"
26 | NUM_FC: 2
27 | POOLER_RESOLUTION: 7
28 | ROI_MASK_HEAD:
29 | NAME: "MaskRCNNConvUpsampleHead"
30 | NUM_CONV: 4
31 | POOLER_RESOLUTION: 14
32 | DATASETS:
33 | TRAIN: ("coco_2017_train",)
34 | TEST: ("coco_2017_val",)
35 | SOLVER:
36 | IMS_PER_BATCH: 16
37 | BASE_LR: 0.02
38 | STEPS: (60000, 80000)
39 | MAX_ITER: 90000
40 | INPUT:
41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2
43 |
--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .habitat import construct_envs, construct_envs21
4 |
5 |
6 | def make_vec_envs(args):
7 | if args.task_config == "tasks/objectnav_gibson.yaml":
8 | envs = construct_envs(args)
9 | else:
10 | envs = construct_envs21(args)
11 | envs = VecPyTorch(envs, args.device)
12 | return envs
13 |
14 |
15 | # Adapted from
16 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/envs.py#L159
17 | class VecPyTorch():
18 |
19 | def __init__(self, venv, device):
20 | self.venv = venv
21 | self.num_envs = venv.num_envs
22 | self.observation_space = venv.observation_space
23 | self.action_space = venv.action_space
24 | self.device = device
25 |
26 | def reset(self):
27 | obs, info = self.venv.reset()
28 | obs = torch.from_numpy(obs).float().to(self.device)
29 | return obs, info
30 |
31 | def step_async(self, actions):
32 | actions = actions.cpu().numpy()
33 | self.venv.step_async(actions)
34 |
35 | def step_wait(self):
36 | obs, reward, done, info = self.venv.step_wait()
37 | obs = torch.from_numpy(obs).float().to(self.device)
38 | reward = torch.from_numpy(reward).float()
39 | return obs, reward, done, info
40 |
41 | def step(self, actions):
42 | actions = actions.cpu().numpy()
43 | obs, reward, done, info = self.venv.step(actions)
44 | obs = torch.from_numpy(obs).float().to(self.device)
45 | reward = torch.from_numpy(reward).float()
46 | return obs, reward, done, info
47 |
48 | def get_rewards(self, inputs):
49 | reward = self.venv.get_rewards(inputs)
50 | reward = torch.from_numpy(reward).float()
51 | return reward
52 |
53 | def plan_act_and_preprocess(self, inputs):
54 | obs, reward, done, info = self.venv.plan_act_and_preprocess(inputs)
55 | obs = torch.from_numpy(obs).float().to(self.device)
56 | # reward = torch.from_numpy(reward).float()
57 | return obs, reward, done, info
58 |
59 | def rotate_at_beginning_act(self, inputs):
60 | obs, reward, done, info = self.venv.rotate_at_beginning_act(inputs)
61 | obs = torch.from_numpy(obs).float().to(self.device)
62 | # reward = torch.from_numpy(reward).float()
63 | return obs, reward, done, info
64 |
65 | def close(self):
66 | return self.venv.close()
67 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance
3 |
4 | Neurips 2024
5 |
6 |
7 | This is the official repository of [GAMap: Zero-Shot Object Goal Navigation with Multi-Scale Geometric-Affordance Guidance](https://arxiv.org/pdf/2410.23978).
8 |
9 |
10 |

11 |
12 |
13 | ## Setup
14 |
15 | ### Dataset Preparation
16 |
17 | Please follow [HM3D](https://aihabitat.org/datasets/hm3d/) to download the dataset and prepare the data. The data format should be:
18 |
19 | ```
20 | data/
21 | scene_datasets/
22 | matterport_category_mappings.tsv
23 | object_norm_inv_perplexity.npy
24 | versioned_data
25 | objectgoal_hm3d/
26 | train/
27 | val/
28 | val_mini/
29 | ```
30 |
31 | ### Dependencies
32 |
33 | 1. Python & PyTorch
34 |
35 | This code is tested on Python 3.9.16 on Ubuntu 20.04, with PyTorch 1.11.0+cu113.
36 |
37 | 2. Habitat-Sim & Habitat-Lab
38 |
39 | ```
40 | # Habitat-Sim
41 | git clone https://github.com/facebookresearch/habitat-sim.git
42 | cd habitat-sim; git checkout tags/challenge-2022;
43 | pip install -r requirements.txt;
44 | python setup.py install --headless
45 |
46 | # Habitat-Lab
47 | git clone https://github.com/facebookresearch/habitat-lab.git
48 | cd habitat-lab; git checkout tags/challenge-2022;
49 | pip install -e .
50 | ```
51 |
52 | 4. Others
53 |
54 | ```
55 | pip install -r requirements.txt
56 | ```
57 |
58 | ## Running
59 |
60 | ### Example
61 |
62 | An example command to run the pipeline:
63 |
64 | ```
65 | python ga.py --split val_mini --eval 0 --auto_gpu_config 0 -n 1 \
66 | --num_eval_episodes 2000 --use_gtsem 0 --num_local_steps 10 \
67 | --exp_name gamap --agent multi_attr_exp
68 | ```
69 |
70 | [//]: # ()
71 | [//]: # (### Visualization)
72 |
73 | [//]: # ()
74 | [//]: # (To make a demo video on your saved images, you can either use `ffmpeg` to make separate videos or use)
75 |
76 | [//]: # ()
77 | [//]: # (```)
78 |
79 | [//]: # (python make_demo.py --exp_name test # add `--delete_img` to delete images after making video)
80 |
81 | [//]: # (```)
82 |
83 | [//]: # ()
84 | [//]: # (to make batched videos.)
85 |
86 | ## Acknowledgements
87 |
88 | This repo is heavily based on [L3MVN](https://github.com/ybgdgh/L3MVN). We thank the authors for their great work.
89 |
90 | ## Citation
91 |
92 | If you find this work helpful, please consider citing:
93 |
--------------------------------------------------------------------------------
/agents/utils/visualization.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 |
4 |
5 | def get_contour_points(pos, origin, size=20):
6 | x, y, o = pos
7 | pt1 = (int(x) + origin[0],
8 | int(y) + origin[1])
9 | pt2 = (int(x + size / 1.5 * np.cos(o + np.pi * 4 / 3)) + origin[0],
10 | int(y + size / 1.5 * np.sin(o + np.pi * 4 / 3)) + origin[1])
11 | pt3 = (int(x + size * np.cos(o)) + origin[0],
12 | int(y + size * np.sin(o)) + origin[1])
13 | pt4 = (int(x + size / 1.5 * np.cos(o - np.pi * 4 / 3)) + origin[0],
14 | int(y + size / 1.5 * np.sin(o - np.pi * 4 / 3)) + origin[1])
15 |
16 | return np.array([pt1, pt2, pt3, pt4])
17 |
18 |
19 | def draw_line(start, end, mat, steps=25, w=1):
20 | for i in range(steps + 1):
21 | x = int(np.rint(start[0] + (end[0] - start[0]) * i / steps))
22 | y = int(np.rint(start[1] + (end[1] - start[1]) * i / steps))
23 | mat[x - w:x + w, y - w:y + w] = 1
24 | return mat
25 |
26 |
27 | def init_vis_image(goal_name, legend):
28 | vis_image = np.ones((655, 1165, 3)).astype(np.uint8) * 255
29 | font = cv2.FONT_HERSHEY_SIMPLEX
30 | fontScale = 1
31 | color = (20, 20, 20) # BGR
32 | thickness = 2
33 |
34 | text = "Observations (Goal: {})".format(goal_name)
35 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
36 | textX = (640 - textsize[0]) // 2 + 15
37 | textY = (50 + textsize[1]) // 2
38 | vis_image = cv2.putText(vis_image, text, (textX, textY),
39 | font, fontScale, color, thickness,
40 | cv2.LINE_AA)
41 |
42 | text = "Predicted Semantic Map"
43 | textsize = cv2.getTextSize(text, font, fontScale, thickness)[0]
44 | textX = 640 + (480 - textsize[0]) // 2 + 30
45 | textY = (50 + textsize[1]) // 2
46 | vis_image = cv2.putText(vis_image, text, (textX, textY),
47 | font, fontScale, color, thickness,
48 | cv2.LINE_AA)
49 |
50 | # draw outlines
51 | color = [100, 100, 100]
52 | vis_image[49, 15:655] = color
53 | vis_image[49, 670:1150] = color
54 | vis_image[50:530, 14] = color
55 | vis_image[50:530, 655] = color
56 | vis_image[50:530, 669] = color
57 | vis_image[50:530, 1150] = color
58 | vis_image[530, 15:655] = color
59 | vis_image[530, 670:1150] = color
60 |
61 | # draw legend
62 | # lx, ly, _ = legend.shape
63 | # vis_image[537:537 + lx, 155:155 + ly, :] = legend
64 |
65 | return vis_image
66 |
--------------------------------------------------------------------------------
/utils/tensorboard_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from typing import Any
8 |
9 | import numpy as np
10 | import torch
11 | from torch.utils.tensorboard import SummaryWriter
12 |
13 |
14 | class TensorboardWriter:
15 | def __init__(self, log_dir: str, *args: Any, **kwargs: Any):
16 | r"""A Wrapper for tensorboard SummaryWriter. It creates a dummy writer
17 | when log_dir is empty string or None. It also has functionality that
18 | generates tb video directly from numpy images.
19 |
20 | Args:
21 | log_dir: Save directory location. Will not write to disk if
22 | log_dir is an empty string.
23 | *args: Additional positional args for SummaryWriter
24 | **kwargs: Additional keyword args for SummaryWriter
25 | """
26 | self.writer = None
27 | if log_dir is not None and len(log_dir) > 0:
28 | self.writer = SummaryWriter(log_dir, *args, **kwargs)
29 |
30 | def __getattr__(self, item):
31 | if self.writer:
32 | return self.writer.__getattribute__(item)
33 | else:
34 | return lambda *args, **kwargs: None
35 |
36 | def __enter__(self):
37 | return self
38 |
39 | def __exit__(self, exc_type, exc_val, exc_tb):
40 | if self.writer:
41 | self.writer.close()
42 |
43 | def add_video_from_np_images(
44 | self, video_name: str, step_idx: int, images: np.ndarray, fps: int = 10
45 | ) -> None:
46 | r"""Write video into tensorboard from images frames.
47 |
48 | Args:
49 | video_name: name of video string.
50 | step_idx: int of checkpoint index to be displayed.
51 | images: list of n frames. Each frame is a np.ndarray of shape.
52 | fps: frame per second for output video.
53 |
54 | Returns:
55 | None.
56 | """
57 | if not self.writer:
58 | return
59 | # initial shape of np.ndarray list: N * (H, W, 3)
60 | frame_tensors = [
61 | torch.from_numpy(np_arr).unsqueeze(0) for np_arr in images
62 | ]
63 | video_tensor = torch.cat(tuple(frame_tensors))
64 | video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0)
65 | # final shape of video tensor: (1, n, 3, H, W)
66 | self.writer.add_video(
67 | video_name, video_tensor, fps=fps, global_step=step_idx
68 | )
69 |
--------------------------------------------------------------------------------
/envs/utils/rotation_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for generating and applying rotation matrices.
17 | """
18 | import numpy as np
19 |
20 | ANGLE_EPS = 0.001
21 |
22 |
23 | def normalize(v):
24 | return v / np.linalg.norm(v)
25 |
26 |
27 | def get_r_matrix(ax_, angle):
28 | ax = normalize(ax_)
29 | if np.abs(angle) > ANGLE_EPS:
30 | S_hat = np.array(
31 | [[0.0, -ax[2], ax[1]], [ax[2], 0.0, -ax[0]], [-ax[1], ax[0], 0.0]],
32 | dtype=np.float32)
33 | R = np.eye(3) + np.sin(angle) * S_hat + \
34 | (1 - np.cos(angle)) * (np.linalg.matrix_power(S_hat, 2))
35 | else:
36 | R = np.eye(3)
37 | return R
38 |
39 |
40 | def r_between(v_from_, v_to_):
41 | v_from = normalize(v_from_)
42 | v_to = normalize(v_to_)
43 | ax = normalize(np.cross(v_from, v_to))
44 | angle = np.arccos(np.dot(v_from, v_to))
45 | return get_r_matrix(ax, angle)
46 |
47 |
48 | def rotate_camera_to_point_at(up_from, lookat_from, up_to, lookat_to):
49 | inputs = [up_from, lookat_from, up_to, lookat_to]
50 | for i in range(4):
51 | inputs[i] = normalize(np.array(inputs[i]).reshape((-1,)))
52 | up_from, lookat_from, up_to, lookat_to = inputs
53 | r1 = r_between(lookat_from, lookat_to)
54 |
55 | new_x = np.dot(r1, np.array([1, 0, 0]).reshape((-1, 1))).reshape((-1))
56 | to_x = normalize(np.cross(lookat_to, up_to))
57 | angle = np.arccos(np.dot(new_x, to_x))
58 | if angle > ANGLE_EPS:
59 | if angle < np.pi - ANGLE_EPS:
60 | ax = normalize(np.cross(new_x, to_x))
61 | flip = np.dot(lookat_to, ax)
62 | if flip > 0:
63 | r2 = get_r_matrix(lookat_to, angle)
64 | elif flip < 0:
65 | r2 = get_r_matrix(lookat_to, -1. * angle)
66 | else:
67 | # Angle of rotation is too close to 180 degrees, direction of
68 | # rotation does not matter.
69 | r2 = get_r_matrix(lookat_to, angle)
70 | else:
71 | r2 = np.eye(3)
72 | return np.dot(r2, r1)
73 |
--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
1 | coco_categories = {
2 | "chair": 0,
3 | "couch": 1,
4 | "potted plant": 2,
5 | "bed": 3,
6 | "toilet": 4,
7 | "tv": 5,
8 | "dining-table": 6,
9 | "oven": 7,
10 | "sink": 8,
11 | "refrigerator": 9,
12 | "book": 10,
13 | "clock": 11,
14 | "vase": 12,
15 | "cup": 13,
16 | "bottle": 14
17 | }
18 |
19 | category_to_id = [
20 | "chair",
21 | "bed",
22 | "plant",
23 | "toilet",
24 | "tv_monitor",
25 | "sofa"
26 | ]
27 |
28 | category_to_id_gibson = [
29 | "chair",
30 | "couch",
31 | "potted plant",
32 | "bed",
33 | "toilet",
34 | "tv"
35 | ]
36 |
37 | mp3d_category_id = {
38 | 'void': 1,
39 | 'chair': 2,
40 | 'sofa': 3,
41 | 'plant': 4,
42 | 'bed': 5,
43 | 'toilet': 6,
44 | 'tv_monitor': 7,
45 | 'table': 8,
46 | 'refrigerator': 9,
47 | 'sink': 10,
48 | 'stairs': 16,
49 | 'fireplace': 12
50 | }
51 |
52 | # mp_categories_mapping = [4, 11, 15, 12, 19, 23, 6, 7, 15, 38, 40, 28, 29, 8, 17]
53 |
54 | mp_categories_mapping = [4, 11, 15, 12, 19, 23, 26, 24, 28, 38, 21, 16, 14, 6, 16]
55 |
56 | hm3d_category = [
57 | "chair",
58 | "sofa",
59 | "plant",
60 | "bed",
61 | "toilet",
62 | "tv_monitor",
63 | "bathtub",
64 | "shower",
65 | "fireplace",
66 | "appliances",
67 | "towel",
68 | "sink",
69 | "chest_of_drawers",
70 | "table",
71 | "stairs"
72 | ]
73 |
74 | coco_categories_mapping = {
75 | 56: 0, # chair
76 | 57: 1, # couch
77 | 58: 2, # potted plant
78 | 59: 3, # bed
79 | 61: 4, # toilet
80 | 62: 5, # tv
81 | 60: 6, # dining-table
82 | 69: 7, # oven
83 | 71: 8, # sink
84 | 72: 9, # refrigerator
85 | 73: 10, # book
86 | 74: 11, # clock
87 | 75: 12, # vase
88 | 41: 13, # cup
89 | 39: 14, # bottle
90 | }
91 |
92 | color_palette = [
93 | 1.0, 1.0, 1.0,
94 | 0.6, 0.6, 0.6,
95 | 0.95, 0.95, 0.95,
96 | 0.96, 0.36, 0.26,
97 | 0.12156862745098039, 0.47058823529411764, 0.7058823529411765,
98 | 0.9400000000000001, 0.7818, 0.66,
99 | 0.9400000000000001, 0.8868, 0.66,
100 | 0.8882000000000001, 0.9400000000000001, 0.66,
101 | 0.7832000000000001, 0.9400000000000001, 0.66,
102 | 0.6782000000000001, 0.9400000000000001, 0.66,
103 | 0.66, 0.9400000000000001, 0.7468000000000001,
104 | 0.66, 0.9400000000000001, 0.8518000000000001,
105 | 0.66, 0.9232, 0.9400000000000001,
106 | 0.66, 0.8182, 0.9400000000000001,
107 | 0.66, 0.7132, 0.9400000000000001,
108 | 0.7117999999999999, 0.66, 0.9400000000000001,
109 | 0.8168, 0.66, 0.9400000000000001,
110 | 0.9218, 0.66, 0.9400000000000001,
111 | 0.9400000000000001, 0.66, 0.8531999999999998,
112 | 0.9400000000000001, 0.66, 0.748199999999999]
113 |
114 |
115 |
116 | cat_2_multi_att_prompt={
117 | "chair": ['chair','leg','backrest','seat','armrest'],
118 | "bed": ['bed','sleep','lie','mattress','bedframe'],
119 | "plant": ['plant','leaf','stem','branch','pot'],
120 | "toilet": ['toilet','bathroom','defecating','urinating','flushing'],
121 | "tv_monitor": ['tv monitor','livingroom','bedroom','screen','display'],
122 | "sofa": ['sofa','sit','cushion','armrest','backrest'],
123 | }
--------------------------------------------------------------------------------
/envs/utils/map_builder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import envs.utils.depth_utils as du
3 |
4 |
5 | class MapBuilder(object):
6 | def __init__(self, params):
7 | self.params = params
8 | frame_width = params['frame_width']
9 | frame_height = params['frame_height']
10 | fov = params['fov']
11 | self.camera_matrix = du.get_camera_matrix(
12 | frame_width,
13 | frame_height,
14 | fov)
15 | self.vision_range = params['vision_range']
16 |
17 | self.map_size_cm = params['map_size_cm']
18 | self.resolution = params['resolution']
19 | agent_min_z = params['agent_min_z']
20 | agent_max_z = params['agent_max_z']
21 | self.z_bins = [agent_min_z, agent_max_z]
22 | self.du_scale = params['du_scale']
23 | self.visualize = params['visualize']
24 | self.obs_threshold = params['obs_threshold']
25 |
26 | self.map = np.zeros((self.map_size_cm // self.resolution,
27 | self.map_size_cm // self.resolution,
28 | len(self.z_bins) + 1), dtype=np.float32)
29 |
30 | self.agent_height = params['agent_height']
31 | self.agent_view_angle = params['agent_view_angle']
32 | return
33 |
34 | def update_map(self, depth, current_pose):
35 | with np.errstate(invalid="ignore"):
36 | depth[depth > self.vision_range * self.resolution] = np.NaN
37 | point_cloud = du.get_point_cloud_from_z(depth, self.camera_matrix,
38 | scale=self.du_scale)
39 |
40 | agent_view = du.transform_camera_view(point_cloud,
41 | self.agent_height,
42 | self.agent_view_angle)
43 |
44 | shift_loc = [self.vision_range * self.resolution // 2, 0, np.pi / 2.0]
45 | agent_view_centered = du.transform_pose(agent_view, shift_loc)
46 |
47 | agent_view_flat = du.bin_points(
48 | agent_view_centered,
49 | self.vision_range,
50 | self.z_bins,
51 | self.resolution)
52 |
53 | agent_view_cropped = agent_view_flat[:, :, 1]
54 | agent_view_cropped = agent_view_cropped / self.obs_threshold
55 | agent_view_cropped[agent_view_cropped >= 0.5] = 1.0
56 | agent_view_cropped[agent_view_cropped < 0.5] = 0.0
57 |
58 | agent_view_explored = agent_view_flat.sum(2)
59 | agent_view_explored[agent_view_explored > 0] = 1.0
60 |
61 | geocentric_pc = du.transform_pose(agent_view, current_pose)
62 |
63 | geocentric_flat = du.bin_points(
64 | geocentric_pc,
65 | self.map.shape[0],
66 | self.z_bins,
67 | self.resolution)
68 |
69 | self.map = self.map + geocentric_flat
70 |
71 | map_gt = self.map[:, :, 1] / self.obs_threshold
72 | map_gt[map_gt >= 0.5] = 1.0
73 | map_gt[map_gt < 0.5] = 0.0
74 |
75 | explored_gt = self.map.sum(2)
76 | explored_gt[explored_gt > 1] = 1.0
77 |
78 | return agent_view_cropped, map_gt, agent_view_explored, explored_gt
79 |
80 | def get_st_pose(self, current_loc):
81 | loc = [- (current_loc[0] / self.resolution
82 | - self.map_size_cm // (self.resolution * 2)) /
83 | (self.map_size_cm // (self.resolution * 2)),
84 | - (current_loc[1] / self.resolution
85 | - self.map_size_cm // (self.resolution * 2)) /
86 | (self.map_size_cm // (self.resolution * 2)),
87 | 90 - np.rad2deg(current_loc[2])]
88 | return loc
89 |
90 | def reset_map(self, map_size):
91 | self.map_size_cm = map_size
92 |
93 | self.map = np.zeros((self.map_size_cm // self.resolution,
94 | self.map_size_cm // self.resolution,
95 | len(self.z_bins) + 1), dtype=np.float32)
96 |
97 | def get_map(self):
98 | return self.map
99 |
--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn
4 | from torch.nn import functional as F
5 |
6 |
7 | def get_grid(pose, grid_size, device):
8 | """
9 | Input:
10 | `pose` FloatTensor(bs, 3)
11 | `grid_size` 4-tuple (bs, _, grid_h, grid_w)
12 | `device` torch.device (cpu or gpu)
13 | Output:
14 | `rot_grid` FloatTensor(bs, grid_h, grid_w, 2)
15 | `trans_grid` FloatTensor(bs, grid_h, grid_w, 2)
16 |
17 | """
18 | pose = pose.float()
19 | x = pose[:, 0]
20 | y = pose[:, 1]
21 | t = pose[:, 2]
22 |
23 | bs = x.size(0)
24 | t = t * np.pi / 180.
25 | cos_t = t.cos()
26 | sin_t = t.sin()
27 |
28 | theta11 = torch.stack([cos_t, -sin_t,
29 | torch.zeros(cos_t.shape).float().to(device)], 1)
30 | theta12 = torch.stack([sin_t, cos_t,
31 | torch.zeros(cos_t.shape).float().to(device)], 1)
32 | theta1 = torch.stack([theta11, theta12], 1)
33 |
34 | theta21 = torch.stack([torch.ones(x.shape).to(device),
35 | -torch.zeros(x.shape).to(device), x], 1)
36 | theta22 = torch.stack([torch.zeros(x.shape).to(device),
37 | torch.ones(x.shape).to(device), y], 1)
38 | theta2 = torch.stack([theta21, theta22], 1)
39 |
40 | rot_grid = F.affine_grid(theta1, torch.Size(grid_size))
41 | trans_grid = F.affine_grid(theta2, torch.Size(grid_size))
42 |
43 | return rot_grid, trans_grid
44 |
45 |
46 | class ChannelPool(nn.MaxPool1d):
47 | def forward(self, x):
48 | n, c, w, h = x.size()
49 | x = x.view(n, c, w * h).permute(0, 2, 1)
50 | x = x.contiguous()
51 | pooled = F.max_pool1d(x, c, 1)
52 | _, _, c = pooled.size()
53 | pooled = pooled.permute(0, 2, 1)
54 | return pooled.view(n, c, w, h)
55 |
56 |
57 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/utils.py#L32
58 | class AddBias(nn.Module):
59 | def __init__(self, bias):
60 | super(AddBias, self).__init__()
61 | self._bias = nn.Parameter(bias.unsqueeze(1))
62 |
63 | def forward(self, x):
64 | if x.dim() == 2:
65 | bias = self._bias.t().view(1, -1)
66 | else:
67 | bias = self._bias.t().view(1, -1, 1, 1)
68 |
69 | return x + bias
70 |
71 |
72 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L10
73 | class Flatten(nn.Module):
74 | def forward(self, x):
75 | return x.view(x.size(0), -1)
76 |
77 |
78 | # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/model.py#L82
79 | class NNBase(nn.Module):
80 |
81 | def __init__(self, recurrent, recurrent_input_size, hidden_size):
82 |
83 | super(NNBase, self).__init__()
84 | self._hidden_size = hidden_size
85 | self._recurrent = recurrent
86 |
87 | if recurrent:
88 | self.gru = nn.GRUCell(recurrent_input_size, hidden_size)
89 | nn.init.orthogonal_(self.gru.weight_ih.data)
90 | nn.init.orthogonal_(self.gru.weight_hh.data)
91 | self.gru.bias_ih.data.fill_(0)
92 | self.gru.bias_hh.data.fill_(0)
93 |
94 | @property
95 | def is_recurrent(self):
96 | return self._recurrent
97 |
98 | @property
99 | def rec_state_size(self):
100 | if self._recurrent:
101 | return self._hidden_size
102 | return 1
103 |
104 | @property
105 | def output_size(self):
106 | return self._hidden_size
107 |
108 | def _forward_gru(self, x, hxs, masks):
109 | if x.size(0) == hxs.size(0):
110 | x = hxs = self.gru(x, hxs * masks[:, None])
111 | else:
112 | # x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
113 | N = hxs.size(0)
114 | T = int(x.size(0) / N)
115 |
116 | # unflatten
117 | x = x.view(T, N, x.size(1))
118 |
119 | # Same deal with masks
120 | masks = masks.view(T, N, 1)
121 |
122 | outputs = []
123 | for i in range(T):
124 | hx = hxs = self.gru(x[i], hxs * masks[i])
125 | outputs.append(hx)
126 |
127 | # x is a (T, N, -1) tensor
128 | x = torch.stack(outputs, dim=0)
129 | # flatten
130 | x = x.view(T * N, -1)
131 |
132 | return x, hxs
133 |
--------------------------------------------------------------------------------
/RedNet/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 | import torch
4 | import os
5 |
6 | med_frq = [0.382900, 0.452448, 0.637584, 0.377464, 0.585595,
7 | 0.479574, 0.781544, 0.982534, 1.017466, 0.624581,
8 | 2.589096, 0.980794, 0.920340, 0.667984, 1.172291,
9 | 0.862240, 0.921714, 2.154782, 1.187832, 1.178115,
10 | 1.848545, 1.428922, 2.849658, 0.771605, 1.656668,
11 | 4.483506, 2.209922, 1.120280, 2.790182, 0.706519,
12 | 3.994768, 2.220004, 0.972934, 1.481525, 5.342475,
13 | 0.750738, 4.040773]
14 |
15 | model_urls = {
16 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
17 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
18 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
19 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
20 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
21 | }
22 |
23 | label_colours = [(0, 0, 0),
24 | # 0=background
25 | (148, 65, 137), (255, 116, 69), (86, 156, 137),
26 | (202, 179, 158), (155, 99, 235), (161, 107, 108),
27 | (133, 160, 103), (76, 152, 126), (84, 62, 35),
28 | (44, 80, 130), (31, 184, 157), (101, 144, 77),
29 | (23, 197, 62), (141, 168, 145), (142, 151, 136),
30 | (115, 201, 77), (100, 216, 255), (57, 156, 36),
31 | (88, 108, 129), (105, 129, 112), (42, 137, 126),
32 | (155, 108, 249), (166, 148, 143), (81, 91, 87),
33 | (100, 124, 51), (73, 131, 121), (157, 210, 220),
34 | (134, 181, 60), (221, 223, 147), (123, 108, 131),
35 | (161, 66, 179), (163, 221, 160), (31, 146, 98),
36 | (99, 121, 30), (49, 89, 240), (116, 108, 9),
37 | (161, 176, 169), (80, 29, 135), (177, 105, 197),
38 | (139, 110, 246)]
39 |
40 |
41 | class CrossEntropyLoss2d(nn.Module):
42 | def __init__(self, weight=med_frq):
43 | super(CrossEntropyLoss2d, self).__init__()
44 | self.ce_loss = nn.CrossEntropyLoss(reduction='none')
45 |
46 | def forward(self, inputs_scales, targets_scales):
47 | losses = []
48 | for inputs, targets in zip(inputs_scales, targets_scales):
49 | mask = targets > 0
50 | targets_m = targets.clone()
51 | targets_m[mask] -= 1
52 | loss_all = self.ce_loss(inputs, targets_m.long())
53 | losses.append(torch.sum(torch.masked_select(loss_all, mask)) / torch.sum(mask.float()))
54 | total_loss = sum(losses)
55 | return total_loss
56 |
57 |
58 | def color_label(label):
59 | label = label.clone().cpu().data.numpy()
60 | colored_label = np.vectorize(lambda x: label_colours[int(x)])
61 |
62 | colored = np.asarray(colored_label(label)).astype(np.float32)
63 | colored = colored.squeeze()
64 |
65 | try:
66 | return torch.from_numpy(colored.transpose([1, 0, 2, 3]))
67 | except ValueError:
68 | return torch.from_numpy(colored[np.newaxis, ...])
69 |
70 |
71 | def print_log(global_step, epoch, local_count, count_inter, dataset_size, loss, time_inter):
72 | print('Step: {:>5} Train Epoch: {:>3} [{:>4}/{:>4} ({:3.1f}%)] '
73 | 'Loss: {:.6f} [{:.2f}s every {:>4} data]'.format(
74 | global_step, epoch, local_count, dataset_size,
75 | 100. * local_count / dataset_size, loss.data, time_inter, count_inter))
76 |
77 |
78 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train):
79 | # usually this happens only on the start of a epoch
80 | epoch_float = epoch + (local_count / num_train)
81 | state = {
82 | 'global_step': global_step,
83 | 'epoch': epoch_float,
84 | 'state_dict': model.state_dict(),
85 | 'optimizer': optimizer.state_dict(),
86 | }
87 | ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float)
88 | path = os.path.join(ckpt_dir, ckpt_model_filename)
89 | torch.save(state, path)
90 | print('{:>2} has been successfully saved'.format(path))
91 |
92 |
93 | # def load_ckpt(model, optimizer, model_file, device):
94 | # if os.path.isfile(model_file):
95 | # print("=> loading checkpoint '{}'".format(model_file))
96 | # if device.type == 'cuda':
97 | # checkpoint = torch.load(model_file)
98 | # else:
99 | # checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
100 | # model.load_state_dict(checkpoint['state_dict'])
101 | # if optimizer:
102 | # optimizer.load_state_dict(checkpoint['optimizer'])
103 | # print("=> loaded checkpoint '{}' (epoch {})"
104 | # .format(model_file, checkpoint['epoch']))
105 | # step = checkpoint['global_step']
106 | # epoch = checkpoint['epoch']
107 | # return step, epoch
108 | # else:
109 | # print("=> no checkpoint found at '{}'".format(model_file))
110 | # os._exit(0)
111 |
112 |
113 | def load_ckpt(model, optimizer, model_file, device):
114 | if os.path.isfile(model_file):
115 | print("=> loading checkpoint '{}'".format(model_file))
116 | if device.type == 'cuda':
117 | checkpoint = torch.load(model_file)
118 | else:
119 | checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
120 | state_dict = checkpoint['model_state']
121 | prefix = 'module.'
122 | state_dict = {
123 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
124 | }
125 | model.load_state_dict(state_dict)
126 | if optimizer:
127 | optimizer.load_state_dict(checkpoint['optimizer'])
128 | print("=> loaded checkpoint '{}' (epoch {})"
129 | .format(model_file, checkpoint['epoch']))
130 | # step = checkpoint['global_step']
131 | # epoch = checkpoint['epoch']
132 | # return step, epoch
133 | else:
134 | print("=> no checkpoint found at '{}'".format(model_file))
135 | os._exit(0)
--------------------------------------------------------------------------------
/envs/utils/fmm_planner.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import skfmm
4 | import skimage
5 | from numpy import around, array, ma
6 |
7 |
8 | def get_mask(sx, sy, scale, step_size):
9 | size = int(step_size // scale) * 2 + 1
10 | mask = np.zeros((size, size))
11 | for i in range(size):
12 | for j in range(size):
13 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
14 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
15 | step_size ** 2 \
16 | and ((i + 0.5) - (size // 2 + sx)) ** 2 + \
17 | ((j + 0.5) - (size // 2 + sy)) ** 2 > \
18 | (step_size - 1) ** 2:
19 | mask[i, j] = 1
20 |
21 | mask[size // 2, size // 2] = 1
22 | return mask
23 |
24 |
25 | def get_dist(sx, sy, scale, step_size):
26 | size = int(step_size // scale) * 2 + 1
27 | mask = np.zeros((size, size)) + 1e-10
28 | for i in range(size):
29 | for j in range(size):
30 | if ((i + 0.5) - (size // 2 + sx)) ** 2 + \
31 | ((j + 0.5) - (size // 2 + sy)) ** 2 <= \
32 | step_size ** 2:
33 | mask[i, j] = max(5,
34 | (((i + 0.5) - (size // 2 + sx)) ** 2 +
35 | ((j + 0.5) - (size // 2 + sy)) ** 2) ** 0.5)
36 | return mask
37 |
38 |
39 | class FMMPlanner():
40 | def __init__(self, traversible, scale=1, step_size=5):
41 | self.scale = scale
42 | self.step_size = step_size
43 | if scale != 1.:
44 | self.traversible = cv2.resize(traversible,
45 | (traversible.shape[1] // scale,
46 | traversible.shape[0] // scale),
47 | interpolation=cv2.INTER_NEAREST)
48 | self.traversible = np.rint(self.traversible)
49 | else:
50 | self.traversible = traversible
51 |
52 | self.du = int(self.step_size / (self.scale * 1.))
53 | self.fmm_dist = None
54 | self.around = np.zeros(traversible.shape)
55 |
56 | kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(9, 9))
57 | self.traversible_around = cv2.dilate((1-self.traversible*1).astype('uint8'), kernel)
58 | traversible_around_ma = ma.masked_values(self.traversible_around*1, 0)
59 | traversible_around_ma[self.traversible==0] = 0
60 | if np.any(traversible_around_ma == 0):
61 | dd = skfmm.distance(traversible_around_ma, dx=1)
62 | dd = (np.max(dd) - dd)
63 | dd = ma.filled(dd, 0)
64 | self.around = dd
65 |
66 |
67 | def set_goal(self, goal, auto_improve=False):
68 | traversible_ma = ma.masked_values(self.traversible * 1, 0)
69 | goal_x, goal_y = int(goal[0] / (self.scale * 1.)), \
70 | int(goal[1] / (self.scale * 1.))
71 |
72 | if self.traversible[goal_x, goal_y] == 0. and auto_improve:
73 | goal_x, goal_y = self._find_nearest_goal([goal_x, goal_y])
74 |
75 | traversible_ma[goal_x, goal_y] = 0
76 | dd = skfmm.distance(traversible_ma, dx=1)
77 | dd = ma.filled(dd, np.max(dd) + 1)
78 | self.fmm_dist = dd
79 | return
80 |
81 | def set_multi_goal(self, goal_map):
82 | traversible_ma = ma.masked_values(self.traversible * 1, 0) #mask掉障碍物层(0),只剩空白区域
83 | traversible_ma[goal_map == 1] = 0 # 除去障碍物层的地图,目标为0,其他为1
84 | dd = skfmm.distance(traversible_ma, dx=1)
85 | dd = ma.filled(dd, np.max(dd) + 1) # 将mask的值替换为max(dd)+1, 即障碍物为最大距离
86 | self.fmm_dist = dd
87 | self.around[goal_map == 1] = 0
88 | return
89 |
90 | def get_short_term_goal(self, state):
91 | scale = self.scale * 1.
92 | state = [x / scale for x in state]
93 | dx, dy = state[0] - int(state[0]), state[1] - int(state[1])
94 | mask = get_mask(dx, dy, scale, self.step_size)
95 | dist_mask = get_dist(dx, dy, scale, self.step_size)
96 |
97 | state = [int(x) for x in state]
98 | self.fmm_dist += self.around
99 | dist = np.pad(self.fmm_dist, self.du,
100 | 'constant', constant_values=self.fmm_dist.shape[0] ** 2)
101 | subset = dist[state[0]:state[0] + 2 * self.du + 1,
102 | state[1]:state[1] + 2 * self.du + 1]
103 |
104 | dist_around = np.pad(self.around, self.du,
105 | 'constant', constant_values=self.fmm_dist.shape[0] ** 2)
106 | subset_around = dist_around[state[0]:state[0] + 2 * self.du + 1,
107 | state[1]:state[1] + 2 * self.du + 1]
108 |
109 | assert subset.shape[0] == 2 * self.du + 1 and \
110 | subset.shape[1] == 2 * self.du + 1, \
111 | "Planning error: unexpected subset shape {}".format(subset.shape)
112 |
113 | subset *= mask
114 | subset += (1 - mask) * self.fmm_dist.shape[0] ** 2
115 |
116 | if subset[self.du, self.du] < 0.25 * 100 / 5.: # 25cm
117 | stop = True
118 | else:
119 | stop = False
120 |
121 | subset -= subset[self.du, self.du]
122 | ratio1 = subset / dist_mask
123 | subset[ratio1 < -1.5] = 1
124 |
125 | (stg_x, stg_y) = np.unravel_index(np.argmin(subset), subset.shape)
126 |
127 | # remove the imfluence of the around map
128 | if subset[stg_x, stg_y]+subset_around[self.du, self.du] > -0.0001:
129 | replan = True
130 | else:
131 | replan = False
132 |
133 | return (stg_x + state[0] - self.du) * scale, \
134 | (stg_y + state[1] - self.du) * scale, replan, stop
135 |
136 | def _find_nearest_goal(self, goal):
137 | traversible = skimage.morphology.binary_dilation(
138 | np.zeros(self.traversible.shape),
139 | skimage.morphology.disk(2)) != True
140 | traversible = traversible * 1.
141 | planner = FMMPlanner(traversible)
142 | planner.set_goal(goal)
143 |
144 | mask = self.traversible
145 |
146 | dist_map = planner.fmm_dist * mask
147 | dist_map[dist_map == 0] = dist_map.max()
148 |
149 | goal = np.unravel_index(dist_map.argmin(), dist_map.shape)
150 |
151 | return goal
152 |
--------------------------------------------------------------------------------
/agents/utils/semantic_prediction.py:
--------------------------------------------------------------------------------
1 | # The following code is largely borrowed from
2 | # https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py and
3 | # https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
4 |
5 | import argparse
6 | import time
7 |
8 | import torch
9 | import numpy as np
10 |
11 | from detectron2.config import get_cfg
12 | from detectron2.utils.logger import setup_logger
13 | from detectron2.data.catalog import MetadataCatalog
14 | from detectron2.modeling import build_model
15 | from detectron2.checkpoint import DetectionCheckpointer
16 | from detectron2.utils.visualizer import ColorMode, Visualizer
17 | import detectron2.data.transforms as T
18 |
19 | from constants import coco_categories_mapping
20 |
21 |
22 | class SemanticPredMaskRCNN():
23 |
24 | def __init__(self, args):
25 | self.segmentation_model = ImageSegmentation(args)
26 | self.args = args
27 |
28 | def get_prediction(self, img):
29 | args = self.args
30 | image_list = []
31 | img = img[:, :, ::-1]
32 | image_list.append(img)
33 | seg_predictions, vis_output = self.segmentation_model.get_predictions(
34 | image_list, visualize=args.visualize == 2)
35 |
36 | if args.visualize == 2:
37 | img = vis_output.get_image()
38 |
39 | semantic_input = np.zeros((img.shape[0], img.shape[1], 15 + 1))
40 |
41 | for j, class_idx in enumerate(
42 | seg_predictions[0]['instances'].pred_classes.cpu().numpy()):
43 | if class_idx in list(coco_categories_mapping.keys()):
44 | idx = coco_categories_mapping[class_idx]
45 | obj_mask = seg_predictions[0]['instances'].pred_masks[j] * 1.
46 | semantic_input[:, :, idx] += obj_mask.cpu().numpy()
47 |
48 | return semantic_input, img
49 |
50 |
51 | def compress_sem_map(sem_map):
52 | c_map = np.zeros((sem_map.shape[1], sem_map.shape[2]))
53 | for i in range(sem_map.shape[0]):
54 | c_map[sem_map[i] > 0.] = i + 1
55 | return c_map
56 |
57 |
58 | class ImageSegmentation():
59 | def __init__(self, args):
60 | string_args = """
61 | --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
62 | --input input1.jpeg
63 | --confidence-threshold {}
64 | --opts MODEL.WEIGHTS
65 | detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
66 | """.format(args.sem_pred_prob_thr)
67 |
68 | if args.sem_gpu_id == -2:
69 | string_args += """ MODEL.DEVICE cpu"""
70 | else:
71 | string_args += """ MODEL.DEVICE cuda:{}""".format(args.sem_gpu_id)
72 |
73 | string_args = string_args.split()
74 |
75 | args = get_seg_parser().parse_args(string_args)
76 | logger = setup_logger()
77 | logger.info("Arguments: " + str(args))
78 |
79 | cfg = setup_cfg(args)
80 | self.demo = VisualizationDemo(cfg)
81 |
82 | def get_predictions(self, img, visualize=0):
83 | return self.demo.run_on_image(img, visualize=visualize)
84 |
85 |
86 | def setup_cfg(args):
87 | # load config from file and command-line arguments
88 | cfg = get_cfg()
89 | cfg.merge_from_file(args.config_file)
90 | cfg.merge_from_list(args.opts)
91 | # Set score_threshold for builtin models
92 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
93 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
94 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = \
95 | args.confidence_threshold
96 | cfg.freeze()
97 | return cfg
98 |
99 |
100 | def get_seg_parser():
101 | parser = argparse.ArgumentParser(
102 | description="Detectron2 demo for builtin models")
103 | parser.add_argument(
104 | "--config-file",
105 | default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
106 | metavar="FILE",
107 | help="path to config file",
108 | )
109 | parser.add_argument(
110 | "--webcam",
111 | action="store_true",
112 | help="Take inputs from webcam.")
113 | parser.add_argument("--video-input", help="Path to video file.")
114 | parser.add_argument(
115 | "--input",
116 | nargs="+",
117 | help="A list of space separated input images")
118 | parser.add_argument(
119 | "--output",
120 | help="A file or directory to save output visualizations. "
121 | "If not given, will show output in an OpenCV window.",
122 | )
123 |
124 | parser.add_argument(
125 | "--confidence-threshold",
126 | type=float,
127 | default=0.5,
128 | help="Minimum score for instance predictions to be shown",
129 | )
130 | parser.add_argument(
131 | "--opts",
132 | help="Modify config options using the command-line 'KEY VALUE' pairs",
133 | default=[],
134 | nargs=argparse.REMAINDER,
135 | )
136 | return parser
137 |
138 |
139 | class VisualizationDemo(object):
140 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE):
141 | """
142 | Args:
143 | cfg (CfgNode):
144 | instance_mode (ColorMode):
145 | """
146 | self.metadata = MetadataCatalog.get(
147 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
148 | )
149 | self.cpu_device = torch.device("cpu")
150 | self.instance_mode = instance_mode
151 |
152 | self.predictor = BatchPredictor(cfg)
153 |
154 | def run_on_image(self, image_list, visualize=0):
155 | """
156 | Args:
157 | image (np.ndarray): an image of shape (H, W, C) (in BGR order).
158 | This is the format used by OpenCV.
159 |
160 | Returns:
161 | predictions (dict): the output of the model.
162 | vis_output (VisImage): the visualized image output.
163 | """
164 | vis_output = None
165 | all_predictions = self.predictor(image_list)
166 | # Convert image from OpenCV BGR format to Matplotlib RGB format.
167 |
168 | if visualize:
169 | predictions = all_predictions[0]
170 | image = image_list[0]
171 | visualizer = Visualizer(
172 | image, self.metadata, instance_mode=self.instance_mode)
173 | if "panoptic_seg" in predictions:
174 | panoptic_seg, segments_info = predictions["panoptic_seg"]
175 | vis_output = visualizer.draw_panoptic_seg_predictions(
176 | panoptic_seg.to(self.cpu_device), segments_info
177 | )
178 | else:
179 | if "sem_seg" in predictions:
180 | vis_output = visualizer.draw_sem_seg(
181 | predictions["sem_seg"].argmax(
182 | dim=0).to(self.cpu_device)
183 | )
184 | if "instances" in predictions:
185 | instances = predictions["instances"].to(self.cpu_device)
186 | vis_output = visualizer.draw_instance_predictions(
187 | predictions=instances)
188 |
189 | return all_predictions, vis_output
190 |
191 |
192 | class BatchPredictor:
193 | """
194 | Create a simple end-to-end predictor with the given config that runs on
195 | single device for a list of input images.
196 |
197 | Compared to using the model directly, this class does the following
198 | additions:
199 |
200 | 1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
201 | 2. Always take BGR image as the input and apply conversion defined by
202 | `cfg.INPUT.FORMAT`.
203 | 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
204 | 4. Take a list of input images
205 |
206 | Attributes:
207 | metadata (Metadata): the metadata of the underlying dataset, obtained
208 | from cfg.DATASETS.TEST.
209 |
210 | """
211 |
212 | def __init__(self, cfg):
213 | self.cfg = cfg.clone() # cfg can be modified by model
214 | self.model = build_model(self.cfg)
215 | self.model.eval()
216 | self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
217 |
218 | checkpointer = DetectionCheckpointer(self.model)
219 | checkpointer.load(cfg.MODEL.WEIGHTS)
220 |
221 | self.input_format = cfg.INPUT.FORMAT
222 | assert self.input_format in ["RGB", "BGR"], self.input_format
223 |
224 | def __call__(self, image_list):
225 | """
226 | Args:
227 | image_list (list of np.ndarray): a list of images of
228 | shape (H, W, C) (in BGR order).
229 |
230 | Returns:
231 | predictions (dict):
232 | the output of the model for all images.
233 | See :doc:`/tutorials/models` for details about the format.
234 | """
235 | inputs = []
236 | for original_image in image_list:
237 | # https://github.com/sphinx-doc/sphinx/issues/4258
238 | # Apply pre-processing to image.
239 | if self.input_format == "RGB":
240 | # whether the model expects BGR inputs or RGB
241 | original_image = original_image[:, :, ::-1]
242 | height, width = original_image.shape[:2]
243 | image = original_image
244 | image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
245 |
246 | instance = {"image": image, "height": height, "width": width}
247 |
248 | inputs.append(instance)
249 |
250 | with torch.no_grad():
251 | predictions = self.model(inputs)
252 | return predictions
253 |
--------------------------------------------------------------------------------
/envs/habitat/__init__.py:
--------------------------------------------------------------------------------
1 | # Parts of the code in this file have been borrowed from:
2 | # https://github.com/facebookresearch/habitat-api
3 | import os
4 | import pdb
5 | import sys
6 |
7 | import numpy as np
8 | import torch
9 | from habitat.config.default import get_config as cfg_env
10 | from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1
11 | from habitat import Config, Env, RLEnv, VectorEnv, make_dataset
12 |
13 | from agents.sem_exp import Sem_Exp_Env_Agent
14 | from agents.multi_attr_exp import Multi_Attr_Exp_Env_Agent
15 | from .objectgoal_env import ObjectGoal_Env
16 | from .objectgoal_env21 import ObjectGoal_Env21
17 |
18 | from .utils.vector_env import VectorEnv
19 |
20 |
21 | def make_env_fn(args, config_env, rank):
22 | dataset = make_dataset(config_env.DATASET.TYPE, config=config_env.DATASET)
23 | config_env.defrost()
24 | config_env.SIMULATOR.SCENE = dataset.episodes[0].scene_id
25 | config_env.freeze()
26 | if args.agent == "sem_exp":
27 | env = Sem_Exp_Env_Agent(args=args, rank=rank,
28 | config_env=config_env,
29 | dataset=dataset
30 | )
31 | elif args.agent=='multi_attr_exp':
32 | env = Multi_Attr_Exp_Env_Agent(args=args, rank=rank,
33 | config_env=config_env,
34 | dataset=dataset
35 | )
36 | elif args.agent == "obj21":
37 | env = ObjectGoal_Env21(args=args, rank=rank,
38 | config_env=config_env,
39 | dataset=dataset
40 | )
41 | else:
42 | env = ObjectGoal_Env(args=args, rank=rank,
43 | config_env=config_env,
44 | dataset=dataset
45 | )
46 |
47 | env.seed(rank)
48 | return env
49 |
50 |
51 | def _get_scenes_from_folder(content_dir):
52 | scene_dataset_ext = ".glb.json.gz"
53 | scenes = []
54 | for filename in os.listdir(content_dir):
55 | if filename.endswith(scene_dataset_ext):
56 | scene = filename[: -len(scene_dataset_ext) + 4]
57 | scenes.append(scene)
58 | scenes.sort()
59 | return scenes
60 |
61 |
62 | def construct_envs(args):
63 | env_configs = []
64 | args_list = []
65 |
66 | basic_config = cfg_env(config_paths=["envs/habitat/configs/"
67 | + args.task_config])
68 | basic_config.defrost()
69 | basic_config.DATASET.SPLIT = args.split
70 |
71 | # basic_config.DATASET.DATA_PATH = \
72 | # basic_config.DATASET.DATA_PATH.replace("v1", args.version)
73 | # basic_config.DATASET.EPISODES_DIR = \
74 | # basic_config.DATASET.EPISODES_DIR.replace("v1", args.version)
75 | basic_config.freeze()
76 |
77 | scenes = basic_config.DATASET.CONTENT_SCENES
78 | if "*" in basic_config.DATASET.CONTENT_SCENES:
79 | content_dir = os.path.join("data/datasets/objectnav/gibson/v1.1/" + args.split, "content")
80 | scenes = _get_scenes_from_folder(content_dir)
81 |
82 | if len(scenes) > 0:
83 | assert len(scenes) >= args.num_processes, (
84 | "reduce the number of processes as there "
85 | "aren't enough number of scenes"
86 | )
87 |
88 | scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes))
89 | for _ in range(args.num_processes)]
90 | for i in range(len(scenes) % args.num_processes):
91 | scene_split_sizes[i] += 1
92 |
93 | print("Scenes per thread:")
94 | for i in range(args.num_processes):
95 | config_env = cfg_env(config_paths=["envs/habitat/configs/"
96 | + args.task_config])
97 | config_env.defrost()
98 |
99 | if len(scenes) > 0:
100 | config_env.DATASET.CONTENT_SCENES = scenes[
101 | sum(scene_split_sizes[:i]):
102 | sum(scene_split_sizes[:i + 1])
103 | ]
104 | print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES))
105 |
106 | if i < args.num_processes_on_first_gpu:
107 | gpu_id = 0
108 | else:
109 | gpu_id = int((i - args.num_processes_on_first_gpu)
110 | // args.num_processes_per_gpu) + args.sim_gpu_id
111 | gpu_id = min(torch.cuda.device_count() - 1, gpu_id)
112 | config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id
113 |
114 | agent_sensors = []
115 | agent_sensors.append("RGB_SENSOR")
116 | agent_sensors.append("DEPTH_SENSOR")
117 | agent_sensors.append("SEMANTIC_SENSOR")
118 |
119 | config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors
120 |
121 | # Reseting episodes manually, setting high max episode length in sim
122 | config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000
123 | config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
124 |
125 | config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width
126 | config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height
127 | config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov
128 | config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0]
129 |
130 | config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width
131 | config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height
132 | config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov
133 | config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth
134 | config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth
135 | config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0]
136 |
137 | config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width
138 | config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height
139 | config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov
140 | config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \
141 | [0, args.camera_height, 0]
142 |
143 | config_env.SIMULATOR.TURN_ANGLE = args.turn_angle
144 | config_env.DATASET.SPLIT = args.split
145 | # config_env.DATASET.DATA_PATH = \
146 | # config_env.DATASET.DATA_PATH.replace("v1", args.version)
147 | # config_env.DATASET.EPISODES_DIR = \
148 | # config_env.DATASET.EPISODES_DIR.replace("v1", args.version)
149 |
150 | config_env.freeze()
151 | env_configs.append(config_env)
152 |
153 | args_list.append(args)
154 |
155 | envs = VectorEnv(
156 | make_env_fn=make_env_fn,
157 | env_fn_args=tuple(
158 | tuple(
159 | zip(args_list, env_configs, range(args.num_processes))
160 | )
161 | ),
162 | )
163 |
164 | return envs
165 |
166 |
167 | def construct_envs21(args):
168 | env_configs = []
169 | args_list = []
170 |
171 | basic_config = cfg_env(config_paths=["envs/habitat/configs/"
172 | + args.task_config])
173 | basic_config.defrost()
174 | basic_config.DATASET.SPLIT = args.split
175 | basic_config.DATASET.DATA_PATH = \
176 | basic_config.DATASET.DATA_PATH.replace("v1", args.version)
177 | # basic_config.DATASET.EPISODES_DIR = \
178 | # basic_config.DATASET.EPISODES_DIR.replace("v1", args.version)
179 | basic_config.freeze()
180 |
181 | scenes = basic_config.DATASET.CONTENT_SCENES
182 | dataset = make_dataset(basic_config.DATASET.TYPE, config=basic_config.DATASET)
183 | if "*" in basic_config.DATASET.CONTENT_SCENES:
184 | scenes = dataset.get_scenes_to_load(basic_config.DATASET)
185 |
186 | if len(scenes) > 0:
187 | assert len(scenes) >= args.num_processes, (
188 | "reduce the number of processes as there "
189 | "aren't enough number of scenes"
190 | )
191 |
192 | scene_split_sizes = [int(np.floor(len(scenes) / args.num_processes))
193 | for _ in range(args.num_processes)]
194 | for i in range(len(scenes) % args.num_processes):
195 | scene_split_sizes[i] += 1
196 |
197 | print("Scenes per thread:")
198 | for i in range(args.num_processes):
199 | config_env = cfg_env(config_paths=["envs/habitat/configs/"
200 | + args.task_config])
201 | config_env.defrost()
202 |
203 | if len(scenes) > 0:
204 | config_env.DATASET.CONTENT_SCENES = scenes[
205 | sum(scene_split_sizes[:i]):
206 | sum(scene_split_sizes[:i + 1])
207 | ]
208 | print("Thread {}: {}".format(i, config_env.DATASET.CONTENT_SCENES))
209 |
210 | if i < args.num_processes_on_first_gpu:
211 | gpu_id = 0
212 | else:
213 | gpu_id = int((i - args.num_processes_on_first_gpu)
214 | // args.num_processes_per_gpu) + args.sim_gpu_id
215 | gpu_id = min(torch.cuda.device_count() - 1, gpu_id)
216 | config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = gpu_id
217 |
218 | agent_sensors = []
219 | agent_sensors.append("RGB_SENSOR")
220 | agent_sensors.append("DEPTH_SENSOR")
221 | agent_sensors.append("SEMANTIC_SENSOR")
222 |
223 | config_env.SIMULATOR.AGENT_0.SENSORS = agent_sensors
224 |
225 | # Reseting episodes manually, setting high max episode length in sim
226 | config_env.ENVIRONMENT.MAX_EPISODE_STEPS = 10000000
227 | config_env.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = False
228 |
229 | config_env.SIMULATOR.RGB_SENSOR.WIDTH = args.env_frame_width
230 | config_env.SIMULATOR.RGB_SENSOR.HEIGHT = args.env_frame_height
231 | config_env.SIMULATOR.RGB_SENSOR.HFOV = args.hfov
232 | config_env.SIMULATOR.RGB_SENSOR.POSITION = [0, args.camera_height, 0]
233 |
234 | config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = args.env_frame_width
235 | config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = args.env_frame_height
236 | config_env.SIMULATOR.DEPTH_SENSOR.HFOV = args.hfov
237 | config_env.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = args.min_depth
238 | config_env.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = args.max_depth
239 | config_env.SIMULATOR.DEPTH_SENSOR.POSITION = [0, args.camera_height, 0]
240 |
241 | config_env.SIMULATOR.SEMANTIC_SENSOR.WIDTH = args.env_frame_width
242 | config_env.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = args.env_frame_height
243 | config_env.SIMULATOR.SEMANTIC_SENSOR.HFOV = args.hfov
244 | config_env.SIMULATOR.SEMANTIC_SENSOR.POSITION = \
245 | [0, args.camera_height, 0]
246 |
247 |
248 | config_env.SIMULATOR.TURN_ANGLE = args.turn_angle
249 | config_env.DATASET.SPLIT = args.split
250 | config_env.DATASET.DATA_PATH = \
251 | config_env.DATASET.DATA_PATH.replace("v1", args.version)
252 | # config_env.DATASET.EPISODES_DIR = \
253 | # config_env.DATASET.EPISODES_DIR.replace("v1", args.version)
254 |
255 | config_env.freeze()
256 | env_configs.append(config_env)
257 |
258 | args_list.append(args)
259 |
260 | envs = VectorEnv(
261 | make_env_fn=make_env_fn,
262 | env_fn_args=tuple(
263 | tuple(
264 | zip(args_list, env_configs, range(args.num_processes))
265 | )
266 | ),
267 | )
268 |
269 | return envs
270 |
271 |
272 |
273 |
--------------------------------------------------------------------------------
/envs/habitat/objectgoal_env21.py:
--------------------------------------------------------------------------------
1 | import json
2 | import bz2
3 | import gzip
4 | import _pickle as cPickle
5 | import pdb
6 |
7 | import gym
8 | import numpy as np
9 | import quaternion
10 | import skimage.morphology
11 | import habitat
12 |
13 | from envs.utils.fmm_planner import FMMPlanner
14 | from constants import category_to_id, mp3d_category_id
15 | import envs.utils.pose as pu
16 |
17 | coco_categories = [0, 3, 2, 4, 5, 1]
18 |
19 | class ObjectGoal_Env21(habitat.RLEnv):
20 | """The Object Goal Navigation environment class. The class is responsible
21 | for loading the dataset, generating episodes, and computing evaluation
22 | metrics.
23 | """
24 |
25 | def __init__(self, args, rank, config_env, dataset):
26 | self.args = args
27 | self.rank = rank
28 |
29 | super().__init__(config_env, dataset)
30 |
31 | # Initializations
32 | self.episode_no = 0
33 |
34 | # Scene info
35 | self.last_scene_path = None
36 | self.scene_path = None
37 | self.scene_name = None
38 |
39 | # Episode Dataset info
40 | self.eps_data = None
41 | self.eps_data_idx = None
42 | self.gt_planner = None
43 | self.object_boundary = None
44 | self.goal_idx = None
45 | self.goal_name = None
46 | self.map_obj_origin = None
47 | self.starting_loc = None
48 | self.starting_distance = None
49 |
50 | # Episode tracking info
51 | self.curr_distance = None
52 | self.prev_distance = None
53 | self.timestep = None
54 | self.stopped = None
55 | self.path_length = None
56 | self.last_sim_location = None
57 | self.trajectory_states = []
58 | self.info = {}
59 | self.info['distance_to_goal'] = None
60 | self.info['spl'] = None
61 | self.info['success'] = None
62 |
63 | # self.scene = self._env.sim.semantic_annotations()
64 |
65 | fileName = 'data/matterport_category_mappings.tsv'
66 |
67 | text = ''
68 | lines = []
69 | items = []
70 | self.hm3d_semantic_mapping={}
71 |
72 | with open(fileName, 'r') as f:
73 | text = f.read()
74 | lines = text.split('\n')
75 |
76 | for l in lines:
77 | items.append(l.split(' '))
78 |
79 | for i in items:
80 | if len(i) > 3:
81 | self.hm3d_semantic_mapping[i[2]] = i[-1]
82 |
83 | # print()
84 |
85 | # for obj in self.scene.objects:
86 | # if obj is not None:
87 | # print(
88 | # f"Object id:{obj.id}, category:{obj.category.name()}, index:{obj.category.index()}"
89 | # f" center:{obj.aabb.center}, dims:{obj.aabb.sizes}"
90 | # )
91 |
92 | def reset(self):
93 | """Resets the environment to a new episode.
94 |
95 | Returns:
96 | obs (ndarray): RGBD observations (4 x H x W)
97 | info (dict): contains timestep, pose, goal category and
98 | evaluation metric info
99 | """
100 | args = self.args
101 | # new_scene = self.episode_no % args.num_train_episodes == 0
102 |
103 |
104 | self.episode_no += 1
105 |
106 | # Initializations
107 | self.timestep = 0
108 | self.stopped = False
109 | self.path_length = 1e-5
110 | self.trajectory_states = []
111 |
112 | # if new_scene:
113 | obs = super().reset()
114 | start_height = 0
115 | self.scene = self._env.sim.semantic_annotations()
116 | # start_height = self._env.current_episode.start_position[1]
117 | # goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1]
118 |
119 | # floor_height = []
120 | # floor_size = []
121 | # for obj in self.scene.objects:
122 | # if obj.category.name() in self.hm3d_semantic_mapping and \
123 | # self.hm3d_semantic_mapping[obj.category.name()] == 'floor':
124 | # floor_height.append(abs(obj.aabb.center[1] - start_height))
125 | # floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2])
126 |
127 |
128 | # if not args.eval:
129 | # while all(h > 0.1 or s < 12 for (h,s) in zip(floor_height, floor_size)) or abs(start_height-goal_height) > 1.2:
130 | # obs = super().reset()
131 |
132 | # self.scene = self._env.sim.semantic_annotations()
133 | # start_height = self._env.current_episode.start_position[1]
134 | # goal_height = self.scene.objects[self._env.current_episode.info['closest_goal_object_id']].aabb.center[1]
135 |
136 | # floor_height = []
137 | # floor_size = []
138 | # for obj in self.scene.objects:
139 | # if obj.category.name() in self.hm3d_semantic_mapping and \
140 | # self.hm3d_semantic_mapping[obj.category.name()] == 'floor':
141 | # floor_height.append(abs(obj.aabb.center[1] - start_height))
142 | # floor_size.append(obj.aabb.sizes[0]*obj.aabb.sizes[2])
143 |
144 | self.prev_distance = self._env.get_metrics()["distance_to_goal"]
145 | self.starting_distance = self._env.get_metrics()["distance_to_goal"]
146 | self.info['starting_distance_to_goal']=self.starting_distance
147 |
148 | # print("obs: ,", obs)
149 | # print("obs shape: ,", obs.shape)
150 | rgb = obs['rgb'].astype(np.uint8)
151 | depth = obs['depth']
152 | semantic = self._preprocess_semantic(obs["semantic"])
153 |
154 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
155 | self.last_sim_location = self.get_sim_location()
156 |
157 | # Set info
158 | self.info['time'] = self.timestep
159 | self.info['sensor_pose'] = [0., 0., 0.]
160 | self.info['goal_cat_id'] = coco_categories[obs['objectgoal'][0]]
161 | self.info['goal_name'] = category_to_id[obs['objectgoal'][0]]
162 |
163 | self.goal_name = category_to_id[obs['objectgoal'][0]]
164 |
165 | return state, self.info
166 |
167 | def step(self, action):
168 | """Function to take an action in the environment.
169 |
170 | Args:
171 | action (dict):
172 | dict with following keys:
173 | 'action' (int): 0: stop, 1: forward, 2: left, 3: right
174 |
175 | Returns:
176 | obs (ndarray): RGBD observations (4 x H x W)
177 | reward (float): amount of reward returned after previous action
178 | done (bool): whether the episode has ended
179 | info (dict): contains timestep, pose, goal category and
180 | evaluation metric info
181 | """
182 | action = action["action"]
183 | if action == 0:
184 | self.stopped = True
185 | # Not sending stop to simulator, resetting manually
186 | action = 3
187 |
188 | obs, rew, done, _ = super().step(action)
189 |
190 | # Get pose change
191 | dx, dy, do = self.get_pose_change()
192 | self.info['sensor_pose'] = [dx, dy, do]
193 | self.path_length += pu.get_l2_distance(0, dx, 0, dy)
194 |
195 | self.info['distance_to_goal'] = self._env.get_metrics()["distance_to_goal"]
196 | spl, success, dist = 0., 0., 0.
197 | if done:
198 | spl, success, dist = self.get_metrics()
199 | self.info['distance_to_goal'] = dist
200 | self.info['spl'] = spl
201 | self.info['success'] = success
202 |
203 | rgb = obs['rgb'].astype(np.uint8)
204 | depth = obs['depth']
205 | semantic = self._preprocess_semantic(obs["semantic"])
206 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
207 |
208 | self.timestep += 1
209 | self.info['time'] = self.timestep
210 |
211 |
212 |
213 |
214 | return state, rew, done, self.info
215 |
216 | def _preprocess_semantic(self, semantic):
217 | # print("*********semantic type: ", type(semantic))
218 | se = list(set(semantic.ravel()))
219 | # pdb.set_trace()
220 | for i in range(len(se)):
221 | if self.scene.objects[se[i]].category.name() in self.hm3d_semantic_mapping:
222 | hm3d_category_name = self.hm3d_semantic_mapping[self.scene.objects[se[i]].category.name()]
223 | else:
224 | hm3d_category_name = self.scene.objects[se[i]].category.name()
225 |
226 | if hm3d_category_name in mp3d_category_id:
227 | # print("sum: ", np.sum(sem_output[sem_output==se[i]])/se[i])
228 | semantic[semantic==se[i]] = mp3d_category_id[hm3d_category_name]-1
229 | else :
230 | semantic[
231 | semantic==se[i]
232 | ] = 0
233 |
234 | # se = list(set(semantic.ravel()))
235 | # print("semantic: ", se) # []
236 | # semantic = np.expand_dims(semantic.astype(np.uint8), 2)
237 | return semantic.astype(np.uint8)
238 |
239 | def get_reward_range(self):
240 | """This function is not used, Habitat-RLEnv requires this function"""
241 | return (0., 1.0)
242 |
243 | def get_reward(self, observations):
244 | self.curr_distance = self._env.get_metrics()['distance_to_goal']
245 |
246 | reward = (self.prev_distance - self.curr_distance) * \
247 | self.args.reward_coeff
248 |
249 | self.prev_distance = self.curr_distance
250 | return reward
251 |
252 | def get_llm_distance(self, target_point_map, frontier_loc_g):
253 |
254 | frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0],
255 | frontier_loc_g[1]] / 20.0
256 | tpm = len(list(set(target_point_map.ravel()))) -1
257 | for lay in range(tpm):
258 | frontier_loc = np.argwhere(target_point_map == lay+1)
259 | frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0],
260 | frontier_loc[1]] / 20.0
261 | if frontier_distance > frontier_dis_g:
262 | return 0
263 | return 1
264 |
265 | def get_metrics(self):
266 | """This function computes evaluation metrics for the Object Goal task
267 |
268 | Returns:
269 | spl (float): Success weighted by Path Length
270 | (See https://arxiv.org/pdf/1807.06757.pdf)
271 | success (int): 0: Failure, 1: Successful
272 | dist (float): Distance to Success (DTS), distance of the agent
273 | from the success threshold boundary in meters.
274 | (See https://arxiv.org/pdf/2007.00643.pdf)
275 | """
276 | dist = self._env.get_metrics()['distance_to_goal']
277 | if dist < 0.1:
278 | success = 1
279 | else:
280 | success = 0
281 | spl = min(success * self.starting_distance / self.path_length, 1)
282 | return spl, success, dist
283 |
284 | def get_done(self, observations):
285 | if self.info['time'] >= self.args.max_episode_length - 1:
286 | done = True
287 | elif self.stopped:
288 | done = True
289 | # print(self._env.get_metrics())
290 | else:
291 | done = False
292 | return done
293 |
294 | def _episode_success(self):
295 | return self._env.get_metrics()['success']
296 |
297 | def get_info(self, observations):
298 | """This function is not used, Habitat-RLEnv requires this function"""
299 | info = {}
300 | return info
301 |
302 | def get_sim_location(self):
303 | """Returns x, y, o pose of the agent in the Habitat simulator."""
304 |
305 | agent_state = super().habitat_env.sim.get_agent_state(0)
306 | x = -agent_state.position[2]
307 | y = -agent_state.position[0]
308 | axis = quaternion.as_euler_angles(agent_state.rotation)[0]
309 | if (axis % (2 * np.pi)) < 0.1 or (axis %
310 | (2 * np.pi)) > 2 * np.pi - 0.1:
311 | o = quaternion.as_euler_angles(agent_state.rotation)[1]
312 | else:
313 | o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
314 | if o > np.pi:
315 | o -= 2 * np.pi
316 | return x, y, o
317 |
318 | def get_pose_change(self):
319 | """Returns dx, dy, do pose change of the agent relative to the last
320 | timestep."""
321 | curr_sim_pose = self.get_sim_location()
322 | dx, dy, do = pu.get_rel_pose_change(
323 | curr_sim_pose, self.last_sim_location)
324 | self.last_sim_location = curr_sim_pose
325 | return dx, dy, do
326 |
--------------------------------------------------------------------------------
/envs/utils/depth_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for processing depth images.
17 | """
18 | import pdb
19 | from argparse import Namespace
20 |
21 | import itertools
22 | import numpy as np
23 | import torch
24 |
25 | import envs.utils.rotation_utils as ru
26 |
27 |
28 | def get_camera_matrix(width, height, fov):
29 | """Returns a camera matrix from image size and fov."""
30 | xc = (width - 1.) / 2.
31 | zc = (height - 1.) / 2.
32 | f = (width / 2.) / np.tan(np.deg2rad(fov / 2.))
33 | camera_matrix = {'xc': xc, 'zc': zc, 'f': f}
34 | camera_matrix = Namespace(**camera_matrix)
35 | return camera_matrix
36 |
37 |
38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1):
39 | """Projects the depth image Y into a 3D point cloud.
40 | Inputs:
41 | Y is ...xHxW
42 | camera_matrix
43 | Outputs:
44 | X is positive going right
45 | Y is positive into the image
46 | Z is positive up in the image
47 | XYZ is ...xHxWx3
48 | """
49 | x, z = np.meshgrid(np.arange(Y.shape[-1]),
50 | np.arange(Y.shape[-2] - 1, -1, -1))
51 | for _ in range(Y.ndim - 2):
52 | x = np.expand_dims(x, axis=0)
53 | z = np.expand_dims(z, axis=0)
54 | X = (x[::scale, ::scale] - camera_matrix.xc) * \
55 | Y[::scale, ::scale] / camera_matrix.f
56 | Z = (z[::scale, ::scale] - camera_matrix.zc) * \
57 | Y[::scale, ::scale] / camera_matrix.f
58 | XYZ = np.concatenate((X[..., np.newaxis],
59 | Y[::scale, ::scale][..., np.newaxis],
60 | Z[..., np.newaxis]), axis=X.ndim)
61 | return XYZ
62 |
63 |
64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree):
65 | """
66 | Transforms the point cloud into geocentric frame to account for
67 | camera elevation and angle
68 | Input:
69 | XYZ : ...x3
70 | sensor_height : height of the sensor
71 | camera_elevation_degree : camera elevation to rectify.
72 | Output:
73 | XYZ : ...x3
74 | """
75 | R = ru.get_r_matrix(
76 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
77 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
78 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
79 | return XYZ
80 |
81 |
82 | def transform_pose(XYZ, current_pose):
83 | """
84 | Transforms the point cloud into geocentric frame to account for
85 | camera position
86 | Input:
87 | XYZ : ...x3
88 | current_pose : camera position (x, y, theta (radians))
89 | Output:
90 | XYZ : ...x3
91 | """
92 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
93 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
94 | XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0]
95 | XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1]
96 | return XYZ
97 |
98 |
99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution):
100 | """Bins points into xy-z bins
101 | XYZ_cms is ... x H x W x3
102 | Outputs is ... x map_size x map_size x (len(z_bins)+1)
103 | """
104 | sh = XYZ_cms.shape
105 | XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
106 | n_z_bins = len(z_bins) + 1
107 | counts = []
108 | for XYZ_cm in XYZ_cms:
109 | isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0]))
110 | X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32)
111 | Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32)
112 | Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32)
113 |
114 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0,
115 | Y_bin < map_size,
116 | Z_bin >= 0, Z_bin < n_z_bins, isnotnan])
117 | isvalid = np.all(isvalid, axis=0)
118 |
119 | ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin
120 | ind[np.logical_not(isvalid)] = 0
121 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
122 | minlength=map_size * map_size * n_z_bins)
123 | counts = np.reshape(count, [map_size, map_size, n_z_bins])
124 |
125 | counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins])
126 |
127 | return counts
128 |
129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution):
130 | """Bins points into xy-z bins
131 | XYZ_cms is ... x H x W x3
132 | semantic is ... x H x W
133 | Outputs is ... x map_size x map_size x len
134 | """
135 | # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3
136 | # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
137 | # print("XYZ_cms: ",XYZ_cms)
138 | # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3
139 | # print("z_bins: ",z_bins)
140 | # se = list(set(semantic.ravel()))
141 | # print("bin_semantic_points: ", se) # []
142 | # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3
143 | isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0]))
144 | X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32)
145 | Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32)
146 | Z_bin = semantic.astype(np.int32)
147 |
148 | # print("X_bin S count: ", Counter(X_bin.ravel()))
149 | # print("Y_bin S count: ", Counter(Y_bin.ravel()))
150 | # print("Z_bin S count: ", Counter(Z_bin.ravel()))
151 | # print("Z_bin: {}".format(X_bin))
152 | # print("X_bin: {}".format(X_bin.shape)) #128*128
153 | # print("Z_bin: {}".format(Z_bin.shape)) #128*128
154 |
155 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size,
156 | Z_bin > 0, Z_bin <= semantic_map_len, isnotnan])
157 | isvalid = np.all(isvalid, axis=0)
158 |
159 | # print("isvalid: ", Counter(isvalid.ravel()))
160 | # print("isvalid: ", isvalid.shape) #128*128
161 |
162 | ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1
163 | ind[np.logical_not(isvalid)] = 0
164 | # print("ind: ", Counter(ind.ravel()))
165 | # print("ind: ", ind.shape) # 128*128
166 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
167 | minlength=map_size * map_size * semantic_map_len)
168 | # print("counts: ", count.shape) # 4838400
169 |
170 | counts = np.reshape(count, [map_size, map_size, semantic_map_len])
171 | # print("counts: ", counts.shape) # 480*480*semantic_map_len
172 |
173 | return counts
174 |
175 |
176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1):
177 | """Projects the depth image Y into a 3D point cloud.
178 | Inputs:
179 | Y is ...xHxW
180 | camera_matrix
181 | Outputs:
182 | X is positive going right
183 | Y is positive into the image
184 | Z is positive up in the image
185 | XYZ is ...xHxWx3
186 | """
187 | grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]),
188 | torch.arange(Y_t.shape[-2] - 1, -1, -1))
189 | grid_x = grid_x.transpose(1, 0).to(device)
190 | grid_z = grid_z.transpose(1, 0).to(device)
191 | grid_x = grid_x.unsqueeze(0).expand(Y_t.size())
192 | grid_z = grid_z.unsqueeze(0).expand(Y_t.size())
193 |
194 | X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \
195 | Y_t[:, ::scale, ::scale] / camera_matrix.f
196 | Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \
197 | Y_t[:, ::scale, ::scale] / camera_matrix.f
198 |
199 | XYZ = torch.stack(
200 | (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size()))
201 |
202 | return XYZ
203 |
204 |
205 | def transform_camera_view_t(
206 | XYZ, sensor_height, camera_elevation_degree, device):
207 | """
208 | Transforms the point cloud into geocentric frame to account for
209 | camera elevation and angle
210 | Input:
211 | XYZ : ...x3
212 | sensor_height : height of the sensor
213 | camera_elevation_degree : camera elevation to rectify.
214 | Output:
215 | XYZ : ...x3
216 | """
217 | for i in range(XYZ.shape[0]):
218 | R = ru.get_r_matrix(
219 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i]))
220 | XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device)
221 | )
222 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
223 | return XYZ
224 |
225 |
226 | def transform_pose_t(XYZ, current_pose, device):
227 | """
228 | Transforms the point cloud into geocentric frame to account for
229 | camera position
230 | Input:
231 | XYZ : ...x3
232 | current_pose : camera position (x, y, theta (radians))
233 | Output:
234 | XYZ : ...x3
235 | """
236 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
237 | XYZ = torch.matmul(XYZ.reshape(-1, 3),
238 | torch.from_numpy(R).float().transpose(1, 0).to(device)
239 | ).reshape(XYZ.shape)
240 | XYZ[..., 0] += current_pose[0]
241 | XYZ[..., 1] += current_pose[1]
242 | return XYZ
243 |
244 |
245 | def splat_feat_nd(init_grid, feat, coords):
246 | """
247 | Args:
248 | init_grid: B X nF X W X H X D X ..
249 | feat: B X nF X nPt
250 | coords: B X nDims X nPt in [-1, 1]
251 | Returns:
252 | grid: B X nF X W X H X D X ..
253 | """
254 | wts_dim = []
255 | pos_dim = []
256 | grid_dims = init_grid.shape[2:]
257 |
258 | B = init_grid.shape[0]
259 | F = init_grid.shape[1]
260 |
261 | n_dims = len(grid_dims)
262 |
263 | grid_flat = init_grid.view(B, F, -1)
264 |
265 | for d in range(n_dims):
266 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
267 | pos_d = []
268 | wts_d = []
269 |
270 | for ix in [0, 1]:
271 | pos_ix = torch.floor(pos) + ix
272 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
273 | safe_ix = safe_ix.type(pos.dtype)
274 |
275 | wts_ix = 1 - torch.abs(pos - pos_ix)
276 |
277 | wts_ix = wts_ix * safe_ix
278 | pos_ix = pos_ix * safe_ix
279 |
280 | pos_d.append(pos_ix)
281 | wts_d.append(wts_ix)
282 |
283 | pos_dim.append(pos_d)
284 | wts_dim.append(wts_d)
285 |
286 | l_ix = [[0, 1] for d in range(n_dims)]
287 | # pdb.set_trace()
288 | for ix_d in itertools.product(*l_ix):
289 | wts = torch.ones_like(wts_dim[0][0])
290 | index = torch.zeros_like(wts_dim[0][0])
291 | for d in range(n_dims):
292 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
293 | wts = wts * wts_dim[d][ix_d[d]]
294 |
295 | index = index.long()
296 | grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts)
297 | grid_flat = torch.round(grid_flat)
298 |
299 | return grid_flat.view(init_grid.shape)
300 |
301 | def splat_att_feat_nd(init_grid, feat, coords):
302 | """
303 | Args:
304 | init_grid: B X nF X W X H X D X ..
305 | feat: B X nF X nPt
306 | coords: B X nDims X nPt in [-1, 1]
307 | Returns:
308 | grid: B X nF X W X H X D X ..
309 | """
310 | wts_dim = []
311 | pos_dim = []
312 | grid_dims = init_grid.shape[2:]
313 |
314 | B = init_grid.shape[0]
315 | F = init_grid.shape[1]
316 |
317 | n_dims = len(grid_dims)
318 |
319 | grid_flat = init_grid[:,:-1,...].view(B, F-1, -1)
320 | grid_flat_att = init_grid[:,-1:,...].view(B,1, -1)
321 |
322 | for d in range(n_dims):
323 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
324 | pos_d = []
325 | wts_d = []
326 |
327 | for ix in [0, 1]:
328 | pos_ix = torch.floor(pos) + ix
329 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
330 | safe_ix = safe_ix.type(pos.dtype)
331 |
332 | wts_ix = 1 - torch.abs(pos - pos_ix)
333 |
334 | wts_ix = wts_ix * safe_ix
335 | pos_ix = pos_ix * safe_ix
336 |
337 | pos_d.append(pos_ix)
338 | wts_d.append(wts_ix)
339 |
340 | pos_dim.append(pos_d)
341 | wts_dim.append(wts_d)
342 |
343 | l_ix = [[0, 1] for d in range(n_dims)]
344 | for ix_d in itertools.product(*l_ix):
345 | wts = torch.ones_like(wts_dim[0][0])
346 | index = torch.zeros_like(wts_dim[0][0])
347 | for d in range(n_dims):
348 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
349 | wts = wts * wts_dim[d][ix_d[d]]
350 |
351 | index = index.long()
352 | grid_flat.scatter_add_(2, index.expand(-1, F-1, -1), feat[:,:-1,:] * wts)
353 | grid_flat = torch.round(grid_flat)
354 |
355 | current_values = grid_flat_att.gather(2, index)
356 | new_val = torch.max(current_values, (feat[:,-1:,:] * wts))
357 | grid_flat_att.scatter_(2, index, new_val)
358 |
359 | final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1)
360 |
361 | assert final_grid_flat.shape==init_grid.view(B, F, -1).shape
362 | return final_grid_flat.view(init_grid.shape)
363 |
--------------------------------------------------------------------------------
/envs/utils/depth_utils_multi.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for processing depth images.
17 | """
18 | import pdb
19 | from argparse import Namespace
20 |
21 | import itertools
22 | import numpy as np
23 | import torch
24 |
25 | import envs.utils.rotation_utils as ru
26 |
27 |
28 | def get_camera_matrix(width, height, fov):
29 | """Returns a camera matrix from image size and fov."""
30 | xc = (width - 1.) / 2.
31 | zc = (height - 1.) / 2.
32 | f = (width / 2.) / np.tan(np.deg2rad(fov / 2.))
33 | camera_matrix = {'xc': xc, 'zc': zc, 'f': f}
34 | camera_matrix = Namespace(**camera_matrix)
35 | return camera_matrix
36 |
37 |
38 | def get_point_cloud_from_z(Y, camera_matrix, scale=1):
39 | """Projects the depth image Y into a 3D point cloud.
40 | Inputs:
41 | Y is ...xHxW
42 | camera_matrix
43 | Outputs:
44 | X is positive going right
45 | Y is positive into the image
46 | Z is positive up in the image
47 | XYZ is ...xHxWx3
48 | """
49 | x, z = np.meshgrid(np.arange(Y.shape[-1]),
50 | np.arange(Y.shape[-2] - 1, -1, -1))
51 | for _ in range(Y.ndim - 2):
52 | x = np.expand_dims(x, axis=0)
53 | z = np.expand_dims(z, axis=0)
54 | X = (x[::scale, ::scale] - camera_matrix.xc) * \
55 | Y[::scale, ::scale] / camera_matrix.f
56 | Z = (z[::scale, ::scale] - camera_matrix.zc) * \
57 | Y[::scale, ::scale] / camera_matrix.f
58 | XYZ = np.concatenate((X[..., np.newaxis],
59 | Y[::scale, ::scale][..., np.newaxis],
60 | Z[..., np.newaxis]), axis=X.ndim)
61 | return XYZ
62 |
63 |
64 | def transform_camera_view(XYZ, sensor_height, camera_elevation_degree):
65 | """
66 | Transforms the point cloud into geocentric frame to account for
67 | camera elevation and angle
68 | Input:
69 | XYZ : ...x3
70 | sensor_height : height of the sensor
71 | camera_elevation_degree : camera elevation to rectify.
72 | Output:
73 | XYZ : ...x3
74 | """
75 | R = ru.get_r_matrix(
76 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree))
77 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
78 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
79 | return XYZ
80 |
81 |
82 | def transform_pose(XYZ, current_pose):
83 | """
84 | Transforms the point cloud into geocentric frame to account for
85 | camera position
86 | Input:
87 | XYZ : ...x3
88 | current_pose : camera position (x, y, theta (radians))
89 | Output:
90 | XYZ : ...x3
91 | """
92 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
93 | XYZ = np.matmul(XYZ.reshape(-1, 3), R.T).reshape(XYZ.shape)
94 | XYZ[:, :, 0] = XYZ[:, :, 0] + current_pose[0]
95 | XYZ[:, :, 1] = XYZ[:, :, 1] + current_pose[1]
96 | return XYZ
97 |
98 |
99 | def bin_points(XYZ_cms, map_size, z_bins, xy_resolution):
100 | """Bins points into xy-z bins
101 | XYZ_cms is ... x H x W x3
102 | Outputs is ... x map_size x map_size x (len(z_bins)+1)
103 | """
104 | sh = XYZ_cms.shape
105 | XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
106 | n_z_bins = len(z_bins) + 1
107 | counts = []
108 | for XYZ_cm in XYZ_cms:
109 | isnotnan = np.logical_not(np.isnan(XYZ_cm[:, :, 0]))
110 | X_bin = np.round(XYZ_cm[:, :, 0] / xy_resolution).astype(np.int32)
111 | Y_bin = np.round(XYZ_cm[:, :, 1] / xy_resolution).astype(np.int32)
112 | Z_bin = np.digitize(XYZ_cm[:, :, 2], bins=z_bins).astype(np.int32)
113 |
114 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0,
115 | Y_bin < map_size,
116 | Z_bin >= 0, Z_bin < n_z_bins, isnotnan])
117 | isvalid = np.all(isvalid, axis=0)
118 |
119 | ind = (Y_bin * map_size + X_bin) * n_z_bins + Z_bin
120 | ind[np.logical_not(isvalid)] = 0
121 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
122 | minlength=map_size * map_size * n_z_bins)
123 | counts = np.reshape(count, [map_size, map_size, n_z_bins])
124 |
125 | counts = counts.reshape(list(sh[:-3]) + [map_size, map_size, n_z_bins])
126 |
127 | return counts
128 |
129 | def bin_semantic_points(XYZ_cms, semantic, map_size, semantic_map_len, xy_resolution):
130 | """Bins points into xy-z bins
131 | XYZ_cms is ... x H x W x3
132 | semantic is ... x H x W
133 | Outputs is ... x map_size x map_size x len
134 | """
135 | # print("XYZ_cms: ",XYZ_cms.shape) #128*128*3
136 | # XYZ_cms = XYZ_cms.reshape([-1, sh[-3], sh[-2], sh[-1]])
137 | # print("XYZ_cms: ",XYZ_cms)
138 | # print("XYZ_cms: ",XYZ_cms.shape) #1*128*128*3
139 | # print("z_bins: ",z_bins)
140 | # se = list(set(semantic.ravel()))
141 | # print("bin_semantic_points: ", se) # []
142 | # print("XYZ_cm:", XYZ_cm.shape) # 128*128*3
143 | isnotnan = np.logical_not(np.isnan(XYZ_cms[:, :, 0]))
144 | X_bin = np.round(XYZ_cms[:, :, 0] / xy_resolution).astype(np.int32)
145 | Y_bin = np.round(XYZ_cms[:, :, 1] / xy_resolution).astype(np.int32)
146 | Z_bin = semantic.astype(np.int32)
147 |
148 | # print("X_bin S count: ", Counter(X_bin.ravel()))
149 | # print("Y_bin S count: ", Counter(Y_bin.ravel()))
150 | # print("Z_bin S count: ", Counter(Z_bin.ravel()))
151 | # print("Z_bin: {}".format(X_bin))
152 | # print("X_bin: {}".format(X_bin.shape)) #128*128
153 | # print("Z_bin: {}".format(Z_bin.shape)) #128*128
154 |
155 | isvalid = np.array([X_bin >= 0, X_bin < map_size, Y_bin >= 0, Y_bin < map_size,
156 | Z_bin > 0, Z_bin <= semantic_map_len, isnotnan])
157 | isvalid = np.all(isvalid, axis=0)
158 |
159 | # print("isvalid: ", Counter(isvalid.ravel()))
160 | # print("isvalid: ", isvalid.shape) #128*128
161 |
162 | ind = (Y_bin * map_size + X_bin) * semantic_map_len + Z_bin-1
163 | ind[np.logical_not(isvalid)] = 0
164 | # print("ind: ", Counter(ind.ravel()))
165 | # print("ind: ", ind.shape) # 128*128
166 | count = np.bincount(ind.ravel(), isvalid.ravel().astype(np.int32),
167 | minlength=map_size * map_size * semantic_map_len)
168 | # print("counts: ", count.shape) # 4838400
169 |
170 | counts = np.reshape(count, [map_size, map_size, semantic_map_len])
171 | # print("counts: ", counts.shape) # 480*480*semantic_map_len
172 |
173 | return counts
174 |
175 |
176 | def get_point_cloud_from_z_t(Y_t, camera_matrix, device, scale=1):
177 | """Projects the depth image Y into a 3D point cloud.
178 | Inputs:
179 | Y is ...xHxW
180 | camera_matrix
181 | Outputs:
182 | X is positive going right
183 | Y is positive into the image
184 | Z is positive up in the image
185 | XYZ is ...xHxWx3
186 | """
187 | grid_x, grid_z = torch.meshgrid(torch.arange(Y_t.shape[-1]),
188 | torch.arange(Y_t.shape[-2] - 1, -1, -1))
189 | grid_x = grid_x.transpose(1, 0).to(device)
190 | grid_z = grid_z.transpose(1, 0).to(device)
191 | grid_x = grid_x.unsqueeze(0).expand(Y_t.size())
192 | grid_z = grid_z.unsqueeze(0).expand(Y_t.size())
193 |
194 | X_t = (grid_x[:, ::scale, ::scale] - camera_matrix.xc) * \
195 | Y_t[:, ::scale, ::scale] / camera_matrix.f
196 | Z_t = (grid_z[:, ::scale, ::scale] - camera_matrix.zc) * \
197 | Y_t[:, ::scale, ::scale] / camera_matrix.f
198 |
199 | XYZ = torch.stack(
200 | (X_t, Y_t[:, ::scale, ::scale], Z_t), dim=len(Y_t.size()))
201 |
202 | return XYZ
203 |
204 |
205 | def transform_camera_view_t(
206 | XYZ, sensor_height, camera_elevation_degree, device):
207 | """
208 | Transforms the point cloud into geocentric frame to account for
209 | camera elevation and angle
210 | Input:
211 | XYZ : ...x3
212 | sensor_height : height of the sensor
213 | camera_elevation_degree : camera elevation to rectify.
214 | Output:
215 | XYZ : ...x3
216 | """
217 | for i in range(XYZ.shape[0]):
218 | R = ru.get_r_matrix(
219 | [1., 0., 0.], angle=np.deg2rad(camera_elevation_degree[i]))
220 | XYZ[i] = torch.matmul(XYZ[i],torch.from_numpy(R).float().transpose(1, 0).to(device)
221 | )
222 | XYZ[..., 2] = XYZ[..., 2] + sensor_height
223 | return XYZ
224 |
225 |
226 | def transform_pose_t(XYZ, current_pose, device):
227 | """
228 | Transforms the point cloud into geocentric frame to account for
229 | camera position
230 | Input:
231 | XYZ : ...x3
232 | current_pose : camera position (x, y, theta (radians))
233 | Output:
234 | XYZ : ...x3
235 | """
236 | R = ru.get_r_matrix([0., 0., 1.], angle=current_pose[2] - np.pi / 2.)
237 | XYZ = torch.matmul(XYZ.reshape(-1, 3),
238 | torch.from_numpy(R).float().transpose(1, 0).to(device)
239 | ).reshape(XYZ.shape)
240 | XYZ[..., 0] += current_pose[0]
241 | XYZ[..., 1] += current_pose[1]
242 | return XYZ
243 |
244 |
245 | def splat_feat_nd(init_grid, feat, coords):
246 | """
247 | Args:
248 | init_grid: B X nF X W X H X D X ..
249 | feat: B X nF X nPt
250 | coords: B X nDims X nPt in [-1, 1]
251 | Returns:
252 | grid: B X nF X W X H X D X ..
253 | """
254 | wts_dim = []
255 | pos_dim = []
256 | grid_dims = init_grid.shape[2:]
257 |
258 | B = init_grid.shape[0]
259 | F = init_grid.shape[1]
260 |
261 | n_dims = len(grid_dims)
262 |
263 | grid_flat = init_grid.view(B, F, -1)
264 |
265 | for d in range(n_dims):
266 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
267 | pos_d = []
268 | wts_d = []
269 |
270 | for ix in [0, 1]:
271 | pos_ix = torch.floor(pos) + ix
272 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
273 | safe_ix = safe_ix.type(pos.dtype)
274 |
275 | wts_ix = 1 - torch.abs(pos - pos_ix)
276 |
277 | wts_ix = wts_ix * safe_ix
278 | pos_ix = pos_ix * safe_ix
279 |
280 | pos_d.append(pos_ix)
281 | wts_d.append(wts_ix)
282 |
283 | pos_dim.append(pos_d)
284 | wts_dim.append(wts_d)
285 |
286 | l_ix = [[0, 1] for d in range(n_dims)]
287 | # pdb.set_trace()
288 | for ix_d in itertools.product(*l_ix):
289 | wts = torch.ones_like(wts_dim[0][0])
290 | index = torch.zeros_like(wts_dim[0][0])
291 | for d in range(n_dims):
292 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
293 | wts = wts * wts_dim[d][ix_d[d]]
294 |
295 | index = index.long()
296 | grid_flat.scatter_add_(2, index.expand(-1, F, -1), feat * wts)
297 | grid_flat = torch.round(grid_flat)
298 |
299 | return grid_flat.view(init_grid.shape)
300 |
301 | def splat_att_feat_nd(init_grid, feat, coords,num_atts):
302 | """
303 | Args:
304 | init_grid: B X nF X W X H X D X ..
305 | feat: B X nF X nPt
306 | coords: B X nDims X nPt in [-1, 1]
307 | Returns:
308 | grid: B X nF X W X H X D X ..
309 | """
310 | wts_dim = []
311 | pos_dim = []
312 | grid_dims = init_grid.shape[2:]
313 |
314 | B = init_grid.shape[0]
315 | F = init_grid.shape[1]
316 |
317 | n_dims = len(grid_dims)
318 | # pdb.set_trace()
319 | grid_flat = init_grid[:,:-num_atts,...].view(B, F-num_atts, -1)
320 | grid_flat_att = init_grid[:,-num_atts:,...].view(B,num_atts, -1)
321 |
322 | for d in range(n_dims):
323 | pos = coords[:, [d], :] * grid_dims[d] / 2 + grid_dims[d] / 2
324 | pos_d = []
325 | wts_d = []
326 |
327 | for ix in [0, 1]:
328 | pos_ix = torch.floor(pos) + ix
329 | safe_ix = (pos_ix > 0) & (pos_ix < grid_dims[d])
330 | safe_ix = safe_ix.type(pos.dtype)
331 |
332 | wts_ix = 1 - torch.abs(pos - pos_ix)
333 |
334 | wts_ix = wts_ix * safe_ix
335 | pos_ix = pos_ix * safe_ix
336 |
337 | pos_d.append(pos_ix)
338 | wts_d.append(wts_ix)
339 |
340 | pos_dim.append(pos_d)
341 | wts_dim.append(wts_d)
342 |
343 | l_ix = [[0, 1] for d in range(n_dims)]
344 | for ix_d in itertools.product(*l_ix):
345 | wts = torch.ones_like(wts_dim[0][0])
346 | index = torch.zeros_like(wts_dim[0][0])
347 | for d in range(n_dims):
348 | index = index * grid_dims[d] + pos_dim[d][ix_d[d]]
349 | wts = wts * wts_dim[d][ix_d[d]]
350 |
351 | index = index.long()
352 | grid_flat.scatter_add_(2, index.expand(-1, F-num_atts, -1), feat[:,:-num_atts,:] * wts)
353 | grid_flat = torch.round(grid_flat)
354 |
355 | current_values = grid_flat_att.gather(2, index.expand(-1, num_atts, -1))
356 | new_val = torch.max(current_values, (feat[:,-num_atts:,:] * wts))
357 | grid_flat_att.scatter_(2, index.expand(-1, num_atts, -1), new_val)
358 |
359 | final_grid_flat=torch.cat([grid_flat,grid_flat_att],dim=1)
360 |
361 | assert final_grid_flat.shape==init_grid.view(B, F, -1).shape
362 | return final_grid_flat.view(init_grid.shape)
363 |
--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 |
4 |
5 | def get_args():
6 | parser = argparse.ArgumentParser(
7 | description='Goal-Oriented-Semantic-Exploration')
8 |
9 | # General Arguments
10 | parser.add_argument('--seed', type=int, default=1,
11 | help='random seed (default: 1)')
12 | parser.add_argument('--auto_gpu_config', type=int, default=1)
13 | parser.add_argument('--total_num_scenes', type=str, default="auto")
14 | parser.add_argument('-n', '--num_processes', type=int, default=5,
15 | help="""how many training processes to use (default:5)
16 | Overridden when auto_gpu_config=1
17 | and training on gpus""")
18 | parser.add_argument('--num_processes_per_gpu', type=int, default=6)
19 | parser.add_argument('--num_processes_on_first_gpu', type=int, default=1)
20 | parser.add_argument('--eval', type=int, default=0,
21 | help='0: Train, 1: Evaluate (default: 0)')
22 | parser.add_argument('--num_training_frames', type=int, default=10000000,
23 | help='total number of training frames')
24 | parser.add_argument('--num_eval_episodes', type=int, default=400,
25 | help="number of test episodes per scene")
26 | parser.add_argument('--num_train_episodes', type=int, default=10000,
27 | help="""number of train episodes per scene
28 | before loading the next scene""")
29 | parser.add_argument('--no_cuda', action='store_true', default=False,
30 | help='disables CUDA training')
31 | parser.add_argument("--sim_gpu_id", type=int, default=0,
32 | help="gpu id on which scenes are loaded")
33 | parser.add_argument("--sem_gpu_id", type=int, default=-1,
34 | help="""gpu id for semantic model,
35 | -1: same as sim gpu, -2: cpu""")
36 | parser.add_argument("--blip_gpu_id", type=int, default=1,
37 | help="""gpu id for BLIP model,
38 | -1: same as sim gpu, -2: cpu""")
39 |
40 | # Logging, loading models, visualization
41 | parser.add_argument('--log_interval', type=int, default=10,
42 | help="""log interval, one log per n updates
43 | (default: 10) """)
44 | parser.add_argument('--save_interval', type=int, default=1,
45 | help="""save interval""")
46 | parser.add_argument('-d', '--dump_location', type=str, default="./tmp/",
47 | help='path to dump models and log (default: ./tmp/)')
48 | parser.add_argument('--exp_name', type=str, default="exp1",
49 | help='experiment name (default: exp1)')
50 | parser.add_argument('--save_periodic', type=int, default=500000,
51 | help='Model save frequency in number of updates')
52 | parser.add_argument('--load', type=str, default="0",
53 | help="""model path to load,
54 | 0 to not reload (default: 0)""")
55 | parser.add_argument('-v', '--visualize', type=int, default=0,
56 | help="""1: Render the observation and
57 | the predicted semantic map,
58 | 2: Render the observation with semantic
59 | predictions and the predicted semantic map
60 | (default: 0)""")
61 | parser.add_argument('--print_images', type=int, default=0,
62 | help='1: save visualization as images')
63 |
64 | # Environment, dataset and episode specifications
65 | parser.add_argument('-efw', '--env_frame_width', type=int, default=640,
66 | help='Frame width (default:640)')
67 | parser.add_argument('-efh', '--env_frame_height', type=int, default=480,
68 | help='Frame height (default:480)')
69 | parser.add_argument('-fw', '--frame_width', type=int, default=160,
70 | help='Frame width (default:160)')
71 | parser.add_argument('-fh', '--frame_height', type=int, default=120,
72 | help='Frame height (default:120)')
73 | parser.add_argument('-el', '--max_episode_length', type=int, default=532,
74 | help="""Maximum episode length""")
75 | parser.add_argument("--task_config", type=str,
76 | default="tasks/objectnav_hm3d.yaml",
77 | help="path to config yaml containing task information")
78 | parser.add_argument("--split", type=str, default="train",
79 | help="dataset split (train | val | val_mini) ")
80 | parser.add_argument('--camera_height', type=float, default=0.88,
81 | help="agent camera height in metres")
82 | parser.add_argument('--hfov', type=float, default=79.0,
83 | help="horizontal field of view in degrees")
84 | parser.add_argument('--turn_angle', type=float, default=30,
85 | help="Agent turn angle in degrees")
86 | parser.add_argument('--min_depth', type=float, default=0.5,
87 | help="Minimum depth for depth sensor in meters")
88 | parser.add_argument('--max_depth', type=float, default=5.0,
89 | help="Maximum depth for depth sensor in meters")
90 | parser.add_argument('--success_dist', type=float, default=1.5,
91 | help="success distance threshold in meters")
92 | parser.add_argument('--floor_thr', type=int, default=50,
93 | help="floor threshold in cm")
94 | parser.add_argument('--min_d', type=float, default=1.5,
95 | help="min distance to goal during training in meters")
96 | parser.add_argument('--max_d', type=float, default=100.0,
97 | help="max distance to goal during training in meters")
98 | parser.add_argument('--version', type=str, default="v1.1",
99 | help="dataset version")
100 |
101 | # Model Hyperparameters
102 | parser.add_argument('--agent', type=str, default="sem_exp")
103 | parser.add_argument('--lr', type=float, default=2.5e-5,
104 | help='learning rate (default: 2.5e-5)')
105 | parser.add_argument('--global_hidden_size', type=int, default=256,
106 | help='global_hidden_size')
107 | parser.add_argument('--eps', type=float, default=1e-5,
108 | help='RL Optimizer epsilon (default: 1e-5)')
109 | parser.add_argument('--alpha', type=float, default=0.99,
110 | help='RL Optimizer alpha (default: 0.99)')
111 | parser.add_argument('--gamma', type=float, default=0.99,
112 | help='discount factor for rewards (default: 0.99)')
113 | parser.add_argument('--use_gae', action='store_true', default=False,
114 | help='use generalized advantage estimation')
115 | parser.add_argument('--tau', type=float, default=0.95,
116 | help='gae parameter (default: 0.95)')
117 | parser.add_argument('--entropy_coef', type=float, default=0.001,
118 | help='entropy term coefficient (default: 0.01)')
119 | parser.add_argument('--value_loss_coef', type=float, default=0.5,
120 | help='value loss coefficient (default: 0.5)')
121 | parser.add_argument('--max_grad_norm', type=float, default=0.5,
122 | help='max norm of gradients (default: 0.5)')
123 | parser.add_argument('--num_global_steps', type=int, default=20,
124 | help='number of forward steps in A2C (default: 5)')
125 | parser.add_argument('--ppo_epoch', type=int, default=4,
126 | help='number of ppo epochs (default: 4)')
127 | parser.add_argument('--num_mini_batch', type=str, default="auto",
128 | help='number of batches for ppo (default: 32)')
129 | parser.add_argument('--clip_param', type=float, default=0.2,
130 | help='ppo clip parameter (default: 0.2)')
131 | parser.add_argument('--use_recurrent_global', type=int, default=0,
132 | help='use a recurrent global policy')
133 | parser.add_argument('--num_local_steps', type=int, default=25,
134 | help="""Number of steps the local policy
135 | between each global step""")
136 | parser.add_argument('--reward_coeff', type=float, default=0.1,
137 | help="Object goal reward coefficient")
138 | parser.add_argument('--intrinsic_rew_coeff', type=float, default=0.05,
139 | help="intrinsic exploration reward coefficient")
140 | parser.add_argument('--num_sem_categories', type=float, default=16)
141 | parser.add_argument('--sem_pred_prob_thr', type=float, default=0.9,
142 | help="Semantic prediction confidence threshold")
143 |
144 | # Mapping
145 | parser.add_argument('--global_downscaling', type=int, default=2)
146 | parser.add_argument('--vision_range', type=int, default=100)
147 | parser.add_argument('--map_resolution', type=int, default=5)
148 | parser.add_argument('--du_scale', type=int, default=1)
149 | parser.add_argument('--map_size_cm', type=int, default=4800)
150 | parser.add_argument('--cat_pred_threshold', type=float, default=5.0)
151 | parser.add_argument('--map_pred_threshold', type=float, default=1.0)
152 | parser.add_argument('--exp_pred_threshold', type=float, default=1.0)
153 | parser.add_argument('--collision_threshold', type=float, default=0.10)
154 | parser.add_argument('--collision_queue_size', type=int, default=3)
155 | parser.add_argument('--wtec', type=float, default=0.5)
156 | parser.add_argument('--num_attrs', type=float, default=5)
157 |
158 | parser.add_argument('--use_gtsem', type=int, default=0)
159 |
160 | # train_se_frontier
161 | parser.add_argument('--train_se_f', type=int, default=0)
162 | parser.add_argument('--load_se_edge', type=str, default="0",
163 | help="""model path to load,
164 | 0 to not reload (default: 0)""")
165 |
166 |
167 | # parse arguments
168 | args = parser.parse_args()
169 |
170 | args.cuda = not args.no_cuda and torch.cuda.is_available()
171 |
172 | if args.cuda:
173 | if args.auto_gpu_config:
174 | num_gpus = torch.cuda.device_count()
175 | if args.total_num_scenes != "auto":
176 | args.total_num_scenes = int(args.total_num_scenes)
177 | elif "objectnav_gibson" in args.task_config and \
178 | "train" in args.split:
179 | args.total_num_scenes = 25
180 | elif "objectnav_gibson" in args.task_config and \
181 | "val" in args.split:
182 | args.total_num_scenes = 5
183 | else:
184 | assert False, "Unknown task config, please specify" + \
185 | " total_num_scenes"
186 |
187 | # GPU Memory required for the SemExp model:
188 | # 0.8 + 0.4 * args.total_num_scenes (GB)
189 | # GPU Memory required per thread: 2.6 (GB)
190 | min_memory_required = max(0.8 + 0.4 * args.total_num_scenes, 2.6)
191 | # Automatically configure number of training threads based on
192 | # number of GPUs available and GPU memory size
193 | gpu_memory = 1000
194 | for i in range(num_gpus):
195 | gpu_memory = min(gpu_memory,
196 | torch.cuda.get_device_properties(
197 | i).total_memory
198 | / 1024 / 1024 / 1024)
199 | assert gpu_memory > min_memory_required, \
200 | """Insufficient GPU memory for GPU {}, gpu memory ({}GB)
201 | needs to be greater than {}GB""".format(
202 | i, gpu_memory, min_memory_required)
203 |
204 | num_processes_per_gpu = int(gpu_memory / 2.6)
205 | num_processes_on_first_gpu = \
206 | int((gpu_memory - min_memory_required) / 2.6)
207 |
208 | if args.eval:
209 | max_threads = num_processes_per_gpu * (num_gpus - 1) \
210 | + num_processes_on_first_gpu
211 | assert max_threads >= args.total_num_scenes, \
212 | """Insufficient GPU memory for evaluation"""
213 |
214 | if num_gpus == 1:
215 | args.num_processes_on_first_gpu = num_processes_on_first_gpu
216 | args.num_processes_per_gpu = 0
217 | args.num_processes = num_processes_on_first_gpu
218 | assert args.num_processes > 0, "Insufficient GPU memory"
219 | else:
220 | num_threads = num_processes_per_gpu * (num_gpus - 1) \
221 | + num_processes_on_first_gpu
222 | num_threads = min(num_threads, args.total_num_scenes)
223 | args.num_processes_per_gpu = num_processes_per_gpu
224 | args.num_processes_on_first_gpu = max(
225 | 0,
226 | num_threads - args.num_processes_per_gpu * (num_gpus - 1))
227 | args.num_processes = num_threads
228 |
229 | args.sim_gpu_id = 1
230 |
231 | print("Auto GPU config:")
232 | print("Number of processes: {}".format(args.num_processes))
233 | print("Number of processes on GPU 0: {}".format(
234 | args.num_processes_on_first_gpu))
235 | print("Number of processes per GPU: {}".format(
236 | args.num_processes_per_gpu))
237 | else:
238 | args.sem_gpu_id = -2
239 |
240 | if args.num_mini_batch == "auto":
241 | args.num_mini_batch = max(args.num_processes // 2, 1)
242 | else:
243 | args.num_mini_batch = int(args.num_mini_batch)
244 |
245 | return args
246 |
--------------------------------------------------------------------------------
/model_multi.py:
--------------------------------------------------------------------------------
1 | import pdb
2 |
3 | import torch
4 | import torch.nn as nn
5 | from torch.nn import functional as F
6 | import numpy as np
7 |
8 | # from utils.distributions import Categorical, DiagGaussian
9 | from torch.distributions.categorical import Categorical
10 | from utils.model import get_grid, ChannelPool, Flatten, NNBase
11 | import envs.utils.depth_utils_multi as du
12 |
13 | class Semantic_Mapping(nn.Module):
14 |
15 | """
16 | Semantic_Mapping
17 | """
18 |
19 | def __init__(self, args):
20 | super(Semantic_Mapping, self).__init__()
21 |
22 | self.device = args.device
23 | self.screen_h = args.frame_height
24 | self.screen_w = args.frame_width
25 | self.resolution = args.map_resolution
26 | self.z_resolution = args.map_resolution
27 | self.map_size_cm = args.map_size_cm // args.global_downscaling
28 | self.n_channels = 3
29 | self.vision_range = args.vision_range
30 | self.dropout = 0.5
31 | self.fov = args.hfov
32 | self.du_scale = args.du_scale
33 | self.cat_pred_threshold = args.cat_pred_threshold
34 | self.exp_pred_threshold = args.exp_pred_threshold
35 | self.map_pred_threshold = args.map_pred_threshold
36 | self.num_sem_categories = args.num_sem_categories
37 | self.num_atts = args.num_attrs
38 |
39 | self.max_height = int(200 / self.z_resolution)
40 | self.min_height = int(-40 / self.z_resolution)
41 | self.agent_height = args.camera_height * 100.
42 | self.shift_loc = [self.vision_range *
43 | self.resolution // 2, 0, np.pi / 2.0]
44 | self.camera_matrix = du.get_camera_matrix(
45 | self.screen_w, self.screen_h, self.fov)
46 |
47 | self.pool = ChannelPool(1)
48 |
49 | vr = self.vision_range
50 | # pdb.set_trace()
51 | self.init_grid = torch.zeros(
52 | args.num_processes, 1 + self.num_sem_categories+self.num_atts, vr, vr,
53 | self.max_height - self.min_height
54 | ).float().to(self.device)
55 | self.feat = torch.ones(
56 | args.num_processes, 1 + self.num_sem_categories+self.num_atts,
57 | self.screen_h // self.du_scale * self.screen_w // self.du_scale
58 | ).float().to(self.device)
59 |
60 |
61 | self.max_pool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
62 |
63 | self.stair_mask_radius = 30
64 | self.stair_mask = self.get_mask(self.stair_mask_radius).to(self.device)
65 |
66 | def forward(self, obs, pose_obs, maps_last, poses_last, eve_angle,goal_found):
67 | bs, c, h, w = obs.size()
68 | depth = obs[:, 3, :, :]
69 | point_cloud_t = du.get_point_cloud_from_z_t(
70 | depth, self.camera_matrix, self.device, scale=self.du_scale)
71 |
72 | agent_view_t = du.transform_camera_view_t(
73 | point_cloud_t, self.agent_height, eve_angle, self.device)
74 |
75 | agent_view_centered_t = du.transform_pose_t(
76 | agent_view_t, self.shift_loc, self.device)
77 |
78 | max_h = self.max_height
79 | min_h = self.min_height
80 | xy_resolution = self.resolution
81 | z_resolution = self.z_resolution
82 | vision_range = self.vision_range
83 | XYZ_cm_std = agent_view_centered_t.float()
84 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] / xy_resolution)
85 | XYZ_cm_std[..., :2] = (XYZ_cm_std[..., :2] -
86 | vision_range // 2.) / vision_range * 2.
87 | XYZ_cm_std[..., 2] = XYZ_cm_std[..., 2] / z_resolution
88 | XYZ_cm_std[..., 2] = (XYZ_cm_std[..., 2] -
89 | (max_h + min_h) // 2.) / (max_h - min_h) * 2.
90 | self.feat[:, 1:, :] = nn.AvgPool2d(self.du_scale)(
91 | obs[:, 4:, :, :]
92 | ).view(bs, c - 4, h // self.du_scale * w // self.du_scale)
93 |
94 | XYZ_cm_std = XYZ_cm_std.permute(0, 3, 1, 2)
95 | XYZ_cm_std = XYZ_cm_std.view(XYZ_cm_std.shape[0],
96 | XYZ_cm_std.shape[1],
97 | XYZ_cm_std.shape[2] * XYZ_cm_std.shape[3])
98 |
99 | # voxels = du.splat_feat_nd(
100 | # self.init_grid * 0., self.feat, XYZ_cm_std).transpose(2, 3)
101 | voxels = du.splat_att_feat_nd(
102 | self.init_grid * 0., self.feat, XYZ_cm_std,self.num_atts).transpose(2, 3)
103 |
104 | min_z = int(25 / z_resolution - min_h)
105 | max_z = int((self.agent_height + 50) / z_resolution - min_h)
106 | mid_z = int(self.agent_height / z_resolution - min_h)
107 |
108 | # print("========== IN SEM MAP ==========")
109 | # print(np.unique(voxels[:,[-1],...].max(4)[0].detach().cpu().numpy()))
110 |
111 | agent_height_proj = voxels[:,:-self.num_atts,..., min_z:max_z].sum(4)
112 | agent_height_proj_att = voxels[:,-self.num_atts:,..., min_z:max_z].max(4)[0]
113 | agent_height_proj=torch.cat([agent_height_proj,agent_height_proj_att],dim=1)
114 | agent_height_stair_proj = voxels[:,:-self.num_atts,..., mid_z-5:mid_z].sum(4)
115 | agent_height_stair_proj_att = voxels[:,-self.num_atts:,..., mid_z-5:mid_z].max(4)[0]
116 | agent_height_stair_proj=torch.cat([agent_height_stair_proj,agent_height_stair_proj_att],dim=1)
117 | all_height_proj = voxels[:,:-self.num_atts,...].sum(4)
118 | all_height_proj_att = voxels[:,-self.num_atts:,...].max(4)[0]
119 | all_height_proj=torch.cat([all_height_proj,all_height_proj_att],dim=1)
120 |
121 | # pdb.set_trace()
122 | fp_map_pred = agent_height_proj[:, 0:1, :, :]
123 | fp_exp_pred = all_height_proj[:, 0:1, :, :]
124 | fp_stair_pred = agent_height_stair_proj[:, 0:1, :, :]
125 | fp_map_pred = fp_map_pred / self.map_pred_threshold
126 | fp_stair_pred = fp_stair_pred / self.map_pred_threshold
127 | fp_exp_pred = fp_exp_pred / self.exp_pred_threshold
128 | fp_map_pred = torch.clamp(fp_map_pred, min=0.0, max=1.0)
129 | fp_stair_pred = torch.clamp(fp_stair_pred, min=0.0, max=1.0)
130 | fp_exp_pred = torch.clamp(fp_exp_pred, min=0.0, max=1.0)
131 |
132 | pose_pred = poses_last
133 |
134 | agent_view = torch.zeros(bs, c,
135 | self.map_size_cm // self.resolution,
136 | self.map_size_cm // self.resolution
137 | ).to(self.device)
138 |
139 | x1 = self.map_size_cm // (self.resolution * 2) - self.vision_range // 2
140 | x2 = x1 + self.vision_range
141 | y1 = self.map_size_cm // (self.resolution * 2)
142 | y2 = y1 + self.vision_range
143 | agent_view[:, 0:1, y1:y2, x1:x2] = fp_map_pred
144 | agent_view[:, 1:2, y1:y2, x1:x2] = fp_exp_pred
145 | agent_view[:, 4:-self.num_atts, y1:y2, x1:x2] = torch.clamp(
146 | agent_height_proj[:, 1:-self.num_atts, :, :] / self.cat_pred_threshold,
147 | min=0.0, max=1.0)
148 | agent_view[:, -self.num_atts:, y1:y2, x1:x2] =agent_height_proj[:, -self.num_atts:, :, :]
149 |
150 | # print("SEM MAP: MAX ",agent_height_proj[:, -1, :, :].max(),", MIN: ",agent_height_proj[:, -1, :, :].min())
151 | # print("SEM MAP/HEIGHT: MAX ",agent_view[:, -1, ...].max(),", MIN: ",agent_view[:, -1, ...].min())
152 |
153 | agent_view_stair = agent_view.clone().detach()
154 | agent_view_stair[:, 0:1, y1:y2, x1:x2] = fp_stair_pred
155 |
156 | corrected_pose = pose_obs
157 |
158 | def get_new_pose_batch(pose, rel_pose_change):
159 |
160 | pose[:, 1] += rel_pose_change[:, 0] * \
161 | torch.sin(pose[:, 2] / 57.29577951308232) \
162 | + rel_pose_change[:, 1] * \
163 | torch.cos(pose[:, 2] / 57.29577951308232)
164 | pose[:, 0] += rel_pose_change[:, 0] * \
165 | torch.cos(pose[:, 2] / 57.29577951308232) \
166 | - rel_pose_change[:, 1] * \
167 | torch.sin(pose[:, 2] / 57.29577951308232)
168 | pose[:, 2] += rel_pose_change[:, 2] * 57.29577951308232
169 |
170 | pose[:, 2] = torch.fmod(pose[:, 2] - 180.0, 360.0) + 180.0
171 | pose[:, 2] = torch.fmod(pose[:, 2] + 180.0, 360.0) - 180.0
172 |
173 | return pose
174 |
175 | current_poses = get_new_pose_batch(poses_last, corrected_pose)
176 | st_pose = current_poses.clone().detach()
177 |
178 | st_pose[:, :2] = - (st_pose[:, :2]
179 | * 100.0 / self.resolution
180 | - self.map_size_cm // (self.resolution * 2)) /\
181 | (self.map_size_cm // (self.resolution * 2))
182 | st_pose[:, 2] = 90. - (st_pose[:, 2])
183 |
184 | rot_mat, trans_mat = get_grid(st_pose, agent_view.size(),
185 | self.device)
186 |
187 | rotated = F.grid_sample(agent_view, rot_mat, align_corners=True)
188 | translated = F.grid_sample(rotated, trans_mat, align_corners=True)
189 |
190 | # translated[:, 18:19, :, :] = -self.max_pool(-translated[:, 18:19, :, :])
191 |
192 | diff_ob_ex = translated[:, 1:2, :, :] - self.max_pool(translated[:, 0:1, :, :])
193 |
194 | diff_ob_ex[diff_ob_ex>0.8] = 1.0
195 | diff_ob_ex[diff_ob_ex!=1.0] = 0.0
196 |
197 | # pdb.set_trace()
198 | if not goal_found:
199 | # TODO: better way to get the score
200 | # maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
201 | _score_maps_last=maps_last[:,-self.num_atts:,...]
202 | _score_maps_new=translated[:, -self.num_atts:, ...]
203 | updated_mask=_score_maps_new>0
204 | _s_map=torch.cat((_score_maps_last.unsqueeze(1), _score_maps_new.unsqueeze(1)), 1)
205 | _s_map,_=torch.max(_s_map, 1)
206 | map_score_new = torch.where(updated_mask, _s_map / 2, _score_maps_last)
207 | maps2 = torch.cat((maps_last[:, :-self.num_atts, ...].unsqueeze(1), translated[:, :-self.num_atts, ...].unsqueeze(1)), 1)
208 | map_pred, _ = torch.max(maps2, 1)
209 | map_pred=torch.cat([map_pred,map_score_new],1)
210 | # print("map_score_new: MAX ",map_score_new.max(),", MIN: ",map_score_new.min())
211 | else:
212 | maps2 = torch.cat((maps_last.unsqueeze(1), translated.unsqueeze(1)), 1)
213 | map_pred, _ = torch.max(maps2, 1)
214 |
215 |
216 | # map_pred=torch.cat((_map_pred, map_score_new), dim=1)
217 |
218 | for i in range(eve_angle.shape[0]):
219 | if eve_angle[i] == 0:
220 | map_pred[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0
221 |
222 | # stairs view
223 | rot_mat_stair, trans_mat_stair = get_grid(st_pose, agent_view_stair.size(),
224 | self.device)
225 |
226 | rotated_stair = F.grid_sample(agent_view_stair, rot_mat_stair, align_corners=True)
227 | translated_stair = F.grid_sample(rotated_stair, trans_mat_stair, align_corners=True)
228 |
229 | stair_mask = torch.zeros(self.map_size_cm // self.resolution, self.map_size_cm // self.resolution).to(self.device)
230 |
231 | s_y = int(current_poses[0][1]*100/5)
232 | s_x = int(current_poses[0][0]*100/5)
233 | limit_up = self.map_size_cm // self.resolution - self.stair_mask_radius - 1
234 | limit_be = self.stair_mask_radius
235 | if s_y > limit_up:
236 | s_y = limit_up
237 | if s_y < self.stair_mask_radius:
238 | s_y = self.stair_mask_radius
239 | if s_x > limit_up:
240 | s_x = limit_up
241 | if s_x < self.stair_mask_radius:
242 | s_x = self.stair_mask_radius
243 | stair_mask[int(s_y-self.stair_mask_radius):int(s_y+self.stair_mask_radius), int(s_x-self.stair_mask_radius):int(s_x+self.stair_mask_radius)] = self.stair_mask
244 |
245 | translated_stair[0, 0:1, :, :] *= stair_mask
246 | translated_stair[0, 1:2, :, :] *= stair_mask
247 |
248 | # translated_stair[:, 13:14, :, :] = -self.max_pool(-translated_stair[:, 13:14, :, :])
249 |
250 | diff_ob_ex = translated_stair[:, 1:2, :, :] - translated_stair[:, 0:1, :, :]
251 |
252 | diff_ob_ex[diff_ob_ex>0.8] = 1.0
253 | diff_ob_ex[diff_ob_ex!=1.0] = 0.0
254 |
255 | maps3 = torch.cat((maps_last.unsqueeze(1), translated_stair.unsqueeze(1)), 1)
256 |
257 | map_pred_stair, _ = torch.max(maps3, 1)
258 |
259 | for i in range(eve_angle.shape[0]):
260 | if eve_angle[i] == 0:
261 | map_pred_stair[i, 0:1, :, :][diff_ob_ex[i] == 1.0] = 0.0
262 |
263 |
264 | # print("map_pred_-1: MAX ",map_pred[:,-1,...].max(),", MIN: ",map_pred[:,-1,...].min())
265 | return translated, map_pred, map_pred_stair, current_poses
266 |
267 |
268 | def get_mask(self, step_size):
269 | size = int(step_size) * 2
270 | mask = torch.zeros(size, size)
271 | for i in range(size):
272 | for j in range(size):
273 | if ((i + 0.5) - (size // 2)) ** 2 + \
274 | ((j + 0.5) - (size // 2)) ** 2 <= \
275 | step_size ** 2:
276 | mask[i, j] = 1
277 | return mask
278 |
279 |
280 | class FeedforwardNet(nn.Module):
281 |
282 | def __init__(self, input_dim, output_dim):
283 | super(FeedforwardNet, self).__init__()
284 | """ self.layers = nn.Sequential(
285 | nn.Linear(input_dim, 512),
286 | nn.BatchNorm1d(512),
287 | nn.ReLU(inplace=True),
288 | nn.Linear(512, 512),
289 | nn.BatchNorm1d(512),
290 | nn.ReLU(inplace=True),
291 | nn.Linear(512, 256),
292 | nn.BatchNorm1d(256),
293 | nn.ReLU(inplace=True),
294 | nn.Linear(256, output_dim),
295 | ) """
296 | self.layers = nn.Sequential(
297 | nn.Linear(input_dim, 512),
298 | nn.BatchNorm1d(512),
299 | nn.ReLU(inplace=True),
300 | nn.Linear(512, 256),
301 | nn.BatchNorm1d(256),
302 | nn.ReLU(inplace=True),
303 | nn.Linear(256, output_dim),
304 | )
305 |
306 | def forward(self, x):
307 | return self.layers(x)
--------------------------------------------------------------------------------
/RedNet/RedNet_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 | import math
5 | import torch.utils.model_zoo as model_zoo
6 | import utils
7 | from torch.utils.checkpoint import checkpoint
8 | import os
9 | import torchvision
10 |
11 |
12 | class RedNet(nn.Module):
13 | def __init__(self, num_classes=40, pretrained=False):
14 |
15 | super().__init__()
16 |
17 | block = Bottleneck
18 | transblock = TransBasicBlock
19 | layers = [3, 4, 6, 3]
20 | # original resnet
21 | self.inplanes = 64
22 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
23 | bias=False)
24 | self.bn1 = nn.BatchNorm2d(64)
25 | self.relu = nn.ReLU(inplace=True)
26 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
27 | self.layer1 = self._make_layer(block, 64, layers[0])
28 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
29 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
30 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
31 |
32 | # resnet for depth channel
33 | self.inplanes = 64
34 | self.conv1_d = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
35 | bias=False)
36 | self.bn1_d = nn.BatchNorm2d(64)
37 | self.layer1_d = self._make_layer(block, 64, layers[0])
38 | self.layer2_d = self._make_layer(block, 128, layers[1], stride=2)
39 | self.layer3_d = self._make_layer(block, 256, layers[2], stride=2)
40 | self.layer4_d = self._make_layer(block, 512, layers[3], stride=2)
41 |
42 | self.inplanes = 512
43 | self.deconv1 = self._make_transpose(transblock, 256, 6, stride=2)
44 | self.deconv2 = self._make_transpose(transblock, 128, 4, stride=2)
45 | self.deconv3 = self._make_transpose(transblock, 64, 3, stride=2)
46 | self.deconv4 = self._make_transpose(transblock, 64, 3, stride=2)
47 |
48 | self.agant0 = self._make_agant_layer(64, 64)
49 | self.agant1 = self._make_agant_layer(64 * 4, 64)
50 | self.agant2 = self._make_agant_layer(128 * 4, 128)
51 | self.agant3 = self._make_agant_layer(256 * 4, 256)
52 | self.agant4 = self._make_agant_layer(512 * 4, 512)
53 |
54 | # final block
55 | self.inplanes = 64
56 | self.final_conv = self._make_transpose(transblock, 64, 3)
57 |
58 | self.final_deconv_custom = nn.ConvTranspose2d(self.inplanes, num_classes, kernel_size=2,
59 | stride=2, padding=0, bias=True)
60 |
61 | self.out5_conv_custom = nn.Conv2d(256, num_classes, kernel_size=1, stride=1, bias=True)
62 | self.out4_conv_custom = nn.Conv2d(128, num_classes, kernel_size=1, stride=1, bias=True)
63 | self.out3_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True)
64 | self.out2_conv_custom = nn.Conv2d(64, num_classes, kernel_size=1, stride=1, bias=True)
65 |
66 | if pretrained:
67 | self._load_resnet_pretrained()
68 |
69 | def weights_init(self, m):
70 | classname = m.__class__.__name__
71 | if classname.find('Conv') != -1:
72 | nn.init.kaiming_normal_(m.weight)
73 | if m.bias is not None:
74 | nn.init.zeros_(m.bias)
75 | elif isinstance(m, nn.BatchNorm2d):
76 | m.weight.data.fill_(1)
77 | m.bias.data.zero_()
78 |
79 |
80 |
81 | def _make_layer(self, block, planes, blocks, stride=1):
82 | downsample = None
83 | if stride != 1 or self.inplanes != planes * block.expansion:
84 | downsample = nn.Sequential(
85 | nn.Conv2d(self.inplanes, planes * block.expansion,
86 | kernel_size=1, stride=stride, bias=False),
87 | nn.BatchNorm2d(planes * block.expansion),
88 | )
89 |
90 | layers = []
91 |
92 | layers.append(block(self.inplanes, planes, stride, downsample))
93 | self.inplanes = planes * block.expansion
94 | for i in range(1, blocks):
95 | layers.append(block(self.inplanes, planes))
96 |
97 | return nn.Sequential(*layers)
98 |
99 | def _make_transpose(self, block, planes, blocks, stride=1):
100 |
101 | upsample = None
102 | if stride != 1:
103 | upsample = nn.Sequential(
104 | nn.ConvTranspose2d(self.inplanes, planes,
105 | kernel_size=2, stride=stride,
106 | padding=0, bias=False),
107 | nn.BatchNorm2d(planes),
108 | )
109 | elif self.inplanes != planes:
110 | upsample = nn.Sequential(
111 | nn.Conv2d(self.inplanes, planes,
112 | kernel_size=1, stride=stride, bias=False),
113 | nn.BatchNorm2d(planes),
114 | )
115 |
116 | layers = []
117 |
118 | for i in range(1, blocks):
119 | layers.append(block(self.inplanes, self.inplanes))
120 |
121 | layers.append(block(self.inplanes, planes, stride, upsample))
122 | self.inplanes = planes
123 |
124 | return nn.Sequential(*layers)
125 |
126 | def _make_agant_layer(self, inplanes, planes):
127 |
128 | layers = nn.Sequential(
129 | nn.Conv2d(inplanes, planes, kernel_size=1,
130 | stride=1, padding=0, bias=False),
131 | nn.BatchNorm2d(planes),
132 | nn.ReLU(inplace=True)
133 | )
134 | return layers
135 |
136 | def _load_resnet_pretrained(self):
137 | pretrain_dict = torch.load(ckpt, map_location='cpu')['model_state']
138 | prefix = 'module.'
139 | pretrain_dict = {
140 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in pretrain_dict.items()
141 | }
142 | pretrain_dict.pop('final_deconv_custom.weight')
143 | pretrain_dict.pop('final_deconv_custom.bias')
144 |
145 | pretrain_dict.pop('out5_conv_custom.weight')
146 | pretrain_dict.pop('out5_conv_custom.bias')
147 |
148 | pretrain_dict.pop('out4_conv_custom.weight')
149 | pretrain_dict.pop('out4_conv_custom.bias')
150 |
151 | pretrain_dict.pop('out3_conv_custom.weight')
152 | pretrain_dict.pop('out3_conv_custom.bias')
153 |
154 | pretrain_dict.pop('out2_conv_custom.weight')
155 | pretrain_dict.pop('out2_conv_custom.bias')
156 |
157 | model_dict = {}
158 | state_dict = self.state_dict()
159 |
160 | state_dict.update(model_dict)
161 | self.load_state_dict(state_dict)
162 |
163 | def forward_downsample(self, rgb, depth):
164 | x = self.conv1(rgb)
165 | x = self.bn1(x)
166 | x = self.relu(x)
167 | depth = self.conv1_d(depth)
168 | depth = self.bn1_d(depth)
169 | depth = self.relu(depth)
170 |
171 | fuse0 = x + depth
172 |
173 | x = self.maxpool(fuse0)
174 | depth = self.maxpool(depth)
175 |
176 | # block 1
177 | x = self.layer1(x)
178 | # debug_tensor('post1', depth)
179 | # for i, mod in enumerate(self.layer1_d):
180 | # depth = mod(depth)
181 | # debug_tensor(f'post1d-{i}', depth)
182 |
183 | depth = self.layer1_d(depth)
184 | # debug_tensor('post1d', depth)
185 |
186 | fuse1 = x + depth
187 | # block 2
188 | x = self.layer2(fuse1)
189 | depth = self.layer2_d(depth)
190 | fuse2 = x + depth
191 | # block 3
192 | x = self.layer3(fuse2)
193 | depth = self.layer3_d(depth)
194 | fuse3 = x + depth
195 | # block 4
196 | x = self.layer4(fuse3)
197 | depth = self.layer4_d(depth)
198 | fuse4 = x + depth
199 |
200 | agant4 = self.agant4(fuse4)
201 |
202 | return fuse0, fuse1, fuse2, fuse3, agant4
203 |
204 | def forward_upsample(self, fuse0, fuse1, fuse2, fuse3, agant4, train=False):
205 |
206 | # upsample 1
207 | x = self.deconv1(agant4)
208 | if train:
209 | out5 = self.out5_conv_custom(x)
210 | x = x + self.agant3(fuse3)
211 | # upsample 2
212 | x = self.deconv2(x)
213 | if train:
214 | out4 = self.out4_conv_custom(x)
215 | x = x + self.agant2(fuse2)
216 | # upsample 3
217 | x = self.deconv3(x)
218 | if train:
219 | out3 = self.out3_conv_custom(x)
220 | x = x + self.agant1(fuse1)
221 | # upsample 4
222 | x = self.deconv4(x)
223 | if train:
224 | out2 = self.out2_conv_custom(x)
225 | x = x + self.agant0(fuse0)
226 | # final
227 | x = self.final_conv(x)
228 |
229 | last_layer = x
230 |
231 | out = self.final_deconv_custom(x)
232 |
233 | if train:
234 | return out, out2, out3, out4, out5
235 |
236 | return out
237 |
238 | def forward(self, rgb, depth, train=False):
239 |
240 | fuses = self.forward_downsample(rgb, depth)
241 |
242 | # We only need predictions.
243 | # features_encoder = fuses[-1]
244 | # scores, features_lastlayer = self.forward_upsample(*fuses)
245 | scores = self.forward_upsample(*fuses, train)
246 | # debug_tensor('scores', scores)
247 | # return features_encoder, features_lastlayer, scores
248 | # print("scores: ", scores.shape)
249 | return scores
250 |
251 |
252 | def conv3x3(in_planes, out_planes, stride=1):
253 | "3x3 convolution with padding"
254 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
255 | padding=1, bias=False)
256 |
257 |
258 | class Bottleneck(nn.Module):
259 | expansion = 4
260 |
261 | def __init__(self, inplanes, planes, stride=1, downsample=None):
262 | super(Bottleneck, self).__init__()
263 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
264 | self.bn1 = nn.BatchNorm2d(planes)
265 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
266 | padding=1, bias=False)
267 | self.bn2 = nn.BatchNorm2d(planes)
268 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
269 | self.bn3 = nn.BatchNorm2d(planes * 4)
270 | self.relu = nn.ReLU(inplace=True)
271 | self.downsample = downsample
272 | self.stride = stride
273 |
274 | def forward(self, x):
275 | residual = x
276 |
277 | out = self.conv1(x)
278 | out = self.bn1(out)
279 | out = self.relu(out)
280 |
281 | out = self.conv2(out)
282 | out = self.bn2(out)
283 | out = self.relu(out)
284 |
285 | out = self.conv3(out)
286 | out = self.bn3(out)
287 |
288 | if self.downsample is not None:
289 | residual = self.downsample(x)
290 |
291 | out += residual
292 | out = self.relu(out)
293 |
294 | return out
295 |
296 | class TransBasicBlock(nn.Module):
297 | expansion = 1
298 |
299 | def __init__(self, inplanes, planes, stride=1, upsample=None, **kwargs):
300 | super(TransBasicBlock, self).__init__()
301 | self.conv1 = conv3x3(inplanes, inplanes)
302 | self.bn1 = nn.BatchNorm2d(inplanes)
303 | self.relu = nn.ReLU(inplace=True)
304 | if upsample is not None and stride != 1:
305 | self.conv2 = nn.ConvTranspose2d(inplanes, planes,
306 | kernel_size=3, stride=stride, padding=1,
307 | output_padding=1, bias=False)
308 | else:
309 | self.conv2 = conv3x3(inplanes, planes, stride)
310 | self.bn2 = nn.BatchNorm2d(planes)
311 | self.upsample = upsample
312 | self.stride = stride
313 |
314 | def forward(self, x):
315 | residual = x
316 |
317 | out = self.conv1(x)
318 | out = self.bn1(out)
319 | out = self.relu(out)
320 |
321 | out = self.conv2(out)
322 | out = self.bn2(out)
323 |
324 | if self.upsample is not None:
325 | residual = self.upsample(x)
326 |
327 | out += residual
328 | out = self.relu(out)
329 |
330 | return out
331 |
332 | class BatchNormalize(nn.Module):
333 | r"""
334 | I can't believe this isn't supported
335 | https://github.com/pytorch/vision/issues/157
336 | """
337 |
338 | def __init__(self, mean, std, device):
339 | super().__init__()
340 | self.mean = torch.tensor(mean, device=device)[None, :, None, None]
341 | self.std = torch.tensor(std, device=device)[None, :, None, None]
342 |
343 | def forward(self, x):
344 | # mean = torch.tensor(self.mean, device=x.device)
345 | # std = torch.tensor(self.std, device=x.device)
346 | # mean = self.mean[None, :, None, None]
347 | # std = self.std[None, :, None, None]
348 | # print(x.shape)
349 | # print(self.mean.shape)
350 | return (x - self.mean) / self.std
351 | # x.sub_(self.mean).div_(self.std)
352 | # return x
353 |
354 | class RedNetResizeWrapper(nn.Module):
355 | def __init__(self, device, resize=True, stabilize=False):
356 | super().__init__()
357 | self.rednet = RedNet()
358 | self.rednet.eval()
359 | self.semmap_rgb_norm = BatchNormalize(
360 | mean=[0.493, 0.468, 0.438],
361 | std=[0.544, 0.521, 0.499],
362 | device=device
363 | )
364 | self.semmap_depth_norm = BatchNormalize(
365 | mean=[0.213],
366 | std=[0.285],
367 | device=device
368 | )
369 | self.pretrained_size = (480, 640)
370 | self.resize = resize
371 | self.stabilize = stabilize
372 |
373 | def forward(self, rgb, depth, train=False):
374 | r"""
375 | Args:
376 | Raw sensor inputs.
377 | rgb: B x H=256 x W=256 x 3
378 | depth: B x H x W x 1
379 | Returns:
380 | semantic: drop-in replacement for default semantic sensor. B x H x W (no channel, for some reason)
381 | """
382 | # Not quite sure what depth is produced here. Let's just check
383 | if self.resize:
384 | _, og_h, og_w, _ = rgb.size() # b h w c
385 | rgb = rgb.permute(0, 3, 1, 2)
386 | depth = depth.permute(0, 3, 1, 2)
387 |
388 | rgb = rgb.float() / 255
389 | if self.resize:
390 | rgb = F.interpolate(rgb, self.pretrained_size, mode='bilinear')
391 | depth = F.interpolate(depth, self.pretrained_size, mode='nearest')
392 | # print("rgb: ", rgb.shape)
393 | rgb = self.semmap_rgb_norm(rgb)
394 |
395 | # rgb = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],
396 | # std=[0.229, 0.224, 0.225])(rgb)
397 | # depth = torchvision.transforms.Normalize(mean=[19050],
398 | # std=[9650])(depth)
399 |
400 | depth_clip = (depth < 1.0).squeeze(1)
401 | # depth_clip = ((depth < 1.0) & (depth > 0.0)).squeeze(1)
402 | # print("depth: ", depth.shape)
403 | depth = self.semmap_depth_norm(depth)
404 | with torch.no_grad():
405 | scores = self.rednet(rgb, depth, train)
406 | # print("scores: ", scores[0][5][100])
407 |
408 | if train:
409 | return scores
410 | else:
411 | with torch.no_grad():
412 | scores = nn.Softmax(dim=1)(scores)
413 | # print("score: ", scores.shape) #torch.Size([1, 40, 480, 640])
414 | # print("pred: ", pred.shape) # torch.Size([1, 40, 480, 640])
415 | pred = (torch.max(scores, 1)[1] + 1).float() # B x 480 x 640
416 |
417 | # print("torch: ", pred[0][100])
418 | pred[torch.max(scores, 1)[0] < 0.8] = 1
419 |
420 | if self.stabilize: # downsample tiny
421 | # Mask out out of depth samples
422 | pred[~depth_clip] = -1 # 41 is UNK in MP3D, but hab-sim renders with -1
423 | # pred = F.interpolate(pred.unsqueeze(1), (15, 20), mode='nearest').squeeze(1)
424 | if self.resize:
425 | pred = F.interpolate(pred.unsqueeze(1), (og_h, og_w), mode='nearest')
426 |
427 | return pred.squeeze(1)
428 |
429 | # def load_rednet(device, ckpt="", resize=True, stabilize=False):
430 | # if not os.path.isfile(ckpt):
431 | # raise Exception(f"invalid path {ckpt} provided for rednet weights")
432 |
433 | # model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device)
434 |
435 | # print("=> loading RedNet checkpoint '{}'".format(ckpt))
436 | # if device.type == 'cuda':
437 | # checkpoint = torch.load(ckpt, map_location='cpu')
438 | # else:
439 | # checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage)
440 |
441 | # state_dict = checkpoint['model_state']
442 | # prefix = 'module.'
443 | # state_dict = {
444 | # (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
445 | # }
446 | # state_dict.pop('final_deconv_custom.weight')
447 | # state_dict.pop('final_deconv_custom.bias')
448 |
449 | # state_dict.pop('out5_conv_custom.weight')
450 | # state_dict.pop('out5_conv_custom.bias')
451 |
452 | # state_dict.pop('out4_conv_custom.weight')
453 | # state_dict.pop('out4_conv_custom.bias')
454 |
455 | # state_dict.pop('out3_conv_custom.weight')
456 | # state_dict.pop('out3_conv_custom.bias')
457 |
458 | # state_dict.pop('out2_conv_custom.weight')
459 | # state_dict.pop('out2_conv_custom.bias')
460 |
461 | # model.rednet.state_dict().update(state_dict)
462 | # # model.rednet.load_state_dict(state_dict)
463 | # print("=> loaded checkpoint '{}' (epoch {})"
464 | # .format(ckpt, checkpoint['epoch']))
465 | # # print(model)
466 | # print("Model's state_dict:")
467 | # # for param_tensor in model.state_dict():
468 | # # print(param_tensor, "\t", model.state_dict()[param_tensor].size())
469 |
470 | # return model
471 |
472 |
473 | def load_rednet(device, ckpt="", resize=True, stabilize=False):
474 | if not os.path.isfile(ckpt):
475 | raise Exception(f"invalid path {ckpt} provided for rednet weights")
476 |
477 | model = RedNetResizeWrapper(device, resize=resize, stabilize=stabilize).to(device)
478 |
479 | print("=> loading RedNet checkpoint '{}'".format(ckpt))
480 | if device.type == 'cuda':
481 | checkpoint = torch.load(ckpt, map_location='cpu')
482 | else:
483 | checkpoint = torch.load(ckpt, map_location=lambda storage, loc: storage)
484 |
485 | # state_dict = checkpoint['state_dict']
486 | state_dict = checkpoint['model_state']
487 | prefix = 'module.'
488 | state_dict = {
489 | (k[len(prefix):] if k[:len(prefix)] == prefix else k): v for k, v in state_dict.items()
490 | }
491 | # model.load_state_dict(state_dict)
492 | model.rednet.load_state_dict(state_dict)
493 | print("=> loaded checkpoint '{}' (epoch {})"
494 | .format(ckpt, checkpoint['epoch']))
495 | # print(model)
496 | print("Model's state_dict:")
497 | # for param_tensor in model.state_dict():
498 | # print(param_tensor, "\t", model.state_dict()[param_tensor].size())
499 |
500 | return model
501 |
502 | def save_ckpt(ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train):
503 | # usually this happens only on the start of a epoch
504 | epoch_float = epoch + (local_count / num_train)
505 | epoch_float = epoch + (local_count / num_train)
506 | state = {
507 | 'global_step': global_step,
508 | 'epoch': epoch_float,
509 | 'state_dict': model.state_dict(),
510 | 'optimizer': optimizer.state_dict(),
511 | }
512 | ckpt_model_filename = "ckpt_epoch_{:0.2f}.pth".format(epoch_float)
513 | path = os.path.join(ckpt_dir, ckpt_model_filename)
514 | torch.save(state, path)
515 | print('{:>2} has been successfully saved'.format(path))
--------------------------------------------------------------------------------
/get_point_score.py:
--------------------------------------------------------------------------------
1 | import sys,pdb,os,time
2 | import torch
3 | from PIL import Image
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | from torchvision.transforms.functional import to_pil_image
7 | from lavis.models import model_zoo
8 | from lavis.models import load_model_and_preprocess
9 | from lavis.common.gradcam import getAttMap
10 | from lavis.models.blip_models.blip_image_text_matching import compute_gradcam
11 | import cv2
12 | import scipy.ndimage
13 | import gzip
14 | import json
15 |
16 | def crop_img(image, vis_processors,scales=[1,2,4]):
17 | """
18 | Crops the image into multiple patches at different scales and visualizes the patches.
19 |
20 | Parameters:
21 | - image_path: The path to the image to be processed.
22 | - scales: A list of scales at which to crop the image.
23 |
24 | Returns:
25 | - A list of PIL Image objects representing the cropped patches.
26 | """
27 |
28 | # Initialize a list to hold the cropped images
29 | cropped_images = []
30 | cropped_batch=[]
31 | # Define the crop function using numpy slicing
32 |
33 | def crop_numpy(img, num_x, num_y):
34 | # Convert to numpy array for slicing
35 | np_img = np.array(img)
36 | h, w, _ = np_img.shape
37 | patches=list(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
38 | j * w // num_x:(j + 1) * w // num_x, :])
39 | for i in range(num_y) for j in range(num_x))
40 | patches_batch = list(vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
41 | j * w // num_x:(j + 1) * w // num_x, :])).to(device)
42 | for i in range(num_y) for j in range(num_x))
43 |
44 | return patches, patches_batch
45 |
46 | for scale in scales:
47 | # Calculate number of patches in each axis
48 | num_x, num_y = scale, scale
49 | # Crop using numpy and store in dictionary
50 | patch,batch=crop_numpy(image, num_x, num_y)
51 | cropped_images.extend(patch)
52 | cropped_batch.extend(batch)
53 |
54 | return cropped_images,torch.stack(cropped_batch)
55 |
56 | def crop_img_padding(image, vis_processors,scales=[0.5, 0.25]):
57 | w, h = image.size
58 | patches = [image]
59 | for scale in scales:
60 | # Determine the size of the patches
61 | new_w, new_h = int(w * scale), int(h * scale)
62 |
63 | # Calculate the number of patches to create
64 | patches_across_width = w // new_w
65 | patches_across_height = h // new_h
66 |
67 | for i in range(patches_across_width):
68 | for j in range(patches_across_height):
69 | # Calculate the left, upper, right, and lower pixel coordinates for cropping
70 | left = i * new_w
71 | upper = j * new_h
72 | right = left + new_w
73 | lower = upper + new_h
74 |
75 | # Crop the patch and create a masked image of the original size
76 | patch = image.crop((left, upper, right, lower))
77 | mask = Image.new("RGB", (w, h), (0, 0, 0))
78 | mask.paste(patch, (left, upper))
79 | patches.append(mask)
80 | return patches #from top to down, from large patch to small patch
81 |
82 | def vis_patches(cropped_images,pth):
83 | # Visualize the cropped images
84 | fig, axes = plt.subplots(len(cropped_images), 1, figsize=(10, len(cropped_images) * 5))
85 | if len(cropped_images) == 1:
86 | axes = [axes]
87 | for ax, img in zip(axes, cropped_images):
88 | ax.imshow(img)
89 | ax.axis('off')
90 | plt.tight_layout()
91 | plt.savefig(pth)
92 |
93 | def save_images_with_scores(batch, itc_score, attributes,directory):
94 | for attr in attributes:
95 | for i, tensor in enumerate(batch):
96 | # Convert the tensor to a PIL Image
97 | mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:,None,None].cuda()
98 | std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:,None,None].cuda()
99 | # pdb.set_trace()
100 | tensor=tensor*std+mean
101 | image = to_pil_image(tensor)
102 | score = itc_score[attr][i].item()
103 | # Save the image with the score in the filename
104 | filename = f"image_{i}_attri_{attr}_score_{score:.4f}.png"
105 | image.save(os.path.join(directory, filename))
106 |
107 | def depth_to_colormap(depth_array, colormap=cv2.COLORMAP_JET):
108 | # Normalize the depth array to the range [0, 255]
109 | normalized_depth = cv2.normalize(depth_array, None, 0, 255, cv2.NORM_MINMAX)
110 | # Convert to unsigned 8-bit type
111 | normalized_depth = normalized_depth.astype(np.uint8)
112 | # Apply colormap
113 | colored_depth = cv2.applyColorMap(normalized_depth, colormap)
114 | colored_depth_pil = Image.fromarray(cv2.cvtColor(colored_depth, cv2.COLOR_BGR2RGB))
115 | return colored_depth_pil
116 |
117 | def get_pyramid_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]):
118 | """
119 | Crops the image into multiple patches at different scales, computes scores for each patch,
120 | normalizes these scores, and maps them back to the original image pixels.
121 |
122 | Parameters:
123 | - image: The PIL Image to be processed.
124 | - vis_processors: A function that processes each patch and returns a tensor suitable for scoring.
125 | - score_processor: A function that takes a batch of patches and returns scores.
126 | - scales: A list of scales at which to crop the image.
127 |
128 | Returns:
129 | - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel.
130 | """
131 |
132 | # Initialize lists to hold cropped images and their scores
133 | cropped_images = []
134 | cropped_batches = []
135 |
136 | # Define the crop function using numpy slicing
137 | def crop_numpy(img, num_x, num_y):
138 | np_img = np.array(img)
139 | h, w, _ = np_img.shape
140 | return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
141 | j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices)
142 | for i in range(num_y) for j in range(num_x)]
143 |
144 |
145 | # Process each scale
146 | for scale in scales:
147 | num_x, num_y = scale, scale
148 | patches = crop_numpy(image, num_x, num_y)
149 | cropped_images.extend(patches)
150 | # batch = torch.stack(cropped_images)
151 |
152 | attr_score={}
153 | attr_score_map={}
154 | for attr in attributes:
155 |
156 | image_scores = score_processor({"image": torch.stack(cropped_images), "text_input": attr}, match_head='itc')
157 | attr_score[attr] = image_scores.detach().cpu().numpy()
158 |
159 | # Normalize the scores: TODO
160 | # total_score = sum(image_scores)
161 | # normalized_scores = [score / total_score for score in image_scores]
162 |
163 | # Create a map of scores for the original image
164 | scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32)
165 | patch_idx = 0
166 | for scale in scales:
167 | num_x, num_y = scale, scale
168 | patch_height = scores_map.shape[0] // num_y
169 | patch_width = scores_map.shape[1] // num_x
170 | for i in range(num_y):
171 | for j in range(num_x):
172 | # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx]
173 | scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += image_scores[patch_idx].detach().cpu().numpy()
174 | patch_idx += 1
175 |
176 | # Average the scores map if there is overlap (simple averaging for overlap handling)
177 | # scores_map /= len(scales)
178 | attr_score_map[attr]=scores_map/3
179 | # pdb.set_trace()
180 | return cropped_images, attr_score_map
181 |
182 | def get_pyramid_clip_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None,scales=[1, 2, 4]):
183 | """
184 | Crops the image into multiple patches at different scales, computes scores for each patch,
185 | normalizes these scores, and maps them back to the original image pixels.
186 |
187 | Parameters:
188 | - image: The PIL Image to be processed.
189 | - vis_processors: A function that processes each patch and returns a tensor suitable for scoring.
190 | - score_processor: A function that takes a batch of patches and returns scores.
191 | - scales: A list of scales at which to crop the image.
192 |
193 | Returns:
194 | - A tuple of (cropped_images, scores_map) where scores_map is a numpy array of the same shape as the original image with scores assigned to each pixel.
195 | """
196 |
197 | # Initialize lists to hold cropped images and their scores
198 | cropped_images = []
199 | cropped_batches = []
200 |
201 | # Define the crop function using numpy slicing
202 | def crop_numpy(img, num_x, num_y):
203 | np_img = np.array(img)
204 | h, w, _ = np_img.shape
205 | return [vis_processors(Image.fromarray(np_img[i * h // num_y:(i + 1) * h // num_y,
206 | j * w // num_x:(j + 1) * w // num_x, :])).to(blip_devices)
207 | for i in range(num_y) for j in range(num_x)]
208 |
209 | # Process each scale
210 | for scale in scales:
211 | num_x, num_y = scale, scale
212 | patches = crop_numpy(image, num_x, num_y)
213 | cropped_images.extend(patches)
214 | # batch = torch.stack(cropped_images)
215 |
216 | attr_score_map = {}
217 | for att_i,attr in enumerate(attributes):
218 | sample = {"image": torch.stack(cropped_images), "text_input": txt_processors(attr)}
219 |
220 | clip_features = score_processor.extract_features(sample)
221 |
222 | features_image = clip_features.image_embeds_proj
223 | features_text = clip_features.text_embeds_proj
224 |
225 | image_scores = (features_image @ features_text.t()).squeeze()
226 |
227 | scores_map = np.zeros(np.array(image).shape[:2], dtype=np.float32)
228 | patch_idx = 0
229 | for scale in scales:
230 | num_x, num_y = scale, scale
231 | patch_height = scores_map.shape[0] // num_y
232 | patch_width = scores_map.shape[1] // num_x
233 | for i in range(num_y):
234 | for j in range(num_x):
235 | # scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += normalized_scores[patch_idx]
236 | scores_map[i * patch_height:(i + 1) * patch_height, j * patch_width:(j + 1) * patch_width] += \
237 | image_scores[patch_idx].detach().cpu().numpy()
238 | patch_idx += 1
239 |
240 | attr_score_map[attr] = scores_map / len(scales)
241 |
242 | return cropped_images, attr_score_map
243 |
244 | def get_att_score(image, vis_processors, txt_processors,score_processor,attributes,blip_devices=None):
245 | img=vis_processors(image).unsqueeze(0).to(blip_devices)
246 |
247 | txt = txt_processors(attributes[0])
248 | txt_tokens = score_processor.tokenizer(txt, return_tensors="pt").to(blip_devices)
249 | # pdb.set_trace()
250 | gradcam, _ = compute_gradcam(score_processor, img, txt, txt_tokens, block_num=7)
251 | gradcam_np = gradcam[0][1].numpy().astype(np.float32)
252 | avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False)
253 | return avg_gradcam
254 |
255 | def save_scores_map(scores_map, filename='score_map.png'):
256 | """
257 | Visualizes and saves the scores map with a color bar.
258 |
259 | Parameters:
260 | - scores_map: A numpy array with scores assigned to each pixel.
261 | - filename: The filename to save the image to.
262 | """
263 |
264 | # Normalize scores_map to the range 0-1 for better color mapping
265 | normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map))
266 |
267 | # Create a figure and an axes object
268 | fig, ax = plt.subplots()
269 |
270 | # Display the image
271 | cax = ax.imshow(normalized_map, cmap='viridis') # Choose a colormap that you prefer
272 |
273 | # Create a color bar
274 | cbar = fig.colorbar(cax, ax=ax)
275 | cbar.set_label('Score')
276 |
277 | # Save the figure
278 | plt.axis('off') # Turn off axis
279 | plt.savefig(filename, bbox_inches='tight', pad_inches=0)
280 |
281 | # Close the plot to free up memory
282 | plt.close(fig)
283 |
284 | def overlay_attention_map(image, scores_map, filename='attention_overlay.png'):
285 | """
286 | Applies a smoothed attention map over an image to highlight areas based on the scores,
287 | and then saves the resulting image.
288 |
289 | Parameters:
290 | - image: A PIL Image to be enhanced.
291 | - scores_map: A numpy array with scores assigned to each pixel.
292 | - filename: The filename to save the image to.
293 | """
294 |
295 | # Normalize scores_map to the range 0-1 for better color mapping
296 | normalized_map = (scores_map - np.min(scores_map)) / (np.max(scores_map) - np.min(scores_map))
297 |
298 | # Apply Gaussian smoothing to the scores map
299 | smoothed_map = scipy.ndimage.gaussian_filter(normalized_map, sigma=2) # Adjust sigma as needed for the desired smoothness
300 |
301 | # Convert the original image to a numpy array for processing
302 | img_array = np.array(image)
303 |
304 | # Convert the smoothed scores map to a heatmap
305 | plt.figure(figsize=(10, 6))
306 | plt.imshow(img_array, cmap='gray', interpolation='nearest') # Display the original image
307 | plt.imshow(smoothed_map, cmap='jet', alpha=0.6, interpolation='bilinear') # Overlay the attention map
308 | plt.colorbar() # Optionally add a color bar to indicate scoring scale
309 | plt.axis('off') # Hide axes
310 | # Save the figure
311 | plt.savefig(filename, bbox_inches='tight', pad_inches=0)
312 | plt.close()
313 |
314 | def doubly_right_detection(image,model, vis_processors,target,blip_devices=None):
315 | image = vis_processors(image).unsqueeze(0).to(blip_devices)
316 | out=model.generate({"image": image, "prompt": f"Question: is {target} shown in this image? Answer:"})
317 | print(out)
318 | if "yes" in out[0].lower():
319 | return True
320 | else:
321 | return False
322 |
323 | class Pos_Queue:
324 | def __init__(self, capacity):
325 | self.capacity = capacity
326 | self.queue = []
327 |
328 | def add_element(self, element):
329 | # Add the new element to the queue
330 | self.queue.append(element)
331 | # Check if the queue is beyond capacity
332 | if len(self.queue) > self.capacity:
333 | # If it's full, pop and return the oldest element
334 | return self.queue.pop(0)
335 | # If not full, return None
336 | return None
337 |
338 |
339 |
340 | if __name__ == "__main__":
341 | # setup model
342 | # device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
343 |
344 | # model, vis_processors, text_processors = load_model_and_preprocess(name="blip2_feature_extractor",
345 | # model_type="pretrain", is_eval=True,
346 | # device=device)
347 |
348 | # model, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain",
349 | # device=device, is_eval=True)
350 |
351 | # model, vis_processors, text_processors = load_model_and_preprocess("blip_image_text_matching", "large",
352 | # device=device, is_eval=True)
353 |
354 | # model, vis_processors, _ = load_model_and_preprocess(name="blip2_opt", model_type="pretrain_opt2.7b",
355 | # is_eval=True, device=device)
356 |
357 | # model, vis_processors, text_processors = load_model_and_preprocess("clip_feature_extractor", model_type="ViT-B-32",
358 | # is_eval=True, device=device)
359 | # load the observation
360 | # data_root="/home/mmvc/Desktop/L3MVN_Reproduce/L3MVN/vis/3d_score"
361 | # obs = np.load(os.path.join(data_root,"sample_T.npy"))#0:3 rgb, 4 depth
362 |
363 | # get rgb
364 | # rgb=obs[:,:,0:3].astype(np.uint8)
365 | # rgb_obs=Image.fromarray(rgb)
366 | # rgb_obs.save(os.path.join(data_root,"rgb.png"))
367 | # patches,batch=crop_img(rgb_obs,vis_processors["eval"])
368 | # vis_patches(patches,os.path.join(data_root,"patches.png"))
369 |
370 | # get detph
371 | # depth=obs[:,:,4]
372 | # depth_img=depth_to_colormap(depth,)
373 | # depth_img.save(os.path.join(data_root,"depth.png"))
374 |
375 | #batch img and list txt 3.2s
376 | # txt_list=[]
377 | # for i in range(batch.shape[0]):
378 | # txt_list.append(txt)
379 | # for i in range(5):
380 | # itc_score = model({"image": batch, "text_input": txt_list}, match_head='itc')
381 |
382 | # single img and single txt 8.13s
383 | # txt="chair"
384 | # for j in range(5):
385 | # for i in range(batch.shape[0]):
386 | # itc_score = model({"image": batch[i][None,:], "text_input": txt}, match_head='itc')
387 | # print(itc_score)
388 | #batch img and single txt 3.1s
389 | # "leg",
390 | # "backrest",
391 | # "seat",
392 | # "armrest"
393 |
394 | # attributes=['chair backrest', 'chair seat', 'chair leg', 'chair armrest']
395 | # target=["chair"]
396 |
397 | t=time.time()
398 | # ------------- multi_scale approach ------------------
399 | # attributes=['object with chair leg, chair backrest, chair seat, and chair armrest']
400 | # imgs,score_mape=get_pyramid_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
401 | # imgs,score_mape=get_pyramid_score_feature(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
402 |
403 | # ------------- gradcam approach -------------
404 | # img=vis_processors["eval"](rgb_obs).unsqueeze(0).to(device)
405 | # txt = text_processors["eval"](attributes[0])
406 | # txt_tokens = model.tokenizer(txt, return_tensors="pt").to(device)
407 | # gradcam, _ = compute_gradcam(model, img, txt, txt_tokens, block_num=7)
408 | # gradcam_np = gradcam[0][1].numpy().astype(np.float32)
409 | # avg_gradcam = getAttMap(np.array(rgb_obs)/ 255., gradcam_np, blur=True,overlap=False)
410 |
411 | # fig, ax = plt.subplots(1, 1, figsize=(10, 10))
412 | # ax.imshow(avg_gradcam)
413 | # ax.axis('off')
414 | # plt.tight_layout()
415 | # plt.savefig(os.path.join(data_root, f"grad_score_map.png"))
416 |
417 | # ------------------ doubly right detection ------------------
418 | # print(doubly_right_detection(rgb_obs,vis_processors["eval"],target[0],device))
419 | # end_t=time.time()
420 |
421 | # ------------------ CLIP multi-scale ------------------
422 | # imgs,score_mape=get_pyramid_clip_score(rgb_obs,vis_processors["eval"],text_processors["eval"],model,attributes,device)
423 |
424 | # check dataset
425 | data_pth='data/objectgoal_hm3d/val_mini/content/TEEsavR23oF.json.gz'
426 | with gzip.open(data_pth, 'r') as f:
427 | data = json.loads(f.read().decode('utf-8'))
428 | for k in data.keys():
429 | print(k,len(data[k]))
430 | print("========================")
431 | print(data['episodes'][1])
432 | print("========================")
433 | print(data['episodes'][2])
434 | print("========================")
435 | print(data['episodes'][3])
436 | end_t=time.time()
437 |
438 | # for k in score_mape.keys():
439 | # overlay_attention_map(rgb_obs,score_mape[k],os.path.join(data_root,f"clip_score_map_{k}.png"))
440 |
441 | print(f"duration: {end_t-t} s")
442 |
443 | # print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)
444 |
--------------------------------------------------------------------------------
/envs/habitat/objectgoal_env.py:
--------------------------------------------------------------------------------
1 | import json
2 | import bz2
3 | import gzip
4 | import _pickle as cPickle
5 | import gym
6 | import numpy as np
7 | import quaternion
8 | import skimage.morphology
9 | import habitat
10 |
11 | from envs.utils.fmm_planner import FMMPlanner
12 | from constants import coco_categories, hm3d_category
13 | import envs.utils.pose as pu
14 |
15 |
16 | class ObjectGoal_Env(habitat.RLEnv):
17 | """The Object Goal Navigation environment class. The class is responsible
18 | for loading the dataset, generating episodes, and computing evaluation
19 | metrics.
20 | """
21 |
22 | def __init__(self, args, rank, config_env, dataset):
23 | self.args = args
24 | self.rank = rank
25 |
26 | super().__init__(config_env, dataset)
27 |
28 | # Loading dataset info file
29 | self.split = config_env.DATASET.SPLIT
30 | self.episodes_dir = config_env.DATASET.EPISODES_DIR.format(
31 | split=self.split)
32 |
33 | dataset_info_file = self.episodes_dir + \
34 | "{split}_info.pbz2".format(split=self.split)
35 | with bz2.BZ2File(dataset_info_file, 'rb') as f:
36 | self.dataset_info = cPickle.load(f)
37 |
38 | # Specifying action and observation space
39 | self.action_space = gym.spaces.Discrete(3)
40 |
41 | self.observation_space = gym.spaces.Box(0, 255,
42 | (3, args.frame_height,
43 | args.frame_width),
44 | dtype='uint8')
45 |
46 | # Initializations
47 | self.episode_no = 0
48 |
49 | # Scene info
50 | self.last_scene_path = None
51 | self.scene_path = None
52 | self.scene_name = None
53 |
54 | # Episode Dataset info
55 | self.eps_data = None
56 | self.eps_data_idx = None
57 | self.gt_planner = None
58 | self.object_boundary = None
59 | self.goal_idx = None
60 | self.goal_name = None
61 | self.map_obj_origin = None
62 | self.starting_loc = None
63 | self.starting_distance = None
64 |
65 | # Episode tracking info
66 | self.curr_distance = None
67 | self.prev_distance = None
68 | self.timestep = None
69 | self.stopped = None
70 | self.path_length = None
71 | self.last_sim_location = None
72 | self.trajectory_states = []
73 | self.info = {}
74 | self.info['distance_to_goal'] = None
75 | self.info['spl'] = None
76 | self.info['success'] = None
77 |
78 | fileName = 'data/matterport_category_mappings.tsv'
79 |
80 | text = ''
81 | lines = []
82 | items = []
83 | self.hm3d_semantic_mapping={}
84 |
85 | with open(fileName, 'r') as f:
86 | text = f.read()
87 | lines = text.split('\n')
88 |
89 | for l in lines:
90 | items.append(l.split(' '))
91 |
92 | for i in items:
93 | if len(i) > 3:
94 | self.hm3d_semantic_mapping[i[2]] = i[-1]
95 |
96 | def load_new_episode(self):
97 | """The function loads a fixed episode from the episode dataset. This
98 | function is used for evaluating a trained model on the val split.
99 | """
100 |
101 | args = self.args
102 | self.scene_path = self.habitat_env.sim.curr_scene_name
103 | scene_name = self.scene_path.split("/")[-1].split(".")[0]
104 |
105 | if self.scene_path != self.last_scene_path:
106 | episodes_file = self.episodes_dir + \
107 | "content/{}_episodes.json.gz".format(scene_name)
108 |
109 | print("Loading episodes from: {}".format(episodes_file))
110 | with gzip.open(episodes_file, 'r') as f:
111 | self.eps_data = json.loads(
112 | f.read().decode('utf-8'))["episodes"]
113 |
114 | self.eps_data_idx = 0
115 | self.last_scene_path = self.scene_path
116 |
117 | # Load episode info
118 | episode = self.eps_data[self.eps_data_idx]
119 | self.eps_data_idx += 1
120 | self.eps_data_idx = self.eps_data_idx % len(self.eps_data)
121 | pos = episode["start_position"]
122 | rot = quaternion.from_float_array(episode["start_rotation"])
123 |
124 | goal_name = episode["object_category"]
125 | goal_idx = episode["object_id"]
126 | floor_idx = episode["floor_id"]
127 |
128 | # Load scene info
129 | scene_info = self.dataset_info[scene_name]
130 | sem_map = scene_info[floor_idx]['sem_map']
131 | map_obj_origin = scene_info[floor_idx]['origin']
132 |
133 | # Setup ground truth planner
134 | object_boundary = args.success_dist
135 | map_resolution = args.map_resolution
136 | selem = skimage.morphology.disk(2)
137 | traversible = skimage.morphology.binary_dilation(
138 | sem_map[0], selem) != True
139 | traversible = 1 - traversible
140 | planner = FMMPlanner(traversible)
141 | selem = skimage.morphology.disk(
142 | int(object_boundary * 100. / map_resolution))
143 | goal_map = skimage.morphology.binary_dilation(
144 | sem_map[goal_idx + 1], selem) != True
145 | goal_map = 1 - goal_map
146 | planner.set_multi_goal(goal_map)
147 |
148 | # Get starting loc in GT map coordinates
149 | x = -pos[2]
150 | y = -pos[0]
151 | min_x, min_y = map_obj_origin / 100.0
152 | map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
153 |
154 | self.gt_planner = planner
155 | self.starting_loc = map_loc
156 | self.object_boundary = object_boundary
157 | self.goal_idx = goal_idx
158 | self.goal_name = goal_name
159 | self.map_obj_origin = map_obj_origin
160 |
161 | self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc]\
162 | / 20.0 + self.object_boundary
163 | self.prev_distance = self.starting_distance
164 | self._env.sim.set_agent_state(pos, rot)
165 |
166 | # The following two should match approximately
167 | # print(starting_loc)
168 | # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
169 |
170 | obs = self._env.sim.get_observations_at(pos, rot)
171 |
172 | return obs
173 |
174 | def generate_new_episode(self):
175 | """The function generates a random valid episode. This function is used
176 | for training a model on the train split.
177 | """
178 |
179 | args = self.args
180 |
181 | self.scene_path = self.habitat_env.sim.config.SCENE
182 | scene_name = self.scene_path.split("/")[-1].split(".")[0]
183 |
184 | scene_info = self.dataset_info[scene_name]
185 | map_resolution = args.map_resolution
186 |
187 | floor_idx = np.random.randint(len(scene_info.keys()))
188 | floor_height = scene_info[floor_idx]['floor_height']
189 | sem_map = scene_info[floor_idx]['sem_map']
190 | map_obj_origin = scene_info[floor_idx]['origin']
191 |
192 | cat_counts = sem_map.sum(2).sum(1)
193 | possible_cats = list(np.arange(6))
194 |
195 | for i in range(6):
196 | if cat_counts[i + 1] == 0:
197 | possible_cats.remove(i)
198 |
199 | object_boundary = args.success_dist
200 |
201 | loc_found = False
202 | while not loc_found:
203 | if len(possible_cats) == 0:
204 | print("No valid objects for {}".format(floor_height))
205 | eps = eps - 1
206 | continue
207 |
208 | goal_idx = np.random.choice(possible_cats)
209 |
210 | for key, value in coco_categories.items():
211 | if value == goal_idx:
212 | goal_name = key
213 |
214 | selem = skimage.morphology.disk(2)
215 | traversible = skimage.morphology.binary_dilation(
216 | sem_map[0], selem) != True
217 | traversible = 1 - traversible
218 |
219 | planner = FMMPlanner(traversible)
220 |
221 | selem = skimage.morphology.disk(
222 | int(object_boundary * 100. / map_resolution))
223 | goal_map = skimage.morphology.binary_dilation(
224 | sem_map[goal_idx + 1], selem) != True
225 | goal_map = 1 - goal_map
226 |
227 | planner.set_multi_goal(goal_map)
228 |
229 | m1 = sem_map[0] > 0
230 | m2 = planner.fmm_dist > (args.min_d - object_boundary) * 20.0
231 | m3 = planner.fmm_dist < (args.max_d - object_boundary) * 20.0
232 |
233 | possible_starting_locs = np.logical_and(m1, m2)
234 | possible_starting_locs = np.logical_and(
235 | possible_starting_locs, m3) * 1.
236 | if possible_starting_locs.sum() != 0:
237 | loc_found = True
238 | else:
239 | print("Invalid object: {} / {} / {}".format(
240 | scene_name, floor_height, goal_name))
241 | possible_cats.remove(goal_idx)
242 | scene_info[floor_idx]["sem_map"][goal_idx + 1, :, :] = 0.
243 | self.dataset_info[scene_name][floor_idx][
244 | "sem_map"][goal_idx + 1, :, :] = 0.
245 |
246 | loc_found = False
247 | while not loc_found:
248 | pos = self._env.sim.sample_navigable_point()
249 | x = -pos[2]
250 | y = -pos[0]
251 | min_x, min_y = map_obj_origin / 100.0
252 | map_loc = int((-y - min_y) * 20.), int((-x - min_x) * 20.)
253 | if abs(pos[1] - floor_height) < args.floor_thr / 100.0 and \
254 | possible_starting_locs[map_loc[0], map_loc[1]] == 1:
255 | loc_found = True
256 |
257 | agent_state = self._env.sim.get_agent_state(0)
258 | rotation = agent_state.rotation
259 | rvec = quaternion.as_rotation_vector(rotation)
260 | rvec[1] = np.random.rand() * 2 * np.pi
261 | rot = quaternion.from_rotation_vector(rvec)
262 |
263 | self.gt_planner = planner
264 | self.starting_loc = map_loc
265 | self.object_boundary = object_boundary
266 | self.goal_idx = goal_idx
267 | self.goal_name = goal_name
268 | self.map_obj_origin = map_obj_origin
269 |
270 | self.starting_distance = self.gt_planner.fmm_dist[self.starting_loc] \
271 | / 20.0 + self.object_boundary
272 | self.prev_distance = self.starting_distance
273 |
274 | self._env.sim.set_agent_state(pos, rot)
275 |
276 | # The following two should match approximately
277 | # print(starting_loc)
278 | # print(self.sim_continuous_to_sim_map(self.get_sim_location()))
279 |
280 | obs = self._env.sim.get_observations_at(pos, rot)
281 |
282 | return obs
283 |
284 | def sim_map_to_sim_continuous(self, coords):
285 | """Converts ground-truth 2D Map coordinates to absolute Habitat
286 | simulator position and rotation.
287 | """
288 | agent_state = self._env.sim.get_agent_state(0)
289 | y, x = coords
290 | min_x, min_y = self.map_obj_origin / 100.0
291 |
292 | cont_x = x / 20. + min_x
293 | cont_y = y / 20. + min_y
294 | agent_state.position[0] = cont_y
295 | agent_state.position[2] = cont_x
296 |
297 | rotation = agent_state.rotation
298 | rvec = quaternion.as_rotation_vector(rotation)
299 |
300 | if self.args.train_single_eps:
301 | rvec[1] = 0.0
302 | else:
303 | rvec[1] = np.random.rand() * 2 * np.pi
304 | rot = quaternion.from_rotation_vector(rvec)
305 |
306 | return agent_state.position, rot
307 |
308 | def sim_continuous_to_sim_map(self, sim_loc):
309 | """Converts absolute Habitat simulator pose to ground-truth 2D Map
310 | coordinates.
311 | """
312 | x, y, o = sim_loc
313 | min_x, min_y = self.map_obj_origin / 100.0
314 | x, y = int((-x - min_x) * 20.), int((-y - min_y) * 20.)
315 |
316 | o = np.rad2deg(o) + 180.0
317 | return y, x, o
318 |
319 | def reset(self):
320 | """Resets the environment to a new episode.
321 |
322 | Returns:
323 | obs (ndarray): RGBD observations (4 x H x W)
324 | info (dict): contains timestep, pose, goal category and
325 | evaluation metric info
326 | """
327 | args = self.args
328 | new_scene = self.episode_no % args.num_train_episodes == 0
329 |
330 | self.scene = self._env.sim.semantic_annotations()
331 |
332 | self.episode_no += 1
333 |
334 | # Initializations
335 | self.timestep = 0
336 | self.stopped = False
337 | self.path_length = 1e-5
338 | self.trajectory_states = []
339 |
340 | if new_scene:
341 | obs = super().reset()
342 | self.scene_name = self.habitat_env.sim.curr_scene_name
343 | print("Changing scene: {}/{}".format(self.rank, self.scene_name))
344 |
345 | self.scene_path = self.habitat_env.sim.curr_scene_name
346 |
347 | if self.split == "val":
348 | obs = self.load_new_episode()
349 | else:
350 | obs = self.generate_new_episode()
351 |
352 | # print("obs: ,", obs)
353 | # print("obs shape: ,", obs.shape)
354 | rgb = obs['rgb'].astype(np.uint8)
355 | depth = obs['depth']
356 | semantic = self._preprocess_semantic(obs["semantic"])
357 | # print("rgb shape: ,", rgb.shape)
358 | # print("depth shape: ,", depth.shape)
359 | # print("semantic shape: ,", semantic.shape)
360 |
361 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
362 | self.last_sim_location = self.get_sim_location()
363 |
364 | # Set info
365 | self.info['time'] = self.timestep
366 | self.info['sensor_pose'] = [0., 0., 0.]
367 | self.info['goal_cat_id'] = self.goal_idx
368 | self.info['goal_name'] = hm3d_category[self.goal_idx]
369 |
370 | return state, self.info
371 |
372 | def step(self, action):
373 | """Function to take an action in the environment.
374 |
375 | Args:
376 | action (dict):
377 | dict with following keys:
378 | 'action' (int): 0: stop, 1: forward, 2: left, 3: right
379 |
380 | Returns:
381 | obs (ndarray): RGBD observations (4 x H x W)
382 | reward (float): amount of reward returned after previous action
383 | done (bool): whether the episode has ended
384 | info (dict): contains timestep, pose, goal category and
385 | evaluation metric info
386 | """
387 | action = action["action"]
388 | if action == 0:
389 | self.stopped = True
390 | # Not sending stop to simulator, resetting manually
391 | action = 3
392 |
393 | obs, rew, done, _ = super().step(action)
394 |
395 | # Get pose change
396 | dx, dy, do = self.get_pose_change()
397 | self.info['sensor_pose'] = [dx, dy, do]
398 | self.path_length += pu.get_l2_distance(0, dx, 0, dy)
399 |
400 | spl, success, dist = 0., 0., 0.
401 | if done:
402 | spl, success, dist = self.get_metrics()
403 | self.info['distance_to_goal'] = dist
404 | self.info['spl'] = spl
405 | self.info['success'] = success
406 |
407 | rgb = obs['rgb'].astype(np.uint8)
408 | depth = obs['depth']
409 | semantic = self._preprocess_semantic(obs["semantic"])
410 | state = np.concatenate((rgb, depth, semantic), axis=2).transpose(2, 0, 1)
411 |
412 | self.timestep += 1
413 | self.info['time'] = self.timestep
414 |
415 | return state, rew, done, self.info
416 |
417 | def _preprocess_semantic(self, semantic):
418 | # print("*********semantic type: ", type(semantic))
419 | se = list(set(semantic.ravel()))
420 | # print(se) # []
421 | for i in range(len(se)):
422 | if self.scene.objects[se[i]] is not None and self.scene.objects[se[i]].category.name() in coco_categories:
423 | # print(self.scene.objects[se[i]].id)
424 | # print(self.scene.objects[se[i]].category.index())
425 | # print(type(self.scene.objects[se[i]].category.index()) ) # int
426 | semantic[semantic==se[i]] = coco_categories[self.scene.objects[se[i]].category.name()]+1
427 | # print(self.scene.objects[se[i]+1].category.name())
428 | else :
429 | semantic[
430 | semantic==se[i]
431 | ] = 0
432 |
433 | # se = list(set(semantic.ravel()))
434 | # print("semantic: ", se) # []
435 | # semantic = np.expand_dims(semantic.astype(np.uint8), 2)
436 | return semantic
437 |
438 | def get_reward_range(self):
439 | """This function is not used, Habitat-RLEnv requires this function"""
440 | return (0., 1.0)
441 |
442 | def get_reward(self, observations):
443 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
444 | self.curr_distance = self.gt_planner.fmm_dist[curr_loc[0],
445 | curr_loc[1]] / 20.0
446 |
447 | reward = (self.prev_distance - self.curr_distance) * \
448 | self.args.reward_coeff
449 |
450 | self.prev_distance = self.curr_distance
451 | return reward
452 |
453 | def get_llm_distance(self, target_point_map, frontier_loc_g):
454 |
455 | tpm = len(list(set(target_point_map.ravel()))) -1
456 | frontier_dis_g = 0
457 | if tpm > 1:
458 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
459 | frontier_loc_g[0][0] = frontier_loc_g[0][0] - target_point_map.shape[0]/2 + curr_loc[0]
460 | frontier_loc_g[1][0] = frontier_loc_g[1][0] - target_point_map.shape[1]/2 + curr_loc[1]
461 | (aa, bb) = self.gt_planner.traversible.shape
462 | if frontier_loc_g[0][0] > aa-1:
463 | frontier_loc_g[0][0] = aa-1
464 | if frontier_loc_g[1][0] > bb-1:
465 | frontier_loc_g[1][0] = bb -1
466 | frontier_dis_g = self.gt_planner.fmm_dist[frontier_loc_g[0],
467 | frontier_loc_g[1]] / 20.0
468 | for lay in range(tpm):
469 | frontier_loc = np.where(target_point_map == lay+1)
470 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
471 | frontier_loc[0][0] = frontier_loc[0][0] - target_point_map.shape[0]/2 + curr_loc[0]
472 | frontier_loc[1][0] = frontier_loc[1][0] - target_point_map.shape[1]/2 + curr_loc[1]
473 | (aa, bb) = self.gt_planner.traversible.shape
474 | if frontier_loc[0][0] > aa-1:
475 | frontier_loc[0][0] = aa-1
476 | if frontier_loc[1][0] > bb-1:
477 | frontier_loc[1][0] = bb -1
478 | frontier_distance = self.gt_planner.fmm_dist[frontier_loc[0],
479 | frontier_loc[1]] / 20.0
480 | if frontier_distance < frontier_dis_g:
481 | return 0
482 | return 1
483 | else:
484 | return 0
485 |
486 |
487 | def get_metrics(self):
488 | """This function computes evaluation metrics for the Object Goal task
489 |
490 | Returns:
491 | spl (float): Success weighted by Path Length
492 | (See https://arxiv.org/pdf/1807.06757.pdf)
493 | success (int): 0: Failure, 1: Successful
494 | dist (float): Distance to Success (DTS), distance of the agent
495 | from the success threshold boundary in meters.
496 | (See https://arxiv.org/pdf/2007.00643.pdf)
497 | """
498 | curr_loc = self.sim_continuous_to_sim_map(self.get_sim_location())
499 | dist = self.gt_planner.fmm_dist[curr_loc[0], curr_loc[1]] / 20.0
500 | if dist == 0.0:
501 | success = 1
502 | else:
503 | success = 0
504 | spl = min(success * self.starting_distance / self.path_length, 1)
505 | return spl, success, dist
506 |
507 | def get_done(self, observations):
508 | if self.info['time'] >= self.args.max_episode_length - 1:
509 | done = True
510 | elif self.stopped:
511 | done = True
512 | else:
513 | done = False
514 | return done
515 |
516 | def get_info(self, observations):
517 | """This function is not used, Habitat-RLEnv requires this function"""
518 | info = {}
519 | return info
520 |
521 | def get_spaces(self):
522 | """Returns observation and action spaces for the ObjectGoal task."""
523 | return self.observation_space, self.action_space
524 |
525 | def get_sim_location(self):
526 | """Returns x, y, o pose of the agent in the Habitat simulator."""
527 |
528 | agent_state = super().habitat_env.sim.get_agent_state(0)
529 | x = -agent_state.position[2]
530 | y = -agent_state.position[0]
531 | axis = quaternion.as_euler_angles(agent_state.rotation)[0]
532 | if (axis % (2 * np.pi)) < 0.1 or (axis %
533 | (2 * np.pi)) > 2 * np.pi - 0.1:
534 | o = quaternion.as_euler_angles(agent_state.rotation)[1]
535 | else:
536 | o = 2 * np.pi - quaternion.as_euler_angles(agent_state.rotation)[1]
537 | if o > np.pi:
538 | o -= 2 * np.pi
539 | return x, y, o
540 |
541 | def get_pose_change(self):
542 | """Returns dx, dy, do pose change of the agent relative to the last
543 | timestep."""
544 | curr_sim_pose = self.get_sim_location()
545 | dx, dy, do = pu.get_rel_pose_change(
546 | curr_sim_pose, self.last_sim_location)
547 | self.last_sim_location = curr_sim_pose
548 | return dx, dy, do
549 |
--------------------------------------------------------------------------------
/agents/sem_exp.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import sys
4 |
5 | import cv2
6 | import numpy as np
7 | import skimage.morphology
8 | from PIL import Image
9 | from torchvision import transforms
10 | import time
11 |
12 | from envs.utils.fmm_planner import FMMPlanner
13 | from envs.habitat.objectgoal_env import ObjectGoal_Env
14 | from envs.habitat.objectgoal_env21 import ObjectGoal_Env21
15 | from agents.utils.semantic_prediction import SemanticPredMaskRCNN
16 | from constants import color_palette
17 | import envs.utils.pose as pu
18 | import agents.utils.visualization as vu
19 |
20 | from RedNet.RedNet_model import load_rednet
21 | from constants import mp_categories_mapping
22 | import torch
23 | from get_point_score import *
24 |
25 | class Sem_Exp_Env_Agent(ObjectGoal_Env21):
26 | """The Sem_Exp environment agent class. A seperate Sem_Exp_Env_Agent class
27 | object is used for each environment thread.
28 |
29 | """
30 |
31 | def __init__(self, args, rank, config_env, dataset):
32 |
33 | self.args = args
34 | super().__init__(args, rank, config_env, dataset)
35 |
36 | # initialize transform for RGB observations
37 | self.res = transforms.Compose(
38 | [transforms.ToPILImage(),
39 | transforms.Resize((args.frame_height, args.frame_width),
40 | interpolation=Image.NEAREST)])
41 |
42 | # initialize semantic segmentation prediction model
43 | if args.sem_gpu_id == -1:
44 | args.sem_gpu_id = config_env.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID
45 |
46 | self.device = args.device
47 | self.sem_pred = SemanticPredMaskRCNN(args)
48 | self.red_sem_pred = load_rednet(
49 | self.device, ckpt='RedNet/model/rednet_semmap_mp3d_40.pth', resize=True, # since we train on half-vision
50 | )
51 | self.red_sem_pred.eval()
52 | # self.red_sem_pred.to(self.device)
53 |
54 |
55 | # initializations for planning:
56 | self.selem = skimage.morphology.disk(3)
57 |
58 | self.obs = None
59 | self.obs_shape = None
60 | self.collision_map = None
61 | self.visited = None
62 | self.visited_vis = None
63 | self.col_width = None
64 | self.curr_loc = None
65 | self.last_loc = None
66 | self.last_action = None
67 | self.count_forward_actions = None
68 |
69 | self.replan_count = 0
70 | self.collision_n = 0
71 | self.kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(3, 3))
72 |
73 | if args.visualize or args.print_images:
74 | self.legend = cv2.imread('docs/legend.png')
75 | self.vis_image = None
76 | self.rgb_vis = None
77 |
78 | self.fail_case = {}
79 | self.fail_case['collision'] = 0
80 | self.fail_case['success'] = 0
81 | self.fail_case['detection'] = 0
82 | self.fail_case['exploration'] = 0
83 |
84 | self.eve_angle = 0
85 |
86 | def reset(self):
87 | args = self.args
88 |
89 | self.replan_count = 0
90 | self.collision_n = 0
91 |
92 | obs, info = super().reset()
93 | store_obs=obs.transpose(1, 2, 0)
94 | # print(obs.shape)
95 | # print(store_obs.shape)
96 | # print("!!!")
97 | # np.save("vis/3d_score/sample_T.npy",store_obs)
98 | obs = self._preprocess_obs(obs)
99 |
100 | self.obs_shape = obs.shape
101 |
102 | # Episode initializations
103 | map_shape = (args.map_size_cm // args.map_resolution,
104 | args.map_size_cm // args.map_resolution)
105 | self.collision_map = np.zeros(map_shape)
106 | self.visited = np.zeros(map_shape)
107 | self.visited_vis = np.zeros(map_shape)
108 | self.col_width = 1
109 | self.count_forward_actions = 0
110 | self.curr_loc = [args.map_size_cm / 100.0 / 2.0,
111 | args.map_size_cm / 100.0 / 2.0, 0.]
112 | self.last_action = None
113 |
114 | self.eve_angle = 0
115 | self.eve_angle_old = 0
116 |
117 | info['eve_angle'] = self.eve_angle
118 |
119 |
120 | if args.visualize or args.print_images:
121 | self.vis_image = vu.init_vis_image(self.goal_name, self.legend)
122 |
123 | # print(obs.shape)
124 | return obs, info
125 |
126 | def plan_act_and_preprocess(self, planner_inputs):
127 | """Function responsible for planning, taking the action and
128 | preprocessing observations
129 |
130 | Args:
131 | planner_inputs (dict):
132 | dict with following keys:
133 | 'map_pred' (ndarray): (M, M) map prediction
134 | 'goal' (ndarray): (M, M) mat denoting goal locations
135 | 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
136 | and planning window (gx1, gx2, gy1, gy2)
137 | 'found_goal' (bool): whether the goal object is found
138 |
139 | Returns:
140 | obs (ndarray): preprocessed observations ((4+C) x H x W)
141 | reward (float): amount of reward returned after previous action
142 | done (bool): whether the episode has ended
143 | info (dict): contains timestep, pose, goal category and
144 | evaluation metric info
145 | """
146 | # s_time = time.time()
147 |
148 | # plan
149 | if planner_inputs["wait"]:
150 | self.last_action = None
151 | self.info["sensor_pose"] = [0., 0., 0.]
152 | return np.zeros(self.obs.shape), self.fail_case, False, self.info
153 |
154 | # Reset reward if new long-term goal
155 | if planner_inputs["new_goal"]:
156 | goal = planner_inputs['goal']
157 | if np.sum(goal == 1) == 1 and self.args.task_config == "tasks/objectnav_gibson.yaml":
158 | frontier_loc = np.where(goal == 1)
159 | self.info["g_reward"] = self.get_llm_distance(planner_inputs["map_target"], frontier_loc)
160 |
161 | # self.collision_map = np.zeros(self.visited.shape)
162 | self.info['clear_flag'] = 0
163 |
164 | action = self._plan(planner_inputs)
165 |
166 | # c_time = time.time()
167 | # ss_time = c_time - s_time
168 | # print('plan map: %.3f秒'%ss_time) 0.19
169 |
170 | if self.collision_n > 20 or self.replan_count > 26:
171 | self.info['clear_flag'] = 1
172 | self.collision_n = 0
173 |
174 | if self.args.visualize or self.args.print_images:
175 |
176 | self._visualize(planner_inputs)
177 |
178 | if action >= 0:
179 |
180 | # act
181 | action = {'action': action}
182 | obs, rew, done, info = super().step(action)
183 |
184 | if done and self.info['success'] == 0:
185 | if self.info['time'] >= self.args.max_episode_length - 1:
186 | self.fail_case['exploration'] += 1
187 | elif self.replan_count > 26:
188 | self.fail_case['collision'] += 1
189 | else:
190 | self.fail_case['detection'] += 1
191 |
192 | if done and self.info['success'] == 1:
193 | self.fail_case['success'] += 1
194 |
195 | # preprocess obs
196 | obs = self._preprocess_obs(obs)
197 | self.last_action = action['action']
198 | self.obs = obs
199 |
200 | info['last_action'] = action['action'] # Updated to get action info as well
201 | self.info = info
202 | info['eve_angle'] = self.eve_angle
203 |
204 |
205 | # e_time = time.time()
206 | # ss_time = e_time - c_time
207 | # print('act map: %.3f秒'%ss_time) 0.23
208 |
209 | # info['g_reward'] += rew
210 |
211 | return obs, self.fail_case, done, info
212 |
213 | else:
214 | self.last_action = None
215 | self.info["sensor_pose"] = [0., 0., 0.]
216 | info['last_action'] = None # Updated to get action info as well
217 |
218 | return np.zeros(self.obs_shape), self.fail_case, False, self.info
219 |
220 | def _plan(self, planner_inputs):
221 | """Function responsible for planning
222 |
223 | Args:
224 | planner_inputs (dict):
225 | dict with following keys:
226 | 'map_pred' (ndarray): (M, M) map prediction
227 | 'goal' (ndarray): (M, M) goal locations
228 | 'pose_pred' (ndarray): (7,) array denoting pose (x,y,o)
229 | and planning window (gx1, gx2, gy1, gy2)
230 | 'found_goal' (bool): whether the goal object is found
231 |
232 | Returns:
233 | action (int): action id
234 | """
235 | args = self.args
236 |
237 | self.last_loc = self.curr_loc
238 |
239 | # Get Map prediction
240 | map_pred = np.rint(planner_inputs['map_pred'])
241 | exp_pred = np.rint(planner_inputs['exp_pred'])
242 | goal = planner_inputs['goal']
243 |
244 | # Get pose prediction and global policy planning window
245 | start_x, start_y, start_o, gx1, gx2, gy1, gy2 = \
246 | planner_inputs['pose_pred']
247 | gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
248 | planning_window = [gx1, gx2, gy1, gy2]
249 |
250 | # Get curr loc
251 | self.curr_loc = [start_x, start_y, start_o]
252 | r, c = start_y, start_x
253 | start = [int(r * 100.0 / args.map_resolution - gx1),
254 | int(c * 100.0 / args.map_resolution - gy1)]
255 | start = pu.threshold_poses(start, map_pred.shape)
256 |
257 | self.visited[gx1:gx2, gy1:gy2][start[0] - 0:start[0] + 1,
258 | start[1] - 0:start[1] + 1] = 1
259 |
260 | # if args.visualize or args.print_images:
261 | # Get last loc
262 | last_start_x, last_start_y = self.last_loc[0], self.last_loc[1]
263 | r, c = last_start_y, last_start_x
264 | last_start = [int(r * 100.0 / args.map_resolution - gx1),
265 | int(c * 100.0 / args.map_resolution - gy1)]
266 | last_start = pu.threshold_poses(last_start, map_pred.shape)
267 | self.visited_vis[gx1:gx2, gy1:gy2] = \
268 | vu.draw_line(last_start, start,
269 | self.visited_vis[gx1:gx2, gy1:gy2])
270 |
271 | # Collision check
272 | if self.last_action == 1 and not planner_inputs["new_goal"]:
273 | x1, y1, t1 = self.last_loc
274 | x2, y2, _ = self.curr_loc
275 | buf = 4
276 | length = 2
277 |
278 | if abs(x1 - x2) < 0.05 and abs(y1 - y2) < 0.05:
279 | self.col_width += 2
280 | if self.col_width == 7:
281 | length = 4
282 | buf = 3
283 | self.col_width = min(self.col_width, 5)
284 | else:
285 | self.col_width = 1
286 |
287 | dist = pu.get_l2_distance(x1, x2, y1, y2)
288 | if dist < args.collision_threshold: # Collision
289 | self.collision_n += 1
290 | width = self.col_width
291 | for i in range(length):
292 | for j in range(width):
293 | wx = x1 + 0.05 * \
294 | ((i + buf) * np.cos(np.deg2rad(t1))
295 | + (j - width // 2) * np.sin(np.deg2rad(t1)))
296 | wy = y1 + 0.05 * \
297 | ((i + buf) * np.sin(np.deg2rad(t1))
298 | - (j - width // 2) * np.cos(np.deg2rad(t1)))
299 | r, c = wy, wx
300 | r, c = int(r * 100 / args.map_resolution), \
301 | int(c * 100 / args.map_resolution)
302 | [r, c] = pu.threshold_poses([r, c],
303 | self.collision_map.shape)
304 | self.collision_map[r, c] = 1
305 |
306 | stg, replan, stop = self._get_stg(map_pred, start, np.copy(goal),
307 | planning_window)
308 |
309 | if replan:
310 | self.replan_count += 1
311 | print("false: ", self.replan_count)
312 | else:
313 | self.replan_count = 0
314 |
315 | # Deterministic Local Policy
316 | if (stop and planner_inputs['found_goal'] == 1) or self.replan_count > 26:
317 | action = 0 # Stop
318 | else:
319 | (stg_x, stg_y) = stg
320 | angle_st_goal = math.degrees(math.atan2(stg_x - start[0],
321 | stg_y - start[1]))
322 | angle_agent = (start_o) % 360.0
323 | if angle_agent > 180:
324 | angle_agent -= 360
325 |
326 | relative_angle = (angle_agent - angle_st_goal) % 360.0
327 | if relative_angle > 180:
328 | relative_angle -= 360
329 |
330 | ## add the evelution angle
331 | eve_start_x = int(5 * math.sin(angle_st_goal) + start[0])
332 | eve_start_y = int(5 * math.cos(angle_st_goal) + start[1])
333 | if eve_start_x > map_pred.shape[0]: eve_start_x = map_pred.shape[0]
334 | if eve_start_y > map_pred.shape[0]: eve_start_y = map_pred.shape[0]
335 | if eve_start_x < 0: eve_start_x = 0
336 | if eve_start_y < 0: eve_start_y = 0
337 | if exp_pred[eve_start_x, eve_start_y] == 0 and self.eve_angle > -60:
338 | action = 5
339 | self.eve_angle -= 30
340 | elif exp_pred[eve_start_x, eve_start_y] == 1 and self.eve_angle < 0:
341 | action = 4
342 | self.eve_angle += 30
343 | elif relative_angle > self.args.turn_angle / 2.:
344 | action = 3 # Right
345 | elif relative_angle < -self.args.turn_angle / 2.:
346 | action = 2 # Left
347 | else:
348 | action = 1 # Forward
349 |
350 | return action
351 |
352 | def _get_stg(self, grid, start, goal, planning_window):
353 | """Get short-term goal"""
354 |
355 | [gx1, gx2, gy1, gy2] = planning_window
356 |
357 | x1, y1, = 0, 0
358 | x2, y2 = grid.shape
359 |
360 | def add_boundary(mat, value=1):
361 | h, w = mat.shape
362 | new_mat = np.zeros((h + 2, w + 2)) + value
363 | new_mat[1:h + 1, 1:w + 1] = mat
364 | return new_mat
365 |
366 | traversible = skimage.morphology.binary_dilation(
367 | grid[x1:x2, y1:y2],
368 | self.selem) != True
369 | traversible[self.collision_map[gx1:gx2, gy1:gy2]
370 | [x1:x2, y1:y2] == 1] = 0
371 | traversible[cv2.dilate(self.visited_vis[gx1:gx2, gy1:gy2][x1:x2, y1:y2], self.kernel) == 1] = 1
372 |
373 | traversible[int(start[0] - x1) - 1:int(start[0] - x1) + 2,
374 | int(start[1] - y1) - 1:int(start[1] - y1) + 2] = 1
375 |
376 | traversible = add_boundary(traversible)
377 | goal = add_boundary(goal, value=0)
378 |
379 | planner = FMMPlanner(traversible)
380 | selem = skimage.morphology.disk(10)
381 | goal = skimage.morphology.binary_dilation(
382 | goal, selem) != True
383 | goal = 1 - goal * 1.
384 | planner.set_multi_goal(goal)
385 |
386 | state = [start[0] - x1 + 1, start[1] - y1 + 1]
387 | stg_x, stg_y, replan, stop = planner.get_short_term_goal(state)
388 |
389 | stg_x, stg_y = stg_x + x1 - 1, stg_y + y1 - 1
390 |
391 | return (stg_x, stg_y), replan, stop
392 |
393 | def _preprocess_obs(self, obs, use_seg=True):
394 | args = self.args
395 | # print("obs: ", obs)
396 |
397 | obs = obs.transpose(1, 2, 0)
398 | rgb = obs[:, :, :3]
399 | depth = obs[:, :, 3:4]
400 | semantic = obs[:,:,4:5].squeeze()
401 | # print("obs: ", semantic.shape)
402 | if args.use_gtsem:
403 | self.rgb_vis = rgb
404 | sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1))
405 | for i in range(16):
406 | sem_seg_pred[:,:,i][semantic == i+1] = 1
407 | else:
408 | # print("sem_exp_preprocess_semantic")
409 | red_semantic_pred, semantic_pred = self._get_sem_pred(
410 | rgb.astype(np.uint8), depth, use_seg=use_seg)
411 | # print(red_semantic_pred.shape)
412 | # print(semantic_pred.shape)
413 | # print(mp_categories_mapping)
414 | # sys.exit(0)
415 | sem_seg_pred = np.zeros((rgb.shape[0], rgb.shape[1], 15 + 1))
416 | for i in range(0, 15):
417 | # print(mp_categories_mapping[i])
418 | sem_seg_pred[:,:,i][red_semantic_pred == mp_categories_mapping[i]] = 1
419 |
420 | sem_seg_pred[:,:,0][semantic_pred[:,:,0] == 0] = 0
421 | sem_seg_pred[:,:,1][semantic_pred[:,:,1] == 0] = 0
422 | sem_seg_pred[:,:,3][semantic_pred[:,:,3] == 0] = 0
423 | sem_seg_pred[:,:,4][semantic_pred[:,:,4] == 1] = 1
424 | sem_seg_pred[:,:,5][semantic_pred[:,:,5] == 1] = 1
425 |
426 |
427 | # sem_seg_pred = self._get_sem_pred(
428 | # rgb.astype(np.uint8), depth, use_seg=use_seg)
429 |
430 | depth = self._preprocess_depth(depth, args.min_depth, args.max_depth)
431 |
432 | ds = args.env_frame_width // args.frame_width # Downscaling factor
433 |
434 | if ds != 1:
435 | rgb = np.asarray(self.res(rgb.astype(np.uint8)))
436 | depth = depth[ds // 2::ds, ds // 2::ds]
437 | sem_seg_pred = sem_seg_pred[ds // 2::ds, ds // 2::ds]
438 |
439 | depth = np.expand_dims(depth, axis=2)
440 | state = np.concatenate((rgb, depth, sem_seg_pred),
441 | axis=2).transpose(2, 0, 1)
442 |
443 | return state
444 |
445 | def _preprocess_depth(self, depth, min_d, max_d):
446 | depth = depth[:, :, 0] * 1
447 |
448 | for i in range(depth.shape[1]):
449 | depth[:, i][depth[:, i] == 0.] = depth[:, i].max()
450 |
451 | mask2 = depth > 0.99
452 | depth[mask2] = 0.
453 |
454 | mask1 = depth == 0
455 | depth[mask1] = 100.0
456 | # depth = min_d * 100.0 + depth * max_d * 100.0
457 | depth = min_d * 100.0 + depth * (max_d-min_d) * 100.0
458 | # depth = depth*1000.
459 |
460 | return depth
461 |
462 | def _get_sem_pred(self, rgb, depth, use_seg=True):
463 | if use_seg:
464 | image = torch.from_numpy(rgb).to(self.device).unsqueeze_(0).float()
465 | depth = torch.from_numpy(depth).to(self.device).unsqueeze_(0).float()
466 | with torch.no_grad():
467 | red_semantic_pred = self.red_sem_pred(image, depth)
468 | red_semantic_pred = red_semantic_pred.squeeze().cpu().detach().numpy()
469 | semantic_pred, self.rgb_vis = self.sem_pred.get_prediction(rgb)
470 | semantic_pred = semantic_pred.astype(np.float32)
471 | else:
472 | semantic_pred = np.zeros((rgb.shape[0], rgb.shape[1], 16))
473 | self.rgb_vis = rgb[:, :, ::-1]
474 | return red_semantic_pred, semantic_pred
475 |
476 | def _visualize(self, inputs):
477 | args = self.args
478 | dump_dir = "{}/dump/{}/".format(args.dump_location,
479 | args.exp_name)
480 | ep_dir = '{}/episodes/thread_{}/eps_{}/'.format(
481 | dump_dir, self.rank, self.episode_no)
482 | score_dir = '{}/episodes/thread_{}/score_{}/'.format(
483 | dump_dir, self.rank, self.episode_no)
484 |
485 | if not os.path.exists(ep_dir):
486 | os.makedirs(ep_dir)
487 | if not os.path.exists(score_dir):
488 | os.makedirs(score_dir)
489 |
490 | local_w = inputs['map_pred'].shape[0]
491 |
492 | map_pred = inputs['map_pred']
493 | exp_pred = inputs['exp_pred']
494 | map_edge = inputs['map_edge']
495 | start_x, start_y, start_o, gx1, gx2, gy1, gy2 = inputs['pose_pred']
496 |
497 | goal = inputs['goal']
498 | sem_map = inputs['sem_map_pred']
499 |
500 | gx1, gx2, gy1, gy2 = int(gx1), int(gx2), int(gy1), int(gy2)
501 |
502 | sem_map += 5
503 |
504 | no_cat_mask = sem_map == 20
505 | map_mask = np.rint(map_pred) == 1
506 | exp_mask = np.rint(exp_pred) == 1
507 | vis_mask = self.visited_vis[gx1:gx2, gy1:gy2] == 1
508 | edge_mask = map_edge == 1
509 |
510 | sem_map[no_cat_mask] = 0
511 | m1 = np.logical_and(no_cat_mask, exp_mask)
512 | sem_map[m1] = 2
513 |
514 | m2 = np.logical_and(no_cat_mask, map_mask)
515 | sem_map[m2] = 1
516 |
517 | sem_map[vis_mask] = 3
518 | sem_map[edge_mask] = 3
519 |
520 | selem = skimage.morphology.disk(4)
521 | goal_mat = 1 - skimage.morphology.binary_dilation(
522 | goal, selem) != True
523 |
524 | goal_mask = goal_mat == 1
525 | sem_map[goal_mask] = 4
526 | if np.sum(goal) == 1:
527 | f_pos = np.argwhere(goal == 1)
528 | # fmb = get_frontier_boundaries((f_pos[0][0], f_pos[0][1]))
529 | # goal_fmb = skimage.draw.circle_perimeter(int((fmb[0]+fmb[1])/2), int((fmb[2]+fmb[3])/2), 23)
530 | goal_fmb = skimage.draw.circle_perimeter(f_pos[0][0], f_pos[0][1], int(local_w/4-2))
531 | goal_fmb[0][goal_fmb[0] > local_w-1] = local_w-1
532 | goal_fmb[1][goal_fmb[1] > local_w-1] = local_w-1
533 | goal_fmb[0][goal_fmb[0] < 0] = 0
534 | goal_fmb[1][goal_fmb[1] < 0] = 0
535 | # goal_fmb[goal_fmb < 0] =0
536 | goal_mask[goal_fmb[0], goal_fmb[1]] = 1
537 | sem_map[goal_mask] = 4
538 |
539 |
540 | color_pal = [int(x * 255.) for x in color_palette]
541 | sem_map_vis = Image.new("P", (sem_map.shape[1],
542 | sem_map.shape[0]))
543 | sem_map_vis.putpalette(color_pal)
544 | sem_map_vis.putdata(sem_map.flatten().astype(np.uint8))
545 | sem_map_vis = sem_map_vis.convert("RGB")
546 | sem_map_vis = np.flipud(sem_map_vis)
547 |
548 | sem_map_vis = sem_map_vis[:, :, [2, 1, 0]]
549 | sem_map_vis = cv2.resize(sem_map_vis, (480, 480),
550 | interpolation=cv2.INTER_NEAREST)
551 | self.vis_image[50:530, 15:655] = self.rgb_vis
552 | self.vis_image[50:530, 670:1150] = sem_map_vis
553 |
554 | pos = (
555 | (start_x * 100. / args.map_resolution - gy1)
556 | * 480 / map_pred.shape[0],
557 | (map_pred.shape[1] - start_y * 100. / args.map_resolution + gx1)
558 | * 480 / map_pred.shape[1],
559 | np.deg2rad(-start_o)
560 | )
561 |
562 | agent_arrow = vu.get_contour_points(pos, origin=(670, 50), size=10)
563 | color = (int(color_palette[11] * 255),
564 | int(color_palette[10] * 255),
565 | int(color_palette[9] * 255))
566 | cv2.drawContours(self.vis_image, [agent_arrow], 0, color, -1)
567 |
568 | if args.visualize:
569 | # Displaying the image
570 | cv2.imshow("Thread {}".format(self.rank), self.vis_image)
571 | cv2.waitKey(1)
572 |
573 | if args.print_images:
574 | fn = '{}/episodes/thread_{}/eps_{}/{}-{}-Vis-{}.png'.format(
575 | dump_dir, self.rank, self.episode_no,
576 | self.rank, self.episode_no, self.timestep)
577 | cv2.imwrite(fn, self.vis_image)
578 |
579 |
580 |
--------------------------------------------------------------------------------