├── .gitignore
├── LICENSE
├── README.md
├── benchmark
├── eval.py
└── predict.py
├── data
└── scannet
│ ├── README.md
│ ├── batch_load_scannet_data.py
│ ├── load_scannet_data.py
│ ├── meta_data
│ ├── nyu40_labels.csv
│ ├── scannet_means.npz
│ ├── scannet_reference_means.npz
│ ├── scannetv2-labels.combined.tsv
│ ├── scannetv2.txt
│ ├── scannetv2_test.txt
│ ├── scannetv2_train.txt
│ └── scannetv2_val.txt
│ ├── model_util_scannet.py
│ ├── scannet_utils.py
│ └── visualize.py
├── demo
└── ScanRefer.gif
├── docs
├── browser.png
├── davezchen_eccv2020_scanrefer.pdf
├── index.html
├── paper.jpg
├── teaser.png
└── w3.css
├── lib
├── ap_helper.py
├── config.py
├── dataset.py
├── enet.py
├── eval_helper.py
├── loss.py
├── loss_helper.py
├── pointnet2
│ ├── _ext_src
│ │ ├── include
│ │ │ ├── ball_query.h
│ │ │ ├── cuda_utils.h
│ │ │ ├── group_points.h
│ │ │ ├── interpolate.h
│ │ │ ├── sampling.h
│ │ │ └── utils.h
│ │ └── src
│ │ │ ├── ball_query.cpp
│ │ │ ├── ball_query_gpu.cu
│ │ │ ├── bindings.cpp
│ │ │ ├── group_points.cpp
│ │ │ ├── group_points_gpu.cu
│ │ │ ├── interpolate.cpp
│ │ │ ├── interpolate_gpu.cu
│ │ │ ├── sampling.cpp
│ │ │ └── sampling_gpu.cu
│ ├── _version.py
│ ├── pointnet2_modules.py
│ ├── pointnet2_test.py
│ ├── pointnet2_utils.py
│ ├── pytorch_utils.py
│ └── setup.py
├── projection.py
└── solver.py
├── models
├── backbone_module.py
├── lang_module.py
├── match_module.py
├── proposal_module.py
├── refnet.py
└── voting_module.py
├── requirements.txt
├── scripts
├── compute_multiview_features.py
├── eval.py
├── project_multiview_features.py
├── project_multiview_labels.py
├── train.py
└── visualize.py
└── utils
├── box_util.py
├── eta.py
├── eval_det.py
├── metric_util.py
├── nms.py
├── nn_distance.py
└── pc_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # dataset
2 | data/scanrefer*
3 | data/ScanRefer*
4 | data/glove*
5 | data/scannet/scannet_data
6 | data/scannet/scans
7 | data/scannetv2_enet.pth
8 |
9 | # cache
10 | data/scannet/__pycache__
11 | lib/__pycache__
12 | lib/pointnet2/__pycache__
13 | models/__pycache__
14 | utils/__pycache__
15 | .DS_Store
16 |
17 | # pointnet2
18 | lib/pointnet2/build/
19 | lib/pointnet2/dist/
20 | lib/pointnet2/pointnet2.egg-info/
21 |
22 | # output
23 | outputs/
24 |
25 | # delete
26 | docs/.DS_Store
27 | demo/.DS_Store
28 |
29 | # misc
30 | upload/
--------------------------------------------------------------------------------
/benchmark/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import argparse
5 |
6 | import numpy as np
7 |
8 | from tqdm import tqdm
9 |
10 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
11 | from lib.config import CONF
12 | from utils.box_util import box3d_iou
13 |
14 | SCANREFER_GT = json.load(open(os.path.join(CONF.PATH.DATA, "ScanRefer_filtered_test_gt_bbox.json")))
15 |
16 | def organize_gt():
17 | organized = {}
18 |
19 | for data in SCANREFER_GT:
20 | scene_id = data["scene_id"]
21 | object_id = data["object_id"]
22 | ann_id = data["ann_id"]
23 |
24 | if scene_id not in organized:
25 | organized[scene_id] = {}
26 |
27 | if object_id not in organized[scene_id]:
28 | organized[scene_id][object_id] = {}
29 |
30 | if ann_id not in organized[scene_id][object_id]:
31 | organized[scene_id][object_id][ann_id] = {}
32 |
33 | organized[scene_id][object_id][ann_id] = data
34 |
35 | return organized
36 |
37 | def evaluate(args):
38 | pred_path = os.path.join(CONF.PATH.OUTPUT, args.folder, "pred.json")
39 | if not os.path.isfile(pred_path):
40 | print("please run `benchmark/predict.py` first to generate bounding boxes")
41 | exit()
42 |
43 | organized_gt = organize_gt()
44 |
45 | with open(pred_path) as f:
46 | predictions = json.load(f)
47 | ious = []
48 | masks = []
49 | others = []
50 | print("evaluating...")
51 | for data in tqdm(predictions):
52 | scene_id = data["scene_id"]
53 | object_id = data["object_id"]
54 | ann_id = data["ann_id"]
55 | pred_bbox = np.array(data["bbox"])
56 | mask = data["unique_multiple"]
57 | other = data["others"]
58 |
59 | try:
60 | gt_bbox = np.array(organized_gt[scene_id][object_id][ann_id]["bbox"])
61 | # iou, _ = box3d_iou(pred_bbox, gt_bbox)
62 | iou = box3d_iou(pred_bbox, gt_bbox)
63 |
64 | except KeyError:
65 | iou = 0
66 |
67 | ious.append(iou)
68 | masks.append(mask)
69 | others.append(other)
70 |
71 | # ious = np.array(ious)
72 | # iou_rate_025 = ious[ious >= 0.25].shape[0] / ious.shape[0]
73 | # iou_rate_05 = ious[ious >= 0.5].shape[0] / ious.shape[0]
74 |
75 | # print("\nAcc@0.25IoU: {}".format(iou_rate_025))
76 | # print("Acc@0.5IoU: {}".format(iou_rate_05))
77 |
78 | ious = np.array(ious)
79 | masks = np.array(masks)
80 | others = np.array(others)
81 |
82 | multiple_dict = {
83 | "unique": 0,
84 | "multiple": 1
85 | }
86 | others_dict = {
87 | "not_in_others": 0,
88 | "in_others": 1
89 | }
90 |
91 | # evaluation stats
92 | stats = {k: np.sum(masks == v) for k, v in multiple_dict.items()}
93 | stats["overall"] = masks.shape[0]
94 | stats = {}
95 | for k, v in multiple_dict.items():
96 | stats[k] = {}
97 | for k_o, v_o in others_dict.items():
98 | stats[k][k_o] = np.sum(np.logical_and(masks == v, others == v_o))
99 |
100 | stats[k]["overall"] = np.sum(masks == v)
101 |
102 | stats["overall"] = {}
103 | for k_o, v_o in others_dict.items():
104 | stats["overall"][k_o] = np.sum(others == v_o)
105 |
106 | stats["overall"]["overall"] = masks.shape[0]
107 |
108 | # aggregate scores
109 | scores = {}
110 | for k, v in multiple_dict.items():
111 | for k_o in others_dict.keys():
112 | acc_025iou = ious[np.logical_and(np.logical_and(masks == multiple_dict[k], others == others_dict[k_o]), ious >= 0.25)].shape[0] \
113 | / ious[np.logical_and(masks == multiple_dict[k], others == others_dict[k_o])].shape[0] \
114 | if np.sum(np.logical_and(masks == multiple_dict[k], others == others_dict[k_o])) > 0 else 0
115 | acc_05iou = ious[np.logical_and(np.logical_and(masks == multiple_dict[k], others == others_dict[k_o]), ious >= 0.5)].shape[0] \
116 | / ious[np.logical_and(masks == multiple_dict[k], others == others_dict[k_o])].shape[0] \
117 | if np.sum(np.logical_and(masks == multiple_dict[k], others == others_dict[k_o])) > 0 else 0
118 |
119 | if k not in scores:
120 | scores[k] = {k_o: {} for k_o in others_dict.keys()}
121 |
122 | scores[k][k_o]["acc@0.25iou"] = acc_025iou
123 | scores[k][k_o]["acc@0.5iou"] = acc_05iou
124 |
125 | acc_025iou = ious[np.logical_and(masks == multiple_dict[k], ious >= 0.25)].shape[0] \
126 | / ious[masks == multiple_dict[k]].shape[0] if np.sum(masks == multiple_dict[k]) > 0 else 0
127 | acc_05iou = ious[np.logical_and(masks == multiple_dict[k], ious >= 0.5)].shape[0] \
128 | / ious[masks == multiple_dict[k]].shape[0] if np.sum(masks == multiple_dict[k]) > 0 else 0
129 |
130 | scores[k]["overall"] = {}
131 | scores[k]["overall"]["acc@0.25iou"] = acc_025iou
132 | scores[k]["overall"]["acc@0.5iou"] = acc_05iou
133 |
134 | scores["overall"] = {}
135 | for k_o in others_dict.keys():
136 | acc_025iou = ious[np.logical_and(others == others_dict[k_o], ious >= 0.25)].shape[0] \
137 | / ious[others == others_dict[k_o]].shape[0] if np.sum(others == others_dict[k_o]) > 0 else 0
138 | acc_05iou = ious[np.logical_and(others == others_dict[k_o], ious >= 0.5)].shape[0] \
139 | / ious[others == others_dict[k_o]].shape[0] if np.sum(others == others_dict[k_o]) > 0 else 0
140 |
141 | # aggregate
142 | scores["overall"][k_o] = {}
143 | scores["overall"][k_o]["acc@0.25iou"] = acc_025iou
144 | scores["overall"][k_o]["acc@0.5iou"] = acc_05iou
145 |
146 | acc_025iou = ious[ious >= 0.25].shape[0] / ious.shape[0]
147 | acc_05iou = ious[ious >= 0.5].shape[0] / ious.shape[0]
148 |
149 |
150 | # aggregate
151 | scores["overall"]["overall"] = {}
152 | scores["overall"]["overall"]["acc@0.25iou"] = acc_025iou
153 | scores["overall"]["overall"]["acc@0.5iou"] = acc_05iou
154 |
155 | # report
156 | print("\nstats:")
157 | for k_s in stats.keys():
158 | for k_o in stats[k_s].keys():
159 | print("{} | {}: {}".format(k_s, k_o, stats[k_s][k_o]))
160 |
161 | for k_s in scores.keys():
162 | print("\n{}:".format(k_s))
163 | for k_m in scores[k_s].keys():
164 | for metric in scores[k_s][k_m].keys():
165 | print("{} | {} | {}: {}".format(k_s, k_m, metric, scores[k_s][k_m][metric]))
166 |
167 | if __name__ == "__main__":
168 | parser = argparse.ArgumentParser()
169 | parser.add_argument("--folder", type=str, help="Folder containing the model")
170 | args = parser.parse_args()
171 |
172 | evaluate(args)
--------------------------------------------------------------------------------
/benchmark/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import pickle
5 | import argparse
6 | import importlib
7 | import torch
8 | import torch.optim as optim
9 | import torch.nn as nn
10 | import numpy as np
11 |
12 | from torch.utils.data import DataLoader
13 | from datetime import datetime
14 | from tqdm import tqdm
15 | from copy import deepcopy
16 |
17 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
18 | from lib.config import CONF
19 | from lib.dataset import ScannetReferenceDataset
20 | from lib.solver import Solver
21 | from lib.ap_helper import APCalculator, parse_predictions, parse_groundtruths
22 | from lib.loss_helper import get_loss
23 | from lib.eval_helper import get_eval
24 | from models.refnet import RefNet
25 | from utils.box_util import get_3d_box
26 | from data.scannet.model_util_scannet import ScannetDatasetConfig
27 |
28 | SCANREFER_TEST = json.load(open(os.path.join(CONF.PATH.DATA, "ScanRefer_filtered_test.json")))
29 |
30 | def get_dataloader(args, scanrefer, all_scene_list, split, config):
31 | dataset = ScannetReferenceDataset(
32 | scanrefer=scanrefer,
33 | scanrefer_all_scene=all_scene_list,
34 | split=split,
35 | num_points=args.num_points,
36 | use_color=args.use_color,
37 | use_height=(not args.no_height),
38 | use_normal=args.use_normal,
39 | use_multiview=args.use_multiview
40 | )
41 | print("predict for {} samples".format(len(dataset)))
42 |
43 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
44 |
45 | return dataset, dataloader
46 |
47 | def get_model(args, config):
48 | # load model
49 | input_channels = int(args.use_multiview) * 128 + int(args.use_normal) * 3 + int(args.use_color) * 3 + int(not args.no_height)
50 | model = RefNet(
51 | num_class=config.num_class,
52 | num_heading_bin=config.num_heading_bin,
53 | num_size_cluster=config.num_size_cluster,
54 | mean_size_arr=config.mean_size_arr,
55 | num_proposal=args.num_proposals,
56 | input_feature_dim=input_channels,
57 | use_lang_classifier=(not args.no_lang_cls),
58 | use_bidir=args.use_bidir
59 | ).cuda()
60 |
61 | model_name = "model.pth"
62 | path = os.path.join(CONF.PATH.OUTPUT, args.folder, model_name)
63 | model.load_state_dict(torch.load(path), strict=False)
64 | model.eval()
65 |
66 | return model
67 |
68 | def get_scannet_scene_list(split):
69 | scene_list = sorted([line.rstrip() for line in open(os.path.join(CONF.PATH.SCANNET_META, "scannetv2_{}.txt".format(split)))])
70 |
71 | return scene_list
72 |
73 | def get_scanrefer(args):
74 | scanrefer = SCANREFER_TEST
75 | scene_list = sorted(list(set([data["scene_id"] for data in scanrefer])))
76 | scanrefer = [data for data in scanrefer if data["scene_id"] in scene_list]
77 |
78 | return scanrefer, scene_list
79 |
80 | def predict(args):
81 | print("predict bounding boxes...")
82 | # constant
83 | DC = ScannetDatasetConfig()
84 |
85 | # init training dataset
86 | print("preparing data...")
87 | scanrefer, scene_list = get_scanrefer(args)
88 |
89 | # dataloader
90 | _, dataloader = get_dataloader(args, scanrefer, scene_list, "test", DC)
91 |
92 | # model
93 | model = get_model(args, DC)
94 |
95 | # config
96 | POST_DICT = {
97 | "remove_empty_box": True,
98 | "use_3d_nms": True,
99 | "nms_iou": 0.25,
100 | "use_old_type_nms": False,
101 | "cls_nms": True,
102 | "per_class_proposal": True,
103 | "conf_thresh": 0.05,
104 | "dataset_config": DC
105 | } if not args.no_nms else None
106 |
107 | # predict
108 | print("predicting...")
109 | pred_bboxes = []
110 | for data_dict in tqdm(dataloader):
111 | for key in data_dict:
112 | data_dict[key] = data_dict[key].cuda()
113 |
114 | # feed
115 | data_dict = model(data_dict)
116 | _, data_dict = get_loss(
117 | data_dict=data_dict,
118 | config=DC,
119 | detection=False,
120 | reference=True
121 | )
122 |
123 | objectness_preds_batch = torch.argmax(data_dict['objectness_scores'], 2).long()
124 |
125 | if POST_DICT:
126 | _ = parse_predictions(data_dict, POST_DICT)
127 | nms_masks = torch.LongTensor(data_dict['pred_mask']).cuda()
128 |
129 | # construct valid mask
130 | pred_masks = (nms_masks * objectness_preds_batch == 1).float()
131 | else:
132 | # construct valid mask
133 | pred_masks = (objectness_preds_batch == 1).float()
134 |
135 | pred_ref = torch.argmax(data_dict['cluster_ref'] * pred_masks, 1) # (B,)
136 | pred_center = data_dict['center'] # (B,K,3)
137 | pred_heading_class = torch.argmax(data_dict['heading_scores'], -1) # B,num_proposal
138 | pred_heading_residual = torch.gather(data_dict['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1
139 | pred_heading_class = pred_heading_class # B,num_proposal
140 | pred_heading_residual = pred_heading_residual.squeeze(2) # B,num_proposal
141 | pred_size_class = torch.argmax(data_dict['size_scores'], -1) # B,num_proposal
142 | pred_size_residual = torch.gather(data_dict['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat(1,1,1,3)) # B,num_proposal,1,3
143 | pred_size_class = pred_size_class
144 | pred_size_residual = pred_size_residual.squeeze(2) # B,num_proposal,3
145 |
146 | for i in range(pred_ref.shape[0]):
147 | # compute the iou
148 | pred_ref_idx = pred_ref[i]
149 | pred_obb = DC.param2obb(
150 | pred_center[i, pred_ref_idx, 0:3].detach().cpu().numpy(),
151 | pred_heading_class[i, pred_ref_idx].detach().cpu().numpy(),
152 | pred_heading_residual[i, pred_ref_idx].detach().cpu().numpy(),
153 | pred_size_class[i, pred_ref_idx].detach().cpu().numpy(),
154 | pred_size_residual[i, pred_ref_idx].detach().cpu().numpy()
155 | )
156 | pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3])
157 |
158 | # construct the multiple mask
159 | multiple = data_dict["unique_multiple"][i].item()
160 |
161 | # construct the others mask
162 | others = 1 if data_dict["object_cat"][i] == 17 else 0
163 |
164 | # store data
165 | scanrefer_idx = data_dict["scan_idx"][i].item()
166 | pred_data = {
167 | "scene_id": scanrefer[scanrefer_idx]["scene_id"],
168 | "object_id": scanrefer[scanrefer_idx]["object_id"],
169 | "ann_id": scanrefer[scanrefer_idx]["ann_id"],
170 | "bbox": pred_bbox.tolist(),
171 | "unique_multiple": multiple,
172 | "others": others
173 | }
174 | pred_bboxes.append(pred_data)
175 |
176 | # dump
177 | print("dumping...")
178 | pred_path = os.path.join(CONF.PATH.OUTPUT, args.folder, "pred.json")
179 | with open(pred_path, "w") as f:
180 | json.dump(pred_bboxes, f, indent=4)
181 |
182 | print("done!")
183 |
184 | if __name__ == "__main__":
185 | parser = argparse.ArgumentParser()
186 | parser.add_argument("--folder", type=str, help="Folder containing the model")
187 | parser.add_argument("--gpu", type=str, help="gpu", default="0")
188 | parser.add_argument("--batch_size", type=int, help="batch size", default=8)
189 | parser.add_argument("--num_points", type=int, default=40000, help="Point Number [default: 40000]")
190 | parser.add_argument("--num_proposals", type=int, default=256, help="Proposal number [default: 256]")
191 | parser.add_argument("--seed", type=int, default=42, help="random seed")
192 | parser.add_argument("--no_height", action="store_true", help="Do NOT use height signal in input.")
193 | parser.add_argument("--no_lang_cls", action="store_true", help="Do NOT use language classifier.")
194 | parser.add_argument("--no_nms", action="store_true", help="do NOT use non-maximum suppression for post-processing.")
195 | parser.add_argument("--use_color", action="store_true", help="Use RGB color in input.")
196 | parser.add_argument("--use_normal", action="store_true", help="Use RGB color in input.")
197 | parser.add_argument("--use_multiview", action="store_true", help="Use multiview images.")
198 | parser.add_argument("--use_bidir", action="store_true", help="Use bi-directional GRU.")
199 | args = parser.parse_args()
200 |
201 | # setting
202 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
203 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
204 |
205 | # reproducibility
206 | torch.manual_seed(args.seed)
207 | torch.backends.cudnn.deterministic = True
208 | torch.backends.cudnn.benchmark = False
209 | np.random.seed(args.seed)
210 |
211 | predict(args)
212 |
--------------------------------------------------------------------------------
/data/scannet/README.md:
--------------------------------------------------------------------------------
1 | # ScanNet Instructions
2 |
3 | To acquire the access to ScanNet dataset, Please refer to the [ScanNet project page](https://github.com/ScanNet/ScanNet) and follow the instructions there. You will get a `download-scannet.py` script after your request for the ScanNet dataset is approved. Note that only a subset of ScanNet is needed. Once you get `download-scannet.py`, please use the commands below to download the portion of ScanNet that is necessary for ScanRefer:
4 |
5 | ```shell
6 | python2 download-scannet.py -o data/scannet --type _vh_clean_2.ply
7 | python2 download-scannet.py -o data/scannet --type .aggregation.json
8 | python2 download-scannet.py -o data/scannet --type _vh_clean_2.0.010000.segs.json
9 | python2 download-scannet.py -o data/scannet --type .txt
10 | ```
11 | Roughly 10.6GB free space is needed on your disk.
12 |
--------------------------------------------------------------------------------
/data/scannet/batch_load_scannet_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from: https://github.com/facebookresearch/votenet/blob/master/scannet/batch_load_scannet_data.py
3 |
4 | Batch mode in loading Scannet scenes with vertices and ground truth labels for semantic and instance segmentations
5 |
6 | Usage example: python ./batch_load_scannet_data.py
7 | """
8 |
9 | import os
10 | import datetime
11 | import numpy as np
12 | from load_scannet_data import export
13 | from multiprocessing import Pool
14 |
15 |
16 | SCANNET_DIR = 'scans'
17 | SCAN_NAMES = sorted([line.rstrip() for line in open('meta_data/scannetv2.txt')])
18 | LABEL_MAP_FILE = 'meta_data/scannetv2-labels.combined.tsv'
19 | DONOTCARE_CLASS_IDS = np.array([])
20 | OBJ_CLASS_IDS = np.array([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) # exclude wall (1), floor (2), ceiling (22)
21 | MAX_NUM_POINT = 50000
22 | OUTPUT_FOLDER = './scannet_data'
23 |
24 | def export_one_scan(scan_name):
25 | output_filename_prefix = os.path.join(OUTPUT_FOLDER, scan_name)
26 | mesh_file = os.path.join(SCANNET_DIR, scan_name, scan_name + '_vh_clean_2.ply')
27 | agg_file = os.path.join(SCANNET_DIR, scan_name, scan_name + '.aggregation.json')
28 | seg_file = os.path.join(SCANNET_DIR, scan_name, scan_name + '_vh_clean_2.0.010000.segs.json')
29 | meta_file = os.path.join(SCANNET_DIR, scan_name, scan_name + '.txt') # includes axisAlignment info for the train set scans.
30 | mesh_vertices, aligned_vertices, semantic_labels, instance_labels, instance_bboxes, aligned_instance_bboxes = export(mesh_file, agg_file, seg_file, meta_file, LABEL_MAP_FILE, None)
31 |
32 | mask = np.logical_not(np.in1d(semantic_labels, DONOTCARE_CLASS_IDS))
33 | mesh_vertices = mesh_vertices[mask,:]
34 | aligned_vertices = aligned_vertices[mask,:]
35 | semantic_labels = semantic_labels[mask]
36 | instance_labels = instance_labels[mask]
37 |
38 | if instance_bboxes.shape[0] > 1:
39 | num_instances = len(np.unique(instance_labels))
40 | print('Num of instances: ', num_instances)
41 |
42 | # bbox_mask = np.in1d(instance_bboxes[:,-1], OBJ_CLASS_IDS)
43 | bbox_mask = np.in1d(instance_bboxes[:,-2], OBJ_CLASS_IDS) # match the mesh2cap
44 | instance_bboxes = instance_bboxes[bbox_mask,:]
45 | aligned_instance_bboxes = aligned_instance_bboxes[bbox_mask,:]
46 | print('Num of care instances: ', instance_bboxes.shape[0])
47 | else:
48 | print("No semantic/instance annotation for test scenes")
49 |
50 | N = mesh_vertices.shape[0]
51 | if N > MAX_NUM_POINT:
52 | choices = np.random.choice(N, MAX_NUM_POINT, replace=False)
53 | mesh_vertices = mesh_vertices[choices, :]
54 | aligned_vertices = aligned_vertices[choices, :]
55 | semantic_labels = semantic_labels[choices]
56 | instance_labels = instance_labels[choices]
57 |
58 | print("Shape of points: {}".format(mesh_vertices.shape))
59 |
60 | np.save(output_filename_prefix+'_vert.npy', mesh_vertices)
61 | np.save(output_filename_prefix+'_aligned_vert.npy', aligned_vertices)
62 | np.save(output_filename_prefix+'_sem_label.npy', semantic_labels)
63 | np.save(output_filename_prefix+'_ins_label.npy', instance_labels)
64 | np.save(output_filename_prefix+'_bbox.npy', instance_bboxes)
65 | np.save(output_filename_prefix+'_aligned_bbox.npy', aligned_instance_bboxes)
66 |
67 | def batch_export():
68 |
69 | if not os.path.exists(OUTPUT_FOLDER):
70 | print('Creating new data folder: {}'.format(OUTPUT_FOLDER))
71 | os.mkdir(OUTPUT_FOLDER)
72 |
73 | with Pool() as pool:
74 | pool.map(export_one_scan, SCAN_NAMES)
75 |
76 | if __name__=='__main__':
77 | batch_export()
78 |
--------------------------------------------------------------------------------
/data/scannet/load_scannet_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from: https://github.com/facebookresearch/votenet/blob/master/scannet/load_scannet_data.py
3 |
4 | Load Scannet scenes with vertices and ground truth labels for semantic and instance segmentations
5 | """
6 |
7 | # python imports
8 | import math
9 | import os, sys, argparse
10 | import inspect
11 | import json
12 | import pdb
13 | import numpy as np
14 | import scannet_utils
15 |
16 | def read_aggregation(filename):
17 | object_id_to_segs = {}
18 | label_to_segs = {}
19 | with open(filename) as f:
20 | data = json.load(f)
21 | num_objects = len(data['segGroups'])
22 | for i in range(num_objects):
23 | object_id = data['segGroups'][i]['objectId'] + 1 # instance ids should be 1-indexed
24 | label = data['segGroups'][i]['label']
25 | segs = data['segGroups'][i]['segments']
26 | object_id_to_segs[object_id] = segs
27 | if label in label_to_segs:
28 | label_to_segs[label].extend(segs)
29 | else:
30 | label_to_segs[label] = segs
31 | return object_id_to_segs, label_to_segs
32 |
33 |
34 | def read_segmentation(filename):
35 | seg_to_verts = {}
36 | with open(filename) as f:
37 | data = json.load(f)
38 | num_verts = len(data['segIndices'])
39 | for i in range(num_verts):
40 | seg_id = data['segIndices'][i]
41 | if seg_id in seg_to_verts:
42 | seg_to_verts[seg_id].append(i)
43 | else:
44 | seg_to_verts[seg_id] = [i]
45 | return seg_to_verts, num_verts
46 |
47 |
48 | def export(mesh_file, agg_file, seg_file, meta_file, label_map_file, output_file=None):
49 | """ points are XYZ RGB (RGB in 0-255),
50 | semantic label as nyu40 ids,
51 | instance label as 1-#instance,
52 | box as (cx,cy,cz,dx,dy,dz,semantic_label)
53 | """
54 | label_map = scannet_utils.read_label_mapping(label_map_file, label_from='raw_category', label_to='nyu40id')
55 | # mesh_vertices = scannet_utils.read_mesh_vertices_rgb(mesh_file)
56 | mesh_vertices = scannet_utils.read_mesh_vertices_rgb_normal(mesh_file)
57 |
58 | # Load scene axis alignment matrix
59 | lines = open(meta_file).readlines()
60 | axis_align_matrix = None
61 | for line in lines:
62 | if 'axisAlignment' in line:
63 | axis_align_matrix = [float(x) for x in line.rstrip().strip('axisAlignment = ').split(' ')]
64 |
65 | if axis_align_matrix != None:
66 | axis_align_matrix = np.array(axis_align_matrix).reshape((4,4))
67 | pts = np.ones((mesh_vertices.shape[0], 4))
68 | pts[:,0:3] = mesh_vertices[:,0:3]
69 | pts = np.dot(pts, axis_align_matrix.transpose()) # Nx4
70 | aligned_vertices = np.copy(mesh_vertices)
71 | aligned_vertices[:,0:3] = pts[:,0:3]
72 | else:
73 | print("No axis alignment matrix found")
74 | aligned_vertices = mesh_vertices
75 |
76 | # Load semantic and instance labels
77 | if os.path.isfile(agg_file):
78 | object_id_to_segs, label_to_segs = read_aggregation(agg_file)
79 | seg_to_verts, num_verts = read_segmentation(seg_file)
80 |
81 | label_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated
82 | object_id_to_label_id = {}
83 | for label, segs in label_to_segs.items():
84 | label_id = label_map[label]
85 | for seg in segs:
86 | verts = seg_to_verts[seg]
87 | label_ids[verts] = label_id
88 | instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated
89 | num_instances = len(np.unique(list(object_id_to_segs.keys())))
90 | for object_id, segs in object_id_to_segs.items():
91 | for seg in segs:
92 | verts = seg_to_verts[seg]
93 | instance_ids[verts] = object_id
94 | if object_id not in object_id_to_label_id:
95 | object_id_to_label_id[object_id] = label_ids[verts][0]
96 |
97 | instance_bboxes = np.zeros((num_instances,8)) # also include object id
98 | aligned_instance_bboxes = np.zeros((num_instances,8)) # also include object id
99 | for obj_id in object_id_to_segs:
100 | label_id = object_id_to_label_id[obj_id]
101 |
102 | # bboxes in the original meshes
103 | obj_pc = mesh_vertices[instance_ids==obj_id, 0:3]
104 | if len(obj_pc) == 0: continue
105 | # Compute axis aligned box
106 | # An axis aligned bounding box is parameterized by
107 | # (cx,cy,cz) and (dx,dy,dz) and label id
108 | # where (cx,cy,cz) is the center point of the box,
109 | # dx is the x-axis length of the box.
110 | xmin = np.min(obj_pc[:,0])
111 | ymin = np.min(obj_pc[:,1])
112 | zmin = np.min(obj_pc[:,2])
113 | xmax = np.max(obj_pc[:,0])
114 | ymax = np.max(obj_pc[:,1])
115 | zmax = np.max(obj_pc[:,2])
116 | bbox = np.array([(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2, xmax-xmin, ymax-ymin, zmax-zmin, label_id, obj_id-1]) # also include object id
117 | # NOTE: this assumes obj_id is in 1,2,3,.,,,.NUM_INSTANCES
118 | instance_bboxes[obj_id-1,:] = bbox
119 |
120 | # bboxes in the aligned meshes
121 | obj_pc = aligned_vertices[instance_ids==obj_id, 0:3]
122 | if len(obj_pc) == 0: continue
123 | # Compute axis aligned box
124 | # An axis aligned bounding box is parameterized by
125 | # (cx,cy,cz) and (dx,dy,dz) and label id
126 | # where (cx,cy,cz) is the center point of the box,
127 | # dx is the x-axis length of the box.
128 | xmin = np.min(obj_pc[:,0])
129 | ymin = np.min(obj_pc[:,1])
130 | zmin = np.min(obj_pc[:,2])
131 | xmax = np.max(obj_pc[:,0])
132 | ymax = np.max(obj_pc[:,1])
133 | zmax = np.max(obj_pc[:,2])
134 | bbox = np.array([(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2, xmax-xmin, ymax-ymin, zmax-zmin, label_id, obj_id-1]) # also include object id
135 | # NOTE: this assumes obj_id is in 1,2,3,.,,,.NUM_INSTANCES
136 | aligned_instance_bboxes[obj_id-1,:] = bbox
137 | else:
138 | # use zero as placeholders for the test scene
139 | print("use placeholders")
140 | num_verts = mesh_vertices.shape[0]
141 | label_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated
142 | instance_ids = np.zeros(shape=(num_verts), dtype=np.uint32) # 0: unannotated
143 | instance_bboxes = np.zeros((1, 8)) # also include object id
144 | aligned_instance_bboxes = np.zeros((1, 8)) # also include object id
145 |
146 | if output_file is not None:
147 | np.save(output_file+'_vert.npy', mesh_vertices)
148 | np.save(output_file+'_aligned_vert.npy', aligned_vertices)
149 | np.save(output_file+'_sem_label.npy', label_ids)
150 | np.save(output_file+'_ins_label.npy', instance_ids)
151 | np.save(output_file+'_bbox.npy', instance_bboxes)
152 | np.save(output_file+'_aligned_bbox.npy', instance_bboxes)
153 |
154 | return mesh_vertices, aligned_vertices, label_ids, instance_ids, instance_bboxes, aligned_instance_bboxes
155 |
156 | def main():
157 | parser = argparse.ArgumentParser()
158 | parser.add_argument('--scan_path', required=True, help='path to scannet scene (e.g., data/ScanNet/v2/scene0000_00')
159 | parser.add_argument('--output_file', required=True, help='output file')
160 | parser.add_argument('--label_map_file', required=True, help='path to scannetv2-labels.combined.tsv')
161 | opt = parser.parse_args()
162 |
163 | scan_name = os.path.split(opt.scan_path)[-1]
164 | mesh_file = os.path.join(opt.scan_path, scan_name + '_vh_clean_2.ply')
165 | agg_file = os.path.join(opt.scan_path, scan_name + '.aggregation.json')
166 | seg_file = os.path.join(opt.scan_path, scan_name + '_vh_clean_2.0.010000.segs.json')
167 | meta_file = os.path.join(opt.scan_path, scan_name + '.txt') # includes axisAlignment info for the train set scans.
168 | export(mesh_file, agg_file, seg_file, meta_file, opt.label_map_file, opt.output_file)
169 |
170 | if __name__ == '__main__':
171 | main()
172 |
--------------------------------------------------------------------------------
/data/scannet/meta_data/nyu40_labels.csv:
--------------------------------------------------------------------------------
1 | nyu40id,nyu40class,mappedId,mappedIdConsecutive,weight
2 | 1,wall,(ignore),19,0.0
3 | 2,floor,(ignore),19,0.0
4 | 3,cabinet,3,1,3.9644974086960434
5 | 4,bed,4,2,5.459494152836571
6 | 5,chair,5,3,2.241522691584157
7 | 6,sofa,6,4,4.820655512680854
8 | 7,table,7,5,3.565918577548873
9 | 8,door,8,6,3.538498341919445
10 | 9,window,9,7,4.636521236560596
11 | 10,bookshelf,10,8,5.445050937449535
12 | 11,picture,11,9,5.079250281008131
13 | 12,counter,12,10,6.2030429647735845
14 | 13,blinds,(ignore),19,0.0
15 | 14,desk,14,11,4.622662494840168
16 | 15,shelves,(ignore),19,0.0
17 | 16,curtain,16,12,5.956294301248057
18 | 17,dresser,(ignore),19,0.0
19 | 18,pillow,(ignore),19,0.0
20 | 19,mirror,(ignore),19,0.0
21 | 20,floor_mat,(ignore),19,0.0
22 | 21,clothes,(ignore),19,0.0
23 | 22,ceiling,(ignore),19,0.0
24 | 23,books,(ignore),19,0.0
25 | 24,refridgerator,24,13,5.459141107819665
26 | 25,television,(ignore),19,0.0
27 | 26,paper,(ignore),19,0.0
28 | 27,towel,(ignore),19,0.0
29 | 28,shower_curtain,28,14,6.724871661883906
30 | 29,box,(ignore),19,0.0
31 | 30,whiteboard,(ignore),19,0.0
32 | 31,person,(ignore),19,0.0
33 | 32,night_stand,(ignore),19,0.0
34 | 33,toilet,33,15,5.832442848923174
35 | 34,sink,34,16,5.064773947290611
36 | 35,lamp,(ignore),19,0.0
37 | 36,bathtub,36,17,6.738988357113375
38 | 37,bag,(ignore),19,0.0
39 | 38,otherstructure,(ignore),19,0.0
40 | 39,otherfurniture,39,18,3.375217918833916
41 | 40,otherprop,(ignore),19,0.0
--------------------------------------------------------------------------------
/data/scannet/meta_data/scannet_means.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/data/scannet/meta_data/scannet_means.npz
--------------------------------------------------------------------------------
/data/scannet/meta_data/scannet_reference_means.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/data/scannet/meta_data/scannet_reference_means.npz
--------------------------------------------------------------------------------
/data/scannet/meta_data/scannetv2_test.txt:
--------------------------------------------------------------------------------
1 | scene0707_00
2 | scene0708_00
3 | scene0709_00
4 | scene0710_00
5 | scene0711_00
6 | scene0712_00
7 | scene0713_00
8 | scene0714_00
9 | scene0715_00
10 | scene0716_00
11 | scene0717_00
12 | scene0718_00
13 | scene0719_00
14 | scene0720_00
15 | scene0721_00
16 | scene0722_00
17 | scene0723_00
18 | scene0724_00
19 | scene0725_00
20 | scene0726_00
21 | scene0727_00
22 | scene0728_00
23 | scene0729_00
24 | scene0730_00
25 | scene0731_00
26 | scene0732_00
27 | scene0733_00
28 | scene0734_00
29 | scene0735_00
30 | scene0736_00
31 | scene0737_00
32 | scene0738_00
33 | scene0739_00
34 | scene0740_00
35 | scene0741_00
36 | scene0742_00
37 | scene0743_00
38 | scene0744_00
39 | scene0745_00
40 | scene0746_00
41 | scene0747_00
42 | scene0748_00
43 | scene0749_00
44 | scene0750_00
45 | scene0751_00
46 | scene0752_00
47 | scene0753_00
48 | scene0754_00
49 | scene0755_00
50 | scene0756_00
51 | scene0757_00
52 | scene0758_00
53 | scene0759_00
54 | scene0760_00
55 | scene0761_00
56 | scene0762_00
57 | scene0763_00
58 | scene0764_00
59 | scene0765_00
60 | scene0766_00
61 | scene0767_00
62 | scene0768_00
63 | scene0769_00
64 | scene0770_00
65 | scene0771_00
66 | scene0772_00
67 | scene0773_00
68 | scene0774_00
69 | scene0775_00
70 | scene0776_00
71 | scene0777_00
72 | scene0778_00
73 | scene0779_00
74 | scene0780_00
75 | scene0781_00
76 | scene0782_00
77 | scene0783_00
78 | scene0784_00
79 | scene0785_00
80 | scene0786_00
81 | scene0787_00
82 | scene0788_00
83 | scene0789_00
84 | scene0790_00
85 | scene0791_00
86 | scene0792_00
87 | scene0793_00
88 | scene0794_00
89 | scene0795_00
90 | scene0796_00
91 | scene0797_00
92 | scene0798_00
93 | scene0799_00
94 | scene0800_00
95 | scene0801_00
96 | scene0802_00
97 | scene0803_00
98 | scene0804_00
99 | scene0805_00
100 | scene0806_00
101 |
--------------------------------------------------------------------------------
/data/scannet/meta_data/scannetv2_val.txt:
--------------------------------------------------------------------------------
1 | scene0011_00
2 | scene0011_01
3 | scene0015_00
4 | scene0019_00
5 | scene0019_01
6 | scene0025_00
7 | scene0025_01
8 | scene0025_02
9 | scene0030_00
10 | scene0030_01
11 | scene0030_02
12 | scene0046_00
13 | scene0046_01
14 | scene0046_02
15 | scene0050_00
16 | scene0050_01
17 | scene0050_02
18 | scene0063_00
19 | scene0064_00
20 | scene0064_01
21 | scene0077_00
22 | scene0077_01
23 | scene0081_00
24 | scene0081_01
25 | scene0081_02
26 | scene0084_00
27 | scene0084_01
28 | scene0084_02
29 | scene0086_00
30 | scene0086_01
31 | scene0086_02
32 | scene0088_00
33 | scene0088_01
34 | scene0088_02
35 | scene0088_03
36 | scene0095_00
37 | scene0095_01
38 | scene0100_00
39 | scene0100_01
40 | scene0100_02
41 | scene0131_00
42 | scene0131_01
43 | scene0131_02
44 | scene0139_00
45 | scene0144_00
46 | scene0144_01
47 | scene0146_00
48 | scene0146_01
49 | scene0146_02
50 | scene0149_00
51 | scene0153_00
52 | scene0153_01
53 | scene0164_00
54 | scene0164_01
55 | scene0164_02
56 | scene0164_03
57 | scene0169_00
58 | scene0169_01
59 | scene0187_00
60 | scene0187_01
61 | scene0193_00
62 | scene0193_01
63 | scene0196_00
64 | scene0203_00
65 | scene0203_01
66 | scene0203_02
67 | scene0207_00
68 | scene0207_01
69 | scene0207_02
70 | scene0208_00
71 | scene0217_00
72 | scene0221_00
73 | scene0221_01
74 | scene0222_00
75 | scene0222_01
76 | scene0231_00
77 | scene0231_01
78 | scene0231_02
79 | scene0246_00
80 | scene0249_00
81 | scene0251_00
82 | scene0256_00
83 | scene0256_01
84 | scene0256_02
85 | scene0257_00
86 | scene0277_00
87 | scene0277_01
88 | scene0277_02
89 | scene0278_00
90 | scene0278_01
91 | scene0300_00
92 | scene0300_01
93 | scene0304_00
94 | scene0307_00
95 | scene0307_01
96 | scene0307_02
97 | scene0314_00
98 | scene0316_00
99 | scene0328_00
100 | scene0329_00
101 | scene0329_01
102 | scene0329_02
103 | scene0334_00
104 | scene0334_01
105 | scene0334_02
106 | scene0338_00
107 | scene0338_01
108 | scene0338_02
109 | scene0342_00
110 | scene0343_00
111 | scene0351_00
112 | scene0351_01
113 | scene0353_00
114 | scene0353_01
115 | scene0353_02
116 | scene0354_00
117 | scene0355_00
118 | scene0355_01
119 | scene0356_00
120 | scene0356_01
121 | scene0356_02
122 | scene0357_00
123 | scene0357_01
124 | scene0377_00
125 | scene0377_01
126 | scene0377_02
127 | scene0378_00
128 | scene0378_01
129 | scene0378_02
130 | scene0382_00
131 | scene0382_01
132 | scene0389_00
133 | scene0406_00
134 | scene0406_01
135 | scene0406_02
136 | scene0412_00
137 | scene0412_01
138 | scene0414_00
139 | scene0423_00
140 | scene0423_01
141 | scene0423_02
142 | scene0426_00
143 | scene0426_01
144 | scene0426_02
145 | scene0426_03
146 | scene0427_00
147 | scene0430_00
148 | scene0430_01
149 | scene0432_00
150 | scene0432_01
151 | scene0435_00
152 | scene0435_01
153 | scene0435_02
154 | scene0435_03
155 | scene0441_00
156 | scene0458_00
157 | scene0458_01
158 | scene0461_00
159 | scene0462_00
160 | scene0474_00
161 | scene0474_01
162 | scene0474_02
163 | scene0474_03
164 | scene0474_04
165 | scene0474_05
166 | scene0488_00
167 | scene0488_01
168 | scene0490_00
169 | scene0494_00
170 | scene0496_00
171 | scene0500_00
172 | scene0500_01
173 | scene0518_00
174 | scene0527_00
175 | scene0535_00
176 | scene0549_00
177 | scene0549_01
178 | scene0550_00
179 | scene0552_00
180 | scene0552_01
181 | scene0553_00
182 | scene0553_01
183 | scene0553_02
184 | scene0558_00
185 | scene0558_01
186 | scene0558_02
187 | scene0559_00
188 | scene0559_01
189 | scene0559_02
190 | scene0565_00
191 | scene0568_00
192 | scene0568_01
193 | scene0568_02
194 | scene0574_00
195 | scene0574_01
196 | scene0574_02
197 | scene0575_00
198 | scene0575_01
199 | scene0575_02
200 | scene0578_00
201 | scene0578_01
202 | scene0578_02
203 | scene0580_00
204 | scene0580_01
205 | scene0583_00
206 | scene0583_01
207 | scene0583_02
208 | scene0591_00
209 | scene0591_01
210 | scene0591_02
211 | scene0593_00
212 | scene0593_01
213 | scene0595_00
214 | scene0598_00
215 | scene0598_01
216 | scene0598_02
217 | scene0599_00
218 | scene0599_01
219 | scene0599_02
220 | scene0606_00
221 | scene0606_01
222 | scene0606_02
223 | scene0607_00
224 | scene0607_01
225 | scene0608_00
226 | scene0608_01
227 | scene0608_02
228 | scene0609_00
229 | scene0609_01
230 | scene0609_02
231 | scene0609_03
232 | scene0616_00
233 | scene0616_01
234 | scene0618_00
235 | scene0621_00
236 | scene0629_00
237 | scene0629_01
238 | scene0629_02
239 | scene0633_00
240 | scene0633_01
241 | scene0643_00
242 | scene0644_00
243 | scene0645_00
244 | scene0645_01
245 | scene0645_02
246 | scene0647_00
247 | scene0647_01
248 | scene0648_00
249 | scene0648_01
250 | scene0651_00
251 | scene0651_01
252 | scene0651_02
253 | scene0652_00
254 | scene0653_00
255 | scene0653_01
256 | scene0655_00
257 | scene0655_01
258 | scene0655_02
259 | scene0658_00
260 | scene0660_00
261 | scene0663_00
262 | scene0663_01
263 | scene0663_02
264 | scene0664_00
265 | scene0664_01
266 | scene0664_02
267 | scene0665_00
268 | scene0665_01
269 | scene0670_00
270 | scene0670_01
271 | scene0671_00
272 | scene0671_01
273 | scene0678_00
274 | scene0678_01
275 | scene0678_02
276 | scene0684_00
277 | scene0684_01
278 | scene0685_00
279 | scene0685_01
280 | scene0685_02
281 | scene0686_00
282 | scene0686_01
283 | scene0686_02
284 | scene0689_00
285 | scene0690_00
286 | scene0690_01
287 | scene0693_00
288 | scene0693_01
289 | scene0693_02
290 | scene0695_00
291 | scene0695_01
292 | scene0695_02
293 | scene0695_03
294 | scene0696_00
295 | scene0696_01
296 | scene0696_02
297 | scene0697_00
298 | scene0697_01
299 | scene0697_02
300 | scene0697_03
301 | scene0699_00
302 | scene0700_00
303 | scene0700_01
304 | scene0700_02
305 | scene0701_00
306 | scene0701_01
307 | scene0701_02
308 | scene0702_00
309 | scene0702_01
310 | scene0702_02
311 | scene0704_00
312 | scene0704_01
313 |
--------------------------------------------------------------------------------
/data/scannet/model_util_scannet.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from: https://github.com/facebookresearch/votenet/blob/master/scannet/model_util_scannet.py
3 | """
4 |
5 | import numpy as np
6 | import sys
7 | import os
8 |
9 | sys.path.append(os.path.join(os.getcwd(), os.pardir, "lib")) # HACK add the lib folder
10 | from lib.config import CONF
11 | from utils.box_util import get_3d_box
12 |
13 | def in_hull(p, hull):
14 | from scipy.spatial import Delaunay
15 | if not isinstance(hull,Delaunay):
16 | hull = Delaunay(hull)
17 | return hull.find_simplex(p)>=0
18 |
19 | def extract_pc_in_box3d(pc, box3d):
20 | ''' pc: (N,3), box3d: (8,3) '''
21 | box3d_roi_inds = in_hull(pc[:,0:3], box3d)
22 | return pc[box3d_roi_inds,:], box3d_roi_inds
23 |
24 | def rotate_aligned_boxes(input_boxes, rot_mat):
25 | centers, lengths = input_boxes[:,0:3], input_boxes[:,3:6]
26 | new_centers = np.dot(centers, np.transpose(rot_mat))
27 |
28 | dx, dy = lengths[:,0]/2.0, lengths[:,1]/2.0
29 | new_x = np.zeros((dx.shape[0], 4))
30 | new_y = np.zeros((dx.shape[0], 4))
31 |
32 | for i, crnr in enumerate([(-1,-1), (1, -1), (1, 1), (-1, 1)]):
33 | crnrs = np.zeros((dx.shape[0], 3))
34 | crnrs[:,0] = crnr[0]*dx
35 | crnrs[:,1] = crnr[1]*dy
36 | crnrs = np.dot(crnrs, np.transpose(rot_mat))
37 | new_x[:,i] = crnrs[:,0]
38 | new_y[:,i] = crnrs[:,1]
39 |
40 |
41 | new_dx = 2.0*np.max(new_x, 1)
42 | new_dy = 2.0*np.max(new_y, 1)
43 | new_lengths = np.stack((new_dx, new_dy, lengths[:,2]), axis=1)
44 |
45 | return np.concatenate([new_centers, new_lengths], axis=1)
46 |
47 | def rotate_aligned_boxes_along_axis(input_boxes, rot_mat, axis):
48 | centers, lengths = input_boxes[:,0:3], input_boxes[:,3:6]
49 | new_centers = np.dot(centers, np.transpose(rot_mat))
50 |
51 | if axis == "x":
52 | d1, d2 = lengths[:,1]/2.0, lengths[:,2]/2.0
53 | elif axis == "y":
54 | d1, d2 = lengths[:,0]/2.0, lengths[:,2]/2.0
55 | else:
56 | d1, d2 = lengths[:,0]/2.0, lengths[:,1]/2.0
57 |
58 | new_1 = np.zeros((d1.shape[0], 4))
59 | new_2 = np.zeros((d1.shape[0], 4))
60 |
61 | for i, crnr in enumerate([(-1,-1), (1, -1), (1, 1), (-1, 1)]):
62 | crnrs = np.zeros((d1.shape[0], 3))
63 | crnrs[:,0] = crnr[0]*d1
64 | crnrs[:,1] = crnr[1]*d2
65 | crnrs = np.dot(crnrs, np.transpose(rot_mat))
66 | new_1[:,i] = crnrs[:,0]
67 | new_2[:,i] = crnrs[:,1]
68 |
69 | new_d1 = 2.0*np.max(new_1, 1)
70 | new_d2 = 2.0*np.max(new_2, 1)
71 |
72 | if axis == "x":
73 | new_lengths = np.stack((lengths[:,0], new_d1, new_d2), axis=1)
74 | elif axis == "y":
75 | new_lengths = np.stack((new_d1, lengths[:,1], new_d2), axis=1)
76 | else:
77 | new_lengths = np.stack((new_d1, new_d2, lengths[:,2]), axis=1)
78 |
79 | return np.concatenate([new_centers, new_lengths], axis=1)
80 |
81 | class ScannetDatasetConfig(object):
82 | def __init__(self):
83 | self.type2class = {'cabinet':0, 'bed':1, 'chair':2, 'sofa':3, 'table':4, 'door':5,
84 | 'window':6,'bookshelf':7,'picture':8, 'counter':9, 'desk':10, 'curtain':11,
85 | 'refrigerator':12, 'shower curtain':13, 'toilet':14, 'sink':15, 'bathtub':16, 'others':17}
86 | self.class2type = {self.type2class[t]:t for t in self.type2class}
87 |
88 | self.nyu40ids = np.array([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) # exclude wall (1), floor (2), ceiling (22)
89 | self.nyu40id2class = self._get_nyu40id2class()
90 | self.mean_size_arr = np.load(os.path.join(CONF.PATH.SCANNET, 'meta_data/scannet_reference_means.npz'))['arr_0']
91 |
92 | self.num_class = len(self.type2class.keys())
93 | self.num_heading_bin = 1
94 | self.num_size_cluster = len(self.type2class.keys())
95 |
96 | self.type_mean_size = {}
97 | for i in range(self.num_size_cluster):
98 | self.type_mean_size[self.class2type[i]] = self.mean_size_arr[i,:]
99 |
100 | def _get_nyu40id2class(self):
101 | lines = [line.rstrip() for line in open(os.path.join(CONF.PATH.SCANNET, 'meta_data/scannetv2-labels.combined.tsv'))]
102 | lines = lines[1:]
103 | nyu40ids2class = {}
104 | for i in range(len(lines)):
105 | label_classes_set = set(self.type2class.keys())
106 | elements = lines[i].split('\t')
107 | nyu40_id = int(elements[4])
108 | nyu40_name = elements[7]
109 | if nyu40_id in self.nyu40ids:
110 | if nyu40_name not in label_classes_set:
111 | nyu40ids2class[nyu40_id] = self.type2class["others"]
112 | else:
113 | nyu40ids2class[nyu40_id] = self.type2class[nyu40_name]
114 |
115 | return nyu40ids2class
116 |
117 | def angle2class(self, angle):
118 | ''' Convert continuous angle to discrete class
119 | [optinal] also small regression number from
120 | class center angle to current angle.
121 |
122 | angle is from 0-2pi (or -pi~pi), class center at 0, 1*(2pi/N), 2*(2pi/N) ... (N-1)*(2pi/N)
123 | return is class of int32 of 0,1,...,N-1 and a number such that
124 | class*(2pi/N) + number = angle
125 |
126 | NOT USED.
127 | '''
128 | assert(False)
129 |
130 | def class2angle(self, pred_cls, residual, to_label_format=True):
131 | ''' Inverse function to angle2class.
132 |
133 | As ScanNet only has axis-alined boxes so angles are always 0. '''
134 | return 0
135 |
136 | def class2angle_batch(self, pred_cls, residual, to_label_format=True):
137 | ''' Inverse function to angle2class.
138 |
139 | As ScanNet only has axis-alined boxes so angles are always 0. '''
140 | return np.zeros(pred_cls.shape[0])
141 |
142 | def size2class(self, size, type_name):
143 | ''' Convert 3D box size (l,w,h) to size class and size residual '''
144 | size_class = self.type2class[type_name]
145 | size_residual = size - self.type_mean_size[type_name]
146 | return size_class, size_residual
147 |
148 | def class2size(self, pred_cls, residual):
149 | ''' Inverse function to size2class '''
150 | return self.mean_size_arr[pred_cls] + residual
151 |
152 | def class2size_batch(self, pred_cls, residual):
153 | ''' Inverse function to size2class '''
154 | return self.mean_size_arr[pred_cls] + residual
155 |
156 | def param2obb(self, center, heading_class, heading_residual, size_class, size_residual):
157 | heading_angle = self.class2angle(heading_class, heading_residual)
158 | box_size = self.class2size(int(size_class), size_residual)
159 | obb = np.zeros((7,))
160 | obb[0:3] = center
161 | obb[3:6] = box_size
162 | obb[6] = heading_angle*-1
163 | return obb
164 |
165 | def param2obb_batch(self, center, heading_class, heading_residual, size_class, size_residual):
166 | heading_angle = self.class2angle_batch(heading_class, heading_residual)
167 | box_size = self.class2size_batch(size_class, size_residual)
168 | obb = np.zeros((heading_class.shape[0], 7))
169 | obb[:, 0:3] = center
170 | obb[:, 3:6] = box_size
171 | obb[:, 6] = heading_angle*-1
172 | return obb
173 |
--------------------------------------------------------------------------------
/data/scannet/scannet_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from: https://github.com/facebookresearch/votenet/blob/master/scannet/scannet_utils.py
3 | """
4 |
5 | import os
6 | import sys
7 | import json
8 | import csv
9 | import numpy as np
10 |
11 | try:
12 | from plyfile import PlyData, PlyElement
13 | except:
14 | print("Please install the module 'plyfile' for PLY i/o, e.g.")
15 | print("pip install plyfile")
16 | sys.exit(-1)
17 |
18 | def normalize_v3(arr):
19 | ''' Normalize a numpy array of 3 component vectors shape=(n,3) '''
20 | lens = np.sqrt( arr[:,0]**2 + arr[:,1]**2 + arr[:,2]**2 )
21 | arr[:,0] /= (lens + 1e-8)
22 | arr[:,1] /= (lens + 1e-8)
23 | arr[:,2] /= (lens + 1e-8)
24 | return arr
25 |
26 | def compute_normal(vertices, faces):
27 | #Create a zeroed array with the same type and shape as our vertices i.e., per vertex normal
28 | normals = np.zeros( vertices.shape, dtype=vertices.dtype )
29 | #Create an indexed view into the vertex array using the array of three indices for triangles
30 | tris = vertices[faces]
31 | #Calculate the normal for all the triangles, by taking the cross product of the vectors v1-v0, and v2-v0 in each triangle
32 | n = np.cross( tris[::,1 ] - tris[::,0] , tris[::,2 ] - tris[::,0] )
33 | # n is now an array of normals per triangle. The length of each normal is dependent the vertices,
34 | # we need to normalize these, so that our next step weights each normal equally.
35 | normalize_v3(n)
36 | # now we have a normalized array of normals, one per triangle, i.e., per triangle normals.
37 | # But instead of one per triangle (i.e., flat shading), we add to each vertex in that triangle,
38 | # the triangles' normal. Multiple triangles would then contribute to every vertex, so we need to normalize again afterwards.
39 | # The cool part, we can actually add the normals through an indexed view of our (zeroed) per vertex normal array
40 | normals[ faces[:,0] ] += n
41 | normals[ faces[:,1] ] += n
42 | normals[ faces[:,2] ] += n
43 | normalize_v3(normals)
44 |
45 | return normals
46 |
47 | def represents_int(s):
48 | ''' if string s represents an int. '''
49 | try:
50 | int(s)
51 | return True
52 | except ValueError:
53 | return False
54 |
55 |
56 | def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
57 | assert os.path.isfile(filename)
58 | mapping = dict()
59 | with open(filename) as csvfile:
60 | reader = csv.DictReader(csvfile, delimiter='\t')
61 | for row in reader:
62 | mapping[row[label_from]] = int(row[label_to])
63 | if represents_int(list(mapping.keys())[0]):
64 | mapping = {int(k):v for k,v in mapping.items()}
65 | return mapping
66 |
67 | def read_mesh_vertices(filename):
68 | """ read XYZ for each vertex.
69 | """
70 | assert os.path.isfile(filename)
71 | with open(filename, 'rb') as f:
72 | plydata = PlyData.read(f)
73 | num_verts = plydata['vertex'].count
74 | vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32)
75 | vertices[:,0] = plydata['vertex'].data['x']
76 | vertices[:,1] = plydata['vertex'].data['y']
77 | vertices[:,2] = plydata['vertex'].data['z']
78 | return vertices
79 |
80 | def read_mesh_vertices_rgb(filename):
81 | """ read XYZ RGB for each vertex.
82 | Note: RGB values are in 0-255
83 | """
84 | assert os.path.isfile(filename)
85 | with open(filename, 'rb') as f:
86 | plydata = PlyData.read(f)
87 | num_verts = plydata['vertex'].count
88 | vertices = np.zeros(shape=[num_verts, 6], dtype=np.float32)
89 | vertices[:,0] = plydata['vertex'].data['x']
90 | vertices[:,1] = plydata['vertex'].data['y']
91 | vertices[:,2] = plydata['vertex'].data['z']
92 | vertices[:,3] = plydata['vertex'].data['red']
93 | vertices[:,4] = plydata['vertex'].data['green']
94 | vertices[:,5] = plydata['vertex'].data['blue']
95 | return vertices
96 |
97 | def read_mesh_vertices_rgb_normal(filename):
98 | """ read XYZ RGB normals point cloud from filename PLY file """
99 | assert(os.path.isfile(filename))
100 | with open(filename, 'rb') as f:
101 | plydata = PlyData.read(f)
102 | num_verts = plydata['vertex'].count
103 | vertices = np.zeros(shape=[num_verts, 9], dtype=np.float32)
104 | vertices[:,0] = plydata['vertex'].data['x']
105 | vertices[:,1] = plydata['vertex'].data['y']
106 | vertices[:,2] = plydata['vertex'].data['z']
107 | vertices[:,3] = plydata['vertex'].data['red']
108 | vertices[:,4] = plydata['vertex'].data['green']
109 | vertices[:,5] = plydata['vertex'].data['blue']
110 |
111 | # compute normals
112 | xyz = np.array([[x, y, z] for x, y, z, _, _, _, _ in plydata["vertex"].data])
113 | face = np.array([f[0] for f in plydata["face"].data])
114 | nxnynz = compute_normal(xyz, face)
115 | vertices[:,6:] = nxnynz
116 | return vertices
117 |
--------------------------------------------------------------------------------
/data/scannet/visualize.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 | import numpy as np
5 |
6 | if __name__ == "__main__":
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--scene_id", type=str, help="scene id of scene to be visualized", default="scene0000_00")
9 | args = parser.parse_args()
10 |
11 | verts = np.load("scannet_data/{}_vert.npy".format(args.scene_id))
12 | aligned_verts = np.load("scannet_data/{}_aligned_vert.npy".format(args.scene_id))
13 |
14 |
15 | with open("scannet_data/{}_verts.obj".format(args.scene_id), "w") as f:
16 | for i in range(verts.shape[0]):
17 | f.write("v {} {} {} {} {} {}\n".format(
18 | verts[i, 0],
19 | verts[i, 1],
20 | verts[i, 2],
21 | verts[i, 3],
22 | verts[i, 4],
23 | verts[i, 5]
24 | ))
25 |
26 | with open("scannet_data/{}_aligned_verts.obj".format(args.scene_id), "w") as f:
27 | for i in range(aligned_verts.shape[0]):
28 | f.write("v {} {} {} {} {} {}\n".format(
29 | aligned_verts[i, 0],
30 | aligned_verts[i, 1],
31 | aligned_verts[i, 2],
32 | aligned_verts[i, 3],
33 | aligned_verts[i, 4],
34 | aligned_verts[i, 5]
35 | ))
36 |
--------------------------------------------------------------------------------
/demo/ScanRefer.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/demo/ScanRefer.gif
--------------------------------------------------------------------------------
/docs/browser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/docs/browser.png
--------------------------------------------------------------------------------
/docs/davezchen_eccv2020_scanrefer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/docs/davezchen_eccv2020_scanrefer.pdf
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | ScanRefer: 3D Object Localization in RGB-DScans using Natural Language
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
ScanRefer: 3D Object Localization in RGB-D Scans using Natural Language
23 |
24 |
25 |
European Conference on Computer Vision (ECCV), 2020.
26 |
27 |
28 | Dave Zhenyu Chen 1
29 |
30 | Angel X. Chang 2
31 |
32 | Matthias Nießner 1
33 |
34 |
35 |
36 |
37 | 1 Technical University of Munich
38 |
39 | 2 Simon Fraser University
40 |
41 |
42 |
43 |
Submit to our ScanRefer Localization Benchmark here !
44 |
45 |
46 |
47 |
Introduction
48 |
49 | We introduce the task of 3D object localization in RGB-D scans using natural language descriptions.
50 | As input, we assume a point cloud of a scanned 3D scene along with a free-form description of a specified target object.
51 | To address this task, we propose ScanRefer, learning a fused descriptor from 3D object proposals and encoded sentence embeddings.
52 | This fused descriptor correlates language expressions with geometric features, enabling regression of the 3D bounding box of a target object.
53 | We also introduce the ScanRefer dataset, containing 51,583 descriptions of 11,046 objects from 800 ScanNet scenes.
54 | ScanRefer is the first large-scale effort to perform object localization via natural language expression directly in 3D.
55 |
56 |
57 |
Video
58 |
59 | VIDEO
60 |
61 |
62 |
Browse
63 |
64 | The ScanRefer data can be browsed online in your web browser. Learn more at the ScanRefer Data Browser .
65 | (For a better browsing experience, we recommend using Google Chrome.)
66 |
67 |
68 |
69 |
70 |
Publication
71 | European Conference on Computer Vision (ECCV), 2020.
72 |
Paper |
arXiv |
Code
73 |
74 |
75 |
76 |
77 | If you find our project useful, please consider citing us:
78 |
79 |
80 | @article{chen2020scanrefer,
81 | title={ScanRefer: 3D Object Localization in RGB-D Scans using Natural Language},
82 | author={Chen, Dave Zhenyu and Chang, Angel X and Nie{\ss}ner, Matthias},
83 | journal={16th European Conference on Computer Vision (ECCV)},
84 | year={2020}
85 | }
86 |
87 |
88 |
89 |
Dataset Download
90 |
91 | If you would like to access to the ScanRefer dataset, please fill out the
ScanRefer Terms of Use Form . Once your request is accepted, you will receive an email with the download link.
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/docs/paper.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/docs/paper.jpg
--------------------------------------------------------------------------------
/docs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daveredrum/ScanRefer/9d7483053e8d29acfd4db4eb1bc28f1564f5dddb/docs/teaser.png
--------------------------------------------------------------------------------
/lib/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from easydict import EasyDict
4 |
5 | CONF = EasyDict()
6 |
7 | # path
8 | CONF.PATH = EasyDict()
9 | CONF.PATH.BASE = "/home/davech2y/ScanRefer/" # TODO: change this
10 | CONF.PATH.DATA = os.path.join(CONF.PATH.BASE, "data")
11 | CONF.PATH.SCANNET = os.path.join(CONF.PATH.DATA, "scannet")
12 | CONF.PATH.LIB = os.path.join(CONF.PATH.BASE, "lib")
13 | CONF.PATH.MODELS = os.path.join(CONF.PATH.BASE, "models")
14 | CONF.PATH.UTILS = os.path.join(CONF.PATH.BASE, "utils")
15 |
16 | # append to syspath
17 | for _, path in CONF.PATH.items():
18 | sys.path.append(path)
19 |
20 | # scannet data
21 | CONF.PATH.SCANNET_SCANS = os.path.join(CONF.PATH.SCANNET, "scans")
22 | CONF.PATH.SCANNET_META = os.path.join(CONF.PATH.SCANNET, "meta_data")
23 | CONF.PATH.SCANNET_DATA = os.path.join(CONF.PATH.SCANNET, "scannet_data")
24 |
25 | # data
26 | CONF.SCANNET_DIR = "/mnt/canis/Datasets/ScanNet/public/v2/scans" # TODO change this
27 | CONF.SCANNET_FRAMES_ROOT = "/home/davech2y/frames_square/" # TODO change this
28 | CONF.PROJECTION = "/home/davech2y/multiview_projection_scanrefer" # TODO change this
29 | CONF.ENET_FEATURES_ROOT = "/home/davech2y/enet_features" # TODO change this
30 | CONF.ENET_FEATURES_SUBROOT = os.path.join(CONF.ENET_FEATURES_ROOT, "{}") # scene_id
31 | CONF.ENET_FEATURES_PATH = os.path.join(CONF.ENET_FEATURES_SUBROOT, "{}.npy") # frame_id
32 | CONF.SCANNET_FRAMES = os.path.join(CONF.SCANNET_FRAMES_ROOT, "{}/{}") # scene_id, mode
33 | CONF.SCENE_NAMES = sorted(os.listdir(CONF.SCANNET_DIR))
34 | CONF.ENET_WEIGHTS = os.path.join(CONF.PATH.BASE, "data/scannetv2_enet.pth")
35 | # CONF.MULTIVIEW = os.path.join(CONF.PATH.SCANNET_DATA, "enet_feats.hdf5")
36 | CONF.MULTIVIEW = os.path.join(CONF.PATH.SCANNET_DATA, "enet_feats_maxpool.hdf5")
37 | CONF.NYU40_LABELS = os.path.join(CONF.PATH.SCANNET_META, "nyu40_labels.csv")
38 |
39 | # scannet
40 | CONF.SCANNETV2_TRAIN = os.path.join(CONF.PATH.SCANNET_META, "scannetv2_train.txt")
41 | CONF.SCANNETV2_VAL = os.path.join(CONF.PATH.SCANNET_META, "scannetv2_val.txt")
42 | CONF.SCANNETV2_TEST = os.path.join(CONF.PATH.SCANNET_META, "scannetv2_test.txt")
43 | CONF.SCANNETV2_LIST = os.path.join(CONF.PATH.SCANNET_META, "scannetv2.txt")
44 |
45 | # output
46 | CONF.PATH.OUTPUT = os.path.join(CONF.PATH.BASE, "outputs")
47 |
48 | # train
49 | CONF.TRAIN = EasyDict()
50 | CONF.TRAIN.MAX_DES_LEN = 126
51 | CONF.TRAIN.SEED = 42
52 |
--------------------------------------------------------------------------------
/lib/eval_helper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | import torch
7 | import torch.nn as nn
8 | import numpy as np
9 | import sys
10 | import os
11 |
12 | sys.path.append(os.path.join(os.getcwd(), "lib")) # HACK add the lib folder
13 | from utils.nn_distance import nn_distance, huber_loss
14 | from lib.ap_helper import parse_predictions
15 | from lib.loss import SoftmaxRankingLoss
16 | from utils.box_util import get_3d_box, get_3d_box_batch, box3d_iou
17 |
18 | def eval_ref_one_sample(pred_bbox, gt_bbox):
19 | """ Evaluate one reference prediction
20 |
21 | Args:
22 | pred_bbox: 8 corners of prediction bounding box, (8, 3)
23 | gt_bbox: 8 corners of ground truth bounding box, (8, 3)
24 | Returns:
25 | iou: intersection over union score
26 | """
27 |
28 | iou = box3d_iou(pred_bbox, gt_bbox)
29 |
30 | return iou
31 |
32 | def construct_bbox_corners(center, box_size):
33 | sx, sy, sz = box_size
34 | x_corners = [sx/2, sx/2, -sx/2, -sx/2, sx/2, sx/2, -sx/2, -sx/2]
35 | y_corners = [sy/2, -sy/2, -sy/2, sy/2, sy/2, -sy/2, -sy/2, sy/2]
36 | z_corners = [sz/2, sz/2, sz/2, sz/2, -sz/2, -sz/2, -sz/2, -sz/2]
37 | corners_3d = np.vstack([x_corners, y_corners, z_corners])
38 | corners_3d[0,:] = corners_3d[0,:] + center[0];
39 | corners_3d[1,:] = corners_3d[1,:] + center[1];
40 | corners_3d[2,:] = corners_3d[2,:] + center[2];
41 | corners_3d = np.transpose(corners_3d)
42 |
43 | return corners_3d
44 |
45 | def get_eval(data_dict, config, reference, use_lang_classifier=False, use_oracle=False, use_cat_rand=False, use_best=False, post_processing=None):
46 | """ Loss functions
47 |
48 | Args:
49 | data_dict: dict
50 | config: dataset config instance
51 | reference: flag (False/True)
52 | post_processing: config dict
53 | Returns:
54 | loss: pytorch scalar tensor
55 | data_dict: dict
56 | """
57 |
58 | batch_size, num_words, _ = data_dict["lang_feat"].shape
59 |
60 |
61 | objectness_preds_batch = torch.argmax(data_dict['objectness_scores'], 2).long()
62 | objectness_labels_batch = data_dict['objectness_label'].long()
63 |
64 | if post_processing:
65 | _ = parse_predictions(data_dict, post_processing)
66 | nms_masks = torch.LongTensor(data_dict['pred_mask']).cuda()
67 |
68 | # construct valid mask
69 | pred_masks = (nms_masks * objectness_preds_batch == 1).float()
70 | label_masks = (objectness_labels_batch == 1).float()
71 | else:
72 | # construct valid mask
73 | pred_masks = (objectness_preds_batch == 1).float()
74 | label_masks = (objectness_labels_batch == 1).float()
75 |
76 | cluster_preds = torch.argmax(data_dict["cluster_ref"] * pred_masks, 1).long().unsqueeze(1).repeat(1, pred_masks.shape[1])
77 | preds = torch.zeros(pred_masks.shape).cuda()
78 | preds = preds.scatter_(1, cluster_preds, 1)
79 | cluster_preds = preds
80 | cluster_labels = data_dict["cluster_labels"].float()
81 | cluster_labels *= label_masks
82 |
83 | # compute classification scores
84 | corrects = torch.sum((cluster_preds == 1) * (cluster_labels == 1), dim=1).float()
85 | labels = torch.ones(corrects.shape[0]).cuda()
86 | ref_acc = corrects / (labels + 1e-8)
87 |
88 | # store
89 | data_dict["ref_acc"] = ref_acc.cpu().numpy().tolist()
90 |
91 | # compute localization metrics
92 | if use_best:
93 | pred_ref = torch.argmax(data_dict["cluster_labels"], 1) # (B,)
94 | # store the calibrated predictions and masks
95 | data_dict['cluster_ref'] = data_dict["cluster_labels"]
96 | if use_cat_rand:
97 | cluster_preds = torch.zeros(cluster_labels.shape).cuda()
98 | for i in range(cluster_preds.shape[0]):
99 | num_bbox = data_dict["num_bbox"][i]
100 | sem_cls_label = data_dict["sem_cls_label"][i]
101 | # sem_cls_label = torch.argmax(end_points["sem_cls_scores"], 2)[i]
102 | sem_cls_label[num_bbox:] -= 1
103 | candidate_masks = torch.gather(sem_cls_label == data_dict["object_cat"][i], 0, data_dict["object_assignment"][i])
104 | candidates = torch.arange(cluster_labels.shape[1])[candidate_masks]
105 | try:
106 | chosen_idx = torch.randperm(candidates.shape[0])[0]
107 | chosen_candidate = candidates[chosen_idx]
108 | cluster_preds[i, chosen_candidate] = 1
109 | except IndexError:
110 | cluster_preds[i, candidates] = 1
111 |
112 | pred_ref = torch.argmax(cluster_preds, 1) # (B,)
113 | # store the calibrated predictions and masks
114 | data_dict['cluster_ref'] = cluster_preds
115 | else:
116 | pred_ref = torch.argmax(data_dict['cluster_ref'] * pred_masks, 1) # (B,)
117 | # store the calibrated predictions and masks
118 | data_dict['cluster_ref'] = data_dict['cluster_ref'] * pred_masks
119 |
120 | if use_oracle:
121 | pred_center = data_dict['center_label'] # (B,MAX_NUM_OBJ,3)
122 | pred_heading_class = data_dict['heading_class_label'] # B,K2
123 | pred_heading_residual = data_dict['heading_residual_label'] # B,K2
124 | pred_size_class = data_dict['size_class_label'] # B,K2
125 | pred_size_residual = data_dict['size_residual_label'] # B,K2,3
126 |
127 | # assign
128 | pred_center = torch.gather(pred_center, 1, data_dict["object_assignment"].unsqueeze(2).repeat(1, 1, 3))
129 | pred_heading_class = torch.gather(pred_heading_class, 1, data_dict["object_assignment"])
130 | pred_heading_residual = torch.gather(pred_heading_residual, 1, data_dict["object_assignment"]).unsqueeze(-1)
131 | pred_size_class = torch.gather(pred_size_class, 1, data_dict["object_assignment"])
132 | pred_size_residual = torch.gather(pred_size_residual, 1, data_dict["object_assignment"].unsqueeze(2).repeat(1, 1, 3))
133 | else:
134 | pred_center = data_dict['center'] # (B,K,3)
135 | pred_heading_class = torch.argmax(data_dict['heading_scores'], -1) # B,num_proposal
136 | pred_heading_residual = torch.gather(data_dict['heading_residuals'], 2, pred_heading_class.unsqueeze(-1)) # B,num_proposal,1
137 | pred_heading_class = pred_heading_class # B,num_proposal
138 | pred_heading_residual = pred_heading_residual.squeeze(2) # B,num_proposal
139 | pred_size_class = torch.argmax(data_dict['size_scores'], -1) # B,num_proposal
140 | pred_size_residual = torch.gather(data_dict['size_residuals'], 2, pred_size_class.unsqueeze(-1).unsqueeze(-1).repeat(1,1,1,3)) # B,num_proposal,1,3
141 | pred_size_class = pred_size_class
142 | pred_size_residual = pred_size_residual.squeeze(2) # B,num_proposal,3
143 |
144 | # store
145 | data_dict["pred_mask"] = pred_masks
146 | data_dict["label_mask"] = label_masks
147 | data_dict['pred_center'] = pred_center
148 | data_dict['pred_heading_class'] = pred_heading_class
149 | data_dict['pred_heading_residual'] = pred_heading_residual
150 | data_dict['pred_size_class'] = pred_size_class
151 | data_dict['pred_size_residual'] = pred_size_residual
152 |
153 | gt_ref = torch.argmax(data_dict["ref_box_label"], 1)
154 | gt_center = data_dict['center_label'] # (B,MAX_NUM_OBJ,3)
155 | gt_heading_class = data_dict['heading_class_label'] # B,K2
156 | gt_heading_residual = data_dict['heading_residual_label'] # B,K2
157 | gt_size_class = data_dict['size_class_label'] # B,K2
158 | gt_size_residual = data_dict['size_residual_label'] # B,K2,3
159 |
160 | ious = []
161 | multiple = []
162 | others = []
163 | pred_bboxes = []
164 | gt_bboxes = []
165 | for i in range(pred_ref.shape[0]):
166 | # compute the iou
167 | pred_ref_idx, gt_ref_idx = pred_ref[i], gt_ref[i]
168 | pred_obb = config.param2obb(
169 | pred_center[i, pred_ref_idx, 0:3].detach().cpu().numpy(),
170 | pred_heading_class[i, pred_ref_idx].detach().cpu().numpy(),
171 | pred_heading_residual[i, pred_ref_idx].detach().cpu().numpy(),
172 | pred_size_class[i, pred_ref_idx].detach().cpu().numpy(),
173 | pred_size_residual[i, pred_ref_idx].detach().cpu().numpy()
174 | )
175 | gt_obb = config.param2obb(
176 | gt_center[i, gt_ref_idx, 0:3].detach().cpu().numpy(),
177 | gt_heading_class[i, gt_ref_idx].detach().cpu().numpy(),
178 | gt_heading_residual[i, gt_ref_idx].detach().cpu().numpy(),
179 | gt_size_class[i, gt_ref_idx].detach().cpu().numpy(),
180 | gt_size_residual[i, gt_ref_idx].detach().cpu().numpy()
181 | )
182 | pred_bbox = get_3d_box(pred_obb[3:6], pred_obb[6], pred_obb[0:3])
183 | gt_bbox = get_3d_box(gt_obb[3:6], gt_obb[6], gt_obb[0:3])
184 | iou = eval_ref_one_sample(pred_bbox, gt_bbox)
185 | ious.append(iou)
186 |
187 | # NOTE: get_3d_box() will return problematic bboxes
188 | pred_bbox = construct_bbox_corners(pred_obb[0:3], pred_obb[3:6])
189 | gt_bbox = construct_bbox_corners(gt_obb[0:3], gt_obb[3:6])
190 | pred_bboxes.append(pred_bbox)
191 | gt_bboxes.append(gt_bbox)
192 |
193 | # construct the multiple mask
194 | multiple.append(data_dict["unique_multiple"][i].item())
195 |
196 | # construct the others mask
197 | flag = 1 if data_dict["object_cat"][i] == 17 else 0
198 | others.append(flag)
199 |
200 | # lang
201 | if reference and use_lang_classifier:
202 | data_dict["lang_acc"] = (torch.argmax(data_dict['lang_scores'], 1) == data_dict["object_cat"]).float().mean()
203 | else:
204 | data_dict["lang_acc"] = torch.zeros(1)[0].cuda()
205 |
206 | # store
207 | data_dict["ref_iou"] = ious
208 | data_dict["ref_iou_rate_0.25"] = np.array(ious)[np.array(ious) >= 0.25].shape[0] / np.array(ious).shape[0]
209 | data_dict["ref_iou_rate_0.5"] = np.array(ious)[np.array(ious) >= 0.5].shape[0] / np.array(ious).shape[0]
210 | data_dict["ref_multiple_mask"] = multiple
211 | data_dict["ref_others_mask"] = others
212 | data_dict["pred_bboxes"] = pred_bboxes
213 | data_dict["gt_bboxes"] = gt_bboxes
214 |
215 | # --------------------------------------------
216 | # Some other statistics
217 | obj_pred_val = torch.argmax(data_dict['objectness_scores'], 2) # B,K
218 | obj_acc = torch.sum((obj_pred_val==data_dict['objectness_label'].long()).float()*data_dict['objectness_mask'])/(torch.sum(data_dict['objectness_mask'])+1e-6)
219 | data_dict['obj_acc'] = obj_acc
220 | # detection semantic classification
221 | sem_cls_label = torch.gather(data_dict['sem_cls_label'], 1, data_dict['object_assignment']) # select (B,K) from (B,K2)
222 | sem_cls_pred = data_dict['sem_cls_scores'].argmax(-1) # (B,K)
223 | sem_match = (sem_cls_label == sem_cls_pred).float()
224 | data_dict["sem_acc"] = (sem_match * data_dict["pred_mask"]).sum() / data_dict["pred_mask"].sum()
225 |
226 | return data_dict
227 |
--------------------------------------------------------------------------------
/lib/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class SoftmaxRankingLoss(nn.Module):
6 | def __init__(self):
7 | super().__init__()
8 |
9 | def forward(self, inputs, targets):
10 | # input check
11 | assert inputs.shape == targets.shape
12 |
13 | # compute the probabilities
14 | probs = F.softmax(inputs + 1e-8, dim=1)
15 |
16 | # reduction
17 | loss = -torch.sum(torch.log(probs + 1e-8) * targets, dim=1).mean()
18 |
19 | return loss
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/ball_query.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
5 | const int nsample);
6 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/cuda_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef _CUDA_UTILS_H
2 | #define _CUDA_UTILS_H
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 |
11 | #include
12 |
13 | #define TOTAL_THREADS 512
14 |
15 | inline int opt_n_threads(int work_size) {
16 | const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0);
17 |
18 | return max(min(1 << pow_2, TOTAL_THREADS), 1);
19 | }
20 |
21 | inline dim3 opt_block_config(int x, int y) {
22 | const int x_threads = opt_n_threads(x);
23 | const int y_threads =
24 | max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
25 | dim3 block_config(x_threads, y_threads, 1);
26 |
27 | return block_config;
28 | }
29 |
30 | #define CUDA_CHECK_ERRORS() \
31 | do { \
32 | cudaError_t err = cudaGetLastError(); \
33 | if (cudaSuccess != err) { \
34 | fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \
35 | cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
36 | __FILE__); \
37 | exit(-1); \
38 | } \
39 | } while (0)
40 |
41 | #endif
42 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/group_points.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor group_points(at::Tensor points, at::Tensor idx);
5 | at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
6 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/interpolate.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | std::vector three_nn(at::Tensor unknowns, at::Tensor knows);
7 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
8 | at::Tensor weight);
9 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
10 | at::Tensor weight, const int m);
11 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/sampling.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor gather_points(at::Tensor points, at::Tensor idx);
5 | at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
6 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
7 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/include/utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 |
5 | #define CHECK_CUDA(x) \
6 | do { \
7 | AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \
8 | } while (0)
9 |
10 | #define CHECK_CONTIGUOUS(x) \
11 | do { \
12 | AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
13 | } while (0)
14 |
15 | #define CHECK_IS_INT(x) \
16 | do { \
17 | AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \
18 | #x " must be an int tensor"); \
19 | } while (0)
20 |
21 | #define CHECK_IS_FLOAT(x) \
22 | do { \
23 | AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \
24 | #x " must be a float tensor"); \
25 | } while (0)
26 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/ball_query.cpp:
--------------------------------------------------------------------------------
1 | #include "ball_query.h"
2 | #include "utils.h"
3 |
4 | void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
5 | int nsample, const float *new_xyz,
6 | const float *xyz, int *idx);
7 |
8 | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
9 | const int nsample) {
10 | CHECK_CONTIGUOUS(new_xyz);
11 | CHECK_CONTIGUOUS(xyz);
12 | CHECK_IS_FLOAT(new_xyz);
13 | CHECK_IS_FLOAT(xyz);
14 |
15 | if (new_xyz.is_cuda()) {
16 | CHECK_CUDA(xyz);
17 | }
18 |
19 | at::Tensor idx =
20 | torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
21 | at::device(new_xyz.device()).dtype(at::ScalarType::Int));
22 |
23 | if (new_xyz.is_cuda()) {
24 | query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
25 | radius, nsample, new_xyz.data_ptr(),
26 | xyz.data_ptr(), idx.data_ptr());
27 | } else {
28 | AT_ASSERT(false, "CPU not supported");
29 | }
30 |
31 | return idx;
32 | }
33 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/ball_query_gpu.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "cuda_utils.h"
6 |
7 | // input: new_xyz(b, m, 3) xyz(b, n, 3)
8 | // output: idx(b, m, nsample)
9 | __global__ void query_ball_point_kernel(int b, int n, int m, float radius,
10 | int nsample,
11 | const float *__restrict__ new_xyz,
12 | const float *__restrict__ xyz,
13 | int *__restrict__ idx) {
14 | int batch_index = blockIdx.x;
15 | xyz += batch_index * n * 3;
16 | new_xyz += batch_index * m * 3;
17 | idx += m * nsample * batch_index;
18 |
19 | int index = threadIdx.x;
20 | int stride = blockDim.x;
21 |
22 | float radius2 = radius * radius;
23 | for (int j = index; j < m; j += stride) {
24 | float new_x = new_xyz[j * 3 + 0];
25 | float new_y = new_xyz[j * 3 + 1];
26 | float new_z = new_xyz[j * 3 + 2];
27 | for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
28 | float x = xyz[k * 3 + 0];
29 | float y = xyz[k * 3 + 1];
30 | float z = xyz[k * 3 + 2];
31 | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
32 | (new_z - z) * (new_z - z);
33 | if (d2 < radius2) {
34 | if (cnt == 0) {
35 | for (int l = 0; l < nsample; ++l) {
36 | idx[j * nsample + l] = k;
37 | }
38 | }
39 | idx[j * nsample + cnt] = k;
40 | ++cnt;
41 | }
42 | }
43 | }
44 | }
45 |
46 | void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
47 | int nsample, const float *new_xyz,
48 | const float *xyz, int *idx) {
49 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
50 | query_ball_point_kernel<<>>(
51 | b, n, m, radius, nsample, new_xyz, xyz, idx);
52 |
53 | CUDA_CHECK_ERRORS();
54 | }
55 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/bindings.cpp:
--------------------------------------------------------------------------------
1 | #include "ball_query.h"
2 | #include "group_points.h"
3 | #include "interpolate.h"
4 | #include "sampling.h"
5 |
6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
7 | m.def("gather_points", &gather_points);
8 | m.def("gather_points_grad", &gather_points_grad);
9 | m.def("furthest_point_sampling", &furthest_point_sampling);
10 |
11 | m.def("three_nn", &three_nn);
12 | m.def("three_interpolate", &three_interpolate);
13 | m.def("three_interpolate_grad", &three_interpolate_grad);
14 |
15 | m.def("ball_query", &ball_query);
16 |
17 | m.def("group_points", &group_points);
18 | m.def("group_points_grad", &group_points_grad);
19 | }
20 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/group_points.cpp:
--------------------------------------------------------------------------------
1 | #include "group_points.h"
2 | #include "utils.h"
3 |
4 | void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
5 | const float *points, const int *idx,
6 | float *out);
7 |
8 | void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
9 | int nsample, const float *grad_out,
10 | const int *idx, float *grad_points);
11 |
12 | at::Tensor group_points(at::Tensor points, at::Tensor idx) {
13 | CHECK_CONTIGUOUS(points);
14 | CHECK_CONTIGUOUS(idx);
15 | CHECK_IS_FLOAT(points);
16 | CHECK_IS_INT(idx);
17 |
18 | if (points.is_cuda()) {
19 | CHECK_CUDA(idx);
20 | }
21 |
22 | at::Tensor output =
23 | torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)},
24 | at::device(points.device()).dtype(at::ScalarType::Float));
25 |
26 | if (points.is_cuda()) {
27 | group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
28 | idx.size(1), idx.size(2),
29 | points.data_ptr(), idx.data_ptr(),
30 | output.data_ptr());
31 | } else {
32 | AT_ASSERT(false, "CPU not supported");
33 | }
34 |
35 | return output;
36 | }
37 |
38 | at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) {
39 | CHECK_CONTIGUOUS(grad_out);
40 | CHECK_CONTIGUOUS(idx);
41 | CHECK_IS_FLOAT(grad_out);
42 | CHECK_IS_INT(idx);
43 |
44 | if (grad_out.is_cuda()) {
45 | CHECK_CUDA(idx);
46 | }
47 |
48 | at::Tensor output =
49 | torch::zeros({grad_out.size(0), grad_out.size(1), n},
50 | at::device(grad_out.device()).dtype(at::ScalarType::Float));
51 |
52 | if (grad_out.is_cuda()) {
53 | group_points_grad_kernel_wrapper(
54 | grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2),
55 | grad_out.data_ptr(), idx.data_ptr(),
56 | output.data_ptr());
57 | } else {
58 | AT_ASSERT(false, "CPU not supported");
59 | }
60 |
61 | return output;
62 | }
63 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/group_points_gpu.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include "cuda_utils.h"
5 |
6 | // input: points(b, c, n) idx(b, npoints, nsample)
7 | // output: out(b, c, npoints, nsample)
8 | __global__ void group_points_kernel(int b, int c, int n, int npoints,
9 | int nsample,
10 | const float *__restrict__ points,
11 | const int *__restrict__ idx,
12 | float *__restrict__ out) {
13 | int batch_index = blockIdx.x;
14 | points += batch_index * n * c;
15 | idx += batch_index * npoints * nsample;
16 | out += batch_index * npoints * nsample * c;
17 |
18 | const int index = threadIdx.y * blockDim.x + threadIdx.x;
19 | const int stride = blockDim.y * blockDim.x;
20 | for (int i = index; i < c * npoints; i += stride) {
21 | const int l = i / npoints;
22 | const int j = i % npoints;
23 | for (int k = 0; k < nsample; ++k) {
24 | int ii = idx[j * nsample + k];
25 | out[(l * npoints + j) * nsample + k] = points[l * n + ii];
26 | }
27 | }
28 | }
29 |
30 | void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
31 | const float *points, const int *idx,
32 | float *out) {
33 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
34 |
35 | group_points_kernel<<>>(
36 | b, c, n, npoints, nsample, points, idx, out);
37 |
38 | CUDA_CHECK_ERRORS();
39 | }
40 |
41 | // input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
42 | // output: grad_points(b, c, n)
43 | __global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
44 | int nsample,
45 | const float *__restrict__ grad_out,
46 | const int *__restrict__ idx,
47 | float *__restrict__ grad_points) {
48 | int batch_index = blockIdx.x;
49 | grad_out += batch_index * npoints * nsample * c;
50 | idx += batch_index * npoints * nsample;
51 | grad_points += batch_index * n * c;
52 |
53 | const int index = threadIdx.y * blockDim.x + threadIdx.x;
54 | const int stride = blockDim.y * blockDim.x;
55 | for (int i = index; i < c * npoints; i += stride) {
56 | const int l = i / npoints;
57 | const int j = i % npoints;
58 | for (int k = 0; k < nsample; ++k) {
59 | int ii = idx[j * nsample + k];
60 | atomicAdd(grad_points + l * n + ii,
61 | grad_out[(l * npoints + j) * nsample + k]);
62 | }
63 | }
64 | }
65 |
66 | void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
67 | int nsample, const float *grad_out,
68 | const int *idx, float *grad_points) {
69 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
70 |
71 | group_points_grad_kernel<<>>(
72 | b, c, n, npoints, nsample, grad_out, idx, grad_points);
73 |
74 | CUDA_CHECK_ERRORS();
75 | }
76 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/interpolate.cpp:
--------------------------------------------------------------------------------
1 | #include "interpolate.h"
2 | #include "utils.h"
3 |
4 | void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
5 | const float *known, float *dist2, int *idx);
6 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
7 | const float *points, const int *idx,
8 | const float *weight, float *out);
9 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
10 | const float *grad_out,
11 | const int *idx, const float *weight,
12 | float *grad_points);
13 |
14 | std::vector three_nn(at::Tensor unknowns, at::Tensor knows) {
15 | CHECK_CONTIGUOUS(unknowns);
16 | CHECK_CONTIGUOUS(knows);
17 | CHECK_IS_FLOAT(unknowns);
18 | CHECK_IS_FLOAT(knows);
19 |
20 | if (unknowns.is_cuda()) {
21 | CHECK_CUDA(knows);
22 | }
23 |
24 | at::Tensor idx =
25 | torch::zeros({unknowns.size(0), unknowns.size(1), 3},
26 | at::device(unknowns.device()).dtype(at::ScalarType::Int));
27 | at::Tensor dist2 =
28 | torch::zeros({unknowns.size(0), unknowns.size(1), 3},
29 | at::device(unknowns.device()).dtype(at::ScalarType::Float));
30 |
31 | if (unknowns.is_cuda()) {
32 | three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
33 | unknowns.data_ptr(), knows.data_ptr(),
34 | dist2.data_ptr(), idx.data_ptr());
35 | } else {
36 | AT_ASSERT(false, "CPU not supported");
37 | }
38 |
39 | return {dist2, idx};
40 | }
41 |
42 | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
43 | at::Tensor weight) {
44 | CHECK_CONTIGUOUS(points);
45 | CHECK_CONTIGUOUS(idx);
46 | CHECK_CONTIGUOUS(weight);
47 | CHECK_IS_FLOAT(points);
48 | CHECK_IS_INT(idx);
49 | CHECK_IS_FLOAT(weight);
50 |
51 | if (points.is_cuda()) {
52 | CHECK_CUDA(idx);
53 | CHECK_CUDA(weight);
54 | }
55 |
56 | at::Tensor output =
57 | torch::zeros({points.size(0), points.size(1), idx.size(1)},
58 | at::device(points.device()).dtype(at::ScalarType::Float));
59 |
60 | if (points.is_cuda()) {
61 | three_interpolate_kernel_wrapper(
62 | points.size(0), points.size(1), points.size(2), idx.size(1),
63 | points.data_ptr(), idx.data_ptr(), weight.data_ptr(),
64 | output.data_ptr());
65 | } else {
66 | AT_ASSERT(false, "CPU not supported");
67 | }
68 |
69 | return output;
70 | }
71 | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
72 | at::Tensor weight, const int m) {
73 | CHECK_CONTIGUOUS(grad_out);
74 | CHECK_CONTIGUOUS(idx);
75 | CHECK_CONTIGUOUS(weight);
76 | CHECK_IS_FLOAT(grad_out);
77 | CHECK_IS_INT(idx);
78 | CHECK_IS_FLOAT(weight);
79 |
80 | if (grad_out.is_cuda()) {
81 | CHECK_CUDA(idx);
82 | CHECK_CUDA(weight);
83 | }
84 |
85 | at::Tensor output =
86 | torch::zeros({grad_out.size(0), grad_out.size(1), m},
87 | at::device(grad_out.device()).dtype(at::ScalarType::Float));
88 |
89 | if (grad_out.is_cuda()) {
90 | three_interpolate_grad_kernel_wrapper(
91 | grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
92 | grad_out.data_ptr(), idx.data_ptr(),
93 | weight.data_ptr(), output.data_ptr());
94 | } else {
95 | AT_ASSERT(false, "CPU not supported");
96 | }
97 |
98 | return output;
99 | }
100 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/interpolate_gpu.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "cuda_utils.h"
6 |
7 | // input: unknown(b, n, 3) known(b, m, 3)
8 | // output: dist2(b, n, 3), idx(b, n, 3)
9 | __global__ void three_nn_kernel(int b, int n, int m,
10 | const float *__restrict__ unknown,
11 | const float *__restrict__ known,
12 | float *__restrict__ dist2,
13 | int *__restrict__ idx) {
14 | int batch_index = blockIdx.x;
15 | unknown += batch_index * n * 3;
16 | known += batch_index * m * 3;
17 | dist2 += batch_index * n * 3;
18 | idx += batch_index * n * 3;
19 |
20 | int index = threadIdx.x;
21 | int stride = blockDim.x;
22 | for (int j = index; j < n; j += stride) {
23 | float ux = unknown[j * 3 + 0];
24 | float uy = unknown[j * 3 + 1];
25 | float uz = unknown[j * 3 + 2];
26 |
27 | double best1 = 1e40, best2 = 1e40, best3 = 1e40;
28 | int besti1 = 0, besti2 = 0, besti3 = 0;
29 | for (int k = 0; k < m; ++k) {
30 | float x = known[k * 3 + 0];
31 | float y = known[k * 3 + 1];
32 | float z = known[k * 3 + 2];
33 | float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
34 | if (d < best1) {
35 | best3 = best2;
36 | besti3 = besti2;
37 | best2 = best1;
38 | besti2 = besti1;
39 | best1 = d;
40 | besti1 = k;
41 | } else if (d < best2) {
42 | best3 = best2;
43 | besti3 = besti2;
44 | best2 = d;
45 | besti2 = k;
46 | } else if (d < best3) {
47 | best3 = d;
48 | besti3 = k;
49 | }
50 | }
51 | dist2[j * 3 + 0] = best1;
52 | dist2[j * 3 + 1] = best2;
53 | dist2[j * 3 + 2] = best3;
54 |
55 | idx[j * 3 + 0] = besti1;
56 | idx[j * 3 + 1] = besti2;
57 | idx[j * 3 + 2] = besti3;
58 | }
59 | }
60 |
61 | void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
62 | const float *known, float *dist2, int *idx) {
63 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
64 | three_nn_kernel<<>>(b, n, m, unknown, known,
65 | dist2, idx);
66 |
67 | CUDA_CHECK_ERRORS();
68 | }
69 |
70 | // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
71 | // output: out(b, c, n)
72 | __global__ void three_interpolate_kernel(int b, int c, int m, int n,
73 | const float *__restrict__ points,
74 | const int *__restrict__ idx,
75 | const float *__restrict__ weight,
76 | float *__restrict__ out) {
77 | int batch_index = blockIdx.x;
78 | points += batch_index * m * c;
79 |
80 | idx += batch_index * n * 3;
81 | weight += batch_index * n * 3;
82 |
83 | out += batch_index * n * c;
84 |
85 | const int index = threadIdx.y * blockDim.x + threadIdx.x;
86 | const int stride = blockDim.y * blockDim.x;
87 | for (int i = index; i < c * n; i += stride) {
88 | const int l = i / n;
89 | const int j = i % n;
90 | float w1 = weight[j * 3 + 0];
91 | float w2 = weight[j * 3 + 1];
92 | float w3 = weight[j * 3 + 2];
93 |
94 | int i1 = idx[j * 3 + 0];
95 | int i2 = idx[j * 3 + 1];
96 | int i3 = idx[j * 3 + 2];
97 |
98 | out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 +
99 | points[l * m + i3] * w3;
100 | }
101 | }
102 |
103 | void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
104 | const float *points, const int *idx,
105 | const float *weight, float *out) {
106 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
107 | three_interpolate_kernel<<>>(
108 | b, c, m, n, points, idx, weight, out);
109 |
110 | CUDA_CHECK_ERRORS();
111 | }
112 |
113 | // input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
114 | // output: grad_points(b, c, m)
115 |
116 | __global__ void three_interpolate_grad_kernel(
117 | int b, int c, int n, int m, const float *__restrict__ grad_out,
118 | const int *__restrict__ idx, const float *__restrict__ weight,
119 | float *__restrict__ grad_points) {
120 | int batch_index = blockIdx.x;
121 | grad_out += batch_index * n * c;
122 | idx += batch_index * n * 3;
123 | weight += batch_index * n * 3;
124 | grad_points += batch_index * m * c;
125 |
126 | const int index = threadIdx.y * blockDim.x + threadIdx.x;
127 | const int stride = blockDim.y * blockDim.x;
128 | for (int i = index; i < c * n; i += stride) {
129 | const int l = i / n;
130 | const int j = i % n;
131 | float w1 = weight[j * 3 + 0];
132 | float w2 = weight[j * 3 + 1];
133 | float w3 = weight[j * 3 + 2];
134 |
135 | int i1 = idx[j * 3 + 0];
136 | int i2 = idx[j * 3 + 1];
137 | int i3 = idx[j * 3 + 2];
138 |
139 | atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
140 | atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
141 | atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
142 | }
143 | }
144 |
145 | void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
146 | const float *grad_out,
147 | const int *idx, const float *weight,
148 | float *grad_points) {
149 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
150 | three_interpolate_grad_kernel<<>>(
151 | b, c, n, m, grad_out, idx, weight, grad_points);
152 |
153 | CUDA_CHECK_ERRORS();
154 | }
155 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/sampling.cpp:
--------------------------------------------------------------------------------
1 | #include "sampling.h"
2 | #include "utils.h"
3 |
4 | void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
5 | const float *points, const int *idx,
6 | float *out);
7 | void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
8 | const float *grad_out, const int *idx,
9 | float *grad_points);
10 |
11 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
12 | const float *dataset, float *temp,
13 | int *idxs);
14 |
15 | at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
16 | CHECK_CONTIGUOUS(points);
17 | CHECK_CONTIGUOUS(idx);
18 | CHECK_IS_FLOAT(points);
19 | CHECK_IS_INT(idx);
20 |
21 | if (points.is_cuda()) {
22 | CHECK_CUDA(idx);
23 | }
24 |
25 | at::Tensor output =
26 | torch::zeros({points.size(0), points.size(1), idx.size(1)},
27 | at::device(points.device()).dtype(at::ScalarType::Float));
28 |
29 | if (points.is_cuda()) {
30 | gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
31 | idx.size(1), points.data_ptr(),
32 | idx.data_ptr(), output.data_ptr());
33 | } else {
34 | AT_ASSERT(false, "CPU not supported");
35 | }
36 |
37 | return output;
38 | }
39 |
40 | at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
41 | const int n) {
42 | CHECK_CONTIGUOUS(grad_out);
43 | CHECK_CONTIGUOUS(idx);
44 | CHECK_IS_FLOAT(grad_out);
45 | CHECK_IS_INT(idx);
46 |
47 | if (grad_out.is_cuda()) {
48 | CHECK_CUDA(idx);
49 | }
50 |
51 | at::Tensor output =
52 | torch::zeros({grad_out.size(0), grad_out.size(1), n},
53 | at::device(grad_out.device()).dtype(at::ScalarType::Float));
54 |
55 | if (grad_out.is_cuda()) {
56 | gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n,
57 | idx.size(1), grad_out.data_ptr(),
58 | idx.data_ptr(),
59 | output.data_ptr());
60 | } else {
61 | AT_ASSERT(false, "CPU not supported");
62 | }
63 |
64 | return output;
65 | }
66 | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {
67 | CHECK_CONTIGUOUS(points);
68 | CHECK_IS_FLOAT(points);
69 |
70 | at::Tensor output =
71 | torch::zeros({points.size(0), nsamples},
72 | at::device(points.device()).dtype(at::ScalarType::Int));
73 |
74 | at::Tensor tmp =
75 | torch::full({points.size(0), points.size(1)}, 1e10,
76 | at::device(points.device()).dtype(at::ScalarType::Float));
77 |
78 | if (points.is_cuda()) {
79 | furthest_point_sampling_kernel_wrapper(
80 | points.size(0), points.size(1), nsamples, points.data_ptr(),
81 | tmp.data_ptr(), output.data_ptr());
82 | } else {
83 | AT_ASSERT(false, "CPU not supported");
84 | }
85 |
86 | return output;
87 | }
88 |
--------------------------------------------------------------------------------
/lib/pointnet2/_ext_src/src/sampling_gpu.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include "cuda_utils.h"
5 |
6 | // input: points(b, c, n) idx(b, m)
7 | // output: out(b, c, m)
8 | __global__ void gather_points_kernel(int b, int c, int n, int m,
9 | const float *__restrict__ points,
10 | const int *__restrict__ idx,
11 | float *__restrict__ out) {
12 | for (int i = blockIdx.x; i < b; i += gridDim.x) {
13 | for (int l = blockIdx.y; l < c; l += gridDim.y) {
14 | for (int j = threadIdx.x; j < m; j += blockDim.x) {
15 | int a = idx[i * m + j];
16 | out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
17 | }
18 | }
19 | }
20 | }
21 |
22 | void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
23 | const float *points, const int *idx,
24 | float *out) {
25 | gather_points_kernel<<>>(b, c, n, npoints,
27 | points, idx, out);
28 |
29 | CUDA_CHECK_ERRORS();
30 | }
31 |
32 | // input: grad_out(b, c, m) idx(b, m)
33 | // output: grad_points(b, c, n)
34 | __global__ void gather_points_grad_kernel(int b, int c, int n, int m,
35 | const float *__restrict__ grad_out,
36 | const int *__restrict__ idx,
37 | float *__restrict__ grad_points) {
38 | for (int i = blockIdx.x; i < b; i += gridDim.x) {
39 | for (int l = blockIdx.y; l < c; l += gridDim.y) {
40 | for (int j = threadIdx.x; j < m; j += blockDim.x) {
41 | int a = idx[i * m + j];
42 | atomicAdd(grad_points + (i * c + l) * n + a,
43 | grad_out[(i * c + l) * m + j]);
44 | }
45 | }
46 | }
47 | }
48 |
49 | void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
50 | const float *grad_out, const int *idx,
51 | float *grad_points) {
52 | gather_points_grad_kernel<<>>(
54 | b, c, n, npoints, grad_out, idx, grad_points);
55 |
56 | CUDA_CHECK_ERRORS();
57 | }
58 |
59 | __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
60 | int idx1, int idx2) {
61 | const float v1 = dists[idx1], v2 = dists[idx2];
62 | const int i1 = dists_i[idx1], i2 = dists_i[idx2];
63 | dists[idx1] = max(v1, v2);
64 | dists_i[idx1] = v2 > v1 ? i2 : i1;
65 | }
66 |
67 | // Input dataset: (b, n, 3), tmp: (b, n)
68 | // Ouput idxs (b, m)
69 | template
70 | __global__ void furthest_point_sampling_kernel(
71 | int b, int n, int m, const float *__restrict__ dataset,
72 | float *__restrict__ temp, int *__restrict__ idxs) {
73 | if (m <= 0) return;
74 | __shared__ float dists[block_size];
75 | __shared__ int dists_i[block_size];
76 |
77 | int batch_index = blockIdx.x;
78 | dataset += batch_index * n * 3;
79 | temp += batch_index * n;
80 | idxs += batch_index * m;
81 |
82 | int tid = threadIdx.x;
83 | const int stride = block_size;
84 |
85 | int old = 0;
86 | if (threadIdx.x == 0) idxs[0] = old;
87 |
88 | __syncthreads();
89 | for (int j = 1; j < m; j++) {
90 | int besti = 0;
91 | float best = -1;
92 | float x1 = dataset[old * 3 + 0];
93 | float y1 = dataset[old * 3 + 1];
94 | float z1 = dataset[old * 3 + 2];
95 | for (int k = tid; k < n; k += stride) {
96 | float x2, y2, z2;
97 | x2 = dataset[k * 3 + 0];
98 | y2 = dataset[k * 3 + 1];
99 | z2 = dataset[k * 3 + 2];
100 | float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
101 | if (mag <= 1e-3) continue;
102 |
103 | float d =
104 | (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
105 |
106 | float d2 = min(d, temp[k]);
107 | temp[k] = d2;
108 | besti = d2 > best ? k : besti;
109 | best = d2 > best ? d2 : best;
110 | }
111 | dists[tid] = best;
112 | dists_i[tid] = besti;
113 | __syncthreads();
114 |
115 | if (block_size >= 512) {
116 | if (tid < 256) {
117 | __update(dists, dists_i, tid, tid + 256);
118 | }
119 | __syncthreads();
120 | }
121 | if (block_size >= 256) {
122 | if (tid < 128) {
123 | __update(dists, dists_i, tid, tid + 128);
124 | }
125 | __syncthreads();
126 | }
127 | if (block_size >= 128) {
128 | if (tid < 64) {
129 | __update(dists, dists_i, tid, tid + 64);
130 | }
131 | __syncthreads();
132 | }
133 | if (block_size >= 64) {
134 | if (tid < 32) {
135 | __update(dists, dists_i, tid, tid + 32);
136 | }
137 | __syncthreads();
138 | }
139 | if (block_size >= 32) {
140 | if (tid < 16) {
141 | __update(dists, dists_i, tid, tid + 16);
142 | }
143 | __syncthreads();
144 | }
145 | if (block_size >= 16) {
146 | if (tid < 8) {
147 | __update(dists, dists_i, tid, tid + 8);
148 | }
149 | __syncthreads();
150 | }
151 | if (block_size >= 8) {
152 | if (tid < 4) {
153 | __update(dists, dists_i, tid, tid + 4);
154 | }
155 | __syncthreads();
156 | }
157 | if (block_size >= 4) {
158 | if (tid < 2) {
159 | __update(dists, dists_i, tid, tid + 2);
160 | }
161 | __syncthreads();
162 | }
163 | if (block_size >= 2) {
164 | if (tid < 1) {
165 | __update(dists, dists_i, tid, tid + 1);
166 | }
167 | __syncthreads();
168 | }
169 |
170 | old = dists_i[0];
171 | if (tid == 0) idxs[j] = old;
172 | }
173 | }
174 |
175 | void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
176 | const float *dataset, float *temp,
177 | int *idxs) {
178 | unsigned int n_threads = opt_n_threads(n);
179 |
180 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
181 |
182 | switch (n_threads) {
183 | case 512:
184 | furthest_point_sampling_kernel<512>
185 | <<>>(b, n, m, dataset, temp, idxs);
186 | break;
187 | case 256:
188 | furthest_point_sampling_kernel<256>
189 | <<>>(b, n, m, dataset, temp, idxs);
190 | break;
191 | case 128:
192 | furthest_point_sampling_kernel<128>
193 | <<>>(b, n, m, dataset, temp, idxs);
194 | break;
195 | case 64:
196 | furthest_point_sampling_kernel<64>
197 | <<>>(b, n, m, dataset, temp, idxs);
198 | break;
199 | case 32:
200 | furthest_point_sampling_kernel<32>
201 | <<>>(b, n, m, dataset, temp, idxs);
202 | break;
203 | case 16:
204 | furthest_point_sampling_kernel<16>
205 | <<>>(b, n, m, dataset, temp, idxs);
206 | break;
207 | case 8:
208 | furthest_point_sampling_kernel<8>
209 | <<>>(b, n, m, dataset, temp, idxs);
210 | break;
211 | case 4:
212 | furthest_point_sampling_kernel<4>
213 | <<>>(b, n, m, dataset, temp, idxs);
214 | break;
215 | case 2:
216 | furthest_point_sampling_kernel<2>
217 | <<>>(b, n, m, dataset, temp, idxs);
218 | break;
219 | case 1:
220 | furthest_point_sampling_kernel<1>
221 | <<>>(b, n, m, dataset, temp, idxs);
222 | break;
223 | default:
224 | furthest_point_sampling_kernel<512>
225 | <<>>(b, n, m, dataset, temp, idxs);
226 | }
227 |
228 | CUDA_CHECK_ERRORS();
229 | }
230 |
--------------------------------------------------------------------------------
/lib/pointnet2/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.0.0"
2 |
--------------------------------------------------------------------------------
/lib/pointnet2/pointnet2_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | ''' Testing customized ops. '''
7 |
8 | import torch
9 | from torch.autograd import gradcheck
10 | import numpy as np
11 |
12 | import os
13 | import sys
14 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
15 | sys.path.append(BASE_DIR)
16 | import pointnet2_utils
17 |
18 | def test_interpolation_grad():
19 | batch_size = 1
20 | feat_dim = 2
21 | m = 4
22 | feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda()
23 |
24 | def interpolate_func(inputs):
25 | idx = torch.from_numpy(np.array([[[0,1,2],[1,2,3]]])).int().cuda()
26 | weight = torch.from_numpy(np.array([[[1,1,1],[2,2,2]]])).float().cuda()
27 | interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight)
28 | return interpolated_feats
29 |
30 | assert (gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1))
31 |
32 | if __name__=='__main__':
33 | test_interpolation_grad()
34 |
--------------------------------------------------------------------------------
/lib/pointnet2/pytorch_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | ''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch '''
7 | import torch
8 | import torch.nn as nn
9 | from typing import List, Tuple
10 |
11 | class SharedMLP(nn.Sequential):
12 |
13 | def __init__(
14 | self,
15 | args: List[int],
16 | *,
17 | bn: bool = False,
18 | activation=nn.ReLU(inplace=True),
19 | preact: bool = False,
20 | first: bool = False,
21 | name: str = ""
22 | ):
23 | super().__init__()
24 |
25 | for i in range(len(args) - 1):
26 | self.add_module(
27 | name + 'layer{}'.format(i),
28 | Conv2d(
29 | args[i],
30 | args[i + 1],
31 | bn=(not first or not preact or (i != 0)) and bn,
32 | activation=activation
33 | if (not first or not preact or (i != 0)) else None,
34 | preact=preact
35 | )
36 | )
37 |
38 |
39 | class _BNBase(nn.Sequential):
40 |
41 | def __init__(self, in_size, batch_norm=None, name=""):
42 | super().__init__()
43 | self.add_module(name + "bn", batch_norm(in_size))
44 |
45 | nn.init.constant_(self[0].weight, 1.0)
46 | nn.init.constant_(self[0].bias, 0)
47 |
48 |
49 | class BatchNorm1d(_BNBase):
50 |
51 | def __init__(self, in_size: int, *, name: str = ""):
52 | super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
53 |
54 |
55 | class BatchNorm2d(_BNBase):
56 |
57 | def __init__(self, in_size: int, name: str = ""):
58 | super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
59 |
60 |
61 | class BatchNorm3d(_BNBase):
62 |
63 | def __init__(self, in_size: int, name: str = ""):
64 | super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name)
65 |
66 |
67 | class _ConvBase(nn.Sequential):
68 |
69 | def __init__(
70 | self,
71 | in_size,
72 | out_size,
73 | kernel_size,
74 | stride,
75 | padding,
76 | activation,
77 | bn,
78 | init,
79 | conv=None,
80 | batch_norm=None,
81 | bias=True,
82 | preact=False,
83 | name=""
84 | ):
85 | super().__init__()
86 |
87 | bias = bias and (not bn)
88 | conv_unit = conv(
89 | in_size,
90 | out_size,
91 | kernel_size=kernel_size,
92 | stride=stride,
93 | padding=padding,
94 | bias=bias
95 | )
96 | init(conv_unit.weight)
97 | if bias:
98 | nn.init.constant_(conv_unit.bias, 0)
99 |
100 | if bn:
101 | if not preact:
102 | bn_unit = batch_norm(out_size)
103 | else:
104 | bn_unit = batch_norm(in_size)
105 |
106 | if preact:
107 | if bn:
108 | self.add_module(name + 'bn', bn_unit)
109 |
110 | if activation is not None:
111 | self.add_module(name + 'activation', activation)
112 |
113 | self.add_module(name + 'conv', conv_unit)
114 |
115 | if not preact:
116 | if bn:
117 | self.add_module(name + 'bn', bn_unit)
118 |
119 | if activation is not None:
120 | self.add_module(name + 'activation', activation)
121 |
122 |
123 | class Conv1d(_ConvBase):
124 |
125 | def __init__(
126 | self,
127 | in_size: int,
128 | out_size: int,
129 | *,
130 | kernel_size: int = 1,
131 | stride: int = 1,
132 | padding: int = 0,
133 | activation=nn.ReLU(inplace=True),
134 | bn: bool = False,
135 | init=nn.init.kaiming_normal_,
136 | bias: bool = True,
137 | preact: bool = False,
138 | name: str = ""
139 | ):
140 | super().__init__(
141 | in_size,
142 | out_size,
143 | kernel_size,
144 | stride,
145 | padding,
146 | activation,
147 | bn,
148 | init,
149 | conv=nn.Conv1d,
150 | batch_norm=BatchNorm1d,
151 | bias=bias,
152 | preact=preact,
153 | name=name
154 | )
155 |
156 |
157 | class Conv2d(_ConvBase):
158 |
159 | def __init__(
160 | self,
161 | in_size: int,
162 | out_size: int,
163 | *,
164 | kernel_size: Tuple[int, int] = (1, 1),
165 | stride: Tuple[int, int] = (1, 1),
166 | padding: Tuple[int, int] = (0, 0),
167 | activation=nn.ReLU(inplace=True),
168 | bn: bool = False,
169 | init=nn.init.kaiming_normal_,
170 | bias: bool = True,
171 | preact: bool = False,
172 | name: str = ""
173 | ):
174 | super().__init__(
175 | in_size,
176 | out_size,
177 | kernel_size,
178 | stride,
179 | padding,
180 | activation,
181 | bn,
182 | init,
183 | conv=nn.Conv2d,
184 | batch_norm=BatchNorm2d,
185 | bias=bias,
186 | preact=preact,
187 | name=name
188 | )
189 |
190 |
191 | class Conv3d(_ConvBase):
192 |
193 | def __init__(
194 | self,
195 | in_size: int,
196 | out_size: int,
197 | *,
198 | kernel_size: Tuple[int, int, int] = (1, 1, 1),
199 | stride: Tuple[int, int, int] = (1, 1, 1),
200 | padding: Tuple[int, int, int] = (0, 0, 0),
201 | activation=nn.ReLU(inplace=True),
202 | bn: bool = False,
203 | init=nn.init.kaiming_normal_,
204 | bias: bool = True,
205 | preact: bool = False,
206 | name: str = ""
207 | ):
208 | super().__init__(
209 | in_size,
210 | out_size,
211 | kernel_size,
212 | stride,
213 | padding,
214 | activation,
215 | bn,
216 | init,
217 | conv=nn.Conv3d,
218 | batch_norm=BatchNorm3d,
219 | bias=bias,
220 | preact=preact,
221 | name=name
222 | )
223 |
224 |
225 | class FC(nn.Sequential):
226 |
227 | def __init__(
228 | self,
229 | in_size: int,
230 | out_size: int,
231 | *,
232 | activation=nn.ReLU(inplace=True),
233 | bn: bool = False,
234 | init=None,
235 | preact: bool = False,
236 | name: str = ""
237 | ):
238 | super().__init__()
239 |
240 | fc = nn.Linear(in_size, out_size, bias=not bn)
241 | if init is not None:
242 | init(fc.weight)
243 | if not bn:
244 | nn.init.constant_(fc.bias, 0)
245 |
246 | if preact:
247 | if bn:
248 | self.add_module(name + 'bn', BatchNorm1d(in_size))
249 |
250 | if activation is not None:
251 | self.add_module(name + 'activation', activation)
252 |
253 | self.add_module(name + 'fc', fc)
254 |
255 | if not preact:
256 | if bn:
257 | self.add_module(name + 'bn', BatchNorm1d(out_size))
258 |
259 | if activation is not None:
260 | self.add_module(name + 'activation', activation)
261 |
262 | def set_bn_momentum_default(bn_momentum):
263 |
264 | def fn(m):
265 | if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
266 | m.momentum = bn_momentum
267 |
268 | return fn
269 |
270 |
271 | class BNMomentumScheduler(object):
272 |
273 | def __init__(
274 | self, model, bn_lambda, last_epoch=-1,
275 | setter=set_bn_momentum_default
276 | ):
277 | if not isinstance(model, nn.Module):
278 | raise RuntimeError(
279 | "Class '{}' is not a PyTorch nn Module".format(
280 | type(model).__name__
281 | )
282 | )
283 |
284 | self.model = model
285 | self.setter = setter
286 | self.lmbd = bn_lambda
287 |
288 | self.step(last_epoch + 1)
289 | self.last_epoch = last_epoch
290 |
291 | def step(self, epoch=None):
292 | if epoch is None:
293 | epoch = self.last_epoch + 1
294 |
295 | self.last_epoch = epoch
296 | self.model.apply(self.setter(self.lmbd(epoch)))
297 |
298 |
299 |
--------------------------------------------------------------------------------
/lib/pointnet2/setup.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import os.path as osp
4 |
5 | from setuptools import find_packages, setup
6 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
7 |
8 | _this_dir = osp.dirname(osp.abspath(__file__))
9 | _ext_src_root = "_ext_src"
10 | _ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob(
11 | "{}/src/*.cu".format(_ext_src_root)
12 | )
13 | _ext_headers = glob.glob("{}/include/*".format(_ext_src_root))
14 |
15 | requirements = ["torch>=1.4"]
16 |
17 | os.environ["TORCH_CUDA_ARCH_LIST"] = "3.7+PTX;5.0;6.0;6.1;6.2;7.0;7.5"
18 |
19 | exec(open("_version.py").read())
20 |
21 | setup(
22 | name='pointnet2',
23 | version=__version__,
24 | packages=find_packages(),
25 | install_requires=requirements,
26 | ext_modules=[
27 | CUDAExtension(
28 | name='pointnet2._ext',
29 | sources=_ext_sources,
30 | extra_compile_args={
31 | "cxx": ["-O3"],
32 | "nvcc": ["-O3", "-Xfatbin", "-compress-all"],
33 | },
34 | include_dirs=[osp.join(_this_dir, _ext_src_root, "include")],
35 | )
36 | ],
37 | cmdclass={"build_ext": BuildExtension},
38 | include_package_data=True,
39 | )
--------------------------------------------------------------------------------
/models/backbone_module.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | import sys
6 | import os
7 |
8 | sys.path.append(os.path.join(os.getcwd(), "lib")) # HACK add the lib folder
9 | from lib.pointnet2.pointnet2_modules import PointnetSAModuleVotes, PointnetFPModule
10 |
11 | class Pointnet2Backbone(nn.Module):
12 | r"""
13 | Backbone network for point cloud feature learning.
14 | Based on Pointnet++ single-scale grouping network.
15 |
16 | Parameters
17 | ----------
18 | input_feature_dim: int
19 | Number of input channels in the feature descriptor for each point.
20 | e.g. 3 for RGB.
21 | """
22 | def __init__(self, input_feature_dim=0):
23 | super().__init__()
24 |
25 | self.input_feature_dim = input_feature_dim
26 |
27 | # --------- 4 SET ABSTRACTION LAYERS ---------
28 | self.sa1 = PointnetSAModuleVotes(
29 | npoint=2048,
30 | radius=0.2,
31 | nsample=64,
32 | mlp=[input_feature_dim, 64, 64, 128],
33 | use_xyz=True,
34 | normalize_xyz=True
35 | )
36 |
37 | self.sa2 = PointnetSAModuleVotes(
38 | npoint=1024,
39 | radius=0.4,
40 | nsample=32,
41 | mlp=[128, 128, 128, 256],
42 | use_xyz=True,
43 | normalize_xyz=True
44 | )
45 |
46 | self.sa3 = PointnetSAModuleVotes(
47 | npoint=512,
48 | radius=0.8,
49 | nsample=16,
50 | mlp=[256, 128, 128, 256],
51 | use_xyz=True,
52 | normalize_xyz=True
53 | )
54 |
55 | self.sa4 = PointnetSAModuleVotes(
56 | npoint=256,
57 | radius=1.2,
58 | nsample=16,
59 | mlp=[256, 128, 128, 256],
60 | use_xyz=True,
61 | normalize_xyz=True
62 | )
63 |
64 | # --------- 2 FEATURE UPSAMPLING LAYERS --------
65 | self.fp1 = PointnetFPModule(mlp=[256+256,256,256])
66 | self.fp2 = PointnetFPModule(mlp=[256+256,256,256])
67 |
68 | def _break_up_pc(self, pc):
69 | xyz = pc[..., :3].contiguous()
70 | features = pc[..., 3:].transpose(1, 2).contiguous() if pc.size(-1) > 3 else None
71 |
72 | return xyz, features
73 |
74 | def forward(self, data_dict):
75 | r"""
76 | Forward pass of the network
77 |
78 | Parameters
79 | ----------
80 | pointcloud: Variable(torch.cuda.FloatTensor)
81 | (B, N, 3 + input_feature_dim) tensor
82 | Point cloud to run predicts on
83 | Each point in the point-cloud MUST
84 | be formated as (x, y, z, features...)
85 |
86 | Returns
87 | ----------
88 | data_dict: {XXX_xyz, XXX_features, XXX_inds}
89 | XXX_xyz: float32 Tensor of shape (B,K,3)
90 | XXX_features: float32 Tensor of shape (B,K,D)
91 | XXX-inds: int64 Tensor of shape (B,K) values in [0,N-1]
92 | """
93 |
94 | pointcloud = data_dict["point_clouds"]
95 |
96 | batch_size = pointcloud.shape[0]
97 |
98 | xyz, features = self._break_up_pc(pointcloud)
99 |
100 | # --------- 4 SET ABSTRACTION LAYERS ---------
101 | xyz, features, fps_inds = self.sa1(xyz, features)
102 | data_dict['sa1_inds'] = fps_inds
103 | data_dict['sa1_xyz'] = xyz
104 | data_dict['sa1_features'] = features
105 |
106 | xyz, features, fps_inds = self.sa2(xyz, features) # this fps_inds is just 0,1,...,1023
107 | data_dict['sa2_inds'] = fps_inds
108 | data_dict['sa2_xyz'] = xyz
109 | data_dict['sa2_features'] = features
110 |
111 | xyz, features, fps_inds = self.sa3(xyz, features) # this fps_inds is just 0,1,...,511
112 | data_dict['sa3_xyz'] = xyz
113 | data_dict['sa3_features'] = features
114 |
115 | xyz, features, fps_inds = self.sa4(xyz, features) # this fps_inds is just 0,1,...,255
116 | data_dict['sa4_xyz'] = xyz
117 | data_dict['sa4_features'] = features
118 |
119 | # --------- 2 FEATURE UPSAMPLING LAYERS --------
120 | features = self.fp1(data_dict['sa3_xyz'], data_dict['sa4_xyz'], data_dict['sa3_features'], data_dict['sa4_features'])
121 | features = self.fp2(data_dict['sa2_xyz'], data_dict['sa3_xyz'], data_dict['sa2_features'], features)
122 | data_dict['fp2_features'] = features
123 | data_dict['fp2_xyz'] = data_dict['sa2_xyz']
124 | num_seed = data_dict['fp2_xyz'].shape[1]
125 | data_dict['fp2_inds'] = data_dict['sa1_inds'][:,0:num_seed] # indices among the entire input point clouds
126 | return data_dict
127 |
128 | if __name__=='__main__':
129 | backbone_net = Pointnet2Backbone(input_feature_dim=3).cuda()
130 | print(backbone_net)
131 | backbone_net.eval()
132 | out = backbone_net(torch.rand(16,20000,6).cuda())
133 | for key in sorted(out.keys()):
134 | print(key, '\t', out[key].shape)
135 |
--------------------------------------------------------------------------------
/models/lang_module.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import torch
4 | import torch.nn as nn
5 |
6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
7 |
8 | class LangModule(nn.Module):
9 | def __init__(self, num_text_classes, use_lang_classifier=True, use_bidir=False,
10 | emb_size=300, hidden_size=256):
11 | super().__init__()
12 |
13 | self.num_text_classes = num_text_classes
14 | self.use_lang_classifier = use_lang_classifier
15 | self.use_bidir = use_bidir
16 |
17 | self.gru = nn.GRU(
18 | input_size=emb_size,
19 | hidden_size=hidden_size,
20 | batch_first=True,
21 | bidirectional=self.use_bidir
22 | )
23 | lang_size = hidden_size * 2 if self.use_bidir else hidden_size
24 |
25 | # language classifier
26 | if use_lang_classifier:
27 | self.lang_cls = nn.Sequential(
28 | nn.Linear(lang_size, num_text_classes),
29 | nn.Dropout()
30 | )
31 |
32 |
33 | def forward(self, data_dict):
34 | """
35 | encode the input descriptions
36 | """
37 |
38 | word_embs = data_dict["lang_feat"]
39 | lang_feat = pack_padded_sequence(word_embs, data_dict["lang_len"], batch_first=True, enforce_sorted=False)
40 |
41 | # encode description
42 | _, lang_last = self.gru(lang_feat)
43 | lang_last = lang_last.permute(1, 0, 2).contiguous().flatten(start_dim=1) # batch_size, hidden_size * num_dir
44 |
45 | # store the encoded language features
46 | data_dict["lang_emb"] = lang_last # B, hidden_size
47 |
48 | # classify
49 | if self.use_lang_classifier:
50 | data_dict["lang_scores"] = self.lang_cls(data_dict["lang_emb"])
51 |
52 | return data_dict
53 |
54 |
--------------------------------------------------------------------------------
/models/match_module.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class MatchModule(nn.Module):
5 | def __init__(self, num_proposals=256, lang_size=256, hidden_size=128):
6 | super().__init__()
7 |
8 | self.num_proposals = num_proposals
9 | self.lang_size = lang_size
10 | self.hidden_size = hidden_size
11 |
12 | self.fuse = nn.Sequential(
13 | nn.Conv1d(self.lang_size + 128, hidden_size, 1),
14 | nn.ReLU()
15 | )
16 | # self.match = nn.Conv1d(hidden_size, 1, 1)
17 | self.match = nn.Sequential(
18 | nn.Conv1d(hidden_size, hidden_size, 1),
19 | nn.ReLU(),
20 | nn.BatchNorm1d(hidden_size),
21 | nn.Conv1d(hidden_size, hidden_size, 1),
22 | nn.ReLU(),
23 | nn.BatchNorm1d(hidden_size),
24 | nn.Conv1d(hidden_size, 1, 1)
25 | )
26 |
27 | def forward(self, data_dict):
28 | """
29 | Args:
30 | xyz: (B,K,3)
31 | features: (B,C,K)
32 | Returns:
33 | scores: (B,num_proposal,2+3+NH*2+NS*4)
34 | """
35 |
36 | # unpack outputs from detection branch
37 | features = data_dict['aggregated_vote_features'] # batch_size, num_proposal, 128
38 | objectness_masks = data_dict['objectness_scores'].max(2)[1].float().unsqueeze(2) # batch_size, num_proposals, 1
39 |
40 | # unpack outputs from language branch
41 | lang_feat = data_dict["lang_emb"] # batch_size, lang_size
42 | lang_feat = lang_feat.unsqueeze(1).repeat(1, self.num_proposals, 1) # batch_size, num_proposals, lang_size
43 |
44 | # fuse
45 | features = torch.cat([features, lang_feat], dim=-1) # batch_size, num_proposals, 128 + lang_size
46 | features = features.permute(0, 2, 1).contiguous() # batch_size, 128 + lang_size, num_proposals
47 |
48 | # fuse features
49 | features = self.fuse(features) # batch_size, hidden_size, num_proposals
50 |
51 | # mask out invalid proposals
52 | objectness_masks = objectness_masks.permute(0, 2, 1).contiguous() # batch_size, 1, num_proposals
53 | features = features * objectness_masks
54 |
55 | # match
56 | confidences = self.match(features).squeeze(1) # batch_size, num_proposals
57 |
58 | data_dict["cluster_ref"] = confidences
59 |
60 | return data_dict
61 |
--------------------------------------------------------------------------------
/models/proposal_module.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified from: https://github.com/facebookresearch/votenet/blob/master/models/proposal_module.py
3 | """
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | import numpy as np
9 | import os
10 | import sys
11 |
12 | sys.path.append(os.path.join(os.getcwd(), "lib")) # HACK add the lib folder
13 | import lib.pointnet2.pointnet2_utils
14 | from lib.pointnet2.pointnet2_modules import PointnetSAModuleVotes
15 |
16 | class ProposalModule(nn.Module):
17 | def __init__(self, num_class, num_heading_bin, num_size_cluster, mean_size_arr, num_proposal, sampling, seed_feat_dim=256):
18 | super().__init__()
19 |
20 | self.num_class = num_class
21 | self.num_heading_bin = num_heading_bin
22 | self.num_size_cluster = num_size_cluster
23 | self.mean_size_arr = mean_size_arr
24 | self.num_proposal = num_proposal
25 | self.sampling = sampling
26 | self.seed_feat_dim = seed_feat_dim
27 |
28 | # Vote clustering
29 | self.vote_aggregation = PointnetSAModuleVotes(
30 | npoint=self.num_proposal,
31 | radius=0.3,
32 | nsample=16,
33 | mlp=[self.seed_feat_dim, 128, 128, 128],
34 | use_xyz=True,
35 | normalize_xyz=True
36 | )
37 |
38 | # Object proposal/detection
39 | # Objectness scores (2), center residual (3),
40 | # heading class+residual (num_heading_bin*2), size class+residual(num_size_cluster*4)
41 | self.proposal = nn.Sequential(
42 | nn.Conv1d(128,128,1, bias=False),
43 | nn.BatchNorm1d(128),
44 | nn.ReLU(),
45 | nn.Conv1d(128,128,1, bias=False),
46 | nn.BatchNorm1d(128),
47 | nn.ReLU(),
48 | nn.Conv1d(128,2+3+num_heading_bin*2+num_size_cluster*4+self.num_class,1)
49 | )
50 |
51 | def forward(self, xyz, features, data_dict):
52 | """
53 | Args:
54 | xyz: (B,K,3)
55 | features: (B,C,K)
56 | Returns:
57 | scores: (B,num_proposal,2+3+NH*2+NS*4)
58 | """
59 |
60 | # Farthest point sampling (FPS) on votes
61 | xyz, features, fps_inds = self.vote_aggregation(xyz, features)
62 |
63 | sample_inds = fps_inds
64 |
65 | data_dict['aggregated_vote_xyz'] = xyz # (batch_size, num_proposal, 3)
66 | data_dict['aggregated_vote_features'] = features.permute(0, 2, 1).contiguous() # (batch_size, num_proposal, 128)
67 | data_dict['aggregated_vote_inds'] = sample_inds # (batch_size, num_proposal,) # should be 0,1,2,...,num_proposal
68 |
69 | # --------- PROPOSAL GENERATION ---------
70 | net = self.proposal(features)
71 | data_dict = self.decode_scores(net, data_dict, self.num_class, self.num_heading_bin, self.num_size_cluster, self.mean_size_arr)
72 |
73 | return data_dict
74 |
75 | def decode_scores(self, net, data_dict, num_class, num_heading_bin, num_size_cluster, mean_size_arr):
76 | """
77 | decode the predicted parameters for the bounding boxes
78 |
79 | """
80 | net_transposed = net.transpose(2,1).contiguous() # (batch_size, 1024, ..)
81 | batch_size = net_transposed.shape[0]
82 | num_proposal = net_transposed.shape[1]
83 |
84 | objectness_scores = net_transposed[:,:,0:2]
85 |
86 | base_xyz = data_dict['aggregated_vote_xyz'] # (batch_size, num_proposal, 3)
87 | center = base_xyz + net_transposed[:,:,2:5] # (batch_size, num_proposal, 3)
88 |
89 | heading_scores = net_transposed[:,:,5:5+num_heading_bin]
90 | heading_residuals_normalized = net_transposed[:,:,5+num_heading_bin:5+num_heading_bin*2]
91 |
92 | size_scores = net_transposed[:,:,5+num_heading_bin*2:5+num_heading_bin*2+num_size_cluster]
93 | size_residuals_normalized = net_transposed[:,:,5+num_heading_bin*2+num_size_cluster:5+num_heading_bin*2+num_size_cluster*4].view([batch_size, num_proposal, num_size_cluster, 3]) # Bxnum_proposalxnum_size_clusterx3
94 |
95 | sem_cls_scores = net_transposed[:,:,5+num_heading_bin*2+num_size_cluster*4:] # Bxnum_proposalx10
96 |
97 | # store
98 | data_dict['objectness_scores'] = objectness_scores
99 | data_dict['center'] = center
100 | data_dict['heading_scores'] = heading_scores # Bxnum_proposalxnum_heading_bin
101 | data_dict['heading_residuals_normalized'] = heading_residuals_normalized # Bxnum_proposalxnum_heading_bin (should be -1 to 1)
102 | data_dict['heading_residuals'] = heading_residuals_normalized * (np.pi/num_heading_bin) # Bxnum_proposalxnum_heading_bin
103 | data_dict['size_scores'] = size_scores
104 | data_dict['size_residuals_normalized'] = size_residuals_normalized
105 | data_dict['size_residuals'] = size_residuals_normalized * torch.from_numpy(mean_size_arr.astype(np.float32)).cuda().unsqueeze(0).unsqueeze(0)
106 | data_dict['sem_cls_scores'] = sem_cls_scores
107 |
108 | return data_dict
109 |
110 |
--------------------------------------------------------------------------------
/models/refnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import sys
5 | import os
6 |
7 | sys.path.append(os.path.join(os.getcwd(), "lib")) # HACK add the lib folder
8 | from models.backbone_module import Pointnet2Backbone
9 | from models.voting_module import VotingModule
10 | from models.proposal_module import ProposalModule
11 | from models.lang_module import LangModule
12 | from models.match_module import MatchModule
13 |
14 | class RefNet(nn.Module):
15 | def __init__(self, num_class, num_heading_bin, num_size_cluster, mean_size_arr,
16 | input_feature_dim=0, num_proposal=128, vote_factor=1, sampling="vote_fps",
17 | use_lang_classifier=True, use_bidir=False, no_reference=False,
18 | emb_size=300, hidden_size=256):
19 | super().__init__()
20 |
21 | self.num_class = num_class
22 | self.num_heading_bin = num_heading_bin
23 | self.num_size_cluster = num_size_cluster
24 | self.mean_size_arr = mean_size_arr
25 | assert(mean_size_arr.shape[0] == self.num_size_cluster)
26 | self.input_feature_dim = input_feature_dim
27 | self.num_proposal = num_proposal
28 | self.vote_factor = vote_factor
29 | self.sampling = sampling
30 | self.use_lang_classifier = use_lang_classifier
31 | self.use_bidir = use_bidir
32 | self.no_reference = no_reference
33 |
34 |
35 | # --------- PROPOSAL GENERATION ---------
36 | # Backbone point feature learning
37 | self.backbone_net = Pointnet2Backbone(input_feature_dim=self.input_feature_dim)
38 |
39 | # Hough voting
40 | self.vgen = VotingModule(self.vote_factor, 256)
41 |
42 | # Vote aggregation and object proposal
43 | self.proposal = ProposalModule(num_class, num_heading_bin, num_size_cluster, mean_size_arr, num_proposal, sampling)
44 |
45 | if not no_reference:
46 | # --------- LANGUAGE ENCODING ---------
47 | # Encode the input descriptions into vectors
48 | # (including attention and language classification)
49 | self.lang = LangModule(num_class, use_lang_classifier, use_bidir, emb_size, hidden_size)
50 |
51 | # --------- PROPOSAL MATCHING ---------
52 | # Match the generated proposals and select the most confident ones
53 | self.match = MatchModule(num_proposals=num_proposal, lang_size=(1 + int(self.use_bidir)) * hidden_size)
54 |
55 | def forward(self, data_dict):
56 | """ Forward pass of the network
57 |
58 | Args:
59 | data_dict: dict
60 | {
61 | point_clouds,
62 | lang_feat
63 | }
64 |
65 | point_clouds: Variable(torch.cuda.FloatTensor)
66 | (B, N, 3 + input_channels) tensor
67 | Point cloud to run predicts on
68 | Each point in the point-cloud MUST
69 | be formated as (x, y, z, features...)
70 | Returns:
71 | end_points: dict
72 | """
73 |
74 | #######################################
75 | # #
76 | # DETECTION BRANCH #
77 | # #
78 | #######################################
79 |
80 | # --------- HOUGH VOTING ---------
81 | data_dict = self.backbone_net(data_dict)
82 |
83 | # --------- HOUGH VOTING ---------
84 | xyz = data_dict["fp2_xyz"]
85 | features = data_dict["fp2_features"]
86 | data_dict["seed_inds"] = data_dict["fp2_inds"]
87 | data_dict["seed_xyz"] = xyz
88 | data_dict["seed_features"] = features
89 |
90 | xyz, features = self.vgen(xyz, features)
91 | features_norm = torch.norm(features, p=2, dim=1)
92 | features = features.div(features_norm.unsqueeze(1))
93 | data_dict["vote_xyz"] = xyz
94 | data_dict["vote_features"] = features
95 |
96 | # --------- PROPOSAL GENERATION ---------
97 | data_dict = self.proposal(xyz, features, data_dict)
98 |
99 | if not self.no_reference:
100 | #######################################
101 | # #
102 | # LANGUAGE BRANCH #
103 | # #
104 | #######################################
105 |
106 | # --------- LANGUAGE ENCODING ---------
107 | data_dict = self.lang(data_dict)
108 |
109 | #######################################
110 | # #
111 | # PROPOSAL MATCHING #
112 | # #
113 | #######################################
114 |
115 | # --------- PROPOSAL MATCHING ---------
116 | data_dict = self.match(data_dict)
117 |
118 | return data_dict
119 |
--------------------------------------------------------------------------------
/models/voting_module.py:
--------------------------------------------------------------------------------
1 | '''
2 | Voting module: generate votes from XYZ and features of seed points.
3 |
4 | Modified from: https://github.com/facebookresearch/votenet/blob/master/models/voting_module.py
5 | '''
6 |
7 | import torch
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 | class VotingModule(nn.Module):
12 | def __init__(self, vote_factor, seed_feature_dim):
13 | """ Votes generation from seed point features.
14 |
15 | Args:
16 | vote_facotr: int
17 | number of votes generated from each seed point
18 | seed_feature_dim: int
19 | number of channels of seed point features
20 | vote_feature_dim: int
21 | number of channels of vote features
22 | """
23 | super().__init__()
24 | self.vote_factor = vote_factor
25 | self.in_dim = seed_feature_dim
26 | self.out_dim = self.in_dim # due to residual feature, in_dim has to be == out_dim
27 | self.conv1 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
28 | self.conv2 = torch.nn.Conv1d(self.in_dim, self.in_dim, 1)
29 | self.conv3 = torch.nn.Conv1d(self.in_dim, (3+self.out_dim) * self.vote_factor, 1)
30 | self.bn1 = torch.nn.BatchNorm1d(self.in_dim)
31 | self.bn2 = torch.nn.BatchNorm1d(self.in_dim)
32 |
33 | def forward(self, seed_xyz, seed_features):
34 | """ Forward pass.
35 |
36 | Arguments:
37 | seed_xyz: (batch_size, num_seed, 3) Pytorch tensor
38 | seed_features: (batch_size, feature_dim, num_seed) Pytorch tensor
39 | Returns:
40 | vote_xyz: (batch_size, num_seed*vote_factor, 3)
41 | vote_features: (batch_size, vote_feature_dim, num_seed*vote_factor)
42 | """
43 | batch_size = seed_xyz.shape[0]
44 | num_seed = seed_xyz.shape[1]
45 | num_vote = num_seed*self.vote_factor
46 | net = F.relu(self.bn1(self.conv1(seed_features)))
47 | net = F.relu(self.bn2(self.conv2(net)))
48 | net = self.conv3(net) # (batch_size, (3+out_dim)*vote_factor, num_seed)
49 |
50 | net = net.transpose(2,1).view(batch_size, num_seed, self.vote_factor, 3+self.out_dim).contiguous()
51 | offset = net[:,:,:,0:3]
52 | vote_xyz = seed_xyz.unsqueeze(2) + offset
53 | vote_xyz = vote_xyz.contiguous().view(batch_size, num_vote, 3)
54 |
55 | residual_features = net[:,:,:,3:] # (batch_size, num_seed, vote_factor, out_dim)
56 | vote_features = seed_features.transpose(2,1).unsqueeze(2).contiguous() + residual_features
57 | vote_features = vote_features.contiguous().view(batch_size, num_vote, self.out_dim)
58 | vote_features = vote_features.transpose(2,1).contiguous()
59 |
60 | return vote_xyz, vote_features
61 |
62 | if __name__=='__main__':
63 | net = VotingModule(2, 256).cuda()
64 | xyz, features = net(torch.rand(8,1024,3).cuda(), torch.rand(8,256,1024).cuda())
65 | print('xyz', xyz.shape)
66 | print('features', features.shape)
67 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | plyfile
2 | opencv-python
3 | trimesh==2.35.39
4 | tensorboardX
5 | easydict
6 | tqdm
7 | h5py
8 | matplotlib
--------------------------------------------------------------------------------
/scripts/compute_multiview_features.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import math
4 | import torch
5 | import argparse
6 | import numpy as np
7 | import torch.nn as nn
8 | import torchvision.transforms as transforms
9 | from torch.utils.data import Dataset, DataLoader
10 | from imageio import imread
11 | from PIL import Image
12 | from tqdm import tqdm
13 |
14 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
15 | from lib.enet import create_enet_for_3d
16 | from lib.config import CONF
17 |
18 | # scannet data
19 | # NOTE: read only!
20 | SCANNET_FRAME_ROOT = CONF.SCANNET_FRAMES
21 | SCANNET_FRAME_PATH = os.path.join(SCANNET_FRAME_ROOT, "{}") # name of the file
22 | SCANNET_LIST = CONF.SCANNETV2_LIST
23 |
24 | ENET_PATH = CONF.ENET_WEIGHTS
25 | ENET_FEATURE_ROOT = CONF.ENET_FEATURES_SUBROOT
26 | ENET_FEATURE_PATH = CONF.ENET_FEATURES_PATH
27 |
28 | class EnetDataset(Dataset):
29 | def __init__(self):
30 | self._init_resources()
31 |
32 | def __len__(self):
33 | return len(self.data)
34 |
35 | def __getitem__(self, idx):
36 | scene_id, frame_id = self.data[idx]
37 | image = self._load_image(SCANNET_FRAME_PATH.format(scene_id, "color", "{}.jpg".format(frame_id)), [328, 256])
38 |
39 | return scene_id, frame_id, image
40 |
41 | def _init_resources(self):
42 | self._get_scene_list()
43 | self.data = []
44 | for scene_id in self.scene_list:
45 | frame_list = sorted(os.listdir(SCANNET_FRAME_ROOT.format(scene_id, "color")), key=lambda x:int(x.split(".")[0]))
46 | for frame_file in frame_list:
47 | self.data.append(
48 | (
49 | scene_id,
50 | int(frame_file.split(".")[0])
51 | )
52 | )
53 |
54 | def _get_scene_list(self):
55 | with open(SCANNET_LIST, 'r') as f:
56 | self.scene_list = sorted(list(set(f.read().splitlines())))
57 |
58 | def _resize_crop_image(self, image, new_image_dims):
59 | image_dims = [image.shape[1], image.shape[0]]
60 | if image_dims != new_image_dims:
61 | resize_width = int(math.floor(new_image_dims[1] * float(image_dims[0]) / float(image_dims[1])))
62 | image = transforms.Resize([new_image_dims[1], resize_width], interpolation=Image.NEAREST)(Image.fromarray(image))
63 | image = transforms.CenterCrop([new_image_dims[1], new_image_dims[0]])(image)
64 |
65 | return np.array(image)
66 |
67 | def _load_image(self, file, image_dims):
68 | image = imread(file)
69 | # preprocess
70 | image = self._resize_crop_image(image, image_dims)
71 | if len(image.shape) == 3: # color image
72 | image = np.transpose(image, [2, 0, 1]) # move feature to front
73 | image = transforms.Normalize(mean=[0.496342, 0.466664, 0.440796], std=[0.277856, 0.28623, 0.291129])(torch.Tensor(image.astype(np.float32) / 255.0))
74 | elif len(image.shape) == 2: # label image
75 | image = np.expand_dims(image, 0)
76 | else:
77 | raise ValueError
78 |
79 | return image
80 |
81 | def collate_fn(self, data):
82 | scene_ids, frame_ids, images = zip(*data)
83 | scene_ids = list(scene_ids)
84 | frame_ids = list(frame_ids)
85 | images = torch.stack(images, 0).cuda()
86 |
87 | return scene_ids, frame_ids, images
88 |
89 | def create_enet():
90 | enet_fixed, enet_trainable, _ = create_enet_for_3d(41, ENET_PATH, 21)
91 | enet = nn.Sequential(
92 | enet_fixed,
93 | enet_trainable
94 | ).cuda()
95 | enet.eval()
96 | for param in enet.parameters():
97 | param.requires_grad = False
98 |
99 | return enet
100 |
101 | if __name__ == "__main__":
102 | parser = argparse.ArgumentParser()
103 | parser.add_argument('--gpu', type=str, help='gpu', default='0')
104 | args = parser.parse_args()
105 |
106 | # setting
107 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
108 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
109 |
110 | # init
111 | dataset = EnetDataset()
112 | dataloader = DataLoader(dataset, batch_size=256, shuffle=False, collate_fn=dataset.collate_fn)
113 | enet = create_enet()
114 |
115 | # feed
116 | print("extracting multiview features from ENet...")
117 | for scene_ids, frame_ids, images in tqdm(dataloader):
118 | features = enet(images)
119 | batch_size = images.shape[0]
120 | for batch_id in range(batch_size):
121 | os.makedirs(ENET_FEATURE_ROOT.format(scene_ids[batch_id]), exist_ok=True)
122 | np.save(ENET_FEATURE_PATH.format(scene_ids[batch_id], frame_ids[batch_id]), features[batch_id].cpu().numpy())
123 |
124 | print("done!")
125 |
126 |
--------------------------------------------------------------------------------
/scripts/project_multiview_features.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import h5py
4 | import torch
5 | import torch.nn as nn
6 | import argparse
7 | import numpy as np
8 | from tqdm import tqdm
9 | from plyfile import PlyData, PlyElement
10 | import math
11 | from imageio import imread
12 | from PIL import Image
13 | import torchvision.transforms as transforms
14 |
15 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
16 | from lib.config import CONF
17 | from lib.projection import ProjectionHelper
18 |
19 | SCANNET_LIST = CONF.SCANNETV2_LIST
20 | SCANNET_DATA = CONF.PATH.SCANNET_DATA
21 | SCANNET_FRAME_ROOT = CONF.SCANNET_FRAMES
22 | SCANNET_FRAME_PATH = os.path.join(SCANNET_FRAME_ROOT, "{}") # name of the file
23 |
24 | ENET_FEATURE_PATH = CONF.ENET_FEATURES_PATH
25 | ENET_FEATURE_DATABASE = CONF.MULTIVIEW
26 |
27 | # projection
28 | INTRINSICS = [[37.01983, 0, 20, 0],[0, 38.52470, 15.5, 0],[0, 0, 1, 0],[0, 0, 0, 1]]
29 | PROJECTOR = ProjectionHelper(INTRINSICS, 0.1, 4.0, [41, 32], 0.05)
30 |
31 | def get_scene_list():
32 | with open(SCANNET_LIST, 'r') as f:
33 | return sorted(list(set(f.read().splitlines())))
34 |
35 | def to_tensor(arr):
36 | return torch.Tensor(arr).cuda()
37 |
38 | def resize_crop_image(image, new_image_dims):
39 | image_dims = [image.shape[1], image.shape[0]]
40 | if image_dims == new_image_dims:
41 | return image
42 | resize_width = int(math.floor(new_image_dims[1] * float(image_dims[0]) / float(image_dims[1])))
43 | image = transforms.Resize([new_image_dims[1], resize_width], interpolation=Image.NEAREST)(Image.fromarray(image))
44 | image = transforms.CenterCrop([new_image_dims[1], new_image_dims[0]])(image)
45 | image = np.array(image)
46 |
47 | return image
48 |
49 | def load_image(file, image_dims):
50 | image = imread(file)
51 | # preprocess
52 | image = resize_crop_image(image, image_dims)
53 | if len(image.shape) == 3: # color image
54 | image = np.transpose(image, [2, 0, 1]) # move feature to front
55 | image = transforms.Normalize(mean=[0.496342, 0.466664, 0.440796], std=[0.277856, 0.28623, 0.291129])(torch.Tensor(image.astype(np.float32) / 255.0))
56 | elif len(image.shape) == 2: # label image
57 | # image = np.expand_dims(image, 0)
58 | pass
59 | else:
60 | raise
61 |
62 | return image
63 |
64 | def load_pose(filename):
65 | lines = open(filename).read().splitlines()
66 | assert len(lines) == 4
67 | lines = [[x[0],x[1],x[2],x[3]] for x in (x.split(" ") for x in lines)]
68 |
69 | return np.asarray(lines).astype(np.float32)
70 |
71 | def load_depth(file, image_dims):
72 | depth_image = imread(file)
73 | # preprocess
74 | depth_image = resize_crop_image(depth_image, image_dims)
75 | depth_image = depth_image.astype(np.float32) / 1000.0
76 |
77 | return depth_image
78 |
79 | def get_scene_data(scene_list):
80 | scene_data = {}
81 | for scene_id in scene_list:
82 | # load the original vertices, not the axis-aligned ones
83 | scene_data[scene_id] = np.load(os.path.join(SCANNET_DATA, scene_id)+"_vert.npy")[:, :3]
84 |
85 | return scene_data
86 |
87 | def compute_projection(points, depth, camera_to_world):
88 | """
89 | :param points: tensor containing all points of the point cloud (num_points, 3)
90 | :param depth: depth map (size: proj_image)
91 | :param camera_to_world: camera pose (4, 4)
92 |
93 | :return indices_3d (array with point indices that correspond to a pixel),
94 | :return indices_2d (array with pixel indices that correspond to a point)
95 |
96 | note:
97 | the first digit of indices represents the number of relevant points
98 | the rest digits are for the projection mapping
99 | """
100 | num_points = points.shape[0]
101 | num_frames = depth.shape[0]
102 | indices_3ds = torch.zeros(num_frames, num_points + 1).long().cuda()
103 | indices_2ds = torch.zeros(num_frames, num_points + 1).long().cuda()
104 |
105 | for i in range(num_frames):
106 | indices = PROJECTOR.compute_projection(to_tensor(points), to_tensor(depth[i]), to_tensor(camera_to_world[i]))
107 | if indices:
108 | indices_3ds[i] = indices[0].long()
109 | indices_2ds[i] = indices[1].long()
110 | print("found {} mappings in {} points from frame {}".format(indices_3ds[i][0], num_points, i))
111 |
112 | return indices_3ds, indices_2ds
113 |
114 | if __name__ == "__main__":
115 | parser = argparse.ArgumentParser()
116 | parser.add_argument('--gpu', type=str, help='gpu', default='0')
117 | parser.add_argument("--maxpool", action="store_true", help="use max pooling to aggregate features \
118 | (use majority voting in label projection mode)")
119 | args = parser.parse_args()
120 |
121 | # setting
122 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
123 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
124 |
125 | scene_list = get_scene_list()
126 | scene_data = get_scene_data(scene_list)
127 | with h5py.File(ENET_FEATURE_DATABASE, "w", libver="latest") as database:
128 | print("projecting multiview features to point cloud...")
129 | for scene_id in scene_list:
130 | print("processing {}...".format(scene_id))
131 | scene = scene_data[scene_id]
132 | # load frames
133 | frame_list = list(map(lambda x: x.split(".")[0], sorted(os.listdir(SCANNET_FRAME_ROOT.format(scene_id, "color")))))
134 | scene_images = np.zeros((len(frame_list), 3, 256, 328))
135 | scene_depths = np.zeros((len(frame_list), 32, 41))
136 | scene_poses = np.zeros((len(frame_list), 4, 4))
137 | for i, frame_id in enumerate(frame_list):
138 | scene_images[i] = load_image(SCANNET_FRAME_PATH.format(scene_id, "color", "{}.jpg".format(frame_id)), [328, 256])
139 | scene_depths[i] = load_depth(SCANNET_FRAME_PATH.format(scene_id, "depth", "{}.png".format(frame_id)), [41, 32])
140 | scene_poses[i] = load_pose(SCANNET_FRAME_PATH.format(scene_id, "pose", "{}.txt".format(frame_id)))
141 |
142 | # compute projections for each chunk
143 | projection_3d, projection_2d = compute_projection(scene, scene_depths, scene_poses)
144 |
145 | # compute valid projections
146 | projections = []
147 | for i in range(projection_3d.shape[0]):
148 | num_valid = projection_3d[i, 0]
149 | if num_valid == 0:
150 | continue
151 |
152 | projections.append((frame_list[i], projection_3d[i], projection_2d[i]))
153 |
154 | # # project
155 | # point_features = to_tensor(scene).new(scene.shape[0], 128).fill_(0)
156 | # for i, projection in enumerate(projections):
157 | # frame_id = projection[0]
158 | # projection_3d = projection[1]
159 | # projection_2d = projection[2]
160 | # feat = to_tensor(np.load(ENET_FEATURE_PATH.format(scene_id, frame_id)))
161 | # proj_feat = PROJECTOR.project(feat, projection_3d, projection_2d, scene.shape[0]).transpose(1, 0)
162 | # if i == 0:
163 | # point_features = proj_feat
164 | # else:
165 | # mask = ((point_features == 0).sum(1) == 128).nonzero().squeeze(1)
166 | # point_features[mask] = proj_feat[mask]
167 |
168 | # project
169 | point_features = to_tensor(scene).new(scene.shape[0], 128).fill_(0)
170 | for i, projection in enumerate(projections):
171 | frame_id = projection[0]
172 | projection_3d = projection[1]
173 | projection_2d = projection[2]
174 | feat = to_tensor(np.load(ENET_FEATURE_PATH.format(scene_id, frame_id)))
175 |
176 | proj_feat = PROJECTOR.project(feat, projection_3d, projection_2d, scene.shape[0]).transpose(1, 0)
177 |
178 | if args.maxpool:
179 | # only apply max pooling on the overlapping points
180 | # find out the points that are covered in projection
181 | feat_mask = ((proj_feat == 0).sum(1) != 128).bool()
182 | # find out the points that are not filled with features
183 | point_mask = ((point_features == 0).sum(1) == 128).bool()
184 |
185 | # for the points that are not filled with features
186 | # and are covered in projection,
187 | # simply fill those points with projected features
188 | mask = point_mask * feat_mask
189 | point_features[mask] = proj_feat[mask]
190 |
191 | # for the points that have already been filled with features
192 | # and are covered in projection,
193 | # apply max pooling first and then fill with pooled values
194 | mask = ~point_mask * feat_mask
195 | point_features[mask] = torch.max(point_features[mask], proj_feat[mask])
196 | else:
197 | if i == 0:
198 | point_features = proj_feat
199 | else:
200 | mask = (point_features == 0).sum(1) == 128
201 | point_features[mask] = proj_feat[mask]
202 |
203 | # save
204 | database.create_dataset(scene_id, data=point_features.cpu().numpy())
205 |
206 | print("done!")
207 |
208 |
209 |
--------------------------------------------------------------------------------
/scripts/project_multiview_labels.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import h5py
4 | import math
5 | import argparse
6 | import torch
7 | import torch.nn as nn
8 | import numpy as np
9 | import pandas as pd
10 | import torchvision.transforms as transforms
11 |
12 | from imageio import imread
13 | from PIL import Image
14 | from tqdm import tqdm
15 | from plyfile import PlyData, PlyElement
16 | from collections import Counter
17 |
18 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
19 | from lib.config import CONF
20 | from lib.projection import ProjectionHelper
21 | from lib.enet import create_enet_for_3d
22 |
23 | SCANNET_LIST = CONF.SCANNETV2_LIST
24 | SCANNET_DATA = CONF.PATH.SCANNET_DATA
25 | SCANNET_FRAME_ROOT = CONF.SCANNET_FRAMES
26 | SCANNET_FRAME_PATH = os.path.join(SCANNET_FRAME_ROOT, "{}") # name of the file
27 |
28 | ENET_FEATURE_PATH = CONF.ENET_FEATURES_PATH
29 | ENET_FEATURE_DATABASE = CONF.MULTIVIEW
30 |
31 | # projection
32 | INTRINSICS = [[37.01983, 0, 20, 0],[0, 38.52470, 15.5, 0],[0, 0, 1, 0],[0, 0, 0, 1]]
33 | PROJECTOR = ProjectionHelper(INTRINSICS, 0.1, 4.0, [41, 32], 0.05)
34 |
35 | ENET_PATH = CONF.ENET_WEIGHTS
36 | ENET_GT_PATH = SCANNET_FRAME_PATH
37 |
38 | NYU40_LABELS = CONF.NYU40_LABELS
39 | SCANNET_LABELS = ['unannotated', 'wall', 'floor', 'chair', 'table', 'desk', 'bed', 'bookshelf', 'sofa', 'sink', 'bathtub', 'toilet', 'curtain', 'counter', 'door', 'window', 'shower curtain', 'refridgerator', 'picture', 'cabinet', 'otherfurniture']
40 |
41 | PC_LABEL_ROOT = os.path.join(CONF.PATH.OUTPUT, "projections")
42 | PC_LABEL_PATH = os.path.join(PC_LABEL_ROOT, "{}.ply")
43 |
44 | def get_nyu40_labels():
45 | labels = ["unannotated"]
46 | labels += pd.read_csv(NYU40_LABELS)["nyu40class"].tolist()
47 |
48 | return labels
49 |
50 | def get_prediction_to_raw():
51 | labels = get_nyu40_labels()
52 | mapping = {i: label for i, label in enumerate(labels)}
53 |
54 | return mapping
55 |
56 | def get_nyu_to_scannet():
57 | nyu_idx_to_nyu_label = get_prediction_to_raw()
58 | scannet_label_to_scannet_idx = {label: i for i, label in enumerate(SCANNET_LABELS)}
59 |
60 | # mapping
61 | nyu_to_scannet = {}
62 | for nyu_idx in range(41):
63 | nyu_label = nyu_idx_to_nyu_label[nyu_idx]
64 | if nyu_label in scannet_label_to_scannet_idx.keys():
65 | scannet_idx = scannet_label_to_scannet_idx[nyu_label]
66 | else:
67 | scannet_idx = 0
68 | nyu_to_scannet[nyu_idx] = scannet_idx
69 |
70 | return nyu_to_scannet
71 |
72 | def create_color_palette():
73 | return {
74 | "unannotated": (0, 0, 0),
75 | "floor": (152, 223, 138),
76 | "wall": (174, 199, 232),
77 | "cabinet": (31, 119, 180),
78 | "bed": (255, 187, 120),
79 | "chair": (188, 189, 34),
80 | "sofa": (140, 86, 75),
81 | "table": (255, 152, 150),
82 | "door": (214, 39, 40),
83 | "window": (197, 176, 213),
84 | "bookshelf": (148, 103, 189),
85 | "picture": (196, 156, 148),
86 | "counter": (23, 190, 207),
87 | "desk": (247, 182, 210),
88 | "curtain": (219, 219, 141),
89 | "refridgerator": (255, 127, 14),
90 | "bathtub": (227, 119, 194),
91 | "shower curtain": (158, 218, 229),
92 | "toilet": (44, 160, 44),
93 | "sink": (112, 128, 144),
94 | "otherfurniture": (82, 84, 163),
95 | }
96 |
97 | def get_scene_list(args):
98 | if args.scene_id == "-1":
99 | with open(SCANNET_LIST, 'r') as f:
100 | return sorted(list(set(f.read().splitlines())))
101 | else:
102 | return [args.scene_id]
103 |
104 | def to_tensor(arr):
105 | return torch.Tensor(arr).cuda()
106 |
107 | def resize_crop_image(image, new_image_dims):
108 | image_dims = [image.shape[1], image.shape[0]]
109 | if image_dims == new_image_dims:
110 | return image
111 | resize_width = int(math.floor(new_image_dims[1] * float(image_dims[0]) / float(image_dims[1])))
112 | image = transforms.Resize([new_image_dims[1], resize_width], interpolation=Image.NEAREST)(Image.fromarray(image))
113 | image = transforms.CenterCrop([new_image_dims[1], new_image_dims[0]])(image)
114 | image = np.array(image)
115 |
116 | return image
117 |
118 | def load_image(file, image_dims):
119 | image = imread(file)
120 | # preprocess
121 | image = resize_crop_image(image, image_dims)
122 | if len(image.shape) == 3: # color image
123 | image = np.transpose(image, [2, 0, 1]) # move feature to front
124 | image = transforms.Normalize(mean=[0.496342, 0.466664, 0.440796], std=[0.277856, 0.28623, 0.291129])(torch.Tensor(image.astype(np.float32) / 255.0))
125 | elif len(image.shape) == 2: # label image
126 | # image = np.expand_dims(image, 0)
127 | pass
128 | else:
129 | raise
130 |
131 | return image
132 |
133 | def load_pose(filename):
134 | lines = open(filename).read().splitlines()
135 | assert len(lines) == 4
136 | lines = [[x[0],x[1],x[2],x[3]] for x in (x.split(" ") for x in lines)]
137 |
138 | return np.asarray(lines).astype(np.float32)
139 |
140 | def load_depth(file, image_dims):
141 | depth_image = imread(file)
142 | # preprocess
143 | depth_image = resize_crop_image(depth_image, image_dims)
144 | depth_image = depth_image.astype(np.float32) / 1000.0
145 |
146 | return depth_image
147 |
148 | def visualize(coords, labels):
149 | palette = create_color_palette()
150 | nyu_to_scannet = get_nyu_to_scannet()
151 | vertex = []
152 | for i in range(coords.shape[0]):
153 | vertex.append(
154 | (
155 | coords[i][0],
156 | coords[i][1],
157 | coords[i][2],
158 | palette[SCANNET_LABELS[nyu_to_scannet[labels[i]]]][0],
159 | palette[SCANNET_LABELS[nyu_to_scannet[labels[i]]]][1],
160 | palette[SCANNET_LABELS[nyu_to_scannet[labels[i]]]][2]
161 | )
162 | )
163 |
164 | vertex = np.array(
165 | vertex,
166 | dtype=[
167 | ("x", np.dtype("float32")),
168 | ("y", np.dtype("float32")),
169 | ("z", np.dtype("float32")),
170 | ("red", np.dtype("uint8")),
171 | ("green", np.dtype("uint8")),
172 | ("blue", np.dtype("uint8"))
173 | ]
174 | )
175 |
176 | output_pc = PlyElement.describe(vertex, "vertex")
177 | output_pc = PlyData([output_pc])
178 | os.makedirs(PC_LABEL_ROOT, exist_ok=True)
179 | output_pc.write(PC_LABEL_PATH.format(args.scene_id))
180 |
181 | def get_scene_data(scene_list):
182 | scene_data = {}
183 | for scene_id in scene_list:
184 | scene_data[scene_id] = {}
185 | scene_data[scene_id] = np.load(os.path.join(SCANNET_DATA, scene_id)+"_vert.npy")[:, :3]
186 |
187 | return scene_data
188 |
189 | def compute_projection(points, depth, camera_to_world):
190 | """
191 | :param points: tensor containing all points of the point cloud (num_points, 3)
192 | :param depth: depth map (size: proj_image)
193 | :param camera_to_world: camera pose (4, 4)
194 |
195 | :return indices_3d (array with point indices that correspond to a pixel),
196 | :return indices_2d (array with pixel indices that correspond to a point)
197 |
198 | note:
199 | the first digit of indices represents the number of relevant points
200 | the rest digits are for the projection mapping
201 | """
202 | num_points = points.shape[0]
203 | num_frames = depth.shape[0]
204 | indices_3ds = torch.zeros(num_frames, num_points + 1).long().cuda()
205 | indices_2ds = torch.zeros(num_frames, num_points + 1).long().cuda()
206 |
207 | for i in range(num_frames):
208 | indices = PROJECTOR.compute_projection(to_tensor(points), to_tensor(depth[i]), to_tensor(camera_to_world[i]))
209 | if indices:
210 | indices_3ds[i] = indices[0].long()
211 | indices_2ds[i] = indices[1].long()
212 |
213 | return indices_3ds, indices_2ds
214 |
215 | def create_enet():
216 | enet_fixed, enet_trainable, enet_classifier = create_enet_for_3d(41, ENET_PATH, 21)
217 | enet = nn.Sequential(
218 | enet_fixed,
219 | enet_trainable,
220 | enet_classifier
221 | ).cuda()
222 | enet.eval()
223 | for param in enet.parameters():
224 | param.requires_grad = False
225 |
226 | return enet
227 |
228 |
229 | if __name__ == "__main__":
230 | parser = argparse.ArgumentParser()
231 | parser.add_argument("--scene_id", type=str, default="-1")
232 | parser.add_argument("--gt", action="store_true")
233 | parser.add_argument("--maxpool", action="store_true", help="use max pooling to aggregate features \
234 | (use majority voting in label projection mode)")
235 | args = parser.parse_args()
236 |
237 | scene_list = get_scene_list(args)
238 | scene_data = get_scene_data(scene_list)
239 | enet = create_enet()
240 | for scene_id in tqdm(scene_list):
241 | scene = scene_data[scene_id]
242 | # load frames
243 | frame_list = list(map(lambda x: x.split(".")[0], sorted(os.listdir(SCANNET_FRAME_ROOT.format(scene_id, "color")))))
244 | scene_images = np.zeros((len(frame_list), 3, 256, 328))
245 | scene_depths = np.zeros((len(frame_list), 32, 41))
246 | scene_poses = np.zeros((len(frame_list), 4, 4))
247 | for i, frame_id in enumerate(frame_list):
248 | scene_images[i] = load_image(SCANNET_FRAME_PATH.format(scene_id, "color", "{}.jpg".format(frame_id)), [328, 256])
249 | scene_depths[i] = load_depth(SCANNET_FRAME_PATH.format(scene_id, "depth", "{}.png".format(frame_id)), [41, 32])
250 | scene_poses[i] = load_pose(SCANNET_FRAME_PATH.format(scene_id, "pose", "{}.txt".format(frame_id)))
251 |
252 | # compute projections for each chunk
253 | projection_3d, projection_2d = compute_projection(scene, scene_depths, scene_poses)
254 |
255 | # compute valid projections
256 | projections = []
257 | for i in range(projection_3d.shape[0]):
258 | num_valid = projection_3d[i, 0]
259 | if num_valid == 0:
260 | continue
261 |
262 | projections.append((frame_list[i], projection_3d[i], projection_2d[i]))
263 |
264 | # # project
265 | # labels = None
266 | # for i, projection in enumerate(projections):
267 | # frame_id = projection[0]
268 | # projection_3d = projection[1]
269 | # projection_2d = projection[2]
270 | # if args.gt:
271 | # feat = to_tensor(load_image(ENET_GT_PATH.format(scene_id, "labelv2", "{}.png".format(frame_id)), [41, 32])).unsqueeze(0)
272 | # else:
273 | # image = load_image(SCANNET_FRAME_PATH.format(scene_id, "color", "{}.jpg".format(frame_id)), [328, 256])
274 | # feat = enet(to_tensor(image).unsqueeze(0)).max(1)[1].unsqueeze(1)
275 |
276 | # proj_label = PROJECTOR.project(feat, projection_3d, projection_2d, scene.shape[0]).transpose(1, 0)
277 | # if i == 0:
278 | # labels = proj_label
279 | # else:
280 | # labels[labels == 0] = proj_label[labels == 0]
281 |
282 | # project
283 | labels = to_tensor(scene).new(scene.shape[0], len(projections)).fill_(0).long()
284 | for i, projection in enumerate(projections):
285 | frame_id = projection[0]
286 | projection_3d = projection[1]
287 | projection_2d = projection[2]
288 |
289 | if args.gt:
290 | feat = to_tensor(load_image(ENET_GT_PATH.format(scene_id, "labelv2", "{}.png".format(frame_id)), [41, 32])).unsqueeze(0)
291 | else:
292 | image = load_image(SCANNET_FRAME_PATH.format(scene_id, "color", "{}.jpg".format(frame_id)), [328, 256])
293 | feat = enet(to_tensor(image).unsqueeze(0)).max(1)[1].unsqueeze(1)
294 |
295 | proj_label = PROJECTOR.project(feat, projection_3d, projection_2d, scene.shape[0]).transpose(1, 0) # num_points, 1
296 |
297 | if args.maxpool:
298 | # only apply max pooling on the overlapping points
299 | # find out the points that are covered in projection
300 | feat_mask = ((proj_label == 0).sum(1) != 1).bool()
301 | # find out the points that are not filled with labels
302 | point_mask = ((labels == 0).sum(1) == len(projections)).bool()
303 |
304 | # for the points that are not filled with features
305 | # and are covered in projection,
306 | # simply fill those points with labels
307 | mask = point_mask * feat_mask
308 | labels[mask, i] = proj_label[mask, 0]
309 |
310 | # for the points that have already been filled with features
311 | # and are covered in projection,
312 | # simply fill those points with labels
313 | mask = ~point_mask * feat_mask
314 | labels[mask, i] = proj_label[mask, 0]
315 | else:
316 | if i == 0:
317 | labels = proj_label
318 | else:
319 | labels[labels == 0] = proj_label[labels == 0]
320 |
321 | # aggregate
322 | if args.maxpool:
323 | new_labels = []
324 | for label_id in range(labels.shape[0]):
325 | point_label = labels[label_id].cpu().numpy().tolist()
326 | count = dict(Counter(point_label))
327 | count = sorted(count.items(), key=lambda x: x[1], reverse=True)
328 | count = [c for c in count if c[0] != 0]
329 | if count:
330 | new_labels.append(count[0][0])
331 | else:
332 | new_labels.append(0)
333 |
334 | labels = torch.FloatTensor(np.array(new_labels)[:, np.newaxis])
335 |
336 | # output
337 | visualize(scene, labels.long().squeeze(1).cpu().numpy())
338 |
339 |
--------------------------------------------------------------------------------
/scripts/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import h5py
5 | import argparse
6 | import importlib
7 | import torch
8 | import torch.optim as optim
9 | import torch.nn as nn
10 | import numpy as np
11 |
12 | from torch.utils.data import DataLoader
13 | from datetime import datetime
14 | from copy import deepcopy
15 |
16 | sys.path.append(os.path.join(os.getcwd())) # HACK add the root folder
17 | from data.scannet.model_util_scannet import ScannetDatasetConfig
18 | from lib.dataset import ScannetReferenceDataset
19 | from lib.solver import Solver
20 | from lib.config import CONF
21 | from models.refnet import RefNet
22 |
23 | SCANREFER_TRAIN = json.load(open(os.path.join(CONF.PATH.DATA, "ScanRefer_filtered_train.json")))
24 | SCANREFER_VAL = json.load(open(os.path.join(CONF.PATH.DATA, "ScanRefer_filtered_val.json")))
25 |
26 | # constants
27 | DC = ScannetDatasetConfig()
28 |
29 | def get_dataloader(args, scanrefer, all_scene_list, split, config, augment):
30 | dataset = ScannetReferenceDataset(
31 | scanrefer=scanrefer[split],
32 | scanrefer_all_scene=all_scene_list,
33 | split=split,
34 | num_points=args.num_points,
35 | use_height=(not args.no_height),
36 | use_color=args.use_color,
37 | use_normal=args.use_normal,
38 | use_multiview=args.use_multiview
39 | )
40 | # dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
41 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
42 |
43 | return dataset, dataloader
44 |
45 | def get_model(args):
46 | # initiate model
47 | input_channels = int(args.use_multiview) * 128 + int(args.use_normal) * 3 + int(args.use_color) * 3 + int(not args.no_height)
48 | model = RefNet(
49 | num_class=DC.num_class,
50 | num_heading_bin=DC.num_heading_bin,
51 | num_size_cluster=DC.num_size_cluster,
52 | mean_size_arr=DC.mean_size_arr,
53 | input_feature_dim=input_channels,
54 | num_proposal=args.num_proposals,
55 | use_lang_classifier=(not args.no_lang_cls),
56 | use_bidir=args.use_bidir,
57 | no_reference=args.no_reference
58 | )
59 |
60 | # trainable model
61 | if args.use_pretrained:
62 | # load model
63 | print("loading pretrained VoteNet...")
64 | pretrained_model = RefNet(
65 | num_class=DC.num_class,
66 | num_heading_bin=DC.num_heading_bin,
67 | num_size_cluster=DC.num_size_cluster,
68 | mean_size_arr=DC.mean_size_arr,
69 | num_proposal=args.num_proposals,
70 | input_feature_dim=input_channels,
71 | use_bidir=args.use_bidir,
72 | no_reference=True
73 | )
74 |
75 | pretrained_path = os.path.join(CONF.PATH.OUTPUT, args.use_pretrained, "model_last.pth")
76 | pretrained_model.load_state_dict(torch.load(pretrained_path), strict=False)
77 |
78 | # mount
79 | model.backbone_net = pretrained_model.backbone_net
80 | model.vgen = pretrained_model.vgen
81 | model.proposal = pretrained_model.proposal
82 |
83 | if args.no_detection:
84 | # freeze pointnet++ backbone
85 | for param in model.backbone_net.parameters():
86 | param.requires_grad = False
87 |
88 | # freeze voting
89 | for param in model.vgen.parameters():
90 | param.requires_grad = False
91 |
92 | # freeze detector
93 | for param in model.proposal.parameters():
94 | param.requires_grad = False
95 |
96 | # to CUDA
97 | model = model.cuda()
98 |
99 | return model
100 |
101 | def get_num_params(model):
102 | model_parameters = filter(lambda p: p.requires_grad, model.parameters())
103 | num_params = int(sum([np.prod(p.size()) for p in model_parameters]))
104 |
105 | return num_params
106 |
107 | def get_solver(args, dataloader):
108 | model = get_model(args)
109 | optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
110 |
111 | if args.use_checkpoint:
112 | print("loading checkpoint {}...".format(args.use_checkpoint))
113 | stamp = args.use_checkpoint
114 | root = os.path.join(CONF.PATH.OUTPUT, stamp)
115 | checkpoint = torch.load(os.path.join(CONF.PATH.OUTPUT, args.use_checkpoint, "checkpoint.tar"))
116 | model.load_state_dict(checkpoint["model_state_dict"])
117 | optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
118 | else:
119 | stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
120 | if args.tag: stamp += "_"+args.tag.upper()
121 | root = os.path.join(CONF.PATH.OUTPUT, stamp)
122 | os.makedirs(root, exist_ok=True)
123 |
124 | # scheduler parameters for training solely the detection pipeline
125 | LR_DECAY_STEP = [80, 120, 160] if args.no_reference else None
126 | LR_DECAY_RATE = 0.1 if args.no_reference else None
127 | BN_DECAY_STEP = 20 if args.no_reference else None
128 | BN_DECAY_RATE = 0.5 if args.no_reference else None
129 |
130 | solver = Solver(
131 | model=model,
132 | config=DC,
133 | dataloader=dataloader,
134 | optimizer=optimizer,
135 | stamp=stamp,
136 | val_step=args.val_step,
137 | detection=not args.no_detection,
138 | reference=not args.no_reference,
139 | use_lang_classifier=not args.no_lang_cls,
140 | lr_decay_step=LR_DECAY_STEP,
141 | lr_decay_rate=LR_DECAY_RATE,
142 | bn_decay_step=BN_DECAY_STEP,
143 | bn_decay_rate=BN_DECAY_RATE
144 | )
145 | num_params = get_num_params(model)
146 |
147 | return solver, num_params, root
148 |
149 | def save_info(args, root, num_params, train_dataset, val_dataset):
150 | info = {}
151 | for key, value in vars(args).items():
152 | info[key] = value
153 |
154 | info["num_train"] = len(train_dataset)
155 | info["num_val"] = len(val_dataset)
156 | info["num_train_scenes"] = len(train_dataset.scene_list)
157 | info["num_val_scenes"] = len(val_dataset.scene_list)
158 | info["num_params"] = num_params
159 |
160 | with open(os.path.join(root, "info.json"), "w") as f:
161 | json.dump(info, f, indent=4)
162 |
163 | def get_scannet_scene_list(split):
164 | scene_list = sorted([line.rstrip() for line in open(os.path.join(CONF.PATH.SCANNET_META, "scannetv2_{}.txt".format(split)))])
165 |
166 | return scene_list
167 |
168 | def get_scanrefer(scanrefer_train, scanrefer_val, num_scenes):
169 | if args.no_reference:
170 | train_scene_list = get_scannet_scene_list("train")
171 | new_scanrefer_train = []
172 | for scene_id in train_scene_list:
173 | data = deepcopy(SCANREFER_TRAIN[0])
174 | data["scene_id"] = scene_id
175 | new_scanrefer_train.append(data)
176 |
177 | val_scene_list = get_scannet_scene_list("val")
178 | new_scanrefer_val = []
179 | for scene_id in val_scene_list:
180 | data = deepcopy(SCANREFER_VAL[0])
181 | data["scene_id"] = scene_id
182 | new_scanrefer_val.append(data)
183 | else:
184 | # get initial scene list
185 | train_scene_list = sorted(list(set([data["scene_id"] for data in scanrefer_train])))
186 | val_scene_list = sorted(list(set([data["scene_id"] for data in scanrefer_val])))
187 | if num_scenes == -1:
188 | num_scenes = len(train_scene_list)
189 | else:
190 | assert len(train_scene_list) >= num_scenes
191 |
192 | # slice train_scene_list
193 | train_scene_list = train_scene_list[:num_scenes]
194 |
195 | # filter data in chosen scenes
196 | new_scanrefer_train = []
197 | for data in scanrefer_train:
198 | if data["scene_id"] in train_scene_list:
199 | new_scanrefer_train.append(data)
200 |
201 | new_scanrefer_val = scanrefer_val
202 |
203 | # all scanrefer scene
204 | all_scene_list = train_scene_list + val_scene_list
205 |
206 | print("train on {} samples and val on {} samples".format(len(new_scanrefer_train), len(new_scanrefer_val)))
207 |
208 | return new_scanrefer_train, new_scanrefer_val, all_scene_list
209 |
210 | def train(args):
211 | # init training dataset
212 | print("preparing data...")
213 | scanrefer_train, scanrefer_val, all_scene_list = get_scanrefer(SCANREFER_TRAIN, SCANREFER_VAL, args.num_scenes)
214 | scanrefer = {
215 | "train": scanrefer_train,
216 | "val": scanrefer_val
217 | }
218 |
219 | # dataloader
220 | train_dataset, train_dataloader = get_dataloader(args, scanrefer, all_scene_list, "train", DC, True)
221 | val_dataset, val_dataloader = get_dataloader(args, scanrefer, all_scene_list, "val", DC, False)
222 | dataloader = {
223 | "train": train_dataloader,
224 | "val": val_dataloader
225 | }
226 |
227 | print("initializing...")
228 | solver, num_params, root = get_solver(args, dataloader)
229 |
230 | print("Start training...\n")
231 | save_info(args, root, num_params, train_dataset, val_dataset)
232 | solver(args.epoch, args.verbose)
233 |
234 | if __name__ == "__main__":
235 | parser = argparse.ArgumentParser()
236 | parser.add_argument("--tag", type=str, help="tag for the training, e.g. cuda_wl", default="")
237 | parser.add_argument("--gpu", type=str, help="gpu", default="0")
238 | parser.add_argument("--batch_size", type=int, help="batch size", default=14)
239 | parser.add_argument("--epoch", type=int, help="number of epochs", default=50)
240 | parser.add_argument("--verbose", type=int, help="iterations of showing verbose", default=10)
241 | parser.add_argument("--val_step", type=int, help="iterations of validating", default=5000)
242 | parser.add_argument("--lr", type=float, help="learning rate", default=1e-3)
243 | parser.add_argument("--wd", type=float, help="weight decay", default=1e-5)
244 | parser.add_argument("--num_points", type=int, default=40000, help="Point Number [default: 40000]")
245 | parser.add_argument("--num_proposals", type=int, default=256, help="Proposal number [default: 256]")
246 | parser.add_argument("--num_scenes", type=int, default=-1, help="Number of scenes [default: -1]")
247 | parser.add_argument("--seed", type=int, default=42, help="random seed")
248 | parser.add_argument("--no_height", action="store_true", help="Do NOT use height signal in input.")
249 | parser.add_argument("--no_augment", action="store_true", help="Do NOT use height signal in input.")
250 | parser.add_argument("--no_lang_cls", action="store_true", help="Do NOT use language classifier.")
251 | parser.add_argument("--no_detection", action="store_true", help="Do NOT train the detection module.")
252 | parser.add_argument("--no_reference", action="store_true", help="Do NOT train the localization module.")
253 | parser.add_argument("--use_color", action="store_true", help="Use RGB color in input.")
254 | parser.add_argument("--use_normal", action="store_true", help="Use RGB color in input.")
255 | parser.add_argument("--use_multiview", action="store_true", help="Use multiview images.")
256 | parser.add_argument("--use_bidir", action="store_true", help="Use bi-directional GRU.")
257 | parser.add_argument("--use_pretrained", type=str, help="Specify the folder name containing the pretrained detection module.")
258 | parser.add_argument("--use_checkpoint", type=str, help="Specify the checkpoint root", default="")
259 | args = parser.parse_args()
260 |
261 | # setting
262 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
263 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
264 |
265 | # reproducibility
266 | torch.manual_seed(args.seed)
267 | torch.backends.cudnn.deterministic = True
268 | torch.backends.cudnn.benchmark = False
269 | np.random.seed(args.seed)
270 |
271 | train(args)
272 |
273 |
--------------------------------------------------------------------------------
/utils/box_util.py:
--------------------------------------------------------------------------------
1 | """
2 | Helper functions for calculating 2D and 3D bounding box IoU.
3 | From: https://github.com/facebookresearch/votenet/blob/master/utils/box_util.py
4 |
5 | Collected and written by Charles R. Qi
6 | Last modified: Jul 2019
7 | """
8 |
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 | from scipy.spatial import ConvexHull
13 |
14 | def polygon_clip(subjectPolygon, clipPolygon):
15 | """ Clip a polygon with another polygon.
16 |
17 | Ref: https://rosettacode.org/wiki/Sutherland-Hodgman_polygon_clipping#Python
18 |
19 | Args:
20 | subjectPolygon: a list of (x,y) 2d points, any polygon.
21 | clipPolygon: a list of (x,y) 2d points, has to be *convex*
22 | Note:
23 | **points have to be counter-clockwise ordered**
24 |
25 | Return:
26 | a list of (x,y) vertex point for the intersection polygon.
27 | """
28 | def inside(p):
29 | return(cp2[0]-cp1[0])*(p[1]-cp1[1]) > (cp2[1]-cp1[1])*(p[0]-cp1[0])
30 |
31 | def computeIntersection():
32 | dc = [ cp1[0] - cp2[0], cp1[1] - cp2[1] ]
33 | dp = [ s[0] - e[0], s[1] - e[1] ]
34 | n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
35 | n2 = s[0] * e[1] - s[1] * e[0]
36 | n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0])
37 | return [(n1*dp[0] - n2*dc[0]) * n3, (n1*dp[1] - n2*dc[1]) * n3]
38 |
39 | outputList = subjectPolygon
40 | cp1 = clipPolygon[-1]
41 |
42 | for clipVertex in clipPolygon:
43 | cp2 = clipVertex
44 | inputList = outputList
45 | outputList = []
46 | s = inputList[-1]
47 |
48 | for subjectVertex in inputList:
49 | e = subjectVertex
50 | if inside(e):
51 | if not inside(s):
52 | outputList.append(computeIntersection())
53 | outputList.append(e)
54 | elif inside(s):
55 | outputList.append(computeIntersection())
56 | s = e
57 | cp1 = cp2
58 | if len(outputList) == 0:
59 | return None
60 | return(outputList)
61 |
62 | def poly_area(x,y):
63 | """ Ref: http://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates """
64 | return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))
65 |
66 | def poly_area_batch(x,y):
67 | """ Ref: http://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates """
68 | return 0.5 * np.abs(np.matmul(np.expand_dims(x, axis=1), np.roll(np.expand_dims(y, axis=2), 1, axis=1)) \
69 | - np.matmul(np.expand_dims(y, axis=1), np.roll(np.expand_dims(x, axis=2), 1, axis=1))).squeeze(axis=(1,2))
70 |
71 | def convex_hull_intersection(p1, p2):
72 | """ Compute area of two convex hull's intersection area.
73 | p1,p2 are a list of (x,y) tuples of hull vertices.
74 | return a list of (x,y) for the intersection and its volume
75 | """
76 | inter_p = polygon_clip(p1,p2)
77 | if inter_p is not None:
78 | hull_inter = ConvexHull(inter_p)
79 | return inter_p, hull_inter.volume
80 | else:
81 | return None, 0.0
82 |
83 | def box3d_vol(corners):
84 | ''' corners: (8,3) no assumption on axis direction '''
85 | a = np.sqrt(np.sum((corners[0,:] - corners[1,:])**2))
86 | b = np.sqrt(np.sum((corners[1,:] - corners[2,:])**2))
87 | c = np.sqrt(np.sum((corners[0,:] - corners[4,:])**2))
88 | return a*b*c
89 |
90 | def is_clockwise(p):
91 | x = p[:,0]
92 | y = p[:,1]
93 | return np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)) > 0
94 |
95 | def box3d_iou(corners1, corners2):
96 | ''' Compute 3D bounding box IoU.
97 |
98 | Input:
99 | corners1: numpy array (8,3), assume up direction is Z
100 | corners2: numpy array (8,3), assume up direction is Z
101 | Output:
102 | iou: 3D bounding box IoU
103 |
104 | '''
105 | # # corner points are in counter clockwise order
106 | # rect1 = [(corners1[i,0], corners1[i,2]) for i in range(3,-1,-1)]
107 | # rect2 = [(corners2[i,0], corners2[i,2]) for i in range(3,-1,-1)]
108 | # area1 = poly_area(np.array(rect1)[:,0], np.array(rect1)[:,1])
109 | # area2 = poly_area(np.array(rect2)[:,0], np.array(rect2)[:,1])
110 | # inter, inter_area = convex_hull_intersection(rect1, rect2)
111 | # iou_2d = inter_area/(area1+area2-inter_area)
112 | # ymax = min(corners1[0,1], corners2[0,1])
113 | # ymin = max(corners1[4,1], corners2[4,1])
114 | # inter_vol = inter_area * max(0.0, ymax-ymin)
115 | # vol1 = box3d_vol(corners1)
116 | # vol2 = box3d_vol(corners2)
117 | # iou = inter_vol / (vol1 + vol2 - inter_vol)
118 | # return iou, iou_2d
119 |
120 | x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max(corners1)
121 | x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max(corners2)
122 | xA = np.maximum(x_min_1, x_min_2)
123 | yA = np.maximum(y_min_1, y_min_2)
124 | zA = np.maximum(z_min_1, z_min_2)
125 | xB = np.minimum(x_max_1, x_max_2)
126 | yB = np.minimum(y_max_1, y_max_2)
127 | zB = np.minimum(z_max_1, z_max_2)
128 | inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0)
129 | box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1)
130 | box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2)
131 | iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8)
132 |
133 | return iou
134 |
135 | def get_box3d_min_max(corner):
136 | ''' Compute min and max coordinates for 3D bounding box
137 | Note: only for axis-aligned bounding boxes
138 |
139 | Input:
140 | corners: numpy array (8,3), assume up direction is Z (batch of N samples)
141 | Output:
142 | box_min_max: an array for min and max coordinates of 3D bounding box IoU
143 |
144 | '''
145 |
146 | min_coord = corner.min(axis=0)
147 | max_coord = corner.max(axis=0)
148 | x_min, x_max = min_coord[0], max_coord[0]
149 | y_min, y_max = min_coord[1], max_coord[1]
150 | z_min, z_max = min_coord[2], max_coord[2]
151 |
152 | return x_min, x_max, y_min, y_max, z_min, z_max
153 |
154 | def box3d_iou_batch(corners1, corners2):
155 | ''' Compute 3D bounding box IoU.
156 | Note: only for axis-aligned bounding boxes
157 |
158 | Input:
159 | corners1: numpy array (N,8,3), assume up direction is Z (batch of N samples)
160 | corners2: numpy array (N,8,3), assume up direction is Z (batch of N samples)
161 | Output:
162 | iou: an array of 3D bounding box IoU
163 |
164 | '''
165 |
166 | x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max_batch(corners1)
167 | x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max_batch(corners2)
168 | xA = np.maximum(x_min_1, x_min_2)
169 | yA = np.maximum(y_min_1, y_min_2)
170 | zA = np.maximum(z_min_1, z_min_2)
171 | xB = np.minimum(x_max_1, x_max_2)
172 | yB = np.minimum(y_max_1, y_max_2)
173 | zB = np.minimum(z_max_1, z_max_2)
174 | inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0)
175 | box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1)
176 | box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2)
177 | iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8)
178 |
179 | return iou
180 |
181 | def get_box3d_min_max_batch(corner):
182 | ''' Compute min and max coordinates for 3D bounding box
183 | Note: only for axis-aligned bounding boxes
184 |
185 | Input:
186 | corners: numpy array (N,8,3), assume up direction is Z (batch of N samples)
187 | Output:
188 | box_min_max: an array for min and max coordinates of 3D bounding box IoU
189 |
190 | '''
191 |
192 | min_coord = corner.min(axis=1)
193 | max_coord = corner.max(axis=1)
194 | x_min, x_max = min_coord[:, 0], max_coord[:, 0]
195 | y_min, y_max = min_coord[:, 1], max_coord[:, 1]
196 | z_min, z_max = min_coord[:, 2], max_coord[:, 2]
197 |
198 | return x_min, x_max, y_min, y_max, z_min, z_max
199 |
200 | def get_iou(bb1, bb2):
201 | """
202 | Calculate the Intersection over Union (IoU) of two 2D bounding boxes.
203 |
204 | Parameters
205 | ----------
206 | bb1 : dict
207 | Keys: {'x1', 'x2', 'y1', 'y2'}
208 | The (x1, y1) position is at the top left corner,
209 | the (x2, y2) position is at the bottom right corner
210 | bb2 : dict
211 | Keys: {'x1', 'x2', 'y1', 'y2'}
212 | The (x, y) position is at the top left corner,
213 | the (x2, y2) position is at the bottom right corner
214 |
215 | Returns
216 | -------
217 | float
218 | in [0, 1]
219 | """
220 | assert bb1['x1'] < bb1['x2']
221 | assert bb1['y1'] < bb1['y2']
222 | assert bb2['x1'] < bb2['x2']
223 | assert bb2['y1'] < bb2['y2']
224 |
225 | # determine the coordinates of the intersection rectangle
226 | x_left = max(bb1['x1'], bb2['x1'])
227 | y_top = max(bb1['y1'], bb2['y1'])
228 | x_right = min(bb1['x2'], bb2['x2'])
229 | y_bottom = min(bb1['y2'], bb2['y2'])
230 |
231 | if x_right < x_left or y_bottom < y_top:
232 | return 0.0
233 |
234 | # The intersection of two axis-aligned bounding boxes is always an
235 | # axis-aligned bounding box
236 | intersection_area = (x_right - x_left) * (y_bottom - y_top)
237 |
238 | # compute the area of both AABBs
239 | bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
240 | bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])
241 |
242 | # compute the intersection over union by taking the intersection
243 | # area and dividing it by the sum of prediction + ground-truth
244 | # areas - the interesection area
245 | iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
246 | assert iou >= 0.0
247 | assert iou <= 1.0
248 | return iou
249 |
250 | def box2d_iou(box1, box2):
251 | ''' Compute 2D bounding box IoU.
252 |
253 | Input:
254 | box1: tuple of (xmin,ymin,xmax,ymax)
255 | box2: tuple of (xmin,ymin,xmax,ymax)
256 | Output:
257 | iou: 2D IoU scalar
258 | '''
259 | return get_iou({'x1':box1[0], 'y1':box1[1], 'x2':box1[2], 'y2':box1[3]}, \
260 | {'x1':box2[0], 'y1':box2[1], 'x2':box2[2], 'y2':box2[3]})
261 |
262 | # -----------------------------------------------------------
263 | # Convert from box parameters to
264 | # -----------------------------------------------------------
265 | def roty(t):
266 | """Rotation about the y-axis."""
267 | c = np.cos(t)
268 | s = np.sin(t)
269 | return np.array([[c, 0, s],
270 | [0, 1, 0],
271 | [-s, 0, c]])
272 |
273 | def roty_batch(t):
274 | """Rotation about the y-axis.
275 | t: (x1,x2,...xn)
276 | return: (x1,x2,...,xn,3,3)
277 | """
278 | input_shape = t.shape
279 | output = np.zeros(tuple(list(input_shape)+[3,3]))
280 | c = np.cos(t)
281 | s = np.sin(t)
282 | output[...,0,0] = c
283 | output[...,0,2] = s
284 | output[...,1,1] = 1
285 | output[...,2,0] = -s
286 | output[...,2,2] = c
287 | return output
288 |
289 |
290 | def get_3d_box(box_size, heading_angle, center):
291 | ''' box_size is array(l,w,h), heading_angle is radius clockwise from pos x axis, center is xyz of box center
292 | output (8,3) array for 3D box cornders
293 | Similar to utils/compute_orientation_3d
294 | '''
295 | R = roty(heading_angle)
296 | l,w,h = box_size
297 | # x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2]
298 | # y_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2]
299 | # z_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2]
300 | x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2]
301 | y_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2]
302 | z_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2]
303 | corners_3d = np.dot(R, np.vstack([x_corners,y_corners,z_corners]))
304 | corners_3d[0,:] = corners_3d[0,:] + center[0]
305 | corners_3d[1,:] = corners_3d[1,:] + center[1]
306 | corners_3d[2,:] = corners_3d[2,:] + center[2]
307 | corners_3d = np.transpose(corners_3d)
308 | return corners_3d
309 |
310 | def get_3d_box_batch(box_size, heading_angle, center):
311 | ''' box_size: [x1,x2,...,xn,3]
312 | heading_angle: [x1,x2,...,xn]
313 | center: [x1,x2,...,xn,3]
314 | Return:
315 | [x1,x3,...,xn,8,3]
316 | '''
317 | input_shape = heading_angle.shape
318 | R = roty_batch(heading_angle)
319 | l = np.expand_dims(box_size[...,0], -1) # [x1,...,xn,1]
320 | w = np.expand_dims(box_size[...,1], -1)
321 | h = np.expand_dims(box_size[...,2], -1)
322 | corners_3d = np.zeros(tuple(list(input_shape)+[8,3]))
323 | # corners_3d[...,:,0] = np.concatenate((l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2), -1)
324 | # corners_3d[...,:,1] = np.concatenate((h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2), -1)
325 | # corners_3d[...,:,2] = np.concatenate((w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2), -1)
326 | corners_3d[...,:,0] = np.concatenate((l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2), -1)
327 | corners_3d[...,:,1] = np.concatenate((w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2), -1)
328 | corners_3d[...,:,2] = np.concatenate((h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2), -1)
329 | tlist = [i for i in range(len(input_shape))]
330 | tlist += [len(input_shape)+1, len(input_shape)]
331 | corners_3d = np.matmul(corners_3d, np.transpose(R, tuple(tlist)))
332 | corners_3d += np.expand_dims(center, -2)
333 | return corners_3d
334 |
--------------------------------------------------------------------------------
/utils/eta.py:
--------------------------------------------------------------------------------
1 | '''
2 | File Created: Monday, 25th November 2019 1:35:30 pm
3 | Author: Dave Zhenyu Chen (zhenyu.chen@tum.de)
4 | '''
5 |
6 | def get_eta(start, end, extra, num_left):
7 | exe_s = end - start
8 | eta_s = (exe_s + extra) * num_left
9 | eta = {'h': 0, 'm': 0, 's': 0}
10 | if eta_s < 60:
11 | eta['s'] = int(eta_s)
12 | elif eta_s >= 60 and eta_s < 3600:
13 | eta['m'] = int(eta_s / 60)
14 | eta['s'] = int(eta_s % 60)
15 | else:
16 | eta['h'] = int(eta_s / (60 * 60))
17 | eta['m'] = int(eta_s % (60 * 60) / 60)
18 | eta['s'] = int(eta_s % (60 * 60) % 60)
19 |
20 | return eta
21 |
22 | def decode_eta(eta_sec):
23 | eta = {'h': 0, 'm': 0, 's': 0}
24 | if eta_sec < 60:
25 | eta['s'] = int(eta_sec)
26 | elif eta_sec >= 60 and eta_sec < 3600:
27 | eta['m'] = int(eta_sec / 60)
28 | eta['s'] = int(eta_sec % 60)
29 | else:
30 | eta['h'] = int(eta_sec / (60 * 60))
31 | eta['m'] = int(eta_sec % (60 * 60) / 60)
32 | eta['s'] = int(eta_sec % (60 * 60) % 60)
33 |
34 | return eta
--------------------------------------------------------------------------------
/utils/eval_det.py:
--------------------------------------------------------------------------------
1 | """
2 | Generic Code for Object Detection Evaluation
3 | From: https://github.com/facebookresearch/votenet/blob/master/utils/eval_det.py
4 |
5 | Input:
6 | For each class:
7 | For each image:
8 | Predictions: box, score
9 | Groundtruths: box
10 |
11 | Output:
12 | For each class:
13 | precision-recal and average precision
14 |
15 | Author: Charles R. Qi
16 |
17 | Ref: https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/master/lib/datasets/voc_eval.py
18 | """
19 | import numpy as np
20 |
21 | def voc_ap(rec, prec, use_07_metric=False):
22 | """ ap = voc_ap(rec, prec, [use_07_metric])
23 | Compute VOC AP given precision and recall.
24 | If use_07_metric is true, uses the
25 | VOC 07 11 point method (default:False).
26 | """
27 | if use_07_metric:
28 | # 11 point metric
29 | ap = 0.
30 | for t in np.arange(0., 1.1, 0.1):
31 | if np.sum(rec >= t) == 0:
32 | p = 0
33 | else:
34 | p = np.max(prec[rec >= t])
35 | ap = ap + p / 11.
36 | else:
37 | # correct AP calculation
38 | # first append sentinel values at the end
39 | mrec = np.concatenate(([0.], rec, [1.]))
40 | mpre = np.concatenate(([0.], prec, [0.]))
41 |
42 | # compute the precision envelope
43 | for i in range(mpre.size - 1, 0, -1):
44 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
45 |
46 | # to calculate area under PR curve, look for points
47 | # where X axis (recall) changes value
48 | i = np.where(mrec[1:] != mrec[:-1])[0]
49 |
50 | # and sum (\Delta recall) * prec
51 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
52 | return ap
53 |
54 | import os
55 | import sys
56 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
57 | from utils.metric_util import calc_iou # axis-aligned 3D box IoU
58 | def get_iou(bb1, bb2):
59 | """ Compute IoU of two bounding boxes.
60 | ** Define your bod IoU function HERE **
61 | """
62 | #pass
63 | iou3d = calc_iou(bb1, bb2)
64 | return iou3d
65 |
66 | from utils.box_util import box3d_iou
67 | def get_iou_obb(bb1,bb2):
68 | iou3d = box3d_iou(bb1,bb2)
69 | return iou3d
70 |
71 | def get_iou_main(get_iou_func, args):
72 | return get_iou_func(*args)
73 |
74 | def eval_det_cls(pred, gt, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou):
75 | """ Generic functions to compute precision/recall for object detection
76 | for a single class.
77 | Input:
78 | pred: map of {img_id: [(bbox, score)]} where bbox is numpy array
79 | gt: map of {img_id: [bbox]}
80 | ovthresh: scalar, iou threshold
81 | use_07_metric: bool, if True use VOC07 11 point method
82 | Output:
83 | rec: numpy array of length nd
84 | prec: numpy array of length nd
85 | ap: scalar, average precision
86 | """
87 |
88 | # construct gt objects
89 | class_recs = {} # {img_id: {'bbox': bbox list, 'det': matched list}}
90 | npos = 0
91 | for img_id in gt.keys():
92 | bbox = np.array(gt[img_id])
93 | det = [False] * len(bbox)
94 | npos += len(bbox)
95 | class_recs[img_id] = {'bbox': bbox, 'det': det}
96 | # pad empty list to all other imgids
97 | for img_id in pred.keys():
98 | if img_id not in gt:
99 | class_recs[img_id] = {'bbox': np.array([]), 'det': []}
100 |
101 | # construct dets
102 | image_ids = []
103 | confidence = []
104 | BB = []
105 | for img_id in pred.keys():
106 | for box,score in pred[img_id]:
107 | image_ids.append(img_id)
108 | confidence.append(score)
109 | BB.append(box)
110 | confidence = np.array(confidence)
111 | BB = np.array(BB) # (nd,4 or 8,3 or 6)
112 |
113 | # sort by confidence
114 | sorted_ind = np.argsort(-confidence)
115 | sorted_scores = np.sort(-confidence)
116 | BB = BB[sorted_ind, ...]
117 | image_ids = [image_ids[x] for x in sorted_ind]
118 |
119 | # go down dets and mark TPs and FPs
120 | nd = len(image_ids)
121 | tp = np.zeros(nd)
122 | fp = np.zeros(nd)
123 | for d in range(nd):
124 | #if d%100==0: print(d)
125 | R = class_recs[image_ids[d]]
126 | bb = BB[d,...].astype(float)
127 | ovmax = -np.inf
128 | BBGT = R['bbox'].astype(float)
129 |
130 | if BBGT.size > 0:
131 | # compute overlaps
132 | for j in range(BBGT.shape[0]):
133 | iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
134 | if iou > ovmax:
135 | ovmax = iou
136 | jmax = j
137 |
138 | #print d, ovmax
139 | if ovmax > ovthresh:
140 | if not R['det'][jmax]:
141 | tp[d] = 1.
142 | R['det'][jmax] = 1
143 | else:
144 | fp[d] = 1.
145 | else:
146 | fp[d] = 1.
147 |
148 | # compute precision recall
149 | fp = np.cumsum(fp)
150 | tp = np.cumsum(tp)
151 | rec = tp / float(npos + 1e-8)
152 | #print('NPOS: ', npos)
153 | # avoid divide by zero in case the first detection matches a difficult
154 | # ground truth
155 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
156 | ap = voc_ap(rec, prec, use_07_metric)
157 |
158 | return rec, prec, ap
159 |
160 | def eval_det_cls_wrapper(arguments):
161 | pred, gt, ovthresh, use_07_metric, get_iou_func = arguments
162 | rec, prec, ap = eval_det_cls(pred, gt, ovthresh, use_07_metric, get_iou_func)
163 | return (rec, prec, ap)
164 |
165 | def eval_det(pred_all, gt_all, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou):
166 | """ Generic functions to compute precision/recall for object detection
167 | for multiple classes.
168 | Input:
169 | pred_all: map of {img_id: [(classname, bbox, score)]}
170 | gt_all: map of {img_id: [(classname, bbox)]}
171 | ovthresh: scalar, iou threshold
172 | use_07_metric: bool, if true use VOC07 11 point method
173 | Output:
174 | rec: {classname: rec}
175 | prec: {classname: prec_all}
176 | ap: {classname: scalar}
177 | """
178 | pred = {} # map {classname: pred}
179 | gt = {} # map {classname: gt}
180 | for img_id in pred_all.keys():
181 | for classname, bbox, score in pred_all[img_id]:
182 | if classname not in pred: pred[classname] = {}
183 | if img_id not in pred[classname]:
184 | pred[classname][img_id] = []
185 | if classname not in gt: gt[classname] = {}
186 | if img_id not in gt[classname]:
187 | gt[classname][img_id] = []
188 | pred[classname][img_id].append((bbox,score))
189 | for img_id in gt_all.keys():
190 | for classname, bbox in gt_all[img_id]:
191 | if classname not in gt: gt[classname] = {}
192 | if img_id not in gt[classname]:
193 | gt[classname][img_id] = []
194 | gt[classname][img_id].append(bbox)
195 |
196 | rec = {}
197 | prec = {}
198 | ap = {}
199 | for classname in gt.keys():
200 | print('Computing AP for class: ', classname)
201 | rec[classname], prec[classname], ap[classname] = eval_det_cls(pred[classname], gt[classname], ovthresh, use_07_metric, get_iou_func)
202 | print(classname, ap[classname])
203 |
204 | return rec, prec, ap
205 |
206 | from multiprocessing import Pool
207 | def eval_det_multiprocessing(pred_all, gt_all, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou):
208 | """ Generic functions to compute precision/recall for object detection
209 | for multiple classes.
210 | Input:
211 | pred_all: map of {img_id: [(classname, bbox, score)]}
212 | gt_all: map of {img_id: [(classname, bbox)]}
213 | ovthresh: scalar, iou threshold
214 | use_07_metric: bool, if true use VOC07 11 point method
215 | Output:
216 | rec: {classname: rec}
217 | prec: {classname: prec_all}
218 | ap: {classname: scalar}
219 | """
220 | pred = {} # map {classname: pred}
221 | gt = {} # map {classname: gt}
222 | for img_id in pred_all.keys():
223 | for classname, bbox, score in pred_all[img_id]:
224 | if classname not in pred: pred[classname] = {}
225 | if img_id not in pred[classname]:
226 | pred[classname][img_id] = []
227 | if classname not in gt: gt[classname] = {}
228 | if img_id not in gt[classname]:
229 | gt[classname][img_id] = []
230 | pred[classname][img_id].append((bbox,score))
231 | for img_id in gt_all.keys():
232 | for classname, bbox in gt_all[img_id]:
233 | if classname not in gt: gt[classname] = {}
234 | if img_id not in gt[classname]:
235 | gt[classname][img_id] = []
236 | gt[classname][img_id].append(bbox)
237 |
238 | rec = {}
239 | prec = {}
240 | ap = {}
241 | p = Pool(processes=10)
242 | ret_values = p.map(eval_det_cls_wrapper, [(pred[classname], gt[classname], ovthresh, use_07_metric, get_iou_func) for classname in gt.keys() if classname in pred])
243 | p.close()
244 | for i, classname in enumerate(gt.keys()):
245 | if classname in pred:
246 | rec[classname], prec[classname], ap[classname] = ret_values[i]
247 | else:
248 | rec[classname] = 0
249 | prec[classname] = 0
250 | ap[classname] = 0
251 | print(classname, ap[classname])
252 |
253 | return rec, prec, ap
254 |
--------------------------------------------------------------------------------
/utils/metric_util.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility functions for metric evaluation.
3 | From: https://github.com/facebookresearch/votenet/blob/master/utils/metric_util.py
4 |
5 | Author: Or Litany and Charles R. Qi
6 | """
7 |
8 | import os
9 | import sys
10 | import torch
11 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
12 | sys.path.append(BASE_DIR)
13 |
14 | import numpy as np
15 |
16 | # Mesh IO
17 | import trimesh
18 |
19 |
20 | # ----------------------------------------
21 | # Precision and Recall
22 | # ----------------------------------------
23 |
24 | def multi_scene_precision_recall(labels, pred, iou_thresh, conf_thresh, label_mask, pred_mask=None):
25 | '''
26 | Args:
27 | labels: (B, N, 6)
28 | pred: (B, M, 6)
29 | iou_thresh: scalar
30 | conf_thresh: scalar
31 | label_mask: (B, N,) with values in 0 or 1 to indicate which GT boxes to consider.
32 | pred_mask: (B, M,) with values in 0 or 1 to indicate which PRED boxes to consider.
33 | Returns:
34 | TP,FP,FN,Precision,Recall
35 | '''
36 | # Make sure the masks are not Torch tensor, otherwise the mask==1 returns uint8 array instead
37 | # of True/False array as in numpy
38 | assert(not torch.is_tensor(label_mask))
39 | assert(not torch.is_tensor(pred_mask))
40 | TP, FP, FN = 0, 0, 0
41 | if label_mask is None: label_mask = np.ones((labels.shape[0], labels.shape[1]))
42 | if pred_mask is None: pred_mask = np.ones((pred.shape[0], pred.shape[1]))
43 | for batch_idx in range(labels.shape[0]):
44 | TP_i, FP_i, FN_i = single_scene_precision_recall(labels[batch_idx, label_mask[batch_idx,:]==1, :],
45 | pred[batch_idx, pred_mask[batch_idx,:]==1, :],
46 | iou_thresh, conf_thresh)
47 | TP += TP_i
48 | FP += FP_i
49 | FN += FN_i
50 |
51 | return TP, FP, FN, precision_recall(TP, FP, FN)
52 |
53 |
54 | def single_scene_precision_recall(labels, pred, iou_thresh, conf_thresh):
55 | """Compute P and R for predicted bounding boxes. Ignores classes!
56 | Args:
57 | labels: (N x bbox) ground-truth bounding boxes (6 dims)
58 | pred: (M x (bbox + conf)) predicted bboxes with confidence and maybe classification
59 | Returns:
60 | TP, FP, FN
61 | """
62 |
63 |
64 | # for each pred box with high conf (C), compute IoU with all gt boxes.
65 | # TP = number of times IoU > th ; FP = C - TP
66 | # FN - number of scene objects without good match
67 |
68 | gt_bboxes = labels[:, :6]
69 |
70 | num_scene_bboxes = gt_bboxes.shape[0]
71 | conf = pred[:, 6]
72 |
73 | conf_pred_bbox = pred[np.where(conf > conf_thresh)[0], :6]
74 | num_conf_pred_bboxes = conf_pred_bbox.shape[0]
75 |
76 | # init an array to keep iou between generated and scene bboxes
77 | iou_arr = np.zeros([num_conf_pred_bboxes, num_scene_bboxes])
78 | for g_idx in range(num_conf_pred_bboxes):
79 | for s_idx in range(num_scene_bboxes):
80 | iou_arr[g_idx, s_idx] = calc_iou(conf_pred_bbox[g_idx ,:], gt_bboxes[s_idx, :])
81 |
82 |
83 | good_match_arr = (iou_arr >= iou_thresh)
84 |
85 | TP = good_match_arr.any(axis=1).sum()
86 | FP = num_conf_pred_bboxes - TP
87 | FN = num_scene_bboxes - good_match_arr.any(axis=0).sum()
88 |
89 | return TP, FP, FN
90 |
91 |
92 | def precision_recall(TP, FP, FN):
93 | Prec = 1.0 * TP / (TP + FP) if TP+FP>0 else 0
94 | Rec = 1.0 * TP / (TP + FN)
95 | return Prec, Rec
96 |
97 |
98 | def calc_iou(box_a, box_b):
99 | """Computes IoU of two axis aligned bboxes.
100 | Args:
101 | box_a, box_b: 6D of center and lengths
102 | Returns:
103 | iou
104 | """
105 |
106 | max_a = box_a[0:3] + box_a[3:6]/2
107 | max_b = box_b[0:3] + box_b[3:6]/2
108 | min_max = np.array([max_a, max_b]).min(0)
109 |
110 | min_a = box_a[0:3] - box_a[3:6]/2
111 | min_b = box_b[0:3] - box_b[3:6]/2
112 | max_min = np.array([min_a, min_b]).max(0)
113 | if not ((min_max > max_min).all()):
114 | return 0.0
115 |
116 | intersection = (min_max - max_min).prod()
117 | vol_a = box_a[3:6].prod()
118 | vol_b = box_b[3:6].prod()
119 | union = vol_a + vol_b - intersection
120 | return 1.0*intersection / union
121 |
122 |
123 | if __name__ == '__main__':
124 | print('running some tests')
125 |
126 | ############
127 | ## Test IoU
128 | ############
129 | box_a = np.array([0,0,0,1,1,1])
130 | box_b = np.array([0,0,0,2,2,2])
131 | expected_iou = 1.0/8
132 | pred_iou = calc_iou(box_a, box_b)
133 | assert expected_iou == pred_iou, 'function returned wrong IoU'
134 |
135 | box_a = np.array([0,0,0,1,1,1])
136 | box_b = np.array([10,10,10,2,2,2])
137 | expected_iou = 0.0
138 | pred_iou = calc_iou(box_a, box_b)
139 | assert expected_iou == pred_iou, 'function returned wrong IoU'
140 |
141 | print('IoU test -- PASSED')
142 |
143 | #########################
144 | ## Test Precition Recall
145 | #########################
146 | gt_boxes = np.array([[0,0,0,1,1,1],[3, 0, 1, 1, 10, 1]])
147 | detected_boxes = np.array([[0,0,0,1,1,1, 1.0],[3, 0, 1, 1, 10, 1, 0.9]])
148 | TP, FP, FN = single_scene_precision_recall(gt_boxes, detected_boxes, 0.5, 0.5)
149 | assert TP == 2 and FP == 0 and FN == 0
150 | assert precision_recall(TP, FP, FN) == (1, 1)
151 |
152 | detected_boxes = np.array([[0,0,0,1,1,1, 1.0]])
153 | TP, FP, FN = single_scene_precision_recall(gt_boxes, detected_boxes, 0.5, 0.5)
154 | assert TP == 1 and FP == 0 and FN == 1
155 | assert precision_recall(TP, FP, FN) == (1, 0.5)
156 |
157 | detected_boxes = np.array([[0,0,0,1,1,1, 1.0], [-1,-1,0,0.1,0.1,1, 1.0]])
158 | TP, FP, FN = single_scene_precision_recall(gt_boxes, detected_boxes, 0.5, 0.5)
159 | assert TP == 1 and FP == 1 and FN == 1
160 | assert precision_recall(TP, FP, FN) == (0.5, 0.5)
161 |
162 | # wrong box has low confidence
163 | detected_boxes = np.array([[0,0,0,1,1,1, 1.0], [-1,-1,0,0.1,0.1,1, 0.1]])
164 | TP, FP, FN = single_scene_precision_recall(gt_boxes, detected_boxes, 0.5, 0.5)
165 | assert TP == 1 and FP == 0 and FN == 1
166 | assert precision_recall(TP, FP, FN) == (1, 0.5)
167 |
168 | print('Precition Recall test -- PASSED')
169 |
170 |
--------------------------------------------------------------------------------
/utils/nms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from utils.pc_utils import bbox_corner_dist_measure
3 |
4 | # boxes are axis aigned 2D boxes of shape (n,5) in FLOAT numbers with (x1,y1,x2,y2,score)
5 | ''' Ref: https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
6 | Ref: https://github.com/vickyboy47/nms-python/blob/master/nms.py
7 | '''
8 | def nms_2d(boxes, overlap_threshold):
9 | x1 = boxes[:,0]
10 | y1 = boxes[:,1]
11 | x2 = boxes[:,2]
12 | y2 = boxes[:,3]
13 | score = boxes[:,4]
14 | area = (x2-x1)*(y2-y1)
15 |
16 | I = np.argsort(score)
17 | pick = []
18 | while (I.size!=0):
19 | last = I.size
20 | i = I[-1]
21 | pick.append(i)
22 | suppress = [last-1]
23 | for pos in range(last-1):
24 | j = I[pos]
25 | xx1 = max(x1[i],x1[j])
26 | yy1 = max(y1[i],y1[j])
27 | xx2 = min(x2[i],x2[j])
28 | yy2 = min(y2[i],y2[j])
29 | w = xx2-xx1
30 | h = yy2-yy1
31 | if (w>0 and h>0):
32 | o = w*h/area[j]
33 | print('Overlap is', o)
34 | if (o>overlap_threshold):
35 | suppress.append(pos)
36 | I = np.delete(I,suppress)
37 | return pick
38 |
39 | def nms_2d_faster(boxes, overlap_threshold, old_type=False):
40 | x1 = boxes[:,0]
41 | y1 = boxes[:,1]
42 | x2 = boxes[:,2]
43 | y2 = boxes[:,3]
44 | score = boxes[:,4]
45 | area = (x2-x1)*(y2-y1)
46 |
47 | I = np.argsort(score)
48 | pick = []
49 | while (I.size!=0):
50 | last = I.size
51 | i = I[-1]
52 | pick.append(i)
53 |
54 | xx1 = np.maximum(x1[i], x1[I[:last-1]])
55 | yy1 = np.maximum(y1[i], y1[I[:last-1]])
56 | xx2 = np.minimum(x2[i], x2[I[:last-1]])
57 | yy2 = np.minimum(y2[i], y2[I[:last-1]])
58 |
59 | w = np.maximum(0, xx2-xx1)
60 | h = np.maximum(0, yy2-yy1)
61 |
62 | if old_type:
63 | o = (w*h)/area[I[:last-1]]
64 | else:
65 | inter = w*h
66 | o = inter / (area[i] + area[I[:last-1]] - inter)
67 |
68 | I = np.delete(I, np.concatenate(([last-1], np.where(o>overlap_threshold)[0])))
69 |
70 | return pick
71 |
72 | def nms_3d_faster(boxes, overlap_threshold, old_type=False):
73 | x1 = boxes[:,0]
74 | y1 = boxes[:,1]
75 | z1 = boxes[:,2]
76 | x2 = boxes[:,3]
77 | y2 = boxes[:,4]
78 | z2 = boxes[:,5]
79 | score = boxes[:,6]
80 | area = (x2-x1)*(y2-y1)*(z2-z1)
81 |
82 | I = np.argsort(score)
83 | pick = []
84 | while (I.size!=0):
85 | last = I.size
86 | i = I[-1]
87 | pick.append(i)
88 |
89 | xx1 = np.maximum(x1[i], x1[I[:last-1]])
90 | yy1 = np.maximum(y1[i], y1[I[:last-1]])
91 | zz1 = np.maximum(z1[i], z1[I[:last-1]])
92 | xx2 = np.minimum(x2[i], x2[I[:last-1]])
93 | yy2 = np.minimum(y2[i], y2[I[:last-1]])
94 | zz2 = np.minimum(z2[i], z2[I[:last-1]])
95 |
96 | l = np.maximum(0, xx2-xx1)
97 | w = np.maximum(0, yy2-yy1)
98 | h = np.maximum(0, zz2-zz1)
99 |
100 | if old_type:
101 | o = (l*w*h)/area[I[:last-1]]
102 | else:
103 | inter = l*w*h
104 | o = inter / (area[i] + area[I[:last-1]] - inter)
105 |
106 | I = np.delete(I, np.concatenate(([last-1], np.where(o>overlap_threshold)[0])))
107 |
108 | return pick
109 |
110 | def nms_3d_faster_samecls(boxes, overlap_threshold, old_type=False):
111 | x1 = boxes[:,0]
112 | y1 = boxes[:,1]
113 | z1 = boxes[:,2]
114 | x2 = boxes[:,3]
115 | y2 = boxes[:,4]
116 | z2 = boxes[:,5]
117 | score = boxes[:,6]
118 | cls = boxes[:,7]
119 | area = (x2-x1)*(y2-y1)*(z2-z1)
120 |
121 | I = np.argsort(score)
122 | pick = []
123 | while (I.size!=0):
124 | last = I.size
125 | i = I[-1]
126 | pick.append(i)
127 |
128 | xx1 = np.maximum(x1[i], x1[I[:last-1]])
129 | yy1 = np.maximum(y1[i], y1[I[:last-1]])
130 | zz1 = np.maximum(z1[i], z1[I[:last-1]])
131 | xx2 = np.minimum(x2[i], x2[I[:last-1]])
132 | yy2 = np.minimum(y2[i], y2[I[:last-1]])
133 | zz2 = np.minimum(z2[i], z2[I[:last-1]])
134 | cls1 = cls[i]
135 | cls2 = cls[I[:last-1]]
136 |
137 | l = np.maximum(0, xx2-xx1)
138 | w = np.maximum(0, yy2-yy1)
139 | h = np.maximum(0, zz2-zz1)
140 |
141 | if old_type:
142 | o = (l*w*h)/area[I[:last-1]]
143 | else:
144 | inter = l*w*h
145 | o = inter / (area[i] + area[I[:last-1]] - inter)
146 | o = o * (cls1==cls2)
147 |
148 | I = np.delete(I, np.concatenate(([last-1], np.where(o>overlap_threshold)[0])))
149 |
150 | return pick
151 |
152 |
153 | def nms_crnr_dist(boxes, conf, overlap_threshold):
154 |
155 | I = np.argsort(conf)
156 | pick = []
157 | while (I.size!=0):
158 | last = I.size
159 | i = I[-1]
160 | pick.append(i)
161 |
162 | scores = []
163 | for ind in I[:-1]:
164 | scores.append(bbox_corner_dist_measure(boxes[i,:], boxes[ind, :]))
165 |
166 | I = np.delete(I, np.concatenate(([last-1], np.where(np.array(scores)>overlap_threshold)[0])))
167 |
168 | return pick
169 |
170 | if __name__=='__main__':
171 | a = np.random.random((100,5))
172 | print(nms_2d(a,0.9))
173 | print(nms_2d_faster(a,0.9))
174 |
--------------------------------------------------------------------------------
/utils/nn_distance.py:
--------------------------------------------------------------------------------
1 | """
2 | Chamfer distance in Pytorch.
3 | Author: Charles R. Qi
4 |
5 | From: https://github.com/facebookresearch/votenet/blob/master/utils/nn_distance.py
6 | """
7 |
8 | import torch
9 | import torch.nn as nn
10 | import numpy as np
11 |
12 |
13 | def huber_loss(error, delta=1.0):
14 | """
15 | Args:
16 | error: Torch tensor (d1,d2,...,dk)
17 | Returns:
18 | loss: Torch tensor (d1,d2,...,dk)
19 |
20 | x = error = pred - gt or dist(pred,gt)
21 | 0.5 * |x|^2 if |x|<=d
22 | 0.5 * d^2 + d * (|x|-d) if |x|>d
23 | Ref: https://github.com/charlesq34/frustum-pointnets/blob/master/models/model_util.py
24 | """
25 | abs_error = torch.abs(error)
26 | #quadratic = torch.min(abs_error, torch.FloatTensor([delta]))
27 | quadratic = torch.clamp(abs_error, max=delta)
28 | linear = (abs_error - quadratic)
29 | loss = 0.5 * quadratic**2 + delta * linear
30 | return loss
31 |
32 | def nn_distance(pc1, pc2, l1smooth=False, delta=1.0, l1=False):
33 | """
34 | Input:
35 | pc1: (B,N,C) torch tensor
36 | pc2: (B,M,C) torch tensor
37 | l1smooth: bool, whether to use l1smooth loss
38 | delta: scalar, the delta used in l1smooth loss
39 | Output:
40 | dist1: (B,N) torch float32 tensor
41 | idx1: (B,N) torch int64 tensor
42 | dist2: (B,M) torch float32 tensor
43 | idx2: (B,M) torch int64 tensor
44 | """
45 | N = pc1.shape[1]
46 | M = pc2.shape[1]
47 | pc1_expand_tile = pc1.unsqueeze(2).repeat(1,1,M,1)
48 | pc2_expand_tile = pc2.unsqueeze(1).repeat(1,N,1,1)
49 | pc_diff = pc1_expand_tile - pc2_expand_tile
50 |
51 | if l1smooth:
52 | pc_dist = torch.sum(huber_loss(pc_diff, delta), dim=-1) # (B,N,M)
53 | elif l1:
54 | pc_dist = torch.sum(torch.abs(pc_diff), dim=-1) # (B,N,M)
55 | else:
56 | pc_dist = torch.sum(pc_diff**2, dim=-1) # (B,N,M)
57 | dist1, idx1 = torch.min(pc_dist, dim=2) # (B,N)
58 | dist2, idx2 = torch.min(pc_dist, dim=1) # (B,M)
59 | return dist1, idx1, dist2, idx2
60 |
61 | def demo_nn_distance():
62 | np.random.seed(0)
63 | pc1arr = np.random.random((1,5,3))
64 | pc2arr = np.random.random((1,6,3))
65 | pc1 = torch.from_numpy(pc1arr.astype(np.float32))
66 | pc2 = torch.from_numpy(pc2arr.astype(np.float32))
67 | dist1, idx1, dist2, idx2 = nn_distance(pc1, pc2)
68 | print(dist1)
69 | print(idx1)
70 | dist = np.zeros((5,6))
71 | for i in range(5):
72 | for j in range(6):
73 | dist[i,j] = np.sum((pc1arr[0,i,:] - pc2arr[0,j,:]) ** 2)
74 | print(dist)
75 | print('-'*30)
76 | print('L1smooth dists:')
77 | dist1, idx1, dist2, idx2 = nn_distance(pc1, pc2, True)
78 | print(dist1)
79 | print(idx1)
80 | dist = np.zeros((5,6))
81 | for i in range(5):
82 | for j in range(6):
83 | error = np.abs(pc1arr[0,i,:] - pc2arr[0,j,:])
84 | quad = np.minimum(error, 1.0)
85 | linear = error - quad
86 | loss = 0.5*quad**2 + 1.0*linear
87 | dist[i,j] = np.sum(loss)
88 | print(dist)
89 |
90 |
91 | if __name__ == '__main__':
92 | demo_nn_distance()
93 |
--------------------------------------------------------------------------------