├── LICENSE ├── assets ├── mean_points_emb.npy └── teaser.png ├── data ├── pose_dataset.py └── shape_dataset.py ├── evaluate.py ├── lib ├── align.py ├── auto_encoder.py ├── loss.py ├── network.py ├── nn_distance │ ├── chamfer_loss.py │ ├── setup.py │ └── src │ │ ├── nn_distance.cpp │ │ └── nn_distance_cuda.cu ├── pspnet.py └── utils.py ├── mean_shape.py ├── preprocess ├── pose_data.py └── shape_data.py ├── readme.md ├── tools └── tsne.py ├── train_ae.py └── train_deform.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Meng Tian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/mean_points_emb.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mentian/object-deformnet/a2dcdb87dd88912c6b51b0f693443212fde5696e/assets/mean_points_emb.npy -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mentian/object-deformnet/a2dcdb87dd88912c6b51b0f693443212fde5696e/assets/teaser.png -------------------------------------------------------------------------------- /data/pose_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import math 4 | import random 5 | import numpy as np 6 | import _pickle as cPickle 7 | from PIL import Image 8 | import torch.utils.data as data 9 | import torchvision.transforms as transforms 10 | from lib.utils import load_depth, get_bbox 11 | 12 | 13 | class PoseDataset(data.Dataset): 14 | def __init__(self, source, mode, data_dir, n_pts, img_size): 15 | """ 16 | Args: 17 | source: 'CAMERA', 'Real' or 'CAMERA+Real' 18 | mode: 'train' or 'test' 19 | data_dir: 20 | n_pts: number of selected foreground points 21 | img_size: square image window 22 | """ 23 | self.source = source 24 | self.mode = mode 25 | self.data_dir = data_dir 26 | self.n_pts = n_pts 27 | self.img_size = img_size 28 | 29 | assert source in ['CAMERA', 'Real', 'CAMERA+Real'] 30 | assert mode in ['train', 'test'] 31 | img_list_path = ['CAMERA/train_list.txt', 'Real/train_list.txt', 32 | 'CAMERA/val_list.txt', 'Real/test_list.txt'] 33 | model_file_path = ['obj_models/camera_train.pkl', 'obj_models/real_train.pkl', 34 | 'obj_models/camera_val.pkl', 'obj_models/real_test.pkl'] 35 | if mode == 'train': 36 | del img_list_path[2:] 37 | del model_file_path[2:] 38 | else: 39 | del img_list_path[:2] 40 | del model_file_path[:2] 41 | if source == 'CAMERA': 42 | del img_list_path[-1] 43 | del model_file_path[-1] 44 | elif source == 'Real': 45 | del img_list_path[0] 46 | del model_file_path[0] 47 | else: 48 | # only use Real to test when source is CAMERA+Real 49 | if mode == 'test': 50 | del img_list_path[0] 51 | del model_file_path[0] 52 | 53 | img_list = [] 54 | subset_len = [] 55 | for path in img_list_path: 56 | img_list += [os.path.join(path.split('/')[0], line.rstrip('\n')) 57 | for line in open(os.path.join(data_dir, path))] 58 | subset_len.append(len(img_list)) 59 | if len(subset_len) == 2: 60 | self.subset_len = [subset_len[0], subset_len[1]-subset_len[0]] 61 | self.img_list = img_list 62 | self.length = len(self.img_list) 63 | 64 | models = {} 65 | for path in model_file_path: 66 | with open(os.path.join(data_dir, path), 'rb') as f: 67 | models.update(cPickle.load(f)) 68 | self.models = models 69 | 70 | # meta info for re-label mug category 71 | with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f: 72 | self.mug_meta = cPickle.load(f) 73 | 74 | self.mean_shapes = np.load('assets/mean_points_emb.npy') 75 | self.cat_names = ['bottle', 'bowl', 'camera', 'can', 'laptop', 'mug'] 76 | self.camera_intrinsics = [577.5, 577.5, 319.5, 239.5] # [fx, fy, cx, cy] 77 | self.real_intrinsics = [591.0125, 590.16775, 322.525, 244.11084] 78 | self.sym_ids = [0, 1, 3] # 0-indexed 79 | self.norm_scale = 1000.0 # normalization scale 80 | self.xmap = np.array([[i for i in range(640)] for j in range(480)]) 81 | self.ymap = np.array([[j for i in range(640)] for j in range(480)]) 82 | self.shift_range = 0.01 83 | self.colorjitter = transforms.ColorJitter(0.2, 0.2, 0.2, 0.05) 84 | self.transform = transforms.Compose([transforms.ToTensor(), 85 | transforms.Normalize(mean=[0.485, 0.456, 0.406], 86 | std=[0.229, 0.224, 0.225])]) 87 | print('{} images found.'.format(self.length)) 88 | print('{} models loaded.'.format(len(self.models))) 89 | 90 | def __len__(self): 91 | return self.length 92 | 93 | def __getitem__(self, index): 94 | img_path = os.path.join(self.data_dir, self.img_list[index]) 95 | rgb = cv2.imread(img_path + '_color.png')[:, :, :3] 96 | rgb = rgb[:, :, ::-1] 97 | depth = load_depth(img_path) 98 | mask = cv2.imread(img_path + '_mask.png')[:, :, 2] 99 | coord = cv2.imread(img_path + '_coord.png')[:, :, :3] 100 | coord = coord[:, :, (2, 1, 0)] 101 | coord = np.array(coord, dtype=np.float32) / 255 102 | coord[:, :, 2] = 1 - coord[:, :, 2] 103 | with open(img_path + '_label.pkl', 'rb') as f: 104 | gts = cPickle.load(f) 105 | if 'CAMERA' in img_path.split('/'): 106 | cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics 107 | else: 108 | cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics 109 | 110 | # select one foreground object 111 | idx = random.randint(0, len(gts['instance_ids'])-1) 112 | inst_id = gts['instance_ids'][idx] 113 | rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx]) 114 | # sample points 115 | mask = np.equal(mask, inst_id) 116 | mask = np.logical_and(mask, depth > 0) 117 | choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] 118 | if len(choose) > self.n_pts: 119 | c_mask = np.zeros(len(choose), dtype=int) 120 | c_mask[:self.n_pts] = 1 121 | np.random.shuffle(c_mask) 122 | choose = choose[c_mask.nonzero()] 123 | else: 124 | choose = np.pad(choose, (0, self.n_pts-len(choose)), 'wrap') 125 | depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 126 | xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 127 | ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 128 | pt2 = depth_masked / self.norm_scale 129 | pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx 130 | pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy 131 | points = np.concatenate((pt0, pt1, pt2), axis=1) 132 | nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5 133 | # resize cropped image to standard size and adjust 'choose' accordingly 134 | rgb = rgb[rmin:rmax, cmin:cmax, :] 135 | rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR) 136 | crop_w = rmax - rmin 137 | ratio = self.img_size / crop_w 138 | col_idx = choose % crop_w 139 | row_idx = choose // crop_w 140 | choose = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64) 141 | # label 142 | cat_id = gts['class_ids'][idx] - 1 # convert to 0-indexed 143 | model = self.models[gts['model_list'][idx]].astype(np.float32) # 1024 points 144 | prior = self.mean_shapes[cat_id].astype(np.float32) 145 | scale = gts['scales'][idx] 146 | rotation = gts['rotations'][idx] 147 | translation = gts['translations'][idx] 148 | # data augmentation 149 | if self.mode == 'train': 150 | # color jitter 151 | rgb = self.colorjitter(Image.fromarray(np.uint8(rgb))) 152 | rgb = np.array(rgb) 153 | # point shift 154 | add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3)) 155 | translation = translation + add_t[0] 156 | # point jitter 157 | add_t = add_t + np.clip(0.001*np.random.randn(points.shape[0], 3), -0.005, 0.005) 158 | points = np.add(points, add_t) 159 | rgb = self.transform(rgb) 160 | points = points.astype(np.float32) 161 | # adjust nocs coords for mug category 162 | if cat_id == 5: 163 | T0 = self.mug_meta[gts['model_list'][idx]][0] 164 | s0 = self.mug_meta[gts['model_list'][idx]][1] 165 | nocs = s0 * (nocs + T0) 166 | # map ambiguous rotation to canonical rotation 167 | if cat_id in self.sym_ids: 168 | rotation = gts['rotations'][idx] 169 | # assume continuous axis rotation symmetry 170 | theta_x = rotation[0, 0] + rotation[2, 2] 171 | theta_y = rotation[0, 2] - rotation[2, 0] 172 | r_norm = math.sqrt(theta_x**2 + theta_y**2) 173 | s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm], 174 | [0.0, 1.0, 0.0 ], 175 | [theta_y/r_norm, 0.0, theta_x/r_norm]]) 176 | rotation = rotation @ s_map 177 | nocs = nocs @ s_map 178 | sRT = np.identity(4, dtype=np.float32) 179 | sRT[:3, :3] = scale * rotation 180 | sRT[:3, 3] = translation 181 | nocs = nocs.astype(np.float32) 182 | 183 | return points, rgb, choose, cat_id, model, prior, sRT, nocs 184 | -------------------------------------------------------------------------------- /data/shape_dataset.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import torch.utils.data as data 4 | 5 | 6 | class ShapeDataset(data.Dataset): 7 | def __init__(self, h5_file, mode, n_points=2048, augment=False): 8 | assert (mode == 'train' or mode == 'val'), 'Mode must be "train" or "val".' 9 | self.mode = mode 10 | self.n_points = n_points 11 | self.augment = augment 12 | # load data from h5py file 13 | with h5py.File(h5_file, 'r') as f: 14 | self.length = f[self.mode].attrs['len'] 15 | self.data = f[self.mode]['data'][:] 16 | self.label = f[self.mode]['label'][:] 17 | # augmentation parameters 18 | self.sigma = 0.01 19 | self.clip = 0.02 20 | self.shift_range = 0.02 21 | 22 | def __len__(self): 23 | return self.length 24 | 25 | def __getitem__(self, index): 26 | xyz = self.data[index] 27 | label = self.label[index] - 1 # data saved indexed from 1 28 | # randomly downsample 29 | np_data = xyz.shape[0] 30 | assert np_data >= self.n_points, 'Not enough points in shape.' 31 | idx = np.random.choice(np_data, self.n_points) 32 | xyz = xyz[idx, :] 33 | # data augmentation 34 | if self.augment: 35 | jitter = np.clip(self.sigma*np.random.randn(self.n_points, 3), -self.clip, self.clip) 36 | xyz[:, :3] += jitter 37 | shift = np.random.uniform(-self.shift_range, self.shift_range, (1, 3)) 38 | xyz[:, :3] += shift 39 | return xyz, label 40 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | import cv2 5 | import glob 6 | import numpy as np 7 | from tqdm import tqdm 8 | import _pickle as cPickle 9 | import torch 10 | import torch.nn.functional as F 11 | import torchvision.transforms as transforms 12 | from lib.network import DeformNet 13 | from lib.align import estimateSimilarityTransform 14 | from lib.utils import load_depth, get_bbox, compute_mAP, plot_mAP 15 | 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--data', type=str, default='val', help='val, real_test') 19 | parser.add_argument('--data_dir', type=str, default='data', help='data directory') 20 | parser.add_argument('--n_cat', type=int, default=6, help='number of object categories') 21 | parser.add_argument('--nv_prior', type=int, default=1024, help='number of vertices in shape priors') 22 | parser.add_argument('--model', type=str, default='results/camera/model_50.pth', help='resume from saved model') 23 | parser.add_argument('--n_pts', type=int, default=1024, help='number of foreground points') 24 | parser.add_argument('--img_size', type=int, default=192, help='cropped image size') 25 | parser.add_argument('--gpu', type=str, default='1', help='GPU to use') 26 | opt = parser.parse_args() 27 | 28 | mean_shapes = np.load('assets/mean_points_emb.npy') 29 | 30 | assert opt.data in ['val', 'real_test'] 31 | if opt.data == 'val': 32 | result_dir = 'results/eval_camera' 33 | file_path = 'CAMERA/val_list.txt' 34 | cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5 35 | else: 36 | result_dir = 'results/eval_real' 37 | file_path = 'Real/test_list.txt' 38 | cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084 39 | 40 | if not os.path.exists(result_dir): 41 | os.makedirs(result_dir) 42 | 43 | xmap = np.array([[i for i in range(640)] for j in range(480)]) 44 | ymap = np.array([[j for i in range(640)] for j in range(480)]) 45 | norm_scale = 1000.0 46 | norm_color = transforms.Compose( 47 | [transforms.ToTensor(), 48 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])] 49 | ) 50 | 51 | 52 | def detect(): 53 | # resume model 54 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu 55 | estimator = DeformNet(opt.n_cat, opt.nv_prior) 56 | estimator.cuda() 57 | estimator.load_state_dict(torch.load(opt.model)) 58 | estimator.eval() 59 | # get test data list 60 | img_list = [os.path.join(file_path.split('/')[0], line.rstrip('\n')) 61 | for line in open(os.path.join(opt.data_dir, file_path))] 62 | # frame by frame test 63 | t_inference = 0.0 64 | t_umeyama = 0.0 65 | inst_count = 0 66 | img_count = 0 67 | t_start = time.time() 68 | for path in tqdm(img_list): 69 | img_path = os.path.join(opt.data_dir, path) 70 | raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3] 71 | raw_rgb = raw_rgb[:, :, ::-1] 72 | raw_depth = load_depth(img_path) 73 | # load mask-rcnn detection results 74 | img_path_parsing = img_path.split('/') 75 | mrcnn_path = os.path.join('results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format( 76 | opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1])) 77 | with open(mrcnn_path, 'rb') as f: 78 | mrcnn_result = cPickle.load(f) 79 | num_insts = len(mrcnn_result['class_ids']) 80 | f_sRT = np.zeros((num_insts, 4, 4), dtype=float) 81 | f_size = np.zeros((num_insts, 3), dtype=float) 82 | # prepare frame data 83 | f_points, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], [] 84 | valid_inst = [] 85 | for i in range(num_insts): 86 | cat_id = mrcnn_result['class_ids'][i] - 1 87 | prior = mean_shapes[cat_id] 88 | rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i]) 89 | mask = np.logical_and(mrcnn_result['masks'][:, :, i], raw_depth > 0) 90 | choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0] 91 | # no depth observation for background in CAMERA dataset 92 | # beacuase of how we compute the bbox in function get_bbox 93 | # there might be a chance that no foreground points after cropping the mask 94 | # cuased by false positive of mask_rcnn, most of the regions are background 95 | if len(choose) < 32: 96 | f_sRT[i] = np.identity(4, dtype=float) 97 | f_size[i] = 2 * np.amax(np.abs(prior), axis=0) 98 | continue 99 | else: 100 | valid_inst.append(i) 101 | # process objects with valid depth observation 102 | if len(choose) > opt.n_pts: 103 | c_mask = np.zeros(len(choose), dtype=int) 104 | c_mask[:opt.n_pts] = 1 105 | np.random.shuffle(c_mask) 106 | choose = choose[c_mask.nonzero()] 107 | else: 108 | choose = np.pad(choose, (0, opt.n_pts-len(choose)), 'wrap') 109 | depth_masked = raw_depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 110 | xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 111 | ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis] 112 | pt2 = depth_masked / norm_scale 113 | pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx 114 | pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy 115 | points = np.concatenate((pt0, pt1, pt2), axis=1) 116 | rgb = raw_rgb[rmin:rmax, cmin:cmax, :] 117 | rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR) 118 | rgb = norm_color(rgb) 119 | crop_w = rmax - rmin 120 | ratio = opt.img_size / crop_w 121 | col_idx = choose % crop_w 122 | row_idx = choose // crop_w 123 | choose = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64) 124 | # concatenate instances 125 | f_points.append(points) 126 | f_rgb.append(rgb) 127 | f_choose.append(choose) 128 | f_catId.append(cat_id) 129 | f_prior.append(prior) 130 | if len(valid_inst): 131 | f_points = torch.cuda.FloatTensor(f_points) 132 | f_rgb = torch.stack(f_rgb, dim=0).cuda() 133 | f_choose = torch.cuda.LongTensor(f_choose) 134 | f_catId = torch.cuda.LongTensor(f_catId) 135 | f_prior = torch.cuda.FloatTensor(f_prior) 136 | # inference 137 | torch.cuda.synchronize() 138 | t_now = time.time() 139 | assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId, f_prior) 140 | # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior) 141 | inst_shape = f_prior + deltas 142 | assign_mat = F.softmax(assign_mat, dim=2) 143 | f_coords = torch.bmm(assign_mat, inst_shape) # bs x n_pts x 3 144 | torch.cuda.synchronize() 145 | t_inference += (time.time() - t_now) 146 | f_coords = f_coords.detach().cpu().numpy() 147 | f_points = f_points.cpu().numpy() 148 | f_choose = f_choose.cpu().numpy() 149 | f_insts = inst_shape.detach().cpu().numpy() 150 | t_now = time.time() 151 | for i in range(len(valid_inst)): 152 | inst_idx = valid_inst[i] 153 | choose = f_choose[i] 154 | _, choose = np.unique(choose, return_index=True) 155 | nocs_coords = f_coords[i, choose, :] 156 | f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0) 157 | points = f_points[i, choose, :] 158 | _, _, _, pred_sRT = estimateSimilarityTransform(nocs_coords, points) 159 | if pred_sRT is None: 160 | pred_sRT = np.identity(4, dtype=float) 161 | f_sRT[inst_idx] = pred_sRT 162 | t_umeyama += (time.time() - t_now) 163 | img_count += 1 164 | inst_count += len(valid_inst) 165 | 166 | # save results 167 | result = {} 168 | with open(img_path + '_label.pkl', 'rb') as f: 169 | gts = cPickle.load(f) 170 | result['gt_class_ids'] = gts['class_ids'] 171 | result['gt_bboxes'] = gts['bboxes'] 172 | result['gt_RTs'] = gts['poses'] 173 | result['gt_scales'] = gts['size'] 174 | result['gt_handle_visibility'] = gts['handle_visibility'] 175 | 176 | result['pred_class_ids'] = mrcnn_result['class_ids'] 177 | result['pred_bboxes'] = mrcnn_result['rois'] 178 | result['pred_scores'] = mrcnn_result['scores'] 179 | result['pred_RTs'] = f_sRT 180 | result['pred_scales'] = f_size 181 | 182 | image_short_path = '_'.join(img_path_parsing[-3:]) 183 | save_path = os.path.join(result_dir, 'results_{}.pkl'.format(image_short_path)) 184 | with open(save_path, 'wb') as f: 185 | cPickle.dump(result, f) 186 | # write statistics 187 | fw = open('{0}/eval_logs.txt'.format(result_dir), 'w') 188 | messages = [] 189 | messages.append("Total images: {}".format(len(img_list))) 190 | messages.append("Valid images: {}, Total instances: {}, Average: {:.2f}/image".format( 191 | img_count, inst_count, inst_count/img_count)) 192 | messages.append("Inference time: {:06f} Average: {:06f}/image".format(t_inference, t_inference/img_count)) 193 | messages.append("Umeyama time: {:06f} Average: {:06f}/image".format(t_umeyama, t_umeyama/img_count)) 194 | messages.append("Total time: {:06f}".format(time.time() - t_start)) 195 | for msg in messages: 196 | print(msg) 197 | fw.write(msg + '\n') 198 | fw.close() 199 | 200 | 201 | def evaluate(): 202 | degree_thres_list = list(range(0, 61, 1)) 203 | shift_thres_list = [i / 2 for i in range(21)] 204 | iou_thres_list = [i / 100 for i in range(101)] 205 | # predictions 206 | result_pkl_list = glob.glob(os.path.join(result_dir, 'results_*.pkl')) 207 | result_pkl_list = sorted(result_pkl_list) 208 | assert len(result_pkl_list) 209 | pred_results = [] 210 | for pkl_path in result_pkl_list: 211 | with open(pkl_path, 'rb') as f: 212 | result = cPickle.load(f) 213 | if 'gt_handle_visibility' not in result: 214 | result['gt_handle_visibility'] = np.ones_like(result['gt_class_ids']) 215 | else: 216 | assert len(result['gt_handle_visibility']) == len(result['gt_class_ids']), "{} {}".format( 217 | result['gt_handle_visibility'], result['gt_class_ids']) 218 | if type(result) is list: 219 | pred_results += result 220 | elif type(result) is dict: 221 | pred_results.append(result) 222 | else: 223 | assert False 224 | # To be consistent with NOCS, set use_matches_for_pose=True for mAP evaluation 225 | iou_aps, pose_aps, iou_acc, pose_acc = compute_mAP(pred_results, result_dir, degree_thres_list, shift_thres_list, 226 | iou_thres_list, iou_pose_thres=0.1, use_matches_for_pose=True) 227 | # metric 228 | fw = open('{0}/eval_logs.txt'.format(result_dir), 'a') 229 | iou_25_idx = iou_thres_list.index(0.25) 230 | iou_50_idx = iou_thres_list.index(0.5) 231 | iou_75_idx = iou_thres_list.index(0.75) 232 | degree_05_idx = degree_thres_list.index(5) 233 | degree_10_idx = degree_thres_list.index(10) 234 | shift_02_idx = shift_thres_list.index(2) 235 | shift_05_idx = shift_thres_list.index(5) 236 | messages = [] 237 | messages.append('mAP:') 238 | messages.append('3D IoU at 25: {:.1f}'.format(iou_aps[-1, iou_25_idx] * 100)) 239 | messages.append('3D IoU at 50: {:.1f}'.format(iou_aps[-1, iou_50_idx] * 100)) 240 | messages.append('3D IoU at 75: {:.1f}'.format(iou_aps[-1, iou_75_idx] * 100)) 241 | messages.append('5 degree, 2cm: {:.1f}'.format(pose_aps[-1, degree_05_idx, shift_02_idx] * 100)) 242 | messages.append('5 degree, 5cm: {:.1f}'.format(pose_aps[-1, degree_05_idx, shift_05_idx] * 100)) 243 | messages.append('10 degree, 2cm: {:.1f}'.format(pose_aps[-1, degree_10_idx, shift_02_idx] * 100)) 244 | messages.append('10 degree, 5cm: {:.1f}'.format(pose_aps[-1, degree_10_idx, shift_05_idx] * 100)) 245 | messages.append('Acc:') 246 | messages.append('3D IoU at 25: {:.1f}'.format(iou_acc[-1, iou_25_idx] * 100)) 247 | messages.append('3D IoU at 50: {:.1f}'.format(iou_acc[-1, iou_50_idx] * 100)) 248 | messages.append('3D IoU at 75: {:.1f}'.format(iou_acc[-1, iou_75_idx] * 100)) 249 | messages.append('5 degree, 2cm: {:.1f}'.format(pose_acc[-1, degree_05_idx, shift_02_idx] * 100)) 250 | messages.append('5 degree, 5cm: {:.1f}'.format(pose_acc[-1, degree_05_idx, shift_05_idx] * 100)) 251 | messages.append('10 degree, 2cm: {:.1f}'.format(pose_acc[-1, degree_10_idx, shift_02_idx] * 100)) 252 | messages.append('10 degree, 5cm: {:.1f}'.format(pose_acc[-1, degree_10_idx, shift_05_idx] * 100)) 253 | for msg in messages: 254 | print(msg) 255 | fw.write(msg + '\n') 256 | fw.close() 257 | # load NOCS results 258 | pkl_path = os.path.join('results/nocs_results', opt.data, 'mAP_Acc.pkl') 259 | with open(pkl_path, 'rb') as f: 260 | nocs_results = cPickle.load(f) 261 | nocs_iou_aps = nocs_results['iou_aps'][-1, :] 262 | nocs_pose_aps = nocs_results['pose_aps'][-1, :, :] 263 | iou_aps = np.concatenate((iou_aps, nocs_iou_aps[None, :]), axis=0) 264 | pose_aps = np.concatenate((pose_aps, nocs_pose_aps[None, :, :]), axis=0) 265 | # plot 266 | plot_mAP(iou_aps, pose_aps, result_dir, iou_thres_list, degree_thres_list, shift_thres_list) 267 | 268 | 269 | if __name__ == '__main__': 270 | print('Detecting ...') 271 | detect() 272 | print('Evaluating ...') 273 | evaluate() 274 | -------------------------------------------------------------------------------- /lib/align.py: -------------------------------------------------------------------------------- 1 | """ 2 | RANSAC for Similarity Transformation Estimation 3 | Modified from https://github.com/hughw19/NOCS_CVPR2019 4 | Originally Written by Srinath Sridhar 5 | """ 6 | import time 7 | import numpy as np 8 | 9 | 10 | def estimateSimilarityUmeyama(SourceHom, TargetHom): 11 | # Copy of original paper is at: http://web.stanford.edu/class/cs273/refs/umeyama.pdf 12 | SourceCentroid = np.mean(SourceHom[:3, :], axis=1) 13 | TargetCentroid = np.mean(TargetHom[:3, :], axis=1) 14 | nPoints = SourceHom.shape[1] 15 | CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose() 16 | CenteredTarget = TargetHom[:3, :] - np.tile(TargetCentroid, (nPoints, 1)).transpose() 17 | CovMatrix = np.matmul(CenteredTarget, np.transpose(CenteredSource)) / nPoints 18 | if np.isnan(CovMatrix).any(): 19 | print('nPoints:', nPoints) 20 | print(SourceHom.shape) 21 | print(TargetHom.shape) 22 | raise RuntimeError('There are NANs in the input.') 23 | 24 | U, D, Vh = np.linalg.svd(CovMatrix, full_matrices=True) 25 | d = (np.linalg.det(U) * np.linalg.det(Vh)) < 0.0 26 | if d: 27 | D[-1] = -D[-1] 28 | U[:, -1] = -U[:, -1] 29 | # rotation 30 | Rotation = np.matmul(U, Vh) 31 | # scale 32 | varP = np.var(SourceHom[:3, :], axis=1).sum() 33 | Scale = 1 / varP * np.sum(D) 34 | # translation 35 | Translation = TargetHom[:3, :].mean(axis=1) - SourceHom[:3, :].mean(axis=1).dot(Scale*Rotation.T) 36 | # transformation matrix 37 | OutTransform = np.identity(4) 38 | OutTransform[:3, :3] = Scale * Rotation 39 | OutTransform[:3, 3] = Translation 40 | 41 | return Scale, Rotation, Translation, OutTransform 42 | 43 | 44 | def estimateSimilarityTransform(source: np.array, target: np.array, verbose=False): 45 | """ Add RANSAC algorithm to account for outliers. 46 | 47 | """ 48 | assert source.shape[0] == target.shape[0], 'Source and Target must have same number of points.' 49 | SourceHom = np.transpose(np.hstack([source, np.ones([source.shape[0], 1])])) 50 | TargetHom = np.transpose(np.hstack([target, np.ones([target.shape[0], 1])])) 51 | # Auto-parameter selection based on source heuristics 52 | # Assume source is object model or gt nocs map, which is of high quality 53 | SourceCentroid = np.mean(SourceHom[:3, :], axis=1) 54 | nPoints = SourceHom.shape[1] 55 | CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose() 56 | SourceDiameter = 2 * np.amax(np.linalg.norm(CenteredSource, axis=0)) 57 | InlierT = SourceDiameter / 10.0 # 0.1 of source diameter 58 | maxIter = 128 59 | confidence = 0.99 60 | 61 | if verbose: 62 | print('Inlier threshold: ', InlierT) 63 | print('Max number of iterations: ', maxIter) 64 | 65 | BestInlierRatio = 0 66 | BestInlierIdx = np.arange(nPoints) 67 | for i in range(0, maxIter): 68 | # Pick 5 random (but corresponding) points from source and target 69 | RandIdx = np.random.randint(nPoints, size=5) 70 | Scale, _, _, OutTransform = estimateSimilarityUmeyama(SourceHom[:, RandIdx], TargetHom[:, RandIdx]) 71 | PassThreshold = Scale * InlierT # propagate inlier threshold to target scale 72 | Diff = TargetHom - np.matmul(OutTransform, SourceHom) 73 | ResidualVec = np.linalg.norm(Diff[:3, :], axis=0) 74 | InlierIdx = np.where(ResidualVec < PassThreshold)[0] 75 | nInliers = InlierIdx.shape[0] 76 | InlierRatio = nInliers / nPoints 77 | # update best hypothesis 78 | if InlierRatio > BestInlierRatio: 79 | BestInlierRatio = InlierRatio 80 | BestInlierIdx = InlierIdx 81 | if verbose: 82 | print('Iteration: ', i) 83 | print('Inlier ratio: ', BestInlierRatio) 84 | # early break 85 | if (1 - (1 - BestInlierRatio ** 5) ** i) > confidence: 86 | break 87 | 88 | if(BestInlierRatio < 0.1): 89 | print('[ WARN ] - Something is wrong. Small BestInlierRatio: ', BestInlierRatio) 90 | return None, None, None, None 91 | 92 | SourceInliersHom = SourceHom[:, BestInlierIdx] 93 | TargetInliersHom = TargetHom[:, BestInlierIdx] 94 | Scale, Rotation, Translation, OutTransform = estimateSimilarityUmeyama(SourceInliersHom, TargetInliersHom) 95 | 96 | if verbose: 97 | print('BestInlierRatio:', BestInlierRatio) 98 | print('Rotation:\n', Rotation) 99 | print('Translation:\n', Translation) 100 | print('Scale:', Scale) 101 | 102 | return Scale, Rotation, Translation, OutTransform 103 | 104 | 105 | def backproject(depth, intrinsics, instance_mask): 106 | """ Back-projection, use opencv camera coordinate frame. 107 | 108 | """ 109 | cam_fx = intrinsics[0, 0] 110 | cam_fy = intrinsics[1, 1] 111 | cam_cx = intrinsics[0, 2] 112 | cam_cy = intrinsics[1, 2] 113 | 114 | non_zero_mask = (depth > 0) 115 | final_instance_mask = np.logical_and(instance_mask, non_zero_mask) 116 | idxs = np.where(final_instance_mask) 117 | 118 | z = depth[idxs[0], idxs[1]] 119 | x = (idxs[1] - cam_cx) * z / cam_fx 120 | y = (idxs[0] - cam_cy) * z / cam_fy 121 | pts = np.stack((x, y, z), axis=1) 122 | 123 | return pts, idxs 124 | 125 | 126 | def align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path, verbose=False): 127 | num_instances = len(instance_ids) 128 | error_messages = '' 129 | elapses = [] 130 | scales = np.zeros(num_instances) 131 | rotations = np.zeros((num_instances, 3, 3)) 132 | translations = np.zeros((num_instances, 3)) 133 | 134 | for i in range(num_instances): 135 | mask = masks[:, :, i] 136 | coord = coords[:, :, i, :] 137 | pts, idxs = backproject(depth, intrinsics, mask) 138 | coord_pts = coord[idxs[0], idxs[1], :] - 0.5 139 | try: 140 | start = time.time() 141 | s, R, T, outtransform = estimateSimilarityTransform(coord_pts, pts, False) 142 | elapsed = time.time() - start 143 | if verbose: 144 | print('elapsed: ', elapsed) 145 | elapses.append(elapsed) 146 | except Exception as e: 147 | message = '[ Error ] aligning instance {} in {} fails. Message: {}.'.format(instance_ids[i], img_path, str(e)) 148 | print(message) 149 | error_messages += message + '\n' 150 | s = 1.0 151 | R = np.eye(3) 152 | T = np.zeros(3) 153 | outtransform = np.identity(4, dtype=np.float32) 154 | 155 | scales[i] = s / 1000.0 156 | rotations[i, :, :] = R 157 | translations[i, :] = T / 1000.0 158 | 159 | return scales, rotations, translations, error_messages, elapses 160 | -------------------------------------------------------------------------------- /lib/auto_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class PointCloudEncoder(nn.Module): 7 | def __init__(self, emb_dim): 8 | super(PointCloudEncoder, self).__init__() 9 | self.conv1 = nn.Conv1d(3, 64, 1) 10 | self.conv2 = nn.Conv1d(64, 128, 1) 11 | self.conv3 = nn.Conv1d(256, 256, 1) 12 | self.conv4 = nn.Conv1d(256, 1024, 1) 13 | self.fc = nn.Linear(1024, emb_dim) 14 | 15 | def forward(self, xyz): 16 | """ 17 | Args: 18 | xyz: (B, 3, N) 19 | 20 | """ 21 | np = xyz.size()[2] 22 | x = F.relu(self.conv1(xyz)) 23 | x = F.relu(self.conv2(x)) 24 | global_feat = F.adaptive_max_pool1d(x, 1) 25 | x = torch.cat((x, global_feat.repeat(1, 1, np)), dim=1) 26 | x = F.relu(self.conv3(x)) 27 | x = F.relu(self.conv4(x)) 28 | x = torch.squeeze(F.adaptive_max_pool1d(x, 1), dim=2) 29 | embedding = self.fc(x) 30 | return embedding 31 | 32 | 33 | class PointCloudDecoder(nn.Module): 34 | def __init__(self, emb_dim, n_pts): 35 | super(PointCloudDecoder, self).__init__() 36 | self.fc1 = nn.Linear(emb_dim, 512) 37 | self.fc2 = nn.Linear(512, 1024) 38 | self.fc3 = nn.Linear(1024, 3*n_pts) 39 | 40 | def forward(self, embedding): 41 | """ 42 | Args: 43 | embedding: (B, 512) 44 | 45 | """ 46 | bs = embedding.size()[0] 47 | out = F.relu(self.fc1(embedding)) 48 | out = F.relu(self.fc2(out)) 49 | out = self.fc3(out) 50 | out_pc = out.view(bs, -1, 3) 51 | return out_pc 52 | 53 | 54 | class PointCloudAE(nn.Module): 55 | def __init__(self, emb_dim=512, n_pts=1024): 56 | super(PointCloudAE, self).__init__() 57 | self.encoder = PointCloudEncoder(emb_dim) 58 | self.decoder = PointCloudDecoder(emb_dim, n_pts) 59 | 60 | def forward(self, in_pc, emb=None): 61 | """ 62 | Args: 63 | in_pc: (B, N, 3) 64 | emb: (B, 512) 65 | 66 | Returns: 67 | emb: (B, emb_dim) 68 | out_pc: (B, n_pts, 3) 69 | 70 | """ 71 | if emb is None: 72 | xyz = in_pc.permute(0, 2, 1) 73 | emb = self.encoder(xyz) 74 | out_pc = self.decoder(emb) 75 | return emb, out_pc 76 | -------------------------------------------------------------------------------- /lib/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .nn_distance.chamfer_loss import ChamferLoss 5 | 6 | 7 | class Loss(nn.Module): 8 | """ Loss for training DeformNet. 9 | Use NOCS coords to supervise training. 10 | """ 11 | def __init__(self, corr_wt, cd_wt, entropy_wt, deform_wt): 12 | super(Loss, self).__init__() 13 | self.threshold = 0.1 14 | self.chamferloss = ChamferLoss() 15 | self.corr_wt = corr_wt 16 | self.cd_wt = cd_wt 17 | self.entropy_wt = entropy_wt 18 | self.deform_wt = deform_wt 19 | 20 | def forward(self, assign_mat, deltas, prior, nocs, model): 21 | """ 22 | Args: 23 | assign_mat: bs x n_pts x nv 24 | deltas: bs x nv x 3 25 | prior: bs x nv x 3 26 | """ 27 | inst_shape = prior + deltas 28 | # smooth L1 loss for correspondences 29 | soft_assign = F.softmax(assign_mat, dim=2) 30 | coords = torch.bmm(soft_assign, inst_shape) # bs x n_pts x 3 31 | diff = torch.abs(coords - nocs) 32 | less = torch.pow(diff, 2) / (2.0 * self.threshold) 33 | higher = diff - self.threshold / 2.0 34 | corr_loss = torch.where(diff > self.threshold, higher, less) 35 | corr_loss = torch.mean(torch.sum(corr_loss, dim=2)) 36 | corr_loss = self.corr_wt * corr_loss 37 | # entropy loss to encourage peaked distribution 38 | log_assign = F.log_softmax(assign_mat, dim=2) 39 | entropy_loss = torch.mean(-torch.sum(soft_assign * log_assign, 2)) 40 | entropy_loss = self.entropy_wt * entropy_loss 41 | # cd-loss for instance reconstruction 42 | cd_loss, _, _ = self.chamferloss(inst_shape, model) 43 | cd_loss = self.cd_wt * cd_loss 44 | # L2 regularizations on deformation 45 | deform_loss = torch.norm(deltas, p=2, dim=2).mean() 46 | deform_loss = self.deform_wt * deform_loss 47 | # total loss 48 | total_loss = corr_loss + entropy_loss + cd_loss + deform_loss 49 | return total_loss, corr_loss, cd_loss, entropy_loss, deform_loss 50 | -------------------------------------------------------------------------------- /lib/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from lib.pspnet import PSPNet 4 | 5 | 6 | class DeformNet(nn.Module): 7 | def __init__(self, n_cat=6, nv_prior=1024): 8 | super(DeformNet, self).__init__() 9 | self.n_cat = n_cat 10 | self.psp = PSPNet(bins=(1, 2, 3, 6), backend='resnet18') 11 | self.instance_color = nn.Sequential( 12 | nn.Conv1d(32, 64, 1), 13 | nn.ReLU(), 14 | ) 15 | self.instance_geometry = nn.Sequential( 16 | nn.Conv1d(3, 64, 1), 17 | nn.ReLU(), 18 | nn.Conv1d(64, 64, 1), 19 | nn.ReLU(), 20 | nn.Conv1d(64, 64, 1), 21 | nn.ReLU(), 22 | ) 23 | self.instance_global = nn.Sequential( 24 | nn.Conv1d(128, 128, 1), 25 | nn.ReLU(), 26 | nn.Conv1d(128, 1024, 1), 27 | nn.ReLU(), 28 | nn.AdaptiveAvgPool1d(1), 29 | ) 30 | self.category_local = nn.Sequential( 31 | nn.Conv1d(3, 64, 1), 32 | nn.ReLU(), 33 | nn.Conv1d(64, 64, 1), 34 | nn.ReLU(), 35 | nn.Conv1d(64, 64, 1), 36 | nn.ReLU(), 37 | ) 38 | self.category_global = nn.Sequential( 39 | nn.Conv1d(64, 128, 1), 40 | nn.ReLU(), 41 | nn.Conv1d(128, 1024, 1), 42 | nn.ReLU(), 43 | nn.AdaptiveAvgPool1d(1), 44 | ) 45 | self.assignment = nn.Sequential( 46 | nn.Conv1d(2176, 512, 1), 47 | nn.ReLU(), 48 | nn.Conv1d(512, 256, 1), 49 | nn.ReLU(), 50 | nn.Conv1d(256, n_cat*nv_prior, 1), 51 | ) 52 | self.deformation = nn.Sequential( 53 | nn.Conv1d(2112, 512, 1), 54 | nn.ReLU(), 55 | nn.Conv1d(512, 256, 1), 56 | nn.ReLU(), 57 | nn.Conv1d(256, n_cat*3, 1), 58 | ) 59 | # Initialize weights to be small so initial deformations aren't so big 60 | self.deformation[4].weight.data.normal_(0, 0.0001) 61 | 62 | def forward(self, points, img, choose, cat_id, prior): 63 | """ 64 | Args: 65 | points: bs x n_pts x 3 66 | img: bs x 3 x H x W 67 | choose: bs x n_pts 68 | cat_id: bs 69 | prior: bs x nv x 3 70 | 71 | Returns: 72 | assign_mat: bs x n_pts x nv 73 | inst_shape: bs x nv x 3 74 | deltas: bs x nv x 3 75 | log_assign: bs x n_pts x nv, for numerical stability 76 | 77 | """ 78 | bs, n_pts = points.size()[:2] 79 | nv = prior.size()[1] 80 | # instance-specific features 81 | points = points.permute(0, 2, 1) 82 | points = self.instance_geometry(points) 83 | out_img = self.psp(img) 84 | di = out_img.size()[1] 85 | emb = out_img.view(bs, di, -1) 86 | choose = choose.unsqueeze(1).repeat(1, di, 1) 87 | emb = torch.gather(emb, 2, choose).contiguous() 88 | emb = self.instance_color(emb) 89 | inst_local = torch.cat((points, emb), dim=1) # bs x 128 x n_pts 90 | inst_global = self.instance_global(inst_local) # bs x 1024 x 1 91 | # category-specific features 92 | cat_prior = prior.permute(0, 2, 1) 93 | cat_local = self.category_local(cat_prior) # bs x 64 x n_pts 94 | cat_global = self.category_global(cat_local) # bs x 1024 x 1 95 | # assignemnt matrix 96 | assign_feat = torch.cat((inst_local, inst_global.repeat(1, 1, n_pts), cat_global.repeat(1, 1, n_pts)), dim=1) # bs x 2176 x n_pts 97 | assign_mat = self.assignment(assign_feat) 98 | assign_mat = assign_mat.view(-1, nv, n_pts).contiguous() # bs, nc*nv, n_pts -> bs*nc, nv, n_pts 99 | index = cat_id + torch.arange(bs, dtype=torch.long).cuda() * self.n_cat 100 | assign_mat = torch.index_select(assign_mat, 0, index) # bs x nv x n_pts 101 | assign_mat = assign_mat.permute(0, 2, 1).contiguous() # bs x n_pts x nv 102 | # deformation field 103 | deform_feat = torch.cat((cat_local, cat_global.repeat(1, 1, nv), inst_global.repeat(1, 1, nv)), dim=1) # bs x 2112 x n_pts 104 | deltas = self.deformation(deform_feat) 105 | deltas = deltas.view(-1, 3, nv).contiguous() # bs, nc*3, nv -> bs*nc, 3, nv 106 | deltas = torch.index_select(deltas, 0, index) # bs x 3 x nv 107 | deltas = deltas.permute(0, 2, 1).contiguous() # bs x nv x 3 108 | 109 | return assign_mat, deltas 110 | -------------------------------------------------------------------------------- /lib/nn_distance/chamfer_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import nn_distance 3 | 4 | 5 | class NnDistanceFunction(torch.autograd.Function): 6 | """ 3D point set to 3D point set distance. 7 | 8 | """ 9 | @staticmethod 10 | def forward(ctx, xyz1, xyz2): 11 | B, N, _ = xyz1.size() 12 | B, M, _ = xyz2.size() 13 | result = torch.empty(B, N, dtype=xyz1.dtype, device=xyz1.device) 14 | result_i = torch.empty(B, N, dtype=torch.int32, device=xyz1.device) 15 | result2 = torch.empty(B, M, dtype=xyz2.dtype, device=xyz2.device) 16 | result2_i = torch.empty(B, M, dtype=torch.int32, device=xyz2.device) 17 | nn_distance.forward(xyz1, xyz2, result, result2, result_i, result2_i) 18 | ctx.save_for_backward(xyz1, xyz2, result_i, result2_i) 19 | ctx.mark_non_differentiable(result_i, result2_i) 20 | return result, result2, result_i, result2_i 21 | 22 | @staticmethod 23 | def backward(ctx, d_dist1, d_dist2, d_i1, d_i2): 24 | B, N = d_dist1.size() 25 | B, M = d_dist2.size() 26 | xyz1, xyz2, idx1, idx2 = ctx.saved_variables 27 | d_xyz1 = torch.zeros_like(xyz1) 28 | d_xyz2 = torch.zeros_like(xyz2) 29 | gradient1, gradient2 = ctx.needs_input_grad 30 | nn_distance.backward(xyz1, xyz2, d_xyz1, d_xyz2, d_dist1, d_dist2, idx1, idx2) 31 | if not gradient1: 32 | return None, d_xyz2 33 | if not gradient2: 34 | return d_xyz1, None 35 | else: 36 | return d_xyz1, d_xyz2 37 | 38 | 39 | class ChamferLoss(torch.nn.Module): 40 | """ Chamfer Loss: bidirectional nearest neighbor distance of two point sets. 41 | 42 | """ 43 | def __init__(self, threshold=None, backward_weight=1.0): 44 | super(ChamferLoss, self).__init__() 45 | # only consider distance smaller than threshold*mean(distance) (remove outlier) 46 | self.__threshold = threshold 47 | self.backward_weight = backward_weight 48 | 49 | def set_threshold(self, value): 50 | self.__threshold = value 51 | 52 | def unset_threshold(self): 53 | self.__threshold = None 54 | 55 | def forward(self, pred, gt): 56 | assert(pred.dim() == 3 and gt.dim() == 3), \ 57 | "input for ChamferLoss must be a 3D-tensor, but pred.size() is {} gt.size() is {}".format(pred.size(), gt.size()) 58 | # need transpose 59 | if pred.size(2) != 3: 60 | assert(pred.size(1) == 3), "ChamferLoss is implemented for 3D points" 61 | pred = pred.transpose(2, 1).contiguous() 62 | if gt.size(2) != 3: 63 | assert(gt.size(1) == 3), "ChamferLoss is implemented for 3D points" 64 | gt = gt.transpose(2, 1).contiguous() 65 | assert(pred.size(2) == 3 and gt.size(2) == 3), "ChamferLoss is implemented for 3D points" 66 | pred2gt, gt2pred, idx1, idx2 = NnDistanceFunction.apply(pred, gt) 67 | 68 | if self.__threshold is not None: 69 | threshold = self.__threshold 70 | forward_threshold = torch.mean(pred2gt, dim=1, keepdim=True) * threshold 71 | backward_threshold = torch.mean(gt2pred, dim=1, keepdim=True) * threshold 72 | # only care about distance within threshold (ignore strong outliers) 73 | pred2gt = torch.where(pred2gt < forward_threshold, pred2gt, torch.zeros_like(pred2gt)) 74 | gt2pred = torch.where(gt2pred < backward_threshold, gt2pred, torch.zeros_like(gt2pred)) 75 | 76 | pred2gt = torch.mean(pred2gt, dim=1) 77 | gt2pred = torch.mean(gt2pred, dim=1) 78 | cd_dist = pred2gt + self.backward_weight * gt2pred 79 | cd_loss = torch.mean(cd_dist) 80 | return cd_loss, idx1, idx2 81 | 82 | 83 | if __name__ == '__main__': 84 | from torch.autograd import gradcheck 85 | nndistance = NnDistanceFunction.apply 86 | pc1 = torch.randn([2, 60, 3], dtype=torch.float, requires_grad=True).cuda() 87 | pc2 = torch.randn([2, 30, 3], dtype=torch.float, requires_grad=True).cuda() 88 | test = gradcheck(nndistance, (pc1, pc2), eps=1e-3, atol=1e-3) 89 | print(test) 90 | -------------------------------------------------------------------------------- /lib/nn_distance/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | 5 | setup( 6 | name='nn_distance', 7 | ext_modules=[ 8 | CUDAExtension('nn_distance', [ 9 | 'src/nn_distance.cpp', 10 | 'src/nn_distance_cuda.cu', ], 11 | extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}) 12 | ], 13 | 14 | cmdclass={ 15 | 'build_ext': BuildExtension 16 | }) 17 | -------------------------------------------------------------------------------- /lib/nn_distance/src/nn_distance.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int nn_distance_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2); 5 | 6 | 7 | int nn_distance_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2); 8 | 9 | 10 | int nn_distance_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) { 11 | return nn_distance_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2); 12 | } 13 | 14 | 15 | int nn_distance_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 16 | at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) { 17 | return nn_distance_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2); 18 | } 19 | 20 | 21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 22 | m.def("forward", &nn_distance_forward, "nn_distance forward (CUDA)"); 23 | m.def("backward", &nn_distance_backward, "nn_distance backward (CUDA)"); 24 | } -------------------------------------------------------------------------------- /lib/nn_distance/src/nn_distance_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 9 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 10 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 11 | 12 | 13 | __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){ 14 | const int batch=512; 15 | __shared__ float buf[batch*3]; 16 | for (int i=blockIdx.x;ibest){ 128 | result[(i*n+j)]=best; 129 | result_i[(i*n+j)]=best_i; 130 | } 131 | } 132 | __syncthreads(); 133 | } 134 | } 135 | } 136 | 137 | 138 | int nn_distance_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){ 139 | CHECK_INPUT(xyz1); 140 | CHECK_INPUT(xyz2); 141 | 142 | const auto batch_size = xyz1.size(0); 143 | const auto n = xyz1.size(1); //num_points point cloud A 144 | const auto m = xyz2.size(1); //num_points point cloud B 145 | 146 | NmDistanceKernel<<>>(batch_size, n, xyz1.data(), m, xyz2.data(), dist1.data(), idx1.data()); 147 | NmDistanceKernel<<>>(batch_size, m, xyz2.data(), n, xyz1.data(), dist2.data(), idx2.data()); 148 | 149 | cudaError_t err = cudaGetLastError(); 150 | if (err != cudaSuccess) { 151 | printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err)); 152 | return 0; 153 | } 154 | return 1; 155 | } 156 | 157 | 158 | __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){ 159 | for (int i=blockIdx.x;i>>(batch_size,n,xyz1.data(),m,xyz2.data(),graddist1.data(),idx1.data(),gradxyz1.data(),gradxyz2.data()); 186 | NmDistanceGradKernel<<>>(batch_size,m,xyz2.data(),n,xyz1.data(),graddist2.data(),idx2.data(),gradxyz2.data(),gradxyz1.data()); 187 | 188 | cudaError_t err = cudaGetLastError(); 189 | if (err != cudaSuccess) { 190 | printf("error in nnd get grad: %s\n", cudaGetErrorString(err)); 191 | return 0; 192 | } 193 | return 1; 194 | } 195 | -------------------------------------------------------------------------------- /lib/pspnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | def conv3x3(in_planes, out_planes, stride=1, dilation=1): 8 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False) 9 | 10 | 11 | class BasicBlock(nn.Module): 12 | expansion = 1 13 | def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1): 14 | super(BasicBlock, self).__init__() 15 | self.conv1 = conv3x3(inplanes, planes, stride=stride, dilation=dilation) 16 | self.relu = nn.ReLU(inplace=True) 17 | self.conv2 = conv3x3(planes, planes, stride=1, dilation=dilation) 18 | self.downsample = downsample 19 | self.stride = stride 20 | 21 | def forward(self, x): 22 | residual = x 23 | out = self.conv1(x) 24 | out = self.relu(out) 25 | out = self.conv2(out) 26 | if self.downsample is not None: 27 | residual = self.downsample(x) 28 | out += residual 29 | out = self.relu(out) 30 | return out 31 | 32 | 33 | class ResNet(nn.Module): 34 | def __init__(self, block, layers=(3, 4, 23, 3)): 35 | self.inplanes = 64 36 | super(ResNet, self).__init__() 37 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 38 | self.relu = nn.ReLU(inplace=True) 39 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 40 | self.layer1 = self._make_layer(block, 64, layers[0]) 41 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 42 | self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) 43 | self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) 44 | 45 | for m in self.modules(): 46 | if isinstance(m, nn.Conv2d): 47 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 48 | m.weight.data.normal_(0, math.sqrt(2./n)) 49 | elif isinstance(m, nn.BatchNorm2d): 50 | m.weight.data.fill_(1) 51 | m.bias.data.zero_() 52 | 53 | def _make_layer(self, block, planes, blocks, stride=1, dilation=1): 54 | downsample = None 55 | if stride != 1 or self.inplanes != planes*block.expansion: 56 | downsample = nn.Sequential( 57 | nn.Conv2d(self.inplanes, planes*block.expansion, kernel_size=1, stride=stride, bias=False) 58 | ) 59 | layers = [block(self.inplanes, planes, stride, downsample)] 60 | self.inplanes = planes * block.expansion 61 | for i in range(1, blocks): 62 | layers.append(block(self.inplanes, planes, dilation=dilation)) 63 | return nn.Sequential(*layers) 64 | 65 | def forward(self, x): 66 | x = self.conv1(x) 67 | x = self.relu(x) 68 | x = self.maxpool(x) 69 | x = self.layer1(x) 70 | x = self.layer2(x) 71 | x = self.layer3(x) 72 | x = self.layer4(x) 73 | return x 74 | 75 | 76 | class PSPModule(nn.Module): 77 | def __init__(self, feat_dim, bins=(1, 2, 3, 6)): 78 | super(PSPModule, self).__init__() 79 | self.reduction_dim = feat_dim // len(bins) 80 | self.stages = [] 81 | self.stages = nn.ModuleList([self._make_stage(feat_dim, size) for size in bins]) 82 | 83 | def _make_stage(self, feat_dim, size): 84 | prior = nn.AdaptiveAvgPool2d(output_size=(size, size)) 85 | conv = nn.Conv2d(feat_dim, self.reduction_dim, kernel_size=1, bias=False) 86 | relu = nn.ReLU(inplace=True) 87 | return nn.Sequential(prior, conv, relu) 88 | 89 | def forward(self, feats): 90 | h, w = feats.size(2), feats.size(3) 91 | priors = [feats] 92 | for stage in self.stages: 93 | priors.append(F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True)) 94 | return torch.cat(priors, 1) 95 | 96 | 97 | class PSPUpsample(nn.Module): 98 | def __init__(self, in_channels, out_channels): 99 | super(PSPUpsample, self).__init__() 100 | self.conv = nn.Sequential( 101 | nn.Conv2d(in_channels, out_channels, 3, padding=1), 102 | nn.PReLU() 103 | ) 104 | 105 | def forward(self, x): 106 | x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) 107 | return self.conv(x) 108 | 109 | 110 | class PSPNet(nn.Module): 111 | def __init__(self, bins=(1, 2, 3, 6), backend='resnet18'): 112 | super(PSPNet, self).__init__() 113 | if backend == 'resnet18': 114 | self.feats = ResNet(BasicBlock, [2, 2, 2, 2]) 115 | feat_dim = 512 116 | else: 117 | raise NotImplementedError 118 | self.psp = PSPModule(feat_dim, bins) 119 | self.drop = nn.Dropout2d(p=0.15) 120 | self.up_1 = PSPUpsample(1024, 256) 121 | self.up_2 = PSPUpsample(256, 64) 122 | self.up_3 = PSPUpsample(64, 64) 123 | self.final = nn.Conv2d(64, 32, kernel_size=1) 124 | 125 | def forward(self, x): 126 | f = self.feats(x) 127 | p = self.psp(f) 128 | p = self.up_1(p) 129 | p = self.drop(p) 130 | p = self.up_2(p) 131 | p = self.drop(p) 132 | p = self.up_3(p) 133 | return self.final(p) 134 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluation-related codes are modified from 3 | https://github.com/hughw19/NOCS_CVPR2019 4 | """ 5 | import logging 6 | import os 7 | import math 8 | import cv2 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import _pickle as cPickle 12 | from tqdm import tqdm 13 | 14 | 15 | def setup_logger(logger_name, log_file, level=logging.INFO): 16 | logger = logging.getLogger(logger_name) 17 | formatter = logging.Formatter('%(asctime)s : %(message)s') 18 | fileHandler = logging.FileHandler(log_file, mode='a') 19 | fileHandler.setFormatter(formatter) 20 | logger.setLevel(level) 21 | logger.addHandler(fileHandler) 22 | streamHandler = logging.StreamHandler() 23 | streamHandler.setFormatter(formatter) 24 | logger.addHandler(streamHandler) 25 | return logger 26 | 27 | 28 | def load_obj(path_to_file): 29 | """ Load obj file. 30 | 31 | Args: 32 | path_to_file: path 33 | 34 | Returns: 35 | vertices: ndarray 36 | faces: ndarray, index of triangle vertices 37 | 38 | """ 39 | vertices = [] 40 | faces = [] 41 | with open(path_to_file, 'r') as f: 42 | for line in f: 43 | if line[:2] == 'v ': 44 | vertex = line[2:].strip().split(' ') 45 | vertex = [float(xyz) for xyz in vertex] 46 | vertices.append(vertex) 47 | elif line[0] == 'f': 48 | face = line[1:].replace('//', '/').strip().split(' ') 49 | face = [int(idx.split('/')[0])-1 for idx in face] 50 | faces.append(face) 51 | else: 52 | continue 53 | vertices = np.asarray(vertices) 54 | faces = np.asarray(faces) 55 | return vertices, faces 56 | 57 | 58 | def create_sphere(): 59 | # 642 verts, 1280 faces, 60 | verts, faces = load_obj('assets/sphere_mesh_template.obj') 61 | return verts, faces 62 | 63 | 64 | def random_point(face_vertices): 65 | """ Sampling point using Barycentric coordiante. 66 | 67 | """ 68 | r1, r2 = np.random.random(2) 69 | sqrt_r1 = np.sqrt(r1) 70 | point = (1 - sqrt_r1) * face_vertices[0, :] + \ 71 | sqrt_r1 * (1 - r2) * face_vertices[1, :] + \ 72 | sqrt_r1 * r2 * face_vertices[2, :] 73 | 74 | return point 75 | 76 | 77 | def pairwise_distance(A, B): 78 | """ Compute pairwise distance of two point clouds.point 79 | 80 | Args: 81 | A: n x 3 numpy array 82 | B: m x 3 numpy array 83 | 84 | Return: 85 | C: n x m numpy array 86 | 87 | """ 88 | diff = A[:, :, None] - B[:, :, None].T 89 | C = np.sqrt(np.sum(diff**2, axis=1)) 90 | 91 | return C 92 | 93 | 94 | def uniform_sample(vertices, faces, n_samples, with_normal=False): 95 | """ Sampling points according to the area of mesh surface. 96 | 97 | """ 98 | sampled_points = np.zeros((n_samples, 3), dtype=float) 99 | normals = np.zeros((n_samples, 3), dtype=float) 100 | faces = vertices[faces] 101 | vec_cross = np.cross(faces[:, 1, :] - faces[:, 0, :], 102 | faces[:, 2, :] - faces[:, 0, :]) 103 | face_area = 0.5 * np.linalg.norm(vec_cross, axis=1) 104 | cum_area = np.cumsum(face_area) 105 | for i in range(n_samples): 106 | face_id = np.searchsorted(cum_area, np.random.random() * cum_area[-1]) 107 | sampled_points[i] = random_point(faces[face_id, :, :]) 108 | normals[i] = vec_cross[face_id] 109 | normals = normals / np.linalg.norm(normals, axis=1, keepdims=True) 110 | if with_normal: 111 | sampled_points = np.concatenate((sampled_points, normals), axis=1) 112 | return sampled_points 113 | 114 | 115 | def farthest_point_sampling(points, n_samples): 116 | """ Farthest point sampling. 117 | 118 | """ 119 | selected_pts = np.zeros((n_samples,), dtype=int) 120 | dist_mat = pairwise_distance(points, points) 121 | # start from first point 122 | pt_idx = 0 123 | dist_to_set = dist_mat[:, pt_idx] 124 | for i in range(n_samples): 125 | selected_pts[i] = pt_idx 126 | dist_to_set = np.minimum(dist_to_set, dist_mat[:, pt_idx]) 127 | pt_idx = np.argmax(dist_to_set) 128 | return selected_pts 129 | 130 | 131 | def sample_points_from_mesh(path, n_pts, with_normal=False, fps=False, ratio=2): 132 | """ Uniformly sampling points from mesh model. 133 | 134 | Args: 135 | path: path to OBJ file. 136 | n_pts: int, number of points being sampled. 137 | with_normal: return points with normal, approximated by mesh triangle normal 138 | fps: whether to use fps for post-processing, default False. 139 | ratio: int, if use fps, sample ratio*n_pts first, then use fps to sample final output. 140 | 141 | Returns: 142 | points: n_pts x 3, n_pts x 6 if with_normal = True 143 | 144 | """ 145 | vertices, faces = load_obj(path) 146 | if fps: 147 | points = uniform_sample(vertices, faces, ratio*n_pts, with_normal) 148 | pts_idx = farthest_point_sampling(points[:, :3], n_pts) 149 | points = points[pts_idx] 150 | else: 151 | points = uniform_sample(vertices, faces, n_pts, with_normal) 152 | return points 153 | 154 | 155 | def load_depth(img_path): 156 | """ Load depth image from img_path. """ 157 | depth_path = img_path + '_depth.png' 158 | depth = cv2.imread(depth_path, -1) 159 | if len(depth.shape) == 3: 160 | # This is encoded depth image, let's convert 161 | # NOTE: RGB is actually BGR in opencv 162 | depth16 = depth[:, :, 1]*256 + depth[:, :, 2] 163 | depth16 = np.where(depth16==32001, 0, depth16) 164 | depth16 = depth16.astype(np.uint16) 165 | elif len(depth.shape) == 2 and depth.dtype == 'uint16': 166 | depth16 = depth 167 | else: 168 | assert False, '[ Error ]: Unsupported depth type.' 169 | return depth16 170 | 171 | 172 | def get_bbox(bbox): 173 | """ Compute square image crop window. """ 174 | y1, x1, y2, x2 = bbox 175 | img_width = 480 176 | img_length = 640 177 | window_size = (max(y2-y1, x2-x1) // 40 + 1) * 40 178 | window_size = min(window_size, 440) 179 | center = [(y1 + y2) // 2, (x1 + x2) // 2] 180 | rmin = center[0] - int(window_size / 2) 181 | rmax = center[0] + int(window_size / 2) 182 | cmin = center[1] - int(window_size / 2) 183 | cmax = center[1] + int(window_size / 2) 184 | if rmin < 0: 185 | delt = -rmin 186 | rmin = 0 187 | rmax += delt 188 | if cmin < 0: 189 | delt = -cmin 190 | cmin = 0 191 | cmax += delt 192 | if rmax > img_width: 193 | delt = rmax - img_width 194 | rmax = img_width 195 | rmin -= delt 196 | if cmax > img_length: 197 | delt = cmax - img_length 198 | cmax = img_length 199 | cmin -= delt 200 | return rmin, rmax, cmin, cmax 201 | 202 | 203 | def compute_sRT_errors(sRT1, sRT2): 204 | """ 205 | Args: 206 | sRT1: [4, 4]. homogeneous affine transformation 207 | sRT2: [4, 4]. homogeneous affine transformation 208 | 209 | Returns: 210 | R_error: angle difference in degree, 211 | T_error: Euclidean distance 212 | IoU: relative scale error 213 | 214 | """ 215 | try: 216 | assert np.array_equal(sRT1[3, :], sRT2[3, :]) 217 | assert np.array_equal(sRT1[3, :], np.array([0, 0, 0, 1])) 218 | except AssertionError: 219 | print(sRT1[3, :], sRT2[3, :]) 220 | 221 | s1 = np.cbrt(np.linalg.det(sRT1[:3, :3])) 222 | R1 = sRT1[:3, :3] / s1 223 | T1 = sRT1[:3, 3] 224 | s2 = np.cbrt(np.linalg.det(sRT2[:3, :3])) 225 | R2 = sRT2[:3, :3] / s2 226 | T2 = sRT2[:3, 3] 227 | R12 = R1 @ R2.transpose() 228 | R_error = np.arccos(np.clip((np.trace(R12)-1)/2, -1.0, 1.0)) * 180 / np.pi 229 | T_error = np.linalg.norm(T1 - T2) 230 | IoU = np.abs(s1 - s2) / s2 231 | 232 | return R_error, T_error, IoU 233 | 234 | 235 | ############################################################ 236 | # Evaluation 237 | ############################################################ 238 | 239 | def get_3d_bbox(size, shift=0): 240 | """ 241 | Args: 242 | size: [3] or scalar 243 | shift: [3] or scalar 244 | Returns: 245 | bbox_3d: [3, N] 246 | 247 | """ 248 | bbox_3d = np.array([[+size[0] / 2, +size[1] / 2, +size[2] / 2], 249 | [+size[0] / 2, +size[1] / 2, -size[2] / 2], 250 | [-size[0] / 2, +size[1] / 2, +size[2] / 2], 251 | [-size[0] / 2, +size[1] / 2, -size[2] / 2], 252 | [+size[0] / 2, -size[1] / 2, +size[2] / 2], 253 | [+size[0] / 2, -size[1] / 2, -size[2] / 2], 254 | [-size[0] / 2, -size[1] / 2, +size[2] / 2], 255 | [-size[0] / 2, -size[1] / 2, -size[2] / 2]]) + shift 256 | bbox_3d = bbox_3d.transpose() 257 | return bbox_3d 258 | 259 | 260 | def transform_coordinates_3d(coordinates, sRT): 261 | """ 262 | Args: 263 | coordinates: [3, N] 264 | sRT: [4, 4] 265 | 266 | Returns: 267 | new_coordinates: [3, N] 268 | 269 | """ 270 | assert coordinates.shape[0] == 3 271 | coordinates = np.vstack([coordinates, np.ones((1, coordinates.shape[1]), dtype=np.float32)]) 272 | new_coordinates = sRT @ coordinates 273 | new_coordinates = new_coordinates[:3, :] / new_coordinates[3, :] 274 | return new_coordinates 275 | 276 | 277 | def compute_3d_IoU(sRT_1, sRT_2, size_1, size_2, class_name_1, class_name_2, handle_visibility): 278 | """ Computes IoU overlaps between two 3D bboxes. """ 279 | def asymmetric_3d_iou(sRT_1, sRT_2, size_1, size_2): 280 | noc_cube_1 = get_3d_bbox(size_1, 0) 281 | bbox_3d_1 = transform_coordinates_3d(noc_cube_1, sRT_1) 282 | noc_cube_2 = get_3d_bbox(size_2, 0) 283 | bbox_3d_2 = transform_coordinates_3d(noc_cube_2, sRT_2) 284 | 285 | bbox_1_max = np.amax(bbox_3d_1, axis=0) 286 | bbox_1_min = np.amin(bbox_3d_1, axis=0) 287 | bbox_2_max = np.amax(bbox_3d_2, axis=0) 288 | bbox_2_min = np.amin(bbox_3d_2, axis=0) 289 | 290 | overlap_min = np.maximum(bbox_1_min, bbox_2_min) 291 | overlap_max = np.minimum(bbox_1_max, bbox_2_max) 292 | 293 | # intersections and union 294 | if np.amin(overlap_max - overlap_min) < 0: 295 | intersections = 0 296 | else: 297 | intersections = np.prod(overlap_max - overlap_min) 298 | union = np.prod(bbox_1_max - bbox_1_min) + np.prod(bbox_2_max - bbox_2_min) - intersections 299 | overlaps = intersections / union 300 | return overlaps 301 | 302 | if sRT_1 is None or sRT_2 is None: 303 | return -1 304 | 305 | if (class_name_1 in ['bottle', 'bowl', 'can'] and class_name_1 == class_name_2) or \ 306 | (class_name_1 == 'mug' and class_name_1 == class_name_2 and handle_visibility==0): 307 | def y_rotation_matrix(theta): 308 | return np.array([[ np.cos(theta), 0, np.sin(theta), 0], 309 | [ 0, 1, 0, 0], 310 | [-np.sin(theta), 0, np.cos(theta), 0], 311 | [ 0, 0, 0, 1]]) 312 | n = 20 313 | max_iou = 0 314 | for i in range(n): 315 | rotated_RT_1 = sRT_1 @ y_rotation_matrix(2 * math.pi * i / float(n)) 316 | max_iou = max(max_iou, asymmetric_3d_iou(rotated_RT_1, sRT_2, size_1, size_2)) 317 | else: 318 | max_iou = asymmetric_3d_iou(sRT_1, sRT_2, size_1, size_2) 319 | 320 | return max_iou 321 | 322 | 323 | def compute_IoU_matches(gt_class_ids, gt_sRT, gt_size, gt_handle_visibility, 324 | pred_class_ids, pred_sRT, pred_size, pred_scores, 325 | synset_names, iou_3d_thresholds, score_threshold=0): 326 | """ Find matches between NOCS prediction and ground truth instances. 327 | 328 | Args: 329 | size: 3D bounding box size 330 | bboxes: 2D bounding boxes 331 | 332 | Returns: 333 | gt_matches: 2-D array. For each GT box it has the index of the matched predicted box. 334 | pred_matches: 2-D array. For each predicted box, it has the index of the matched ground truth box. 335 | overlaps: IoU overlaps. 336 | indices: 337 | 338 | """ 339 | num_pred = len(pred_class_ids) 340 | num_gt = len(gt_class_ids) 341 | indices = np.zeros(0) 342 | if num_pred: 343 | # Sort predictions by score from high to low 344 | indices = np.argsort(pred_scores)[::-1] 345 | pred_class_ids = pred_class_ids[indices].copy() 346 | pred_size = pred_size[indices].copy() 347 | pred_sRT = pred_sRT[indices].copy() 348 | # compute IoU overlaps [pred_bboxs gt_bboxs] 349 | overlaps = np.zeros((num_pred, num_gt), dtype=np.float32) 350 | for i in range(num_pred): 351 | for j in range(num_gt): 352 | overlaps[i, j] = compute_3d_IoU(pred_sRT[i], gt_sRT[j], pred_size[i, :], gt_size[j], 353 | synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]], gt_handle_visibility[j]) 354 | # loop through predictions and find matching ground truth boxes 355 | num_iou_3d_thres = len(iou_3d_thresholds) 356 | pred_matches = -1 * np.ones([num_iou_3d_thres, num_pred]) 357 | gt_matches = -1 * np.ones([num_iou_3d_thres, num_gt]) 358 | for s, iou_thres in enumerate(iou_3d_thresholds): 359 | for i in range(indices.shape[0]): 360 | # Find best matching ground truth box 361 | # 1. Sort matches by score 362 | sorted_ixs = np.argsort(overlaps[i])[::-1] 363 | # 2. Remove low scores 364 | low_score_idx = np.where(overlaps[i, sorted_ixs] < score_threshold)[0] 365 | if low_score_idx.size > 0: 366 | sorted_ixs = sorted_ixs[:low_score_idx[0]] 367 | # 3. Find the match 368 | for j in sorted_ixs: 369 | # If ground truth box is already matched, go to next one 370 | if gt_matches[s, j] > -1: 371 | continue 372 | # If we reach IoU smaller than the threshold, end the loop 373 | iou = overlaps[i, j] 374 | if iou < iou_thres: 375 | break 376 | # Do we have a match? 377 | if not pred_class_ids[i] == gt_class_ids[j]: 378 | continue 379 | if iou > iou_thres: 380 | gt_matches[s, j] = i 381 | pred_matches[s, i] = j 382 | break 383 | return gt_matches, pred_matches, overlaps, indices 384 | 385 | 386 | def compute_RT_errors(sRT_1, sRT_2, class_id, handle_visibility, synset_names): 387 | """ 388 | Args: 389 | sRT_1: [4, 4]. homogeneous affine transformation 390 | sRT_2: [4, 4]. homogeneous affine transformation 391 | 392 | Returns: 393 | theta: angle difference of R in degree 394 | shift: l2 difference of T in centimeter 395 | """ 396 | # make sure the last row is [0, 0, 0, 1] 397 | if sRT_1 is None or sRT_2 is None: 398 | return -1 399 | try: 400 | assert np.array_equal(sRT_1[3, :], sRT_2[3, :]) 401 | assert np.array_equal(sRT_1[3, :], np.array([0, 0, 0, 1])) 402 | except AssertionError: 403 | print(sRT_1[3, :], sRT_2[3, :]) 404 | exit() 405 | 406 | R1 = sRT_1[:3, :3] / np.cbrt(np.linalg.det(sRT_1[:3, :3])) 407 | T1 = sRT_1[:3, 3] 408 | R2 = sRT_2[:3, :3] / np.cbrt(np.linalg.det(sRT_2[:3, :3])) 409 | T2 = sRT_2[:3, 3] 410 | # symmetric when rotating around y-axis 411 | if synset_names[class_id] in ['bottle', 'can', 'bowl'] or \ 412 | (synset_names[class_id] == 'mug' and handle_visibility == 0): 413 | y = np.array([0, 1, 0]) 414 | y1 = R1 @ y 415 | y2 = R2 @ y 416 | cos_theta = y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2)) 417 | else: 418 | R = R1 @ R2.transpose() 419 | cos_theta = (np.trace(R) - 1) / 2 420 | 421 | theta = np.arccos(np.clip(cos_theta, -1.0, 1.0)) * 180 / np.pi 422 | shift = np.linalg.norm(T1 - T2) * 100 423 | result = np.array([theta, shift]) 424 | 425 | return result 426 | 427 | 428 | def compute_RT_overlaps(gt_class_ids, gt_sRT, gt_handle_visibility, pred_class_ids, pred_sRT, synset_names): 429 | """ Finds overlaps between prediction and ground truth instances. 430 | 431 | Returns: 432 | overlaps: 433 | 434 | """ 435 | num_pred = len(pred_class_ids) 436 | num_gt = len(gt_class_ids) 437 | overlaps = np.zeros((num_pred, num_gt, 2)) 438 | 439 | for i in range(num_pred): 440 | for j in range(num_gt): 441 | overlaps[i, j, :] = compute_RT_errors(pred_sRT[i], gt_sRT[j], gt_class_ids[j], 442 | gt_handle_visibility[j], synset_names) 443 | return overlaps 444 | 445 | 446 | def compute_RT_matches(overlaps, pred_class_ids, gt_class_ids, degree_thres_list, shift_thres_list): 447 | num_degree_thres = len(degree_thres_list) 448 | num_shift_thres = len(shift_thres_list) 449 | num_pred = len(pred_class_ids) 450 | num_gt = len(gt_class_ids) 451 | 452 | pred_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_pred)) 453 | gt_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_gt)) 454 | 455 | if num_pred == 0 or num_gt == 0: 456 | return gt_matches, pred_matches 457 | 458 | assert num_pred == overlaps.shape[0] 459 | assert num_gt == overlaps.shape[1] 460 | assert overlaps.shape[2] == 2 461 | 462 | for d, degree_thres in enumerate(degree_thres_list): 463 | for s, shift_thres in enumerate(shift_thres_list): 464 | for i in range(num_pred): 465 | # Find best matching ground truth box 466 | # 1. Sort matches by scores from low to high 467 | sum_degree_shift = np.sum(overlaps[i, :, :], axis=-1) 468 | sorted_ixs = np.argsort(sum_degree_shift) 469 | # 2. Find the match 470 | for j in sorted_ixs: 471 | # If ground truth box is already matched, go to next one 472 | if gt_matches[d, s, j] > -1 or pred_class_ids[i] != gt_class_ids[j]: 473 | continue 474 | # If we reach IoU smaller than the threshold, end the loop 475 | if overlaps[i, j, 0] > degree_thres or overlaps[i, j, 1] > shift_thres: 476 | continue 477 | gt_matches[d, s, j] = i 478 | pred_matches[d, s, i] = j 479 | break 480 | 481 | return gt_matches, pred_matches 482 | 483 | 484 | def compute_ap_and_acc(pred_matches, pred_scores, gt_matches): 485 | # sort the scores from high to low 486 | assert pred_matches.shape[0] == pred_scores.shape[0] 487 | score_indices = np.argsort(pred_scores)[::-1] 488 | # pred_scores = pred_scores[score_indices] 489 | pred_matches = pred_matches[score_indices] 490 | precisions = np.cumsum(pred_matches > -1) / (np.arange(len(pred_matches)) + 1) 491 | recalls = np.cumsum(pred_matches > -1).astype(np.float32) / len(gt_matches) 492 | # Pad with start and end values to simplify the math 493 | precisions = np.concatenate([[0], precisions, [0]]) 494 | recalls = np.concatenate([[0], recalls, [1]]) 495 | # Ensure precision values decrease but don't increase. This way, the 496 | # precision value at each recall threshold is the maximum it can be 497 | # for all following recall thresholds, as specified by the VOC paper. 498 | for i in range(len(precisions) - 2, -1, -1): 499 | precisions[i] = np.maximum(precisions[i], precisions[i + 1]) 500 | # compute mean AP over recall range 501 | indices = np.where(recalls[:-1] != recalls[1:])[0] + 1 502 | ap = np.sum((recalls[indices] - recalls[indices - 1]) * precisions[indices]) 503 | # accuracy 504 | acc = np.sum(pred_matches > -1) / len(pred_matches) 505 | 506 | return ap, acc 507 | 508 | 509 | def compute_mAP(pred_results, out_dir, degree_thresholds=[180], shift_thresholds=[100], 510 | iou_3d_thresholds=[0.1], iou_pose_thres=0.1, use_matches_for_pose=False): 511 | """ Compute mean Average Precision. 512 | 513 | Returns: 514 | iou_aps: 515 | pose_aps: 516 | iou_acc: 517 | pose_acc: 518 | 519 | """ 520 | synset_names = ['BG', 'bottle', 'bowl', 'camera', 'can', 'laptop', 'mug'] 521 | num_classes = len(synset_names) 522 | degree_thres_list = list(degree_thresholds) + [360] 523 | num_degree_thres = len(degree_thres_list) 524 | shift_thres_list = list(shift_thresholds) + [100] 525 | num_shift_thres = len(shift_thres_list) 526 | iou_thres_list = list(iou_3d_thresholds) 527 | num_iou_thres = len(iou_thres_list) 528 | 529 | if use_matches_for_pose: 530 | assert iou_pose_thres in iou_thres_list 531 | 532 | # pre-allocate more than enough memory 533 | iou_aps = np.zeros((num_classes + 1, num_iou_thres)) 534 | iou_acc = np.zeros((num_classes + 1, num_iou_thres)) 535 | iou_pred_matches_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)] 536 | iou_pred_scores_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)] 537 | iou_gt_matches_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)] 538 | iou_pred_count = [0 for _ in range(num_classes)] 539 | iou_gt_count = [0 for _ in range(num_classes)] 540 | 541 | pose_aps = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres)) 542 | pose_acc = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres)) 543 | pose_pred_matches_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)] 544 | pose_pred_scores_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)] 545 | pose_gt_matches_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)] 546 | pose_pred_count = [0 for _ in range(num_classes)] 547 | pose_gt_count = [0 for _ in range(num_classes)] 548 | 549 | # loop over results to gather pred matches and gt matches for iou and pose metrics 550 | progress = 0 551 | for progress, result in enumerate(tqdm(pred_results)): 552 | gt_class_ids = result['gt_class_ids'].astype(np.int32) 553 | gt_sRT = np.array(result['gt_RTs']) 554 | gt_size = np.array(result['gt_scales']) 555 | gt_handle_visibility = result['gt_handle_visibility'] 556 | 557 | pred_class_ids = result['pred_class_ids'] 558 | pred_sRT = np.array(result['pred_RTs']) 559 | pred_size = result['pred_scales'] 560 | pred_scores = result['pred_scores'] 561 | 562 | if len(gt_class_ids) == 0 and len(pred_class_ids) == 0: 563 | continue 564 | 565 | for cls_id in range(1, num_classes): 566 | # get gt and predictions in this class 567 | cls_gt_class_ids = gt_class_ids[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros(0) 568 | cls_gt_sRT = gt_sRT[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros((0, 4, 4)) 569 | cls_gt_size = gt_size[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros((0, 3)) 570 | if synset_names[cls_id] != 'mug': 571 | cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids) 572 | else: 573 | cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids==cls_id] if len(gt_class_ids) else np.ones(0) 574 | 575 | cls_pred_class_ids = pred_class_ids[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros(0) 576 | cls_pred_sRT = pred_sRT[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros((0, 4, 4)) 577 | cls_pred_size = pred_size[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros((0, 3)) 578 | cls_pred_scores = pred_scores[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros(0) 579 | 580 | # calculate the overlap between each gt instance and pred instance 581 | iou_cls_gt_match, iou_cls_pred_match, _, iou_pred_indices = \ 582 | compute_IoU_matches(cls_gt_class_ids, cls_gt_sRT, cls_gt_size, cls_gt_handle_visibility, 583 | cls_pred_class_ids, cls_pred_sRT, cls_pred_size, cls_pred_scores, 584 | synset_names, iou_thres_list) 585 | if len(iou_pred_indices): 586 | cls_pred_class_ids = cls_pred_class_ids[iou_pred_indices] 587 | cls_pred_sRT = cls_pred_sRT[iou_pred_indices] 588 | cls_pred_scores = cls_pred_scores[iou_pred_indices] 589 | 590 | num_pred = iou_cls_pred_match.shape[1] 591 | pred_start = iou_pred_count[cls_id] 592 | pred_end = pred_start + num_pred 593 | iou_pred_count[cls_id] = pred_end 594 | iou_pred_matches_all[cls_id][:, pred_start:pred_end] = iou_cls_pred_match 595 | cls_pred_scores_tile = np.tile(cls_pred_scores, (num_iou_thres, 1)) 596 | assert cls_pred_scores_tile.shape[1] == num_pred 597 | iou_pred_scores_all[cls_id][:, pred_start:pred_end] = cls_pred_scores_tile 598 | num_gt = iou_cls_gt_match.shape[1] 599 | gt_start = iou_gt_count[cls_id] 600 | gt_end = gt_start + num_gt 601 | iou_gt_count[cls_id] = gt_end 602 | iou_gt_matches_all[cls_id][:, gt_start:gt_end] = iou_cls_gt_match 603 | 604 | if use_matches_for_pose: 605 | thres_ind = list(iou_thres_list).index(iou_pose_thres) 606 | iou_thres_pred_match = iou_cls_pred_match[thres_ind, :] 607 | cls_pred_class_ids = cls_pred_class_ids[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros(0) 608 | cls_pred_sRT = cls_pred_sRT[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros((0, 4, 4)) 609 | cls_pred_scores = cls_pred_scores[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros(0) 610 | iou_thres_gt_match = iou_cls_gt_match[thres_ind, :] 611 | cls_gt_class_ids = cls_gt_class_ids[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros(0) 612 | cls_gt_sRT = cls_gt_sRT[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros((0, 4, 4)) 613 | cls_gt_handle_visibility = cls_gt_handle_visibility[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros(0) 614 | 615 | RT_overlaps = compute_RT_overlaps(cls_gt_class_ids, cls_gt_sRT, cls_gt_handle_visibility, 616 | cls_pred_class_ids, cls_pred_sRT, synset_names) 617 | pose_cls_gt_match, pose_cls_pred_match = compute_RT_matches(RT_overlaps, cls_pred_class_ids, cls_gt_class_ids, 618 | degree_thres_list, shift_thres_list) 619 | num_pred = pose_cls_pred_match.shape[2] 620 | pred_start = pose_pred_count[cls_id] 621 | pred_end = pred_start + num_pred 622 | pose_pred_count[cls_id] = pred_end 623 | pose_pred_matches_all[cls_id][:, :, pred_start:pred_end] = pose_cls_pred_match 624 | cls_pred_scores_tile = np.tile(cls_pred_scores, (num_degree_thres, num_shift_thres, 1)) 625 | assert cls_pred_scores_tile.shape[2] == num_pred 626 | pose_pred_scores_all[cls_id][:, :, pred_start:pred_end] = cls_pred_scores_tile 627 | num_gt = pose_cls_gt_match.shape[2] 628 | gt_start = pose_gt_count[cls_id] 629 | gt_end = gt_start + num_gt 630 | pose_gt_count[cls_id] = gt_end 631 | pose_gt_matches_all[cls_id][:, :, gt_start:gt_end] = pose_cls_gt_match 632 | 633 | # trim zeros 634 | for cls_id in range(num_classes): 635 | # IoU 636 | iou_pred_matches_all[cls_id] = iou_pred_matches_all[cls_id][:, :iou_pred_count[cls_id]] 637 | iou_pred_scores_all[cls_id] = iou_pred_scores_all[cls_id][:, :iou_pred_count[cls_id]] 638 | iou_gt_matches_all[cls_id] = iou_gt_matches_all[cls_id][:, :iou_gt_count[cls_id]] 639 | # pose 640 | pose_pred_matches_all[cls_id] = pose_pred_matches_all[cls_id][:, :, :pose_pred_count[cls_id]] 641 | pose_pred_scores_all[cls_id] = pose_pred_scores_all[cls_id][:, :, :pose_pred_count[cls_id]] 642 | pose_gt_matches_all[cls_id] = pose_gt_matches_all[cls_id][:, :, :pose_gt_count[cls_id]] 643 | 644 | # compute 3D IoU mAP 645 | for cls_id in range(1, num_classes): 646 | for s, iou_thres in enumerate(iou_thres_list): 647 | iou_aps[cls_id, s], iou_acc[cls_id, s] = compute_ap_and_acc(iou_pred_matches_all[cls_id][s, :], 648 | iou_pred_scores_all[cls_id][s, :], 649 | iou_gt_matches_all[cls_id][s, :]) 650 | iou_aps[-1, :] = np.mean(iou_aps[1:-1, :], axis=0) 651 | iou_acc[-1, :] = np.mean(iou_acc[1:-1, :], axis=0) 652 | # compute pose mAP 653 | for i, degree_thres in enumerate(degree_thres_list): 654 | for j, shift_thres in enumerate(shift_thres_list): 655 | for cls_id in range(1, num_classes): 656 | cls_pose_pred_matches_all = pose_pred_matches_all[cls_id][i, j, :] 657 | cls_pose_gt_matches_all = pose_gt_matches_all[cls_id][i, j, :] 658 | cls_pose_pred_scores_all = pose_pred_scores_all[cls_id][i, j, :] 659 | pose_aps[cls_id, i, j], pose_acc[cls_id, i, j] = compute_ap_and_acc(cls_pose_pred_matches_all, 660 | cls_pose_pred_scores_all, 661 | cls_pose_gt_matches_all) 662 | pose_aps[-1, i, j] = np.mean(pose_aps[1:-1, i, j]) 663 | pose_acc[-1, i, j] = np.mean(pose_acc[1:-1, i, j]) 664 | 665 | # save results to pkl 666 | result_dict = {} 667 | result_dict['iou_thres_list'] = iou_thres_list 668 | result_dict['degree_thres_list'] = degree_thres_list 669 | result_dict['shift_thres_list'] = shift_thres_list 670 | result_dict['iou_aps'] = iou_aps 671 | result_dict['pose_aps'] = pose_aps 672 | result_dict['iou_acc'] = iou_acc 673 | result_dict['pose_acc'] = pose_acc 674 | pkl_path = os.path.join(out_dir, 'mAP_Acc.pkl') 675 | with open(pkl_path, 'wb') as f: 676 | cPickle.dump(result_dict, f) 677 | return iou_aps, pose_aps, iou_acc, pose_acc 678 | 679 | 680 | def plot_mAP(iou_aps, pose_aps, out_dir, iou_thres_list, degree_thres_list, shift_thres_list): 681 | """ Draw iou 3d AP vs. iou thresholds. 682 | """ 683 | 684 | labels = ['bottle', 'bowl', 'camera', 'can', 'laptop', 'mug', 'mean', 'nocs'] 685 | colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:pink', 'tab:olive', 'tab:purple', 'tab:red', 'tab:gray'] 686 | styles = ['-', '-', '-', '-', '-', '-', '--', ':'] 687 | 688 | fig, (ax_iou, ax_degree, ax_shift) = plt.subplots(1, 3, figsize=(8, 3.5)) 689 | # IoU subplot 690 | ax_iou.set_title('3D IoU', fontsize=10) 691 | ax_iou.set_ylabel('Average Precision') 692 | ax_iou.set_ylim(0, 100) 693 | ax_iou.set_xlabel('Percent') 694 | ax_iou.set_xlim(0, 100) 695 | ax_iou.xaxis.set_ticks([0, 25, 50, 75, 100]) 696 | ax_iou.grid() 697 | for i in range(1, iou_aps.shape[0]): 698 | ax_iou.plot(100*np.array(iou_thres_list), 100*iou_aps[i, :], 699 | color=colors[i-1], linestyle=styles[i-1], label=labels[i-1]) 700 | # rotation subplot 701 | ax_degree.set_title('Rotation', fontsize=10) 702 | ax_degree.set_ylim(0, 100) 703 | ax_degree.yaxis.set_ticklabels([]) 704 | ax_degree.set_xlabel('Degree') 705 | ax_degree.set_xlim(0, 60) 706 | ax_degree.xaxis.set_ticks([0, 20, 40, 60]) 707 | ax_degree.grid() 708 | for i in range(1, pose_aps.shape[0]): 709 | ax_degree.plot(np.array(degree_thres_list), 100*pose_aps[i, :len(degree_thres_list), -1], 710 | color=colors[i-1], linestyle=styles[i-1], label=labels[i-1]) 711 | # translation subplot 712 | ax_shift.set_title('Translation', fontsize=10) 713 | ax_shift.set_ylim(0, 100) 714 | ax_shift.yaxis.set_ticklabels([]) 715 | ax_shift.set_xlabel('Centimeter') 716 | ax_shift.set_xlim(0, 10) 717 | ax_shift.xaxis.set_ticks([0, 5, 10]) 718 | ax_shift.grid() 719 | for i in range(1, pose_aps.shape[0]): 720 | ax_shift.plot(np.array(shift_thres_list), 100*pose_aps[i, -1, :len(shift_thres_list)], 721 | color=colors[i-1], linestyle=styles[i-1], label=labels[i-1]) 722 | ax_shift.legend(loc='lower right', fontsize='small') 723 | plt.tight_layout() 724 | # plt.show() 725 | plt.savefig(os.path.join(out_dir, 'mAP.png')) 726 | plt.close(fig) 727 | return 728 | 729 | 730 | def calculate_2d_projections(coordinates_3d, intrinsics): 731 | """ 732 | Args: 733 | coordinates_3d: [3, N] 734 | intrinsics: [3, 3] 735 | 736 | Returns: 737 | projected_coordinates: [N, 2] 738 | """ 739 | projected_coordinates = intrinsics @ coordinates_3d 740 | projected_coordinates = projected_coordinates[:2, :] / projected_coordinates[2, :] 741 | projected_coordinates = projected_coordinates.transpose() 742 | projected_coordinates = np.array(projected_coordinates, dtype=np.int32) 743 | 744 | return projected_coordinates 745 | 746 | 747 | def align_rotation(sRT): 748 | """ Align rotations for symmetric objects. 749 | Args: 750 | sRT: 4 x 4 751 | """ 752 | s = np.cbrt(np.linalg.det(sRT[:3, :3])) 753 | R = sRT[:3, :3] / s 754 | T = sRT[:3, 3] 755 | 756 | theta_x = R[0, 0] + R[2, 2] 757 | theta_y = R[0, 2] - R[2, 0] 758 | r_norm = math.sqrt(theta_x**2 + theta_y**2) 759 | s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm], 760 | [0.0, 1.0, 0.0 ], 761 | [theta_y/r_norm, 0.0, theta_x/r_norm]]) 762 | rotation = R @ s_map 763 | aligned_sRT = np.identity(4, dtype=np.float32) 764 | aligned_sRT[:3, :3] = s * rotation 765 | aligned_sRT[:3, 3] = T 766 | return aligned_sRT 767 | 768 | 769 | def draw_bboxes(img, img_pts, color): 770 | img_pts = np.int32(img_pts).reshape(-1, 2) 771 | # draw ground layer in darker color 772 | color_ground = (int(color[0]*0.3), int(color[1]*0.3), int(color[2]*0.3)) 773 | for i, j in zip([4, 5, 6, 7], [5, 7, 4, 6]): 774 | img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color_ground, 2) 775 | # draw pillars in minor darker color 776 | color_pillar = (int(color[0]*0.6), int(color[1]*0.6), int(color[2]*0.6)) 777 | for i, j in zip(range(4), range(4, 8)): 778 | img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color_pillar, 2) 779 | # draw top layer in original color 780 | for i, j in zip([0, 1, 2, 3], [1, 3, 0, 2]): 781 | img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color, 2) 782 | 783 | return img 784 | 785 | 786 | def draw_detections(img, out_dir, data_name, img_id, intrinsics, pred_sRT, pred_size, pred_class_ids, 787 | gt_sRT, gt_size, gt_class_ids, nocs_sRT, nocs_size, nocs_class_ids, draw_gt=True, draw_nocs=True): 788 | """ Visualize pose predictions. 789 | """ 790 | out_path = os.path.join(out_dir, '{}_{}_pred.png'.format(data_name, img_id)) 791 | 792 | # draw nocs results - BLUE color 793 | if draw_nocs: 794 | for i in range(nocs_sRT.shape[0]): 795 | if nocs_class_ids[i] in [1, 2, 4]: 796 | sRT = align_rotation(nocs_sRT[i, :, :]) 797 | else: 798 | sRT = nocs_sRT[i, :, :] 799 | bbox_3d = get_3d_bbox(nocs_size[i, :], 0) 800 | transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT) 801 | projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics) 802 | img = draw_bboxes(img, projected_bbox, (255, 0, 0)) 803 | # darw ground truth - GREEN color 804 | if draw_gt: 805 | for i in range(gt_sRT.shape[0]): 806 | if gt_class_ids[i] in [1, 2, 4]: 807 | sRT = align_rotation(gt_sRT[i, :, :]) 808 | else: 809 | sRT = gt_sRT[i, :, :] 810 | bbox_3d = get_3d_bbox(gt_size[i, :], 0) 811 | transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT) 812 | projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics) 813 | img = draw_bboxes(img, projected_bbox, (0, 255, 0)) 814 | # darw prediction - RED color 815 | for i in range(pred_sRT.shape[0]): 816 | if pred_class_ids[i] in [1, 2, 4]: 817 | sRT = align_rotation(pred_sRT[i, :, :]) 818 | else: 819 | sRT = pred_sRT[i, :, :] 820 | bbox_3d = get_3d_bbox(pred_size[i, :], 0) 821 | transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT) 822 | projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics) 823 | img = draw_bboxes(img, projected_bbox, (0, 0, 255)) 824 | 825 | cv2.imwrite(out_path, img) 826 | # cv2.imshow('vis', img) 827 | # cv2.waitKey(0) 828 | -------------------------------------------------------------------------------- /mean_shape.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | import open3d as o3d 5 | import matplotlib.pyplot as plt 6 | import torch 7 | from lib.auto_encoder import PointCloudAE 8 | from data.shape_dataset import ShapeDataset 9 | from tools.tsne import tsne 10 | 11 | 12 | def visualize_shape(name, shape_list, result_dir): 13 | """ Visualization and save image. 14 | 15 | Args: 16 | name: window name 17 | shape: list of geoemtries 18 | 19 | """ 20 | vis = o3d.visualization.Visualizer() 21 | vis.create_window(window_name=name, width=512, height=512, left=50, top=25) 22 | for shape in shape_list: 23 | vis.add_geometry(shape) 24 | ctr = vis.get_view_control() 25 | ctr.rotate(-300.0, 150.0) 26 | if name == 'camera': 27 | ctr.translate(20.0, -20.0) # (horizontal right +, vertical down +) 28 | if name == 'laptop': 29 | ctr.translate(25.0, -60.0) 30 | vis.run() 31 | vis.capture_screen_image(os.path.join(result_dir, name+'.png'), False) 32 | vis.destroy_window() 33 | 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--h5_file', type=str, default='data/obj_models/ShapeNetCore_2048.h5', help='h5py file') 37 | parser.add_argument('--model', type=str, default='results/ae_points/model_50.pth', help='resume model') 38 | parser.add_argument('--result_dir', type=str, default='results/ae_points', help='directory to save mean shapes') 39 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use') 40 | opt = parser.parse_args() 41 | 42 | opt.emb_dim = 512 43 | opt.n_cat = 6 44 | opt.n_pts = 1024 45 | 46 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu 47 | 48 | estimator = PointCloudAE(opt.emb_dim, opt.n_pts) 49 | estimator.cuda() 50 | estimator.load_state_dict(torch.load(opt.model)) 51 | estimator.eval() 52 | train_dataset = ShapeDataset(opt.h5_file, mode='train', augment=False) 53 | train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=0) 54 | 55 | obj_models = [] 56 | embedding = [] 57 | catId = [] # zero-indexed 58 | for i, data in enumerate(train_dataloader): 59 | batch_xyz, batch_label = data 60 | batch_xyz = batch_xyz[:, :, :3].cuda() 61 | batch_label = batch_label.cuda() 62 | emb, pred_points = estimator(batch_xyz) 63 | emb = emb.cpu().detach().numpy() 64 | inst_shape = batch_xyz.cpu().numpy() 65 | label = batch_label.cpu().numpy() 66 | embedding.append(emb) 67 | obj_models.append(inst_shape) 68 | catId.append(label) 69 | 70 | embedding = np.squeeze(np.array(embedding).astype(np.float64), axis=1) 71 | catId = np.squeeze((np.array(catId)), axis=1) 72 | obj_models = np.squeeze(np.array(obj_models), axis=1) 73 | 74 | # enbedding visualization 75 | Y = tsne(embedding, 2, 50, 30.0) 76 | y_bottle = Y[np.where(catId == 0)[0], :] 77 | s_bottle = plt.scatter(y_bottle[:, 0], y_bottle[:, 1], s=20, marker='o', c='tab:orange') 78 | y_bowl = Y[np.where(catId == 1)[0], :] 79 | s_bowl = plt.scatter(y_bowl[:, 0], y_bowl[:, 1], s=20, marker='^', c='tab:blue') 80 | y_camera = Y[np.where(catId == 2)[0], :] 81 | s_camera = plt.scatter(y_camera[:, 0], y_camera[:, 1], s=20, marker='s', c='tab:olive') 82 | y_can = Y[np.where(catId == 3)[0], :] 83 | s_can = plt.scatter(y_can[:, 0], y_can[:, 1], s=20, marker='d', c='tab:gray') 84 | y_laptop = Y[np.where(catId == 4)[0], :] 85 | s_laptop = plt.scatter(y_laptop[:, 0], y_laptop[:, 1], s=20, marker='P', c='tab:purple') 86 | y_mug = Y[np.where(catId == 5)[0], :] 87 | s_mug = plt.scatter(y_mug[:, 0], y_mug[:, 1], s=20, marker='v', c='tab:brown') 88 | plt.legend((s_bottle, s_bowl, s_camera, s_can, s_laptop, s_mug), 89 | ('bottle', 'bowl', 'camera', 'can', 'laptop', 'mug'), 90 | loc='best', ncol=1, fontsize=8, frameon=False) 91 | plt.xticks([]) 92 | plt.yticks([]) 93 | plt.savefig(os.path.join(opt.result_dir, 'visual_embedding.png'), bbox_inches='tight') 94 | 95 | # mean embedding and mean shape 96 | mean_emb = np.empty((opt.n_cat, opt.emb_dim), dtype=np.float) 97 | catId_to_name = {0: 'bottle', 1: 'bowl', 2: 'camera', 3: 'can', 4: 'laptop', 5: 'mug'} 98 | mean_points = np.empty((opt.n_cat, opt.n_pts, 3), dtype=np.float) 99 | for i in range(opt.n_cat): 100 | mean = np.mean(embedding[np.where(catId==i)[0], :], axis=0, keepdims=False) 101 | mean_emb[i] = mean 102 | assigned_emb = torch.cuda.FloatTensor(mean[None, :]) 103 | _, mean_shape = estimator(None, assigned_emb) 104 | mean_shape = mean_shape.cpu().detach().numpy()[0] 105 | mean_points[i] = mean_shape 106 | # save point cloud and visualize 107 | pcd = o3d.geometry.PointCloud() 108 | pcd.points = o3d.utility.Vector3dVector(mean_shape) 109 | visualize_shape(catId_to_name[i], [pcd], opt.result_dir) 110 | # save results 111 | np.save(os.path.join(opt.result_dir, 'mean_embedding'), mean_emb) 112 | np.save(os.path.join(opt.result_dir, 'mean_points_emb'), mean_points) 113 | -------------------------------------------------------------------------------- /preprocess/pose_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import cv2 5 | import numpy as np 6 | import _pickle as cPickle 7 | from tqdm import tqdm 8 | sys.path.append('../lib') 9 | from align import align_nocs_to_depth 10 | from utils import load_depth 11 | 12 | 13 | def create_img_list(data_dir): 14 | """ Create train/val/test data list for CAMERA and Real. """ 15 | # CAMERA dataset 16 | for subset in ['train', 'val']: 17 | img_list = [] 18 | img_dir = os.path.join(data_dir, 'CAMERA', subset) 19 | folder_list = [name for name in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, name))] 20 | for i in range(10*len(folder_list)): 21 | folder_id = int(i) // 10 22 | img_id = int(i) % 10 23 | img_path = os.path.join(subset, '{:05d}'.format(folder_id), '{:04d}'.format(img_id)) 24 | img_list.append(img_path) 25 | with open(os.path.join(data_dir, 'CAMERA', subset+'_list_all.txt'), 'w') as f: 26 | for img_path in img_list: 27 | f.write("%s\n" % img_path) 28 | # Real dataset 29 | for subset in ['train', 'test']: 30 | img_list = [] 31 | img_dir = os.path.join(data_dir, 'Real', subset) 32 | folder_list = [name for name in sorted(os.listdir(img_dir)) if os.path.isdir(os.path.join(img_dir, name))] 33 | for folder in folder_list: 34 | img_paths = glob.glob(os.path.join(img_dir, folder, '*_color.png')) 35 | img_paths = sorted(img_paths) 36 | for img_full_path in img_paths: 37 | img_name = os.path.basename(img_full_path) 38 | img_ind = img_name.split('_')[0] 39 | img_path = os.path.join(subset, folder, img_ind) 40 | img_list.append(img_path) 41 | with open(os.path.join(data_dir, 'Real', subset+'_list_all.txt'), 'w') as f: 42 | for img_path in img_list: 43 | f.write("%s\n" % img_path) 44 | print('Write all data paths to file done!') 45 | 46 | 47 | def process_data(img_path, depth): 48 | """ Load instance masks for the objects in the image. """ 49 | mask_path = img_path + '_mask.png' 50 | mask = cv2.imread(mask_path)[:, :, 2] 51 | mask = np.array(mask, dtype=np.int32) 52 | all_inst_ids = sorted(list(np.unique(mask))) 53 | assert all_inst_ids[-1] == 255 54 | del all_inst_ids[-1] # remove background 55 | num_all_inst = len(all_inst_ids) 56 | h, w = mask.shape 57 | 58 | coord_path = img_path + '_coord.png' 59 | coord_map = cv2.imread(coord_path)[:, :, :3] 60 | coord_map = coord_map[:, :, (2, 1, 0)] 61 | # flip z axis of coord map 62 | coord_map = np.array(coord_map, dtype=np.float32) / 255 63 | coord_map[:, :, 2] = 1 - coord_map[:, :, 2] 64 | 65 | class_ids = [] 66 | instance_ids = [] 67 | model_list = [] 68 | masks = np.zeros([h, w, num_all_inst], dtype=np.uint8) 69 | coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32) 70 | bboxes = np.zeros((num_all_inst, 4), dtype=np.int32) 71 | 72 | meta_path = img_path + '_meta.txt' 73 | with open(meta_path, 'r') as f: 74 | i = 0 75 | for line in f: 76 | line_info = line.strip().split(' ') 77 | inst_id = int(line_info[0]) 78 | cls_id = int(line_info[1]) 79 | # background objects and non-existing objects 80 | if cls_id == 0 or (inst_id not in all_inst_ids): 81 | continue 82 | if len(line_info) == 3: 83 | model_id = line_info[2] # Real scanned objs 84 | else: 85 | model_id = line_info[3] # CAMERA objs 86 | # remove one mug instance in CAMERA train due to improper model 87 | if model_id == 'b9be7cfe653740eb7633a2dd89cec754': 88 | continue 89 | # process foreground objects 90 | inst_mask = np.equal(mask, inst_id) 91 | # bounding box 92 | horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0] 93 | vertical_indicies = np.where(np.any(inst_mask, axis=1))[0] 94 | assert horizontal_indicies.shape[0], print(img_path) 95 | x1, x2 = horizontal_indicies[[0, -1]] 96 | y1, y2 = vertical_indicies[[0, -1]] 97 | # x2 and y2 should not be part of the box. Increment by 1. 98 | x2 += 1 99 | y2 += 1 100 | # object occupies full image, rendering error, happens in CAMERA dataset 101 | if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)): 102 | return None, None, None, None, None, None 103 | # not enough valid depth observation 104 | final_mask = np.logical_and(inst_mask, depth > 0) 105 | if np.sum(final_mask) < 64: 106 | continue 107 | class_ids.append(cls_id) 108 | instance_ids.append(inst_id) 109 | model_list.append(model_id) 110 | masks[:, :, i] = inst_mask 111 | coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1)) 112 | bboxes[i] = np.array([y1, x1, y2, x2]) 113 | i += 1 114 | # no valid foreground objects 115 | if i == 0: 116 | return None, None, None, None, None, None 117 | 118 | masks = masks[:, :, :i] 119 | coords = np.clip(coords[:, :, :i, :], 0, 1) 120 | bboxes = bboxes[:i, :] 121 | 122 | return masks, coords, class_ids, instance_ids, model_list, bboxes 123 | 124 | 125 | def annotate_camera_train(data_dir): 126 | """ Generate gt labels for CAMERA train data. """ 127 | camera_train = open(os.path.join(data_dir, 'CAMERA', 'train_list_all.txt')).read().splitlines() 128 | intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]]) 129 | # meta info for re-label mug category 130 | with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f: 131 | mug_meta = cPickle.load(f) 132 | 133 | valid_img_list = [] 134 | for img_path in tqdm(camera_train): 135 | img_full_path = os.path.join(data_dir, 'CAMERA', img_path) 136 | all_exist = os.path.exists(img_full_path + '_color.png') and \ 137 | os.path.exists(img_full_path + '_coord.png') and \ 138 | os.path.exists(img_full_path + '_depth.png') and \ 139 | os.path.exists(img_full_path + '_mask.png') and \ 140 | os.path.exists(img_full_path + '_meta.txt') 141 | if not all_exist: 142 | continue 143 | depth = load_depth(img_full_path) 144 | masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth) 145 | if instance_ids is None: 146 | continue 147 | # Umeyama alignment of GT NOCS map with depth image 148 | scales, rotations, translations, error_messages, _ = \ 149 | align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path) 150 | if error_messages: 151 | continue 152 | # re-label for mug category 153 | for i in range(len(class_ids)): 154 | if class_ids[i] == 6: 155 | T0 = mug_meta[model_list[i]][0] 156 | s0 = mug_meta[model_list[i]][1] 157 | T = translations[i] - scales[i] * rotations[i] @ T0 158 | s = scales[i] / s0 159 | scales[i] = s 160 | translations[i] = T 161 | # write results 162 | gts = {} 163 | gts['class_ids'] = class_ids # int list, 1 to 6 164 | gts['bboxes'] = bboxes # np.array, [[y1, x1, y2, x2], ...] 165 | gts['scales'] = scales.astype(np.float32) # np.array, scale factor from NOCS model to depth observation 166 | gts['rotations'] = rotations.astype(np.float32) # np.array, R 167 | gts['translations'] = translations.astype(np.float32) # np.array, T 168 | gts['instance_ids'] = instance_ids # int list, start from 1 169 | gts['model_list'] = model_list # str list, model id/name 170 | with open(img_full_path + '_label.pkl', 'wb') as f: 171 | cPickle.dump(gts, f) 172 | valid_img_list.append(img_path) 173 | # write valid img list to file 174 | with open(os.path.join(data_dir, 'CAMERA/train_list.txt'), 'w') as f: 175 | for img_path in valid_img_list: 176 | f.write("%s\n" % img_path) 177 | 178 | 179 | def annotate_real_train(data_dir): 180 | """ Generate gt labels for Real train data through PnP. """ 181 | real_train = open(os.path.join(data_dir, 'Real/train_list_all.txt')).read().splitlines() 182 | intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]]) 183 | # scale factors for all instances 184 | scale_factors = {} 185 | path_to_size = glob.glob(os.path.join(data_dir, 'obj_models/real_train', '*_norm.txt')) 186 | for inst_path in sorted(path_to_size): 187 | instance = os.path.basename(inst_path).split('.')[0] 188 | bbox_dims = np.loadtxt(inst_path) 189 | scale_factors[instance] = np.linalg.norm(bbox_dims) 190 | # meta info for re-label mug category 191 | with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f: 192 | mug_meta = cPickle.load(f) 193 | 194 | valid_img_list = [] 195 | for img_path in tqdm(real_train): 196 | img_full_path = os.path.join(data_dir, 'Real', img_path) 197 | all_exist = os.path.exists(img_full_path + '_color.png') and \ 198 | os.path.exists(img_full_path + '_coord.png') and \ 199 | os.path.exists(img_full_path + '_depth.png') and \ 200 | os.path.exists(img_full_path + '_mask.png') and \ 201 | os.path.exists(img_full_path + '_meta.txt') 202 | if not all_exist: 203 | continue 204 | depth = load_depth(img_full_path) 205 | masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth) 206 | if instance_ids is None: 207 | continue 208 | # compute pose 209 | num_insts = len(class_ids) 210 | scales = np.zeros(num_insts) 211 | rotations = np.zeros((num_insts, 3, 3)) 212 | translations = np.zeros((num_insts, 3)) 213 | for i in range(num_insts): 214 | s = scale_factors[model_list[i]] 215 | mask = masks[:, :, i] 216 | idxs = np.where(mask) 217 | coord = coords[:, :, i, :] 218 | coord_pts = s * (coord[idxs[0], idxs[1], :] - 0.5) 219 | coord_pts = coord_pts[:, :, None] 220 | img_pts = np.array([idxs[1], idxs[0]]).transpose() 221 | img_pts = img_pts[:, :, None].astype(float) 222 | distCoeffs = np.zeros((4, 1)) # no distoration 223 | retval, rvec, tvec = cv2.solvePnP(coord_pts, img_pts, intrinsics, distCoeffs) 224 | assert retval 225 | R, _ = cv2.Rodrigues(rvec) 226 | T = np.squeeze(tvec) 227 | # re-label for mug category 228 | if class_ids[i] == 6: 229 | T0 = mug_meta[model_list[i]][0] 230 | s0 = mug_meta[model_list[i]][1] 231 | T = T - s * R @ T0 232 | s = s / s0 233 | scales[i] = s 234 | rotations[i] = R 235 | translations[i] = T 236 | # write results 237 | gts = {} 238 | gts['class_ids'] = class_ids # int list, 1 to 6 239 | gts['bboxes'] = bboxes # np.array, [[y1, x1, y2, x2], ...] 240 | gts['scales'] = scales.astype(np.float32) # np.array, scale factor from NOCS model to depth observation 241 | gts['rotations'] = rotations.astype(np.float32) # np.array, R 242 | gts['translations'] = translations.astype(np.float32) # np.array, T 243 | gts['instance_ids'] = instance_ids # int list, start from 1 244 | gts['model_list'] = model_list # str list, model id/name 245 | with open(img_full_path + '_label.pkl', 'wb') as f: 246 | cPickle.dump(gts, f) 247 | valid_img_list.append(img_path) 248 | # write valid img list to file 249 | with open(os.path.join(data_dir, 'Real/train_list.txt'), 'w') as f: 250 | for img_path in valid_img_list: 251 | f.write("%s\n" % img_path) 252 | 253 | 254 | def annotate_test_data(data_dir): 255 | """ Generate gt labels for test data. 256 | Properly copy handle_visibility provided by NOCS gts. 257 | """ 258 | # Statistics: 259 | # test_set missing file bad rendering no (occluded) fg occlusion (< 64 pts) 260 | # val 3792 imgs 132 imgs 1856 (23) imgs 50 insts 261 | # test 0 img 0 img 0 img 2 insts 262 | 263 | camera_val = open(os.path.join(data_dir, 'CAMERA', 'val_list_all.txt')).read().splitlines() 264 | real_test = open(os.path.join(data_dir, 'Real', 'test_list_all.txt')).read().splitlines() 265 | camera_intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]]) 266 | real_intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]]) 267 | # compute model size 268 | model_file_path = ['obj_models/camera_val.pkl', 'obj_models/real_test.pkl'] 269 | models = {} 270 | for path in model_file_path: 271 | with open(os.path.join(data_dir, path), 'rb') as f: 272 | models.update(cPickle.load(f)) 273 | model_sizes = {} 274 | for key in models.keys(): 275 | model_sizes[key] = 2 * np.amax(np.abs(models[key]), axis=0) 276 | # meta info for re-label mug category 277 | with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f: 278 | mug_meta = cPickle.load(f) 279 | 280 | subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val'), ('Real', real_test, real_intrinsics, 'test')] 281 | for source, img_list, intrinsics, subset in subset_meta: 282 | valid_img_list = [] 283 | for img_path in tqdm(img_list): 284 | img_full_path = os.path.join(data_dir, source, img_path) 285 | all_exist = os.path.exists(img_full_path + '_color.png') and \ 286 | os.path.exists(img_full_path + '_coord.png') and \ 287 | os.path.exists(img_full_path + '_depth.png') and \ 288 | os.path.exists(img_full_path + '_mask.png') and \ 289 | os.path.exists(img_full_path + '_meta.txt') 290 | if not all_exist: 291 | continue 292 | depth = load_depth(img_full_path) 293 | masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth) 294 | if instance_ids is None: 295 | continue 296 | num_insts = len(instance_ids) 297 | # match each instance with NOCS ground truth to properly assign gt_handle_visibility 298 | nocs_dir = os.path.join(os.path.dirname(data_dir), 'results/nocs_results') 299 | if source == 'CAMERA': 300 | nocs_path = os.path.join(nocs_dir, 'val', 'results_val_{}_{}.pkl'.format( 301 | img_path.split('/')[-2], img_path.split('/')[-1])) 302 | else: 303 | nocs_path = os.path.join(nocs_dir, 'real_test', 'results_test_{}_{}.pkl'.format( 304 | img_path.split('/')[-2], img_path.split('/')[-1])) 305 | with open(nocs_path, 'rb') as f: 306 | nocs = cPickle.load(f) 307 | gt_class_ids = nocs['gt_class_ids'] 308 | gt_bboxes = nocs['gt_bboxes'] 309 | gt_sRT = nocs['gt_RTs'] 310 | gt_handle_visibility = nocs['gt_handle_visibility'] 311 | map_to_nocs = [] 312 | for i in range(num_insts): 313 | gt_match = -1 314 | for j in range(len(gt_class_ids)): 315 | if gt_class_ids[j] != class_ids[i]: 316 | continue 317 | if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5: 318 | continue 319 | # match found 320 | gt_match = j 321 | break 322 | # check match validity 323 | assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance') 324 | assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match') 325 | map_to_nocs.append(gt_match) 326 | # copy from ground truth, re-label for mug category 327 | handle_visibility = gt_handle_visibility[map_to_nocs] 328 | sizes = np.zeros((num_insts, 3)) 329 | poses = np.zeros((num_insts, 4, 4)) 330 | scales = np.zeros(num_insts) 331 | rotations = np.zeros((num_insts, 3, 3)) 332 | translations = np.zeros((num_insts, 3)) 333 | for i in range(num_insts): 334 | gt_idx = map_to_nocs[i] 335 | sizes[i] = model_sizes[model_list[i]] 336 | sRT = gt_sRT[gt_idx] 337 | s = np.cbrt(np.linalg.det(sRT[:3, :3])) 338 | R = sRT[:3, :3] / s 339 | T = sRT[:3, 3] 340 | # re-label mug category 341 | if class_ids[i] == 6: 342 | T0 = mug_meta[model_list[i]][0] 343 | s0 = mug_meta[model_list[i]][1] 344 | T = T - s * R @ T0 345 | s = s / s0 346 | # used for test during training 347 | scales[i] = s 348 | rotations[i] = R 349 | translations[i] = T 350 | # used for evaluation 351 | sRT = np.identity(4, dtype=np.float32) 352 | sRT[:3, :3] = s * R 353 | sRT[:3, 3] = T 354 | poses[i] = sRT 355 | # write results 356 | gts = {} 357 | gts['class_ids'] = np.array(class_ids) # int list, 1 to 6 358 | gts['bboxes'] = bboxes # np.array, [[y1, x1, y2, x2], ...] 359 | gts['instance_ids'] = instance_ids # int list, start from 1 360 | gts['model_list'] = model_list # str list, model id/name 361 | gts['size'] = sizes # 3D size of NOCS model 362 | gts['scales'] = scales.astype(np.float32) # np.array, scale factor from NOCS model to depth observation 363 | gts['rotations'] = rotations.astype(np.float32) # np.array, R 364 | gts['translations'] = translations.astype(np.float32) # np.array, T 365 | gts['poses'] = poses.astype(np.float32) # np.array 366 | gts['handle_visibility'] = handle_visibility # handle visibility of mug 367 | with open(img_full_path + '_label.pkl', 'wb') as f: 368 | cPickle.dump(gts, f) 369 | valid_img_list.append(img_path) 370 | # write valid img list to file 371 | with open(os.path.join(data_dir, source, subset+'_list.txt'), 'w') as f: 372 | for img_path in valid_img_list: 373 | f.write("%s\n" % img_path) 374 | 375 | 376 | if __name__ == '__main__': 377 | data_dir = '/home/tianmeng/Documents/pose_ws/object-deformnet/data' 378 | # create list for all data 379 | create_img_list(data_dir) 380 | # annotate dataset and re-write valid data to list 381 | annotate_camera_train(data_dir) 382 | annotate_real_train(data_dir) 383 | annotate_test_data(data_dir) 384 | -------------------------------------------------------------------------------- /preprocess/shape_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import h5py 4 | import glob 5 | import numpy as np 6 | import _pickle as cPickle 7 | sys.path.append('../lib') 8 | from utils import sample_points_from_mesh 9 | 10 | 11 | def save_nocs_model_to_file(obj_model_dir): 12 | """ Sampling points from mesh model and normalize to NOCS. 13 | Models are centered at origin, i.e. NOCS-0.5 14 | 15 | """ 16 | mug_meta = {} 17 | # used for re-align mug category 18 | special_cases = {'3a7439cfaa9af51faf1af397e14a566d': np.array([0.115, 0.0, 0.0]), 19 | '5b0c679eb8a2156c4314179664d18101': np.array([0.083, 0.0, -0.044]), 20 | '649a51c711dc7f3b32e150233fdd42e9': np.array([0.0, 0.0, -0.017]), 21 | 'bf2b5e941b43d030138af902bc222a59': np.array([0.0534, 0.0, 0.0]), 22 | 'ca198dc3f7dc0cacec6338171298c66b': np.array([0.120, 0.0, 0.0]), 23 | 'f42a9784d165ad2f5e723252788c3d6e': np.array([0.117, 0.0, -0.026])} 24 | 25 | # CAMERA dataset 26 | for subset in ['train', 'val']: 27 | camera = {} 28 | for synsetId in ['02876657', '02880940', '02942699', '02946921', '03642806', '03797390']: 29 | synset_dir = os.path.join(obj_model_dir, subset, synsetId) 30 | inst_list = sorted(os.listdir(synset_dir)) 31 | for instance in inst_list: 32 | path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj') 33 | model_points = sample_points_from_mesh(path_to_mesh_model, 1024, fps=True, ratio=3) 34 | # flip z-axis in CAMERA 35 | model_points = model_points * np.array([[1.0, 1.0, -1.0]]) 36 | # re-align mug category 37 | if synsetId == '03797390': 38 | if instance == 'b9be7cfe653740eb7633a2dd89cec754': 39 | # skip this instance in train set, improper mug model, only influence training. 40 | continue 41 | if instance in special_cases.keys(): 42 | shift = special_cases[instance] 43 | else: 44 | shift_x = (np.amin(model_points[:, 2]) - np.amax(model_points[:, 2])) / 2 - np.amin(model_points[:, 0]) 45 | shift = np.array([shift_x, 0.0, 0.0]) 46 | model_points += shift 47 | size = 2 * np.amax(np.abs(model_points), axis=0) 48 | scale = 1 / np.linalg.norm(size) 49 | model_points *= scale 50 | mug_meta[instance] = [shift, scale] 51 | camera[instance] = model_points 52 | with open(os.path.join(obj_model_dir, 'camera_{}.pkl'.format(subset)), 'wb') as f: 53 | cPickle.dump(camera, f) 54 | # Real dataset 55 | for subset in ['real_train', 'real_test']: 56 | real = {} 57 | inst_list = glob.glob(os.path.join(obj_model_dir, subset, '*.obj')) 58 | for inst_path in inst_list: 59 | instance = os.path.basename(inst_path).split('.')[0] 60 | bbox_file = inst_path.replace('.obj', '.txt') 61 | bbox_dims = np.loadtxt(bbox_file) 62 | scale = np.linalg.norm(bbox_dims) 63 | model_points = sample_points_from_mesh(inst_path, 1024, fps=True, ratio=3) 64 | model_points /= scale 65 | # relable mug category 66 | if 'mug' in instance: 67 | shift_x = (np.amin(model_points[:, 2]) - np.amax(model_points[:, 2])) / 2 - np.amin(model_points[:, 0]) 68 | shift = np.array([shift_x, 0.0, 0.0]) 69 | model_points += shift 70 | size = 2 * np.amax(np.abs(model_points), axis=0) 71 | scale = 1 / np.linalg.norm(size) 72 | model_points *= scale 73 | mug_meta[instance] = [shift, scale] 74 | real[instance] = model_points 75 | with open(os.path.join(obj_model_dir, '{}.pkl'.format(subset)), 'wb') as f: 76 | cPickle.dump(real, f) 77 | # save mug_meta information for re-labeling 78 | with open(os.path.join(obj_model_dir, 'mug_meta.pkl'), 'wb') as f: 79 | cPickle.dump(mug_meta, f) 80 | 81 | 82 | def save_model_to_hdf5(obj_model_dir, n_points, fps=False, include_distractors=False, with_normal=False): 83 | """ Save object models (point cloud) to HDF5 file. 84 | Dataset used to train the auto-encoder. 85 | Only use models from ShapeNetCore. 86 | Background objects are not inlcuded as default. We did not observe that it helps 87 | to train the auto-encoder. 88 | 89 | """ 90 | catId_to_synsetId = {1: '02876657', 2: '02880940', 3: '02942699', 4: '02946921', 5: '03642806', 6: '03797390'} 91 | distractors_synsetId = ['00000000', '02954340', '02992529', '03211117'] 92 | with open(os.path.join(obj_model_dir, 'mug_meta.pkl'), 'rb') as f: 93 | mug_meta = cPickle.load(f) 94 | # read all the paths to models 95 | print('Sampling points from mesh model ...') 96 | if with_normal: 97 | train_data = np.zeros((3000, n_points, 6), dtype=np.float32) 98 | val_data = np.zeros((500, n_points, 6), dtype=np.float32) 99 | else: 100 | train_data = np.zeros((3000, n_points, 3), dtype=np.float32) 101 | val_data = np.zeros((500, n_points, 3), dtype=np.float32) 102 | train_label = [] 103 | val_label = [] 104 | train_count = 0 105 | val_count = 0 106 | # CAMERA 107 | for subset in ['train', 'val']: 108 | for catId in range(1, 7): 109 | synset_dir = os.path.join(obj_model_dir, subset, catId_to_synsetId[catId]) 110 | inst_list = sorted(os.listdir(synset_dir)) 111 | for instance in inst_list: 112 | path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj') 113 | if instance == 'b9be7cfe653740eb7633a2dd89cec754': 114 | continue 115 | model_points = sample_points_from_mesh(path_to_mesh_model, n_points, with_normal, fps=fps, ratio=2) 116 | model_points = model_points * np.array([[1.0, 1.0, -1.0]]) 117 | if catId == 6: 118 | shift = mug_meta[instance][0] 119 | scale = mug_meta[instance][1] 120 | model_points = scale * (model_points + shift) 121 | if subset == 'train': 122 | train_data[train_count] = model_points 123 | train_label.append(catId) 124 | train_count += 1 125 | else: 126 | val_data[val_count] = model_points 127 | val_label.append(catId) 128 | val_count += 1 129 | # distractors 130 | if include_distractors: 131 | for synsetId in distractors_synsetId: 132 | synset_dir = os.path.join(obj_model_dir, subset, synsetId) 133 | inst_list = sorted(os.listdir(synset_dir)) 134 | for instance in inst_list: 135 | path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj') 136 | model_points = sample_points_from_mesh(path_to_mesh_model, n_points, with_normal, fps=fps, ratio=2) 137 | # TODO: check whether need to flip z-axis, currently not used 138 | model_points = model_points * np.array([[1.0, 1.0, -1.0]]) 139 | if subset == 'train': 140 | train_data[train_count] = model_points 141 | train_label.append(0) 142 | train_count += 1 143 | else: 144 | val_data[val_count] = model_points 145 | val_label.append(0) 146 | val_count += 1 147 | # Real 148 | for subset in ['real_train', 'real_test']: 149 | path_to_mesh_models = glob.glob(os.path.join(obj_model_dir, subset, '*.obj')) 150 | for inst_path in sorted(path_to_mesh_models): 151 | instance = os.path.basename(inst_path).split('.')[0] 152 | if instance.startswith('bottle'): 153 | catId = 1 154 | elif instance.startswith('bowl'): 155 | catId = 2 156 | elif instance.startswith('camera'): 157 | catId = 3 158 | elif instance.startswith('can'): 159 | catId = 4 160 | elif instance.startswith('laptop'): 161 | catId = 5 162 | elif instance.startswith('mug'): 163 | catId = 6 164 | else: 165 | raise NotImplementedError 166 | model_points = sample_points_from_mesh(inst_path, n_points, with_normal, fps=fps, ratio=2) 167 | bbox_file = inst_path.replace('.obj', '.txt') 168 | bbox_dims = np.loadtxt(bbox_file) 169 | model_points /= np.linalg.norm(bbox_dims) 170 | if catId == 6: 171 | shift = mug_meta[instance][0] 172 | scale = mug_meta[instance][1] 173 | model_points = scale * (model_points + shift) 174 | if subset == 'real_train': 175 | train_data[train_count] = model_points 176 | train_label.append(catId) 177 | train_count += 1 178 | else: 179 | val_data[val_count] = model_points 180 | val_label.append(catId) 181 | val_count += 1 182 | 183 | num_train_instances = len(train_label) 184 | num_val_instances = len(val_label) 185 | assert num_train_instances == train_count 186 | assert num_val_instances == val_count 187 | train_data = train_data[:num_train_instances] 188 | val_data = val_data[:num_val_instances] 189 | train_label = np.array(train_label, dtype=np.uint8) 190 | val_label = np.array(val_label, dtype=np.uint8) 191 | print('{} shapes found in train dataset'.format(num_train_instances)) 192 | print('{} shapes found in val dataset'.format(num_val_instances)) 193 | 194 | # write to HDF5 file 195 | print('Writing data to HDF5 file ...') 196 | if with_normal: 197 | filename = 'ShapeNetCore_{}_with_normal.h5'.format(n_points) 198 | else: 199 | filename = 'ShapeNetCore_{}.h5'.format(n_points) 200 | hfile = h5py.File(os.path.join(obj_model_dir, filename), 'w') 201 | train_dataset = hfile.create_group('train') 202 | train_dataset.attrs.create('len', num_train_instances) 203 | train_dataset.create_dataset('data', data=train_data, compression='gzip', dtype='float32') 204 | train_dataset.create_dataset('label', data=train_label, compression='gzip', dtype='uint8') 205 | val_dataset = hfile.create_group('val') 206 | val_dataset.attrs.create('len', num_val_instances) 207 | val_dataset.create_dataset('data', data=val_data, compression='gzip', dtype='float32') 208 | val_dataset.create_dataset('label', data=val_label, compression='gzip', dtype='uint8') 209 | hfile.close() 210 | 211 | 212 | if __name__ == '__main__': 213 | obj_model_dir = '/home/tianmeng/Documents/pose_ws/object-deformnet/data/obj_models' 214 | # Save ground truth models for training deform network 215 | save_nocs_model_to_file(obj_model_dir) 216 | # Save models to HDF5 file for training the auto-encoder. 217 | save_model_to_hdf5(obj_model_dir, n_points=4096, fps=False) 218 | # Save nmodels to HDF5 file, which used to generate mean shape. 219 | save_model_to_hdf5(obj_model_dir, n_points=2048, fps=True) 220 | 221 | # import random 222 | # import open3d as o3d 223 | # for file in ['camera_train.pkl', 'camera_val.pkl', 'real_train.pkl', 'real_test.pkl']: 224 | # with open(os.path.join(obj_model_dir, file), 'rb') as f: 225 | # obj_models = cPickle.load(f) 226 | # instance = random.choice(list(obj_models.keys())) 227 | # model_points = obj_models[instance] 228 | # print('Diameter: {}'.format(np.linalg.norm(2*np.amax(np.abs(model_points), axis=0)))) 229 | # color = np.repeat(np.array([[1, 0, 0]]), model_points.shape[0], axis=0) 230 | # pcd = o3d.geometry.PointCloud() 231 | # pcd.points = o3d.utility.Vector3dVector(model_points) 232 | # pcd.colors = o3d.utility.Vector3dVector(color) 233 | # # visualization: camera coordinate frame 234 | # points = [[0, 0, 0], [0.5, 0, 0], [0, 0.5, 0], [0, 0, 0.5]] 235 | # lines = [[0, 1], [0, 2], [0, 3]] 236 | # colors = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] 237 | # line_set = o3d.geometry.LineSet() 238 | # line_set.points = o3d.utility.Vector3dVector(points) 239 | # line_set.lines = o3d.utility.Vector2iVector(lines) 240 | # line_set.colors = o3d.utility.Vector3dVector(colors) 241 | # o3d.visualization.draw_geometries([pcd, line_set]) 242 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation 2 | ![teaser](assets/teaser.png) 3 | 4 | ## Overview 5 | This repository contains the PyTorch implementation of the paper "Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation" 6 | ([arXiv](http://arxiv.org/abs/2007.08454)). 7 | Our approach could recover the 6D pose and size of unseen objects from an RGB-D image, as well as reconstruct their complete 3D models. 8 | 9 | ## Dependencies 10 | * Python 3.6 11 | * PyTorch 1.0.1 12 | * CUDA 9.0 13 | 14 | ## Installation 15 | ``` 16 | ROOT=/path/to/object-deformnet 17 | cd $ROOT/lib/nn_distance 18 | python setup.py install --user 19 | ``` 20 | 21 | ## Datasets 22 | Download [camera_train](http://download.cs.stanford.edu/orion/nocs/camera_train.zip), [camera_val](http://download.cs.stanford.edu/orion/nocs/camera_val25K.zip), 23 | [real_train](http://download.cs.stanford.edu/orion/nocs/real_train.zip), [real_test](http://download.cs.stanford.edu/orion/nocs/real_test.zip), 24 | [ground-truth annotations](http://download.cs.stanford.edu/orion/nocs/gts.zip), 25 | and [mesh models](http://download.cs.stanford.edu/orion/nocs/obj_models.zip) 26 | provided by [NOCS](https://github.com/hughw19/NOCS_CVPR2019).
27 | Unzip and organize these files in $ROOT/data as follows: 28 | ``` 29 | data 30 | ├── CAMERA 31 | │ ├── train 32 | │ └── val 33 | ├── Real 34 | │ ├── train 35 | │ └── test 36 | ├── gts 37 | │ ├── val 38 | │ └── real_test 39 | └── obj_models 40 | ├── train 41 | ├── val 42 | ├── real_train 43 | └── real_test 44 | ``` 45 | Run python scripts to prepare the datasets. 46 | ``` 47 | cd $ROOT/preprocess 48 | python shape_data.py 49 | python pose_data.py 50 | ``` 51 | Notice that running the scripts will additionally shift and re-scale the models of mug category (w/o modifying the original files), 52 | such that the origin of the object coordinate frame is on the axis of symmetry. 53 | This step is implemented for one of our early experiments and turns out to be unnecessary. 54 | Ignoring this step should make no difference to the performance of our approach. 55 | We keep it in this repo for reproducibility. 56 | 57 | ## Training 58 | ``` 59 | # optional - train an Autoencoder from scratch and prepare the shape priors 60 | python train_ae.py 61 | python mean_shape.py 62 | 63 | # train DeformNet 64 | python train_deform.py 65 | ``` 66 | 67 | ## Evaluation 68 | Download the pre-trained models, segmentation results from Mask R-CNN, and predictions of NOCS from [here](https://drive.google.com/file/d/1p72NdY4Bie_sra9U8zoUNI4fTrQZdbnc/view?usp=sharing). 69 | ``` 70 | unzip -q deformnet_eval.zip 71 | mv deformnet_eval/* $ROOT/results 72 | rmdir deformnet_eval 73 | cd $ROOT 74 | python evaluate.py 75 | ``` 76 | 77 | ## Citation 78 | If you find our work helpful, please consider citing: 79 | ``` 80 | @InProceedings{Tian_2020_ECCV, 81 | author = {Tian, Meng and Ang Jr, Marcelo H and Lee, Gim Hee}, 82 | title = {Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation}, 83 | booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)}, 84 | month = {August}, 85 | year = {2020} 86 | } 87 | ``` 88 | 89 | ## Acknowledgment 90 | Our implementation leverages the code from [NOCS](https://github.com/hughw19/NOCS_CVPR2019) and [3PU](https://github.com/yifita/3PU_pytorch). 91 | -------------------------------------------------------------------------------- /tools/tsne.py: -------------------------------------------------------------------------------- 1 | # 2 | # tsne.py 3 | # 4 | # Implementation of t-SNE in Python. The implementation was tested on Python 5 | # 2.7.10, and it requires a working installation of NumPy. The implementation 6 | # comes with an example on the MNIST dataset. In order to plot the 7 | # results of this example, a working installation of matplotlib is required. 8 | # 9 | # The example can be run by executing: `ipython tsne.py` 10 | # 11 | # 12 | # Created by Laurens van der Maaten on 20-12-08. 13 | # Copyright (c) 2008 Tilburg University. All rights reserved. 14 | 15 | import numpy as np 16 | import pylab 17 | 18 | 19 | def Hbeta(D=np.array([]), beta=1.0): 20 | """ 21 | Compute the perplexity and the P-row for a specific value of the 22 | precision of a Gaussian distribution. 23 | """ 24 | 25 | # Compute P-row and corresponding perplexity 26 | P = np.exp(-D.copy() * beta) 27 | sumP = sum(P) 28 | H = np.log(sumP) + beta * np.sum(D * P) / sumP 29 | P = P / sumP 30 | return H, P 31 | 32 | 33 | def x2p(X=np.array([]), tol=1e-5, perplexity=30.0): 34 | """ 35 | Performs a binary search to get P-values in such a way that each 36 | conditional Gaussian has the same perplexity. 37 | """ 38 | 39 | # Initialize some variables 40 | print("Computing pairwise distances...") 41 | (n, d) = X.shape 42 | sum_X = np.sum(np.square(X), 1) 43 | D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) 44 | P = np.zeros((n, n)) 45 | beta = np.ones((n, 1)) 46 | logU = np.log(perplexity) 47 | 48 | # Loop over all datapoints 49 | for i in range(n): 50 | 51 | # Print progress 52 | if i % 500 == 0: 53 | print("Computing P-values for point %d of %d..." % (i, n)) 54 | 55 | # Compute the Gaussian kernel and entropy for the current precision 56 | betamin = -np.inf 57 | betamax = np.inf 58 | Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] 59 | (H, thisP) = Hbeta(Di, beta[i]) 60 | 61 | # Evaluate whether the perplexity is within tolerance 62 | Hdiff = H - logU 63 | tries = 0 64 | while np.abs(Hdiff) > tol and tries < 50: 65 | 66 | # If not, increase or decrease precision 67 | if Hdiff > 0: 68 | betamin = beta[i].copy() 69 | if betamax == np.inf or betamax == -np.inf: 70 | beta[i] = beta[i] * 2. 71 | else: 72 | beta[i] = (beta[i] + betamax) / 2. 73 | else: 74 | betamax = beta[i].copy() 75 | if betamin == np.inf or betamin == -np.inf: 76 | beta[i] = beta[i] / 2. 77 | else: 78 | beta[i] = (beta[i] + betamin) / 2. 79 | 80 | # Recompute the values 81 | (H, thisP) = Hbeta(Di, beta[i]) 82 | Hdiff = H - logU 83 | tries += 1 84 | 85 | # Set the final row of P 86 | P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP 87 | 88 | # Return final P-matrix 89 | print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta))) 90 | return P 91 | 92 | 93 | def pca(X=np.array([]), no_dims=50): 94 | """ 95 | Runs PCA on the NxD array X in order to reduce its dimensionality to 96 | no_dims dimensions. 97 | """ 98 | 99 | print("Preprocessing the data using PCA...") 100 | (n, d) = X.shape 101 | X = X - np.tile(np.mean(X, 0), (n, 1)) 102 | (l, M) = np.linalg.eig(np.dot(X.T, X)) 103 | Y = np.dot(X, M[:, 0:no_dims]) 104 | return Y 105 | 106 | 107 | def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0): 108 | """ 109 | Runs t-SNE on the dataset in the NxD array X to reduce its 110 | dimensionality to no_dims dimensions. The syntaxis of the function is 111 | `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array. 112 | """ 113 | 114 | # Check inputs 115 | if isinstance(no_dims, float): 116 | print("Error: array X should have type float.") 117 | return -1 118 | if round(no_dims) != no_dims: 119 | print("Error: number of dimensions should be an integer.") 120 | return -1 121 | 122 | # Initialize variables 123 | X = pca(X, initial_dims).real 124 | (n, d) = X.shape 125 | max_iter = 1000 126 | initial_momentum = 0.5 127 | final_momentum = 0.8 128 | eta = 500 129 | min_gain = 0.01 130 | Y = np.random.randn(n, no_dims) 131 | dY = np.zeros((n, no_dims)) 132 | iY = np.zeros((n, no_dims)) 133 | gains = np.ones((n, no_dims)) 134 | 135 | # Compute P-values 136 | P = x2p(X, 1e-5, perplexity) 137 | P = P + np.transpose(P) 138 | P = P / np.sum(P) 139 | P = P * 4. # early exaggeration 140 | P = np.maximum(P, 1e-12) 141 | 142 | # Run iterations 143 | for iter in range(max_iter): 144 | 145 | # Compute pairwise affinities 146 | sum_Y = np.sum(np.square(Y), 1) 147 | num = -2. * np.dot(Y, Y.T) 148 | num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y)) 149 | num[range(n), range(n)] = 0. 150 | Q = num / np.sum(num) 151 | Q = np.maximum(Q, 1e-12) 152 | 153 | # Compute gradient 154 | PQ = P - Q 155 | for i in range(n): 156 | dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0) 157 | 158 | # Perform the update 159 | if iter < 20: 160 | momentum = initial_momentum 161 | else: 162 | momentum = final_momentum 163 | gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \ 164 | (gains * 0.8) * ((dY > 0.) == (iY > 0.)) 165 | gains[gains < min_gain] = min_gain 166 | iY = momentum * iY - eta * (gains * dY) 167 | Y = Y + iY 168 | Y = Y - np.tile(np.mean(Y, 0), (n, 1)) 169 | 170 | # Compute current value of cost function 171 | if (iter + 1) % 10 == 0: 172 | C = np.sum(P * np.log(P / Q)) 173 | print("Iteration %d: error is %f" % (iter + 1, C)) 174 | 175 | # Stop lying about P-values 176 | if iter == 100: 177 | P = P / 4. 178 | 179 | # Return solution 180 | return Y 181 | 182 | 183 | if __name__ == "__main__": 184 | print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.") 185 | print("Running example on 2,500 MNIST digits...") 186 | X = np.loadtxt("mnist2500_X.txt") 187 | labels = np.loadtxt("mnist2500_labels.txt") 188 | Y = tsne(X, 2, 50, 20.0) 189 | pylab.scatter(Y[:, 0], Y[:, 1], 20, labels) 190 | pylab.show() 191 | -------------------------------------------------------------------------------- /train_ae.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | import torch 5 | import tensorflow as tf 6 | from lib.auto_encoder import PointCloudAE 7 | from lib.loss import ChamferLoss 8 | from data.shape_dataset import ShapeDataset 9 | from lib.utils import setup_logger 10 | 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--num_point', type=int, default=1024, help='number of points, needed if use points') 14 | parser.add_argument('--emb_dim', type=int, default=512, help='dimension of latent embedding [default: 512]') 15 | parser.add_argument('--h5_file', type=str, default='data/obj_models/ShapeNetCore_4096.h5', help='h5 file') 16 | parser.add_argument('--batch_size', type=int, default=32, help='batch size') 17 | parser.add_argument('--num_workers', type=int, default=10, help='number of data loading workers') 18 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use') 19 | parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate') 20 | parser.add_argument('--start_epoch', type=int, default=1, help='which epoch to start') 21 | parser.add_argument('--max_epoch', type=int, default=50, help='max number of epochs to train') 22 | parser.add_argument('--resume_model', type=str, default='', help='resume from saved model') 23 | parser.add_argument('--result_dir', type=str, default='results/ae_points', help='directory to save train results') 24 | opt = parser.parse_args() 25 | 26 | opt.repeat_epoch = 10 27 | opt.decay_step = 5000 28 | opt.decay_rate = [1.0, 0.6, 0.3, 0.1] 29 | 30 | 31 | def train_net(): 32 | # set result directory 33 | if not os.path.exists(opt.result_dir): 34 | os.makedirs(opt.result_dir) 35 | tb_writer = tf.summary.FileWriter(opt.result_dir) 36 | logger = setup_logger('train_log', os.path.join(opt.result_dir, 'log.txt')) 37 | for key, value in vars(opt).items(): 38 | logger.info(key + ': ' + str(value)) 39 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu 40 | # model & loss 41 | estimator = PointCloudAE(opt.emb_dim, opt.num_point) 42 | estimator.cuda() 43 | criterion = ChamferLoss() 44 | if opt.resume_model != '': 45 | estimator.load_state_dict(torch.load(opt.resume_model)) 46 | # dataset 47 | train_dataset = ShapeDataset(opt.h5_file, mode='train', augment=True) 48 | train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, 49 | shuffle=True, num_workers=opt.num_workers) 50 | val_dataset = ShapeDataset(opt.h5_file, mode='val', augment=False) 51 | val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.batch_size, 52 | shuffle=False, num_workers=opt.num_workers) 53 | # train 54 | st_time = time.time() 55 | global_step = ((train_dataset.length + opt.batch_size - 1) // opt.batch_size) * opt.repeat_epoch * (opt.start_epoch - 1) 56 | decay_count = -1 57 | for epoch in range(opt.start_epoch, opt.max_epoch+1): 58 | # train one epoch 59 | logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \ 60 | ', ' + 'Epoch %02d' % epoch + ', ' + 'Training started')) 61 | # create optimizer and adjust learning rate if needed 62 | if global_step // opt.decay_step > decay_count: 63 | decay_count += 1 64 | if decay_count < len(opt.decay_rate): 65 | current_lr = opt.lr * opt.decay_rate[decay_count] 66 | optimizer = torch.optim.Adam(estimator.parameters(), lr=current_lr) 67 | batch_idx = 0 68 | estimator.train() 69 | for rep in range(opt.repeat_epoch): 70 | for i, data in enumerate(train_dataloader): 71 | # label must be zero_indexed 72 | batch_xyz, batch_label = data 73 | batch_xyz = batch_xyz[:, :, :3].cuda() 74 | optimizer.zero_grad() 75 | embedding, point_cloud = estimator(batch_xyz) 76 | loss, _, _ = criterion(point_cloud, batch_xyz) 77 | summary = tf.Summary(value=[tf.Summary.Value(tag='learning_rate', simple_value=current_lr), 78 | tf.Summary.Value(tag='train_loss', simple_value=loss)]) 79 | # backward 80 | loss.backward() 81 | optimizer.step() 82 | global_step += 1 83 | batch_idx += 1 84 | # write results to tensorboard 85 | tb_writer.add_summary(summary, global_step) 86 | if batch_idx % 10 == 0: 87 | logger.info('Batch {0} Loss:{1:f}'.format(batch_idx, loss)) 88 | logger.info('>>>>>>>>----------Epoch {:02d} train finish---------<<<<<<<<'.format(epoch)) 89 | # evaluate one epoch 90 | logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \ 91 | ', ' + 'Epoch %02d' % epoch + ', ' + 'Testing started')) 92 | estimator.eval() 93 | val_loss = 0.0 94 | for i, data in enumerate(val_dataloader, 1): 95 | batch_xyz, batch_label = data 96 | batch_xyz = batch_xyz[:, :, :3].cuda() 97 | embedding, point_cloud = estimator(batch_xyz) 98 | loss, _, _ = criterion(point_cloud, batch_xyz) 99 | val_loss += loss.item() 100 | logger.info('Batch {0} Loss:{1:f}'.format(i, loss)) 101 | val_loss = val_loss / i 102 | summary = tf.Summary(value=[tf.Summary.Value(tag='val_loss', simple_value=val_loss)]) 103 | tb_writer.add_summary(summary, global_step) 104 | logger.info('Epoch {0:02d} test average loss: {1:06f}'.format(epoch, val_loss)) 105 | logger.info('>>>>>>>>----------Epoch {:02d} test finish---------<<<<<<<<'.format(epoch)) 106 | # save model after each epoch 107 | torch.save(estimator.state_dict(), '{0}/model_{1:02d}.pth'.format(opt.result_dir, epoch)) 108 | 109 | 110 | if __name__ == '__main__': 111 | train_net() 112 | -------------------------------------------------------------------------------- /train_deform.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | import random 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | import tensorflow as tf 9 | from lib.network import DeformNet 10 | from lib.loss import Loss 11 | from data.pose_dataset import PoseDataset 12 | from lib.utils import setup_logger, compute_sRT_errors 13 | from lib.align import estimateSimilarityTransform 14 | 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--dataset', type=str, default='CAMERA', help='CAMERA or CAMERA+Real') 18 | parser.add_argument('--data_dir', type=str, default='data', help='data directory') 19 | parser.add_argument('--n_pts', type=int, default=1024, help='number of foreground points') 20 | parser.add_argument('--n_cat', type=int, default=6, help='number of object categories') 21 | parser.add_argument('--nv_prior', type=int, default=1024, help='number of vertices in shape priors') 22 | parser.add_argument('--img_size', type=int, default=192, help='cropped image size') 23 | parser.add_argument('--batch_size', type=int, default=32, help='batch size') 24 | parser.add_argument('--num_workers', type=int, default=10, help='number of data loading workers') 25 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use') 26 | parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate') 27 | parser.add_argument('--start_epoch', type=int, default=1, help='which epoch to start') 28 | parser.add_argument('--max_epoch', type=int, default=50, help='max number of epochs to train') 29 | parser.add_argument('--resume_model', type=str, default='', help='resume from saved model') 30 | parser.add_argument('--result_dir', type=str, default='results/camera', help='directory to save train results') 31 | opt = parser.parse_args() 32 | 33 | opt.decay_epoch = [0, 10, 20, 30, 40] 34 | opt.decay_rate = [1.0, 0.6, 0.3, 0.1, 0.01] 35 | opt.corr_wt = 1.0 36 | opt.cd_wt = 5.0 37 | opt.entropy_wt = 0.0001 38 | opt.deform_wt = 0.01 39 | 40 | 41 | def train_net(): 42 | # set result directory 43 | if not os.path.exists(opt.result_dir): 44 | os.makedirs(opt.result_dir) 45 | tb_writer = tf.summary.FileWriter(opt.result_dir) 46 | logger = setup_logger('train_log', os.path.join(opt.result_dir, 'log.txt')) 47 | for key, value in vars(opt).items(): 48 | logger.info(key + ': ' + str(value)) 49 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu 50 | # model & loss 51 | estimator = DeformNet(opt.n_cat, opt.nv_prior) 52 | estimator.cuda() 53 | criterion = Loss(opt.corr_wt, opt.cd_wt, opt.entropy_wt, opt.deform_wt) 54 | if opt.resume_model != '': 55 | estimator.load_state_dict(torch.load(opt.resume_model)) 56 | # dataset 57 | train_dataset = PoseDataset(opt.dataset, 'train', opt.data_dir, opt.n_pts, opt.img_size) 58 | val_dataset = PoseDataset(opt.dataset, 'test', opt.data_dir, opt.n_pts, opt.img_size) 59 | # start training 60 | st_time = time.time() 61 | train_steps = 1500 62 | global_step = train_steps * (opt.start_epoch - 1) 63 | n_decays = len(opt.decay_epoch) 64 | assert len(opt.decay_rate) == n_decays 65 | for i in range(n_decays): 66 | if opt.start_epoch > opt.decay_epoch[i]: 67 | decay_count = i 68 | train_size = train_steps * opt.batch_size 69 | indices = [] 70 | page_start = -train_size 71 | for epoch in range(opt.start_epoch, opt.max_epoch + 1): 72 | # train one epoch 73 | logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \ 74 | ', ' + 'Epoch %02d' % epoch + ', ' + 'Training started')) 75 | # create optimizer and adjust learning rate if needed 76 | if decay_count < len(opt.decay_rate): 77 | if epoch > opt.decay_epoch[decay_count]: 78 | current_lr = opt.lr * opt.decay_rate[decay_count] 79 | optimizer = torch.optim.Adam(estimator.parameters(), lr=current_lr) 80 | decay_count += 1 81 | # sample train subset 82 | page_start += train_size 83 | len_last = len(indices) - page_start 84 | if len_last < train_size: 85 | indices = indices[page_start:] 86 | if opt.dataset == 'CAMERA+Real': 87 | # CAMERA : Real = 3 : 1 88 | camera_len = train_dataset.subset_len[0] 89 | real_len = train_dataset.subset_len[1] 90 | real_indices = list(range(camera_len, camera_len+real_len)) 91 | camera_indices = list(range(camera_len)) 92 | n_repeat = (train_size - len_last) // (4 * real_len) + 1 93 | data_list = random.sample(camera_indices, 3*n_repeat*real_len) + real_indices*n_repeat 94 | random.shuffle(data_list) 95 | indices += data_list 96 | else: 97 | data_list = list(range(train_dataset.length)) 98 | for i in range((train_size - len_last) // train_dataset.length + 1): 99 | random.shuffle(data_list) 100 | indices += data_list 101 | page_start = 0 102 | train_idx = indices[page_start:(page_start+train_size)] 103 | train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_idx) 104 | train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, sampler=train_sampler, 105 | num_workers=opt.num_workers, pin_memory=True) 106 | estimator.train() 107 | for i, data in enumerate(train_dataloader, 1): 108 | points, rgb, choose, cat_id, model, prior, sRT, nocs = data 109 | points = points.cuda() 110 | rgb = rgb.cuda() 111 | choose = choose.cuda() 112 | cat_id = cat_id.cuda() 113 | model = model.cuda() 114 | prior = prior.cuda() 115 | sRT = sRT.cuda() 116 | nocs = nocs.cuda() 117 | assign_mat, deltas = estimator(points, rgb, choose, cat_id, prior) 118 | loss, corr_loss, cd_loss, entropy_loss, deform_loss = criterion(assign_mat, deltas, prior, nocs, model) 119 | optimizer.zero_grad() 120 | loss.backward() 121 | optimizer.step() 122 | global_step += 1 123 | # write results to tensorboard 124 | summary = tf.Summary(value=[tf.Summary.Value(tag='learning_rate', simple_value=current_lr), 125 | tf.Summary.Value(tag='train_loss', simple_value=loss), 126 | tf.Summary.Value(tag='corr_loss', simple_value=corr_loss), 127 | tf.Summary.Value(tag='cd_loss', simple_value=cd_loss), 128 | tf.Summary.Value(tag='entropy_loss', simple_value=entropy_loss), 129 | tf.Summary.Value(tag='deform_loss', simple_value=deform_loss)]) 130 | tb_writer.add_summary(summary, global_step) 131 | if i % 10 == 0: 132 | logger.info('Batch {0} Loss:{1:f}, corr_loss:{2:f}, cd_loss:{3:f}, entropy_loss:{4:f}, deform_loss:{5:f}'.format( 133 | i, loss.item(), corr_loss.item(), cd_loss.item(), entropy_loss.item(), deform_loss.item())) 134 | 135 | logger.info('>>>>>>>>----------Epoch {:02d} train finish---------<<<<<<<<'.format(epoch)) 136 | 137 | # evaluate one epoch 138 | logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + 139 | ', ' + 'Epoch %02d' % epoch + ', ' + 'Testing started')) 140 | val_loss = 0.0 141 | total_count = np.zeros((opt.n_cat,), dtype=int) 142 | strict_success = np.zeros((opt.n_cat,), dtype=int) # 5 degree and 5 cm 143 | easy_success = np.zeros((opt.n_cat,), dtype=int) # 10 degree and 5 cm 144 | iou_success = np.zeros((opt.n_cat,), dtype=int) # relative scale error < 0.1 145 | # sample validation subset 146 | val_size = 1500 147 | val_idx = random.sample(list(range(val_dataset.length)), val_size) 148 | val_sampler = torch.utils.data.sampler.SubsetRandomSampler(val_idx) 149 | val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, sampler=val_sampler, 150 | num_workers=opt.num_workers, pin_memory=True) 151 | estimator.eval() 152 | for i, data in enumerate(val_dataloader, 1): 153 | points, rgb, choose, cat_id, model, prior, sRT, nocs = data 154 | points = points.cuda() 155 | rgb = rgb.cuda() 156 | choose = choose.cuda() 157 | cat_id = cat_id.cuda() 158 | model = model.cuda() 159 | prior = prior.cuda() 160 | sRT = sRT.cuda() 161 | nocs = nocs.cuda() 162 | assign_mat, deltas = estimator(points, rgb, choose, cat_id, prior) 163 | loss, _, _, _, _ = criterion(assign_mat, deltas, prior, nocs, model) 164 | # estimate pose and scale 165 | inst_shape = prior + deltas 166 | assign_mat = F.softmax(assign_mat, dim=2) 167 | nocs_coords = torch.bmm(assign_mat, inst_shape) 168 | nocs_coords = nocs_coords.detach().cpu().numpy()[0] 169 | points = points.cpu().numpy()[0] 170 | # use choose to remove repeated points 171 | choose = choose.cpu().numpy()[0] 172 | _, choose = np.unique(choose, return_index=True) 173 | nocs_coords = nocs_coords[choose, :] 174 | points = points[choose, :] 175 | _, _, _, pred_sRT = estimateSimilarityTransform(nocs_coords, points) 176 | # evaluate pose 177 | cat_id = cat_id.item() 178 | if pred_sRT is not None: 179 | sRT = sRT.detach().cpu().numpy()[0] 180 | R_error, T_error, IoU = compute_sRT_errors(pred_sRT, sRT) 181 | if R_error < 5 and T_error < 0.05: 182 | strict_success[cat_id] += 1 183 | if R_error < 10 and T_error < 0.05: 184 | easy_success[cat_id] += 1 185 | if IoU < 0.1: 186 | iou_success[cat_id] += 1 187 | total_count[cat_id] += 1 188 | val_loss += loss.item() 189 | if i % 100 == 0: 190 | logger.info('Batch {0} Loss:{1:f}'.format(i, loss.item())) 191 | # compute accuracy 192 | strict_acc = 100 * (strict_success / total_count) 193 | easy_acc = 100 * (easy_success / total_count) 194 | iou_acc = 100 * (iou_success / total_count) 195 | for i in range(opt.n_cat): 196 | logger.info('{} accuracies:'.format(val_dataset.cat_names[i])) 197 | logger.info('5^o 5cm: {:4f}'.format(strict_acc[i])) 198 | logger.info('10^o 5cm: {:4f}'.format(easy_acc[i])) 199 | logger.info('IoU < 0.1: {:4f}'.format(iou_acc[i])) 200 | strict_acc = np.mean(strict_acc) 201 | easy_acc = np.mean(easy_acc) 202 | iou_acc = np.mean(iou_acc) 203 | val_loss = val_loss / val_size 204 | summary = tf.Summary(value=[tf.Summary.Value(tag='val_loss', simple_value=val_loss), 205 | tf.Summary.Value(tag='5^o5cm_acc', simple_value=strict_acc), 206 | tf.Summary.Value(tag='10^o5cm_acc', simple_value=easy_acc), 207 | tf.Summary.Value(tag='iou_acc', simple_value=iou_acc)]) 208 | tb_writer.add_summary(summary, global_step) 209 | logger.info('Epoch {0:02d} test average loss: {1:06f}'.format(epoch, val_loss)) 210 | logger.info('Overall accuracies:') 211 | logger.info('5^o 5cm: {:4f} 10^o 5cm: {:4f} IoU: {:4f}'.format(strict_acc, easy_acc, iou_acc)) 212 | logger.info('>>>>>>>>----------Epoch {:02d} test finish---------<<<<<<<<'.format(epoch)) 213 | # save model after each epoch 214 | torch.save(estimator.state_dict(), '{0}/model_{1:02d}.pth'.format(opt.result_dir, epoch)) 215 | 216 | 217 | if __name__ == '__main__': 218 | train_net() 219 | --------------------------------------------------------------------------------