├── LICENSE
├── assets
    ├── mean_points_emb.npy
    └── teaser.png
├── data
    ├── pose_dataset.py
    └── shape_dataset.py
├── evaluate.py
├── lib
    ├── align.py
    ├── auto_encoder.py
    ├── loss.py
    ├── network.py
    ├── nn_distance
    │   ├── chamfer_loss.py
    │   ├── setup.py
    │   └── src
    │   │   ├── nn_distance.cpp
    │   │   └── nn_distance_cuda.cu
    ├── pspnet.py
    └── utils.py
├── mean_shape.py
├── preprocess
    ├── pose_data.py
    └── shape_data.py
├── readme.md
├── tools
    └── tsne.py
├── train_ae.py
└── train_deform.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Meng Tian
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/mean_points_emb.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mentian/object-deformnet/a2dcdb87dd88912c6b51b0f693443212fde5696e/assets/mean_points_emb.npy


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mentian/object-deformnet/a2dcdb87dd88912c6b51b0f693443212fde5696e/assets/teaser.png


--------------------------------------------------------------------------------
/data/pose_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import math
  4 | import random
  5 | import numpy as np
  6 | import _pickle as cPickle
  7 | from PIL import Image
  8 | import torch.utils.data as data
  9 | import torchvision.transforms as transforms
 10 | from lib.utils import load_depth, get_bbox
 11 | 
 12 | 
 13 | class PoseDataset(data.Dataset):
 14 |     def __init__(self, source, mode, data_dir, n_pts, img_size):
 15 |         """
 16 |         Args:
 17 |             source: 'CAMERA', 'Real' or 'CAMERA+Real'
 18 |             mode: 'train' or 'test'
 19 |             data_dir:
 20 |             n_pts: number of selected foreground points
 21 |             img_size: square image window
 22 |         """
 23 |         self.source = source
 24 |         self.mode = mode
 25 |         self.data_dir = data_dir
 26 |         self.n_pts = n_pts
 27 |         self.img_size = img_size
 28 | 
 29 |         assert source in ['CAMERA', 'Real', 'CAMERA+Real']
 30 |         assert mode in ['train', 'test']
 31 |         img_list_path = ['CAMERA/train_list.txt', 'Real/train_list.txt',
 32 |                          'CAMERA/val_list.txt', 'Real/test_list.txt']
 33 |         model_file_path = ['obj_models/camera_train.pkl', 'obj_models/real_train.pkl',
 34 |                            'obj_models/camera_val.pkl', 'obj_models/real_test.pkl']
 35 |         if mode == 'train':
 36 |             del img_list_path[2:]
 37 |             del model_file_path[2:]
 38 |         else:
 39 |             del img_list_path[:2]
 40 |             del model_file_path[:2]
 41 |         if source == 'CAMERA':
 42 |             del img_list_path[-1]
 43 |             del model_file_path[-1]
 44 |         elif source == 'Real':
 45 |             del img_list_path[0]
 46 |             del model_file_path[0]
 47 |         else:
 48 |             # only use Real to test when source is CAMERA+Real
 49 |             if mode == 'test':
 50 |                 del img_list_path[0]
 51 |                 del model_file_path[0]
 52 | 
 53 |         img_list = []
 54 |         subset_len = []
 55 |         for path in img_list_path:
 56 |             img_list += [os.path.join(path.split('/')[0], line.rstrip('\n'))
 57 |                          for line in open(os.path.join(data_dir, path))]
 58 |             subset_len.append(len(img_list))
 59 |         if len(subset_len) == 2:
 60 |             self.subset_len = [subset_len[0], subset_len[1]-subset_len[0]]
 61 |         self.img_list = img_list
 62 |         self.length = len(self.img_list)
 63 | 
 64 |         models = {}
 65 |         for path in model_file_path:
 66 |             with open(os.path.join(data_dir, path), 'rb') as f:
 67 |                 models.update(cPickle.load(f))
 68 |         self.models = models
 69 | 
 70 |         # meta info for re-label mug category
 71 |         with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f:
 72 |             self.mug_meta = cPickle.load(f)
 73 | 
 74 |         self.mean_shapes = np.load('assets/mean_points_emb.npy')
 75 |         self.cat_names = ['bottle', 'bowl', 'camera', 'can', 'laptop', 'mug']
 76 |         self.camera_intrinsics = [577.5, 577.5, 319.5, 239.5]    # [fx, fy, cx, cy]
 77 |         self.real_intrinsics = [591.0125, 590.16775, 322.525, 244.11084]
 78 |         self.sym_ids = [0, 1, 3]    # 0-indexed
 79 |         self.norm_scale = 1000.0    # normalization scale
 80 |         self.xmap = np.array([[i for i in range(640)] for j in range(480)])
 81 |         self.ymap = np.array([[j for i in range(640)] for j in range(480)])
 82 |         self.shift_range = 0.01
 83 |         self.colorjitter = transforms.ColorJitter(0.2, 0.2, 0.2, 0.05)
 84 |         self.transform = transforms.Compose([transforms.ToTensor(),
 85 |                                              transforms.Normalize(mean=[0.485, 0.456, 0.406],
 86 |                                                                   std=[0.229, 0.224, 0.225])])
 87 |         print('{} images found.'.format(self.length))
 88 |         print('{} models loaded.'.format(len(self.models)))
 89 | 
 90 |     def __len__(self):
 91 |         return self.length
 92 | 
 93 |     def __getitem__(self, index):
 94 |         img_path = os.path.join(self.data_dir, self.img_list[index])
 95 |         rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
 96 |         rgb = rgb[:, :, ::-1]
 97 |         depth = load_depth(img_path)
 98 |         mask = cv2.imread(img_path + '_mask.png')[:, :, 2]
 99 |         coord = cv2.imread(img_path + '_coord.png')[:, :, :3]
100 |         coord = coord[:, :, (2, 1, 0)]
101 |         coord = np.array(coord, dtype=np.float32) / 255
102 |         coord[:, :, 2] = 1 - coord[:, :, 2]
103 |         with open(img_path + '_label.pkl', 'rb') as f:
104 |             gts = cPickle.load(f)
105 |         if 'CAMERA' in img_path.split('/'):
106 |             cam_fx, cam_fy, cam_cx, cam_cy = self.camera_intrinsics
107 |         else:
108 |             cam_fx, cam_fy, cam_cx, cam_cy = self.real_intrinsics
109 | 
110 |         # select one foreground object
111 |         idx = random.randint(0, len(gts['instance_ids'])-1)
112 |         inst_id = gts['instance_ids'][idx]
113 |         rmin, rmax, cmin, cmax = get_bbox(gts['bboxes'][idx])
114 |         # sample points
115 |         mask = np.equal(mask, inst_id)
116 |         mask = np.logical_and(mask, depth > 0)
117 |         choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
118 |         if len(choose) > self.n_pts:
119 |             c_mask = np.zeros(len(choose), dtype=int)
120 |             c_mask[:self.n_pts] = 1
121 |             np.random.shuffle(c_mask)
122 |             choose = choose[c_mask.nonzero()]
123 |         else:
124 |             choose = np.pad(choose, (0, self.n_pts-len(choose)), 'wrap')
125 |         depth_masked = depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
126 |         xmap_masked = self.xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
127 |         ymap_masked = self.ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
128 |         pt2 = depth_masked / self.norm_scale
129 |         pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
130 |         pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
131 |         points = np.concatenate((pt0, pt1, pt2), axis=1)
132 |         nocs = coord[rmin:rmax, cmin:cmax, :].reshape((-1, 3))[choose, :] - 0.5
133 |         # resize cropped image to standard size and adjust 'choose' accordingly
134 |         rgb = rgb[rmin:rmax, cmin:cmax, :]
135 |         rgb = cv2.resize(rgb, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR)
136 |         crop_w = rmax - rmin
137 |         ratio = self.img_size / crop_w
138 |         col_idx = choose % crop_w
139 |         row_idx = choose // crop_w
140 |         choose = (np.floor(row_idx * ratio) * self.img_size + np.floor(col_idx * ratio)).astype(np.int64)
141 |         # label
142 |         cat_id = gts['class_ids'][idx] - 1    # convert to 0-indexed
143 |         model = self.models[gts['model_list'][idx]].astype(np.float32)     # 1024 points
144 |         prior = self.mean_shapes[cat_id].astype(np.float32)
145 |         scale = gts['scales'][idx]
146 |         rotation = gts['rotations'][idx]
147 |         translation = gts['translations'][idx]
148 |         # data augmentation
149 |         if self.mode == 'train':
150 |             # color jitter
151 |             rgb = self.colorjitter(Image.fromarray(np.uint8(rgb)))
152 |             rgb = np.array(rgb)
153 |             # point shift
154 |             add_t = np.random.uniform(-self.shift_range, self.shift_range, (1, 3))
155 |             translation = translation + add_t[0]
156 |             # point jitter
157 |             add_t = add_t + np.clip(0.001*np.random.randn(points.shape[0], 3), -0.005, 0.005)
158 |             points = np.add(points, add_t)
159 |         rgb = self.transform(rgb)
160 |         points = points.astype(np.float32)
161 |         # adjust nocs coords for mug category
162 |         if cat_id == 5:
163 |             T0 = self.mug_meta[gts['model_list'][idx]][0]
164 |             s0 = self.mug_meta[gts['model_list'][idx]][1]
165 |             nocs = s0 * (nocs + T0)
166 |         # map ambiguous rotation to canonical rotation
167 |         if cat_id in self.sym_ids:
168 |             rotation = gts['rotations'][idx]
169 |             # assume continuous axis rotation symmetry
170 |             theta_x = rotation[0, 0] + rotation[2, 2]
171 |             theta_y = rotation[0, 2] - rotation[2, 0]
172 |             r_norm = math.sqrt(theta_x**2 + theta_y**2)
173 |             s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
174 |                               [0.0,            1.0,  0.0           ],
175 |                               [theta_y/r_norm, 0.0,  theta_x/r_norm]])
176 |             rotation = rotation @ s_map
177 |             nocs = nocs @ s_map
178 |         sRT = np.identity(4, dtype=np.float32)
179 |         sRT[:3, :3] = scale * rotation
180 |         sRT[:3, 3] = translation
181 |         nocs = nocs.astype(np.float32)
182 | 
183 |         return points, rgb, choose, cat_id, model, prior, sRT, nocs
184 | 


--------------------------------------------------------------------------------
/data/shape_dataset.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import numpy as np
 3 | import torch.utils.data as data
 4 | 
 5 | 
 6 | class ShapeDataset(data.Dataset):
 7 |     def __init__(self, h5_file, mode, n_points=2048, augment=False):
 8 |         assert (mode == 'train' or mode == 'val'), 'Mode must be "train" or "val".'
 9 |         self.mode = mode
10 |         self.n_points = n_points
11 |         self.augment = augment
12 |         # load data from h5py file
13 |         with h5py.File(h5_file, 'r') as f:
14 |             self.length = f[self.mode].attrs['len']
15 |             self.data = f[self.mode]['data'][:]
16 |             self.label = f[self.mode]['label'][:]
17 |         # augmentation parameters
18 |         self.sigma = 0.01
19 |         self.clip = 0.02
20 |         self.shift_range = 0.02
21 | 
22 |     def __len__(self):
23 |         return self.length
24 | 
25 |     def __getitem__(self, index):
26 |         xyz = self.data[index]
27 |         label = self.label[index] - 1    # data saved indexed from 1
28 |         # randomly downsample
29 |         np_data = xyz.shape[0]
30 |         assert np_data >= self.n_points, 'Not enough points in shape.'
31 |         idx = np.random.choice(np_data, self.n_points)
32 |         xyz = xyz[idx, :]
33 |         # data augmentation
34 |         if self.augment:
35 |             jitter = np.clip(self.sigma*np.random.randn(self.n_points, 3), -self.clip, self.clip)
36 |             xyz[:, :3] += jitter
37 |             shift = np.random.uniform(-self.shift_range, self.shift_range, (1, 3))
38 |             xyz[:, :3] += shift
39 |         return xyz, label
40 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import argparse
  4 | import cv2
  5 | import glob
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import _pickle as cPickle
  9 | import torch
 10 | import torch.nn.functional as F
 11 | import torchvision.transforms as transforms
 12 | from lib.network import DeformNet
 13 | from lib.align import estimateSimilarityTransform
 14 | from lib.utils import load_depth, get_bbox, compute_mAP, plot_mAP
 15 | 
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--data', type=str, default='val', help='val, real_test')
 19 | parser.add_argument('--data_dir', type=str, default='data', help='data directory')
 20 | parser.add_argument('--n_cat', type=int, default=6, help='number of object categories')
 21 | parser.add_argument('--nv_prior', type=int, default=1024, help='number of vertices in shape priors')
 22 | parser.add_argument('--model', type=str, default='results/camera/model_50.pth', help='resume from saved model')
 23 | parser.add_argument('--n_pts', type=int, default=1024, help='number of foreground points')
 24 | parser.add_argument('--img_size', type=int, default=192, help='cropped image size')
 25 | parser.add_argument('--gpu', type=str, default='1', help='GPU to use')
 26 | opt = parser.parse_args()
 27 | 
 28 | mean_shapes = np.load('assets/mean_points_emb.npy')
 29 | 
 30 | assert opt.data in ['val', 'real_test']
 31 | if opt.data == 'val':
 32 |     result_dir = 'results/eval_camera'
 33 |     file_path = 'CAMERA/val_list.txt'
 34 |     cam_fx, cam_fy, cam_cx, cam_cy = 577.5, 577.5, 319.5, 239.5
 35 | else:
 36 |     result_dir = 'results/eval_real'
 37 |     file_path = 'Real/test_list.txt'
 38 |     cam_fx, cam_fy, cam_cx, cam_cy = 591.0125, 590.16775, 322.525, 244.11084
 39 | 
 40 | if not os.path.exists(result_dir):
 41 |     os.makedirs(result_dir)
 42 | 
 43 | xmap = np.array([[i for i in range(640)] for j in range(480)])
 44 | ymap = np.array([[j for i in range(640)] for j in range(480)])
 45 | norm_scale = 1000.0
 46 | norm_color = transforms.Compose(
 47 |     [transforms.ToTensor(),
 48 |      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
 49 | )
 50 | 
 51 | 
 52 | def detect():
 53 |     # resume model
 54 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
 55 |     estimator = DeformNet(opt.n_cat, opt.nv_prior)
 56 |     estimator.cuda()
 57 |     estimator.load_state_dict(torch.load(opt.model))
 58 |     estimator.eval()
 59 |     # get test data list
 60 |     img_list = [os.path.join(file_path.split('/')[0], line.rstrip('\n'))
 61 |                 for line in open(os.path.join(opt.data_dir, file_path))]
 62 |     # frame by frame test
 63 |     t_inference = 0.0
 64 |     t_umeyama = 0.0
 65 |     inst_count = 0
 66 |     img_count = 0
 67 |     t_start = time.time()
 68 |     for path in tqdm(img_list):
 69 |         img_path = os.path.join(opt.data_dir, path)
 70 |         raw_rgb = cv2.imread(img_path + '_color.png')[:, :, :3]
 71 |         raw_rgb = raw_rgb[:, :, ::-1]
 72 |         raw_depth = load_depth(img_path)
 73 |         # load mask-rcnn detection results
 74 |         img_path_parsing = img_path.split('/')
 75 |         mrcnn_path = os.path.join('results/mrcnn_results', opt.data, 'results_{}_{}_{}.pkl'.format(
 76 |             opt.data.split('_')[-1], img_path_parsing[-2], img_path_parsing[-1]))
 77 |         with open(mrcnn_path, 'rb') as f:
 78 |             mrcnn_result = cPickle.load(f)
 79 |         num_insts = len(mrcnn_result['class_ids'])
 80 |         f_sRT = np.zeros((num_insts, 4, 4), dtype=float)
 81 |         f_size = np.zeros((num_insts, 3), dtype=float)
 82 |         # prepare frame data
 83 |         f_points, f_rgb, f_choose, f_catId, f_prior = [], [], [], [], []
 84 |         valid_inst = []
 85 |         for i in range(num_insts):
 86 |             cat_id = mrcnn_result['class_ids'][i] - 1
 87 |             prior = mean_shapes[cat_id]
 88 |             rmin, rmax, cmin, cmax = get_bbox(mrcnn_result['rois'][i])
 89 |             mask = np.logical_and(mrcnn_result['masks'][:, :, i], raw_depth > 0)
 90 |             choose = mask[rmin:rmax, cmin:cmax].flatten().nonzero()[0]
 91 |             # no depth observation for background in CAMERA dataset
 92 |             # beacuase of how we compute the bbox in function get_bbox
 93 |             # there might be a chance that no foreground points after cropping the mask
 94 |             # cuased by false positive of mask_rcnn, most of the regions are background
 95 |             if len(choose) < 32:
 96 |                 f_sRT[i] = np.identity(4, dtype=float)
 97 |                 f_size[i] = 2 * np.amax(np.abs(prior), axis=0)
 98 |                 continue
 99 |             else:
100 |                 valid_inst.append(i)
101 |             # process objects with valid depth observation
102 |             if len(choose) > opt.n_pts:
103 |                 c_mask = np.zeros(len(choose), dtype=int)
104 |                 c_mask[:opt.n_pts] = 1
105 |                 np.random.shuffle(c_mask)
106 |                 choose = choose[c_mask.nonzero()]
107 |             else:
108 |                 choose = np.pad(choose, (0, opt.n_pts-len(choose)), 'wrap')
109 |             depth_masked = raw_depth[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
110 |             xmap_masked = xmap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
111 |             ymap_masked = ymap[rmin:rmax, cmin:cmax].flatten()[choose][:, np.newaxis]
112 |             pt2 = depth_masked / norm_scale
113 |             pt0 = (xmap_masked - cam_cx) * pt2 / cam_fx
114 |             pt1 = (ymap_masked - cam_cy) * pt2 / cam_fy
115 |             points = np.concatenate((pt0, pt1, pt2), axis=1)
116 |             rgb = raw_rgb[rmin:rmax, cmin:cmax, :]
117 |             rgb = cv2.resize(rgb, (opt.img_size, opt.img_size), interpolation=cv2.INTER_LINEAR)
118 |             rgb = norm_color(rgb)
119 |             crop_w = rmax - rmin
120 |             ratio = opt.img_size / crop_w
121 |             col_idx = choose % crop_w
122 |             row_idx = choose // crop_w
123 |             choose = (np.floor(row_idx * ratio) * opt.img_size + np.floor(col_idx * ratio)).astype(np.int64)
124 |             # concatenate instances
125 |             f_points.append(points)
126 |             f_rgb.append(rgb)
127 |             f_choose.append(choose)
128 |             f_catId.append(cat_id)
129 |             f_prior.append(prior)
130 |         if len(valid_inst):
131 |             f_points = torch.cuda.FloatTensor(f_points)
132 |             f_rgb = torch.stack(f_rgb, dim=0).cuda()
133 |             f_choose = torch.cuda.LongTensor(f_choose)
134 |             f_catId = torch.cuda.LongTensor(f_catId)
135 |             f_prior = torch.cuda.FloatTensor(f_prior)
136 |             # inference
137 |             torch.cuda.synchronize()
138 |             t_now = time.time()
139 |             assign_mat, deltas = estimator(f_points, f_rgb, f_choose, f_catId, f_prior)
140 |             # assign_mat, deltas = estimator(f_rgb, f_choose, f_catId, f_prior)
141 |             inst_shape = f_prior + deltas
142 |             assign_mat = F.softmax(assign_mat, dim=2)
143 |             f_coords = torch.bmm(assign_mat, inst_shape)  # bs x n_pts x 3
144 |             torch.cuda.synchronize()
145 |             t_inference += (time.time() - t_now)
146 |             f_coords = f_coords.detach().cpu().numpy()
147 |             f_points = f_points.cpu().numpy()
148 |             f_choose = f_choose.cpu().numpy()
149 |             f_insts = inst_shape.detach().cpu().numpy()
150 |             t_now = time.time()
151 |             for i in range(len(valid_inst)):
152 |                 inst_idx = valid_inst[i]
153 |                 choose = f_choose[i]
154 |                 _, choose = np.unique(choose, return_index=True)
155 |                 nocs_coords = f_coords[i, choose, :]
156 |                 f_size[inst_idx] = 2 * np.amax(np.abs(f_insts[i]), axis=0)
157 |                 points = f_points[i, choose, :]
158 |                 _, _, _, pred_sRT = estimateSimilarityTransform(nocs_coords, points)
159 |                 if pred_sRT is None:
160 |                     pred_sRT = np.identity(4, dtype=float)
161 |                 f_sRT[inst_idx] = pred_sRT
162 |             t_umeyama += (time.time() - t_now)
163 |             img_count += 1
164 |             inst_count += len(valid_inst)
165 | 
166 |         # save results
167 |         result = {}
168 |         with open(img_path + '_label.pkl', 'rb') as f:
169 |             gts = cPickle.load(f)
170 |         result['gt_class_ids'] = gts['class_ids']
171 |         result['gt_bboxes'] = gts['bboxes']
172 |         result['gt_RTs'] = gts['poses']
173 |         result['gt_scales'] = gts['size']
174 |         result['gt_handle_visibility'] = gts['handle_visibility']
175 | 
176 |         result['pred_class_ids'] = mrcnn_result['class_ids']
177 |         result['pred_bboxes'] = mrcnn_result['rois']
178 |         result['pred_scores'] = mrcnn_result['scores']
179 |         result['pred_RTs'] = f_sRT
180 |         result['pred_scales'] = f_size
181 | 
182 |         image_short_path = '_'.join(img_path_parsing[-3:])
183 |         save_path = os.path.join(result_dir, 'results_{}.pkl'.format(image_short_path))
184 |         with open(save_path, 'wb') as f:
185 |             cPickle.dump(result, f)
186 |     # write statistics
187 |     fw = open('{0}/eval_logs.txt'.format(result_dir), 'w')
188 |     messages = []
189 |     messages.append("Total images: {}".format(len(img_list)))
190 |     messages.append("Valid images: {},  Total instances: {},  Average: {:.2f}/image".format(
191 |         img_count, inst_count, inst_count/img_count))
192 |     messages.append("Inference time: {:06f}  Average: {:06f}/image".format(t_inference, t_inference/img_count))
193 |     messages.append("Umeyama time: {:06f}  Average: {:06f}/image".format(t_umeyama, t_umeyama/img_count))
194 |     messages.append("Total time: {:06f}".format(time.time() - t_start))
195 |     for msg in messages:
196 |         print(msg)
197 |         fw.write(msg + '\n')
198 |     fw.close()
199 | 
200 | 
201 | def evaluate():
202 |     degree_thres_list = list(range(0, 61, 1))
203 |     shift_thres_list = [i / 2 for i in range(21)]
204 |     iou_thres_list = [i / 100 for i in range(101)]
205 |     # predictions
206 |     result_pkl_list = glob.glob(os.path.join(result_dir, 'results_*.pkl'))
207 |     result_pkl_list = sorted(result_pkl_list)
208 |     assert len(result_pkl_list)
209 |     pred_results = []
210 |     for pkl_path in result_pkl_list:
211 |         with open(pkl_path, 'rb') as f:
212 |             result = cPickle.load(f)
213 |             if 'gt_handle_visibility' not in result:
214 |                 result['gt_handle_visibility'] = np.ones_like(result['gt_class_ids'])
215 |             else:
216 |                 assert len(result['gt_handle_visibility']) == len(result['gt_class_ids']), "{} {}".format(
217 |                     result['gt_handle_visibility'], result['gt_class_ids'])
218 |         if type(result) is list:
219 |             pred_results += result
220 |         elif type(result) is dict:
221 |             pred_results.append(result)
222 |         else:
223 |             assert False
224 |     # To be consistent with NOCS, set use_matches_for_pose=True for mAP evaluation
225 |     iou_aps, pose_aps, iou_acc, pose_acc = compute_mAP(pred_results, result_dir, degree_thres_list, shift_thres_list,
226 |                                                        iou_thres_list, iou_pose_thres=0.1, use_matches_for_pose=True)
227 |     # metric
228 |     fw = open('{0}/eval_logs.txt'.format(result_dir), 'a')
229 |     iou_25_idx = iou_thres_list.index(0.25)
230 |     iou_50_idx = iou_thres_list.index(0.5)
231 |     iou_75_idx = iou_thres_list.index(0.75)
232 |     degree_05_idx = degree_thres_list.index(5)
233 |     degree_10_idx = degree_thres_list.index(10)
234 |     shift_02_idx = shift_thres_list.index(2)
235 |     shift_05_idx = shift_thres_list.index(5)
236 |     messages = []
237 |     messages.append('mAP:')
238 |     messages.append('3D IoU at 25: {:.1f}'.format(iou_aps[-1, iou_25_idx] * 100))
239 |     messages.append('3D IoU at 50: {:.1f}'.format(iou_aps[-1, iou_50_idx] * 100))
240 |     messages.append('3D IoU at 75: {:.1f}'.format(iou_aps[-1, iou_75_idx] * 100))
241 |     messages.append('5 degree, 2cm: {:.1f}'.format(pose_aps[-1, degree_05_idx, shift_02_idx] * 100))
242 |     messages.append('5 degree, 5cm: {:.1f}'.format(pose_aps[-1, degree_05_idx, shift_05_idx] * 100))
243 |     messages.append('10 degree, 2cm: {:.1f}'.format(pose_aps[-1, degree_10_idx, shift_02_idx] * 100))
244 |     messages.append('10 degree, 5cm: {:.1f}'.format(pose_aps[-1, degree_10_idx, shift_05_idx] * 100))
245 |     messages.append('Acc:')
246 |     messages.append('3D IoU at 25: {:.1f}'.format(iou_acc[-1, iou_25_idx] * 100))
247 |     messages.append('3D IoU at 50: {:.1f}'.format(iou_acc[-1, iou_50_idx] * 100))
248 |     messages.append('3D IoU at 75: {:.1f}'.format(iou_acc[-1, iou_75_idx] * 100))
249 |     messages.append('5 degree, 2cm: {:.1f}'.format(pose_acc[-1, degree_05_idx, shift_02_idx] * 100))
250 |     messages.append('5 degree, 5cm: {:.1f}'.format(pose_acc[-1, degree_05_idx, shift_05_idx] * 100))
251 |     messages.append('10 degree, 2cm: {:.1f}'.format(pose_acc[-1, degree_10_idx, shift_02_idx] * 100))
252 |     messages.append('10 degree, 5cm: {:.1f}'.format(pose_acc[-1, degree_10_idx, shift_05_idx] * 100))
253 |     for msg in messages:
254 |         print(msg)
255 |         fw.write(msg + '\n')
256 |     fw.close()
257 |     # load NOCS results
258 |     pkl_path = os.path.join('results/nocs_results', opt.data, 'mAP_Acc.pkl')
259 |     with open(pkl_path, 'rb') as f:
260 |         nocs_results = cPickle.load(f)
261 |     nocs_iou_aps = nocs_results['iou_aps'][-1, :]
262 |     nocs_pose_aps = nocs_results['pose_aps'][-1, :, :]
263 |     iou_aps = np.concatenate((iou_aps, nocs_iou_aps[None, :]), axis=0)
264 |     pose_aps = np.concatenate((pose_aps, nocs_pose_aps[None, :, :]), axis=0)
265 |     # plot
266 |     plot_mAP(iou_aps, pose_aps, result_dir, iou_thres_list, degree_thres_list, shift_thres_list)
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     print('Detecting ...')
271 |     detect()
272 |     print('Evaluating ...')
273 |     evaluate()
274 | 


--------------------------------------------------------------------------------
/lib/align.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     RANSAC for Similarity Transformation Estimation
  3 |     Modified from https://github.com/hughw19/NOCS_CVPR2019
  4 |     Originally Written by Srinath Sridhar
  5 | """
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | def estimateSimilarityUmeyama(SourceHom, TargetHom):
 11 |     # Copy of original paper is at: http://web.stanford.edu/class/cs273/refs/umeyama.pdf
 12 |     SourceCentroid = np.mean(SourceHom[:3, :], axis=1)
 13 |     TargetCentroid = np.mean(TargetHom[:3, :], axis=1)
 14 |     nPoints = SourceHom.shape[1]
 15 |     CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose()
 16 |     CenteredTarget = TargetHom[:3, :] - np.tile(TargetCentroid, (nPoints, 1)).transpose()
 17 |     CovMatrix = np.matmul(CenteredTarget, np.transpose(CenteredSource)) / nPoints
 18 |     if np.isnan(CovMatrix).any():
 19 |         print('nPoints:', nPoints)
 20 |         print(SourceHom.shape)
 21 |         print(TargetHom.shape)
 22 |         raise RuntimeError('There are NANs in the input.')
 23 | 
 24 |     U, D, Vh = np.linalg.svd(CovMatrix, full_matrices=True)
 25 |     d = (np.linalg.det(U) * np.linalg.det(Vh)) < 0.0
 26 |     if d:
 27 |         D[-1] = -D[-1]
 28 |         U[:, -1] = -U[:, -1]
 29 |     # rotation
 30 |     Rotation = np.matmul(U, Vh)
 31 |     # scale
 32 |     varP = np.var(SourceHom[:3, :], axis=1).sum()
 33 |     Scale = 1 / varP * np.sum(D)
 34 |     # translation
 35 |     Translation = TargetHom[:3, :].mean(axis=1) - SourceHom[:3, :].mean(axis=1).dot(Scale*Rotation.T)
 36 |     # transformation matrix
 37 |     OutTransform = np.identity(4)
 38 |     OutTransform[:3, :3] = Scale * Rotation
 39 |     OutTransform[:3, 3] = Translation
 40 | 
 41 |     return Scale, Rotation, Translation, OutTransform
 42 | 
 43 | 
 44 | def estimateSimilarityTransform(source: np.array, target: np.array, verbose=False):
 45 |     """ Add RANSAC algorithm to account for outliers.
 46 | 
 47 |     """
 48 |     assert source.shape[0] == target.shape[0], 'Source and Target must have same number of points.'
 49 |     SourceHom = np.transpose(np.hstack([source, np.ones([source.shape[0], 1])]))
 50 |     TargetHom = np.transpose(np.hstack([target, np.ones([target.shape[0], 1])]))
 51 |     # Auto-parameter selection based on source heuristics
 52 |     # Assume source is object model or gt nocs map, which is of high quality
 53 |     SourceCentroid = np.mean(SourceHom[:3, :], axis=1)
 54 |     nPoints = SourceHom.shape[1]
 55 |     CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose()
 56 |     SourceDiameter = 2 * np.amax(np.linalg.norm(CenteredSource, axis=0))
 57 |     InlierT = SourceDiameter / 10.0  # 0.1 of source diameter
 58 |     maxIter = 128
 59 |     confidence = 0.99
 60 | 
 61 |     if verbose:
 62 |         print('Inlier threshold: ', InlierT)
 63 |         print('Max number of iterations: ', maxIter)
 64 | 
 65 |     BestInlierRatio = 0
 66 |     BestInlierIdx = np.arange(nPoints)
 67 |     for i in range(0, maxIter):
 68 |         # Pick 5 random (but corresponding) points from source and target
 69 |         RandIdx = np.random.randint(nPoints, size=5)
 70 |         Scale, _, _, OutTransform = estimateSimilarityUmeyama(SourceHom[:, RandIdx], TargetHom[:, RandIdx])
 71 |         PassThreshold = Scale * InlierT    # propagate inlier threshold to target scale
 72 |         Diff = TargetHom - np.matmul(OutTransform, SourceHom)
 73 |         ResidualVec = np.linalg.norm(Diff[:3, :], axis=0)
 74 |         InlierIdx = np.where(ResidualVec < PassThreshold)[0]
 75 |         nInliers = InlierIdx.shape[0]
 76 |         InlierRatio = nInliers / nPoints
 77 |         # update best hypothesis
 78 |         if InlierRatio > BestInlierRatio:
 79 |             BestInlierRatio = InlierRatio
 80 |             BestInlierIdx = InlierIdx
 81 |         if verbose:
 82 |             print('Iteration: ', i)
 83 |             print('Inlier ratio: ', BestInlierRatio)
 84 |         # early break
 85 |         if (1 - (1 - BestInlierRatio ** 5) ** i) > confidence:
 86 |             break
 87 | 
 88 |     if(BestInlierRatio < 0.1):
 89 |         print('[ WARN ] - Something is wrong. Small BestInlierRatio: ', BestInlierRatio)
 90 |         return None, None, None, None
 91 | 
 92 |     SourceInliersHom = SourceHom[:, BestInlierIdx]
 93 |     TargetInliersHom = TargetHom[:, BestInlierIdx]
 94 |     Scale, Rotation, Translation, OutTransform = estimateSimilarityUmeyama(SourceInliersHom, TargetInliersHom)
 95 | 
 96 |     if verbose:
 97 |         print('BestInlierRatio:', BestInlierRatio)
 98 |         print('Rotation:\n', Rotation)
 99 |         print('Translation:\n', Translation)
100 |         print('Scale:', Scale)
101 | 
102 |     return Scale, Rotation, Translation, OutTransform
103 | 
104 | 
105 | def backproject(depth, intrinsics, instance_mask):
106 |     """ Back-projection, use opencv camera coordinate frame.
107 | 
108 |     """
109 |     cam_fx = intrinsics[0, 0]
110 |     cam_fy = intrinsics[1, 1]
111 |     cam_cx = intrinsics[0, 2]
112 |     cam_cy = intrinsics[1, 2]
113 | 
114 |     non_zero_mask = (depth > 0)
115 |     final_instance_mask = np.logical_and(instance_mask, non_zero_mask)
116 |     idxs = np.where(final_instance_mask)
117 | 
118 |     z = depth[idxs[0], idxs[1]]
119 |     x = (idxs[1] - cam_cx) * z / cam_fx
120 |     y = (idxs[0] - cam_cy) * z / cam_fy
121 |     pts = np.stack((x, y, z), axis=1)
122 | 
123 |     return pts, idxs
124 | 
125 | 
126 | def align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path, verbose=False):
127 |     num_instances = len(instance_ids)
128 |     error_messages = ''
129 |     elapses = []
130 |     scales = np.zeros(num_instances)
131 |     rotations = np.zeros((num_instances, 3, 3))
132 |     translations = np.zeros((num_instances, 3))
133 | 
134 |     for i in range(num_instances):
135 |         mask = masks[:, :, i]
136 |         coord = coords[:, :, i, :]
137 |         pts, idxs = backproject(depth, intrinsics, mask)
138 |         coord_pts = coord[idxs[0], idxs[1], :] - 0.5
139 |         try:
140 |             start = time.time()
141 |             s, R, T, outtransform = estimateSimilarityTransform(coord_pts, pts, False)
142 |             elapsed = time.time() - start
143 |             if verbose:
144 |                 print('elapsed: ', elapsed)
145 |             elapses.append(elapsed)
146 |         except Exception as e:
147 |             message = '[ Error ] aligning instance {} in {} fails. Message: {}.'.format(instance_ids[i], img_path, str(e))
148 |             print(message)
149 |             error_messages += message + '\n'
150 |             s = 1.0
151 |             R = np.eye(3)
152 |             T = np.zeros(3)
153 |             outtransform = np.identity(4, dtype=np.float32)
154 | 
155 |         scales[i] = s / 1000.0
156 |         rotations[i, :, :] = R
157 |         translations[i, :] = T / 1000.0
158 | 
159 |     return scales, rotations, translations, error_messages, elapses
160 | 


--------------------------------------------------------------------------------
/lib/auto_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class PointCloudEncoder(nn.Module):
 7 |     def __init__(self, emb_dim):
 8 |         super(PointCloudEncoder, self).__init__()
 9 |         self.conv1 = nn.Conv1d(3, 64, 1)
10 |         self.conv2 = nn.Conv1d(64, 128, 1)
11 |         self.conv3 = nn.Conv1d(256, 256, 1)
12 |         self.conv4 = nn.Conv1d(256, 1024, 1)
13 |         self.fc = nn.Linear(1024, emb_dim)
14 | 
15 |     def forward(self, xyz):
16 |         """
17 |         Args:
18 |             xyz: (B, 3, N)
19 | 
20 |         """
21 |         np = xyz.size()[2]
22 |         x = F.relu(self.conv1(xyz))
23 |         x = F.relu(self.conv2(x))
24 |         global_feat = F.adaptive_max_pool1d(x, 1)
25 |         x = torch.cat((x, global_feat.repeat(1, 1, np)), dim=1)
26 |         x = F.relu(self.conv3(x))
27 |         x = F.relu(self.conv4(x))
28 |         x = torch.squeeze(F.adaptive_max_pool1d(x, 1), dim=2)
29 |         embedding = self.fc(x)
30 |         return embedding
31 | 
32 | 
33 | class PointCloudDecoder(nn.Module):
34 |     def __init__(self, emb_dim, n_pts):
35 |         super(PointCloudDecoder, self).__init__()
36 |         self.fc1 = nn.Linear(emb_dim, 512)
37 |         self.fc2 = nn.Linear(512, 1024)
38 |         self.fc3 = nn.Linear(1024, 3*n_pts)
39 | 
40 |     def forward(self, embedding):
41 |         """
42 |         Args:
43 |             embedding: (B, 512)
44 | 
45 |         """
46 |         bs = embedding.size()[0]
47 |         out = F.relu(self.fc1(embedding))
48 |         out = F.relu(self.fc2(out))
49 |         out = self.fc3(out)
50 |         out_pc = out.view(bs, -1, 3)
51 |         return out_pc
52 | 
53 | 
54 | class PointCloudAE(nn.Module):
55 |     def __init__(self, emb_dim=512, n_pts=1024):
56 |         super(PointCloudAE, self).__init__()
57 |         self.encoder = PointCloudEncoder(emb_dim)
58 |         self.decoder = PointCloudDecoder(emb_dim, n_pts)
59 | 
60 |     def forward(self, in_pc, emb=None):
61 |         """
62 |         Args:
63 |             in_pc: (B, N, 3)
64 |             emb: (B, 512)
65 | 
66 |         Returns:
67 |             emb: (B, emb_dim)
68 |             out_pc: (B, n_pts, 3)
69 | 
70 |         """
71 |         if emb is None:
72 |             xyz = in_pc.permute(0, 2, 1)
73 |             emb = self.encoder(xyz)
74 |         out_pc = self.decoder(emb)
75 |         return emb, out_pc
76 | 


--------------------------------------------------------------------------------
/lib/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from .nn_distance.chamfer_loss import ChamferLoss
 5 | 
 6 | 
 7 | class Loss(nn.Module):
 8 |     """ Loss for training DeformNet.
 9 |         Use NOCS coords to supervise training.
10 |     """
11 |     def __init__(self, corr_wt, cd_wt, entropy_wt, deform_wt):
12 |         super(Loss, self).__init__()
13 |         self.threshold = 0.1
14 |         self.chamferloss = ChamferLoss()
15 |         self.corr_wt = corr_wt
16 |         self.cd_wt = cd_wt
17 |         self.entropy_wt = entropy_wt
18 |         self.deform_wt = deform_wt
19 | 
20 |     def forward(self, assign_mat, deltas, prior, nocs, model):
21 |         """
22 |         Args:
23 |             assign_mat: bs x n_pts x nv
24 |             deltas: bs x nv x 3
25 |             prior: bs x nv x 3
26 |         """
27 |         inst_shape = prior + deltas
28 |         # smooth L1 loss for correspondences
29 |         soft_assign = F.softmax(assign_mat, dim=2)
30 |         coords = torch.bmm(soft_assign, inst_shape)  # bs x n_pts x 3
31 |         diff = torch.abs(coords - nocs)
32 |         less = torch.pow(diff, 2) / (2.0 * self.threshold)
33 |         higher = diff - self.threshold / 2.0
34 |         corr_loss = torch.where(diff > self.threshold, higher, less)
35 |         corr_loss = torch.mean(torch.sum(corr_loss, dim=2))
36 |         corr_loss = self.corr_wt * corr_loss
37 |         # entropy loss to encourage peaked distribution
38 |         log_assign = F.log_softmax(assign_mat, dim=2)
39 |         entropy_loss = torch.mean(-torch.sum(soft_assign * log_assign, 2))
40 |         entropy_loss = self.entropy_wt * entropy_loss
41 |         # cd-loss for instance reconstruction
42 |         cd_loss, _, _ = self.chamferloss(inst_shape, model)
43 |         cd_loss = self.cd_wt * cd_loss
44 |         # L2 regularizations on deformation
45 |         deform_loss = torch.norm(deltas, p=2, dim=2).mean()
46 |         deform_loss = self.deform_wt * deform_loss
47 |         # total loss
48 |         total_loss = corr_loss + entropy_loss + cd_loss + deform_loss
49 |         return total_loss, corr_loss, cd_loss, entropy_loss, deform_loss
50 | 


--------------------------------------------------------------------------------
/lib/network.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from lib.pspnet import PSPNet
  4 | 
  5 | 
  6 | class DeformNet(nn.Module):
  7 |     def __init__(self, n_cat=6, nv_prior=1024):
  8 |         super(DeformNet, self).__init__()
  9 |         self.n_cat = n_cat
 10 |         self.psp = PSPNet(bins=(1, 2, 3, 6), backend='resnet18')
 11 |         self.instance_color = nn.Sequential(
 12 |             nn.Conv1d(32, 64, 1),
 13 |             nn.ReLU(),
 14 |         )
 15 |         self.instance_geometry = nn.Sequential(
 16 |             nn.Conv1d(3, 64, 1),
 17 |             nn.ReLU(),
 18 |             nn.Conv1d(64, 64, 1),
 19 |             nn.ReLU(),
 20 |             nn.Conv1d(64, 64, 1),
 21 |             nn.ReLU(),
 22 |         )
 23 |         self.instance_global = nn.Sequential(
 24 |             nn.Conv1d(128, 128, 1),
 25 |             nn.ReLU(),
 26 |             nn.Conv1d(128, 1024, 1),
 27 |             nn.ReLU(),
 28 |             nn.AdaptiveAvgPool1d(1),
 29 |         )
 30 |         self.category_local = nn.Sequential(
 31 |             nn.Conv1d(3, 64, 1),
 32 |             nn.ReLU(),
 33 |             nn.Conv1d(64, 64, 1),
 34 |             nn.ReLU(),
 35 |             nn.Conv1d(64, 64, 1),
 36 |             nn.ReLU(),
 37 |         )
 38 |         self.category_global = nn.Sequential(
 39 |             nn.Conv1d(64, 128, 1),
 40 |             nn.ReLU(),
 41 |             nn.Conv1d(128, 1024, 1),
 42 |             nn.ReLU(),
 43 |             nn.AdaptiveAvgPool1d(1),
 44 |         )
 45 |         self.assignment = nn.Sequential(
 46 |             nn.Conv1d(2176, 512, 1),
 47 |             nn.ReLU(),
 48 |             nn.Conv1d(512, 256, 1),
 49 |             nn.ReLU(),
 50 |             nn.Conv1d(256, n_cat*nv_prior, 1),
 51 |         )
 52 |         self.deformation = nn.Sequential(
 53 |             nn.Conv1d(2112, 512, 1),
 54 |             nn.ReLU(),
 55 |             nn.Conv1d(512, 256, 1),
 56 |             nn.ReLU(),
 57 |             nn.Conv1d(256, n_cat*3, 1),
 58 |         )
 59 |         # Initialize weights to be small so initial deformations aren't so big
 60 |         self.deformation[4].weight.data.normal_(0, 0.0001)
 61 | 
 62 |     def forward(self, points, img, choose, cat_id, prior):
 63 |         """
 64 |         Args:
 65 |             points: bs x n_pts x 3
 66 |             img: bs x 3 x H x W
 67 |             choose: bs x n_pts
 68 |             cat_id: bs
 69 |             prior: bs x nv x 3
 70 | 
 71 |         Returns:
 72 |             assign_mat: bs x n_pts x nv
 73 |             inst_shape: bs x nv x 3
 74 |             deltas: bs x nv x 3
 75 |             log_assign: bs x n_pts x nv, for numerical stability
 76 | 
 77 |         """
 78 |         bs, n_pts = points.size()[:2]
 79 |         nv = prior.size()[1]
 80 |         # instance-specific features
 81 |         points = points.permute(0, 2, 1)
 82 |         points = self.instance_geometry(points)
 83 |         out_img = self.psp(img)
 84 |         di = out_img.size()[1]
 85 |         emb = out_img.view(bs, di, -1)
 86 |         choose = choose.unsqueeze(1).repeat(1, di, 1)
 87 |         emb = torch.gather(emb, 2, choose).contiguous()
 88 |         emb = self.instance_color(emb)
 89 |         inst_local = torch.cat((points, emb), dim=1)     # bs x 128 x n_pts
 90 |         inst_global = self.instance_global(inst_local)    # bs x 1024 x 1
 91 |         # category-specific features
 92 |         cat_prior = prior.permute(0, 2, 1)
 93 |         cat_local = self.category_local(cat_prior)    # bs x 64 x n_pts
 94 |         cat_global = self.category_global(cat_local)  # bs x 1024 x 1
 95 |         # assignemnt matrix
 96 |         assign_feat = torch.cat((inst_local, inst_global.repeat(1, 1, n_pts), cat_global.repeat(1, 1, n_pts)), dim=1)     # bs x 2176 x n_pts
 97 |         assign_mat = self.assignment(assign_feat)
 98 |         assign_mat = assign_mat.view(-1, nv, n_pts).contiguous()   # bs, nc*nv, n_pts -> bs*nc, nv, n_pts
 99 |         index = cat_id + torch.arange(bs, dtype=torch.long).cuda() * self.n_cat
100 |         assign_mat = torch.index_select(assign_mat, 0, index)   # bs x nv x n_pts
101 |         assign_mat = assign_mat.permute(0, 2, 1).contiguous()    # bs x n_pts x nv
102 |         # deformation field
103 |         deform_feat = torch.cat((cat_local, cat_global.repeat(1, 1, nv), inst_global.repeat(1, 1, nv)), dim=1)       # bs x 2112 x n_pts
104 |         deltas = self.deformation(deform_feat)
105 |         deltas = deltas.view(-1, 3, nv).contiguous()   # bs, nc*3, nv -> bs*nc, 3, nv
106 |         deltas = torch.index_select(deltas, 0, index)   # bs x 3 x nv
107 |         deltas = deltas.permute(0, 2, 1).contiguous()   # bs x nv x 3
108 | 
109 |         return assign_mat, deltas
110 | 


--------------------------------------------------------------------------------
/lib/nn_distance/chamfer_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import nn_distance
 3 | 
 4 | 
 5 | class NnDistanceFunction(torch.autograd.Function):
 6 |     """ 3D point set to 3D point set distance.
 7 | 
 8 |     """
 9 |     @staticmethod
10 |     def forward(ctx, xyz1, xyz2):
11 |         B, N, _ = xyz1.size()
12 |         B, M, _ = xyz2.size()
13 |         result = torch.empty(B, N, dtype=xyz1.dtype, device=xyz1.device)
14 |         result_i = torch.empty(B, N, dtype=torch.int32, device=xyz1.device)
15 |         result2 = torch.empty(B, M, dtype=xyz2.dtype, device=xyz2.device)
16 |         result2_i = torch.empty(B, M, dtype=torch.int32, device=xyz2.device)
17 |         nn_distance.forward(xyz1, xyz2, result, result2, result_i, result2_i)
18 |         ctx.save_for_backward(xyz1, xyz2, result_i, result2_i)
19 |         ctx.mark_non_differentiable(result_i, result2_i)
20 |         return result, result2, result_i, result2_i
21 | 
22 |     @staticmethod
23 |     def backward(ctx, d_dist1, d_dist2, d_i1, d_i2):
24 |         B, N = d_dist1.size()
25 |         B, M = d_dist2.size()
26 |         xyz1, xyz2, idx1, idx2 = ctx.saved_variables
27 |         d_xyz1 = torch.zeros_like(xyz1)
28 |         d_xyz2 = torch.zeros_like(xyz2)
29 |         gradient1, gradient2 = ctx.needs_input_grad
30 |         nn_distance.backward(xyz1, xyz2, d_xyz1, d_xyz2, d_dist1, d_dist2, idx1, idx2)
31 |         if not gradient1:
32 |             return None, d_xyz2
33 |         if not gradient2:
34 |             return d_xyz1, None
35 |         else:
36 |             return d_xyz1, d_xyz2
37 | 
38 | 
39 | class ChamferLoss(torch.nn.Module):
40 |     """ Chamfer Loss: bidirectional nearest neighbor distance of two point sets.
41 | 
42 |     """
43 |     def __init__(self, threshold=None, backward_weight=1.0):
44 |         super(ChamferLoss, self).__init__()
45 |         # only consider distance smaller than threshold*mean(distance) (remove outlier)
46 |         self.__threshold = threshold
47 |         self.backward_weight = backward_weight
48 | 
49 |     def set_threshold(self, value):
50 |         self.__threshold = value
51 | 
52 |     def unset_threshold(self):
53 |         self.__threshold = None
54 | 
55 |     def forward(self, pred, gt):
56 |         assert(pred.dim() == 3 and gt.dim() == 3), \
57 |             "input for ChamferLoss must be a 3D-tensor, but pred.size() is {} gt.size() is {}".format(pred.size(), gt.size())
58 |         # need transpose
59 |         if pred.size(2) != 3:
60 |             assert(pred.size(1) == 3), "ChamferLoss is implemented for 3D points"
61 |             pred = pred.transpose(2, 1).contiguous()
62 |         if gt.size(2) != 3:
63 |             assert(gt.size(1) == 3), "ChamferLoss is implemented for 3D points"
64 |             gt = gt.transpose(2, 1).contiguous()
65 |         assert(pred.size(2) == 3 and gt.size(2) == 3), "ChamferLoss is implemented for 3D points"
66 |         pred2gt, gt2pred, idx1, idx2 = NnDistanceFunction.apply(pred, gt)
67 | 
68 |         if self.__threshold is not None:
69 |             threshold = self.__threshold
70 |             forward_threshold = torch.mean(pred2gt, dim=1, keepdim=True) * threshold
71 |             backward_threshold = torch.mean(gt2pred, dim=1, keepdim=True) * threshold
72 |             # only care about distance within threshold (ignore strong outliers)
73 |             pred2gt = torch.where(pred2gt < forward_threshold, pred2gt, torch.zeros_like(pred2gt))
74 |             gt2pred = torch.where(gt2pred < backward_threshold, gt2pred, torch.zeros_like(gt2pred))
75 | 
76 |         pred2gt = torch.mean(pred2gt, dim=1)
77 |         gt2pred = torch.mean(gt2pred, dim=1)
78 |         cd_dist = pred2gt + self.backward_weight * gt2pred
79 |         cd_loss = torch.mean(cd_dist)
80 |         return cd_loss, idx1, idx2
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     from torch.autograd import gradcheck
85 |     nndistance = NnDistanceFunction.apply
86 |     pc1 = torch.randn([2, 60, 3], dtype=torch.float, requires_grad=True).cuda()
87 |     pc2 = torch.randn([2, 30, 3], dtype=torch.float, requires_grad=True).cuda()
88 |     test = gradcheck(nndistance, (pc1, pc2), eps=1e-3, atol=1e-3)
89 |     print(test)
90 | 


--------------------------------------------------------------------------------
/lib/nn_distance/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | 
 5 | setup(
 6 |     name='nn_distance',
 7 |     ext_modules=[
 8 |         CUDAExtension('nn_distance', [
 9 |             'src/nn_distance.cpp',
10 |             'src/nn_distance_cuda.cu', ],
11 |             extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
12 |     ],
13 | 
14 |     cmdclass={
15 |         'build_ext': BuildExtension
16 |     })
17 | 


--------------------------------------------------------------------------------
/lib/nn_distance/src/nn_distance.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <vector>
 3 | 
 4 | int nn_distance_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2);
 5 | 
 6 | 
 7 | int nn_distance_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2);
 8 | 
 9 | 
10 | int nn_distance_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2) {
11 |     return nn_distance_cuda_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
12 | }
13 | 
14 | 
15 | int nn_distance_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, 
16 | 					  	 at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2) {
17 |     return nn_distance_cuda_backward(xyz1, xyz2, gradxyz1, gradxyz2, graddist1, graddist2, idx1, idx2);
18 | }
19 | 
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |   m.def("forward", &nn_distance_forward, "nn_distance forward (CUDA)");
23 |   m.def("backward", &nn_distance_backward, "nn_distance backward (CUDA)");
24 | }


--------------------------------------------------------------------------------
/lib/nn_distance/src/nn_distance_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <ATen/ATen.h>
  3 | #include <cuda.h>
  4 | #include <cuda_runtime.h>
  5 | #include <vector>
  6 | 
  7 | 
  8 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
  9 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 10 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 11 | 
 12 | 
 13 | __global__ void NmDistanceKernel(int b,int n,const float * xyz,int m,const float * xyz2,float * result,int * result_i){
 14 |     const int batch=512;
 15 |     __shared__ float buf[batch*3];
 16 |     for (int i=blockIdx.x;i<b;i+=gridDim.x){
 17 |         for (int k2=0;k2<m;k2+=batch){
 18 |             int end_k=min(m,k2+batch)-k2;
 19 |             for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
 20 |                 buf[j]=xyz2[(i*m+k2)*3+j];
 21 |             }
 22 |             __syncthreads();
 23 |             for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
 24 |                 float x1=xyz[(i*n+j)*3+0];
 25 |                 float y1=xyz[(i*n+j)*3+1];
 26 |                 float z1=xyz[(i*n+j)*3+2];
 27 |                 int best_i=0;
 28 |                 float best=0;
 29 |                 int end_ka=end_k-(end_k&3);
 30 |                 if (end_ka==batch){
 31 |                     for (int k=0;k<batch;k+=4){
 32 |                         {
 33 |                             float x2=buf[k*3+0]-x1;
 34 |                             float y2=buf[k*3+1]-y1;
 35 |                             float z2=buf[k*3+2]-z1;
 36 |                             float d=x2*x2+y2*y2+z2*z2;
 37 |                             if (k==0 || d<best){
 38 |                                 best=d;
 39 |                                 best_i=k+k2;
 40 |                             }
 41 |                         }
 42 |                         {
 43 |                             float x2=buf[k*3+3]-x1;
 44 |                             float y2=buf[k*3+4]-y1;
 45 |                             float z2=buf[k*3+5]-z1;
 46 |                             float d=x2*x2+y2*y2+z2*z2;
 47 |                             if (d<best){
 48 |                                 best=d;
 49 |                                 best_i=k+k2+1;
 50 |                             }
 51 |                         }
 52 |                         {
 53 |                             float x2=buf[k*3+6]-x1;
 54 |                             float y2=buf[k*3+7]-y1;
 55 |                             float z2=buf[k*3+8]-z1;
 56 |                             float d=x2*x2+y2*y2+z2*z2;
 57 |                             if (d<best){
 58 |                                 best=d;
 59 |                                 best_i=k+k2+2;
 60 |                             }
 61 |                         }
 62 |                         {
 63 |                             float x2=buf[k*3+9]-x1;
 64 |                             float y2=buf[k*3+10]-y1;
 65 |                             float z2=buf[k*3+11]-z1;
 66 |                             float d=x2*x2+y2*y2+z2*z2;
 67 |                             if (d<best){
 68 |                                 best=d;
 69 |                                 best_i=k+k2+3;
 70 |                             }
 71 |                         }
 72 |                     }
 73 |                 }else{
 74 |                     for (int k=0;k<end_ka;k+=4){
 75 |                         {
 76 |                             float x2=buf[k*3+0]-x1;
 77 |                             float y2=buf[k*3+1]-y1;
 78 |                             float z2=buf[k*3+2]-z1;
 79 |                             float d=x2*x2+y2*y2+z2*z2;
 80 |                             if (k==0 || d<best){
 81 |                                 best=d;
 82 |                                 best_i=k+k2;
 83 |                             }
 84 |                         }
 85 |                         {
 86 |                             float x2=buf[k*3+3]-x1;
 87 |                             float y2=buf[k*3+4]-y1;
 88 |                             float z2=buf[k*3+5]-z1;
 89 |                             float d=x2*x2+y2*y2+z2*z2;
 90 |                             if (d<best){
 91 |                                 best=d;
 92 |                                 best_i=k+k2+1;
 93 |                             }
 94 |                         }
 95 |                         {
 96 |                             float x2=buf[k*3+6]-x1;
 97 |                             float y2=buf[k*3+7]-y1;
 98 |                             float z2=buf[k*3+8]-z1;
 99 |                             float d=x2*x2+y2*y2+z2*z2;
100 |                             if (d<best){
101 |                                 best=d;
102 |                                 best_i=k+k2+2;
103 |                             }
104 |                         }
105 |                         {
106 |                             float x2=buf[k*3+9]-x1;
107 |                             float y2=buf[k*3+10]-y1;
108 |                             float z2=buf[k*3+11]-z1;
109 |                             float d=x2*x2+y2*y2+z2*z2;
110 |                             if (d<best){
111 |                                 best=d;
112 |                                 best_i=k+k2+3;
113 |                             }
114 |                         }
115 |                     }
116 |                 }
117 |                 for (int k=end_ka;k<end_k;k++){
118 |                     float x2=buf[k*3+0]-x1;
119 |                     float y2=buf[k*3+1]-y1;
120 |                     float z2=buf[k*3+2]-z1;
121 |                     float d=x2*x2+y2*y2+z2*z2;
122 |                     if (k==0 || d<best){
123 |                         best=d;
124 |                         best_i=k+k2;
125 |                     }
126 |                 }
127 |                 if (k2==0 || result[(i*n+j)]>best){
128 |                     result[(i*n+j)]=best;
129 |                     result_i[(i*n+j)]=best_i;
130 |                 }
131 |             }
132 |             __syncthreads();
133 |         }
134 |     }
135 | }
136 | 
137 | 
138 | int nn_distance_cuda_forward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor dist1, at::Tensor dist2, at::Tensor idx1, at::Tensor idx2){
139 |     CHECK_INPUT(xyz1);
140 |     CHECK_INPUT(xyz2);
141 | 
142 |     const auto batch_size = xyz1.size(0);
143 |     const auto n = xyz1.size(1); //num_points point cloud A
144 |     const auto m = xyz2.size(1); //num_points point cloud B
145 | 
146 |     NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, n, xyz1.data<float>(), m, xyz2.data<float>(), dist1.data<float>(), idx1.data<int>());
147 |     NmDistanceKernel<<<dim3(32,16,1),512>>>(batch_size, m, xyz2.data<float>(), n, xyz1.data<float>(), dist2.data<float>(), idx2.data<int>());
148 | 
149 |     cudaError_t err = cudaGetLastError();
150 |     if (err != cudaSuccess) {
151 |         printf("error in nnd updateOutput: %s\n", cudaGetErrorString(err));
152 |         return 0;
153 |     }
154 |     return 1;
155 | }
156 | 
157 | 
158 | __global__ void NmDistanceGradKernel(int b,int n,const float * xyz1,int m,const float * xyz2,const float * grad_dist1,const int * idx1,float * grad_xyz1,float * grad_xyz2){
159 |     for (int i=blockIdx.x;i<b;i+=gridDim.x){
160 |         for (int j=threadIdx.x+blockIdx.y*blockDim.x;j<n;j+=blockDim.x*gridDim.y){
161 |             float x1=xyz1[(i*n+j)*3+0];
162 |             float y1=xyz1[(i*n+j)*3+1];
163 |             float z1=xyz1[(i*n+j)*3+2];
164 |             int j2=idx1[i*n+j];
165 |             float x2=xyz2[(i*m+j2)*3+0];
166 |             float y2=xyz2[(i*m+j2)*3+1];
167 |             float z2=xyz2[(i*m+j2)*3+2];
168 |             float g=grad_dist1[i*n+j]*2;
169 |             atomicAdd(&(grad_xyz1[(i*n+j)*3+0]),g*(x1-x2));
170 |             atomicAdd(&(grad_xyz1[(i*n+j)*3+1]),g*(y1-y2));
171 |             atomicAdd(&(grad_xyz1[(i*n+j)*3+2]),g*(z1-z2));
172 |             atomicAdd(&(grad_xyz2[(i*m+j2)*3+0]),-(g*(x1-x2)));
173 |             atomicAdd(&(grad_xyz2[(i*m+j2)*3+1]),-(g*(y1-y2)));
174 |             atomicAdd(&(grad_xyz2[(i*m+j2)*3+2]),-(g*(z1-z2)));
175 |         }
176 |     }
177 | }
178 | 
179 | 
180 | int nn_distance_cuda_backward(at::Tensor xyz1, at::Tensor xyz2, at::Tensor gradxyz1, at::Tensor gradxyz2, at::Tensor graddist1, at::Tensor graddist2, at::Tensor idx1, at::Tensor idx2){
181 |     const auto batch_size = xyz1.size(0);
182 |     const auto n = xyz1.size(1);    // num_points point cloud A
183 |     const auto m = xyz2.size(1);    // num_points point cloud B
184 | 
185 |     NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,n,xyz1.data<float>(),m,xyz2.data<float>(),graddist1.data<float>(),idx1.data<int>(),gradxyz1.data<float>(),gradxyz2.data<float>());
186 |     NmDistanceGradKernel<<<dim3(1,16,1),256>>>(batch_size,m,xyz2.data<float>(),n,xyz1.data<float>(),graddist2.data<float>(),idx2.data<int>(),gradxyz2.data<float>(),gradxyz1.data<float>());
187 |     
188 |     cudaError_t err = cudaGetLastError();
189 |         if (err != cudaSuccess) {
190 |             printf("error in nnd get grad: %s\n", cudaGetErrorString(err));
191 |             return 0;
192 |         }
193 |         return 1;
194 | }
195 | 


--------------------------------------------------------------------------------
/lib/pspnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def conv3x3(in_planes, out_planes, stride=1, dilation=1):
  8 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False)
  9 | 
 10 | 
 11 | class BasicBlock(nn.Module):
 12 |     expansion = 1
 13 |     def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
 14 |         super(BasicBlock, self).__init__()
 15 |         self.conv1 = conv3x3(inplanes, planes, stride=stride, dilation=dilation)
 16 |         self.relu = nn.ReLU(inplace=True)
 17 |         self.conv2 = conv3x3(planes, planes, stride=1, dilation=dilation)
 18 |         self.downsample = downsample
 19 |         self.stride = stride
 20 | 
 21 |     def forward(self, x):
 22 |         residual = x
 23 |         out = self.conv1(x)
 24 |         out = self.relu(out)
 25 |         out = self.conv2(out)
 26 |         if self.downsample is not None:
 27 |             residual = self.downsample(x)
 28 |         out += residual
 29 |         out = self.relu(out)
 30 |         return out
 31 | 
 32 | 
 33 | class ResNet(nn.Module):
 34 |     def __init__(self, block, layers=(3, 4, 23, 3)):
 35 |         self.inplanes = 64
 36 |         super(ResNet, self).__init__()
 37 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
 38 |         self.relu = nn.ReLU(inplace=True)
 39 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 40 |         self.layer1 = self._make_layer(block, 64, layers[0])
 41 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 42 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
 43 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
 44 | 
 45 |         for m in self.modules():
 46 |             if isinstance(m, nn.Conv2d):
 47 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 48 |                 m.weight.data.normal_(0, math.sqrt(2./n))
 49 |             elif isinstance(m, nn.BatchNorm2d):
 50 |                 m.weight.data.fill_(1)
 51 |                 m.bias.data.zero_()
 52 | 
 53 |     def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
 54 |         downsample = None
 55 |         if stride != 1 or self.inplanes != planes*block.expansion:
 56 |             downsample = nn.Sequential(
 57 |                 nn.Conv2d(self.inplanes, planes*block.expansion, kernel_size=1, stride=stride, bias=False)
 58 |             )
 59 |         layers = [block(self.inplanes, planes, stride, downsample)]
 60 |         self.inplanes = planes * block.expansion
 61 |         for i in range(1, blocks):
 62 |             layers.append(block(self.inplanes, planes, dilation=dilation))
 63 |         return nn.Sequential(*layers)
 64 | 
 65 |     def forward(self, x):
 66 |         x = self.conv1(x)
 67 |         x = self.relu(x)
 68 |         x = self.maxpool(x)
 69 |         x = self.layer1(x)
 70 |         x = self.layer2(x)
 71 |         x = self.layer3(x)
 72 |         x = self.layer4(x)
 73 |         return x
 74 | 
 75 | 
 76 | class PSPModule(nn.Module):
 77 |     def __init__(self, feat_dim, bins=(1, 2, 3, 6)):
 78 |         super(PSPModule, self).__init__()
 79 |         self.reduction_dim = feat_dim // len(bins)
 80 |         self.stages = []
 81 |         self.stages = nn.ModuleList([self._make_stage(feat_dim, size) for size in bins])
 82 | 
 83 |     def _make_stage(self, feat_dim, size):
 84 |         prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
 85 |         conv = nn.Conv2d(feat_dim, self.reduction_dim, kernel_size=1, bias=False)
 86 |         relu = nn.ReLU(inplace=True)
 87 |         return nn.Sequential(prior, conv, relu)
 88 | 
 89 |     def forward(self, feats):
 90 |         h, w = feats.size(2), feats.size(3)
 91 |         priors = [feats]
 92 |         for stage in self.stages:
 93 |             priors.append(F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True))
 94 |         return torch.cat(priors, 1)
 95 | 
 96 | 
 97 | class PSPUpsample(nn.Module):
 98 |     def __init__(self, in_channels, out_channels):
 99 |         super(PSPUpsample, self).__init__()
100 |         self.conv = nn.Sequential(
101 |             nn.Conv2d(in_channels, out_channels, 3, padding=1),
102 |             nn.PReLU()
103 |         )
104 | 
105 |     def forward(self, x):
106 |         x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
107 |         return self.conv(x)
108 | 
109 | 
110 | class PSPNet(nn.Module):
111 |     def __init__(self, bins=(1, 2, 3, 6), backend='resnet18'):
112 |         super(PSPNet, self).__init__()
113 |         if backend == 'resnet18':
114 |             self.feats = ResNet(BasicBlock, [2, 2, 2, 2])
115 |             feat_dim = 512
116 |         else:
117 |             raise NotImplementedError
118 |         self.psp = PSPModule(feat_dim, bins)
119 |         self.drop = nn.Dropout2d(p=0.15)
120 |         self.up_1 = PSPUpsample(1024, 256)
121 |         self.up_2 = PSPUpsample(256, 64)
122 |         self.up_3 = PSPUpsample(64, 64)
123 |         self.final = nn.Conv2d(64, 32, kernel_size=1)
124 | 
125 |     def forward(self, x):
126 |         f = self.feats(x)
127 |         p = self.psp(f)
128 |         p = self.up_1(p)
129 |         p = self.drop(p)
130 |         p = self.up_2(p)
131 |         p = self.drop(p)
132 |         p = self.up_3(p)
133 |         return self.final(p)
134 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Evaluation-related codes are modified from
  3 |     https://github.com/hughw19/NOCS_CVPR2019
  4 | """
  5 | import logging
  6 | import os
  7 | import math
  8 | import cv2
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import _pickle as cPickle
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | def setup_logger(logger_name, log_file, level=logging.INFO):
 16 |     logger = logging.getLogger(logger_name)
 17 |     formatter = logging.Formatter('%(asctime)s : %(message)s')
 18 |     fileHandler = logging.FileHandler(log_file, mode='a')
 19 |     fileHandler.setFormatter(formatter)
 20 |     logger.setLevel(level)
 21 |     logger.addHandler(fileHandler)
 22 |     streamHandler = logging.StreamHandler()
 23 |     streamHandler.setFormatter(formatter)
 24 |     logger.addHandler(streamHandler)
 25 |     return logger
 26 | 
 27 | 
 28 | def load_obj(path_to_file):
 29 |     """ Load obj file.
 30 | 
 31 |     Args:
 32 |         path_to_file: path
 33 | 
 34 |     Returns:
 35 |         vertices: ndarray
 36 |         faces: ndarray, index of triangle vertices
 37 | 
 38 |     """
 39 |     vertices = []
 40 |     faces = []
 41 |     with open(path_to_file, 'r') as f:
 42 |         for line in f:
 43 |             if line[:2] == 'v ':
 44 |                 vertex = line[2:].strip().split(' ')
 45 |                 vertex = [float(xyz) for xyz in vertex]
 46 |                 vertices.append(vertex)
 47 |             elif line[0] == 'f':
 48 |                 face = line[1:].replace('//', '/').strip().split(' ')
 49 |                 face = [int(idx.split('/')[0])-1 for idx in face]
 50 |                 faces.append(face)
 51 |             else:
 52 |                 continue
 53 |     vertices = np.asarray(vertices)
 54 |     faces = np.asarray(faces)
 55 |     return vertices, faces
 56 | 
 57 | 
 58 | def create_sphere():
 59 |     # 642 verts, 1280 faces,
 60 |     verts, faces = load_obj('assets/sphere_mesh_template.obj')
 61 |     return verts, faces
 62 | 
 63 | 
 64 | def random_point(face_vertices):
 65 |     """ Sampling point using Barycentric coordiante.
 66 | 
 67 |     """
 68 |     r1, r2 = np.random.random(2)
 69 |     sqrt_r1 = np.sqrt(r1)
 70 |     point = (1 - sqrt_r1) * face_vertices[0, :] + \
 71 |         sqrt_r1 * (1 - r2) * face_vertices[1, :] + \
 72 |         sqrt_r1 * r2 * face_vertices[2, :]
 73 | 
 74 |     return point
 75 | 
 76 | 
 77 | def pairwise_distance(A, B):
 78 |     """ Compute pairwise distance of two point clouds.point
 79 | 
 80 |     Args:
 81 |         A: n x 3 numpy array
 82 |         B: m x 3 numpy array
 83 | 
 84 |     Return:
 85 |         C: n x m numpy array
 86 | 
 87 |     """
 88 |     diff = A[:, :, None] - B[:, :, None].T
 89 |     C = np.sqrt(np.sum(diff**2, axis=1))
 90 | 
 91 |     return C
 92 | 
 93 | 
 94 | def uniform_sample(vertices, faces, n_samples, with_normal=False):
 95 |     """ Sampling points according to the area of mesh surface.
 96 | 
 97 |     """
 98 |     sampled_points = np.zeros((n_samples, 3), dtype=float)
 99 |     normals = np.zeros((n_samples, 3), dtype=float)
100 |     faces = vertices[faces]
101 |     vec_cross = np.cross(faces[:, 1, :] - faces[:, 0, :],
102 |                          faces[:, 2, :] - faces[:, 0, :])
103 |     face_area = 0.5 * np.linalg.norm(vec_cross, axis=1)
104 |     cum_area = np.cumsum(face_area)
105 |     for i in range(n_samples):
106 |         face_id = np.searchsorted(cum_area, np.random.random() * cum_area[-1])
107 |         sampled_points[i] = random_point(faces[face_id, :, :])
108 |         normals[i] = vec_cross[face_id]
109 |     normals = normals / np.linalg.norm(normals, axis=1, keepdims=True)
110 |     if with_normal:
111 |         sampled_points = np.concatenate((sampled_points, normals), axis=1)
112 |     return sampled_points
113 | 
114 | 
115 | def farthest_point_sampling(points, n_samples):
116 |     """ Farthest point sampling.
117 | 
118 |     """
119 |     selected_pts = np.zeros((n_samples,), dtype=int)
120 |     dist_mat = pairwise_distance(points, points)
121 |     # start from first point
122 |     pt_idx = 0
123 |     dist_to_set = dist_mat[:, pt_idx]
124 |     for i in range(n_samples):
125 |         selected_pts[i] = pt_idx
126 |         dist_to_set = np.minimum(dist_to_set, dist_mat[:, pt_idx])
127 |         pt_idx = np.argmax(dist_to_set)
128 |     return selected_pts
129 | 
130 | 
131 | def sample_points_from_mesh(path, n_pts, with_normal=False, fps=False, ratio=2):
132 |     """ Uniformly sampling points from mesh model.
133 | 
134 |     Args:
135 |         path: path to OBJ file.
136 |         n_pts: int, number of points being sampled.
137 |         with_normal: return points with normal, approximated by mesh triangle normal
138 |         fps: whether to use fps for post-processing, default False.
139 |         ratio: int, if use fps, sample ratio*n_pts first, then use fps to sample final output.
140 | 
141 |     Returns:
142 |         points: n_pts x 3, n_pts x 6 if with_normal = True
143 | 
144 |     """
145 |     vertices, faces = load_obj(path)
146 |     if fps:
147 |         points = uniform_sample(vertices, faces, ratio*n_pts, with_normal)
148 |         pts_idx = farthest_point_sampling(points[:, :3], n_pts)
149 |         points = points[pts_idx]
150 |     else:
151 |         points = uniform_sample(vertices, faces, n_pts, with_normal)
152 |     return points
153 | 
154 | 
155 | def load_depth(img_path):
156 |     """ Load depth image from img_path. """
157 |     depth_path = img_path + '_depth.png'
158 |     depth = cv2.imread(depth_path, -1)
159 |     if len(depth.shape) == 3:
160 |         # This is encoded depth image, let's convert
161 |         # NOTE: RGB is actually BGR in opencv
162 |         depth16 = depth[:, :, 1]*256 + depth[:, :, 2]
163 |         depth16 = np.where(depth16==32001, 0, depth16)
164 |         depth16 = depth16.astype(np.uint16)
165 |     elif len(depth.shape) == 2 and depth.dtype == 'uint16':
166 |         depth16 = depth
167 |     else:
168 |         assert False, '[ Error ]: Unsupported depth type.'
169 |     return depth16
170 | 
171 | 
172 | def get_bbox(bbox):
173 |     """ Compute square image crop window. """
174 |     y1, x1, y2, x2 = bbox
175 |     img_width = 480
176 |     img_length = 640
177 |     window_size = (max(y2-y1, x2-x1) // 40 + 1) * 40
178 |     window_size = min(window_size, 440)
179 |     center = [(y1 + y2) // 2, (x1 + x2) // 2]
180 |     rmin = center[0] - int(window_size / 2)
181 |     rmax = center[0] + int(window_size / 2)
182 |     cmin = center[1] - int(window_size / 2)
183 |     cmax = center[1] + int(window_size / 2)
184 |     if rmin < 0:
185 |         delt = -rmin
186 |         rmin = 0
187 |         rmax += delt
188 |     if cmin < 0:
189 |         delt = -cmin
190 |         cmin = 0
191 |         cmax += delt
192 |     if rmax > img_width:
193 |         delt = rmax - img_width
194 |         rmax = img_width
195 |         rmin -= delt
196 |     if cmax > img_length:
197 |         delt = cmax - img_length
198 |         cmax = img_length
199 |         cmin -= delt
200 |     return rmin, rmax, cmin, cmax
201 | 
202 | 
203 | def compute_sRT_errors(sRT1, sRT2):
204 |     """
205 |     Args:
206 |         sRT1: [4, 4]. homogeneous affine transformation
207 |         sRT2: [4, 4]. homogeneous affine transformation
208 | 
209 |     Returns:
210 |         R_error: angle difference in degree,
211 |         T_error: Euclidean distance
212 |         IoU: relative scale error
213 | 
214 |     """
215 |     try:
216 |         assert np.array_equal(sRT1[3, :], sRT2[3, :])
217 |         assert np.array_equal(sRT1[3, :], np.array([0, 0, 0, 1]))
218 |     except AssertionError:
219 |         print(sRT1[3, :], sRT2[3, :])
220 | 
221 |     s1 = np.cbrt(np.linalg.det(sRT1[:3, :3]))
222 |     R1 = sRT1[:3, :3] / s1
223 |     T1 = sRT1[:3, 3]
224 |     s2 = np.cbrt(np.linalg.det(sRT2[:3, :3]))
225 |     R2 = sRT2[:3, :3] / s2
226 |     T2 = sRT2[:3, 3]
227 |     R12 = R1 @ R2.transpose()
228 |     R_error = np.arccos(np.clip((np.trace(R12)-1)/2, -1.0, 1.0)) * 180 / np.pi
229 |     T_error = np.linalg.norm(T1 - T2)
230 |     IoU = np.abs(s1 - s2) / s2
231 | 
232 |     return R_error, T_error, IoU
233 | 
234 | 
235 | ############################################################
236 | #  Evaluation
237 | ############################################################
238 | 
239 | def get_3d_bbox(size, shift=0):
240 |     """
241 |     Args:
242 |         size: [3] or scalar
243 |         shift: [3] or scalar
244 |     Returns:
245 |         bbox_3d: [3, N]
246 | 
247 |     """
248 |     bbox_3d = np.array([[+size[0] / 2, +size[1] / 2, +size[2] / 2],
249 |                         [+size[0] / 2, +size[1] / 2, -size[2] / 2],
250 |                         [-size[0] / 2, +size[1] / 2, +size[2] / 2],
251 |                         [-size[0] / 2, +size[1] / 2, -size[2] / 2],
252 |                         [+size[0] / 2, -size[1] / 2, +size[2] / 2],
253 |                         [+size[0] / 2, -size[1] / 2, -size[2] / 2],
254 |                         [-size[0] / 2, -size[1] / 2, +size[2] / 2],
255 |                         [-size[0] / 2, -size[1] / 2, -size[2] / 2]]) + shift
256 |     bbox_3d = bbox_3d.transpose()
257 |     return bbox_3d
258 | 
259 | 
260 | def transform_coordinates_3d(coordinates, sRT):
261 |     """
262 |     Args:
263 |         coordinates: [3, N]
264 |         sRT: [4, 4]
265 | 
266 |     Returns:
267 |         new_coordinates: [3, N]
268 | 
269 |     """
270 |     assert coordinates.shape[0] == 3
271 |     coordinates = np.vstack([coordinates, np.ones((1, coordinates.shape[1]), dtype=np.float32)])
272 |     new_coordinates = sRT @ coordinates
273 |     new_coordinates = new_coordinates[:3, :] / new_coordinates[3, :]
274 |     return new_coordinates
275 | 
276 | 
277 | def compute_3d_IoU(sRT_1, sRT_2, size_1, size_2, class_name_1, class_name_2, handle_visibility):
278 |     """ Computes IoU overlaps between two 3D bboxes. """
279 |     def asymmetric_3d_iou(sRT_1, sRT_2, size_1, size_2):
280 |         noc_cube_1 = get_3d_bbox(size_1, 0)
281 |         bbox_3d_1 = transform_coordinates_3d(noc_cube_1, sRT_1)
282 |         noc_cube_2 = get_3d_bbox(size_2, 0)
283 |         bbox_3d_2 = transform_coordinates_3d(noc_cube_2, sRT_2)
284 | 
285 |         bbox_1_max = np.amax(bbox_3d_1, axis=0)
286 |         bbox_1_min = np.amin(bbox_3d_1, axis=0)
287 |         bbox_2_max = np.amax(bbox_3d_2, axis=0)
288 |         bbox_2_min = np.amin(bbox_3d_2, axis=0)
289 | 
290 |         overlap_min = np.maximum(bbox_1_min, bbox_2_min)
291 |         overlap_max = np.minimum(bbox_1_max, bbox_2_max)
292 | 
293 |         # intersections and union
294 |         if np.amin(overlap_max - overlap_min) < 0:
295 |             intersections = 0
296 |         else:
297 |             intersections = np.prod(overlap_max - overlap_min)
298 |         union = np.prod(bbox_1_max - bbox_1_min) + np.prod(bbox_2_max - bbox_2_min) - intersections
299 |         overlaps = intersections / union
300 |         return overlaps
301 | 
302 |     if sRT_1 is None or sRT_2 is None:
303 |         return -1
304 | 
305 |     if (class_name_1 in ['bottle', 'bowl', 'can'] and class_name_1 == class_name_2) or \
306 |         (class_name_1 == 'mug' and class_name_1 == class_name_2 and handle_visibility==0):
307 |         def y_rotation_matrix(theta):
308 |             return np.array([[ np.cos(theta), 0, np.sin(theta), 0],
309 |                              [ 0,             1, 0,             0],
310 |                              [-np.sin(theta), 0, np.cos(theta), 0],
311 |                              [ 0,             0, 0,             1]])
312 |         n = 20
313 |         max_iou = 0
314 |         for i in range(n):
315 |             rotated_RT_1 = sRT_1 @ y_rotation_matrix(2 * math.pi * i / float(n))
316 |             max_iou = max(max_iou, asymmetric_3d_iou(rotated_RT_1, sRT_2, size_1, size_2))
317 |     else:
318 |         max_iou = asymmetric_3d_iou(sRT_1, sRT_2, size_1, size_2)
319 | 
320 |     return max_iou
321 | 
322 | 
323 | def compute_IoU_matches(gt_class_ids, gt_sRT, gt_size, gt_handle_visibility,
324 |                         pred_class_ids, pred_sRT, pred_size, pred_scores,
325 |                         synset_names, iou_3d_thresholds, score_threshold=0):
326 |     """ Find matches between NOCS prediction and ground truth instances.
327 | 
328 |     Args:
329 |         size: 3D bounding box size
330 |         bboxes: 2D bounding boxes
331 | 
332 |     Returns:
333 |         gt_matches: 2-D array. For each GT box it has the index of the matched predicted box.
334 |         pred_matches: 2-D array. For each predicted box, it has the index of the matched ground truth box.
335 |         overlaps: IoU overlaps.
336 |         indices:
337 | 
338 |     """
339 |     num_pred = len(pred_class_ids)
340 |     num_gt = len(gt_class_ids)
341 |     indices = np.zeros(0)
342 |     if num_pred:
343 |         # Sort predictions by score from high to low
344 |         indices = np.argsort(pred_scores)[::-1]
345 |         pred_class_ids = pred_class_ids[indices].copy()
346 |         pred_size = pred_size[indices].copy()
347 |         pred_sRT = pred_sRT[indices].copy()
348 |     # compute IoU overlaps [pred_bboxs gt_bboxs]
349 |     overlaps = np.zeros((num_pred, num_gt), dtype=np.float32)
350 |     for i in range(num_pred):
351 |         for j in range(num_gt):
352 |             overlaps[i, j] = compute_3d_IoU(pred_sRT[i], gt_sRT[j], pred_size[i, :], gt_size[j],
353 |                 synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]], gt_handle_visibility[j])
354 |     # loop through predictions and find matching ground truth boxes
355 |     num_iou_3d_thres = len(iou_3d_thresholds)
356 |     pred_matches = -1 * np.ones([num_iou_3d_thres, num_pred])
357 |     gt_matches = -1 * np.ones([num_iou_3d_thres, num_gt])
358 |     for s, iou_thres in enumerate(iou_3d_thresholds):
359 |         for i in range(indices.shape[0]):
360 |             # Find best matching ground truth box
361 |             # 1. Sort matches by score
362 |             sorted_ixs = np.argsort(overlaps[i])[::-1]
363 |             # 2. Remove low scores
364 |             low_score_idx = np.where(overlaps[i, sorted_ixs] < score_threshold)[0]
365 |             if low_score_idx.size > 0:
366 |                 sorted_ixs = sorted_ixs[:low_score_idx[0]]
367 |             # 3. Find the match
368 |             for j in sorted_ixs:
369 |                 # If ground truth box is already matched, go to next one
370 |                 if gt_matches[s, j] > -1:
371 |                     continue
372 |                 # If we reach IoU smaller than the threshold, end the loop
373 |                 iou = overlaps[i, j]
374 |                 if iou < iou_thres:
375 |                     break
376 |                 # Do we have a match?
377 |                 if not pred_class_ids[i] == gt_class_ids[j]:
378 |                     continue
379 |                 if iou > iou_thres:
380 |                     gt_matches[s, j] = i
381 |                     pred_matches[s, i] = j
382 |                     break
383 |     return gt_matches, pred_matches, overlaps, indices
384 | 
385 | 
386 | def compute_RT_errors(sRT_1, sRT_2, class_id, handle_visibility, synset_names):
387 |     """
388 |     Args:
389 |         sRT_1: [4, 4]. homogeneous affine transformation
390 |         sRT_2: [4, 4]. homogeneous affine transformation
391 | 
392 |     Returns:
393 |         theta: angle difference of R in degree
394 |         shift: l2 difference of T in centimeter
395 |     """
396 |     # make sure the last row is [0, 0, 0, 1]
397 |     if sRT_1 is None or sRT_2 is None:
398 |         return -1
399 |     try:
400 |         assert np.array_equal(sRT_1[3, :], sRT_2[3, :])
401 |         assert np.array_equal(sRT_1[3, :], np.array([0, 0, 0, 1]))
402 |     except AssertionError:
403 |         print(sRT_1[3, :], sRT_2[3, :])
404 |         exit()
405 | 
406 |     R1 = sRT_1[:3, :3] / np.cbrt(np.linalg.det(sRT_1[:3, :3]))
407 |     T1 = sRT_1[:3, 3]
408 |     R2 = sRT_2[:3, :3] / np.cbrt(np.linalg.det(sRT_2[:3, :3]))
409 |     T2 = sRT_2[:3, 3]
410 |     # symmetric when rotating around y-axis
411 |     if synset_names[class_id] in ['bottle', 'can', 'bowl'] or \
412 |         (synset_names[class_id] == 'mug' and handle_visibility == 0):
413 |         y = np.array([0, 1, 0])
414 |         y1 = R1 @ y
415 |         y2 = R2 @ y
416 |         cos_theta = y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2))
417 |     else:
418 |         R = R1 @ R2.transpose()
419 |         cos_theta = (np.trace(R) - 1) / 2
420 | 
421 |     theta = np.arccos(np.clip(cos_theta, -1.0, 1.0)) * 180 / np.pi
422 |     shift = np.linalg.norm(T1 - T2) * 100
423 |     result = np.array([theta, shift])
424 | 
425 |     return result
426 | 
427 | 
428 | def compute_RT_overlaps(gt_class_ids, gt_sRT, gt_handle_visibility, pred_class_ids, pred_sRT, synset_names):
429 |     """ Finds overlaps between prediction and ground truth instances.
430 | 
431 |     Returns:
432 |         overlaps:
433 | 
434 |     """
435 |     num_pred = len(pred_class_ids)
436 |     num_gt = len(gt_class_ids)
437 |     overlaps = np.zeros((num_pred, num_gt, 2))
438 | 
439 |     for i in range(num_pred):
440 |         for j in range(num_gt):
441 |             overlaps[i, j, :] = compute_RT_errors(pred_sRT[i], gt_sRT[j], gt_class_ids[j],
442 |                                                   gt_handle_visibility[j], synset_names)
443 |     return overlaps
444 | 
445 | 
446 | def compute_RT_matches(overlaps, pred_class_ids, gt_class_ids, degree_thres_list, shift_thres_list):
447 |     num_degree_thres = len(degree_thres_list)
448 |     num_shift_thres = len(shift_thres_list)
449 |     num_pred = len(pred_class_ids)
450 |     num_gt = len(gt_class_ids)
451 | 
452 |     pred_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_pred))
453 |     gt_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_gt))
454 | 
455 |     if num_pred == 0 or num_gt == 0:
456 |         return gt_matches, pred_matches
457 | 
458 |     assert num_pred == overlaps.shape[0]
459 |     assert num_gt == overlaps.shape[1]
460 |     assert overlaps.shape[2] == 2
461 | 
462 |     for d, degree_thres in enumerate(degree_thres_list):
463 |         for s, shift_thres in enumerate(shift_thres_list):
464 |             for i in range(num_pred):
465 |                 # Find best matching ground truth box
466 |                 # 1. Sort matches by scores from low to high
467 |                 sum_degree_shift = np.sum(overlaps[i, :, :], axis=-1)
468 |                 sorted_ixs = np.argsort(sum_degree_shift)
469 |                 # 2. Find the match
470 |                 for j in sorted_ixs:
471 |                     # If ground truth box is already matched, go to next one
472 |                     if gt_matches[d, s, j] > -1 or pred_class_ids[i] != gt_class_ids[j]:
473 |                         continue
474 |                     # If we reach IoU smaller than the threshold, end the loop
475 |                     if overlaps[i, j, 0] > degree_thres or overlaps[i, j, 1] > shift_thres:
476 |                         continue
477 |                     gt_matches[d, s, j] = i
478 |                     pred_matches[d, s, i] = j
479 |                     break
480 | 
481 |     return gt_matches, pred_matches
482 | 
483 | 
484 | def compute_ap_and_acc(pred_matches, pred_scores, gt_matches):
485 |     # sort the scores from high to low
486 |     assert pred_matches.shape[0] == pred_scores.shape[0]
487 |     score_indices = np.argsort(pred_scores)[::-1]
488 |     # pred_scores = pred_scores[score_indices]
489 |     pred_matches = pred_matches[score_indices]
490 |     precisions = np.cumsum(pred_matches > -1) / (np.arange(len(pred_matches)) + 1)
491 |     recalls = np.cumsum(pred_matches > -1).astype(np.float32) / len(gt_matches)
492 |     # Pad with start and end values to simplify the math
493 |     precisions = np.concatenate([[0], precisions, [0]])
494 |     recalls = np.concatenate([[0], recalls, [1]])
495 |     # Ensure precision values decrease but don't increase. This way, the
496 |     # precision value at each recall threshold is the maximum it can be
497 |     # for all following recall thresholds, as specified by the VOC paper.
498 |     for i in range(len(precisions) - 2, -1, -1):
499 |         precisions[i] = np.maximum(precisions[i], precisions[i + 1])
500 |     # compute mean AP over recall range
501 |     indices = np.where(recalls[:-1] != recalls[1:])[0] + 1
502 |     ap = np.sum((recalls[indices] - recalls[indices - 1]) * precisions[indices])
503 |     # accuracy
504 |     acc = np.sum(pred_matches > -1) / len(pred_matches)
505 | 
506 |     return ap, acc
507 | 
508 | 
509 | def compute_mAP(pred_results, out_dir, degree_thresholds=[180], shift_thresholds=[100],
510 |                 iou_3d_thresholds=[0.1], iou_pose_thres=0.1, use_matches_for_pose=False):
511 |     """ Compute mean Average Precision.
512 | 
513 |     Returns:
514 |         iou_aps:
515 |         pose_aps:
516 |         iou_acc:
517 |         pose_acc:
518 | 
519 |     """
520 |     synset_names = ['BG', 'bottle', 'bowl', 'camera', 'can', 'laptop', 'mug']
521 |     num_classes = len(synset_names)
522 |     degree_thres_list = list(degree_thresholds) + [360]
523 |     num_degree_thres = len(degree_thres_list)
524 |     shift_thres_list = list(shift_thresholds) + [100]
525 |     num_shift_thres = len(shift_thres_list)
526 |     iou_thres_list = list(iou_3d_thresholds)
527 |     num_iou_thres = len(iou_thres_list)
528 | 
529 |     if use_matches_for_pose:
530 |         assert iou_pose_thres in iou_thres_list
531 | 
532 |     # pre-allocate more than enough memory
533 |     iou_aps = np.zeros((num_classes + 1, num_iou_thres))
534 |     iou_acc = np.zeros((num_classes + 1, num_iou_thres))
535 |     iou_pred_matches_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)]
536 |     iou_pred_scores_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)]
537 |     iou_gt_matches_all = [np.zeros((num_iou_thres, 30000)) for _ in range(num_classes)]
538 |     iou_pred_count = [0 for _ in range(num_classes)]
539 |     iou_gt_count = [0 for _ in range(num_classes)]
540 | 
541 |     pose_aps = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres))
542 |     pose_acc = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres))
543 |     pose_pred_matches_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)]
544 |     pose_pred_scores_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)]
545 |     pose_gt_matches_all = [np.zeros((num_degree_thres, num_shift_thres, 30000)) for _ in range(num_classes)]
546 |     pose_pred_count = [0 for _ in range(num_classes)]
547 |     pose_gt_count = [0 for _ in range(num_classes)]
548 | 
549 |     # loop over results to gather pred matches and gt matches for iou and pose metrics
550 |     progress = 0
551 |     for progress, result in enumerate(tqdm(pred_results)):
552 |         gt_class_ids = result['gt_class_ids'].astype(np.int32)
553 |         gt_sRT = np.array(result['gt_RTs'])
554 |         gt_size = np.array(result['gt_scales'])
555 |         gt_handle_visibility = result['gt_handle_visibility']
556 | 
557 |         pred_class_ids = result['pred_class_ids']
558 |         pred_sRT = np.array(result['pred_RTs'])
559 |         pred_size = result['pred_scales']
560 |         pred_scores = result['pred_scores']
561 | 
562 |         if len(gt_class_ids) == 0 and len(pred_class_ids) == 0:
563 |             continue
564 | 
565 |         for cls_id in range(1, num_classes):
566 |             # get gt and predictions in this class
567 |             cls_gt_class_ids = gt_class_ids[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros(0)
568 |             cls_gt_sRT = gt_sRT[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros((0, 4, 4))
569 |             cls_gt_size = gt_size[gt_class_ids==cls_id] if len(gt_class_ids) else np.zeros((0, 3))
570 |             if synset_names[cls_id] != 'mug':
571 |                 cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids)
572 |             else:
573 |                 cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids==cls_id] if len(gt_class_ids) else np.ones(0)
574 | 
575 |             cls_pred_class_ids = pred_class_ids[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros(0)
576 |             cls_pred_sRT = pred_sRT[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros((0, 4, 4))
577 |             cls_pred_size = pred_size[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros((0, 3))
578 |             cls_pred_scores = pred_scores[pred_class_ids==cls_id] if len(pred_class_ids) else np.zeros(0)
579 | 
580 |             # calculate the overlap between each gt instance and pred instance
581 |             iou_cls_gt_match, iou_cls_pred_match, _, iou_pred_indices = \
582 |                 compute_IoU_matches(cls_gt_class_ids, cls_gt_sRT, cls_gt_size, cls_gt_handle_visibility,
583 |                                     cls_pred_class_ids, cls_pred_sRT, cls_pred_size, cls_pred_scores,
584 |                                     synset_names, iou_thres_list)
585 |             if len(iou_pred_indices):
586 |                 cls_pred_class_ids = cls_pred_class_ids[iou_pred_indices]
587 |                 cls_pred_sRT = cls_pred_sRT[iou_pred_indices]
588 |                 cls_pred_scores = cls_pred_scores[iou_pred_indices]
589 | 
590 |             num_pred = iou_cls_pred_match.shape[1]
591 |             pred_start = iou_pred_count[cls_id]
592 |             pred_end = pred_start + num_pred
593 |             iou_pred_count[cls_id] = pred_end
594 |             iou_pred_matches_all[cls_id][:, pred_start:pred_end] = iou_cls_pred_match
595 |             cls_pred_scores_tile = np.tile(cls_pred_scores, (num_iou_thres, 1))
596 |             assert cls_pred_scores_tile.shape[1] == num_pred
597 |             iou_pred_scores_all[cls_id][:, pred_start:pred_end] = cls_pred_scores_tile
598 |             num_gt = iou_cls_gt_match.shape[1]
599 |             gt_start = iou_gt_count[cls_id]
600 |             gt_end = gt_start + num_gt
601 |             iou_gt_count[cls_id] = gt_end
602 |             iou_gt_matches_all[cls_id][:, gt_start:gt_end] = iou_cls_gt_match
603 | 
604 |             if use_matches_for_pose:
605 |                 thres_ind = list(iou_thres_list).index(iou_pose_thres)
606 |                 iou_thres_pred_match = iou_cls_pred_match[thres_ind, :]
607 |                 cls_pred_class_ids = cls_pred_class_ids[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros(0)
608 |                 cls_pred_sRT = cls_pred_sRT[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros((0, 4, 4))
609 |                 cls_pred_scores = cls_pred_scores[iou_thres_pred_match > -1] if len(iou_thres_pred_match) > 0 else np.zeros(0)
610 |                 iou_thres_gt_match = iou_cls_gt_match[thres_ind, :]
611 |                 cls_gt_class_ids = cls_gt_class_ids[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros(0)
612 |                 cls_gt_sRT = cls_gt_sRT[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros((0, 4, 4))
613 |                 cls_gt_handle_visibility = cls_gt_handle_visibility[iou_thres_gt_match > -1] if len(iou_thres_gt_match) > 0 else np.zeros(0)
614 | 
615 |             RT_overlaps = compute_RT_overlaps(cls_gt_class_ids, cls_gt_sRT, cls_gt_handle_visibility,
616 |                                               cls_pred_class_ids, cls_pred_sRT, synset_names)
617 |             pose_cls_gt_match, pose_cls_pred_match = compute_RT_matches(RT_overlaps, cls_pred_class_ids, cls_gt_class_ids,
618 |                                                                         degree_thres_list, shift_thres_list)
619 |             num_pred = pose_cls_pred_match.shape[2]
620 |             pred_start = pose_pred_count[cls_id]
621 |             pred_end = pred_start + num_pred
622 |             pose_pred_count[cls_id] = pred_end
623 |             pose_pred_matches_all[cls_id][:, :, pred_start:pred_end] = pose_cls_pred_match
624 |             cls_pred_scores_tile = np.tile(cls_pred_scores, (num_degree_thres, num_shift_thres, 1))
625 |             assert cls_pred_scores_tile.shape[2] == num_pred
626 |             pose_pred_scores_all[cls_id][:, :, pred_start:pred_end] = cls_pred_scores_tile
627 |             num_gt = pose_cls_gt_match.shape[2]
628 |             gt_start = pose_gt_count[cls_id]
629 |             gt_end = gt_start + num_gt
630 |             pose_gt_count[cls_id] = gt_end
631 |             pose_gt_matches_all[cls_id][:, :, gt_start:gt_end] = pose_cls_gt_match
632 | 
633 |     # trim zeros
634 |     for cls_id in range(num_classes):
635 |         # IoU
636 |         iou_pred_matches_all[cls_id] = iou_pred_matches_all[cls_id][:, :iou_pred_count[cls_id]]
637 |         iou_pred_scores_all[cls_id] = iou_pred_scores_all[cls_id][:, :iou_pred_count[cls_id]]
638 |         iou_gt_matches_all[cls_id] = iou_gt_matches_all[cls_id][:, :iou_gt_count[cls_id]]
639 |         # pose
640 |         pose_pred_matches_all[cls_id] = pose_pred_matches_all[cls_id][:, :, :pose_pred_count[cls_id]]
641 |         pose_pred_scores_all[cls_id] = pose_pred_scores_all[cls_id][:, :, :pose_pred_count[cls_id]]
642 |         pose_gt_matches_all[cls_id] = pose_gt_matches_all[cls_id][:, :, :pose_gt_count[cls_id]]
643 | 
644 |     # compute 3D IoU mAP
645 |     for cls_id in range(1, num_classes):
646 |         for s, iou_thres in enumerate(iou_thres_list):
647 |             iou_aps[cls_id, s], iou_acc[cls_id, s] = compute_ap_and_acc(iou_pred_matches_all[cls_id][s, :],
648 |                                                                         iou_pred_scores_all[cls_id][s, :],
649 |                                                                         iou_gt_matches_all[cls_id][s, :])
650 |     iou_aps[-1, :] = np.mean(iou_aps[1:-1, :], axis=0)
651 |     iou_acc[-1, :] = np.mean(iou_acc[1:-1, :], axis=0)
652 |     # compute pose mAP
653 |     for i, degree_thres in enumerate(degree_thres_list):
654 |         for j, shift_thres in enumerate(shift_thres_list):
655 |             for cls_id in range(1, num_classes):
656 |                 cls_pose_pred_matches_all = pose_pred_matches_all[cls_id][i, j, :]
657 |                 cls_pose_gt_matches_all = pose_gt_matches_all[cls_id][i, j, :]
658 |                 cls_pose_pred_scores_all = pose_pred_scores_all[cls_id][i, j, :]
659 |                 pose_aps[cls_id, i, j], pose_acc[cls_id, i, j] = compute_ap_and_acc(cls_pose_pred_matches_all,
660 |                                                                                     cls_pose_pred_scores_all,
661 |                                                                                     cls_pose_gt_matches_all)
662 |             pose_aps[-1, i, j] = np.mean(pose_aps[1:-1, i, j])
663 |             pose_acc[-1, i, j] = np.mean(pose_acc[1:-1, i, j])
664 | 
665 |     # save results to pkl
666 |     result_dict = {}
667 |     result_dict['iou_thres_list'] = iou_thres_list
668 |     result_dict['degree_thres_list'] = degree_thres_list
669 |     result_dict['shift_thres_list'] = shift_thres_list
670 |     result_dict['iou_aps'] = iou_aps
671 |     result_dict['pose_aps'] = pose_aps
672 |     result_dict['iou_acc'] = iou_acc
673 |     result_dict['pose_acc'] = pose_acc
674 |     pkl_path = os.path.join(out_dir, 'mAP_Acc.pkl')
675 |     with open(pkl_path, 'wb') as f:
676 |         cPickle.dump(result_dict, f)
677 |     return iou_aps, pose_aps, iou_acc, pose_acc
678 | 
679 | 
680 | def plot_mAP(iou_aps, pose_aps, out_dir, iou_thres_list, degree_thres_list, shift_thres_list):
681 |     """ Draw iou 3d AP vs. iou thresholds.
682 |     """
683 | 
684 |     labels = ['bottle', 'bowl', 'camera', 'can', 'laptop', 'mug', 'mean', 'nocs']
685 |     colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:pink', 'tab:olive', 'tab:purple', 'tab:red', 'tab:gray']
686 |     styles = ['-', '-', '-', '-', '-', '-', '--', ':']
687 | 
688 |     fig, (ax_iou, ax_degree, ax_shift) = plt.subplots(1, 3, figsize=(8, 3.5))
689 |     # IoU subplot
690 |     ax_iou.set_title('3D IoU', fontsize=10)
691 |     ax_iou.set_ylabel('Average Precision')
692 |     ax_iou.set_ylim(0, 100)
693 |     ax_iou.set_xlabel('Percent')
694 |     ax_iou.set_xlim(0, 100)
695 |     ax_iou.xaxis.set_ticks([0, 25, 50, 75, 100])
696 |     ax_iou.grid()
697 |     for i in range(1, iou_aps.shape[0]):
698 |         ax_iou.plot(100*np.array(iou_thres_list), 100*iou_aps[i, :],
699 |                     color=colors[i-1], linestyle=styles[i-1], label=labels[i-1])
700 |     # rotation subplot
701 |     ax_degree.set_title('Rotation', fontsize=10)
702 |     ax_degree.set_ylim(0, 100)
703 |     ax_degree.yaxis.set_ticklabels([])
704 |     ax_degree.set_xlabel('Degree')
705 |     ax_degree.set_xlim(0, 60)
706 |     ax_degree.xaxis.set_ticks([0, 20, 40, 60])
707 |     ax_degree.grid()
708 |     for i in range(1, pose_aps.shape[0]):
709 |         ax_degree.plot(np.array(degree_thres_list), 100*pose_aps[i, :len(degree_thres_list), -1],
710 |                        color=colors[i-1], linestyle=styles[i-1], label=labels[i-1])
711 |     # translation subplot
712 |     ax_shift.set_title('Translation', fontsize=10)
713 |     ax_shift.set_ylim(0, 100)
714 |     ax_shift.yaxis.set_ticklabels([])
715 |     ax_shift.set_xlabel('Centimeter')
716 |     ax_shift.set_xlim(0, 10)
717 |     ax_shift.xaxis.set_ticks([0, 5, 10])
718 |     ax_shift.grid()
719 |     for i in range(1, pose_aps.shape[0]):
720 |         ax_shift.plot(np.array(shift_thres_list), 100*pose_aps[i, -1, :len(shift_thres_list)],
721 |                       color=colors[i-1], linestyle=styles[i-1], label=labels[i-1])
722 |     ax_shift.legend(loc='lower right', fontsize='small')
723 |     plt.tight_layout()
724 |     # plt.show()
725 |     plt.savefig(os.path.join(out_dir, 'mAP.png'))
726 |     plt.close(fig)
727 |     return
728 | 
729 | 
730 | def calculate_2d_projections(coordinates_3d, intrinsics):
731 |     """
732 |     Args:
733 |         coordinates_3d: [3, N]
734 |         intrinsics: [3, 3]
735 | 
736 |     Returns:
737 |         projected_coordinates: [N, 2]
738 |     """
739 |     projected_coordinates = intrinsics @ coordinates_3d
740 |     projected_coordinates = projected_coordinates[:2, :] / projected_coordinates[2, :]
741 |     projected_coordinates = projected_coordinates.transpose()
742 |     projected_coordinates = np.array(projected_coordinates, dtype=np.int32)
743 | 
744 |     return projected_coordinates
745 | 
746 | 
747 | def align_rotation(sRT):
748 |     """ Align rotations for symmetric objects.
749 |     Args:
750 |         sRT: 4 x 4
751 |     """
752 |     s = np.cbrt(np.linalg.det(sRT[:3, :3]))
753 |     R = sRT[:3, :3] / s
754 |     T = sRT[:3, 3]
755 | 
756 |     theta_x = R[0, 0] + R[2, 2]
757 |     theta_y = R[0, 2] - R[2, 0]
758 |     r_norm = math.sqrt(theta_x**2 + theta_y**2)
759 |     s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
760 |                       [0.0,            1.0,  0.0           ],
761 |                       [theta_y/r_norm, 0.0,  theta_x/r_norm]])
762 |     rotation = R @ s_map
763 |     aligned_sRT = np.identity(4, dtype=np.float32)
764 |     aligned_sRT[:3, :3] = s * rotation
765 |     aligned_sRT[:3, 3] = T
766 |     return aligned_sRT
767 | 
768 | 
769 | def draw_bboxes(img, img_pts, color):
770 |     img_pts = np.int32(img_pts).reshape(-1, 2)
771 |     # draw ground layer in darker color
772 |     color_ground = (int(color[0]*0.3), int(color[1]*0.3), int(color[2]*0.3))
773 |     for i, j in zip([4, 5, 6, 7], [5, 7, 4, 6]):
774 |         img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color_ground, 2)
775 |     # draw pillars in minor darker color
776 |     color_pillar = (int(color[0]*0.6), int(color[1]*0.6), int(color[2]*0.6))
777 |     for i, j in zip(range(4), range(4, 8)):
778 |         img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color_pillar, 2)
779 |     # draw top layer in original color
780 |     for i, j in zip([0, 1, 2, 3], [1, 3, 0, 2]):
781 |         img = cv2.line(img, tuple(img_pts[i]), tuple(img_pts[j]), color, 2)
782 | 
783 |     return img
784 | 
785 | 
786 | def draw_detections(img, out_dir, data_name, img_id, intrinsics, pred_sRT, pred_size, pred_class_ids,
787 |                     gt_sRT, gt_size, gt_class_ids, nocs_sRT, nocs_size, nocs_class_ids, draw_gt=True, draw_nocs=True):
788 |     """ Visualize pose predictions.
789 |     """
790 |     out_path = os.path.join(out_dir, '{}_{}_pred.png'.format(data_name, img_id))
791 | 
792 |     # draw nocs results - BLUE color
793 |     if draw_nocs:
794 |         for i in range(nocs_sRT.shape[0]):
795 |             if nocs_class_ids[i] in [1, 2, 4]:
796 |                 sRT = align_rotation(nocs_sRT[i, :, :])
797 |             else:
798 |                 sRT = nocs_sRT[i, :, :]
799 |             bbox_3d = get_3d_bbox(nocs_size[i, :], 0)
800 |             transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT)
801 |             projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics)
802 |             img = draw_bboxes(img, projected_bbox, (255, 0, 0))
803 |     # darw ground truth - GREEN color
804 |     if draw_gt:
805 |         for i in range(gt_sRT.shape[0]):
806 |             if gt_class_ids[i] in [1, 2, 4]:
807 |                 sRT = align_rotation(gt_sRT[i, :, :])
808 |             else:
809 |                 sRT = gt_sRT[i, :, :]
810 |             bbox_3d = get_3d_bbox(gt_size[i, :], 0)
811 |             transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT)
812 |             projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics)
813 |             img = draw_bboxes(img, projected_bbox, (0, 255, 0))
814 |     # darw prediction - RED color
815 |     for i in range(pred_sRT.shape[0]):
816 |         if pred_class_ids[i] in [1, 2, 4]:
817 |             sRT = align_rotation(pred_sRT[i, :, :])
818 |         else:
819 |             sRT = pred_sRT[i, :, :]
820 |         bbox_3d = get_3d_bbox(pred_size[i, :], 0)
821 |         transformed_bbox_3d = transform_coordinates_3d(bbox_3d, sRT)
822 |         projected_bbox = calculate_2d_projections(transformed_bbox_3d, intrinsics)
823 |         img = draw_bboxes(img, projected_bbox, (0, 0, 255))
824 | 
825 |     cv2.imwrite(out_path, img)
826 |     # cv2.imshow('vis', img)
827 |     # cv2.waitKey(0)
828 | 


--------------------------------------------------------------------------------
/mean_shape.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import numpy as np
  4 | import open3d as o3d
  5 | import matplotlib.pyplot as plt
  6 | import torch
  7 | from lib.auto_encoder import PointCloudAE
  8 | from data.shape_dataset import ShapeDataset
  9 | from tools.tsne import tsne
 10 | 
 11 | 
 12 | def visualize_shape(name, shape_list, result_dir):
 13 |     """ Visualization and save image.
 14 | 
 15 |     Args:
 16 |         name: window name
 17 |         shape: list of geoemtries
 18 | 
 19 |     """
 20 |     vis = o3d.visualization.Visualizer()
 21 |     vis.create_window(window_name=name, width=512, height=512, left=50, top=25)
 22 |     for shape in shape_list:
 23 |         vis.add_geometry(shape)
 24 |     ctr = vis.get_view_control()
 25 |     ctr.rotate(-300.0, 150.0)
 26 |     if name == 'camera':
 27 |         ctr.translate(20.0, -20.0)     # (horizontal right +, vertical down +)
 28 |     if name == 'laptop':
 29 |         ctr.translate(25.0, -60.0)
 30 |     vis.run()
 31 |     vis.capture_screen_image(os.path.join(result_dir, name+'.png'), False)
 32 |     vis.destroy_window()
 33 | 
 34 | 
 35 | parser = argparse.ArgumentParser()
 36 | parser.add_argument('--h5_file', type=str, default='data/obj_models/ShapeNetCore_2048.h5', help='h5py file')
 37 | parser.add_argument('--model', type=str, default='results/ae_points/model_50.pth',  help='resume model')
 38 | parser.add_argument('--result_dir', type=str, default='results/ae_points', help='directory to save mean shapes')
 39 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use')
 40 | opt = parser.parse_args()
 41 | 
 42 | opt.emb_dim = 512
 43 | opt.n_cat = 6
 44 | opt.n_pts = 1024
 45 | 
 46 | os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
 47 | 
 48 | estimator = PointCloudAE(opt.emb_dim, opt.n_pts)
 49 | estimator.cuda()
 50 | estimator.load_state_dict(torch.load(opt.model))
 51 | estimator.eval()
 52 | train_dataset = ShapeDataset(opt.h5_file, mode='train', augment=False)
 53 | train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=0)
 54 | 
 55 | obj_models = []
 56 | embedding = []
 57 | catId = []  # zero-indexed
 58 | for i, data in enumerate(train_dataloader):
 59 |     batch_xyz, batch_label = data
 60 |     batch_xyz = batch_xyz[:, :, :3].cuda()
 61 |     batch_label = batch_label.cuda()
 62 |     emb, pred_points = estimator(batch_xyz)
 63 |     emb = emb.cpu().detach().numpy()
 64 |     inst_shape = batch_xyz.cpu().numpy()
 65 |     label = batch_label.cpu().numpy()
 66 |     embedding.append(emb)
 67 |     obj_models.append(inst_shape)
 68 |     catId.append(label)
 69 | 
 70 | embedding = np.squeeze(np.array(embedding).astype(np.float64), axis=1)
 71 | catId = np.squeeze((np.array(catId)), axis=1)
 72 | obj_models = np.squeeze(np.array(obj_models), axis=1)
 73 | 
 74 | # enbedding visualization
 75 | Y = tsne(embedding, 2, 50, 30.0)
 76 | y_bottle = Y[np.where(catId == 0)[0], :]
 77 | s_bottle = plt.scatter(y_bottle[:, 0], y_bottle[:, 1], s=20, marker='o', c='tab:orange')
 78 | y_bowl = Y[np.where(catId == 1)[0], :]
 79 | s_bowl = plt.scatter(y_bowl[:, 0], y_bowl[:, 1], s=20, marker='^', c='tab:blue')
 80 | y_camera = Y[np.where(catId == 2)[0], :]
 81 | s_camera = plt.scatter(y_camera[:, 0], y_camera[:, 1], s=20, marker='s', c='tab:olive')
 82 | y_can = Y[np.where(catId == 3)[0], :]
 83 | s_can = plt.scatter(y_can[:, 0], y_can[:, 1], s=20, marker='d', c='tab:gray')
 84 | y_laptop = Y[np.where(catId == 4)[0], :]
 85 | s_laptop = plt.scatter(y_laptop[:, 0], y_laptop[:, 1], s=20, marker='P', c='tab:purple')
 86 | y_mug = Y[np.where(catId == 5)[0], :]
 87 | s_mug = plt.scatter(y_mug[:, 0], y_mug[:, 1], s=20, marker='v', c='tab:brown')
 88 | plt.legend((s_bottle, s_bowl, s_camera, s_can, s_laptop, s_mug),
 89 |            ('bottle', 'bowl', 'camera', 'can', 'laptop', 'mug'),
 90 |            loc='best', ncol=1, fontsize=8, frameon=False)
 91 | plt.xticks([])
 92 | plt.yticks([])
 93 | plt.savefig(os.path.join(opt.result_dir, 'visual_embedding.png'), bbox_inches='tight')
 94 | 
 95 | #  mean embedding and mean shape
 96 | mean_emb = np.empty((opt.n_cat, opt.emb_dim), dtype=np.float)
 97 | catId_to_name = {0: 'bottle', 1: 'bowl', 2: 'camera', 3: 'can', 4: 'laptop', 5: 'mug'}
 98 | mean_points = np.empty((opt.n_cat, opt.n_pts, 3), dtype=np.float)
 99 | for i in range(opt.n_cat):
100 |     mean = np.mean(embedding[np.where(catId==i)[0], :], axis=0, keepdims=False)
101 |     mean_emb[i] = mean
102 |     assigned_emb = torch.cuda.FloatTensor(mean[None, :])
103 |     _, mean_shape = estimator(None, assigned_emb)
104 |     mean_shape = mean_shape.cpu().detach().numpy()[0]
105 |     mean_points[i] = mean_shape
106 |     # save point cloud and visualize
107 |     pcd = o3d.geometry.PointCloud()
108 |     pcd.points = o3d.utility.Vector3dVector(mean_shape)
109 |     visualize_shape(catId_to_name[i], [pcd], opt.result_dir)
110 | # save results
111 | np.save(os.path.join(opt.result_dir, 'mean_embedding'), mean_emb)
112 | np.save(os.path.join(opt.result_dir, 'mean_points_emb'), mean_points)
113 | 


--------------------------------------------------------------------------------
/preprocess/pose_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import cv2
  5 | import numpy as np
  6 | import _pickle as cPickle
  7 | from tqdm import tqdm
  8 | sys.path.append('../lib')
  9 | from align import align_nocs_to_depth
 10 | from utils import load_depth
 11 | 
 12 | 
 13 | def create_img_list(data_dir):
 14 |     """ Create train/val/test data list for CAMERA and Real. """
 15 |     # CAMERA dataset
 16 |     for subset in ['train', 'val']:
 17 |         img_list = []
 18 |         img_dir = os.path.join(data_dir, 'CAMERA', subset)
 19 |         folder_list = [name for name in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, name))]
 20 |         for i in range(10*len(folder_list)):
 21 |             folder_id = int(i) // 10
 22 |             img_id = int(i) % 10
 23 |             img_path = os.path.join(subset, '{:05d}'.format(folder_id), '{:04d}'.format(img_id))
 24 |             img_list.append(img_path)
 25 |         with open(os.path.join(data_dir, 'CAMERA', subset+'_list_all.txt'), 'w') as f:
 26 |             for img_path in img_list:
 27 |                 f.write("%s\n" % img_path)
 28 |     # Real dataset
 29 |     for subset in ['train', 'test']:
 30 |         img_list = []
 31 |         img_dir = os.path.join(data_dir, 'Real', subset)
 32 |         folder_list = [name for name in sorted(os.listdir(img_dir)) if os.path.isdir(os.path.join(img_dir, name))]
 33 |         for folder in folder_list:
 34 |             img_paths = glob.glob(os.path.join(img_dir, folder, '*_color.png'))
 35 |             img_paths = sorted(img_paths)
 36 |             for img_full_path in img_paths:
 37 |                 img_name = os.path.basename(img_full_path)
 38 |                 img_ind = img_name.split('_')[0]
 39 |                 img_path = os.path.join(subset, folder, img_ind)
 40 |                 img_list.append(img_path)
 41 |         with open(os.path.join(data_dir, 'Real', subset+'_list_all.txt'), 'w') as f:
 42 |             for img_path in img_list:
 43 |                 f.write("%s\n" % img_path)
 44 |     print('Write all data paths to file done!')
 45 | 
 46 | 
 47 | def process_data(img_path, depth):
 48 |     """ Load instance masks for the objects in the image. """
 49 |     mask_path = img_path + '_mask.png'
 50 |     mask = cv2.imread(mask_path)[:, :, 2]
 51 |     mask = np.array(mask, dtype=np.int32)
 52 |     all_inst_ids = sorted(list(np.unique(mask)))
 53 |     assert all_inst_ids[-1] == 255
 54 |     del all_inst_ids[-1]    # remove background
 55 |     num_all_inst = len(all_inst_ids)
 56 |     h, w = mask.shape
 57 | 
 58 |     coord_path = img_path + '_coord.png'
 59 |     coord_map = cv2.imread(coord_path)[:, :, :3]
 60 |     coord_map = coord_map[:, :, (2, 1, 0)]
 61 |     # flip z axis of coord map
 62 |     coord_map = np.array(coord_map, dtype=np.float32) / 255
 63 |     coord_map[:, :, 2] = 1 - coord_map[:, :, 2]
 64 | 
 65 |     class_ids = []
 66 |     instance_ids = []
 67 |     model_list = []
 68 |     masks = np.zeros([h, w, num_all_inst], dtype=np.uint8)
 69 |     coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32)
 70 |     bboxes = np.zeros((num_all_inst, 4), dtype=np.int32)
 71 | 
 72 |     meta_path = img_path + '_meta.txt'
 73 |     with open(meta_path, 'r') as f:
 74 |         i = 0
 75 |         for line in f:
 76 |             line_info = line.strip().split(' ')
 77 |             inst_id = int(line_info[0])
 78 |             cls_id = int(line_info[1])
 79 |             # background objects and non-existing objects
 80 |             if cls_id == 0 or (inst_id not in all_inst_ids):
 81 |                 continue
 82 |             if len(line_info) == 3:
 83 |                 model_id = line_info[2]    # Real scanned objs
 84 |             else:
 85 |                 model_id = line_info[3]    # CAMERA objs
 86 |             # remove one mug instance in CAMERA train due to improper model
 87 |             if model_id == 'b9be7cfe653740eb7633a2dd89cec754':
 88 |                 continue
 89 |             # process foreground objects
 90 |             inst_mask = np.equal(mask, inst_id)
 91 |             # bounding box
 92 |             horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0]
 93 |             vertical_indicies = np.where(np.any(inst_mask, axis=1))[0]
 94 |             assert horizontal_indicies.shape[0], print(img_path)
 95 |             x1, x2 = horizontal_indicies[[0, -1]]
 96 |             y1, y2 = vertical_indicies[[0, -1]]
 97 |             # x2 and y2 should not be part of the box. Increment by 1.
 98 |             x2 += 1
 99 |             y2 += 1
100 |             # object occupies full image, rendering error, happens in CAMERA dataset
101 |             if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)):
102 |                 return None, None, None, None, None, None
103 |             # not enough valid depth observation
104 |             final_mask = np.logical_and(inst_mask, depth > 0)
105 |             if np.sum(final_mask) < 64:
106 |                 continue
107 |             class_ids.append(cls_id)
108 |             instance_ids.append(inst_id)
109 |             model_list.append(model_id)
110 |             masks[:, :, i] = inst_mask
111 |             coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1))
112 |             bboxes[i] = np.array([y1, x1, y2, x2])
113 |             i += 1
114 |     # no valid foreground objects
115 |     if i == 0:
116 |         return None, None, None, None, None, None
117 | 
118 |     masks = masks[:, :, :i]
119 |     coords = np.clip(coords[:, :, :i, :], 0, 1)
120 |     bboxes = bboxes[:i, :]
121 | 
122 |     return masks, coords, class_ids, instance_ids, model_list, bboxes
123 | 
124 | 
125 | def annotate_camera_train(data_dir):
126 |     """ Generate gt labels for CAMERA train data. """
127 |     camera_train = open(os.path.join(data_dir, 'CAMERA', 'train_list_all.txt')).read().splitlines()
128 |     intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
129 |     # meta info for re-label mug category
130 |     with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f:
131 |         mug_meta = cPickle.load(f)
132 | 
133 |     valid_img_list = []
134 |     for img_path in tqdm(camera_train):
135 |         img_full_path = os.path.join(data_dir, 'CAMERA', img_path)
136 |         all_exist = os.path.exists(img_full_path + '_color.png') and \
137 |                     os.path.exists(img_full_path + '_coord.png') and \
138 |                     os.path.exists(img_full_path + '_depth.png') and \
139 |                     os.path.exists(img_full_path + '_mask.png') and \
140 |                     os.path.exists(img_full_path + '_meta.txt')
141 |         if not all_exist:
142 |             continue
143 |         depth = load_depth(img_full_path)
144 |         masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth)
145 |         if instance_ids is None:
146 |             continue
147 |         # Umeyama alignment of GT NOCS map with depth image
148 |         scales, rotations, translations, error_messages, _ = \
149 |             align_nocs_to_depth(masks, coords, depth, intrinsics, instance_ids, img_path)
150 |         if error_messages:
151 |             continue
152 |         # re-label for mug category
153 |         for i in range(len(class_ids)):
154 |             if class_ids[i] == 6:
155 |                 T0 = mug_meta[model_list[i]][0]
156 |                 s0 = mug_meta[model_list[i]][1]
157 |                 T = translations[i] - scales[i] * rotations[i] @ T0
158 |                 s = scales[i] / s0
159 |                 scales[i] = s
160 |                 translations[i] = T
161 |         # write results
162 |         gts = {}
163 |         gts['class_ids'] = class_ids    # int list, 1 to 6
164 |         gts['bboxes'] = bboxes  # np.array, [[y1, x1, y2, x2], ...]
165 |         gts['scales'] = scales.astype(np.float32)  # np.array, scale factor from NOCS model to depth observation
166 |         gts['rotations'] = rotations.astype(np.float32)    # np.array, R
167 |         gts['translations'] = translations.astype(np.float32)  # np.array, T
168 |         gts['instance_ids'] = instance_ids  # int list, start from 1
169 |         gts['model_list'] = model_list  # str list, model id/name
170 |         with open(img_full_path + '_label.pkl', 'wb') as f:
171 |             cPickle.dump(gts, f)
172 |         valid_img_list.append(img_path)
173 |     # write valid img list to file
174 |     with open(os.path.join(data_dir, 'CAMERA/train_list.txt'), 'w') as f:
175 |         for img_path in valid_img_list:
176 |             f.write("%s\n" % img_path)
177 | 
178 | 
179 | def annotate_real_train(data_dir):
180 |     """ Generate gt labels for Real train data through PnP. """
181 |     real_train = open(os.path.join(data_dir, 'Real/train_list_all.txt')).read().splitlines()
182 |     intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
183 |     # scale factors for all instances
184 |     scale_factors = {}
185 |     path_to_size = glob.glob(os.path.join(data_dir, 'obj_models/real_train', '*_norm.txt'))
186 |     for inst_path in sorted(path_to_size):
187 |         instance = os.path.basename(inst_path).split('.')[0]
188 |         bbox_dims = np.loadtxt(inst_path)
189 |         scale_factors[instance] = np.linalg.norm(bbox_dims)
190 |     # meta info for re-label mug category
191 |     with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f:
192 |         mug_meta = cPickle.load(f)
193 | 
194 |     valid_img_list = []
195 |     for img_path in tqdm(real_train):
196 |         img_full_path = os.path.join(data_dir, 'Real', img_path)
197 |         all_exist = os.path.exists(img_full_path + '_color.png') and \
198 |                     os.path.exists(img_full_path + '_coord.png') and \
199 |                     os.path.exists(img_full_path + '_depth.png') and \
200 |                     os.path.exists(img_full_path + '_mask.png') and \
201 |                     os.path.exists(img_full_path + '_meta.txt')
202 |         if not all_exist:
203 |             continue
204 |         depth = load_depth(img_full_path)
205 |         masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth)
206 |         if instance_ids is None:
207 |             continue
208 |         # compute pose
209 |         num_insts = len(class_ids)
210 |         scales = np.zeros(num_insts)
211 |         rotations = np.zeros((num_insts, 3, 3))
212 |         translations = np.zeros((num_insts, 3))
213 |         for i in range(num_insts):
214 |             s = scale_factors[model_list[i]]
215 |             mask = masks[:, :, i]
216 |             idxs = np.where(mask)
217 |             coord = coords[:, :, i, :]
218 |             coord_pts = s * (coord[idxs[0], idxs[1], :] - 0.5)
219 |             coord_pts = coord_pts[:, :, None]
220 |             img_pts = np.array([idxs[1], idxs[0]]).transpose()
221 |             img_pts = img_pts[:, :, None].astype(float)
222 |             distCoeffs = np.zeros((4, 1))    # no distoration
223 |             retval, rvec, tvec = cv2.solvePnP(coord_pts, img_pts, intrinsics, distCoeffs)
224 |             assert retval
225 |             R, _ = cv2.Rodrigues(rvec)
226 |             T = np.squeeze(tvec)
227 |             # re-label for mug category
228 |             if class_ids[i] == 6:
229 |                 T0 = mug_meta[model_list[i]][0]
230 |                 s0 = mug_meta[model_list[i]][1]
231 |                 T = T - s * R @ T0
232 |                 s = s / s0
233 |             scales[i] = s
234 |             rotations[i] = R
235 |             translations[i] = T
236 |         # write results
237 |         gts = {}
238 |         gts['class_ids'] = class_ids    # int list, 1 to 6
239 |         gts['bboxes'] = bboxes  # np.array, [[y1, x1, y2, x2], ...]
240 |         gts['scales'] = scales.astype(np.float32)  # np.array, scale factor from NOCS model to depth observation
241 |         gts['rotations'] = rotations.astype(np.float32)    # np.array, R
242 |         gts['translations'] = translations.astype(np.float32)  # np.array, T
243 |         gts['instance_ids'] = instance_ids  # int list, start from 1
244 |         gts['model_list'] = model_list  # str list, model id/name
245 |         with open(img_full_path + '_label.pkl', 'wb') as f:
246 |             cPickle.dump(gts, f)
247 |         valid_img_list.append(img_path)
248 |     # write valid img list to file
249 |     with open(os.path.join(data_dir, 'Real/train_list.txt'), 'w') as f:
250 |         for img_path in valid_img_list:
251 |             f.write("%s\n" % img_path)
252 | 
253 | 
254 | def annotate_test_data(data_dir):
255 |     """ Generate gt labels for test data.
256 |         Properly copy handle_visibility provided by NOCS gts.
257 |     """
258 |     # Statistics:
259 |     # test_set    missing file     bad rendering    no (occluded) fg    occlusion (< 64 pts)
260 |     #   val        3792 imgs        132 imgs         1856 (23) imgs      50 insts
261 |     #   test       0 img            0 img            0 img               2 insts
262 | 
263 |     camera_val = open(os.path.join(data_dir, 'CAMERA', 'val_list_all.txt')).read().splitlines()
264 |     real_test = open(os.path.join(data_dir, 'Real', 'test_list_all.txt')).read().splitlines()
265 |     camera_intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
266 |     real_intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
267 |     # compute model size
268 |     model_file_path = ['obj_models/camera_val.pkl', 'obj_models/real_test.pkl']
269 |     models = {}
270 |     for path in model_file_path:
271 |         with open(os.path.join(data_dir, path), 'rb') as f:
272 |             models.update(cPickle.load(f))
273 |     model_sizes = {}
274 |     for key in models.keys():
275 |         model_sizes[key] = 2 * np.amax(np.abs(models[key]), axis=0)
276 |     # meta info for re-label mug category
277 |     with open(os.path.join(data_dir, 'obj_models/mug_meta.pkl'), 'rb') as f:
278 |         mug_meta = cPickle.load(f)
279 | 
280 |     subset_meta = [('CAMERA', camera_val, camera_intrinsics, 'val'), ('Real', real_test, real_intrinsics, 'test')]
281 |     for source, img_list, intrinsics, subset in subset_meta:
282 |         valid_img_list = []
283 |         for img_path in tqdm(img_list):
284 |             img_full_path = os.path.join(data_dir, source, img_path)
285 |             all_exist = os.path.exists(img_full_path + '_color.png') and \
286 |                         os.path.exists(img_full_path + '_coord.png') and \
287 |                         os.path.exists(img_full_path + '_depth.png') and \
288 |                         os.path.exists(img_full_path + '_mask.png') and \
289 |                         os.path.exists(img_full_path + '_meta.txt')
290 |             if not all_exist:
291 |                 continue
292 |             depth = load_depth(img_full_path)
293 |             masks, coords, class_ids, instance_ids, model_list, bboxes = process_data(img_full_path, depth)
294 |             if instance_ids is None:
295 |                 continue
296 |             num_insts = len(instance_ids)
297 |             # match each instance with NOCS ground truth to properly assign gt_handle_visibility
298 |             nocs_dir = os.path.join(os.path.dirname(data_dir), 'results/nocs_results')
299 |             if source == 'CAMERA':
300 |                 nocs_path = os.path.join(nocs_dir, 'val', 'results_val_{}_{}.pkl'.format(
301 |                     img_path.split('/')[-2], img_path.split('/')[-1]))
302 |             else:
303 |                 nocs_path = os.path.join(nocs_dir, 'real_test', 'results_test_{}_{}.pkl'.format(
304 |                     img_path.split('/')[-2], img_path.split('/')[-1]))
305 |             with open(nocs_path, 'rb') as f:
306 |                 nocs = cPickle.load(f)
307 |             gt_class_ids = nocs['gt_class_ids']
308 |             gt_bboxes = nocs['gt_bboxes']
309 |             gt_sRT = nocs['gt_RTs']
310 |             gt_handle_visibility = nocs['gt_handle_visibility']
311 |             map_to_nocs = []
312 |             for i in range(num_insts):
313 |                 gt_match = -1
314 |                 for j in range(len(gt_class_ids)):
315 |                     if gt_class_ids[j] != class_ids[i]:
316 |                         continue
317 |                     if np.sum(np.abs(bboxes[i] - gt_bboxes[j])) > 5:
318 |                         continue
319 |                     # match found
320 |                     gt_match = j
321 |                     break
322 |                 # check match validity
323 |                 assert gt_match > -1, print(img_path, instance_ids[i], 'no match for instance')
324 |                 assert gt_match not in map_to_nocs, print(img_path, instance_ids[i], 'duplicate match')
325 |                 map_to_nocs.append(gt_match)
326 |             # copy from ground truth, re-label for mug category
327 |             handle_visibility = gt_handle_visibility[map_to_nocs]
328 |             sizes = np.zeros((num_insts, 3))
329 |             poses = np.zeros((num_insts, 4, 4))
330 |             scales = np.zeros(num_insts)
331 |             rotations = np.zeros((num_insts, 3, 3))
332 |             translations = np.zeros((num_insts, 3))
333 |             for i in range(num_insts):
334 |                 gt_idx = map_to_nocs[i]
335 |                 sizes[i] = model_sizes[model_list[i]]
336 |                 sRT = gt_sRT[gt_idx]
337 |                 s = np.cbrt(np.linalg.det(sRT[:3, :3]))
338 |                 R = sRT[:3, :3] / s
339 |                 T = sRT[:3, 3]
340 |                 # re-label mug category
341 |                 if class_ids[i] == 6:
342 |                     T0 = mug_meta[model_list[i]][0]
343 |                     s0 = mug_meta[model_list[i]][1]
344 |                     T = T - s * R @ T0
345 |                     s = s / s0
346 |                 # used for test during training
347 |                 scales[i] = s
348 |                 rotations[i] = R
349 |                 translations[i] = T
350 |                 # used for evaluation
351 |                 sRT = np.identity(4, dtype=np.float32)
352 |                 sRT[:3, :3] = s * R
353 |                 sRT[:3, 3] = T
354 |                 poses[i] = sRT
355 |             # write results
356 |             gts = {}
357 |             gts['class_ids'] = np.array(class_ids)    # int list, 1 to 6
358 |             gts['bboxes'] = bboxes    # np.array, [[y1, x1, y2, x2], ...]
359 |             gts['instance_ids'] = instance_ids    # int list, start from 1
360 |             gts['model_list'] = model_list    # str list, model id/name
361 |             gts['size'] = sizes   # 3D size of NOCS model
362 |             gts['scales'] = scales.astype(np.float32)    # np.array, scale factor from NOCS model to depth observation
363 |             gts['rotations'] = rotations.astype(np.float32)    # np.array, R
364 |             gts['translations'] = translations.astype(np.float32)    # np.array, T
365 |             gts['poses'] = poses.astype(np.float32)    # np.array
366 |             gts['handle_visibility'] = handle_visibility    # handle visibility of mug
367 |             with open(img_full_path + '_label.pkl', 'wb') as f:
368 |                 cPickle.dump(gts, f)
369 |             valid_img_list.append(img_path)
370 |         # write valid img list to file
371 |         with open(os.path.join(data_dir, source, subset+'_list.txt'), 'w') as f:
372 |             for img_path in valid_img_list:
373 |                 f.write("%s\n" % img_path)
374 | 
375 | 
376 | if __name__ == '__main__':
377 |     data_dir = '/home/tianmeng/Documents/pose_ws/object-deformnet/data'
378 |     # create list for all data
379 |     create_img_list(data_dir)
380 |     # annotate dataset and re-write valid data to list
381 |     annotate_camera_train(data_dir)
382 |     annotate_real_train(data_dir)
383 |     annotate_test_data(data_dir)
384 | 


--------------------------------------------------------------------------------
/preprocess/shape_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import h5py
  4 | import glob
  5 | import numpy as np
  6 | import _pickle as cPickle
  7 | sys.path.append('../lib')
  8 | from utils import sample_points_from_mesh
  9 | 
 10 | 
 11 | def save_nocs_model_to_file(obj_model_dir):
 12 |     """ Sampling points from mesh model and normalize to NOCS.
 13 |         Models are centered at origin, i.e. NOCS-0.5
 14 | 
 15 |     """
 16 |     mug_meta = {}
 17 |     # used for re-align mug category
 18 |     special_cases = {'3a7439cfaa9af51faf1af397e14a566d': np.array([0.115, 0.0, 0.0]),
 19 |                      '5b0c679eb8a2156c4314179664d18101': np.array([0.083, 0.0, -0.044]),
 20 |                      '649a51c711dc7f3b32e150233fdd42e9': np.array([0.0, 0.0, -0.017]),
 21 |                      'bf2b5e941b43d030138af902bc222a59': np.array([0.0534, 0.0, 0.0]),
 22 |                      'ca198dc3f7dc0cacec6338171298c66b': np.array([0.120, 0.0, 0.0]),
 23 |                      'f42a9784d165ad2f5e723252788c3d6e': np.array([0.117, 0.0, -0.026])}
 24 | 
 25 |     # CAMERA dataset
 26 |     for subset in ['train', 'val']:
 27 |         camera = {}
 28 |         for synsetId in ['02876657', '02880940', '02942699', '02946921', '03642806', '03797390']:
 29 |             synset_dir = os.path.join(obj_model_dir, subset, synsetId)
 30 |             inst_list = sorted(os.listdir(synset_dir))
 31 |             for instance in inst_list:
 32 |                 path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj')
 33 |                 model_points = sample_points_from_mesh(path_to_mesh_model, 1024, fps=True, ratio=3)
 34 |                 # flip z-axis in CAMERA
 35 |                 model_points = model_points * np.array([[1.0, 1.0, -1.0]])
 36 |                 # re-align mug category
 37 |                 if synsetId == '03797390':
 38 |                     if instance == 'b9be7cfe653740eb7633a2dd89cec754':
 39 |                         # skip this instance in train set, improper mug model, only influence training.
 40 |                         continue
 41 |                     if instance in special_cases.keys():
 42 |                         shift = special_cases[instance]
 43 |                     else:
 44 |                         shift_x = (np.amin(model_points[:, 2]) - np.amax(model_points[:, 2])) / 2 - np.amin(model_points[:, 0])
 45 |                         shift = np.array([shift_x, 0.0, 0.0])
 46 |                     model_points += shift
 47 |                     size = 2 * np.amax(np.abs(model_points), axis=0)
 48 |                     scale = 1 / np.linalg.norm(size)
 49 |                     model_points *= scale
 50 |                     mug_meta[instance] = [shift, scale]
 51 |                 camera[instance] = model_points
 52 |         with open(os.path.join(obj_model_dir, 'camera_{}.pkl'.format(subset)), 'wb') as f:
 53 |             cPickle.dump(camera, f)
 54 |     # Real dataset
 55 |     for subset in ['real_train', 'real_test']:
 56 |         real = {}
 57 |         inst_list = glob.glob(os.path.join(obj_model_dir, subset, '*.obj'))
 58 |         for inst_path in inst_list:
 59 |             instance = os.path.basename(inst_path).split('.')[0]
 60 |             bbox_file = inst_path.replace('.obj', '.txt')
 61 |             bbox_dims = np.loadtxt(bbox_file)
 62 |             scale = np.linalg.norm(bbox_dims)
 63 |             model_points = sample_points_from_mesh(inst_path, 1024, fps=True, ratio=3)
 64 |             model_points /= scale
 65 |             # relable mug category
 66 |             if 'mug' in instance:
 67 |                 shift_x = (np.amin(model_points[:, 2]) - np.amax(model_points[:, 2])) / 2 - np.amin(model_points[:, 0])
 68 |                 shift = np.array([shift_x, 0.0, 0.0])
 69 |                 model_points += shift
 70 |                 size = 2 * np.amax(np.abs(model_points), axis=0)
 71 |                 scale = 1 / np.linalg.norm(size)
 72 |                 model_points *= scale
 73 |                 mug_meta[instance] = [shift, scale]
 74 |             real[instance] = model_points
 75 |         with open(os.path.join(obj_model_dir, '{}.pkl'.format(subset)), 'wb') as f:
 76 |             cPickle.dump(real, f)
 77 |     # save mug_meta information for re-labeling
 78 |     with open(os.path.join(obj_model_dir, 'mug_meta.pkl'), 'wb') as f:
 79 |         cPickle.dump(mug_meta, f)
 80 | 
 81 | 
 82 | def save_model_to_hdf5(obj_model_dir, n_points, fps=False, include_distractors=False, with_normal=False):
 83 |     """ Save object models (point cloud) to HDF5 file.
 84 |         Dataset used to train the auto-encoder.
 85 |         Only use models from ShapeNetCore.
 86 |         Background objects are not inlcuded as default. We did not observe that it helps
 87 |         to train the auto-encoder.
 88 | 
 89 |     """
 90 |     catId_to_synsetId = {1: '02876657', 2: '02880940', 3: '02942699', 4: '02946921', 5: '03642806', 6: '03797390'}
 91 |     distractors_synsetId = ['00000000', '02954340', '02992529', '03211117']
 92 |     with open(os.path.join(obj_model_dir, 'mug_meta.pkl'), 'rb') as f:
 93 |         mug_meta = cPickle.load(f)
 94 |     # read all the paths to models
 95 |     print('Sampling points from mesh model ...')
 96 |     if with_normal:
 97 |         train_data = np.zeros((3000, n_points, 6), dtype=np.float32)
 98 |         val_data = np.zeros((500, n_points, 6), dtype=np.float32)
 99 |     else:
100 |         train_data = np.zeros((3000, n_points, 3), dtype=np.float32)
101 |         val_data = np.zeros((500, n_points, 3), dtype=np.float32)
102 |     train_label = []
103 |     val_label = []
104 |     train_count = 0
105 |     val_count = 0
106 |     # CAMERA
107 |     for subset in ['train', 'val']:
108 |         for catId in range(1, 7):
109 |             synset_dir = os.path.join(obj_model_dir, subset, catId_to_synsetId[catId])
110 |             inst_list = sorted(os.listdir(synset_dir))
111 |             for instance in inst_list:
112 |                 path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj')
113 |                 if instance == 'b9be7cfe653740eb7633a2dd89cec754':
114 |                     continue
115 |                 model_points = sample_points_from_mesh(path_to_mesh_model, n_points, with_normal, fps=fps, ratio=2)
116 |                 model_points = model_points * np.array([[1.0, 1.0, -1.0]])
117 |                 if catId == 6:
118 |                     shift = mug_meta[instance][0]
119 |                     scale = mug_meta[instance][1]
120 |                     model_points = scale * (model_points + shift)
121 |                 if subset == 'train':
122 |                     train_data[train_count] = model_points
123 |                     train_label.append(catId)
124 |                     train_count += 1
125 |                 else:
126 |                     val_data[val_count] = model_points
127 |                     val_label.append(catId)
128 |                     val_count += 1
129 |         # distractors
130 |         if include_distractors:
131 |             for synsetId in distractors_synsetId:
132 |                 synset_dir = os.path.join(obj_model_dir, subset, synsetId)
133 |                 inst_list = sorted(os.listdir(synset_dir))
134 |                 for instance in inst_list:
135 |                     path_to_mesh_model = os.path.join(synset_dir, instance, 'model.obj')
136 |                     model_points = sample_points_from_mesh(path_to_mesh_model, n_points, with_normal, fps=fps, ratio=2)
137 |                     # TODO: check whether need to flip z-axis, currently not used
138 |                     model_points = model_points * np.array([[1.0, 1.0, -1.0]])
139 |                     if subset == 'train':
140 |                         train_data[train_count] = model_points
141 |                         train_label.append(0)
142 |                         train_count += 1
143 |                     else:
144 |                         val_data[val_count] = model_points
145 |                         val_label.append(0)
146 |                         val_count += 1
147 |     # Real
148 |     for subset in ['real_train', 'real_test']:
149 |         path_to_mesh_models = glob.glob(os.path.join(obj_model_dir, subset, '*.obj'))
150 |         for inst_path in sorted(path_to_mesh_models):
151 |             instance = os.path.basename(inst_path).split('.')[0]
152 |             if instance.startswith('bottle'):
153 |                 catId = 1
154 |             elif instance.startswith('bowl'):
155 |                 catId = 2
156 |             elif instance.startswith('camera'):
157 |                 catId = 3
158 |             elif instance.startswith('can'):
159 |                 catId = 4
160 |             elif instance.startswith('laptop'):
161 |                 catId = 5
162 |             elif instance.startswith('mug'):
163 |                 catId = 6
164 |             else:
165 |                 raise NotImplementedError
166 |             model_points = sample_points_from_mesh(inst_path, n_points, with_normal, fps=fps, ratio=2)
167 |             bbox_file = inst_path.replace('.obj', '.txt')
168 |             bbox_dims = np.loadtxt(bbox_file)
169 |             model_points /= np.linalg.norm(bbox_dims)
170 |             if catId == 6:
171 |                 shift = mug_meta[instance][0]
172 |                 scale = mug_meta[instance][1]
173 |                 model_points = scale * (model_points + shift)
174 |             if subset == 'real_train':
175 |                 train_data[train_count] = model_points
176 |                 train_label.append(catId)
177 |                 train_count += 1
178 |             else:
179 |                 val_data[val_count] = model_points
180 |                 val_label.append(catId)
181 |                 val_count += 1
182 | 
183 |     num_train_instances = len(train_label)
184 |     num_val_instances = len(val_label)
185 |     assert num_train_instances == train_count
186 |     assert num_val_instances == val_count
187 |     train_data = train_data[:num_train_instances]
188 |     val_data = val_data[:num_val_instances]
189 |     train_label = np.array(train_label, dtype=np.uint8)
190 |     val_label = np.array(val_label, dtype=np.uint8)
191 |     print('{} shapes found in train dataset'.format(num_train_instances))
192 |     print('{} shapes found in val dataset'.format(num_val_instances))
193 | 
194 |     # write to HDF5 file
195 |     print('Writing data to HDF5 file ...')
196 |     if with_normal:
197 |         filename = 'ShapeNetCore_{}_with_normal.h5'.format(n_points)
198 |     else:
199 |         filename = 'ShapeNetCore_{}.h5'.format(n_points)
200 |     hfile = h5py.File(os.path.join(obj_model_dir, filename), 'w')
201 |     train_dataset = hfile.create_group('train')
202 |     train_dataset.attrs.create('len', num_train_instances)
203 |     train_dataset.create_dataset('data', data=train_data, compression='gzip', dtype='float32')
204 |     train_dataset.create_dataset('label', data=train_label, compression='gzip', dtype='uint8')
205 |     val_dataset = hfile.create_group('val')
206 |     val_dataset.attrs.create('len', num_val_instances)
207 |     val_dataset.create_dataset('data', data=val_data, compression='gzip', dtype='float32')
208 |     val_dataset.create_dataset('label', data=val_label, compression='gzip', dtype='uint8')
209 |     hfile.close()
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     obj_model_dir = '/home/tianmeng/Documents/pose_ws/object-deformnet/data/obj_models'
214 |     # Save ground truth models for training deform network
215 |     save_nocs_model_to_file(obj_model_dir)
216 |     # Save models to HDF5 file for training the auto-encoder.
217 |     save_model_to_hdf5(obj_model_dir, n_points=4096, fps=False)
218 |     # Save nmodels to HDF5 file, which used to generate mean shape.
219 |     save_model_to_hdf5(obj_model_dir, n_points=2048, fps=True)
220 | 
221 |     # import random
222 |     # import open3d as o3d
223 |     # for file in ['camera_train.pkl', 'camera_val.pkl', 'real_train.pkl', 'real_test.pkl']:
224 |     #     with open(os.path.join(obj_model_dir, file), 'rb') as f:
225 |     #         obj_models = cPickle.load(f)
226 |     #     instance = random.choice(list(obj_models.keys()))
227 |     #     model_points = obj_models[instance]
228 |     #     print('Diameter: {}'.format(np.linalg.norm(2*np.amax(np.abs(model_points), axis=0))))
229 |     #     color = np.repeat(np.array([[1, 0, 0]]), model_points.shape[0], axis=0)
230 |     #     pcd = o3d.geometry.PointCloud()
231 |     #     pcd.points = o3d.utility.Vector3dVector(model_points)
232 |     #     pcd.colors = o3d.utility.Vector3dVector(color)
233 |     #     # visualization: camera coordinate frame
234 |     #     points = [[0, 0, 0], [0.5, 0, 0], [0, 0.5, 0], [0, 0, 0.5]]
235 |     #     lines = [[0, 1], [0, 2], [0, 3]]
236 |     #     colors = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
237 |     #     line_set = o3d.geometry.LineSet()
238 |     #     line_set.points = o3d.utility.Vector3dVector(points)
239 |     #     line_set.lines = o3d.utility.Vector2iVector(lines)
240 |     #     line_set.colors = o3d.utility.Vector3dVector(colors)
241 |     #     o3d.visualization.draw_geometries([pcd, line_set])
242 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation
 2 | ![teaser](assets/teaser.png)
 3 | 
 4 | ## Overview
 5 | This repository contains the PyTorch implementation of the paper "Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation"
 6 | ([arXiv](http://arxiv.org/abs/2007.08454)).
 7 | Our approach could recover the 6D pose and size of unseen objects from an RGB-D image, as well as reconstruct their complete 3D models.
 8 | 
 9 | ## Dependencies
10 | * Python 3.6
11 | * PyTorch 1.0.1
12 | * CUDA 9.0
13 | 
14 | ## Installation
15 | ```
16 | ROOT=/path/to/object-deformnet
17 | cd $ROOT/lib/nn_distance
18 | python setup.py install --user
19 | ```
20 | 
21 | ## Datasets
22 | Download [camera_train](http://download.cs.stanford.edu/orion/nocs/camera_train.zip), [camera_val](http://download.cs.stanford.edu/orion/nocs/camera_val25K.zip),
23 | [real_train](http://download.cs.stanford.edu/orion/nocs/real_train.zip), [real_test](http://download.cs.stanford.edu/orion/nocs/real_test.zip),
24 | [ground-truth annotations](http://download.cs.stanford.edu/orion/nocs/gts.zip),
25 | and [mesh models](http://download.cs.stanford.edu/orion/nocs/obj_models.zip)
26 | provided by [NOCS](https://github.com/hughw19/NOCS_CVPR2019).<br/>
27 | Unzip and organize these files in $ROOT/data as follows:
28 | ```
29 | data
30 | ├── CAMERA
31 | │   ├── train
32 | │   └── val
33 | ├── Real
34 | │   ├── train
35 | │   └── test
36 | ├── gts
37 | │   ├── val
38 | │   └── real_test
39 | └── obj_models
40 |     ├── train
41 |     ├── val
42 |     ├── real_train
43 |     └── real_test
44 | ```
45 | Run python scripts to prepare the datasets.
46 | ```
47 | cd $ROOT/preprocess
48 | python shape_data.py
49 | python pose_data.py
50 | ```
51 | Notice that running the scripts will additionally shift and re-scale the models of mug category (w/o modifying the original files),
52 | such that the origin of the object coordinate frame is on the axis of symmetry.
53 | This step is implemented for one of our early experiments and turns out to be unnecessary.
54 | Ignoring this step should make no difference to the performance of our approach.
55 | We keep it in this repo for reproducibility.
56 | 
57 | ## Training
58 | ```
59 | # optional - train an Autoencoder from scratch and prepare the shape priors
60 | python train_ae.py
61 | python mean_shape.py
62 | 
63 | # train DeformNet
64 | python train_deform.py
65 | ```
66 | 
67 | ## Evaluation
68 | Download the pre-trained models, segmentation results from Mask R-CNN, and predictions of NOCS from [here](https://drive.google.com/file/d/1p72NdY4Bie_sra9U8zoUNI4fTrQZdbnc/view?usp=sharing).
69 | ```
70 | unzip -q deformnet_eval.zip
71 | mv deformnet_eval/* $ROOT/results
72 | rmdir deformnet_eval
73 | cd $ROOT
74 | python evaluate.py
75 | ```
76 | 
77 | ## Citation
78 | If you find our work helpful, please consider citing:
79 | ```
80 | @InProceedings{Tian_2020_ECCV,
81 |   author = {Tian, Meng and Ang Jr, Marcelo H and Lee, Gim Hee},
82 |   title = {Shape Prior Deformation for Categorical 6D Object Pose and Size Estimation},
83 |   booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)},
84 |   month = {August},
85 |   year = {2020}
86 | }
87 | ```
88 | 
89 | ## Acknowledgment
90 | Our implementation leverages the code from [NOCS](https://github.com/hughw19/NOCS_CVPR2019) and [3PU](https://github.com/yifita/3PU_pytorch).
91 | 


--------------------------------------------------------------------------------
/tools/tsne.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  tsne.py
  3 | #
  4 | # Implementation of t-SNE in Python. The implementation was tested on Python
  5 | # 2.7.10, and it requires a working installation of NumPy. The implementation
  6 | # comes with an example on the MNIST dataset. In order to plot the
  7 | # results of this example, a working installation of matplotlib is required.
  8 | #
  9 | # The example can be run by executing: `ipython tsne.py`
 10 | #
 11 | #
 12 | #  Created by Laurens van der Maaten on 20-12-08.
 13 | #  Copyright (c) 2008 Tilburg University. All rights reserved.
 14 | 
 15 | import numpy as np
 16 | import pylab
 17 | 
 18 | 
 19 | def Hbeta(D=np.array([]), beta=1.0):
 20 |     """
 21 |         Compute the perplexity and the P-row for a specific value of the
 22 |         precision of a Gaussian distribution.
 23 |     """
 24 | 
 25 |     # Compute P-row and corresponding perplexity
 26 |     P = np.exp(-D.copy() * beta)
 27 |     sumP = sum(P)
 28 |     H = np.log(sumP) + beta * np.sum(D * P) / sumP
 29 |     P = P / sumP
 30 |     return H, P
 31 | 
 32 | 
 33 | def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
 34 |     """
 35 |         Performs a binary search to get P-values in such a way that each
 36 |         conditional Gaussian has the same perplexity.
 37 |     """
 38 | 
 39 |     # Initialize some variables
 40 |     print("Computing pairwise distances...")
 41 |     (n, d) = X.shape
 42 |     sum_X = np.sum(np.square(X), 1)
 43 |     D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
 44 |     P = np.zeros((n, n))
 45 |     beta = np.ones((n, 1))
 46 |     logU = np.log(perplexity)
 47 | 
 48 |     # Loop over all datapoints
 49 |     for i in range(n):
 50 | 
 51 |         # Print progress
 52 |         if i % 500 == 0:
 53 |             print("Computing P-values for point %d of %d..." % (i, n))
 54 | 
 55 |         # Compute the Gaussian kernel and entropy for the current precision
 56 |         betamin = -np.inf
 57 |         betamax = np.inf
 58 |         Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))]
 59 |         (H, thisP) = Hbeta(Di, beta[i])
 60 | 
 61 |         # Evaluate whether the perplexity is within tolerance
 62 |         Hdiff = H - logU
 63 |         tries = 0
 64 |         while np.abs(Hdiff) > tol and tries < 50:
 65 | 
 66 |             # If not, increase or decrease precision
 67 |             if Hdiff > 0:
 68 |                 betamin = beta[i].copy()
 69 |                 if betamax == np.inf or betamax == -np.inf:
 70 |                     beta[i] = beta[i] * 2.
 71 |                 else:
 72 |                     beta[i] = (beta[i] + betamax) / 2.
 73 |             else:
 74 |                 betamax = beta[i].copy()
 75 |                 if betamin == np.inf or betamin == -np.inf:
 76 |                     beta[i] = beta[i] / 2.
 77 |                 else:
 78 |                     beta[i] = (beta[i] + betamin) / 2.
 79 | 
 80 |             # Recompute the values
 81 |             (H, thisP) = Hbeta(Di, beta[i])
 82 |             Hdiff = H - logU
 83 |             tries += 1
 84 | 
 85 |         # Set the final row of P
 86 |         P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP
 87 | 
 88 |     # Return final P-matrix
 89 |     print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
 90 |     return P
 91 | 
 92 | 
 93 | def pca(X=np.array([]), no_dims=50):
 94 |     """
 95 |         Runs PCA on the NxD array X in order to reduce its dimensionality to
 96 |         no_dims dimensions.
 97 |     """
 98 | 
 99 |     print("Preprocessing the data using PCA...")
100 |     (n, d) = X.shape
101 |     X = X - np.tile(np.mean(X, 0), (n, 1))
102 |     (l, M) = np.linalg.eig(np.dot(X.T, X))
103 |     Y = np.dot(X, M[:, 0:no_dims])
104 |     return Y
105 | 
106 | 
107 | def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
108 |     """
109 |         Runs t-SNE on the dataset in the NxD array X to reduce its
110 |         dimensionality to no_dims dimensions. The syntaxis of the function is
111 |         `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
112 |     """
113 | 
114 |     # Check inputs
115 |     if isinstance(no_dims, float):
116 |         print("Error: array X should have type float.")
117 |         return -1
118 |     if round(no_dims) != no_dims:
119 |         print("Error: number of dimensions should be an integer.")
120 |         return -1
121 | 
122 |     # Initialize variables
123 |     X = pca(X, initial_dims).real
124 |     (n, d) = X.shape
125 |     max_iter = 1000
126 |     initial_momentum = 0.5
127 |     final_momentum = 0.8
128 |     eta = 500
129 |     min_gain = 0.01
130 |     Y = np.random.randn(n, no_dims)
131 |     dY = np.zeros((n, no_dims))
132 |     iY = np.zeros((n, no_dims))
133 |     gains = np.ones((n, no_dims))
134 | 
135 |     # Compute P-values
136 |     P = x2p(X, 1e-5, perplexity)
137 |     P = P + np.transpose(P)
138 |     P = P / np.sum(P)
139 |     P = P * 4.									# early exaggeration
140 |     P = np.maximum(P, 1e-12)
141 | 
142 |     # Run iterations
143 |     for iter in range(max_iter):
144 | 
145 |         # Compute pairwise affinities
146 |         sum_Y = np.sum(np.square(Y), 1)
147 |         num = -2. * np.dot(Y, Y.T)
148 |         num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
149 |         num[range(n), range(n)] = 0.
150 |         Q = num / np.sum(num)
151 |         Q = np.maximum(Q, 1e-12)
152 | 
153 |         # Compute gradient
154 |         PQ = P - Q
155 |         for i in range(n):
156 |             dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0)
157 | 
158 |         # Perform the update
159 |         if iter < 20:
160 |             momentum = initial_momentum
161 |         else:
162 |             momentum = final_momentum
163 |         gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
164 |                 (gains * 0.8) * ((dY > 0.) == (iY > 0.))
165 |         gains[gains < min_gain] = min_gain
166 |         iY = momentum * iY - eta * (gains * dY)
167 |         Y = Y + iY
168 |         Y = Y - np.tile(np.mean(Y, 0), (n, 1))
169 | 
170 |         # Compute current value of cost function
171 |         if (iter + 1) % 10 == 0:
172 |             C = np.sum(P * np.log(P / Q))
173 |             print("Iteration %d: error is %f" % (iter + 1, C))
174 | 
175 |         # Stop lying about P-values
176 |         if iter == 100:
177 |             P = P / 4.
178 | 
179 |     # Return solution
180 |     return Y
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
185 |     print("Running example on 2,500 MNIST digits...")
186 |     X = np.loadtxt("mnist2500_X.txt")
187 |     labels = np.loadtxt("mnist2500_labels.txt")
188 |     Y = tsne(X, 2, 50, 20.0)
189 |     pylab.scatter(Y[:, 0], Y[:, 1], 20, labels)
190 |     pylab.show()
191 | 


--------------------------------------------------------------------------------
/train_ae.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import argparse
  4 | import torch
  5 | import tensorflow as tf
  6 | from lib.auto_encoder import PointCloudAE
  7 | from lib.loss import ChamferLoss
  8 | from data.shape_dataset import ShapeDataset
  9 | from lib.utils import setup_logger
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--num_point', type=int, default=1024, help='number of points, needed if use points')
 14 | parser.add_argument('--emb_dim', type=int, default=512, help='dimension of latent embedding [default: 512]')
 15 | parser.add_argument('--h5_file', type=str, default='data/obj_models/ShapeNetCore_4096.h5', help='h5 file')
 16 | parser.add_argument('--batch_size', type=int, default=32, help='batch size')
 17 | parser.add_argument('--num_workers', type=int, default=10, help='number of data loading workers')
 18 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use')
 19 | parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate')
 20 | parser.add_argument('--start_epoch', type=int, default=1, help='which epoch to start')
 21 | parser.add_argument('--max_epoch', type=int, default=50, help='max number of epochs to train')
 22 | parser.add_argument('--resume_model', type=str, default='', help='resume from saved model')
 23 | parser.add_argument('--result_dir', type=str, default='results/ae_points', help='directory to save train results')
 24 | opt = parser.parse_args()
 25 | 
 26 | opt.repeat_epoch = 10
 27 | opt.decay_step = 5000
 28 | opt.decay_rate = [1.0, 0.6, 0.3, 0.1]
 29 | 
 30 | 
 31 | def train_net():
 32 |     # set result directory
 33 |     if not os.path.exists(opt.result_dir):
 34 |         os.makedirs(opt.result_dir)
 35 |     tb_writer = tf.summary.FileWriter(opt.result_dir)
 36 |     logger = setup_logger('train_log', os.path.join(opt.result_dir, 'log.txt'))
 37 |     for key, value in vars(opt).items():
 38 |         logger.info(key + ': ' + str(value))
 39 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
 40 |     # model & loss
 41 |     estimator = PointCloudAE(opt.emb_dim, opt.num_point)
 42 |     estimator.cuda()
 43 |     criterion = ChamferLoss()
 44 |     if opt.resume_model != '':
 45 |         estimator.load_state_dict(torch.load(opt.resume_model))
 46 |     # dataset
 47 |     train_dataset = ShapeDataset(opt.h5_file, mode='train', augment=True)
 48 |     train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size,
 49 |                                                    shuffle=True, num_workers=opt.num_workers)
 50 |     val_dataset = ShapeDataset(opt.h5_file, mode='val', augment=False)
 51 |     val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.batch_size,
 52 |                                                  shuffle=False, num_workers=opt.num_workers)
 53 |     # train
 54 |     st_time = time.time()
 55 |     global_step = ((train_dataset.length + opt.batch_size - 1) // opt.batch_size) * opt.repeat_epoch * (opt.start_epoch - 1)
 56 |     decay_count = -1
 57 |     for epoch in range(opt.start_epoch, opt.max_epoch+1):
 58 |         # train one epoch
 59 |         logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \
 60 |                     ', ' + 'Epoch %02d' % epoch + ', ' + 'Training started'))
 61 |         # create optimizer and adjust learning rate if needed
 62 |         if global_step // opt.decay_step > decay_count:
 63 |             decay_count += 1
 64 |             if decay_count < len(opt.decay_rate):
 65 |                 current_lr = opt.lr * opt.decay_rate[decay_count]
 66 |                 optimizer = torch.optim.Adam(estimator.parameters(), lr=current_lr)
 67 |         batch_idx = 0
 68 |         estimator.train()
 69 |         for rep in range(opt.repeat_epoch):
 70 |             for i, data in enumerate(train_dataloader):
 71 |                 # label must be zero_indexed
 72 |                 batch_xyz, batch_label = data
 73 |                 batch_xyz = batch_xyz[:, :, :3].cuda()
 74 |                 optimizer.zero_grad()
 75 |                 embedding, point_cloud = estimator(batch_xyz)
 76 |                 loss, _, _ = criterion(point_cloud, batch_xyz)
 77 |                 summary = tf.Summary(value=[tf.Summary.Value(tag='learning_rate', simple_value=current_lr),
 78 |                                             tf.Summary.Value(tag='train_loss', simple_value=loss)])
 79 |                 # backward
 80 |                 loss.backward()
 81 |                 optimizer.step()
 82 |                 global_step += 1
 83 |                 batch_idx += 1
 84 |                 # write results to tensorboard
 85 |                 tb_writer.add_summary(summary, global_step)
 86 |                 if batch_idx % 10 == 0:
 87 |                     logger.info('Batch {0} Loss:{1:f}'.format(batch_idx, loss))
 88 |         logger.info('>>>>>>>>----------Epoch {:02d} train finish---------<<<<<<<<'.format(epoch))
 89 |         # evaluate one epoch
 90 |         logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \
 91 |                     ', ' + 'Epoch %02d' % epoch + ', ' + 'Testing started'))
 92 |         estimator.eval()
 93 |         val_loss = 0.0
 94 |         for i, data in enumerate(val_dataloader, 1):
 95 |             batch_xyz, batch_label = data
 96 |             batch_xyz = batch_xyz[:, :, :3].cuda()
 97 |             embedding, point_cloud = estimator(batch_xyz)
 98 |             loss, _, _ = criterion(point_cloud, batch_xyz)
 99 |             val_loss += loss.item()
100 |             logger.info('Batch {0} Loss:{1:f}'.format(i, loss))
101 |         val_loss = val_loss / i
102 |         summary = tf.Summary(value=[tf.Summary.Value(tag='val_loss', simple_value=val_loss)])
103 |         tb_writer.add_summary(summary, global_step)
104 |         logger.info('Epoch {0:02d} test average loss: {1:06f}'.format(epoch, val_loss))
105 |         logger.info('>>>>>>>>----------Epoch {:02d} test finish---------<<<<<<<<'.format(epoch))
106 |         # save model after each epoch
107 |         torch.save(estimator.state_dict(), '{0}/model_{1:02d}.pth'.format(opt.result_dir, epoch))
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     train_net()
112 | 


--------------------------------------------------------------------------------
/train_deform.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import argparse
  4 | import random
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import tensorflow as tf
  9 | from lib.network import DeformNet
 10 | from lib.loss import Loss
 11 | from data.pose_dataset import PoseDataset
 12 | from lib.utils import setup_logger, compute_sRT_errors
 13 | from lib.align import estimateSimilarityTransform
 14 | 
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('--dataset', type=str, default='CAMERA', help='CAMERA or CAMERA+Real')
 18 | parser.add_argument('--data_dir', type=str, default='data', help='data directory')
 19 | parser.add_argument('--n_pts', type=int, default=1024, help='number of foreground points')
 20 | parser.add_argument('--n_cat', type=int, default=6, help='number of object categories')
 21 | parser.add_argument('--nv_prior', type=int, default=1024, help='number of vertices in shape priors')
 22 | parser.add_argument('--img_size', type=int, default=192, help='cropped image size')
 23 | parser.add_argument('--batch_size', type=int, default=32, help='batch size')
 24 | parser.add_argument('--num_workers', type=int, default=10, help='number of data loading workers')
 25 | parser.add_argument('--gpu', type=str, default='0', help='GPU to use')
 26 | parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate')
 27 | parser.add_argument('--start_epoch', type=int, default=1, help='which epoch to start')
 28 | parser.add_argument('--max_epoch', type=int, default=50, help='max number of epochs to train')
 29 | parser.add_argument('--resume_model', type=str, default='', help='resume from saved model')
 30 | parser.add_argument('--result_dir', type=str, default='results/camera', help='directory to save train results')
 31 | opt = parser.parse_args()
 32 | 
 33 | opt.decay_epoch = [0, 10, 20, 30, 40]
 34 | opt.decay_rate = [1.0, 0.6, 0.3, 0.1, 0.01]
 35 | opt.corr_wt = 1.0
 36 | opt.cd_wt = 5.0
 37 | opt.entropy_wt = 0.0001
 38 | opt.deform_wt = 0.01
 39 | 
 40 | 
 41 | def train_net():
 42 |     # set result directory
 43 |     if not os.path.exists(opt.result_dir):
 44 |         os.makedirs(opt.result_dir)
 45 |     tb_writer = tf.summary.FileWriter(opt.result_dir)
 46 |     logger = setup_logger('train_log', os.path.join(opt.result_dir, 'log.txt'))
 47 |     for key, value in vars(opt).items():
 48 |         logger.info(key + ': ' + str(value))
 49 |     os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
 50 |     # model & loss
 51 |     estimator = DeformNet(opt.n_cat, opt.nv_prior)
 52 |     estimator.cuda()
 53 |     criterion = Loss(opt.corr_wt, opt.cd_wt, opt.entropy_wt, opt.deform_wt)
 54 |     if opt.resume_model != '':
 55 |         estimator.load_state_dict(torch.load(opt.resume_model))
 56 |     # dataset
 57 |     train_dataset = PoseDataset(opt.dataset, 'train', opt.data_dir, opt.n_pts, opt.img_size)
 58 |     val_dataset = PoseDataset(opt.dataset, 'test', opt.data_dir, opt.n_pts, opt.img_size)
 59 |     # start training
 60 |     st_time = time.time()
 61 |     train_steps = 1500
 62 |     global_step = train_steps * (opt.start_epoch - 1)
 63 |     n_decays = len(opt.decay_epoch)
 64 |     assert len(opt.decay_rate) == n_decays
 65 |     for i in range(n_decays):
 66 |         if opt.start_epoch > opt.decay_epoch[i]:
 67 |             decay_count = i
 68 |     train_size = train_steps * opt.batch_size
 69 |     indices = []
 70 |     page_start = -train_size
 71 |     for epoch in range(opt.start_epoch, opt.max_epoch + 1):
 72 |         # train one epoch
 73 |         logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) + \
 74 |                     ', ' + 'Epoch %02d' % epoch + ', ' + 'Training started'))
 75 |         # create optimizer and adjust learning rate if needed
 76 |         if decay_count < len(opt.decay_rate):
 77 |             if epoch > opt.decay_epoch[decay_count]:
 78 |                 current_lr = opt.lr * opt.decay_rate[decay_count]
 79 |                 optimizer = torch.optim.Adam(estimator.parameters(), lr=current_lr)
 80 |                 decay_count += 1
 81 |         # sample train subset
 82 |         page_start += train_size
 83 |         len_last = len(indices) - page_start
 84 |         if len_last < train_size:
 85 |             indices = indices[page_start:]
 86 |             if opt.dataset == 'CAMERA+Real':
 87 |                 # CAMERA : Real = 3 : 1
 88 |                 camera_len = train_dataset.subset_len[0]
 89 |                 real_len = train_dataset.subset_len[1]
 90 |                 real_indices = list(range(camera_len, camera_len+real_len))
 91 |                 camera_indices = list(range(camera_len))
 92 |                 n_repeat = (train_size - len_last) // (4 * real_len) + 1
 93 |                 data_list = random.sample(camera_indices, 3*n_repeat*real_len) + real_indices*n_repeat
 94 |                 random.shuffle(data_list)
 95 |                 indices += data_list
 96 |             else:
 97 |                 data_list = list(range(train_dataset.length))
 98 |                 for i in range((train_size - len_last) // train_dataset.length + 1):
 99 |                     random.shuffle(data_list)
100 |                     indices += data_list
101 |             page_start = 0
102 |         train_idx = indices[page_start:(page_start+train_size)]
103 |         train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_idx)
104 |         train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, sampler=train_sampler,
105 |                                                        num_workers=opt.num_workers, pin_memory=True)
106 |         estimator.train()
107 |         for i, data in enumerate(train_dataloader, 1):
108 |             points, rgb, choose, cat_id, model, prior, sRT, nocs = data
109 |             points = points.cuda()
110 |             rgb = rgb.cuda()
111 |             choose = choose.cuda()
112 |             cat_id = cat_id.cuda()
113 |             model = model.cuda()
114 |             prior = prior.cuda()
115 |             sRT = sRT.cuda()
116 |             nocs = nocs.cuda()
117 |             assign_mat, deltas = estimator(points, rgb, choose, cat_id, prior)
118 |             loss, corr_loss, cd_loss, entropy_loss, deform_loss = criterion(assign_mat, deltas, prior, nocs, model)
119 |             optimizer.zero_grad()
120 |             loss.backward()
121 |             optimizer.step()
122 |             global_step += 1
123 |             # write results to tensorboard
124 |             summary = tf.Summary(value=[tf.Summary.Value(tag='learning_rate', simple_value=current_lr),
125 |                                         tf.Summary.Value(tag='train_loss', simple_value=loss),
126 |                                         tf.Summary.Value(tag='corr_loss', simple_value=corr_loss),
127 |                                         tf.Summary.Value(tag='cd_loss', simple_value=cd_loss),
128 |                                         tf.Summary.Value(tag='entropy_loss', simple_value=entropy_loss),
129 |                                         tf.Summary.Value(tag='deform_loss', simple_value=deform_loss)])
130 |             tb_writer.add_summary(summary, global_step)
131 |             if i % 10 == 0:
132 |                 logger.info('Batch {0} Loss:{1:f}, corr_loss:{2:f}, cd_loss:{3:f}, entropy_loss:{4:f}, deform_loss:{5:f}'.format(
133 |                     i, loss.item(), corr_loss.item(), cd_loss.item(), entropy_loss.item(), deform_loss.item()))
134 | 
135 |         logger.info('>>>>>>>>----------Epoch {:02d} train finish---------<<<<<<<<'.format(epoch))
136 | 
137 |         # evaluate one epoch
138 |         logger.info('Time {0}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - st_time)) +
139 |                     ', ' + 'Epoch %02d' % epoch + ', ' + 'Testing started'))
140 |         val_loss = 0.0
141 |         total_count = np.zeros((opt.n_cat,), dtype=int)
142 |         strict_success = np.zeros((opt.n_cat,), dtype=int)    # 5 degree and 5 cm
143 |         easy_success = np.zeros((opt.n_cat,), dtype=int)      # 10 degree and 5 cm
144 |         iou_success = np.zeros((opt.n_cat,), dtype=int)       # relative scale error < 0.1
145 |         # sample validation subset
146 |         val_size = 1500
147 |         val_idx = random.sample(list(range(val_dataset.length)), val_size)
148 |         val_sampler = torch.utils.data.sampler.SubsetRandomSampler(val_idx)
149 |         val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, sampler=val_sampler,
150 |                                                      num_workers=opt.num_workers, pin_memory=True)
151 |         estimator.eval()
152 |         for i, data in enumerate(val_dataloader, 1):
153 |             points, rgb, choose, cat_id, model, prior, sRT, nocs = data
154 |             points = points.cuda()
155 |             rgb = rgb.cuda()
156 |             choose = choose.cuda()
157 |             cat_id = cat_id.cuda()
158 |             model = model.cuda()
159 |             prior = prior.cuda()
160 |             sRT = sRT.cuda()
161 |             nocs = nocs.cuda()
162 |             assign_mat, deltas = estimator(points, rgb, choose, cat_id, prior)
163 |             loss, _, _, _, _ = criterion(assign_mat, deltas, prior, nocs, model)
164 |             # estimate pose and scale
165 |             inst_shape = prior + deltas
166 |             assign_mat = F.softmax(assign_mat, dim=2)
167 |             nocs_coords = torch.bmm(assign_mat, inst_shape)
168 |             nocs_coords = nocs_coords.detach().cpu().numpy()[0]
169 |             points = points.cpu().numpy()[0]
170 |             # use choose to remove repeated points
171 |             choose = choose.cpu().numpy()[0]
172 |             _, choose = np.unique(choose, return_index=True)
173 |             nocs_coords = nocs_coords[choose, :]
174 |             points = points[choose, :]
175 |             _, _, _, pred_sRT = estimateSimilarityTransform(nocs_coords, points)
176 |             # evaluate pose
177 |             cat_id = cat_id.item()
178 |             if pred_sRT is not None:
179 |                 sRT = sRT.detach().cpu().numpy()[0]
180 |                 R_error, T_error, IoU = compute_sRT_errors(pred_sRT, sRT)
181 |                 if R_error < 5 and T_error < 0.05:
182 |                     strict_success[cat_id] += 1
183 |                 if R_error < 10 and T_error < 0.05:
184 |                     easy_success[cat_id] += 1
185 |                 if IoU < 0.1:
186 |                     iou_success[cat_id] += 1
187 |             total_count[cat_id] += 1
188 |             val_loss += loss.item()
189 |             if i % 100 == 0:
190 |                 logger.info('Batch {0} Loss:{1:f}'.format(i, loss.item()))
191 |         # compute accuracy
192 |         strict_acc = 100 * (strict_success / total_count)
193 |         easy_acc = 100 * (easy_success / total_count)
194 |         iou_acc = 100 * (iou_success / total_count)
195 |         for i in range(opt.n_cat):
196 |             logger.info('{} accuracies:'.format(val_dataset.cat_names[i]))
197 |             logger.info('5^o 5cm: {:4f}'.format(strict_acc[i]))
198 |             logger.info('10^o 5cm: {:4f}'.format(easy_acc[i]))
199 |             logger.info('IoU < 0.1: {:4f}'.format(iou_acc[i]))
200 |         strict_acc = np.mean(strict_acc)
201 |         easy_acc = np.mean(easy_acc)
202 |         iou_acc = np.mean(iou_acc)
203 |         val_loss = val_loss / val_size
204 |         summary = tf.Summary(value=[tf.Summary.Value(tag='val_loss', simple_value=val_loss),
205 |                                     tf.Summary.Value(tag='5^o5cm_acc', simple_value=strict_acc),
206 |                                     tf.Summary.Value(tag='10^o5cm_acc', simple_value=easy_acc),
207 |                                     tf.Summary.Value(tag='iou_acc', simple_value=iou_acc)])
208 |         tb_writer.add_summary(summary, global_step)
209 |         logger.info('Epoch {0:02d} test average loss: {1:06f}'.format(epoch, val_loss))
210 |         logger.info('Overall accuracies:')
211 |         logger.info('5^o 5cm: {:4f} 10^o 5cm: {:4f} IoU: {:4f}'.format(strict_acc, easy_acc, iou_acc))
212 |         logger.info('>>>>>>>>----------Epoch {:02d} test finish---------<<<<<<<<'.format(epoch))
213 |         # save model after each epoch
214 |         torch.save(estimator.state_dict(), '{0}/model_{1:02d}.pth'.format(opt.result_dir, epoch))
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     train_net()
219 | 


--------------------------------------------------------------------------------