├── doc ├── FigNetwork.pdf └── FigNetwork2.png ├── eval.py ├── LICENSE ├── main.py ├── utils ├── pc_utils.py ├── configs.py ├── tfnp_compatibility.py ├── align_utils.py ├── spherical_utils.py └── evaluation_utils.py ├── README.md ├── provider ├── dataloder.py └── training_data_prepare.py └── model ├── modules.py ├── layers.py └── dualposenet.py /doc/FigNetwork.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gorilla-Lab-SCUT/DualPoseNet/HEAD/doc/FigNetwork.pdf -------------------------------------------------------------------------------- /doc/FigNetwork2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Gorilla-Lab-SCUT/DualPoseNet/HEAD/doc/FigNetwork2.png -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ Evaluation of results of DualPoseNet on CAMERA25 and REAL275. 7 | """ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import os 14 | import sys 15 | 16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 17 | ROOT_DIR = BASE_DIR 18 | sys.path.append(os.path.join(ROOT_DIR, 'utils')) 19 | from evaluation_utils import evaluate 20 | 21 | 22 | CAMERA25_path = os.path.join('results', 'CAMERA25') 23 | REAL275_path = os.path.join('results', 'REAL275') 24 | 25 | 26 | print('\n*********** Evaluate the results of DualPoseNet on CAMERA25 ***********') 27 | evaluate(CAMERA25_path) 28 | 29 | print('\n*********** Evaluate the results of DualPoseNet on REAL275 ***********') 30 | evaluate(REAL275_path) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jiehong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ 7 | Training/testing routines of DualPoseNet for category-level pose estimation on CAMERA25 or REAL275. 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import os 15 | import sys 16 | import logging 17 | import pprint 18 | pp = pprint.PrettyPrinter() 19 | 20 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 21 | ROOT_DIR = BASE_DIR 22 | sys.path.append(os.path.join(ROOT_DIR, 'model')) 23 | sys.path.append(os.path.join(ROOT_DIR, 'provider')) 24 | sys.path.append(os.path.join(ROOT_DIR, 'utils')) 25 | 26 | 27 | import tensorflow as tf 28 | import configs 29 | from dualposenet import DualPoseNet 30 | from evaluation_utils import evaluate 31 | 32 | 33 | def run(): 34 | FLAGS = configs.parse() 35 | assert FLAGS.dataset=='REAL275' or FLAGS.dataset=='CAMERA25', 'Error dataset of {}, which should be chosen from [REAL275, CAMERA25]'.format(FLAGS.dataset) 36 | assert FLAGS.phase in ['train', 'test', 'test_refine_encoder', 'test_refine_feature'], 'Error dataset of {}, which should be chosen from [train, test, test_refine_encoder, test_refine_feature]'.format(FLAGS.phase) 37 | 38 | FLAGS.log_dir = os.path.join('log', FLAGS.dataset) 39 | if not os.path.exists('log'): 40 | os.makedirs('log') 41 | if not os.path.exists(FLAGS.log_dir): 42 | os.makedirs(FLAGS.log_dir) 43 | if FLAGS.phase !='train': 44 | FLAGS.test_log_dir = os.path.join(FLAGS.log_dir, FLAGS.phase + '_epoch' + str(FLAGS.test_epoch)) 45 | if not os.path.exists(FLAGS.test_log_dir): 46 | os.makedirs(FLAGS.test_log_dir) 47 | 48 | run_config = tf.ConfigProto() 49 | run_config.gpu_options.allow_growth = True 50 | 51 | with tf.Session(config=run_config) as sess: 52 | model = DualPoseNet(FLAGS,sess) 53 | 54 | if FLAGS.phase == 'train': 55 | model.train() 56 | else: 57 | if FLAGS.phase == 'test': 58 | model.test() 59 | elif FLAGS.phase == 'test_refine_encoder': 60 | model.test_refine_encoder() 61 | elif FLAGS.phase == 'test_refine_feature': 62 | model.test_refine_feature() 63 | 64 | print('\n*********** Evaluate the results on {} ***********'.format(FLAGS.dataset)) 65 | evaluate(FLAGS.test_log_dir) 66 | 67 | 68 | def main(unused_argv): 69 | run() 70 | 71 | if __name__ == '__main__': 72 | logging.basicConfig(level=logging.INFO) 73 | tf.app.run() 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /utils/pc_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ 7 | Modified based on: https://github.com/hughw19/NOCS_CVPR2019. 8 | """ 9 | 10 | import cv2 11 | import numpy as np 12 | 13 | import math 14 | import cmath 15 | 16 | 17 | def load_depth(img_path): 18 | """ Load depth image from img_path. """ 19 | depth_path = img_path + '_depth.png' 20 | depth = cv2.imread(depth_path, -1) 21 | if len(depth.shape) == 3: 22 | # This is encoded depth image, let's convert 23 | # NOTE: RGB is actually BGR in opencv 24 | depth16 = depth[:, :, 1]*256 + depth[:, :, 2] 25 | depth16 = np.where(depth16==32001, 0, depth16) 26 | depth16 = depth16.astype(np.uint16) 27 | elif len(depth.shape) == 2 and depth.dtype == 'uint16': 28 | depth16 = depth 29 | else: 30 | assert False, '[ Error ]: Unsupported depth type.' 31 | return depth16 32 | 33 | 34 | def backproject(depth, intrinsics, instance_mask): 35 | intrinsics_inv = np.linalg.inv(intrinsics) 36 | image_shape = depth.shape 37 | width = image_shape[1] 38 | height = image_shape[0] 39 | 40 | x = np.arange(width) 41 | y = np.arange(height) 42 | 43 | #non_zero_mask = np.logical_and(depth > 0, depth < 5000) 44 | non_zero_mask = (depth > 0) 45 | final_instance_mask = np.logical_and(instance_mask, non_zero_mask) 46 | 47 | idxs = np.where(final_instance_mask) 48 | grid = np.array([idxs[1], idxs[0]]) 49 | 50 | # shape: height * width 51 | # mesh_grid = np.meshgrid(x, y) #[height, width, 2] 52 | # mesh_grid = np.reshape(mesh_grid, [2, -1]) 53 | length = grid.shape[1] 54 | ones = np.ones([1, length]) 55 | uv_grid = np.concatenate((grid, ones), axis=0) # [3, num_pixel] 56 | 57 | xyz = intrinsics_inv @ uv_grid # [3, num_pixel] 58 | xyz = np.transpose(xyz) #[num_pixel, 3] 59 | 60 | z = depth[idxs[0], idxs[1]] 61 | 62 | # print(np.amax(z), np.amin(z)) 63 | pts = xyz * z[:, np.newaxis]/xyz[:, -1:] 64 | pts[:, 0] = -pts[:, 0] 65 | pts[:, 1] = -pts[:, 1] 66 | 67 | return pts, idxs 68 | 69 | 70 | def pc2sphericalmap(pc, img, resolution=64): 71 | n = pc.shape[0] 72 | assert pc.shape[1] == 3 73 | 74 | pc = pc.astype(np.float32) 75 | x = pc[:, 0] 76 | y = pc[:, 1] 77 | z = pc[:, 2] 78 | t = np.pi / resolution 79 | k = 2 * np.pi / resolution 80 | r = np.sqrt(np.sum(pc ** 2, axis=1) + 1e-10).astype(np.float32) 81 | 82 | phi = np.around(np.arccos(z / r) / t).astype('int') % resolution 83 | arr = np.arctan2(y, x) 84 | rho = np.zeros(n) 85 | rho[y > 0] = np.around(arr[y > 0] / k) 86 | rho[y < 0] = np.around((arr[y < 0] + 2 * np.pi) / k) 87 | rho = rho.astype('int') % resolution 88 | f1 = np.zeros([resolution, resolution, 1], dtype='float32') 89 | f2 = np.zeros([resolution, resolution, 3], dtype='float32') 90 | 91 | for i in range(pc.shape[0]): 92 | tmp = np.real(cmath.rect(r[i], 0)) 93 | if f1[rho[i], phi[i], 0] <= tmp: 94 | f1[rho[i], phi[i], 0] = tmp 95 | f2[rho[i], phi[i], :] = img[i] 96 | 97 | return f1[np.newaxis, :, :, :], f2[np.newaxis, :, :, :] -------------------------------------------------------------------------------- /utils/configs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import argparse 8 | import os 9 | import yaml 10 | 11 | def parse(args=None): 12 | parser = argparse.ArgumentParser(description='Configurations of DualPoseNet.', 13 | fromfile_prefix_chars='@') 14 | parser.add_argument('--phase', default='train', 15 | help="train/test/test_refine_encoder/test_refine_feature") 16 | parser.add_argument('--dataset', default='REAL275', 17 | help="REAL275/CAMERA25") 18 | 19 | # dataset 20 | parser.add_argument('--n_classes', '-nc', type=int, default=6, 21 | help='Number of classes in dataset') 22 | parser.add_argument('--input_res', '-res', type=int, default=64, 23 | help='Resolution for spherical inputs; may subsample if larger') 24 | 25 | # model 26 | parser.add_argument('--nfilters', default=[16, 16, 32, 32, 64, 64, 128, 128], 27 | type=lambda x: [int(_) for _ in x.split(',')], 28 | help='Number of filters per layer') 29 | parser.add_argument('--pool_layers', default=[0, 0, 1, 0, 1, 0, 1, 0], 30 | type=lambda x: [int(_) for _ in x.split(',')], 31 | help='Pooling layer indicator') 32 | parser.add_argument('--n_filter_params', '-nfp', type=int, default=8, 33 | help='Number of filter params (if 0, use max, else do spectral linear interpolation for localized filters.)') 34 | parser.add_argument('--nonlin', '-nl', type=str, default='prelu', 35 | help='Nonlinearity to be used') 36 | parser.add_argument('--fusion', '-fs', type=str, default='Max', 37 | help='fusion method in multi-scale aggregation[Max, Avg, Cat]') 38 | parser.add_argument('--transform_method', '-tm', choices=['naive', 'sep'], default='naive', 39 | help='SH transform method: NAIVE or SEParation of variables') 40 | parser.add_argument('--real_inputs', '-ri', action='store_true', default=True, 41 | help='Leverage symmetry when inputs are real.') 42 | 43 | # train 44 | parser.add_argument('--learning_rate', '-lr', type=yaml.load, default=0.0001, 45 | help='Initial learning rate') 46 | parser.add_argument('--lr_decay_steps', type=int, default=40000) 47 | parser.add_argument('--lr_decay_rate', type=float, default=0.5) 48 | parser.add_argument('--training_epoch', '-ne', type=int, default=30, 49 | help='Number of training epochs') 50 | parser.add_argument('--batch_size', '-bs', type=int, default=64, 51 | help='training batch size') 52 | parser.add_argument('--weight_decay', type=float, default=0.0) 53 | parser.add_argument('--implicit_loss_weight', type=float, default=10.0, 54 | help='Balanced parameter for the loss term of the inplicit pose decoder') 55 | 56 | # test 57 | parser.add_argument('--test_epoch', type=int, 58 | default=30, help='Epoch for test') 59 | 60 | # use given args instead of cmd line, if they exist 61 | if isinstance(args, list): 62 | # if list, parse as cmd lines arguments 63 | args_out = parser.parse_args(args) 64 | elif args is not None: 65 | # if dict, set values directly 66 | args_out = parser.parse_args('') 67 | for k, v in args.items(): 68 | setattr(args_out, k, v) 69 | else: 70 | args_out = parser.parse_args() 71 | return args_out 72 | -------------------------------------------------------------------------------- /utils/tfnp_compatibility.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some functions are designed to work with either numpy arrays or tensorflow tensors. 3 | These wrappers ensure compatibility. 4 | 5 | Source: https://github.com/daniilidis-group/spherical-cnn 6 | """ 7 | # this is a questionable design decision... 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | 12 | 13 | def safe_cast(x, y): 14 | """ Cast x to type of y or y to type of x, without loss of precision. 15 | 16 | Works with complex and floats of any precision 17 | """ 18 | t = 'complex' if (x.dtype.is_complex or y.dtype.is_complex) else 'float' 19 | s = max(x.dtype.size, y.dtype.size) 20 | dtype = '{}{}'.format(t, s*8) 21 | 22 | return tf.cast(x, dtype), tf.cast(y, dtype) 23 | 24 | 25 | def istf(x): 26 | """ Check if argument is a tensorflow structure (or list of). """ 27 | if isinstance(x, list): 28 | x = x[0] 29 | return (isinstance(x, tf.Tensor) or 30 | isinstance(x, tf.Variable)) 31 | 32 | 33 | def shape(x): 34 | """ Return shape of either tensorflow Tensor or numpy array. """ 35 | return (tuple(x.get_shape().as_list()) if istf(x) 36 | else x.shape) 37 | 38 | 39 | def sum(*args, **kwargs): 40 | """ Return np.sum or tf.reduce_sum according to input. """ 41 | return fun(['reduce_sum', 'sum'], *args, **kwargs) 42 | 43 | 44 | def fun(fun_name, *args, **kwargs): 45 | """ Return np or tf version of function according to input. 46 | 47 | Args: 48 | fun_name (list or str): if str, return tf.fun_name or np.fun_name 49 | if list, return tf.fun_name[0] or np.fun_name[1] 50 | """ 51 | if isinstance(fun_name, list): 52 | f1, f2 = fun_name 53 | else: 54 | f1, f2 = fun_name, fun_name 55 | return (getattr(tf, f1)(*args, **kwargs) if any([istf(a) for a in args]) 56 | else getattr(np, f2)(*args, **kwargs)) 57 | 58 | 59 | def dot(x, y, *args, **kwargs): 60 | """ Return np.tensordot or tf.tensordot according to input. """ 61 | if istf(x) and not istf(y): 62 | y = tf.constant(y) 63 | if istf(y) and not istf(x): 64 | x = tf.constant(x) 65 | if istf(x) and istf(y): 66 | x, y = safe_cast(x, y) 67 | return tf.tensordot(x, y, *args, **kwargs) 68 | else: 69 | return np.tensordot(x, y, *args, **kwargs) 70 | 71 | 72 | def concat(*args, axis=0): 73 | """ Return np.concatenate or tf.concat according to input. """ 74 | # this is a bit fragile, why should y take type of x and not the other way around? 75 | return (tf.concat([*args], axis=axis) if istf(args[0]) 76 | else np.concatenate(args, axis=axis)) 77 | 78 | 79 | def fft(x, *args, **kwargs): 80 | """ Return np.fft.fft or tf.fft according to input. """ 81 | return (tf.fft(x, *args, **kwargs) if istf(x) 82 | else np.fft.fft(x, *args, **kwargs)) 83 | 84 | 85 | def conj(*args, **kwargs): 86 | return fun('conj', *args, **kwargs) 87 | 88 | 89 | def transpose(*args, **kwargs): 90 | return fun('transpose', *args, **kwargs) 91 | 92 | 93 | def reshape(*args, **kwargs): 94 | return fun('reshape', *args, **kwargs) 95 | 96 | 97 | def real_or_imag(x, part): 98 | """ Return real or imaginary part of either tensorflow Tensor or numpy array. """ 99 | if istf(x): 100 | fun = getattr(tf, part) 101 | if x.dtype.is_complex: 102 | return fun(x) 103 | else: 104 | nbits = x.dtype.size*8 105 | return fun(tf.cast(x, 'complex{}'.format(nbits*2))) 106 | else: 107 | fun = getattr(np, part) 108 | return fun(x) 109 | 110 | 111 | def real(x): 112 | return real_or_imag(x, 'real') 113 | 114 | 115 | def imag(x): 116 | return real_or_imag(x, 'imag') 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DualPoseNet 2 | Code for "DualPoseNet: Category-level 6D Object Pose and Size Estimation Using Dual Pose Network with Refined Learning of Pose Consistency", ICCV2021. [[Paper](https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_DualPoseNet_Category-Level_6D_Object_Pose_and_Size_Estimation_Using_Dual_ICCV_2021_paper.pdf)][[Supp](https://openaccess.thecvf.com/content/ICCV2021/supplemental/Lin_DualPoseNet_Category-Level_6D_ICCV_2021_supplemental.pdf)][[Arxiv](https://arxiv.org/abs/2103.06526)] 3 | 4 | Created by Jiehong Lin, Zewei Wei, Zhihao Li, Songcen Xu, [Kui Jia](http://kuijia.site/), and Yuanqing Li. 5 | 6 | ![image](https://github.com/Gorilla-Lab-SCUT/DualPoseNet/blob/main/doc/FigNetwork2.png) 7 | 8 | 9 | ## Citation 10 | If you find our work useful in your research, please consider citing: 11 | 12 | @InProceedings{Lin_2021_ICCV, 13 | author = {Lin, Jiehong and Wei, Zewei and Li, Zhihao and Xu, Songcen and Jia, Kui and Li, Yuanqing}, 14 | title = {DualPoseNet: Category-Level 6D Object Pose and Size Estimation Using Dual Pose Network With Refined Learning of Pose Consistency}, 15 | booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, 16 | month = {October}, 17 | year = {2021}, 18 | pages = {3560-3569} 19 | } 20 | 21 | ## Requirements 22 | This code has been tested with 23 | - python 3.6.5 24 | - tensorflow-gpu 1.11.0 25 | - CUDA 11.2 26 | 27 | ## Downloads 28 | - Pre-trained models [[link](https://drive.google.com/file/d/16DVaudTE_K_dqXbKoW7SASgxRQnfHApI/view?usp=sharing)] 29 | - Segmentation predictions on CAMERA25 and REAL275 [[link](https://drive.google.com/file/d/1RwAbFWw2ITX9mXzLUEBjPy_g-MNdyHET/view?usp=sharing)] 30 | - Pose Predicitons on CAMERA25 and REAL275 [[link](https://drive.google.com/file/d/10TBFY73BMmTxfErlbMqZKfClZgxMzMd9/view?usp=sharing)] 31 | 32 | ## Evaluation 33 | Evaluate the results of DualPoseNet reported in the paper: 34 | 35 | ``` 36 | python eval.py 37 | ``` 38 | ## Data Preparation 39 | 40 | Download the data provided by [NOCS](https://github.com/hughw19/NOCS_CVPR2019) ([real_train](http://download.cs.stanford.edu/orion/nocs/real_train.zip), [real_test](http://download.cs.stanford.edu/orion/nocs/real_test.zip), 41 | [ground truths](http://download.cs.stanford.edu/orion/nocs/gts.zip), 42 | and [mesh models](http://download.cs.stanford.edu/orion/nocs/obj_models.zip)), and unzip them in the folder ```data/``` as follows: 43 | 44 | ``` 45 | data 46 | ├── CAMERA 47 | │ ├── train 48 | │ └── val 49 | ├── Real 50 | │ ├── train 51 | │ └── test 52 | ├── gts 53 | │ ├── val 54 | │ └── real_test 55 | └── obj_models 56 | ├── train 57 | ├── val 58 | ├── real_train 59 | └── real_test 60 | ``` 61 | 62 | Run the following scripts to prepare training instances: 63 | 64 | ``` 65 | cd provider 66 | python training_data_prepare.py 67 | ``` 68 | 69 | ## Training 70 | 71 | Command for training DualPoseNet: 72 | ``` 73 | python main.py --phase train --dataset REAL275 74 | ``` 75 | The configurations can be modified in ```utils/config.py```. 76 | 77 | ## Testing 78 | Command for testing DualPoseNet without refined learning: 79 | ``` 80 | python main.py --phase test --dataset REAL275 81 | ``` 82 | 83 | Command for testing DualPoseNet with refined learning: 84 | ``` 85 | python main.py --phase test_refine_encoder --dataset REAL275 86 | ``` 87 | 88 | We also provider another faster way of refinement by directly finetuning the pose-sensitive features: 89 | ``` 90 | python main.py --phase test_refine_feature --dataset REAL275 91 | ``` 92 | 93 | The configurations can also be modified in ```utils/config.py```. 94 | 95 | ## Acknowledgements 96 | 97 | Our implementation leverages the code from [SCNN](https://github.com/daniilidis-group/spherical-cnn), [NOCS](https://github.com/hughw19/NOCS_CVPR2019) and [SPD](https://github.com/mentian/object-deformnet). 98 | 99 | ## License 100 | Our code is released under MIT License (see LICENSE file for details). 101 | 102 | ## Contact 103 | `lin.jiehong@mail.scut.edu.cn` 104 | 105 | `kuijia@scut.edu.cn` 106 | 107 | 108 | -------------------------------------------------------------------------------- /provider/dataloder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ 7 | Dataloders of REAL275 and CAMERA25. 8 | 9 | Author: Jiehong Lin 10 | """ 11 | 12 | import os 13 | import numpy as np 14 | import queue 15 | import threading 16 | import glob 17 | import _pickle as cPickle 18 | 19 | class Fetcher(threading.Thread): 20 | def __init__(self, opts): 21 | super(Fetcher, self).__init__() 22 | self.queue = queue.Queue(50) 23 | self.stopped = False 24 | self.opts = opts 25 | 26 | 27 | data_paths = glob.glob('data/training_instance/CAMERA25_*.pkl') 28 | if self.opts.dataset == 'REAL275': 29 | data_paths.append('data/training_instance/REAL275.pkl') 30 | print(data_paths) 31 | 32 | self.observed_pc = [] 33 | self.input_dis = [] 34 | self.input_rgb = [] 35 | self.rotation = [] 36 | self.translation = [] 37 | self.scale = [] 38 | 39 | for data_path in data_paths: 40 | print(data_path) 41 | with open(data_path, 'rb') as f: 42 | data = cPickle.load(f) 43 | self.observed_pc.append(data['observed_pc']) 44 | self.input_dis.append(data['input_dis']) 45 | self.input_rgb.append(data['input_rgb']) 46 | self.rotation.append(data['rotation']) 47 | self.translation.append(data['translation']) 48 | self.scale.append(data['scale']) 49 | 50 | self.observed_pc = np.concatenate(self.observed_pc, axis=0) 51 | self.input_dis = np.concatenate(self.input_dis, axis=0) 52 | self.input_rgb = np.concatenate(self.input_rgb, axis=0) 53 | self.rotation = np.concatenate(self.rotation, axis=0) 54 | self.translation = np.concatenate(self.translation, axis=0) 55 | self.scale = np.concatenate(self.scale, axis=0) 56 | 57 | self.batch_size = self.opts.batch_size 58 | self.sample_cnt = self.input_dis.shape[0] 59 | self.num_batches = self.sample_cnt//self.batch_size 60 | print ("NUM_INSTANCE is %s"%(self.sample_cnt)) 61 | print ("NUM_BATCH is %s"%(self.num_batches)) 62 | 63 | def run(self): 64 | while not self.stopped: 65 | idx = np.arange(self.sample_cnt) 66 | np.random.shuffle(idx) 67 | self.observed_pc = self.observed_pc[idx, ...] 68 | self.input_dis = self.input_dis[idx, ...] 69 | self.input_rgb = self.input_rgb[idx, ...] 70 | self.rotation = self.rotation[idx, ...] 71 | self.translation = self.translation[idx, ...] 72 | self.scale = self.scale[idx, ...] 73 | 74 | for batch_idx in range(self.num_batches): 75 | if self.stopped: 76 | return None 77 | start_idx = batch_idx * self.batch_size 78 | end_idx = (batch_idx + 1) * self.batch_size 79 | 80 | batch_input_dis = self.input_dis[start_idx:end_idx, :, :, :].copy() 81 | batch_input_rgb = self.input_rgb[start_idx:end_idx, :, :, :].copy() 82 | batch_observed_pc = self.observed_pc[start_idx:end_idx, :, :].copy() 83 | batch_rotation = self.rotation[start_idx:end_idx, :].copy() 84 | batch_translation = self.translation[start_idx:end_idx, :].copy() 85 | batch_scale = self.scale[start_idx:end_idx, :].copy() 86 | self.queue.put((batch_input_dis, batch_input_rgb, batch_observed_pc, batch_rotation, batch_translation, batch_scale)) 87 | return None 88 | 89 | def fetch(self): 90 | if self.stopped: 91 | return None 92 | return self.queue.get() 93 | 94 | def shutdown(self): 95 | self.stopped = True 96 | print ("Shutdown .....") 97 | while not self.queue.empty(): 98 | self.queue.get() 99 | print ("Remove all queue data") 100 | 101 | -------------------------------------------------------------------------------- /utils/align_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | RANSAC for Similarity Transformation Estimation 3 | Modified from https://github.com/hughw19/NOCS_CVPR2019 & https://github.com/mentian/object-deformnet 4 | Originally Written by Srinath Sridhar 5 | """ 6 | import time 7 | import numpy as np 8 | 9 | from pc_utils import backproject, pc2sphericalmap 10 | 11 | 12 | def estimateSimilarityUmeyama(SourceHom, TargetHom): 13 | # Copy of original paper is at: http://web.stanford.edu/class/cs273/refs/umeyama.pdf 14 | SourceCentroid = np.mean(SourceHom[:3, :], axis=1) 15 | TargetCentroid = np.mean(TargetHom[:3, :], axis=1) 16 | nPoints = SourceHom.shape[1] 17 | CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose() 18 | CenteredTarget = TargetHom[:3, :] - np.tile(TargetCentroid, (nPoints, 1)).transpose() 19 | CovMatrix = np.matmul(CenteredTarget, np.transpose(CenteredSource)) / nPoints 20 | if np.isnan(CovMatrix).any(): 21 | print('nPoints:', nPoints) 22 | print(SourceHom.shape) 23 | print(TargetHom.shape) 24 | raise RuntimeError('There are NANs in the input.') 25 | 26 | U, D, Vh = np.linalg.svd(CovMatrix, full_matrices=True) 27 | d = (np.linalg.det(U) * np.linalg.det(Vh)) < 0.0 28 | if d: 29 | D[-1] = -D[-1] 30 | U[:, -1] = -U[:, -1] 31 | # rotation 32 | Rotation = np.matmul(U, Vh) 33 | # scale 34 | varP = np.var(SourceHom[:3, :], axis=1).sum() 35 | Scale = 1 / varP * np.sum(D) 36 | # translation 37 | Translation = TargetHom[:3, :].mean(axis=1) - SourceHom[:3, :].mean(axis=1).dot(Scale*Rotation.T) 38 | # transformation matrix 39 | OutTransform = np.identity(4) 40 | OutTransform[:3, :3] = Scale * Rotation 41 | OutTransform[:3, 3] = Translation 42 | 43 | return Scale, Rotation, Translation, OutTransform 44 | 45 | 46 | def estimateSimilarityTransform(source: np.array, target: np.array, verbose=False): 47 | """ Add RANSAC algorithm to account for outliers. 48 | """ 49 | assert source.shape[0] == target.shape[0], 'Source and Target must have same number of points.' 50 | SourceHom = np.transpose(np.hstack([source, np.ones([source.shape[0], 1])])) 51 | TargetHom = np.transpose(np.hstack([target, np.ones([target.shape[0], 1])])) 52 | # Auto-parameter selection based on source heuristics 53 | # Assume source is object model or gt nocs map, which is of high quality 54 | SourceCentroid = np.mean(SourceHom[:3, :], axis=1) 55 | nPoints = SourceHom.shape[1] 56 | CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose() 57 | SourceDiameter = 2 * np.amax(np.linalg.norm(CenteredSource, axis=0)) 58 | InlierT = SourceDiameter / 10.0 # 0.1 of source diameter 59 | maxIter = 128 60 | confidence = 0.99 61 | 62 | if verbose: 63 | print('Inlier threshold: ', InlierT) 64 | print('Max number of iterations: ', maxIter) 65 | 66 | BestInlierRatio = 0 67 | BestInlierIdx = np.arange(nPoints) 68 | for i in range(0, maxIter): 69 | # Pick 5 random (but corresponding) points from source and target 70 | RandIdx = np.random.randint(nPoints, size=5) 71 | Scale, _, _, OutTransform = estimateSimilarityUmeyama(SourceHom[:, RandIdx], TargetHom[:, RandIdx]) 72 | PassThreshold = Scale * InlierT # propagate inlier threshold to target scale 73 | Diff = TargetHom - np.matmul(OutTransform, SourceHom) 74 | ResidualVec = np.linalg.norm(Diff[:3, :], axis=0) 75 | InlierIdx = np.where(ResidualVec < PassThreshold)[0] 76 | nInliers = InlierIdx.shape[0] 77 | InlierRatio = nInliers / nPoints 78 | # update best hypothesis 79 | if InlierRatio > BestInlierRatio: 80 | BestInlierRatio = InlierRatio 81 | BestInlierIdx = InlierIdx 82 | if verbose: 83 | print('Iteration: ', i) 84 | print('Inlier ratio: ', BestInlierRatio) 85 | # early break 86 | if (1 - (1 - BestInlierRatio ** 5) ** i) > confidence: 87 | break 88 | 89 | if(BestInlierRatio < 0.1): 90 | print('[ WARN ] - Something is wrong. Small BestInlierRatio: ', BestInlierRatio) 91 | return None, None, None, None 92 | 93 | SourceInliersHom = SourceHom[:, BestInlierIdx] 94 | TargetInliersHom = TargetHom[:, BestInlierIdx] 95 | Scale, Rotation, Translation, OutTransform = estimateSimilarityUmeyama(SourceInliersHom, TargetInliersHom) 96 | 97 | if verbose: 98 | print('BestInlierRatio:', BestInlierRatio) 99 | print('Rotation:\n', Rotation) 100 | print('Translation:\n', Translation) 101 | print('Scale:', Scale) 102 | 103 | return Scale, Rotation, Translation, OutTransform 104 | 105 | 106 | def align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path, verbose=False): 107 | num_instances = len(instance_ids) 108 | error_messages = '' 109 | elapses = [] 110 | 111 | instance_pts = np.zeros((num_instances, 1024, 3)) 112 | dis_smaps = np.zeros((num_instances, 64, 64, 1)) 113 | rgb_smaps = np.zeros((num_instances, 64, 64, 3)) 114 | 115 | scales = np.zeros(num_instances) 116 | rotations = np.zeros((num_instances, 3, 3)) 117 | translations = np.zeros((num_instances, 3)) 118 | 119 | 120 | for i in range(num_instances): 121 | mask = masks[:, :, i] 122 | coord = coords[:, :, i, :] 123 | pts, idxs = backproject(depth, intrinsics, mask) 124 | coord_pts = coord[idxs[0], idxs[1], :] - 0.5 125 | try: 126 | start = time.time() 127 | s, R, T, outtransform = estimateSimilarityTransform(coord_pts, pts, False) 128 | elapsed = time.time() - start 129 | if verbose: 130 | print('elapsed: ', elapsed) 131 | elapses.append(elapsed) 132 | except Exception as e: 133 | message = '[ Error ] aligning instance {} in {} fails. Message: {}.'.format(instance_ids[i], img_path, str(e)) 134 | print(message) 135 | error_messages += message + '\n' 136 | s = 1.0 137 | R = np.eye(3) 138 | T = np.zeros(3) 139 | outtransform = np.identity(4, dtype=np.float32) 140 | 141 | pts = pts / 1000.0 142 | centroid = np.mean(pts, axis=0) 143 | pts = pts - centroid[np.newaxis, :] 144 | T = T / 1000.0 - centroid 145 | 146 | img = image[idxs[0], idxs[1], :] 147 | img = (img-np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0 148 | dis_map, rgb_map = pc2sphericalmap( 149 | pts, img, resolution=64) 150 | 151 | if pts.shape[0]>1024: 152 | pts = pts[np.random.choice( 153 | pts.shape[0], 1024, replace=False), :] 154 | else: 155 | pts = pts[np.random.choice( 156 | pts.shape[0], 1024), :] 157 | 158 | instance_pts[i, :, :] = pts 159 | dis_smaps[i, :, :, :] = dis_map[0] 160 | rgb_smaps[i, :, :, :] = rgb_map[0] 161 | 162 | scales[i] = s / 1000.0 163 | rotations[i, :, :] = R.transpose() 164 | translations[i, :] = T 165 | 166 | return instance_pts, dis_smaps, rgb_smaps, scales, rotations, translations, error_messages, elapses -------------------------------------------------------------------------------- /model/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ 7 | Main Modules in DualPoseNet. 8 | 9 | * The code of encoder is modified based on: https://github.com/daniilidis-group/spherical-cnn. 10 | 11 | Author: Jiehong Lin 12 | """ 13 | import numpy as np 14 | import tensorflow as tf 15 | 16 | import spherical_utils 17 | from layers import sphconv, conv1d, fully_connected, area_weights, block 18 | 19 | 20 | def dup(x): 21 | """ Return two references for input; useful when creating NNs and storing references to layers """ 22 | return [x, x] 23 | 24 | 25 | class encoder(object): 26 | def __init__(self, opts, is_training, name="encoder"): 27 | self.opts = opts 28 | self.is_training = is_training 29 | self.name = name 30 | self.reuse = False 31 | self.bn = False 32 | 33 | self.method = self.opts.transform_method 34 | self.real = self.opts.real_inputs 35 | 36 | if self.method == 'naive': 37 | args = self.opts 38 | fun = lambda *args, **kwargs: spherical_utils.sph_harm_all( 39 | *args, **kwargs, real=self.real) 40 | 41 | with tf.name_scope('harmonics_or_legendre'): 42 | res = self.opts.input_res 43 | self.harmonics = [fun(res // (2**i), as_tfvar=True) 44 | for i in range(sum(self.opts.pool_layers) + 1)] 45 | 46 | def __call__(self, dis_inputs, rgb_inputs): 47 | 48 | net = {} 49 | high = 0 50 | low = 1 51 | args = self.opts 52 | convfun = sphconv 53 | l_or_h = self.harmonics 54 | method = self.method 55 | dis_curr = dis_inputs 56 | rgb_curr = rgb_inputs 57 | 58 | with tf.variable_scope(self.name, reuse=self.reuse): 59 | 60 | # Main Streams with Spherical Fusion 61 | for i, (nf, pool) in enumerate(zip(args.nfilters, args.pool_layers)): 62 | if i in [3, 5, 7]: 63 | with tf.variable_scope('fusion_conv{}'.format(i), reuse=tf.AUTO_REUSE): 64 | fusion_curr = tf.concat([dis_curr, rgb_curr], axis=-1) 65 | net['fusion_conv{}'.format(i)], fusion_curr = dup(block(args, convfun, self.is_training, fusion_curr, nf, 66 | n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 67 | harmonics_or_legendre=l_or_h[high], method=method)) 68 | dis_curr = tf.concat([dis_curr, fusion_curr], axis=-1) 69 | rgb_curr = tf.concat([rgb_curr, fusion_curr], axis=-1) 70 | else: 71 | if not pool: 72 | with tf.variable_scope('dis_conv{}'.format(i), reuse=tf.AUTO_REUSE): 73 | net['dis_conv{}'.format(i)], dis_curr = dup(block(args, convfun, self.is_training, dis_curr, nf, 74 | n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 75 | harmonics_or_legendre=l_or_h[high], method=method)) 76 | with tf.variable_scope('rgb_conv{}'.format(i), reuse=tf.AUTO_REUSE): 77 | net['rgb_conv{}'.format(i)], rgb_curr = dup(block(args, convfun, self.is_training, rgb_curr, nf, 78 | n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 79 | harmonics_or_legendre=l_or_h[high], method=method)) 80 | else: 81 | with tf.variable_scope('dis_conv{}'.format(i), reuse=tf.AUTO_REUSE): 82 | net['dis_conv{}'.format(i)], dis_curr = dup(block(args, convfun, self.is_training, dis_curr, nf, n_filter_params=args.n_filter_params, 83 | weight_decay=args.weight_decay, harmonics_or_legendre=l_or_h[ 84 | high], method=method, 85 | spectral_pool=0, harmonics_or_legendre_low=l_or_h[low])) 86 | with tf.variable_scope('rgb_conv{}'.format(i), reuse=tf.AUTO_REUSE): 87 | net['rgb_conv{}'.format(i)], rgb_curr = dup(block(args, convfun, self.is_training, rgb_curr, nf, n_filter_params=args.n_filter_params, 88 | weight_decay=args.weight_decay, harmonics_or_legendre=l_or_h[ 89 | high], method=method, 90 | spectral_pool=0, harmonics_or_legendre_low=l_or_h[low])) 91 | # pooling 92 | dis_curr = area_weights(tf.layers.average_pooling2d( 93 | area_weights(dis_curr), 2*pool, 2*pool, 'same'), invert=True) 94 | rgb_curr = area_weights(tf.layers.average_pooling2d( 95 | area_weights(rgb_curr), 2*pool, 2*pool, 'same'), invert=True) 96 | 97 | high += 1 98 | low += 1 99 | 100 | # Aggregation of Multi-scale Spherical Features 101 | feat1 = net['fusion_conv3'] 102 | feat1 = tf.layers.flatten(inputs=feat1) 103 | feat1 = fully_connected(feat1, 1024, scope='flatten_conv3', weight_decay=self.opts.weight_decay, 104 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 105 | 106 | feat2 = net['fusion_conv5'] 107 | feat2 = tf.layers.flatten(inputs=feat2) 108 | feat2 = fully_connected(feat2, 1024, scope='flatten_conv5', weight_decay=self.opts.weight_decay, 109 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 110 | 111 | feat3 = net['fusion_conv7'] 112 | feat3 = tf.layers.flatten(inputs=feat3) 113 | feat3 = fully_connected(feat3, 1024, scope='flatten_conv7', weight_decay=self.opts.weight_decay, 114 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 115 | 116 | if self.opts.fusion == 'Avg': 117 | curr = tf.concat([tf.expand_dims(feat1, 2), tf.expand_dims( 118 | feat2, 2), tf.expand_dims(feat3, 2)], axis=2) 119 | curr = tf.reduce_mean(curr, axis=2) 120 | elif self.opts.fusion == 'Max': 121 | curr = tf.concat([tf.expand_dims(feat1, 2), tf.expand_dims( 122 | feat2, 2), tf.expand_dims(feat3, 2)], axis=2) 123 | curr = tf.reduce_max(curr, axis=2) 124 | elif self.opts.fusion == 'Cat': 125 | curr = tf.concat([feat1, feat2, feat3], axis=-1) 126 | else: 127 | assert False 128 | 129 | curr = fully_connected(curr, 1024, scope='fc1', weight_decay=self.opts.weight_decay, 130 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 131 | curr = fully_connected(curr, 1024, scope='fc2', weight_decay=self.opts.weight_decay, 132 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 133 | 134 | self.reuse = True 135 | self.variables = tf.get_collection( 136 | tf.GraphKeys.TRAINABLE_VARIABLES, self.name) 137 | 138 | return curr 139 | 140 | 141 | class explicit_decoder(object): 142 | def __init__(self, opts, is_training, name="explicit_decoder"): 143 | self.opts = opts 144 | self.is_training = is_training 145 | self.name = name 146 | self.reuse = False 147 | self.bn = False 148 | 149 | def __call__(self, feat): 150 | 151 | with tf.variable_scope(self.name, reuse=self.reuse): 152 | 153 | rot_feat = fully_connected(feat, 1024, scope='rot_fc1', weight_decay=self.opts.weight_decay, 154 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 155 | rot_feat = fully_connected(rot_feat, 512, scope='rot_fc2', weight_decay=self.opts.weight_decay, 156 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 157 | rot_feat = tf.layers.dense( 158 | inputs=rot_feat, activation=None, units=4, name="rot_fc3") 159 | rotation = rot_feat / \ 160 | (tf.linalg.norm(rot_feat, axis=1, keepdims=True)+1e-10) 161 | 162 | trans_feat = fully_connected(feat, 1024, scope='trans_fc1', weight_decay=self.opts.weight_decay, 163 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 164 | trans_feat = fully_connected(trans_feat, 512, scope='trans_fc2', weight_decay=self.opts.weight_decay, 165 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 166 | translation = tf.layers.dense( 167 | inputs=trans_feat, activation=None, units=3, name="trans_fc3") 168 | 169 | scale_feat = fully_connected(feat, 1024, scope='scale_fc1', weight_decay=self.opts.weight_decay, 170 | activation_fn=tf.nn.leaky_relu, bn=self.opts.batch_norm, is_training=self.is_training) 171 | scale_feat = fully_connected(scale_feat, 512, scope='scale_fc2', weight_decay=self.opts.weight_decay, 172 | activation_fn=tf.nn.leaky_relu, bn=self.opts.batch_norm, is_training=self.is_training) 173 | scale = tf.layers.dense( 174 | inputs=scale_feat, activation=None, units=3, name="scale_fc3") 175 | 176 | 177 | self.reuse = True 178 | self.variables = tf.get_collection( 179 | tf.GraphKeys.TRAINABLE_VARIABLES, self.name) 180 | 181 | return translation, rotation, scale 182 | 183 | 184 | class implicit_decoder(object): 185 | def __init__(self, opts, is_training, output_npoint=1024, name="implcit_decoder"): 186 | self.opts = opts 187 | self.npoint = output_npoint 188 | self.is_training = is_training 189 | self.name = name 190 | self.reuse = False 191 | 192 | def __call__(self, pts, feat): 193 | with tf.variable_scope(self.name, reuse=self.reuse): 194 | b = pts.get_shape()[0].value 195 | n = pts.get_shape()[1].value 196 | c = feat.get_shape()[1].value 197 | 198 | feat = tf.concat( 199 | [pts, tf.tile(tf.reshape(feat, [b, 1, c]), [1, n, 1])], axis=-1) 200 | feat = conv1d(feat, 1024, 1, scope='conv1', weight_decay=self.opts.weight_decay, 201 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 202 | feat = conv1d(feat, 512, 1, scope='conv2', weight_decay=self.opts.weight_decay, 203 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 204 | feat = conv1d(feat, 256, 1, scope='conv3', weight_decay=self.opts.weight_decay, 205 | activation_fn=tf.nn.leaky_relu, is_training=self.is_training) 206 | Q = conv1d(feat, 3, 1, scope='conv4', weight_decay=self.opts.weight_decay, 207 | activation_fn=None, bn=False, is_training=self.is_training) 208 | 209 | self.reuse = True 210 | self.variables = tf.get_collection( 211 | tf.GraphKeys.TRAINABLE_VARIABLES, self.name) 212 | 213 | return Q 214 | -------------------------------------------------------------------------------- /model/layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ Custom layers. 7 | 8 | Modified based on: https://github.com/daniilidis-group/spherical-cnn. 9 | """ 10 | 11 | 12 | import tensorflow as tf 13 | import numpy as np 14 | 15 | import spherical_utils 16 | import tfnp_compatibility as tfnp 17 | 18 | 19 | def sphconv(inputs, filters, use_bias=True, n_filter_params=0, weight_decay=0, 20 | *args, **kwargs): 21 | shapei = tfnp.shape(inputs) 22 | spectral_input = True if len(shapei) == 5 else False 23 | nchan = shapei[-1] 24 | n = shapei[2] 25 | if spectral_input: 26 | n *= 2 27 | 28 | with tf.variable_scope(None, default_name='sphconv'): 29 | # we desire that variance to stay constant after every layer 30 | # factor n // 2 is because we sum n // 2 coefficients 31 | # factor nchan is because we sum nchan channels 32 | # factor 2pi is the 'gain' of integrating over SO(3) 33 | # the factor 2 takes into account non-zero mean 34 | # (see He et al 2015 "Delving deep into rectifiers") 35 | std = 2./(2 * np.pi * np.sqrt((n // 2) * (nchan))) 36 | regularizer = tf.contrib.layers.l2_regularizer( 37 | weight_decay) if weight_decay > 0 else None 38 | 39 | if n_filter_params == 0: 40 | weights = tf.get_variable('W', 41 | trainable=True, 42 | initializer=tf.truncated_normal([nchan, n // 2, filters], 43 | stddev=std), 44 | regularizer=regularizer) 45 | ker = weights[:, np.newaxis, :, np.newaxis, :] 46 | else: 47 | nw_in = n_filter_params 48 | if nw_in > n // 2: 49 | nw_in = n // 2 50 | 51 | weights = tf.get_variable('W', 52 | trainable=True, 53 | initializer=tf.truncated_normal([nchan, 54 | nw_in, 55 | filters], 56 | stddev=std), 57 | regularizer=regularizer) 58 | xw_in = np.linspace(0, 1, nw_in) 59 | xw_out = np.linspace(0, 1, n // 2) 60 | id_out = np.searchsorted(xw_in, xw_out) 61 | subws = [] 62 | for i, x in zip(id_out, xw_out): 63 | # linear interpolation 64 | # HACK! we assume the first indices match so i-1 when i==0 cancels out 65 | subws.append(weights[:, i-1, :] + 66 | (weights[:, i, :] - weights[:, i-1, :]) * 67 | (x-xw_in[i-1]) / 68 | ((xw_in[i]-xw_in[i-1]))) 69 | ker = tf.stack(subws, axis=1)[:, np.newaxis, :, np.newaxis, :] 70 | 71 | if use_bias: 72 | bias = tf.get_variable('b', 73 | trainable=True, 74 | initializer=tf.zeros([1, 1, 1, filters], dtype=tf.float32)) 75 | else: 76 | bias = tf.zeros([1, 1, 1, filters], dtype=tf.float32) 77 | 78 | conv = spherical_utils.sph_conv_batch(inputs, ker, *args, **kwargs) 79 | conv = conv + bias 80 | 81 | for k, v in {'W': weights, 'b': bias, 'activations': conv}.items(): 82 | tf.summary.histogram(k, v) 83 | # avg 84 | tf.summary.scalar('norm_activation', tf.reduce_mean( 85 | tf.norm(conv, axis=(1, 2)) / n)) 86 | 87 | return conv 88 | 89 | 90 | def block(params, fun, is_training=None, *args, **kwargs): 91 | """ Block consisting of weight layer + batch norm + nonlinearity""" 92 | params.batch_norm = False 93 | use_bias = not params.batch_norm 94 | curr = fun(*args, **kwargs, use_bias=use_bias) 95 | if params.batch_norm: 96 | curr = tf.layers.batch_normalization(curr, 97 | # doesn't make sense to learn scale when using ReLU 98 | fused=False, 99 | scale=False, 100 | training=is_training, 101 | renorm=params.batch_renorm) 102 | for v in tf.get_variable_scope().trainable_variables(): 103 | if 'batch_normalization' in v.name: 104 | tf.summary.histogram(v.name, v) 105 | 106 | return nonlin(params)(curr) 107 | 108 | 109 | def nonlin(params): 110 | return getattr(tf.nn, params.nonlin, globals().get(params.nonlin)) 111 | 112 | 113 | def identity(inputs): 114 | return inputs 115 | 116 | 117 | def prelu(inputs): 118 | """ From: https://stackoverflow.com/a/40264459 """ 119 | alphas = tf.Variable(0.1 * tf.ones(inputs.get_shape()[-1]), 120 | trainable=True, 121 | dtype=tf.float32) 122 | pos = tf.nn.relu(inputs) 123 | neg = alphas * (inputs - abs(inputs)) * 0.5 124 | 125 | return pos + neg 126 | 127 | 128 | def area_weights(x, invert=False): 129 | """ Apply weight each cell according to its area; useful for averaging/avg pooling. """ 130 | n = tfnp.shape(x)[1] 131 | phi, theta = spherical_utils.sph_sample(n) 132 | phi += np.diff(phi)[0]/2 133 | # this is proportional to the cell area, not exactly the area 134 | # this is the same as using |cos\phi_1 - cos\phi_2| 135 | if invert: 136 | x /= np.sin(phi)[np.newaxis, np.newaxis, :, np.newaxis] 137 | else: 138 | x *= np.sin(phi)[np.newaxis, np.newaxis, :, np.newaxis] 139 | 140 | return x 141 | 142 | 143 | def fully_connected(inputs, 144 | num_outputs, 145 | scope, 146 | use_xavier=True, 147 | stddev=1e-3, 148 | weight_decay=0.00001, 149 | activation_fn=tf.nn.relu, 150 | bn=False, 151 | bn_decay=None, 152 | use_bias=True, 153 | is_training=None): 154 | """ Fully connected layer with non-linear operation. 155 | 156 | Args: 157 | inputs: 2-D tensor BxN 158 | num_outputs: int 159 | 160 | Returns: 161 | Variable tensor of size B x num_outputs. 162 | """ 163 | 164 | with tf.variable_scope(scope) as sc: 165 | if use_xavier: 166 | initializer = tf.contrib.layers.xavier_initializer() 167 | else: 168 | initializer = tf.truncated_normal_initializer(stddev=stddev) 169 | 170 | if weight_decay > 0: 171 | outputs = tf.layers.dense(inputs, num_outputs, 172 | use_bias=use_bias, kernel_initializer=initializer, 173 | kernel_regularizer=tf.contrib.layers.l2_regularizer( 174 | weight_decay), 175 | bias_regularizer=tf.contrib.layers.l2_regularizer( 176 | weight_decay), 177 | reuse=None) 178 | else: 179 | outputs = tf.layers.dense(inputs, num_outputs, 180 | use_bias=use_bias, kernel_initializer=initializer, 181 | kernel_regularizer=None, 182 | bias_regularizer=None, 183 | reuse=None) 184 | if bn: 185 | # outputs = tf.layers.batch_normalization(outputs, momentum=bn_decay, training=is_training, renorm=False) 186 | outputs = tf.layers.batch_normalization( 187 | outputs, training=is_training, renorm=False) 188 | 189 | if activation_fn is not None: 190 | outputs = activation_fn(outputs) 191 | 192 | return outputs 193 | 194 | 195 | def conv1d(inputs, 196 | num_output_channels, 197 | kernel_size, 198 | scope=None, 199 | stride=1, 200 | padding='SAME', 201 | use_xavier=True, 202 | stddev=1e-3, 203 | weight_decay=0.00001, 204 | activation_fn=tf.nn.relu, 205 | bn=False, 206 | bn_decay=None, 207 | use_bias=True, 208 | is_training=None, 209 | reuse=None): 210 | """ 1D convolution with non-linear operation. 211 | 212 | Args: 213 | inputs: 3-D tensor variable BxHxWxC 214 | num_output_channels: int 215 | kernel_size: int 216 | scope: string 217 | stride: a list of 2 ints 218 | padding: 'SAME' or 'VALID' 219 | use_xavier: bool, use xavier_initializer if true 220 | stddev: float, stddev for truncated_normal init 221 | weight_decay: float 222 | activation_fn: function 223 | bn: bool, whether to use batch norm 224 | bn_decay: float or float tensor variable in [0,1] 225 | is_training: bool Tensor variable 226 | 227 | Returns: 228 | Variable tensor 229 | """ 230 | with tf.variable_scope(scope, reuse=reuse): 231 | if use_xavier: 232 | initializer = tf.contrib.layers.xavier_initializer() 233 | else: 234 | initializer = tf.truncated_normal_initializer(stddev=stddev) 235 | 236 | if weight_decay > 0: 237 | outputs = tf.layers.conv1d(inputs, num_output_channels, kernel_size, stride, padding, 238 | kernel_initializer=initializer, 239 | kernel_regularizer=tf.contrib.layers.l2_regularizer( 240 | weight_decay), 241 | bias_regularizer=tf.contrib.layers.l2_regularizer( 242 | weight_decay), 243 | use_bias=use_bias, reuse=None) 244 | else: 245 | outputs = tf.layers.conv1d(inputs, num_output_channels, kernel_size, stride, padding, 246 | kernel_initializer=initializer, 247 | kernel_regularizer=None, 248 | bias_regularizer=None, 249 | use_bias=use_bias, reuse=None) 250 | 251 | if bn: 252 | outputs = tf.layers.batch_normalization( 253 | outputs, momentum=bn_decay, training=is_training, renorm=False, fused=True) 254 | 255 | if activation_fn is not None: 256 | outputs = activation_fn(outputs) 257 | 258 | return outputs 259 | 260 | 261 | def quaternion2mat(q): 262 | # quaternion2mat(q) = transforms3d.euler.qua2mat(q).T 263 | b = tf.shape(q)[0] 264 | 265 | w = tf.slice(q, [0, 0], [-1, 1]) 266 | x = tf.slice(q, [0, 1], [-1, 1]) 267 | y = tf.slice(q, [0, 2], [-1, 1]) 268 | z = tf.slice(q, [0, 3], [-1, 1]) 269 | 270 | r1 = tf.reshape( 271 | tf.concat([1-2*(y**2)-2*(z**2), 2*x*y+2*w*z, 2*x*z-2*w*y], axis=1), [b, 1, 3]) 272 | r2 = tf.reshape( 273 | tf.concat([2*x*y-2*w*z, 1-2*(x**2)-2*(z**2), 2*y*z+2*w*x], axis=1), [b, 1, 3]) 274 | r3 = tf.reshape( 275 | tf.concat([2*x*z+2*w*y, 2*y*z-2*w*x, 1-2*(x**2)-2*(y**2)], axis=1), [b, 1, 3]) 276 | 277 | r = tf.concat([r1, r2, r3], axis=1) 278 | return r 279 | 280 | 281 | def point_transformation(pc, rotation, translation, scale): 282 | b = tf.shape(pc)[0] 283 | 284 | r = quaternion2mat(rotation) 285 | pc = (pc - tf.tile(tf.reshape(translation, [b, 1, 3]), [1, 1024, 1]))/tf.tile( 286 | tf.reshape((tf.linalg.norm(scale, axis=1)+1e-10), [b, 1, 1]), [1, 1024, 3]) 287 | pc = tf.matmul(pc, r) 288 | return pc -------------------------------------------------------------------------------- /utils/spherical_utils.py: -------------------------------------------------------------------------------- 1 | """ Spherical harmonics transforms/inverses and spherical convolution. 2 | 3 | Source: https://github.com/daniilidis-group/spherical-cnn 4 | """ 5 | import functools 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | from scipy.special import sph_harm 10 | 11 | import tfnp_compatibility as tfnp 12 | from tfnp_compatibility import istf 13 | 14 | 15 | def safe_cast(x, y): 16 | """ Cast x to type of y or y to type of x, without loss of precision. 17 | 18 | Works with complex and floats of any precision 19 | """ 20 | t = 'complex' if (x.dtype.is_complex or y.dtype.is_complex) else 'float' 21 | s = max(x.dtype.size, y.dtype.size) 22 | dtype = '{}{}'.format(t, s*8) 23 | 24 | return tf.cast(x, dtype), tf.cast(y, dtype) 25 | 26 | 27 | def sph_sample(n, mode='DH'): 28 | """ Sample grid on a sphere. 29 | 30 | Args: 31 | n (int): dimension is n x n 32 | mode (str): sampling mode; DH or GLQ 33 | 34 | Returns: 35 | theta, phi (1D arrays): polar and azimuthal angles 36 | """ 37 | assert n % 2 == 0 38 | j = np.arange(0, n) 39 | if mode == 'DH': 40 | return j*np.pi/n, j*2*np.pi/n 41 | elif mode == 'ours': 42 | return (2*j+1)*np.pi/2/n, j*2*np.pi/n 43 | elif mode == 'GLQ': 44 | from pyshtools.shtools import GLQGridCoord 45 | phi, theta = GLQGridCoord(n-1) 46 | # convert latitude to [0, np.pi/2] 47 | return np.radians(phi+90), np.radians(theta) 48 | elif mode == 'naive': 49 | # repeat first and last points; useful for plotting 50 | return np.linspace(0, np.pi, n), np.linspace(0, 2*np.pi, n) 51 | 52 | 53 | # cache outputs; 2050 > 32*64 54 | @functools.lru_cache(maxsize=2050, typed=False) 55 | def sph_harm_lm(l, m, n): 56 | """ Wrapper around scipy.special.sph_harm. Return spherical harmonic of degree l and order m. """ 57 | phi, theta = sph_sample(n) 58 | phi, theta = np.meshgrid(phi, theta) 59 | f = sph_harm(m, l, theta, phi) 60 | 61 | return f 62 | 63 | 64 | def sph_harm_all(n, as_tfvar=False, real=False): 65 | """ Compute spherical harmonics for an n x n input (degree up to n // 2) 66 | 67 | Args: 68 | n (int): input dimensions; order will be n // 2 69 | as_tfvar (bool): if True, return as list of tensorflow Variables. 70 | real (bool): if True, return real harmonics 71 | """ 72 | harmonics = [] 73 | 74 | for l in range(n // 2): 75 | if real: 76 | minl = 0 77 | else: 78 | minl = -l 79 | row = [] 80 | for m in range(minl, l+1): 81 | row.append(sph_harm_lm(l, m, n)) 82 | harmonics.append(row) 83 | 84 | if as_tfvar: 85 | return tf.cast(tf.constant(sph_harm_to_shtools(harmonics)), 86 | 'complex64') 87 | 88 | else: 89 | return harmonics 90 | 91 | 92 | def DHaj(n, mode='DH'): 93 | """ Sampling weights. """ 94 | # Driscoll and Healy sampling weights (on the phi dimension) 95 | # note: weights depend on the chosen grid, given by sph_sample 96 | if mode == 'DH': 97 | def gridfun(j): return np.pi*j/n 98 | elif mode == 'ours': 99 | def gridfun(j): return np.pi*(2*j+1)/2/n 100 | else: 101 | raise NotImplementedError() 102 | 103 | l = np.arange(0, n/2) 104 | a = [(2*np.sqrt(2)/n * 105 | np.sin(gridfun(j)) * 106 | (1/(2*l+1) * np.sin((2*l+1)*gridfun(j))).sum()) 107 | for j in range(n)] 108 | 109 | return a 110 | 111 | 112 | def sph_harm_transform(f, mode='DH', harmonics=None): 113 | """ Project spherical function into the spherical harmonics basis. """ 114 | assert f.shape[0] == f.shape[1] 115 | 116 | if isinstance(f, tf.Tensor): 117 | sumfun = tf.reduce_sum 118 | def conjfun(x): return tf.conj(x) 119 | n = f.shape[0].value 120 | else: 121 | sumfun = np.sum 122 | conjfun = np.conj 123 | n = f.shape[0] 124 | assert np.log2(n).is_integer() 125 | 126 | if harmonics is None: 127 | harmonics = sph_harm_all(n) 128 | 129 | a = DHaj(n, mode) 130 | 131 | f = f*np.array(a)[np.newaxis, :] 132 | 133 | real = is_real_sft(harmonics) 134 | 135 | coeffs = [] 136 | for l in range(n // 2): 137 | row = [] 138 | minl = 0 if real else -l 139 | for m in range(minl, l+1): 140 | # WARNING: results are off by this factor, when using driscoll1994computing formulas 141 | factor = 2*np.sqrt(np.pi) 142 | row.append(sumfun(factor * np.sqrt(2*np.pi)/n * 143 | f * conjfun(harmonics[l][m-minl]))) 144 | coeffs.append(row) 145 | 146 | return coeffs 147 | 148 | 149 | def sph_harm_inverse(c, harmonics=None): 150 | """ Inverse spherical harmonics transform. """ 151 | n = 2*len(c) 152 | 153 | real = is_real_sft(c) 154 | dtype = 'float32' if real else c[1][1].dtype 155 | if harmonics is None: 156 | harmonics = sph_harm_all(n, real=real) 157 | 158 | if isinstance(c[0][0], tf.Tensor): 159 | f = tf.zeros((n, n), dtype=dtype) 160 | else: 161 | f = np.zeros((n, n), dtype=dtype) 162 | 163 | for l in range(n // 2): 164 | lenm = l+1 if real else 2*l+1 165 | for m in range(lenm): 166 | if real: 167 | # leverage symmetry of coefficients and harmonics 168 | factor = 1 if m == 0 else 2 169 | f += factor*(tfnp.real(c[l][m]) * tfnp.real(harmonics[l][m]) - 170 | tfnp.imag(c[l][m]) * tfnp.imag(harmonics[l][m])) 171 | else: 172 | f += c[l][m] * harmonics[l][m] 173 | 174 | return f 175 | 176 | 177 | def sph_harm_transform_batch(f, method=None, *args, **kwargs): 178 | return sph_harm_transform_batch_naive(f, *args, **kwargs) 179 | 180 | 181 | def sph_harm_inverse_batch(c, method=None, *args, **kwargs): 182 | return sph_harm_inverse_batch_naive(c, *args, **kwargs) 183 | 184 | 185 | def sph_harm_transform_batch_naive(f, harmonics=None, m0_only=False): 186 | """ Spherical harmonics batch-transform. 187 | 188 | Args: 189 | f (n, l, l, c)-array : functions are on l x l grid 190 | harmonics (2, l/2, l/2, l, l)-array: 191 | m0_only (bool): return only coefficients with order 0; 192 | only them are needed when computing convolutions 193 | 194 | Returns: 195 | coeffs ((n, 2, l/2, l/2, c)-array): 196 | """ 197 | shapef = tfnp.shape(f) 198 | n, l = shapef[:2] 199 | assert shapef[2] == l 200 | if harmonics is None: 201 | harmonics = sph_harm_to_shtools(sph_harm_all(l)) 202 | shapeh = tfnp.shape(harmonics) 203 | assert shapeh[1:] == (l//2, l//2, l, l) 204 | assert shapeh[0] in [1, 2] 205 | 206 | aj = np.array(DHaj(l)) 207 | 208 | # returns m=0 only; useful to expand kernel in spherical convolution 209 | if m0_only: 210 | harmonics = harmonics[slice(0, 1), :, slice(0, 1), ...] 211 | 212 | na = np.newaxis 213 | coeffs = tfnp.transpose(2*np.sqrt(2)*np.pi/l * 214 | tfnp.dot(f * aj[na, na, :, na], 215 | tfnp.conj(harmonics), 216 | [[1, 2], [3, 4]]), 217 | (0, 2, 3, 4, 1)) 218 | 219 | return coeffs 220 | 221 | 222 | def sph_harm_inverse_batch_naive(c, harmonics=None): 223 | """ Spherical harmonics batch inverse transform. 224 | 225 | Args: 226 | c ((n, 2, l/2, l/2, c)-array): sph harm coefficients; max degree is l/2 227 | harmonics (2, l/2, l/2, l, l)-array: 228 | 229 | Returns: 230 | recons ((n, l, l, c)-array): 231 | """ 232 | shapec = tfnp.shape(c) 233 | l = 2*shapec[2] 234 | assert shapec[3] == l//2 235 | if harmonics is None: 236 | harmonics = sph_harm_to_shtools(sph_harm_all(l)) 237 | shapeh = tfnp.shape(harmonics) 238 | assert shapeh[1:] == (l//2, l//2, l, l) 239 | assert shapeh[0] in [1, 2] 240 | 241 | real = True if shapeh[0] == 1 else False 242 | 243 | na = np.newaxis 244 | 245 | if real: 246 | # using m, -m symmetry: 247 | # c^{-m}Y^{-m} + c^mY^m = 2(Re(c^{m})Re(Y^m) - Im(c^{m})Im(Y^m)) 248 | # that does not apply to c_0 so we compensate by dividing it by two 249 | factor = np.ones(tfnp.shape(c)[1:])[np.newaxis, ...] 250 | factor[..., 0, :] = factor[..., 0, :]/2 251 | c = c * factor 252 | # c[..., 0, :] = c[..., 0, :]/2 253 | recons = tfnp.transpose(2*(tfnp.dot(tfnp.real(c), tfnp.real(harmonics), 254 | [[1, 2, 3], [0, 1, 2]]) - 255 | tfnp.dot(tfnp.imag(c), tfnp.imag(harmonics), 256 | [[1, 2, 3], [0, 1, 2]])), 257 | (0, 2, 3, 1)) 258 | else: 259 | recons = tfnp.transpose(tfnp.dot(c, harmonics, 260 | [[1, 2, 3], [0, 1, 2]]), 261 | (0, 2, 3, 1)) 262 | return recons 263 | 264 | 265 | def sph_conv(f, g, harmonics=None): 266 | """ Spherical convolution f * g. """ 267 | stackfun = tf.stack if isinstance(f, tf.Tensor) else np.array 268 | cf, cg = [sph_harm_transform(x, harmonics=harmonics) for x in [f, g]] 269 | cfg = [2*np.pi*np.sqrt(4*np.pi / (2*l+1)) * stackfun(c1) * c2[l] 270 | for l, (c1, c2) in enumerate(zip(cf, cg))] 271 | 272 | return sph_harm_inverse(cfg) 273 | 274 | 275 | def sph_conv_batch(f, g, 276 | harmonics_or_legendre=None, 277 | method=None, 278 | spectral_pool=0, 279 | harmonics_or_legendre_low=None): 280 | """ CNN-like batch spherical convolution. 281 | 282 | Args: 283 | f (n, l, l, c)-array: input feature map. n entries, c channels 284 | g (c, l, l, d)-array: convolution kernels 285 | harmonics_or_legendre (): spherical harmonics or legendre polynomials to expand f and g 286 | method (str): see sph_harm_transform_batch 287 | spectral_pool (int): if > 0 run spectral pooling before ISHT 288 | (bandwidth is reduced by a factor of 2**spectral_pool) 289 | harmonics_or_legendre_low (): low frequency harmonics of legendre to be used when spectral_pool==True 290 | 291 | Returns: 292 | fg (n, l, l, d)-array 293 | """ 294 | shapef, shapeg = [tfnp.shape(x) for x in [f, g]] 295 | spectral_filter = True if len(shapeg) == 5 else False 296 | spectral_input = True if len(shapef) == 5 else False 297 | n = shapef[2] 298 | if spectral_input: 299 | n *= 2 300 | 301 | if not spectral_input: 302 | cf = sph_harm_transform_batch( 303 | f, method, harmonics_or_legendre, m0_only=False) 304 | else: 305 | cf = f 306 | if not spectral_filter: 307 | cg = sph_harm_transform_batch( 308 | g, method, harmonics_or_legendre, m0_only=True) 309 | else: 310 | cg = g 311 | 312 | shapecf, shapecg = [tfnp.shape(x) for x in [cf, cg]] 313 | assert shapecf[4] == shapecg[0] 314 | assert shapecf[2] == shapecg[2] 315 | 316 | na = np.newaxis 317 | # per degree factor 318 | factor = (2*np.pi*np.sqrt(4*np.pi / (2*np.arange(n/2)+1)) 319 | )[na, na, :, na, na, na] 320 | cg = tfnp.transpose(cg, (1, 2, 3, 0, 4))[na, ...] 321 | cf = cf[..., na] 322 | if istf(cg) and istf(cf): 323 | cg, cf = safe_cast(cg, cf) 324 | cfg = factor * cf * cg 325 | else: 326 | cfg = factor * cf * cg 327 | 328 | if spectral_pool > 0: 329 | cfg = cfg[:, :, :n//(2**(spectral_pool+1)), :n // 330 | (2**(spectral_pool+1)), ...] 331 | hol = harmonics_or_legendre_low 332 | else: 333 | hol = harmonics_or_legendre 334 | 335 | # sum over channels 336 | cfg = tfnp.sum(cfg, axis=-2) 337 | 338 | return sph_harm_inverse_batch(cfg, method, hol) 339 | 340 | 341 | def is_real_sft(h_or_c): 342 | """ Detect if list of lists of harmonics or coefficients assumes real inputs (m>0) """ 343 | if istf(h_or_c): 344 | d = tfnp.shape(h_or_c[1])[0] 345 | else: 346 | d = len(h_or_c[1]) 347 | 348 | isreal = True if d == 2 else False 349 | 350 | return isreal 351 | 352 | 353 | def sph_harm_to_shtools(c): 354 | """ Convert our list format for the sph harm coefficients/harmonics to pyshtools (2, n, n) format. """ 355 | n = len(c) 356 | real = is_real_sft(c) 357 | dim1 = 1 if real else 2 358 | out = np.zeros((dim1, n, n, *c[0][0].shape)) + 0j 359 | for l, cc in enumerate(c): 360 | cc = np.array(cc) 361 | if not real: 362 | m_minus = cc[:l][::-1] 363 | m_plus = cc[l:] 364 | else: 365 | m_minus = np.array([]) 366 | m_plus = cc 367 | 368 | # we get warnings here when using reals 369 | if m_minus.size > 0: 370 | out[1, l, 1:l+1, ...] = m_minus 371 | out[0, l, :l+1, ...] = m_plus 372 | 373 | return out 374 | -------------------------------------------------------------------------------- /provider/training_data_prepare.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modified from https://github.com/mentian/object-deformnet 3 | """ 4 | 5 | import os 6 | import sys 7 | import glob 8 | import cv2 9 | import math 10 | import numpy as np 11 | import _pickle as cPickle 12 | from tqdm import tqdm 13 | from autolab_core import RigidTransform 14 | 15 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 16 | ROOT_DIR = BASE_DIR 17 | sys.path.append(os.path.join(ROOT_DIR, '..', 'utils')) 18 | from align_utils import align_nocs_to_depth 19 | 20 | sym_id = [0,1,3] 21 | DATA_DIR = 'data' 22 | OBJ_MODEL_DIR = os.path.join(DATA_DIR, 'obj_models') 23 | 24 | 25 | def create_img_list(data_dir): 26 | """ Create train/val/test data list for CAMERA and Real. """ 27 | # CAMERA dataset 28 | for subset in ['train', 'val']: 29 | img_list = [] 30 | img_dir = os.path.join(data_dir, 'camera', subset) 31 | folder_list = [name for name in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, name))] 32 | for i in range(10*len(folder_list)): 33 | folder_id = int(i) // 10 34 | img_id = int(i) % 10 35 | img_path = os.path.join(subset, '{:05d}'.format(folder_id), '{:04d}'.format(img_id)) 36 | img_list.append(img_path) 37 | with open(os.path.join(data_dir, 'camera', subset+'_list_all.txt'), 'w') as f: 38 | for img_path in img_list: 39 | f.write("%s\n" % img_path) 40 | # Real dataset 41 | for subset in ['train', 'test']: 42 | img_list = [] 43 | img_dir = os.path.join(data_dir, 'real', subset) 44 | folder_list = [name for name in sorted(os.listdir(img_dir)) if os.path.isdir(os.path.join(img_dir, name))] 45 | for folder in folder_list: 46 | img_paths = glob.glob(os.path.join(img_dir, folder, '*_color.png')) 47 | img_paths = sorted(img_paths) 48 | for img_full_path in img_paths: 49 | img_name = os.path.basename(img_full_path) 50 | img_ind = img_name.split('_')[0] 51 | img_path = os.path.join(subset, folder, img_ind) 52 | img_list.append(img_path) 53 | with open(os.path.join(data_dir, 'real', subset+'_list_all.txt'), 'w') as f: 54 | for img_path in img_list: 55 | f.write("%s\n" % img_path) 56 | print('Write all data paths to file done!') 57 | 58 | def load_depth(img_path): 59 | """ Load depth image from img_path. """ 60 | depth_path = img_path + '_depth.png' 61 | depth = cv2.imread(depth_path, -1) 62 | if len(depth.shape) == 3: 63 | # This is encoded depth image, let's convert 64 | # NOTE: RGB is actually BGR in opencv 65 | depth16 = depth[:, :, 1]*256 + depth[:, :, 2] 66 | depth16 = np.where(depth16==32001, 0, depth16) 67 | depth16 = depth16.astype(np.uint16) 68 | elif len(depth.shape) == 2 and depth.dtype == 'uint16': 69 | depth16 = depth 70 | else: 71 | assert False, '[ Error ]: Unsupported depth type.' 72 | return depth16 73 | 74 | 75 | def process_data(img_path, depth, subset=None): 76 | """ Load instance masks for the objects in the image. """ 77 | 78 | mask_path = img_path + '_mask.png' 79 | mask = cv2.imread(mask_path)[:, :, 2] 80 | mask = np.array(mask, dtype=np.int32) 81 | all_inst_ids = sorted(list(np.unique(mask))) 82 | assert all_inst_ids[-1] == 255 83 | del all_inst_ids[-1] # remove background 84 | num_all_inst = len(all_inst_ids) 85 | h, w = mask.shape 86 | 87 | coord_path = img_path + '_coord.png' 88 | coord_map = cv2.imread(coord_path)[:, :, :3] 89 | coord_map = coord_map[:, :, (2, 1, 0)] 90 | # flip z axis of coord map 91 | coord_map = np.array(coord_map, dtype=np.float32) / 255 92 | coord_map[:, :, 2] = 1 - coord_map[:, :, 2] 93 | 94 | class_ids = [] 95 | instance_ids = [] 96 | model_list = [] 97 | masks = np.zeros([h, w, num_all_inst], dtype=np.uint8) 98 | coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32) 99 | bboxes = np.zeros((num_all_inst, 4), dtype=np.int32) 100 | scales = np.zeros([num_all_inst, 3], dtype=np.float32) 101 | 102 | meta_path = img_path + '_meta.txt' 103 | with open(meta_path, 'r') as f: 104 | i = 0 105 | for line in f: 106 | line_info = line.strip().split(' ') 107 | inst_id = int(line_info[0]) 108 | cls_id = int(line_info[1]) 109 | # background objects and non-existing objects 110 | if cls_id == 0 or (inst_id not in all_inst_ids): 111 | continue 112 | 113 | if len(line_info) == 3: 114 | model_id = line_info[2] # Real scanned objs 115 | if model_id[-3:] == 'npz': 116 | npz_path = os.path.join(OBJ_MODEL_DIR, 'real_val', model_id) 117 | with np.load(npz_path) as npz_file: 118 | scale = npz_file['scale'] 119 | else: 120 | bbox_file = os.path.join(OBJ_MODEL_DIR, 'real_'+subset, model_id+'.txt') 121 | scale = np.loadtxt(bbox_file) 122 | 123 | scales[i, :] = scale/ (np.linalg.norm(scale)+1e-10) 124 | else: 125 | model_id = line_info[3] # CAMERA objs 126 | bbox_file = os.path.join(OBJ_MODEL_DIR, subset, line_info[2], line_info[3], 'bbox.txt') 127 | bbox = np.loadtxt(bbox_file) 128 | scales[i, :] = bbox[0, :] - bbox[1, :] 129 | 130 | # remove one mug instance in CAMERA train due to improper model 131 | if model_id == 'b9be7cfe653740eb7633a2dd89cec754': 132 | continue 133 | # process foreground objects 134 | inst_mask = np.equal(mask, inst_id) 135 | # bounding box 136 | horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0] 137 | vertical_indicies = np.where(np.any(inst_mask, axis=1))[0] 138 | assert horizontal_indicies.shape[0], print(img_path) 139 | x1, x2 = horizontal_indicies[[0, -1]] 140 | y1, y2 = vertical_indicies[[0, -1]] 141 | # x2 and y2 should not be part of the box. Increment by 1. 142 | x2 += 1 143 | y2 += 1 144 | # object occupies full image, rendering error, happens in CAMERA dataset 145 | if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)): 146 | return None, None, None, None, None, None, None 147 | # not enough valid depth observation 148 | final_mask = np.logical_and(inst_mask, depth > 0) 149 | if np.sum(final_mask) < 64: 150 | continue 151 | class_ids.append(cls_id) 152 | instance_ids.append(inst_id) 153 | model_list.append(model_id) 154 | masks[:, :, i] = inst_mask 155 | coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1)) 156 | bboxes[i] = np.array([y1, x1, y2, x2]) 157 | 158 | i += 1 159 | # no valid foreground objects 160 | if i == 0: 161 | return None, None, None, None, None, None, None 162 | 163 | masks = masks[:, :, :i] 164 | coords = np.clip(coords[:, :, :i, :], 0, 1) 165 | bboxes = bboxes[:i, :] 166 | scales = scales[:i, :] 167 | 168 | return masks, coords, class_ids, instance_ids, model_list, bboxes, scales 169 | 170 | 171 | def annotate_camera_train(data_dir): 172 | """ Generate gt labels for CAMERA train data. """ 173 | camera_train = open(os.path.join(data_dir, 'camera', 'train_list_all.txt')).read().splitlines() 174 | intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]]) 175 | 176 | all_input_dis = [] 177 | all_input_rgb = [] 178 | all_observed_pc = [] 179 | all_translation = [] 180 | all_rotation = [] 181 | all_scale = [] 182 | 183 | part_counter = 1 184 | instance_counter = 0 185 | 186 | for i, img_path in enumerate(tqdm(camera_train)): 187 | img_full_path = os.path.join(data_dir, 'camera', img_path) 188 | all_exist = os.path.exists(img_full_path + '_color.png') and \ 189 | os.path.exists(img_full_path + '_coord.png') and \ 190 | os.path.exists(img_full_path + '_depth.png') and \ 191 | os.path.exists(img_full_path + '_mask.png') and \ 192 | os.path.exists(img_full_path + '_meta.txt') 193 | if not all_exist: 194 | continue 195 | 196 | image = cv2.imread(img_full_path + '_color.png')[:, :, :3] 197 | image = image[:, :, ::-1] 198 | depth = load_depth(img_full_path) 199 | masks, coords, class_ids, instance_ids, model_list, bboxes, sizes = process_data(img_full_path, depth, subset='train') 200 | if instance_ids is None: 201 | continue 202 | # Umeyama alignment of GT NOCS map with depth image 203 | pts, input_dis, input_rgb, scales, rotations, translations, error_messages, _ = \ 204 | align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path) 205 | if error_messages: 206 | continue 207 | # write results 208 | quaternions = np.zeros((rotations.shape[0], 4)) 209 | for k in range(pts.shape[0]): 210 | label = class_ids[k] - 1 211 | r = rotations[k] 212 | if label in sym_id: 213 | theta_x = r[0, 0] + r[2, 2] 214 | theta_y = r[0, 2] - r[2, 0] 215 | r_norm = math.sqrt(theta_x**2 + theta_y**2) 216 | s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm], 217 | [0.0, 1.0, 0.0 ], 218 | [theta_y/r_norm, 0.0, theta_x/r_norm]]) 219 | r = s_map@r 220 | quaternions[k] = RigidTransform(rotation=r).quaternion 221 | 222 | all_input_dis.append(input_dis) 223 | all_input_rgb.append(input_rgb) 224 | all_observed_pc.append(pts) 225 | all_translation.append(translations) 226 | all_rotation.append(quaternions) 227 | all_scale.append(scales[:, np.newaxis] * sizes) 228 | 229 | instance_counter += pts.shape[0] 230 | if instance_counter >=80000 or i==len(camera_train)-1: 231 | dataset = {} 232 | dataset['input_dis'] = np.concatenate(all_input_dis, axis=0).astype(np.float32) 233 | dataset['input_rgb'] = np.concatenate(all_input_rgb, axis=0).astype(np.float32) 234 | dataset['observed_pc'] = np.concatenate(all_observed_pc, axis=0).astype(np.float32) 235 | dataset['translation'] = np.concatenate(all_translation, axis=0).astype(np.float32) 236 | dataset['rotation'] = np.concatenate(all_rotation, axis=0).astype(np.float32) 237 | dataset['scale'] = np.concatenate(all_scale, axis=0).astype(np.float32) 238 | 239 | with open(os.path.join(ROOT_DIR, '..', 'data', 'training_instance', 'CAMERA25_'+str(part_counter)+'.pkl'), 'wb') as f: 240 | cPickle.dump(dataset, f) 241 | 242 | all_input_dis = [] 243 | all_input_rgb = [] 244 | all_observed_pc = [] 245 | all_translation = [] 246 | all_rotation = [] 247 | all_scale = [] 248 | 249 | part_counter += 1 250 | instance_counter = 0 251 | 252 | 253 | def annotate_real_train(data_dir): 254 | """ Generate gt labels for Real train data through PnP. """ 255 | real_train = open(os.path.join(data_dir, 'real/train_list_all.txt')).read().splitlines() 256 | intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]]) 257 | 258 | all_input_dis = [] 259 | all_input_rgb = [] 260 | all_observed_pc = [] 261 | all_translation = [] 262 | all_rotation = [] 263 | all_scale = [] 264 | 265 | for img_path in tqdm(real_train): 266 | img_full_path = os.path.join(data_dir, 'real', img_path) 267 | all_exist = os.path.exists(img_full_path + '_color.png') and \ 268 | os.path.exists(img_full_path + '_coord.png') and \ 269 | os.path.exists(img_full_path + '_depth.png') and \ 270 | os.path.exists(img_full_path + '_mask.png') and \ 271 | os.path.exists(img_full_path + '_meta.txt') 272 | if not all_exist: 273 | continue 274 | image = cv2.imread(img_full_path + '_color.png')[:, :, :3] 275 | image = image[:, :, ::-1] 276 | depth = load_depth(img_full_path) 277 | masks, coords, class_ids, instance_ids, model_list, bboxes, sizes = process_data(img_full_path, depth, 'train') 278 | if instance_ids is None: 279 | continue 280 | # Umeyama alignment of GT NOCS map with depth image 281 | pts, input_dis, input_rgb, scales, rotations, translations, error_messages, _ = \ 282 | align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path) 283 | if error_messages: 284 | continue 285 | # write results 286 | quaternions = np.zeros((rotations.shape[0], 4)) 287 | for k in range(pts.shape[0]): 288 | label = class_ids[k] - 1 289 | r = rotations[k] 290 | if label in sym_id: 291 | theta_x = r[0, 0] + r[2, 2] 292 | theta_y = r[0, 2] - r[2, 0] 293 | r_norm = math.sqrt(theta_x**2 + theta_y**2) 294 | s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm], 295 | [0.0, 1.0, 0.0 ], 296 | [theta_y/r_norm, 0.0, theta_x/r_norm]]) 297 | r = s_map@r 298 | quaternions[k] = RigidTransform(rotation=r).quaternion 299 | 300 | all_input_dis.append(input_dis) 301 | all_input_rgb.append(input_rgb) 302 | all_observed_pc.append(pts) 303 | all_translation.append(translations) 304 | all_rotation.append(quaternions) 305 | all_scale.append(scales[:, np.newaxis] * sizes) 306 | 307 | dataset = {} 308 | dataset['input_dis'] = np.concatenate(all_input_dis, axis=0).astype(np.float32) 309 | dataset['input_rgb'] = np.concatenate(all_input_rgb, axis=0).astype(np.float32) 310 | dataset['observed_pc'] = np.concatenate(all_observed_pc, axis=0).astype(np.float32) 311 | dataset['translation'] = np.concatenate(all_translation, axis=0).astype(np.float32) 312 | dataset['rotation'] = np.concatenate(all_rotation, axis=0).astype(np.float32) 313 | dataset['scale'] = np.concatenate(all_scale, axis=0).astype(np.float32) 314 | 315 | with open(os.path.join(ROOT_DIR, '..', 'data', 'training_instance', 'REAL275.pkl'), 'wb') as f: 316 | cPickle.dump(dataset, f) 317 | 318 | 319 | if __name__ == '__main__': 320 | 321 | if not os.path.exists(os.path.join(ROOT_DIR, '..', 'data', 'training_instance')): 322 | os.makedirs(os.path.join(ROOT_DIR, '..', 'data', 'training_instance')) 323 | 324 | # create list for all data 325 | create_img_list(DATA_DIR) 326 | # annotate dataset and re-write valid data to list 327 | annotate_camera_train(DATA_DIR) 328 | annotate_real_train(DATA_DIR) 329 | -------------------------------------------------------------------------------- /model/dualposenet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ 7 | Dual Pose Network with Refined Learning of Pose Consistency. 8 | 9 | Author: Jiehong Lin 10 | """ 11 | 12 | import os 13 | import tensorflow as tf 14 | import numpy as np 15 | import math 16 | import cmath 17 | import glob 18 | import _pickle as cPickle 19 | from tqdm import tqdm 20 | import cv2 21 | from transforms3d.euler import quat2mat 22 | 23 | from layers import point_transformation 24 | from modules import encoder, explicit_decoder, implicit_decoder 25 | from dataloder import Fetcher 26 | from pc_utils import load_depth, backproject, pc2sphericalmap 27 | 28 | 29 | class DualPoseNet(object): 30 | def __init__(self, opts, sess): 31 | self.sess = sess 32 | self.opts = opts 33 | 34 | if self.opts.dataset == 'REAL275': 35 | self.intrinsics = np.array( 36 | [[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]]) 37 | elif self.opts.dataset == 'CAMERA25': 38 | self.intrinsics = np.array( 39 | [[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]]) 40 | 41 | def allocate_placeholders(self): 42 | self.is_training = tf.placeholder_with_default( 43 | True, shape=[], name='is_training') 44 | self.global_step = tf.Variable(0, trainable=False, name='global_step') 45 | 46 | self.input_dis = tf.placeholder(tf.float32, shape=[ 47 | self.opts.batch_size, self.opts.input_res, self.opts.input_res, 1]) 48 | self.input_rgb = tf.placeholder(tf.float32, shape=[ 49 | self.opts.batch_size, self.opts.input_res, self.opts.input_res, 3]) 50 | self.observed_pc = tf.placeholder( 51 | tf.float32, shape=[self.opts.batch_size, 1024, 3]) 52 | 53 | self.gt_rotation = tf.placeholder( 54 | tf.float32, shape=[self.opts.batch_size, 4]) 55 | self.gt_translation = tf.placeholder( 56 | tf.float32, shape=[self.opts.batch_size, 3]) 57 | self.gt_scale = tf.placeholder( 58 | tf.float32, shape=[self.opts.batch_size, 3]) 59 | 60 | def build_model(self): 61 | # model 62 | self.encoder = encoder(self.opts, self.is_training, name='encoder') 63 | self.explicit_decoder = explicit_decoder( 64 | self.opts, self.is_training, name='explicit_decoder') 65 | self.implicit_decoder = implicit_decoder( 66 | self.opts, self.is_training, name="implicit_decoder") 67 | 68 | # graphs 69 | self.pose_feat = self.encoder(self.input_dis, self.input_rgb) 70 | self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder( 71 | self.pose_feat) 72 | self.pred_canonical_points = self.implicit_decoder( 73 | self.observed_pc, self.pose_feat) 74 | self.gt_canonical_points = point_transformation( 75 | self.observed_pc, self.gt_rotation, self.gt_translation, self.gt_scale) 76 | 77 | # loss 78 | self.translation_loss = tf.losses.huber_loss( 79 | self.pred_translation, self.gt_translation) 80 | self.rotation_loss = tf.losses.huber_loss( 81 | self.pred_rotation, self.gt_rotation) 82 | self.scale_loss = tf.losses.huber_loss(self.pred_scale, self.gt_scale) 83 | self.implicit_loss = tf.losses.huber_loss( 84 | self.pred_canonical_points, self.gt_canonical_points) 85 | self.loss = self.rotation_loss + self.translation_loss + \ 86 | self.scale_loss + self.opts.implicit_loss_weight*self.implicit_loss 87 | 88 | def setup_optimizer(self): 89 | self.learning_rate = tf.train.exponential_decay(self.opts.learning_rate, self.global_step, 90 | self.opts.lr_decay_steps, self.opts.lr_decay_rate, staircase=True) 91 | self.learning_rate = tf.maximum(self.learning_rate, 0.000001) 92 | 93 | all_update_ops = [op for op in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if op.name.startswith( 94 | "encoder") or op.name.startswith("explicit_decoder") or op.name.startswith("implicit_decoder")] 95 | all_tvars = [var for var in tf.trainable_variables() if var.name.startswith( 96 | "encoder") or var.name.startswith("explicit_decoder") or var.name.startswith("implicit_decoder")] 97 | with tf.control_dependencies(all_update_ops): 98 | self.all_optimizers = tf.train.AdamOptimizer(self.learning_rate).minimize( 99 | self.loss, var_list=all_tvars, colocate_gradients_with_ops=True, global_step=self.global_step) 100 | 101 | def train(self): 102 | print('\n*********** Training of DualPoseNet ***********') 103 | 104 | # model & graph 105 | print('building model ...') 106 | self.allocate_placeholders() 107 | self.build_model() 108 | self.setup_optimizer() 109 | print('model built !') 110 | 111 | # dataset 112 | print('loading data ...') 113 | fetchworker = Fetcher(self.opts) 114 | fetchworker.start() 115 | print('data loaded !') 116 | 117 | print('starting training ...') 118 | self.sess.run(tf.global_variables_initializer()) 119 | self.saver = tf.train.Saver(max_to_keep=None) 120 | step = self.sess.run(self.global_step) 121 | 122 | for epoch in range(1, self.opts.training_epoch+1): 123 | sum_loss = 0.0 124 | sum_r_loss = 0.0 125 | sum_t_loss = 0.0 126 | sum_s_loss = 0.0 127 | sum_i_loss = 0.0 128 | count = 0 129 | 130 | for batch_idx in range(fetchworker.num_batches): 131 | 132 | batch_input_dis, batch_input_rgb, batch_observed_pc, batch_rotation, batch_translation, batch_scale = fetchworker.fetch() 133 | curr_bs = batch_input_dis.shape[0] 134 | 135 | feed_dict = {self.input_dis: batch_input_dis, 136 | self.input_rgb: batch_input_rgb, 137 | self.observed_pc: batch_observed_pc, 138 | self.gt_rotation: batch_rotation, 139 | self.gt_translation: batch_translation, 140 | self.gt_scale: batch_scale, 141 | self.is_training: True} 142 | 143 | _, loss, r_loss, t_loss, s_loss, i_loss = self.sess.run( 144 | [self.all_optimizers, self.loss, self.rotation_loss, self.translation_loss, self.scale_loss, self.implicit_loss], feed_dict=feed_dict) 145 | 146 | sum_loss += (loss*curr_bs) 147 | sum_r_loss += (r_loss*curr_bs) 148 | sum_t_loss += (t_loss*curr_bs) 149 | sum_s_loss += (s_loss*curr_bs) 150 | sum_i_loss += (i_loss*curr_bs) 151 | count += curr_bs 152 | 153 | print('[{}][{}/{}] loss: {:.5f}({:.5f}) r_loss: {:.5f}({:.5f}) t_loss: {:.5f}({:.5f}) s_loss: {:.5f}({:.5f}) i_loss: {:.5f}({:.5f})'.format( 154 | epoch, batch_idx, fetchworker.num_batches, sum_loss / 155 | float(count), loss, sum_r_loss / 156 | float(count), r_loss, sum_t_loss/float(count), 157 | t_loss, sum_s_loss/float(count), s_loss, sum_i_loss/float(count), i_loss)) 158 | step += 1 159 | 160 | if epoch % 10 == 0: 161 | self.saver.save(self.sess, os.path.join( 162 | self.opts.log_dir, 'model'), epoch) 163 | 164 | fetchworker.shutdown() 165 | print('training finished !') 166 | 167 | def test(self): 168 | 169 | print('\n*********** Testing on {} ***********'.format(self.opts.dataset)) 170 | 171 | # inputs 172 | self.input_dis = tf.placeholder( 173 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1]) 174 | self.input_rgb = tf.placeholder( 175 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3]) 176 | self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3]) 177 | self.is_training = tf.placeholder_with_default( 178 | False, shape=[], name='is_training') 179 | 180 | # modules 181 | self.encoder = encoder(self.opts, self.is_training, name='encoder') 182 | self.explicit_decoder = explicit_decoder( 183 | self.opts, self.is_training, name='explicit_decoder') 184 | self.implicit_decoder = implicit_decoder( 185 | self.opts, self.is_training, name="implicit_decoder") 186 | 187 | # graphs 188 | self.pose_feat = self.encoder(self.input_dis, self.input_rgb) 189 | self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder( 190 | self.pose_feat) 191 | self.pred_canonical_points = self.implicit_decoder( 192 | self.observed_pc, self.pose_feat) 193 | 194 | # checkpoints 195 | print("loading from checkpoint ...") 196 | saver = tf.train.Saver() 197 | checkpoint_path = os.path.join( 198 | self.opts.log_dir, 'model-'+str(self.opts.test_epoch)) 199 | saver.restore(self.sess, checkpoint_path) 200 | 201 | # test 202 | result_pkl_list = glob.glob( 203 | 'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset)) 204 | result_pkl_list = sorted(result_pkl_list) 205 | n_image = len(result_pkl_list) 206 | print('no. of test images: {}\n'.format(n_image)) 207 | 208 | for i, path in tqdm(enumerate(result_pkl_list)): 209 | with open(path, 'rb') as f: 210 | data = cPickle.load(f) 211 | image_path = data['image_path'] 212 | num_instance = len(data['pred_class_ids']) 213 | 214 | image = cv2.imread(image_path + '_color.png')[:, :, :3] 215 | image = image[:, :, ::-1] 216 | depth = load_depth(image_path) 217 | pred_mask = data['pred_masks'] 218 | 219 | pred_RTs = np.zeros((num_instance, 4, 4)) 220 | pred_scales = np.zeros((num_instance, 3)) 221 | pred_RTs[:, 3, 3] = 1 222 | 223 | if num_instance != 0: 224 | for j in range(num_instance): 225 | inst_mask = 255 * pred_mask[:, :, j].astype('uint8') 226 | pts_ori, idx = backproject( 227 | depth, self.intrinsics, inst_mask) 228 | pts_ori = pts_ori/1000.0 229 | rgb_ori = image[idx[0], idx[1], :] 230 | rgb_ori = ( 231 | rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0 232 | 233 | FLAG = pts_ori.shape[0] > 32 234 | pts = pts_ori.copy() 235 | rgb = rgb_ori.copy() 236 | 237 | for k in range(3): 238 | if FLAG: 239 | centroid = np.mean(pts, axis=0) 240 | pts = pts - centroid[np.newaxis, :] 241 | 242 | input_dis, input_rgb = pc2sphericalmap( 243 | pts, rgb, resolution=self.opts.input_res) 244 | if pts.shape[0] > 1024: 245 | input_pts = pts[np.random.choice( 246 | pts.shape[0], 1024, replace=False), :] 247 | else: 248 | input_pts = pts[np.random.choice( 249 | pts.shape[0], 1024), :] 250 | feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb, 251 | self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False} 252 | pred_rotation, pred_translation, pred_size = self.sess.run( 253 | [self.pred_rotation, self.pred_translation, self.pred_scale], feed_dict=feed_dict) 254 | 255 | pred_rotation = quat2mat(pred_rotation[0]) 256 | pred_translation = pred_translation[0] + centroid 257 | pred_scale = np.linalg.norm(pred_size[0]) 258 | pred_size = pred_size[0]/pred_scale 259 | 260 | pred_canonical_pts = ( 261 | (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation) 262 | dis = np.linalg.norm(pred_canonical_pts, axis=1) 263 | 264 | FLAG = np.sum(dis < 0.6) > 32 265 | pts = pts_ori[dis < 0.6].copy() 266 | rgb = rgb_ori[dis < 0.6].copy() 267 | else: 268 | break 269 | 270 | if pts_ori.shape[0] > 32: 271 | pred_RTs[j, :3, :3] = np.diag( 272 | np.ones((3))*pred_scale) @ pred_rotation.transpose() 273 | pred_RTs[j, :3, 3] = pred_translation 274 | pred_scales[j] = pred_size 275 | 276 | z_180_RT = np.zeros((4, 4), dtype=np.float32) 277 | z_180_RT[:3, :3] = np.diag([-1, -1, 1]) 278 | z_180_RT[3, 3] = 1 279 | pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :] 280 | else: 281 | pred_RTs[j] = np.eye(4) 282 | pred_scales[j] = np.ones((3)) 283 | 284 | data.pop('pred_masks') 285 | data['pred_RTs'] = pred_RTs 286 | data['pred_scales'] = pred_scales 287 | 288 | with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f: 289 | cPickle.dump(data, f) 290 | 291 | def test_refine_encoder(self): 292 | 293 | print( 294 | '\n*********** Testing & Refining on {} ***********'.format(self.opts.dataset)) 295 | 296 | # inputs 297 | self.input_dis = tf.placeholder( 298 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1]) 299 | self.input_rgb = tf.placeholder( 300 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3]) 301 | self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3]) 302 | self.is_training = tf.placeholder_with_default( 303 | False, shape=[], name='is_training') 304 | self.global_step = tf.Variable(0, trainable=False, name='global_step') 305 | 306 | # modules 307 | self.encoder = encoder(self.opts, self.is_training, name='encoder') 308 | self.explicit_decoder = explicit_decoder( 309 | self.opts, self.is_training, name='explicit_decoder') 310 | self.implicit_decoder = implicit_decoder( 311 | self.opts, self.is_training, name="implicit_decoder") 312 | 313 | # graphs 314 | self.pose_feat = self.encoder(self.input_dis, self.input_rgb) 315 | self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder( 316 | self.pose_feat) 317 | self.ex_canonical_points = point_transformation( 318 | self.observed_pc, self.pred_rotation, self.pred_translation, self.pred_scale) 319 | self.im_canonical_points = self.implicit_decoder( 320 | self.observed_pc, self.pose_feat) 321 | 322 | # self-adaptive loss 323 | self.loss = tf.losses.huber_loss( 324 | self.ex_canonical_points, self.im_canonical_points) 325 | 326 | # optimizer 327 | self.learning_rate = tf.train.exponential_decay( 328 | self.opts.refine_learning_rate, self.global_step, 100000000, 0, staircase=True) 329 | all_update_ops = [op for op in tf.get_collection( 330 | tf.GraphKeys.UPDATE_OPS) if op.name.startswith("encoder")] 331 | all_tvars = [var for var in tf.trainable_variables() 332 | if var.name.startswith("encoder")] 333 | with tf.control_dependencies(all_update_ops): 334 | self.all_optimizers = tf.train.AdamOptimizer(self.learning_rate).minimize( 335 | self.loss, var_list=all_tvars, colocate_gradients_with_ops=True, global_step=self.global_step) 336 | 337 | # checkpoints 338 | print("loading from checkpoint ...") 339 | self.sess.run(tf.global_variables_initializer()) 340 | checkpoint_path = os.path.join( 341 | self.opts.log_dir, 'model-'+str(self.opts.test_epoch)) 342 | var_to_restore = [ 343 | val for val in tf.global_variables() if ('/Adam' not in val.name) and ('encoder' in val.name or 'explicit_decoder' in val.name or 'implicit_decoder' in val.name)] 344 | saver = tf.train.Saver(var_to_restore) 345 | saver.restore(self.sess, checkpoint_path) 346 | step = self.sess.run(self.global_step) 347 | 348 | # test 349 | result_pkl_list = glob.glob( 350 | 'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset)) 351 | result_pkl_list = sorted(result_pkl_list) 352 | n_image = len(result_pkl_list) 353 | print('no. of test images: {}\n'.format(n_image)) 354 | 355 | for i, path in tqdm(enumerate(result_pkl_list)): 356 | with open(path, 'rb') as f: 357 | data = cPickle.load(f) 358 | image_path = data['image_path'] 359 | num_instance = len(data['pred_class_ids']) 360 | 361 | image = cv2.imread(image_path + '_color.png')[:, :, :3] 362 | image = image[:, :, ::-1] 363 | depth = load_depth(image_path) 364 | pred_mask = data['pred_masks'] 365 | 366 | pred_RTs = np.zeros((num_instance, 4, 4)) 367 | pred_scales = np.zeros((num_instance, 3)) 368 | pred_RTs[:, 3, 3] = 1 369 | 370 | if num_instance != 0: 371 | for j in range(num_instance): 372 | saver.restore(self.sess, checkpoint_path) # re-load model 373 | 374 | inst_mask = 255 * pred_mask[:, :, j].astype('uint8') 375 | pts_ori, idx = backproject( 376 | depth, self.intrinsics, inst_mask) 377 | pts_ori = pts_ori/1000.0 378 | rgb_ori = image[idx[0], idx[1], :] 379 | rgb_ori = ( 380 | rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0 381 | 382 | # test 383 | dis = np.zeros((pts_ori.shape[0])) 384 | for k in range(3): 385 | pts = pts_ori[dis < 0.6].copy() 386 | rgb = rgb_ori[dis < 0.6].copy() 387 | 388 | if pts.shape[0] > 32: 389 | centroid = np.mean(pts, axis=0) 390 | pts = pts - centroid[np.newaxis, :] 391 | 392 | input_dis, input_rgb = pc2sphericalmap( 393 | pts, rgb, resolution=self.opts.input_res) 394 | if pts.shape[0] > 1024: 395 | input_pts = pts[np.random.choice( 396 | pts.shape[0], 1024, replace=False), :] 397 | else: 398 | input_pts = pts[np.random.choice( 399 | pts.shape[0], 1024), :] 400 | feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb, 401 | self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False} 402 | iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run( 403 | [self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict) 404 | 405 | pred_rotation = quat2mat(iter_rotation[0]) 406 | pred_translation = iter_translation[0] + centroid 407 | pred_scale = np.linalg.norm(iter_size[0]) 408 | pred_size = iter_size[0]/pred_scale 409 | pred_loss = iter_loss 410 | 411 | pred_canonical_pts = ( 412 | (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation) 413 | dis = np.linalg.norm(pred_canonical_pts, axis=1) 414 | 415 | else: 416 | break 417 | 418 | # refinement 419 | if pts.shape[0] > 32: 420 | for k in range(self.opts.refine_iteration): 421 | _, iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run( 422 | [self.all_optimizers, self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict) 423 | if iter_loss < pred_loss: 424 | pred_rotation = quat2mat(iter_rotation[0]) 425 | pred_translation = iter_translation[0] + centroid 426 | pred_scale = np.linalg.norm(iter_size[0]) 427 | pred_size = iter_size[0]/pred_scale 428 | pred_loss = iter_loss 429 | if pred_loss <= self.opts.refine_threshold: 430 | break 431 | 432 | if pts_ori.shape[0] > 32: 433 | pred_RTs[j, :3, :3] = np.diag( 434 | np.ones((3))*pred_scale) @ pred_rotation.transpose() 435 | pred_RTs[j, :3, 3] = pred_translation 436 | pred_scales[j] = pred_size 437 | 438 | z_180_RT = np.zeros((4, 4), dtype=np.float32) 439 | z_180_RT[:3, :3] = np.diag([-1, -1, 1]) 440 | z_180_RT[3, 3] = 1 441 | pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :] 442 | else: 443 | pred_RTs[j] = np.eye(4) 444 | pred_scales[j] = np.ones((3)) 445 | 446 | data.pop('pred_masks') 447 | data['pred_RTs'] = pred_RTs 448 | data['pred_scales'] = pred_scales 449 | 450 | with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f: 451 | cPickle.dump(data, f) 452 | 453 | def test_refine_feature(self): 454 | print( 455 | '\n*********** Testing & Refining on {} ***********'.format(self.opts.dataset)) 456 | 457 | # inputs 458 | self.input_dis = tf.placeholder( 459 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1]) 460 | self.input_rgb = tf.placeholder( 461 | tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3]) 462 | self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3]) 463 | self.new_pose_feat = tf.Variable(tf.zeros([1,1024]), trainable=True, name='new_pose_feat') 464 | self.is_training = tf.placeholder_with_default( 465 | False, shape=[], name='is_training') 466 | self.global_step = tf.Variable(0, trainable=False, name='global_step') 467 | 468 | # modules 469 | self.encoder = encoder(self.opts, self.is_training, name='encoder') 470 | self.explicit_decoder = explicit_decoder( 471 | self.opts, self.is_training, name='explicit_decoder') 472 | self.implicit_decoder = implicit_decoder( 473 | self.opts, self.is_training, name="implicit_decoder") 474 | 475 | # graphs 476 | self.pose_feat = self.encoder(self.input_dis, self.input_rgb) 477 | self.pred_translation0, self.pred_rotation0, self.pred_scale0 = self.explicit_decoder( 478 | self.pose_feat) 479 | self.ex_canonical_points0 = point_transformation( 480 | self.observed_pc, self.pred_rotation0, self.pred_translation0, self.pred_scale0) 481 | self.im_canonical_points0 = self.implicit_decoder( 482 | self.observed_pc, self.pose_feat) 483 | 484 | # refine_graphs 485 | self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder( 486 | self.new_pose_feat) 487 | self.ex_canonical_points = point_transformation( 488 | self.observed_pc, self.pred_rotation, self.pred_translation, self.pred_scale) 489 | self.im_canonical_points = self.implicit_decoder( 490 | self.observed_pc, self.new_pose_feat) 491 | 492 | # self.adaptive loss 493 | self.loss = tf.losses.huber_loss( 494 | self.ex_canonical_points, self.im_canonical_points) 495 | 496 | # optimizer 497 | self.learning_rate = tf.train.exponential_decay( 498 | self.opts.refine_learning_rate, self.global_step, 100000000, 0, staircase=True) 499 | self.all_optimizers = tf.train.AdamOptimizer(self.opts.refine_f_learning_rate).minimize( 500 | self.loss, var_list=[self.new_pose_feat], colocate_gradients_with_ops=True, global_step=self.global_step) 501 | 502 | # checkpoints 503 | print("loading from checkpoint ...") 504 | self.sess.run(tf.global_variables_initializer()) 505 | checkpoint_path = os.path.join( 506 | self.opts.log_dir, 'model-'+str(self.opts.test_epoch)) 507 | var_to_restore = [ 508 | val for val in tf.global_variables() if ('/Adam' not in val.name) and ('encoder' in val.name or 'explicit_decoder' in val.name or 'implicit_decoder' in val.name)] 509 | saver = tf.train.Saver(var_to_restore) 510 | saver.restore(self.sess, checkpoint_path) 511 | step = self.sess.run(self.global_step) 512 | 513 | # test 514 | result_pkl_list = glob.glob( 515 | 'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset)) 516 | result_pkl_list = sorted(result_pkl_list) 517 | n_image = len(result_pkl_list) 518 | print('no. of test images: {}\n'.format(n_image)) 519 | 520 | for i, path in tqdm(enumerate(result_pkl_list)): 521 | with open(path, 'rb') as f: 522 | data = cPickle.load(f) 523 | image_path = data['image_path'] 524 | num_instance = len(data['pred_class_ids']) 525 | 526 | image = cv2.imread(image_path + '_color.png')[:, :, :3] 527 | image = image[:, :, ::-1] 528 | depth = load_depth(image_path) 529 | pred_mask = data['pred_masks'] 530 | 531 | pred_RTs = np.zeros((num_instance, 4, 4)) 532 | pred_scales = np.zeros((num_instance, 3)) 533 | pred_RTs[:, 3, 3] = 1 534 | 535 | if num_instance != 0: 536 | for j in range(num_instance): 537 | inst_mask = 255 * pred_mask[:, :, j].astype('uint8') 538 | pts_ori, idx = backproject( 539 | depth, self.intrinsics, inst_mask) 540 | pts_ori = pts_ori/1000.0 541 | rgb_ori = image[idx[0], idx[1], :] 542 | rgb_ori = ( 543 | rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0 544 | 545 | # test 546 | dis = np.zeros((pts_ori.shape[0])) 547 | for k in range(3): 548 | pts = pts_ori[dis < 0.6].copy() 549 | rgb = rgb_ori[dis < 0.6].copy() 550 | 551 | if pts.shape[0] > 32: 552 | centroid = np.mean(pts, axis=0) 553 | pts = pts - centroid[np.newaxis, :] 554 | 555 | input_dis, input_rgb = pc2sphericalmap( 556 | pts, rgb, resolution=self.opts.input_res) 557 | if pts.shape[0] > 1024: 558 | input_pts = pts[np.random.choice( 559 | pts.shape[0], 1024, replace=False), :] 560 | else: 561 | input_pts = pts[np.random.choice( 562 | pts.shape[0], 1024), :] 563 | feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb, 564 | self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False} 565 | iter_rotation, iter_translation, iter_size = self.sess.run( 566 | [self.pred_rotation0, self.pred_translation0, self.pred_scale0], feed_dict=feed_dict) 567 | 568 | pred_rotation = quat2mat(iter_rotation[0]) 569 | pred_translation = iter_translation[0] + centroid 570 | pred_scale = np.linalg.norm(iter_size[0]) 571 | pred_size = iter_size[0]/pred_scale 572 | 573 | pred_canonical_pts = ( 574 | (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation) 575 | dis = np.linalg.norm(pred_canonical_pts, axis=1) 576 | 577 | else: 578 | break 579 | 580 | # refinement 581 | if pts.shape[0] > 32: 582 | pose_feat = self.sess.run([self.pose_feat], feed_dict=feed_dict) 583 | update = tf.assign(self.new_pose_feat, pose_feat[0]) 584 | self.sess.run(update) 585 | pred_loss = np.inf 586 | 587 | for k in range(self.opts.refine_f_iteration): 588 | _, iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run( 589 | [self.all_optimizers, self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict) 590 | if iter_loss < pred_loss: 591 | pred_rotation = quat2mat(iter_rotation[0]) 592 | pred_translation = iter_translation[0] + centroid 593 | pred_scale = np.linalg.norm(iter_size[0]) 594 | pred_size = iter_size[0]/pred_scale 595 | pred_loss = iter_loss 596 | if pred_loss <= self.opts.refine_f_threshold: 597 | break 598 | 599 | if pts_ori.shape[0] > 32: 600 | pred_RTs[j, :3, :3] = np.diag( 601 | np.ones((3))*pred_scale) @ pred_rotation.transpose() 602 | pred_RTs[j, :3, 3] = pred_translation 603 | pred_scales[j] = pred_size 604 | 605 | z_180_RT = np.zeros((4, 4), dtype=np.float32) 606 | z_180_RT[:3, :3] = np.diag([-1, -1, 1]) 607 | z_180_RT[3, 3] = 1 608 | pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :] 609 | else: 610 | pred_RTs[j] = np.eye(4) 611 | pred_scales[j] = np.ones((3)) 612 | 613 | data.pop('pred_masks') 614 | data['pred_RTs'] = pred_RTs 615 | data['pred_scales'] = pred_scales 616 | 617 | with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f: 618 | cPickle.dump(data, f) 619 | -------------------------------------------------------------------------------- /utils/evaluation_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Gorilla Lab, SCUT. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | """ Modified based on https://github.com/hughw19/NOCS_CVPR2019.""" 7 | 8 | 9 | import os 10 | import sys 11 | import numpy as np 12 | import glob 13 | import math 14 | import _pickle as cPickle 15 | from tqdm import tqdm 16 | 17 | 18 | def trim_zeros(x): 19 | """It's common to have tensors larger than the available data and 20 | pad with zeros. This function removes rows that are all zeros. 21 | x: [rows, columns]. 22 | """ 23 | 24 | pre_shape = x.shape 25 | assert len(x.shape) == 2, x.shape 26 | new_x = x[~np.all(x == 0, axis=1)] 27 | post_shape = new_x.shape 28 | assert pre_shape[0] == post_shape[0] 29 | assert pre_shape[1] == post_shape[1] 30 | 31 | return new_x 32 | 33 | 34 | def get_3d_bbox(scale, shift=0): 35 | """ 36 | Input: 37 | scale: [3] or scalar 38 | shift: [3] or scalar 39 | Return 40 | bbox_3d: [3, N] 41 | 42 | """ 43 | if hasattr(scale, "__iter__"): 44 | bbox_3d = np.array([[scale[0] / 2, +scale[1] / 2, scale[2] / 2], 45 | [scale[0] / 2, +scale[1] / 2, -scale[2] / 2], 46 | [-scale[0] / 2, +scale[1] / 2, scale[2] / 2], 47 | [-scale[0] / 2, +scale[1] / 2, -scale[2] / 2], 48 | [+scale[0] / 2, -scale[1] / 2, scale[2] / 2], 49 | [+scale[0] / 2, -scale[1] / 2, -scale[2] / 2], 50 | [-scale[0] / 2, -scale[1] / 2, scale[2] / 2], 51 | [-scale[0] / 2, -scale[1] / 2, -scale[2] / 2]]) + shift 52 | else: 53 | bbox_3d = np.array([[scale / 2, +scale / 2, scale / 2], 54 | [scale / 2, +scale / 2, -scale / 2], 55 | [-scale / 2, +scale / 2, scale / 2], 56 | [-scale / 2, +scale / 2, -scale / 2], 57 | [+scale / 2, -scale / 2, scale / 2], 58 | [+scale / 2, -scale / 2, -scale / 2], 59 | [-scale / 2, -scale / 2, scale / 2], 60 | [-scale / 2, -scale / 2, -scale / 2]]) + shift 61 | 62 | bbox_3d = bbox_3d.transpose() 63 | return bbox_3d 64 | 65 | 66 | def transform_coordinates_3d(coordinates, RT): 67 | """ 68 | Input: 69 | coordinates: [3, N] 70 | RT: [4, 4] 71 | Return 72 | new_coordinates: [3, N] 73 | 74 | """ 75 | assert coordinates.shape[0] == 3 76 | coordinates = np.vstack([coordinates, np.ones( 77 | (1, coordinates.shape[1]), dtype=np.float32)]) 78 | new_coordinates = RT @ coordinates 79 | new_coordinates = new_coordinates[:3, :]/new_coordinates[3, :] 80 | return new_coordinates 81 | 82 | 83 | def compute_ap_from_matches_scores(pred_match, pred_scores, gt_match): 84 | # sort the scores from high to low 85 | # print(pred_match.shape, pred_scores.shape) 86 | assert pred_match.shape[0] == pred_scores.shape[0] 87 | 88 | score_indices = np.argsort(pred_scores)[::-1] 89 | pred_scores = pred_scores[score_indices] 90 | pred_match = pred_match[score_indices] 91 | 92 | precisions = np.cumsum(pred_match > -1) / (np.arange(len(pred_match)) + 1) 93 | recalls = np.cumsum(pred_match > -1).astype(np.float32) / len(gt_match) 94 | 95 | # Pad with start and end values to simplify the math 96 | precisions = np.concatenate([[0], precisions, [0]]) 97 | recalls = np.concatenate([[0], recalls, [1]]) 98 | 99 | # Ensure precision values decrease but don't increase. This way, the 100 | # precision value at each recall threshold is the maximum it can be 101 | # for all following recall thresholds, as specified by the VOC paper. 102 | for i in range(len(precisions) - 2, -1, -1): 103 | precisions[i] = np.maximum(precisions[i], precisions[i + 1]) 104 | 105 | # Compute mean AP over recall range 106 | indices = np.where(recalls[:-1] != recalls[1:])[0] + 1 107 | ap = np.sum((recalls[indices] - recalls[indices - 1]) 108 | * precisions[indices]) 109 | return ap 110 | 111 | 112 | def compute_3d_iou_new(RT_1, RT_2, scales_1, scales_2, handle_visibility, class_name_1, class_name_2): 113 | '''Computes IoU overlaps between two 3d bboxes. 114 | bbox_3d_1, bbox_3d_1: [3, 8] 115 | ''' 116 | # flatten masks 117 | def asymmetric_3d_iou(RT_1, RT_2, scales_1, scales_2): 118 | noc_cube_1 = get_3d_bbox(scales_1, 0) 119 | bbox_3d_1 = transform_coordinates_3d(noc_cube_1, RT_1) 120 | 121 | noc_cube_2 = get_3d_bbox(scales_2, 0) 122 | bbox_3d_2 = transform_coordinates_3d(noc_cube_2, RT_2) 123 | 124 | bbox_1_max = np.amax(bbox_3d_1, axis=0) 125 | bbox_1_min = np.amin(bbox_3d_1, axis=0) 126 | bbox_2_max = np.amax(bbox_3d_2, axis=0) 127 | bbox_2_min = np.amin(bbox_3d_2, axis=0) 128 | 129 | overlap_min = np.maximum(bbox_1_min, bbox_2_min) 130 | overlap_max = np.minimum(bbox_1_max, bbox_2_max) 131 | 132 | # intersections and union 133 | if np.amin(overlap_max - overlap_min) < 0: 134 | intersections = 0 135 | else: 136 | intersections = np.prod(overlap_max - overlap_min) 137 | union = np.prod(bbox_1_max - bbox_1_min) + \ 138 | np.prod(bbox_2_max - bbox_2_min) - intersections 139 | overlaps = intersections / union 140 | return overlaps 141 | 142 | if RT_1 is None or RT_2 is None: 143 | return -1 144 | 145 | symmetry_flag = False 146 | if (class_name_1 in ['bottle', 'bowl', 'can'] and class_name_1 == class_name_2) or (class_name_1 == 'mug' and class_name_1 == class_name_2 and handle_visibility == 0): 147 | # print('*'*10) 148 | 149 | noc_cube_1 = get_3d_bbox(scales_1, 0) 150 | noc_cube_2 = get_3d_bbox(scales_2, 0) 151 | bbox_3d_2 = transform_coordinates_3d(noc_cube_2, RT_2) 152 | 153 | def y_rotation_matrix(theta): 154 | return np.array([[np.cos(theta), 0, np.sin(theta), 0], 155 | [0, 1, 0, 0], 156 | [-np.sin(theta), 0, np.cos(theta), 0], 157 | [0, 0, 0, 1]]) 158 | 159 | n = 20 160 | max_iou = 0 161 | for i in range(n): 162 | rotated_RT_1 = RT_1@y_rotation_matrix(2*math.pi*i/float(n)) 163 | max_iou = max(max_iou, 164 | asymmetric_3d_iou(rotated_RT_1, RT_2, scales_1, scales_2)) 165 | else: 166 | max_iou = asymmetric_3d_iou(RT_1, RT_2, scales_1, scales_2) 167 | 168 | return max_iou 169 | 170 | 171 | def compute_combination_RT_degree_cm_symmetry(RT_1, RT_2, scale, class_id, handle_visibility, synset_names): 172 | ''' 173 | :param RT_1: [4, 4]. homogeneous affine transformation 174 | :param RT_2: [4, 4]. homogeneous affine transformation 175 | :return: theta: angle difference of R in degree, shift: l2 difference of T in centimeter 176 | 177 | 178 | synset_names = ['BG', # 0 179 | 'bottle', # 1 180 | 'bowl', # 2 181 | 'camera', # 3 182 | 'can', # 4 183 | 'cap', # 5 184 | 'phone', # 6 185 | 'monitor', # 7 186 | 'laptop', # 8 187 | 'mug' # 9 188 | ] 189 | 190 | synset_names = ['BG', # 0 191 | 'bottle', # 1 192 | 'bowl', # 2 193 | 'camera', # 3 194 | 'can', # 4 195 | 'laptop', # 5 196 | 'mug' # 6 197 | ] 198 | ''' 199 | 200 | # make sure the last row is [0, 0, 0, 1] 201 | if RT_1 is None or RT_2 is None: 202 | return -1 203 | try: 204 | assert np.array_equal(RT_1[3, :], RT_2[3, :]) 205 | assert np.array_equal(RT_1[3, :], np.array([0, 0, 0, 1])) 206 | except AssertionError: 207 | print(RT_1[3, :], RT_2[3, :]) 208 | exit() 209 | 210 | R1 = RT_1[:3, :3] / np.cbrt(np.linalg.det(RT_1[:3, :3])) 211 | T1 = RT_1[:3, 3] 212 | 213 | R2 = RT_2[:3, :3] / np.cbrt(np.linalg.det(RT_2[:3, :3])) 214 | T2 = RT_2[:3, 3] 215 | 216 | # symmetric when rotating around y-axis 217 | if synset_names[class_id] in ['bottle', 'can', 'bowl']: 218 | y = np.array([0, 1, 0]) 219 | y1 = R1 @ y 220 | y2 = R2 @ y 221 | theta = np.arccos( 222 | y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2))) 223 | # symmetric when rotating around y-axis 224 | elif synset_names[class_id] == 'mug' and handle_visibility == 0: 225 | y = np.array([0, 1, 0]) 226 | y1 = R1 @ y 227 | y2 = R2 @ y 228 | theta = np.arccos( 229 | y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2))) 230 | elif synset_names[class_id] in ['phone', 'eggbox', 'glue']: 231 | y_180_RT = np.diag([-1.0, 1.0, -1.0]) 232 | R = R1 @ R2.transpose() 233 | R_rot = R1 @ y_180_RT @ R2.transpose() 234 | theta = min(np.arccos((np.trace(R) - 1) / 2), 235 | np.arccos((np.trace(R_rot) - 1) / 2)) 236 | else: 237 | R = R1 @ R2.transpose() 238 | theta = np.arccos(np.clip((np.trace(R) - 1) / 2, -1.0, 1.0)) 239 | 240 | theta *= 180 / np.pi 241 | shift = np.linalg.norm(T1 - T2) / scale 242 | result = np.array([theta, shift]) 243 | 244 | return result 245 | 246 | 247 | def compute_combination_3d_matches(gt_class_ids, gt_RTs, gt_scales, gt_handle_visibility, synset_names, 248 | pred_boxes, pred_class_ids, pred_scores, pred_RTs, pred_scales, 249 | iou_3d_thresholds, degree_thesholds, shift_thesholds, score_threshold=0): 250 | """Finds matches between prediction and ground truth instances. 251 | Returns: 252 | gt_matches: 2-D array. For each GT box it has the index of the matched 253 | predicted box. 254 | pred_matches: 2-D array. For each predicted box, it has the index of 255 | the matched ground truth box. 256 | overlaps: [pred_boxes, gt_boxes] IoU overlaps. 257 | """ 258 | # Trim zero padding 259 | # TODO: cleaner to do zero unpadding upstream 260 | num_pred = len(pred_class_ids) 261 | num_gt = len(gt_class_ids) 262 | indices = np.zeros(0) 263 | 264 | if num_pred: 265 | pred_boxes = trim_zeros(pred_boxes).copy() 266 | pred_scores = pred_scores[:pred_boxes.shape[0]].copy() 267 | 268 | # Sort predictions by score from high to low 269 | indices = np.argsort(pred_scores)[::-1] 270 | 271 | pred_boxes = pred_boxes[indices].copy() 272 | pred_class_ids = pred_class_ids[indices].copy() 273 | pred_scores = pred_scores[indices].copy() 274 | pred_scales = pred_scales[indices].copy() 275 | pred_RTs = pred_RTs[indices].copy() 276 | 277 | # Compute IoU overlaps [pred_bboxs gt_bboxs] 278 | #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)] 279 | overlaps = np.zeros((num_pred, num_gt), dtype=np.float32) 280 | RT_overlaps = np.zeros((num_pred, num_gt, 2), dtype=np.float32) 281 | for i in range(num_pred): 282 | for j in range(num_gt): 283 | # overlaps[i, j] = compute_3d_iou(pred_3d_bboxs[i], gt_3d_bboxs[j], gt_handle_visibility[j], 284 | # synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]]) 285 | overlaps[i, j] = compute_3d_iou_new(pred_RTs[i], gt_RTs[j], pred_scales[i, :], gt_scales[j], 286 | gt_handle_visibility[j], synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]]) 287 | 288 | RT_overlaps[i, j, :] = compute_combination_RT_degree_cm_symmetry(pred_RTs[i], gt_RTs[j], np.cbrt( 289 | np.linalg.det(gt_RTs[j, :3, :3])), gt_class_ids[j], gt_handle_visibility[j], synset_names) 290 | 291 | # Loop through predictions and find matching ground truth boxes 292 | num_iou_3d_thres = len(iou_3d_thresholds) 293 | num_degree_thes = len(degree_thesholds) 294 | num_shift_thes = len(shift_thesholds) 295 | pred_matches = -1 * \ 296 | np.ones([num_degree_thes, num_shift_thes, num_iou_3d_thres, num_pred]) 297 | gt_matches = -1 * \ 298 | np.ones([num_degree_thes, num_shift_thes, num_iou_3d_thres, num_gt]) 299 | 300 | for s, iou_thres in enumerate(iou_3d_thresholds): 301 | for d, degree_thres in enumerate(degree_thesholds): 302 | for t, shift_thres in enumerate(shift_thesholds): 303 | for i in range(len(pred_boxes)): 304 | # Find best matching ground truth box 305 | # 1. Sort matches by score 306 | sorted_ixs_by_iou = np.argsort(overlaps[i])[::-1] 307 | # 2. Remove low scores 308 | low_score_idx = np.where( 309 | overlaps[i, sorted_ixs_by_iou] < score_threshold)[0] 310 | if low_score_idx.size > 0: 311 | sorted_ixs_by_iou = sorted_ixs_by_iou[:low_score_idx[0]] 312 | # 3. Find the match 313 | for j in sorted_ixs_by_iou: 314 | if gt_matches[d, t, s, j] > -1: 315 | continue 316 | # If we reach IoU smaller than the threshold, end the loop 317 | iou = overlaps[i, j] 318 | r_error = RT_overlaps[i, j, 0] 319 | t_error = RT_overlaps[i, j, 1] 320 | 321 | if iou < iou_thres or r_error > degree_thres or t_error > shift_thres: 322 | break 323 | 324 | if not pred_class_ids[i] == gt_class_ids[j]: 325 | continue 326 | 327 | if iou >= iou_thres or r_error <= degree_thres or t_error <= shift_thres: 328 | gt_matches[d, t, s, j] = i 329 | pred_matches[d, t, s, i] = j 330 | break 331 | 332 | return gt_matches, pred_matches, indices 333 | 334 | 335 | def compute_combination_mAP(final_results, synset_names, degree_thresholds=[5, 10, 15], shift_thresholds=[0.1, 0.2], iou_3d_thresholds=[0.1]): 336 | num_classes = len(synset_names) 337 | degree_thres_list = list(degree_thresholds) + [360] 338 | num_degree_thres = len(degree_thres_list) 339 | 340 | shift_thres_list = list(shift_thresholds) + [100] 341 | num_shift_thres = len(shift_thres_list) 342 | 343 | iou_thres_list = list(iou_3d_thresholds) 344 | num_iou_thres = len(iou_thres_list) 345 | 346 | aps = np.zeros((num_classes + 1, num_degree_thres, 347 | num_shift_thres, num_iou_thres)) 348 | pred_matches_all = [np.zeros( 349 | (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)] 350 | gt_matches_all = [np.zeros( 351 | (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)] 352 | pred_scores_all = [np.zeros( 353 | (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)] 354 | 355 | for progress, result in tqdm(enumerate(final_results)): 356 | gt_class_ids = result['gt_class_ids'].astype(np.int32) 357 | gt_RTs = np.array(result['gt_RTs']) 358 | gt_scales = np.array(result['gt_scales']) 359 | gt_handle_visibility = result['gt_handle_visibility'] 360 | 361 | pred_bboxes = np.array(result['pred_bboxes']) 362 | pred_class_ids = result['pred_class_ids'] 363 | pred_scales = result['pred_scales'] 364 | pred_scores = result['pred_scores'] 365 | pred_RTs = np.array(result['pred_RTs']) 366 | 367 | if len(gt_class_ids) == 0 and len(pred_class_ids) == 0: 368 | continue 369 | 370 | for cls_id in range(1, num_classes): 371 | # get gt and predictions in this class 372 | cls_gt_class_ids = gt_class_ids[gt_class_ids == cls_id] if len( 373 | gt_class_ids) else np.zeros(0) 374 | cls_gt_scales = gt_scales[gt_class_ids == cls_id] if len( 375 | gt_class_ids) else np.zeros((0, 3)) 376 | cls_gt_RTs = gt_RTs[gt_class_ids == cls_id] if len( 377 | gt_class_ids) else np.zeros((0, 4, 4)) 378 | 379 | cls_pred_class_ids = pred_class_ids[pred_class_ids == cls_id] if len( 380 | pred_class_ids) else np.zeros(0) 381 | cls_pred_bboxes = pred_bboxes[pred_class_ids == cls_id, :] if len( 382 | pred_class_ids) else np.zeros((0, 4)) 383 | cls_pred_scores = pred_scores[pred_class_ids == cls_id] if len( 384 | pred_class_ids) else np.zeros(0) 385 | cls_pred_RTs = pred_RTs[pred_class_ids == cls_id] if len( 386 | pred_class_ids) else np.zeros((0, 4, 4)) 387 | cls_pred_scales = pred_scales[pred_class_ids == cls_id] if len( 388 | pred_class_ids) else np.zeros((0, 3)) 389 | 390 | if synset_names[cls_id] != 'mug': 391 | cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids) 392 | else: 393 | cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids == cls_id] if len( 394 | gt_class_ids) else np.ones(0) 395 | 396 | gt_match, pred_match, pred_indiced = compute_combination_3d_matches(cls_gt_class_ids, cls_gt_RTs, cls_gt_scales, cls_gt_handle_visibility, synset_names, 397 | cls_pred_bboxes, cls_pred_class_ids, cls_pred_scores, cls_pred_RTs, cls_pred_scales, 398 | iou_thres_list, degree_thres_list, shift_thres_list) 399 | if len(pred_indiced): 400 | cls_pred_class_ids = cls_pred_class_ids[pred_indiced] 401 | cls_pred_RTs = cls_pred_RTs[pred_indiced] 402 | cls_pred_scores = cls_pred_scores[pred_indiced] 403 | cls_pred_bboxes = cls_pred_bboxes[pred_indiced] 404 | 405 | pred_matches_all[cls_id] = np.concatenate( 406 | (pred_matches_all[cls_id], pred_match), axis=-1) 407 | cls_pred_scores_tile = np.tile( 408 | cls_pred_scores, (num_degree_thres, num_shift_thres, num_iou_thres, 1)) 409 | pred_scores_all[cls_id] = np.concatenate( 410 | (pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1) 411 | assert pred_matches_all[cls_id].shape[-1] == pred_scores_all[cls_id].shape[-1] 412 | gt_matches_all[cls_id] = np.concatenate( 413 | (gt_matches_all[cls_id], gt_match), axis=-1) 414 | 415 | for cls_id in range(1, num_classes): 416 | class_name = synset_names[cls_id] 417 | for s, iou_thres in enumerate(iou_thres_list): 418 | for d, degree_thres in enumerate(degree_thres_list): 419 | for t, shift_thres in enumerate(shift_thres_list): 420 | aps[cls_id, d, t, s] = compute_ap_from_matches_scores(pred_matches_all[cls_id][d, t, s, :], 421 | pred_scores_all[cls_id][d, 422 | t, s, :], 423 | gt_matches_all[cls_id][d, t, s, :]) 424 | # for i in range(6): 425 | # print(i+1) 426 | # print('IoU75, 5 degree, 5% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100)) 427 | # print('IoU75, 10 degree, 5% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100)) 428 | # print('IoU75, 5 degree, 10% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.10), iou_thres_list.index(0.75)]*100)) 429 | # print('IoU50, 5 degree, 20% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100)) 430 | # print('IoU50, 10 degree, 10% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.10), iou_thres_list.index(0.50)]*100)) 431 | # print('IoU50, 10 degree, 20% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100)) 432 | 433 | # print('ALL:') 434 | aps[-1, :, :, :] = np.mean(aps[1:-1, :, :, :], axis=0) 435 | 436 | print('IoU75, 5 degree, 5% translation: {:.2f}'.format( 437 | aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100)) 438 | print('IoU75, 10 degree, 5% translation: {:.2f}'.format( 439 | aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100)) 440 | print('IoU75, 5 degree, 10% translation: {:.2f}'.format( 441 | aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.10), iou_thres_list.index(0.75)]*100)) 442 | print('IoU50, 5 degree, 20% translation: {:.2f}'.format( 443 | aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100)) 444 | print('IoU50, 10 degree, 10% translation: {:.2f}'.format( 445 | aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.10), iou_thres_list.index(0.50)]*100)) 446 | print('IoU50, 10 degree, 20% translation: {:.2f}'.format( 447 | aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100)) 448 | 449 | return aps 450 | 451 | 452 | def compute_3d_matches(gt_class_ids, gt_RTs, gt_scales, gt_handle_visibility, synset_names, 453 | pred_boxes, pred_class_ids, pred_scores, pred_RTs, pred_scales, 454 | iou_3d_thresholds, score_threshold=0): 455 | """Finds matches between prediction and ground truth instances. 456 | Returns: 457 | gt_matches: 2-D array. For each GT box it has the index of the matched 458 | predicted box. 459 | pred_matches: 2-D array. For each predicted box, it has the index of 460 | the matched ground truth box. 461 | overlaps: [pred_boxes, gt_boxes] IoU overlaps. 462 | """ 463 | # Trim zero padding 464 | # TODO: cleaner to do zero unpadding upstream 465 | num_pred = len(pred_class_ids) 466 | num_gt = len(gt_class_ids) 467 | indices = np.zeros(0) 468 | 469 | if num_pred: 470 | pred_boxes = trim_zeros(pred_boxes).copy() 471 | pred_scores = pred_scores[:pred_boxes.shape[0]].copy() 472 | 473 | # Sort predictions by score from high to low 474 | indices = np.argsort(pred_scores)[::-1] 475 | 476 | pred_boxes = pred_boxes[indices].copy() 477 | pred_class_ids = pred_class_ids[indices].copy() 478 | pred_scores = pred_scores[indices].copy() 479 | pred_scales = pred_scales[indices].copy() 480 | pred_RTs = pred_RTs[indices].copy() 481 | 482 | # Compute IoU overlaps [pred_bboxs gt_bboxs] 483 | #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)] 484 | overlaps = np.zeros((num_pred, num_gt), dtype=np.float32) 485 | for i in range(num_pred): 486 | for j in range(num_gt): 487 | # overlaps[i, j] = compute_3d_iou(pred_3d_bboxs[i], gt_3d_bboxs[j], gt_handle_visibility[j], 488 | # synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]]) 489 | overlaps[i, j] = compute_3d_iou_new(pred_RTs[i], gt_RTs[j], pred_scales[i, :], gt_scales[j], 490 | gt_handle_visibility[j], synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]]) 491 | 492 | # Loop through predictions and find matching ground truth boxes 493 | num_iou_3d_thres = len(iou_3d_thresholds) 494 | pred_matches = -1 * np.ones([num_iou_3d_thres, num_pred]) 495 | gt_matches = -1 * np.ones([num_iou_3d_thres, num_gt]) 496 | 497 | for s, iou_thres in enumerate(iou_3d_thresholds): 498 | for i in range(len(pred_boxes)): 499 | # Find best matching ground truth box 500 | # 1. Sort matches by score 501 | sorted_ixs = np.argsort(overlaps[i])[::-1] 502 | # 2. Remove low scores 503 | low_score_idx = np.where( 504 | overlaps[i, sorted_ixs] < score_threshold)[0] 505 | if low_score_idx.size > 0: 506 | sorted_ixs = sorted_ixs[:low_score_idx[0]] 507 | # 3. Find the match 508 | for j in sorted_ixs: 509 | # If ground truth box is already matched, go to next one 510 | #print('gt_match: ', gt_match[j]) 511 | if gt_matches[s, j] > -1: 512 | continue 513 | # If we reach IoU smaller than the threshold, end the loop 514 | iou = overlaps[i, j] 515 | #print('iou: ', iou) 516 | if iou < iou_thres: 517 | break 518 | # Do we have a match? 519 | if not pred_class_ids[i] == gt_class_ids[j]: 520 | continue 521 | 522 | if iou > iou_thres: 523 | gt_matches[s, j] = i 524 | pred_matches[s, i] = j 525 | break 526 | 527 | return gt_matches, pred_matches, overlaps, indices 528 | 529 | 530 | def compute_RT_degree_cm_symmetry(RT_1, RT_2, class_id, handle_visibility, synset_names): 531 | ''' 532 | :param RT_1: [4, 4]. homogeneous affine transformation 533 | :param RT_2: [4, 4]. homogeneous affine transformation 534 | :return: theta: angle difference of R in degree, shift: l2 difference of T in centimeter 535 | 536 | 537 | synset_names = ['BG', # 0 538 | 'bottle', # 1 539 | 'bowl', # 2 540 | 'camera', # 3 541 | 'can', # 4 542 | 'cap', # 5 543 | 'phone', # 6 544 | 'monitor', # 7 545 | 'laptop', # 8 546 | 'mug' # 9 547 | ] 548 | 549 | synset_names = ['BG', # 0 550 | 'bottle', # 1 551 | 'bowl', # 2 552 | 'camera', # 3 553 | 'can', # 4 554 | 'laptop', # 5 555 | 'mug' # 6 556 | ] 557 | ''' 558 | 559 | # make sure the last row is [0, 0, 0, 1] 560 | if RT_1 is None or RT_2 is None: 561 | return -1 562 | try: 563 | assert np.array_equal(RT_1[3, :], RT_2[3, :]) 564 | assert np.array_equal(RT_1[3, :], np.array([0, 0, 0, 1])) 565 | except AssertionError: 566 | print(RT_1[3, :], RT_2[3, :]) 567 | exit() 568 | 569 | R1 = RT_1[:3, :3] / np.cbrt(np.linalg.det(RT_1[:3, :3])) 570 | T1 = RT_1[:3, 3] 571 | 572 | R2 = RT_2[:3, :3] / np.cbrt(np.linalg.det(RT_2[:3, :3])) 573 | T2 = RT_2[:3, 3] 574 | 575 | # symmetric when rotating around y-axis 576 | if synset_names[class_id] in ['bottle', 'can', 'bowl']: 577 | y = np.array([0, 1, 0]) 578 | y1 = R1 @ y 579 | y2 = R2 @ y 580 | theta = np.arccos( 581 | y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2))) 582 | # symmetric when rotating around y-axis 583 | elif synset_names[class_id] == 'mug' and handle_visibility == 0: 584 | y = np.array([0, 1, 0]) 585 | y1 = R1 @ y 586 | y2 = R2 @ y 587 | theta = np.arccos( 588 | y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2))) 589 | elif synset_names[class_id] in ['phone', 'eggbox', 'glue']: 590 | y_180_RT = np.diag([-1.0, 1.0, -1.0]) 591 | R = R1 @ R2.transpose() 592 | R_rot = R1 @ y_180_RT @ R2.transpose() 593 | theta = min(np.arccos((np.trace(R) - 1) / 2), 594 | np.arccos((np.trace(R_rot) - 1) / 2)) 595 | else: 596 | R = R1 @ R2.transpose() 597 | theta = np.arccos(np.clip((np.trace(R) - 1) / 2, -1.0, 1.0)) 598 | 599 | theta *= 180 / np.pi 600 | shift = np.linalg.norm(T1 - T2) * 100 601 | result = np.array([theta, shift]) 602 | 603 | return result 604 | 605 | 606 | def compute_RT_overlaps(gt_class_ids, gt_RTs, gt_handle_visibility, 607 | pred_class_ids, pred_RTs, 608 | synset_names): 609 | """Finds overlaps between prediction and ground truth instances. 610 | Returns: 611 | overlaps: [pred_boxes, gt_boxes] IoU overlaps. 612 | """ 613 | # print('num of gt instances: {}, num of pred instances: {}'.format(len(gt_class_ids), len(gt_class_ids))) 614 | num_pred = len(pred_class_ids) 615 | num_gt = len(gt_class_ids) 616 | 617 | # Compute IoU overlaps [pred_bboxs gt_bboxs] 618 | #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)] 619 | overlaps = np.zeros((num_pred, num_gt, 2)) 620 | 621 | for i in range(num_pred): 622 | for j in range(num_gt): 623 | overlaps[i, j, :] = compute_RT_degree_cm_symmetry(pred_RTs[i], 624 | gt_RTs[j], 625 | gt_class_ids[j], 626 | gt_handle_visibility[j], 627 | synset_names) 628 | 629 | return overlaps 630 | 631 | 632 | def compute_match_from_degree_cm(overlaps, pred_class_ids, gt_class_ids, degree_thres_list, shift_thres_list): 633 | num_degree_thres = len(degree_thres_list) 634 | num_shift_thres = len(shift_thres_list) 635 | 636 | num_pred = len(pred_class_ids) 637 | num_gt = len(gt_class_ids) 638 | 639 | pred_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_pred)) 640 | gt_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_gt)) 641 | 642 | if num_pred == 0 or num_gt == 0: 643 | return gt_matches, pred_matches 644 | 645 | assert num_pred == overlaps.shape[0] 646 | assert num_gt == overlaps.shape[1] 647 | assert overlaps.shape[2] == 2 648 | 649 | for d, degree_thres in enumerate(degree_thres_list): 650 | for s, shift_thres in enumerate(shift_thres_list): 651 | for i in range(num_pred): 652 | # Find best matching ground truth box 653 | # 1. Sort matches by scores from low to high 654 | sum_degree_shift = np.sum(overlaps[i, :, :], axis=-1) 655 | sorted_ixs = np.argsort(sum_degree_shift) 656 | # 2. Remove low scores 657 | # low_score_idx = np.where(sum_degree_shift >= 100)[0] 658 | # if low_score_idx.size > 0: 659 | # sorted_ixs = sorted_ixs[:low_score_idx[0]] 660 | # 3. Find the match 661 | for j in sorted_ixs: 662 | # If ground truth box is already matched, go to next one 663 | #print(j, len(gt_match), len(pred_class_ids), len(gt_class_ids)) 664 | if gt_matches[d, s, j] > -1 or pred_class_ids[i] != gt_class_ids[j]: 665 | continue 666 | # If we reach IoU smaller than the threshold, end the loop 667 | if overlaps[i, j, 0] > degree_thres or overlaps[i, j, 1] > shift_thres: 668 | continue 669 | 670 | gt_matches[d, s, j] = i 671 | pred_matches[d, s, i] = j 672 | break 673 | 674 | return gt_matches, pred_matches 675 | 676 | 677 | def compute_independent_mAP(final_results, synset_names, degree_thresholds=[360], shift_thresholds=[100], iou_3d_thresholds=[0.1], iou_pose_thres=0.1, use_matches_for_pose=True): 678 | 679 | num_classes = len(synset_names) 680 | degree_thres_list = list(degree_thresholds) + [360] 681 | num_degree_thres = len(degree_thres_list) 682 | 683 | shift_thres_list = list(shift_thresholds) + [100] 684 | num_shift_thres = len(shift_thres_list) 685 | 686 | iou_thres_list = list(iou_3d_thresholds) 687 | num_iou_thres = len(iou_thres_list) 688 | 689 | if use_matches_for_pose: 690 | assert iou_pose_thres in iou_thres_list 691 | 692 | iou_3d_aps = np.zeros((num_classes + 1, num_iou_thres)) 693 | iou_pred_matches_all = [np.zeros((num_iou_thres, 0)) 694 | for _ in range(num_classes)] 695 | iou_pred_scores_all = [np.zeros((num_iou_thres, 0)) 696 | for _ in range(num_classes)] 697 | iou_gt_matches_all = [np.zeros((num_iou_thres, 0)) 698 | for _ in range(num_classes)] 699 | 700 | pose_aps = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres)) 701 | pose_pred_matches_all = [ 702 | np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)] 703 | pose_gt_matches_all = [ 704 | np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)] 705 | pose_pred_scores_all = [ 706 | np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)] 707 | 708 | # loop over results to gather pred matches and gt matches for iou and pose metrics 709 | progress = 0 710 | for progress, result in tqdm(enumerate(final_results)): 711 | gt_class_ids = result['gt_class_ids'].astype(np.int32) 712 | gt_RTs = np.array(result['gt_RTs']) 713 | gt_scales = np.array(result['gt_scales']) 714 | gt_handle_visibility = result['gt_handle_visibility'] 715 | 716 | pred_bboxes = np.array(result['pred_bboxes']) 717 | pred_class_ids = result['pred_class_ids'] 718 | pred_scales = result['pred_scales'] 719 | pred_scores = result['pred_scores'] 720 | pred_RTs = np.array(result['pred_RTs']) 721 | 722 | if len(gt_class_ids) == 0 and len(pred_class_ids) == 0: 723 | continue 724 | 725 | for cls_id in range(1, num_classes): 726 | # get gt and predictions in this class 727 | cls_gt_class_ids = gt_class_ids[gt_class_ids == cls_id] if len( 728 | gt_class_ids) else np.zeros(0) 729 | cls_gt_scales = gt_scales[gt_class_ids == cls_id] if len( 730 | gt_class_ids) else np.zeros((0, 3)) 731 | cls_gt_RTs = gt_RTs[gt_class_ids == cls_id] if len( 732 | gt_class_ids) else np.zeros((0, 4, 4)) 733 | 734 | # ipdb.set_trace() 735 | cls_pred_class_ids = pred_class_ids[pred_class_ids == cls_id] if len( 736 | pred_class_ids) else np.zeros(0) 737 | cls_pred_bboxes = pred_bboxes[pred_class_ids == cls_id, :] if len( 738 | pred_class_ids) else np.zeros((0, 4)) 739 | cls_pred_scores = pred_scores[pred_class_ids == cls_id] if len( 740 | pred_class_ids) else np.zeros(0) 741 | cls_pred_RTs = pred_RTs[pred_class_ids == cls_id] if len( 742 | pred_class_ids) else np.zeros((0, 4, 4)) 743 | cls_pred_scales = pred_scales[pred_class_ids == cls_id] if len( 744 | pred_class_ids) else np.zeros((0, 3)) 745 | 746 | # calculate the overlap between each gt instance and pred instance 747 | if synset_names[cls_id] != 'mug': 748 | cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids) 749 | else: 750 | cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids == cls_id] if len( 751 | gt_class_ids) else np.ones(0) 752 | 753 | iou_cls_gt_match, iou_cls_pred_match, _, iou_pred_indices = compute_3d_matches(cls_gt_class_ids, cls_gt_RTs, cls_gt_scales, cls_gt_handle_visibility, synset_names, 754 | cls_pred_bboxes, cls_pred_class_ids, cls_pred_scores, cls_pred_RTs, cls_pred_scales, 755 | iou_thres_list) 756 | if len(iou_pred_indices): 757 | cls_pred_class_ids = cls_pred_class_ids[iou_pred_indices] 758 | cls_pred_RTs = cls_pred_RTs[iou_pred_indices] 759 | cls_pred_scores = cls_pred_scores[iou_pred_indices] 760 | cls_pred_bboxes = cls_pred_bboxes[iou_pred_indices] 761 | 762 | iou_pred_matches_all[cls_id] = np.concatenate( 763 | (iou_pred_matches_all[cls_id], iou_cls_pred_match), axis=-1) 764 | cls_pred_scores_tile = np.tile(cls_pred_scores, (num_iou_thres, 1)) 765 | iou_pred_scores_all[cls_id] = np.concatenate( 766 | (iou_pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1) 767 | assert iou_pred_matches_all[cls_id].shape[1] == iou_pred_scores_all[cls_id].shape[1] 768 | iou_gt_matches_all[cls_id] = np.concatenate( 769 | (iou_gt_matches_all[cls_id], iou_cls_gt_match), axis=-1) 770 | 771 | if use_matches_for_pose: 772 | thres_ind = list(iou_thres_list).index(iou_pose_thres) 773 | 774 | iou_thres_pred_match = iou_cls_pred_match[thres_ind, :] 775 | 776 | cls_pred_class_ids = cls_pred_class_ids[iou_thres_pred_match > -1] if len( 777 | iou_thres_pred_match) > 0 else np.zeros(0) 778 | cls_pred_RTs = cls_pred_RTs[iou_thres_pred_match > -1] if len( 779 | iou_thres_pred_match) > 0 else np.zeros((0, 4, 4)) 780 | cls_pred_scores = cls_pred_scores[iou_thres_pred_match > -1] if len( 781 | iou_thres_pred_match) > 0 else np.zeros(0) 782 | cls_pred_bboxes = cls_pred_bboxes[iou_thres_pred_match > -1] if len( 783 | iou_thres_pred_match) > 0 else np.zeros((0, 4)) 784 | 785 | iou_thres_gt_match = iou_cls_gt_match[thres_ind, :] 786 | cls_gt_class_ids = cls_gt_class_ids[iou_thres_gt_match > -1] if len( 787 | iou_thres_gt_match) > 0 else np.zeros(0) 788 | cls_gt_RTs = cls_gt_RTs[iou_thres_gt_match > -1] if len( 789 | iou_thres_gt_match) > 0 else np.zeros((0, 4, 4)) 790 | cls_gt_handle_visibility = cls_gt_handle_visibility[iou_thres_gt_match > -1] if len( 791 | iou_thres_gt_match) > 0 else np.zeros(0) 792 | 793 | RT_overlaps = compute_RT_overlaps(cls_gt_class_ids, cls_gt_RTs, cls_gt_handle_visibility, 794 | cls_pred_class_ids, cls_pred_RTs, 795 | synset_names) 796 | 797 | pose_cls_gt_match, pose_cls_pred_match = compute_match_from_degree_cm(RT_overlaps, 798 | cls_pred_class_ids, 799 | cls_gt_class_ids, 800 | degree_thres_list, 801 | shift_thres_list) 802 | 803 | pose_pred_matches_all[cls_id] = np.concatenate( 804 | (pose_pred_matches_all[cls_id], pose_cls_pred_match), axis=-1) 805 | 806 | cls_pred_scores_tile = np.tile( 807 | cls_pred_scores, (num_degree_thres, num_shift_thres, 1)) 808 | pose_pred_scores_all[cls_id] = np.concatenate( 809 | (pose_pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1) 810 | assert pose_pred_scores_all[cls_id].shape[2] == pose_pred_matches_all[cls_id].shape[2], '{} vs. {}'.format( 811 | pose_pred_scores_all[cls_id].shape, pose_pred_matches_all[cls_id].shape) 812 | pose_gt_matches_all[cls_id] = np.concatenate( 813 | (pose_gt_matches_all[cls_id], pose_cls_gt_match), axis=-1) 814 | 815 | iou_dict = {} 816 | iou_dict['thres_list'] = iou_thres_list 817 | for cls_id in range(1, num_classes): 818 | class_name = synset_names[cls_id] 819 | for s, iou_thres in enumerate(iou_thres_list): 820 | iou_3d_aps[cls_id, s] = compute_ap_from_matches_scores(iou_pred_matches_all[cls_id][s, :], 821 | iou_pred_scores_all[cls_id][s, :], 822 | iou_gt_matches_all[cls_id][s, :]) 823 | 824 | iou_3d_aps[-1, :] = np.mean(iou_3d_aps[1:-1, :], axis=0) 825 | 826 | for i, degree_thres in enumerate(degree_thres_list): 827 | for j, shift_thres in enumerate(shift_thres_list): 828 | for cls_id in range(1, num_classes): 829 | cls_pose_pred_matches_all = pose_pred_matches_all[cls_id][i, j, :] 830 | cls_pose_gt_matches_all = pose_gt_matches_all[cls_id][i, j, :] 831 | cls_pose_pred_scores_all = pose_pred_scores_all[cls_id][i, j, :] 832 | 833 | pose_aps[cls_id, i, j] = compute_ap_from_matches_scores(cls_pose_pred_matches_all, 834 | cls_pose_pred_scores_all, 835 | cls_pose_gt_matches_all) 836 | 837 | pose_aps[-1, i, j] = np.mean(pose_aps[1:-1, i, j]) 838 | 839 | print('3D IoU at 25: {:.1f}'.format( 840 | iou_3d_aps[-1, iou_thres_list.index(0.25)] * 100)) 841 | print('3D IoU at 50: {:.1f}'.format( 842 | iou_3d_aps[-1, iou_thres_list.index(0.5)] * 100)) 843 | print('3D IoU at 75: {:.1f}'.format( 844 | iou_3d_aps[-1, iou_thres_list.index(0.75)] * 100)) 845 | 846 | print('5 degree, 2cm: {:.1f}'.format( 847 | pose_aps[-1, degree_thres_list.index(5), shift_thres_list.index(2)] * 100)) 848 | print('5 degree, 5cm: {:.1f}'.format( 849 | pose_aps[-1, degree_thres_list.index(5), shift_thres_list.index(5)] * 100)) 850 | 851 | print('10 degree, 2cm: {:.1f}'.format( 852 | pose_aps[-1, degree_thres_list.index(10), shift_thres_list.index(2)] * 100)) 853 | print('10 degree, 5cm: {:.1f}'.format( 854 | pose_aps[-1, degree_thres_list.index(10), shift_thres_list.index(5)] * 100)) 855 | 856 | return iou_3d_aps, pose_aps 857 | 858 | 859 | def evaluate(path): 860 | synset_names = ['BG', # 0 861 | 'bottle', # 1 862 | 'bowl', # 2 863 | 'camera', # 3 864 | 'can', # 4 865 | 'laptop', # 5 866 | 'mug'] # 6 867 | 868 | result_pkl_list = glob.glob(os.path.join(path, 'results_*.pkl')) 869 | result_pkl_list = sorted(result_pkl_list) 870 | print('image num: {}'.format(len(result_pkl_list))) 871 | 872 | final_results = [] 873 | for pkl_path in result_pkl_list: 874 | with open(pkl_path, 'rb') as f: 875 | result = cPickle.load(f) 876 | if not 'gt_handle_visibility' in result: 877 | result['gt_handle_visibility'] = np.ones_like( 878 | result['gt_class_ids']) 879 | print('can\'t find gt_handle_visibility in the pkl.') 880 | else: 881 | assert len(result['gt_handle_visibility']) == len(result['gt_class_ids']), "{} {}".format( 882 | result['gt_handle_visibility'], result['gt_class_ids']) 883 | 884 | if type(result) is list: 885 | final_results += result 886 | elif type(result) is dict: 887 | final_results.append(result) 888 | else: 889 | assert False 890 | 891 | print("Compute combined mAP: ") 892 | compute_combination_mAP(final_results, synset_names, 893 | degree_thresholds=[5, 10, 20], 894 | shift_thresholds=[0.05, 0.1, 0.2], 895 | iou_3d_thresholds=[0.25, 0.50, 0.75]) 896 | 897 | print("Compute independent mAP: ") 898 | compute_independent_mAP(final_results, synset_names, 899 | degree_thresholds=[5, 10], 900 | shift_thresholds=[2, 5], 901 | iou_3d_thresholds=[0.10, 0.25, 0.50, 0.75]) 902 | --------------------------------------------------------------------------------